Difference between revisions of "Python: Pandas - The Series Data Structure"
From MyWiki
| (2 intermediate revisions by the same user not shown) | |||
| Line 51: | Line 51: | ||
s | s | ||
| + | </source> | ||
| + | Querying a series | ||
| + | <source lang="python"> | ||
| + | sports = {'Archery': 'Bhutan', | ||
| + | 'Golf': 'Scotland', | ||
| + | 'Sumo': 'Japan', | ||
| + | 'Taekwondo': 'South Korea'} | ||
| + | s = pd.Series(sports) | ||
| + | s | ||
| + | s.iloc[3] | ||
| + | |||
| + | |||
| + | s.loc['Golf'] | ||
| + | |||
| + | |||
| + | s[3] | ||
| + | |||
| + | |||
| + | s['Golf'] | ||
| + | |||
| + | |||
| + | sports = {99: 'Bhutan', | ||
| + | 100: 'Scotland', | ||
| + | 101: 'Japan', | ||
| + | 102: 'South Korea'} | ||
| + | s = pd.Series(sports) | ||
| + | |||
| + | |||
| + | s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead | ||
| + | |||
| + | s = pd.Series([100.00, 120.00, 101.00, 3.00]) | ||
| + | s | ||
| + | |||
| + | |||
| + | total = 0 | ||
| + | for item in s: | ||
| + | total+=item | ||
| + | print(total) | ||
| + | |||
| + | |||
| + | import numpy as np | ||
| + | |||
| + | total = np.sum(s) | ||
| + | print(total) | ||
| + | |||
| + | |||
| + | #this creates a big series of random numbers | ||
| + | s = pd.Series(np.random.randint(0,1000,10000)) | ||
| + | s.head() | ||
| + | |||
| + | len(s) | ||
| + | |||
| + | |||
| + | %%timeit -n 100 | ||
| + | summary = 0 | ||
| + | for item in s: | ||
| + | summary+=item | ||
| + | |||
| + | |||
| + | %%timeit -n 100 | ||
| + | summary = np.sum(s) | ||
| + | |||
| + | |||
| + | s+=2 #adds two to each item in s using broadcasting | ||
| + | s.head() | ||
| + | |||
| + | |||
| + | for label, value in s.iteritems(): | ||
| + | s.set_value(label, value+2) | ||
| + | s.head() | ||
| + | |||
| + | |||
| + | %%timeit -n 10 | ||
| + | s = pd.Series(np.random.randint(0,1000,10000)) | ||
| + | for label, value in s.iteritems(): | ||
| + | s.loc[label]= value+2 | ||
| + | |||
| + | |||
| + | %%timeit -n 10 | ||
| + | s = pd.Series(np.random.randint(0,1000,10000)) | ||
| + | s+=2 | ||
| + | |||
| + | s = pd.Series([1, 2, 3]) | ||
| + | s.loc['Animal'] = 'Bears' | ||
| + | s | ||
| + | |||
| + | |||
| + | original_sports = pd.Series({'Archery': 'Bhutan', | ||
| + | 'Golf': 'Scotland', | ||
| + | 'Sumo': 'Japan', | ||
| + | 'Taekwondo': 'South Korea'}) | ||
| + | cricket_loving_countries = pd.Series(['Australia', | ||
| + | 'Barbados', | ||
| + | 'Pakistan', | ||
| + | 'England'], | ||
| + | index=['Cricket', | ||
| + | 'Cricket', | ||
| + | 'Cricket', | ||
| + | 'Cricket']) | ||
| + | all_countries = original_sports.append(cricket_loving_countries) | ||
| + | |||
| + | |||
| + | original_sports | ||
| + | |||
| + | cricket_loving_countries | ||
| + | |||
| + | all_countries | ||
| + | |||
| + | all_countries.loc['Cricket'] | ||
| + | |||
| + | </source> | ||
| + | |||
| + | The DataFrame Data Structure | ||
| + | |||
| + | <source lang="python"> | ||
</source> | </source> | ||
Latest revision as of 18:15, 28 July 2019
import pandas as pd pd.Series? animals = ['Tiger', 'Bear', 'Moose'] pd.Series(animals) numbers = [1, 2, 3] pd.Series(numbers) animals = ['Tiger', 'Bear', None] pd.Series(animals) numbers = [1, 2, None] pd.Series(numbers) import numpy as np np.nan == None np.nan == np.nan np.isnan(np.nan) sports = {'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'} s = pd.Series(sports) s s.index s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada']) s sports = {'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'} s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey']) s
Querying a series
sports = {'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'} s = pd.Series(sports) s s.iloc[3] s.loc['Golf'] s[3] s['Golf'] sports = {99: 'Bhutan', 100: 'Scotland', 101: 'Japan', 102: 'South Korea'} s = pd.Series(sports) s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead s = pd.Series([100.00, 120.00, 101.00, 3.00]) s total = 0 for item in s: total+=item print(total) import numpy as np total = np.sum(s) print(total) #this creates a big series of random numbers s = pd.Series(np.random.randint(0,1000,10000)) s.head() len(s) %%timeit -n 100 summary = 0 for item in s: summary+=item %%timeit -n 100 summary = np.sum(s) s+=2 #adds two to each item in s using broadcasting s.head() for label, value in s.iteritems(): s.set_value(label, value+2) s.head() %%timeit -n 10 s = pd.Series(np.random.randint(0,1000,10000)) for label, value in s.iteritems(): s.loc[label]= value+2 %%timeit -n 10 s = pd.Series(np.random.randint(0,1000,10000)) s+=2 s = pd.Series([1, 2, 3]) s.loc['Animal'] = 'Bears' s original_sports = pd.Series({'Archery': 'Bhutan', 'Golf': 'Scotland', 'Sumo': 'Japan', 'Taekwondo': 'South Korea'}) cricket_loving_countries = pd.Series(['Australia', 'Barbados', 'Pakistan', 'England'], index=['Cricket', 'Cricket', 'Cricket', 'Cricket']) all_countries = original_sports.append(cricket_loving_countries) original_sports cricket_loving_countries all_countries all_countries.loc['Cricket']
The DataFrame Data Structure