Combinando datos: *merge* y *join*
In [1]:
import pandas as pd
import numpy as np
class display(object):
"""Display HTML representation of multiple objects"""
template = """<div style="float: left; padding: 10px;">
<p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
</div>"""
def __init__(self, *args):
self.args = args
def _repr_html_(self):
return '\n'.join(self.template.format(a, eval(a)._repr_html_())
for a in self.args)
def __repr__(self):
return '\n\n'.join(a + '\n' + repr(eval(a))
for a in self.args)
Algebra Relacional¶
Categorias de uniones¶
Uniones uno-a-uno (one-to-one)¶
In [2]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
'hire_date': [2004, 2008, 2012, 2014]})
display('df1', 'df2')
Out[2]:
In [3]:
df3 = pd.merge(df1, df2)
df3
Out[3]:
Uniones muchas-a-uno (many-to-one)¶
In [4]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
'supervisor': ['Carly', 'Guido', 'Steve']})
display('df3', 'df4', 'pd.merge(df3, df4)')
Out[4]:
Uniones muchas-a-muchas (many-to-many)¶
In [5]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting',
'Engineering', 'Engineering', 'HR', 'HR'],
'skills': ['math', 'spreadsheets', 'coding', 'linux',
'spreadsheets', 'organization']})
display('df1', 'df5', "pd.merge(df1, df5)")
Out[5]:
Especificando la llave de unión en merge¶
La palabra reservada on
¶
In [6]:
display('df1', 'df2', "pd.merge(df1, df2, on='employee')")
Out[6]:
Las palabras reservadas left_on
y right_on
¶
In [7]:
df3 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'salary': [70000, 80000, 120000, 90000]})
display('df1', 'df3', 'pd.merge(df1, df3, left_on="employee", right_on="name")')
Out[7]:
In [8]:
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)
Out[8]:
Las palabras reservadas left_index
y right_index
¶
In [9]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
display('df1a', 'df2a')
Out[9]:
In [10]:
display('df1a', 'df2a',
"pd.merge(df1a, df2a, left_index=True, right_index=True)")
Out[10]:
In [11]:
display('df1a', 'df2a', 'df1a.join(df2a)')
Out[11]:
In [12]:
display('df1a', 'df3', "pd.merge(df1a, df3, left_index=True, right_on='name')")
Out[12]:
Aritmética de conjuntos para uniones¶
In [13]:
df6 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
'food': ['fish', 'beans', 'bread']},
columns=['name', 'food'])
df7 = pd.DataFrame({'name': ['Mary', 'Joseph'],
'drink': ['wine', 'beer']},
columns=['name', 'drink'])
display('df6', 'df7', 'pd.merge(df6, df7)')
Out[13]:
In [14]:
pd.merge(df6, df7, how='inner')
Out[14]:
In [15]:
display('df6', 'df7', "pd.merge(df6, df7, how='outer')")
Out[15]:
In [16]:
display('df6', 'df7', "pd.merge(df6, df7, how='left')")
Out[16]:
Nombres de columnas traslapadas: la plabra reservada suffixes
¶
In [17]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
display('df8', 'df9', 'pd.merge(df8, df9, on="name")')
Out[17]:
In [18]:
display('df8', 'df9', 'pd.merge(df8, df9, on="name", suffixes=["_L", "_R"])')
Out[18]:
Ejemplo: datos sobre los EU¶
In [19]:
# Following are shell commands to download the data
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv
# !curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv
In [20]:
pop = pd.read_csv('data/state-population.csv')
areas = pd.read_csv('data/state-areas.csv')
abbrevs = pd.read_csv('data/state-abbrevs.csv')
display('pop.head()', 'areas.head()', 'abbrevs.head()')
Out[20]:
In [21]:
merged = pd.merge(pop, abbrevs, how='outer',
left_on='state/region', right_on='abbreviation')
merged = merged.drop('abbreviation', 1) # drop duplicate info
merged.head()
Out[21]:
In [22]:
merged.isnull().any()
Out[22]:
In [23]:
merged[merged['population'].isnull()].head()
Out[23]:
In [24]:
merged.loc[merged['state'].isnull(), 'state/region'].unique()
Out[24]:
In [25]:
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States'
merged.isnull().any()
Out[25]:
In [26]:
final = pd.merge(merged, areas, on='state', how='left')
final.head()
Out[26]:
In [27]:
final.isnull().any()
Out[27]:
In [28]:
final['state'][final['area (sq. mi)'].isnull()].unique()
Out[28]:
In [29]:
final.dropna(inplace=True)
final.head()
Out[29]:
In [30]:
data2010 = final.query("year == 2010 & ages == 'total'")
data2010.head()
Out[30]:
In [31]:
data2010.set_index('state', inplace=True)
density = data2010['population'] / data2010['area (sq. mi)']
In [32]:
density.sort_values(ascending=False, inplace=True)
density.head()
Out[32]:
In [33]:
density.tail()
Out[33]: