Basic Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
= 'colab'
pio.renderers.defaule
from itables import show
Exam1 Python Commands Cheat Sheet
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio
= 'colab'
pio.renderers.defaule
from itables import show
Inside the parenthesis you put the exact name/location of the data.
= "https://en.wikipedia.org/wiki/List_of_Academy_Award%E2%80%93winning_films"
my_website = pd.read_html(my_website) DF
= pd.read_csv('mapdataall.csv') DF
= 'https://joannabieri.com/introdatascience/data/starwars.csv'
file_location = pd.read_csv(file_location) DF
Assume our data is stored in the variable DF.
DF.shape
DF.columns
DF.describe()
show(DF)
'name'] DF[
= ['name','hair_color', 'skin_color', 'eye_color']
my_columns DF[my_columns]
All plots have three basic parts:
fig = px.histogram(DF,
x='column name')
fig.update_layout(title=' ',
title_x=0.5,
xaxis_title=" ",
yaxis_title=" ",
autosize=False,
width=800,
height=500)
fig.show()
show(DF)
name | height | mass | hair_color | skin_color | eye_color | birth_year | sex | gender | homeworld | species | films | vehicles | starships |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loading ITables v2.1.4 from the internet...
(need help?) |
= px.scatter(DF,
fig ='height',
x='mass',
y='name',
hover_data= 'sex')
color
='Mass vs. Height of Starwars Characters',
fig.update_layout(title=0.5,
title_x="Mass",
xaxis_title="Height",
yaxis_title=False,
autosize=800,
width=500)
height
fig.show()
# Scatter Plot
= px.histogram(DF,
fig ='height',
x=10,
nbins= 'sex')
color
=0.02,
fig.update_layout(bargap='Mass of Starwars Characters',
title=0.5,
title_x="Frequency",
yaxis_title="Height",
xaxis_title=False,
autosize=800,
width=500)
height
fig.show()
# Scatter Plot
= px.histogram(DF,
fig ='height',
x=10,
nbins= 'sex',
facet_col=2,
facet_col_wrap='eye_color')
color
=0.02,
fig.update_layout(bargap='Mass of Starwars Characters',
title=0.5,
title_x="Frequency",
xaxis_title="Height",
yaxis_title=False,
autosize=800,
width=800)
height
fig.show()
All of these commands will give you information about how many categories are in a column (or columns).
'sex'].value_counts()) show(DF[
'sex'].drop_duplicates()) show(DF[
= ['sex','eye_color']
my_columns DF[my_columns].drop_duplicates()
You can technically leave out the by= command, but this sometimes helps me to understand the command. If you wanted to send in a list of columns, it can sort more than one column at a time. It will sort in the order the columns are given.
='height',ascending=False) DF.sort_values(by
Operator | Definition |
---|---|
< | less than |
> | greater than |
<= | less than or equal to |
>= | greater than or equal to |
== | exactly equal to |
!= | not equal to |
Operator | Definition |
---|---|
and | check if two things are both true |
or | check if one of two things is true |
in | checks if something is in another thing |
! | not checks if something is false |
Operator | Definition |
---|---|
& | checks that two masks are both true |
when masking more than one condition, separate the conditions with parenthesis:
= DF['homeworld'] != 'Tatooine'
mask DF[mask]
= DF['homeworld'] == 'Tatooine'
mask DF[mask]
= DF['height'] > 100
mask = DF[mask].copy() DF_tall
= (DF['homeworld'] == 'Tatooine') & (DF['homeworld'] == 'Naboo') & (DF['homeworld'] == 'Alderaan')
mask = DF[mask].copy() DF_homeworld
To add a column you just enter a new column name into DF[] followed by a calculation
'height_and_mass'] = DF['height'] + DF['mass'] DF[
'is_human_female'] = (DF['species']=='Human') & (DF['sex']=='female') DF[
The groupby command just separates the data in Pythons memory. Then you apply a function to see the results. In this class we have been doing
my_columns = [list of columns] # must include the column you are grouping by
DF[my_columns).groupby('grouping column').describe()
You can use all sorts of functions at the end:
if you just want to see one group you can use
= ['sex','height']
my_columns 'sex').describe() DF[my_columns].groupby(
= ['sex','height']
my_columns 'sex').get_group('female') DF[my_columns].groupby(
= ['sex','height','mass']
my_columns 'sex').mean() DF[my_columns].groupby(
If you ahave two or more data frames that share a column you can join them together
= pd.DataFrame({'id':[1,2,3],'data1':['x1','x2','x3']})
DF_fake1 = pd.DataFrame({'id':[1,2,4],'data1':['y1','y2','y4']})
DF_fake2
show(DF_fake1) show(DF_fake2)
id | data1 |
---|---|
Loading ITables v2.1.4 from the internet...
(need help?) |
id | data1 |
---|---|
Loading ITables v2.1.4 from the internet...
(need help?) |
='id',how='left') pd.merge(DF_fake1, DF_fake2, on
id | data1_x | data1_y | |
---|---|---|---|
0 | 1 | x1 | y1 |
1 | 2 | x2 | y2 |
2 | 3 | x3 | NaN |
='id',how='right') pd.merge(DF_fake1, DF_fake2, on
id | data1_x | data1_y | |
---|---|---|---|
0 | 1 | x1 | y1 |
1 | 2 | x2 | y2 |
2 | 4 | NaN | y4 |
='id',how='outer') pd.merge(DF_fake1, DF_fake2, on
id | data1_x | data1_y | |
---|---|---|---|
0 | 1 | x1 | y1 |
1 | 2 | x2 | y2 |
2 | 3 | x3 | NaN |
3 | 4 | NaN | y4 |
='id',how='inner') pd.merge(DF_fake1, DF_fake2, on
id | data1_x | data1_y | |
---|---|---|---|
0 | 1 | x1 | y1 |
1 | 2 | x2 | y2 |
In the example below we are going to create a column in our data frame that counts the total number of films that a Starwars character has been in
'number_films'] = DF['films'].apply(lambda x: len(x.split(',')))
DF['''
Why does this work???
Here we are looking at each piece of data in the flims column.
The data looks like this:
"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith"
When we do x.split(',') this returns a list:
["A New Hope", "The Empire Strikes Back", "Return of the Jedi", "Revenge of the Sith"]
We find the length of this list to count up the movies!
'''
'name','number_films']] DF[[
'weight'] = DF['mass'].apply(lambda x: x*2.20462) DF[