If you enjoyed this book considering buying a copy
- Buy a copy of the book on Lean Pub
- Buy a copy of the book on Kindle
- Buy a hard copy of the book on Amazon
- Buy the bundle Master Python on Lean Pub
Chapter 8: Data Science Libraries #
This chapter servers as a primer into many common Data Science libraries in Python. This is more of a cookbook you can use to look up examples. You don’t need to master this material to be an expert in programming, but it could be helpful to look things up as a reference.
Learn numpy #
References
What is numpy? #
- Low level multi-dimensional array library
- A programmers Excel
- The building blocks for many key Python libraries:
- Pandas
- Sklearn
- Tensorflow
Hello World Numpy Workflow #
The numpy
library is a scientific library. These are some examples of how to use it.
import numpy as np
Make an array #
a = np.arange(6).reshape(2, 3)
Print shape #
a.shape
(2, 3)
Print size #
a.size
6
Print type #
a.dtype.name
'int64'
Print contents #
a
array([[0, 1, 2],
[3, 4, 5]])
Create an Array #
One Dimensional Array #
a = np.array([2,4,6,8])
print(f"Shape {a.shape}")
print(f"Content: {a}")
Shape (4,)
Content: [2 4 6 8]
Two Dimensional Array #
a = np.array([(2,4,6,8),(20,40,60,80)])
print(f"Shape: {a.shape}")
print(f"Content: {a}")
Shape: (2, 4)
Content: [[ 2 4 6 8]
[20 40 60 80]]
Create Sequence of Numbers #
a = np.arange(1,20)
print(f"Shape: {a.shape}")
print(f"Content: {a}")
Shape: (19,)
Content: [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]
Create empty multi-dimensional array #
a = np.zeros( (2,3) )
print(f"Shape: {a.shape}")
print(f"Content: {a}")
Shape: (2, 3)
Content: [[0. 0. 0.]
[0. 0. 0.]]
Learn sklearn #
Supervized Machine Learning: Classification Modeling Workflow #
Key Evaluation Terms #
-
Precision: Measures the fraction of actual positives among those examples that are predicted as positive. The range is 0 to 1. A larger value indicates better predictive accuracy
-
** Recall**: Measures the fraction of actual positives that are predicted as positive. The range is 0 to 1. A larger value indicates better predictive accuracy
-
F1-score: Weighted average of recall and precision
-
AUC: AUC measures the ability of the model to predict a higher score for positive examples as compared to negative examples
-
False Positive Rate: The false positive rate (FPR) measures the false alarm rate or the fraction of actual negatives that are predicted as positive. The range is 0 to 1. A smaller value indicates better predictive accuracy
Digits Dataset #
sklearn modeling #
https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html
# Standard scientific Python imports
import matplotlib.pyplot as plt
# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics
# The digits dataset
digits = datasets.load_digits()
# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset. If we were working from image files, we could load them using
# matplotlib.pyplot.imread. Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
plt.subplot(2, 4, index + 1)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Training: %i' % label)
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)
# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])
# Now predict the value of the digit on the second half:
expected = digits.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])
print("Classification report for classifier %s:\n%s\n"
% (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
plt.subplot(2, 4, index + 5)
plt.axis('off')
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('Prediction: %i' % prediction)
plt.show()
Classification report for classifier
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
coef0=0.0, decision_function_shape='ovr', degree=3,
gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
random_state=None, shrinking=True, tol=0.001, verbose=False):
precision recall f1-score support
0 1.00 0.99 0.99 88
1 0.99 0.97 0.98 91
2 0.99 0.99 0.99 86
3 0.98 0.87 0.92 91
4 0.99 0.96 0.97 92
5 0.95 0.97 0.96 91
6 0.99 0.99 0.99 91
7 0.96 0.99 0.97 89
8 0.94 1.00 0.97 88
9 0.93 0.98 0.95 92
accuracy 0.97 899
macro avg 0.97 0.97 0.97 899
weighted avg 0.97 0.97 0.97 899
Confusion matrix:
[[87 0 0 0 1 0 0 0 0 0]
[ 0 88 1 0 0 0 0 0 1 1]
[ 0 0 85 1 0 0 0 0 0 0]
[ 0 0 0 79 0 3 0 4 5 0]
[ 0 0 0 0 88 0 0 0 0 4]
[ 0 0 0 0 0 88 1 0 0 2]
[ 0 1 0 0 0 0 90 0 0 0]
[ 0 0 0 0 0 1 0 88 0 0]
[ 0 0 0 0 0 0 0 0 88 0]
[ 0 0 0 1 0 1 0 0 0 90]]
Yellowbrick Confusion Matrix #
The yellowbrick library is here:
http://www.scikit-yb.org/en/latest/api/classifier/confusion_matrix.html
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from yellowbrick.classifier import ConfusionMatrix
# We'll use the handwritten digits data set from scikit-learn.
# Each feature of this dataset is an 8x8 pixel image of a handwritten number.
# Digits.data converts these 64 pixels into a single array of features
digits = load_digits()
X = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size =0.2,
random_state=1)
model = LogisticRegression()
# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9])
# Fit fits the passed model. This is unnecessary if you pass the
# visualizer a pre-fitted model
cm.fit(X_train, y_train)
# To create the ConfusionMatrix, we need some test data.
# Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(X_test, y_test)
# How did we do?
cm.poof()
ROCAUC #
http://www.scikit-yb.org/en/latest/api/classifier/rocauc.html
from yellowbrick.classifier import ROCAUC
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classes=[0,1,2,3,4,5,6,7,8,9]
# Instantiate the visualizer with the classification model
visualizer = ROCAUC(model, classes=classes)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.poof() # Draw/show/poof the data
Supervized Machine Learning: Regression Modeling Workflow #
Ingest #
Source: http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_MLB_HeightsWeights
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/noahgift/"\
"functional_intro_to_python/master/data/mlb_weight_ht.csv")
df.head()
Find N/A
df.shape
(1034, 6)
df.isnull().values.any()
True
df = df.dropna()
df.isnull().values.any()
False
df.shape
(1033, 6)
Clean #
df.rename(index=str,
columns={"Height(inches)": "Height", "Weight(pounds)": "Weight"},
inplace=True)
df.head()
Model #
from sklearn import linear_model
from sklearn.model_selection import train_test_split
Create Features #
var = df['Weight'].values
var.shape
(1033,)
y = df['Weight'].values #Target
y = y.reshape(-1, 1)
X = df['Height'].values #Feature(s)
X = X.reshape(-1,1)
#X = df[['Height', 'Age']].values
y.shape
(1033, 1)
Split data #
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(826, 1) (826, 1)
(207, 1) (207, 1)
Fit the model #
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
lm.predict?
Returns Numpy Array
type(predictions)
numpy.ndarray
Plot Predictions #
from matplotlib import pyplot as plt
plt.scatter(y_test, predictions)
plt.xlabel("Actual Weight")
plt.ylabel("Predicted Weight")
Text(0, 0.5, 'Predicted Weight')
Print Accuracy of Linear Regression Model #
model.score(X_test, y_test)
0.3074268236931288
Use Cross-Validation #
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
scores = cross_val_score(model, X, y, cv=6)
scores
array([0.29670427, 0.22459508, 0.29543549, 0.30012566, 0.19191046,
0.34579806])
Plot Cross-validation Predictions #
predictions = cross_val_predict(model, X, y, cv=6)
plt.scatter(y, predictions)
<matplotlib.collections.PathCollection at 0x7fbc7c3d0320>
accuracy = metrics.r2_score(y, predictions)
accuracy
0.280770222008195
Conclusion #
- Cross-Validation improved Accuracy
- Adding more data or more features could improve the model
- Major League Baseball may be a strange set to predict Weight
- Bigger Data Set here: http://socr.ucla.edu/docs/resources/SOCR_Data/SOCR_Data_Dinov_020108_HeightsWeights.html
Unsupervized Machine Learning: Clustering #
Ingest #
import pandas as pd
df = pd.read_csv(
"https://raw.githubusercontent.com/noahgift/"\
"food/master/data/features.en.openfoodfacts.org.products.csv")
# drop two rows we don't need
df.drop(["Unnamed: 0", "exceeded", "g_sum", "energy_100g"], axis=1, inplace=True)
df = df.drop(df.index[[1,11877]]) #drop outlier
df.rename(
index=str, columns={"reconstructed_energy": "energy_100g"},
inplace=True)
df.head()
df.columns
Index(['fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g',
'salt_100g', 'energy_100g', 'product'],
dtype='object')
Create Features to Cluster #
df_cluster_features = df.drop("product", axis=1)
Scale the data #
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(df_cluster_features))
print(scaler.transform(df_cluster_features))
MinMaxScaler(copy=True, feature_range=(0, 1))
[[2.85700000e-01 6.42900000e-01 1.53063241e-01 6.89388819e-02
0.00000000e+00 5.06782123e-01]
[5.71400000e-01 1.78600000e-01 4.71343874e-02 2.06913199e-01
6.02500000e-04 6.33675978e-01]
[1.87500000e-01 5.78100000e-01 1.66205534e-01 1.70223038e-01
6.87500000e-05 4.36433520e-01]
...
[0.00000000e+00 1.33300000e-01 1.43577075e-01 3.44694410e-02
1.87500000e-05 5.06391061e-02]
[0.00000000e+00 1.62500000e-01 1.72430830e-01 3.44694410e-02
1.87500000e-05 6.17318436e-02]
[0.00000000e+00 0.00000000e+00 1.18577075e-02 3.44694410e-02
0.00000000e+00 0.00000000e+00]]
Add Cluster Labels #
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3)
kmeans = k_means.fit(scaler.transform(df_cluster_features))
df['cluster'] = kmeans.labels_
df.head()
Learn pandas #
Time Series Workflow #
Ingest Zillow #
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns; sns.set(color_codes=True)
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
df = pd.read_csv("https://raw.githubusercontent.com/noahgift/"
"real_estate_ml/master/data/Zip_Zhvi_SingleFamilyResidence_2018.csv")
df[["City", "State"]].head()
median_prices = df.median()
#sf_prices = df["City"] == "San Francisco".median()
Median USA Prices December, 2018
median_prices.tail()
2018-08 196900.000
2018-09 198100.000
2018-10 199600.000
2018-11 201100.000
2018-12 202150.000
dtype: float64
sf_df = df[df["City"] == "San Francisco"].median()
df_comparison = pd.concat([sf_df,median_prices], axis=1)
df_comparison.columns = ["San Francisco","Median USA"]
df_comparison.tail()
Transpose #
df_transposed = df.transpose()
#df_transposed.head(15)
df_transposed.columns
RangeIndex(start=0, stop=15508, step=1)
Create Cities DataFrame #
cities = df_transposed.iloc[2].values
cities_df = df_transposed.drop(df_transposed.index[:7])
cities_df.columns = cities
#cities_df.head()
Create time series #
from pandas.plotting import autocorrelation_plot
sf_values = cities_df.iloc[:, 9].values
index = pd.DatetimeIndex(cities_df.index.values)
sf_data = pd.Series(sf_values, index=index)
autocorrelation plot #
autocorrelation_plot(sf_data)
<matplotlib.axes._subplots.AxesSubplot at 0x7fbc63393860>
sf_data.tail()
2018-08-01 3993000
2018-09-01 3999000
2018-10-01 4014600
2018-11-01 4009500
2018-12-01 4016600
dtype: object
Simple Plot #
sf_data.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fbc7c3ec668>
DataFrame Workflow #
Ingest #
import pandas as pd
df = pd.read_csv(
"https://raw.githubusercontent.com/noahgift/"\
"food/master/data/features.en.openfoodfacts.org.products.csv")
# drop two rows we don't need
df.drop(["Unnamed: 0", "exceeded", "g_sum", "energy_100g"], axis=1, inplace=True)
df = df.drop(df.index[[1,11877]]) #drop outlier
df.rename(
index=str, columns={"reconstructed_energy": "energy_100g"},
inplace=True)
df.head()
EDA #
df.columns
Index(['fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g',
'salt_100g', 'energy_100g', 'product'],
dtype='object')
Rows and Attributes #
df.shape
(45026, 7)
First Five Columns #
df.head()
Descriptive Statistics #
df.describe()
Correlations #
df.corr()
Filtering by Quantiles #
Find fatty foods in the 98th percentile #
high_fat_df = df[df.fat_100g > df.fat_100g.quantile(.98)]
high_fat_text = high_fat_df['product'].values
len(high_fat_text)
878
high_fat_text[0]
'Organic Salted Nut Mix'
Find protein foods in the 98th percentile #
high_protein_df = df[df.proteins_100g > df.proteins_100g.quantile(.98)]
high_protein_text = high_protein_df['product'].values
len(high_protein_text)
896
high_protein_text[0]
'Organic Yellow Split Peas'
Learn tensorflow #
Use seaborn for 2D plots #
Faceted Distribution Plots #
Generate distributions based on energy type
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import matplotlib.cbook
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
sns.set(style="white", palette="muted", color_codes=True)
# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True)
sns.despine(left=True)
# Plot each distribution on the 4 points
sns.distplot(df.proteins_100g, color="b", ax=axes[0, 0])
sns.distplot(df.sugars_100g, color="g", ax=axes[0, 1])
sns.distplot(df.fat_100g, color="r", ax=axes[1, 1])
sns.distplot(df.carbohydrates_100g, color="m", ax=axes[1, 0])
<matplotlib.axes._subplots.AxesSubplot at 0x7fbc6368b5f8>
Pairplot #
import seaborn as sns
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x7fbc63bb19b0>
lmplot #
import seaborn as sns
sns.lmplot(x="fat_100g", y="proteins_100g", data=df.sample(100))
<seaborn.axisgrid.FacetGrid at 0x7fbc6271a6d8>
heatmap #
sns.heatmap(df.corr())
<matplotlib.axes._subplots.AxesSubplot at 0x7fbc632a1c88>
Specialized Visualization Libraries #
Yellowbrick #
Visualize Regression Lasso (Regression) Model Accuracy with Yellowbrick #
Note, uses Lasso Regression
from yellowbrick.regressor import PredictionError
from sklearn.linear_model import Lasso
lasso = Lasso()
visualizer = PredictionError(lasso)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.poof() # Draw/show/poof the data
Visualize cross-validated scores for Linear regression model #
See this: http://www.scikit-yb.org/en/latest/api/model_selection/cross_validation.html
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from yellowbrick.model_selection import CVScores
# Create a new figure and axes
_, ax = plt.subplots()
cv = KFold(12)
oz = CVScores(
linear_model.LinearRegression(), ax=ax, cv=cv, scoring='r2'
)
oz.fit(X, y)
oz.poof()
Word Cloud #
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
High protein foods #
Find protein foods in the 98th percentile
high_protein_df = df[df.proteins_100g > df.proteins_100g.quantile(.98)]
high_protein_text = high_protein_df['product'].values
len(high_protein_text)
896
Word Cloud High Protein
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(high_protein_text))
fig = plt.figure(
figsize = (10, 7),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
High fat foods #
Find fatty foods in the 98th percentile
high_fat_df = df[df.fat_100g > df.fat_100g.quantile(.98)]
high_fat_text = high_fat_df['product'].values
len(high_fat_text)
878
Word Cloud High Fat
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(high_fat_text))
fig = plt.figure(
figsize = (10, 7),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
High sugar foods #
Find sugary foods in the 98th percentile
high_sugar_df = df[df.sugars_100g > df.sugars_100g.quantile(.98)]
high_sugar_text = high_sugar_df['product'].values
len(high_sugar_text)
893
Word Cloud High Sugar
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = STOPWORDS).generate(str(high_sugar_text))
fig = plt.figure(
figsize = (10, 7),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
Learn Natural Language Processing Libraries #
NLTK Stopword Processing #
Setup Stop Words #
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
Preprocess Text #
dataset = df['product'].fillna("").values
raw_text_data = [d.split() for d in dataset]
Remove stop words #
text_data = [item for item in raw_text_data if item not in stop]
Gensim Topic Modeling #
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(
corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
print(topic)
2D Plots #
def enable_plotly_in_cell():
import IPython
from plotly.offline import init_notebook_mode
display(IPython.core.display.HTML('''
<script src="/static/components/requirejs/require.js"></script>
'''))
init_notebook_mode(connected=False)
from plotly.offline import init_notebook_mode
enable_plotly_in_cell()
init_notebook_mode(connected=False)
import cufflinks as cf
cf.go_offline()
df.sample(1000).iplot(kind='bubble',
size='energy_100g',
mode='markers',
x='fat_100g',
y='proteins_100g',
xTitle='Fat',
yTitle='Protein',
text="product")
Protein-Fat-Carb 3D Plot #
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
enable_plotly_in_cell()
trace1 = go.Scatter3d(
x=df["fat_100g"],
y=df["carbohydrates_100g"],
z=df["proteins_100g"],
mode='markers',
text=df["product"],
marker=dict(
size=12,
# set color to an array/list of desired values
color=df["cluster"],
colorscale='Viridis', # choose a colorscale
opacity=0.8
)
)
data = [trace1]
layout = go.Layout(
showlegend=False,
title="Protein-Fat-Carb: Food Energy Types",
scene = dict(
xaxis = dict(title='X: Fat Content-100g'),
yaxis = dict(title="Y: Carbohydrate Content-100g"),
zaxis = dict(title="Z: Protein Content-100g"),
),
width=1000,
height=900,
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='3d-scatter-colorscale')