Chapter08 Data Science Libraries

If you find this content useful, consider buying this book:

If you enjoyed this book considering buying a copy

Chapter 8: Data Science Libraries #

This chapter servers as a primer into many common Data Science libraries in Python. This is more of a cookbook you can use to look up examples. You don’t need to master this material to be an expert in programming, but it could be helpful to look things up as a reference.

Learn numpy #

References

What is numpy? #

  • Low level multi-dimensional array library
  • A programmers Excel
  • The building blocks for many key Python libraries:
    • Pandas
    • Sklearn
    • Tensorflow

Hello World Numpy Workflow #

The numpy library is a scientific library. These are some examples of how to use it.

import numpy as np

Make an array #

a = np.arange(6).reshape(2, 3)
a.shape
    (2, 3)
a.size
    6
a.dtype.name
    'int64'
a
    array([[0, 1, 2],
           [3, 4, 5]])

Create an Array #

One Dimensional Array #

a = np.array([2,4,6,8])
print(f"Shape {a.shape}")
print(f"Content: {a}")
    Shape (4,)
    Content: [2 4 6 8]

Two Dimensional Array #

a = np.array([(2,4,6,8),(20,40,60,80)])
print(f"Shape: {a.shape}")
print(f"Content: {a}")
    Shape: (2, 4)
    Content: [[ 2  4  6  8]
     [20 40 60 80]]

Create Sequence of Numbers #

a = np.arange(1,20)
print(f"Shape: {a.shape}")
print(f"Content: {a}")

    Shape: (19,)
    Content: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]

Create empty multi-dimensional array #

a = np.zeros( (2,3) )
print(f"Shape: {a.shape}")
print(f"Content: {a}")
    Shape: (2, 3)
    Content: [[0. 0. 0.]
     [0. 0. 0.]]

Learn sklearn #

Supervized Machine Learning: Classification Modeling Workflow #

Key Evaluation Terms #

  • Amazon ML Key Classification Metrics

  • sklearn classification metrics

  • Precision: Measures the fraction of actual positives among those examples that are predicted as positive. The range is 0 to 1. A larger value indicates better predictive accuracy

  • ** Recall**: Measures the fraction of actual positives that are predicted as positive. The range is 0 to 1. A larger value indicates better predictive accuracy

  • F1-score: Weighted average of recall and precision

  • AUC: AUC measures the ability of the model to predict a higher score for positive examples as compared to negative examples

  • False Positive Rate: The false positive rate (FPR) measures the false alarm rate or the fraction of actual negatives that are predicted as positive. The range is 0 to 1. A smaller value indicates better predictive accuracy

Digits Dataset #

sklearn modeling #

https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html

# Standard scientific Python imports
import matplotlib.pyplot as plt

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics

# The digits dataset
digits = datasets.load_digits()

# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset.  If we were working from image files, we could load them using
# matplotlib.pyplot.imread.  Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
    plt.subplot(2, 4, index + 5)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)

plt.show()
Classification report for classifier
SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
    coef0=0.0, decision_function_shape='ovr', degree=3,
    gamma=0.001, kernel='rbf', max_iter=-1, probability=False,
    random_state=None, shrinking=True, tol=0.001, verbose=False):
                  precision    recall  f1-score   support

               0       1.00      0.99      0.99        88
               1       0.99      0.97      0.98        91
               2       0.99      0.99      0.99        86
               3       0.98      0.87      0.92        91
               4       0.99      0.96      0.97        92
               5       0.95      0.97      0.96        91
               6       0.99      0.99      0.99        91
               7       0.96      0.99      0.97        89
               8       0.94      1.00      0.97        88
               9       0.93      0.98      0.95        92

        accuracy                           0.97       899
       macro avg       0.97      0.97      0.97       899
    weighted avg       0.97      0.97      0.97       899


    Confusion matrix:
    [[87  0  0  0  1  0  0  0  0  0]
     [ 0 88  1  0  0  0  0  0  1  1]
     [ 0  0 85  1  0  0  0  0  0  0]
     [ 0  0  0 79  0  3  0  4  5  0]
     [ 0  0  0  0 88  0  0  0  0  4]
     [ 0  0  0  0  0 88  1  0  0  2]
     [ 0  1  0  0  0  0 90  0  0  0]
     [ 0  0  0  0  0  1  0 88  0  0]
     [ 0  0  0  0  0  0  0  0 88  0]
     [ 0  0  0  1  0  1  0  0  0 90]]

png

Yellowbrick Confusion Matrix #

The yellowbrick library is here:

http://www.scikit-yb.org/en/latest/api/classifier/confusion_matrix.html

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from yellowbrick.classifier import ConfusionMatrix

# We'll use the handwritten digits data set from scikit-learn.
# Each feature of this dataset is an 8x8 pixel image of a handwritten number.
# Digits.data converts these 64 pixels into a single array of features
digits = load_digits()
X = digits.data
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(
                                       X, y, test_size =0.2,
                                       random_state=1)

model = LogisticRegression()

# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9])

# Fit fits the passed model. This is unnecessary if you pass the
# visualizer a pre-fitted model
cm.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data.
# Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(X_test, y_test)

# How did we do?
cm.poof()

png

ROCAUC #

http://www.scikit-yb.org/en/latest/api/classifier/rocauc.html

from yellowbrick.classifier import ROCAUC
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
classes=[0,1,2,3,4,5,6,7,8,9]

# Instantiate the visualizer with the classification model
visualizer = ROCAUC(model, classes=classes)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
g = visualizer.poof()             # Draw/show/poof the data

png

Supervized Machine Learning: Regression Modeling Workflow #

Ingest #

Source: http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_MLB_HeightsWeights

import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/noahgift/"\
                 "functional_intro_to_python/master/data/mlb_weight_ht.csv")
df.head()

Find N/A

df.shape
    (1034, 6)
df.isnull().values.any()
    True
df = df.dropna()
df.isnull().values.any()
    False
df.shape
    (1033, 6)

Clean #

df.rename(index=str,
             columns={"Height(inches)": "Height", "Weight(pounds)": "Weight"},
             inplace=True)
df.head()

Model #

from sklearn import linear_model
from sklearn.model_selection import train_test_split

Create Features #

var = df['Weight'].values
var.shape
    (1033,)
y = df['Weight'].values #Target
y = y.reshape(-1, 1)
X = df['Height'].values #Feature(s)
X = X.reshape(-1,1)
#X = df[['Height', 'Age']].values
y.shape
    (1033, 1)

Split data #

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
    (826, 1) (826, 1)
    (207, 1) (207, 1)

Fit the model #

lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)
lm.predict?

Returns Numpy Array

type(predictions)
    numpy.ndarray

Plot Predictions #

from matplotlib import pyplot as plt
plt.scatter(y_test, predictions)
plt.xlabel("Actual Weight")
plt.ylabel("Predicted Weight")
    Text(0, 0.5, 'Predicted Weight')

png

model.score(X_test, y_test)
    0.3074268236931288

Use Cross-Validation #

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
scores = cross_val_score(model, X, y, cv=6)
scores
    array([0.29670427, 0.22459508, 0.29543549, 0.30012566, 0.19191046,
           0.34579806])

Plot Cross-validation Predictions #

predictions = cross_val_predict(model, X, y, cv=6)
plt.scatter(y, predictions)
    <matplotlib.collections.PathCollection at 0x7fbc7c3d0320>

png

accuracy = metrics.r2_score(y, predictions)
accuracy
    0.280770222008195

Conclusion #

Unsupervized Machine Learning: Clustering #

Ingest #

import pandas as pd

df = pd.read_csv(
    "https://raw.githubusercontent.com/noahgift/"\
    "food/master/data/features.en.openfoodfacts.org.products.csv")
# drop two rows we don't need
df.drop(["Unnamed: 0", "exceeded", "g_sum", "energy_100g"], axis=1, inplace=True)
df = df.drop(df.index[[1,11877]]) #drop outlier
df.rename(
    index=str, columns={"reconstructed_energy": "energy_100g"},
    inplace=True)
df.head()
df.columns
    Index(['fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g',
           'salt_100g', 'energy_100g', 'product'],
          dtype='object')

Create Features to Cluster #

df_cluster_features = df.drop("product", axis=1)

Scale the data #

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
print(scaler.fit(df_cluster_features))
print(scaler.transform(df_cluster_features))
    MinMaxScaler(copy=True, feature_range=(0, 1))
    [[2.85700000e-01 6.42900000e-01 1.53063241e-01 6.89388819e-02
      0.00000000e+00 5.06782123e-01]
     [5.71400000e-01 1.78600000e-01 4.71343874e-02 2.06913199e-01
      6.02500000e-04 6.33675978e-01]
     [1.87500000e-01 5.78100000e-01 1.66205534e-01 1.70223038e-01
      6.87500000e-05 4.36433520e-01]
     ...
     [0.00000000e+00 1.33300000e-01 1.43577075e-01 3.44694410e-02
      1.87500000e-05 5.06391061e-02]
     [0.00000000e+00 1.62500000e-01 1.72430830e-01 3.44694410e-02
      1.87500000e-05 6.17318436e-02]
     [0.00000000e+00 0.00000000e+00 1.18577075e-02 3.44694410e-02
      0.00000000e+00 0.00000000e+00]]

Add Cluster Labels #

from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=3)
kmeans = k_means.fit(scaler.transform(df_cluster_features))
df['cluster'] = kmeans.labels_
df.head()

Learn pandas #

Time Series Workflow #

Ingest Zillow #

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns; sns.set(color_codes=True)
from sklearn.cluster import KMeans
color = sns.color_palette()
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
df = pd.read_csv("https://raw.githubusercontent.com/noahgift/"
  "real_estate_ml/master/data/Zip_Zhvi_SingleFamilyResidence_2018.csv")
df[["City", "State"]].head()

median_prices = df.median()
#sf_prices = df["City"] == "San Francisco".median()

Median USA Prices December, 2018

median_prices.tail()
    2018-08   196900.000
    2018-09   198100.000
    2018-10   199600.000
    2018-11   201100.000
    2018-12   202150.000
    dtype: float64
sf_df = df[df["City"] == "San Francisco"].median()
df_comparison = pd.concat([sf_df,median_prices], axis=1)
df_comparison.columns = ["San Francisco","Median USA"]
df_comparison.tail()

Transpose #

df_transposed = df.transpose()
#df_transposed.head(15)
df_transposed.columns

    RangeIndex(start=0, stop=15508, step=1)

Create Cities DataFrame #

cities = df_transposed.iloc[2].values
cities_df = df_transposed.drop(df_transposed.index[:7])
cities_df.columns = cities
#cities_df.head()

Create time series #

from pandas.plotting import autocorrelation_plot
sf_values = cities_df.iloc[:, 9].values
index = pd.DatetimeIndex(cities_df.index.values)
sf_data = pd.Series(sf_values, index=index)

autocorrelation plot #

Reference: https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-autocorrelation

autocorrelation_plot(sf_data)
    <matplotlib.axes._subplots.AxesSubplot at 0x7fbc63393860>

png

sf_data.tail()
    2018-08-01    3993000
    2018-09-01    3999000
    2018-10-01    4014600
    2018-11-01    4009500
    2018-12-01    4016600
    dtype: object

Simple Plot #

sf_data.plot()
    <matplotlib.axes._subplots.AxesSubplot at 0x7fbc7c3ec668>

png

DataFrame Workflow #

Ingest #

import pandas as pd
df = pd.read_csv(
    "https://raw.githubusercontent.com/noahgift/"\
    "food/master/data/features.en.openfoodfacts.org.products.csv")
# drop two rows we don't need
df.drop(["Unnamed: 0", "exceeded", "g_sum", "energy_100g"], axis=1, inplace=True)
df = df.drop(df.index[[1,11877]]) #drop outlier
df.rename(
  index=str, columns={"reconstructed_energy": "energy_100g"},
  inplace=True)
df.head()

EDA #

df.columns
    Index(['fat_100g', 'carbohydrates_100g', 'sugars_100g', 'proteins_100g',
           'salt_100g', 'energy_100g', 'product'],
          dtype='object')

Rows and Attributes #

df.shape
    (45026, 7)

First Five Columns #

df.head()

Descriptive Statistics #

df.describe()

Correlations #

df.corr()

Filtering by Quantiles #

Find fatty foods in the 98th percentile #
high_fat_df = df[df.fat_100g > df.fat_100g.quantile(.98)]
high_fat_text = high_fat_df['product'].values
len(high_fat_text)
    878
high_fat_text[0]
    'Organic Salted Nut Mix'
Find protein foods in the 98th percentile #
high_protein_df = df[df.proteins_100g > df.proteins_100g.quantile(.98)]
high_protein_text = high_protein_df['product'].values
len(high_protein_text)
    896
high_protein_text[0]
    'Organic Yellow Split Peas'

Learn tensorflow #

Use seaborn for 2D plots #

Faceted Distribution Plots #

Generate distributions based on energy type

import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import matplotlib.cbook
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
sns.set(style="white", palette="muted", color_codes=True)


# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True)
sns.despine(left=True)

# Plot each distribution on the 4 points
sns.distplot(df.proteins_100g, color="b", ax=axes[0, 0])
sns.distplot(df.sugars_100g, color="g", ax=axes[0, 1])
sns.distplot(df.fat_100g, color="r", ax=axes[1, 1])
sns.distplot(df.carbohydrates_100g, color="m", ax=axes[1, 0])
    <matplotlib.axes._subplots.AxesSubplot at 0x7fbc6368b5f8>

png

Pairplot #

import seaborn as sns
sns.pairplot(df)
    <seaborn.axisgrid.PairGrid at 0x7fbc63bb19b0>

png

lmplot #

import seaborn as sns
sns.lmplot(x="fat_100g", y="proteins_100g",  data=df.sample(100))
    <seaborn.axisgrid.FacetGrid at 0x7fbc6271a6d8>

png

heatmap #

sns.heatmap(df.corr())
    <matplotlib.axes._subplots.AxesSubplot at 0x7fbc632a1c88>

png

Specialized Visualization Libraries #

Yellowbrick #

Visualize Regression Lasso (Regression) Model Accuracy with Yellowbrick #

Note, uses Lasso Regression

from yellowbrick.regressor import PredictionError
from sklearn.linear_model import Lasso

lasso = Lasso()
visualizer = PredictionError(lasso)
visualizer.fit(X_train, y_train) # Fit the training data to the visualizer
visualizer.score(X_test, y_test) # Evaluate the model on the test data
g = visualizer.poof()            # Draw/show/poof the data

png

Visualize cross-validated scores for Linear regression model #

See this: http://www.scikit-yb.org/en/latest/api/model_selection/cross_validation.html

from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from yellowbrick.model_selection import CVScores

# Create a new figure and axes
_, ax = plt.subplots()
cv = KFold(12)

oz = CVScores(
    linear_model.LinearRegression(), ax=ax, cv=cv, scoring='r2'
)

oz.fit(X, y)
oz.poof()

png

Word Cloud #

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

High protein foods #

Find protein foods in the 98th percentile

high_protein_df = df[df.proteins_100g > df.proteins_100g.quantile(.98)]
high_protein_text = high_protein_df['product'].values
len(high_protein_text)
    896

Word Cloud High Protein

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(high_protein_text))

fig = plt.figure(
    figsize = (10, 7),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

png

High fat foods #

Find fatty foods in the 98th percentile

high_fat_df = df[df.fat_100g > df.fat_100g.quantile(.98)]
high_fat_text = high_fat_df['product'].values
len(high_fat_text)
    878

Word Cloud High Fat

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(high_fat_text))

fig = plt.figure(
    figsize = (10, 7),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

png

High sugar foods #

Find sugary foods in the 98th percentile

high_sugar_df = df[df.sugars_100g > df.sugars_100g.quantile(.98)]
high_sugar_text = high_sugar_df['product'].values
len(high_sugar_text)
    893

Word Cloud High Sugar

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(high_sugar_text))

fig = plt.figure(
    figsize = (10, 7),
    facecolor = 'k',
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

png

Learn Natural Language Processing Libraries #

NLTK Stopword Processing #

Setup Stop Words #

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

Preprocess Text #

dataset = df['product'].fillna("").values
raw_text_data = [d.split() for d in dataset]

Remove stop words #

text_data = [item for item in raw_text_data if item not in stop]

Gensim Topic Modeling #

from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(
    corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

2D Plots #

def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
  '''))
  init_notebook_mode(connected=False)

from plotly.offline import init_notebook_mode
enable_plotly_in_cell()
init_notebook_mode(connected=False)

import cufflinks as cf
cf.go_offline()
df.sample(1000).iplot(kind='bubble',
                      size='energy_100g',
                      mode='markers',
                      x='fat_100g',
                      y='proteins_100g',
                      xTitle='Fat',
                      yTitle='Protein',
                      text="product")

Screen Shot 2020-04-05 at 8 25 39 PM

Protein-Fat-Carb 3D Plot #

import plotly.offline as py
import plotly.graph_objs as go

from plotly.offline import init_notebook_mode
enable_plotly_in_cell()


trace1 = go.Scatter3d(
    x=df["fat_100g"],
    y=df["carbohydrates_100g"],
    z=df["proteins_100g"],
    mode='markers',
    text=df["product"],
    marker=dict(
        size=12,
        # set color to an array/list of desired values
        color=df["cluster"],
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    )
)
data = [trace1]
layout = go.Layout(
    showlegend=False,
    title="Protein-Fat-Carb:  Food Energy Types",
    scene = dict(
        xaxis = dict(title='X: Fat Content-100g'),
        yaxis = dict(title="Y:  Carbohydrate Content-100g"),
        zaxis = dict(title="Z:  Protein Content-100g"),
    ),
    width=1000,
    height=900,
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='3d-scatter-colorscale')

Screen Shot 2020-04-05 at 8 27 15 PM