#Import tools and libraries from __future__ import print_function from time import time import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_lfw_people from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.decomposition import PCA from sklearn import metrics import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler import numpy as np # Download the data, if not already on disk and load it as numpy arrays lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) # introspect the images arrays to find the shapes (for plotting) n_samples, h, w = lfw_people.images.shape # for machine learning we use the 2 data directly (as relative pixel positions info is ignored by this # model) X = lfw_people.data n_features = X.shape[1] # the label to predict is the id of the person y = lfw_people.target target_names = lfw_people.target_names n_classes = target_names.shape[0] print("Total dataset size:") print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) print("n_classes: %d" % n_classes)
Output:
Free Step-by-step Guide To Become A Data Scientist
Subscribe and get this detailed guide absolutely FREE
Total dataset size:
n_samples: Â Â 1288
n_features: Â 1850
n_classes:Â Â Â Â Â Â Â Â Â Â 7
# Split into a training set and a test set using a stratified k fold # split into a training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) scaler = StandardScaler() # Fit on training set only scaler.fit(X_train) # Apply transform to both the training set and the test set. train_img = scaler.transform(X_train) test_img = scaler.transform(X_test) # Compute a PCA (eigenfaces) on the face dataset (treated as unlabelled dataset): # unsupervised feature extraction / dimensionality reduction variance = 0.95 print("Extracting eigenfaces with %f variance from %d faces" % (variance, X_train.shape[0])) Output:Extracting eigenfaces with 0.950000 variance from 966 faces  # PCA model t0 = time() pca = PCA(0.95, whiten=True).fit(X_train) print("done in %0.3fs" % (time() - t0)) print("Projecting the input data on the eigenfaces orthonormal basis")
Output: done in 0.823s
# Implementing PCA transform t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done in %0.3fs" % (time() - t0)) Output: Projecting the input data on the eigenfaces orthonormal basisdone in 0.012s  eigenfaces = pca.components_.reshape(-1,h,w) eigenfaces.shape # print("Projecting the input data on the eigenfaces orthonormal basis")
Output:
(135, 50, 37)
# plot the gallery of the most significative eigenfaces eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] plot_gallery(eigenfaces, eigenface_titles, h, w) plt.show()
# Train a SVM classification model print("Fitting the classifier to the training set") t0 = time() clf = LogisticRegression(solver = 'lbfgs') clf = clf.fit(X_train_pca, y_train) print("done in %0.3fs" % (time() - t0))
Output:
Fitting the classifier to the training setdone in 0.068s
# Quantitative evaluation of the model quality on the test set print("Predicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) print("done in %0.3fs" % (time() - t0))
Output:
Predicting people’s names on the test setdone in 0.002s
# Evaluate the accuracy accuracy = (metrics.accuracy_score(y_test, y_pred)) accuracy
Output: 0.7763975155279503
# Classification report, comments on f1 score print(classification_report(y_test, y_pred, target_names=target_names))
Output:
precision   recall f1-score  support     Ariel Sharon             0.65        0.77     0.71       22    Colin Powell              0.86        0.75     0.80       57 Donald Rumsfeld        0.62        0.80     0.70       25   George W Bush         0.84        0.82     0.83      135Gerhard Schroeder       0.79       0.77     0.78       30     Hugo Chavez            0.71       0.75     0.73       20      Tony Blair                0.66       0.64     0.65       33        micro avg                0.78      0.78      0.78      322       macro avg               0.73      0.76      0.74      322    weighted avg            0.78      0.78      0.78      322
# Evaluation of the confusion matrix confusion = (confusion_matrix(y_test, y_pred, labels=range(n_classes))) Â # Visualisation import seaborn as sns; sns.set() ax = sns.heatmap(confusion, annot=True, fmt="d")
# Qualitative evaluation of the predictions using matplotlib def plot_gallery(images, titles, h, w, n_row=3, n_col=4): """Helper function to plot a gallery of portraits""" plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) for i in range(n_row * n_col): plt.subplot(n_row, n_col, i + 1) plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray) plt.title(titles[i], size=12) plt.xticks(()) plt.yticks(()) # plot the result of the prediction on a portion of the test set def title(y_pred, y_test, target_names, i): pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] return 'predicted: %s\ntrue:Â Â Â Â Â %s' % (pred_name, true_name) prediction_titles = [title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0])] plot_gallery(X_test, prediction_titles, h, w)