Case Study of the Titanic Data Set: A Classification Problem Statement using Stratified Cross-Validation
Dataset
https://s3.amazonaws.com/acadgildsite/wordpress_images/datasets/titanic/test.csv
Free Step-by-step Guide To Become A Data Scientist
Subscribe and get this detailed guide absolutely FREE
https://s3.amazonaws.com/acadgildsite/wordpress_images/datasets/titanic/train.csv
Prepare Datasets
import re import numpy as np import pandas as pd from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score from sklearn.linear_model import LogisticRegression, LogisticRegressionCV # read the data sets # 'X' data frame will store train data # 'y' series will store the 'Survived' data # 'Z' will store test data X = pd.read_csv("train.csv") y = X['Survived'] Z = pd.read_csv("test.csv") X_orig = X.copy() Z_orig = Z.copy() X.head()
Output:
# Check for missing values in 'Age', 'Fare', 'Embarked' and fill them with the median value; do it for both test and train sets X['Age'].fillna(X['Age'].median(), inplace=True) X['Embarked'].fillna('S', inplace=True) Z['Age'].fillna(Z['Age'].median(), inplace=True) Z['Fare'].fillna(Z['Fare'].median(), inplace=True) # Perform dummy treatment on 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked' for both test and train sets and drop the original columns X = pd.concat([X, pd.get_dummies(X['Pclass'], prefix="PClass"), pd.get_dummies(X['Sex'], prefix="Sex"), pd.get_dummies(X['SibSp'], prefix="SibSp"), pd.get_dummies(X['Parch'], prefix="Parch"), pd.get_dummies(X['Embarked'], prefix="Embarked")], axis=1) Z = pd.concat([Z, pd.get_dummies(Z['Pclass'], prefix="PClass"), pd.get_dummies(Z['Sex'], prefix="Sex"), pd.get_dummies(Z['SibSp'], prefix="SibSp"), pd.get_dummies(Z['Parch'], prefix="Parch"), pd.get_dummies(Z['Embarked'], prefix="Embarked")], axis=1) X.drop(['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId'], axis=1, inplace=True) Z.drop(['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked', 'PassengerId', 'Parch_9'], axis=1, inplace=True) # Check data structure X.head()
Output:
X.shape
Output: (891, 24)
Z.head()
Output:
Z.shape
Output: (418, 24)
Feature Engineering
## Perform Binning for Age and Fare # Perform median groups and drop the original columns X['Age_cat'] = pd.qcut(X.Age, q=4, labels=False) X.drop(['Age'], axis=1, inplace=True) Z['Age_cat'] = pd.qcut(Z.Age, q=4, labels=False) Z.drop(['Age'], axis=1, inplace=True X['Fare_cat'] = pd.qcut(X.Fare, q=4, labels=False) X.drop(['Fare'], axis=1, inplace=True) ## Cabin usage: add feature 'has_cabin' which will yield cell values of 'false' or 'true' X['has_Cabin'] = ~X_orig.Cabin.isnull() Z['has_Cabin'] = ~Z_orig.Cabin.isnull()
Output:
## Extract the Titles of passengers (use lambda function) and do dummy treatment # Concatenate the dummies with the parent data frame X_title = pd.DataFrame(index = X_orig.index) X_title['Title'] = X_orig.Name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1)) X_title['Title'] = X_title['Title'].replace({'Mlle':'Miss', 'Mme':'Mrs', 'Ms':'Miss'}) X_title['Title'] = X_title['Title'].replace(['Don', 'Dona', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 'Countess', 'Jonkheer'],'Special') X = pd.concat([X, pd.get_dummies(X_title['Title'], prefix='Title')], axis=1) Z_title = pd.DataFrame(index = Z_orig.index) Z_title['Title'] = Z_orig.Name.apply(lambda x: re.search(' ([A-Z][a-z]+)\.', x).group(1)) Z_title['Title'] = Z_title['Title'].replace({'Mlle':'Miss', 'Mme':'Mrs', 'Ms':'Miss'}) Z_title['Title'] = Z_title['Title'].replace(['Don', 'Dona', 'Rev', 'Dr', 'Major', 'Lady', 'Sir', 'Col', 'Capt', 'Countess', 'Jonkheer'],'Special') Z = pd.concat([Z, pd.get_dummies(Z_title['Title'], prefix='Title')], axis=1) #Check column names and cell values X.head()
Output:
Decision Tree
# Set stratified cross validation parameters skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5) # Set the tree parameters decision_tree_params = {'max_depth': list(range(1, 5)), 'min_samples_leaf': list(range(1, 5))} # Set the tree grid decision_tree_grid = GridSearchCV(DecisionTreeClassifier(random_state=17), decision_tree_params, verbose=True, n_jobs=-1, cv=skf) decision_tree_grid.fit(X, y) # print out the best tree parameters print('Best decision tree params:', decision_tree_grid.best_params_) # print out the best tree cross validation score print('Best decision tree cross validation score:', decision_tree_grid.best_score_)
Output:
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best decision tree params: {‘max_depth’: 4, ‘min_samples_leaf’: 1}
Best decision tree cross validation score: 0.8170594837261503
decision_tree_predictions = decision_tree_grid.best_estimator_.predict(Z)
Logit
# Set the logit parameters c_values = np.logspace(-2, 3, 500) # perform the grid search algo logit_searcher = LogisticRegressionCV(Cs=c_values, cv=skf, verbose=1, n_jobs=-1) logit_searcher.fit(X, y) #print out the best tuning parameter print('Best C:', logit_searcher.C_)
Output:
Best C: [0.68181623]
#Predict on the test data set logit_predictions = logit_searcher.predict(Z)