# 1Linear regression Case Study 1

## Case Study on Donut Data Set using Linear Regression

The Donut data set was provided in an interview round and it was asked to solve the problem statement hands on. The data set can be accessed through the following link.

Free Step-by-step Guide To Become A Data Scientist

Subscribe and get this detailed guide absolutely FREE

## Reading in the data

# read the data and set the datetime as the index

# import all necessary libraries and tools

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn import metrics

import numpy as np

# list all columns of test and train data

train_df.columns

Output: Index([‘Donut ID’, ‘Donut Estimator 1’, ‘Donut Area of cross section’,

‘Donut Area of circumference circle’,

‘Donut area of central hole / Donut Area of circumscribed circle’,

‘DonutÂ  Estimator 2’, ‘DonutÂ  Estimator 3’, ‘DonutÂ  Estimator 4’,

‘DonutÂ  Estimator 5’, ‘Donut volume Estimator 6’, ‘Location’,

‘Donut Density’, ‘Donut volume’],

dtype=’object’)

train_df.isnull().values.any()

False

test_df.isnull().values.any()

False

## Handling categorical features

# create dummy variables of train data

season_dummies = pd.get_dummies(train_df.Location, prefix=’Location’)

# drop the first column

season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)

# concatenate the original DataFrame and the dummy DataFrame (axis=0 means rows, axis=1 means columns)

train_df = pd.concat([train_df, season_dummies], axis=1)

train_df.drop([‘Donut ID’,’Location’],axis=1,inplace=True)

Â

# print 5 random rows

train_df.sample(n=5, random_state=1)

# create dummy variables for the test data

season_dummies = pd.get_dummies(test_df.Location, prefix=’Location’)

# drop the first column

season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)

# concatenate the original DataFrame and the dummy DataFrame (axis=0 means rows, axis=1 means columns)

test_df = pd.concat([test_df, season_dummies], axis=1)

test_df.drop([‘Donut ID’,’Location’],axis=1,inplace=True)

# print 5 random rows

test_df.sample(n=5, random_state=1)

## Visualizing the data

import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline

plt.rcParams[‘figure.figsize’] = (8, 6)

plt.rcParams[‘font.size’] = 14

# explore more features

feature_cols = [val for val in train_df.columns if val not in [‘Donut Density’,’Donut volume’]]

feature_cols

Output: [‘Donut Estimator 1’,

‘Donut Area of cross section’,

‘Donut Area of circumference circle’,

‘Donut area of central hole / Donut Area of circumscribed circle’,

‘DonutÂ  Estimator 2’,

‘DonutÂ  Estimator 3’,

‘DonutÂ  Estimator 4’,

‘DonutÂ  Estimator 5’,

‘Donut volume Estimator 6’,

‘Location_Texas’]

# multiple scatter plots in Pandas

fig, axs = plt.subplots(1, len(feature_cols), sharey=True)

for index, feature in enumerate(feature_cols):

train_df.plot(kind=’scatter’, x=feature, y=’Donut Density’, ax=axs[index], figsize=(16, 3))

# multiple scatter plots in Pandas

fig, axs = plt.subplots(1, len(feature_cols), sharey=True)

for index, feature in enumerate(feature_cols):

train_df.plot(kind=’scatter’, x=feature, y=’Donut volume’, ax=axs[index], figsize=(16, 3))

# line plot of rentals

train_df[‘Donut Density’].plot()

# line plot of rentals

train_df[‘Donut volume’].plot()

Check for multi-collinearity

def remove_collinear_features(x, target , threshold):

Â Â Â  ”’

Â Â Â  Objective:

Â Â Â Â Â Â Â  Remove collinear features in a dataframe with a correlation coefficient

Â Â Â Â Â Â Â  greater than the threshold. Removing collinear features can help a model

Â Â Â Â Â Â Â  to generalize and improves the interpretability of the model.

Â Â Â  Inputs:

Â Â Â Â Â Â Â  threshold: any features with correlations greater than this value are removed

Â Â Â  Output:

Â Â Â Â Â Â Â  dataframe that contains only the non-highly-collinear features

Â Â Â  ”’

# # Define y and x

y = x[target]

x = x.drop(columns = [target])

# Calculate the correlation matrix

corr_matrix = x.corr()

iters = range(len(corr_matrix.columns) – 1)

drop_cols = []

Â Â Â  # Iterate through the correlation matrix and compare correlations

for i in iters:

for j in range(i):

item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]

col = item.columns

row = item.index

val = abs(item.values)

# If correlation exceeds the threshold

if val >= threshold:

# Print the correlated features and the correlation value

print(col.values[0], “|”, row.values[0], “|”, round(val[0][0], 2))

drop_cols.append(col.values[0])

Â Â Â  # Drop one of each pair of correlated columns

drops = set(drop_cols)

x = x.drop(columns = drops)

Â Â # Add the score back in to the data

x[target] = y

return x

import matplotlib.pyplot as plt

def plot_line(y, predicted,target):

fig, ax = plt.subplots()

ax.scatter(y, predicted, edgecolors=(0, 1, 1))

ax.plot([y.min(), y.max()], [y.min(), y.max()], ‘k–‘, lw=4)

ax.set_xlabel(‘Actual {} ‘.format(target))

ax.set_ylabel(‘Predicted {} ‘.format(target))

plt.show()

## Creating baseline with null RMSE

Null RMSE is the RMSE that could be achieved byÂ always predicting the mean response value. It is a benchmark against which you may want to measure your regression model.

# define a function that accepts a list of features and returns testing RMSE

def get_baseline(feature_cols,target):

X_train, X_test, y_train, y_test = train_df[feature_cols] , test_df[feature_cols] , train_df[target] , test_df[target]

Â Â Â  # create a NumPy array with the same shape as y_test

y_null = np.zeros_like(y_test, dtype=float)

# fill the array with the mean value of y_test

y_null.fill(y_test.mean())

Â Â Â  # compute null RMSE

rmse = np.sqrt(metrics.mean_squared_error(y_test, y_null))

baseline_guess = np.median(y_train)

print(‘The baseline guess for %s is a score of %0.2f’ % (target,baseline_guess))

print(“Baseline Performance on the test set for %s : RMSE = %0.4f” % (target,rmse))

get_baseline(feature_cols,’Donut volume’)

Output: The baseline guess for Donut volume is a score of 83.40

Baseline Performance on the test set for Donut volume: RMSE = 10.5856

Â

Fitting on entire features

# define a function that accepts a list of features and returns testing RMSE

def train_test_rmse(train_df,test_df,feature_cols,target):

X = train_df[feature_cols]

y = train_df[target]

X_train, X_test, y_train, y_test = train_df[feature_cols] , test_df[feature_cols] , train_df[target] , test_df[target]

linreg = LinearRegression()

linreg.fit(X_train, y_train)

y_pred = linreg.predict(X_test)

plot_line(y_test,y_pred,target)

return linreg,np.sqrt(metrics.mean_squared_error(y_test, y_pred))

# compare different sets of features

# ‘Donut Density’, ‘Donut volume’

# Donut Density

linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,’Donut Density’)

print (rmse_)

#print(linreg.score(X_test, y_test))

# pair the feature names with the coefficients

list(zip(feature_cols, linreg.coef_))

Output: 4.205028275576086

[(‘Donut Estimator 1’, 0.019969286018910806),

(‘Donut Area of cross section’, 0.056824378116820666),

(‘Donut Area of circumference circle’, -0.011141124820625037),

(‘Donut area of central hole / Donut Area of circumscribed circle’,

18.37532899880506),

(‘DonutÂ  Estimator 2’, -0.013662198041061776),

(‘DonutÂ  Estimator 3’, 0.03211097048326648),

(‘DonutÂ  Estimator 4’, 0.0005844272728200844),

(‘DonutÂ  Estimator 5’, 0.001613921718232509),

(‘Donut volume Estimator 6’, -0.05474623086568598),

(‘Location_Texas’, -2.3923430721543717)]

# Donut volume’

linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,’Donut volume’)

print (rmse_)

# pair the feature names with the coefficients

list(zip(feature_cols, linreg.coef_))

Output: 9.625589368125372

[(‘Donut Estimator 1’, 0.015167756318128774),

(‘Donut Area of cross section’, 0.12148190231269254),

(‘Donut Area of circumference circle’, -0.06206356694138626),

(‘Donut area of central hole / Donut Area of circumscribed circle’,

49.63679964426164),

(‘DonutÂ  Estimator 2’, -0.016590844171362112),

(‘DonutÂ  Estimator 3’, 0.1104276320893024),

(‘DonutÂ  Estimator 4’, 0.004140928646805438),

(‘DonutÂ  Estimator 5’, 0.00040124955448831573),

(‘Donut volume Estimator 6’, -0.1388884821714789),

(‘Location_Texas’, -5.674542346646663)]

Check for missing values

train_df.isna().any().any()

test_df.isna().any().any()

Check and treat outliers

import numpy as np

from scipy import stats

train_df = train_df[(np.abs(stats.zscore(train_df)) < 3).all(axis=1)]

test_df = test_df[(np.abs(stats.zscore(test_df)) < 3).all(axis=1)]

linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,’Donut Density’)

print (rmse_)

# pair the feature names with the coefficients

list(zip(feature_cols, linreg.coef_))

Output: 3.746191877394081

[(‘Donut Estimator 1’, 0.0288295162848659),

(‘Donut Area of cross section’, 0.03642288126891477),

(‘Donut Area of circumference circle’, -0.023677186635323165),

(‘Donut area of central hole / Donut Area of circumscribed circle’,

15.21302451820518),

(‘DonutÂ  Estimator 2’, 0.013458144257911404),

(‘DonutÂ  Estimator 3’, 0.016877294651321272),

(‘DonutÂ  Estimator 4’, -0.08072455883436068),

(‘DonutÂ  Estimator 5’, 0.08147306364736875),

(‘Donut volume Estimator 6’, -0.03985676958030972),

(‘Location_Texas’, -0.7118962569491505)]

linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,’Donut volume’)

print (rmse_)

# pair the feature names with the coefficients

list(zip(feature_cols, linreg.coef_))

Output: 9.36845277397271

[(‘Donut Estimator 1’, 0.02302115009651563),

(‘Donut Area of cross section’, 0.07538537877128039),

(‘Donut Area of circumference circle’, -0.07290185928673625),

(‘Donut area of central hole / Donut Area of circumscribed circle’,

38.60569659761433),

(‘DonutÂ  Estimator 2’, 0.0479577123217777),

(‘DonutÂ  Estimator 3’, 0.05132482421369051),

(‘DonutÂ  Estimator 4’, -0.14289021476649924),

(‘DonutÂ  Estimator 5’, 0.15684599831219803),

(‘Donut volume Estimator 6’, -0.0934921576017296),

(‘Location_Texas’, -1.7729050771242068)]

## Treat multi-collinearity

# visualize correlation matrix in Seaborn using a heatmap

sns.heatmap(train_df.corr())

train_df_col = remove_collinear_features(train_df,’Donut Density’,0.7)

feature_cols_ = train_df_col.columns

test_df_col = test_df[feature_cols_]

linreg,rmse_ = train_test_rmse(train_df_col,test_df_col,feature_cols_,’Donut Density’)

print (rmse_)

# pair the feature names with the coefficients

list(zip(feature_cols, linreg.coef_))

Output: 6.069680214474494e-15

[(‘Donut Estimator 1’, -3.286933510054274e-17),

(‘Donut Area of cross section’, -7.979727989493313e-17),

(‘Donut Area of circumference circle’, 1.0581813203458523e-16),

(‘Donut area of central hole / Donut Area of circumscribed circle’,

-7.361732751176575e-17),

(‘DonutÂ  Estimator 2’, 7.178841397205427e-16),

(‘DonutÂ  Estimator 3′, 0.9999999999999999)]

linreg,rmse_ = train_test_rmse(train_df_col,test_df_col,feature_cols_,’Donut volume’)

print (rmse_)

# pair the feature names with the coefficients

list(zip(feature_cols, linreg.coef_))

Output: 9.255956005260917e-15

Out[30]:

[(‘Donut Estimator 1’, 2.39049709822129e-17),

(‘Donut Area of cross section’, -1.249000902703301e-16),

(‘Donut Area of circumference circle’, 1.5612511283791264e-16),

(‘Donut area of central hole / Donut Area of circumscribed circle’,

1.0000000000000004),

(‘DonutÂ  Estimator 2’, -9.774082584956822e-16),

(‘DonutÂ  Estimator 3’, 3.870601755773251e-17)]

Â Conclusion: After treating outliers and multicollinearity we get almost perfect fit on out of bag samples.

### prateek

An alumnus of the NIE-Institute Of Technology, Mysore, Prateek is an ardent Data Science enthusiast. He has been working at Acadgild as a Data Engineer for the past 3 years. He is a Subject-matter expert in the field of Big Data, Hadoop ecosystem, and Spark.

This site uses Akismet to reduce spam. Learn how your comment data is processed.

### Related Articles

Close

Get 24 weeks career boost with Data Science Bootcamp NIT Kurkshetra