Free Shipping

Secure Payment

easy returns

# Linear regression Case Study 1

## Case Study on Donut Data Set using Linear Regression

The Donut data set was provided in an interview round and it was asked to solve the problem statement hands on. The data set can be accessed through the following link.

# read the data and set the datetime as the index
# import all necessary libraries and tools
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
# list all columns of test and train data
train_df.columns

Output: Index([â€˜Donut IDâ€™, â€˜Donut Estimator 1â€™, â€˜Donut Area of cross sectionâ€™,

â€˜Donut Area of circumference circleâ€™,

â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

â€˜DonutÂ  Estimator 2â€™, â€˜DonutÂ  Estimator 3â€™, â€˜DonutÂ  Estimator 4â€™,

â€˜DonutÂ  Estimator 5â€™, â€˜Donut volume Estimator 6â€™, â€˜Locationâ€™,

â€˜Donut Densityâ€™, â€˜Donut volumeâ€™],

dtype=â€™objectâ€™)

## Handling categorical features

# create dummy variables of train data
season_dummies = pd.get_dummies(train_df.Location, prefix=‘Location’)
# drop the first column
season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)
# concatenate the original DataFrame and the dummy DataFrame (axis=0 means rows, axis=1 means columns)
train_df = pd.concat([train_df, season_dummies], axis=1)
train_df.drop([‘Donut ID’,‘Location’],axis=1,inplace=True)
# print 5 random rows
train_df.sample(n=5, random_state=1)
# create dummy variables for the test data
season_dummies = pd.get_dummies(test_df.Location, prefix=‘Location’)
# drop the first column
season_dummies.drop(season_dummies.columns[0], axis=1, inplace=True)
# concatenate the original DataFrame and the dummy DataFrame (axis=0 means rows, axis=1 means columns)
test_df = pd.concat([test_df, season_dummies], axis=1)
test_df.drop([‘Donut ID’,‘Location’],axis=1,inplace=True)
# print 5 random rows
test_df.sample(n=5, random_state=1)

## Visualizing the data

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams[‘figure.figsize’] = (8, 6)
plt.rcParams[‘font.size’] = 14
# explore more features
feature_cols = [val for val in train_df.columns if val not in [‘Donut Density’,‘Donut volume’]]
feature_cols

Output: [â€˜Donut Estimator 1â€™,

â€˜Donut Area of cross sectionâ€™,

â€˜Donut Area of circumference circleâ€™,

â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

â€˜DonutÂ  Estimator 2â€™,

â€˜DonutÂ  Estimator 3â€™,

â€˜DonutÂ  Estimator 4â€™,

â€˜DonutÂ  Estimator 5â€™,

â€˜Donut volume Estimator 6â€™,

â€˜Location_Texasâ€™]

# multiple scatter plots in Pandas
fig, axs = plt.subplots(1, len(feature_cols), sharey=True)
for index, feature in enumerate(feature_cols):
train_df.plot(kind=‘scatter’, x=feature, y=‘Donut volume’, ax=axs[index], figsize=(16, 3))
# line plot of rentals
train_df[‘Donut Density’].plot()
# line plot of rentals
train_df[‘Donut volume’].plot()

Check for multi-collinearity

def remove_collinear_features(x, target , threshold):
”’
Objective:
Remove collinear features in a dataframe with a correlation coefficient
greater than the threshold. Removing collinear features can help a model
to generalize and improves the interpretability of the model.
Inputs:
threshold: any features with correlations greater than this value are removed
Output:
dataframe that contains only the non-highly-collinear features
”’
# # Define y and x
y = x[target]
x = x.drop(columns = [target])
# Calculate the correlation matrix
corr_matrix = x.corr()
iters = range(len(corr_matrix.columns)1)
drop_cols = []
# Iterate through the correlation matrix and compare correlations
for i in iters:
for j in range(i):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = abs(item.values)
# If correlation exceeds the threshold
if val >= threshold:
# Print the correlated features and the correlation value
print(col.values[0], “|”, row.values[0], “|”, round(val[0][0], 2))
drop_cols.append(col.values[0])
# Drop one of each pair of correlated columns
drops = set(drop_cols)
x = x.drop(columns = drops)
# Add the score back in to the data
x[target] = y
return x
import matplotlib.pyplot as plt
def plot_line(y, predicted,target):
fig, ax = plt.subplots()
ax.scatter(y, predicted, edgecolors=(0, 1, 1))
ax.plot([y.min(), y.max()], [y.min(), y.max()], ‘k–‘, lw=4)
ax.set_xlabel(‘Actual {} ‘.format(target))
ax.set_ylabel(‘Predicted {} ‘.format(target))
plt.show()

## Creating baseline with null RMSE

Null RMSE is the RMSE that could be achieved byÂ always predicting the mean response value. It is a benchmark against which you may want to measure your regression model.

# define a function that accepts a list of features and returns testing RMSE
def get_baseline(feature_cols,target):
X_train, X_test, y_train, y_test = train_df[feature_cols] , test_df[feature_cols] , train_df[target] , test_df[target]
# create a NumPy array with the same shape as y_test
y_null = np.zeros_like(y_test, dtype=float)
# fill the array with the mean value of y_test
y_null.fill(y_test.mean())
# compute null RMSE
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_null))
baseline_guess = np.median(y_train)
print(‘The baseline guess for %s is a score of %0.2f’ % (target,baseline_guess))
print(“Baseline Performance on the test set for %s : RMSE = %0.4f” % (target,rmse))
get_baseline(feature_cols,‘Donut volume’)

Output: The baseline guess for Donut volume is a score of 83.40

Baseline Performance on the test set for Donut volume: RMSE = 10.5856

Fitting on entire features

# define a function that accepts a list of features and returns testing RMSE
def train_test_rmse(train_df,test_df,feature_cols,target):
X = train_df[feature_cols] y = train_df[target] X_train, X_test, y_train, y_test = train_df[feature_cols] , test_df[feature_cols] , train_df[target] , test_df[target] linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)
plot_line(y_test,y_pred,target)
return linreg,np.sqrt(metrics.mean_squared_error(y_test, y_pred))
# compare different sets of features
# â€˜Donut Densityâ€™, â€˜Donut volumeâ€™
# Donut Density
linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,â€™Donut Densityâ€™)
print (rmse_)
#print(linreg.score(X_test, y_test))
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

Output: 4.205028275576086

[(â€˜Donut Estimator 1â€™, 0.019969286018910806),

(â€˜Donut Area of cross sectionâ€™, 0.056824378116820666),

(â€˜Donut Area of circumference circleâ€™, -0.011141124820625037),

(â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

18.37532899880506),

(â€˜DonutÂ  Estimator 2â€™, -0.013662198041061776),

(â€˜DonutÂ  Estimator 3â€™, 0.03211097048326648),

(â€˜DonutÂ  Estimator 4â€™, 0.0005844272728200844),

(â€˜DonutÂ  Estimator 5â€™, 0.001613921718232509),

(â€˜Donut volume Estimator 6â€™, -0.05474623086568598),

(â€˜Location_Texasâ€™, -2.3923430721543717)]

# Donut volume’
linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,‘Donut volume’)
print (rmse_)
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

Output: 9.625589368125372

[(â€˜Donut Estimator 1â€™, 0.015167756318128774),

(â€˜Donut Area of cross sectionâ€™, 0.12148190231269254),

(â€˜Donut Area of circumference circleâ€™, -0.06206356694138626),

(â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

49.63679964426164),

(â€˜DonutÂ  Estimator 2â€™, -0.016590844171362112),

(â€˜DonutÂ  Estimator 3â€™, 0.1104276320893024),

(â€˜DonutÂ  Estimator 4â€™, 0.004140928646805438),

(â€˜DonutÂ  Estimator 5â€™, 0.00040124955448831573),

(â€˜Donut volume Estimator 6â€™, -0.1388884821714789),

(â€˜Location_Texasâ€™, -5.674542346646663)]

Check for missing values

train_df.isna().any().any()
test_df.isna().any().any()

Check and treat outliers

import numpy as np
from scipy import stats
train_df = train_df[(np.abs(stats.zscore(train_df)) < 3).all(axis=1)]
test_df = test_df[(np.abs(stats.zscore(test_df)) < 3).all(axis=1)]
linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,‘Donut Density’)
print (rmse_)
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

Output: 3.746191877394081

[(â€˜Donut Estimator 1â€™, 0.0288295162848659),

(â€˜Donut Area of cross sectionâ€™, 0.03642288126891477),

(â€˜Donut Area of circumference circleâ€™, -0.023677186635323165),

(â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

15.21302451820518),

(â€˜DonutÂ  Estimator 2â€™, 0.013458144257911404),

(â€˜DonutÂ  Estimator 3â€™, 0.016877294651321272),

(â€˜DonutÂ  Estimator 4â€™, -0.08072455883436068),

(â€˜DonutÂ  Estimator 5â€™, 0.08147306364736875),

(â€˜Donut volume Estimator 6â€™, -0.03985676958030972),

(â€˜Location_Texasâ€™, -0.7118962569491505)]

linreg,rmse_ = train_test_rmse(train_df,test_df,feature_cols,‘Donut volume’)
print (rmse_)
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

Output: 9.36845277397271

[(â€˜Donut Estimator 1â€™, 0.02302115009651563),

(â€˜Donut Area of cross sectionâ€™, 0.07538537877128039),

(â€˜Donut Area of circumference circleâ€™, -0.07290185928673625),

(â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

38.60569659761433),

(â€˜DonutÂ  Estimator 2â€™, 0.0479577123217777),

(â€˜DonutÂ  Estimator 3â€™, 0.05132482421369051),

(â€˜DonutÂ  Estimator 4â€™, -0.14289021476649924),

(â€˜DonutÂ  Estimator 5â€™, 0.15684599831219803),

(â€˜Donut volume Estimator 6â€™, -0.0934921576017296),

(â€˜Location_Texasâ€™, -1.7729050771242068)]

## Treat multi-collinearity

# visualize correlation matrix in Seaborn using a heatmap
sns.heatmap(train_df.corr())

train_df_col = remove_collinear_features(train_df,‘Donut Density’,0.7)
feature_cols_ = train_df_col.columns
test_df_col = test_df[feature_cols_]
linreg,rmse_ = train_test_rmse(train_df_col,test_df_col,feature_cols_,‘Donut Density’)
print (rmse_)
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

Output: 6.069680214474494e-15

[(â€˜Donut Estimator 1â€™, -3.286933510054274e-17),

(â€˜Donut Area of cross sectionâ€™, -7.979727989493313e-17),

(â€˜Donut Area of circumference circleâ€™, 1.0581813203458523e-16),

(â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

-7.361732751176575e-17),

(â€˜DonutÂ  Estimator 2â€™, 7.178841397205427e-16),

(â€˜DonutÂ  Estimator 3â€™, 0.9999999999999999)]

linreg,rmse_ = train_test_rmse(train_df_col,test_df_col,feature_cols_,‘Donut volume’)
print (rmse_)
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

Output: 9.255956005260917e-15

Out[30]:

[(â€˜Donut Estimator 1â€™, 2.39049709822129e-17),

(â€˜Donut Area of cross sectionâ€™, -1.249000902703301e-16),

(â€˜Donut Area of circumference circleâ€™, 1.5612511283791264e-16),

(â€˜Donut area of central hole / Donut Area of circumscribed circleâ€™,

1.0000000000000004),

(â€˜DonutÂ  Estimator 2â€™, -9.774082584956822e-16),

(â€˜DonutÂ  Estimator 3â€™, 3.870601755773251e-17)]

Â Conclusion:Â After treating outliers and multicollinearity we get almost perfect fit on out of bag samples.