UGBA 198 - 3 Lecture 2

https://ugba198.org

Ridge Regression and Cross-Validation

In [69]:
#importing the necessary packages 
%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

import numpy as np
import pandas as pd
import sklearn
In [70]:
#sklearn has implemented the regression models we will be using
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
In [71]:
#loads the boston dataset into a dataframe
boston = load_boston()
boston_df = pd.DataFrame(boston.data, columns=boston.feature_names)
print(boston.DESCR)

#add another column that contains the house prices (your label) which in scikit learn datasets are considered as target
boston_df['Price'] = boston.target

boston_df.head(5) #prints out first 5 rows of the dataset
Boston House Prices dataset
===========================

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
        - LSTAT    % lower status of the population
        - MEDV     Median value of owner-occupied homes in $1000's

    :Missing Attribute Values: None

    :Creator: Harrison, D. and Rubinfeld, D.L.

This is a copy of UCI ML housing dataset.
http://archive.ics.uci.edu/ml/datasets/Housing


This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

The Boston house-price data has been used in many machine learning papers that address regression
problems.   
     
**References**

   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.
   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.
   - many more! (see http://archive.ics.uci.edu/ml/datasets/Housing)

Out[71]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT Price
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33 36.2
In [72]:
fullX = boston_df.drop('Price',axis=1) #fullX is a matrix
fullY = boston_df['Price'] #fullY is a series of labels

#splitting out the test data from the training data - 30% of data is reserved for testing
X_train, X_test, y_train, y_test = train_test_split(fullX, fullY, test_size=0.3, random_state=3)
print(X_train.shape)
(354, 13)
In [73]:
#histogram of prices
fig = plt.figure()
ax1 = plt.axes()

ax1.hist(fullY)

ax1.grid(axis='y', alpha=0.75)
ax1.set_xlabel('Price (in Thousands of Dollars)')
ax1.set_ylabel('Frequency')
ax1.set_title('Price Histogram')
Out[73]:
Text(0.5, 1.0, 'Price Histogram')
In [15]:
lr = LinearRegression()
lr.fit(X_train, y_train)

#score returns the coefficient of determination R^2 of the prediction.
train_score=lr.score(X_train, y_train)
test_score=lr.score(X_test, y_test)

print("ordinary linear regression train score:", train_score)
print("ordinary linear regression test score:", test_score)

fig = plt.figure()
ax2 = plt.axes()

ax2.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression')

ax2.set_xlabel('Coefficient Index',fontsize=16)
ax2.set_ylabel('Coefficient Magnitude',fontsize=16)

ax2.legend(fontsize=13,loc=4)
ordinary linear regression train score: 0.7419034960343789
ordinary linear regression test score: 0.7146895989294313
Out[15]:
<matplotlib.legend.Legend at 0x1a1f7556a0>
In [74]:
# Plot outputs


fig = plt.figure()
ax3 = plt.axes(projection='3d')


#chose first two features to plot
y_predict = lr.predict(X_test)
ax3.scatter3D(X_test.values[:,5], X_test.values[:,7], y_predict, color='black')

X,Y = np.meshgrid(np.arange(min(X_test.values[:,5]), max(X_test.values[:,5]), 1), 
                  np.arange(min(X_test.values[:,7]), max(X_test.values[:,7]), 1))
C = lr.coef_
print(lr.coef_.shape)
Z = C[5]*X + C[7]*Y + C[12]

#note that the Z intercept is actually for the entire dataset and not the two plotted vars so it is skewed
ax3.plot_surface(X, Y, Z, rstride=1, cstride=1, alpha=0.2, color="green")

ax3.set_xlabel(X_test.columns[5])
ax3.set_ylabel(X_test.columns[7])
ax3.set_zlabel(y_test.name[0])

plt.show()
(13,)
In [37]:
fig = plt.figure()
ax4 = plt.axes()

rr = Ridge(alpha=0.01) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely
# restricted and in this case linear and ridge regression resembles
rr.fit(X_train, y_train)


Ridge_train_score = rr.score(X_train,y_train)
Ridge_test_score = rr.score(X_test, y_test)


print("ridge regression train score low alpha:", Ridge_train_score)
print("ridge regression test score low alpha:", Ridge_test_score)

ax4.plot(rr.coef_, alpha=1,linestyle='none', marker='*',
         markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers

ax4.set_xlabel('Coefficient Index',fontsize=16)
ax4.set_ylabel('Coefficient Magnitude',fontsize=16)

ax4.legend(fontsize=13,loc=4)
ridge regression train score low alpha: 0.7419030253527293
ridge regression test score low alpha: 0.7145115044376253
Out[37]:
<matplotlib.legend.Legend at 0x1a211f28d0>
In [75]:
fig = plt.figure()
ax5 = plt.axes()

rr100 = Ridge(alpha=100) #  comparison with alpha value
rr100.fit(X_train, y_train)

Ridge_train_score100 = rr100.score(X_train,y_train)
Ridge_test_score100 = rr100.score(X_test, y_test)


print("ridge regression train score high alpha:", Ridge_train_score100)
print("ridge regression test score high alpha:", Ridge_test_score100)

ax5.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',
         markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency

ax5.set_xlabel('Coefficient Index',fontsize=16)
ax5.set_ylabel('Coefficient Magnitude',fontsize=16)

ax5.legend(fontsize=13,loc=4)
ridge regression train score high alpha: 0.7172809669938278
ridge regression test score high alpha: 0.6805838894730997
Out[75]:
<matplotlib.legend.Legend at 0x1a1f63c438>

Lasso Regression

In [76]:
fig = plt.figure()
ax6 = plt.axes()

lasso = Lasso(alpha=0.1) #  comparison with alpha value
lasso.fit(X_train, y_train)

lasso_train_score = lasso.score(X_train,y_train)
lasso_test_score = lasso.score(X_test, y_test)


print("lasso regression train score .1 alpha:", lasso_train_score)
print("lasso regression test score .1 alpha:", lasso_test_score)

ax6.plot(lasso.coef_,alpha=0.5,linestyle='none',marker='x',
         markersize=6,color='black',label=r'Lasso; $\alpha = 0.1$') # alpha here is for transparency

ax6.set_xlabel('Coefficient Index',fontsize=16)
ax6.set_ylabel('Coefficient Magnitude',fontsize=16)

ax6.legend(fontsize=13,loc=4)