This is my lab from DS-GA-1003 NYU
I put all my code in my github

Lab1 Gradient Descent

Import and Init Dataset

compile feature normalization first

python
1
2
3
4
5
6
# Import
import matplotlib.pyplot as plt
import numpy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
python
1
2
3
4
5
6
7
8
# Init data
df = pd.read_csv('hw1-data.csv', delimiter=',')
X = df.values[:, :-1]
y = df.values[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100, random_state=10)
X_train, X_test = feature_normalization(X_train, X_test)
X_train = np.hstack((X_train, np.ones((X_train.shape[0], 1)))) # Add bias term
X_test = np.hstack((X_test, np.ones((X_test.shape[0], 1)))) # Add bias term

Feature Normalization

only operate train dataset.
shift all data to ensure they are all larger than 0, then rescale them into [0,1]

Xnorm=XXminXmaxXmin

Wanrning

数据标准化必须以特征列为单位。不能所有数据共同标准化。因为不同feature的数据尺度不同。

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def feature_normalization(train: np.ndarray, test: np.ndarray) -> [np.ndarray, np.ndarray]:
"""Rescale the data so that each feature in the training set is in
the interval [0,1], and apply the same transformations to the test
set, using the statistics computed on the training set.

Args:
train - training set, a 2D numpy array of size (num_instances, num_features)
test - test set, a 2D numpy array of size (num_instances, num_features)
Returns:
train_normalized - training set after normalization
test_normalized - test set after normalization

"""
min_value: np.ndarray = np.min(train, axis=0)
max_value: np.ndarray = np.max(train, axis=0)

for index in range(len(max_value)):
if min_value[index] == max_value[index]:
min_value[index] = 0

train_normalized = (train - min_value) / (max_value - min_value + 0.0)
test_normalized = (test - min_value) / (max_value - min_value + 0.0)

return train_normalized, test_normalized

Linear Regression with Full Batch

Compute Square Loss

Given a set of X, y, theta, compute the square loss for predicting y with X*theta
loss func: J(θ)=12mi=1m(hθ(xi)yi)2
In matrix formulation, hθ(xi)=θTxi
So J(θ)=12m(XθTy)(XθTy)T

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def compute_square_loss(X: np.ndarray, y: np.ndarray, theta: np.ndarray) -> float:
"""
Given a set of X, y, theta, compute the square loss for predicting y with X*theta

Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D array of size (num_features)

Returns:
loss - the square loss, scalar
"""

num_instances = X.shape[0]
auction = np.dot(X, theta)
difference: np.ndarray = auction - y

loss: float = 0.5 / num_instances * np.dot(difference, difference)
return loss

Compute Square Loss Gradient

J(θ)=12m(XθTy)(XθTy)T
θJ(θ)=1m(XθTy)X

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def compute_square_loss_gradient(X: np.ndarray, y: np.ndarray, theta: np.ndarray) -> np.ndarray:
"""
Compute gradient of the square loss (as defined in compute_square_loss), at the point theta.

Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D numpy array of size (num_features)

Returns:
grad - gradient vector, 1D numpy array of size (num_features)
"""

num_instances = X.shape[0]
difference: np.ndarray = np.dot(X, theta) - y
grad = 1.0 / (num_instances+0.0) * np.dot(difference.T, X)
return grad

Gradient Check

Check that the function compute_square_loss_gradient returns the correct gradient for the given X, y, and theta.
For all directions: let e1=(1,0,0,,0),e2=(0,1,0,,0),,ed=(0,,0,1)
Use approximation: J(θ+epsilonei)J(θepsilonei)(2epsilon) for each direction
Then for each direction of gradient, check if Euclidean distance of approximation and computed is exceed tolerance(1e-4 default)

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
def grad_checker(X: np.ndarray, y: np.ndarray, theta: np.ndarray, epsilon=0.01, tolerance=1e-4) -> bool:
"""Implement Gradient Checker
Check that the function compute_square_loss_gradient returns the
correct gradient for the given X, y, and theta.

Let d be the number of features. Here we numerically estimate the
gradient by approximating the directional derivative in each of
the d coordinate directions:
(e_1 = (1,0,0,...,0), e_2 = (0,1,0,...,0), ..., e_d = (0,...,0,1)

The approximation for the directional derivative of J at the point
theta in the direction e_i is given by:
( J(theta + epsilon * e_i) - J(theta - epsilon * e_i) ) / (2*epsilon).

We then look at the Euclidean distance between the gradient
computed using this approximation and the gradient computed by
compute_square_loss_gradient(X, y, theta). If the Euclidean
distance exceeds tolerance, we say the gradient is incorrect.

Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D numpy array of size (num_features)
epsilon - the epsilon used in approximation
tolerance - the tolerance error

Return:
A boolean value indicate whether the gradient is correct or not

"""
true_gradient: np.ndarray = compute_square_loss_gradient(X, y, theta) # the true gradient
num_features = theta.shape[0]
approx_grad: np.ndarray = np.zeros(num_features) # Initialize the gradient we approximate
# TODO
for index in range(num_features):
e_i = np.zeros(num_features)
e_i[index] = 1
theta_plus = theta + epsilon * e_i
theta_minus = theta - epsilon * e_i
approx_grad[index] = \
(compute_square_loss(X, y, theta_plus) - compute_square_loss(X, y, theta_minus)) \
/ (2 * epsilon)
distance = np.linalg.norm(approx_grad-true_gradient)
return distance < tolerance

Generic Gradient Checker

Similar as Gradient Checker
The functions takes objective_func and gradient_func as parameters. And check whether gradient_func(X, y, theta) returned
the true gradient for objective_func(X, y, theta).
Eg: In LSR, the objective_func = compute_square_loss, and gradient_func = compute_square_loss_gradient

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def generic_gradient_checker(X, y, theta, objective_func, gradient_func, epsilon=0.01, tolerance=1e-4):
"""
The functions takes objective_func and gradient_func as parameters. And check whether gradient_func(X, y, theta) returned
the true gradient for objective_func(X, y, theta).
Eg: In LSR, the objective_func = compute_square_loss, and gradient_func = compute_square_loss_gradient
"""
true_gradient: np.ndarray = gradient_func(X, y, theta) # the true gradient
num_features = theta.shape[0]
approx_grad: np.ndarray = np.zeros(num_features) # Initialize the gradient we approximate

for index in range(num_features):
e_i = np.zeros(num_features)
e_i[index] = 1
theta_plus = theta + epsilon * e_i
theta_minus = theta - epsilon * e_i
approx_grad[index] = \
(objective_func(X, y, theta_plus) - objective_func(X, y, theta_minus)) \
/ (2 * epsilon)
distance = np.linalg.norm(approx_grad - true_gradient)
return distance < tolerance

Batch Grad Descent

每次迭代计算梯度,然后θ下降一次梯度

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def batch_grad_descent(X: np.ndarray, y: np.ndarray, step_size=0.1, num_iter=1000, check_gradient=False) \
-> [np.ndarray, np.ndarray]:
"""
In this question you will implement batch gradient descent to
minimize the square loss objective

Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
step - step size in gradient descent
num_iter - number of iterations to run
check_gradient - a boolean value indicating whether checking the gradient when updating

Returns:
theta_hist - store the history of parameter vector in iteration, 2D numpy array of size (num_iter+1, num_features)
for instance, theta in iteration 0 should be theta_hist[0], theta in iteration (num_iter) is theta_hist[-1]
loss_hist - the history of objective function vector, 1D numpy array of size (num_iter+1)
"""
num_instances, num_features = X.shape[0], X.shape[1]
theta_hist = np.zeros((num_iter + 1, num_features)) # Initialize theta_hist
loss_hist = np.zeros(num_iter + 1) # initialize loss_hist
theta = np.ones(num_features) # initialize theta
# TODO
theta_hist[0] = theta
loss_hist[0] = compute_square_loss(X, y, theta)

for iteration in range(1, num_iter + 1):
if check_gradient:
assert (grad_checker(X, y, theta) is True)
grad = compute_square_loss_gradient(X, y, theta)
theta = theta - step_size*grad.transpose()
theta_hist[iteration] = theta
loss = compute_square_loss(X, y, theta.transpose())
loss_hist[iteration] = loss

return theta_hist, loss_hist

Draw Convergence Diagram

Try step sizes [0.01, 0.05, 0.1, 0.101, 0.2]. Plot the value of the objective function as a function of the number of steps for each step sizes. Briefly summarize your findings.

Note

If step size is too small, it will converge will slow. If step size is too large, it will oscillate and therefore, never converge.

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def draw(theta_hist, loss_hist, step_size, num_iter):
x_index = []
for i in range(num_iter + 1):
x_index.append(i)
plt.subplot(1, 2, 1)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.title('Loss over Step Size')
if loss_hist[-1] > 1000:
plt.yscale('log')
plt.plot(x_index, loss_hist, label=step_size)
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(x_index, theta_hist.transpose()[0], label=step_size)
plt.xlabel('Iterations')
plt.ylabel('Theta')
plt.title('Theta over Step Size')
if theta_hist.transpose()[0][-1] > 100:
plt.yscale('symlog')
plt.legend()
plt.show()


def converge_test(X, y, num_iter=100):
step_sizes = np.array([0.0001, 0.01, 0.05, 0.1, 0.101, 0.2])
x_index = []
for i in range(num_iter + 1):
x_index.append(i)

for step_size in step_sizes:
theta_hist, loss_hist = batch_grad_descent(X, y,
step_size=step_size,
num_iter=num_iter,
check_gradient=False)
draw(theta_hist, loss_hist, step_size, num_iter)


converge_test(X_train, y_train)

png

png

png

png

png

png

Ridge Regression with L2 Regularized

Compute Regularized Square Loss Gradient

In Ridge Regression
Given a set of X, y, theta, compute the square loss for predicting y with X*theta
loss func: J(θ)=12mi=1m(hθ(xi)yi)2+λθTθ
In matrix formulation, hθ(xi)=θTxi
So J(θ)=12m(XθTy)(XθTy)T+λθTθ
J(θ)=1m(xθy)Tx+2λθT

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def compute_regularized_square_loss_gradient(X, y, theta, lambda_reg):
"""
Compute the gradient of L2-regularized square loss function given X, y and theta

Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D numpy array of size (num_features)
lambda_reg - the regularization coefficient

Returns:
grad - gradient vector, 1D numpy array of size (num_features)
"""
# TODO
square_loss_gradient = compute_square_loss_gradient(X, y, theta)
return square_loss_gradient + 2*lambda_reg*theta.T

Regularized Grad Descent

same as line regression, except use regularized squre loss function

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
def regularized_grad_descent(X, y, step_size=0.05, lambda_reg=1, num_iter=1000):
"""
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
alpha - step size in gradient descent
lambda_reg - the regularization coefficient
numIter - number of iterations to run

Returns:
theta_hist - the history of parameter vector, 2D numpy array of size (num_iter+1, num_features)
loss_hist - the history of regularized loss value, 1D numpy array
"""
num_instances, num_features = X.shape[0], X.shape[1]
theta_hist = np.zeros((num_iter + 1, num_features)) # Initialize theta_hist
loss_hist = np.zeros(num_iter + 1) # initialize loss_hist
theta = np.ones(num_features) # initialize theta
theta_hist[0] = theta
loss_hist[0] = compute_square_loss(X, y, theta)

for iteration in range(1, num_iter + 1):
grad = compute_regularized_square_loss_gradient(X, y, theta, lambda_reg)
theta = theta - step_size * grad.T
theta_hist[iteration] = theta
loss_hist[iteration] = compute_square_loss(X, y, theta)

return theta_hist, loss_hist

Visualize Ridge Regression

python
1
2
3
4
5
6
7
8
def visualize_regularized_batch_gradient_descent(X, y):
num_iter = 10
for lambda_reg in [1e-5, 1e-3, 1e-1, 1, 10, 100]:
theta_hist, loss_hist = regularized_grad_descent(X, y, lambda_reg = lambda_reg, num_iter=num_iter)
draw(theta_hist, loss_hist, lambda_reg, num_iter)


visualize_regularized_batch_gradient_descent(X_train, y_train)

png

png

png

png

png

png

Stochastic Grad Descent(SGD)

If dataset is too large, compute loss gradient for the whole dataset is time-comsuming.
So we can divide a dataset into many batch, i.e. minibatch
Epoch: if you run some minibatch that coverge the dataset, then you run 1 epoch
The batch size of the code is 50% of whole dataset.
The epoch is 100

4 step size methods:

if step_size is a float, then the step size in every iteration is constant.
if step_size == “1/sqrt(t)”, step size=1t
if step_size == “1/t”, StepSize=1t
if step_size == “frac”, StepSize=StepSize01+StepSize0lambdat, whereas StepSize0 is set up by you

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def stochastic_grad_descent(X, y, step_size=0.1, lambda_reg: float = 1, num_iter=100) -> [np.ndarray, np.ndarray]:
"""
In this question you will implement stochastic gradient descent with a regularization term

Args:
:param X - the feature vector, 2D numpy array of size (num_instances, num_features)
:param y - the label vector, 1D numpy array of size (num_instances)
:param step_size - string or float. step size in gradient descent
NOTE: In SGD, it's not always a good idea to use a fixed step size. Usually it's set to 1/sqrt(t) or 1/t
if step_size is a float, then the step size in every iteration is alpha.
if step_size == "1/sqrt(t)", alpha = 1/sqrt(t)
if step_size == "1/t", alpha = 1/t
if step_size == "frac", step_size = step_size_0/(1+step_size_0*lambda*t)"
:param lambda_reg - the regularization coefficient
:param num_iter - number of epochs (i.e. number of times) to go through the whole training set

Returns:
:return theta_hist - the history of parameter vector, 3D numpy array of size (num_iter, num_instances, num_features)
:return loss hist - the history of regularized loss function vector, 2D numpy array of size(num_iter, num_instances)
"""
num_instances, num_features = X.shape[0], X.shape[1]
theta = np.ones(num_features) # Initialize theta

theta_hist = np.zeros((num_iter + 1, num_features)) # Initialize theta_hist
loss_hist = np.zeros(num_iter + 1) # Initialize loss_hist
theta_hist[0] = theta
loss_hist[0] = compute_square_loss(X, y, theta)
step_size_0: float = 1.0
step_size_method = step_size
if step_size_method == "frac":
step_size_0: float = 0.1

for t in range(1, num_iter + 1):
if step_size_method == "1/sqrt(t)":
step_size = 1.0 / np.sqrt(t + 1)
elif step_size_method == "1/t":
step_size = 1 / (t + 1)
elif step_size_method == "frac":
step_size = step_size_0 / (1.0 + step_size_0 * lambda_reg * (t + 1.0))
else:
pass

X_1, X_2, y_1, y_2 = train_test_split(X, y, test_size=0.5, random_state=10)
X_1, X_2 = feature_normalization(X_1, X_2)
grad = compute_regularized_square_loss_gradient(X_1, y_1, theta, lambda_reg)
theta = theta - step_size * grad.T
grad = compute_regularized_square_loss_gradient(X_2, y_2, theta, lambda_reg)
theta = theta - step_size * grad.T
loss_hist[t] = compute_square_loss(X, y, theta) + np.dot(theta, theta) * lambda_reg
theta_hist[t] = theta

return theta_hist, loss_hist

Visualize SGD by 4 methods

For “0.05”, it need you to choose this manually.
For “1/sqrt(1)” and “1/t”, they are diverged!
For “frac”, it’s oscillated but converged finally!
The reason why “1/sqrt(1)” and “1/t” are diverged is that: they have too large step size initially that let them run out of the curve. And their step sizes decrease too fast that they can not turn back to the minimum point. And frac mode solve this by decreasing slowly.

python
1
2
3
4
5
6
7
8
9
def visualize_sgd(X, y):
num_iter = 1000
for step_size in [0.05, "1/sqrt(t)", "1/t", "frac"]:
theta_hist, loss_hist = stochastic_grad_descent(X, y, step_size,
lambda_reg=1e-5, num_iter=num_iter)
draw(theta_hist, loss_hist, step_size, num_iter)


visualize_sgd(X_train, y_train)

png

png

png

png

Compare Full Batch and SGD

SGD converge a little quickly with comparing of iteration times. But Full Batch can adapt more large step size.

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def visualize_compare_batch_and_sgd(X, y):
num_iter = 100
for step_size in [0.001, 0.01, 0.05, 0.1, 0.2]:
theta_hist_batch, loss_hit_batch = batch_grad_descent(X, y, step_size, num_iter)
theta_hist_sgd, loss_hit_sgd = stochastic_grad_descent(X, y,
step_size=step_size,
num_iter=num_iter,
lambda_reg=1e-4)
plt.subplot(1, 2, 1)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.title('Loss over Step Size = {0}'.format(step_size))
plt.plot(loss_hit_batch, label="Full Batch")
plt.plot(loss_hit_sgd, label="SGD")
if loss_hit_batch[-1] > 1000:
plt.yscale('log')
plt.legend()

plt.subplot(1, 2, 2)
plt.xlabel('Iterations')
plt.ylabel('Theta')
plt.title('Theta over Step Size = {0}'.format(step_size))
plt.plot(theta_hist_batch.transpose()[0], label="Full Batch")
plt.plot(theta_hist_sgd.transpose()[0], label="SGD")
if loss_hit_batch[-1] > 1000:
plt.yscale('log')
plt.legend()
plt.show()


visualize_compare_batch_and_sgd(X_train, y_train)

png

png

png

png

png

Overfit Analyse

Since you have different dataset for training and testing, if you apply for too large iterations when train, the model will just more only “remember” the train dataset(overfit). The model therefore are lack of generalization and get larger mean square error in test dataset
Therefore, we can add L2 regularization to solve this. But L2 will make trivial contribution if too small, or cause too large bias if too large.

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def overfit_analyse(X_train, y_train, X_test, y_test):
num_iter = 10000
for step_size in [0.05, 0.01]:
theta_hist_non_reg, _ = \
batch_grad_descent(X_train, y_train, step_size=step_size, num_iter=num_iter)

loss_trace_non_reg: list = \
[compute_square_loss(X_test, y_test, theta) for theta in theta_hist_non_reg]
min_index = np.argmin(loss_trace_non_reg)
plt.plot(loss_trace_non_reg[100:-1], label="No Regularization")
show_max = '[' + str(min_index) + ',' + str(loss_trace_non_reg[min_index])[0:4] + ']'
plt.annotate(show_max, xytext=(min_index - 100, loss_trace_non_reg[min_index]),
xy=(min_index - 100, loss_trace_non_reg[min_index]))

for lambda_reg in [1e-3, 1e-2, 0.1]:
theta_hist_reg, _ = \
regularized_grad_descent(X_train, y_train,
lambda_reg=lambda_reg, step_size=step_size, num_iter=num_iter)

loss_trace_reg: list = \
[compute_square_loss(X_test, y_test, theta)
for theta in theta_hist_reg]
plt.plot(loss_trace_reg[100:-1], label="Reg Lambda {0}".format(lambda_reg))
min_index = np.argmin(loss_trace_reg)
show_max = '[' + str(min_index) + ',' + str(loss_trace_reg[min_index])[0:4] + ']'
plt.annotate(show_max, xytext=(min_index - 100, loss_trace_reg[min_index]),
xy=(min_index - 100, loss_trace_reg[min_index]))

plt.legend()
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.title("Loss change over iterations with step size: {0}".format(step_size))
plt.show()


overfit_analyse(X_train, y_train, X_test, y_test)

png

png

Regularization and Coefficient Analyse

Larger regularization will make θ more close to zero.
This time, risk shift from (close to dataset) to (close to regularization, far from dataset).
Approximation Error close to infinity, whereas estimation and optimization error close to zero

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def regularization_coefficient_analyse(X_train, y_train, X_test, y_test):
num_iter = 10000
for step_size in [0.05, 0.01]:
theta_hist_non_reg, _ = \
batch_grad_descent(X_train, y_train, step_size=step_size, num_iter=num_iter)

theta_sum: list = []
lambda_list: list = []

theta_sum.append(theta_hist_non_reg[-1].sum())
lambda_list.append(0)

for lambda_reg in [1e-7, 1e-8, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2]:
theta_hist_reg, _ = \
regularized_grad_descent(X_train, y_train,
lambda_reg=lambda_reg, step_size=step_size, num_iter=num_iter)

theta_sum.append(theta_hist_reg[-1].sum())
lambda_list.append(lambda_reg)

plt.plot(lambda_list, theta_sum, label="step size: {0}".format(step_size))
plt.legend()

plt.xscale("log")
plt.xlabel("regularization lambda")
plt.ylabel("sum of coefficient of theta")
plt.title("Theta constraned over regularization lambda")
plt.show()


regularization_coefficient_analyse(X_train, y_train, X_test, y_test)

png

Risk Minimization Analyse

  1. show that for square loss L(y^,y)=12(yy^2) , its Bayes decision func is f(X)=E[Y|X=x]
    Risk Func R(f)=12E[(f(x)y^)2]=12E[yy^2|X=x]=12E[(y22yy^+y^2)|X=x]=12E[y2|X=x]E[yy^|X=x]+12E[y^2]
    Therefore, R(f)y=E[y|X=x]y^
    To minimize R(f), we get f(X)=y^=E[Y|X=x]
  2. show that for square loss L(y^,y)=|yy^| , its Bayes decision func is f(X)=median [Y|X=x]
    Too hard, too many math!
    Can reference: https://github.com/kinslyzhu/DS-GA-1003-ML/blob/master/assignment/hw1-sgd/yz3079_hw1.ipynb
python
1