机器学习实验7

本次实验需要实现一个简单的神经网络,并使用梯度下降法训练。

遇到了Xavier初始化,找到了一篇学习资料:

https://blog.csdn.net/qq_67720621/article/details/138045784

networkBackward 函数需要对照课件好好学习一下

实验代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

# Utilities
def onehotEncoder(Y, ny):
return np.eye(ny)[Y]

# Xavier Initialization
def initWeights(M):
l = len(M)
W = []
B = []

for i in range(1, l):
W.append(np.random.randn(M[i-1], M[i]) * np.sqrt(2.0/M[i-1])) # (M[i-1], M[i]) 表示当前权重矩阵的形状
# W.append(np.random.randn(M[i-1], M[i]) * np.sqrt(1.0/M[i-1])) # TODO 尝试标准的Xavier初始化
B.append(np.zeros([1, M[i]]))

return W, B

# Forward propagation
def networkForward(X, W, B):
l = len(W)
A = [X] + [None for i in range(l)]

for i in range(l):
Z = np.dot(A[i], W[i]) + B[i]
# Z到下一层有两种情况
if i == l-1: # 输出层使用softmax函数
A[i+1] = np.exp(Z) / np.sum(np.exp(Z), axis=1, keepdims=True)
else:
A[i+1] = 1 / (1 + np.exp(-Z))

return A

# Backward propagation
def networkBackward(Y, A, W):
l = len(W)
dW = [None for i in range(l)]
dB = [None for i in range(l)]

m = Y.shape[0]
dA = A[-1] - Y # 从最后一层往前推导

for i in reversed(range(l)):
if i != l-1:
dA = np.dot(dA, W[i+1].T) * (A[i+1]*(1-A[i+1])) # sigmoid导数
dW[i] = np.dot(A[i].T, dA) / m
dB[i] = np.sum(dA, axis=0, keepdims=True) / m

return dW, dB

# Update weights by gradient descent
def updateWeights(W, B, dW, dB, lr):
l = len(W)

for i in range(l):
W[i] -= lr * dW[i]
B[i] -= lr * dB[i]

return W, B

# Compute regularized cost function
def cost(A_l, Y, W):
n = Y.shape[0]
c = -np.sum(Y*np.log(A_l + 1e-8)) / n
# TODO 这里可以添加正则项
return c

def train(X, Y, M, lr, iterations):
costs = []
W, B = initWeights(M)

for i in range(iterations):
A = networkForward(X, W, B)
c = cost(A[-1], Y, W)
dW, dB = networkBackward(Y, A, W)
W, B = updateWeights(W, B, dW, dB, lr)

if i % 100 == 0:
print("Cost after iteration %i: %f" %(i, c))
costs.append(c)

return W, B, costs

def predict(X, W, B, Y):
Y_out = np.zeros([X.shape[0], Y.shape[1]])

A = networkForward(X, W, B)
idx = np.argmax(A[-1], axis=1)
Y_out[range(Y.shape[0]),idx] = 1

return Y_out

def test(Y, X, W, B):
Y_out = predict(X, W, B, Y)
acc = np.mean(Y_out == Y)
print("Training accuracy is: %f" %(acc))

return acc

def output(X, W, B):
A = networkForward(X, W, B)

Y_hat = np.expand_dims(np.argmax(A[-1], axis=1), axis=1)
idx = np.expand_dims(np.arange(Y_hat.shape[0]), axis=1)
np.savetxt("predict.csv", np.concatenate([idx, Y_hat], axis=1), header="Index,ID", comments='', delimiter=',')

iterations = 5000 # Training loops
lr = 0.08 # Learning rate

X = np.load("train_data.npy")
Y = np.load("train_label.npy")
(n, m) = X.shape
Y = onehotEncoder(Y, 10)

# M = [784, 25, 10]
M = [784,64,64,10] # TODO:尝试增加隐藏层来提高模型复杂度 当前模型可以达到0.86
W, B, costs = train(X, Y, M, lr, iterations)

plt.figure()
plt.plot(range(len(costs)), costs)

X = np.load("train_data.npy")
Y = np.load("train_label.npy")
Y = onehotEncoder(Y, 10)
test(Y, X, W, B)

X = np.load("test_data.npy")
output(X, W, B)

实验效果最好为 0.86,感觉下标那里好乱啊,搞不明白

下面是一些尝试的记录

调整了一下学习率,曲线较为平稳了一点,设置的结构为:M=[784,256,256,10]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def train(X, Y, M, lr, iterations):
costs = []
W, B = initWeights(M)

for i in range(iterations):
A = networkForward(X, W, B)
c = cost(A[-1], Y, W)
dW, dB = networkBackward(Y, A, W)
if i > 9000:
W, B = updateWeights(W, B, dW, dB,0.0005)
elif i > 8000:
W, B = updateWeights(W, B, dW, dB,0.001)
elif i >= 4000 and i <= 8000:
W, B = updateWeights(W, B, dW, dB,0.008)
else:
W, B = updateWeights(W, B, dW, dB, 0.02)

if i % 100 == 0:
print("Cost after iteration %i: %f" %(i, c))
costs.append(c)

return W, B, costs

但是效果还是不太好,可能是过拟合了,改一下结构: M = [784,128,128,10]

结果还是不太好,大概只有0.82得分

下面尝试加一个L2正则看看效果,网络结构为 M = [784,512,256,128,64,32,10]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Compute regularized cost function
def cost(A_l, Y, W):
# n = Y.shape[0]
# c = -np.sum(Y*np.log(A_l + 1e-8)) / n

# return c

# 加了L2的损失函数
n = Y.shape[0]
lambda_reg = 0.01
c = -np.sum(Y*np.log(A_l + 1e-8)) / n
reg_term = 0
for w in W:
reg_term += np.sum(np.square(w))
reg_term *= lambda_reg / (2 * n)
return c + reg_term

这个框架跑太慢了,而且损失函数降不下去,pass

不知道怎么提升了...

关于这个实验

不死心又尝试了relu激活,有几点注意事项,可以看看这个总结: https://sakigami-yang.me/2017/08/11/thinking-from-softmax-03/

使用relu的话主要要选择一个比较小的学习率,防止梯度爆炸或者直接die掉

根据实验的结果来看,个人感觉relu的梯度下降会更平稳一点,但是由于学习率不敢给太高,所以迭代次数上需要多给一点,整体效果应该会比sigmoid好一点点吧,嗯,希望是这样...

附上测试代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# 尝试使用relu激活函数来看看效果
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

# Utilities
def onehotEncoder(Y, ny):
return np.eye(ny)[Y]

# Xavier Initialization
def initWeights(M):
l = len(M)
W = []
B = []

for i in range(1, l):
W.append(np.random.randn(M[i-1], M[i]) * np.sqrt(2.0/M[i-1]))
B.append(np.zeros([1, M[i]]))

return W, B

# ReLU Activation Function
def relu(Z):
return np.maximum(0, Z)

# Derivative of ReLU
def reluDerivative(Z):
return (Z > 0).astype(float)

# Forward propagation
def networkForward(X, W, B):
l = len(W)
A = [X] + [None for i in range(l)]

for i in range(l):
Z = np.dot(A[i], W[i]) + B[i]
if i == l-1: # Output layer uses softmax function
A[i+1] = np.exp(Z) / np.sum(np.exp(Z), axis=1, keepdims=True)
else:
A[i+1] = relu(Z)

return A

# Backward propagation
def networkBackward(Y, A, W):
l = len(W)
dW = [None for i in range(l)]
dB = [None for i in range(l)]

m = Y.shape[0]
dA = A[-1] - Y # Start from the last layer

for i in reversed(range(l)):
if i != l-1:
dA = np.dot(dA, W[i+1].T) * reluDerivative(A[i+1])
dW[i] = np.dot(A[i].T, dA) / m
dB[i] = np.sum(dA, axis=0, keepdims=True) / m

return dW, dB

# Update weights by gradient descent
def updateWeights(W, B, dW, dB, lr):
l = len(W)

for i in range(l):
W[i] -= lr * dW[i]
B[i] -= lr * dB[i]

return W, B

# Compute regularized cost function
def cost(A_l, Y, W):
n = Y.shape[0]
lambda_reg = 0.15 # TODO: L2正则化可能也需要调整
c = -np.sum(Y*np.log(A_l + 1e-8)) / n
reg_term = 0
for w in W:
reg_term += np.sum(np.square(w))
reg_term *= lambda_reg / (2 * n)
return c + reg_term

def train(X, Y, M, lr, iterations):
costs = []
W, B = initWeights(M)

for i in range(iterations):
A = networkForward(X, W, B)
c = cost(A[-1], Y, W)
dW, dB = networkBackward(Y, A, W)

W, B = updateWeights(W, B, dW, dB, lr)

if i % 100 == 0:
print("Cost after iteration %i: %f" %(i, c))
costs.append(c)

return W, B, costs

def predict(X, W, B, Y):
Y_out = np.zeros([X.shape[0], Y.shape[1]])

A = networkForward(X, W, B)
idx = np.argmax(A[-1], axis=1)
Y_out[range(Y.shape[0]),idx] = 1

return Y_out

def test(Y, X, W, B):
Y_out = predict(X, W, B, Y)
acc = np.mean(Y_out == Y) # 计算的是预测正确的样本占总样本的比例
print("Training accuracy is: %f" %(acc))

return acc

def output(X, W, B):
A = networkForward(X, W, B)

Y_hat = np.expand_dims(np.argmax(A[-1], axis=1), axis=1)
idx = np.expand_dims(np.arange(Y_hat.shape[0]), axis=1)
np.savetxt("predict.csv", np.concatenate([idx, Y_hat], axis=1), header="Index,ID", comments='', delimiter=',')

iterations = 12000 # Training loops
lr = 0.005 # Learning rate
# 0.001没跑完感觉小了
# 0.005感觉刚刚好,但是迭代次数6000有点不够用,可以加一点

X = np.load("train_data.npy")
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0) # s数据标准化
Y = np.load("train_label.npy")
(n, m) = X.shape
Y = onehotEncoder(Y, 10)

M = [784,256,10]
W, B, costs = train(X, Y, M, lr, iterations)

plt.figure()
plt.plot(range(len(costs)), costs)

X = np.load("train_data.npy")
Y = np.load("train_label.npy")
Y = onehotEncoder(Y, 10)
test(Y, X, W, B)

X = np.load("test_data.npy")
output(X, W, B)


机器学习实验7
http://jrhu0048.github.io/2024/10/23/ji-qi-xue-xi/ji-qi-xue-xi-shi-yan-7/
作者
JR.HU
发布于
2024年10月23日
更新于
2024年10月26日
许可协议