机器学习实验7

本次实验需要实现一个简单的神经网络，并使用梯度下降法训练。

遇到了Xavier初始化,找到了一篇学习资料：

https://blog.csdn.net/qq_67720621/article/details/138045784

networkBackward 函数需要对照课件好好学习一下

实验代码

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

# Utilities
def onehotEncoder(Y, ny):
    return np.eye(ny)[Y]

# Xavier Initialization
def initWeights(M):
    l = len(M)
    W = []
    B = []
    
    for i in range(1, l):
        W.append(np.random.randn(M[i-1], M[i]) * np.sqrt(2.0/M[i-1]))  # (M[i-1], M[i]) 表示当前权重矩阵的形状
        # W.append(np.random.randn(M[i-1], M[i]) * np.sqrt(1.0/M[i-1]))  # TODO 尝试标准的Xavier初始化
        B.append(np.zeros([1, M[i]]))
        
    return W, B

# Forward propagation
def networkForward(X, W, B):
    l = len(W)
    A = [X] + [None for i in range(l)]
    
    for i in range(l):
        Z = np.dot(A[i], W[i]) + B[i]
        # Z到下一层有两种情况
        if i == l-1: # 输出层使用softmax函数
            A[i+1] = np.exp(Z) / np.sum(np.exp(Z), axis=1, keepdims=True)
        else:
            A[i+1] = 1 / (1 + np.exp(-Z))
    
    return A

# Backward propagation
def networkBackward(Y, A, W):
    l = len(W)
    dW = [None for i in range(l)]
    dB = [None for i in range(l)]
    
    m = Y.shape[0]
    dA = A[-1] - Y  # 从最后一层往前推导
    
    for i in reversed(range(l)):
        if i != l-1:
            dA = np.dot(dA, W[i+1].T) * (A[i+1]*(1-A[i+1])) # sigmoid导数
        dW[i] = np.dot(A[i].T, dA) / m
        dB[i] = np.sum(dA, axis=0, keepdims=True) / m
        
    return dW, dB

# Update weights by gradient descent
def updateWeights(W, B, dW, dB, lr):
    l = len(W)

    for i in range(l):
        W[i] -= lr * dW[i]
        B[i] -= lr * dB[i]

    return W, B

# Compute regularized cost function
def cost(A_l, Y, W):
    n = Y.shape[0]
    c = -np.sum(Y*np.log(A_l + 1e-8)) / n
    # TODO 这里可以添加正则项
    return c

def train(X, Y, M, lr, iterations):
    costs = []
    W, B = initWeights(M)

    for i in range(iterations):
        A = networkForward(X, W, B)
        c = cost(A[-1], Y, W)
        dW, dB = networkBackward(Y, A, W)
        W, B = updateWeights(W, B, dW, dB, lr)

        if i % 100 == 0:
            print("Cost after iteration %i: %f" %(i, c))
            costs.append(c)

    return W, B, costs

def predict(X, W, B, Y):
    Y_out = np.zeros([X.shape[0], Y.shape[1]])
    
    A = networkForward(X, W, B)
    idx = np.argmax(A[-1], axis=1)
    Y_out[range(Y.shape[0]),idx] = 1
    
    return Y_out

def test(Y, X, W, B):
    Y_out = predict(X, W, B, Y)
    acc = np.mean(Y_out == Y)
    print("Training accuracy is: %f" %(acc))
    
    return acc

def output(X, W, B):
    A = networkForward(X, W, B)
    
    Y_hat = np.expand_dims(np.argmax(A[-1], axis=1), axis=1)
    idx = np.expand_dims(np.arange(Y_hat.shape[0]), axis=1)
    np.savetxt("predict.csv", np.concatenate([idx, Y_hat], axis=1), header="Index,ID", comments='', delimiter=',')

iterations = 5000  # Training loops
lr = 0.08          # Learning rate

X = np.load("train_data.npy")
Y = np.load("train_label.npy")
(n, m) = X.shape
Y = onehotEncoder(Y, 10)

# M = [784, 25, 10]
M = [784,64,64,10]  # TODO：尝试增加隐藏层来提高模型复杂度 当前模型可以达到0.86
W, B, costs = train(X, Y, M, lr, iterations)

plt.figure()
plt.plot(range(len(costs)), costs)

X = np.load("train_data.npy")
Y = np.load("train_label.npy")
Y = onehotEncoder(Y, 10)
test(Y, X, W, B)

X = np.load("test_data.npy")
output(X, W, B)

实验效果最好为 0.86，感觉下标那里好乱啊，搞不明白

下面是一些尝试的记录

调整了一下学习率，曲线较为平稳了一点，设置的结构为：M=[784,256,256,10]

def train(X, Y, M, lr, iterations):
    costs = []
    W, B = initWeights(M)

    for i in range(iterations):
        A = networkForward(X, W, B)
        c = cost(A[-1], Y, W)
        dW, dB = networkBackward(Y, A, W)
        if i > 9000:
            W, B = updateWeights(W, B, dW, dB,0.0005)
        elif i > 8000:
            W, B = updateWeights(W, B, dW, dB,0.001)
        elif i >= 4000 and i <= 8000:
            W, B = updateWeights(W, B, dW, dB,0.008)
        else:
            W, B = updateWeights(W, B, dW, dB, 0.02)

        if i % 100 == 0:
            print("Cost after iteration %i: %f" %(i, c))
            costs.append(c)

    return W, B, costs

但是效果还是不太好,可能是过拟合了，改一下结构: M = [784,128,128,10]

结果还是不太好，大概只有0.82得分

下面尝试加一个L2正则看看效果，网络结构为 M = [784,512,256,128,64,32,10]

# Compute regularized cost function
def cost(A_l, Y, W):
    # n = Y.shape[0]
    # c = -np.sum(Y*np.log(A_l + 1e-8)) / n
    
    # return c
    
    # 加了L2的损失函数
    n = Y.shape[0]
    lambda_reg = 0.01
    c = -np.sum(Y*np.log(A_l + 1e-8)) / n
    reg_term = 0
    for w in W:
        reg_term += np.sum(np.square(w))
    reg_term *= lambda_reg / (2 * n)
    return c + reg_term

这个框架跑太慢了，而且损失函数降不下去，pass

不知道怎么提升了...

关于这个实验

不死心又尝试了relu激活，有几点注意事项，可以看看这个总结： https://sakigami-yang.me/2017/08/11/thinking-from-softmax-03/

使用relu的话主要要选择一个比较小的学习率，防止梯度爆炸或者直接die掉

根据实验的结果来看，个人感觉relu的梯度下降会更平稳一点，但是由于学习率不敢给太高，所以迭代次数上需要多给一点，整体效果应该会比sigmoid好一点点吧，嗯，希望是这样...

附上测试代码：

# 尝试使用relu激活函数来看看效果
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt

# Utilities
def onehotEncoder(Y, ny):
    return np.eye(ny)[Y]

# Xavier Initialization
def initWeights(M):
    l = len(M)
    W = []
    B = []
    
    for i in range(1, l):
        W.append(np.random.randn(M[i-1], M[i]) * np.sqrt(2.0/M[i-1])) 
        B.append(np.zeros([1, M[i]]))
        
    return W, B

# ReLU Activation Function
def relu(Z):
    return np.maximum(0, Z)

# Derivative of ReLU
def reluDerivative(Z):
    return (Z > 0).astype(float)

# Forward propagation
def networkForward(X, W, B):
    l = len(W)
    A = [X] + [None for i in range(l)]
    
    for i in range(l):
        Z = np.dot(A[i], W[i]) + B[i]
        if i == l-1: # Output layer uses softmax function
            A[i+1] = np.exp(Z) / np.sum(np.exp(Z), axis=1, keepdims=True)
        else:
            A[i+1] = relu(Z)
    
    return A

# Backward propagation
def networkBackward(Y, A, W):
    l = len(W)
    dW = [None for i in range(l)]
    dB = [None for i in range(l)]
    
    m = Y.shape[0]
    dA = A[-1] - Y  # Start from the last layer
    
    for i in reversed(range(l)):
        if i != l-1:
            dA = np.dot(dA, W[i+1].T) * reluDerivative(A[i+1])
        dW[i] = np.dot(A[i].T, dA) / m
        dB[i] = np.sum(dA, axis=0, keepdims=True) / m
        
    return dW, dB

# Update weights by gradient descent
def updateWeights(W, B, dW, dB, lr):
    l = len(W)

    for i in range(l):
        W[i] -= lr * dW[i]
        B[i] -= lr * dB[i]

    return W, B

# Compute regularized cost function
def cost(A_l, Y, W):
    n = Y.shape[0]
    lambda_reg = 0.15  # TODO: L2正则化可能也需要调整
    c = -np.sum(Y*np.log(A_l + 1e-8)) / n
    reg_term = 0
    for w in W:
        reg_term += np.sum(np.square(w))
    reg_term *= lambda_reg / (2 * n)
    return c + reg_term

def train(X, Y, M, lr, iterations):
    costs = []
    W, B = initWeights(M)

    for i in range(iterations):
        A = networkForward(X, W, B)
        c = cost(A[-1], Y, W)
        dW, dB = networkBackward(Y, A, W)
        
        W, B = updateWeights(W, B, dW, dB, lr)
        
        if i % 100 == 0:
            print("Cost after iteration %i: %f" %(i, c))
            costs.append(c)

    return W, B, costs

def predict(X, W, B, Y):
    Y_out = np.zeros([X.shape[0], Y.shape[1]])
    
    A = networkForward(X, W, B)
    idx = np.argmax(A[-1], axis=1)
    Y_out[range(Y.shape[0]),idx] = 1
    
    return Y_out

def test(Y, X, W, B):
    Y_out = predict(X, W, B, Y)
    acc = np.mean(Y_out == Y) # 计算的是预测正确的样本占总样本的比例
    print("Training accuracy is: %f" %(acc))
    
    return acc

def output(X, W, B):
    A = networkForward(X, W, B)
    
    Y_hat = np.expand_dims(np.argmax(A[-1], axis=1), axis=1)
    idx = np.expand_dims(np.arange(Y_hat.shape[0]), axis=1)
    np.savetxt("predict.csv", np.concatenate([idx, Y_hat], axis=1), header="Index,ID", comments='', delimiter=',')

iterations = 12000  # Training loops
lr = 0.005          # Learning rate
# 0.001没跑完感觉小了
# 0.005感觉刚刚好，但是迭代次数6000有点不够用，可以加一点

X = np.load("train_data.npy")
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)  # s数据标准化
Y = np.load("train_label.npy")
(n, m) = X.shape
Y = onehotEncoder(Y, 10)

M = [784,256,10]
W, B, costs = train(X, Y, M, lr, iterations)

plt.figure()
plt.plot(range(len(costs)), costs)

X = np.load("train_data.npy")
Y = np.load("train_label.npy")
Y = onehotEncoder(Y, 10)
test(Y, X, W, B)

X = np.load("test_data.npy")
output(X, W, B)

机器学习

机器学习实验7

http://jrhu0048.github.io/2024/10/23/ji-qi-xue-xi/ji-qi-xue-xi-shi-yan-7/

作者

JR.HU

发布于

2024年10月23日

更新于

2024年10月26日

许可协议

第三章：BMP文件的读取与显示上一篇

第七讲_神经网络下一篇