|
|
import numpy as np
|
|
|
|
|
|
|
|
|
input_dim = 3
|
|
|
hidden_dim = 4
|
|
|
output_dim = 2
|
|
|
sequence_length = 5
|
|
|
learning_rate = 0.01
|
|
|
epochs = 500
|
|
|
|
|
|
|
|
|
sample_input = np.random.rand(sequence_length, input_dim)
|
|
|
|
|
|
if np.sum(sample_input) > (sequence_length * input_dim / 2):
|
|
|
sample_y = np.array([1, 0]).reshape(-1, 1)
|
|
|
else:
|
|
|
sample_y = np.array([0, 1]).reshape(-1, 1)
|
|
|
|
|
|
print(f"Sample Input Shape: {sample_input.shape}")
|
|
|
print(f"True Label: Class {np.argmax(sample_y)}")
|
|
|
print("-" * 30)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sigmoid(x):
|
|
|
return 1 / (1 + np.exp(-x))
|
|
|
|
|
|
|
|
|
def sigmoid_derivative(x):
|
|
|
s = sigmoid(x)
|
|
|
return s * (1 - s)
|
|
|
|
|
|
|
|
|
def tanh(x):
|
|
|
return np.tanh(x)
|
|
|
|
|
|
|
|
|
def tanh_derivative(x):
|
|
|
return 1 - np.tanh(x)**2
|
|
|
|
|
|
|
|
|
def softmax(x):
|
|
|
|
|
|
e_x = np.exp(x - np.max(x, axis=0, keepdims=True))
|
|
|
return e_x / np.sum(e_x, axis=0, keepdims=True)
|
|
|
|
|
|
|
|
|
def cross_entropy_loss(y_pred, y_true):
|
|
|
|
|
|
return -np.sum(y_true * np.log(y_pred + 1e-9))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NumpyLSTM:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
|
|
|
self.input_size = input_size
|
|
|
self.hidden_size = hidden_size
|
|
|
self.output_size = output_size
|
|
|
self.learning_rate = learning_rate
|
|
|
|
|
|
|
|
|
|
|
|
self.Wx = np.random.randn(4 * hidden_size, input_size) * 0.1
|
|
|
self.Wh = np.random.randn(4 * hidden_size, hidden_size) * 0.1
|
|
|
self.b = np.zeros((4 * hidden_size, 1))
|
|
|
|
|
|
|
|
|
self.Why = np.random.randn(output_size, hidden_size) * 0.1
|
|
|
self.by = np.zeros((output_size, 1))
|
|
|
|
|
|
|
|
|
self.dWx, self.dWh, self.db = np.zeros_like(self.Wx), np.zeros_like(self.Wh), np.zeros_like(self.b)
|
|
|
self.dWhy, self.dby = np.zeros_like(self.Why), np.zeros_like(self.by)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def forward(self, inputs, y_true):
|
|
|
self.inputs = inputs
|
|
|
self.y_true = y_true
|
|
|
seq_length = inputs.shape[0]
|
|
|
|
|
|
|
|
|
self.h_states, self.c_states = {}, {}
|
|
|
self.h_states[-1] = np.zeros((self.hidden_size, 1))
|
|
|
self.c_states[-1] = np.zeros((self.hidden_size, 1))
|
|
|
|
|
|
|
|
|
self.z_s, self.f_s, self.i_s, self.c_tilde_s, self.o_s = {}, {}, {}, {}, {}
|
|
|
|
|
|
|
|
|
for t in range(seq_length):
|
|
|
xt = self.inputs[t].reshape(-1, 1)
|
|
|
h_prev = self.h_states[t - 1]
|
|
|
c_prev = self.c_states[t - 1]
|
|
|
|
|
|
|
|
|
|
|
|
self.z_s[t] = self.Wx @ xt + self.Wh @ h_prev + self.b
|
|
|
|
|
|
|
|
|
|
|
|
self.f_s[t] = sigmoid(self.z_s[t][:self.hidden_size, :])
|
|
|
|
|
|
self.i_s[t] = sigmoid(self.z_s[t][self.hidden_size:2*self.hidden_size, :])
|
|
|
|
|
|
self.c_tilde_s[t] = tanh(self.z_s[t][2*self.hidden_size:3*self.hidden_size, :])
|
|
|
|
|
|
self.o_s[t] = sigmoid(self.z_s[t][3*self.hidden_size:, :])
|
|
|
|
|
|
|
|
|
self.c_states[t] = self.f_s[t] * c_prev + self.i_s[t] * self.c_tilde_s[t]
|
|
|
self.h_states[t] = self.o_s[t] * tanh(self.c_states[t])
|
|
|
|
|
|
|
|
|
self.final_h = self.h_states[seq_length - 1]
|
|
|
self.logits = self.Why @ self.final_h + self.by
|
|
|
self.y_pred = softmax(self.logits)
|
|
|
|
|
|
|
|
|
self.loss = cross_entropy_loss(self.y_pred, self.y_true)
|
|
|
|
|
|
return self.loss, self.y_pred
|
|
|
|
|
|
|
|
|
def backward(self):
|
|
|
|
|
|
self.dWx, self.dWh, self.db = np.zeros_like(self.Wx), np.zeros_like(self.Wh), np.zeros_like(self.b)
|
|
|
self.dWhy, self.dby = np.zeros_like(self.Why), np.zeros_like(self.by)
|
|
|
|
|
|
|
|
|
dh_next = np.zeros_like(self.h_states[0])
|
|
|
dc_next = np.zeros_like(self.c_states[0])
|
|
|
|
|
|
|
|
|
d_logits = self.y_pred - self.y_true
|
|
|
self.dWhy = d_logits @ self.final_h.T
|
|
|
self.dby = d_logits
|
|
|
dh_final = self.Why.T @ d_logits
|
|
|
|
|
|
|
|
|
dh_next += dh_final
|
|
|
|
|
|
|
|
|
for t in reversed(range(len(self.inputs))):
|
|
|
xt = self.inputs[t].reshape(-1, 1)
|
|
|
h_prev = self.h_states[t - 1]
|
|
|
c_prev = self.c_states[t - 1]
|
|
|
|
|
|
|
|
|
do = dh_next * tanh(self.c_states[t])
|
|
|
dc = dc_next + dh_next * self.o_s[t] * tanh_derivative(self.c_states[t])
|
|
|
|
|
|
|
|
|
dz_o = do * sigmoid_derivative(self.z_s[t][3*self.hidden_size:, :])
|
|
|
dc_tilde = dc * self.i_s[t]
|
|
|
dz_c = dc_tilde * tanh_derivative(self.z_s[t][2*self.hidden_size:3*self.hidden_size, :])
|
|
|
di = dc * self.c_tilde_s[t]
|
|
|
dz_i = di * sigmoid_derivative(self.z_s[t][self.hidden_size:2*self.hidden_size, :])
|
|
|
df = dc * c_prev
|
|
|
dz_f = df * sigmoid_derivative(self.z_s[t][:self.hidden_size, :])
|
|
|
|
|
|
|
|
|
dz = np.vstack((dz_f, dz_i, dz_c, dz_o))
|
|
|
|
|
|
|
|
|
self.dWx += dz @ xt.T
|
|
|
self.dWh += dz @ h_prev.T
|
|
|
self.db += dz
|
|
|
|
|
|
|
|
|
dh_next = self.Wh.T @ dz
|
|
|
dc_next = self.f_s[t] * dc
|
|
|
|
|
|
|
|
|
for dparam in [self.dWx, self.dWh, self.db, self.dWhy, self.dby]:
|
|
|
np.clip(dparam, -5, 5, out=dparam)
|
|
|
|
|
|
|
|
|
def update(self):
|
|
|
self.Wx -= self.learning_rate * self.dWx
|
|
|
self.Wh -= self.learning_rate * self.dWh
|
|
|
self.b -= self.learning_rate * self.db
|
|
|
self.Why -= self.learning_rate * self.dWhy
|
|
|
self.by -= self.learning_rate * self.dby
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
lstm = NumpyLSTM(input_size=input_dim, hidden_size=hidden_dim, output_size=output_dim, learning_rate=learning_rate)
|
|
|
|
|
|
|
|
|
for epoch in range(epochs):
|
|
|
|
|
|
loss, y_pred = lstm.forward(sample_input, sample_y)
|
|
|
|
|
|
|
|
|
lstm.backward()
|
|
|
|
|
|
|
|
|
lstm.update()
|
|
|
|
|
|
if epoch % 100 == 0:
|
|
|
print(f"Epoch {epoch}, Loss: {loss:.4f}")
|
|
|
print(f"Predicted Probs: {y_pred.flatten()}")
|
|
|
print(f"Predicted Class: {np.argmax(y_pred)}")
|
|
|
print("-" * 20)
|
|
|
|
|
|
print("\n--- Training Finished ---")
|
|
|
final_loss, final_y_pred = lstm.forward(sample_input, sample_y)
|
|
|
print(f"Final Loss: {final_loss:.4f}")
|
|
|
print(f"Final Prediction: Class {np.argmax(final_y_pred)} (Probs: {final_y_pred.flatten()})")
|
|
|
print(f"True Label: Class {np.argmax(sample_y)}") |