Upload LSTM.py
Browse files
LSTM.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
|
| 3 |
+
# --- 1. μν λ°μ΄ν° λ° νμ΄νΌνλΌλ―Έν° μ μ ---
|
| 4 |
+
input_dim = 3
|
| 5 |
+
hidden_dim = 4
|
| 6 |
+
output_dim = 2
|
| 7 |
+
sequence_length = 5
|
| 8 |
+
learning_rate = 0.01
|
| 9 |
+
epochs = 500
|
| 10 |
+
|
| 11 |
+
# μ: μνμ€ λ°μ΄ν°μ μ΄ ν©μ΄ νΉμ κ°λ³΄λ€ ν¬λ©΄ 1, μλλ©΄ 0μΌλ‘ λΆλ₯
|
| 12 |
+
sample_input = np.random.rand(sequence_length, input_dim)
|
| 13 |
+
|
| 14 |
+
if np.sum(sample_input) > (sequence_length * input_dim / 2):
|
| 15 |
+
sample_y = np.array([1, 0]).reshape(-1, 1) # Class 0
|
| 16 |
+
else:
|
| 17 |
+
sample_y = np.array([0, 1]).reshape(-1, 1) # Class 1
|
| 18 |
+
|
| 19 |
+
print(f"Sample Input Shape: {sample_input.shape}")
|
| 20 |
+
print(f"True Label: Class {np.argmax(sample_y)}")
|
| 21 |
+
print("-" * 30)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# --- 2. νμ ν¨μ μ μ (νμ±ν ν¨μ λ° μμ€ ν¨μ) ---
|
| 25 |
+
|
| 26 |
+
# μκ·Έλͺ¨μ΄λ νμ±ν ν¨μ
|
| 27 |
+
def sigmoid(x):
|
| 28 |
+
return 1 / (1 + np.exp(-x))
|
| 29 |
+
|
| 30 |
+
# μκ·Έλͺ¨μ΄λ ν¨μμ λν¨μ
|
| 31 |
+
def sigmoid_derivative(x):
|
| 32 |
+
s = sigmoid(x)
|
| 33 |
+
return s * (1 - s)
|
| 34 |
+
|
| 35 |
+
# νμ΄νΌλ³Όλ¦ νμ νΈ(tanh) νμ±ν ν¨μ
|
| 36 |
+
def tanh(x):
|
| 37 |
+
return np.tanh(x)
|
| 38 |
+
|
| 39 |
+
# tanh ν¨μμ λν¨μ
|
| 40 |
+
def tanh_derivative(x):
|
| 41 |
+
return 1 - np.tanh(x)**2
|
| 42 |
+
|
| 43 |
+
# μννΈλ§₯μ€ ν¨μ
|
| 44 |
+
def softmax(x):
|
| 45 |
+
# μμΉμ μμ μ±μ μν΄ μ
λ ₯κ°μμ μ΅λκ°μ λΉΌμ€ (Overflow λ°©μ§)
|
| 46 |
+
e_x = np.exp(x - np.max(x, axis=0, keepdims=True))
|
| 47 |
+
return e_x / np.sum(e_x, axis=0, keepdims=True)
|
| 48 |
+
|
| 49 |
+
# ν¬λ‘μ€ μνΈλ‘νΌ μμ€ ν¨μ
|
| 50 |
+
def cross_entropy_loss(y_pred, y_true):
|
| 51 |
+
# y_predμ μμ£Ό μμ κ°μ λν΄ log(0) λ°©μ§
|
| 52 |
+
return -np.sum(y_true * np.log(y_pred + 1e-9))
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# --- 3. NumpyLSTM λͺ¨λΈ ν΄λμ€ ---
|
| 56 |
+
|
| 57 |
+
class NumpyLSTM:
|
| 58 |
+
# λͺ¨λΈμ κ°μ€μΉμ νλΌλ―Έν°λ₯Ό μ΄κΈ°νν©λλ€.
|
| 59 |
+
# - input_size: μ
λ ₯ 벑ν°μ μ°¨μ
|
| 60 |
+
# - hidden_size: μλ μν λ° μ
μν 벑ν°μ μ°¨μ
|
| 61 |
+
# - output_size: μΆλ ₯ 벑ν°(ν΄λμ€ κ°μ)μ μ°¨μ
|
| 62 |
+
def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
|
| 63 |
+
self.input_size = input_size
|
| 64 |
+
self.hidden_size = hidden_size
|
| 65 |
+
self.output_size = output_size
|
| 66 |
+
self.learning_rate = learning_rate
|
| 67 |
+
|
| 68 |
+
# LSTM νλΌλ―Έν° μ΄κΈ°ν (Forget, Input, Cell, Output κ²μ΄νΈ)
|
| 69 |
+
# κ° κ²μ΄νΈλ μ
λ ₯(x)κ³Ό μ΄μ μλ μν(h)λ₯Ό λͺ¨λ λ°μΌλ―λ‘, κ°μ€μΉ νλ ¬μ ν©μ³μ μ μ
|
| 70 |
+
self.Wx = np.random.randn(4 * hidden_size, input_size) * 0.1
|
| 71 |
+
self.Wh = np.random.randn(4 * hidden_size, hidden_size) * 0.1
|
| 72 |
+
self.b = np.zeros((4 * hidden_size, 1))
|
| 73 |
+
|
| 74 |
+
# Dense Layer (μΆλ ₯μΈ΅) νλΌλ―Έν° μ΄κΈ°ν
|
| 75 |
+
self.Why = np.random.randn(output_size, hidden_size) * 0.1
|
| 76 |
+
self.by = np.zeros((output_size, 1))
|
| 77 |
+
|
| 78 |
+
# κ·ΈλλμΈνΈλ₯Ό μ μ₯ν λ³μ μ΄κΈ°ν
|
| 79 |
+
self.dWx, self.dWh, self.db = np.zeros_like(self.Wx), np.zeros_like(self.Wh), np.zeros_like(self.b)
|
| 80 |
+
self.dWhy, self.dby = np.zeros_like(self.Why), np.zeros_like(self.by)
|
| 81 |
+
|
| 82 |
+
# μμ ν κ³Όμ μ μνν©λλ€.
|
| 83 |
+
# - inputs: (μνμ€ κΈΈμ΄, μ
λ ₯ μ°¨μ) ννμ 2D numpy λ°°μ΄
|
| 84 |
+
# - y_true: (μΆλ ₯ μ°¨μ, 1) ννμ one-hot μΈμ½λ©λ μ λ΅ λ μ΄λΈ
|
| 85 |
+
def forward(self, inputs, y_true):
|
| 86 |
+
self.inputs = inputs
|
| 87 |
+
self.y_true = y_true
|
| 88 |
+
seq_length = inputs.shape[0]
|
| 89 |
+
|
| 90 |
+
# μ΄μ μλ μνμ μ
μνλ₯Ό μ μ₯ν λμ
λ리
|
| 91 |
+
self.h_states, self.c_states = {}, {}
|
| 92 |
+
self.h_states[-1] = np.zeros((self.hidden_size, 1))
|
| 93 |
+
self.c_states[-1] = np.zeros((self.hidden_size, 1))
|
| 94 |
+
|
| 95 |
+
# μμ νμ νμν μ€κ° κ°λ€μ μ μ₯ν λμ
λ리
|
| 96 |
+
self.z_s, self.f_s, self.i_s, self.c_tilde_s, self.o_s = {}, {}, {}, {}, {}
|
| 97 |
+
|
| 98 |
+
# 1. LSTM μ
μμ ν (μκ° μμλλ‘)
|
| 99 |
+
for t in range(seq_length):
|
| 100 |
+
xt = self.inputs[t].reshape(-1, 1) # νμ¬ νμμ€ν
μ μ
λ ₯
|
| 101 |
+
h_prev = self.h_states[t - 1]
|
| 102 |
+
c_prev = self.c_states[t - 1]
|
| 103 |
+
|
| 104 |
+
# (1) κ²μ΄νΈ κ³μ°μ μν μ ν κ²°ν©
|
| 105 |
+
# 4κ°μ κ²μ΄νΈ(f, i, c_tilde, o) κ³μ°μ ν λ²μ μν
|
| 106 |
+
self.z_s[t] = self.Wx @ xt + self.Wh @ h_prev + self.b
|
| 107 |
+
|
| 108 |
+
# (2) κ° κ²μ΄νΈ νμ±ν
|
| 109 |
+
# Forget Gate (λ§κ° κ²μ΄νΈ)
|
| 110 |
+
self.f_s[t] = sigmoid(self.z_s[t][:self.hidden_size, :])
|
| 111 |
+
# Input Gate (μ
λ ₯ κ²μ΄νΈ)
|
| 112 |
+
self.i_s[t] = sigmoid(self.z_s[t][self.hidden_size:2*self.hidden_size, :])
|
| 113 |
+
# Cell Candidate (μ
μν ν보)
|
| 114 |
+
self.c_tilde_s[t] = tanh(self.z_s[t][2*self.hidden_size:3*self.hidden_size, :])
|
| 115 |
+
# Output Gate (μΆλ ₯ κ²μ΄νΈ)
|
| 116 |
+
self.o_s[t] = sigmoid(self.z_s[t][3*self.hidden_size:, :])
|
| 117 |
+
|
| 118 |
+
# (3) μ
μν λ° μλ μν μ
λ°μ΄νΈ
|
| 119 |
+
self.c_states[t] = self.f_s[t] * c_prev + self.i_s[t] * self.c_tilde_s[t]
|
| 120 |
+
self.h_states[t] = self.o_s[t] * tanh(self.c_states[t])
|
| 121 |
+
|
| 122 |
+
# 2. Dense Layer & Softmax μμ ν
|
| 123 |
+
self.final_h = self.h_states[seq_length - 1]
|
| 124 |
+
self.logits = self.Why @ self.final_h + self.by
|
| 125 |
+
self.y_pred = softmax(self.logits)
|
| 126 |
+
|
| 127 |
+
# 3. μμ€(Loss) κ³μ°
|
| 128 |
+
self.loss = cross_entropy_loss(self.y_pred, self.y_true)
|
| 129 |
+
|
| 130 |
+
return self.loss, self.y_pred
|
| 131 |
+
|
| 132 |
+
# μμ ν(BPTT) κ³Όμ μ μννμ¬ κ·ΈλλμΈνΈλ₯Ό κ³μ°ν©λλ€.
|
| 133 |
+
def backward(self):
|
| 134 |
+
# κ·ΈλλμΈνΈ μ΄κΈ°ν
|
| 135 |
+
self.dWx, self.dWh, self.db = np.zeros_like(self.Wx), np.zeros_like(self.Wh), np.zeros_like(self.b)
|
| 136 |
+
self.dWhy, self.dby = np.zeros_like(self.Why), np.zeros_like(self.by)
|
| 137 |
+
|
| 138 |
+
# λ€μ νμμ€ν
μμ λμ΄μ¬ κ·ΈλλμΈνΈ μ΄κΈ°ν
|
| 139 |
+
dh_next = np.zeros_like(self.h_states[0])
|
| 140 |
+
dc_next = np.zeros_like(self.c_states[0])
|
| 141 |
+
|
| 142 |
+
# 1. Dense & Softmax Layer μμ ν
|
| 143 |
+
d_logits = self.y_pred - self.y_true # Lossμ λν Logitsμ κ·ΈλλμΈνΈ
|
| 144 |
+
self.dWhy = d_logits @ self.final_h.T
|
| 145 |
+
self.dby = d_logits
|
| 146 |
+
dh_final = self.Why.T @ d_logits # LSTMμ μ΅μ’
μλ μνμ λν κ·ΈλλμΈνΈ
|
| 147 |
+
|
| 148 |
+
# dh_nextμ μ΅μ’
κ·ΈλλμΈνΈ μΆκ°
|
| 149 |
+
dh_next += dh_final
|
| 150 |
+
|
| 151 |
+
# 2. LSTM μ
μμ ν (μκ° μμμΌλ‘)
|
| 152 |
+
for t in reversed(range(len(self.inputs))):
|
| 153 |
+
xt = self.inputs[t].reshape(-1, 1)
|
| 154 |
+
h_prev = self.h_states[t - 1]
|
| 155 |
+
c_prev = self.c_states[t - 1]
|
| 156 |
+
|
| 157 |
+
# (1) μλ μνμ μ
μνμ λν κ·ΈλλμΈνΈ κ³μ°
|
| 158 |
+
do = dh_next * tanh(self.c_states[t])
|
| 159 |
+
dc = dc_next + dh_next * self.o_s[t] * tanh_derivative(self.c_states[t])
|
| 160 |
+
|
| 161 |
+
# (2) κ° κ²μ΄νΈμ νμ±ν μ΄μ κ°(z)μ λν κ·ΈλλμΈνΈ κ³μ°
|
| 162 |
+
dz_o = do * sigmoid_derivative(self.z_s[t][3*self.hidden_size:, :])
|
| 163 |
+
dc_tilde = dc * self.i_s[t]
|
| 164 |
+
dz_c = dc_tilde * tanh_derivative(self.z_s[t][2*self.hidden_size:3*self.hidden_size, :])
|
| 165 |
+
di = dc * self.c_tilde_s[t]
|
| 166 |
+
dz_i = di * sigmoid_derivative(self.z_s[t][self.hidden_size:2*self.hidden_size, :])
|
| 167 |
+
df = dc * c_prev
|
| 168 |
+
dz_f = df * sigmoid_derivative(self.z_s[t][:self.hidden_size, :])
|
| 169 |
+
|
| 170 |
+
# (3) 4κ°μ κ·ΈλλμΈνΈλ₯Ό νλλ‘ ν©μΉκΈ°
|
| 171 |
+
dz = np.vstack((dz_f, dz_i, dz_c, dz_o))
|
| 172 |
+
|
| 173 |
+
# (4) νλΌλ―Έν°μ λν κ·ΈλλμΈνΈ λμ
|
| 174 |
+
self.dWx += dz @ xt.T
|
| 175 |
+
self.dWh += dz @ h_prev.T
|
| 176 |
+
self.db += dz
|
| 177 |
+
|
| 178 |
+
# (5) μ΄μ νμμ€ν
μΌλ‘ μ λ¬ν κ·ΈλλμΈνΈ κ³μ°
|
| 179 |
+
dh_next = self.Wh.T @ dz
|
| 180 |
+
dc_next = self.f_s[t] * dc
|
| 181 |
+
|
| 182 |
+
# κ·ΈλλμΈνΈ νλ°(exploding gradients)μ λ°©μ§νκΈ° μν ν΄λ¦¬ν
|
| 183 |
+
for dparam in [self.dWx, self.dWh, self.db, self.dWhy, self.dby]:
|
| 184 |
+
np.clip(dparam, -5, 5, out=dparam)
|
| 185 |
+
|
| 186 |
+
# κ³μ°λ κ·ΈλλμΈνΈλ₯Ό μ¬μ©νμ¬ νλΌλ―Έν°λ₯Ό μ
λ°μ΄νΈν©λλ€. (Gradient Descent)
|
| 187 |
+
def update(self):
|
| 188 |
+
self.Wx -= self.learning_rate * self.dWx
|
| 189 |
+
self.Wh -= self.learning_rate * self.dWh
|
| 190 |
+
self.b -= self.learning_rate * self.db
|
| 191 |
+
self.Why -= self.learning_rate * self.dWhy
|
| 192 |
+
self.by -= self.learning_rate * self.dby
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
# --- 4. λͺ¨λΈ νμ΅ μ€ν ---
|
| 196 |
+
if __name__ == '__main__':
|
| 197 |
+
# λͺ¨λΈ μΈμ€ν΄μ€ μμ±
|
| 198 |
+
lstm = NumpyLSTM(input_size=input_dim, hidden_size=hidden_dim, output_size=output_dim, learning_rate=learning_rate)
|
| 199 |
+
|
| 200 |
+
# νμ΅ λ£¨ν
|
| 201 |
+
for epoch in range(epochs):
|
| 202 |
+
# 1. μμ ν (μ€ν μμ λ¨)
|
| 203 |
+
loss, y_pred = lstm.forward(sample_input, sample_y)
|
| 204 |
+
|
| 205 |
+
# 2. μμ ν
|
| 206 |
+
lstm.backward()
|
| 207 |
+
|
| 208 |
+
# 3. κ°μ€μΉ μ
λ°μ΄νΈ
|
| 209 |
+
lstm.update()
|
| 210 |
+
|
| 211 |
+
if epoch % 100 == 0:
|
| 212 |
+
print(f"Epoch {epoch}, Loss: {loss:.4f}")
|
| 213 |
+
print(f"Predicted Probs: {y_pred.flatten()}")
|
| 214 |
+
print(f"Predicted Class: {np.argmax(y_pred)}")
|
| 215 |
+
print("-" * 20)
|
| 216 |
+
|
| 217 |
+
print("\n--- Training Finished ---")
|
| 218 |
+
final_loss, final_y_pred = lstm.forward(sample_input, sample_y)
|
| 219 |
+
print(f"Final Loss: {final_loss:.4f}")
|
| 220 |
+
print(f"Final Prediction: Class {np.argmax(final_y_pred)} (Probs: {final_y_pred.flatten()})")
|
| 221 |
+
print(f"True Label: Class {np.argmax(sample_y)}")
|