gihakkk commited on
Commit
f1b07c4
Β·
verified Β·
1 Parent(s): 40ab3ba

Upload LSTM.py

Browse files
Files changed (1) hide show
  1. LSTM.py +221 -0
LSTM.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # --- 1. μƒ˜ν”Œ 데이터 및 ν•˜μ΄νΌνŒŒλΌλ―Έν„° μ •μ˜ ---
4
+ input_dim = 3
5
+ hidden_dim = 4
6
+ output_dim = 2
7
+ sequence_length = 5
8
+ learning_rate = 0.01
9
+ epochs = 500
10
+
11
+ # 예: μ‹œν€€μŠ€ λ°μ΄ν„°μ˜ 총 합이 νŠΉμ • 값보닀 크면 1, μ•„λ‹ˆλ©΄ 0으둜 λΆ„λ₯˜
12
+ sample_input = np.random.rand(sequence_length, input_dim)
13
+
14
+ if np.sum(sample_input) > (sequence_length * input_dim / 2):
15
+ sample_y = np.array([1, 0]).reshape(-1, 1) # Class 0
16
+ else:
17
+ sample_y = np.array([0, 1]).reshape(-1, 1) # Class 1
18
+
19
+ print(f"Sample Input Shape: {sample_input.shape}")
20
+ print(f"True Label: Class {np.argmax(sample_y)}")
21
+ print("-" * 30)
22
+
23
+
24
+ # --- 2. ν•„μš” ν•¨μˆ˜ μ •μ˜ (ν™œμ„±ν™” ν•¨μˆ˜ 및 손싀 ν•¨μˆ˜) ---
25
+
26
+ # μ‹œκ·Έλͺ¨μ΄λ“œ ν™œμ„±ν™” ν•¨μˆ˜
27
+ def sigmoid(x):
28
+ return 1 / (1 + np.exp(-x))
29
+
30
+ # μ‹œκ·Έλͺ¨μ΄λ“œ ν•¨μˆ˜μ˜ λ„ν•¨μˆ˜
31
+ def sigmoid_derivative(x):
32
+ s = sigmoid(x)
33
+ return s * (1 - s)
34
+
35
+ # ν•˜μ΄νΌλ³Όλ¦­ νƒ„μ  νŠΈ(tanh) ν™œμ„±ν™” ν•¨μˆ˜
36
+ def tanh(x):
37
+ return np.tanh(x)
38
+
39
+ # tanh ν•¨μˆ˜μ˜ λ„ν•¨μˆ˜
40
+ def tanh_derivative(x):
41
+ return 1 - np.tanh(x)**2
42
+
43
+ # μ†Œν”„νŠΈλ§₯슀 ν•¨μˆ˜
44
+ def softmax(x):
45
+ # 수치적 μ•ˆμ •μ„±μ„ μœ„ν•΄ μž…λ ₯κ°’μ—μ„œ μ΅œλŒ“κ°’μ„ 빼쀌 (Overflow λ°©μ§€)
46
+ e_x = np.exp(x - np.max(x, axis=0, keepdims=True))
47
+ return e_x / np.sum(e_x, axis=0, keepdims=True)
48
+
49
+ # 크둜슀 μ—”νŠΈλ‘œν”Ό 손싀 ν•¨μˆ˜
50
+ def cross_entropy_loss(y_pred, y_true):
51
+ # y_pred에 μ•„μ£Ό μž‘μ€ 값을 더해 log(0) λ°©μ§€
52
+ return -np.sum(y_true * np.log(y_pred + 1e-9))
53
+
54
+
55
+ # --- 3. NumpyLSTM λͺ¨λΈ 클래슀 ---
56
+
57
+ class NumpyLSTM:
58
+ # λͺ¨λΈμ˜ κ°€μ€‘μΉ˜μ™€ νŒŒλΌλ―Έν„°λ₯Ό μ΄ˆκΈ°ν™”ν•©λ‹ˆλ‹€.
59
+ # - input_size: μž…λ ₯ λ²‘ν„°μ˜ 차원
60
+ # - hidden_size: 은닉 μƒνƒœ 및 μ…€ μƒνƒœ λ²‘ν„°μ˜ 차원
61
+ # - output_size: 좜λ ₯ 벑터(클래슀 개수)의 차원
62
+ def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
63
+ self.input_size = input_size
64
+ self.hidden_size = hidden_size
65
+ self.output_size = output_size
66
+ self.learning_rate = learning_rate
67
+
68
+ # LSTM νŒŒλΌλ―Έν„° μ΄ˆκΈ°ν™” (Forget, Input, Cell, Output 게이트)
69
+ # 각 κ²Œμ΄νŠΈλŠ” μž…λ ₯(x)κ³Ό 이전 은닉 μƒνƒœ(h)λ₯Ό λͺ¨λ‘ λ°›μœΌλ―€λ‘œ, κ°€μ€‘μΉ˜ 행렬을 ν•©μ³μ„œ μ •μ˜
70
+ self.Wx = np.random.randn(4 * hidden_size, input_size) * 0.1
71
+ self.Wh = np.random.randn(4 * hidden_size, hidden_size) * 0.1
72
+ self.b = np.zeros((4 * hidden_size, 1))
73
+
74
+ # Dense Layer (좜λ ₯μΈ΅) νŒŒλΌλ―Έν„° μ΄ˆκΈ°ν™”
75
+ self.Why = np.random.randn(output_size, hidden_size) * 0.1
76
+ self.by = np.zeros((output_size, 1))
77
+
78
+ # κ·Έλž˜λ””μ–ΈνŠΈλ₯Ό μ €μž₯ν•  λ³€μˆ˜ μ΄ˆκΈ°ν™”
79
+ self.dWx, self.dWh, self.db = np.zeros_like(self.Wx), np.zeros_like(self.Wh), np.zeros_like(self.b)
80
+ self.dWhy, self.dby = np.zeros_like(self.Why), np.zeros_like(self.by)
81
+
82
+ # μˆœμ „νŒŒ 과정을 μˆ˜ν–‰ν•©λ‹ˆλ‹€.
83
+ # - inputs: (μ‹œν€€μŠ€ 길이, μž…λ ₯ 차원) ν˜•νƒœμ˜ 2D numpy λ°°μ—΄
84
+ # - y_true: (좜λ ₯ 차원, 1) ν˜•νƒœμ˜ one-hot μΈμ½”λ”©λœ μ •λ‹΅ λ ˆμ΄λΈ”
85
+ def forward(self, inputs, y_true):
86
+ self.inputs = inputs
87
+ self.y_true = y_true
88
+ seq_length = inputs.shape[0]
89
+
90
+ # 이전 은닉 μƒνƒœμ™€ μ…€ μƒνƒœλ₯Ό μ €μž₯ν•  λ”•μ…”λ„ˆλ¦¬
91
+ self.h_states, self.c_states = {}, {}
92
+ self.h_states[-1] = np.zeros((self.hidden_size, 1))
93
+ self.c_states[-1] = np.zeros((self.hidden_size, 1))
94
+
95
+ # μˆœμ „νŒŒμ— ν•„μš”ν•œ 쀑간 값듀을 μ €μž₯ν•  λ”•μ…”λ„ˆλ¦¬
96
+ self.z_s, self.f_s, self.i_s, self.c_tilde_s, self.o_s = {}, {}, {}, {}, {}
97
+
98
+ # 1. LSTM μ…€ μˆœμ „νŒŒ (μ‹œκ°„ μˆœμ„œλŒ€λ‘œ)
99
+ for t in range(seq_length):
100
+ xt = self.inputs[t].reshape(-1, 1) # ν˜„μž¬ νƒ€μž„μŠ€ν…μ˜ μž…λ ₯
101
+ h_prev = self.h_states[t - 1]
102
+ c_prev = self.c_states[t - 1]
103
+
104
+ # (1) 게이트 계산을 μœ„ν•œ μ„ ν˜• κ²°ν•©
105
+ # 4개의 게이트(f, i, c_tilde, o) 계산을 ν•œ λ²ˆμ— μˆ˜ν–‰
106
+ self.z_s[t] = self.Wx @ xt + self.Wh @ h_prev + self.b
107
+
108
+ # (2) 각 게이트 ν™œμ„±ν™”
109
+ # Forget Gate (망각 게이트)
110
+ self.f_s[t] = sigmoid(self.z_s[t][:self.hidden_size, :])
111
+ # Input Gate (μž…λ ₯ 게이트)
112
+ self.i_s[t] = sigmoid(self.z_s[t][self.hidden_size:2*self.hidden_size, :])
113
+ # Cell Candidate (μ…€ μƒνƒœ 후보)
114
+ self.c_tilde_s[t] = tanh(self.z_s[t][2*self.hidden_size:3*self.hidden_size, :])
115
+ # Output Gate (좜λ ₯ 게이트)
116
+ self.o_s[t] = sigmoid(self.z_s[t][3*self.hidden_size:, :])
117
+
118
+ # (3) μ…€ μƒνƒœ 및 은닉 μƒνƒœ μ—…λ°μ΄νŠΈ
119
+ self.c_states[t] = self.f_s[t] * c_prev + self.i_s[t] * self.c_tilde_s[t]
120
+ self.h_states[t] = self.o_s[t] * tanh(self.c_states[t])
121
+
122
+ # 2. Dense Layer & Softmax μˆœμ „νŒŒ
123
+ self.final_h = self.h_states[seq_length - 1]
124
+ self.logits = self.Why @ self.final_h + self.by
125
+ self.y_pred = softmax(self.logits)
126
+
127
+ # 3. 손싀(Loss) 계산
128
+ self.loss = cross_entropy_loss(self.y_pred, self.y_true)
129
+
130
+ return self.loss, self.y_pred
131
+
132
+ # μ—­μ „νŒŒ(BPTT) 과정을 μˆ˜ν–‰ν•˜μ—¬ κ·Έλž˜λ””μ–ΈνŠΈλ₯Ό κ³„μ‚°ν•©λ‹ˆλ‹€.
133
+ def backward(self):
134
+ # κ·Έλž˜λ””μ–ΈνŠΈ μ΄ˆκΈ°ν™”
135
+ self.dWx, self.dWh, self.db = np.zeros_like(self.Wx), np.zeros_like(self.Wh), np.zeros_like(self.b)
136
+ self.dWhy, self.dby = np.zeros_like(self.Why), np.zeros_like(self.by)
137
+
138
+ # λ‹€μŒ νƒ€μž„μŠ€ν…μ—μ„œ λ„˜μ–΄μ˜¬ κ·Έλž˜λ””μ–ΈνŠΈ μ΄ˆκΈ°ν™”
139
+ dh_next = np.zeros_like(self.h_states[0])
140
+ dc_next = np.zeros_like(self.c_states[0])
141
+
142
+ # 1. Dense & Softmax Layer μ—­μ „νŒŒ
143
+ d_logits = self.y_pred - self.y_true # Loss에 λŒ€ν•œ Logits의 κ·Έλž˜λ””μ–ΈνŠΈ
144
+ self.dWhy = d_logits @ self.final_h.T
145
+ self.dby = d_logits
146
+ dh_final = self.Why.T @ d_logits # LSTM의 μ΅œμ’… 은닉 μƒνƒœμ— λŒ€ν•œ κ·Έλž˜λ””μ–ΈνŠΈ
147
+
148
+ # dh_next에 μ΅œμ’… κ·Έλž˜λ””μ–ΈνŠΈ μΆ”κ°€
149
+ dh_next += dh_final
150
+
151
+ # 2. LSTM μ…€ μ—­μ „νŒŒ (μ‹œκ°„ μ—­μˆœμœΌλ‘œ)
152
+ for t in reversed(range(len(self.inputs))):
153
+ xt = self.inputs[t].reshape(-1, 1)
154
+ h_prev = self.h_states[t - 1]
155
+ c_prev = self.c_states[t - 1]
156
+
157
+ # (1) 은닉 μƒνƒœμ™€ μ…€ μƒνƒœμ— λŒ€ν•œ κ·Έλž˜λ””μ–ΈνŠΈ 계산
158
+ do = dh_next * tanh(self.c_states[t])
159
+ dc = dc_next + dh_next * self.o_s[t] * tanh_derivative(self.c_states[t])
160
+
161
+ # (2) 각 게이트의 ν™œμ„±ν™” 이전 κ°’(z)에 λŒ€ν•œ κ·Έλž˜λ””μ–ΈνŠΈ 계산
162
+ dz_o = do * sigmoid_derivative(self.z_s[t][3*self.hidden_size:, :])
163
+ dc_tilde = dc * self.i_s[t]
164
+ dz_c = dc_tilde * tanh_derivative(self.z_s[t][2*self.hidden_size:3*self.hidden_size, :])
165
+ di = dc * self.c_tilde_s[t]
166
+ dz_i = di * sigmoid_derivative(self.z_s[t][self.hidden_size:2*self.hidden_size, :])
167
+ df = dc * c_prev
168
+ dz_f = df * sigmoid_derivative(self.z_s[t][:self.hidden_size, :])
169
+
170
+ # (3) 4개의 κ·Έλž˜λ””μ–ΈνŠΈλ₯Ό ν•˜λ‚˜λ‘œ ν•©μΉ˜κΈ°
171
+ dz = np.vstack((dz_f, dz_i, dz_c, dz_o))
172
+
173
+ # (4) νŒŒλΌλ―Έν„°μ— λŒ€ν•œ κ·Έλž˜λ””μ–ΈνŠΈ λˆ„μ 
174
+ self.dWx += dz @ xt.T
175
+ self.dWh += dz @ h_prev.T
176
+ self.db += dz
177
+
178
+ # (5) 이전 νƒ€μž„μŠ€ν…μœΌλ‘œ 전달할 κ·Έλž˜λ””μ–ΈνŠΈ 계산
179
+ dh_next = self.Wh.T @ dz
180
+ dc_next = self.f_s[t] * dc
181
+
182
+ # κ·Έλž˜λ””μ–ΈνŠΈ 폭발(exploding gradients)을 λ°©μ§€ν•˜κΈ° μœ„ν•œ 클리핑
183
+ for dparam in [self.dWx, self.dWh, self.db, self.dWhy, self.dby]:
184
+ np.clip(dparam, -5, 5, out=dparam)
185
+
186
+ # κ³„μ‚°λœ κ·Έλž˜λ””μ–ΈνŠΈλ₯Ό μ‚¬μš©ν•˜μ—¬ νŒŒλΌλ―Έν„°λ₯Ό μ—…λ°μ΄νŠΈν•©λ‹ˆλ‹€. (Gradient Descent)
187
+ def update(self):
188
+ self.Wx -= self.learning_rate * self.dWx
189
+ self.Wh -= self.learning_rate * self.dWh
190
+ self.b -= self.learning_rate * self.db
191
+ self.Why -= self.learning_rate * self.dWhy
192
+ self.by -= self.learning_rate * self.dby
193
+
194
+
195
+ # --- 4. λͺ¨λΈ ν•™μŠ΅ μ‹€ν–‰ ---
196
+ if __name__ == '__main__':
197
+ # λͺ¨λΈ μΈμŠ€ν„΄μŠ€ 생성
198
+ lstm = NumpyLSTM(input_size=input_dim, hidden_size=hidden_dim, output_size=output_dim, learning_rate=learning_rate)
199
+
200
+ # ν•™μŠ΅ 루프
201
+ for epoch in range(epochs):
202
+ # 1. μˆœμ „νŒŒ (μ˜€νƒ€ μˆ˜μ •λ¨)
203
+ loss, y_pred = lstm.forward(sample_input, sample_y)
204
+
205
+ # 2. μ—­μ „νŒŒ
206
+ lstm.backward()
207
+
208
+ # 3. κ°€μ€‘μΉ˜ μ—…λ°μ΄νŠΈ
209
+ lstm.update()
210
+
211
+ if epoch % 100 == 0:
212
+ print(f"Epoch {epoch}, Loss: {loss:.4f}")
213
+ print(f"Predicted Probs: {y_pred.flatten()}")
214
+ print(f"Predicted Class: {np.argmax(y_pred)}")
215
+ print("-" * 20)
216
+
217
+ print("\n--- Training Finished ---")
218
+ final_loss, final_y_pred = lstm.forward(sample_input, sample_y)
219
+ print(f"Final Loss: {final_loss:.4f}")
220
+ print(f"Final Prediction: Class {np.argmax(final_y_pred)} (Probs: {final_y_pred.flatten()})")
221
+ print(f"True Label: Class {np.argmax(sample_y)}")