gihakkk commited on
Commit
6140a7a
ยท
verified ยท
1 Parent(s): cf1f6fc

Upload Transformer.py

Browse files
Files changed (1) hide show
  1. Transformer.py +205 -0
Transformer.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ # --- 0. ๊ธฐ๋ณธ ์„ค์ • (Settings) ---
4
+ batch_size = 4 # ๋ฐฐ์น˜ ํฌ๊ธฐ B
5
+ d_model = 512 # ๋ชจ๋ธ ์ฐจ์› D
6
+ d_k = 64 # ํ—ค๋“œ ์ฐจ์› (d_model / num_heads)
7
+ d_ff = 2048 # FFN ๋‚ด๋ถ€ ์ฐจ์›
8
+ vocab_size = 10000 # ์–ดํœ˜ ํฌ๊ธฐ V
9
+ enc_seq_len = 10 # ์ธ์ฝ”๋” ์‹œํ€€์Šค ๊ธธ์ด S_enc
10
+ num_heads = 8
11
+
12
+ # ์˜ˆ์‹œ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ: [B, S_enc, D] ํ˜•ํƒœ
13
+ input_data = np.random.randn(batch_size, enc_seq_len, d_model) * 0.1
14
+
15
+ # --- 1. ํ—ฌํผ ํ•จ์ˆ˜ ๋ฐ ๊ฐ€์ค‘์น˜ ์ดˆ๊ธฐํ™” ---
16
+
17
+ def init_weights(shape):
18
+ """He/Xavier ์ดˆ๊ธฐํ™”์˜ ๊ฐ„๋žตํ™” ๋ฒ„์ „"""
19
+ if len(shape) == 1:
20
+ return np.zeros(shape)
21
+ # np.sqrt(2.0 / shape[0]) -> np.sqrt(1.0 / shape[0]) (Xavier)
22
+ return np.random.randn(*shape) * np.sqrt(1.0 / shape[0])
23
+
24
+ # --- 2. ํ•ต์‹ฌ ๋ ˆ์ด์–ด ๊ตฌํ˜„ ---
25
+
26
+ def layer_normalization(x, gamma, beta, epsilon=1e-5):
27
+ """Layer Normalization (๊ณ„์ธต ์ •๊ทœํ™”)"""
28
+ # x ํ˜•ํƒœ: [B, S, D]
29
+ mean = np.mean(x, axis=-1, keepdims=True)
30
+ variance = np.mean((x - mean) ** 2, axis=-1, keepdims=True)
31
+ x_normalized = (x - mean) / np.sqrt(variance + epsilon)
32
+ output = gamma * x_normalized + beta
33
+ return output
34
+
35
+ def scaled_dot_product_attention(Q, K, V, mask=None):
36
+ """Scaled Dot-Product Attention (๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ์ง€์›)"""
37
+ # Q: [B, H, S_q, d_k], K: [B, H, S_k, d_k], V: [B, H, S_k, d_k]
38
+ scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) # [B, H, S_q, S_k]
39
+ scores = scores / np.sqrt(d_k)
40
+
41
+ if mask is not None:
42
+ scores = scores + mask
43
+
44
+ exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
45
+ attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
46
+
47
+ output = np.matmul(attention_weights, V) # [B, H, S_q, d_k]
48
+ return output, attention_weights
49
+
50
+ def multi_head_attention(Q, K, V, W_Q, W_K, W_V, W_O, mask=None):
51
+ """
52
+ Multi-Head Attention (์˜ค๋ฅ˜ ์ˆ˜์ •: ๋™์  ์‹œํ€€์Šค ๊ธธ์ด ์ฒ˜๋ฆฌ)
53
+ Q: [B, S_q, D], K: [B, S_k, D], V: [B, S_k, D]
54
+ """
55
+
56
+ # ๐ŸŒŸ๐ŸŒŸ๐ŸŒŸ ํ•ต์‹ฌ ์ˆ˜์ • ๋ถ€๋ถ„: Q, K, V์—์„œ ๋™์ ์œผ๋กœ Shape ์ฝ๊ธฐ ๐ŸŒŸ๐ŸŒŸ๐ŸŒŸ
57
+ B_q, S_q, D_q = Q.shape
58
+ B_k, S_k, D_k = K.shape
59
+ B_v, S_v, D_v = V.shape
60
+ # (B_q, B_k, B_v๋Š” ๋ชจ๋‘ batch_size๋กœ ๋™์ผํ•ด์•ผ ํ•จ)
61
+ # (S_k์™€ S_v๋Š” ๋™์ผํ•ด์•ผ ํ•จ)
62
+
63
+ # 1. ์„ ํ˜• ๋ณ€ํ™˜ (Projection)
64
+ Q_proj = np.matmul(Q, W_Q) # [B_q, S_q, D]
65
+ K_proj = np.matmul(K, W_K) # [B_k, S_k, D]
66
+ V_proj = np.matmul(V, W_V) # [B_v, S_v, D]
67
+
68
+ # 2. Multi-Head ๋ถ„ํ•  ๋ฐ ์ฐจ์› ๋ณ€๊ฒฝ
69
+ # Q: [B_q, num_heads, S_q, d_k]
70
+ Q_multi = Q_proj.reshape(B_q, S_q, num_heads, d_k).transpose(0, 2, 1, 3)
71
+ # K: [B_k, num_heads, S_k, d_k]
72
+ K_multi = K_proj.reshape(B_k, S_k, num_heads, d_k).transpose(0, 2, 1, 3)
73
+ # V: [B_v, num_heads, S_v, d_k]
74
+ V_multi = V_proj.reshape(B_v, S_v, num_heads, d_k).transpose(0, 2, 1, 3)
75
+
76
+ # 3. ์–ดํ…์…˜ ๊ณ„์‚ฐ
77
+ attended_output, _ = scaled_dot_product_attention(Q_multi, K_multi, V_multi, mask)
78
+
79
+ # 4. ๊ฒฐ๊ณผ ๊ฒฐํ•ฉ (Concatenate): [B_q, S_q, D]
80
+ attended_output = attended_output.transpose(0, 2, 1, 3).reshape(B_q, S_q, d_model)
81
+
82
+ # 5. ์ตœ์ข… ์ถœ๋ ฅ ์„ ํ˜• ๋ณ€ํ™˜
83
+ output = np.matmul(attended_output, W_O)
84
+ return output
85
+
86
+ def feed_forward_network(x, W1, b1, W2, b2):
87
+ """Feed-Forward Network (FFN)"""
88
+ hidden = np.matmul(x, W1) + b1
89
+ hidden = np.maximum(0, hidden) # ReLU
90
+ output = np.matmul(hidden, W2) + b2
91
+ return output
92
+
93
+ # --- 3. ๊ฐ€์ค‘์น˜ ์„ค์ • (ํ•˜๋‚˜์˜ ์ธต์„ ์œ„ํ•œ ๋ชจ๋“  ๊ฐ€์ค‘์น˜) ---
94
+
95
+ # Encoder ๊ฐ€์ค‘์น˜
96
+ W_Q_enc, W_K_enc, W_V_enc, W_O_enc = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
97
+ W1_enc, W2_enc = init_weights((d_model, d_ff)), init_weights((d_ff, d_model))
98
+ b1_enc, b2_enc = init_weights((1, d_ff)), init_weights((1, d_model))
99
+ gamma_enc1, beta_enc1 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
100
+ gamma_enc2, beta_enc2 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
101
+
102
+ # Decoder ๊ฐ€์ค‘์น˜
103
+ W_Q_dec_self, W_K_dec_self, W_V_dec_self, W_O_dec_self = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
104
+ W_Q_dec_cross, W_K_dec_cross, W_V_dec_cross, W_O_dec_cross = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
105
+ W1_dec, W2_dec = init_weights((d_model, d_ff)), init_weights((d_ff, d_model))
106
+ b1_dec, b2_dec = init_weights((1, d_ff)), init_weights((1, d_model))
107
+ gamma_dec1, beta_dec1 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
108
+ gamma_dec2, beta_dec2 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
109
+ gamma_dec3, beta_dec3 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
110
+
111
+
112
+ # --- 4. ์ธ์ฝ”๋” ๋ธ”๋ก (Add & Norm ์ ์šฉ) ---
113
+
114
+ def encoder_block(x):
115
+ # x ํ˜•ํƒœ: [B, S_enc, D]
116
+
117
+ # Sub-layer 1: Multi-Head Self-Attention
118
+ attn_output = multi_head_attention(x, x, x, W_Q_enc, W_K_enc, W_V_enc, W_O_enc)
119
+
120
+ # 1. Add & Norm
121
+ x_1 = layer_normalization(attn_output + x, gamma_enc1, beta_enc1)
122
+
123
+ # Sub-layer 2: Feed-Forward Network
124
+ ffn_output = feed_forward_network(x_1, W1_enc, b1_enc, W2_enc, b2_enc)
125
+
126
+ # 2. Add & Norm
127
+ output = layer_normalization(ffn_output + x_1, gamma_enc2, beta_enc2)
128
+
129
+ return output
130
+
131
+ # --- 5. ๋””์ฝ”๋” ๋ธ”๋ก (Add & Norm ์ ์šฉ) ---
132
+
133
+ def create_look_ahead_mask(size):
134
+ """Look-ahead Mask ์ƒ์„ฑ (๋ฏธ๋ž˜ ๋‹จ์–ด ๋งˆ์Šคํ‚น)"""
135
+ mask = np.triu(np.ones((size, size)), k=1)
136
+ return (mask * -1e9)[np.newaxis, np.newaxis, :, :] # [1, 1, S, S]
137
+
138
+ def decoder_block(x, enc_output, look_ahead_mask):
139
+ # x ํ˜•ํƒœ: [B, S_target, D], enc_output ํ˜•ํƒœ: [B, S_source, D]
140
+
141
+ # Sub-layer 1: Masked Multi-Head Self-Attention
142
+ self_attn_output = multi_head_attention(
143
+ x, x, x, W_Q_dec_self, W_K_dec_self, W_V_dec_self, W_O_dec_self, mask=look_ahead_mask
144
+ )
145
+
146
+ # 1. Add & Norm
147
+ x_1 = layer_normalization(self_attn_output + x, gamma_dec1, beta_dec1)
148
+
149
+ # Sub-layer 2: Multi-Head Encoder-Decoder Attention (Cross-Attention)
150
+ # Q: ๋””์ฝ”๋” ์ถœ๋ ฅ(x_1), K, V: ์ธ์ฝ”๋” ์ถœ๋ ฅ(enc_output)
151
+ cross_attn_output = multi_head_attention(
152
+ x_1, enc_output, enc_output, W_Q_dec_cross, W_K_dec_cross, W_V_dec_cross, W_O_dec_cross, mask=None
153
+ )
154
+
155
+ # 2. Add & Norm (์ž”์ฐจ ์—ฐ๊ฒฐ์€ x_1๊ณผ ์—ฐ๊ฒฐ)
156
+ x_2 = layer_normalization(cross_attn_output + x_1, gamma_dec2, beta_dec2)
157
+
158
+ # Sub-layer 3: FFN
159
+ ffn_output = feed_forward_network(x_2, W1_dec, b1_dec, W2_dec, b2_dec)
160
+
161
+ # 3. Add & Norm
162
+ output = layer_normalization(ffn_output + x_2, gamma_dec3, beta_dec3)
163
+
164
+ return output
165
+
166
+ # --- 6. ์ตœ์ข… Output (Linear + Softmax) ---
167
+
168
+ W_linear = init_weights((d_model, vocab_size))
169
+ b_linear = init_weights((1, vocab_size))
170
+
171
+ def final_output_layer(x):
172
+ # x: [B, S, D]
173
+ logits = np.matmul(x, W_linear) + b_linear # [B, S, V]
174
+
175
+ exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
176
+ probabilities = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
177
+
178
+ return probabilities
179
+
180
+ # --- 7. ์ „์ฒด ํŠธ๋žœ์Šคํฌ๋จธ ํ๋ฆ„ ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ---
181
+
182
+ print("--- Add & Norm ์ ์šฉ๋œ ํŠธ๋žœ์Šคํฌ๋จธ ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ์‹œ์ž‘ ---")
183
+
184
+ # 1. ์ธ์ฝ”๋” ์‹คํ–‰
185
+ # input_data: (4, 10, 512)
186
+ enc_output_final = encoder_block(input_data)
187
+ print(f"์ธ์ฝ”๋” ์ตœ์ข… ์ถœ๋ ฅ ํ˜•ํƒœ (K, V ์†Œ์Šค): {enc_output_final.shape}")
188
+
189
+ # 2. ๋””์ฝ”๋” ์ž…๋ ฅ ์ค€๋น„
190
+ dec_seq_len = 5 # ๋””์ฝ”๋” ์‹œํ€€์Šค ๊ธธ์ด
191
+ decoder_input_data = np.random.randn(batch_size, dec_seq_len, d_model) * 0.1
192
+ look_ahead_mask = create_look_ahead_mask(dec_seq_len) # [1, 1, 5, 5]
193
+
194
+ # 3. ๋””์ฝ”๋” ์‹คํ–‰
195
+ # decoder_input_data (Q): (4, 5, 512)
196
+ # enc_output_final (K, V): (4, 10, 512)
197
+ # Cross-Attention์—์„œ Q(S=5)์™€ K/V(S=10)์˜ ๊ธธ์ด๊ฐ€ ๋‹ฌ๋ผ๋„ ์ •์ƒ ์ž‘๋™
198
+ dec_output_final = decoder_block(decoder_input_data, enc_output_final, look_ahead_mask)
199
+ print(f"๋””์ฝ”๋” ์ตœ์ข… ์ถœ๋ ฅ ํ˜•ํƒœ: {dec_output_final.shape}")
200
+
201
+ # 4. ์ตœ์ข… ์ถœ๋ ฅ
202
+ probabilities = final_output_layer(dec_output_final)
203
+ print(f"์ตœ์ข… ํ™•๋ฅ  ๋ถ„ํฌ ํ˜•ํƒœ (B x S_target x V): {probabilities.shape}")
204
+
205
+ print("\n**์™„๋ฃŒ**")