Transformer / Transformer.py
gihakkk's picture
Upload Transformer.py
6140a7a verified
import numpy as np
# --- 0. ๊ธฐ๋ณธ ์„ค์ • (Settings) ---
batch_size = 4 # ๋ฐฐ์น˜ ํฌ๊ธฐ B
d_model = 512 # ๋ชจ๋ธ ์ฐจ์› D
d_k = 64 # ํ—ค๋“œ ์ฐจ์› (d_model / num_heads)
d_ff = 2048 # FFN ๋‚ด๋ถ€ ์ฐจ์›
vocab_size = 10000 # ์–ดํœ˜ ํฌ๊ธฐ V
enc_seq_len = 10 # ์ธ์ฝ”๋” ์‹œํ€€์Šค ๊ธธ์ด S_enc
num_heads = 8
# ์˜ˆ์‹œ ์ž…๋ ฅ ๋ฐ์ดํ„ฐ: [B, S_enc, D] ํ˜•ํƒœ
input_data = np.random.randn(batch_size, enc_seq_len, d_model) * 0.1
# --- 1. ํ—ฌํผ ํ•จ์ˆ˜ ๋ฐ ๊ฐ€์ค‘์น˜ ์ดˆ๊ธฐํ™” ---
def init_weights(shape):
"""He/Xavier ์ดˆ๊ธฐํ™”์˜ ๊ฐ„๋žตํ™” ๋ฒ„์ „"""
if len(shape) == 1:
return np.zeros(shape)
# np.sqrt(2.0 / shape[0]) -> np.sqrt(1.0 / shape[0]) (Xavier)
return np.random.randn(*shape) * np.sqrt(1.0 / shape[0])
# --- 2. ํ•ต์‹ฌ ๋ ˆ์ด์–ด ๊ตฌํ˜„ ---
def layer_normalization(x, gamma, beta, epsilon=1e-5):
"""Layer Normalization (๊ณ„์ธต ์ •๊ทœํ™”)"""
# x ํ˜•ํƒœ: [B, S, D]
mean = np.mean(x, axis=-1, keepdims=True)
variance = np.mean((x - mean) ** 2, axis=-1, keepdims=True)
x_normalized = (x - mean) / np.sqrt(variance + epsilon)
output = gamma * x_normalized + beta
return output
def scaled_dot_product_attention(Q, K, V, mask=None):
"""Scaled Dot-Product Attention (๋ฐฐ์น˜ ์ฒ˜๋ฆฌ ์ง€์›)"""
# Q: [B, H, S_q, d_k], K: [B, H, S_k, d_k], V: [B, H, S_k, d_k]
scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) # [B, H, S_q, S_k]
scores = scores / np.sqrt(d_k)
if mask is not None:
scores = scores + mask
exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
output = np.matmul(attention_weights, V) # [B, H, S_q, d_k]
return output, attention_weights
def multi_head_attention(Q, K, V, W_Q, W_K, W_V, W_O, mask=None):
"""
Multi-Head Attention (์˜ค๋ฅ˜ ์ˆ˜์ •: ๋™์  ์‹œํ€€์Šค ๊ธธ์ด ์ฒ˜๋ฆฌ)
Q: [B, S_q, D], K: [B, S_k, D], V: [B, S_k, D]
"""
# ๐ŸŒŸ๐ŸŒŸ๐ŸŒŸ ํ•ต์‹ฌ ์ˆ˜์ • ๋ถ€๋ถ„: Q, K, V์—์„œ ๋™์ ์œผ๋กœ Shape ์ฝ๊ธฐ ๐ŸŒŸ๐ŸŒŸ๐ŸŒŸ
B_q, S_q, D_q = Q.shape
B_k, S_k, D_k = K.shape
B_v, S_v, D_v = V.shape
# (B_q, B_k, B_v๋Š” ๋ชจ๋‘ batch_size๋กœ ๋™์ผํ•ด์•ผ ํ•จ)
# (S_k์™€ S_v๋Š” ๋™์ผํ•ด์•ผ ํ•จ)
# 1. ์„ ํ˜• ๋ณ€ํ™˜ (Projection)
Q_proj = np.matmul(Q, W_Q) # [B_q, S_q, D]
K_proj = np.matmul(K, W_K) # [B_k, S_k, D]
V_proj = np.matmul(V, W_V) # [B_v, S_v, D]
# 2. Multi-Head ๋ถ„ํ•  ๋ฐ ์ฐจ์› ๋ณ€๊ฒฝ
# Q: [B_q, num_heads, S_q, d_k]
Q_multi = Q_proj.reshape(B_q, S_q, num_heads, d_k).transpose(0, 2, 1, 3)
# K: [B_k, num_heads, S_k, d_k]
K_multi = K_proj.reshape(B_k, S_k, num_heads, d_k).transpose(0, 2, 1, 3)
# V: [B_v, num_heads, S_v, d_k]
V_multi = V_proj.reshape(B_v, S_v, num_heads, d_k).transpose(0, 2, 1, 3)
# 3. ์–ดํ…์…˜ ๊ณ„์‚ฐ
attended_output, _ = scaled_dot_product_attention(Q_multi, K_multi, V_multi, mask)
# 4. ๊ฒฐ๊ณผ ๊ฒฐํ•ฉ (Concatenate): [B_q, S_q, D]
attended_output = attended_output.transpose(0, 2, 1, 3).reshape(B_q, S_q, d_model)
# 5. ์ตœ์ข… ์ถœ๋ ฅ ์„ ํ˜• ๋ณ€ํ™˜
output = np.matmul(attended_output, W_O)
return output
def feed_forward_network(x, W1, b1, W2, b2):
"""Feed-Forward Network (FFN)"""
hidden = np.matmul(x, W1) + b1
hidden = np.maximum(0, hidden) # ReLU
output = np.matmul(hidden, W2) + b2
return output
# --- 3. ๊ฐ€์ค‘์น˜ ์„ค์ • (ํ•˜๋‚˜์˜ ์ธต์„ ์œ„ํ•œ ๋ชจ๋“  ๊ฐ€์ค‘์น˜) ---
# Encoder ๊ฐ€์ค‘์น˜
W_Q_enc, W_K_enc, W_V_enc, W_O_enc = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
W1_enc, W2_enc = init_weights((d_model, d_ff)), init_weights((d_ff, d_model))
b1_enc, b2_enc = init_weights((1, d_ff)), init_weights((1, d_model))
gamma_enc1, beta_enc1 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
gamma_enc2, beta_enc2 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
# Decoder ๊ฐ€์ค‘์น˜
W_Q_dec_self, W_K_dec_self, W_V_dec_self, W_O_dec_self = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
W_Q_dec_cross, W_K_dec_cross, W_V_dec_cross, W_O_dec_cross = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
W1_dec, W2_dec = init_weights((d_model, d_ff)), init_weights((d_ff, d_model))
b1_dec, b2_dec = init_weights((1, d_ff)), init_weights((1, d_model))
gamma_dec1, beta_dec1 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
gamma_dec2, beta_dec2 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
gamma_dec3, beta_dec3 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
# --- 4. ์ธ์ฝ”๋” ๋ธ”๋ก (Add & Norm ์ ์šฉ) ---
def encoder_block(x):
# x ํ˜•ํƒœ: [B, S_enc, D]
# Sub-layer 1: Multi-Head Self-Attention
attn_output = multi_head_attention(x, x, x, W_Q_enc, W_K_enc, W_V_enc, W_O_enc)
# 1. Add & Norm
x_1 = layer_normalization(attn_output + x, gamma_enc1, beta_enc1)
# Sub-layer 2: Feed-Forward Network
ffn_output = feed_forward_network(x_1, W1_enc, b1_enc, W2_enc, b2_enc)
# 2. Add & Norm
output = layer_normalization(ffn_output + x_1, gamma_enc2, beta_enc2)
return output
# --- 5. ๋””์ฝ”๋” ๋ธ”๋ก (Add & Norm ์ ์šฉ) ---
def create_look_ahead_mask(size):
"""Look-ahead Mask ์ƒ์„ฑ (๋ฏธ๋ž˜ ๋‹จ์–ด ๋งˆ์Šคํ‚น)"""
mask = np.triu(np.ones((size, size)), k=1)
return (mask * -1e9)[np.newaxis, np.newaxis, :, :] # [1, 1, S, S]
def decoder_block(x, enc_output, look_ahead_mask):
# x ํ˜•ํƒœ: [B, S_target, D], enc_output ํ˜•ํƒœ: [B, S_source, D]
# Sub-layer 1: Masked Multi-Head Self-Attention
self_attn_output = multi_head_attention(
x, x, x, W_Q_dec_self, W_K_dec_self, W_V_dec_self, W_O_dec_self, mask=look_ahead_mask
)
# 1. Add & Norm
x_1 = layer_normalization(self_attn_output + x, gamma_dec1, beta_dec1)
# Sub-layer 2: Multi-Head Encoder-Decoder Attention (Cross-Attention)
# Q: ๋””์ฝ”๋” ์ถœ๋ ฅ(x_1), K, V: ์ธ์ฝ”๋” ์ถœ๋ ฅ(enc_output)
cross_attn_output = multi_head_attention(
x_1, enc_output, enc_output, W_Q_dec_cross, W_K_dec_cross, W_V_dec_cross, W_O_dec_cross, mask=None
)
# 2. Add & Norm (์ž”์ฐจ ์—ฐ๊ฒฐ์€ x_1๊ณผ ์—ฐ๊ฒฐ)
x_2 = layer_normalization(cross_attn_output + x_1, gamma_dec2, beta_dec2)
# Sub-layer 3: FFN
ffn_output = feed_forward_network(x_2, W1_dec, b1_dec, W2_dec, b2_dec)
# 3. Add & Norm
output = layer_normalization(ffn_output + x_2, gamma_dec3, beta_dec3)
return output
# --- 6. ์ตœ์ข… Output (Linear + Softmax) ---
W_linear = init_weights((d_model, vocab_size))
b_linear = init_weights((1, vocab_size))
def final_output_layer(x):
# x: [B, S, D]
logits = np.matmul(x, W_linear) + b_linear # [B, S, V]
exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
probabilities = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
return probabilities
# --- 7. ์ „์ฒด ํŠธ๋žœ์Šคํฌ๋จธ ํ๋ฆ„ ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ---
print("--- Add & Norm ์ ์šฉ๋œ ํŠธ๋žœ์Šคํฌ๋จธ ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ์‹œ์ž‘ ---")
# 1. ์ธ์ฝ”๋” ์‹คํ–‰
# input_data: (4, 10, 512)
enc_output_final = encoder_block(input_data)
print(f"์ธ์ฝ”๋” ์ตœ์ข… ์ถœ๋ ฅ ํ˜•ํƒœ (K, V ์†Œ์Šค): {enc_output_final.shape}")
# 2. ๋””์ฝ”๋” ์ž…๋ ฅ ์ค€๋น„
dec_seq_len = 5 # ๋””์ฝ”๋” ์‹œํ€€์Šค ๊ธธ์ด
decoder_input_data = np.random.randn(batch_size, dec_seq_len, d_model) * 0.1
look_ahead_mask = create_look_ahead_mask(dec_seq_len) # [1, 1, 5, 5]
# 3. ๋””์ฝ”๋” ์‹คํ–‰
# decoder_input_data (Q): (4, 5, 512)
# enc_output_final (K, V): (4, 10, 512)
# Cross-Attention์—์„œ Q(S=5)์™€ K/V(S=10)์˜ ๊ธธ์ด๊ฐ€ ๋‹ฌ๋ผ๋„ ์ •์ƒ ์ž‘๋™
dec_output_final = decoder_block(decoder_input_data, enc_output_final, look_ahead_mask)
print(f"๋””์ฝ”๋” ์ตœ์ข… ์ถœ๋ ฅ ํ˜•ํƒœ: {dec_output_final.shape}")
# 4. ์ตœ์ข… ์ถœ๋ ฅ
probabilities = final_output_layer(dec_output_final)
print(f"์ตœ์ข… ํ™•๋ฅ  ๋ถ„ํฌ ํ˜•ํƒœ (B x S_target x V): {probabilities.shape}")
print("\n**์™„๋ฃŒ**")