gihakkk
/

Transformer

Model card Files Files and versions

xet

Community

gihakkk commited on Nov 18, 2025

Commit

6140a7a

verified ·

1 Parent(s): cf1f6fc

Upload Transformer.py

Browse files

Files changed (1) hide show

Transformer.py +205 -0

Transformer.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import numpy as np
+# --- 0. 기본 설정 (Settings) ---
+batch_size = 4   # 배치 크기 B
+d_model = 512    # 모델 차원 D
+d_k = 64         # 헤드 차원 (d_model / num_heads)
+d_ff = 2048      # FFN 내부 차원
+vocab_size = 10000 # 어휘 크기 V
+enc_seq_len = 10   # 인코더 시퀀스 길이 S_enc
+num_heads = 8
+# 예시 입력 데이터: [B, S_enc, D] 형태
+input_data = np.random.randn(batch_size, enc_seq_len, d_model) * 0.1
+# --- 1. 헬퍼 함수 및 가중치 초기화 ---
+def init_weights(shape):
+    """He/Xavier 초기화의 간략화 버전"""
+    if len(shape) == 1:
+        return np.zeros(shape)
+    # np.sqrt(2.0 / shape[0]) -> np.sqrt(1.0 / shape[0]) (Xavier)
+    return np.random.randn(*shape) * np.sqrt(1.0 / shape[0])
+# --- 2. 핵심 레이어 구현 ---
+def layer_normalization(x, gamma, beta, epsilon=1e-5):
+    """Layer Normalization (계층 정규화)"""
+    # x 형태: [B, S, D]
+    mean = np.mean(x, axis=-1, keepdims=True)
+    variance = np.mean((x - mean) ** 2, axis=-1, keepdims=True)
+    x_normalized = (x - mean) / np.sqrt(variance + epsilon)
+    output = gamma * x_normalized + beta
+    return output
+def scaled_dot_product_attention(Q, K, V, mask=None):
+    """Scaled Dot-Product Attention (배치 처리 지원)"""
+    # Q: [B, H, S_q, d_k], K: [B, H, S_k, d_k], V: [B, H, S_k, d_k]
+    scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) # [B, H, S_q, S_k]
+    scores = scores / np.sqrt(d_k)
+    if mask is not None:
+        scores = scores + mask
+    exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
+    attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
+    output = np.matmul(attention_weights, V) # [B, H, S_q, d_k]
+    return output, attention_weights
+def multi_head_attention(Q, K, V, W_Q, W_K, W_V, W_O, mask=None):
+    """
+    Multi-Head Attention (오류 수정: 동적 시퀀스 길이 처리)
+    Q: [B, S_q, D], K: [B, S_k, D], V: [B, S_k, D]
+    """
+    # 🌟🌟🌟 핵심 수정 부분: Q, K, V에서 동적으로 Shape 읽기 🌟🌟🌟
+    B_q, S_q, D_q = Q.shape
+    B_k, S_k, D_k = K.shape
+    B_v, S_v, D_v = V.shape
+    # (B_q, B_k, B_v는 모두 batch_size로 동일해야 함)
+    # (S_k와 S_v는 동일해야 함)
+    # 1. 선형 변환 (Projection)
+    Q_proj = np.matmul(Q, W_Q) # [B_q, S_q, D]
+    K_proj = np.matmul(K, W_K) # [B_k, S_k, D]
+    V_proj = np.matmul(V, W_V) # [B_v, S_v, D]
+    # 2. Multi-Head 분할 및 차원 변경
+    # Q: [B_q, num_heads, S_q, d_k]
+    Q_multi = Q_proj.reshape(B_q, S_q, num_heads, d_k).transpose(0, 2, 1, 3)
+    # K: [B_k, num_heads, S_k, d_k]
+    K_multi = K_proj.reshape(B_k, S_k, num_heads, d_k).transpose(0, 2, 1, 3)
+    # V: [B_v, num_heads, S_v, d_k]
+    V_multi = V_proj.reshape(B_v, S_v, num_heads, d_k).transpose(0, 2, 1, 3)
+    # 3. 어텐션 계산
+    attended_output, _ = scaled_dot_product_attention(Q_multi, K_multi, V_multi, mask)
+    # 4. 결과 결합 (Concatenate): [B_q, S_q, D]
+    attended_output = attended_output.transpose(0, 2, 1, 3).reshape(B_q, S_q, d_model)
+    # 5. 최종 출력 선형 변환
+    output = np.matmul(attended_output, W_O)
+    return output
+def feed_forward_network(x, W1, b1, W2, b2):
+    """Feed-Forward Network (FFN)"""
+    hidden = np.matmul(x, W1) + b1
+    hidden = np.maximum(0, hidden) # ReLU
+    output = np.matmul(hidden, W2) + b2
+    return output
+# --- 3. 가중치 설정 (하나의 층을 위한 모든 가중치) ---
+# Encoder 가중치
+W_Q_enc, W_K_enc, W_V_enc, W_O_enc = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
+W1_enc, W2_enc = init_weights((d_model, d_ff)), init_weights((d_ff, d_model))
+b1_enc, b2_enc = init_weights((1, d_ff)), init_weights((1, d_model))
+gamma_enc1, beta_enc1 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
+gamma_enc2, beta_enc2 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
+# Decoder 가중치
+W_Q_dec_self, W_K_dec_self, W_V_dec_self, W_O_dec_self = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
+W_Q_dec_cross, W_K_dec_cross, W_V_dec_cross, W_O_dec_cross = init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model)), init_weights((d_model, d_model))
+W1_dec, W2_dec = init_weights((d_model, d_ff)), init_weights((d_ff, d_model))
+b1_dec, b2_dec = init_weights((1, d_ff)), init_weights((1, d_model))
+gamma_dec1, beta_dec1 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
+gamma_dec2, beta_dec2 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
+gamma_dec3, beta_dec3 = np.ones((1, 1, d_model)), np.zeros((1, 1, d_model))
+# --- 4. 인코더 블록 (Add & Norm 적용) ---
+def encoder_block(x):
+    # x 형태: [B, S_enc, D]
+    # Sub-layer 1: Multi-Head Self-Attention
+    attn_output = multi_head_attention(x, x, x, W_Q_enc, W_K_enc, W_V_enc, W_O_enc)
+    # 1. Add & Norm
+    x_1 = layer_normalization(attn_output + x, gamma_enc1, beta_enc1)
+    # Sub-layer 2: Feed-Forward Network
+    ffn_output = feed_forward_network(x_1, W1_enc, b1_enc, W2_enc, b2_enc)
+    # 2. Add & Norm
+    output = layer_normalization(ffn_output + x_1, gamma_enc2, beta_enc2)
+    return output
+# --- 5. 디코더 블록 (Add & Norm 적용) ---
+def create_look_ahead_mask(size):
+    """Look-ahead Mask 생성 (미래 단어 마스킹)"""
+    mask = np.triu(np.ones((size, size)), k=1)
+    return (mask * -1e9)[np.newaxis, np.newaxis, :, :] # [1, 1, S, S]
+def decoder_block(x, enc_output, look_ahead_mask):
+    # x 형태: [B, S_target, D], enc_output 형태: [B, S_source, D]
+    # Sub-layer 1: Masked Multi-Head Self-Attention
+    self_attn_output = multi_head_attention(
+        x, x, x, W_Q_dec_self, W_K_dec_self, W_V_dec_self, W_O_dec_self, mask=look_ahead_mask
+    )
+    # 1. Add & Norm
+    x_1 = layer_normalization(self_attn_output + x, gamma_dec1, beta_dec1)
+    # Sub-layer 2: Multi-Head Encoder-Decoder Attention (Cross-Attention)
+    # Q: 디코더 출력(x_1), K, V: 인코더 출력(enc_output)
+    cross_attn_output = multi_head_attention(
+        x_1, enc_output, enc_output, W_Q_dec_cross, W_K_dec_cross, W_V_dec_cross, W_O_dec_cross, mask=None
+    )
+    # 2. Add & Norm (잔차 연결은 x_1과 연결)
+    x_2 = layer_normalization(cross_attn_output + x_1, gamma_dec2, beta_dec2)
+    # Sub-layer 3: FFN
+    ffn_output = feed_forward_network(x_2, W1_dec, b1_dec, W2_dec, b2_dec)
+    # 3. Add & Norm
+    output = layer_normalization(ffn_output + x_2, gamma_dec3, beta_dec3)
+    return output
+# --- 6. 최종 Output (Linear + Softmax) ---
+W_linear = init_weights((d_model, vocab_size))
+b_linear = init_weights((1, vocab_size))
+def final_output_layer(x):
+    # x: [B, S, D]
+    logits = np.matmul(x, W_linear) + b_linear # [B, S, V]
+    exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
+    probabilities = exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
+    return probabilities
+# --- 7. 전체 트랜스포머 흐름 시뮬레이션 ---
+print("--- Add & Norm 적용된 트랜스포머 시뮬레이션 시작 ---")
+# 1. 인코더 실행
+# input_data: (4, 10, 512)
+enc_output_final = encoder_block(input_data)
+print(f"인코더 최종 출력 형태 (K, V 소스): {enc_output_final.shape}")
+# 2. 디코더 입력 준비
+dec_seq_len = 5 # 디코더 시퀀스 길이
+decoder_input_data = np.random.randn(batch_size, dec_seq_len, d_model) * 0.1
+look_ahead_mask = create_look_ahead_mask(dec_seq_len) # [1, 1, 5, 5]
+# 3. 디코더 실행
+# decoder_input_data (Q): (4, 5, 512)
+# enc_output_final (K, V): (4, 10, 512)
+# Cross-Attention에서 Q(S=5)와 K/V(S=10)의 길이가 달라도 정상 작동
+dec_output_final = decoder_block(decoder_input_data, enc_output_final, look_ahead_mask)
+print(f"디코더 최종 출력 형태: {dec_output_final.shape}")
+# 4. 최종 출력
+probabilities = final_output_layer(dec_output_final)
+print(f"최종 확률 분포 형태 (B x S_target x V): {probabilities.shape}")
+print("\n**완료**")