"""权重初始化:LLM 模型权重初始化策略""" import torch import torch.nn as nn import math def init_weights(module, std=0.02): """ 初始化模型权重(适用于 LLM,参考 GPT/LLaMA) 参数: module: PyTorch 模块 std: 正态分布的标准差(默认: 0.02) 初始化策略: - nn.Embedding: 正态分布 N(0, std) - nn.Linear: 正态分布 N(0, std),偏置初始化为 0 - RMSNorm: 可学习参数(scale)初始化为 1.0(RMSNorm 会自动处理) """ if isinstance(module, nn.Embedding): # 词嵌入层:正态分布初始化 nn.init.normal_(module.weight, mean=0.0, std=std) elif isinstance(module, nn.Linear): # 线性层:权重正态分布初始化,偏置初始化为 0 nn.init.normal_(module.weight, mean=0.0, std=std) if module.bias is not None: nn.init.zeros_(module.bias) def init_weights_with_scaling(module, hidden_size=None, std=0.02): """ 初始化模型权重(带缩放,适用于输出层) 参数: module: PyTorch 模块 hidden_size: 隐藏层维度(用于输出层缩放) std: 基础标准差(默认: 0.02) 初始化策略: - nn.Embedding: 正态分布 N(0, std) - nn.Linear: - 输出层(如果 hidden_size 提供): N(0, std / sqrt(hidden_size)) - 其他层: N(0, std) - 偏置: 初始化为 0 """ if isinstance(module, nn.Embedding): # 词嵌入层:正态分布初始化 nn.init.normal_(module.weight, mean=0.0, std=std) elif isinstance(module, nn.Linear): # 线性层 if hidden_size is not None: # 输出层:使用缩放的标准差 output_std = std / math.sqrt(hidden_size) nn.init.normal_(module.weight, mean=0.0, std=output_std) else: # 普通线性层:标准正态分布初始化 nn.init.normal_(module.weight, mean=0.0, std=std) # 偏置初始化为 0 if module.bias is not None: nn.init.zeros_(module.bias) def apply_llm_init(model, std=0.02, init_output_layer=True): """ 对整个模型应用 LLM 权重初始化 参数: model: Transformer 模型 std: 正态分布的标准差(默认: 0.02) init_output_layer: 是否对输出层使用特殊初始化(默认: True) 返回: 初始化后的模型 """ # 获取 hidden_size(用于输出层初始化) hidden_size = None if hasattr(model, "config"): hidden_size = model.config.get("hidden_size") elif hasattr(model, "hidden_size"): hidden_size = model.hidden_size # 遍历所有模块并初始化 for module in model.modules(): if isinstance(module, (nn.Embedding, nn.Linear)): if init_output_layer and isinstance(module, nn.Linear): # 检查是否是输出层(lm_head) # 如果 tie_word_embeddings=True,lm_head 可能为 None,使用 embedding 的权重 if hasattr(model, "lm_head") and module is model.lm_head: # 输出层:使用缩放初始化 init_weights_with_scaling(module, hidden_size=hidden_size, std=std) else: # 普通线性层 init_weights(module, std=std) else: # 标准初始化 init_weights(module, std=std) return model if __name__ == "__main__": import sys import io # 设置输出编码为 UTF-8(Windows 兼容) if sys.platform == "win32": sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8") print("=" * 60) print("权重初始化测试") print("=" * 60) # 测试参数 vocab_size = 100 hidden_size = 320 intermediate_size = 960 std = 0.02 print(f"\n测试参数:") print(f" vocab_size: {vocab_size}") print(f" hidden_size: {hidden_size}") print(f" intermediate_size: {intermediate_size}") print(f" std: {std}") # 测试 Embedding 初始化 print("\n1. 测试 Embedding 初始化") embedding = nn.Embedding(vocab_size, hidden_size) init_weights(embedding, std=std) weight_mean = embedding.weight.mean().item() weight_std = embedding.weight.std().item() print(f" Embedding 权重均值: {weight_mean:.6f} (应该接近 0)") print(f" Embedding 权重标准差: {weight_std:.6f} (应该接近 {std})") # 测试 Linear 初始化 print("\n2. 测试 Linear 初始化") linear = nn.Linear(hidden_size, intermediate_size) init_weights(linear, std=std) weight_mean = linear.weight.mean().item() weight_std = linear.weight.std().item() bias_mean = linear.bias.mean().item() if linear.bias is not None else 0.0 print(f" Linear 权重均值: {weight_mean:.6f} (应该接近 0)") print(f" Linear 权重标准差: {weight_std:.6f} (应该接近 {std})") print(f" Linear 偏置均值: {bias_mean:.6f} (应该为 0)") # 测试输出层初始化(带缩放) print("\n3. 测试输出层初始化(带缩放)") output_layer = nn.Linear(hidden_size, vocab_size, bias=False) init_weights_with_scaling(output_layer, hidden_size=hidden_size, std=std) weight_mean = output_layer.weight.mean().item() weight_std = output_layer.weight.std().item() expected_std = std / math.sqrt(hidden_size) print(f" 输出层权重均值: {weight_mean:.6f} (应该接近 0)") print(f" 输出层权重标准差: {weight_std:.6f}") print(f" 期望标准差: {expected_std:.6f}") # 测试完整模型初始化 print("\n4. 测试完整模型初始化") from llm.model.transformer import Transformer config = { "vocab_size": vocab_size, "hidden_size": hidden_size, "num_hidden_layers": 2, # 使用较小的层数用于测试 "num_attention_heads": 10, "num_key_value_heads": 2, "intermediate_size": intermediate_size, "rms_norm_eps": 1e-5, "max_position_embeddings": 1024, "rope_theta": 10000.0, "sliding_window": 256, "sliding_window_overlap": True, "tie_word_embeddings": True, } model = Transformer(config) # 记录初始化前的权重统计 print(" 初始化前的权重统计:") for name, param in model.named_parameters(): if "embedding" in name or "weight" in name: print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}") # 应用初始化 apply_llm_init(model, std=std, init_output_layer=True) # 记录初始化后的权重统计 print("\n 初始化后的权重统计:") for name, param in model.named_parameters(): if "embedding" in name or ("weight" in name and "norm" not in name): print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}") # 验证初始化效果 print("\n5. 验证初始化效果") embedding_weight = model.embedding.embedding.weight print(f" Embedding 权重均值: {embedding_weight.mean().item():.6f}") print(f" Embedding 权重标准差: {embedding_weight.std().item():.6f}") # 检查第一个 Transformer Block 的线性层 first_block = model.layers[0] attn_q_proj = first_block.attn.q_proj print(f" Attention Q 投影权重均值: {attn_q_proj.weight.mean().item():.6f}") print(f" Attention Q 投影权重标准差: {attn_q_proj.weight.std().item():.6f}") ffn_gate_proj = first_block.ffn.gate_proj print(f" FFN Gate 投影权重均值: {ffn_gate_proj.weight.mean().item():.6f}") print(f" FFN Gate 投影权重标准差: {ffn_gate_proj.weight.std().item():.6f}") print("\n" + "=" * 60) print("所有测试完成!") print("=" * 60)