| """权重初始化:LLM 模型权重初始化策略"""
|
|
|
|
|
| import torch
|
| import torch.nn as nn
|
| import math
|
|
|
|
|
| def init_weights(module, std=0.02):
|
| """
|
| 初始化模型权重(适用于 LLM,参考 GPT/LLaMA)
|
|
|
| 参数:
|
| module: PyTorch 模块
|
| std: 正态分布的标准差(默认: 0.02)
|
|
|
| 初始化策略:
|
| - nn.Embedding: 正态分布 N(0, std)
|
| - nn.Linear: 正态分布 N(0, std),偏置初始化为 0
|
| - RMSNorm: 可学习参数(scale)初始化为 1.0(RMSNorm 会自动处理)
|
| """
|
| if isinstance(module, nn.Embedding):
|
|
|
| nn.init.normal_(module.weight, mean=0.0, std=std)
|
| elif isinstance(module, nn.Linear):
|
|
|
| nn.init.normal_(module.weight, mean=0.0, std=std)
|
| if module.bias is not None:
|
| nn.init.zeros_(module.bias)
|
|
|
|
|
| def init_weights_with_scaling(module, hidden_size=None, std=0.02):
|
| """
|
| 初始化模型权重(带缩放,适用于输出层)
|
|
|
| 参数:
|
| module: PyTorch 模块
|
| hidden_size: 隐藏层维度(用于输出层缩放)
|
| std: 基础标准差(默认: 0.02)
|
|
|
| 初始化策略:
|
| - nn.Embedding: 正态分布 N(0, std)
|
| - nn.Linear:
|
| - 输出层(如果 hidden_size 提供): N(0, std / sqrt(hidden_size))
|
| - 其他层: N(0, std)
|
| - 偏置: 初始化为 0
|
| """
|
| if isinstance(module, nn.Embedding):
|
|
|
| nn.init.normal_(module.weight, mean=0.0, std=std)
|
| elif isinstance(module, nn.Linear):
|
|
|
| if hidden_size is not None:
|
|
|
| output_std = std / math.sqrt(hidden_size)
|
| nn.init.normal_(module.weight, mean=0.0, std=output_std)
|
| else:
|
|
|
| nn.init.normal_(module.weight, mean=0.0, std=std)
|
|
|
|
|
| if module.bias is not None:
|
| nn.init.zeros_(module.bias)
|
|
|
|
|
| def apply_llm_init(model, std=0.02, init_output_layer=True):
|
| """
|
| 对整个模型应用 LLM 权重初始化
|
|
|
| 参数:
|
| model: Transformer 模型
|
| std: 正态分布的标准差(默认: 0.02)
|
| init_output_layer: 是否对输出层使用特殊初始化(默认: True)
|
|
|
| 返回:
|
| 初始化后的模型
|
| """
|
|
|
| hidden_size = None
|
| if hasattr(model, "config"):
|
| hidden_size = model.config.get("hidden_size")
|
| elif hasattr(model, "hidden_size"):
|
| hidden_size = model.hidden_size
|
|
|
|
|
| for module in model.modules():
|
| if isinstance(module, (nn.Embedding, nn.Linear)):
|
| if init_output_layer and isinstance(module, nn.Linear):
|
|
|
|
|
| if hasattr(model, "lm_head") and module is model.lm_head:
|
|
|
| init_weights_with_scaling(module, hidden_size=hidden_size, std=std)
|
| else:
|
|
|
| init_weights(module, std=std)
|
| else:
|
|
|
| init_weights(module, std=std)
|
|
|
| return model
|
|
|
|
|
| if __name__ == "__main__":
|
| import sys
|
| import io
|
|
|
|
|
| if sys.platform == "win32":
|
| sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
|
|
|
| print("=" * 60)
|
| print("权重初始化测试")
|
| print("=" * 60)
|
|
|
|
|
| vocab_size = 100
|
| hidden_size = 320
|
| intermediate_size = 960
|
| std = 0.02
|
|
|
| print(f"\n测试参数:")
|
| print(f" vocab_size: {vocab_size}")
|
| print(f" hidden_size: {hidden_size}")
|
| print(f" intermediate_size: {intermediate_size}")
|
| print(f" std: {std}")
|
|
|
|
|
| print("\n1. 测试 Embedding 初始化")
|
| embedding = nn.Embedding(vocab_size, hidden_size)
|
| init_weights(embedding, std=std)
|
| weight_mean = embedding.weight.mean().item()
|
| weight_std = embedding.weight.std().item()
|
| print(f" Embedding 权重均值: {weight_mean:.6f} (应该接近 0)")
|
| print(f" Embedding 权重标准差: {weight_std:.6f} (应该接近 {std})")
|
|
|
|
|
| print("\n2. 测试 Linear 初始化")
|
| linear = nn.Linear(hidden_size, intermediate_size)
|
| init_weights(linear, std=std)
|
| weight_mean = linear.weight.mean().item()
|
| weight_std = linear.weight.std().item()
|
| bias_mean = linear.bias.mean().item() if linear.bias is not None else 0.0
|
| print(f" Linear 权重均值: {weight_mean:.6f} (应该接近 0)")
|
| print(f" Linear 权重标准差: {weight_std:.6f} (应该接近 {std})")
|
| print(f" Linear 偏置均值: {bias_mean:.6f} (应该为 0)")
|
|
|
|
|
| print("\n3. 测试输出层初始化(带缩放)")
|
| output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
|
| init_weights_with_scaling(output_layer, hidden_size=hidden_size, std=std)
|
| weight_mean = output_layer.weight.mean().item()
|
| weight_std = output_layer.weight.std().item()
|
| expected_std = std / math.sqrt(hidden_size)
|
| print(f" 输出层权重均值: {weight_mean:.6f} (应该接近 0)")
|
| print(f" 输出层权重标准差: {weight_std:.6f}")
|
| print(f" 期望标准差: {expected_std:.6f}")
|
|
|
|
|
| print("\n4. 测试完整模型初始化")
|
| from llm.model.transformer import Transformer
|
|
|
| config = {
|
| "vocab_size": vocab_size,
|
| "hidden_size": hidden_size,
|
| "num_hidden_layers": 2,
|
| "num_attention_heads": 10,
|
| "num_key_value_heads": 2,
|
| "intermediate_size": intermediate_size,
|
| "rms_norm_eps": 1e-5,
|
| "max_position_embeddings": 1024,
|
| "rope_theta": 10000.0,
|
| "sliding_window": 256,
|
| "sliding_window_overlap": True,
|
| "tie_word_embeddings": True,
|
| }
|
|
|
| model = Transformer(config)
|
|
|
|
|
| print(" 初始化前的权重统计:")
|
| for name, param in model.named_parameters():
|
| if "embedding" in name or "weight" in name:
|
| print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")
|
|
|
|
|
| apply_llm_init(model, std=std, init_output_layer=True)
|
|
|
|
|
| print("\n 初始化后的权重统计:")
|
| for name, param in model.named_parameters():
|
| if "embedding" in name or ("weight" in name and "norm" not in name):
|
| print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")
|
|
|
|
|
| print("\n5. 验证初始化效果")
|
| embedding_weight = model.embedding.embedding.weight
|
| print(f" Embedding 权重均值: {embedding_weight.mean().item():.6f}")
|
| print(f" Embedding 权重标准差: {embedding_weight.std().item():.6f}")
|
|
|
|
|
| first_block = model.layers[0]
|
| attn_q_proj = first_block.attn.q_proj
|
| print(f" Attention Q 投影权重均值: {attn_q_proj.weight.mean().item():.6f}")
|
| print(f" Attention Q 投影权重标准差: {attn_q_proj.weight.std().item():.6f}")
|
|
|
| ffn_gate_proj = first_block.ffn.gate_proj
|
| print(f" FFN Gate 投影权重均值: {ffn_gate_proj.weight.mean().item():.6f}")
|
| print(f" FFN Gate 投影权重标准差: {ffn_gate_proj.weight.std().item():.6f}")
|
|
|
| print("\n" + "=" * 60)
|
| print("所有测试完成!")
|
| print("=" * 60)
|
|
|