LiManshu's picture
Add files using upload-large-folder tool
bf6be45 verified
"""权重初始化:LLM 模型权重初始化策略"""
import torch
import torch.nn as nn
import math
def init_weights(module, std=0.02):
"""
初始化模型权重(适用于 LLM,参考 GPT/LLaMA)
参数:
module: PyTorch 模块
std: 正态分布的标准差(默认: 0.02)
初始化策略:
- nn.Embedding: 正态分布 N(0, std)
- nn.Linear: 正态分布 N(0, std),偏置初始化为 0
- RMSNorm: 可学习参数(scale)初始化为 1.0(RMSNorm 会自动处理)
"""
if isinstance(module, nn.Embedding):
# 词嵌入层:正态分布初始化
nn.init.normal_(module.weight, mean=0.0, std=std)
elif isinstance(module, nn.Linear):
# 线性层:权重正态分布初始化,偏置初始化为 0
nn.init.normal_(module.weight, mean=0.0, std=std)
if module.bias is not None:
nn.init.zeros_(module.bias)
def init_weights_with_scaling(module, hidden_size=None, std=0.02):
"""
初始化模型权重(带缩放,适用于输出层)
参数:
module: PyTorch 模块
hidden_size: 隐藏层维度(用于输出层缩放)
std: 基础标准差(默认: 0.02)
初始化策略:
- nn.Embedding: 正态分布 N(0, std)
- nn.Linear:
- 输出层(如果 hidden_size 提供): N(0, std / sqrt(hidden_size))
- 其他层: N(0, std)
- 偏置: 初始化为 0
"""
if isinstance(module, nn.Embedding):
# 词嵌入层:正态分布初始化
nn.init.normal_(module.weight, mean=0.0, std=std)
elif isinstance(module, nn.Linear):
# 线性层
if hidden_size is not None:
# 输出层:使用缩放的标准差
output_std = std / math.sqrt(hidden_size)
nn.init.normal_(module.weight, mean=0.0, std=output_std)
else:
# 普通线性层:标准正态分布初始化
nn.init.normal_(module.weight, mean=0.0, std=std)
# 偏置初始化为 0
if module.bias is not None:
nn.init.zeros_(module.bias)
def apply_llm_init(model, std=0.02, init_output_layer=True):
"""
对整个模型应用 LLM 权重初始化
参数:
model: Transformer 模型
std: 正态分布的标准差(默认: 0.02)
init_output_layer: 是否对输出层使用特殊初始化(默认: True)
返回:
初始化后的模型
"""
# 获取 hidden_size(用于输出层初始化)
hidden_size = None
if hasattr(model, "config"):
hidden_size = model.config.get("hidden_size")
elif hasattr(model, "hidden_size"):
hidden_size = model.hidden_size
# 遍历所有模块并初始化
for module in model.modules():
if isinstance(module, (nn.Embedding, nn.Linear)):
if init_output_layer and isinstance(module, nn.Linear):
# 检查是否是输出层(lm_head)
# 如果 tie_word_embeddings=True,lm_head 可能为 None,使用 embedding 的权重
if hasattr(model, "lm_head") and module is model.lm_head:
# 输出层:使用缩放初始化
init_weights_with_scaling(module, hidden_size=hidden_size, std=std)
else:
# 普通线性层
init_weights(module, std=std)
else:
# 标准初始化
init_weights(module, std=std)
return model
if __name__ == "__main__":
import sys
import io
# 设置输出编码为 UTF-8(Windows 兼容)
if sys.platform == "win32":
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
print("=" * 60)
print("权重初始化测试")
print("=" * 60)
# 测试参数
vocab_size = 100
hidden_size = 320
intermediate_size = 960
std = 0.02
print(f"\n测试参数:")
print(f" vocab_size: {vocab_size}")
print(f" hidden_size: {hidden_size}")
print(f" intermediate_size: {intermediate_size}")
print(f" std: {std}")
# 测试 Embedding 初始化
print("\n1. 测试 Embedding 初始化")
embedding = nn.Embedding(vocab_size, hidden_size)
init_weights(embedding, std=std)
weight_mean = embedding.weight.mean().item()
weight_std = embedding.weight.std().item()
print(f" Embedding 权重均值: {weight_mean:.6f} (应该接近 0)")
print(f" Embedding 权重标准差: {weight_std:.6f} (应该接近 {std})")
# 测试 Linear 初始化
print("\n2. 测试 Linear 初始化")
linear = nn.Linear(hidden_size, intermediate_size)
init_weights(linear, std=std)
weight_mean = linear.weight.mean().item()
weight_std = linear.weight.std().item()
bias_mean = linear.bias.mean().item() if linear.bias is not None else 0.0
print(f" Linear 权重均值: {weight_mean:.6f} (应该接近 0)")
print(f" Linear 权重标准差: {weight_std:.6f} (应该接近 {std})")
print(f" Linear 偏置均值: {bias_mean:.6f} (应该为 0)")
# 测试输出层初始化(带缩放)
print("\n3. 测试输出层初始化(带缩放)")
output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
init_weights_with_scaling(output_layer, hidden_size=hidden_size, std=std)
weight_mean = output_layer.weight.mean().item()
weight_std = output_layer.weight.std().item()
expected_std = std / math.sqrt(hidden_size)
print(f" 输出层权重均值: {weight_mean:.6f} (应该接近 0)")
print(f" 输出层权重标准差: {weight_std:.6f}")
print(f" 期望标准差: {expected_std:.6f}")
# 测试完整模型初始化
print("\n4. 测试完整模型初始化")
from llm.model.transformer import Transformer
config = {
"vocab_size": vocab_size,
"hidden_size": hidden_size,
"num_hidden_layers": 2, # 使用较小的层数用于测试
"num_attention_heads": 10,
"num_key_value_heads": 2,
"intermediate_size": intermediate_size,
"rms_norm_eps": 1e-5,
"max_position_embeddings": 1024,
"rope_theta": 10000.0,
"sliding_window": 256,
"sliding_window_overlap": True,
"tie_word_embeddings": True,
}
model = Transformer(config)
# 记录初始化前的权重统计
print(" 初始化前的权重统计:")
for name, param in model.named_parameters():
if "embedding" in name or "weight" in name:
print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")
# 应用初始化
apply_llm_init(model, std=std, init_output_layer=True)
# 记录初始化后的权重统计
print("\n 初始化后的权重统计:")
for name, param in model.named_parameters():
if "embedding" in name or ("weight" in name and "norm" not in name):
print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")
# 验证初始化效果
print("\n5. 验证初始化效果")
embedding_weight = model.embedding.embedding.weight
print(f" Embedding 权重均值: {embedding_weight.mean().item():.6f}")
print(f" Embedding 权重标准差: {embedding_weight.std().item():.6f}")
# 检查第一个 Transformer Block 的线性层
first_block = model.layers[0]
attn_q_proj = first_block.attn.q_proj
print(f" Attention Q 投影权重均值: {attn_q_proj.weight.mean().item():.6f}")
print(f" Attention Q 投影权重标准差: {attn_q_proj.weight.std().item():.6f}")
ffn_gate_proj = first_block.ffn.gate_proj
print(f" FFN Gate 投影权重均值: {ffn_gate_proj.weight.mean().item():.6f}")
print(f" FFN Gate 投影权重标准差: {ffn_gate_proj.weight.std().item():.6f}")
print("\n" + "=" * 60)
print("所有测试完成!")
print("=" * 60)