Add files using upload-large-folder tool

bf6be45 verified 15 days ago

8.18 kB

	"""权重初始化：LLM 模型权重初始化策略"""


	import torch
	import torch.nn as nn
	import math


	def init_weights(module, std=0.02):
	"""
	初始化模型权重（适用于 LLM，参考 GPT/LLaMA）

	参数:
	module: PyTorch 模块
	std: 正态分布的标准差（默认: 0.02）

	初始化策略:
	- nn.Embedding: 正态分布 N(0, std)
	- nn.Linear: 正态分布 N(0, std)，偏置初始化为 0
	- RMSNorm: 可学习参数（scale）初始化为 1.0（RMSNorm 会自动处理）
	"""
	if isinstance(module, nn.Embedding):
	# 词嵌入层：正态分布初始化
	nn.init.normal_(module.weight, mean=0.0, std=std)
	elif isinstance(module, nn.Linear):
	# 线性层：权重正态分布初始化，偏置初始化为 0
	nn.init.normal_(module.weight, mean=0.0, std=std)
	if module.bias is not None:
	nn.init.zeros_(module.bias)


	def init_weights_with_scaling(module, hidden_size=None, std=0.02):
	"""
	初始化模型权重（带缩放，适用于输出层）

	参数:
	module: PyTorch 模块
	hidden_size: 隐藏层维度（用于输出层缩放）
	std: 基础标准差（默认: 0.02）

	初始化策略:
	- nn.Embedding: 正态分布 N(0, std)
	- nn.Linear:
	- 输出层（如果 hidden_size 提供）: N(0, std / sqrt(hidden_size))
	- 其他层: N(0, std)
	- 偏置: 初始化为 0
	"""
	if isinstance(module, nn.Embedding):
	# 词嵌入层：正态分布初始化
	nn.init.normal_(module.weight, mean=0.0, std=std)
	elif isinstance(module, nn.Linear):
	# 线性层
	if hidden_size is not None:
	# 输出层：使用缩放的标准差
	output_std = std / math.sqrt(hidden_size)
	nn.init.normal_(module.weight, mean=0.0, std=output_std)
	else:
	# 普通线性层：标准正态分布初始化
	nn.init.normal_(module.weight, mean=0.0, std=std)

	# 偏置初始化为 0
	if module.bias is not None:
	nn.init.zeros_(module.bias)


	def apply_llm_init(model, std=0.02, init_output_layer=True):
	"""
	对整个模型应用 LLM 权重初始化

	参数:
	model: Transformer 模型
	std: 正态分布的标准差（默认: 0.02）
	init_output_layer: 是否对输出层使用特殊初始化（默认: True）

	返回:
	初始化后的模型
	"""
	# 获取 hidden_size（用于输出层初始化）
	hidden_size = None
	if hasattr(model, "config"):
	hidden_size = model.config.get("hidden_size")
	elif hasattr(model, "hidden_size"):
	hidden_size = model.hidden_size

	# 遍历所有模块并初始化
	for module in model.modules():
	if isinstance(module, (nn.Embedding, nn.Linear)):
	if init_output_layer and isinstance(module, nn.Linear):
	# 检查是否是输出层（lm_head）
	# 如果 tie_word_embeddings=True，lm_head 可能为 None，使用 embedding 的权重
	if hasattr(model, "lm_head") and module is model.lm_head:
	# 输出层：使用缩放初始化
	init_weights_with_scaling(module, hidden_size=hidden_size, std=std)
	else:
	# 普通线性层
	init_weights(module, std=std)
	else:
	# 标准初始化
	init_weights(module, std=std)

	return model


	if __name__ == "__main__":
	import sys
	import io

	# 设置输出编码为 UTF-8（Windows 兼容）
	if sys.platform == "win32":
	sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")

	print("=" * 60)
	print("权重初始化测试")
	print("=" * 60)

	# 测试参数
	vocab_size = 100
	hidden_size = 320
	intermediate_size = 960
	std = 0.02

	print(f"\n测试参数:")
	print(f" vocab_size: {vocab_size}")
	print(f" hidden_size: {hidden_size}")
	print(f" intermediate_size: {intermediate_size}")
	print(f" std: {std}")

	# 测试 Embedding 初始化
	print("\n1. 测试 Embedding 初始化")
	embedding = nn.Embedding(vocab_size, hidden_size)
	init_weights(embedding, std=std)
	weight_mean = embedding.weight.mean().item()
	weight_std = embedding.weight.std().item()
	print(f" Embedding 权重均值: {weight_mean:.6f} (应该接近 0)")
	print(f" Embedding 权重标准差: {weight_std:.6f} (应该接近 {std})")

	# 测试 Linear 初始化
	print("\n2. 测试 Linear 初始化")
	linear = nn.Linear(hidden_size, intermediate_size)
	init_weights(linear, std=std)
	weight_mean = linear.weight.mean().item()
	weight_std = linear.weight.std().item()
	bias_mean = linear.bias.mean().item() if linear.bias is not None else 0.0
	print(f" Linear 权重均值: {weight_mean:.6f} (应该接近 0)")
	print(f" Linear 权重标准差: {weight_std:.6f} (应该接近 {std})")
	print(f" Linear 偏置均值: {bias_mean:.6f} (应该为 0)")

	# 测试输出层初始化（带缩放）
	print("\n3. 测试输出层初始化（带缩放）")
	output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
	init_weights_with_scaling(output_layer, hidden_size=hidden_size, std=std)
	weight_mean = output_layer.weight.mean().item()
	weight_std = output_layer.weight.std().item()
	expected_std = std / math.sqrt(hidden_size)
	print(f" 输出层权重均值: {weight_mean:.6f} (应该接近 0)")
	print(f" 输出层权重标准差: {weight_std:.6f}")
	print(f" 期望标准差: {expected_std:.6f}")

	# 测试完整模型初始化
	print("\n4. 测试完整模型初始化")
	from llm.model.transformer import Transformer

	config = {
	"vocab_size": vocab_size,
	"hidden_size": hidden_size,
	"num_hidden_layers": 2, # 使用较小的层数用于测试
	"num_attention_heads": 10,
	"num_key_value_heads": 2,
	"intermediate_size": intermediate_size,
	"rms_norm_eps": 1e-5,
	"max_position_embeddings": 1024,
	"rope_theta": 10000.0,
	"sliding_window": 256,
	"sliding_window_overlap": True,
	"tie_word_embeddings": True,
	}

	model = Transformer(config)

	# 记录初始化前的权重统计
	print(" 初始化前的权重统计:")
	for name, param in model.named_parameters():
	if "embedding" in name or "weight" in name:
	print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")

	# 应用初始化
	apply_llm_init(model, std=std, init_output_layer=True)

	# 记录初始化后的权重统计
	print("\n 初始化后的权重统计:")
	for name, param in model.named_parameters():
	if "embedding" in name or ("weight" in name and "norm" not in name):
	print(f" {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")

	# 验证初始化效果
	print("\n5. 验证初始化效果")
	embedding_weight = model.embedding.embedding.weight
	print(f" Embedding 权重均值: {embedding_weight.mean().item():.6f}")
	print(f" Embedding 权重标准差: {embedding_weight.std().item():.6f}")

	# 检查第一个 Transformer Block 的线性层
	first_block = model.layers[0]
	attn_q_proj = first_block.attn.q_proj
	print(f" Attention Q 投影权重均值: {attn_q_proj.weight.mean().item():.6f}")
	print(f" Attention Q 投影权重标准差: {attn_q_proj.weight.std().item():.6f}")

	ffn_gate_proj = first_block.ffn.gate_proj
	print(f" FFN Gate 投影权重均值: {ffn_gate_proj.weight.mean().item():.6f}")
	print(f" FFN Gate 投影权重标准差: {ffn_gate_proj.weight.std().item():.6f}")

	print("\n" + "=" * 60)
	print("所有测试完成！")
	print("=" * 60)