"""权重初始化：LLM 模型权重初始化策略"""


import torch
import torch.nn as nn
import math


def init_weights(module, std=0.02):
    """
    初始化模型权重（适用于 LLM，参考 GPT/LLaMA）

    参数:
        module: PyTorch 模块
        std: 正态分布的标准差（默认: 0.02）

    初始化策略:
        - nn.Embedding: 正态分布 N(0, std)
        - nn.Linear: 正态分布 N(0, std)，偏置初始化为 0
        - RMSNorm: 可学习参数（scale）初始化为 1.0（RMSNorm 会自动处理）
    """
    if isinstance(module, nn.Embedding):
        # 词嵌入层：正态分布初始化
        nn.init.normal_(module.weight, mean=0.0, std=std)
    elif isinstance(module, nn.Linear):
        # 线性层：权重正态分布初始化，偏置初始化为 0
        nn.init.normal_(module.weight, mean=0.0, std=std)
        if module.bias is not None:
            nn.init.zeros_(module.bias)


def init_weights_with_scaling(module, hidden_size=None, std=0.02):
    """
    初始化模型权重（带缩放，适用于输出层）

    参数:
        module: PyTorch 模块
        hidden_size: 隐藏层维度（用于输出层缩放）
        std: 基础标准差（默认: 0.02）

    初始化策略:
        - nn.Embedding: 正态分布 N(0, std)
        - nn.Linear: 
            - 输出层（如果 hidden_size 提供）: N(0, std / sqrt(hidden_size))
            - 其他层: N(0, std)
        - 偏置: 初始化为 0
    """
    if isinstance(module, nn.Embedding):
        # 词嵌入层：正态分布初始化
        nn.init.normal_(module.weight, mean=0.0, std=std)
    elif isinstance(module, nn.Linear):
        # 线性层
        if hidden_size is not None:
            # 输出层：使用缩放的标准差
            output_std = std / math.sqrt(hidden_size)
            nn.init.normal_(module.weight, mean=0.0, std=output_std)
        else:
            # 普通线性层：标准正态分布初始化
            nn.init.normal_(module.weight, mean=0.0, std=std)
        
        # 偏置初始化为 0
        if module.bias is not None:
            nn.init.zeros_(module.bias)


def apply_llm_init(model, std=0.02, init_output_layer=True):
    """
    对整个模型应用 LLM 权重初始化

    参数:
        model: Transformer 模型
        std: 正态分布的标准差（默认: 0.02）
        init_output_layer: 是否对输出层使用特殊初始化（默认: True）

    返回:
        初始化后的模型
    """
    # 获取 hidden_size（用于输出层初始化）
    hidden_size = None
    if hasattr(model, "config"):
        hidden_size = model.config.get("hidden_size")
    elif hasattr(model, "hidden_size"):
        hidden_size = model.hidden_size

    # 遍历所有模块并初始化
    for module in model.modules():
        if isinstance(module, (nn.Embedding, nn.Linear)):
            if init_output_layer and isinstance(module, nn.Linear):
                # 检查是否是输出层（lm_head）
                # 如果 tie_word_embeddings=True，lm_head 可能为 None，使用 embedding 的权重
                if hasattr(model, "lm_head") and module is model.lm_head:
                    # 输出层：使用缩放初始化
                    init_weights_with_scaling(module, hidden_size=hidden_size, std=std)
                else:
                    # 普通线性层
                    init_weights(module, std=std)
            else:
                # 标准初始化
                init_weights(module, std=std)

    return model


if __name__ == "__main__":
    import sys
    import io

    # 设置输出编码为 UTF-8（Windows 兼容）
    if sys.platform == "win32":
        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")

    print("=" * 60)
    print("权重初始化测试")
    print("=" * 60)

    # 测试参数
    vocab_size = 100
    hidden_size = 320
    intermediate_size = 960
    std = 0.02

    print(f"\n测试参数:")
    print(f"  vocab_size: {vocab_size}")
    print(f"  hidden_size: {hidden_size}")
    print(f"  intermediate_size: {intermediate_size}")
    print(f"  std: {std}")

    # 测试 Embedding 初始化
    print("\n1. 测试 Embedding 初始化")
    embedding = nn.Embedding(vocab_size, hidden_size)
    init_weights(embedding, std=std)
    weight_mean = embedding.weight.mean().item()
    weight_std = embedding.weight.std().item()
    print(f"   Embedding 权重均值: {weight_mean:.6f} (应该接近 0)")
    print(f"   Embedding 权重标准差: {weight_std:.6f} (应该接近 {std})")

    # 测试 Linear 初始化
    print("\n2. 测试 Linear 初始化")
    linear = nn.Linear(hidden_size, intermediate_size)
    init_weights(linear, std=std)
    weight_mean = linear.weight.mean().item()
    weight_std = linear.weight.std().item()
    bias_mean = linear.bias.mean().item() if linear.bias is not None else 0.0
    print(f"   Linear 权重均值: {weight_mean:.6f} (应该接近 0)")
    print(f"   Linear 权重标准差: {weight_std:.6f} (应该接近 {std})")
    print(f"   Linear 偏置均值: {bias_mean:.6f} (应该为 0)")

    # 测试输出层初始化（带缩放）
    print("\n3. 测试输出层初始化（带缩放）")
    output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
    init_weights_with_scaling(output_layer, hidden_size=hidden_size, std=std)
    weight_mean = output_layer.weight.mean().item()
    weight_std = output_layer.weight.std().item()
    expected_std = std / math.sqrt(hidden_size)
    print(f"   输出层权重均值: {weight_mean:.6f} (应该接近 0)")
    print(f"   输出层权重标准差: {weight_std:.6f}")
    print(f"   期望标准差: {expected_std:.6f}")

    # 测试完整模型初始化
    print("\n4. 测试完整模型初始化")
    from llm.model.transformer import Transformer

    config = {
        "vocab_size": vocab_size,
        "hidden_size": hidden_size,
        "num_hidden_layers": 2,  # 使用较小的层数用于测试
        "num_attention_heads": 10,
        "num_key_value_heads": 2,
        "intermediate_size": intermediate_size,
        "rms_norm_eps": 1e-5,
        "max_position_embeddings": 1024,
        "rope_theta": 10000.0,
        "sliding_window": 256,
        "sliding_window_overlap": True,
        "tie_word_embeddings": True,
    }

    model = Transformer(config)
    
    # 记录初始化前的权重统计
    print("   初始化前的权重统计:")
    for name, param in model.named_parameters():
        if "embedding" in name or "weight" in name:
            print(f"     {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")

    # 应用初始化
    apply_llm_init(model, std=std, init_output_layer=True)

    # 记录初始化后的权重统计
    print("\n   初始化后的权重统计:")
    for name, param in model.named_parameters():
        if "embedding" in name or ("weight" in name and "norm" not in name):
            print(f"     {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")

    # 验证初始化效果
    print("\n5. 验证初始化效果")
    embedding_weight = model.embedding.embedding.weight
    print(f"   Embedding 权重均值: {embedding_weight.mean().item():.6f}")
    print(f"   Embedding 权重标准差: {embedding_weight.std().item():.6f}")

    # 检查第一个 Transformer Block 的线性层
    first_block = model.layers[0]
    attn_q_proj = first_block.attn.q_proj
    print(f"   Attention Q 投影权重均值: {attn_q_proj.weight.mean().item():.6f}")
    print(f"   Attention Q 投影权重标准差: {attn_q_proj.weight.std().item():.6f}")

    ffn_gate_proj = first_block.ffn.gate_proj
    print(f"   FFN Gate 投影权重均值: {ffn_gate_proj.weight.mean().item():.6f}")
    print(f"   FFN Gate 投影权重标准差: {ffn_gate_proj.weight.std().item():.6f}")

    print("\n" + "=" * 60)
    print("所有测试完成！")
    print("=" * 60)