LiManshu commited on 16 days ago

Commit

bf6be45

verified ·

1 Parent(s): b7488e1

Add files using upload-large-folder tool

Browse files

Files changed (49) hide show

.gitattributes +3 -35
README.md +40 -0
checkpoints/best_model.pt +3 -0
configs/model.yaml +43 -0
data/vocab/char_vocab.json +139 -0
inference.py +67 -0
llm/__init__.py +5 -0
llm/__pycache__/__init__.cpython-312.pyc +0 -0
llm/data/__init__.py +1 -0
llm/data/__pycache__/__init__.cpython-312.pyc +0 -0
llm/data/__pycache__/tokenizer.cpython-312.pyc +0 -0
llm/data/collate.py +167 -0
llm/data/dataset.py +164 -0
llm/data/tokenizer.py +126 -0
llm/inference/__init__.py +5 -0
llm/inference/__pycache__/__init__.cpython-312.pyc +0 -0
llm/inference/__pycache__/generate.cpython-312.pyc +0 -0
llm/inference/generate.py +179 -0
llm/model/__init__.py +1 -0
llm/model/__pycache__/__init__.cpython-312.pyc +0 -0
llm/model/__pycache__/attention.cpython-312.pyc +0 -0
llm/model/__pycache__/block.cpython-312.pyc +0 -0
llm/model/__pycache__/embedding.cpython-312.pyc +0 -0
llm/model/__pycache__/ffn.cpython-312.pyc +0 -0
llm/model/__pycache__/norm.cpython-312.pyc +0 -0
llm/model/__pycache__/rope.cpython-312.pyc +0 -0
llm/model/__pycache__/transformer.cpython-312.pyc +0 -0
llm/model/attention.py +435 -0
llm/model/block.py +163 -0
llm/model/embedding.py +35 -0
llm/model/ffn.py +139 -0
llm/model/norm.py +132 -0
llm/model/rope.py +162 -0
llm/model/transformer.py +280 -0
llm/training/__init__.py +15 -0
llm/training/loss.py +91 -0
llm/training/metrics.py +175 -0
llm/training/optim.py +223 -0
llm/training/trainer.py +294 -0
llm/utils/__init__.py +9 -0
llm/utils/__pycache__/__init__.cpython-312.pyc +0 -0
llm/utils/__pycache__/checkpoint.cpython-312.pyc +0 -0
llm/utils/__pycache__/init.cpython-312.pyc +0 -0
llm/utils/checkpoint.py +39 -0
llm/utils/config.py +25 -0
llm/utils/init.py +213 -0
llm/utils/logging.py +18 -0
llm/utils/seed.py +14 -0
requirements.txt +4 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.pt filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+---
+license: mit
+language:
+- en
+library_name: pytorch
+pipeline_tag: text-generation
+tags:
+- transformer
+- character-level
+- custom-code
+---
+# nextShakespeare
+`nextShakespeare` is a decoder-only, character-level Transformer language model
+trained on Tiny Shakespeare style text. This repo uses custom PyTorch code
+rather than `transformers` native model classes.
+## Model assets
+- Weights: `checkpoints/best_model.pt`
+- Config: `configs/model.yaml`
+- Vocabulary: `data/vocab/char_vocab.json`
+- Runtime code: `llm/`
+## Quickstart
+```bash
+git clone https://huggingface.co/<your-username>/nextShakespeare
+cd nextShakespeare
+pip install -r requirements.txt
+python inference.py --prompt "First Citizen:\n" --max_length 200 --temperature 0.8
+```
+## Notes
+- This is a custom-code checkpoint. It is not directly loadable via
+  `AutoModel.from_pretrained()` yet.
+- For a web demo, see the companion Space:
+  `https://huggingface.co/spaces/<your-username>/manshu-init`

checkpoints/best_model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bbbcb6c73219ad04334d069ca56b0362282ec43b14772e95ae77539b5589357
+size 1248083791

configs/model.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+# ============================================================
+# Model Architecture
+# ============================================================
+num_hidden_layers: 12        # Transformer 层数
+hidden_size: 768            # 隐藏层维度
+num_attention_heads: 12      # 注意力头数
+num_key_value_heads: 4      # Key/Value 头数（GQA）
+intermediate_size: 3072      # FFN 中间层维度
+# ----------------------------
+# Context / Position Encoding
+# ----------------------------
+max_position_embeddings: 2048  # 最大上下文长度
+rope_theta: 10000             # RoPE 位置编码参数
+# ----------------------------
+# Attention Optimization
+# ----------------------------
+sliding_window: 1024           # 滑动窗口注意力大小
+sliding_window_overlap: true  # 是否允许窗口重叠
+# 注意：当前所有层都使用滑动窗口
+# ----------------------------
+# Normalization
+# ----------------------------
+rms_norm_eps: 1e-5            # RMSNorm 数值稳定项
+# ----------------------------
+# Embedding
+# ----------------------------
+tie_word_embeddings: true     # 是否绑定输入输出词嵌入
+# ----------------------------
+# Initialization
+# ----------------------------
+init_weights: true            # 是否启用权重初始化
+init_std: 0.02               # 权重初始化标准差

data/vocab/char_vocab.json ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+  "char_to_id": {
+    "<unk>": 0,
+    " ": 1,
+    "e": 2,
+    "t": 3,
+    "o": 4,
+    "a": 5,
+    "h": 6,
+    "s": 7,
+    "r": 8,
+    "n": 9,
+    "i": 10,
+    "\n": 11,
+    "l": 12,
+    "d": 13,
+    "u": 14,
+    "m": 15,
+    "y": 16,
+    ",": 17,
+    "w": 18,
+    "f": 19,
+    "c": 20,
+    "g": 21,
+    "I": 22,
+    "b": 23,
+    "p": 24,
+    ":": 25,
+    ".": 26,
+    "A": 27,
+    "v": 28,
+    "k": 29,
+    "T": 30,
+    "'": 31,
+    "E": 32,
+    "O": 33,
+    "N": 34,
+    "R": 35,
+    "S": 36,
+    "L": 37,
+    "C": 38,
+    ";": 39,
+    "W": 40,
+    "U": 41,
+    "H": 42,
+    "M": 43,
+    "B": 44,
+    "?": 45,
+    "G": 46,
+    "!": 47,
+    "D": 48,
+    "-": 49,
+    "F": 50,
+    "Y": 51,
+    "P": 52,
+    "K": 53,
+    "V": 54,
+    "j": 55,
+    "q": 56,
+    "x": 57,
+    "z": 58,
+    "J": 59,
+    "Q": 60,
+    "Z": 61,
+    "X": 62,
+    "3": 63,
+    "&": 64,
+    "$": 65
+  },
+  "id_to_char": {
+    "0": "<unk>",
+    "1": " ",
+    "2": "e",
+    "3": "t",
+    "4": "o",
+    "5": "a",
+    "6": "h",
+    "7": "s",
+    "8": "r",
+    "9": "n",
+    "10": "i",
+    "11": "\n",
+    "12": "l",
+    "13": "d",
+    "14": "u",
+    "15": "m",
+    "16": "y",
+    "17": ",",
+    "18": "w",
+    "19": "f",
+    "20": "c",
+    "21": "g",
+    "22": "I",
+    "23": "b",
+    "24": "p",
+    "25": ":",
+    "26": ".",
+    "27": "A",
+    "28": "v",
+    "29": "k",
+    "30": "T",
+    "31": "'",
+    "32": "E",
+    "33": "O",
+    "34": "N",
+    "35": "R",
+    "36": "S",
+    "37": "L",
+    "38": "C",
+    "39": ";",
+    "40": "W",
+    "41": "U",
+    "42": "H",
+    "43": "M",
+    "44": "B",
+    "45": "?",
+    "46": "G",
+    "47": "!",
+    "48": "D",
+    "49": "-",
+    "50": "F",
+    "51": "Y",
+    "52": "P",
+    "53": "K",
+    "54": "V",
+    "55": "j",
+    "56": "q",
+    "57": "x",
+    "58": "z",
+    "59": "J",
+    "60": "Q",
+    "61": "Z",
+    "62": "X",
+    "63": "3",
+    "64": "&",
+    "65": "$"
+  },
+  "vocab_size": 66
+}

inference.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import argparse
+from pathlib import Path
+import torch
+import yaml
+from llm.data.tokenizer import CharTokenizer
+from llm.inference.generate import greedy_decode, sample_decode
+from llm.model.transformer import Transformer
+from llm.utils.checkpoint import load_model_only
+def load_yaml(path: Path):
+    with open(path, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, default="First Citizen:\\n")
+    parser.add_argument("--max_length", type=int, default=200)
+    parser.add_argument("--temperature", type=float, default=0.8)
+    parser.add_argument("--top_k", type=int, default=50)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--checkpoint", type=str, default="checkpoints/best_model.pt")
+    parser.add_argument("--config", type=str, default="configs/model.yaml")
+    parser.add_argument("--vocab", type=str, default="data/vocab/char_vocab.json")
+    args = parser.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model_config = load_yaml(Path(args.config))
+    tokenizer = CharTokenizer(vocab_path=args.vocab)
+    model_config["vocab_size"] = tokenizer.vocab_size
+    model = Transformer(model_config)
+    load_model_only(model, args.checkpoint)
+    model.to(device)
+    model.eval()
+    input_ids = tokenizer.encode(args.prompt)
+    if not input_ids:
+        input_ids = [0]
+    input_ids = torch.tensor([input_ids], dtype=torch.long)
+    with torch.no_grad():
+        if args.temperature == 0:
+            generated_ids = greedy_decode(
+                model, input_ids, max_length=args.max_length, device=device
+            )
+        else:
+            generated_ids = sample_decode(
+                model,
+                input_ids,
+                max_length=args.max_length,
+                temperature=args.temperature,
+                top_k=args.top_k if args.top_k > 0 else None,
+                top_p=args.top_p if args.top_p > 0 else None,
+                device=device,
+            )
+    text = tokenizer.decode(generated_ids[0])
+    print(text)
+if __name__ == "__main__":
+    main()

llm/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+LLM from Manshu - 从零实现的大语言模型
+"""
+__version__ = "0.1.0"

llm/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (278 Bytes). View file

llm/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """数据处理模块"""

llm/data/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (218 Bytes). View file

llm/data/__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (5.23 kB). View file

llm/data/collate.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+数据整理函数：padding / batch
+将多个样本组合成批次
+处理不同长度的序列（padding）
+转换为模型需要的张量格式
+"""
+# 2026-01-23
+import torch
+def collate_fn(batch, pad_token_id=0):
+    """
+    整理批次数据（支持 padding，但所有样本长度相同时直接堆叠）
+    参数:
+        batch: 批次数据列表，每个元素是 (input_ids, target_ids)
+            - input_ids: 输入序列，形状为 (seq_len,)
+            - target_ids: 目标序列，形状为 (seq_len,)
+        pad_token_id: padding token ID（默认: 0）
+    返回:
+        (input_ids_batch, target_ids_batch)
+            - input_ids_batch: 批次输入序列，形状为 (batch_size, max_seq_len)
+            - target_ids_batch: 批次目标序列，形状为 (batch_size, max_seq_len)
+    """
+    # 分离 input_ids 和 target_ids
+    input_ids_list = [item[0] for item in batch]
+    target_ids_list = [item[1] for item in batch]
+    # 检查所有样本长度是否相同
+    input_lengths = [len(ids) for ids in input_ids_list]
+    target_lengths = [len(ids) for ids in target_ids_list]
+    all_same_length = (
+        len(set(input_lengths)) == 1 and
+        len(set(target_lengths)) == 1 and
+        input_lengths[0] == target_lengths[0]
+    )
+    if all_same_length:
+        # 所有样本长度相同，直接堆叠（高效，不需要 padding）
+        input_ids_batch = torch.stack(input_ids_list, dim=0)  # (batch_size, seq_len)
+        target_ids_batch = torch.stack(target_ids_list, dim=0)  # (batch_size, seq_len)
+    else:
+        # 样本长度不同，需要 padding
+        max_seq_len = max(max(input_lengths), max(target_lengths))
+        # Padding input_ids
+        padded_input_ids = []
+        for ids in input_ids_list:
+            pad_length = max_seq_len - len(ids)
+            if pad_length > 0:
+                padded = torch.cat([ids, torch.full((pad_length,), pad_token_id, dtype=ids.dtype)])
+            else:
+                padded = ids
+            padded_input_ids.append(padded)
+        # Padding target_ids
+        padded_target_ids = []
+        for ids in target_ids_list:
+            pad_length = max_seq_len - len(ids)
+            if pad_length > 0:
+                padded = torch.cat([ids, torch.full((pad_length,), pad_token_id, dtype=ids.dtype)])
+            else:
+                padded = ids
+            padded_target_ids.append(padded)
+        # 堆叠
+        input_ids_batch = torch.stack(padded_input_ids, dim=0)  # (batch_size, max_seq_len)
+        target_ids_batch = torch.stack(padded_target_ids, dim=0)  # (batch_size, max_seq_len)
+    return input_ids_batch, target_ids_batch
+if __name__ == "__main__":
+    print("=" * 60)
+    print("数据整理函数测试")
+    print("=" * 60)
+    # 模拟批次数据
+    batch_size = 4
+    seq_len = 10
+    print("\n1. 创建模拟批次数据")
+    print(f"   批次大小: {batch_size}")
+    print(f"   序列长度: {seq_len}")
+    # 创建模拟数据
+    batch = []
+    for i in range(batch_size):
+        input_ids = torch.randint(0, 100, (seq_len,))
+        target_ids = torch.randint(0, 100, (seq_len,))
+        batch.append((input_ids, target_ids))
+        print(f"   样本 {i}: input_ids 形状={input_ids.shape}, target_ids 形状={target_ids.shape}")
+    # 测试 collate_fn
+    print("\n2. 测试 collate_fn")
+    input_ids_batch, target_ids_batch = collate_fn(batch)
+    print(f"   输入批次形状: {input_ids_batch.shape}")
+    print(f"   目标批次形状: {target_ids_batch.shape}")
+    print(f"   期望形状: ({batch_size}, {seq_len})")
+    # 验证形状
+    assert input_ids_batch.shape == (batch_size, seq_len), \
+        f"输入批次形状错误: {input_ids_batch.shape} != ({batch_size}, {seq_len})"
+    assert target_ids_batch.shape == (batch_size, seq_len), \
+        f"目标批次形状错误: {target_ids_batch.shape} != ({batch_size}, {seq_len})"
+    print("   形状验证通过")
+    # 验证数据是否正确堆叠
+    print("\n3. 验证数据堆叠")
+    for i in range(batch_size):
+        input_match = torch.equal(input_ids_batch[i], batch[i][0])
+        target_match = torch.equal(target_ids_batch[i], batch[i][1])
+        print(f"   样本 {i}: input_ids 匹配={input_match}, target_ids 匹配={target_match}")
+        assert input_match and target_match, f"样本 {i} 数据不匹配"
+    print("   数据验证通过")
+    # 测试不同序列长度（需要 padding）
+    print("\n4. 测试不同序列长度（需要 padding）")
+    batch_variable = [
+        (torch.randint(0, 100, (5,)), torch.randint(0, 100, (5,))),
+        (torch.randint(0, 100, (8,)), torch.randint(0, 100, (8,))),
+        (torch.randint(0, 100, (10,)), torch.randint(0, 100, (10,))),
+    ]
+    print("   样本长度: [5, 8, 10]")
+    input_batch_var, target_batch_var = collate_fn(batch_variable, pad_token_id=0)
+    print(f"   输入批次形状: {input_batch_var.shape}")
+    print(f"   目标批次形状: {target_batch_var.shape}")
+    print(f"   期望形状: (3, 10)")
+    assert input_batch_var.shape == (3, 10), \
+        f"输入批次形状错误: {input_batch_var.shape} != (3, 10)"
+    assert target_batch_var.shape == (3, 10), \
+        f"目标批次形状错误: {target_batch_var.shape} != (3, 10)"
+    # 验证 padding 是否正确
+    print("\n5. 验证 padding")
+    for i, (orig_input, orig_target) in enumerate(batch_variable):
+        orig_len = len(orig_input)
+        # 检查原始数据是否正确
+        assert torch.equal(input_batch_var[i, :orig_len], orig_input), \
+            f"样本 {i} 的 input_ids 数据不匹配"
+        assert torch.equal(target_batch_var[i, :orig_len], orig_target), \
+            f"样本 {i} 的 target_ids 数据不匹配"
+        # 检查 padding 是否正确（应该都是 pad_token_id）
+        if orig_len < 10:
+            assert torch.all(input_batch_var[i, orig_len:] == 0), \
+                f"样本 {i} 的 input_ids padding 不正确"
+            assert torch.all(target_batch_var[i, orig_len:] == 0), \
+                f"样本 {i} 的 target_ids padding 不正确"
+        print(f"   样本 {i}: 长度={orig_len}, padding 验证通过")
+    print("   ✓ Padding 验证通过")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/data/dataset.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""数据集：CharDataset"""
+# 2026-01-23
+import numpy as np
+import torch
+from pathlib import Path
+class CharDataset:
+    """字符级数据集"""
+    def __init__(self, data_path, tokenizer, context_window=128):
+        """
+        初始化数据集
+        参数:
+            data_path: 数据文件路径（.npy 格式，包含 token IDs）
+            tokenizer: 分词器（用于获取 vocab_size 等信息，实际数据已预处理）
+            context_window: 训练时的块大小（序列长度）
+        """
+        self.tokenizer = tokenizer
+        self.context_window = context_window
+        # 加载数据
+        data_path = Path(data_path)
+        if not data_path.exists():
+            raise FileNotFoundError(f"数据文件不存在: {data_path}")
+        # 加载 numpy 数组（一维数组，包含所有 token IDs）
+        self.data = np.load(str(data_path))
+        # 确保数据是一维数组
+        if self.data.ndim > 1:
+            self.data = self.data.flatten()
+        print(f"数据集加载完成:")
+        print(f"  数据文件: {data_path}")
+        print(f"  数据长度: {len(self.data):,} tokens")
+        print(f"  块大小: {context_window}")
+        print(f"  可用样本数: {len(self)}")
+    def __len__(self):
+        """
+        返回数据集大小
+        返回:
+            可用样本数量（滑动窗口的数量）
+        """
+        # 每个样本需要 block_size + 1 个 token
+        # 最后一个样本从 len(data) - block_size - 1 开始
+        # 所以总样本数是 len(data) - block_size
+        return len(self.data) - self.context_window
+    def __getitem__(self, idx):
+        """
+        获取单个样本
+        参数:
+            idx: 样本索引
+        返回:
+            input_ids: 输入序列，形状为 (block_size,)
+            target_ids: 目标序列（输入序列右移一位），形状为 (block_size,)
+        """
+        # 边界检查
+        if idx < 0 or idx >= len(self):
+            raise IndexError(f"索引 {idx} 超出范围 [0, {len(self)})")
+        # 获取一个长度为 context_window + 1 的序列
+        # 例如：context_window=512，则取 513 个 token
+        chunk = self.data[idx:idx + self.context_window + 1]
+        # 前 context_window 个作为输入，后 context_window 个作为目标（右移一位）
+        # 例如：[0, 1, 2, ..., 255] -> [1, 2, 3, ..., 256]
+        input_ids = torch.tensor(chunk[:-1], dtype=torch.long)
+        target_ids = torch.tensor(chunk[1:], dtype=torch.long)
+        return input_ids, target_ids
+if __name__ == "__main__":
+    # 添加项目根目录到 Python 路径
+    from pathlib import Path
+    import sys
+    project_root = Path(__file__).parent.parent.parent
+    sys.path.insert(0, str(project_root))
+    from llm.data.tokenizer import CharTokenizer
+    print("=" * 60)
+    print("CharDataset 测试")
+    print("=" * 60)
+    # 测试参数
+    data_path = Path("data/processed/train.npy")
+    vocab_path = Path("data/vocab/char_vocab.json")
+    block_size = 256
+    # 检查文件是否存在
+    if not data_path.exists():
+        print(f"错误：数据文件不存在: {data_path}")
+        print("请先运行: python scripts/preprocess.py")
+        sys.exit(1)
+    if not vocab_path.exists():
+        print(f"错误：词汇表文件不存在: {vocab_path}")
+        print("请先运行: python scripts/preprocess.py")
+        sys.exit(1)
+    # 加载分词器
+    print("\n1. 加载分词器")
+    tokenizer = CharTokenizer(vocab_path=str(vocab_path))
+    print(f"   词汇表大小: {tokenizer.vocab_size}")
+    # 创建数据集
+    print("\n2. 创建数据集")
+    dataset = CharDataset(
+        data_path=str(data_path),
+        tokenizer=tokenizer,
+        block_size=block_size
+    )
+    # 测试数据集长度
+    print(f"\n3. 数据集信息")
+    print(f"   数据集大小: {len(dataset):,} 个样本")
+    print(f"   每个样本输入长度: {block_size}")
+    print(f"   每个样本目标长度: {block_size}")
+    # 测试获取样本
+    print("\n4. 测试获取样本")
+    input_ids, target_ids = dataset[0]
+    print(f"   样本 0:")
+    print(f"     输入形状: {input_ids.shape}")
+    print(f"     目标形状: {target_ids.shape}")
+    print(f"     输入前10个 token IDs: {input_ids[:10].tolist()}")
+    print(f"     目标前10个 token IDs: {target_ids[:10].tolist()}")
+    # 验证目标是否正确（应该是输入右移一位）
+    print(f"\n5. 验证目标序列")
+    expected_target = input_ids[1:].tolist()
+    actual_target = target_ids[:-1].tolist()
+    is_correct = expected_target == actual_target
+    print(f"   目标序列是否正确（右移一位）: {is_correct}")
+    if not is_correct:
+        print(f"   期望: {expected_target[:10]}")
+        print(f"   实际: {actual_target[:10]}")
+    # 测试解码
+    print(f"\n6. 测试解码")
+    input_text = tokenizer.decode(input_ids[:50])
+    target_text = tokenizer.decode(target_ids[:50])
+    print(f"   输入文本（前50个字符）: {repr(input_text)}")
+    print(f"   目标文本（前50个字符）: {repr(target_text)}")
+    # 测试多个样本
+    print(f"\n7. 测试多个样本")
+    for i in [0, 100, len(dataset) - 1]:
+        input_ids, target_ids = dataset[i]
+        print(f"   样本 {i}: 输入形状 {input_ids.shape}, 目标形状 {target_ids.shape}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,126 @@

+"""分词器：CharTokenizer，从切割文本得到 token IDs"""
+# 2026-01-22
+import json
+from pathlib import Path
+from collections import Counter
+class CharTokenizer:
+    """字符级分词器，英文的"""
+    def __init__(self, vocab_path=None):
+        """
+        初始化分词器
+        参数:
+            vocab_path: 词汇表文件路径（JSON 格式）
+        """
+        self.vocab_path = vocab_path
+        self.char_to_id = {}  # 字符 -> ID 的映射
+        self.id_to_char = {}  # ID -> 字符的映射
+        self.vocab_size = 0
+        if vocab_path and Path(vocab_path).exists():
+            self.load_vocab(vocab_path)
+    def build_vocab(self, texts):
+        """
+        从文本构建词汇表
+        参数:
+            texts: 文本列表或单个文本字符串
+        """
+        # 统一处理：如果是单个 str 类型的字符串，转为列表
+        if isinstance(texts, str):
+            texts = [texts]
+        # 合并所有字符串文本
+        all_chars = ''.join(texts)
+        # 统计字符（Python 自动按字符遍历，统计每个字符出现的次数）
+        char_counts = Counter(all_chars)
+        # 创建字符到 ID 的映射
+        self.char_to_id = {
+            '<unk>': 0,   # 未知字符
+            # '<pad>': 1,   # 填充，滑动窗口，不需要 '<pad>'
+            # '<bos>': 1,   # 开始字符
+            # '<eos>': 2,   # 结束字符
+        }
+        # 按频率排序添加字符
+        sorted_chars = sorted(char_counts.items(), key=lambda x: x[1], reverse=True)
+        for char, count in sorted_chars:
+            if char not in self.char_to_id:
+                self.char_to_id[char] = len(self.char_to_id)
+        # 创建反向映射
+        self.id_to_char = {id: char for char, id in self.char_to_id.items()}
+        self.vocab_size = len(self.char_to_id)
+    def encode(self, text):
+        """
+        编码：将文本切割成字符，然后转换为 ID
+        参数:
+            text: 输入文本字符串
+        返回:
+            token_ids: token ID 列表
+        """
+        token_ids = []
+        # 遍历字符串，每个字符自动分离
+        for char in text:
+            # 查找字符对应的 ID，找不到则使用 <unk>
+            char_id = self.char_to_id.get(char, self.char_to_id.get('<unk>', 0))
+            token_ids.append(char_id)
+        return token_ids
+    def decode(self, token_ids):
+        """
+        解码：将 ID 列表转换回文本
+        参数:
+            token_ids: token ID 列表或张量
+        返回:
+            text: 解码后的文本字符串
+        """
+        # 如果是 PyTorch 张量，转换为列表
+        if hasattr(token_ids, 'tolist'):
+            token_ids = token_ids.tolist()
+        # 将每个 ID 转换为字符
+        chars = []
+        for id in token_ids:
+            char = self.id_to_char.get(id, '<unk>')
+            # 过滤特殊 token
+            if char not in ['<unk>']:
+                chars.append(char)
+        # 拼接成文本
+        return ''.join(chars)
+    def save_vocab(self, vocab_path):
+        """保存词汇表到文件"""
+        vocab_data = {
+            'char_to_id': self.char_to_id,
+            'id_to_char': {str(k): v for k, v in self.id_to_char.items()},
+            'vocab_size': self.vocab_size
+        }
+        Path(vocab_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(vocab_path, 'w', encoding='utf-8') as f:
+            json.dump(vocab_data, f, ensure_ascii=False, indent=2)
+    def load_vocab(self, vocab_path):
+        """从文件加载词汇表"""
+        with open(vocab_path, 'r', encoding='utf-8') as f:
+            vocab_data = json.load(f)
+        self.char_to_id = vocab_data['char_to_id']
+        self.id_to_char = {int(k): v for k, v in vocab_data['id_to_char'].items()}
+        self.vocab_size = vocab_data['vocab_size']

llm/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""推理模块：文本生成"""
+from llm.inference.generate import greedy_decode, sample_decode
+__all__ = ['greedy_decode', 'sample_decode']

llm/inference/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (359 Bytes). View file

llm/inference/__pycache__/generate.cpython-312.pyc ADDED Viewed

Binary file (5.8 kB). View file

llm/inference/generate.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""文本生成：greedy / sampling"""
+# 2026-01-23
+import torch
+import torch.nn.functional as F
+from llm.model.attention import create_causal_mask
+def greedy_decode(model, input_ids, max_length=100, device="cpu", stop_token_ids=None):
+    """
+    贪心解码：每次选择概率最高的 token
+    参数:
+        model: Transformer 模型
+        input_ids: 输入 token IDs，形状为 (batch_size, seq_len)
+        max_length: 最大生成长度（不包括输入长度）
+        device: 设备
+        stop_token_ids: 停止 token ID 列表，遇到这些 token 时提前停止（可选）
+    返回:
+        generated_ids: 生成的 token IDs，形状为 (batch_size, total_length)
+    """
+    model.eval()
+    input_ids = input_ids.to(device)
+    generated_ids = input_ids.clone()
+    if stop_token_ids is None:
+        stop_token_ids = []
+    elif isinstance(stop_token_ids, int):
+        stop_token_ids = [stop_token_ids]
+    batch_size = generated_ids.size(0)
+    finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
+    with torch.no_grad():
+        for step in range(max_length):
+            # 如果所有序列都已完成，提前退出
+            if finished.all():
+                break
+            # 获取当前序列长度
+            seq_len = generated_ids.size(1)
+            # 创建因果掩码
+            causal_mask = create_causal_mask(seq_len, device=device)
+            # 前向传播
+            logits = model(generated_ids, mask=causal_mask)  # (batch_size, seq_len, vocab_size)
+            # 获取最后一个位置的 logits
+            next_token_logits = logits[:, -1, :]  # (batch_size, vocab_size)
+            # 选择概率最高的 token（贪心）
+            next_token_id = torch.argmax(next_token_logits, dim=-1, keepdim=True)  # (batch_size, 1)
+            # 检查是否遇到停止 token
+            if stop_token_ids:
+                for stop_id in stop_token_ids:
+                    finished = finished | (next_token_id.squeeze(-1) == stop_id)
+            # 拼接生成的 token
+            generated_ids = torch.cat([generated_ids, next_token_id], dim=1)
+    return generated_ids
+def sample_decode(
+    model,
+    input_ids,
+    max_length=100,
+    temperature=1.0,
+    top_k=0,
+    top_p=0.0,
+    device="cpu",
+    stop_token_ids=None
+):
+    """
+    采样解码：使用温度采样、top-k 采样和 top-p (nucleus) 采样
+    参数:
+        model: Transformer 模型
+        input_ids: 输入 token IDs，形状为 (batch_size, seq_len)
+        max_length: 最大生成长度（不包括输入长度）
+        temperature: 采样温度（越高越随机，越低越确定，0=贪心）
+        top_k: Top-K 采样（只从概率最高的 k 个 token 中采样，0=禁用）
+        top_p: Top-P (Nucleus) 采样（保留累积概率达到 p 的 token，0.0=禁用）
+        device: 设备
+        stop_token_ids: 停止 token ID 列表，遇到这些 token 时提前停止（可选）
+    返回:
+        generated_ids: 生成的 token IDs，形状为 (batch_size, total_length)
+    """
+    model.eval()
+    input_ids = input_ids.to(device)
+    generated_ids = input_ids.clone()
+    if stop_token_ids is None:
+        stop_token_ids = []
+    elif isinstance(stop_token_ids, int):
+        stop_token_ids = [stop_token_ids]
+    batch_size = generated_ids.size(0)
+    finished = torch.zeros(batch_size, dtype=torch.bool, device=device)
+    # 如果温度为 0，使用贪心解码
+    if temperature == 0:
+        return greedy_decode(model, input_ids, max_length, device, stop_token_ids)
+    with torch.no_grad():
+        for step in range(max_length):
+            # 如果所有序列都已完成，提前退出
+            if finished.all():
+                break
+            # 获取当前序列长度
+            seq_len = generated_ids.size(1)
+            # 创建因果掩码
+            causal_mask = create_causal_mask(seq_len, device=device)
+            # 前向传播
+            logits = model(generated_ids, mask=causal_mask)  # (batch_size, seq_len, vocab_size)
+            # 获取最后一个位置的 logits
+            next_token_logits = logits[:, -1, :]  # (batch_size, vocab_size)
+            # 应用温度
+            if temperature != 1.0:
+                next_token_logits = next_token_logits / temperature
+            # Top-K 采样
+            if top_k  is not None and top_k > 0:
+                # 获取 top-k 的值和索引
+                top_k_logits, top_k_indices = torch.topk(next_token_logits, min(top_k, next_token_logits.size(-1)), dim=-1)
+                # 创建掩码，将非 top-k 的位置设为负无穷
+                mask = torch.full_like(next_token_logits, float('-inf'))
+                mask.scatter_(-1, top_k_indices, top_k_logits)
+                next_token_logits = mask
+            # Top-P (Nucleus) 采样
+            if top_p is not None and top_p > 0.0:
+                # 先计算概率分布
+                probs = F.softmax(next_token_logits, dim=-1)
+                # 按概率降序排序
+                sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)
+                # 计算累积概率
+                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                # 找到第一个累积概率超过 top_p 的位置
+                # 保留该位置及之前的所有 token（至少保留第一个 token）
+                sorted_indices_to_remove = cumulative_probs > top_p
+                # 通过移位确保至少保留第一个 token
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                sorted_indices_to_remove[..., 0] = False
+                # 将排序后的掩码映射回原始索引
+                indices_to_remove = sorted_indices_to_remove.scatter(
+                    -1, sorted_indices, sorted_indices_to_remove
+                )
+                next_token_logits[indices_to_remove] = float('-inf')
+            # 应用 softmax 得到概率分布
+            probs = F.softmax(next_token_logits, dim=-1)
+            # 从概率分布中采样
+            next_token_id = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)
+            # 检查是否遇到停止 token
+            if stop_token_ids:
+                for stop_id in stop_token_ids:
+                    finished = finished | (next_token_id.squeeze(-1) == stop_id)
+            # 拼接生成的 token
+            generated_ids = torch.cat([generated_ids, next_token_id], dim=1)
+    return generated_ids

llm/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """模型组件模块"""

llm/model/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (219 Bytes). View file

llm/model/__pycache__/attention.cpython-312.pyc ADDED Viewed

Binary file (16.3 kB). View file

llm/model/__pycache__/block.cpython-312.pyc ADDED Viewed

Binary file (6.39 kB). View file

llm/model/__pycache__/embedding.cpython-312.pyc ADDED Viewed

Binary file (1.45 kB). View file

llm/model/__pycache__/ffn.cpython-312.pyc ADDED Viewed

Binary file (5.47 kB). View file

llm/model/__pycache__/norm.cpython-312.pyc ADDED Viewed

Binary file (7.19 kB). View file

llm/model/__pycache__/rope.cpython-312.pyc ADDED Viewed

Binary file (5.92 kB). View file

llm/model/__pycache__/transformer.cpython-312.pyc ADDED Viewed

Binary file (10.9 kB). View file

llm/model/attention.py ADDED Viewed

	@@ -0,0 +1,435 @@

+"""注意力机制：MHA / GQA / 滑动窗口注意力"""
+# 2026-01-23
+import torch
+import torch.nn as nn
+import math
+def scaled_dot_product_attention(
+    q, k, v, mask=None, sliding_window=None, sliding_window_overlap=True
+):
+    """
+    标量点积注意力（基础函数）
+    参数:
+        q: Query，形状为 (batch_size, num_heads, seq_len, head_dim)
+        k: Key，形状为 (batch_size, num_heads, seq_len, head_dim)
+        v: Value，形状为 (batch_size, num_heads, seq_len, head_dim)
+        mask: 注意力掩码，形状为 (batch_size, seq_len, seq_len) 或 (seq_len, seq_len)
+             True 表示可以关注，False 表示不能关注
+        sliding_window: 滑动窗口大小（可选，如果提供则应用滑动窗口注意力）
+        sliding_window_overlap: 是否允许滑动窗口重叠
+    返回:
+        output: 注意力输出，形状为 (batch_size, num_heads, seq_len, head_dim)
+        attn_weights: 注意力权重，形状为 (batch_size, num_heads, seq_len, seq_len)
+    """
+    batch_size, num_heads, seq_len, head_dim = q.shape
+    # 计算注意力分数：Q @ K^T
+    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(head_dim)
+    # 先应用滑动窗口掩码（如果有）
+    # 注意：滑动窗口掩码已经包含了因果掩码，所以如果使用滑动窗口，通常不需要额外的 mask
+    if sliding_window is not None:
+        scores = apply_sliding_window_mask(
+            scores, sliding_window, seq_len, sliding_window_overlap
+        )
+    # 应用普通掩码：mask 中为 False 的位置设为 -inf
+    # mask 常见只有：(L, L)（所有 batch / head 共用，常用于 causal mask）；(B, L, L)（每个 batch 和 head 独立，常用于 padding mask）
+    if mask is not None:
+        # 如果 mask 是 2D，扩展为 4D 以匹配 scores 的形状
+        if mask.dim() == 2:
+            mask = mask.unsqueeze(0).unsqueeze(0)  # (1, 1, seq_len, seq_len)
+        elif mask.dim() == 3:
+            mask = mask.unsqueeze(1)  # (batch_size, 1, seq_len, seq_len)
+        # mask 为 False 的位置设为 -inf，softmax 后变为 0
+        scores = scores.masked_fill(~mask, float("-inf"))
+    # Softmax：转换为概率分布
+    attn_weights = torch.softmax(scores, dim=-1)
+    # 加权求和：attn_weights @ V
+    output = torch.matmul(attn_weights, v)
+    return output, attn_weights
+def create_causal_mask(seq_len, device="cpu"):
+    """
+    创建因果掩码（下三角矩阵）
+    参数:
+        seq_len: 序列长度
+        device: 设备
+    返回:
+        因果掩码，形状为 (seq_len, seq_len)，下三角为 True
+    """
+    mask = torch.tril(torch.ones(seq_len, seq_len, device=device))
+    return mask.bool()
+def apply_sliding_window_mask(scores, window_size, seq_len, overlap=True):
+    """
+    应用滑动窗口掩码
+    参数:
+        scores: 注意力分数，形状为 (batch_size, num_heads, seq_len, seq_len)
+        window_size: 窗口大小
+        seq_len: 序列长度
+        overlap: 是否允许窗口重叠（如果为 True，使用对称窗口；如果为 False，使用非对称窗口）
+    返回:
+        应用掩码后的分数
+    """
+    batch_size, num_heads, seq_len, _ = scores.shape
+    # 创建滑动窗口掩码
+    # 位置 i 可以关注的位置范围
+    window_mask = torch.zeros(seq_len, seq_len, device=scores.device, dtype=torch.bool)
+    if overlap:
+        # 对称窗口：位置 i 可以关注 [max(0, i-window_size//2), min(seq_len, i+window_size//2+1)]
+        for i in range(seq_len):
+            start = max(0, i - window_size // 2)
+            end = min(seq_len, i + window_size // 2 + 1)
+            window_mask[i, start:end] = True
+    else:
+        # 非对称窗口：位置 i 可以关注 [max(0, i-window_size+1), i+1]
+        for i in range(seq_len):
+            start = max(0, i - window_size + 1)
+            end = i + 1
+            window_mask[i, start:end] = True
+    # 结合因果掩码（下三角）：既要满足因果性，又要满足窗口限制
+    causal_mask = torch.tril(
+        torch.ones(seq_len, seq_len, device=scores.device, dtype=torch.bool)
+    )
+    combined_mask = causal_mask & window_mask
+    # 扩展维度以匹配 scores 的形状
+    combined_mask = combined_mask.unsqueeze(0).unsqueeze(0)  # (1, 1, seq_len, seq_len)
+    # 应用掩码：不能关注的位置设为 -inf
+    scores = scores.masked_fill(~combined_mask, float("-inf"))
+    return scores
+class MultiHeadAttention(nn.Module):
+    """
+    多头注意力（MHA）
+    所有 Query、Key、Value 头都是独立的
+    """
+    def __init__(self, hidden_size, num_heads):
+        """
+        初始化多头注意力
+        参数:
+            hidden_size: 隐藏层维度
+            num_heads: 注意力头数
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        # 确保 hidden_size 能被 num_heads 整除
+        assert hidden_size % num_heads == 0, (
+            f"hidden_size ({hidden_size}) 必须能被 num_heads ({num_heads}) 整除"
+        )
+        # Query、Key、Value 投影，各自创建了一个线性层
+        self.q_proj = nn.Linear(hidden_size, hidden_size)
+        self.k_proj = nn.Linear(hidden_size, hidden_size)
+        self.v_proj = nn.Linear(hidden_size, hidden_size)
+        # 输出投影，创建了一个线性层
+        self.o_proj = nn.Linear(hidden_size, hidden_size)
+    def forward(
+        self, x, mask=None, rope=None, sliding_window=None, sliding_window_overlap=True
+    ):
+        """
+        前向传播
+        参数:
+            x: 输入张量，形状为 (batch_size, seq_len, hidden_size)
+            mask: 注意力掩码，形状为 (batch_size, seq_len, seq_len) 或 (seq_len, seq_len)
+            rope: RoPE 位置编码模块（可选）
+            sliding_window: 滑动窗口大小（可选，如果提供则应用滑动窗口注意力）
+            sliding_window_overlap: 是否允许滑动窗口重叠
+        返回:
+            输出张量，形状为 (batch_size, seq_len, hidden_size)
+        """
+        batch_size, seq_len, hidden_size = x.shape
+        # 1. 通过投影层得到 Q、K、V
+        q = self.q_proj(x)  # (batch_size, seq_len, hidden_size)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        # 2. 重塑为多头形式拆分成多个头
+        # (batch_size, seq_len, hidden_size) -> (batch_size, seq_len, num_heads, head_dim)
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        k = k.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        v = v.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        # 3. 转置以便进行注意力计算
+        # (batch_size, num_heads, seq_len, head_dim)
+        # (B, L, H, D_h) -> (B, H, L, D_h)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        # 4. 应用 RoPE 位置编码（如果有）
+        if rope is not None:
+            q, k = rope(q, k)
+        # 5. 计算注意力
+        # attn_output: (B, H, L, D_h), 第 h 个 head 中，第 i 个 token,从所有 token 的 Value 中,按注意力权重“加权融合”出来的向量
+        # attn_weights: (B, H, L, L),在 batch=b、head=h 中，第 i 个 token 对所有 token 的注意力权重
+        attn_output, attn_weights = scaled_dot_product_attention(
+            q, k, v, mask, sliding_window, sliding_window_overlap
+        )
+        # 6. 转置回来并拼接
+        # (batch_size, num_heads, seq_len, head_dim) -> (batch_size, seq_len, num_heads, head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # 7. 重塑为原始形状
+        # (batch_size, seq_len, num_heads, head_dim) -> (batch_size, seq_len, hidden_size)
+        attn_output = attn_output.contiguous().view(batch_size, seq_len, hidden_size)
+        # 8. 通过输出投影层
+        output = self.o_proj(attn_output)
+        return output
+class GroupedQueryAttention(nn.Module):
+    """
+    分组查询注意力（GQA）
+    多个 Query 头共享一组 Key-Value 头，减少 KV Cache 的内存占用
+    """
+    def __init__(self, hidden_size, num_heads, num_kv_heads):
+        """
+        初始化 GQA
+        参数:
+            hidden_size: 隐藏层维度
+            num_heads: Query 头数
+            num_kv_heads: Key-Value 头数
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = hidden_size // num_heads
+        # 确保 hidden_size 能被 num_heads 整除
+        assert hidden_size % num_heads == 0, (
+            f"hidden_size ({hidden_size}) 必须能被 num_heads ({num_heads}) 整除"
+        )
+        # 确保 num_heads 能被 num_kv_heads 整除
+        assert num_heads % num_kv_heads == 0, (
+            f"num_heads ({num_heads}) 必须能被 num_kv_heads ({num_kv_heads}) 整除"
+        )
+        # Query 投影：每个头都有独立的 Q
+        self.q_proj = nn.Linear(hidden_size, hidden_size)
+        # Key 和 Value 投影：多个 Query 头共享 KV
+        # D -> (num_kv_heads × head_dim)
+        # 之后(B, L, 256) -> (B, L, 4, 64)
+        kv_hidden_size = num_kv_heads * self.head_dim
+        self.k_proj = nn.Linear(hidden_size, kv_hidden_size)
+        self.v_proj = nn.Linear(hidden_size, kv_hidden_size)
+        # 输出投影
+        self.o_proj = nn.Linear(hidden_size, hidden_size)
+    def forward(
+        self, x, mask=None, rope=None, sliding_window=None, sliding_window_overlap=True
+    ):
+        """
+        前向传播
+        参数:
+            x: 输入张量，形状为 (batch_size, seq_len, hidden_size)
+            mask: 注意力掩码，形状为 (batch_size, seq_len, seq_len) 或 (seq_len, seq_len)
+            rope: RoPE 位置编码模块（可选）
+            sliding_window: 滑动窗口大小（可选，如果提供则应用滑动窗口注意力）
+            sliding_window_overlap: 是否允许滑动窗口重叠
+        返回:
+            输出张量，形状为 (batch_size, seq_len, hidden_size)
+        """
+        batch_size, seq_len, hidden_size = x.shape
+        # 1. 通过投影层
+        q = self.q_proj(x)  # (batch_size, seq_len, hidden_size)
+        k = self.k_proj(x)  # (batch_size, seq_len, num_kv_heads * head_dim)
+        v = self.v_proj(x)  # (batch_size, seq_len, num_kv_heads * head_dim)
+        # 2. 重塑 Q
+        q = q.view(batch_size, seq_len, self.num_heads, self.head_dim)
+        q = q.transpose(1, 2)  # (batch_size, num_heads, seq_len, head_dim)
+        # 3. 重塑 K 和 V
+        k = k.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)
+        k = k.transpose(1, 2)  # (batch_size, num_kv_heads, seq_len, head_dim)
+        v = v.view(batch_size, seq_len, self.num_kv_heads, self.head_dim)
+        v = v.transpose(1, 2)  # (batch_size, num_kv_heads, seq_len, head_dim)
+        # 4. 应用 RoPE 位置编码（如果有）
+        if rope is not None:
+            q, k = rope(q, k)
+        # 5. 重复 K 和 V 以匹配 Q 的头数
+        # 例如：10 个 Q 头，2 个 KV 头，每个 KV 头需要重复 5 次
+        repeat_kv = self.num_heads // self.num_kv_heads
+        k = k.repeat_interleave(
+            repeat_kv, dim=1
+        )  # (batch_size, num_heads, seq_len, head_dim)
+        v = v.repeat_interleave(
+            repeat_kv, dim=1
+        )  # (batch_size, num_heads, seq_len, head_dim)
+        # 6. 计算注意力
+        # attn_output: (B, H, L, D_h), 第 h 个 head 中，第 i 个 token,从所有 token 的 Value 中,按注意力权重"加权融合"出来的向量
+        # attn_weights: (B, H, L, L),在 batch=b、head=h 中，第 i 个 token 对所有 token 的注意力权重
+        attn_output, attn_weights = scaled_dot_product_attention(
+            q, k, v, mask, sliding_window, sliding_window_overlap
+        )
+        # 7. 转置并重塑
+        attn_output = attn_output.transpose(
+            1, 2
+        )  # (batch_size, seq_len, num_heads, head_dim)
+        # (batch_size, seq_len, num_heads, head_dim) -> (batch_size, seq_len, hidden_size)
+        attn_output = attn_output.contiguous().view(batch_size, seq_len, hidden_size)
+        # 8. 输出投影
+        output = self.o_proj(attn_output)
+        return output
+if __name__ == "__main__":
+    print("=" * 60)
+    print("注意力机制测试")
+    print("=" * 60)
+    # 测试参数（与 configs/model.yaml 一致）
+    hidden_size = 320
+    num_heads = 10
+    num_kv_heads = 2
+    head_dim = hidden_size // num_heads
+    batch_size = 2
+    seq_len = 10
+    print("\n1. 测试参数")
+    print(f"   hidden_size: {hidden_size}")
+    print(f"   num_heads: {num_heads}")
+    print(f"   num_kv_heads: {num_kv_heads}")
+    print(f"   head_dim: {head_dim}")
+    print(f"   batch_size: {batch_size}")
+    print(f"   seq_len: {seq_len}")
+    # 测试基础函数
+    print("\n2. 测试 scaled_dot_product_attention")
+    q = torch.randn(batch_size, num_heads, seq_len, head_dim)
+    k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+    v = torch.randn(batch_size, num_heads, seq_len, head_dim)
+    causal_mask = create_causal_mask(seq_len)
+    output, attn_weights = scaled_dot_product_attention(q, k, v, causal_mask)
+    print(f"   输入 Q 形状: {q.shape}")
+    print(f"   输出形状: {output.shape}")
+    print(f"   注意力权重形状: {attn_weights.shape}")
+    print(f"   注意力权重和（每行应为1）: {attn_weights.sum(dim=-1)[0, 0, :5]}")
+    # 测试 MHA
+    print("\n3. 测试 MultiHeadAttention")
+    mha = MultiHeadAttention(hidden_size, num_heads)
+    x = torch.randn(batch_size, seq_len, hidden_size)
+    output_mha = mha(x, mask=causal_mask)
+    print(f"   输入形状: {x.shape}")
+    print(f"   输出形状: {output_mha.shape}")
+    print(f"   参数数量: {sum(p.numel() for p in mha.parameters())}")
+    # 测试 GQA
+    print("\n4. 测试 GroupedQueryAttention")
+    gqa = GroupedQueryAttention(hidden_size, num_heads, num_kv_heads)
+    output_gqa = gqa(x, mask=causal_mask)
+    print(f"   输入形状: {x.shape}")
+    print(f"   输出形状: {output_gqa.shape}")
+    print(f"   参数数量: {sum(p.numel() for p in gqa.parameters())}")
+    print(f"   Q 投影参数: {gqa.q_proj.weight.shape}")
+    print(f"   K 投影参数: {gqa.k_proj.weight.shape}")
+    print(f"   V 投影参数: {gqa.v_proj.weight.shape}")
+    # 测试 GQA + RoPE
+    print("\n5. 测试 GroupedQueryAttention + RoPE")
+    from pathlib import Path
+    import sys
+    # 添加项目根目录到 Python 路径
+    project_root = Path(__file__).parent.parent.parent
+    sys.path.insert(0, str(project_root))
+    from llm.model.rope import RoPE
+    rope = RoPE(dim=head_dim, max_seq_len=1024, theta=10000.0)
+    output_gqa_rope = gqa(x, mask=causal_mask, rope=rope)
+    print(f"   输出形状: {output_gqa_rope.shape}")
+    print(f"   与无 RoPE 的输出不同: {not torch.allclose(output_gqa, output_gqa_rope)}")
+    # 测试滑动窗口掩码
+    print("\n6. 测试滑动窗口掩码")
+    window_size = 4
+    scores = torch.randn(batch_size, num_heads, seq_len, seq_len)
+    scores_windowed = apply_sliding_window_mask(
+        scores, window_size, seq_len, overlap=True
+    )
+    print(f"   窗口大小: {window_size}")
+    print(f"   原始分数形状: {scores.shape}")
+    print(f"   掩码后分数形状: {scores_windowed.shape}")
+    # 检查第一个样本第一个头的掩码
+    mask_check = scores_windowed[0, 0] != float("-inf")
+    print(f"   位置 0 可关注的位置数: {mask_check[0].sum().item()}")
+    print(f"   位置 5 可关注的位置数: {mask_check[5].sum().item()}")
+    # 测试 GQA + 滑动窗口
+    print("\n7. 测试 GroupedQueryAttention + 滑动窗口")
+    output_gqa_window = gqa(
+        x, mask=None, rope=rope, sliding_window=window_size, sliding_window_overlap=True
+    )
+    print(f"   输出形状: {output_gqa_window.shape}")
+    print(
+        f"   与无滑动窗口的输出不同: {not torch.allclose(output_gqa_rope, output_gqa_window)}"
+    )
+    # 测试 MHA + 滑动窗口（统一后 MHA 也支持滑动窗口）
+    print("\n8. 测试 MultiHeadAttention + 滑动窗口")
+    output_mha_window = mha(
+        x, mask=None, rope=rope, sliding_window=window_size, sliding_window_overlap=True
+    )
+    print(f"   输出形状: {output_mha_window.shape}")
+    print(
+        f"   与无滑动窗口的输出不同: {not torch.allclose(output_mha, output_mha_window)}"
+    )
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/model/block.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""Transformer Block"""
+# 2026-01-23
+import torch
+import torch.nn as nn
+from pathlib import Path
+import sys
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+from llm.model.attention import GroupedQueryAttention
+from llm.model.ffn import FFN
+from llm.model.norm import RMSNorm
+from llm.model.rope import RoPE
+class TransformerBlock(nn.Module):
+    """
+    Transformer 层（Block）
+    结构：
+    1. 注意力层（带残差连接）
+    2. FFN 层（带残差连接）
+    每个子层都使用 Pre-Norm 结构：
+    - Pre-Norm: x_norm = norm(x), output = x + sublayer(x_norm)
+    """
+    def __init__(self, config):
+        """
+        初始化 Transformer Block
+        参数:
+            config: 配置字典，包含模型参数
+        """
+        super().__init__()
+        self.config = config
+        hidden_size = config["hidden_size"]
+        num_heads = config["num_attention_heads"]
+        num_kv_heads = config.get("num_key_value_heads", num_heads)
+        intermediate_size = config["intermediate_size"]
+        rms_norm_eps = float(config.get("rms_norm_eps", 1e-5))
+        # 滑动窗口配置
+        self.sliding_window = config.get("sliding_window")
+        self.sliding_window_overlap = config.get("sliding_window_overlap", True)
+        # 注意力层（使用 GQA）
+        self.attn = GroupedQueryAttention(hidden_size, num_heads, num_kv_heads)
+        # 归一化层（Pre-Norm 结构）
+        self.attn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
+        self.ffn_norm = RMSNorm(hidden_size, eps=rms_norm_eps)
+        # FFN 层
+        self.ffn = FFN(hidden_size, intermediate_size)
+        # RoPE 位置编码（如果配置中有）
+        max_position_embeddings = config.get("max_position_embeddings", 1024)
+        rope_theta = config.get("rope_theta", 10000.0)
+        head_dim = hidden_size // num_heads
+        self.rope = RoPE(head_dim, max_position_embeddings, rope_theta)
+    def forward(self, x, mask=None):
+        """
+        前向传播
+        参数:
+            x: 输入张量，形状为 (batch_size, seq_len, hidden_size)
+            mask: 注意力掩码，形状为 (batch_size, seq_len, seq_len) 或 (seq_len, seq_len)
+        返回:
+            输出张量，形状为 (batch_size, seq_len, hidden_size)
+        """
+        # 1. 注意力层（Pre-Norm + 残差连接）
+        # Pre-Norm: 先归一化，再计算注意力
+        x_norm = self.attn_norm(x)
+        attn_output = self.attn(
+            x_norm,
+            mask=mask,
+            rope=self.rope,
+            sliding_window=self.sliding_window,
+            sliding_window_overlap=self.sliding_window_overlap,
+        )
+        # 残差连接
+        x = x + attn_output
+        # 2. FFN 层（Pre-Norm + 残差连接）
+        # Pre-Norm: 先归一化，再计算 FFN
+        x_norm = self.ffn_norm(x)
+        ffn_output = self.ffn(x_norm)
+        # 残差连接
+        x = x + ffn_output
+        return x
+if __name__ == "__main__":
+    print("=" * 60)
+    print("TransformerBlock 测试")
+    print("=" * 60)
+    # 测试参数（与 configs/model.yaml 一致）
+    config = {
+        "hidden_size": 320,
+        "num_attention_heads": 10,
+        "num_key_value_heads": 2,
+        "intermediate_size": 960,
+        "rms_norm_eps": 1e-5,
+        "max_position_embeddings": 1024,
+        "rope_theta": 10000.0,
+        "sliding_window": 256,
+        "sliding_window_overlap": True,
+    }
+    batch_size = 2
+    seq_len = 64
+    hidden_size = config["hidden_size"]
+    print("\n1. 测试参数")
+    print(f"   hidden_size: {hidden_size}")
+    print(f"   num_attention_heads: {config['num_attention_heads']}")
+    print(f"   num_key_value_heads: {config['num_key_value_heads']}")
+    print(f"   intermediate_size: {config['intermediate_size']}")
+    print(f"   batch_size: {batch_size}")
+    print(f"   seq_len: {seq_len}")
+    # 创建 TransformerBlock
+    print("\n2. 创建 TransformerBlock")
+    block = TransformerBlock(config)
+    x = torch.randn(batch_size, seq_len, hidden_size)
+    print(f"   输入形状: {x.shape}")
+    output = block(x)
+    print(f"   输出形状: {output.shape}")
+    print(f"   形状匹配: {output.shape == x.shape}")
+    # 检查参数数量
+    total_params = sum(p.numel() for p in block.parameters())
+    print(f"   参数数量: {total_params:,}")
+    # 验证残差连接
+    print("\n3. 验证残差连接")
+    # 如果输入很小，输出应该接近输入（因为残差连接）
+    x_small = torch.randn(batch_size, seq_len, hidden_size) * 0.01
+    output_small = block(x_small)
+    diff = torch.abs(output_small - x_small).mean()
+    print(f"   小输入测试: 输入输出差异均值 = {diff.item():.6f}")
+    # 测试梯度
+    print("\n4. 测试梯度计算")
+    loss = output.sum()
+    loss.backward()
+    print(
+        f"   所有参数是否有梯度: {all(p.grad is not None for p in block.parameters())}"
+    )
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/model/embedding.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""嵌入层：Token Embedding 词嵌入"""
+# 2026-01-23
+import torch.nn as nn
+class TokenEmbedding(nn.Module):
+    """词嵌入"""
+    def __init__(self, vocab_size, hidden_size):
+        """
+        初始化词嵌入层
+        参数:
+            vocab_size: 词汇表大小
+            hidden_size: 隐藏层维度
+        """
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        # 词嵌入层：将 token ID 映射为向量
+        self.embedding = nn.Embedding(vocab_size, hidden_size)
+    def forward(self, input_ids):
+        """
+        前向传播
+        参数:
+            input_ids: Token ID 张量，形状为 (batch_size, seq_len)
+        返回:
+            嵌入向量，形状为 (batch_size, seq_len, hidden_size)
+        """
+        return self.embedding(input_ids)

llm/model/ffn.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""前馈神经网络（FFN）"""
+# 2026-01-23
+import torch
+import torch.nn as nn
+def silu(x):
+    """
+    SiLU 激活函数（Sigmoid Linear Unit）
+    公式: SiLU(x) = x * sigmoid(x)
+    参数:
+        x: 输入张量
+    返回:
+        激活后的张量
+    """
+    return x * torch.sigmoid(x)
+class FFN(nn.Module):
+    """
+    前馈神经网络（使用 SwiGLU 结构）
+    """
+    def __init__(self, hidden_size, intermediate_size):
+        """
+        初始化 FFN
+        参数:
+            hidden_size: 隐藏层维度
+            intermediate_size: 中间层维度
+        """
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        # FFN(x) = W2 · activation(W1 · x)
+        # FFN(x) = W_down ( activation(W_gate · x) ⊙ (W_up · x) )
+        # SwiGLU 结构需要三个投影层：
+        # gate_proj: 用于门控（gate），经过激活函数，生成“门控信号”，决定哪些特征应该被放大或抑制
+        # up_proj: 用于上投影（up），不经过激活函数，提供被门控调制的“原始特征”
+        # down_proj: 用于下投影（down），将中间层映射回 hidden_size
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size)
+        # 激活函数
+        self.activation = silu
+    def forward(self, x):
+        """
+        前向传播
+        使用 SwiGLU 结构：
+        - gate = activation(gate_proj(x))
+        - up = up_proj(x)
+        - output = down_proj(gate * up)
+        参数:
+            x: 输入张量，形状为 (batch_size, seq_len, hidden_size)
+        返回:
+            输出张量，形状为 (batch_size, seq_len, hidden_size)
+        """
+        # SwiGLU 结构
+        # 1. 计算门控值（经过激活函数）
+        gate = self.activation(
+            self.gate_proj(x)
+        )  # (batch_size, seq_len, intermediate_size)
+        # 2. 计算上投影值（不经过激活函数）
+        up = self.up_proj(x)  # (batch_size, seq_len, intermediate_size)
+        # 3. 门控乘法：gate * up（逐元素相乘）
+        gate_up = gate * up  # (batch_size, seq_len, intermediate_size)
+        # 4. 下投影回 hidden_size
+        output = self.down_proj(gate_up)  # (batch_size, seq_len, hidden_size)
+        return output
+if __name__ == "__main__":
+    print("=" * 60)
+    print("FFN 测试")
+    print("=" * 60)
+    # 测试参数（与 configs/model.yaml 一致）
+    hidden_size = 320
+    intermediate_size = 960
+    batch_size = 2
+    seq_len = 10
+    print("\n1. 测试参数")
+    print(f"   hidden_size: {hidden_size}")
+    print(f"   intermediate_size: {intermediate_size}")
+    print(f"   batch_size: {batch_size}")
+    print(f"   seq_len: {seq_len}")
+    # 测试 SiLU 激活函数
+    print("\n2. 测试 SiLU 激活函数")
+    x_test = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0])
+    silu_output = silu(x_test)
+    print(f"   输入: {x_test.tolist()}")
+    print(f"   输出: {silu_output.tolist()}")
+    print(f"   SiLU(0) 应该接近 0: {abs(silu_output[2].item()) < 0.01}")
+    # 测试 FFN
+    print("\n3. 测试 FFN（SwiGLU 结构）")
+    ffn = FFN(hidden_size, intermediate_size)
+    x = torch.randn(batch_size, seq_len, hidden_size)
+    print(f"   输入形状: {x.shape}")
+    output = ffn(x)
+    print(f"   输出形状: {output.shape}")
+    print(f"   形状匹配: {output.shape == x.shape}")
+    # 检查参数数量
+    total_params = sum(p.numel() for p in ffn.parameters())
+    print(f"   参数数量: {total_params}")
+    print(f"   gate_proj 参数: {ffn.gate_proj.weight.shape}")
+    print(f"   up_proj 参数: {ffn.up_proj.weight.shape}")
+    print(f"   down_proj 参数: {ffn.down_proj.weight.shape}")
+    # 验证 SwiGLU 结构
+    print("\n4. 验证 SwiGLU 结构")
+    gate = silu(ffn.gate_proj(x))
+    up = ffn.up_proj(x)
+    gate_up = gate * up
+    manual_output = ffn.down_proj(gate_up)
+    print(f"   手动计算输出形状: {manual_output.shape}")
+    print(f"   与 forward 输出一致: {torch.allclose(output, manual_output)}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/model/norm.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""归一化层：RMSNorm"""
+# 2026-01-22
+import torch
+import torch.nn as nn
+class RMSNorm(nn.Module):
+    """RMSNorm 归一化层"""
+    def __init__(self, dim, eps=1e-5):
+        """
+        初始化 RMSNorm
+        参数:
+            dim: 特征维度
+            eps: 数值稳定项，防止除以零
+        """
+        super().__init__()
+        # 确保 eps 是浮点数（防止 YAML 解析为字符串）
+        self.eps = float(eps)
+        # 创建可学习的缩放参数，初始化为全1
+        # nn.Parameter 表示这是模型参数，会被优化器更新
+        # weight 就是 γ 参数
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        """
+        前向传播
+        参数:
+            x: 输入张量，形状为 (batch_size, seq_len, dim) 或其他形状
+        返回:
+            归一化后的张量
+        """
+        # 计算均方根（Root Mean Square）
+        # x.pow(2) 计算每个元素的平方
+        # mean(-1, keepdim=True) 在最后一个维度上求均值，保持维度
+        rms = torch.sqrt(torch.mean(x.pow(2), dim=-1, keepdim=True) + self.eps)
+        # 归一化：除以 RMS，然后乘以可学习的权重
+        return x / rms * self.weight
+if __name__ == "__main__":
+    print("=" * 60)
+    print("RMSNorm 测试")
+    print("=" * 60)
+    # 1. 创建 RMSNorm 层
+    dim = 32
+    norm = RMSNorm(dim=dim, eps=1e-5)
+    print("\n1. 创建 RMSNorm 层")
+    print(f"   维度: {dim}")
+    print(f"   eps: {norm.eps}")
+    print(f"   权重形状: {norm.weight.shape}")
+    print(f"   权重初始值（前5个）: {norm.weight[:5]}")
+    # 2. 创建测试输入
+    batch_size = 2
+    seq_len = 10
+    x = torch.randn(batch_size, seq_len, dim)
+    print("\n2. 创建测试输入")
+    print(f"   输入形状: {x.shape}")
+    print("   输入统计:")
+    print(f"     - 均值: {x.mean().item():.4f}")
+    print(f"     - 标准差: {x.std().item():.4f}")
+    print(f"     - 最小值: {x.min().item():.4f}")
+    print(f"     - 最大值: {x.max().item():.4f}")
+    # 3. 前向传播
+    output = norm(x)
+    print("\n3. 前向传播结果")
+    print(f"   输出形状: {output.shape}")
+    print("   输出统计:")
+    print(f"     - 均值: {output.mean().item():.4f}")
+    print(f"     - 标准差: {output.std().item():.4f}")
+    # 4. 验证归一化效果
+    print("\n4. 验证归一化效果")
+    # 计算每个样本的 RMS（应该接近1，因为权重初始化为1）
+    rms_per_sample = torch.sqrt(torch.mean(output.pow(2), dim=-1))
+    print("   每个样本的 RMS（归一化后）:")
+    print(f"     - 样本1: {rms_per_sample[0].mean().item():.4f}")
+    print(f"     - 样本2: {rms_per_sample[1].mean().item():.4f}")
+    print(f"     - 平均 RMS: {rms_per_sample.mean().item():.4f}")
+    # 5. 验证参数是否可学习
+    print("\n5. 验证参数可学习性")
+    print(f"   权重是否为 Parameter: {isinstance(norm.weight, nn.Parameter)}")
+    print(f"   权重是否需要梯度: {norm.weight.requires_grad}")
+    # 6. 测试梯度计算
+    print("\n6. 测试梯度计算")
+    loss = output.sum()  # 简单的损失函数
+    loss.backward()
+    print(f"   权重梯度是否存在: {norm.weight.grad is not None}")
+    if norm.weight.grad is not None:
+        print(f"   权重梯度形状: {norm.weight.grad.shape}")
+        print(f"   权重梯度统计:")
+        print(f"     - 均值: {norm.weight.grad.mean().item():.4f}")
+        print(f"     - 标准差: {norm.weight.grad.std().item():.4f}")
+    # 7. 测试不同输入形状
+    print("\n7. 测试不同输入形状")
+    test_cases = [
+        (1, 5, dim),  # 单个样本
+        (4, 20, dim),  # 多个样本
+        (1, 1, dim),  # 单个 token
+    ]
+    for i, shape in enumerate(test_cases, 1):
+        x_test = torch.randn(*shape)
+        output_test = norm(x_test)
+        print(f"   测试 {i}: 输入形状 {shape} -> 输出形状 {output_test.shape} ✓")
+    # 8. 验证数值稳定性
+    print("\n8. 验证数值稳定性")
+    # 测试非常小的输入
+    x_small = torch.randn(1, 1, dim) * 1e-6
+    output_small = norm(x_small)
+    print(
+        f"   极小输入测试: 输入范围 [{x_small.min().item():.2e}, {x_small.max().item():.2e}]"
+    )
+    print(f"   输出是否包含 NaN: {torch.isnan(output_small).any().item()}")
+    print(f"   输出是否包含 Inf: {torch.isinf(output_small).any().item()}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/model/rope.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""RoPE 旋转位置编码"""
+# 2026-01-22
+import torch
+import torch.nn as nn
+class RoPE(nn.Module):
+    """旋转位置编码（Rotary Position Embedding）"""
+    def __init__(self, dim, max_seq_len=1024, theta=10000.0):
+        """
+        初始化 RoPE
+        参数:
+            dim: 每个注意力头的维度，必须是偶数
+            max_seq_len: 模型能处理的最大序列长度
+            theta: 旋转频率参数
+        """
+        super().__init__()
+        self.dim = dim
+        self.max_seq_len = max_seq_len
+        self.theta = theta
+        # 确保 dim 是偶数
+        assert dim % 2 == 0, "dim 必须是偶数"
+        # 只依赖于 dim，是维度分量
+        # 预计算频率,每个维度对对应一个频率,2i/dim
+        # inv_freq[i] = 1 / (theta^(2i/dim))
+        # torch.arange(0, dim, 2) = [0, 2, 4, 6, ..., 30]  # 16 个数
+        # 除以 dim: [0/32, 2/32, 4/32, ..., 30/32]
+        #          = [0, 0.0625, 0.125, 0.1875, ..., 0.9375]
+        # theta 的幂次: [10000^0, 10000^0.0625, 10000^0.125, ...]
+        #              = [1, 1.47, 2.15, 3.16, ...]  # 逐渐增大
+        # 取倒数: [1/1, 1/1.47, 1/2.15, ...]
+        #        = [1.0, 0.68, 0.46, 0.32, ...]  # 逐渐减小
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
+        # register_buffer 注册为缓冲区，不参与梯度计算，但会随模型保存/加载，节省内存
+        self.register_buffer("inv_freq", inv_freq)
+    def forward(self, q, k, positions=None):
+        """
+        应用旋转位置编码到 Query 和 Key
+        参数:
+            q: Query 张量，形状为 (batch_size, num_heads, seq_len, head_dim)
+            k: Key 张量，形状为 (batch_size, num_heads, seq_len, head_dim)
+            positions: 位置索引，如果为 None 则使用序列位置
+        返回:
+            旋转后的 q 和 k
+        """
+        # q[b, h, s, d]
+        # 第 b 个样本，第 h 个注意力头，序列里第 s 个 token，在该 head 下的第 d 个特征分量
+        batch_size, num_heads, seq_len, head_dim = q.shape
+        # 如果没有提供位置，使用序列位置 [0, 1, 2, ..., seq_len-1]
+        # 生成位置索引
+        if positions is None:
+            positions = torch.arange(seq_len, device=q.device)
+        # 计算角度矩阵 freqs，第 s 个位置，第 d 个维度对应的频率，s * inv_freq
+        freqs = torch.outer(positions.float(), self.inv_freq)
+        # 计算 cos 和 sin，旋转矩阵的参数
+        cos = torch.cos(freqs)
+        sin = torch.sin(freqs)
+        # 扩展维度以匹配 q 和 k 的形状
+        # (seq_len, head_dim // 2) -> (1, 1, seq_len, head_dim // 2)
+        cos = cos.unsqueeze(0).unsqueeze(0)
+        sin = sin.unsqueeze(0).unsqueeze(0)
+        # 将 q 和 k 分成两部分（实部和虚部）
+        # q: (batch_size, num_heads, seq_len, head_dim)
+        # 分成两部分: q1 和 q2，各 (batch_size, num_heads, seq_len, head_dim // 2)
+        q1, q2 = q.chunk(2, dim=-1)
+        k1, k2 = k.chunk(2, dim=-1)
+        # 应用旋转矩阵
+        # 旋转矩阵: [cos -sin]  作用在 [x1]
+        #           [sin  cos]          [x2]
+        # 结果: [x1*cos - x2*sin]
+        #      [x1*sin + x2*cos]
+        q_rot = torch.cat(
+            [
+                q1 * cos - q2 * sin,  # 实部
+                q1 * sin + q2 * cos,  # 虚部
+            ],
+            dim=-1,
+        )
+        k_rot = torch.cat(
+            [
+                k1 * cos - k2 * sin,  # 实部
+                k1 * sin + k2 * cos,  # 虚部
+            ],
+            dim=-1,
+        )
+        return q_rot, k_rot
+if __name__ == "__main__":
+    print("=" * 60)
+    print("RoPE 测试")
+    print("=" * 60)
+    # 创建 RoPE 层
+    head_dim = 32  # 必须是偶数
+    rope = RoPE(dim=head_dim, max_seq_len=1024, theta=10000.0)
+    print(f"\n1. 创建 RoPE 层")
+    print(f"   头维度: {head_dim}")
+    print(f"   最大序列长度: {rope.max_seq_len}")
+    print(f"   theta: {rope.theta}")
+    print(f"   频率数量: {len(rope.inv_freq)}")
+    print(f"   前5个频率: {rope.inv_freq[:5]}")
+    # 创建测试输入
+    batch_size = 2
+    num_heads = 10
+    seq_len = 10
+    q = torch.randn(batch_size, num_heads, seq_len, head_dim)
+    k = torch.randn(batch_size, num_heads, seq_len, head_dim)
+    print(f"\n2. 创建测试输入")
+    print(f"   Q 形状: {q.shape}")
+    print(f"   K 形状: {k.shape}")
+    # 前向传播
+    q_rot, k_rot = rope(q, k)
+    print(f"\n3. 前向传播结果")
+    print(f"   旋转后 Q 形状: {q_rot.shape}")
+    print(f"   旋转后 K 形状: {k_rot.shape}")
+    print(f"   形状是否匹配: {q_rot.shape == q.shape and k_rot.shape == k.shape}")
+    # 验证旋转效果
+    print(f"\n4. 验证���转效果")
+    # 位置 0 和位置 1 的旋转角度应该不同
+    q_pos0 = q_rot[0, 0, 0, :]  # 第一个样本，第一个头，位置0
+    q_pos1 = q_rot[0, 0, 1, :]  # 第一个样本，第一个头，位置1
+    print(f"   位置 0 的 Q（前5个值）: {q_pos0[:5]}")
+    print(f"   位置 1 的 Q（前5个值）: {q_pos1[:5]}")
+    print(f"   位置不同，编码不同: {not torch.allclose(q_pos0, q_pos1)}")
+    # 测试不同位置
+    print(f"\n5. 测试不同位置")
+    positions = torch.tensor([0, 5, 10])
+    q_custom, k_custom = rope(q[:, :, :3, :], k[:, :, :3, :], positions=positions)
+    print(f"   自定义位置: {positions.tolist()}")
+    print(f"   输出形状: {q_custom.shape}")
+    print(f"\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/model/transformer.py ADDED Viewed

	@@ -0,0 +1,280 @@

+"""Decoder-only Transformer 主模型"""
+# 2026-01-23
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pathlib import Path
+import sys
+# 添加项目根目录到 Python 路径
+project_root = Path(__file__).parent.parent.parent
+sys.path.insert(0, str(project_root))
+from llm.model.embedding import TokenEmbedding
+from llm.model.block import TransformerBlock
+from llm.model.norm import RMSNorm
+class Transformer(nn.Module):
+    """完整的 Transformer 模型（Decoder-only）"""
+    def __init__(self, config):
+        """
+        初始化 Transformer 模型
+        参数:
+            config: 配置字典，包含模型参数
+        """
+        super().__init__()
+        self.config = config
+        vocab_size = config.get("vocab_size", 1000)  # 需要从数据配置中获取
+        hidden_size = config["hidden_size"]
+        num_layers = config["num_hidden_layers"]
+        rms_norm_eps = float(config.get("rms_norm_eps", 1e-5))
+        tie_word_embeddings = config.get("tie_word_embeddings", True)
+        # 词嵌入层
+        self.embedding = TokenEmbedding(vocab_size, hidden_size)
+        # Transformer 层（多个 Block）
+        self.layers = nn.ModuleList(
+            [TransformerBlock(config) for _ in range(num_layers)]
+        )
+        # 最终归一化层
+        self.norm = RMSNorm(hidden_size, eps=rms_norm_eps)
+        # 输出层（用于生成下一个 token 的概率）
+        # 如果 tie_word_embeddings=True，则共享输入和输出词嵌入权重
+        if tie_word_embeddings:
+            # 绑定输入和输出词嵌入（共享权重）
+            self.lm_head = None
+            # 注意：实际使用时，输出层使用 embedding.weight 的转置
+        else:
+            # 独立的输出层
+            self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False)
+        # 权重初始化（默认启用）
+        init_std = config.get("init_std", 0.02)
+        init_weights_enabled = config.get("init_weights", True)
+        if init_weights_enabled:
+            from llm.utils.init import apply_llm_init
+            apply_llm_init(self, std=init_std, init_output_layer=True)
+    def forward(self, input_ids, mask=None):
+        """
+        前向传播
+        参数:
+            input_ids: Token ID 张量，形状为 (batch_size, seq_len)
+            mask: 注意力掩码，形状为 (batch_size, seq_len, seq_len) 或 (seq_len, seq_len)
+        返回:
+            logits: 下一个 token 的 logits，形状为 (batch_size, seq_len, vocab_size)
+        """
+        # 1. 词嵌入
+        x = self.embedding(input_ids)  # (batch_size, seq_len, hidden_size)
+        # 2. 通过所有 Transformer 层
+        for layer in self.layers:
+            x = layer(x, mask=mask)  # (batch_size, seq_len, hidden_size)
+        # 3. 最终归一化
+        x = self.norm(x)  # (batch_size, seq_len, hidden_size)
+        # 4. 输出层（生成下一个 token 的概率分布）
+        if self.lm_head is None:
+            # 使用共享的词嵌入权重（转置）
+            # embedding.weight: (vocab_size, hidden_size)
+            # 我们需要: (hidden_size, vocab_size) -> 使用转置
+            logits = F.linear(x, self.embedding.embedding.weight)
+        else:
+            # 使用独立的输出层
+            logits = self.lm_head(x)  # (batch_size, seq_len, vocab_size)
+        return logits
+    def generate(
+        self, input_ids, max_length=100, temperature=1.0, top_k=None, top_p=None
+    ):
+        """
+        文本生成（自回归生成）
+        参数:
+            input_ids: 起始 token ID 张量，形状为 (batch_size, start_len)
+            max_length: 最大生成长度（包括输入）
+            temperature: 温度参数，控制生成的随机性
+            top_k: Top-K 采样（保留概率最大的 k 个 token）
+            top_p: Nucleus 采样（保留概率累积和达到 p 的 token）
+        返回:
+            generated_ids: 生成的 token ID 序列，形状为 (batch_size, generated_len)
+        """
+        self.eval()  # 设置为评估模式
+        generated_ids = input_ids.clone()
+        with torch.no_grad():
+            for _ in range(max_length - input_ids.shape[1]):
+                # 获取当前位置的输出 logits
+                logits = self.forward(
+                    generated_ids
+                )  # (batch_size, seq_len, vocab_size)
+                # 获取最后一个位置的 logits（下一步要预测的 token）
+                next_token_logits = (
+                    logits[:, -1, :] / temperature
+                )  # (batch_size, vocab_size)
+                # Top-K 采样
+                if top_k is not None:
+                    # 只保留 top-k 个最大的 logits，其余设为 -inf
+                    top_k_values, top_k_indices = torch.topk(next_token_logits, top_k)
+                    next_token_logits_filtered = torch.full_like(
+                        next_token_logits, float("-inf")
+                    )
+                    next_token_logits_filtered.scatter_(1, top_k_indices, top_k_values)
+                    next_token_logits = next_token_logits_filtered
+                # Top-P (Nucleus) 采样
+                if top_p is not None:
+                    # 按概率排序
+                    sorted_logits, sorted_indices = torch.sort(
+                        next_token_logits, descending=True
+                    )
+                    sorted_probs = F.softmax(sorted_logits, dim=-1)
+                    # 计算累积概率
+                    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+                    # 找到第一个累积概率超过 top_p 的位置
+                    # 移除该位置及之后的所有位置（至少保留一个 token）
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    # 通过移位确保至少保留第一个 token
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                        ..., :-1
+                    ].clone()
+                    sorted_indices_to_remove[..., 0] = False
+                    # 将排序后的掩码映射回原始索引
+                    indices_to_remove = sorted_indices_to_remove.scatter(
+                        1, sorted_indices, sorted_indices_to_remove
+                    )
+                    next_token_logits[indices_to_remove] = float("-inf")
+                # 计算概率并采样
+                probs = F.softmax(next_token_logits, dim=-1)  # (batch_size, vocab_size)
+                next_token_id = torch.multinomial(
+                    probs, num_samples=1
+                )  # (batch_size, 1)
+                # 将新生成的 token 添加到序列中
+                generated_ids = torch.cat(
+                    [generated_ids, next_token_id], dim=1
+                )  # (batch_size, seq_len+1)
+                # 如果所有序列都生成了结束符，可以提前停止（这里简化处理，生成固定长度）
+        return generated_ids
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Transformer 模型测试")
+    print("=" * 60)
+    # 测试参数（与 configs/model.yaml 一致）
+    config = {
+        "vocab_size": 100,  # 示例词汇表大小
+        "hidden_size": 320,
+        "num_hidden_layers": 10,
+        "num_attention_heads": 10,
+        "num_key_value_heads": 2,
+        "intermediate_size": 960,
+        "rms_norm_eps": 1e-5,
+        "max_position_embeddings": 1024,
+        "rope_theta": 10000.0,
+        "sliding_window": 256,
+        "sliding_window_overlap": True,
+        "tie_word_embeddings": True,
+    }
+    batch_size = 2
+    seq_len = 10
+    vocab_size = config["vocab_size"]
+    print("\n1. 测试参数")
+    print(f"   vocab_size: {vocab_size}")
+    print(f"   hidden_size: {config['hidden_size']}")
+    print(f"   num_hidden_layers: {config['num_hidden_layers']}")
+    print(f"   batch_size: {batch_size}")
+    print(f"   seq_len: {seq_len}")
+    # 创建 Transformer 模型
+    print("\n2. 创建 Transformer 模型")
+    model = Transformer(config)
+    # 创建测试输入
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
+    print(f"   输入形状: {input_ids.shape}")
+    # 前向传播
+    output = model(input_ids)
+    print(f"   输出形状: {output.shape}")
+    print(f"   输出形状正确: {output.shape == (batch_size, seq_len, vocab_size)}")
+    # 检查参数数量
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"   参数数量: {total_params:,}")
+    # 验证词嵌入共享
+    print("\n3. 验证词嵌入共享")
+    if model.lm_head is None:
+        print("   使用共享的词嵌入权重（tie_word_embeddings=True）")
+        print(f"   embedding 参数形状: {model.embedding.embedding.weight.shape}")
+    else:
+        print("   使用独立的输出层（tie_word_embeddings=False）")
+        print(f"   lm_head 参数形状: {model.lm_head.weight.shape}")
+    # 测试梯度
+    print("\n4. 测试梯度计算")
+    # 创建简单的损失函数（交叉熵）
+    targets = torch.randint(0, vocab_size, (batch_size, seq_len))
+    loss = F.cross_entropy(output.view(-1, vocab_size), targets.view(-1))
+    loss.backward()
+    print(f"   损失值: {loss.item():.4f}")
+    print(
+        f"   所有参数是否有梯度: {all(p.grad is not None for p in model.parameters())}"
+    )
+    # 测试生成（简化版）
+    print("\n5. 测试文本生成（简化版）")
+    start_ids = torch.randint(0, vocab_size, (1, 5))
+    print(f"   起始序列长度: {start_ids.shape[1]}")
+    print(f"   起始 token IDs: {start_ids[0].tolist()}")
+    # ���加调试：查看第一次生成的 logits 分布
+    with torch.no_grad():
+        logits = model(start_ids)
+        next_logits = logits[0, -1, :]
+        probs = F.softmax(next_logits, dim=-1)
+        top_probs, top_indices = torch.topk(probs, k=5)
+        print("   第一次生成的前5个最可能 token:")
+        for i, (idx, prob) in enumerate(zip(top_indices, top_probs)):
+            print(f"     Token {idx.item()}: {prob.item():.4f}")
+    # 使用 top_k 采样增加多样性（未训练模型建议使用）
+    generated = model.generate(
+        start_ids,
+        max_length=15,
+        temperature=1.5,  # 提高温度增加随机性
+        top_k=10,  # 只从前10个最可能的 token 中采样
+    )
+    print(f"   生成序列长度: {generated.shape[1]}")
+    print(f"   生成的 token IDs: {generated[0].tolist()}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/training/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""训练模块"""
+from llm.training.metrics import (
+    calculate_perplexity,
+    calculate_accuracy,
+    calculate_top_k_accuracy,
+    calculate_metrics,
+)
+__all__ = [
+    "calculate_perplexity",
+    "calculate_accuracy",
+    "calculate_top_k_accuracy",
+    "calculate_metrics",
+]

llm/training/loss.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""损失函数：交叉熵损失"""
+# 2026-01-23
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CrossEntropyLoss(nn.Module):
+    """交叉熵损失"""
+    def __init__(self, ignore_index=-1):
+        """
+        初始化交叉熵损失
+        参数:
+            ignore_index: 要忽略的目标索引（通常用于 padding）
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+    def forward(self, logits, targets):
+        """
+        计算交叉熵损失
+        参数:
+            logits: 模型输出，形状为 (batch_size, seq_len, vocab_size)
+            targets: 目标 token IDs，形状为 (batch_size, seq_len)
+        返回:
+            损失值（标量）
+        """
+        # 重塑为 (batch_size * seq_len, vocab_size) 和 (batch_size * seq_len,)
+        # 这样可以将所有位置的预测和目标展平，统一计算损失
+        logits_flat = logits.view(
+            -1, logits.size(-1)
+        )  # (batch_size * seq_len, vocab_size)
+        targets_flat = targets.view(-1)  # (batch_size * seq_len,)
+        # 计算交叉熵损失
+        # F.cross_entropy 内部会先对 logits 应用 log_softmax，然后计算负对数似然
+        loss = F.cross_entropy(
+            logits_flat, targets_flat, ignore_index=self.ignore_index
+        )
+        return loss
+if __name__ == "__main__":
+    print("=" * 60)
+    print("损失函数测试")
+    print("=" * 60)
+    # 测试交叉熵损失
+    print("\n1. 测试 CrossEntropyLoss")
+    criterion = CrossEntropyLoss()
+    batch_size = 2
+    seq_len = 10
+    vocab_size = 100
+    # 创建随机 logits 和 targets
+    # 注意：logits 需要梯度才能进行反向传播测试
+    logits = torch.randn(batch_size, seq_len, vocab_size, requires_grad=True)
+    targets = torch.randint(0, vocab_size, (batch_size, seq_len))
+    print(f"   Logits 形状: {logits.shape}")
+    print(f"   Targets 形状: {targets.shape}")
+    loss = criterion(logits, targets)
+    print(f"   损失值: {loss.item():.4f}")
+    # 验证损失是否为标量
+    print(f"   损失是否为标量: {loss.dim() == 0}")
+    # 测试梯度
+    loss.backward()
+    print(f"   损失可以反向传播: True")
+    print(f"   Logits 梯度形状: {logits.grad.shape}")
+    # 测试 ignore_index
+    print("\n2. 测试 ignore_index")
+    criterion_ignore = CrossEntropyLoss(ignore_index=-1)
+    targets_with_ignore = targets.clone()
+    targets_with_ignore[0, 0] = -1  # 设置一个忽略的 token
+    loss_ignore = criterion_ignore(logits, targets_with_ignore)
+    print(f"   使用 ignore_index 的损失值: {loss_ignore.item():.4f}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/training/metrics.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""评估指标：困惑度、准确率等"""
+# 2026-01-23
+import torch
+import torch.nn.functional as F
+def calculate_perplexity(loss):
+    """
+    计算困惑度（Perplexity）
+    困惑度是语言模型常用的评估指标，表示模型对下一个 token 的不确定性。
+    困惑度越低，模型越好。
+    公式: PPL = exp(loss)
+    参数:
+        loss: 交叉熵损失值（标量或张量）
+    返回:
+        困惑度值
+    """
+    return torch.exp(loss)
+def calculate_accuracy(logits, targets, ignore_index=-1):
+    """
+    计算准确率（Accuracy）
+    参数:
+        logits: 模型输出，形状为 (batch_size, seq_len, vocab_size)
+        targets: 目标 token IDs，形状为 (batch_size, seq_len)
+        ignore_index: 要忽略的目标索引（通常用于 padding）
+    返回:
+        准确率（标量，0-1 之间）
+    """
+    # 获取预测的 token IDs
+    predictions = torch.argmax(logits, dim=-1)  # (batch_size, seq_len)
+    # 计算匹配的数量（忽略 ignore_index）
+    if ignore_index >= 0:
+        # 创建掩码：忽略 ignore_index 的位置
+        mask = (targets != ignore_index)
+        # 计算匹配的数量（只考虑非忽略的位置）
+        matches = (predictions == targets) & mask
+        total = mask.sum()
+    else:
+        # 不忽略任何位置
+        matches = (predictions == targets)
+        total = targets.numel()
+    # 计算准确率
+    accuracy = matches.sum().float() / total.float() if total > 0 else 0.0
+    return accuracy.item() if isinstance(accuracy, torch.Tensor) else accuracy
+def calculate_top_k_accuracy(logits, targets, k=5, ignore_index=-1):
+    """
+    计算 Top-K 准确率
+    参数:
+        logits: 模型输出，形状为 (batch_size, seq_len, vocab_size)
+        targets: 目标 token IDs，形状为 (batch_size, seq_len)
+        k: Top-K 值（默认: 5）
+        ignore_index: 要忽略的目标索引
+    返回:
+        Top-K 准确率（标量，0-1 之间）
+    """
+    # 获取 top-k 个最可能的 token IDs
+    _, top_k_indices = torch.topk(logits, k, dim=-1)  # (batch_size, seq_len, k)
+    # 扩展 targets 维度以匹配 top_k_indices
+    targets_expanded = targets.unsqueeze(-1).expand_as(top_k_indices)  # (batch_size, seq_len, k)
+    # 检查目标是否在 top-k 中
+    matches = (top_k_indices == targets_expanded).any(dim=-1)  # (batch_size, seq_len)
+    # 计算准确率（忽略 ignore_index）
+    if ignore_index >= 0:
+        mask = (targets != ignore_index)
+        matches = matches & mask
+        total = mask.sum()
+    else:
+        total = targets.numel()
+    top_k_accuracy = matches.sum().float() / total.float() if total > 0 else 0.0
+    return top_k_accuracy.item() if isinstance(top_k_accuracy, torch.Tensor) else top_k_accuracy
+def calculate_metrics(logits, targets, loss, ignore_index=-1):
+    """
+    计算所有评估指标
+    参数:
+        logits: 模型输出，形状为 (batch_size, seq_len, vocab_size)
+        targets: 目标 token IDs，形状为 (batch_size, seq_len)
+        loss: 损失值（标量）
+        ignore_index: 要忽略的目标索引
+    返回:
+        包含所有指标的字典
+    """
+    metrics = {}
+    # 困惑度
+    metrics["perplexity"] = calculate_perplexity(loss).item()
+    # 准确率
+    metrics["accuracy"] = calculate_accuracy(logits, targets, ignore_index=ignore_index)
+    # Top-5 准确率
+    metrics["top5_accuracy"] = calculate_top_k_accuracy(logits, targets, k=5, ignore_index=ignore_index)
+    # 损失
+    metrics["loss"] = loss.item() if isinstance(loss, torch.Tensor) else loss
+    return metrics
+if __name__ == "__main__":
+    print("=" * 60)
+    print("评估指标测试")
+    print("=" * 60)
+    # 测试参数
+    batch_size = 4
+    seq_len = 10
+    vocab_size = 100
+    print(f"\n测试参数:")
+    print(f"  batch_size: {batch_size}")
+    print(f"  seq_len: {seq_len}")
+    print(f"  vocab_size: {vocab_size}")
+    # 创建模拟数据
+    logits = torch.randn(batch_size, seq_len, vocab_size)
+    targets = torch.randint(0, vocab_size, (batch_size, seq_len))
+    loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1))
+    print(f"\n1. 测试困惑度")
+    ppl = calculate_perplexity(loss)
+    print(f"   损失: {loss.item():.4f}")
+    print(f"   困惑度: {ppl.item():.4f}")
+    print(f"\n2. 测试准确率")
+    accuracy = calculate_accuracy(logits, targets)
+    print(f"   准确率: {accuracy:.4f} ({accuracy*100:.2f}%)")
+    print(f"\n3. 测试 Top-5 准确率")
+    top5_acc = calculate_top_k_accuracy(logits, targets, k=5)
+    print(f"   Top-5 准确率: {top5_acc:.4f} ({top5_acc*100:.2f}%)")
+    print(f"\n4. 测试 ignore_index")
+    # 设置一些 ignore_index
+    targets_with_ignore = targets.clone()
+    targets_with_ignore[0, :3] = -1  # 前3个设为忽略
+    accuracy_ignore = calculate_accuracy(logits, targets_with_ignore, ignore_index=-1)
+    print(f"   使用 ignore_index 的准确率: {accuracy_ignore:.4f} ({accuracy_ignore*100:.2f}%)")
+    print(f"\n5. 测试综合指标")
+    all_metrics = calculate_metrics(logits, targets, loss)
+    print("   所有指标:")
+    for key, value in all_metrics.items():
+        if isinstance(value, float):
+            print(f"     {key}: {value:.4f}")
+        else:
+            print(f"     {key}: {value}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/training/optim.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""优化器：AdamW / 学习率调度"""
+# 2026-01-23
+import math
+import torch.optim as optim
+from torch.optim.lr_scheduler import LambdaLR
+def get_optimizer(model, config):
+    """
+    获取优化器（AdamW）
+    参数:
+        model: 模型
+        config: 训练配置字典，应包含以下键：
+            - learning_rate: 学习率（默认: 1e-4）
+            - weight_decay: 权重衰减（默认: 0.01）
+            - beta1: Adam beta1（默认: 0.9）
+            - beta2: Adam beta2（默认: 0.999）
+            - eps: Adam epsilon（默认: 1e-8）
+    返回:
+        AdamW 优化器
+    """
+    lr = float(config.get("learning_rate", 1e-4))
+    weight_decay = float(config.get("weight_decay", 0.01))
+    beta1 = float(config.get("beta1", 0.9))
+    beta2 = float(config.get("beta2", 0.999))
+    eps = float(config.get("eps", 1e-8))
+    optimizer = optim.AdamW(
+        model.parameters(),
+        lr=lr,
+        weight_decay=weight_decay,
+        betas=(beta1, beta2),
+        eps=eps,
+    )
+    return optimizer
+def get_lr_scheduler(optimizer, config):
+    """
+    获取学习率调度器（支持预热）
+    参数:
+        optimizer: 优化器
+        config: 训练配置字典，应包含以下键：
+            - lr_scheduler: 调度器类型，可选值：
+                - "cosine": 余弦退火（带预热）
+                - "linear": 线性衰减（带预热）
+                - "constant": 常数学习率
+            - warmup_steps: 预热步数（默认: 100）
+            - max_steps: 最大训练步数（默认: 10000）
+    返回:
+        学习率调度器（如果为 constant，返回 None）
+    """
+    scheduler_type = config.get("lr_scheduler", "cosine")
+    max_steps = config.get("max_steps", 10000)
+    warmup_steps = config.get("warmup_steps", 100)
+    if scheduler_type == "cosine":
+        # 余弦退火调度（带预热）
+        def lr_lambda(step):
+            if step < warmup_steps:
+                # 预热阶段：线性增加从 0 到 1
+                return step / warmup_steps if warmup_steps > 0 else 1.0
+            else:
+                # 余弦退火阶段：从 1 衰减到 0
+                # 处理边界情况：如果 max_steps <= warmup_steps，直接返回最小值
+                if max_steps <= warmup_steps:
+                    return 0.0
+                progress = (step - warmup_steps) / (max_steps - warmup_steps)
+                # 限制 progress 在 [0, 1] 范围内
+                progress = min(progress, 1.0)
+                # 余弦退火：0.5 * (1 + cos(π * progress))
+                return 0.5 * (1.0 + math.cos(progress * math.pi))
+        scheduler = LambdaLR(optimizer, lr_lambda)
+    elif scheduler_type == "linear":
+        # 线性衰减调度（带预热）
+        def lr_lambda(step):
+            if step < warmup_steps:
+                # 预热阶段：线性增加从 0 到 1
+                return step / warmup_steps if warmup_steps > 0 else 1.0
+            else:
+                # 线性衰减阶段：从 1 线性衰减到 0.1
+                # 处理边界情况：如果 max_steps <= warmup_steps，直接返回最小值
+                if max_steps <= warmup_steps:
+                    return 0.1
+                progress = (step - warmup_steps) / (max_steps - warmup_steps)
+                # 限制 progress 在 [0, 1] 范围内
+                progress = min(progress, 1.0)
+                # 线性衰减：从 1.0 到 0.1
+                return 1.0 - 0.9 * progress
+        scheduler = LambdaLR(optimizer, lr_lambda)
+    elif scheduler_type == "constant":
+        # 常数学习率（无调度器）
+        scheduler = None
+    else:
+        raise ValueError(
+            f"未知的学习率调度器类型: {scheduler_type}。"
+            f"支持的类型: cosine, linear, constant"
+        )
+    return scheduler
+if __name__ == "__main__":
+    import sys
+    import io
+    # 设置输出编码为 UTF-8（Windows 兼容）
+    if sys.platform == "win32":
+        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
+    print("=" * 60)
+    print("优化器和学习率调度器测试")
+    print("=" * 60)
+    # 创建一个简单的模型用于测试
+    import torch.nn as nn
+    class SimpleModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(10, 1)
+        def forward(self, x):
+            return self.linear(x)
+    model = SimpleModel()
+    # 测试配置
+    config = {
+        "learning_rate": 1e-3,
+        "weight_decay": 0.01,
+        "beta1": 0.9,
+        "beta2": 0.999,
+        "eps": 1e-8,
+        "lr_scheduler": "cosine",
+        "warmup_steps": 10,
+        "max_steps": 100,
+    }
+    # 测试优化器
+    print("\n1. 测试优化器创���")
+    optimizer = get_optimizer(model, config)
+    print(f"   优化器类型: {type(optimizer).__name__}")
+    print(f"   学习率: {optimizer.param_groups[0]['lr']}")
+    print(f"   权重衰减: {optimizer.param_groups[0]['weight_decay']}")
+    print(f"   Beta1: {optimizer.param_groups[0]['betas'][0]}")
+    print(f"   Beta2: {optimizer.param_groups[0]['betas'][1]}")
+    # 测试学习率调度器（cosine）
+    print("\n2. 测试余弦退火学习率调度器（带预热）")
+    scheduler = get_lr_scheduler(optimizer, config)
+    print(f"   调度器类型: {type(scheduler).__name__}")
+    print(f"   初始学习率: {optimizer.param_groups[0]['lr']:.6f}")
+    # 模拟训练步骤，观察学习率变化
+    print("\n3. 模拟训练步骤，观察学习率变化（cosine）")
+    lrs = []
+    for step in range(0, 101, 10):
+        current_lr = optimizer.param_groups[0]["lr"]
+        lrs.append(current_lr)
+        print(f"   步数 {step:3d}: 学习率 = {current_lr:.6f}")
+        # 模拟优化步骤（先 optimizer.step()，再 scheduler.step()）
+        # 创建一个虚拟的损失并反向传播（仅用于测试）
+        dummy_loss = sum(p.sum() for p in model.parameters())
+        dummy_loss.backward()
+        optimizer.step()
+        optimizer.zero_grad()
+        if scheduler is not None:
+            scheduler.step()
+    # 测试线性调度器
+    print("\n4. 测试线性学习率调度器（带预热）")
+    config_linear = config.copy()
+    config_linear["lr_scheduler"] = "linear"
+    optimizer_linear = get_optimizer(model, config_linear)
+    scheduler_linear = get_lr_scheduler(optimizer_linear, config_linear)
+    print(f"   调度器类型: {type(scheduler_linear).__name__}")
+    print("\n5. 模拟训练步骤，观察学习率变化（linear）")
+    for step in range(0, 101, 10):
+        current_lr = optimizer_linear.param_groups[0]["lr"]
+        print(f"   步数 {step:3d}: 学习率 = {current_lr:.6f}")
+        # 模拟优化步骤（先 optimizer.step()，再 scheduler.step()）
+        dummy_loss = sum(p.sum() for p in model.parameters())
+        dummy_loss.backward()
+        optimizer_linear.step()
+        optimizer_linear.zero_grad()
+        if scheduler_linear is not None:
+            scheduler_linear.step()
+    # 测试常数学习率
+    print("\n6. 测试常数学习率")
+    config_constant = config.copy()
+    config_constant["lr_scheduler"] = "constant"
+    optimizer_constant = get_optimizer(model, config_constant)
+    scheduler_constant = get_lr_scheduler(optimizer_constant, config_constant)
+    print(f"   调度器类型: {scheduler_constant}")
+    print(f"   学习率: {optimizer_constant.param_groups[0]['lr']:.6f}")
+    # 测试错误类型
+    print("\n7. 测试错误类型处理")
+    try:
+        config_error = config.copy()
+        config_error["lr_scheduler"] = "invalid"
+        scheduler_error = get_lr_scheduler(optimizer, config_error)
+    except ValueError as e:
+        print(f"   正确捕获错误: {e}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/training/trainer.py ADDED Viewed

	@@ -0,0 +1,294 @@

+"""训练器：训练循环"""
+# 2026-01-23
+import torch
+from pathlib import Path
+from collections import deque
+from llm.training.loss import CrossEntropyLoss
+from llm.training.optim import get_optimizer, get_lr_scheduler
+from llm.utils.checkpoint import save_checkpoint
+from llm.utils.logging import ProgressBar
+from llm.model.attention import create_causal_mask
+class Trainer:
+    """训练器"""
+    def __init__(self, model, train_loader, val_loader, config):
+        """
+        初始化训练器
+        参数:
+            model: Transformer 模型
+            train_loader: 训练数据加载器
+            val_loader: 验证数据加载器
+            config: 训练配置字典
+        """
+        self.model = model
+        self.train_loader = train_loader
+        self.val_loader = val_loader
+        self.config = config
+        # 设备
+        device_str = config.get("device", "cpu")
+        self.device = torch.device(device_str)
+        self.model.to(self.device)
+        # 损失函数
+        self.criterion = CrossEntropyLoss()
+        # 优化器
+        self.optimizer = get_optimizer(model, config)
+        # 学习率调度器
+        self.scheduler = get_lr_scheduler(self.optimizer, config)
+        # 训练状态
+        self.global_step = 0
+        self.current_epoch = 0
+        self.best_val_loss = float("inf")
+        # 梯度累积
+        self.gradient_accumulation_steps = config.get("gradient_accumulation_steps", 1)
+        # 检查点配置
+        self.save_steps = config.get("save_steps", 500)
+        self.eval_steps = config.get("eval_steps", 500)
+        self.save_total_limit = config.get("save_total_limit", 3)
+        self.checkpoint_dir = Path(config.get("checkpoint_dir", "checkpoints"))
+        self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        # 保存的检查点路径（用于限制数量）
+        # 注意：不使用 maxlen，手动管理队列大小以便删除文件
+        self.saved_checkpoints = deque()
+        self.best_checkpoint_path = None  # 保存最佳模型的路径
+    def train_step(self, batch):
+        """
+        单步训练
+        参数:
+            batch: 批次数据，格式为 (input_ids, target_ids)
+                - input_ids: 输入 token IDs，形状为 (batch_size, seq_len)
+                - target_ids: 目标 token IDs，形状为 (batch_size, seq_len)
+        返回:
+            损失值（标量）
+        """
+        input_ids, target_ids = batch
+        input_ids = input_ids.to(self.device)
+        target_ids = target_ids.to(self.device)
+        # 创建因果掩码
+        seq_len = input_ids.size(1)
+        causal_mask = create_causal_mask(seq_len, device=self.device)
+        # 前向传播
+        logits = self.model(input_ids, mask=causal_mask)
+        # 计算损失
+        loss = self.criterion(logits, target_ids)
+        # 梯度累积：将损失除以累积步数
+        loss = loss / self.gradient_accumulation_steps
+        # 反向传播
+        loss.backward()
+        # 梯度累积：只在累积步数达到时才更新参数
+        if (self.global_step + 1) % self.gradient_accumulation_steps == 0:
+            # 梯度裁剪
+            max_grad_norm = self.config.get("max_grad_norm", 1.0)
+            torch.nn.utils.clip_grad_norm_(
+                self.model.parameters(), max_grad_norm
+            )
+            # 更新参数
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+            # 更新学习率
+            if self.scheduler is not None:
+                self.scheduler.step()
+        self.global_step += 1
+        return loss.item() * self.gradient_accumulation_steps  # 返回原始损失值
+    def evaluate(self, max_batches=10):
+        """
+        在验证集上评估模型
+        返回:
+            val_loss: 验证损失
+        """
+        self.model.eval()
+        total_loss = 0.0
+        num_batches = 0
+        with torch.no_grad():
+            for batch in self.val_loader:
+                if max_batches and num_batches >= max_batches:
+                    break  # 限制评估批次数
+                input_ids, target_ids = batch
+                input_ids = input_ids.to(self.device)
+                target_ids = target_ids.to(self.device)
+                # 创建因果掩码
+                seq_len = input_ids.size(1)
+                causal_mask = create_causal_mask(seq_len, device=self.device)
+                # 前向传播
+                logits = self.model(input_ids, mask=causal_mask)
+                # 计算损失
+                loss = self.criterion(logits, target_ids)
+                total_loss += loss.item()
+                num_batches += 1
+        avg_loss = total_loss / num_batches if num_batches > 0 else float("inf")
+        self.model.train()  # 恢复训练模式
+        return avg_loss
+    def save_checkpoint(self, is_best=False):
+        """
+        保存检查点
+        参数:
+            is_best: 是否为最佳模型
+        """
+        # 传入简单的 name，让 checkpoint.py 统一拼接 step
+        checkpoint_name = "best_model" if is_best else "checkpoint"
+        checkpoint_path = save_checkpoint(
+            model=self.model,
+            optimizer=self.optimizer,
+            epoch=self.current_epoch,
+            step=self.global_step,
+            loss=self.best_val_loss if is_best else None,
+            checkpoint_dir=self.checkpoint_dir,
+            name=checkpoint_name,
+        )
+        # 记录保存的检查点
+        if is_best:
+            # 删除旧的最佳模型（只保留最新的一个）
+            if self.best_checkpoint_path is not None and self.best_checkpoint_path.exists():
+                self.best_checkpoint_path.unlink()
+            self.best_checkpoint_path = checkpoint_path
+        else:
+            # 在添加新检查点前，如果队列已满，先删除最旧的文件
+            if len(self.saved_checkpoints) >= self.save_total_limit:
+                old_checkpoint = self.saved_checkpoints.popleft()
+                if old_checkpoint.exists():
+                    old_checkpoint.unlink()
+                    print(f"删除旧检查点: {old_checkpoint}")
+            # 添加新检查点到队列
+            self.saved_checkpoints.append(checkpoint_path)
+        return checkpoint_path
+    def train(self):
+        """
+        训练循环
+        """
+        num_epochs = self.config.get("num_epochs", 10)
+        max_steps = self.config.get("max_steps", None)
+        print("=" * 60)
+        print("开始训练")
+        print("=" * 60)
+        print(f"设备: {self.device}")
+        print(f"训练样本数: {len(self.train_loader.dataset)}")
+        print(f"验证样本数: {len(self.val_loader.dataset) if self.val_loader else 0}")
+        print(f"批次大小: {self.config.get('batch_size', 4)}")
+        print(f"梯度累积步数: {self.gradient_accumulation_steps}")
+        print(f"最大步数: {max_steps}")
+        print(f"训练轮数: {num_epochs}")
+        print("=" * 60)
+        for epoch in range(num_epochs):
+            self.current_epoch = epoch
+            self.model.train()
+            # 创建进度条
+            pbar = ProgressBar(
+                total=len(self.train_loader), desc=f"Epoch {epoch+1}/{num_epochs}"
+            )
+            epoch_loss = 0.0
+            num_batches = 0
+            for batch_idx, batch in enumerate(self.train_loader):
+                # 检查是否达到最大步数
+                if max_steps is not None and self.global_step >= max_steps:
+                    print(f"\n达到最大步数 {max_steps}，停止训练")
+                    pbar.close()
+                    return
+                # 训练一步
+                loss = self.train_step(batch)
+                epoch_loss += loss
+                num_batches += 1
+                # 更新进度条
+                pbar.update(1)
+                # 定期评估
+                saved_best_at_this_step = False
+                if self.global_step % self.eval_steps == 0 and self.val_loader:
+                    val_loss = self.evaluate()
+                    # 获取当前学习率
+                    current_lr = self.optimizer.param_groups[0]["lr"]
+                    print(
+                        f"\n步数 {self.global_step}: "
+                        f"训练损失={loss:.4f}, "
+                        f"验证损失={val_loss:.4f}, "
+                        f"学习率={current_lr:.2e}"
+                    )
+                    # 保存最佳模型
+                    if val_loss < self.best_val_loss:
+                        self.best_val_loss = val_loss
+                        checkpoint_path = self.save_checkpoint(is_best=True)
+                        print(f"保存最佳模型: {checkpoint_path}")
+                        saved_best_at_this_step = True
+                # 定期保存检查点（如果在这个 step 已经保存了最佳模型，则跳过）
+                if self.global_step % self.save_steps == 0 and not saved_best_at_this_step:
+                    checkpoint_path = self.save_checkpoint(is_best=False)
+                    print(f"保存检查点: {checkpoint_path}")
+            pbar.close()
+            # 每个 epoch 结束时的评估
+            avg_epoch_loss = epoch_loss / num_batches if num_batches > 0 else 0.0
+            print(f"\nEpoch {epoch+1}/{num_epochs} 完成:")
+            print(f"  平均训练损失: {avg_epoch_loss:.4f}")
+            if self.val_loader:
+                val_loss = self.evaluate()
+                print(f"  验证损失: {val_loss:.4f}")
+        print("\n" + "=" * 60)
+        print("训练完成！")
+        print("=" * 60)
+        print(f"最佳验证损失: {self.best_val_loss:.4f}")
+        # 显示最佳模型路径
+        if self.best_checkpoint_path is not None and self.best_checkpoint_path.exists():
+            print(f"\n最佳模型路径: {self.best_checkpoint_path}")
+            print("  (文件名包含 'best_model' 的检查点就是最佳模型)")
+        else:
+            print("\n警告: 未找到最佳模型检查点")
+        # 显示所有保存的检查点
+        if self.saved_checkpoints:
+            print(f"\n保存的检查点数量: {len(self.saved_checkpoints)}")
+            print("检查点列表:")
+            for i, cp_path in enumerate(self.saved_checkpoints, 1):
+                print(f"  {i}. {cp_path}")

llm/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""工具模块"""
+from llm.utils.init import init_weights, init_weights_with_scaling, apply_llm_init
+__all__ = [
+    "init_weights",
+    "init_weights_with_scaling",
+    "apply_llm_init",
+]

llm/utils/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (358 Bytes). View file

llm/utils/__pycache__/checkpoint.cpython-312.pyc ADDED Viewed

Binary file (1.96 kB). View file

llm/utils/__pycache__/init.cpython-312.pyc ADDED Viewed

Binary file (10.1 kB). View file

llm/utils/checkpoint.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""检查点保存和加载"""
+# 2026-01-23
+import torch
+from pathlib import Path
+def save_checkpoint(model, optimizer, epoch, step, loss, checkpoint_dir, name='checkpoint'):
+    """保存检查点"""
+    checkpoint_dir = Path(checkpoint_dir)
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # 统一拼接 step，避免重复
+    checkpoint_path = checkpoint_dir / f"{name}_step_{step}.pt"
+    torch.save({
+        'epoch': epoch,
+        'step': step,
+        'model_state_dict': model.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'loss': loss,
+    }, checkpoint_path)
+    return checkpoint_path
+def load_checkpoint(model, optimizer, checkpoint_path):
+    """加载检查点（用于恢复训练）"""
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    model.load_state_dict(checkpoint['model_state_dict'])
+    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+    return checkpoint['epoch'], checkpoint['step'], checkpoint.get('loss', None)
+def load_model_only(model, checkpoint_path):
+    """只加载模型权重（用于推理，不需要优化器）"""
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    model.load_state_dict(checkpoint['model_state_dict'])
+    return checkpoint.get('epoch', 0), checkpoint.get('step', 0), checkpoint.get('loss', None)

llm/utils/config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""配置加载：读取 YAML"""
+import yaml
+from pathlib import Path
+def load_config(config_path):
+    """加载配置文件"""
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    return config
+def load_all_configs(config_dir='configs'):
+    """加载所有配置文件"""
+    config_dir = Path(config_dir)
+    configs = {}
+    for config_file in ['model.yaml', 'train.yaml', 'data.yaml']:
+        config_path = config_dir / config_file
+        if config_path.exists():
+            name = config_file.replace('.yaml', '')
+            configs[name] = load_config(config_path)
+    return configs

llm/utils/init.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""权重初始化：LLM 模型权重初始化策略"""
+import torch
+import torch.nn as nn
+import math
+def init_weights(module, std=0.02):
+    """
+    初始化模型权重（适用于 LLM，参考 GPT/LLaMA）
+    参数:
+        module: PyTorch 模块
+        std: 正态分布的标准差（默认: 0.02）
+    初始化策略:
+        - nn.Embedding: 正态分布 N(0, std)
+        - nn.Linear: 正态分布 N(0, std)，偏置初始化为 0
+        - RMSNorm: 可学习参数（scale）初始化为 1.0（RMSNorm 会自动处理）
+    """
+    if isinstance(module, nn.Embedding):
+        # 词嵌入层：正态分布初始化
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif isinstance(module, nn.Linear):
+        # 线性层：权重正态分布初始化，偏置初始化为 0
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def init_weights_with_scaling(module, hidden_size=None, std=0.02):
+    """
+    初始化模型权重（带缩放，适用于输出层）
+    参数:
+        module: PyTorch 模块
+        hidden_size: 隐藏层维度（用于输出层缩放）
+        std: 基础标准差（默认: 0.02）
+    初始化策略:
+        - nn.Embedding: 正态分布 N(0, std)
+        - nn.Linear:
+            - 输出层（如果 hidden_size 提供）: N(0, std / sqrt(hidden_size))
+            - 其他层: N(0, std)
+        - 偏置: 初始化为 0
+    """
+    if isinstance(module, nn.Embedding):
+        # 词嵌入层：正态分布初始化
+        nn.init.normal_(module.weight, mean=0.0, std=std)
+    elif isinstance(module, nn.Linear):
+        # 线性层
+        if hidden_size is not None:
+            # 输出层：使用缩放的标准差
+            output_std = std / math.sqrt(hidden_size)
+            nn.init.normal_(module.weight, mean=0.0, std=output_std)
+        else:
+            # 普通线性层：标准正态分布初始化
+            nn.init.normal_(module.weight, mean=0.0, std=std)
+        # 偏置初始化为 0
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def apply_llm_init(model, std=0.02, init_output_layer=True):
+    """
+    对整个模型应用 LLM 权重初始化
+    参数:
+        model: Transformer 模型
+        std: 正态分布的标准差（默认: 0.02）
+        init_output_layer: 是否对输出层使用特殊初始化（默认: True）
+    返回:
+        初始化后的模型
+    """
+    # 获取 hidden_size（用于输出层初始化）
+    hidden_size = None
+    if hasattr(model, "config"):
+        hidden_size = model.config.get("hidden_size")
+    elif hasattr(model, "hidden_size"):
+        hidden_size = model.hidden_size
+    # 遍历所有模块并初始化
+    for module in model.modules():
+        if isinstance(module, (nn.Embedding, nn.Linear)):
+            if init_output_layer and isinstance(module, nn.Linear):
+                # 检查是否是输出层（lm_head）
+                # 如果 tie_word_embeddings=True，lm_head 可能为 None，使用 embedding 的权重
+                if hasattr(model, "lm_head") and module is model.lm_head:
+                    # 输出层：使用缩放初始化
+                    init_weights_with_scaling(module, hidden_size=hidden_size, std=std)
+                else:
+                    # 普通线性层
+                    init_weights(module, std=std)
+            else:
+                # 标准初始化
+                init_weights(module, std=std)
+    return model
+if __name__ == "__main__":
+    import sys
+    import io
+    # 设置输出编码为 UTF-8（Windows 兼容）
+    if sys.platform == "win32":
+        sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8")
+    print("=" * 60)
+    print("权重初始化测试")
+    print("=" * 60)
+    # 测试参数
+    vocab_size = 100
+    hidden_size = 320
+    intermediate_size = 960
+    std = 0.02
+    print(f"\n测试参数:")
+    print(f"  vocab_size: {vocab_size}")
+    print(f"  hidden_size: {hidden_size}")
+    print(f"  intermediate_size: {intermediate_size}")
+    print(f"  std: {std}")
+    # 测试 Embedding 初始化
+    print("\n1. 测试 Embedding 初始化")
+    embedding = nn.Embedding(vocab_size, hidden_size)
+    init_weights(embedding, std=std)
+    weight_mean = embedding.weight.mean().item()
+    weight_std = embedding.weight.std().item()
+    print(f"   Embedding 权重均值: {weight_mean:.6f} (应该接近 0)")
+    print(f"   Embedding 权重标准差: {weight_std:.6f} (应该接近 {std})")
+    # 测试 Linear 初始化
+    print("\n2. 测试 Linear 初始化")
+    linear = nn.Linear(hidden_size, intermediate_size)
+    init_weights(linear, std=std)
+    weight_mean = linear.weight.mean().item()
+    weight_std = linear.weight.std().item()
+    bias_mean = linear.bias.mean().item() if linear.bias is not None else 0.0
+    print(f"   Linear 权重均值: {weight_mean:.6f} (应该接近 0)")
+    print(f"   Linear 权重标准差: {weight_std:.6f} (应该接近 {std})")
+    print(f"   Linear 偏置均值: {bias_mean:.6f} (应该为 0)")
+    # 测试输出层初始化（带缩放）
+    print("\n3. 测试输出层初始化（带缩放）")
+    output_layer = nn.Linear(hidden_size, vocab_size, bias=False)
+    init_weights_with_scaling(output_layer, hidden_size=hidden_size, std=std)
+    weight_mean = output_layer.weight.mean().item()
+    weight_std = output_layer.weight.std().item()
+    expected_std = std / math.sqrt(hidden_size)
+    print(f"   输出层权重均值: {weight_mean:.6f} (应该接近 0)")
+    print(f"   输出层权重标准差: {weight_std:.6f}")
+    print(f"   期望标准差: {expected_std:.6f}")
+    # 测试完整模型初始化
+    print("\n4. 测试完整模型初始化")
+    from llm.model.transformer import Transformer
+    config = {
+        "vocab_size": vocab_size,
+        "hidden_size": hidden_size,
+        "num_hidden_layers": 2,  # 使用较小的层数用于测试
+        "num_attention_heads": 10,
+        "num_key_value_heads": 2,
+        "intermediate_size": intermediate_size,
+        "rms_norm_eps": 1e-5,
+        "max_position_embeddings": 1024,
+        "rope_theta": 10000.0,
+        "sliding_window": 256,
+        "sliding_window_overlap": True,
+        "tie_word_embeddings": True,
+    }
+    model = Transformer(config)
+    # 记录初始化前的权重统计
+    print("   初始化前的权重统计:")
+    for name, param in model.named_parameters():
+        if "embedding" in name or "weight" in name:
+            print(f"     {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")
+    # 应用初始化
+    apply_llm_init(model, std=std, init_output_layer=True)
+    # 记录初始化后的权重统计
+    print("\n   初始化后的权重统计:")
+    for name, param in model.named_parameters():
+        if "embedding" in name or ("weight" in name and "norm" not in name):
+            print(f"     {name}: mean={param.data.mean().item():.6f}, std={param.data.std().item():.6f}")
+    # 验证初始化效果
+    print("\n5. 验证初始化效果")
+    embedding_weight = model.embedding.embedding.weight
+    print(f"   Embedding 权重均值: {embedding_weight.mean().item():.6f}")
+    print(f"   Embedding 权重标准差: {embedding_weight.std().item():.6f}")
+    # 检查第一个 Transformer Block 的线性层
+    first_block = model.layers[0]
+    attn_q_proj = first_block.attn.q_proj
+    print(f"   Attention Q 投影权重均值: {attn_q_proj.weight.mean().item():.6f}")
+    print(f"   Attention Q 投影权重标准差: {attn_q_proj.weight.std().item():.6f}")
+    ffn_gate_proj = first_block.ffn.gate_proj
+    print(f"   FFN Gate 投影权重均值: {ffn_gate_proj.weight.mean().item():.6f}")
+    print(f"   FFN Gate 投影权重标准差: {ffn_gate_proj.weight.std().item():.6f}")
+    print("\n" + "=" * 60)
+    print("所有测试完成！")
+    print("=" * 60)

llm/utils/logging.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""日志：tqdm / tensorboard"""
+from tqdm import tqdm
+class ProgressBar:
+    """进度条"""
+    def __init__(self, total, desc=""):
+        self.pbar = tqdm(total=total, desc=desc)
+    def update(self, n=1):
+        """更新进度"""
+        self.pbar.update(n)
+    def close(self):
+        """关闭进度条"""
+        self.pbar.close()

llm/utils/seed.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""随机种子设置"""
+import random
+import numpy as np
+import torch
+def set_seed(seed=42):
+    """设置随机种子"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch>=2.0.0
+pyyaml>=6.0
+numpy>=2.0.0
+huggingface_hub>=0.30.0