| |
| """Generate LoRA notebook.""" |
|
|
| import nbformat as nbf |
|
|
| nb = nbf.v4.new_notebook() |
| nb.metadata = {"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},"language_info": {"name": "python", "version": "3.12.0"}} |
|
|
| cells = [] |
| def md(s): cells.append(nbf.v4.new_markdown_cell(s)) |
| def code(s): cells.append(nbf.v4.new_code_cell(s)) |
|
|
| md("# LoRA: Low-Rank Adaptation\n\nParameter-efficient fine-tuning of GPT via low-rank weight updates.") |
|
|
| md("""## 背景 |
| |
| LoRA(Hu et al. 2021)解决大模型全量微调成本过高的问题。核心思想: |
| |
| 冻结预训练权重 $W_0 \\in \\mathbb{R}^{d \\times k}$,学习一对低秩矩阵 $B \\in \\mathbb{R}^{d \\times r}, A \\in \\mathbb{R}^{r \\times k}$, |
| 其中 $r \\ll \\min(d, k)$。 |
| |
| $$h = W_0 x + \\Delta W x = W_0 x + \\frac{\\alpha}{r} BAx$$ |
| |
| | 方法 | 可训练参数量 | 对 GPT (5.7M) | |
| |---|---|---| |
| | Full fine-tune | $d \\times k$ | 5,693,952 | |
| | LoRA r=8 | $(d + k) \\times r$ | **32,772** (0.58%) | |
| |
| 本项目在已训练好的本地 GPT(text8)上注入 LoRA,仅微调注意力层的 Q/K/V/O 投影矩阵。 |
| """) |
|
|
| md("""## 数学原理 |
| |
| ### 低秩分解 |
| |
| $$\\Delta W = BA, \\quad B \\in \\mathbb{R}^{d \\times r}, A \\in \\mathbb{R}^{r \\times k}$$ |
| |
| 推理时将 $\\Delta W$ 合并回原始权重,零额外开销: |
| |
| $$W_{\\text{merged}} = W_0 + \\frac{\\alpha}{r} BA$$ |
| |
| ### 参数效率 |
| |
| $$\\text{Ratio} = \\frac{r(d + k)}{d \\cdot k} \\approx \\frac{r}{k} + \\frac{r}{d}$$ |
| |
| 当 $r=8, d=k=256$ 时: |
| $$\\text{Ratio} = \\frac{8 \\times 512}{256 \\times 256} = \\frac{4096}{65536} \\approx 6.25\\%$$ |
| """) |
|
|
| code("""\ |
| import torch |
| import torch.nn as nn |
| import torch.optim as optim |
| from torch.utils.data import DataLoader |
| from datasets import load_dataset |
| |
| from nlp.gpt.model import GPT |
| from nlp.gpt.tokenizer import WordTokenizer |
| from nlp.lora.model import LoRALayer, inject_lora, freeze_all_except_lora, lora_params_count |
| from utils.config import load_config |
| from utils.seed import set_seed |
| from utils.device import get_device |
| |
| device = get_device() |
| print(f"Device: {device}") |
| """) |
|
|
| code("""\ |
| # Load pretrained GPT |
| checkpoint = "nlp/gpt/gpt_text8.pt" |
| model = torch.load(checkpoint, map_location="cpu", weights_only=False).to(device) |
| full_params = sum(p.numel() for p in model.parameters()) |
| print(f"Full model: {full_params:,} params") |
| |
| # Compare different ranks |
| print(f"\\n{'r':>3} {'LoRA params':>12} {'Ratio':>8}") |
| print("-" * 25) |
| for r in [1, 4, 8, 16]: |
| # Quick count: each of 4 Linear × 4 layers × (d*r + r*d) = 4×4×2×d×r |
| d = 256 |
| lora_count = 4 * 4 * 2 * d * r |
| print(f"{r:3d} {lora_count:>10,d} {lora_count/full_params:>7.2%}") |
| """) |
|
|
| code("""\ |
| # Inject LoRA with r=8 |
| freeze_all_except_lora(model) |
| model = inject_lora(model, r=8, alpha=16) |
| |
| lora_count = lora_params_count(model) |
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| print(f"LoRA params: {lora_count:,} ({lora_count/full_params:.2%})") |
| print(f"Trainable: {trainable:,}") |
| """) |
|
|
| md("""## 训练 |
| |
| > ⏱ 预估耗时:**10 epoch × ~60s/epoch ≈ 10 分钟**(M4 Max, batch_size=32) |
| """) |
|
|
| code("""\ |
| # Load text8 subset |
| ds = load_dataset("afmck/text8", split="train") |
| raw = ds[0]["text"] |
| words = raw.lower().split() |
| raw_chunks = [" ".join(words[i:i + 32]) for i in range(0, len(words), 32)] |
| |
| tokenizer = WordTokenizer(vocab_size=5000) |
| tokenizer.build_vocab(raw_chunks) |
| sentences = [s.strip() for s in raw_chunks if len(s.strip()) > 5][:5000] |
| |
| class TextDataset(torch.utils.data.Dataset): |
| def __init__(self, texts, tokenizer, max_len=64): |
| self.examples = [] |
| for text in texts: |
| tokens, mask = tokenizer.encode(text, max_len) |
| self.examples.append({"input_ids": torch.tensor(tokens, dtype=torch.long)}) |
| def __len__(self): |
| return len(self.examples) |
| def __getitem__(self, idx): |
| return self.examples[idx] |
| |
| dataset = TextDataset(sentences, tokenizer, max_len=64) |
| loader = DataLoader(dataset, batch_size=32, shuffle=True) |
| print(f"Training chunks: {len(sentences):,}") |
| """) |
|
|
| code("""\ |
| NUM_EPOCHS = 10 |
| LR = 0.001 |
| |
| criterion = nn.CrossEntropyLoss() |
| optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=LR) |
| |
| loss_hist, ppl_hist = [], [] |
| |
| for epoch in range(1, NUM_EPOCHS + 1): |
| model.train() |
| total_loss = 0.0 |
| for batch in loader: |
| input_ids = batch["input_ids"].to(device) |
| labels = input_ids[:, 1:].contiguous() |
| inputs = input_ids[:, :-1].contiguous() |
| optimizer.zero_grad() |
| logits, _ = model(inputs) |
| loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1)) |
| loss.backward(); optimizer.step() |
| total_loss += loss.item() |
| |
| avg = total_loss / len(loader) |
| ppl = __import__('math').exp(avg) |
| loss_hist.append(avg) |
| ppl_hist.append(ppl) |
| print(f"Epoch [{epoch:2d}/{NUM_EPOCHS}] Loss: {avg:.4f} PPL: {ppl:.2f}") |
| """) |
|
|
| md("""## Loss 曲线""") |
|
|
| code("""\ |
| import matplotlib.pyplot as plt |
| |
| fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) |
| ax1.plot(loss_hist, marker='o'); ax1.set_xlabel("Epoch"); ax1.set_ylabel("Loss"); ax1.grid(True) |
| ax2.plot(ppl_hist, marker='o', color='orange'); ax2.set_xlabel("Epoch"); ax2.set_ylabel("Perplexity"); ax2.grid(True) |
| plt.tight_layout(); plt.show() |
| """) |
|
|
| md("""\ |
| ## 思考题 |
| |
| 1. LoRA 的 rank $r$ 越大越好还是越小越好?分析过拟合和表达能力之间的权衡。 |
| 2. 为什么只修改注意力层的 Q/K/V/O 而不修改 FFN? |
| 3. 推理时如何将 LoRA 权重合并回原模型,实现零额外开销? |
| 4. 尝试将 $\\alpha$ 从 16 改到 4 或 64,观察训练速度和 loss 的变化。 |
| """) |
|
|
| nb.cells = cells |
| with open("nlp/lora/lora.ipynb", "w") as f: |
| nbf.write(nb, f) |
| print("Generated nlp/lora/lora.ipynb") |
|
|