#!/usr/bin/env python3 """Generate BERT notebook.""" import nbformat as nbf nb = nbf.v4.new_notebook() nb.metadata = { "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, "language_info": {"name": "python", "version": "3.12.0"}, } cells = [] def md(s): cells.append(nbf.v4.new_markdown_cell(s)) def code(s): cells.append(nbf.v4.new_code_cell(s)) md("""\ # BERT: Bidirectional Encoder Representations from Transformers Transformer Encoder pre-trained with Masked Language Model (MLM). """) md("""\ ## 背景 BERT(2018)通过**掩码语言模型(MLM)**在无标注文本上预训练, 然后用少量标注数据微调下游任务。核心思想是**"熵增降噪"**: 1. **熵增**:随机掩码 15% 的 token,增加不确定性 2. **降噪**:训练 Transformer 根据上下文预测被掩码的词 与 GPT 不同,BERT 使用**双向注意力**——每个 token 可以 attend 到所有 token。 """) md("""\ ## 数学原理 ### Masked Language Model 随机选择 15% 的位置进行干扰: - 80% 替换为 `[MASK]` - 10% 替换为随机 token - 10% 保持不变 损失只计算被掩码的位置: $$\\mathcal{L} = -\\sum_{i \\in \\mathcal{M}} \\log P(x_i \\mid \\mathbf{x}_{\\setminus i})$$ 其中 $\\mathcal{M}$ 是被掩码的位置集合。 ### 架构 ``` Input → Token Embed + Segment Embed + Position Encoding → [EncoderBlock × N] → LayerNorm → MLM Head → vocab logits ``` """) code("""\ import math import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, Dataset from datasets import load_dataset from nlp.bert.tokenizer import CharTokenizer from nlp.bert.model import BERTForMLM from utils.device import get_device device = get_device() print(f"Device: {device}") """) code("""\ # 构建字符级分词器 tokenizer = CharTokenizer() print(f"Vocabulary size: {tokenizer.vocab_size}") # 加载 text8 ds = load_dataset("afmck/text8", split="train") raw = ds[0]["text"] chunk_size = 1000 chunks = [raw[i:i + chunk_size] for i in range(0, len(raw), chunk_size)] chunks = chunks[:5000] print(f"Chunks: {len(chunks):,}") """) code("""\ class TextDataset(Dataset): def __init__(self, texts, tokenizer, max_len=128, mask_prob=0.15): self.tokenizer = tokenizer self.max_len = max_len self.mask_prob = mask_prob self.examples = [] for text in texts: tokens, _ = tokenizer.encode(text, max_len) self.examples.append(tokens) def __len__(self): return len(self.examples) def __getitem__(self, idx): tokens = list(self.examples[idx]) labels = list(tokens) for i in range(len(tokens)): if tokens[i] in (self.tokenizer.cls_id, self.tokenizer.sep_id, self.tokenizer.pad_id): continue if torch.rand(1).item() < self.mask_prob: r = torch.rand(1).item() if r < 0.8: tokens[i] = self.tokenizer.mask_id elif r < 0.9: tokens[i] = torch.randint(5, self.tokenizer.vocab_size, (1,)).item() else: labels[i] = -100 return { "input_ids": torch.tensor(tokens, dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long), "attention_mask": torch.tensor( [1 if t != self.tokenizer.pad_id else 0 for t in tokens], dtype=torch.long ), } dataset = TextDataset(chunks, tokenizer, max_len=128) loader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0) """) code("""\ model = BERTForMLM( vocab_size=tokenizer.vocab_size, d_model=128, n_heads=4, n_layers=4, max_len=128, ).to(device) print(f"Parameters: {model.num_params():,}") """) md("""\ ## 训练 > ⏱ 预估耗时:**10 epoch × ~60s/epoch ≈ 10 分钟**(M4 Max, batch_size=32) """) code("""\ NUM_EPOCHS = 10 LR = 1e-4 criterion = nn.CrossEntropyLoss(ignore_index=-100) optimizer = optim.AdamW(model.parameters(), lr=LR) loss_history = [] ppl_history = [] for epoch in range(1, NUM_EPOCHS + 1): model.train() total_loss = 0.0 num_batches = 0 for batch in loader: input_ids = batch["input_ids"].to(device) labels = batch["labels"].to(device) attention_mask = batch["attention_mask"].to(device) optimizer.zero_grad() logits, _ = model(input_ids, attention_mask) loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1)) loss.backward() optimizer.step() total_loss += loss.item() num_batches += 1 avg_loss = total_loss / num_batches perplexity = math.exp(avg_loss) loss_history.append(avg_loss) ppl_history.append(perplexity) print(f"Epoch [{epoch:2d}/{NUM_EPOCHS}] Loss: {avg_loss:.4f} PPL: {perplexity:.2f}") """) md("""## Loss 曲线""") code("""\ import matplotlib.pyplot as plt from utils.device import get_device fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) ax1.plot(loss_history, marker='o') ax1.set_xlabel("Epoch"); ax1.set_ylabel("Loss"); ax1.set_title("Training Loss"); ax1.grid(True) ax2.plot(ppl_history, marker='o', color='orange') ax2.set_xlabel("Epoch"); ax2.set_ylabel("Perplexity"); ax2.set_title("Perplexity"); ax2.grid(True) plt.tight_layout(); plt.show() """) md("""## [MASK] 预测演示 训练完成后,输入一段带 `[MASK]` 的文本,观察模型预测结果。""") code("""\ def predict_mask(text, model, tokenizer, top_k=5): tokens, mask = tokenizer.encode(text, max_len=128) input_ids = torch.tensor([tokens], dtype=torch.long).to(device) mask_id = tokenizer.mask_id with torch.no_grad(): logits, _ = model(input_ids) probs = torch.softmax(logits[0], dim=-1) mask_positions = [i for i, t in enumerate(tokens) if t == mask_id] for pos in mask_positions: top_probs, top_indices = torch.topk(probs[pos], top_k) preds = [tokenizer.id_to_word[idx.item()] for idx in top_indices] print(f"Position {pos}: {preds} (probs: {top_probs.tolist()})") # 示例:输入含 [MASK] 的句子 prompts = [ "once upon a [MASK] there was a beautiful princess", "the [MASK] is shining brightly in the sky today", ] for text in prompts: print(f"\\nInput: {text}") print("-" * 50) predict_mask(text, model, tokenizer) """) md("""\ ## 思考题 1. BERT 为什么用 80/10/10 的掩码策略?100% [MASK] 会怎样? 2. 双向注意力(BERT)和因果注意力(GPT)分别适合什么任务? 3. 把 `mask_prob` 改到 0.5(50% 掩码),loss 会上升还是下降?试试看。 4. BERT 的 [CLS] token 在预训练中没有明确任务,为什么能用于分类? """) nb.cells = cells out = "nlp/bert/bert.ipynb" with open(out, "w") as f: nbf.write(nb, f) print(f"Generated {out}")