| |
| """Generate Word2Vec notebook.""" |
|
|
| import nbformat as nbf |
|
|
| nb = nbf.v4.new_notebook() |
| nb.metadata = { |
| "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"}, |
| "language_info": {"name": "python", "version": "3.12.0"}, |
| } |
|
|
| cells = [] |
| def md(s): cells.append(nbf.v4.new_markdown_cell(s)) |
| def code(s): cells.append(nbf.v4.new_code_cell(s)) |
|
|
| md("""\ |
| # Word2Vec: Word Embeddings |
| |
| CBOW + Skip-gram with Negative Sampling, trained on text8. |
| """) |
|
|
| md("""\ |
| ## 背景 |
| |
| Word2Vec(Mikolov 2013)学习词的分布式表示(dense vectors), |
| 使得语义相近的词在向量空间中也相近。 |
| |
| 两种架构: |
| - **CBOW**:用上下文词预测中心词 |
| - **Skip-gram**:用中心词预测上下文词 |
| |
| 两种加速技巧: |
| - **Negative Sampling**:训练时只更新少量负样本,避免完整的 softmax |
| - **Subsampling**:高频词("the", "a")被概率性丢弃,加速训练 |
| """) |
|
|
| md("""\ |
| ## 数学原理 |
| |
| ### Skip-gram with Negative Sampling |
| |
| 给定中心词 $w_t$ 和上下文词 $w_c$: |
| |
| $$\\mathcal{L} = -\\log \\sigma(v_{w_c} \\cdot v_{w_t}) - \\sum_{k=1}^K \\log \\sigma(-v_{w_k} \\cdot v_{w_t})$$ |
| |
| 其中 $w_k$ 是从噪声分布 $P(w) \\propto \\text{count}(w)^{0.75}$ 采样的负样本。 |
| |
| ### CBOW |
| |
| $$\\mathcal{L} = -\\log \\sigma(v_{w_t} \\cdot \\bar{v}_{\\text{ctx}}) - \\sum_{k=1}^K \\log \\sigma(-v_{w_k} \\cdot \\bar{v}_{\\text{ctx}})$$ |
| |
| 其中 $\\bar{v}_{\\text{ctx}} = \\frac{1}{|C|} \\sum_{c \\in C} v_{w_c}$。 |
| """) |
|
|
| code("""\ |
| import torch |
| from torch.utils.data import DataLoader |
| from nlp.word2vec.train import ( |
| load_texts, build_vocab, subsample, generate_training_pairs, |
| NoiseSampler, CBOWDataset, SkipGramDataset, train_epoch, |
| ) |
| from nlp.word2vec.model import Word2Vec |
| from utils.device import get_device |
| |
| device = get_device() |
| print(f"Device: {device}") |
| """) |
|
|
| code("""\ |
| # 加载数据 |
| print("Loading text8...") |
| texts = load_texts() |
| word_to_id, id_to_word, vocab = build_vocab(texts, min_count=3) |
| vocab_size = len(word_to_id) |
| print(f"Vocabulary: {vocab_size:,}") |
| """) |
|
|
| code("""\ |
| # 生成训练数据 |
| tokenized = subsample(texts, word_to_id) |
| cbow_pairs, skipgram_pairs = generate_training_pairs(tokenized, window_size=2) |
| |
| max_pairs = 100000 |
| cbow_pairs = cbow_pairs[:max_pairs] |
| sg_pairs = skipgram_pairs[:max_pairs] |
| |
| cbow_dataset = CBOWDataset(cbow_pairs, max_window=2) |
| sg_dataset = SkipGramDataset(sg_pairs, vocab_size) |
| cbow_loader = DataLoader(cbow_dataset, batch_size=128, shuffle=True) |
| sg_loader = DataLoader(sg_dataset, batch_size=128, shuffle=True) |
| noise_sampler = NoiseSampler(vocab, vocab_size) |
| print(f"CBOW: {len(cbow_pairs):,} Skip-gram: {len(sg_pairs):,}") |
| """) |
|
|
| md("""## 训练 |
| |
| > ⏱ 预估耗时:**5 epoch × ~1min/epoch ≈ 5 分钟**(M4 Max, batch_size=128) |
| """) |
|
|
| code("""\ |
| NUM_EPOCHS = 5 |
| LR = 0.01 |
| K_NEG = 5 |
| |
| model_sg = Word2Vec(vocab_size, embed_dim=50) |
| optimizer = torch.optim.Adam(model_sg.parameters(), lr=LR) |
| |
| sg_loss_hist = [] |
| for epoch in range(1, NUM_EPOCHS + 1): |
| loss = train_epoch(model_sg, sg_loader, noise_sampler, optimizer, k=K_NEG, mode="skipgram") |
| sg_loss_hist.append(loss) |
| print(f"Skip-gram Epoch [{epoch}/{NUM_EPOCHS}] Loss: {loss:.4f}") |
| """) |
|
|
| code("""\ |
| import matplotlib.pyplot as plt |
| from utils.device import get_device |
| plt.plot(sg_loss_hist, marker='o') |
| plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("Skip-gram Training Loss"); plt.grid(True) |
| plt.show() |
| """) |
|
|
| md("""## 词向量相似度搜索 |
| |
| 训练完成后,用余弦相似度查找语义相近的词。""") |
|
|
| code("""\ |
| embeddings = model_sg.get_embeddings() |
| |
| def similar_words(word, top_k=10): |
| if word not in word_to_id: |
| print(f"'{word}' not in vocabulary") |
| return |
| idx = word_to_id[word] |
| vec = embeddings[idx] |
| sims = (embeddings @ vec) / (torch.norm(embeddings, dim=1) * torch.norm(vec)) |
| vals, inds = torch.topk(sims, top_k + 1) |
| print(f"Words similar to '{word}':") |
| for val, idx in zip(vals[1:], inds[1:]): |
| print(f" {id_to_word[idx.item()]:<15} {val.item():.4f}") |
| |
| for w in ["computer", "science", "king", "water"]: |
| print(); similar_words(w) |
| """) |
|
|
| md("""\ |
| ## 思考题 |
| |
| 1. Skip-gram 和 CBOW 分别擅长什么?(低频词 vs 高频词)" |
| 2. 负样本数量 $k$ 越大越好还是越小越好?典型值是多少? |
| 3. 为什么噪声分布要用 $\\text{count}(w)^{0.75}$ 而不是原始频率? |
| 4. 词嵌入的维度(50)对语义质量有什么影响?试试改到 100。 |
| """) |
|
|
| nb.cells = cells |
| out = "nlp/word2vec/word2vec.ipynb" |
| with open(out, "w") as f: |
| nbf.write(nb, f) |
| print(f"Generated {out}") |
|
|