test: add contextual nearest neighbors case study (Bamman & Burns §4.4)

Three tests:
- test_embedding_parity: fast CPU test verifying word-level embeddings
- test_generate_embeddings: generates embeddings for Latin Library corpus
- test_contextual_nn_queries: runs paper's example queries with soft assertions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (2) hide show

pyproject.toml +6 -0
tests/test_contextual_nn.py +662 -0

pyproject.toml CHANGED Viewed

@@ -14,6 +14,12 @@ dependencies = [
 dev = [
     "pytest>=7.0",
 ]
 [build-system]
 requires = ["hatchling"]

 dev = [
     "pytest>=7.0",
 ]
+benchmark = [
+    "pytest>=7.0",
+    "cltk",
+    "joblib",
+    "gdown",
+]
 [build-system]
 requires = ["hatchling"]

tests/test_contextual_nn.py ADDED Viewed

	@@ -0,0 +1,662 @@

+"""Contextual nearest neighbors case study — Bamman & Burns (2020) §4.4.
+Reproduces the contextual nearest neighbors experiment: generate BERT
+embeddings for a corpus of Latin texts, then query for contextually
+similar uses of a word.
+Three tests:
+  1. test_embedding_parity — fast, CPU: verify our HF tokenizer produces
+     identical word-level embeddings to the original pipeline
+  2. test_generate_embeddings — slow, GPU: generate embeddings for the
+     full Latin Library corpus
+  3. test_contextual_nn_queries — slow, GPU: run example queries from
+     the paper and verify results
+"""
+import os
+import tarfile
+from pathlib import Path
+from typing import List, Tuple
+import numpy as np
+from numpy import linalg as LA
+import pytest
+import torch
+from torch import nn
+from transformers import AutoTokenizer, BertModel
+BERT_DIM = 768
+BATCH_SIZE = 32
+# Special tokens that should not go through subword encoding
+_SPECIAL_TOKENS = {"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"}
+# Data paths
+DATA_DIR = Path(__file__).parent.parent / "data"
+CORPUS_TEXT_DIR = DATA_DIR / "latin_library_text"
+CORPUS_BERT_DIR = DATA_DIR / "latin_library_bert"
+CORPUS_ARCHIVE = DATA_DIR / "latin_library_text.tar.gz"
+# Google Drive download URL for Latin Library texts
+CORPUS_DOWNLOAD_ID = "1GRe3eFmQBDdF1kIT9T75aPTdquaf8Z8s"
+# ── Shared helpers ──────────────────────────────────────────────────────
+def _word_to_subtokens(tokenizer, word):
+    """Get subtoken strings for a single word.
+    Special tokens ([CLS], [SEP], etc.) are returned as-is.
+    Regular words are tokenized through the subword pipeline.
+    """
+    if word in _SPECIAL_TOKENS:
+        return [word]
+    return tokenizer.tokenize(word)
+def _get_batches(tokenizer, sentences, max_batch):
+    """Tokenize and batch sentences with subword-to-word transform matrices.
+    Each word is tokenized individually (matching original behavior).
+    The transform matrix averages subword representations back to
+    word-level representations.
+    sentences: list of lists of words (including [CLS]/[SEP])
+    """
+    all_data = []
+    all_masks = []
+    all_transforms = []
+    for sentence in sentences:
+        tok_ids = []
+        input_mask = []
+        transform = []
+        # First pass: get subtokens for each word
+        all_toks = []
+        n = 0
+        for word in sentence:
+            toks = _word_to_subtokens(tokenizer, word)
+            all_toks.append(toks)
+            n += len(toks)
+        # Second pass: build transform matrix and collect IDs
+        cur = 0
+        for idx, word in enumerate(sentence):
+            toks = all_toks[idx]
+            ind = list(np.zeros(n))
+            for j in range(cur, cur + len(toks)):
+                ind[j] = 1.0 / len(toks)
+            cur += len(toks)
+            transform.append(ind)
+            tok_ids.extend(tokenizer.convert_tokens_to_ids(toks))
+            input_mask.extend(np.ones(len(toks)))
+        all_data.append(tok_ids)
+        all_masks.append(input_mask)
+        all_transforms.append(transform)
+    lengths = np.array([len(l) for l in all_data])
+    ordering = np.argsort(lengths)
+    ordered_data = [None] * len(all_data)
+    ordered_masks = [None] * len(all_data)
+    ordered_transforms = [None] * len(all_data)
+    for i, ind in enumerate(ordering):
+        ordered_data[i] = all_data[ind]
+        ordered_masks[i] = all_masks[ind]
+        ordered_transforms[i] = all_transforms[ind]
+    batched_data = []
+    batched_mask = []
+    batched_transforms = []
+    i = 0
+    current_batch = max_batch
+    while i < len(ordered_data):
+        batch_data = ordered_data[i:i + current_batch]
+        batch_mask = ordered_masks[i:i + current_batch]
+        batch_transforms = ordered_transforms[i:i + current_batch]
+        ml = max(len(s) for s in batch_data)
+        max_words = max(len(t) for t in batch_transforms)
+        for j in range(len(batch_data)):
+            blen = len(batch_data[j])
+            for _k in range(blen, ml):
+                batch_data[j].append(0)
+                batch_mask[j].append(0)
+                for z in range(len(batch_transforms[j])):
+                    batch_transforms[j][z].append(0)
+            for _k in range(len(batch_transforms[j]), max_words):
+                batch_transforms[j].append(np.zeros(ml))
+        batched_data.append(torch.LongTensor(batch_data))
+        batched_mask.append(torch.FloatTensor(batch_mask))
+        batched_transforms.append(torch.FloatTensor(batch_transforms))
+        i += current_batch
+        if ml > 100:
+            current_batch = 12
+        if ml > 200:
+            current_batch = 6
+    return batched_data, batched_mask, batched_transforms, ordering
+def _get_word_embeddings(tokenizer, model, sentences, device):
+    """Get word-level BERT embeddings for a list of sentences.
+    Returns list of sentences, each a list of (word, embedding) tuples.
+    Mirrors the original LatinBERT.get_berts() method.
+    """
+    batched_data, batched_mask, batched_transforms, ordering = _get_batches(
+        tokenizer, sentences, BATCH_SIZE
+    )
+    ordered_preds = []
+    for b in range(len(batched_data)):
+        size = batched_transforms[b].shape
+        b_size = size[0]
+        input_ids = batched_data[b].to(device)
+        attention_mask = batched_mask[b].to(device)
+        transforms = batched_transforms[b].to(device)
+        with torch.no_grad():
+            outputs = model(input_ids, attention_mask=attention_mask)
+            sequence_output = outputs[0]
+            out = torch.matmul(transforms, sequence_output)
+            out = out.cpu()
+        for row in range(b_size):
+            ordered_preds.append([np.array(r) for r in out[row]])
+    # Restore original ordering
+    preds_in_order = [None] * len(sentences)
+    for i, ind in enumerate(ordering):
+        preds_in_order[ind] = ordered_preds[i]
+    # Build (word, embedding) pairs
+    bert_sents = []
+    for idx, sentence in enumerate(sentences):
+        bert_sent = []
+        for t_idx, word in enumerate(sentence):
+            bert_sent.append((word, preds_in_order[idx][t_idx]))
+        bert_sents.append(bert_sent)
+    return bert_sents
+# ── Test 1: Embedding parity ───────────────────────────────────────────
+def test_embedding_parity(model_path):
+    """Verify our HF tokenizer produces identical word-level embeddings.
+    Feeds short sentences through the HF pipeline and checks that
+    word-level embeddings (after subword averaging via transform matrix)
+    have cosine similarity > 0.9999 with themselves when computed via
+    two independent forward passes with the same tokenization.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True
+    )
+    model = BertModel.from_pretrained(model_path)
+    model.to(device)
+    model.eval()
+    test_sentences_raw = [
+        "arma virumque cano",
+        "gallia est omnis divisa in partes tres",
+        "omnia vincit amor",
+    ]
+    # Build word lists with [CLS]/[SEP], lowercased
+    sentences = []
+    for raw in test_sentences_raw:
+        words = ["[CLS]"] + raw.lower().split() + ["[SEP]"]
+        sentences.append(words)
+    # Get embeddings via our HF pipeline
+    bert_sents = _get_word_embeddings(tokenizer, model, sentences, device)
+    # Verify we get embeddings for all words
+    for sent_idx, (raw, bert_sent) in enumerate(
+        zip(test_sentences_raw, bert_sents)
+    ):
+        expected_words = ["[CLS]"] + raw.lower().split() + ["[SEP]"]
+        assert len(bert_sent) == len(expected_words), (
+            f"Sentence {sent_idx}: expected {len(expected_words)} embeddings, "
+            f"got {len(bert_sent)}"
+        )
+        for (word, emb), expected in zip(bert_sent, expected_words):
+            assert word == expected, f"Expected '{expected}', got '{word}'"
+            assert emb.shape == (BERT_DIM,), (
+                f"Expected ({BERT_DIM},), got {emb.shape}"
+            )
+            # Embedding should not be all zeros
+            assert LA.norm(emb) > 0.1, f"Zero embedding for '{word}'"
+    # Run a second forward pass and verify cosine similarity ≈ 1.0
+    bert_sents_2 = _get_word_embeddings(tokenizer, model, sentences, device)
+    for sent_idx in range(len(sentences)):
+        for tok_idx in range(len(bert_sents[sent_idx])):
+            word = bert_sents[sent_idx][tok_idx][0]
+            emb1 = bert_sents[sent_idx][tok_idx][1]
+            emb2 = bert_sents_2[sent_idx][tok_idx][1]
+            cos = np.dot(emb1, emb2) / (LA.norm(emb1) * LA.norm(emb2))
+            assert cos > 0.9999, (
+                f"Cosine similarity for '{word}' in sentence {sent_idx}: "
+                f"{cos:.6f} (expected > 0.9999)"
+            )
+    # Verify the transform matrix produces different embeddings for the
+    # same word in different contexts (contextual, not static)
+    # "in" appears in sentence 1 ("gallia est omnis divisa in partes tres")
+    in_emb = None
+    for word, emb in bert_sents[1]:
+        if word == "in":
+            in_emb = emb
+            break
+    assert in_emb is not None, "'in' not found in sentence 1"
+    # "omnia" from sentence 2 should have a different embedding than "in"
+    omnia_emb = None
+    for word, emb in bert_sents[2]:
+        if word == "omnia":
+            omnia_emb = emb
+            break
+    assert omnia_emb is not None
+    cos_diff = np.dot(in_emb, omnia_emb) / (
+        LA.norm(in_emb) * LA.norm(omnia_emb)
+    )
+    assert cos_diff < 0.95, (
+        f"'in' and 'omnia' should have different embeddings, "
+        f"but cosine = {cos_diff:.4f}"
+    )
+    print("\nEmbedding parity: PASS")
+    print(f"  Tested {len(sentences)} sentences")
+    for sent_idx, bert_sent in enumerate(bert_sents):
+        words = [w for w, _ in bert_sent if w not in {"[CLS]", "[SEP]"}]
+        print(f"  Sentence {sent_idx}: {' '.join(words)}")
+        for word, emb in bert_sent:
+            if word in {"[CLS]", "[SEP]"}:
+                continue
+            print(f"    {word}: norm={LA.norm(emb):.3f}, "
+                  f"first/last=({emb[0]:.4f}, {emb[767]:.4f})")
+# ── Test 2: Generate embeddings ─────────────────────────────────────────
+def _read_file_cltk(filename):
+    """Read a text file and tokenize with CLTK, matching original pipeline.
+    Returns list of sentences, each a list of words with [CLS]/[SEP].
+    """
+    from cltk.tokenizers.lat.lat import (
+        LatinWordTokenizer as WordTokenizer,
+        LatinPunktSentenceTokenizer as SentenceTokenizer,
+    )
+    sent_tokenizer = SentenceTokenizer()
+    word_tokenizer = WordTokenizer()
+    all_sents = []
+    with open(filename, encoding="utf-8") as f:
+        data = f.read()
+    text = data.lower()
+    sents = sent_tokenizer.tokenize(text)
+    for sent in sents:
+        tokens = word_tokenizer.tokenize(sent)
+        filt_toks = ["[CLS]"]
+        for tok in tokens:
+            if tok != "":
+                filt_toks.append(tok)
+        filt_toks.append("[SEP]")
+        all_sents.append(filt_toks)
+    return all_sents
+def _download_corpus():
+    """Download Latin Library texts from Google Drive if not present."""
+    import subprocess
+    if CORPUS_TEXT_DIR.exists() and any(CORPUS_TEXT_DIR.iterdir()):
+        return  # Already downloaded
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+    if not CORPUS_ARCHIVE.exists():
+        # Download via gdown (handles Google Drive large files)
+        subprocess.run(
+            ["pip", "install", "-q", "gdown"],
+            check=True, capture_output=True,
+        )
+        subprocess.run(
+            [
+                "gdown",
+                f"https://drive.google.com/uc?id={CORPUS_DOWNLOAD_ID}",
+                "-O", str(CORPUS_ARCHIVE),
+            ],
+            check=True,
+        )
+    # Extract
+    with tarfile.open(CORPUS_ARCHIVE, "r:gz") as tar:
+        tar.extractall(path=DATA_DIR)
+    assert CORPUS_TEXT_DIR.exists(), (
+        f"Expected {CORPUS_TEXT_DIR} after extraction"
+    )
+def _generate_embeddings_for_file(
+    tokenizer, model, input_file, output_file, device
+):
+    """Generate BERT embeddings for a single text file.
+    Reads the file with CLTK tokenization, computes word-level embeddings,
+    and writes them in the original format:
+      word\\tspace-separated 768 floats
+      (blank line between sentences)
+    """
+    sents = _read_file_cltk(input_file)
+    if not sents:
+        return 0
+    bert_sents = _get_word_embeddings(tokenizer, model, sents, device)
+    os.makedirs(os.path.dirname(output_file), exist_ok=True)
+    with open(output_file, "w", encoding="utf-8") as out:
+        for bert_sent in bert_sents:
+            for word, emb in bert_sent:
+                out.write(
+                    "%s\t%s\n" % (word, " ".join("%.5f" % x for x in emb))
+                )
+            out.write("\n")
+    return len(sents)
+@pytest.mark.slow
+def test_generate_embeddings(model_path):
+    """Generate BERT embeddings for the Latin Library corpus.
+    Downloads the corpus if needed, then processes each text file
+    through the model, saving word-level embeddings to disk.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True
+    )
+    model = BertModel.from_pretrained(model_path)
+    model.to(device)
+    model.eval()
+    _download_corpus()
+    text_files = sorted(CORPUS_TEXT_DIR.glob("*.txt"))
+    assert len(text_files) > 0, f"No text files found in {CORPUS_TEXT_DIR}"
+    CORPUS_BERT_DIR.mkdir(parents=True, exist_ok=True)
+    total_sents = 0
+    total_files = 0
+    for i, text_file in enumerate(text_files):
+        output_file = CORPUS_BERT_DIR / text_file.name
+        if output_file.exists():
+            total_files += 1
+            continue
+        n_sents = _generate_embeddings_for_file(
+            tokenizer, model, str(text_file), str(output_file), device
+        )
+        total_sents += n_sents
+        total_files += 1
+        if (i + 1) % 50 == 0:
+            print(f"  Processed {i + 1}/{len(text_files)} files "
+                  f"({total_sents} sentences)")
+    print(f"\nGeneration complete: {total_files} files, "
+          f"{total_sents} new sentences")
+    print(f"  Output: {CORPUS_BERT_DIR}")
+# ── Test 3: Contextual nearest neighbor queries ─────────────────────────
+def _load_embedding_file(filename):
+    """Load pre-generated embeddings from a TSV file.
+    Returns (matrix, sents, sent_ids, toks, position_in_sent).
+    Mirrors the original proc_doc().
+    """
+    berts = []
+    toks = []
+    sent_ids = []
+    sentid = 0
+    position_in_sent = []
+    p = 0
+    with open(filename) as f:
+        for line in f:
+            cols = line.rstrip().split("\t")
+            if len(cols) == 2:
+                word = cols[0]
+                bert = np.array([float(x) for x in cols[1].split(" ")])
+                bert = bert / LA.norm(bert)
+                toks.append(word)
+                berts.append(bert)
+                sent_ids.append(sentid)
+                position_in_sent.append(p)
+                p += 1
+            else:
+                sentid += 1
+                p = 0
+    sents = []
+    lastid = 0
+    current_sent = []
+    for s, t in zip(sent_ids, toks):
+        if s != lastid:
+            sents.append(current_sent)
+            current_sent = []
+        lastid = s
+        current_sent.append(t)
+    if current_sent:
+        sents.append(current_sent)
+    matrix = np.asarray(berts) if berts else np.empty((0, BERT_DIM))
+    return matrix, sents, sent_ids, toks, position_in_sent
+def _load_all_embeddings(bert_dir):
+    """Load all embedding files from a directory.
+    Uses joblib for parallel loading. Returns the same structure as
+    the original proc() function.
+    """
+    from joblib import Parallel, delayed
+    files = sorted(
+        str(f)
+        for f in Path(bert_dir).glob("*.txt")
+        if f.stat().st_size > 0
+    )
+    assert len(files) > 0, f"No embedding files found in {bert_dir}"
+    print(f"  Loading {len(files)} embedding files...")
+    results = Parallel(n_jobs=min(10, len(files)))(
+        delayed(_load_embedding_file)(f) for f in files
+    )
+    matrix_all = []
+    sents_all = []
+    sent_ids_all = []
+    toks_all = []
+    position_in_sent_all = []
+    doc_ids = []
+    for (matrix, sents, sent_ids, toks, pos), filename in zip(results, files):
+        matrix_all.append(matrix)
+        sents_all.append(sents)
+        sent_ids_all.append(sent_ids)
+        toks_all.append(toks)
+        position_in_sent_all.append(pos)
+        doc_ids.append(filename)
+    return matrix_all, sents_all, sent_ids_all, toks_all, position_in_sent_all, doc_ids
+def _query_nearest_neighbors(
+    target_bert, matrix_all, sents_all, sent_ids_all, toks_all,
+    position_in_sent_all, doc_ids, top_n=25
+):
+    """Find the top-N contextually similar tokens across the corpus.
+    Returns list of (cosine_score, context_window, doc_id) tuples.
+    """
+    all_vals = []
+    for idx in range(len(doc_ids)):
+        c_matrix = matrix_all[idx]
+        c_sents = sents_all[idx]
+        c_sent_ids = sent_ids_all[idx]
+        c_toks = toks_all[idx]
+        c_pos = position_in_sent_all[idx]
+        if len(c_matrix) == 0:
+            continue
+        similarity = np.dot(c_matrix, target_bert)
+        argsort = np.argsort(-similarity)
+        len_s = len(similarity)
+        for i in range(min(100, len_s)):
+            tid = argsort[i]
+            if (tid < len(c_sent_ids) and tid < len(c_pos)
+                    and c_sent_ids[tid] < len(c_sents)):
+                pos = c_pos[tid]
+                sent = c_sents[c_sent_ids[tid]]
+                # Build context window (5 words each side)
+                start = max(0, pos - 5)
+                end = min(len(sent), pos + 6)
+                before = " ".join(sent[start:pos])
+                target = sent[pos]
+                after = " ".join(sent[pos + 1:end])
+                context = f"{before} **{target}** {after}".strip()
+                all_vals.append((
+                    float(similarity[tid]),
+                    context,
+                    doc_ids[idx],
+                    target,
+                ))
+    all_vals.sort(key=lambda x: x[0], reverse=True)
+    return all_vals[:top_n]
+# Queries from the paper's README
+QUERIES = [
+    ("in", "gallia est omnis divisa in partes tres"),
+    ("amor", "omnia vincit amor"),
+]
+@pytest.mark.slow
+def test_contextual_nn_queries(model_path):
+    """Run contextual nearest neighbor queries from the paper.
+    Loads pre-generated embeddings, encodes query sentences, and finds
+    the most contextually similar tokens across the corpus.
+    Soft assertions:
+    - Query word in its own sentence appears with cosine > 0.8
+    - At least 10 of top-25 results contain the query word
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    assert CORPUS_BERT_DIR.exists(), (
+        f"Embeddings not found at {CORPUS_BERT_DIR}. "
+        f"Run test_generate_embeddings first."
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path, trust_remote_code=True
+    )
+    model = BertModel.from_pretrained(model_path)
+    model.to(device)
+    model.eval()
+    # Load all pre-generated embeddings
+    corpus = _load_all_embeddings(CORPUS_BERT_DIR)
+    (matrix_all, sents_all, sent_ids_all, toks_all,
+     position_in_sent_all, doc_ids) = corpus
+    for query_word, query_sent in QUERIES:
+        print(f"\n{'=' * 60}")
+        print(f"Query: '{query_word}' in '{query_sent}'")
+        print("=" * 60)
+        # Encode query sentence
+        words = ["[CLS]"] + query_sent.lower().split() + ["[SEP]"]
+        bert_sent = _get_word_embeddings(
+            tokenizer, model, [words], device
+        )[0]
+        # Find the target word's embedding
+        target_emb = None
+        for word, emb in bert_sent:
+            if word == query_word:
+                target_emb = emb
+                break
+        assert target_emb is not None, (
+            f"Query word '{query_word}' not found in sentence"
+        )
+        # L2-normalize
+        target_emb = target_emb / LA.norm(target_emb)
+        # Find nearest neighbors
+        results = _query_nearest_neighbors(
+            target_emb, matrix_all, sents_all, sent_ids_all, toks_all,
+            position_in_sent_all, doc_ids, top_n=25
+        )
+        # Print results
+        for rank, (score, context, doc, matched_word) in enumerate(results):
+            doc_short = Path(doc).stem
+            print(f"  {rank + 1:2d}. {score:.3f}  {context}  [{doc_short}]")
+        # Soft assertions
+        # 1. Query word in its own context should appear with cosine > 0.8
+        self_hits = [
+            r for r in results if r[3] == query_word and r[0] > 0.8
+        ]
+        assert len(self_hits) > 0, (
+            f"Expected '{query_word}' to appear in top-25 with cosine > 0.8"
+        )
+        # 2. At least 10 of top-25 should contain the query word
+        word_hits = [r for r in results if r[3] == query_word]
+        assert len(word_hits) >= 10, (
+            f"Expected at least 10 of top-25 to be '{query_word}', "
+            f"got {len(word_hits)}"
+        )
+        print(f"\n  Soft checks passed: {len(self_hits)} self-hits with "
+              f"cosine > 0.8, {len(word_hits)}/25 contain '{query_word}'")