File size: 6,632 Bytes

337273e

"""
Batched BLiMP scorer for İvme — fast, GPU-parallel.

Scores all 67 BLiMP subtasks by batching sentence pairs through the model
instead of looping one at a time. On a Blackwell this runs the whole suite
in well under a minute.

Method: for each (good, bad) pair, compute total log-prob of each sentence
and count a win when logprob(good) > logprob(bad). Sentences are padded into
batches and scored with a length mask so padding contributes nothing.

Usage:
    python eval_blimp.py --checkpoint checkpoints/ivme_base_ema.pt
    python eval_blimp.py --checkpoint checkpoints/ivme_base_ema.pt --batch_size 256
"""

from __future__ import annotations
import argparse
import json
import sys
import torch
import torch.nn.functional as F
from tokenizers import Tokenizer
from datasets import load_dataset

sys.path.insert(0, ".")
from model import IvmeConfig, IvmeConversate

TOKENIZER_PATH = "ivme_tokenizer.json"

BLIMP_TASKS = [
    "adjunct_island", "anaphor_gender_agreement", "anaphor_number_agreement",
    "animate_subject_passive", "animate_subject_trans", "causative",
    "complex_NP_island", "coordinate_structure_constraint_complex_left_branch",
    "coordinate_structure_constraint_object_extraction", "determiner_noun_agreement_1",
    "determiner_noun_agreement_2", "determiner_noun_agreement_irregular_1",
    "determiner_noun_agreement_irregular_2", "determiner_noun_agreement_with_adj_2",
    "determiner_noun_agreement_with_adj_irregular_1",
    "determiner_noun_agreement_with_adj_irregular_2",
    "determiner_noun_agreement_with_adjective_1", "distractor_agreement_relational_noun",
    "distractor_agreement_relative_clause", "drop_argument", "ellipsis_n_bar_1",
    "ellipsis_n_bar_2", "existential_there_object_raising",
    "existential_there_quantifiers_1", "existential_there_quantifiers_2",
    "existential_there_subject_raising", "expletive_it_object_raising", "inchoative",
    "intransitive", "irregular_past_participle_adjectives",
    "irregular_past_participle_verbs", "irregular_plural_subject_verb_agreement_1",
    "irregular_plural_subject_verb_agreement_2", "left_branch_island_echo_question",
    "left_branch_island_simple_question", "matrix_question_npi_licensor_present",
    "npi_present_1", "npi_present_2", "only_npi_licensor_present", "only_npi_scope",
    "passive_1", "passive_2", "principle_A_c_command", "principle_A_case_1",
    "principle_A_case_2", "principle_A_domain_1", "principle_A_domain_2",
    "principle_A_domain_3", "principle_A_reconstruction",
    "regular_plural_subject_verb_agreement_1", "regular_plural_subject_verb_agreement_2",
    "sentential_negation_npi_licensor_present", "sentential_negation_npi_scope",
    "sentential_subject_island", "superlative_quantifiers_1", "superlative_quantifiers_2",
    "tough_vs_raising_1", "tough_vs_raising_2", "transitive", "wh_island",
    "wh_questions_object_gap", "wh_questions_subject_gap",
    "wh_questions_subject_gap_long_distance", "wh_vs_that_no_gap",
    "wh_vs_that_no_gap_long_distance", "wh_vs_that_with_gap",
    "wh_vs_that_with_gap_long_distance",
]


@torch.no_grad()
def batch_logprobs(model, token_lists, device, pad_id, max_len):
    """Total log-prob of each sequence in a padded batch. token_lists: list[list[int]]."""
    B = len(token_lists)
    L = min(max(len(t) for t in token_lists), max_len)
    inp = torch.full((B, L), pad_id, dtype=torch.long, device=device)
    lengths = []
    for i, t in enumerate(token_lists):
        t = t[:L]
        inp[i, : len(t)] = torch.tensor(t, dtype=torch.long, device=device)
        lengths.append(len(t))

    with torch.autocast(device_type=device.type, dtype=torch.bfloat16,
                        enabled=device.type == "cuda"):
        logits, _ = model(inp)

    logp = F.log_softmax(logits.float(), dim=-1)
    targets = inp[:, 1:]
    pred = logp[:, :-1, :]
    tok_lp = pred.gather(-1, targets.unsqueeze(-1)).squeeze(-1)

    mask = torch.zeros_like(tok_lp)
    for i, n in enumerate(lengths):
        mask[i, : max(0, n - 1)] = 1.0
    return (tok_lp * mask).sum(dim=1)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--checkpoint", required=True)
    ap.add_argument("--batch_size", type=int, default=256)
    ap.add_argument("--output", default="blimp_results.json")
    args = ap.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tok = Tokenizer.from_file(TOKENIZER_PATH)
    pad_id = tok.token_to_id("<|pad|>") or 0

    ckpt = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
    cfg = ckpt["cfg"]
    cfg.attn_backend = "sdpa"
    max_len = cfg.max_seq_len
    model = IvmeConversate(cfg).to(device)
    model.load_state_dict(ckpt["model"])
    model.eval()
    print(f"[blimp] model loaded: {model.num_params()/1e6:.1f}M on {device}")

    print("[blimp] loading full BLiMP dataset (one download)...")
    full_ds = load_dataset("WillHeld/blimp", split="train")
    by_task = {t: {"good": [], "bad": []} for t in BLIMP_TASKS}
    for row in full_ds:
        uid = row["UID"]
        if uid in by_task:
            by_task[uid]["good"].append(row["sentence_good"])
            by_task[uid]["bad"].append(row["sentence_bad"])
    print(f"[blimp] {len(full_ds)} examples bucketed into {len(BLIMP_TASKS)} subtasks\n")

    results = {}
    total_correct = total_examples = 0

    for i, task in enumerate(BLIMP_TASKS):
        goods = by_task[task]["good"]
        bads = by_task[task]["bad"]
        good_tok = [tok.encode(s).ids for s in goods]
        bad_tok = [tok.encode(s).ids for s in bads]

        correct = 0
        for start in range(0, len(good_tok), args.batch_size):
            gb = good_tok[start : start + args.batch_size]
            bb = bad_tok[start : start + args.batch_size]
            g_lp = batch_logprobs(model, gb, device, pad_id, max_len)
            b_lp = batch_logprobs(model, bb, device, pad_id, max_len)
            correct += (g_lp > b_lp).sum().item()

        acc = correct / len(goods)
        results[task] = acc
        total_correct += correct
        total_examples += len(goods)
        running = total_correct / total_examples
        print(f"[{i+1:02d}/{len(BLIMP_TASKS)}] {task:<55} {acc*100:5.1f}%  "
              f"(avg: {running*100:.2f}%)")

    final = total_correct / total_examples
    print(f"\n{'='*60}")
    print(f"  BLiMP average: {final*100:.2f}%   (random baseline: 50%)")
    print(f"{'='*60}")

    with open(args.output, "w") as f:
        json.dump({"tasks": results, "average": final}, f, indent=2)
    print(f"\n[blimp] saved -> {args.output}")


if __name__ == "__main__":
    main()