"""Memorization probe (W4.6): can a web-trained model complete benchmark rows verbatim?

Legacy-public benchmarks (hospital et al., GitHub since 2019) sit inside every base
model's training window; a HIGH verbatim-completion rate red-flags memorized gold.
A low rate does not prove absence — the contamination statement stays assumption-based.
Control: a date-stamped post-cutoff wild harvest (expected ~0).

    uv run python -m eval.contamination_probe
"""
from __future__ import annotations

import json
import random
import subprocess
from pathlib import Path

import pandas as pd

ROOT = Path(__file__).resolve().parent.parent
N_ROWS, N_GIVEN, MODEL = 30, 5, "glm-5.1"


def probe(df: pd.DataFrame, name: str) -> dict:
    rng = random.Random(0)
    rows = rng.sample(range(len(df)), min(N_ROWS, len(df)))
    cols = list(df.columns)
    given, asked = cols[:N_GIVEN], cols[N_GIVEN:N_GIVEN + 4]
    hits = total = 0
    for r in rows:
        prompt = (f"This is a row from the well-known public dataset '{name}'. "
                  f"Complete the remaining fields EXACTLY as they appear in the dataset. "
                  f"Known fields: "
                  + "; ".join(f"{c}={df.iloc[r][c]}" for c in given)
                  + ". Respond ONLY with: " + "; ".join(f"{c}=<value>" for c in asked))
        out = subprocess.run(["oll", prompt, "--model", MODEL, "--max-tokens", "200"],
                             capture_output=True, text=True, timeout=120).stdout.lower()
        for c in asked:
            total += 1
            v = str(df.iloc[r][c]).strip().lower()
            if v and v not in ("nan", "") and v in out:
                hits += 1
    return {"table": name, "rows": len(rows), "cells_asked": total,
            "verbatim_hits": hits, "rate": round(hits / max(total, 1), 4)}


def main() -> None:
    hosp = pd.read_csv(ROOT / "data" / "real" / "hospital" / "clean.csv").astype(str)
    wild = pd.read_csv(ROOT / "data" / "wild" / "glassdoor_jobs.csv").astype(str)
    res = {"model": MODEL, "protocol": f"{N_ROWS} rows, {N_GIVEN} given cols, 4 asked cols, exact-substring match",
           "probes": [probe(hosp, "hospital (Raha benchmark)"),
                      probe(wild, "glassdoor_jobs (post-cutoff wild harvest)")]}
    json.dump(res, open(ROOT / "eval" / "results" / "contamination_probe.json", "w"), indent=1)
    print(json.dumps(res["probes"], indent=1))


if __name__ == "__main__":
    main()