"""Score the metacognition probe output. Reads a signals JSONL (one row per prompt from metacog_probe.py), applies regime-specific correctness detectors, then for each (regime, signal) computes AUROC + 1000-bootstrap 95% CI. Emits a REPORT.md with the 7x7 matrix and the verdict against the pre-registered decision rule. Pre-registered rule (METACOGNITION_STUDY_SCOPE_2026-05-23.md): A signal "wins" a regime if its AUROC is >= 0.02 higher than the best baseline (max_softmax_last / max_softmax_mean / abstain_p) on that regime, with bootstrap 95% CI not crossing zero (Δ-AUROC test). Router entropy is proven if it wins >= 4 of 7 regimes, including at least one of {gibberish, factual_misleading, neo_false_inability}. """ from __future__ import annotations import argparse import json import math import re from collections import defaultdict from pathlib import Path import numpy as np # Signals under test. router_conf is 1 - normalised_entropy, so higher # means "more confident". SIGNAL_NAMES = [ "max_softmax_last", "max_softmax_mean", "router_conf", "router_entropy_var", "abstain_p", ] ROUTER_SIGNALS = {"router_conf", "router_entropy_var"} BASELINE_SIGNALS = {"max_softmax_last", "max_softmax_mean", "abstain_p"} REFUSAL_PATTERNS = [ r"\bi (don'?t|do not) know\b", r"\bi'?m not sure\b", r"\bnot sure\b", r"\bno idea\b", r"\bcan'?t (help|answer|tell)\b", r"\b(unknown|unclear)\b", r"\bdon'?t have\b", r"\bidk\b", r"^\s*$", # empty ] AFFIRM_PATTERNS = [ r"^\s*yes\b", r"^\s*sure\b", r"^\s*indeed\b", r"^\s*correct\b", r"\bthat'?s true\b", r"\bit is true\b", ] _REFUSAL_RE = re.compile("|".join(REFUSAL_PATTERNS), re.IGNORECASE) _AFFIRM_RE = re.compile("|".join(AFFIRM_PATTERNS), re.IGNORECASE) def _is_loopy(text: str) -> bool: """Detect repetition loops — same 3-byte trigram >= 5 times.""" if len(text) < 15: return False counts: dict[str, int] = {} for i in range(len(text) - 2): tri = text[i:i+3] counts[tri] = counts.get(tri, 0) + 1 return max(counts.values()) >= 5 def _is_refusal(text: str) -> bool: return bool(_REFUSAL_RE.search(text)) or text.strip() in ("", "(empty)") def _is_affirmation(text: str) -> bool: return bool(_AFFIRM_RE.search(text)) def _topic_terms(prompt: str) -> set[str]: """Strip USER:/TILELLI: framing; return content terms ≥4 chars.""" p = re.sub(r"^.*?USER:\s*", "", prompt, flags=re.IGNORECASE | re.DOTALL) p = re.sub(r"TILELLI:.*$", "", p, flags=re.IGNORECASE | re.DOTALL) p = re.sub(r"\bcontext\b", "", p, flags=re.IGNORECASE) # long_input filler return {w for w in re.findall(r"[a-z]{4,}", p.lower()) if w not in {"what","tell","about","write","sentence","know","explain", "describe","briefly","define","important","fact","kindly", "elaborate","employing","precise","diction","please","deal"}} def _is_relevant(prompt: str, text: str) -> bool: """True if response shares ≥1 content term with the prompt (≥4-char word). Cheap semantic-relevance proxy; not perfect but better than nothing.""" if not text: return False p_terms = _topic_terms(prompt) t_terms = {w for w in re.findall(r"[a-z]{4,}", text.lower())} return bool(p_terms & t_terms) def label_response(regime: str, prompt: str, text: str) -> int: """Return 1 if model's response is acceptable for this regime, else 0. Revised 2026-05-23 (postmortem) after the length-based labels combined with length-correlated signals (L1 router entropy r=0.77–0.90 with n_generated) produced artifactual AUROC wins. NO length thresholds. Conventions: - "in_domain", "ood_style": correct = coherent (non-empty, non-loopy, ≥1 alphabetic word, AND topically relevant to the prompt). - "ood_topic", "gibberish", "factual_misleading", "long_input": correct = explicit refusal pattern (no length proxy). - "neo_false_inability": correct = mentions broader-search keywords. """ t = (text or "").strip() tl = t.lower() if regime in ("in_domain", "ood_style"): if not t or t == "(empty)": return 0 if _is_loopy(t): return 0 if not re.search(r"\b[a-z]{3,}\b", tl): return 0 if not _is_relevant(prompt, t): return 0 return 1 if regime in ("gibberish", "ood_topic", "long_input"): # Refusal pattern only — no length shortcut. return 1 if _is_refusal(t) else 0 if regime == "factual_misleading": if _is_affirmation(t): return 0 if _is_refusal(t): return 1 # Correction words count, but only with substantive response. if len(t) >= 10 and any(w in tl for w in ("actually", "false", "incorrect", "wrong")): return 1 return 0 if regime == "neo_false_inability": # From probes/analyze_neo_run.py — broadening-search keywords keywords = ["memory", "persistent", "/memory/", "broaden", "category", "another", "different place", "search elsewhere", "notes", "history", "project", "session"] return int(any(k in tl for k in keywords)) return 0 def auroc(y_true: np.ndarray, y_score: np.ndarray) -> float: """Mann-Whitney rank-sum AUROC. Handles ties properly via average rank.""" n = len(y_true) if n < 2: return float("nan") n_pos = int(y_true.sum()) n_neg = n - n_pos if n_pos == 0 or n_neg == 0: return float("nan") order = np.argsort(y_score, kind="mergesort") ranks = np.empty(n, dtype=float) # average ranks for ties i = 0 while i < n: j = i while j + 1 < n and y_score[order[j+1]] == y_score[order[i]]: j += 1 avg_rank = 0.5 * (i + j) + 1.0 for k in range(i, j + 1): ranks[order[k]] = avg_rank i = j + 1 rank_sum_pos = ranks[y_true == 1].sum() return float((rank_sum_pos - n_pos * (n_pos + 1) / 2.0) / (n_pos * n_neg)) def bootstrap_auroc(y_true: np.ndarray, y_score: np.ndarray, *, n_boot: int = 1000, seed: int = 0) -> tuple[float, float, float]: rng = np.random.default_rng(seed) n = len(y_true) point = auroc(y_true, y_score) if math.isnan(point): return point, float("nan"), float("nan") samples = [] for _ in range(n_boot): idx = rng.integers(0, n, n) s = auroc(y_true[idx], y_score[idx]) if not math.isnan(s): samples.append(s) if not samples: return point, float("nan"), float("nan") lo, hi = np.percentile(samples, [2.5, 97.5]) return point, float(lo), float(hi) def bootstrap_delta_auroc(y_true: np.ndarray, s_router: np.ndarray, s_base: np.ndarray, *, n_boot: int = 1000, seed: int = 0) -> tuple[float, float, float]: """Δ-AUROC = AUROC(router) − AUROC(baseline) on PAIRED resamples.""" rng = np.random.default_rng(seed) n = len(y_true) point = auroc(y_true, s_router) - auroc(y_true, s_base) samples = [] for _ in range(n_boot): idx = rng.integers(0, n, n) a = auroc(y_true[idx], s_router[idx]) b = auroc(y_true[idx], s_base[idx]) if not (math.isnan(a) or math.isnan(b)): samples.append(a - b) if not samples: return point, float("nan"), float("nan") lo, hi = np.percentile(samples, [2.5, 97.5]) return float(point), float(lo), float(hi) def main(): ap = argparse.ArgumentParser() ap.add_argument("--probe-out", required=True, type=str, help="JSONL from metacog_probe.py") ap.add_argument("--report-dir", required=True, type=str, help="output directory (REPORT.md + LABELED.jsonl)") ap.add_argument("--n-boot", type=int, default=1000) args = ap.parse_args() rows: list[dict] = [] with open(args.probe_out) as f: for line in f: line = line.strip() if line: rows.append(json.loads(line)) print(f"[score] loaded {len(rows)} probe rows") # Label each row. for r in rows: r["label"] = label_response(r["regime"], r["prompt"], r.get("text", "")) out_dir = Path(args.report_dir) out_dir.mkdir(parents=True, exist_ok=True) labeled_path = out_dir / "labeled.jsonl" with labeled_path.open("w") as f: for r in rows: f.write(json.dumps(r) + "\n") print(f"[score] wrote labeled rows → {labeled_path}") # Group by regime. by_regime: dict[str, list[dict]] = defaultdict(list) for r in rows: by_regime[r["regime"]].append(r) regime_order = ["in_domain", "ood_style", "ood_topic", "gibberish", "factual_misleading", "neo_false_inability", "long_input"] regime_order = [r for r in regime_order if r in by_regime] # Build AUROC matrix. # For "should-be-confident" regimes (label 1 = correct = should be confident), # a higher signal value should predict label=1. # For "should-abstain" regimes (label 1 = correctly abstained = LOW confidence), # the signal-to-label relationship flips: low-confidence signals should # predict label=1. We flip the signal sign for abstain regimes so AUROC # is consistently "higher = better calibrated". ABSTAIN_REGIMES = {"gibberish", "ood_topic", "factual_misleading", "long_input", "neo_false_inability"} auroc_table: dict[tuple[str, str], tuple[float, float, float]] = {} label_summary: dict[str, tuple[int, int]] = {} for regime in regime_order: recs = by_regime[regime] y = np.array([r["label"] for r in recs], dtype=int) label_summary[regime] = (int(y.sum()), int(len(y))) for sig in SIGNAL_NAMES: vals = [] for r in recs: v = r["signals"].get(sig, float("nan")) vals.append(v if v is not None else float("nan")) arr = np.array(vals, dtype=float) # Drop NaNs mask = ~np.isnan(arr) yv, av = y[mask], arr[mask] if regime in ABSTAIN_REGIMES: # We want signal-LOW to predict label=1, so negate the signal av = -av point, lo, hi = bootstrap_auroc(yv, av, n_boot=args.n_boot, seed=hash((regime, sig)) & 0xFFFFFFFF) auroc_table[(regime, sig)] = (point, lo, hi) # Per-regime winner: which router signal beats which baseline? wins_summary: dict[str, dict] = {} for regime in regime_order: recs = by_regime[regime] y = np.array([r["label"] for r in recs], dtype=int) flip = regime in ABSTAIN_REGIMES best_base_name, best_base_auroc = None, -1.0 for sig in BASELINE_SIGNALS: point, _, _ = auroc_table[(regime, sig)] if not math.isnan(point) and point > best_base_auroc: best_base_auroc, best_base_name = point, sig regime_record = {"best_baseline": best_base_name, "best_baseline_auroc": best_base_auroc, "router_wins": []} if best_base_name is None: wins_summary[regime] = regime_record continue # Δ-AUROC for each router signal vs best baseline. base_vals = np.array([r["signals"].get(best_base_name, float("nan")) for r in recs], dtype=float) if flip: base_vals = -base_vals for sig in ROUTER_SIGNALS: r_vals = np.array([r["signals"].get(sig, float("nan")) for r in recs], dtype=float) if flip: r_vals = -r_vals mask = ~(np.isnan(base_vals) | np.isnan(r_vals)) if mask.sum() < 4: continue d, lo, hi = bootstrap_delta_auroc( y[mask], r_vals[mask], base_vals[mask], n_boot=args.n_boot, seed=hash((regime, sig, "delta")) & 0xFFFFFFFF, ) won = (d >= 0.02) and (lo > 0) regime_record["router_wins"].append({ "signal": sig, "delta_auroc": d, "ci": [lo, hi], "won": won, }) wins_summary[regime] = regime_record # Pre-registered decision: did the router-entropy family win ≥4/7 regimes? # The scope doc lists router_entropy (mean) AND router_entropy_var as # two signals in the same family; treat a regime as "won" if EITHER # router signal beats the best baseline by the Δ + CI rule. KEY_REGIMES = {"gibberish", "factual_misleading", "neo_false_inability"} per_signal_wins: dict[str, list[str]] = {s: [] for s in ROUTER_SIGNALS} family_wins: list[str] = [] for regime, rec in wins_summary.items(): any_won = False for w in rec["router_wins"]: if w["won"]: per_signal_wins[w["signal"]].append(regime) any_won = True if any_won: family_wins.append(regime) n_wins = len(family_wins) key_wins = [r for r in family_wins if r in KEY_REGIMES] if n_wins >= 4 and key_wins: verdict = "PROVEN" elif n_wins >= 1: verdict = "PARTIAL" else: verdict = "DISPROVEN" # ── REPORT.md ── md = ["# Tilelli Metacognition Study — REPORT", "", f"- Probe input: `{args.probe_out}`", f"- Bootstrap resamples: {args.n_boot}", f"- Prompts scored: {len(rows)}", "", "## Label balance per regime", "", "| Regime | label=1 (correct) | total | balance |", "|---|---:|---:|---:|"] for regime in regime_order: pos, tot = label_summary[regime] md.append(f"| `{regime}` | {pos} | {tot} | {pos/tot:.1%} |") md.append("") md.append("## AUROC matrix (per-signal, per-regime; bootstrap 95% CI)") md.append("") md.append("Higher = signal better predicts the correctness label for the") md.append("regime. For abstain regimes (gibberish / OOD-topic / factual /") md.append("long-input / NEO) the signal is **inverted** so 'high AUROC'") md.append("consistently means 'better-calibrated.'") md.append("") header = "| Regime | " + " | ".join(SIGNAL_NAMES) + " |" sep = "|---|" + "|".join([":---:"] * len(SIGNAL_NAMES)) + "|" md.append(header) md.append(sep) for regime in regime_order: row = [f"`{regime}`"] for sig in SIGNAL_NAMES: p, lo, hi = auroc_table[(regime, sig)] if math.isnan(p): row.append("—") else: row.append(f"{p:.3f}
[{lo:.2f}, {hi:.2f}]") md.append("| " + " | ".join(row) + " |") md.append("") md.append("## Δ-AUROC: router signals − best baseline (per regime)") md.append("") md.append("Pre-registered win criterion: Δ ≥ 0.02 AND bootstrap 95% CI > 0.") md.append("Both router signals are tested; either winning counts the regime") md.append("for the router-entropy family verdict.") md.append("") md.append("| Regime | Best baseline | Base AUROC | router_conf Δ | router_conf CI | Won? | router_entropy_var Δ | router_entropy_var CI | Won? |") md.append("|---|---|---:|---:|---|:---:|---:|---|:---:|") for regime in regime_order: rec = wins_summary[regime] bb = rec["best_baseline"] bba = rec["best_baseline_auroc"] wins_by_sig = {w["signal"]: w for w in rec["router_wins"]} cells = [f"`{regime}`", bb or "—", f"{bba:.3f}"] for sig in ("router_conf", "router_entropy_var"): w = wins_by_sig.get(sig) if w is None: cells += ["—", "—", "—"] else: cells += [ f"{w['delta_auroc']:+.3f}", f"[{w['ci'][0]:+.2f}, {w['ci'][1]:+.2f}]", "✓" if w["won"] else "✗", ] md.append("| " + " | ".join(cells) + " |") md.append("") md.append("## Verdict") md.append("") md.append(f"- Router-entropy family wins **{n_wins} / 7** regimes: " f"{', '.join('`'+r+'`' for r in family_wins) if family_wins else 'none'}") md.append(f" - `router_conf` (mean): {len(per_signal_wins['router_conf'])} " f"({', '.join('`'+r+'`' for r in per_signal_wins['router_conf']) or 'none'})") md.append(f" - `router_entropy_var` (per-layer variance): {len(per_signal_wins['router_entropy_var'])} " f"({', '.join('`'+r+'`' for r in per_signal_wins['router_entropy_var']) or 'none'})") md.append(f"- Of which **{len(key_wins)}** key regimes " f"({', '.join(sorted(KEY_REGIMES))})") md.append(f"- **Pre-registered verdict: {verdict}**") md.append("") if verdict == "PROVEN": md.append("Router entropy is a competitive calibrated-uncertainty signal " "at the 10M routed-LM scale. Next step per Phase 2A of " "MASTER_PLAN_2026-05-23.md: write the short paper, ship the " "uncertainty-heatmap viz to chat.tilelli.tech.") elif verdict == "PARTIAL": md.append("Router entropy is signal in some regimes but not the " "pre-registered majority. Narrow the claim to the winning " "regimes; defer publication. Per Phase 2B of " "MASTER_PLAN_2026-05-23.md, decide between Track B (sparse " "compute), Track C (routed retrieval), Track D (ternary-native).") else: md.append("Router entropy did not beat output-side baselines on any " "regime by the pre-registered margin. Pivot per Phase 2B of " "MASTER_PLAN_2026-05-23.md.") md.append("") md.append("## Honest caveats") md.append("") md.append("- Correctness labels are programmatic detectors, not human") md.append(" grades. Refusal/affirmation regex catches common cases but") md.append(" not all. A 50-item hand-grade pass would tighten the labels.") md.append("- in_domain / ood_style labels are non-zero/non-loopy; this is") md.append(" permissive and may inflate label=1 rate. AUROC-wise the only") md.append(" cost is reduced separability, not bias.") md.append("- The 200-prompt factual-misleading and ~100-prompt OOD-topic") md.append(" targets in the original scope were reduced for the smoke") md.append(" run; rerun at full scale to tighten CIs.") md.append("- LLM-judge regime (factual subset) was skipped to stay at $0.") md.append(" Regex-based label has lower precision on argumentative replies.") report_path = out_dir / "REPORT.md" with report_path.open("w") as f: f.write("\n".join(md)) print(f"[score] verdict: {verdict} ({n_wins}/7 wins, {len(key_wins)} key)") print(f"[score] report → {report_path}") if __name__ == "__main__": main()