"""Score the metacognition probe output.
Reads a signals JSONL (one row per prompt from metacog_probe.py), applies
regime-specific correctness detectors, then for each (regime, signal)
computes AUROC + 1000-bootstrap 95% CI. Emits a REPORT.md with the 7x7
matrix and the verdict against the pre-registered decision rule.
Pre-registered rule (METACOGNITION_STUDY_SCOPE_2026-05-23.md):
A signal "wins" a regime if its AUROC is >= 0.02 higher than the
best baseline (max_softmax_last / max_softmax_mean / abstain_p) on
that regime, with bootstrap 95% CI not crossing zero (Δ-AUROC test).
Router entropy is proven if it wins >= 4 of 7 regimes, including
at least one of {gibberish, factual_misleading, neo_false_inability}.
"""
from __future__ import annotations
import argparse
import json
import math
import re
from collections import defaultdict
from pathlib import Path
import numpy as np
# Signals under test. router_conf is 1 - normalised_entropy, so higher
# means "more confident".
SIGNAL_NAMES = [
"max_softmax_last",
"max_softmax_mean",
"router_conf",
"router_entropy_var",
"abstain_p",
]
ROUTER_SIGNALS = {"router_conf", "router_entropy_var"}
BASELINE_SIGNALS = {"max_softmax_last", "max_softmax_mean", "abstain_p"}
REFUSAL_PATTERNS = [
r"\bi (don'?t|do not) know\b",
r"\bi'?m not sure\b",
r"\bnot sure\b",
r"\bno idea\b",
r"\bcan'?t (help|answer|tell)\b",
r"\b(unknown|unclear)\b",
r"\bdon'?t have\b",
r"\bidk\b",
r"^\s*$", # empty
]
AFFIRM_PATTERNS = [
r"^\s*yes\b",
r"^\s*sure\b",
r"^\s*indeed\b",
r"^\s*correct\b",
r"\bthat'?s true\b",
r"\bit is true\b",
]
_REFUSAL_RE = re.compile("|".join(REFUSAL_PATTERNS), re.IGNORECASE)
_AFFIRM_RE = re.compile("|".join(AFFIRM_PATTERNS), re.IGNORECASE)
def _is_loopy(text: str) -> bool:
"""Detect repetition loops — same 3-byte trigram >= 5 times."""
if len(text) < 15:
return False
counts: dict[str, int] = {}
for i in range(len(text) - 2):
tri = text[i:i+3]
counts[tri] = counts.get(tri, 0) + 1
return max(counts.values()) >= 5
def _is_refusal(text: str) -> bool:
return bool(_REFUSAL_RE.search(text)) or text.strip() in ("", "(empty)")
def _is_affirmation(text: str) -> bool:
return bool(_AFFIRM_RE.search(text))
def _topic_terms(prompt: str) -> set[str]:
"""Strip USER:/TILELLI: framing; return content terms ≥4 chars."""
p = re.sub(r"^.*?USER:\s*", "", prompt, flags=re.IGNORECASE | re.DOTALL)
p = re.sub(r"TILELLI:.*$", "", p, flags=re.IGNORECASE | re.DOTALL)
p = re.sub(r"\bcontext\b", "", p, flags=re.IGNORECASE) # long_input filler
return {w for w in re.findall(r"[a-z]{4,}", p.lower())
if w not in {"what","tell","about","write","sentence","know","explain",
"describe","briefly","define","important","fact","kindly",
"elaborate","employing","precise","diction","please","deal"}}
def _is_relevant(prompt: str, text: str) -> bool:
"""True if response shares ≥1 content term with the prompt (≥4-char word).
Cheap semantic-relevance proxy; not perfect but better than nothing."""
if not text:
return False
p_terms = _topic_terms(prompt)
t_terms = {w for w in re.findall(r"[a-z]{4,}", text.lower())}
return bool(p_terms & t_terms)
def label_response(regime: str, prompt: str, text: str) -> int:
"""Return 1 if model's response is acceptable for this regime, else 0.
Revised 2026-05-23 (postmortem) after the length-based labels combined
with length-correlated signals (L1 router entropy r=0.77–0.90 with
n_generated) produced artifactual AUROC wins. NO length thresholds.
Conventions:
- "in_domain", "ood_style": correct = coherent (non-empty, non-loopy,
≥1 alphabetic word, AND topically relevant to the prompt).
- "ood_topic", "gibberish", "factual_misleading", "long_input":
correct = explicit refusal pattern (no length proxy).
- "neo_false_inability": correct = mentions broader-search keywords.
"""
t = (text or "").strip()
tl = t.lower()
if regime in ("in_domain", "ood_style"):
if not t or t == "(empty)":
return 0
if _is_loopy(t):
return 0
if not re.search(r"\b[a-z]{3,}\b", tl):
return 0
if not _is_relevant(prompt, t):
return 0
return 1
if regime in ("gibberish", "ood_topic", "long_input"):
# Refusal pattern only — no length shortcut.
return 1 if _is_refusal(t) else 0
if regime == "factual_misleading":
if _is_affirmation(t):
return 0
if _is_refusal(t):
return 1
# Correction words count, but only with substantive response.
if len(t) >= 10 and any(w in tl for w in
("actually", "false", "incorrect", "wrong")):
return 1
return 0
if regime == "neo_false_inability":
# From probes/analyze_neo_run.py — broadening-search keywords
keywords = ["memory", "persistent", "/memory/", "broaden", "category",
"another", "different place", "search elsewhere",
"notes", "history", "project", "session"]
return int(any(k in tl for k in keywords))
return 0
def auroc(y_true: np.ndarray, y_score: np.ndarray) -> float:
"""Mann-Whitney rank-sum AUROC. Handles ties properly via average rank."""
n = len(y_true)
if n < 2:
return float("nan")
n_pos = int(y_true.sum())
n_neg = n - n_pos
if n_pos == 0 or n_neg == 0:
return float("nan")
order = np.argsort(y_score, kind="mergesort")
ranks = np.empty(n, dtype=float)
# average ranks for ties
i = 0
while i < n:
j = i
while j + 1 < n and y_score[order[j+1]] == y_score[order[i]]:
j += 1
avg_rank = 0.5 * (i + j) + 1.0
for k in range(i, j + 1):
ranks[order[k]] = avg_rank
i = j + 1
rank_sum_pos = ranks[y_true == 1].sum()
return float((rank_sum_pos - n_pos * (n_pos + 1) / 2.0) / (n_pos * n_neg))
def bootstrap_auroc(y_true: np.ndarray, y_score: np.ndarray, *,
n_boot: int = 1000, seed: int = 0) -> tuple[float, float, float]:
rng = np.random.default_rng(seed)
n = len(y_true)
point = auroc(y_true, y_score)
if math.isnan(point):
return point, float("nan"), float("nan")
samples = []
for _ in range(n_boot):
idx = rng.integers(0, n, n)
s = auroc(y_true[idx], y_score[idx])
if not math.isnan(s):
samples.append(s)
if not samples:
return point, float("nan"), float("nan")
lo, hi = np.percentile(samples, [2.5, 97.5])
return point, float(lo), float(hi)
def bootstrap_delta_auroc(y_true: np.ndarray, s_router: np.ndarray, s_base: np.ndarray,
*, n_boot: int = 1000, seed: int = 0) -> tuple[float, float, float]:
"""Δ-AUROC = AUROC(router) − AUROC(baseline) on PAIRED resamples."""
rng = np.random.default_rng(seed)
n = len(y_true)
point = auroc(y_true, s_router) - auroc(y_true, s_base)
samples = []
for _ in range(n_boot):
idx = rng.integers(0, n, n)
a = auroc(y_true[idx], s_router[idx])
b = auroc(y_true[idx], s_base[idx])
if not (math.isnan(a) or math.isnan(b)):
samples.append(a - b)
if not samples:
return point, float("nan"), float("nan")
lo, hi = np.percentile(samples, [2.5, 97.5])
return float(point), float(lo), float(hi)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--probe-out", required=True, type=str,
help="JSONL from metacog_probe.py")
ap.add_argument("--report-dir", required=True, type=str,
help="output directory (REPORT.md + LABELED.jsonl)")
ap.add_argument("--n-boot", type=int, default=1000)
args = ap.parse_args()
rows: list[dict] = []
with open(args.probe_out) as f:
for line in f:
line = line.strip()
if line:
rows.append(json.loads(line))
print(f"[score] loaded {len(rows)} probe rows")
# Label each row.
for r in rows:
r["label"] = label_response(r["regime"], r["prompt"], r.get("text", ""))
out_dir = Path(args.report_dir)
out_dir.mkdir(parents=True, exist_ok=True)
labeled_path = out_dir / "labeled.jsonl"
with labeled_path.open("w") as f:
for r in rows:
f.write(json.dumps(r) + "\n")
print(f"[score] wrote labeled rows → {labeled_path}")
# Group by regime.
by_regime: dict[str, list[dict]] = defaultdict(list)
for r in rows:
by_regime[r["regime"]].append(r)
regime_order = ["in_domain", "ood_style", "ood_topic", "gibberish",
"factual_misleading", "neo_false_inability", "long_input"]
regime_order = [r for r in regime_order if r in by_regime]
# Build AUROC matrix.
# For "should-be-confident" regimes (label 1 = correct = should be confident),
# a higher signal value should predict label=1.
# For "should-abstain" regimes (label 1 = correctly abstained = LOW confidence),
# the signal-to-label relationship flips: low-confidence signals should
# predict label=1. We flip the signal sign for abstain regimes so AUROC
# is consistently "higher = better calibrated".
ABSTAIN_REGIMES = {"gibberish", "ood_topic", "factual_misleading",
"long_input", "neo_false_inability"}
auroc_table: dict[tuple[str, str], tuple[float, float, float]] = {}
label_summary: dict[str, tuple[int, int]] = {}
for regime in regime_order:
recs = by_regime[regime]
y = np.array([r["label"] for r in recs], dtype=int)
label_summary[regime] = (int(y.sum()), int(len(y)))
for sig in SIGNAL_NAMES:
vals = []
for r in recs:
v = r["signals"].get(sig, float("nan"))
vals.append(v if v is not None else float("nan"))
arr = np.array(vals, dtype=float)
# Drop NaNs
mask = ~np.isnan(arr)
yv, av = y[mask], arr[mask]
if regime in ABSTAIN_REGIMES:
# We want signal-LOW to predict label=1, so negate the signal
av = -av
point, lo, hi = bootstrap_auroc(yv, av, n_boot=args.n_boot,
seed=hash((regime, sig)) & 0xFFFFFFFF)
auroc_table[(regime, sig)] = (point, lo, hi)
# Per-regime winner: which router signal beats which baseline?
wins_summary: dict[str, dict] = {}
for regime in regime_order:
recs = by_regime[regime]
y = np.array([r["label"] for r in recs], dtype=int)
flip = regime in ABSTAIN_REGIMES
best_base_name, best_base_auroc = None, -1.0
for sig in BASELINE_SIGNALS:
point, _, _ = auroc_table[(regime, sig)]
if not math.isnan(point) and point > best_base_auroc:
best_base_auroc, best_base_name = point, sig
regime_record = {"best_baseline": best_base_name,
"best_baseline_auroc": best_base_auroc,
"router_wins": []}
if best_base_name is None:
wins_summary[regime] = regime_record
continue
# Δ-AUROC for each router signal vs best baseline.
base_vals = np.array([r["signals"].get(best_base_name, float("nan"))
for r in recs], dtype=float)
if flip:
base_vals = -base_vals
for sig in ROUTER_SIGNALS:
r_vals = np.array([r["signals"].get(sig, float("nan"))
for r in recs], dtype=float)
if flip:
r_vals = -r_vals
mask = ~(np.isnan(base_vals) | np.isnan(r_vals))
if mask.sum() < 4:
continue
d, lo, hi = bootstrap_delta_auroc(
y[mask], r_vals[mask], base_vals[mask],
n_boot=args.n_boot,
seed=hash((regime, sig, "delta")) & 0xFFFFFFFF,
)
won = (d >= 0.02) and (lo > 0)
regime_record["router_wins"].append({
"signal": sig, "delta_auroc": d, "ci": [lo, hi], "won": won,
})
wins_summary[regime] = regime_record
# Pre-registered decision: did the router-entropy family win ≥4/7 regimes?
# The scope doc lists router_entropy (mean) AND router_entropy_var as
# two signals in the same family; treat a regime as "won" if EITHER
# router signal beats the best baseline by the Δ + CI rule.
KEY_REGIMES = {"gibberish", "factual_misleading", "neo_false_inability"}
per_signal_wins: dict[str, list[str]] = {s: [] for s in ROUTER_SIGNALS}
family_wins: list[str] = []
for regime, rec in wins_summary.items():
any_won = False
for w in rec["router_wins"]:
if w["won"]:
per_signal_wins[w["signal"]].append(regime)
any_won = True
if any_won:
family_wins.append(regime)
n_wins = len(family_wins)
key_wins = [r for r in family_wins if r in KEY_REGIMES]
if n_wins >= 4 and key_wins:
verdict = "PROVEN"
elif n_wins >= 1:
verdict = "PARTIAL"
else:
verdict = "DISPROVEN"
# ── REPORT.md ──
md = ["# Tilelli Metacognition Study — REPORT",
"",
f"- Probe input: `{args.probe_out}`",
f"- Bootstrap resamples: {args.n_boot}",
f"- Prompts scored: {len(rows)}",
"",
"## Label balance per regime",
"",
"| Regime | label=1 (correct) | total | balance |",
"|---|---:|---:|---:|"]
for regime in regime_order:
pos, tot = label_summary[regime]
md.append(f"| `{regime}` | {pos} | {tot} | {pos/tot:.1%} |")
md.append("")
md.append("## AUROC matrix (per-signal, per-regime; bootstrap 95% CI)")
md.append("")
md.append("Higher = signal better predicts the correctness label for the")
md.append("regime. For abstain regimes (gibberish / OOD-topic / factual /")
md.append("long-input / NEO) the signal is **inverted** so 'high AUROC'")
md.append("consistently means 'better-calibrated.'")
md.append("")
header = "| Regime | " + " | ".join(SIGNAL_NAMES) + " |"
sep = "|---|" + "|".join([":---:"] * len(SIGNAL_NAMES)) + "|"
md.append(header)
md.append(sep)
for regime in regime_order:
row = [f"`{regime}`"]
for sig in SIGNAL_NAMES:
p, lo, hi = auroc_table[(regime, sig)]
if math.isnan(p):
row.append("—")
else:
row.append(f"{p:.3f}
[{lo:.2f}, {hi:.2f}]")
md.append("| " + " | ".join(row) + " |")
md.append("")
md.append("## Δ-AUROC: router signals − best baseline (per regime)")
md.append("")
md.append("Pre-registered win criterion: Δ ≥ 0.02 AND bootstrap 95% CI > 0.")
md.append("Both router signals are tested; either winning counts the regime")
md.append("for the router-entropy family verdict.")
md.append("")
md.append("| Regime | Best baseline | Base AUROC | router_conf Δ | router_conf CI | Won? | router_entropy_var Δ | router_entropy_var CI | Won? |")
md.append("|---|---|---:|---:|---|:---:|---:|---|:---:|")
for regime in regime_order:
rec = wins_summary[regime]
bb = rec["best_baseline"]
bba = rec["best_baseline_auroc"]
wins_by_sig = {w["signal"]: w for w in rec["router_wins"]}
cells = [f"`{regime}`", bb or "—", f"{bba:.3f}"]
for sig in ("router_conf", "router_entropy_var"):
w = wins_by_sig.get(sig)
if w is None:
cells += ["—", "—", "—"]
else:
cells += [
f"{w['delta_auroc']:+.3f}",
f"[{w['ci'][0]:+.2f}, {w['ci'][1]:+.2f}]",
"✓" if w["won"] else "✗",
]
md.append("| " + " | ".join(cells) + " |")
md.append("")
md.append("## Verdict")
md.append("")
md.append(f"- Router-entropy family wins **{n_wins} / 7** regimes: "
f"{', '.join('`'+r+'`' for r in family_wins) if family_wins else 'none'}")
md.append(f" - `router_conf` (mean): {len(per_signal_wins['router_conf'])} "
f"({', '.join('`'+r+'`' for r in per_signal_wins['router_conf']) or 'none'})")
md.append(f" - `router_entropy_var` (per-layer variance): {len(per_signal_wins['router_entropy_var'])} "
f"({', '.join('`'+r+'`' for r in per_signal_wins['router_entropy_var']) or 'none'})")
md.append(f"- Of which **{len(key_wins)}** key regimes "
f"({', '.join(sorted(KEY_REGIMES))})")
md.append(f"- **Pre-registered verdict: {verdict}**")
md.append("")
if verdict == "PROVEN":
md.append("Router entropy is a competitive calibrated-uncertainty signal "
"at the 10M routed-LM scale. Next step per Phase 2A of "
"MASTER_PLAN_2026-05-23.md: write the short paper, ship the "
"uncertainty-heatmap viz to chat.tilelli.tech.")
elif verdict == "PARTIAL":
md.append("Router entropy is signal in some regimes but not the "
"pre-registered majority. Narrow the claim to the winning "
"regimes; defer publication. Per Phase 2B of "
"MASTER_PLAN_2026-05-23.md, decide between Track B (sparse "
"compute), Track C (routed retrieval), Track D (ternary-native).")
else:
md.append("Router entropy did not beat output-side baselines on any "
"regime by the pre-registered margin. Pivot per Phase 2B of "
"MASTER_PLAN_2026-05-23.md.")
md.append("")
md.append("## Honest caveats")
md.append("")
md.append("- Correctness labels are programmatic detectors, not human")
md.append(" grades. Refusal/affirmation regex catches common cases but")
md.append(" not all. A 50-item hand-grade pass would tighten the labels.")
md.append("- in_domain / ood_style labels are non-zero/non-loopy; this is")
md.append(" permissive and may inflate label=1 rate. AUROC-wise the only")
md.append(" cost is reduced separability, not bias.")
md.append("- The 200-prompt factual-misleading and ~100-prompt OOD-topic")
md.append(" targets in the original scope were reduced for the smoke")
md.append(" run; rerun at full scale to tighten CIs.")
md.append("- LLM-judge regime (factual subset) was skipped to stay at $0.")
md.append(" Regex-based label has lower precision on argumentative replies.")
report_path = out_dir / "REPORT.md"
with report_path.open("w") as f:
f.write("\n".join(md))
print(f"[score] verdict: {verdict} ({n_wins}/7 wins, {len(key_wins)} key)")
print(f"[score] report → {report_path}")
if __name__ == "__main__":
main()