"""Cascade runner: measure base-8B per axis under different prompt/inference strategies, compare to baseline, apply the decision gate. Reuses eval.kappa scoring; the 8B cache makes unchanged prompts free. Usage: python -m eval.cascade baseline v2 # measure these versions, print before/after κ table Versions are registered in VERSIONS (a builders module per key). Results persist to eval/_cache/cascade_results.json so later steps accumulate. """ from __future__ import annotations import importlib import json import os import sys import time from eval import kappa as K from prompt_card.scoring import observable_axes as OA RESULTS = os.path.join(os.path.dirname(__file__), "_cache", "cascade_results.json") VERSIONS = {"baseline": OA, "v2": "eval.prompts_v2", "v3": "eval.prompts_v3", "v4": "eval.prompts_v4"} def _builders(v): b = VERSIONS[v] return importlib.import_module(b) if isinstance(b, str) else b def measure(builders, gt, convs, embedder, client): """Return per-axis headline κ + detail. Cache-served calls are free.""" prompts, plan, geom = K.build_prompts(gt, convs, embedder, builders=builders) pi = {} for it in plan: pi.setdefault(it[0], []).append(it) n_before = client.misses responses = client.run_all(prompts) new_calls = client.misses - n_before out = {"_new_calls": new_calls} # technique / input_quality (per-category + axis-level "any") for axis, fields in (("technique", K.TECH), ("input_quality", K.IQ)): per, fail = K.score_binary_axis(gt, responses, pi, axis, fields) cats = {f: K.cohen_kappa(*per[f]) for f in fields} n = len(per[fields[0]][0]) anyt = [int(any(per[f][0][j] for f in fields)) for j in range(n)] anyp = [int(any(per[f][1][j] for f in fields)) for j in range(n)] feat_ks = [cats[f] for f in fields if sum(per[f][0]) > 0] # only categories with positives headline = (sum(feat_ks) / len(feat_ks)) if feat_ks else None out[axis] = {"headline": headline, "axis_any": K.cohen_kappa(anyt, anyp), "cats": {f: (cats[f], K.binary_counts(*per[f]), sum(per[f][0])) for f in fields}, "parse_fail": fail} # interaction yt, yp, fail = K.score_interaction(gt, responses, pi) out["interaction"] = {"headline": K.cohen_kappa(yt, yp), "counts": K.binary_counts(yt, yp), "pos": sum(yt), "n": len(yt), "parse_fail": fail} # focus (sweep best T) best = None for T in [0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70]: fyt, fyp, info = K.score_focus(gt, responses, pi, geom, T) k = K.cohen_kappa(fyt, fyp) cand = (k if k is not None else -9, info["recall"] or 0) if best is None or cand > best[0]: best = (cand, T, k, info["recall"]) out["focus"] = {"headline": best[2], "T": best[1], "recall": best[3]} return out def gate(k): if k is None: return "N/A" if k >= 0.6: return "SOLID" if k >= 0.4: return "OK" if k >= 0.2: return "try-next" return "must-next" def main(version_keys): base_url = os.environ.get("OPENBMB_BASE_URL"); token = os.environ.get("OPENBMB_TOKEN") if not base_url or not token: print("ERROR: set OPENBMB_BASE_URL and OPENBMB_TOKEN", file=sys.stderr); sys.exit(2) from prompt_card.llm.minicpm import MiniCPMClient gt = K.load_gt(); convs = K.load_convs(); embedder = K.FastEmbedder() client = K.CachedClient(MiniCPMClient(base_url, token), workers=8) prior = {} if os.path.exists(RESULTS): prior = json.load(open(RESULTS)) results = dict(prior) for v in version_keys: t0 = time.time() print(f"\n=== measuring '{v}' ===", flush=True) r = measure(_builders(v), gt, convs, embedder, client) r["_secs"] = round(time.time() - t0, 1) results[v] = r print(f" new 8B calls: {r['_new_calls']} · {r['_secs']}s", flush=True) json.dump(results, open(RESULTS, "w"), indent=1, default=str) axes = ["technique", "input_quality", "interaction", "focus"] print("\n================ κ comparison (headline per axis) ================") head = "axis".ljust(16) + "".join(v.ljust(12) for v in version_keys) + "gate(last)" print(head) for ax in axes: row = ax.ljust(16) last = None for v in version_keys: k = results[v][ax]["headline"]; last = k row += (f"{k:+.3f}" if k is not None else "N/A").ljust(12) print(row + gate(last)) # per-category technique/IQ detail for the last version last = version_keys[-1] print(f"\n--- per-category detail ({last}) ---") for ax in ("technique", "input_quality"): for f, (k, c, npos) in results[last][ax]["cats"].items(): ks = f"{k:+.3f}" if k is not None else "N/A" print(f" {ax[:4]}.{f:22} κ={ks} pos={npos} [TN {c['tn']} FP {c['fp']} FN {c['fn']} TP {c['tp']}]") fi = results[last]["interaction"]; c = fi["counts"] print(f" interaction.refinement κ={fi['headline']:+.3f} pos={fi['pos']}/{fi['n']} " f"[TN {c['tn']} FP {c['fp']} FN {c['fn']} TP {c['tp']}]") ff = results[last]["focus"] print(f" focus.topic_shift κ={ff['headline']:+.3f} T={ff['T']} recall={ff['recall']:.2f}") if __name__ == "__main__": keys = sys.argv[1:] or ["baseline", "v2"] main(keys)