Spaces:
Runtime error
Runtime error
| """Cascade runner: measure base-8B per axis under different prompt/inference strategies, compare to | |
| baseline, apply the decision gate. Reuses eval.kappa scoring; the 8B cache makes unchanged prompts free. | |
| Usage: | |
| python -m eval.cascade baseline v2 # measure these versions, print before/after κ table | |
| Versions are registered in VERSIONS (a builders module per key). Results persist to | |
| eval/_cache/cascade_results.json so later steps accumulate. | |
| """ | |
| from __future__ import annotations | |
| import importlib | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from eval import kappa as K | |
| from prompt_card.scoring import observable_axes as OA | |
| RESULTS = os.path.join(os.path.dirname(__file__), "_cache", "cascade_results.json") | |
| VERSIONS = {"baseline": OA, "v2": "eval.prompts_v2", "v3": "eval.prompts_v3", "v4": "eval.prompts_v4"} | |
| def _builders(v): | |
| b = VERSIONS[v] | |
| return importlib.import_module(b) if isinstance(b, str) else b | |
| def measure(builders, gt, convs, embedder, client): | |
| """Return per-axis headline κ + detail. Cache-served calls are free.""" | |
| prompts, plan, geom = K.build_prompts(gt, convs, embedder, builders=builders) | |
| pi = {} | |
| for it in plan: | |
| pi.setdefault(it[0], []).append(it) | |
| n_before = client.misses | |
| responses = client.run_all(prompts) | |
| new_calls = client.misses - n_before | |
| out = {"_new_calls": new_calls} | |
| # technique / input_quality (per-category + axis-level "any") | |
| for axis, fields in (("technique", K.TECH), ("input_quality", K.IQ)): | |
| per, fail = K.score_binary_axis(gt, responses, pi, axis, fields) | |
| cats = {f: K.cohen_kappa(*per[f]) for f in fields} | |
| n = len(per[fields[0]][0]) | |
| anyt = [int(any(per[f][0][j] for f in fields)) for j in range(n)] | |
| anyp = [int(any(per[f][1][j] for f in fields)) for j in range(n)] | |
| feat_ks = [cats[f] for f in fields if sum(per[f][0]) > 0] # only categories with positives | |
| headline = (sum(feat_ks) / len(feat_ks)) if feat_ks else None | |
| out[axis] = {"headline": headline, "axis_any": K.cohen_kappa(anyt, anyp), | |
| "cats": {f: (cats[f], K.binary_counts(*per[f]), sum(per[f][0])) for f in fields}, | |
| "parse_fail": fail} | |
| # interaction | |
| yt, yp, fail = K.score_interaction(gt, responses, pi) | |
| out["interaction"] = {"headline": K.cohen_kappa(yt, yp), "counts": K.binary_counts(yt, yp), | |
| "pos": sum(yt), "n": len(yt), "parse_fail": fail} | |
| # focus (sweep best T) | |
| best = None | |
| for T in [0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70]: | |
| fyt, fyp, info = K.score_focus(gt, responses, pi, geom, T) | |
| k = K.cohen_kappa(fyt, fyp) | |
| cand = (k if k is not None else -9, info["recall"] or 0) | |
| if best is None or cand > best[0]: | |
| best = (cand, T, k, info["recall"]) | |
| out["focus"] = {"headline": best[2], "T": best[1], "recall": best[3]} | |
| return out | |
| def gate(k): | |
| if k is None: | |
| return "N/A" | |
| if k >= 0.6: | |
| return "SOLID" | |
| if k >= 0.4: | |
| return "OK" | |
| if k >= 0.2: | |
| return "try-next" | |
| return "must-next" | |
| def main(version_keys): | |
| base_url = os.environ.get("OPENBMB_BASE_URL"); token = os.environ.get("OPENBMB_TOKEN") | |
| if not base_url or not token: | |
| print("ERROR: set OPENBMB_BASE_URL and OPENBMB_TOKEN", file=sys.stderr); sys.exit(2) | |
| from prompt_card.llm.minicpm import MiniCPMClient | |
| gt = K.load_gt(); convs = K.load_convs(); embedder = K.FastEmbedder() | |
| client = K.CachedClient(MiniCPMClient(base_url, token), workers=8) | |
| prior = {} | |
| if os.path.exists(RESULTS): | |
| prior = json.load(open(RESULTS)) | |
| results = dict(prior) | |
| for v in version_keys: | |
| t0 = time.time() | |
| print(f"\n=== measuring '{v}' ===", flush=True) | |
| r = measure(_builders(v), gt, convs, embedder, client) | |
| r["_secs"] = round(time.time() - t0, 1) | |
| results[v] = r | |
| print(f" new 8B calls: {r['_new_calls']} · {r['_secs']}s", flush=True) | |
| json.dump(results, open(RESULTS, "w"), indent=1, default=str) | |
| axes = ["technique", "input_quality", "interaction", "focus"] | |
| print("\n================ κ comparison (headline per axis) ================") | |
| head = "axis".ljust(16) + "".join(v.ljust(12) for v in version_keys) + "gate(last)" | |
| print(head) | |
| for ax in axes: | |
| row = ax.ljust(16) | |
| last = None | |
| for v in version_keys: | |
| k = results[v][ax]["headline"]; last = k | |
| row += (f"{k:+.3f}" if k is not None else "N/A").ljust(12) | |
| print(row + gate(last)) | |
| # per-category technique/IQ detail for the last version | |
| last = version_keys[-1] | |
| print(f"\n--- per-category detail ({last}) ---") | |
| for ax in ("technique", "input_quality"): | |
| for f, (k, c, npos) in results[last][ax]["cats"].items(): | |
| ks = f"{k:+.3f}" if k is not None else "N/A" | |
| print(f" {ax[:4]}.{f:22} κ={ks} pos={npos} [TN {c['tn']} FP {c['fp']} FN {c['fn']} TP {c['tp']}]") | |
| fi = results[last]["interaction"]; c = fi["counts"] | |
| print(f" interaction.refinement κ={fi['headline']:+.3f} pos={fi['pos']}/{fi['n']} " | |
| f"[TN {c['tn']} FP {c['fp']} FN {c['fn']} TP {c['tp']}]") | |
| ff = results[last]["focus"] | |
| print(f" focus.topic_shift κ={ff['headline']:+.3f} T={ff['T']} recall={ff['recall']:.2f}") | |
| if __name__ == "__main__": | |
| keys = sys.argv[1:] or ["baseline", "v2"] | |
| main(keys) | |