File size: 4,632 Bytes
51a9974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Run the skill-uplift eval against real frontier models via OpenRouter.

PREREQUISITES (do these first; never hardcode the key):
    export OPENROUTER_API_KEY=sk-or-...
    # optional overrides if the catalog has moved:
    export EVAL_ANSWERER_MODEL=anthropic/claude-opus-4.6
    export EVAL_GRADER_MODEL=openai/gpt-5.1
Run from your repo root so it imports the REAL skill_builder.build_skill_md:
    python run_skill_eval.py

It will: verify the models are live -> run grader calibration -> run the eval ->
write skill_eval_runs/_report.json and print a plain-language summary. It reports
whatever number comes out. A modest or zero uplift is a valid, honest result.
"""

from __future__ import annotations

import json
import sys

from openrouter_client import (
    make_completer, verify_models,
    DEFAULT_ANSWERER_MODEL, DEFAULT_GRADER_MODEL,
)
from skill_uplift_eval import run_eval, run_calibration, GRADER_SYSTEM
from sample_cases import CASES

# A tiny human-labeled calibration set: pairs where WE judged which answer is
# better. The grader should agree. Swap in your own hand-labeled pairs.
CALIBRATION = [
    {
        "task": "Efficiently compute the sum of values on every root-to-leaf path's max in a tree.",
        "better_answer": "Do a single post-order (bottom-up) DFS; at each node combine the max of its children's results in O(n) total.",
        "worse_answer": "From each node, separately walk down every path to a leaf and take the max; simple but O(n^2) or worse.",
    },
    {
        "task": "Find the absorption probability in a Markov chain with two absorbing states.",
        "better_answer": "Write first-step equations for the transient states only, excluding absorbing rows, and solve the linear system.",
        "worse_answer": "Run a large Monte Carlo simulation and estimate the fraction of runs that end in the target state.",
    },
    {
        "task": "Avoid a stack overflow when recursing over a very deep tree in Python.",
        "better_answer": "Convert the recursion to an explicit stack/iterative form, or raise the recursion limit knowingly; the depth is the real cause.",
        "worse_answer": "Wrap the recursion in a try/except and hope it does not overflow.",
    },
]


def main() -> int:
    answerer_model = DEFAULT_ANSWERER_MODEL
    grader_model = DEFAULT_GRADER_MODEL
    if answerer_model == grader_model:
        print("WARNING: answerer and grader are the same model; pick different "
              "models to reduce self-grading bias (set EVAL_GRADER_MODEL).")

    print(f"Verifying models are live on OpenRouter...")
    status = verify_models(answerer_model, grader_model)
    for mid, ok in status.items():
        print(f"  {mid}: {'OK' if ok is True else ok}")
    if not all(v is True for v in status.values()):
        print("\nOne or more model ids are not in the live catalog. Set "
              "EVAL_ANSWERER_MODEL / EVAL_GRADER_MODEL to current ids "
              "(see https://openrouter.ai/models) and re-run.")
        return 1

    answerer = make_completer(answerer_model, temperature=0.2, max_tokens=900)
    grader = make_completer(grader_model, system=GRADER_SYSTEM,
                            temperature=0.0, max_tokens=300)

    print("\n--- Calibration (does the blind grader agree with human labels?) ---")
    cal = run_calibration(grader, CALIBRATION)
    print("grader agreement:", cal["agreement"])
    for row in cal["rows"]:
        print(f"  agree={row['agree']}  better={row['better']} worse={row['worse']}  {row['task']}")

    print("\n--- Running skill-uplift eval ---")
    report = run_eval(CASES, answerer, grader, out_dir="./skill_eval_runs")

    print("\n========== RESULT ==========")
    print(f"answerer: {answerer_model}   grader: {grader_model}")
    print(f"calibration: {cal['agreement']}")
    print(f"cases scored: {report['n_scored']}  (excluded as leaked: {report['n_leaked_excluded']})")
    print(f"baseline (no skill): {report['baseline_no_skill_mean']}")
    print(f"with skill:          {report['with_skill_mean']}")
    print(f"UPLIFT:              {report['uplift']}")
    print(f"win / tie / loss:    {report['wins']} / {report['ties']} / {report['losses']}")
    print("per-case:")
    for c in report["per_case"]:
        flag = " [LEAKED, excluded]" if c["leaked"] else ""
        print(f"  {c['name']}: no={c['no_skill']} with={c['with_skill']} Δ={c['delta']}{flag}")
    print("\nraw generations saved to:", report["raw_saved_to"])
    print("\nReport the number above as-is. Modest or zero uplift is a valid result.")
    return 0


if __name__ == "__main__":
    sys.exit(main())