Upload training/simple_strategies.py
Browse files- training/simple_strategies.py +103 -0
training/simple_strategies.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Simple strategies: always-cheap, cheap-then-frontier, threshold routing.
|
| 2 |
+
|
| 3 |
+
Compute ground-truth costs using SWE-Router traces. No model predictions
|
| 4 |
+
β just pure strategy evaluation against known outcomes.
|
| 5 |
+
"""
|
| 6 |
+
import json
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
from datasets import load_dataset
|
| 9 |
+
|
| 10 |
+
MODELS = ["deepseek-v4-flash","gpt-5-nano","gpt-5-mini","deepseek-v3.2",
|
| 11 |
+
"gemini-2.5-pro","claude-opus-4.7","gpt-5.2","gemini-3-pro"]
|
| 12 |
+
|
| 13 |
+
TIER_MODEL = {1:'deepseek-v4-flash',2:'gpt-5-mini',3:'gemini-2.5-pro',
|
| 14 |
+
4:'claude-opus-4.7',5:'gemini-3-pro'}
|
| 15 |
+
|
| 16 |
+
print("Loading...")
|
| 17 |
+
traces = defaultdict(dict)
|
| 18 |
+
for m in MODELS:
|
| 19 |
+
ds = load_dataset(f"SWE-Router/swebench-verified-{m}", split="test")
|
| 20 |
+
for row in ds:
|
| 21 |
+
traces[row["instance_id"]][m] = {"resolved":row["resolved"],"cost":float(row["instance_cost"])}
|
| 22 |
+
N = len(traces)
|
| 23 |
+
|
| 24 |
+
def eval_strategy(name, tier_sequence):
|
| 25 |
+
"""tier_sequence: list of tiers to try in order. Stop on first success."""
|
| 26 |
+
tc = 0; res = 0
|
| 27 |
+
for tid, tt in traces.items():
|
| 28 |
+
for tier in tier_sequence:
|
| 29 |
+
mt = tt.get(TIER_MODEL[tier],{})
|
| 30 |
+
tc += mt.get("cost",0.30)
|
| 31 |
+
if mt.get("resolved",False): res += 1; break
|
| 32 |
+
return {"name":name,"resolved":res,"rate":res/N,"total_cost":tc,"avg_cost":tc/N}
|
| 33 |
+
|
| 34 |
+
strategies = [
|
| 35 |
+
("Always Tier 1 (cheapest)", [1]),
|
| 36 |
+
("Always Tier 4 (frontier)", [4]),
|
| 37 |
+
("Tier 1 β Tier 4", [1,4]),
|
| 38 |
+
("Tier 1 β Tier 2 β Tier 4", [1,2,4]),
|
| 39 |
+
("Tier 1 β Tier 4 β Tier 5", [1,4,5]),
|
| 40 |
+
("Tier 1 β Tier 2 β Tier 3 β Tier 4", [1,2,3,4]),
|
| 41 |
+
("All tiers 1β5", [1,2,3,4,5]),
|
| 42 |
+
("Tier 4 β Tier 5", [4,5]),
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
results = [eval_strategy(name, seq) for name, seq in strategies]
|
| 46 |
+
|
| 47 |
+
# Baselines
|
| 48 |
+
fc = sum(tt.get("claude-opus-4.7",{}).get("cost",0.317) for tt in traces.values())/N
|
| 49 |
+
fr = sum(1 for tt in traces.values() if tt.get("claude-opus-4.7",{}).get("resolved"))/N
|
| 50 |
+
cc = sum(tt.get("deepseek-v4-flash",{}).get("cost",0.014) for tt in traces.values())/N
|
| 51 |
+
cr = sum(1 for tt in traces.values() if tt.get("deepseek-v4-flash",{}).get("resolved"))/N
|
| 52 |
+
|
| 53 |
+
# Oracle
|
| 54 |
+
oc, orr = 0, 0
|
| 55 |
+
for tt in traces.values():
|
| 56 |
+
best = None
|
| 57 |
+
for m, mt in tt.items():
|
| 58 |
+
if mt["resolved"] and (best is None or mt["cost"] < best["cost"]): best = mt
|
| 59 |
+
if best: oc += best["cost"]; orr += 1
|
| 60 |
+
else: oc += tt.get("claude-opus-4.7",{}).get("cost",0.317)
|
| 61 |
+
oc /= N; orr /= N
|
| 62 |
+
|
| 63 |
+
print(f"\n{'='*70}")
|
| 64 |
+
print("SIMPLE STRATEGIES ON SWE-BENCH (500 tasks)")
|
| 65 |
+
print(f"{'='*70}")
|
| 66 |
+
print(f"\n{'Strategy':<35} {'Resolved':>10} {'Rate':>8} {'AvgCost':>10} {'vsFrontier':>12}")
|
| 67 |
+
print("-"*75)
|
| 68 |
+
|
| 69 |
+
for r in results:
|
| 70 |
+
cr_pct = (1 - r["avg_cost"]/fc)*100
|
| 71 |
+
print(f" {r['name']:<33} {r['resolved']:>10} {r['rate']*100:>7.1f}% ${r['avg_cost']:>9.4f} {cr_pct:>+11.1f}%")
|
| 72 |
+
|
| 73 |
+
print(f"\n {'Frontier baseline':<33} {fr*N:>10.0f} {fr*100:>7.1f}% ${fc:>9.4f} {'--':>12}")
|
| 74 |
+
print(f" {'Always cheap':<33} {cr*N:>10.0f} {cr*100:>7.1f}% ${cc:>9.4f} {(1-cc/fc)*100:>+11.1f}%")
|
| 75 |
+
print(f" {'Oracle':<33} {orr*N:>10.0f} {orr*100:>7.1f}% ${oc:>9.4f} {(1-oc/fc)*100:>+11.1f}%")
|
| 76 |
+
|
| 77 |
+
# Pareto
|
| 78 |
+
print(f"\n{'='*70}")
|
| 79 |
+
print("PARETO ANALYSIS")
|
| 80 |
+
print(f"{'='*70}")
|
| 81 |
+
points = [(r["avg_cost"], r["rate"], r["name"]) for r in results]
|
| 82 |
+
points += [(cc, cr, "Always cheap"), (fc, fr, "Frontier"), (oc, orr, "Oracle")]
|
| 83 |
+
pareto = []
|
| 84 |
+
for cost, rate, name in sorted(points, key=lambda x: (-x[1], x[0])):
|
| 85 |
+
dom = any(pc<=cost and pr>=rate and (pc<cost or pr>rate) for pc,pr,_ in pareto)
|
| 86 |
+
pareto.append((cost,rate,name))
|
| 87 |
+
print(f" {name:<35} {rate*100:>7.1f}% ${cost:>9.4f} {'β
' if not dom else 'β'}")
|
| 88 |
+
|
| 89 |
+
# Key: what does the router vs simple T1βT4 look like?
|
| 90 |
+
t1t4 = next(r for r in results if r["name"]=="Tier 1 β Tier 4")
|
| 91 |
+
t1t2t4 = next(r for r in results if r["name"]=="Tier 1 β Tier 2 β Tier 4")
|
| 92 |
+
print(f"\n{'='*70}")
|
| 93 |
+
print("KEY COMPARISON: Router vs Simple Strategies")
|
| 94 |
+
print(f"{'='*70}")
|
| 95 |
+
print(f" Tier 1βTier 4: {t1t4['rate']*100:.1f}% @ ${t1t4['avg_cost']:.4f} ({t1t4['resolved']} resolved, ${t1t4['total_cost']:.2f} total)")
|
| 96 |
+
print(f" Tier 1βT2βT4: {t1t2t4['rate']*100:.1f}% @ ${t1t2t4['avg_cost']:.4f} ({t1t2t4['resolved']} resolved, ${t1t2t4['total_cost']:.2f} total)")
|
| 97 |
+
print(f" Frontier only: {fr*100:.1f}% @ ${fc:.4f}")
|
| 98 |
+
print(f" v10+feedback: 85.2% @ $0.4425")
|
| 99 |
+
|
| 100 |
+
# How many does T1βT4 lose vs frontier?
|
| 101 |
+
v10_resolved = t1t4['resolved']
|
| 102 |
+
print(f"\n T1βT4 resolves {v10_resolved} vs frontier's {int(fr*N)}")
|
| 103 |
+
print(f" Extra cost per additional resolve: ${(t1t4['total_cost']-fc*N)/max(v10_resolved-int(fr*N),1):.2f}")
|