narcolepticchicken commited on
Commit
cb2ec45
Β·
verified Β·
1 Parent(s): e7550e8

Upload training/simple_strategies.py

Browse files
Files changed (1) hide show
  1. training/simple_strategies.py +103 -0
training/simple_strategies.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simple strategies: always-cheap, cheap-then-frontier, threshold routing.
2
+
3
+ Compute ground-truth costs using SWE-Router traces. No model predictions
4
+ β€” just pure strategy evaluation against known outcomes.
5
+ """
6
+ import json
7
+ from collections import defaultdict
8
+ from datasets import load_dataset
9
+
10
+ MODELS = ["deepseek-v4-flash","gpt-5-nano","gpt-5-mini","deepseek-v3.2",
11
+ "gemini-2.5-pro","claude-opus-4.7","gpt-5.2","gemini-3-pro"]
12
+
13
+ TIER_MODEL = {1:'deepseek-v4-flash',2:'gpt-5-mini',3:'gemini-2.5-pro',
14
+ 4:'claude-opus-4.7',5:'gemini-3-pro'}
15
+
16
+ print("Loading...")
17
+ traces = defaultdict(dict)
18
+ for m in MODELS:
19
+ ds = load_dataset(f"SWE-Router/swebench-verified-{m}", split="test")
20
+ for row in ds:
21
+ traces[row["instance_id"]][m] = {"resolved":row["resolved"],"cost":float(row["instance_cost"])}
22
+ N = len(traces)
23
+
24
+ def eval_strategy(name, tier_sequence):
25
+ """tier_sequence: list of tiers to try in order. Stop on first success."""
26
+ tc = 0; res = 0
27
+ for tid, tt in traces.items():
28
+ for tier in tier_sequence:
29
+ mt = tt.get(TIER_MODEL[tier],{})
30
+ tc += mt.get("cost",0.30)
31
+ if mt.get("resolved",False): res += 1; break
32
+ return {"name":name,"resolved":res,"rate":res/N,"total_cost":tc,"avg_cost":tc/N}
33
+
34
+ strategies = [
35
+ ("Always Tier 1 (cheapest)", [1]),
36
+ ("Always Tier 4 (frontier)", [4]),
37
+ ("Tier 1 β†’ Tier 4", [1,4]),
38
+ ("Tier 1 β†’ Tier 2 β†’ Tier 4", [1,2,4]),
39
+ ("Tier 1 β†’ Tier 4 β†’ Tier 5", [1,4,5]),
40
+ ("Tier 1 β†’ Tier 2 β†’ Tier 3 β†’ Tier 4", [1,2,3,4]),
41
+ ("All tiers 1β†’5", [1,2,3,4,5]),
42
+ ("Tier 4 β†’ Tier 5", [4,5]),
43
+ ]
44
+
45
+ results = [eval_strategy(name, seq) for name, seq in strategies]
46
+
47
+ # Baselines
48
+ fc = sum(tt.get("claude-opus-4.7",{}).get("cost",0.317) for tt in traces.values())/N
49
+ fr = sum(1 for tt in traces.values() if tt.get("claude-opus-4.7",{}).get("resolved"))/N
50
+ cc = sum(tt.get("deepseek-v4-flash",{}).get("cost",0.014) for tt in traces.values())/N
51
+ cr = sum(1 for tt in traces.values() if tt.get("deepseek-v4-flash",{}).get("resolved"))/N
52
+
53
+ # Oracle
54
+ oc, orr = 0, 0
55
+ for tt in traces.values():
56
+ best = None
57
+ for m, mt in tt.items():
58
+ if mt["resolved"] and (best is None or mt["cost"] < best["cost"]): best = mt
59
+ if best: oc += best["cost"]; orr += 1
60
+ else: oc += tt.get("claude-opus-4.7",{}).get("cost",0.317)
61
+ oc /= N; orr /= N
62
+
63
+ print(f"\n{'='*70}")
64
+ print("SIMPLE STRATEGIES ON SWE-BENCH (500 tasks)")
65
+ print(f"{'='*70}")
66
+ print(f"\n{'Strategy':<35} {'Resolved':>10} {'Rate':>8} {'AvgCost':>10} {'vsFrontier':>12}")
67
+ print("-"*75)
68
+
69
+ for r in results:
70
+ cr_pct = (1 - r["avg_cost"]/fc)*100
71
+ print(f" {r['name']:<33} {r['resolved']:>10} {r['rate']*100:>7.1f}% ${r['avg_cost']:>9.4f} {cr_pct:>+11.1f}%")
72
+
73
+ print(f"\n {'Frontier baseline':<33} {fr*N:>10.0f} {fr*100:>7.1f}% ${fc:>9.4f} {'--':>12}")
74
+ print(f" {'Always cheap':<33} {cr*N:>10.0f} {cr*100:>7.1f}% ${cc:>9.4f} {(1-cc/fc)*100:>+11.1f}%")
75
+ print(f" {'Oracle':<33} {orr*N:>10.0f} {orr*100:>7.1f}% ${oc:>9.4f} {(1-oc/fc)*100:>+11.1f}%")
76
+
77
+ # Pareto
78
+ print(f"\n{'='*70}")
79
+ print("PARETO ANALYSIS")
80
+ print(f"{'='*70}")
81
+ points = [(r["avg_cost"], r["rate"], r["name"]) for r in results]
82
+ points += [(cc, cr, "Always cheap"), (fc, fr, "Frontier"), (oc, orr, "Oracle")]
83
+ pareto = []
84
+ for cost, rate, name in sorted(points, key=lambda x: (-x[1], x[0])):
85
+ dom = any(pc<=cost and pr>=rate and (pc<cost or pr>rate) for pc,pr,_ in pareto)
86
+ pareto.append((cost,rate,name))
87
+ print(f" {name:<35} {rate*100:>7.1f}% ${cost:>9.4f} {'βœ…' if not dom else '❌'}")
88
+
89
+ # Key: what does the router vs simple T1β†’T4 look like?
90
+ t1t4 = next(r for r in results if r["name"]=="Tier 1 β†’ Tier 4")
91
+ t1t2t4 = next(r for r in results if r["name"]=="Tier 1 β†’ Tier 2 β†’ Tier 4")
92
+ print(f"\n{'='*70}")
93
+ print("KEY COMPARISON: Router vs Simple Strategies")
94
+ print(f"{'='*70}")
95
+ print(f" Tier 1β†’Tier 4: {t1t4['rate']*100:.1f}% @ ${t1t4['avg_cost']:.4f} ({t1t4['resolved']} resolved, ${t1t4['total_cost']:.2f} total)")
96
+ print(f" Tier 1β†’T2β†’T4: {t1t2t4['rate']*100:.1f}% @ ${t1t2t4['avg_cost']:.4f} ({t1t2t4['resolved']} resolved, ${t1t2t4['total_cost']:.2f} total)")
97
+ print(f" Frontier only: {fr*100:.1f}% @ ${fc:.4f}")
98
+ print(f" v10+feedback: 85.2% @ $0.4425")
99
+
100
+ # How many does T1β†’T4 lose vs frontier?
101
+ v10_resolved = t1t4['resolved']
102
+ print(f"\n T1β†’T4 resolves {v10_resolved} vs frontier's {int(fr*N)}")
103
+ print(f" Extra cost per additional resolve: ${(t1t4['total_cost']-fc*N)/max(v10_resolved-int(fr*N),1):.2f}")