narcolepticchicken
/

agent-cost-optimizer

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 11 days ago

Commit

cb2ec45

verified ·

1 Parent(s): e7550e8

Upload training/simple_strategies.py

Browse files

Files changed (1) hide show

training/simple_strategies.py +103 -0

training/simple_strategies.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""Simple strategies: always-cheap, cheap-then-frontier, threshold routing.
+Compute ground-truth costs using SWE-Router traces. No model predictions
+— just pure strategy evaluation against known outcomes.
+"""
+import json
+from collections import defaultdict
+from datasets import load_dataset
+MODELS = ["deepseek-v4-flash","gpt-5-nano","gpt-5-mini","deepseek-v3.2",
+          "gemini-2.5-pro","claude-opus-4.7","gpt-5.2","gemini-3-pro"]
+TIER_MODEL = {1:'deepseek-v4-flash',2:'gpt-5-mini',3:'gemini-2.5-pro',
+              4:'claude-opus-4.7',5:'gemini-3-pro'}
+print("Loading...")
+traces = defaultdict(dict)
+for m in MODELS:
+    ds = load_dataset(f"SWE-Router/swebench-verified-{m}", split="test")
+    for row in ds:
+        traces[row["instance_id"]][m] = {"resolved":row["resolved"],"cost":float(row["instance_cost"])}
+N = len(traces)
+def eval_strategy(name, tier_sequence):
+    """tier_sequence: list of tiers to try in order. Stop on first success."""
+    tc = 0; res = 0
+    for tid, tt in traces.items():
+        for tier in tier_sequence:
+            mt = tt.get(TIER_MODEL[tier],{})
+            tc += mt.get("cost",0.30)
+            if mt.get("resolved",False): res += 1; break
+    return {"name":name,"resolved":res,"rate":res/N,"total_cost":tc,"avg_cost":tc/N}
+strategies = [
+    ("Always Tier 1 (cheapest)", [1]),
+    ("Always Tier 4 (frontier)", [4]),
+    ("Tier 1 → Tier 4", [1,4]),
+    ("Tier 1 → Tier 2 → Tier 4", [1,2,4]),
+    ("Tier 1 → Tier 4 → Tier 5", [1,4,5]),
+    ("Tier 1 → Tier 2 → Tier 3 → Tier 4", [1,2,3,4]),
+    ("All tiers 1→5", [1,2,3,4,5]),
+    ("Tier 4 → Tier 5", [4,5]),
+]
+results = [eval_strategy(name, seq) for name, seq in strategies]
+# Baselines
+fc = sum(tt.get("claude-opus-4.7",{}).get("cost",0.317) for tt in traces.values())/N
+fr = sum(1 for tt in traces.values() if tt.get("claude-opus-4.7",{}).get("resolved"))/N
+cc = sum(tt.get("deepseek-v4-flash",{}).get("cost",0.014) for tt in traces.values())/N
+cr = sum(1 for tt in traces.values() if tt.get("deepseek-v4-flash",{}).get("resolved"))/N
+# Oracle
+oc, orr = 0, 0
+for tt in traces.values():
+    best = None
+    for m, mt in tt.items():
+        if mt["resolved"] and (best is None or mt["cost"] < best["cost"]): best = mt
+    if best: oc += best["cost"]; orr += 1
+    else: oc += tt.get("claude-opus-4.7",{}).get("cost",0.317)
+oc /= N; orr /= N
+print(f"\n{'='*70}")
+print("SIMPLE STRATEGIES ON SWE-BENCH (500 tasks)")
+print(f"{'='*70}")
+print(f"\n{'Strategy':<35} {'Resolved':>10} {'Rate':>8} {'AvgCost':>10} {'vsFrontier':>12}")
+print("-"*75)
+for r in results:
+    cr_pct = (1 - r["avg_cost"]/fc)*100
+    print(f"  {r['name']:<33} {r['resolved']:>10} {r['rate']*100:>7.1f}% ${r['avg_cost']:>9.4f} {cr_pct:>+11.1f}%")
+print(f"\n  {'Frontier baseline':<33} {fr*N:>10.0f} {fr*100:>7.1f}% ${fc:>9.4f} {'--':>12}")
+print(f"  {'Always cheap':<33} {cr*N:>10.0f} {cr*100:>7.1f}% ${cc:>9.4f} {(1-cc/fc)*100:>+11.1f}%")
+print(f"  {'Oracle':<33} {orr*N:>10.0f} {orr*100:>7.1f}% ${oc:>9.4f} {(1-oc/fc)*100:>+11.1f}%")
+# Pareto
+print(f"\n{'='*70}")
+print("PARETO ANALYSIS")
+print(f"{'='*70}")
+points = [(r["avg_cost"], r["rate"], r["name"]) for r in results]
+points += [(cc, cr, "Always cheap"), (fc, fr, "Frontier"), (oc, orr, "Oracle")]
+pareto = []
+for cost, rate, name in sorted(points, key=lambda x: (-x[1], x[0])):
+    dom = any(pc<=cost and pr>=rate and (pc<cost or pr>rate) for pc,pr,_ in pareto)
+    pareto.append((cost,rate,name))
+    print(f"  {name:<35} {rate*100:>7.1f}% ${cost:>9.4f} {'✅' if not dom else '❌'}")
+# Key: what does the router vs simple T1→T4 look like?
+t1t4 = next(r for r in results if r["name"]=="Tier 1 → Tier 4")
+t1t2t4 = next(r for r in results if r["name"]=="Tier 1 → Tier 2 → Tier 4")
+print(f"\n{'='*70}")
+print("KEY COMPARISON: Router vs Simple Strategies")
+print(f"{'='*70}")
+print(f"  Tier 1→Tier 4:  {t1t4['rate']*100:.1f}% @ ${t1t4['avg_cost']:.4f}  ({t1t4['resolved']} resolved, ${t1t4['total_cost']:.2f} total)")
+print(f"  Tier 1→T2→T4:  {t1t2t4['rate']*100:.1f}% @ ${t1t2t4['avg_cost']:.4f}  ({t1t2t4['resolved']} resolved, ${t1t2t4['total_cost']:.2f} total)")
+print(f"  Frontier only:  {fr*100:.1f}% @ ${fc:.4f}")
+print(f"  v10+feedback:   85.2% @ $0.4425")
+# How many does T1→T4 lose vs frontier?
+v10_resolved = t1t4['resolved']
+print(f"\n  T1→T4 resolves {v10_resolved} vs frontier's {int(fr*N)}")
+print(f"  Extra cost per additional resolve: ${(t1t4['total_cost']-fc*N)/max(v10_resolved-int(fr*N),1):.2f}")