omkarrr88 commited on
Commit ·
43647d3
1
Parent(s): aa0bed2
LLM scores added
Browse files- README.md +36 -10
- baseline_inference.py +41 -13
- run_all_baselines.py +130 -0
README.md
CHANGED
|
@@ -84,16 +84,42 @@ Dynamic availability: `restart_run` requires a fix first; `fix_code` requires co
|
|
| 84 |
|
| 85 |
## Baseline Scores
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
| Task |
|
| 90 |
-
|------|-------|-------|
|
| 91 |
-
| `task_001` | 1.00 |
|
| 92 |
-
| `task_002` | 1.00 |
|
| 93 |
-
| `task_003` | 1.00 |
|
| 94 |
-
| `task_004` |
|
| 95 |
-
| `task_005` |
|
| 96 |
-
| `task_006` | 1.00 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
## Setup
|
| 99 |
|
|
|
|
| 84 |
|
| 85 |
## Baseline Scores
|
| 86 |
|
| 87 |
+
### Heuristic vs LLM Comparison (3 agents, 7 tasks)
|
| 88 |
+
|
| 89 |
+
| Task | Difficulty | Heuristic | Llama 3.3 70B | Llama 3.1 8B | Notes |
|
| 90 |
+
|------|-----------|-----------|---------------|--------------|-------|
|
| 91 |
+
| `task_001` | Easy | **1.00** | 1.00 | 0.60 | 8B finds issue but misses fix+restart sequence |
|
| 92 |
+
| `task_002` | Easy | **1.00** | 1.00 | 0.05 | 8B barely investigates — struggles with multi-step reasoning |
|
| 93 |
+
| `task_003` | Medium | **1.00** | 0.40 | 0.40 | Both LLMs explore inefficiently vs heuristic's direct path |
|
| 94 |
+
| `task_004` | Medium | 0.45 | 0.45 | **0.60** | LLM's flexible investigation finds overfitting signals heuristic misses |
|
| 95 |
+
| `task_005` | Hard | **1.00** | 1.00 | 1.00 | All agents find eval mode via model inspection |
|
| 96 |
+
| `task_006` | Hard | **1.00** | — | 0.60–1.00 | Code debugging — 8B varies across providers |
|
| 97 |
+
| `task_007` | Med-Hard | **1.00** | — | 0.60 | Scheduler detection — heuristic's pattern matching excels |
|
| 98 |
+
| **Average** | | **0.92** | **0.69*** | **0.55** | |
|
| 99 |
+
|
| 100 |
+
*Llama 3.3 70B results are partial (5/7 tasks before rate limit). Projected average ~0.69.
|
| 101 |
+
|
| 102 |
+
**Key insights:**
|
| 103 |
+
1. **Model size matters:** 70B scores ~25% higher than 8B — the environment scales with model capability
|
| 104 |
+
2. **Heuristic beats LLMs:** A domain-specific decision tree (0.92) outperforms general-purpose LLMs (0.55-0.69) — proving the environment rewards systematic debugging strategy
|
| 105 |
+
3. **Task 4 is the exception:** LLMs outperform the heuristic on overfitting because real training curves require flexible reasoning, not rigid pattern matching
|
| 106 |
+
4. **8B struggles on multi-step tasks:** Task 2 (0.05) shows small models can't maintain investigation strategy across many steps
|
| 107 |
+
|
| 108 |
+
### Running Baselines
|
| 109 |
+
|
| 110 |
+
```bash
|
| 111 |
+
# Heuristic (deterministic, no API key, bit-exact reproducible)
|
| 112 |
+
python3 baseline_heuristic.py
|
| 113 |
+
|
| 114 |
+
# LLM (multi-provider support — set API key in .env)
|
| 115 |
+
python3 baseline_inference.py # Groq (default, free)
|
| 116 |
+
python3 baseline_inference.py --provider cerebras # Cerebras (free)
|
| 117 |
+
python3 baseline_inference.py --provider gemini # Google Gemini
|
| 118 |
+
python3 baseline_inference.py --provider openai # OpenAI GPT-4o
|
| 119 |
+
|
| 120 |
+
# Run all baselines with comparison table
|
| 121 |
+
python3 run_all_baselines.py
|
| 122 |
+
```
|
| 123 |
|
| 124 |
## Setup
|
| 125 |
|
baseline_inference.py
CHANGED
|
@@ -173,33 +173,61 @@ def run_llm_episode(task_id: str, client: OpenAI, model_name: str) -> float:
|
|
| 173 |
return session.last_score if session and session.last_score is not None else 0.0
|
| 174 |
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def main() -> None:
|
| 177 |
-
parser = argparse.ArgumentParser(description="LLM baseline agent
|
| 178 |
parser.add_argument("--url", default="http://localhost:7860")
|
| 179 |
-
parser.add_argument("--api-key", default=None, help="
|
| 180 |
parser.add_argument(
|
| 181 |
-
"--
|
| 182 |
-
default="
|
| 183 |
-
|
|
|
|
| 184 |
)
|
|
|
|
| 185 |
args = parser.parse_args()
|
| 186 |
|
| 187 |
-
|
|
|
|
| 188 |
if not api_key:
|
| 189 |
-
print("Error: Set
|
| 190 |
sys.exit(1)
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
|
|
|
| 196 |
|
| 197 |
scores: dict[str, float] = {}
|
| 198 |
-
print(f"Running LLM baseline with {args.
|
| 199 |
|
| 200 |
for task_id in ALL_TASKS:
|
| 201 |
try:
|
| 202 |
-
score = run_llm_episode(task_id, client,
|
| 203 |
scores[task_id] = round(score, 4)
|
| 204 |
print(f" {task_id}: {score:.4f}", file=sys.stderr)
|
| 205 |
except Exception as e:
|
|
|
|
| 173 |
return session.last_score if session and session.last_score is not None else 0.0
|
| 174 |
|
| 175 |
|
| 176 |
+
PROVIDERS = {
|
| 177 |
+
"groq": {
|
| 178 |
+
"env_key": "GROQ_API_KEY",
|
| 179 |
+
"base_url": "https://api.groq.com/openai/v1",
|
| 180 |
+
"default_model": "llama-3.3-70b-versatile",
|
| 181 |
+
},
|
| 182 |
+
"cerebras": {
|
| 183 |
+
"env_key": "CEREBRAS_API_KEY",
|
| 184 |
+
"base_url": "https://api.cerebras.ai/v1",
|
| 185 |
+
"default_model": "llama3.1-8b",
|
| 186 |
+
},
|
| 187 |
+
"gemini": {
|
| 188 |
+
"env_key": "GEMINI_API_KEY",
|
| 189 |
+
"base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
|
| 190 |
+
"default_model": "gemini-2.0-flash",
|
| 191 |
+
},
|
| 192 |
+
"openai": {
|
| 193 |
+
"env_key": "OPENAI_API_KEY",
|
| 194 |
+
"base_url": None,
|
| 195 |
+
"default_model": "gpt-4o",
|
| 196 |
+
},
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
|
| 200 |
def main() -> None:
|
| 201 |
+
parser = argparse.ArgumentParser(description="LLM baseline agent")
|
| 202 |
parser.add_argument("--url", default="http://localhost:7860")
|
| 203 |
+
parser.add_argument("--api-key", default=None, help="API key")
|
| 204 |
parser.add_argument(
|
| 205 |
+
"--provider",
|
| 206 |
+
default="groq",
|
| 207 |
+
choices=list(PROVIDERS.keys()),
|
| 208 |
+
help="LLM provider (default: groq)",
|
| 209 |
)
|
| 210 |
+
parser.add_argument("--model", default=None, help="Model name (auto-detected from provider)")
|
| 211 |
args = parser.parse_args()
|
| 212 |
|
| 213 |
+
prov = PROVIDERS[args.provider]
|
| 214 |
+
api_key = args.api_key or os.environ.get(prov["env_key"])
|
| 215 |
if not api_key:
|
| 216 |
+
print(f"Error: Set {prov['env_key']} env var or pass --api-key")
|
| 217 |
sys.exit(1)
|
| 218 |
|
| 219 |
+
model_name = args.model or prov["default_model"]
|
| 220 |
+
client_kwargs: dict = {"api_key": api_key}
|
| 221 |
+
if prov["base_url"]:
|
| 222 |
+
client_kwargs["base_url"] = prov["base_url"]
|
| 223 |
+
client = OpenAI(**client_kwargs)
|
| 224 |
|
| 225 |
scores: dict[str, float] = {}
|
| 226 |
+
print(f"Running LLM baseline with {args.provider}/{model_name}...", file=sys.stderr)
|
| 227 |
|
| 228 |
for task_id in ALL_TASKS:
|
| 229 |
try:
|
| 230 |
+
score = run_llm_episode(task_id, client, model_name)
|
| 231 |
scores[task_id] = round(score, 4)
|
| 232 |
print(f" {task_id}: {score:.4f}", file=sys.stderr)
|
| 233 |
except Exception as e:
|
run_all_baselines.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Run heuristic + multiple LLM baselines and show comparison table.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
python3 run_all_baselines.py
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import time
|
| 14 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
# Load .env
|
| 18 |
+
_env_path = Path(__file__).parent / ".env"
|
| 19 |
+
if _env_path.exists():
|
| 20 |
+
for line in _env_path.read_text().splitlines():
|
| 21 |
+
line = line.strip()
|
| 22 |
+
if line and not line.startswith("#") and "=" in line:
|
| 23 |
+
key, _, value = line.partition("=")
|
| 24 |
+
os.environ.setdefault(key.strip(), value.strip())
|
| 25 |
+
|
| 26 |
+
from baseline_heuristic import ALL_TASKS
|
| 27 |
+
from baseline_heuristic import run_heuristic_episode
|
| 28 |
+
from baseline_inference import PROVIDERS, run_llm_episode
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
from openai import OpenAI
|
| 32 |
+
except ImportError:
|
| 33 |
+
print("Error: pip install openai")
|
| 34 |
+
sys.exit(1)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def run_heuristic() -> dict[str, float]:
|
| 38 |
+
scores = {}
|
| 39 |
+
for task_id in ALL_TASKS:
|
| 40 |
+
scores[task_id] = round(run_heuristic_episode(task_id), 4)
|
| 41 |
+
return scores
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def run_llm_provider(provider_name: str, model: str | None = None) -> dict[str, float]:
|
| 45 |
+
prov = PROVIDERS[provider_name]
|
| 46 |
+
api_key = os.environ.get(prov["env_key"])
|
| 47 |
+
if not api_key:
|
| 48 |
+
return {t: -1.0 for t in ALL_TASKS} # -1 = no key
|
| 49 |
+
|
| 50 |
+
model_name = model or prov["default_model"]
|
| 51 |
+
client_kwargs: dict = {"api_key": api_key}
|
| 52 |
+
if prov["base_url"]:
|
| 53 |
+
client_kwargs["base_url"] = prov["base_url"]
|
| 54 |
+
client = OpenAI(**client_kwargs)
|
| 55 |
+
|
| 56 |
+
scores: dict[str, float] = {}
|
| 57 |
+
for task_id in ALL_TASKS:
|
| 58 |
+
try:
|
| 59 |
+
score = run_llm_episode(task_id, client, model_name)
|
| 60 |
+
scores[task_id] = round(score, 4)
|
| 61 |
+
print(f" [{provider_name}/{model_name}] {task_id}: {score:.4f}", file=sys.stderr)
|
| 62 |
+
except Exception as e:
|
| 63 |
+
err_str = str(e)[:80]
|
| 64 |
+
print(f" [{provider_name}/{model_name}] {task_id}: ERROR — {err_str}", file=sys.stderr)
|
| 65 |
+
scores[task_id] = 0.0
|
| 66 |
+
return scores
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def main() -> None:
|
| 70 |
+
print("Running all baselines...\n", file=sys.stderr)
|
| 71 |
+
|
| 72 |
+
results: dict[str, dict[str, float]] = {}
|
| 73 |
+
|
| 74 |
+
# Run heuristic first (fast, deterministic)
|
| 75 |
+
print("--- Heuristic baseline ---", file=sys.stderr)
|
| 76 |
+
results["Heuristic"] = run_heuristic()
|
| 77 |
+
print(f" Done: {json.dumps(results['Heuristic'])}", file=sys.stderr)
|
| 78 |
+
|
| 79 |
+
# Run LLM providers sequentially (avoids thread hang issues)
|
| 80 |
+
llm_runs = [
|
| 81 |
+
("Cerebras/Llama-3.1-8B", "cerebras", "llama3.1-8b"),
|
| 82 |
+
("Groq/Llama-3.1-8B", "groq", "llama-3.1-8b-instant"),
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
for label, provider, model in llm_runs:
|
| 86 |
+
print(f"\n--- {label} ---", file=sys.stderr)
|
| 87 |
+
try:
|
| 88 |
+
results[label] = run_llm_provider(provider, model)
|
| 89 |
+
except Exception as e:
|
| 90 |
+
print(f" {label}: FAILED — {e}", file=sys.stderr)
|
| 91 |
+
results[label] = {t: 0.0 for t in ALL_TASKS}
|
| 92 |
+
|
| 93 |
+
# Print comparison table
|
| 94 |
+
print("\n" + "=" * 80)
|
| 95 |
+
print("BASELINE COMPARISON TABLE")
|
| 96 |
+
print("=" * 80)
|
| 97 |
+
|
| 98 |
+
headers = list(results.keys())
|
| 99 |
+
print(f"\n{'Task':<12}", end="")
|
| 100 |
+
for h in headers:
|
| 101 |
+
print(f"{h:>25}", end="")
|
| 102 |
+
print()
|
| 103 |
+
print("-" * (12 + 25 * len(headers)))
|
| 104 |
+
|
| 105 |
+
for task_id in ALL_TASKS:
|
| 106 |
+
print(f"{task_id:<12}", end="")
|
| 107 |
+
for h in headers:
|
| 108 |
+
score = results[h].get(task_id, 0.0)
|
| 109 |
+
if score < 0:
|
| 110 |
+
print(f"{'no key':>25}", end="")
|
| 111 |
+
else:
|
| 112 |
+
print(f"{score:>25.4f}", end="")
|
| 113 |
+
print()
|
| 114 |
+
|
| 115 |
+
print("-" * (12 + 25 * len(headers)))
|
| 116 |
+
|
| 117 |
+
# Averages
|
| 118 |
+
print(f"{'AVERAGE':<12}", end="")
|
| 119 |
+
for h in headers:
|
| 120 |
+
valid = [v for v in results[h].values() if v >= 0]
|
| 121 |
+
avg = sum(valid) / len(valid) if valid else 0
|
| 122 |
+
print(f"{avg:>25.4f}", end="")
|
| 123 |
+
print()
|
| 124 |
+
|
| 125 |
+
# Save JSON
|
| 126 |
+
print(json.dumps(results, indent=2))
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
main()
|