Upload jobs/run_real_llm_standalone.py
Browse files- jobs/run_real_llm_standalone.py +396 -1
jobs/run_real_llm_standalone.py
CHANGED
|
@@ -1 +1,396 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Self-contained GPU job for real LLM code benchmark.
|
| 3 |
+
Includes all OCC classes inline to avoid import issues.
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import random
|
| 8 |
+
import re
|
| 9 |
+
import subprocess
|
| 10 |
+
import sys
|
| 11 |
+
import tempfile
|
| 12 |
+
import time
|
| 13 |
+
from dataclasses import dataclass, field
|
| 14 |
+
from enum import Enum
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 17 |
+
|
| 18 |
+
from datasets import load_dataset
|
| 19 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 20 |
+
import torch
|
| 21 |
+
|
| 22 |
+
# ============== ORACLE (INLINE) ==============
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class OracleResult:
|
| 26 |
+
raw_score: float
|
| 27 |
+
cost_adjusted_score: float
|
| 28 |
+
confidence: float
|
| 29 |
+
evidence: Dict[str, Any]
|
| 30 |
+
reason: str
|
| 31 |
+
failure_tags: List[str] = field(default_factory=list)
|
| 32 |
+
reward_value: float = 0.0
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class ImpactOracle:
|
| 36 |
+
def __init__(self, compute_penalty_rate=0.0001, gaming_penalty=2.0):
|
| 37 |
+
self.compute_penalty_rate = compute_penalty_rate
|
| 38 |
+
self.gaming_penalty = gaming_penalty
|
| 39 |
+
|
| 40 |
+
def score(self, mode, action, context, result, agent_id=""):
|
| 41 |
+
correctness = result.get("correctness", 0.0)
|
| 42 |
+
compute_cost = result.get("compute_cost", 0.0)
|
| 43 |
+
public_pass = result.get("public_pass", correctness)
|
| 44 |
+
hidden_pass = result.get("hidden_tests_pass", correctness)
|
| 45 |
+
|
| 46 |
+
failure_tags = []
|
| 47 |
+
if public_pass and not hidden_pass:
|
| 48 |
+
failure_tags.append("gaming_hidden_tests")
|
| 49 |
+
|
| 50 |
+
raw = correctness * 1.0 - compute_cost * self.compute_penalty_rate
|
| 51 |
+
if "gaming_hidden_tests" in failure_tags:
|
| 52 |
+
raw -= self.gaming_penalty
|
| 53 |
+
|
| 54 |
+
cost_adj = raw - compute_cost * self.compute_penalty_rate
|
| 55 |
+
return OracleResult(
|
| 56 |
+
raw_score=raw,
|
| 57 |
+
cost_adjusted_score=cost_adj,
|
| 58 |
+
confidence=result.get("confidence", correctness),
|
| 59 |
+
evidence={"correctness": correctness},
|
| 60 |
+
reason=f"correctness={correctness:.2f}, cost={compute_cost}",
|
| 61 |
+
failure_tags=failure_tags,
|
| 62 |
+
reward_value=cost_adj,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# ============== LEDGER (INLINE) ==============
|
| 66 |
+
|
| 67 |
+
@dataclass
|
| 68 |
+
class LedgerEntry:
|
| 69 |
+
agent_id: str
|
| 70 |
+
task_id: str
|
| 71 |
+
action_id: str
|
| 72 |
+
earned_credit: float
|
| 73 |
+
spent_credit: float
|
| 74 |
+
decayed_credit: float
|
| 75 |
+
remaining_credit: float
|
| 76 |
+
reason: str
|
| 77 |
+
oracle_score: float
|
| 78 |
+
compute_cost: float
|
| 79 |
+
timestamp: float
|
| 80 |
+
capability_scope: str = "global"
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
class CreditLedger:
|
| 84 |
+
def __init__(self, decay_lambda=0.05):
|
| 85 |
+
self.entries = []
|
| 86 |
+
self.balances = {}
|
| 87 |
+
self.decay_lambda = decay_lambda
|
| 88 |
+
|
| 89 |
+
def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
|
| 90 |
+
now = time.time()
|
| 91 |
+
self._apply_decay(agent_id, now, capability_scope)
|
| 92 |
+
current = self._get(agent_id, capability_scope)
|
| 93 |
+
new_bal = current + amount
|
| 94 |
+
self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
|
| 95 |
+
self._set(agent_id, capability_scope, new_bal)
|
| 96 |
+
|
| 97 |
+
def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
|
| 98 |
+
now = time.time()
|
| 99 |
+
self._apply_decay(agent_id, now, capability_scope)
|
| 100 |
+
current = self._get(agent_id, capability_scope)
|
| 101 |
+
if current < amount:
|
| 102 |
+
return False
|
| 103 |
+
new_bal = current - amount
|
| 104 |
+
self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
|
| 105 |
+
self._set(agent_id, capability_scope, new_bal)
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
def balance(self, agent_id, capability_scope="global"):
|
| 109 |
+
now = time.time()
|
| 110 |
+
self._apply_decay(agent_id, now, capability_scope)
|
| 111 |
+
return self._get(agent_id, capability_scope)
|
| 112 |
+
|
| 113 |
+
def _get(self, agent_id, cap):
|
| 114 |
+
return self.balances.get(agent_id, {}).get(cap, 0.0)
|
| 115 |
+
|
| 116 |
+
def _set(self, agent_id, cap, val):
|
| 117 |
+
if agent_id not in self.balances:
|
| 118 |
+
self.balances[agent_id] = {}
|
| 119 |
+
self.balances[agent_id][cap] = val
|
| 120 |
+
|
| 121 |
+
def _apply_decay(self, agent_id, now, cap):
|
| 122 |
+
current = self._get(agent_id, cap)
|
| 123 |
+
if current <= 0:
|
| 124 |
+
return
|
| 125 |
+
decayed = current * (1 - self.decay_lambda)
|
| 126 |
+
if decayed < current:
|
| 127 |
+
self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
|
| 128 |
+
self._set(agent_id, cap, decayed)
|
| 129 |
+
|
| 130 |
+
# ============== BROKER (INLINE) ==============
|
| 131 |
+
|
| 132 |
+
class Decision(Enum):
|
| 133 |
+
ALLOW = "allow"
|
| 134 |
+
DENY = "deny"
|
| 135 |
+
REQUIRE_APPROVAL = "require_approval"
|
| 136 |
+
DOWNGRADE = "downgrade"
|
| 137 |
+
ESCALATE = "escalate"
|
| 138 |
+
ASK_JUSTIFICATION = "ask_justification"
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
@dataclass
|
| 142 |
+
class ResourceDecision:
|
| 143 |
+
decision: Decision
|
| 144 |
+
reason: str
|
| 145 |
+
capability: str
|
| 146 |
+
downgrade_to: Optional[str] = None
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class ResourceBroker:
|
| 150 |
+
RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
|
| 151 |
+
"debate_turn": "low", "file_write": "high", "shell_execute": "high",
|
| 152 |
+
"memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
|
| 153 |
+
DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}
|
| 154 |
+
|
| 155 |
+
def __init__(self, thresholds=None, urgency_boost=0.5):
|
| 156 |
+
self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
|
| 157 |
+
self.urgency_boost = urgency_boost
|
| 158 |
+
self.denial_history = {}
|
| 159 |
+
|
| 160 |
+
def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
|
| 161 |
+
task_state = task_state or {}
|
| 162 |
+
gaming_flags = gaming_flags or []
|
| 163 |
+
risk_class = self.RESOURCE_RISK.get(capability, "medium")
|
| 164 |
+
threshold = self.thresholds.get(risk_class, 2.0)
|
| 165 |
+
urgency = task_state.get("urgency", 0.0)
|
| 166 |
+
adjusted = max(0.1, threshold - urgency * self.urgency_boost)
|
| 167 |
+
|
| 168 |
+
if gaming_flags:
|
| 169 |
+
return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
|
| 170 |
+
if risk_class == "high" and risk_score > 0.7:
|
| 171 |
+
return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
|
| 172 |
+
if credit_balance >= adjusted:
|
| 173 |
+
return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
|
| 174 |
+
if credit_balance >= adjusted * 0.5:
|
| 175 |
+
if risk_class == "medium":
|
| 176 |
+
return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
|
| 177 |
+
return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
|
| 178 |
+
denials = self.denial_history.get(agent_id, 0)
|
| 179 |
+
if denials > 3:
|
| 180 |
+
return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
|
| 181 |
+
self.denial_history[agent_id] = denials + 1
|
| 182 |
+
return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)
|
| 183 |
+
|
| 184 |
+
# ============== BENCHMARK (INLINE) ==============
|
| 185 |
+
|
| 186 |
+
def extract_function_body(code, entry_point):
|
| 187 |
+
pattern = rf'def\s+{re.escape(entry_point)}\s*\([^)]*\).*?(?=\ndef\s|\Z)'
|
| 188 |
+
match = re.search(pattern, code, re.DOTALL)
|
| 189 |
+
return match.group(0) if match else code
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def run_tests(code, test_code, timeout=5):
|
| 193 |
+
full = code + "\n\n" + test_code + "\n\ncheck()\n"
|
| 194 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
| 195 |
+
f.write(full)
|
| 196 |
+
tmp = f.name
|
| 197 |
+
try:
|
| 198 |
+
result = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=timeout)
|
| 199 |
+
passed = result.returncode == 0
|
| 200 |
+
error = result.stderr if not passed else ""
|
| 201 |
+
except subprocess.TimeoutExpired:
|
| 202 |
+
passed = False
|
| 203 |
+
error = "Timeout"
|
| 204 |
+
except Exception as e:
|
| 205 |
+
passed = False
|
| 206 |
+
error = str(e)
|
| 207 |
+
finally:
|
| 208 |
+
os.unlink(tmp)
|
| 209 |
+
return passed, error
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
class RealLLMBenchmark:
|
| 213 |
+
def __init__(self, model_name="Qwen/Qwen2.5-Coder-0.5B-Instruct", n_problems=20, seed=42):
|
| 214 |
+
self.model_name = model_name
|
| 215 |
+
self.n_problems = n_problems
|
| 216 |
+
self.seed = seed
|
| 217 |
+
self.oracle = ImpactOracle()
|
| 218 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 219 |
+
print(f"Using device: {self.device}")
|
| 220 |
+
|
| 221 |
+
def load_problems(self):
|
| 222 |
+
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 223 |
+
problems = []
|
| 224 |
+
for i, item in enumerate(ds):
|
| 225 |
+
if i >= self.n_problems:
|
| 226 |
+
break
|
| 227 |
+
problems.append({
|
| 228 |
+
"task_id": item["task_id"],
|
| 229 |
+
"prompt": item["prompt"],
|
| 230 |
+
"test": item["test"],
|
| 231 |
+
"entry_point": item["entry_point"],
|
| 232 |
+
})
|
| 233 |
+
return problems
|
| 234 |
+
|
| 235 |
+
def load_model(self):
|
| 236 |
+
print(f"Loading {self.model_name}...")
|
| 237 |
+
tok = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
|
| 238 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 239 |
+
self.model_name,
|
| 240 |
+
trust_remote_code=True,
|
| 241 |
+
dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
|
| 242 |
+
device_map="auto" if self.device == "cuda" else None,
|
| 243 |
+
)
|
| 244 |
+
if self.device == "cpu":
|
| 245 |
+
model = model.to("cpu")
|
| 246 |
+
print("Model loaded.")
|
| 247 |
+
return model, tok
|
| 248 |
+
|
| 249 |
+
def generate(self, model, tok, prompt, max_new_tokens=256, temperature=0.7):
|
| 250 |
+
inputs = tok(prompt, return_tensors="pt").to(model.device)
|
| 251 |
+
with torch.no_grad():
|
| 252 |
+
outputs = model.generate(
|
| 253 |
+
**inputs,
|
| 254 |
+
max_new_tokens=max_new_tokens,
|
| 255 |
+
temperature=temperature,
|
| 256 |
+
do_sample=True,
|
| 257 |
+
pad_token_id=tok.eos_token_id,
|
| 258 |
+
)
|
| 259 |
+
gen = tok.decode(outputs[0], skip_special_tokens=True)
|
| 260 |
+
return gen[len(prompt):]
|
| 261 |
+
|
| 262 |
+
def run_baseline(self, problems, model, tok):
|
| 263 |
+
results = []
|
| 264 |
+
total_compute = 0
|
| 265 |
+
for problem in problems:
|
| 266 |
+
code = self.generate(model, tok, problem["prompt"], max_new_tokens=256)
|
| 267 |
+
tokens = len(tok.encode(code))
|
| 268 |
+
total_compute += tokens
|
| 269 |
+
func = extract_function_body(code, problem["entry_point"])
|
| 270 |
+
full = problem["prompt"] + func
|
| 271 |
+
passed, error = run_tests(full, problem["test"])
|
| 272 |
+
results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "error": error})
|
| 273 |
+
print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
|
| 274 |
+
return {
|
| 275 |
+
"accuracy": sum(1 for r in results if r["passed"]) / len(results),
|
| 276 |
+
"total_compute": total_compute,
|
| 277 |
+
"mean_tokens": total_compute / len(problems),
|
| 278 |
+
"results": results,
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
def run_occ(self, problems, model, tok):
|
| 282 |
+
ledger = CreditLedger(decay_lambda=0.02)
|
| 283 |
+
broker = ResourceBroker()
|
| 284 |
+
ledger.earn("code_agent", "seed", "seed", 25.0, 0.0, 0.0, "initial", "model_call")
|
| 285 |
+
results = []
|
| 286 |
+
total_compute = 0
|
| 287 |
+
|
| 288 |
+
for problem in problems:
|
| 289 |
+
budget_remaining = 2000
|
| 290 |
+
attempts = 0
|
| 291 |
+
passed = False
|
| 292 |
+
best_code = ""
|
| 293 |
+
best_score = -999
|
| 294 |
+
|
| 295 |
+
while budget_remaining > 100 and attempts < 3 and not passed:
|
| 296 |
+
attempts += 1
|
| 297 |
+
balance = ledger.balance("code_agent", "model_call")
|
| 298 |
+
dec = broker.request("model_call", "code_agent", balance,
|
| 299 |
+
task_state={"attempts": attempts, "budget_remaining": budget_remaining})
|
| 300 |
+
if dec.decision == Decision.DENY:
|
| 301 |
+
break
|
| 302 |
+
|
| 303 |
+
temp = 0.5 if attempts == 1 else 0.8
|
| 304 |
+
max_tok = 128 if attempts == 1 else 256
|
| 305 |
+
code = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok, temperature=temp)
|
| 306 |
+
tokens = len(tok.encode(code))
|
| 307 |
+
budget_remaining -= tokens
|
| 308 |
+
total_compute += tokens
|
| 309 |
+
|
| 310 |
+
func = extract_function_body(code, problem["entry_point"])
|
| 311 |
+
full = problem["prompt"] + func
|
| 312 |
+
passed_now, error = run_tests(full, problem["test"])
|
| 313 |
+
score = 1.0 if passed_now else 0.0
|
| 314 |
+
|
| 315 |
+
oracle_res = self.oracle.score(
|
| 316 |
+
mode="code",
|
| 317 |
+
action={"attempt": attempts},
|
| 318 |
+
context={},
|
| 319 |
+
result={"correctness": score, "pass_at_k": score, "regression": False, "compute_cost": tokens, "public_pass": passed_now, "hidden_tests_pass": passed_now},
|
| 320 |
+
agent_id="code_agent",
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
if oracle_res.raw_score > best_score:
|
| 324 |
+
best_score = oracle_res.raw_score
|
| 325 |
+
best_code = code
|
| 326 |
+
passed = passed_now
|
| 327 |
+
|
| 328 |
+
if passed:
|
| 329 |
+
ledger.earn("code_agent", problem["task_id"], f"att_{attempts}", 5.0, oracle_res.raw_score, tokens, "pass", "model_call")
|
| 330 |
+
else:
|
| 331 |
+
ledger.spend("code_agent", problem["task_id"], f"att_{attempts}", 1.0, "model_call", reason="fail")
|
| 332 |
+
|
| 333 |
+
if attempts >= 2 and not passed:
|
| 334 |
+
break
|
| 335 |
+
|
| 336 |
+
results.append({"task_id": problem["task_id"], "passed": passed, "attempts": attempts,
|
| 337 |
+
"tokens_used": 2000 - budget_remaining, "best_score": best_score})
|
| 338 |
+
print(f" {problem['task_id']}: passed={passed}, attempts={attempts}")
|
| 339 |
+
|
| 340 |
+
return {
|
| 341 |
+
"accuracy": sum(1 for r in results if r["passed"]) / len(results),
|
| 342 |
+
"total_compute": total_compute,
|
| 343 |
+
"mean_tokens": total_compute / len(problems),
|
| 344 |
+
"mean_attempts": sum(r["attempts"] for r in results) / len(results),
|
| 345 |
+
"results": results,
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
def run_all(self):
|
| 349 |
+
problems = self.load_problems()
|
| 350 |
+
print(f"Loaded {len(problems)} problems")
|
| 351 |
+
model, tok = self.load_model()
|
| 352 |
+
|
| 353 |
+
print("\n--- Baseline ---")
|
| 354 |
+
baseline = self.run_baseline(problems, model, tok)
|
| 355 |
+
print(f"Baseline: accuracy={baseline['accuracy']:.3f}, compute={baseline['total_compute']}")
|
| 356 |
+
|
| 357 |
+
print("\n--- OCC ---")
|
| 358 |
+
occ = self.run_occ(problems, model, tok)
|
| 359 |
+
print(f"OCC: accuracy={occ['accuracy']:.3f}, compute={occ['total_compute']}")
|
| 360 |
+
|
| 361 |
+
comp = {
|
| 362 |
+
"baseline_accuracy": baseline["accuracy"],
|
| 363 |
+
"occ_accuracy": occ["accuracy"],
|
| 364 |
+
"baseline_compute": baseline["total_compute"],
|
| 365 |
+
"occ_compute": occ["total_compute"],
|
| 366 |
+
"compute_reduction": 1.0 - (occ["total_compute"] / max(baseline["total_compute"], 1)),
|
| 367 |
+
"accuracy_delta": occ["accuracy"] - baseline["accuracy"],
|
| 368 |
+
}
|
| 369 |
+
|
| 370 |
+
return {"baseline_fixed": baseline, "occ_budget": occ, "comparison": comp}
|
| 371 |
+
|
| 372 |
+
|
| 373 |
+
def main():
|
| 374 |
+
bench = RealLLMBenchmark(n_problems=20, seed=42)
|
| 375 |
+
results = bench.run_all()
|
| 376 |
+
|
| 377 |
+
print("\n" + "=" * 60)
|
| 378 |
+
print("REAL LLM CODE BENCHMARK")
|
| 379 |
+
print("=" * 60)
|
| 380 |
+
comp = results["comparison"]
|
| 381 |
+
print(f"Baseline accuracy: {comp['baseline_accuracy']:.3f}")
|
| 382 |
+
print(f"OCC accuracy: {comp['occ_accuracy']:.3f}")
|
| 383 |
+
print(f"Baseline compute: {comp['baseline_compute']}")
|
| 384 |
+
print(f"OCC compute: {comp['occ_compute']}")
|
| 385 |
+
print(f"Compute reduction: {comp['compute_reduction']:.1%}")
|
| 386 |
+
print(f"Accuracy delta: {comp['accuracy_delta']:+.3f}")
|
| 387 |
+
|
| 388 |
+
out_dir = Path("/app/occ/reports")
|
| 389 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 390 |
+
with open(out_dir / "benchmark_code_real_llm_results.json", "w") as f:
|
| 391 |
+
json.dump(results, f, indent=2, default=str)
|
| 392 |
+
print(f"\nSaved to {out_dir / 'benchmark_code_real_llm_results.json'}")
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
if __name__ == "__main__":
|
| 396 |
+
main()
|