narcolepticchicken commited on
Commit
5a238af
·
verified ·
1 Parent(s): fddd9d2

Upload jobs/run_real_llm_standalone.py

Browse files
Files changed (1) hide show
  1. jobs/run_real_llm_standalone.py +396 -1
jobs/run_real_llm_standalone.py CHANGED
@@ -1 +1,396 @@
1
- See /app/occ/jobs/run_real_llm_standalone.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Self-contained GPU job for real LLM code benchmark.
3
+ Includes all OCC classes inline to avoid import issues.
4
+ """
5
+ import json
6
+ import os
7
+ import random
8
+ import re
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ import time
13
+ from dataclasses import dataclass, field
14
+ from enum import Enum
15
+ from pathlib import Path
16
+ from typing import Any, Dict, List, Optional, Tuple
17
+
18
+ from datasets import load_dataset
19
+ from transformers import AutoModelForCausalLM, AutoTokenizer
20
+ import torch
21
+
22
+ # ============== ORACLE (INLINE) ==============
23
+
24
+ @dataclass
25
+ class OracleResult:
26
+ raw_score: float
27
+ cost_adjusted_score: float
28
+ confidence: float
29
+ evidence: Dict[str, Any]
30
+ reason: str
31
+ failure_tags: List[str] = field(default_factory=list)
32
+ reward_value: float = 0.0
33
+
34
+
35
+ class ImpactOracle:
36
+ def __init__(self, compute_penalty_rate=0.0001, gaming_penalty=2.0):
37
+ self.compute_penalty_rate = compute_penalty_rate
38
+ self.gaming_penalty = gaming_penalty
39
+
40
+ def score(self, mode, action, context, result, agent_id=""):
41
+ correctness = result.get("correctness", 0.0)
42
+ compute_cost = result.get("compute_cost", 0.0)
43
+ public_pass = result.get("public_pass", correctness)
44
+ hidden_pass = result.get("hidden_tests_pass", correctness)
45
+
46
+ failure_tags = []
47
+ if public_pass and not hidden_pass:
48
+ failure_tags.append("gaming_hidden_tests")
49
+
50
+ raw = correctness * 1.0 - compute_cost * self.compute_penalty_rate
51
+ if "gaming_hidden_tests" in failure_tags:
52
+ raw -= self.gaming_penalty
53
+
54
+ cost_adj = raw - compute_cost * self.compute_penalty_rate
55
+ return OracleResult(
56
+ raw_score=raw,
57
+ cost_adjusted_score=cost_adj,
58
+ confidence=result.get("confidence", correctness),
59
+ evidence={"correctness": correctness},
60
+ reason=f"correctness={correctness:.2f}, cost={compute_cost}",
61
+ failure_tags=failure_tags,
62
+ reward_value=cost_adj,
63
+ )
64
+
65
+ # ============== LEDGER (INLINE) ==============
66
+
67
+ @dataclass
68
+ class LedgerEntry:
69
+ agent_id: str
70
+ task_id: str
71
+ action_id: str
72
+ earned_credit: float
73
+ spent_credit: float
74
+ decayed_credit: float
75
+ remaining_credit: float
76
+ reason: str
77
+ oracle_score: float
78
+ compute_cost: float
79
+ timestamp: float
80
+ capability_scope: str = "global"
81
+
82
+
83
+ class CreditLedger:
84
+ def __init__(self, decay_lambda=0.05):
85
+ self.entries = []
86
+ self.balances = {}
87
+ self.decay_lambda = decay_lambda
88
+
89
+ def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
90
+ now = time.time()
91
+ self._apply_decay(agent_id, now, capability_scope)
92
+ current = self._get(agent_id, capability_scope)
93
+ new_bal = current + amount
94
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
95
+ self._set(agent_id, capability_scope, new_bal)
96
+
97
+ def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
98
+ now = time.time()
99
+ self._apply_decay(agent_id, now, capability_scope)
100
+ current = self._get(agent_id, capability_scope)
101
+ if current < amount:
102
+ return False
103
+ new_bal = current - amount
104
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
105
+ self._set(agent_id, capability_scope, new_bal)
106
+ return True
107
+
108
+ def balance(self, agent_id, capability_scope="global"):
109
+ now = time.time()
110
+ self._apply_decay(agent_id, now, capability_scope)
111
+ return self._get(agent_id, capability_scope)
112
+
113
+ def _get(self, agent_id, cap):
114
+ return self.balances.get(agent_id, {}).get(cap, 0.0)
115
+
116
+ def _set(self, agent_id, cap, val):
117
+ if agent_id not in self.balances:
118
+ self.balances[agent_id] = {}
119
+ self.balances[agent_id][cap] = val
120
+
121
+ def _apply_decay(self, agent_id, now, cap):
122
+ current = self._get(agent_id, cap)
123
+ if current <= 0:
124
+ return
125
+ decayed = current * (1 - self.decay_lambda)
126
+ if decayed < current:
127
+ self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
128
+ self._set(agent_id, cap, decayed)
129
+
130
+ # ============== BROKER (INLINE) ==============
131
+
132
+ class Decision(Enum):
133
+ ALLOW = "allow"
134
+ DENY = "deny"
135
+ REQUIRE_APPROVAL = "require_approval"
136
+ DOWNGRADE = "downgrade"
137
+ ESCALATE = "escalate"
138
+ ASK_JUSTIFICATION = "ask_justification"
139
+
140
+
141
+ @dataclass
142
+ class ResourceDecision:
143
+ decision: Decision
144
+ reason: str
145
+ capability: str
146
+ downgrade_to: Optional[str] = None
147
+
148
+
149
+ class ResourceBroker:
150
+ RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
151
+ "debate_turn": "low", "file_write": "high", "shell_execute": "high",
152
+ "memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
153
+ DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}
154
+
155
+ def __init__(self, thresholds=None, urgency_boost=0.5):
156
+ self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
157
+ self.urgency_boost = urgency_boost
158
+ self.denial_history = {}
159
+
160
+ def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
161
+ task_state = task_state or {}
162
+ gaming_flags = gaming_flags or []
163
+ risk_class = self.RESOURCE_RISK.get(capability, "medium")
164
+ threshold = self.thresholds.get(risk_class, 2.0)
165
+ urgency = task_state.get("urgency", 0.0)
166
+ adjusted = max(0.1, threshold - urgency * self.urgency_boost)
167
+
168
+ if gaming_flags:
169
+ return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
170
+ if risk_class == "high" and risk_score > 0.7:
171
+ return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
172
+ if credit_balance >= adjusted:
173
+ return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
174
+ if credit_balance >= adjusted * 0.5:
175
+ if risk_class == "medium":
176
+ return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
177
+ return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
178
+ denials = self.denial_history.get(agent_id, 0)
179
+ if denials > 3:
180
+ return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
181
+ self.denial_history[agent_id] = denials + 1
182
+ return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)
183
+
184
+ # ============== BENCHMARK (INLINE) ==============
185
+
186
+ def extract_function_body(code, entry_point):
187
+ pattern = rf'def\s+{re.escape(entry_point)}\s*\([^)]*\).*?(?=\ndef\s|\Z)'
188
+ match = re.search(pattern, code, re.DOTALL)
189
+ return match.group(0) if match else code
190
+
191
+
192
+ def run_tests(code, test_code, timeout=5):
193
+ full = code + "\n\n" + test_code + "\n\ncheck()\n"
194
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
195
+ f.write(full)
196
+ tmp = f.name
197
+ try:
198
+ result = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=timeout)
199
+ passed = result.returncode == 0
200
+ error = result.stderr if not passed else ""
201
+ except subprocess.TimeoutExpired:
202
+ passed = False
203
+ error = "Timeout"
204
+ except Exception as e:
205
+ passed = False
206
+ error = str(e)
207
+ finally:
208
+ os.unlink(tmp)
209
+ return passed, error
210
+
211
+
212
+ class RealLLMBenchmark:
213
+ def __init__(self, model_name="Qwen/Qwen2.5-Coder-0.5B-Instruct", n_problems=20, seed=42):
214
+ self.model_name = model_name
215
+ self.n_problems = n_problems
216
+ self.seed = seed
217
+ self.oracle = ImpactOracle()
218
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
219
+ print(f"Using device: {self.device}")
220
+
221
+ def load_problems(self):
222
+ ds = load_dataset("openai/openai_humaneval", split="test")
223
+ problems = []
224
+ for i, item in enumerate(ds):
225
+ if i >= self.n_problems:
226
+ break
227
+ problems.append({
228
+ "task_id": item["task_id"],
229
+ "prompt": item["prompt"],
230
+ "test": item["test"],
231
+ "entry_point": item["entry_point"],
232
+ })
233
+ return problems
234
+
235
+ def load_model(self):
236
+ print(f"Loading {self.model_name}...")
237
+ tok = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
238
+ model = AutoModelForCausalLM.from_pretrained(
239
+ self.model_name,
240
+ trust_remote_code=True,
241
+ dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
242
+ device_map="auto" if self.device == "cuda" else None,
243
+ )
244
+ if self.device == "cpu":
245
+ model = model.to("cpu")
246
+ print("Model loaded.")
247
+ return model, tok
248
+
249
+ def generate(self, model, tok, prompt, max_new_tokens=256, temperature=0.7):
250
+ inputs = tok(prompt, return_tensors="pt").to(model.device)
251
+ with torch.no_grad():
252
+ outputs = model.generate(
253
+ **inputs,
254
+ max_new_tokens=max_new_tokens,
255
+ temperature=temperature,
256
+ do_sample=True,
257
+ pad_token_id=tok.eos_token_id,
258
+ )
259
+ gen = tok.decode(outputs[0], skip_special_tokens=True)
260
+ return gen[len(prompt):]
261
+
262
+ def run_baseline(self, problems, model, tok):
263
+ results = []
264
+ total_compute = 0
265
+ for problem in problems:
266
+ code = self.generate(model, tok, problem["prompt"], max_new_tokens=256)
267
+ tokens = len(tok.encode(code))
268
+ total_compute += tokens
269
+ func = extract_function_body(code, problem["entry_point"])
270
+ full = problem["prompt"] + func
271
+ passed, error = run_tests(full, problem["test"])
272
+ results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "error": error})
273
+ print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
274
+ return {
275
+ "accuracy": sum(1 for r in results if r["passed"]) / len(results),
276
+ "total_compute": total_compute,
277
+ "mean_tokens": total_compute / len(problems),
278
+ "results": results,
279
+ }
280
+
281
+ def run_occ(self, problems, model, tok):
282
+ ledger = CreditLedger(decay_lambda=0.02)
283
+ broker = ResourceBroker()
284
+ ledger.earn("code_agent", "seed", "seed", 25.0, 0.0, 0.0, "initial", "model_call")
285
+ results = []
286
+ total_compute = 0
287
+
288
+ for problem in problems:
289
+ budget_remaining = 2000
290
+ attempts = 0
291
+ passed = False
292
+ best_code = ""
293
+ best_score = -999
294
+
295
+ while budget_remaining > 100 and attempts < 3 and not passed:
296
+ attempts += 1
297
+ balance = ledger.balance("code_agent", "model_call")
298
+ dec = broker.request("model_call", "code_agent", balance,
299
+ task_state={"attempts": attempts, "budget_remaining": budget_remaining})
300
+ if dec.decision == Decision.DENY:
301
+ break
302
+
303
+ temp = 0.5 if attempts == 1 else 0.8
304
+ max_tok = 128 if attempts == 1 else 256
305
+ code = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok, temperature=temp)
306
+ tokens = len(tok.encode(code))
307
+ budget_remaining -= tokens
308
+ total_compute += tokens
309
+
310
+ func = extract_function_body(code, problem["entry_point"])
311
+ full = problem["prompt"] + func
312
+ passed_now, error = run_tests(full, problem["test"])
313
+ score = 1.0 if passed_now else 0.0
314
+
315
+ oracle_res = self.oracle.score(
316
+ mode="code",
317
+ action={"attempt": attempts},
318
+ context={},
319
+ result={"correctness": score, "pass_at_k": score, "regression": False, "compute_cost": tokens, "public_pass": passed_now, "hidden_tests_pass": passed_now},
320
+ agent_id="code_agent",
321
+ )
322
+
323
+ if oracle_res.raw_score > best_score:
324
+ best_score = oracle_res.raw_score
325
+ best_code = code
326
+ passed = passed_now
327
+
328
+ if passed:
329
+ ledger.earn("code_agent", problem["task_id"], f"att_{attempts}", 5.0, oracle_res.raw_score, tokens, "pass", "model_call")
330
+ else:
331
+ ledger.spend("code_agent", problem["task_id"], f"att_{attempts}", 1.0, "model_call", reason="fail")
332
+
333
+ if attempts >= 2 and not passed:
334
+ break
335
+
336
+ results.append({"task_id": problem["task_id"], "passed": passed, "attempts": attempts,
337
+ "tokens_used": 2000 - budget_remaining, "best_score": best_score})
338
+ print(f" {problem['task_id']}: passed={passed}, attempts={attempts}")
339
+
340
+ return {
341
+ "accuracy": sum(1 for r in results if r["passed"]) / len(results),
342
+ "total_compute": total_compute,
343
+ "mean_tokens": total_compute / len(problems),
344
+ "mean_attempts": sum(r["attempts"] for r in results) / len(results),
345
+ "results": results,
346
+ }
347
+
348
+ def run_all(self):
349
+ problems = self.load_problems()
350
+ print(f"Loaded {len(problems)} problems")
351
+ model, tok = self.load_model()
352
+
353
+ print("\n--- Baseline ---")
354
+ baseline = self.run_baseline(problems, model, tok)
355
+ print(f"Baseline: accuracy={baseline['accuracy']:.3f}, compute={baseline['total_compute']}")
356
+
357
+ print("\n--- OCC ---")
358
+ occ = self.run_occ(problems, model, tok)
359
+ print(f"OCC: accuracy={occ['accuracy']:.3f}, compute={occ['total_compute']}")
360
+
361
+ comp = {
362
+ "baseline_accuracy": baseline["accuracy"],
363
+ "occ_accuracy": occ["accuracy"],
364
+ "baseline_compute": baseline["total_compute"],
365
+ "occ_compute": occ["total_compute"],
366
+ "compute_reduction": 1.0 - (occ["total_compute"] / max(baseline["total_compute"], 1)),
367
+ "accuracy_delta": occ["accuracy"] - baseline["accuracy"],
368
+ }
369
+
370
+ return {"baseline_fixed": baseline, "occ_budget": occ, "comparison": comp}
371
+
372
+
373
+ def main():
374
+ bench = RealLLMBenchmark(n_problems=20, seed=42)
375
+ results = bench.run_all()
376
+
377
+ print("\n" + "=" * 60)
378
+ print("REAL LLM CODE BENCHMARK")
379
+ print("=" * 60)
380
+ comp = results["comparison"]
381
+ print(f"Baseline accuracy: {comp['baseline_accuracy']:.3f}")
382
+ print(f"OCC accuracy: {comp['occ_accuracy']:.3f}")
383
+ print(f"Baseline compute: {comp['baseline_compute']}")
384
+ print(f"OCC compute: {comp['occ_compute']}")
385
+ print(f"Compute reduction: {comp['compute_reduction']:.1%}")
386
+ print(f"Accuracy delta: {comp['accuracy_delta']:+.3f}")
387
+
388
+ out_dir = Path("/app/occ/reports")
389
+ out_dir.mkdir(parents=True, exist_ok=True)
390
+ with open(out_dir / "benchmark_code_real_llm_results.json", "w") as f:
391
+ json.dump(results, f, indent=2, default=str)
392
+ print(f"\nSaved to {out_dir / 'benchmark_code_real_llm_results.json'}")
393
+
394
+
395
+ if __name__ == "__main__":
396
+ main()