narcolepticchicken commited on
Commit
c130a96
·
verified ·
1 Parent(s): 5a238af

Upload jobs/run_real_llm_standalone_v2.py

Browse files
Files changed (1) hide show
  1. jobs/run_real_llm_standalone_v2.py +460 -0
jobs/run_real_llm_standalone_v2.py ADDED
@@ -0,0 +1,460 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Self-contained GPU job for real LLM code benchmark — V2 with Qwen chat templating.
3
+ V1 failed because HumanEval sends raw Python stubs but Qwen-Coder expects chat formatting.
4
+ This version wraps prompts in <|im_start|> markers.
5
+ """
6
+ import json
7
+ import os
8
+ import random
9
+ import re
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ import time
14
+ from dataclasses import dataclass, field
15
+ from enum import Enum
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional, Tuple
18
+
19
+ from datasets import load_dataset
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+ import torch
22
+
23
+ # ============== ORACLE (INLINE) ==============
24
+
25
+ @dataclass
26
+ class OracleResult:
27
+ raw_score: float
28
+ cost_adjusted_score: float
29
+ confidence: float
30
+ evidence: Dict[str, Any]
31
+ reason: str
32
+ failure_tags: List[str] = field(default_factory=list)
33
+ reward_value: float = 0.0
34
+
35
+
36
+ class ImpactOracle:
37
+ def __init__(self, compute_penalty_rate=0.0001, gaming_penalty=2.0):
38
+ self.compute_penalty_rate = compute_penalty_rate
39
+ self.gaming_penalty = gaming_penalty
40
+
41
+ def score(self, mode, action, context, result, agent_id=""):
42
+ correctness = result.get("correctness", 0.0)
43
+ compute_cost = result.get("compute_cost", 0.0)
44
+ public_pass = result.get("public_pass", correctness)
45
+ hidden_pass = result.get("hidden_tests_pass", correctness)
46
+
47
+ failure_tags = []
48
+ if public_pass and not hidden_pass:
49
+ failure_tags.append("gaming_hidden_tests")
50
+
51
+ raw = correctness * 1.0 - compute_cost * self.compute_penalty_rate
52
+ if "gaming_hidden_tests" in failure_tags:
53
+ raw -= self.gaming_penalty
54
+
55
+ cost_adj = raw - compute_cost * self.compute_penalty_rate
56
+ return OracleResult(
57
+ raw_score=raw,
58
+ cost_adjusted_score=cost_adj,
59
+ confidence=result.get("confidence", correctness),
60
+ evidence={"correctness": correctness},
61
+ reason=f"correctness={correctness:.2f}, cost={compute_cost}",
62
+ failure_tags=failure_tags,
63
+ reward_value=cost_adj,
64
+ )
65
+
66
+
67
+ # ============== LEDGER (INLINE) ==============
68
+
69
+ @dataclass
70
+ class LedgerEntry:
71
+ agent_id: str
72
+ task_id: str
73
+ action_id: str
74
+ earned_credit: float
75
+ spent_credit: float
76
+ decayed_credit: float
77
+ remaining_credit: float
78
+ reason: str
79
+ oracle_score: float
80
+ compute_cost: float
81
+ timestamp: float
82
+ capability_scope: str = "global"
83
+
84
+
85
+ class CreditLedger:
86
+ def __init__(self, decay_lambda=0.05):
87
+ self.entries = []
88
+ self.balances = {}
89
+ self.decay_lambda = decay_lambda
90
+
91
+ def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
92
+ now = time.time()
93
+ self._apply_decay(agent_id, now, capability_scope)
94
+ current = self._get(agent_id, capability_scope)
95
+ new_bal = current + amount
96
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
97
+ self._set(agent_id, capability_scope, new_bal)
98
+
99
+ def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
100
+ now = time.time()
101
+ self._apply_decay(agent_id, now, capability_scope)
102
+ current = self._get(agent_id, capability_scope)
103
+ if current < amount:
104
+ return False
105
+ new_bal = current - amount
106
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
107
+ self._set(agent_id, capability_scope, new_bal)
108
+ return True
109
+
110
+ def balance(self, agent_id, capability_scope="global"):
111
+ now = time.time()
112
+ self._apply_decay(agent_id, now, capability_scope)
113
+ return self._get(agent_id, capability_scope)
114
+
115
+ def _get(self, agent_id, cap):
116
+ return self.balances.get(agent_id, {}).get(cap, 0.0)
117
+
118
+ def _set(self, agent_id, cap, val):
119
+ if agent_id not in self.balances:
120
+ self.balances[agent_id] = {}
121
+ self.balances[agent_id][cap] = val
122
+
123
+ def _apply_decay(self, agent_id, now, cap):
124
+ current = self._get(agent_id, cap)
125
+ if current <= 0:
126
+ return
127
+ decayed = current * (1 - self.decay_lambda)
128
+ if decayed < current:
129
+ self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
130
+ self._set(agent_id, cap, decayed)
131
+
132
+
133
+ # ============== BROKER (INLINE) ==============
134
+
135
+ class Decision(Enum):
136
+ ALLOW = "allow"
137
+ DENY = "deny"
138
+ REQUIRE_APPROVAL = "require_approval"
139
+ DOWNGRADE = "downgrade"
140
+ ESCALATE = "escalate"
141
+ ASK_JUSTIFICATION = "ask_justification"
142
+
143
+
144
+ @dataclass
145
+ class ResourceDecision:
146
+ decision: Decision
147
+ reason: str
148
+ capability: str
149
+ downgrade_to: Optional[str] = None
150
+
151
+
152
+ class ResourceBroker:
153
+ RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
154
+ "debate_turn": "low", "file_write": "high", "shell_execute": "high",
155
+ "memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
156
+ DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}
157
+
158
+ def __init__(self, thresholds=None, urgency_boost=0.5):
159
+ self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
160
+ self.urgency_boost = urgency_boost
161
+ self.denial_history = {}
162
+
163
+ def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
164
+ task_state = task_state or {}
165
+ gaming_flags = gaming_flags or []
166
+ risk_class = self.RESOURCE_RISK.get(capability, "medium")
167
+ threshold = self.thresholds.get(risk_class, 2.0)
168
+ urgency = task_state.get("urgency", 0.0)
169
+ adjusted = max(0.1, threshold - urgency * self.urgency_boost)
170
+
171
+ if gaming_flags:
172
+ return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
173
+ if risk_class == "high" and risk_score > 0.7:
174
+ return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
175
+ if credit_balance >= adjusted:
176
+ return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
177
+ if credit_balance >= adjusted * 0.5:
178
+ if risk_class == "medium":
179
+ return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
180
+ return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
181
+ denials = self.denial_history.get(agent_id, 0)
182
+ if denials > 3:
183
+ return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
184
+ self.denial_history[agent_id] = denials + 1
185
+ return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)
186
+
187
+
188
+ # ============== HELPERS ==============
189
+
190
+ def extract_function_body(code, entry_point):
191
+ """Extract just the function body from generated code."""
192
+ pattern = rf'def\s+{re.escape(entry_point)}\s*\([^)]*\).*?(?=\n(?:def\s|\S|$))'
193
+ match = re.search(pattern, code, re.DOTALL)
194
+ if match:
195
+ return match.group(0)
196
+ # Fallback: take everything from 'def entry_point' to end
197
+ pattern2 = rf'def\s+{re.escape(entry_point)}\s*\([^)]*\).*'
198
+ match2 = re.search(pattern2, code, re.DOTALL)
199
+ return match2.group(0) if match2 else code
200
+
201
+
202
+ def run_tests(code, test_code, timeout=5):
203
+ full = code + "\n\n" + test_code + "\n\ncheck()\n"
204
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
205
+ f.write(full)
206
+ tmp = f.name
207
+ try:
208
+ result = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=timeout)
209
+ passed = result.returncode == 0
210
+ error = result.stderr if not passed else ""
211
+ except subprocess.TimeoutExpired:
212
+ passed = False
213
+ error = "Timeout"
214
+ except Exception as e:
215
+ passed = False
216
+ error = str(e)
217
+ finally:
218
+ os.unlink(tmp)
219
+ return passed, error
220
+
221
+
222
+ # ============== QWEN CHAT TEMPLATE WRAPPER (V2 FIX) ==============
223
+
224
+ QWEN_SYSTEM = (
225
+ "You are an expert Python programmer. "
226
+ "Complete the function definition. Return ONLY the completed function body — no explanations, no markdown formatting."
227
+ )
228
+
229
+
230
+ def wrap_prompt_chat(humaneval_prompt: str, tok: AutoTokenizer) -> str:
231
+ """
232
+ V2 FIX: Wrap raw HumanEval prompt as Qwen chat-formatted input.
233
+
234
+ HumanEval sends:
235
+ 'def has_close_elements(numbers, threshold):\n """ Check if...'
236
+
237
+ Qwen-Coder-Instruct expects:
238
+ <|im_start|>system
239
+ You are an expert Python programmer...
240
+ <|im_end|>
241
+ <|im_start|>user
242
+ def has_close_elements(numbers, threshold):
243
+ """ Check if...
244
+ <|im_end|>
245
+ <|im_start|>assistant
246
+ """
247
+ messages = [
248
+ {"role": "system", "content": QWEN_SYSTEM},
249
+ {"role": "user", "content": humaneval_prompt.strip()},
250
+ ]
251
+ # Use the tokenizer's chat template if available
252
+ if hasattr(tok, "apply_chat_template") and tok.chat_template:
253
+ formatted = tok.apply_chat_template(
254
+ messages,
255
+ tokenize=False,
256
+ add_generation_prompt=True, # adds <|im_start|>assistant at end
257
+ )
258
+ return formatted
259
+ else:
260
+ # Manual fallback for Qwen-style models
261
+ return (
262
+ f"<|im_start|>system\n{QWEN_SYSTEM}<|im_end|>\n"
263
+ f"<|im_start|>user\n{humaneval_prompt.strip()}<|im_end|>\n"
264
+ f"<|im_start|>assistant\n"
265
+ )
266
+
267
+
268
+ # ============== BENCHMARK ==============
269
+
270
+ class RealLLMBenchmarkV2:
271
+ def __init__(self, model_name="Qwen/Qwen2.5-Coder-0.5B-Instruct", n_problems=20, seed=42):
272
+ self.model_name = model_name
273
+ self.n_problems = n_problems
274
+ self.seed = seed
275
+ self.oracle = ImpactOracle()
276
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
277
+ print(f"Using device: {self.device}")
278
+
279
+ def load_problems(self):
280
+ ds = load_dataset("openai/openai_humaneval", split="test")
281
+ problems = []
282
+ for i, item in enumerate(ds):
283
+ if i >= self.n_problems:
284
+ break
285
+ problems.append({
286
+ "task_id": item["task_id"],
287
+ "prompt": item["prompt"],
288
+ "test": item["test"],
289
+ "entry_point": item["entry_point"],
290
+ })
291
+ return problems
292
+
293
+ def load_model(self):
294
+ print(f"Loading {self.model_name}...")
295
+ tok = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
296
+ model = AutoModelForCausalLM.from_pretrained(
297
+ self.model_name,
298
+ trust_remote_code=True,
299
+ torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
300
+ device_map="auto" if self.device == "cuda" else None,
301
+ )
302
+ if self.device == "cpu":
303
+ model = model.to("cpu").float()
304
+ print(f"Model loaded. Chat template: {bool(tok.chat_template)}")
305
+ return model, tok
306
+
307
+ def generate(self, model, tok, prompt_raw, max_new_tokens=256, temperature=0.7):
308
+ """Generate with chat template wrapping."""
309
+ chat_prompt = wrap_prompt_chat(prompt_raw, tok)
310
+ inputs = tok(chat_prompt, return_tensors="pt").to(model.device)
311
+ with torch.no_grad():
312
+ outputs = model.generate(
313
+ **inputs,
314
+ max_new_tokens=max_new_tokens,
315
+ temperature=temperature,
316
+ do_sample=True,
317
+ pad_token_id=tok.eos_token_id,
318
+ )
319
+ gen = tok.decode(outputs[0], skip_special_tokens=True)
320
+ # Strip the chat prompt prefix to get just the generated code
321
+ prompt_decoded = tok.decode(inputs.input_ids[0], skip_special_tokens=True)
322
+ code = gen[len(prompt_decoded):].strip()
323
+ return code
324
+
325
+ def run_baseline(self, problems, model, tok):
326
+ results = []
327
+ total_compute = 0
328
+ for problem in problems:
329
+ code = self.generate(model, tok, problem["prompt"], max_new_tokens=256)
330
+ tokens = len(tok.encode(code))
331
+ total_compute += tokens
332
+ func = extract_function_body(code, problem["entry_point"])
333
+ full = problem["prompt"] + func
334
+ passed, error = run_tests(full, problem["test"])
335
+ results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "error": error})
336
+ print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
337
+ return {
338
+ "accuracy": sum(1 for r in results if r["passed"]) / len(results),
339
+ "total_compute": total_compute,
340
+ "mean_tokens": total_compute / len(problems),
341
+ "results": results,
342
+ }
343
+
344
+ def run_occ(self, problems, model, tok):
345
+ ledger = CreditLedger(decay_lambda=0.02)
346
+ broker = ResourceBroker()
347
+ ledger.earn("code_agent", "seed", "seed", 25.0, 0.0, 0.0, "initial", "model_call")
348
+ results = []
349
+ total_compute = 0
350
+
351
+ for problem in problems:
352
+ budget_remaining = 2000
353
+ attempts = 0
354
+ passed = False
355
+ best_code = ""
356
+ best_score = -999
357
+
358
+ while budget_remaining > 100 and attempts < 3 and not passed:
359
+ attempts += 1
360
+ balance = ledger.balance("code_agent", "model_call")
361
+ dec = broker.request("model_call", "code_agent", balance,
362
+ task_state={"attempts": attempts, "budget_remaining": budget_remaining})
363
+ if dec.decision == Decision.DENY:
364
+ break
365
+
366
+ temp = 0.5 if attempts == 1 else 0.8
367
+ max_tok = 128 if attempts == 1 else 256
368
+ code = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok, temperature=temp)
369
+ tokens = len(tok.encode(code))
370
+ budget_remaining -= tokens
371
+ total_compute += tokens
372
+
373
+ func = extract_function_body(code, problem["entry_point"])
374
+ full = problem["prompt"] + func
375
+ passed_now, error = run_tests(full, problem["test"])
376
+ score = 1.0 if passed_now else 0.0
377
+
378
+ oracle_res = self.oracle.score(
379
+ mode="code",
380
+ action={"attempt": attempts},
381
+ context={},
382
+ result={"correctness": score, "pass_at_k": score, "regression": False,
383
+ "compute_cost": tokens, "public_pass": passed_now, "hidden_tests_pass": passed_now},
384
+ agent_id="code_agent",
385
+ )
386
+
387
+ if oracle_res.raw_score > best_score:
388
+ best_score = oracle_res.raw_score
389
+ best_code = code
390
+ passed = passed_now
391
+
392
+ if passed:
393
+ ledger.earn("code_agent", problem["task_id"], f"att_{attempts}", 5.0, oracle_res.raw_score, tokens, "pass", "model_call")
394
+ else:
395
+ ledger.spend("code_agent", problem["task_id"], f"att_{attempts}", 1.0, "model_call", reason="fail")
396
+
397
+ if attempts >= 2 and not passed:
398
+ break
399
+
400
+ results.append({"task_id": problem["task_id"], "passed": passed, "attempts": attempts,
401
+ "tokens_used": 2000 - budget_remaining, "best_score": best_score})
402
+ print(f" {problem['task_id']}: passed={passed}, attempts={attempts}")
403
+
404
+ return {
405
+ "accuracy": sum(1 for r in results if r["passed"]) / len(results),
406
+ "total_compute": total_compute,
407
+ "mean_tokens": total_compute / len(problems),
408
+ "mean_attempts": sum(r["attempts"] for r in results) / len(results),
409
+ "results": results,
410
+ }
411
+
412
+ def run_all(self):
413
+ problems = self.load_problems()
414
+ print(f"Loaded {len(problems)} problems")
415
+ model, tok = self.load_model()
416
+
417
+ print("\n--- Baseline ---")
418
+ baseline = self.run_baseline(problems, model, tok)
419
+ print(f"Baseline: accuracy={baseline['accuracy']:.3f}, compute={baseline['total_compute']}")
420
+
421
+ print("\n--- OCC ---")
422
+ occ = self.run_occ(problems, model, tok)
423
+ print(f"OCC: accuracy={occ['accuracy']:.3f}, compute={occ['total_compute']}")
424
+
425
+ comp = {
426
+ "baseline_accuracy": baseline["accuracy"],
427
+ "occ_accuracy": occ["accuracy"],
428
+ "baseline_compute": baseline["total_compute"],
429
+ "occ_compute": occ["total_compute"],
430
+ "compute_reduction": 1.0 - (occ["total_compute"] / max(baseline["total_compute"], 1)),
431
+ "accuracy_delta": occ["accuracy"] - baseline["accuracy"],
432
+ }
433
+
434
+ return {"baseline_fixed": baseline, "occ_budget": occ, "comparison": comp}
435
+
436
+
437
+ def main():
438
+ bench = RealLLMBenchmarkV2(n_problems=20, seed=42)
439
+ results = bench.run_all()
440
+
441
+ print("\n" + "=" * 60)
442
+ print("REAL LLM CODE BENCHMARK (V2 — Chat Template Fix)")
443
+ print("=" * 60)
444
+ comp = results["comparison"]
445
+ print(f"Baseline accuracy: {comp['baseline_accuracy']:.3f}")
446
+ print(f"OCC accuracy: {comp['occ_accuracy']:.3f}")
447
+ print(f"Baseline compute: {comp['baseline_compute']}")
448
+ print(f"OCC compute: {comp['occ_compute']}")
449
+ print(f"Compute reduction: {comp['compute_reduction']:.1%}")
450
+ print(f"Accuracy delta: {comp['accuracy_delta']:+.3f}")
451
+
452
+ out_dir = Path("/app/occ/reports")
453
+ out_dir.mkdir(parents=True, exist_ok=True)
454
+ with open(out_dir / "benchmark_code_real_llm_v2_results.json", "w") as f:
455
+ json.dump(results, f, indent=2, default=str)
456
+ print(f"\nSaved to {out_dir / 'benchmark_code_real_llm_v2_results.json'}")
457
+
458
+
459
+ if __name__ == "__main__":
460
+ main()