narcolepticchicken commited on
Commit
81e2e8f
·
verified ·
1 Parent(s): 87be1ca

Upload jobs/run_real_llm_standalone_v7.py

Browse files
Files changed (1) hide show
  1. jobs/run_real_llm_standalone_v7.py +347 -0
jobs/run_real_llm_standalone_v7.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Self-contained GPU job for real LLM code benchmark — V7.
3
+ FIXES:
4
+ 1. Better markdown stripping (only remove outer fences, not internal backticks)
5
+ 2. Larger model: Qwen2.5-Coder-1.5B-Instruct
6
+ 3. 512 max_new_tokens
7
+ 4. Better error logging: shows the actual test file that failed
8
+ """
9
+ import json
10
+ import os
11
+ import re
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import time
16
+ from dataclasses import dataclass, field
17
+ from enum import Enum
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional
20
+
21
+ from datasets import load_dataset
22
+ from transformers import AutoModelForCausalLM, AutoTokenizer
23
+ import torch
24
+
25
+
26
+ # --- CORE (INLINE) ---
27
+
28
+ @dataclass
29
+ class OracleResult:
30
+ raw_score: float; cost_adjusted_score: float; confidence: float
31
+ evidence: Dict[str, Any]; reason: str
32
+ failure_tags: List[str] = field(default_factory=list)
33
+ reward_value: float = 0.0
34
+
35
+
36
+ class ImpactOracle:
37
+ def __init__(self, compute_penalty_rate=0.0001, gaming_penalty=2.0):
38
+ self.compute_penalty_rate = compute_penalty_rate
39
+ self.gaming_penalty = gaming_penalty
40
+
41
+ def score(self, mode, action, context, result, agent_id=""):
42
+ correctness = result.get("correctness", 0.0)
43
+ compute_cost = result.get("compute_cost", 0.0)
44
+ public_pass = result.get("public_pass", correctness)
45
+ hidden_pass = result.get("hidden_tests_pass", correctness)
46
+ failure_tags = []
47
+ if public_pass and not hidden_pass: failure_tags.append("gaming_hidden_tests")
48
+ raw = correctness * 1.0 - compute_cost * self.compute_penalty_rate
49
+ if "gaming_hidden_tests" in failure_tags: raw -= self.gaming_penalty
50
+ cost_adj = raw - compute_cost * self.compute_penalty_rate
51
+ return OracleResult(raw, cost_adj, result.get("confidence", correctness),
52
+ {"correctness": correctness}, f"corr={correctness:.2f}, cost={compute_cost}", failure_tags, cost_adj)
53
+
54
+
55
+ @dataclass
56
+ class LedgerEntry:
57
+ agent_id: str; task_id: str; action_id: str; earned_credit: float; spent_credit: float
58
+ decayed_credit: float; remaining_credit: float; reason: str; oracle_score: float
59
+ compute_cost: float; timestamp: float; capability_scope: str = "global"
60
+
61
+
62
+ class CreditLedger:
63
+ def __init__(self, decay_lambda=0.05):
64
+ self.entries = []; self.balances = {}; self.decay_lambda = decay_lambda
65
+
66
+ def earn(self, agent_id, task_id, action_id, amount, oracle_score, compute_cost, reason, capability_scope="global"):
67
+ now = time.time(); self._apply_decay(agent_id, now, capability_scope)
68
+ current = self._get(agent_id, capability_scope); new_bal = current + amount
69
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, amount, 0.0, 0.0, new_bal, reason, oracle_score, compute_cost, now, capability_scope))
70
+ self._set(agent_id, capability_scope, new_bal)
71
+
72
+ def spend(self, agent_id, task_id, action_id, amount, capability_scope="global", reason="spend"):
73
+ now = time.time(); self._apply_decay(agent_id, now, capability_scope)
74
+ current = self._get(agent_id, capability_scope)
75
+ if current < amount: return False
76
+ new_bal = current - amount
77
+ self.entries.append(LedgerEntry(agent_id, task_id, action_id, 0.0, amount, 0.0, new_bal, reason, 0.0, 0.0, now, capability_scope))
78
+ self._set(agent_id, capability_scope, new_bal)
79
+ return True
80
+
81
+ def balance(self, agent_id, capability_scope="global"):
82
+ now = time.time(); self._apply_decay(agent_id, now, capability_scope)
83
+ return self._get(agent_id, capability_scope)
84
+
85
+ def _get(self, agent_id, cap): return self.balances.get(agent_id, {}).get(cap, 0.0)
86
+ def _set(self, agent_id, cap, val):
87
+ if agent_id not in self.balances: self.balances[agent_id] = {}
88
+ self.balances[agent_id][cap] = val
89
+ def _apply_decay(self, agent_id, now, cap):
90
+ current = self._get(agent_id, cap)
91
+ if current <= 0: return
92
+ decayed = current * (1 - self.decay_lambda)
93
+ if decayed < current:
94
+ self.entries.append(LedgerEntry(agent_id, "decay", "decay", 0.0, 0.0, current - decayed, decayed, "credit_decay", 0.0, 0.0, now, cap))
95
+ self._set(agent_id, cap, decayed)
96
+
97
+
98
+ class Decision(Enum):
99
+ ALLOW = "allow"; DENY = "deny"; REQUIRE_APPROVAL = "require_approval"
100
+ DOWNGRADE = "downgrade"; ESCALATE = "escalate"; ASK_JUSTIFICATION = "ask_justification"
101
+
102
+
103
+ @dataclass
104
+ class ResourceDecision:
105
+ decision: Decision; reason: str; capability: str; downgrade_to: Optional[str] = None
106
+
107
+
108
+ class ResourceBroker:
109
+ RESOURCE_RISK = {"model_call": "medium", "retrieval_call": "low", "verifier_call": "medium",
110
+ "debate_turn": "low", "file_write": "high", "shell_execute": "high",
111
+ "memory_write": "medium", "human_escalation": "high", "larger_model": "medium"}
112
+ DEFAULT_THRESHOLDS = {"low": 0.5, "medium": 2.0, "high": 5.0}
113
+
114
+ def __init__(self, thresholds=None, urgency_boost=0.5):
115
+ self.thresholds = thresholds or self.DEFAULT_THRESHOLDS.copy()
116
+ self.urgency_boost = urgency_boost
117
+ self.denial_history = {}
118
+
119
+ def request(self, capability, agent_id, credit_balance, task_state=None, risk_score=0.0, gaming_flags=None):
120
+ task_state = task_state or {}; gaming_flags = gaming_flags or []
121
+ risk_class = self.RESOURCE_RISK.get(capability, "medium")
122
+ threshold = self.thresholds.get(risk_class, 2.0)
123
+ urgency = task_state.get("urgency", 0.0)
124
+ adjusted = max(0.1, threshold - urgency * self.urgency_boost)
125
+ if gaming_flags: return ResourceDecision(Decision.DENY, f"Gaming: {gaming_flags}", capability)
126
+ if risk_class == "high" and risk_score > 0.7: return ResourceDecision(Decision.REQUIRE_APPROVAL, f"High risk {risk_score:.2f}", capability)
127
+ if credit_balance >= adjusted: return ResourceDecision(Decision.ALLOW, f"Balance {credit_balance:.2f} >= {adjusted:.2f}", capability)
128
+ if credit_balance >= adjusted * 0.5:
129
+ if risk_class == "medium": return ResourceDecision(Decision.DOWNGRADE, f"Downgrading from {capability}", capability, "retrieval_call")
130
+ return ResourceDecision(Decision.ASK_JUSTIFICATION, f"Justification required", capability)
131
+ denials = self.denial_history.get(agent_id, 0)
132
+ if denials > 3: return ResourceDecision(Decision.ESCALATE, f"Denied {denials} times", capability)
133
+ self.denial_history[agent_id] = denials + 1
134
+ return ResourceDecision(Decision.DENY, f"Balance {credit_balance:.2f} < {adjusted:.2f}", capability)
135
+
136
+
137
+ # --- HELPERS ---
138
+
139
+ def strip_markdown_fences(text: str) -> str:
140
+ """Remove only the outer markdown code fences, not internal backticks."""
141
+ text = text.strip()
142
+ # Remove leading ```python or ``` on first line only
143
+ if text.startswith("```"):
144
+ first_newline = text.find("\n")
145
+ if first_newline != -1:
146
+ text = text[first_newline+1:]
147
+ # Remove trailing ``` only if on the last line
148
+ lines = text.splitlines()
149
+ if lines and lines[-1].strip() == "```":
150
+ lines = lines[:-1]
151
+ text = "\n".join(lines)
152
+ return text.strip()
153
+
154
+
155
+ def contains_function_definition(code: str, entry_point: str) -> bool:
156
+ return bool(re.search(rf'\bdef\s+{re.escape(entry_point)}\b', code))
157
+
158
+
159
+ def run_test(code: str, test_code: str, timeout: int = 20):
160
+ full = code + "\n\n" + test_code + "\n\ncheck()\n"
161
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
162
+ f.write(full)
163
+ tmp = f.name
164
+ try:
165
+ result = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=timeout)
166
+ passed = result.returncode == 0
167
+ error = result.stderr[:500] if not passed else ""
168
+ except subprocess.TimeoutExpired:
169
+ passed = False; error = "Timeout"
170
+ except Exception as e:
171
+ passed = False; error = str(e)[:500]
172
+ finally:
173
+ os.unlink(tmp)
174
+ return passed, error
175
+
176
+
177
+ def wrap_prompt_chat(prompt: str, tokenizer) -> str:
178
+ system = "You are an expert Python programmer. Write the COMPLETE solution including function signature, docstring if needed, and body."
179
+ messages = [
180
+ {"role": "system", "content": system},
181
+ {"role": "user", "content": prompt.strip()},
182
+ ]
183
+ if hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template:
184
+ return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
185
+ return f"system\n{system}\nuser\n{prompt.strip()}\nassistant\n"
186
+
187
+
188
+ # --- BENCHMARK ---
189
+
190
+ class RealLLMBenchmarkV7:
191
+ def __init__(self, model_name="Qwen/Qwen2.5-Coder-1.5B-Instruct", n_problems=20, seed=42):
192
+ self.model_name = model_name
193
+ self.n_problems = n_problems
194
+ self.seed = seed
195
+ self.oracle = ImpactOracle()
196
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
197
+ print(f"Using device: {self.device}, model: {self.model_name}")
198
+
199
+ def load_problems(self):
200
+ ds = load_dataset("evalplus/humanevalplus", split="test")
201
+ return [{"task_id": item["task_id"], "prompt": item["prompt"], "test": item["test"], "entry_point": item["entry_point"]}
202
+ for i, item in enumerate(ds) if i < self.n_problems]
203
+
204
+ def load_model(self):
205
+ print(f"Loading {self.model_name}...")
206
+ tok = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
207
+ model = AutoModelForCausalLM.from_pretrained(
208
+ self.model_name, trust_remote_code=True,
209
+ torch_dtype=torch.bfloat16 if self.device == "cuda" else torch.float32,
210
+ device_map="auto" if self.device == "cuda" else None,
211
+ )
212
+ if self.device == "cpu": model = model.to("cpu").float()
213
+ print(f"Model loaded. Chat template: {bool(tok.chat_template)}")
214
+ return model, tok
215
+
216
+ def generate(self, model, tok, prompt_raw: str, max_new_tokens: int = 512):
217
+ chat_prompt = wrap_prompt_chat(prompt_raw, tok)
218
+ inputs = tok(chat_prompt, return_tensors="pt").to(model.device)
219
+ with torch.no_grad():
220
+ outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, pad_token_id=tok.eos_token_id)
221
+ gen = tok.decode(outputs[0], skip_special_tokens=True)
222
+ prompt_decoded = tok.decode(inputs.input_ids[0], skip_special_tokens=True)
223
+ code = gen[len(prompt_decoded):].strip()
224
+ return code
225
+
226
+ def evaluate_one(self, problem, model, tok, max_new_tokens=512):
227
+ raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_new_tokens)
228
+ tokens = len(tok.encode(raw))
229
+ code = strip_markdown_fences(raw)
230
+
231
+ # Strategy 1: Generated code is complete function
232
+ if contains_function_definition(code, problem["entry_point"]):
233
+ test_code = code
234
+ else:
235
+ # Strategy 2: Model only generated body, prepend prompt
236
+ test_code = problem["prompt"] + code
237
+
238
+ passed, error = run_test(test_code, problem["test"])
239
+
240
+ # Strategy 3: If failed, try the alternative
241
+ if not passed:
242
+ if contains_function_definition(code, problem["entry_point"]):
243
+ alt_code = problem["prompt"] + code
244
+ else:
245
+ alt_code = code
246
+ passed2, error2 = run_test(alt_code, problem["test"])
247
+ if passed2:
248
+ passed = True; error = ""
249
+ else:
250
+ error = error if len(error) < len(error2) else error2
251
+
252
+ return passed, tokens, raw, error
253
+
254
+ def run_baseline(self, problems, model, tok, max_new_tokens=512):
255
+ results = []; total_compute = 0
256
+ for problem in problems:
257
+ passed, tokens, raw, error = self.evaluate_one(problem, model, tok, max_new_tokens)
258
+ total_compute += tokens
259
+ results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "raw": raw[:300], "error": error[:200]})
260
+ print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
261
+ if not passed:
262
+ print(f" error={error[:200]!r}")
263
+ # Debug: show test file for first 3 failures
264
+ code = strip_markdown_fences(raw)
265
+ if contains_function_definition(code, problem["entry_point"]):
266
+ test_file = code + "\n\n" + problem["test"] + "\n\ncheck()\n"
267
+ else:
268
+ test_file = problem["prompt"] + code + "\n\n" + problem["test"] + "\n\ncheck()\n"
269
+ print(f" [DEBUG test file first 300 chars]: {test_file[:300]!r}")
270
+ return {"accuracy": sum(1 for r in results if r["passed"]) / len(results), "total_compute": total_compute, "results": results}
271
+
272
+ def run_occ(self, problems, model, tok, max_new_tokens_first=256, max_new_tokens_retry=512):
273
+ ledger = CreditLedger(decay_lambda=0.02)
274
+ broker = ResourceBroker()
275
+ ledger.earn("code_agent", "seed", "seed", 25.0, 0.0, 0.0, "initial", "model_call")
276
+ results = []; total_compute = 0
277
+
278
+ for problem in problems:
279
+ budget_remaining = 3000; attempts = 0; passed = False
280
+ while budget_remaining > 100 and attempts < 3 and not passed:
281
+ attempts += 1
282
+ balance = ledger.balance("code_agent", "model_call")
283
+ dec = broker.request("model_call", "code_agent", balance,
284
+ task_state={"attempts": attempts, "budget_remaining": budget_remaining})
285
+ if dec.decision == Decision.DENY: break
286
+ max_tok = max_new_tokens_first if attempts == 1 else max_new_tokens_retry
287
+ code_raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok)
288
+ tokens = len(tok.encode(code_raw)); budget_remaining -= tokens; total_compute += tokens
289
+ code = strip_markdown_fences(code_raw)
290
+ if contains_function_definition(code, problem["entry_point"]):
291
+ test_code = code
292
+ else:
293
+ test_code = problem["prompt"] + code
294
+ passed, error = run_test(test_code, problem["test"])
295
+ score = 1.0 if passed else 0.0
296
+ ora = self.oracle.score("code", {"attempt": attempts}, {},
297
+ {"correctness": score, "compute_cost": tokens, "public_pass": passed, "hidden_tests_pass": passed}, "code_agent")
298
+ if passed: ledger.earn("code_agent", problem["task_id"], f"att_{attempts}", 5.0, ora.raw_score, tokens, "pass", "model_call")
299
+ else: ledger.spend("code_agent", problem["task_id"], f"att_{attempts}", 1.0, "model_call", "fail")
300
+ if attempts >= 2 and not passed: break
301
+ results.append({"task_id": problem["task_id"], "passed": passed, "attempts": attempts})
302
+ print(f" {problem['task_id']}: passed={passed}, attempts={attempts}")
303
+ return {"accuracy": sum(1 for r in results if r["passed"]) / len(results), "total_compute": total_compute, "results": results}
304
+
305
+ def run_all(self):
306
+ problems = self.load_problems()
307
+ print(f"Loaded {len(problems)} problems")
308
+ model, tok = self.load_model()
309
+ print("\n--- Baseline ---")
310
+ baseline = self.run_baseline(problems, model, tok)
311
+ print(f"Baseline: acc={baseline['accuracy']:.3f}, compute={baseline['total_compute']}")
312
+ print("\n--- OCC ---")
313
+ occ = self.run_occ(problems, model, tok)
314
+ print(f"OCC: acc={occ['accuracy']:.3f}, compute={occ['total_compute']}")
315
+ return {
316
+ "baseline": baseline, "occ": occ,
317
+ "comparison": {
318
+ "baseline_accuracy": baseline["accuracy"], "occ_accuracy": occ["accuracy"],
319
+ "baseline_compute": baseline["total_compute"], "occ_compute": occ["total_compute"],
320
+ "compute_reduction": 1.0 - (occ["total_compute"] / max(baseline["total_compute"], 1)),
321
+ "accuracy_delta": occ["accuracy"] - baseline["accuracy"],
322
+ }
323
+ }
324
+
325
+
326
+ def main():
327
+ bench = RealLLMBenchmarkV7(n_problems=20, seed=42)
328
+ results = bench.run_all()
329
+ print("\n" + "=" * 60)
330
+ print("REAL LLM CODE BENCHMARK (V7)")
331
+ print("=" * 60)
332
+ comp = results["comparison"]
333
+ print(f"Baseline accuracy: {comp['baseline_accuracy']:.3f}")
334
+ print(f"OCC accuracy: {comp['occ_accuracy']:.3f}")
335
+ print(f"Baseline compute: {comp['baseline_compute']}")
336
+ print(f"OCC compute: {comp['occ_compute']}")
337
+ print(f"Compute reduction: {comp['compute_reduction']:.1%}")
338
+ print(f"Accuracy delta: {comp['accuracy_delta']:+.3f}")
339
+ out_dir = Path("/app/occ/reports")
340
+ out_dir.mkdir(parents=True, exist_ok=True)
341
+ with open(out_dir / "benchmark_code_real_llm_v7_results.json", "w") as f:
342
+ json.dump(results, f, indent=2, default=str)
343
+ print(f"\nSaved to {out_dir / 'benchmark_code_real_llm_v7_results.json'}")
344
+
345
+
346
+ if __name__ == "__main__":
347
+ main()