Upload jobs/run_real_llm_standalone_v7.py
Browse files
jobs/run_real_llm_standalone_v7.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
"""
|
| 2 |
Self-contained GPU job for real LLM code benchmark — V7.
|
| 3 |
FIXES:
|
| 4 |
-
1.
|
| 5 |
2. Larger model: Qwen2.5-Coder-1.5B-Instruct
|
| 6 |
3. 512 max_new_tokens
|
| 7 |
-
4.
|
| 8 |
"""
|
| 9 |
import json
|
| 10 |
import os
|
|
@@ -136,20 +136,22 @@ class ResourceBroker:
|
|
| 136 |
|
| 137 |
# --- HELPERS ---
|
| 138 |
|
| 139 |
-
def
|
| 140 |
-
"""
|
| 141 |
text = text.strip()
|
| 142 |
-
#
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
| 153 |
|
| 154 |
|
| 155 |
def contains_function_definition(code: str, entry_point: str) -> bool:
|
|
@@ -226,7 +228,7 @@ class RealLLMBenchmarkV7:
|
|
| 226 |
def evaluate_one(self, problem, model, tok, max_new_tokens=512):
|
| 227 |
raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_new_tokens)
|
| 228 |
tokens = len(tok.encode(raw))
|
| 229 |
-
code =
|
| 230 |
|
| 231 |
# Strategy 1: Generated code is complete function
|
| 232 |
if contains_function_definition(code, problem["entry_point"]):
|
|
@@ -249,24 +251,23 @@ class RealLLMBenchmarkV7:
|
|
| 249 |
else:
|
| 250 |
error = error if len(error) < len(error2) else error2
|
| 251 |
|
| 252 |
-
return passed, tokens, raw, error
|
| 253 |
|
| 254 |
def run_baseline(self, problems, model, tok, max_new_tokens=512):
|
| 255 |
results = []; total_compute = 0
|
| 256 |
-
for problem in problems:
|
| 257 |
-
passed, tokens, raw, error = self.evaluate_one(problem, model, tok, max_new_tokens)
|
| 258 |
total_compute += tokens
|
| 259 |
results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "raw": raw[:300], "error": error[:200]})
|
| 260 |
print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
|
| 261 |
if not passed:
|
| 262 |
print(f" error={error[:200]!r}")
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
test_file
|
| 269 |
-
print(f" [DEBUG test file first 300 chars]: {test_file[:300]!r}")
|
| 270 |
return {"accuracy": sum(1 for r in results if r["passed"]) / len(results), "total_compute": total_compute, "results": results}
|
| 271 |
|
| 272 |
def run_occ(self, problems, model, tok, max_new_tokens_first=256, max_new_tokens_retry=512):
|
|
@@ -286,7 +287,7 @@ class RealLLMBenchmarkV7:
|
|
| 286 |
max_tok = max_new_tokens_first if attempts == 1 else max_new_tokens_retry
|
| 287 |
code_raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok)
|
| 288 |
tokens = len(tok.encode(code_raw)); budget_remaining -= tokens; total_compute += tokens
|
| 289 |
-
code =
|
| 290 |
if contains_function_definition(code, problem["entry_point"]):
|
| 291 |
test_code = code
|
| 292 |
else:
|
|
|
|
| 1 |
"""
|
| 2 |
Self-contained GPU job for real LLM code benchmark — V7.
|
| 3 |
FIXES:
|
| 4 |
+
1. Regex-based markdown extraction (handles ```python...``` blocks)
|
| 5 |
2. Larger model: Qwen2.5-Coder-1.5B-Instruct
|
| 6 |
3. 512 max_new_tokens
|
| 7 |
+
4. Debug: show stripped code for first 3 failures
|
| 8 |
"""
|
| 9 |
import json
|
| 10 |
import os
|
|
|
|
| 136 |
|
| 137 |
# --- HELPERS ---
|
| 138 |
|
| 139 |
+
def extract_code_block(text: str) -> str:
|
| 140 |
+
"""Extract code from markdown fenced code block."""
|
| 141 |
text = text.strip()
|
| 142 |
+
# Match ```python or ``` followed by content until ```
|
| 143 |
+
match = re.search(r'```(?:\w+)?\s*\n(.*?)\n```', text, re.DOTALL)
|
| 144 |
+
if match:
|
| 145 |
+
return match.group(1).strip()
|
| 146 |
+
# Also try without newline after opening fence
|
| 147 |
+
match2 = re.search(r'```(?:\w+)?\s*\n(.*)', text, re.DOTALL)
|
| 148 |
+
if match2:
|
| 149 |
+
candidate = match2.group(1).strip()
|
| 150 |
+
# Remove trailing ``` if present
|
| 151 |
+
if candidate.endswith("```"):
|
| 152 |
+
candidate = candidate[:-3].strip()
|
| 153 |
+
return candidate
|
| 154 |
+
return text
|
| 155 |
|
| 156 |
|
| 157 |
def contains_function_definition(code: str, entry_point: str) -> bool:
|
|
|
|
| 228 |
def evaluate_one(self, problem, model, tok, max_new_tokens=512):
|
| 229 |
raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_new_tokens)
|
| 230 |
tokens = len(tok.encode(raw))
|
| 231 |
+
code = extract_code_block(raw)
|
| 232 |
|
| 233 |
# Strategy 1: Generated code is complete function
|
| 234 |
if contains_function_definition(code, problem["entry_point"]):
|
|
|
|
| 251 |
else:
|
| 252 |
error = error if len(error) < len(error2) else error2
|
| 253 |
|
| 254 |
+
return passed, tokens, raw, error, code
|
| 255 |
|
| 256 |
def run_baseline(self, problems, model, tok, max_new_tokens=512):
|
| 257 |
results = []; total_compute = 0
|
| 258 |
+
for i, problem in enumerate(problems):
|
| 259 |
+
passed, tokens, raw, error, code = self.evaluate_one(problem, model, tok, max_new_tokens)
|
| 260 |
total_compute += tokens
|
| 261 |
results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "raw": raw[:300], "error": error[:200]})
|
| 262 |
print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
|
| 263 |
if not passed:
|
| 264 |
print(f" error={error[:200]!r}")
|
| 265 |
+
if i < 3: # Debug first 3 failures
|
| 266 |
+
print(f" [STRIPPED CODE first 300 chars]:")
|
| 267 |
+
print(code[:300])
|
| 268 |
+
print(f" [TEST FILE first 300 chars]:")
|
| 269 |
+
test_file = code + "\n\n" + problem["test"] + "\n\ncheck()\n" if contains_function_definition(code, problem["entry_point"]) else problem["prompt"] + code + "\n\n" + problem["test"] + "\n\ncheck()\n"
|
| 270 |
+
print(test_file[:300])
|
|
|
|
| 271 |
return {"accuracy": sum(1 for r in results if r["passed"]) / len(results), "total_compute": total_compute, "results": results}
|
| 272 |
|
| 273 |
def run_occ(self, problems, model, tok, max_new_tokens_first=256, max_new_tokens_retry=512):
|
|
|
|
| 287 |
max_tok = max_new_tokens_first if attempts == 1 else max_new_tokens_retry
|
| 288 |
code_raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok)
|
| 289 |
tokens = len(tok.encode(code_raw)); budget_remaining -= tokens; total_compute += tokens
|
| 290 |
+
code = extract_code_block(code_raw)
|
| 291 |
if contains_function_definition(code, problem["entry_point"]):
|
| 292 |
test_code = code
|
| 293 |
else:
|