narcolepticchicken commited on
Commit
565e5ab
·
verified ·
1 Parent(s): 81e2e8f

Upload jobs/run_real_llm_standalone_v7.py

Browse files
Files changed (1) hide show
  1. jobs/run_real_llm_standalone_v7.py +28 -27
jobs/run_real_llm_standalone_v7.py CHANGED
@@ -1,10 +1,10 @@
1
  """
2
  Self-contained GPU job for real LLM code benchmark — V7.
3
  FIXES:
4
- 1. Better markdown stripping (only remove outer fences, not internal backticks)
5
  2. Larger model: Qwen2.5-Coder-1.5B-Instruct
6
  3. 512 max_new_tokens
7
- 4. Better error logging: shows the actual test file that failed
8
  """
9
  import json
10
  import os
@@ -136,20 +136,22 @@ class ResourceBroker:
136
 
137
  # --- HELPERS ---
138
 
139
- def strip_markdown_fences(text: str) -> str:
140
- """Remove only the outer markdown code fences, not internal backticks."""
141
  text = text.strip()
142
- # Remove leading ```python or ``` on first line only
143
- if text.startswith("```"):
144
- first_newline = text.find("\n")
145
- if first_newline != -1:
146
- text = text[first_newline+1:]
147
- # Remove trailing ``` only if on the last line
148
- lines = text.splitlines()
149
- if lines and lines[-1].strip() == "```":
150
- lines = lines[:-1]
151
- text = "\n".join(lines)
152
- return text.strip()
 
 
153
 
154
 
155
  def contains_function_definition(code: str, entry_point: str) -> bool:
@@ -226,7 +228,7 @@ class RealLLMBenchmarkV7:
226
  def evaluate_one(self, problem, model, tok, max_new_tokens=512):
227
  raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_new_tokens)
228
  tokens = len(tok.encode(raw))
229
- code = strip_markdown_fences(raw)
230
 
231
  # Strategy 1: Generated code is complete function
232
  if contains_function_definition(code, problem["entry_point"]):
@@ -249,24 +251,23 @@ class RealLLMBenchmarkV7:
249
  else:
250
  error = error if len(error) < len(error2) else error2
251
 
252
- return passed, tokens, raw, error
253
 
254
  def run_baseline(self, problems, model, tok, max_new_tokens=512):
255
  results = []; total_compute = 0
256
- for problem in problems:
257
- passed, tokens, raw, error = self.evaluate_one(problem, model, tok, max_new_tokens)
258
  total_compute += tokens
259
  results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "raw": raw[:300], "error": error[:200]})
260
  print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
261
  if not passed:
262
  print(f" error={error[:200]!r}")
263
- # Debug: show test file for first 3 failures
264
- code = strip_markdown_fences(raw)
265
- if contains_function_definition(code, problem["entry_point"]):
266
- test_file = code + "\n\n" + problem["test"] + "\n\ncheck()\n"
267
- else:
268
- test_file = problem["prompt"] + code + "\n\n" + problem["test"] + "\n\ncheck()\n"
269
- print(f" [DEBUG test file first 300 chars]: {test_file[:300]!r}")
270
  return {"accuracy": sum(1 for r in results if r["passed"]) / len(results), "total_compute": total_compute, "results": results}
271
 
272
  def run_occ(self, problems, model, tok, max_new_tokens_first=256, max_new_tokens_retry=512):
@@ -286,7 +287,7 @@ class RealLLMBenchmarkV7:
286
  max_tok = max_new_tokens_first if attempts == 1 else max_new_tokens_retry
287
  code_raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok)
288
  tokens = len(tok.encode(code_raw)); budget_remaining -= tokens; total_compute += tokens
289
- code = strip_markdown_fences(code_raw)
290
  if contains_function_definition(code, problem["entry_point"]):
291
  test_code = code
292
  else:
 
1
  """
2
  Self-contained GPU job for real LLM code benchmark — V7.
3
  FIXES:
4
+ 1. Regex-based markdown extraction (handles ```python...``` blocks)
5
  2. Larger model: Qwen2.5-Coder-1.5B-Instruct
6
  3. 512 max_new_tokens
7
+ 4. Debug: show stripped code for first 3 failures
8
  """
9
  import json
10
  import os
 
136
 
137
  # --- HELPERS ---
138
 
139
+ def extract_code_block(text: str) -> str:
140
+ """Extract code from markdown fenced code block."""
141
  text = text.strip()
142
+ # Match ```python or ``` followed by content until ```
143
+ match = re.search(r'```(?:\w+)?\s*\n(.*?)\n```', text, re.DOTALL)
144
+ if match:
145
+ return match.group(1).strip()
146
+ # Also try without newline after opening fence
147
+ match2 = re.search(r'```(?:\w+)?\s*\n(.*)', text, re.DOTALL)
148
+ if match2:
149
+ candidate = match2.group(1).strip()
150
+ # Remove trailing ``` if present
151
+ if candidate.endswith("```"):
152
+ candidate = candidate[:-3].strip()
153
+ return candidate
154
+ return text
155
 
156
 
157
  def contains_function_definition(code: str, entry_point: str) -> bool:
 
228
  def evaluate_one(self, problem, model, tok, max_new_tokens=512):
229
  raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_new_tokens)
230
  tokens = len(tok.encode(raw))
231
+ code = extract_code_block(raw)
232
 
233
  # Strategy 1: Generated code is complete function
234
  if contains_function_definition(code, problem["entry_point"]):
 
251
  else:
252
  error = error if len(error) < len(error2) else error2
253
 
254
+ return passed, tokens, raw, error, code
255
 
256
  def run_baseline(self, problems, model, tok, max_new_tokens=512):
257
  results = []; total_compute = 0
258
+ for i, problem in enumerate(problems):
259
+ passed, tokens, raw, error, code = self.evaluate_one(problem, model, tok, max_new_tokens)
260
  total_compute += tokens
261
  results.append({"task_id": problem["task_id"], "passed": passed, "tokens": tokens, "raw": raw[:300], "error": error[:200]})
262
  print(f" {problem['task_id']}: passed={passed}, tokens={tokens}")
263
  if not passed:
264
  print(f" error={error[:200]!r}")
265
+ if i < 3: # Debug first 3 failures
266
+ print(f" [STRIPPED CODE first 300 chars]:")
267
+ print(code[:300])
268
+ print(f" [TEST FILE first 300 chars]:")
269
+ test_file = code + "\n\n" + problem["test"] + "\n\ncheck()\n" if contains_function_definition(code, problem["entry_point"]) else problem["prompt"] + code + "\n\n" + problem["test"] + "\n\ncheck()\n"
270
+ print(test_file[:300])
 
271
  return {"accuracy": sum(1 for r in results if r["passed"]) / len(results), "total_compute": total_compute, "results": results}
272
 
273
  def run_occ(self, problems, model, tok, max_new_tokens_first=256, max_new_tokens_retry=512):
 
287
  max_tok = max_new_tokens_first if attempts == 1 else max_new_tokens_retry
288
  code_raw = self.generate(model, tok, problem["prompt"], max_new_tokens=max_tok)
289
  tokens = len(tok.encode(code_raw)); budget_remaining -= tokens; total_compute += tokens
290
+ code = extract_code_block(code_raw)
291
  if contains_function_definition(code, problem["entry_point"]):
292
  test_code = code
293
  else: