narcolepticchicken commited on
Commit
c7a0b84
·
verified ·
1 Parent(s): 71a9c04

Upload jobs/run_real_llm_diagnostic.py

Browse files
Files changed (1) hide show
  1. jobs/run_real_llm_diagnostic.py +117 -0
jobs/run_real_llm_diagnostic.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Diagnostic script for real LLM code generation on HumanEval.
3
+ Shows exactly what the model generates and what error the test produces.
4
+ """
5
+ import os
6
+ import re
7
+ import subprocess
8
+ import sys
9
+ import tempfile
10
+ from datasets import load_dataset
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer
12
+ import torch
13
+
14
+
15
+ def strip_markdown_fences(text: str) -> str:
16
+ text = text.strip()
17
+ if text.startswith("```"):
18
+ lines = text.splitlines()
19
+ if lines[0].startswith("```"): lines = lines[1:]
20
+ if lines and lines[-1].strip() == "```": lines = lines[:-1]
21
+ text = "\n".join(lines)
22
+ return text.strip()
23
+
24
+
25
+ def run_tests(code: str, test_code: str, timeout: int = 15):
26
+ full = code + "\n\n" + test_code + "\n\ncheck()\n"
27
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
28
+ f.write(full)
29
+ tmp = f.name
30
+ try:
31
+ result = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=timeout)
32
+ passed = result.returncode == 0
33
+ error = result.stderr[:500] if not passed else ""
34
+ except subprocess.TimeoutExpired:
35
+ passed = False; error = "Timeout"
36
+ except Exception as e:
37
+ passed = False; error = str(e)[:500]
38
+ finally:
39
+ os.unlink(tmp)
40
+ return passed, error, full
41
+
42
+
43
+ def main():
44
+ ds = load_dataset("evalplus/humanevalplus", split="test")
45
+ item = ds[0]
46
+ task_id = item["task_id"]
47
+ prompt = item["prompt"]
48
+ test = item["test"]
49
+ entry_point = item["entry_point"]
50
+
51
+ print(f"Task: {task_id}")
52
+ print(f"Entry point: {entry_point}")
53
+ print(f"\n--- HUMANEVAL PROMPT ---")
54
+ print(prompt[:500])
55
+ print(f"\n--- HUMANEVAL TEST (first 300 chars) ---")
56
+ print(test[:300])
57
+ print("...")
58
+
59
+ model_name = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
60
+ device = "cuda" if torch.cuda.is_available() else "cpu"
61
+ print(f"\nLoading {model_name} on {device}...")
62
+ tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
63
+ model = AutoModelForCausalLM.from_pretrained(
64
+ model_name, trust_remote_code=True,
65
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
66
+ device_map="auto" if device == "cuda" else None,
67
+ )
68
+
69
+ system = "You are an expert Python programmer. Write the COMPLETE solution including function signature, docstring if needed, and body."
70
+ messages = [
71
+ {"role": "system", "content": system},
72
+ {"role": "user", "content": prompt.strip()},
73
+ ]
74
+ chat_prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
75
+
76
+ inputs = tok(chat_prompt, return_tensors="pt").to(model.device)
77
+ with torch.no_grad():
78
+ outputs = model.generate(**inputs, max_new_tokens=300, do_sample=False, pad_token_id=tok.eos_token_id)
79
+ gen = tok.decode(outputs[0], skip_special_tokens=True)
80
+ prompt_decoded = tok.decode(inputs.input_ids[0], skip_special_tokens=True)
81
+ code = gen[len(prompt_decoded):].strip()
82
+
83
+ print(f"\n--- GENERATED CODE (raw) ---")
84
+ print(code)
85
+ print(f"\n--- STRIPPED ---")
86
+ stripped = strip_markdown_fences(code)
87
+ print(stripped)
88
+ print(f"\n--- FULL TEST FILE ---")
89
+ passed, error, full = run_tests(stripped, test)
90
+ print(full[:800])
91
+ print(f"\n--- RESULT ---")
92
+ print(f"Passed: {passed}")
93
+ print(f"Error: {error}")
94
+
95
+ # Try without appending check()
96
+ full2 = stripped + "\n\n" + test + "\n"
97
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
98
+ f.write(full2); tmp = f.name
99
+ result2 = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=15)
100
+ print(f"\n--- WITHOUT EXTRA check() ---")
101
+ print(f"Passed: {result2.returncode == 0}")
102
+ print(f"Error: {result2.stderr[:300]}")
103
+ os.unlink(tmp)
104
+
105
+ # Try with just the prompt + stripped (in case model only generates body)
106
+ full3 = prompt + stripped + "\n\n" + test + "\n"
107
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
108
+ f.write(full3); tmp = f.name
109
+ result3 = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=15)
110
+ print(f"\n--- PROMPT + STRIPPED + TEST ---")
111
+ print(f"Passed: {result3.returncode == 0}")
112
+ print(f"Error: {result3.stderr[:300]}")
113
+ os.unlink(tmp)
114
+
115
+
116
+ if __name__ == "__main__":
117
+ main()