narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 26 days ago

Commit

c7a0b84

verified ·

1 Parent(s): 71a9c04

Upload jobs/run_real_llm_diagnostic.py

Browse files

Files changed (1) hide show

jobs/run_real_llm_diagnostic.py +117 -0

jobs/run_real_llm_diagnostic.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Diagnostic script for real LLM code generation on HumanEval.
+Shows exactly what the model generates and what error the test produces.
+"""
+import os
+import re
+import subprocess
+import sys
+import tempfile
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+def strip_markdown_fences(text: str) -> str:
+    text = text.strip()
+    if text.startswith("```"):
+        lines = text.splitlines()
+        if lines[0].startswith("```"): lines = lines[1:]
+        if lines and lines[-1].strip() == "```": lines = lines[:-1]
+        text = "\n".join(lines)
+    return text.strip()
+def run_tests(code: str, test_code: str, timeout: int = 15):
+    full = code + "\n\n" + test_code + "\n\ncheck()\n"
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+        f.write(full)
+        tmp = f.name
+    try:
+        result = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=timeout)
+        passed = result.returncode == 0
+        error = result.stderr[:500] if not passed else ""
+    except subprocess.TimeoutExpired:
+        passed = False; error = "Timeout"
+    except Exception as e:
+        passed = False; error = str(e)[:500]
+    finally:
+        os.unlink(tmp)
+    return passed, error, full
+def main():
+    ds = load_dataset("evalplus/humanevalplus", split="test")
+    item = ds[0]
+    task_id = item["task_id"]
+    prompt = item["prompt"]
+    test = item["test"]
+    entry_point = item["entry_point"]
+    print(f"Task: {task_id}")
+    print(f"Entry point: {entry_point}")
+    print(f"\n--- HUMANEVAL PROMPT ---")
+    print(prompt[:500])
+    print(f"\n--- HUMANEVAL TEST (first 300 chars) ---")
+    print(test[:300])
+    print("...")
+    model_name = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"\nLoading {model_name} on {device}...")
+    tok = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, trust_remote_code=True,
+        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+        device_map="auto" if device == "cuda" else None,
+    )
+    system = "You are an expert Python programmer. Write the COMPLETE solution including function signature, docstring if needed, and body."
+    messages = [
+        {"role": "system", "content": system},
+        {"role": "user", "content": prompt.strip()},
+    ]
+    chat_prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tok(chat_prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=300, do_sample=False, pad_token_id=tok.eos_token_id)
+    gen = tok.decode(outputs[0], skip_special_tokens=True)
+    prompt_decoded = tok.decode(inputs.input_ids[0], skip_special_tokens=True)
+    code = gen[len(prompt_decoded):].strip()
+    print(f"\n--- GENERATED CODE (raw) ---")
+    print(code)
+    print(f"\n--- STRIPPED ---")
+    stripped = strip_markdown_fences(code)
+    print(stripped)
+    print(f"\n--- FULL TEST FILE ---")
+    passed, error, full = run_tests(stripped, test)
+    print(full[:800])
+    print(f"\n--- RESULT ---")
+    print(f"Passed: {passed}")
+    print(f"Error: {error}")
+    # Try without appending check()
+    full2 = stripped + "\n\n" + test + "\n"
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+        f.write(full2); tmp = f.name
+    result2 = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=15)
+    print(f"\n--- WITHOUT EXTRA check() ---")
+    print(f"Passed: {result2.returncode == 0}")
+    print(f"Error: {result2.stderr[:300]}")
+    os.unlink(tmp)
+    # Try with just the prompt + stripped (in case model only generates body)
+    full3 = prompt + stripped + "\n\n" + test + "\n"
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+        f.write(full3); tmp = f.name
+    result3 = subprocess.run(['python', tmp], capture_output=True, text=True, timeout=15)
+    print(f"\n--- PROMPT + STRIPPED + TEST ---")
+    print(f"Passed: {result3.returncode == 0}")
+    print(f"Error: {result3.stderr[:300]}")
+    os.unlink(tmp)
+if __name__ == "__main__":
+    main()