| |
| """HYPERNET N1 - OFFICIAL HUMANEVAL WITH CODE EXECUTION""" |
| import os, sys, json, time, requests, subprocess |
| from datetime import datetime |
|
|
| HYPERNET_URL = "http://localhost:5000" |
| AUTH_TOKEN = "cpn-steve-kawa-hypernet-alpha" |
| LANES = ["lola", "claude", "grok", "deep"] |
|
|
| def call_lane(query, lane): |
| try: |
| r = requests.post(f"{HYPERNET_URL}/api/v1/run", |
| headers={"Authorization": f"Bearer {AUTH_TOKEN}"}, |
| json={"query": query, "lane": lane}, timeout=120) |
| if r.status_code == 200: |
| return r.json() |
| except: pass |
| return {"error": "failed"} |
|
|
| def extract_code(response): |
| code = response |
| if "```python" in code: |
| code = code.split("```python")[1].split("```")[0] |
| elif "```" in code: |
| code = code.split("```")[1].split("```")[0] |
| return code.strip() |
|
|
| def test_solution(problem, solution): |
| code = extract_code(solution) |
| test_code = f'''{problem["prompt"]} |
| {code} |
| |
| {problem["test"]} |
| check({problem["entry_point"]}) |
| print("PASS") |
| ''' |
| try: |
| result = subprocess.run([sys.executable, "-c", test_code], |
| capture_output=True, text=True, timeout=10) |
| return result.returncode == 0 and "PASS" in result.stdout |
| except: return False |
|
|
| def run_benchmark(problems, limit=10): |
| results = {"lanes": {l: {"pass": 0, "fail": 0} for l in LANES}, "problems": []} |
| |
| print(f"\n{'='*60}") |
| print(f"OFFICIAL HUMANEVAL - {limit} PROBLEMS - CODE EXECUTION") |
| print(f"{'='*60}\n") |
| |
| for i, p in enumerate(problems[:limit]): |
| print(f"[{i+1}/{limit}] {p['task_id']}") |
| prob_result = {"task_id": p["task_id"], "lanes": {}} |
| |
| for lane in LANES: |
| prompt = f"Solve this Python function. Return ONLY the implementation, no explanation.\n\n{p['prompt']}" |
| resp = call_lane(prompt, lane) |
| |
| if resp.get("response_text"): |
| passed = test_solution(p, resp["response_text"]) |
| prob_result["lanes"][lane] = passed |
| results["lanes"][lane]["pass" if passed else "fail"] += 1 |
| print(f" {lane}: {'PASS' if passed else 'FAIL'}") |
| else: |
| prob_result["lanes"][lane] = False |
| results["lanes"][lane]["fail"] += 1 |
| print(f" {lane}: ERROR") |
| |
| results["problems"].append(prob_result) |
| print() |
| |
| |
| print(f"{'='*60}") |
| print("RESULTS (pass@1)") |
| print(f"{'='*60}") |
| for lane, stats in results["lanes"].items(): |
| total = stats["pass"] + stats["fail"] |
| pct = (stats["pass"]/total*100) if total > 0 else 0 |
| print(f" {lane:10s}: {stats['pass']:3d}/{total:3d} ({pct:.1f}%)") |
| |
| return results |
|
|
| if __name__ == "__main__": |
| from datasets import load_dataset |
| print("Loading official HumanEval...") |
| ds = load_dataset("openai/openai_humaneval") |
| problems = [dict(item) for item in ds["test"]] |
| print(f"Loaded {len(problems)} problems\n") |
| |
| print("Options:") |
| print(" 1. Run 10 problems (test)") |
| print(" 2. Run 50 problems") |
| print(" 3. Run ALL 164 problems") |
| choice = input("Choice (1/2/3): ").strip() |
| |
| limit = {1: 10, 2: 50, 3: 164}.get(int(choice), 10) |
| results = run_benchmark(problems, limit) |
| |
| |
| with open(f"humaneval_results_{datetime.now().strftime('%H%M%S')}.json", "w") as f: |
| json.dump(results, f, indent=2) |
| print("\nResults saved!")
|
|
|