NameONEStudios
/

hypernet-n1-sdc

code-generation

Eval Results (legacy)

Model card Files Files and versions

hypernet-n1-sdc / run_full_benchmark.py

NameONEStudios's picture

Upload folder using huggingface_hub

1db22f0 verified 5 months ago

history blame contribute delete

3.5 kB

	#!/usr/bin/env python3
	"""HYPERNET N1 - OFFICIAL HUMANEVAL WITH CODE EXECUTION"""
	import os, sys, json, time, requests, subprocess
	from datetime import datetime

	HYPERNET_URL = "http://localhost:5000"
	AUTH_TOKEN = "cpn-steve-kawa-hypernet-alpha"
	LANES = ["lola", "claude", "grok", "deep"]

	def call_lane(query, lane):
	try:
	r = requests.post(f"{HYPERNET_URL}/api/v1/run",
	headers={"Authorization": f"Bearer {AUTH_TOKEN}"},
	json={"query": query, "lane": lane}, timeout=120)
	if r.status_code == 200:
	return r.json()
	except: pass
	return {"error": "failed"}

	def extract_code(response):
	code = response
	if "```python" in code:
	code = code.split("```python")[1].split("```")[0]
	elif "```" in code:
	code = code.split("```")[1].split("```")[0]
	return code.strip()

	def test_solution(problem, solution):
	code = extract_code(solution)
	test_code = f'''{problem["prompt"]}
	{code}

	{problem["test"]}
	check({problem["entry_point"]})
	print("PASS")
	'''
	try:
	result = subprocess.run([sys.executable, "-c", test_code],
	capture_output=True, text=True, timeout=10)
	return result.returncode == 0 and "PASS" in result.stdout
	except: return False

	def run_benchmark(problems, limit=10):
	results = {"lanes": {l: {"pass": 0, "fail": 0} for l in LANES}, "problems": []}

	print(f"\n{'='*60}")
	print(f"OFFICIAL HUMANEVAL - {limit} PROBLEMS - CODE EXECUTION")
	print(f"{'='*60}\n")

	for i, p in enumerate(problems[:limit]):
	print(f"[{i+1}/{limit}] {p['task_id']}")
	prob_result = {"task_id": p["task_id"], "lanes": {}}

	for lane in LANES:
	prompt = f"Solve this Python function. Return ONLY the implementation, no explanation.\n\n{p['prompt']}"
	resp = call_lane(prompt, lane)

	if resp.get("response_text"):
	passed = test_solution(p, resp["response_text"])
	prob_result["lanes"][lane] = passed
	results["lanes"][lane]["pass" if passed else "fail"] += 1
	print(f" {lane}: {'PASS' if passed else 'FAIL'}")
	else:
	prob_result["lanes"][lane] = False
	results["lanes"][lane]["fail"] += 1
	print(f" {lane}: ERROR")

	results["problems"].append(prob_result)
	print()

	# Summary
	print(f"{'='*60}")
	print("RESULTS (pass@1)")
	print(f"{'='*60}")
	for lane, stats in results["lanes"].items():
	total = stats["pass"] + stats["fail"]
	pct = (stats["pass"]/total*100) if total > 0 else 0
	print(f" {lane:10s}: {stats['pass']:3d}/{total:3d} ({pct:.1f}%)")

	return results

	if __name__ == "__main__":
	from datasets import load_dataset
	print("Loading official HumanEval...")
	ds = load_dataset("openai/openai_humaneval")
	problems = [dict(item) for item in ds["test"]]
	print(f"Loaded {len(problems)} problems\n")

	print("Options:")
	print(" 1. Run 10 problems (test)")
	print(" 2. Run 50 problems")
	print(" 3. Run ALL 164 problems")
	choice = input("Choice (1/2/3): ").strip()

	limit = {1: 10, 2: 50, 3: 164}.get(int(choice), 10)
	results = run_benchmark(problems, limit)

	# Save
	with open(f"humaneval_results_{datetime.now().strftime('%H%M%S')}.json", "w") as f:
	json.dump(results, f, indent=2)
	print("\nResults saved!")