fox1.3 / evaluate.py

Add evaluate.py: Benchmark evaluation on HumanEval + MBPP

f7a5fb7 verified 4 days ago

7.33 kB

	#!/usr/bin/env python3
	"""
	Fox1.3 Evaluation Script
	Evaluates on HumanEval and MBPP benchmarks
	"""

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	from datasets import load_dataset
	import json
	import logging
	from typing import List, Dict

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	MODEL_NAME = "teolm30/fox1.3"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	def load_model():
	logger.info(f"Loading model: {MODEL_NAME}")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	tokenizer.pad_token = tokenizer.eos_token

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)

	return model, tokenizer

	def run_humaneval(model, tokenizer) -> Dict:
	"""Run HumanEval benchmark."""
	logger.info("Loading HumanEval dataset...")
	dataset = load_dataset("openai/openai_humaneval", split="test")

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=256,
	do_sample=False,
	temperature=None,
	top_p=None,
	device_map="auto"
	)

	correct = 0
	total = len(dataset)
	results = []

	for i, item in enumerate(dataset):
	prompt = item["prompt"]
	test = item["test"]
	canonical_solution = item["canonical_solution"]

	# Extract the prompt up to the function signature
	prompt_end = prompt.find("def ")
	if prompt_end == -1:
	prompt_end = len(prompt)

	full_prompt = prompt[:prompt_end]

	try:
	output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
	generated = output[0]["generated_text"]

	# Extract code block
	code = generated[len(full_prompt):].strip()

	# Try to extract just the function body
	if "```python" in code:
	code = code.split("```python")[1].split("```")[0].strip()
	elif "```" in code:
	code = code.split("```")[1].split("```")[0].strip()

	# Execute the code with the test
	exec_globals = {}
	exec(code, exec_globals)
	exec(test, exec_globals)

	# Check if test passed by running it
	local_vars = {}
	exec(code, local_vars)
	try:
	exec(test, local_vars)
	correct += 1
	status = "PASS"
	except AssertionError:
	status = "FAIL"
	except Exception as e:
	status = f"ERROR: {str(e)[:50]}"

	except Exception as e:
	status = f"ERROR: {str(e)[:50]}"

	results.append({
	"task_id": item.get("task_id", i),
	"status": status
	})

	if (i + 1) % 10 == 0:
	logger.info(f"HumanEval progress: {i+1}/{total} \| Running pass@{1}: {correct}/{i+1}")

	pass_at_1 = correct / total
	logger.info(f"HumanEval PASS@1: {pass_at_1:.4f} ({correct}/{total})")

	return {
	"benchmark": "HumanEval",
	"pass_at_1": pass_at_1,
	"correct": correct,
	"total": total,
	"results": results
	}

	def run_mbpp(model, tokenizer) -> Dict:
	"""Run MBPP benchmark."""
	logger.info("Loading MBPP dataset...")
	dataset = load_dataset("google-research/mbpp", "sanitized", split="test")

	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=256,
	do_sample=False,
	temperature=None,
	top_p=None,
	device_map="auto"
	)

	correct = 0
	total = min(len(dataset), 374) # Standard subset size
	results = []

	for i, item in enumerate(dataset[:total]):
	prompt = item["prompt"]
	test_list = item["test_list"]
	code = item["code"]

	full_prompt = f"### Instruction:\nWrite a Python function.\n\n### Input:\n{prompt}\n\n### Response:\n"

	try:
	output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
	generated = output[0]["generated_text"]

	# Extract code from response
	response = generated[len(full_prompt):].strip()

	if "```python" in response:
	response = response.split("```python")[1].split("```")[0].strip()
	elif "```" in response:
	response = response.split("```")[1].split("```")[0].strip()

	# Test the generated code
	exec_globals = {}
	exec(response, exec_globals)

	all_passed = True
	for test_code in test_list:
	try:
	exec(test_code, exec_globals)
	except AssertionError:
	all_passed = False
	break
	except Exception:
	all_passed = False
	break

	if all_passed:
	correct += 1
	status = "PASS"
	else:
	status = "FAIL"

	except Exception as e:
	status = f"ERROR: {str(e)[:50]}"

	results.append({
	"prompt_id": item.get("prompts_id", i),
	"status": status
	})

	if (i + 1) % 50 == 0:
	logger.info(f"MBPP progress: {i+1}/{total} \| Running pass@1: {correct}/{i+1}")

	pass_at_1 = correct / total
	logger.info(f"MBPP PASS@1: {pass_at_1:.4f} ({correct}/{total})")

	return {
	"benchmark": "MBPP",
	"pass_at_1": pass_at_1,
	"correct": correct,
	"total": total,
	"results": results
	}

	def main():
	logger.info(f"Using device: {DEVICE}")

	model, tokenizer = load_model()

	# Run benchmarks
	humaneval_results = run_humaneval(model, tokenizer)
	mbpp_results = run_mbpp(model, tokenizer)

	# Summary
	summary = {
	"model": MODEL_NAME,
	"benchmarks": {
	"HumanEval": {
	"pass_at_1": humaneval_results["pass_at_1"],
	"correct": humaneval_results["correct"],
	"total": humaneval_results["total"]
	},
	"MBPP": {
	"pass_at_1": mbpp_results["pass_at_1"],
	"correct": mbpp_results["correct"],
	"total": mbpp_results["total"]
	}
	}
	}

	logger.info("\n" + "="*50)
	logger.info("BENCHMARK RESULTS SUMMARY")
	logger.info("="*50)
	logger.info(f"HumanEval: {humaneval_results['pass_at_1']:.4f} ({humaneval_results['correct']}/{humaneval_results['total']})")
	logger.info(f"MBPP: {mbpp_results['pass_at_1']:.4f} ({mbpp_results['correct']}/{mbpp_results['total']})")

	# Save results
	output_file = "benchmark_results.json"
	with open(output_file, "w") as f:
	json.dump(summary, f, indent=2)
	logger.info(f"Results saved to {output_file}")

	return summary

	if __name__ == "__main__":
	main()