fox1.3 / evaluate.py
teolm30's picture
Add evaluate.py: Benchmark evaluation on HumanEval + MBPP
f7a5fb7 verified
#!/usr/bin/env python3
"""
Fox1.3 Evaluation Script
Evaluates on HumanEval and MBPP benchmarks
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
import json
import logging
from typing import List, Dict
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
MODEL_NAME = "teolm30/fox1.3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def load_model():
logger.info(f"Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
return model, tokenizer
def run_humaneval(model, tokenizer) -> Dict:
"""Run HumanEval benchmark."""
logger.info("Loading HumanEval dataset...")
dataset = load_dataset("openai/openai_humaneval", split="test")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=256,
do_sample=False,
temperature=None,
top_p=None,
device_map="auto"
)
correct = 0
total = len(dataset)
results = []
for i, item in enumerate(dataset):
prompt = item["prompt"]
test = item["test"]
canonical_solution = item["canonical_solution"]
# Extract the prompt up to the function signature
prompt_end = prompt.find("def ")
if prompt_end == -1:
prompt_end = len(prompt)
full_prompt = prompt[:prompt_end]
try:
output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
generated = output[0]["generated_text"]
# Extract code block
code = generated[len(full_prompt):].strip()
# Try to extract just the function body
if "```python" in code:
code = code.split("```python")[1].split("```")[0].strip()
elif "```" in code:
code = code.split("```")[1].split("```")[0].strip()
# Execute the code with the test
exec_globals = {}
exec(code, exec_globals)
exec(test, exec_globals)
# Check if test passed by running it
local_vars = {}
exec(code, local_vars)
try:
exec(test, local_vars)
correct += 1
status = "PASS"
except AssertionError:
status = "FAIL"
except Exception as e:
status = f"ERROR: {str(e)[:50]}"
except Exception as e:
status = f"ERROR: {str(e)[:50]}"
results.append({
"task_id": item.get("task_id", i),
"status": status
})
if (i + 1) % 10 == 0:
logger.info(f"HumanEval progress: {i+1}/{total} | Running pass@{1}: {correct}/{i+1}")
pass_at_1 = correct / total
logger.info(f"HumanEval PASS@1: {pass_at_1:.4f} ({correct}/{total})")
return {
"benchmark": "HumanEval",
"pass_at_1": pass_at_1,
"correct": correct,
"total": total,
"results": results
}
def run_mbpp(model, tokenizer) -> Dict:
"""Run MBPP benchmark."""
logger.info("Loading MBPP dataset...")
dataset = load_dataset("google-research/mbpp", "sanitized", split="test")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
max_new_tokens=256,
do_sample=False,
temperature=None,
top_p=None,
device_map="auto"
)
correct = 0
total = min(len(dataset), 374) # Standard subset size
results = []
for i, item in enumerate(dataset[:total]):
prompt = item["prompt"]
test_list = item["test_list"]
code = item["code"]
full_prompt = f"### Instruction:\nWrite a Python function.\n\n### Input:\n{prompt}\n\n### Response:\n"
try:
output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
generated = output[0]["generated_text"]
# Extract code from response
response = generated[len(full_prompt):].strip()
if "```python" in response:
response = response.split("```python")[1].split("```")[0].strip()
elif "```" in response:
response = response.split("```")[1].split("```")[0].strip()
# Test the generated code
exec_globals = {}
exec(response, exec_globals)
all_passed = True
for test_code in test_list:
try:
exec(test_code, exec_globals)
except AssertionError:
all_passed = False
break
except Exception:
all_passed = False
break
if all_passed:
correct += 1
status = "PASS"
else:
status = "FAIL"
except Exception as e:
status = f"ERROR: {str(e)[:50]}"
results.append({
"prompt_id": item.get("prompts_id", i),
"status": status
})
if (i + 1) % 50 == 0:
logger.info(f"MBPP progress: {i+1}/{total} | Running pass@1: {correct}/{i+1}")
pass_at_1 = correct / total
logger.info(f"MBPP PASS@1: {pass_at_1:.4f} ({correct}/{total})")
return {
"benchmark": "MBPP",
"pass_at_1": pass_at_1,
"correct": correct,
"total": total,
"results": results
}
def main():
logger.info(f"Using device: {DEVICE}")
model, tokenizer = load_model()
# Run benchmarks
humaneval_results = run_humaneval(model, tokenizer)
mbpp_results = run_mbpp(model, tokenizer)
# Summary
summary = {
"model": MODEL_NAME,
"benchmarks": {
"HumanEval": {
"pass_at_1": humaneval_results["pass_at_1"],
"correct": humaneval_results["correct"],
"total": humaneval_results["total"]
},
"MBPP": {
"pass_at_1": mbpp_results["pass_at_1"],
"correct": mbpp_results["correct"],
"total": mbpp_results["total"]
}
}
}
logger.info("\n" + "="*50)
logger.info("BENCHMARK RESULTS SUMMARY")
logger.info("="*50)
logger.info(f"HumanEval: {humaneval_results['pass_at_1']:.4f} ({humaneval_results['correct']}/{humaneval_results['total']})")
logger.info(f"MBPP: {mbpp_results['pass_at_1']:.4f} ({mbpp_results['correct']}/{mbpp_results['total']})")
# Save results
output_file = "benchmark_results.json"
with open(output_file, "w") as f:
json.dump(summary, f, indent=2)
logger.info(f"Results saved to {output_file}")
return summary
if __name__ == "__main__":
main()