#!/usr/bin/env python3 """ Fox1.3 Evaluation Script Evaluates on HumanEval and MBPP benchmarks """ import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from datasets import load_dataset import json import logging from typing import List, Dict logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) MODEL_NAME = "teolm30/fox1.3" DEVICE = "cuda" if torch.cuda.is_available() else "cpu" def load_model(): logger.info(f"Loading model: {MODEL_NAME}") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) return model, tokenizer def run_humaneval(model, tokenizer) -> Dict: """Run HumanEval benchmark.""" logger.info("Loading HumanEval dataset...") dataset = load_dataset("openai/openai_humaneval", split="test") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, do_sample=False, temperature=None, top_p=None, device_map="auto" ) correct = 0 total = len(dataset) results = [] for i, item in enumerate(dataset): prompt = item["prompt"] test = item["test"] canonical_solution = item["canonical_solution"] # Extract the prompt up to the function signature prompt_end = prompt.find("def ") if prompt_end == -1: prompt_end = len(prompt) full_prompt = prompt[:prompt_end] try: output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id) generated = output[0]["generated_text"] # Extract code block code = generated[len(full_prompt):].strip() # Try to extract just the function body if "```python" in code: code = code.split("```python")[1].split("```")[0].strip() elif "```" in code: code = code.split("```")[1].split("```")[0].strip() # Execute the code with the test exec_globals = {} exec(code, exec_globals) exec(test, exec_globals) # Check if test passed by running it local_vars = {} exec(code, local_vars) try: exec(test, local_vars) correct += 1 status = "PASS" except AssertionError: status = "FAIL" except Exception as e: status = f"ERROR: {str(e)[:50]}" except Exception as e: status = f"ERROR: {str(e)[:50]}" results.append({ "task_id": item.get("task_id", i), "status": status }) if (i + 1) % 10 == 0: logger.info(f"HumanEval progress: {i+1}/{total} | Running pass@{1}: {correct}/{i+1}") pass_at_1 = correct / total logger.info(f"HumanEval PASS@1: {pass_at_1:.4f} ({correct}/{total})") return { "benchmark": "HumanEval", "pass_at_1": pass_at_1, "correct": correct, "total": total, "results": results } def run_mbpp(model, tokenizer) -> Dict: """Run MBPP benchmark.""" logger.info("Loading MBPP dataset...") dataset = load_dataset("google-research/mbpp", "sanitized", split="test") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256, do_sample=False, temperature=None, top_p=None, device_map="auto" ) correct = 0 total = min(len(dataset), 374) # Standard subset size results = [] for i, item in enumerate(dataset[:total]): prompt = item["prompt"] test_list = item["test_list"] code = item["code"] full_prompt = f"### Instruction:\nWrite a Python function.\n\n### Input:\n{prompt}\n\n### Response:\n" try: output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id) generated = output[0]["generated_text"] # Extract code from response response = generated[len(full_prompt):].strip() if "```python" in response: response = response.split("```python")[1].split("```")[0].strip() elif "```" in response: response = response.split("```")[1].split("```")[0].strip() # Test the generated code exec_globals = {} exec(response, exec_globals) all_passed = True for test_code in test_list: try: exec(test_code, exec_globals) except AssertionError: all_passed = False break except Exception: all_passed = False break if all_passed: correct += 1 status = "PASS" else: status = "FAIL" except Exception as e: status = f"ERROR: {str(e)[:50]}" results.append({ "prompt_id": item.get("prompts_id", i), "status": status }) if (i + 1) % 50 == 0: logger.info(f"MBPP progress: {i+1}/{total} | Running pass@1: {correct}/{i+1}") pass_at_1 = correct / total logger.info(f"MBPP PASS@1: {pass_at_1:.4f} ({correct}/{total})") return { "benchmark": "MBPP", "pass_at_1": pass_at_1, "correct": correct, "total": total, "results": results } def main(): logger.info(f"Using device: {DEVICE}") model, tokenizer = load_model() # Run benchmarks humaneval_results = run_humaneval(model, tokenizer) mbpp_results = run_mbpp(model, tokenizer) # Summary summary = { "model": MODEL_NAME, "benchmarks": { "HumanEval": { "pass_at_1": humaneval_results["pass_at_1"], "correct": humaneval_results["correct"], "total": humaneval_results["total"] }, "MBPP": { "pass_at_1": mbpp_results["pass_at_1"], "correct": mbpp_results["correct"], "total": mbpp_results["total"] } } } logger.info("\n" + "="*50) logger.info("BENCHMARK RESULTS SUMMARY") logger.info("="*50) logger.info(f"HumanEval: {humaneval_results['pass_at_1']:.4f} ({humaneval_results['correct']}/{humaneval_results['total']})") logger.info(f"MBPP: {mbpp_results['pass_at_1']:.4f} ({mbpp_results['correct']}/{mbpp_results['total']})") # Save results output_file = "benchmark_results.json" with open(output_file, "w") as f: json.dump(summary, f, indent=2) logger.info(f"Results saved to {output_file}") return summary if __name__ == "__main__": main()