| |
| """ |
| Fox1.3 Evaluation Script |
| Evaluates on HumanEval and MBPP benchmarks |
| """ |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
| from datasets import load_dataset |
| import json |
| import logging |
| from typing import List, Dict |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| MODEL_NAME = "teolm30/fox1.3" |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| def load_model(): |
| logger.info(f"Loading model: {MODEL_NAME}") |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| MODEL_NAME, |
| torch_dtype=torch.float16, |
| device_map="auto", |
| trust_remote_code=True |
| ) |
| |
| return model, tokenizer |
|
|
| def run_humaneval(model, tokenizer) -> Dict: |
| """Run HumanEval benchmark.""" |
| logger.info("Loading HumanEval dataset...") |
| dataset = load_dataset("openai/openai_humaneval", split="test") |
| |
| pipe = pipeline( |
| "text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| max_new_tokens=256, |
| do_sample=False, |
| temperature=None, |
| top_p=None, |
| device_map="auto" |
| ) |
| |
| correct = 0 |
| total = len(dataset) |
| results = [] |
| |
| for i, item in enumerate(dataset): |
| prompt = item["prompt"] |
| test = item["test"] |
| canonical_solution = item["canonical_solution"] |
| |
| |
| prompt_end = prompt.find("def ") |
| if prompt_end == -1: |
| prompt_end = len(prompt) |
| |
| full_prompt = prompt[:prompt_end] |
| |
| try: |
| output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id) |
| generated = output[0]["generated_text"] |
| |
| |
| code = generated[len(full_prompt):].strip() |
| |
| |
| if "```python" in code: |
| code = code.split("```python")[1].split("```")[0].strip() |
| elif "```" in code: |
| code = code.split("```")[1].split("```")[0].strip() |
| |
| |
| exec_globals = {} |
| exec(code, exec_globals) |
| exec(test, exec_globals) |
| |
| |
| local_vars = {} |
| exec(code, local_vars) |
| try: |
| exec(test, local_vars) |
| correct += 1 |
| status = "PASS" |
| except AssertionError: |
| status = "FAIL" |
| except Exception as e: |
| status = f"ERROR: {str(e)[:50]}" |
| |
| except Exception as e: |
| status = f"ERROR: {str(e)[:50]}" |
| |
| results.append({ |
| "task_id": item.get("task_id", i), |
| "status": status |
| }) |
| |
| if (i + 1) % 10 == 0: |
| logger.info(f"HumanEval progress: {i+1}/{total} | Running pass@{1}: {correct}/{i+1}") |
| |
| pass_at_1 = correct / total |
| logger.info(f"HumanEval PASS@1: {pass_at_1:.4f} ({correct}/{total})") |
| |
| return { |
| "benchmark": "HumanEval", |
| "pass_at_1": pass_at_1, |
| "correct": correct, |
| "total": total, |
| "results": results |
| } |
|
|
| def run_mbpp(model, tokenizer) -> Dict: |
| """Run MBPP benchmark.""" |
| logger.info("Loading MBPP dataset...") |
| dataset = load_dataset("google-research/mbpp", "sanitized", split="test") |
| |
| pipe = pipeline( |
| "text-generation", |
| model=model, |
| tokenizer=tokenizer, |
| max_new_tokens=256, |
| do_sample=False, |
| temperature=None, |
| top_p=None, |
| device_map="auto" |
| ) |
| |
| correct = 0 |
| total = min(len(dataset), 374) |
| results = [] |
| |
| for i, item in enumerate(dataset[:total]): |
| prompt = item["prompt"] |
| test_list = item["test_list"] |
| code = item["code"] |
| |
| full_prompt = f"### Instruction:\nWrite a Python function.\n\n### Input:\n{prompt}\n\n### Response:\n" |
| |
| try: |
| output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id) |
| generated = output[0]["generated_text"] |
| |
| |
| response = generated[len(full_prompt):].strip() |
| |
| if "```python" in response: |
| response = response.split("```python")[1].split("```")[0].strip() |
| elif "```" in response: |
| response = response.split("```")[1].split("```")[0].strip() |
| |
| |
| exec_globals = {} |
| exec(response, exec_globals) |
| |
| all_passed = True |
| for test_code in test_list: |
| try: |
| exec(test_code, exec_globals) |
| except AssertionError: |
| all_passed = False |
| break |
| except Exception: |
| all_passed = False |
| break |
| |
| if all_passed: |
| correct += 1 |
| status = "PASS" |
| else: |
| status = "FAIL" |
| |
| except Exception as e: |
| status = f"ERROR: {str(e)[:50]}" |
| |
| results.append({ |
| "prompt_id": item.get("prompts_id", i), |
| "status": status |
| }) |
| |
| if (i + 1) % 50 == 0: |
| logger.info(f"MBPP progress: {i+1}/{total} | Running pass@1: {correct}/{i+1}") |
| |
| pass_at_1 = correct / total |
| logger.info(f"MBPP PASS@1: {pass_at_1:.4f} ({correct}/{total})") |
| |
| return { |
| "benchmark": "MBPP", |
| "pass_at_1": pass_at_1, |
| "correct": correct, |
| "total": total, |
| "results": results |
| } |
|
|
| def main(): |
| logger.info(f"Using device: {DEVICE}") |
| |
| model, tokenizer = load_model() |
| |
| |
| humaneval_results = run_humaneval(model, tokenizer) |
| mbpp_results = run_mbpp(model, tokenizer) |
| |
| |
| summary = { |
| "model": MODEL_NAME, |
| "benchmarks": { |
| "HumanEval": { |
| "pass_at_1": humaneval_results["pass_at_1"], |
| "correct": humaneval_results["correct"], |
| "total": humaneval_results["total"] |
| }, |
| "MBPP": { |
| "pass_at_1": mbpp_results["pass_at_1"], |
| "correct": mbpp_results["correct"], |
| "total": mbpp_results["total"] |
| } |
| } |
| } |
| |
| logger.info("\n" + "="*50) |
| logger.info("BENCHMARK RESULTS SUMMARY") |
| logger.info("="*50) |
| logger.info(f"HumanEval: {humaneval_results['pass_at_1']:.4f} ({humaneval_results['correct']}/{humaneval_results['total']})") |
| logger.info(f"MBPP: {mbpp_results['pass_at_1']:.4f} ({mbpp_results['correct']}/{mbpp_results['total']})") |
| |
| |
| output_file = "benchmark_results.json" |
| with open(output_file, "w") as f: |
| json.dump(summary, f, indent=2) |
| logger.info(f"Results saved to {output_file}") |
| |
| return summary |
|
|
| if __name__ == "__main__": |
| main() |
|
|