File size: 7,328 Bytes

f7a5fb7

#!/usr/bin/env python3
"""
Fox1.3 Evaluation Script
Evaluates on HumanEval and MBPP benchmarks
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from datasets import load_dataset
import json
import logging
from typing import List, Dict

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

MODEL_NAME = "teolm30/fox1.3"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def load_model():
    logger.info(f"Loading model: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    return model, tokenizer

def run_humaneval(model, tokenizer) -> Dict:
    """Run HumanEval benchmark."""
    logger.info("Loading HumanEval dataset...")
    dataset = load_dataset("openai/openai_humaneval", split="test")
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        do_sample=False,
        temperature=None,
        top_p=None,
        device_map="auto"
    )
    
    correct = 0
    total = len(dataset)
    results = []
    
    for i, item in enumerate(dataset):
        prompt = item["prompt"]
        test = item["test"]
        canonical_solution = item["canonical_solution"]
        
        # Extract the prompt up to the function signature
        prompt_end = prompt.find("def ")
        if prompt_end == -1:
            prompt_end = len(prompt)
        
        full_prompt = prompt[:prompt_end]
        
        try:
            output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
            generated = output[0]["generated_text"]
            
            # Extract code block
            code = generated[len(full_prompt):].strip()
            
            # Try to extract just the function body
            if "```python" in code:
                code = code.split("```python")[1].split("```")[0].strip()
            elif "```" in code:
                code = code.split("```")[1].split("```")[0].strip()
            
            # Execute the code with the test
            exec_globals = {}
            exec(code, exec_globals)
            exec(test, exec_globals)
            
            # Check if test passed by running it
            local_vars = {}
            exec(code, local_vars)
            try:
                exec(test, local_vars)
                correct += 1
                status = "PASS"
            except AssertionError:
                status = "FAIL"
            except Exception as e:
                status = f"ERROR: {str(e)[:50]}"
                
        except Exception as e:
            status = f"ERROR: {str(e)[:50]}"
        
        results.append({
            "task_id": item.get("task_id", i),
            "status": status
        })
        
        if (i + 1) % 10 == 0:
            logger.info(f"HumanEval progress: {i+1}/{total} | Running pass@{1}: {correct}/{i+1}")
    
    pass_at_1 = correct / total
    logger.info(f"HumanEval PASS@1: {pass_at_1:.4f} ({correct}/{total})")
    
    return {
        "benchmark": "HumanEval",
        "pass_at_1": pass_at_1,
        "correct": correct,
        "total": total,
        "results": results
    }

def run_mbpp(model, tokenizer) -> Dict:
    """Run MBPP benchmark."""
    logger.info("Loading MBPP dataset...")
    dataset = load_dataset("google-research/mbpp", "sanitized", split="test")
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=256,
        do_sample=False,
        temperature=None,
        top_p=None,
        device_map="auto"
    )
    
    correct = 0
    total = min(len(dataset), 374)  # Standard subset size
    results = []
    
    for i, item in enumerate(dataset[:total]):
        prompt = item["prompt"]
        test_list = item["test_list"]
        code = item["code"]
        
        full_prompt = f"### Instruction:\nWrite a Python function.\n\n### Input:\n{prompt}\n\n### Response:\n"
        
        try:
            output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
            generated = output[0]["generated_text"]
            
            # Extract code from response
            response = generated[len(full_prompt):].strip()
            
            if "```python" in response:
                response = response.split("```python")[1].split("```")[0].strip()
            elif "```" in response:
                response = response.split("```")[1].split("```")[0].strip()
            
            # Test the generated code
            exec_globals = {}
            exec(response, exec_globals)
            
            all_passed = True
            for test_code in test_list:
                try:
                    exec(test_code, exec_globals)
                except AssertionError:
                    all_passed = False
                    break
                except Exception:
                    all_passed = False
                    break
            
            if all_passed:
                correct += 1
                status = "PASS"
            else:
                status = "FAIL"
                
        except Exception as e:
            status = f"ERROR: {str(e)[:50]}"
        
        results.append({
            "prompt_id": item.get("prompts_id", i),
            "status": status
        })
        
        if (i + 1) % 50 == 0:
            logger.info(f"MBPP progress: {i+1}/{total} | Running pass@1: {correct}/{i+1}")
    
    pass_at_1 = correct / total
    logger.info(f"MBPP PASS@1: {pass_at_1:.4f} ({correct}/{total})")
    
    return {
        "benchmark": "MBPP",
        "pass_at_1": pass_at_1,
        "correct": correct,
        "total": total,
        "results": results
    }

def main():
    logger.info(f"Using device: {DEVICE}")
    
    model, tokenizer = load_model()
    
    # Run benchmarks
    humaneval_results = run_humaneval(model, tokenizer)
    mbpp_results = run_mbpp(model, tokenizer)
    
    # Summary
    summary = {
        "model": MODEL_NAME,
        "benchmarks": {
            "HumanEval": {
                "pass_at_1": humaneval_results["pass_at_1"],
                "correct": humaneval_results["correct"],
                "total": humaneval_results["total"]
            },
            "MBPP": {
                "pass_at_1": mbpp_results["pass_at_1"],
                "correct": mbpp_results["correct"],
                "total": mbpp_results["total"]
            }
        }
    }
    
    logger.info("\n" + "="*50)
    logger.info("BENCHMARK RESULTS SUMMARY")
    logger.info("="*50)
    logger.info(f"HumanEval: {humaneval_results['pass_at_1']:.4f} ({humaneval_results['correct']}/{humaneval_results['total']})")
    logger.info(f"MBPP:      {mbpp_results['pass_at_1']:.4f} ({mbpp_results['correct']}/{mbpp_results['total']})")
    
    # Save results
    output_file = "benchmark_results.json"
    with open(output_file, "w") as f:
        json.dump(summary, f, indent=2)
    logger.info(f"Results saved to {output_file}")
    
    return summary

if __name__ == "__main__":
    main()