teolm30
/

fox1.3

+#!/usr/bin/env python3
+"""
+Fox1.3 Evaluation Script
+Evaluates on HumanEval and MBPP benchmarks
+"""
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+from datasets import load_dataset
+import json
+import logging
+from typing import List, Dict
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+MODEL_NAME = "teolm30/fox1.3"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def load_model():
+    logger.info(f"Loading model: {MODEL_NAME}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True
+    )
+    return model, tokenizer
+def run_humaneval(model, tokenizer) -> Dict:
+    """Run HumanEval benchmark."""
+    logger.info("Loading HumanEval dataset...")
+    dataset = load_dataset("openai/openai_humaneval", split="test")
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_new_tokens=256,
+        do_sample=False,
+        temperature=None,
+        top_p=None,
+        device_map="auto"
+    )
+    correct = 0
+    total = len(dataset)
+    results = []
+    for i, item in enumerate(dataset):
+        prompt = item["prompt"]
+        test = item["test"]
+        canonical_solution = item["canonical_solution"]
+        # Extract the prompt up to the function signature
+        prompt_end = prompt.find("def ")
+        if prompt_end == -1:
+            prompt_end = len(prompt)
+        full_prompt = prompt[:prompt_end]
+        try:
+            output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
+            generated = output[0]["generated_text"]
+            # Extract code block
+            code = generated[len(full_prompt):].strip()
+            # Try to extract just the function body
+            if "```python" in code:
+                code = code.split("```python")[1].split("```")[0].strip()
+            elif "```" in code:
+                code = code.split("```")[1].split("```")[0].strip()
+            # Execute the code with the test
+            exec_globals = {}
+            exec(code, exec_globals)
+            exec(test, exec_globals)
+            # Check if test passed by running it
+            local_vars = {}
+            exec(code, local_vars)
+            try:
+                exec(test, local_vars)
+                correct += 1
+                status = "PASS"
+            except AssertionError:
+                status = "FAIL"
+            except Exception as e:
+                status = f"ERROR: {str(e)[:50]}"
+        except Exception as e:
+            status = f"ERROR: {str(e)[:50]}"
+        results.append({
+            "task_id": item.get("task_id", i),
+            "status": status
+        })
+        if (i + 1) % 10 == 0:
+            logger.info(f"HumanEval progress: {i+1}/{total} | Running pass@{1}: {correct}/{i+1}")
+    pass_at_1 = correct / total
+    logger.info(f"HumanEval PASS@1: {pass_at_1:.4f} ({correct}/{total})")
+    return {
+        "benchmark": "HumanEval",
+        "pass_at_1": pass_at_1,
+        "correct": correct,
+        "total": total,
+        "results": results
+    }
+def run_mbpp(model, tokenizer) -> Dict:
+    """Run MBPP benchmark."""
+    logger.info("Loading MBPP dataset...")
+    dataset = load_dataset("google-research/mbpp", "sanitized", split="test")
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        max_new_tokens=256,
+        do_sample=False,
+        temperature=None,
+        top_p=None,
+        device_map="auto"
+    )
+    correct = 0
+    total = min(len(dataset), 374)  # Standard subset size
+    results = []
+    for i, item in enumerate(dataset[:total]):
+        prompt = item["prompt"]
+        test_list = item["test_list"]
+        code = item["code"]
+        full_prompt = f"### Instruction:\nWrite a Python function.\n\n### Input:\n{prompt}\n\n### Response:\n"
+        try:
+            output = pipe(full_prompt, pad_token_id=tokenizer.eos_token_id)
+            generated = output[0]["generated_text"]
+            # Extract code from response
+            response = generated[len(full_prompt):].strip()
+            if "```python" in response:
+                response = response.split("```python")[1].split("```")[0].strip()
+            elif "```" in response:
+                response = response.split("```")[1].split("```")[0].strip()
+            # Test the generated code
+            exec_globals = {}
+            exec(response, exec_globals)
+            all_passed = True
+            for test_code in test_list:
+                try:
+                    exec(test_code, exec_globals)
+                except AssertionError:
+                    all_passed = False
+                    break
+                except Exception:
+                    all_passed = False
+                    break
+            if all_passed:
+                correct += 1
+                status = "PASS"
+            else:
+                status = "FAIL"
+        except Exception as e:
+            status = f"ERROR: {str(e)[:50]}"
+        results.append({
+            "prompt_id": item.get("prompts_id", i),
+            "status": status
+        })
+        if (i + 1) % 50 == 0:
+            logger.info(f"MBPP progress: {i+1}/{total} | Running pass@1: {correct}/{i+1}")
+    pass_at_1 = correct / total
+    logger.info(f"MBPP PASS@1: {pass_at_1:.4f} ({correct}/{total})")
+    return {
+        "benchmark": "MBPP",
+        "pass_at_1": pass_at_1,
+        "correct": correct,
+        "total": total,
+        "results": results
+    }
+def main():
+    logger.info(f"Using device: {DEVICE}")
+    model, tokenizer = load_model()
+    # Run benchmarks
+    humaneval_results = run_humaneval(model, tokenizer)
+    mbpp_results = run_mbpp(model, tokenizer)
+    # Summary
+    summary = {
+        "model": MODEL_NAME,
+        "benchmarks": {
+            "HumanEval": {
+                "pass_at_1": humaneval_results["pass_at_1"],
+                "correct": humaneval_results["correct"],
+                "total": humaneval_results["total"]
+            },
+            "MBPP": {
+                "pass_at_1": mbpp_results["pass_at_1"],
+                "correct": mbpp_results["correct"],
+                "total": mbpp_results["total"]
+            }
+        }
+    }
+    logger.info("\n" + "="*50)
+    logger.info("BENCHMARK RESULTS SUMMARY")
+    logger.info("="*50)
+    logger.info(f"HumanEval: {humaneval_results['pass_at_1']:.4f} ({humaneval_results['correct']}/{humaneval_results['total']})")
+    logger.info(f"MBPP:      {mbpp_results['pass_at_1']:.4f} ({mbpp_results['correct']}/{mbpp_results['total']})")
+    # Save results
+    output_file = "benchmark_results.json"
+    with open(output_file, "w") as f:
+        json.dump(summary, f, indent=2)
+    logger.info(f"Results saved to {output_file}")
+    return summary
+if __name__ == "__main__":
+    main()