File size: 12,576 Bytes

7440c87

"""
Benchmark script for evaluating Helion-V2 on standard benchmarks.
Includes MMLU, HellaSwag, ARC, TruthfulQA, GSM8K, and HumanEval.
"""

import torch
import json
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import argparse
from typing import Dict, List, Tuple
import re


class BenchmarkEvaluator:
    """Evaluator for running benchmarks on Helion-V2."""
    
    def __init__(self, model_name: str, device: str = "cuda"):
        """Initialize evaluator with model."""
        print(f"Loading model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            device_map=device,
        )
        self.model.eval()
        self.device = device
        
    def evaluate_mmlu(self, num_shots: int = 5) -> float:
        """
        Evaluate on MMLU (Massive Multitask Language Understanding).
        
        Args:
            num_shots: Number of examples for few-shot learning
            
        Returns:
            Average accuracy across all subjects
        """
        print("\n=== Evaluating MMLU ===")
        dataset = load_dataset("cais/mmlu", "all", split="test")
        
        correct = 0
        total = 0
        
        for item in tqdm(dataset, desc="MMLU"):
            question = item["question"]
            choices = item["choices"]
            answer = item["answer"]
            
            # Format prompt
            prompt = f"Question: {question}\n"
            for i, choice in enumerate(choices):
                prompt += f"{chr(65+i)}. {choice}\n"
            prompt += "Answer:"
            
            # Get model prediction
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=1,
                    temperature=0.0,
                    do_sample=False,
                )
            
            response = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
            
            # Check if correct
            if response.upper() in ['A', 'B', 'C', 'D']:
                predicted_idx = ord(response.upper()) - ord('A')
                if predicted_idx == answer:
                    correct += 1
            
            total += 1
            
            if total >= 1000:  # Limit for testing
                break
        
        accuracy = correct / total if total > 0 else 0
        print(f"MMLU Accuracy: {accuracy:.2%} ({correct}/{total})")
        return accuracy
    
    def evaluate_hellaswag(self) -> float:
        """
        Evaluate on HellaSwag (commonsense reasoning).
        
        Returns:
            Accuracy on HellaSwag
        """
        print("\n=== Evaluating HellaSwag ===")
        dataset = load_dataset("Rowan/hellaswag", split="validation")
        
        correct = 0
        total = 0
        
        for item in tqdm(dataset[:1000], desc="HellaSwag"):
            context = item["ctx"]
            endings = item["endings"]
            label = int(item["label"])
            
            # Calculate log-likelihood for each ending
            best_score = float('-inf')
            best_idx = -1
            
            for idx, ending in enumerate(endings):
                full_text = context + " " + ending
                inputs = self.tokenizer(full_text, return_tensors="pt").to(self.device)
                
                with torch.no_grad():
                    outputs = self.model(**inputs, labels=inputs["input_ids"])
                    score = -outputs.loss.item()
                
                if score > best_score:
                    best_score = score
                    best_idx = idx
            
            if best_idx == label:
                correct += 1
            total += 1
        
        accuracy = correct / total if total > 0 else 0
        print(f"HellaSwag Accuracy: {accuracy:.2%} ({correct}/{total})")
        return accuracy
    
    def evaluate_arc(self, challenge: bool = True) -> float:
        """
        Evaluate on ARC (AI2 Reasoning Challenge).
        
        Args:
            challenge: Use ARC-Challenge (harder) vs ARC-Easy
            
        Returns:
            Accuracy on ARC
        """
        subset = "ARC-Challenge" if challenge else "ARC-Easy"
        print(f"\n=== Evaluating {subset} ===")
        
        dataset = load_dataset("ai2_arc", subset, split="test")
        
        correct = 0
        total = 0
        
        for item in tqdm(dataset, desc=subset):
            question = item["question"]
            choices = item["choices"]["text"]
            labels = item["choices"]["label"]
            answer_key = item["answerKey"]
            
            # Format prompt
            prompt = f"Question: {question}\n"
            for label, choice in zip(labels, choices):
                prompt += f"{label}. {choice}\n"
            prompt += "Answer:"
            
            # Get model prediction
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=5,
                    temperature=0.0,
                    do_sample=False,
                )
            
            response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
            
            # Extract answer
            predicted = response[0] if response else ""
            
            if predicted.upper() == answer_key.upper():
                correct += 1
            
            total += 1
        
        accuracy = correct / total if total > 0 else 0
        print(f"{subset} Accuracy: {accuracy:.2%} ({correct}/{total})")
        return accuracy
    
    def evaluate_gsm8k(self) -> float:
        """
        Evaluate on GSM8K (grade school math).
        
        Returns:
            Accuracy on GSM8K
        """
        print("\n=== Evaluating GSM8K ===")
        dataset = load_dataset("gsm8k", "main", split="test")
        
        correct = 0
        total = 0
        
        for item in tqdm(dataset[:500], desc="GSM8K"):  # Sample for speed
            question = item["question"]
            answer = item["answer"].split("####")[-1].strip()
            
            # Format with chain-of-thought prompt
            prompt = f"Question: {question}\nLet's solve this step by step:\n"
            
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=400,
                    temperature=0.0,
                    do_sample=False,
                )
            
            response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
            
            # Extract numerical answer
            numbers = re.findall(r'-?\d+\.?\d*', response)
            if numbers:
                predicted = numbers[-1]  # Take last number
                if predicted.replace('.', '').replace('-', '').isdigit():
                    if float(predicted) == float(answer):
                        correct += 1
            
            total += 1
        
        accuracy = correct / total if total > 0 else 0
        print(f"GSM8K Accuracy: {accuracy:.2%} ({correct}/{total})")
        return accuracy
    
    def evaluate_truthfulqa(self) -> float:
        """
        Evaluate on TruthfulQA (truthfulness and informativeness).
        
        Returns:
            MC2 accuracy
        """
        print("\n=== Evaluating TruthfulQA ===")
        dataset = load_dataset("truthful_qa", "multiple_choice", split="validation")
        
        correct = 0
        total = 0
        
        for item in tqdm(dataset, desc="TruthfulQA"):
            question = item["question"]
            mc2_targets = item["mc2_targets"]
            choices = mc2_targets["choices"]
            labels = mc2_targets["labels"]
            
            # Format prompt
            prompt = f"Question: {question}\n"
            for i, choice in enumerate(choices):
                prompt += f"{i+1}. {choice}\n"
            prompt += "Select all correct answers:\n"
            
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=100,
                    temperature=0.0,
                    do_sample=False,
                )
            
            response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
            
            # Simple scoring: if any correct answer is mentioned
            response_lower = response.lower()
            found_correct = False
            for idx, (choice, label) in enumerate(zip(choices, labels)):
                if label == 1 and (choice.lower() in response_lower or str(idx+1) in response):
                    found_correct = True
                    break
            
            if found_correct:
                correct += 1
            total += 1
        
        accuracy = correct / total if total > 0 else 0
        print(f"TruthfulQA MC2 Accuracy: {accuracy:.2%} ({correct}/{total})")
        return accuracy
    
    def run_all_benchmarks(self) -> Dict[str, float]:
        """
        Run all available benchmarks.
        
        Returns:
            Dictionary of benchmark results
        """
        results = {}
        
        try:
            results["MMLU"] = self.evaluate_mmlu()
        except Exception as e:
            print(f"MMLU evaluation failed: {e}")
            results["MMLU"] = 0.0
        
        try:
            results["HellaSwag"] = self.evaluate_hellaswag()
        except Exception as e:
            print(f"HellaSwag evaluation failed: {e}")
            results["HellaSwag"] = 0.0
        
        try:
            results["ARC-Challenge"] = self.evaluate_arc(challenge=True)
        except Exception as e:
            print(f"ARC-Challenge evaluation failed: {e}")
            results["ARC-Challenge"] = 0.0
        
        try:
            results["GSM8K"] = self.evaluate_gsm8k()
        except Exception as e:
            print(f"GSM8K evaluation failed: {e}")
            results["GSM8K"] = 0.0
        
        try:
            results["TruthfulQA"] = self.evaluate_truthfulqa()
        except Exception as e:
            print(f"TruthfulQA evaluation failed: {e}")
            results["TruthfulQA"] = 0.0
        
        return results


def main():
    parser = argparse.ArgumentParser(description="Benchmark Helion-V2")
    parser.add_argument(
        "--model",
        type=str,
        default="DeepXR/Helion-V2",
        help="Model name or path"
    )
    parser.add_argument(
        "--device",
        type=str,
        default="cuda",
        help="Device to use"
    )
    parser.add_argument(
        "--benchmark",
        type=str,
        choices=["all", "mmlu", "hellaswag", "arc", "gsm8k", "truthfulqa"],
        default="all",
        help="Benchmark to run"
    )
    parser.add_argument(
        "--output",
        type=str,
        default="benchmark_results.json",
        help="Output file for results"
    )
    
    args = parser.parse_args()
    
    evaluator = BenchmarkEvaluator(args.model, args.device)
    
    if args.benchmark == "all":
        results = evaluator.run_all_benchmarks()
    else:
        benchmark_map = {
            "mmlu": evaluator.evaluate_mmlu,
            "hellaswag": evaluator.evaluate_hellaswag,
            "arc": evaluator.evaluate_arc,
            "gsm8k": evaluator.evaluate_gsm8k,
            "truthfulqa": evaluator.evaluate_truthfulqa,
        }
        score = benchmark_map[args.benchmark]()
        results = {args.benchmark: score}
    
    # Save results
    with open(args.output, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"\n=== Final Results ===")
    for benchmark, score in results.items():
        print(f"{benchmark}: {score:.2%}")
    print(f"\nResults saved to {args.output}")


if __name__ == "__main__":
    main()