""" Benchmark script for evaluating Helion-V2 on standard benchmarks. Includes MMLU, HellaSwag, ARC, TruthfulQA, GSM8K, and HumanEval. """ import torch import json import numpy as np from transformers import AutoTokenizer, AutoModelForCausalLM from datasets import load_dataset from tqdm import tqdm import argparse from typing import Dict, List, Tuple import re class BenchmarkEvaluator: """Evaluator for running benchmarks on Helion-V2.""" def __init__(self, model_name: str, device: str = "cuda"): """Initialize evaluator with model.""" print(f"Loading model: {model_name}") self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map=device, ) self.model.eval() self.device = device def evaluate_mmlu(self, num_shots: int = 5) -> float: """ Evaluate on MMLU (Massive Multitask Language Understanding). Args: num_shots: Number of examples for few-shot learning Returns: Average accuracy across all subjects """ print("\n=== Evaluating MMLU ===") dataset = load_dataset("cais/mmlu", "all", split="test") correct = 0 total = 0 for item in tqdm(dataset, desc="MMLU"): question = item["question"] choices = item["choices"] answer = item["answer"] # Format prompt prompt = f"Question: {question}\n" for i, choice in enumerate(choices): prompt += f"{chr(65+i)}. {choice}\n" prompt += "Answer:" # Get model prediction inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=1, temperature=0.0, do_sample=False, ) response = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip() # Check if correct if response.upper() in ['A', 'B', 'C', 'D']: predicted_idx = ord(response.upper()) - ord('A') if predicted_idx == answer: correct += 1 total += 1 if total >= 1000: # Limit for testing break accuracy = correct / total if total > 0 else 0 print(f"MMLU Accuracy: {accuracy:.2%} ({correct}/{total})") return accuracy def evaluate_hellaswag(self) -> float: """ Evaluate on HellaSwag (commonsense reasoning). Returns: Accuracy on HellaSwag """ print("\n=== Evaluating HellaSwag ===") dataset = load_dataset("Rowan/hellaswag", split="validation") correct = 0 total = 0 for item in tqdm(dataset[:1000], desc="HellaSwag"): context = item["ctx"] endings = item["endings"] label = int(item["label"]) # Calculate log-likelihood for each ending best_score = float('-inf') best_idx = -1 for idx, ending in enumerate(endings): full_text = context + " " + ending inputs = self.tokenizer(full_text, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model(**inputs, labels=inputs["input_ids"]) score = -outputs.loss.item() if score > best_score: best_score = score best_idx = idx if best_idx == label: correct += 1 total += 1 accuracy = correct / total if total > 0 else 0 print(f"HellaSwag Accuracy: {accuracy:.2%} ({correct}/{total})") return accuracy def evaluate_arc(self, challenge: bool = True) -> float: """ Evaluate on ARC (AI2 Reasoning Challenge). Args: challenge: Use ARC-Challenge (harder) vs ARC-Easy Returns: Accuracy on ARC """ subset = "ARC-Challenge" if challenge else "ARC-Easy" print(f"\n=== Evaluating {subset} ===") dataset = load_dataset("ai2_arc", subset, split="test") correct = 0 total = 0 for item in tqdm(dataset, desc=subset): question = item["question"] choices = item["choices"]["text"] labels = item["choices"]["label"] answer_key = item["answerKey"] # Format prompt prompt = f"Question: {question}\n" for label, choice in zip(labels, choices): prompt += f"{label}. {choice}\n" prompt += "Answer:" # Get model prediction inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=5, temperature=0.0, do_sample=False, ) response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip() # Extract answer predicted = response[0] if response else "" if predicted.upper() == answer_key.upper(): correct += 1 total += 1 accuracy = correct / total if total > 0 else 0 print(f"{subset} Accuracy: {accuracy:.2%} ({correct}/{total})") return accuracy def evaluate_gsm8k(self) -> float: """ Evaluate on GSM8K (grade school math). Returns: Accuracy on GSM8K """ print("\n=== Evaluating GSM8K ===") dataset = load_dataset("gsm8k", "main", split="test") correct = 0 total = 0 for item in tqdm(dataset[:500], desc="GSM8K"): # Sample for speed question = item["question"] answer = item["answer"].split("####")[-1].strip() # Format with chain-of-thought prompt prompt = f"Question: {question}\nLet's solve this step by step:\n" inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=400, temperature=0.0, do_sample=False, ) response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) # Extract numerical answer numbers = re.findall(r'-?\d+\.?\d*', response) if numbers: predicted = numbers[-1] # Take last number if predicted.replace('.', '').replace('-', '').isdigit(): if float(predicted) == float(answer): correct += 1 total += 1 accuracy = correct / total if total > 0 else 0 print(f"GSM8K Accuracy: {accuracy:.2%} ({correct}/{total})") return accuracy def evaluate_truthfulqa(self) -> float: """ Evaluate on TruthfulQA (truthfulness and informativeness). Returns: MC2 accuracy """ print("\n=== Evaluating TruthfulQA ===") dataset = load_dataset("truthful_qa", "multiple_choice", split="validation") correct = 0 total = 0 for item in tqdm(dataset, desc="TruthfulQA"): question = item["question"] mc2_targets = item["mc2_targets"] choices = mc2_targets["choices"] labels = mc2_targets["labels"] # Format prompt prompt = f"Question: {question}\n" for i, choice in enumerate(choices): prompt += f"{i+1}. {choice}\n" prompt += "Select all correct answers:\n" inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=100, temperature=0.0, do_sample=False, ) response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) # Simple scoring: if any correct answer is mentioned response_lower = response.lower() found_correct = False for idx, (choice, label) in enumerate(zip(choices, labels)): if label == 1 and (choice.lower() in response_lower or str(idx+1) in response): found_correct = True break if found_correct: correct += 1 total += 1 accuracy = correct / total if total > 0 else 0 print(f"TruthfulQA MC2 Accuracy: {accuracy:.2%} ({correct}/{total})") return accuracy def run_all_benchmarks(self) -> Dict[str, float]: """ Run all available benchmarks. Returns: Dictionary of benchmark results """ results = {} try: results["MMLU"] = self.evaluate_mmlu() except Exception as e: print(f"MMLU evaluation failed: {e}") results["MMLU"] = 0.0 try: results["HellaSwag"] = self.evaluate_hellaswag() except Exception as e: print(f"HellaSwag evaluation failed: {e}") results["HellaSwag"] = 0.0 try: results["ARC-Challenge"] = self.evaluate_arc(challenge=True) except Exception as e: print(f"ARC-Challenge evaluation failed: {e}") results["ARC-Challenge"] = 0.0 try: results["GSM8K"] = self.evaluate_gsm8k() except Exception as e: print(f"GSM8K evaluation failed: {e}") results["GSM8K"] = 0.0 try: results["TruthfulQA"] = self.evaluate_truthfulqa() except Exception as e: print(f"TruthfulQA evaluation failed: {e}") results["TruthfulQA"] = 0.0 return results def main(): parser = argparse.ArgumentParser(description="Benchmark Helion-V2") parser.add_argument( "--model", type=str, default="DeepXR/Helion-V2", help="Model name or path" ) parser.add_argument( "--device", type=str, default="cuda", help="Device to use" ) parser.add_argument( "--benchmark", type=str, choices=["all", "mmlu", "hellaswag", "arc", "gsm8k", "truthfulqa"], default="all", help="Benchmark to run" ) parser.add_argument( "--output", type=str, default="benchmark_results.json", help="Output file for results" ) args = parser.parse_args() evaluator = BenchmarkEvaluator(args.model, args.device) if args.benchmark == "all": results = evaluator.run_all_benchmarks() else: benchmark_map = { "mmlu": evaluator.evaluate_mmlu, "hellaswag": evaluator.evaluate_hellaswag, "arc": evaluator.evaluate_arc, "gsm8k": evaluator.evaluate_gsm8k, "truthfulqa": evaluator.evaluate_truthfulqa, } score = benchmark_map[args.benchmark]() results = {args.benchmark: score} # Save results with open(args.output, 'w') as f: json.dump(results, f, indent=2) print(f"\n=== Final Results ===") for benchmark, score in results.items(): print(f"{benchmark}: {score:.2%}") print(f"\nResults saved to {args.output}") if __name__ == "__main__": main()