| """ |
| Benchmark script for evaluating Helion-V2 on standard benchmarks. |
| Includes MMLU, HellaSwag, ARC, TruthfulQA, GSM8K, and HumanEval. |
| """ |
|
|
| import torch |
| import json |
| import numpy as np |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from datasets import load_dataset |
| from tqdm import tqdm |
| import argparse |
| from typing import Dict, List, Tuple |
| import re |
|
|
|
|
| class BenchmarkEvaluator: |
| """Evaluator for running benchmarks on Helion-V2.""" |
| |
| def __init__(self, model_name: str, device: str = "cuda"): |
| """Initialize evaluator with model.""" |
| print(f"Loading model: {model_name}") |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
| self.model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype=torch.float16, |
| device_map=device, |
| ) |
| self.model.eval() |
| self.device = device |
| |
| def evaluate_mmlu(self, num_shots: int = 5) -> float: |
| """ |
| Evaluate on MMLU (Massive Multitask Language Understanding). |
| |
| Args: |
| num_shots: Number of examples for few-shot learning |
| |
| Returns: |
| Average accuracy across all subjects |
| """ |
| print("\n=== Evaluating MMLU ===") |
| dataset = load_dataset("cais/mmlu", "all", split="test") |
| |
| correct = 0 |
| total = 0 |
| |
| for item in tqdm(dataset, desc="MMLU"): |
| question = item["question"] |
| choices = item["choices"] |
| answer = item["answer"] |
| |
| |
| prompt = f"Question: {question}\n" |
| for i, choice in enumerate(choices): |
| prompt += f"{chr(65+i)}. {choice}\n" |
| prompt += "Answer:" |
| |
| |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
| with torch.no_grad(): |
| outputs = self.model.generate( |
| **inputs, |
| max_new_tokens=1, |
| temperature=0.0, |
| do_sample=False, |
| ) |
| |
| response = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip() |
| |
| |
| if response.upper() in ['A', 'B', 'C', 'D']: |
| predicted_idx = ord(response.upper()) - ord('A') |
| if predicted_idx == answer: |
| correct += 1 |
| |
| total += 1 |
| |
| if total >= 1000: |
| break |
| |
| accuracy = correct / total if total > 0 else 0 |
| print(f"MMLU Accuracy: {accuracy:.2%} ({correct}/{total})") |
| return accuracy |
| |
| def evaluate_hellaswag(self) -> float: |
| """ |
| Evaluate on HellaSwag (commonsense reasoning). |
| |
| Returns: |
| Accuracy on HellaSwag |
| """ |
| print("\n=== Evaluating HellaSwag ===") |
| dataset = load_dataset("Rowan/hellaswag", split="validation") |
| |
| correct = 0 |
| total = 0 |
| |
| for item in tqdm(dataset[:1000], desc="HellaSwag"): |
| context = item["ctx"] |
| endings = item["endings"] |
| label = int(item["label"]) |
| |
| |
| best_score = float('-inf') |
| best_idx = -1 |
| |
| for idx, ending in enumerate(endings): |
| full_text = context + " " + ending |
| inputs = self.tokenizer(full_text, return_tensors="pt").to(self.device) |
| |
| with torch.no_grad(): |
| outputs = self.model(**inputs, labels=inputs["input_ids"]) |
| score = -outputs.loss.item() |
| |
| if score > best_score: |
| best_score = score |
| best_idx = idx |
| |
| if best_idx == label: |
| correct += 1 |
| total += 1 |
| |
| accuracy = correct / total if total > 0 else 0 |
| print(f"HellaSwag Accuracy: {accuracy:.2%} ({correct}/{total})") |
| return accuracy |
| |
| def evaluate_arc(self, challenge: bool = True) -> float: |
| """ |
| Evaluate on ARC (AI2 Reasoning Challenge). |
| |
| Args: |
| challenge: Use ARC-Challenge (harder) vs ARC-Easy |
| |
| Returns: |
| Accuracy on ARC |
| """ |
| subset = "ARC-Challenge" if challenge else "ARC-Easy" |
| print(f"\n=== Evaluating {subset} ===") |
| |
| dataset = load_dataset("ai2_arc", subset, split="test") |
| |
| correct = 0 |
| total = 0 |
| |
| for item in tqdm(dataset, desc=subset): |
| question = item["question"] |
| choices = item["choices"]["text"] |
| labels = item["choices"]["label"] |
| answer_key = item["answerKey"] |
| |
| |
| prompt = f"Question: {question}\n" |
| for label, choice in zip(labels, choices): |
| prompt += f"{label}. {choice}\n" |
| prompt += "Answer:" |
| |
| |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
| with torch.no_grad(): |
| outputs = self.model.generate( |
| **inputs, |
| max_new_tokens=5, |
| temperature=0.0, |
| do_sample=False, |
| ) |
| |
| response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip() |
| |
| |
| predicted = response[0] if response else "" |
| |
| if predicted.upper() == answer_key.upper(): |
| correct += 1 |
| |
| total += 1 |
| |
| accuracy = correct / total if total > 0 else 0 |
| print(f"{subset} Accuracy: {accuracy:.2%} ({correct}/{total})") |
| return accuracy |
| |
| def evaluate_gsm8k(self) -> float: |
| """ |
| Evaluate on GSM8K (grade school math). |
| |
| Returns: |
| Accuracy on GSM8K |
| """ |
| print("\n=== Evaluating GSM8K ===") |
| dataset = load_dataset("gsm8k", "main", split="test") |
| |
| correct = 0 |
| total = 0 |
| |
| for item in tqdm(dataset[:500], desc="GSM8K"): |
| question = item["question"] |
| answer = item["answer"].split("####")[-1].strip() |
| |
| |
| prompt = f"Question: {question}\nLet's solve this step by step:\n" |
| |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
| with torch.no_grad(): |
| outputs = self.model.generate( |
| **inputs, |
| max_new_tokens=400, |
| temperature=0.0, |
| do_sample=False, |
| ) |
| |
| response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
| |
| |
| numbers = re.findall(r'-?\d+\.?\d*', response) |
| if numbers: |
| predicted = numbers[-1] |
| if predicted.replace('.', '').replace('-', '').isdigit(): |
| if float(predicted) == float(answer): |
| correct += 1 |
| |
| total += 1 |
| |
| accuracy = correct / total if total > 0 else 0 |
| print(f"GSM8K Accuracy: {accuracy:.2%} ({correct}/{total})") |
| return accuracy |
| |
| def evaluate_truthfulqa(self) -> float: |
| """ |
| Evaluate on TruthfulQA (truthfulness and informativeness). |
| |
| Returns: |
| MC2 accuracy |
| """ |
| print("\n=== Evaluating TruthfulQA ===") |
| dataset = load_dataset("truthful_qa", "multiple_choice", split="validation") |
| |
| correct = 0 |
| total = 0 |
| |
| for item in tqdm(dataset, desc="TruthfulQA"): |
| question = item["question"] |
| mc2_targets = item["mc2_targets"] |
| choices = mc2_targets["choices"] |
| labels = mc2_targets["labels"] |
| |
| |
| prompt = f"Question: {question}\n" |
| for i, choice in enumerate(choices): |
| prompt += f"{i+1}. {choice}\n" |
| prompt += "Select all correct answers:\n" |
| |
| inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
| with torch.no_grad(): |
| outputs = self.model.generate( |
| **inputs, |
| max_new_tokens=100, |
| temperature=0.0, |
| do_sample=False, |
| ) |
| |
| response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
| |
| |
| response_lower = response.lower() |
| found_correct = False |
| for idx, (choice, label) in enumerate(zip(choices, labels)): |
| if label == 1 and (choice.lower() in response_lower or str(idx+1) in response): |
| found_correct = True |
| break |
| |
| if found_correct: |
| correct += 1 |
| total += 1 |
| |
| accuracy = correct / total if total > 0 else 0 |
| print(f"TruthfulQA MC2 Accuracy: {accuracy:.2%} ({correct}/{total})") |
| return accuracy |
| |
| def run_all_benchmarks(self) -> Dict[str, float]: |
| """ |
| Run all available benchmarks. |
| |
| Returns: |
| Dictionary of benchmark results |
| """ |
| results = {} |
| |
| try: |
| results["MMLU"] = self.evaluate_mmlu() |
| except Exception as e: |
| print(f"MMLU evaluation failed: {e}") |
| results["MMLU"] = 0.0 |
| |
| try: |
| results["HellaSwag"] = self.evaluate_hellaswag() |
| except Exception as e: |
| print(f"HellaSwag evaluation failed: {e}") |
| results["HellaSwag"] = 0.0 |
| |
| try: |
| results["ARC-Challenge"] = self.evaluate_arc(challenge=True) |
| except Exception as e: |
| print(f"ARC-Challenge evaluation failed: {e}") |
| results["ARC-Challenge"] = 0.0 |
| |
| try: |
| results["GSM8K"] = self.evaluate_gsm8k() |
| except Exception as e: |
| print(f"GSM8K evaluation failed: {e}") |
| results["GSM8K"] = 0.0 |
| |
| try: |
| results["TruthfulQA"] = self.evaluate_truthfulqa() |
| except Exception as e: |
| print(f"TruthfulQA evaluation failed: {e}") |
| results["TruthfulQA"] = 0.0 |
| |
| return results |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Benchmark Helion-V2") |
| parser.add_argument( |
| "--model", |
| type=str, |
| default="DeepXR/Helion-V2", |
| help="Model name or path" |
| ) |
| parser.add_argument( |
| "--device", |
| type=str, |
| default="cuda", |
| help="Device to use" |
| ) |
| parser.add_argument( |
| "--benchmark", |
| type=str, |
| choices=["all", "mmlu", "hellaswag", "arc", "gsm8k", "truthfulqa"], |
| default="all", |
| help="Benchmark to run" |
| ) |
| parser.add_argument( |
| "--output", |
| type=str, |
| default="benchmark_results.json", |
| help="Output file for results" |
| ) |
| |
| args = parser.parse_args() |
| |
| evaluator = BenchmarkEvaluator(args.model, args.device) |
| |
| if args.benchmark == "all": |
| results = evaluator.run_all_benchmarks() |
| else: |
| benchmark_map = { |
| "mmlu": evaluator.evaluate_mmlu, |
| "hellaswag": evaluator.evaluate_hellaswag, |
| "arc": evaluator.evaluate_arc, |
| "gsm8k": evaluator.evaluate_gsm8k, |
| "truthfulqa": evaluator.evaluate_truthfulqa, |
| } |
| score = benchmark_map[args.benchmark]() |
| results = {args.benchmark: score} |
| |
| |
| with open(args.output, 'w') as f: |
| json.dump(results, f, indent=2) |
| |
| print(f"\n=== Final Results ===") |
| for benchmark, score in results.items(): |
| print(f"{benchmark}: {score:.2%}") |
| print(f"\nResults saved to {args.output}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |