|
|
""" |
|
|
Benchmark script for evaluating Helion-V2 on standard benchmarks. |
|
|
Includes MMLU, HellaSwag, ARC, TruthfulQA, GSM8K, and HumanEval. |
|
|
""" |
|
|
|
|
|
import torch |
|
|
import json |
|
|
import numpy as np |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
from datasets import load_dataset |
|
|
from tqdm import tqdm |
|
|
import argparse |
|
|
from typing import Dict, List, Tuple |
|
|
import re |
|
|
|
|
|
|
|
|
class BenchmarkEvaluator: |
|
|
"""Evaluator for running benchmarks on Helion-V2.""" |
|
|
|
|
|
def __init__(self, model_name: str, device: str = "cuda"): |
|
|
"""Initialize evaluator with model.""" |
|
|
print(f"Loading model: {model_name}") |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype=torch.float16, |
|
|
device_map=device, |
|
|
) |
|
|
self.model.eval() |
|
|
self.device = device |
|
|
|
|
|
def evaluate_mmlu(self, num_shots: int = 5) -> float: |
|
|
""" |
|
|
Evaluate on MMLU (Massive Multitask Language Understanding). |
|
|
|
|
|
Args: |
|
|
num_shots: Number of examples for few-shot learning |
|
|
|
|
|
Returns: |
|
|
Average accuracy across all subjects |
|
|
""" |
|
|
print("\n=== Evaluating MMLU ===") |
|
|
dataset = load_dataset("cais/mmlu", "all", split="test") |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
|
|
|
for item in tqdm(dataset, desc="MMLU"): |
|
|
question = item["question"] |
|
|
choices = item["choices"] |
|
|
answer = item["answer"] |
|
|
|
|
|
|
|
|
prompt = f"Question: {question}\n" |
|
|
for i, choice in enumerate(choices): |
|
|
prompt += f"{chr(65+i)}. {choice}\n" |
|
|
prompt += "Answer:" |
|
|
|
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
|
|
with torch.no_grad(): |
|
|
outputs = self.model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=1, |
|
|
temperature=0.0, |
|
|
do_sample=False, |
|
|
) |
|
|
|
|
|
response = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip() |
|
|
|
|
|
|
|
|
if response.upper() in ['A', 'B', 'C', 'D']: |
|
|
predicted_idx = ord(response.upper()) - ord('A') |
|
|
if predicted_idx == answer: |
|
|
correct += 1 |
|
|
|
|
|
total += 1 |
|
|
|
|
|
if total >= 1000: |
|
|
break |
|
|
|
|
|
accuracy = correct / total if total > 0 else 0 |
|
|
print(f"MMLU Accuracy: {accuracy:.2%} ({correct}/{total})") |
|
|
return accuracy |
|
|
|
|
|
def evaluate_hellaswag(self) -> float: |
|
|
""" |
|
|
Evaluate on HellaSwag (commonsense reasoning). |
|
|
|
|
|
Returns: |
|
|
Accuracy on HellaSwag |
|
|
""" |
|
|
print("\n=== Evaluating HellaSwag ===") |
|
|
dataset = load_dataset("Rowan/hellaswag", split="validation") |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
|
|
|
for item in tqdm(dataset[:1000], desc="HellaSwag"): |
|
|
context = item["ctx"] |
|
|
endings = item["endings"] |
|
|
label = int(item["label"]) |
|
|
|
|
|
|
|
|
best_score = float('-inf') |
|
|
best_idx = -1 |
|
|
|
|
|
for idx, ending in enumerate(endings): |
|
|
full_text = context + " " + ending |
|
|
inputs = self.tokenizer(full_text, return_tensors="pt").to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = self.model(**inputs, labels=inputs["input_ids"]) |
|
|
score = -outputs.loss.item() |
|
|
|
|
|
if score > best_score: |
|
|
best_score = score |
|
|
best_idx = idx |
|
|
|
|
|
if best_idx == label: |
|
|
correct += 1 |
|
|
total += 1 |
|
|
|
|
|
accuracy = correct / total if total > 0 else 0 |
|
|
print(f"HellaSwag Accuracy: {accuracy:.2%} ({correct}/{total})") |
|
|
return accuracy |
|
|
|
|
|
def evaluate_arc(self, challenge: bool = True) -> float: |
|
|
""" |
|
|
Evaluate on ARC (AI2 Reasoning Challenge). |
|
|
|
|
|
Args: |
|
|
challenge: Use ARC-Challenge (harder) vs ARC-Easy |
|
|
|
|
|
Returns: |
|
|
Accuracy on ARC |
|
|
""" |
|
|
subset = "ARC-Challenge" if challenge else "ARC-Easy" |
|
|
print(f"\n=== Evaluating {subset} ===") |
|
|
|
|
|
dataset = load_dataset("ai2_arc", subset, split="test") |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
|
|
|
for item in tqdm(dataset, desc=subset): |
|
|
question = item["question"] |
|
|
choices = item["choices"]["text"] |
|
|
labels = item["choices"]["label"] |
|
|
answer_key = item["answerKey"] |
|
|
|
|
|
|
|
|
prompt = f"Question: {question}\n" |
|
|
for label, choice in zip(labels, choices): |
|
|
prompt += f"{label}. {choice}\n" |
|
|
prompt += "Answer:" |
|
|
|
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
|
|
with torch.no_grad(): |
|
|
outputs = self.model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=5, |
|
|
temperature=0.0, |
|
|
do_sample=False, |
|
|
) |
|
|
|
|
|
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip() |
|
|
|
|
|
|
|
|
predicted = response[0] if response else "" |
|
|
|
|
|
if predicted.upper() == answer_key.upper(): |
|
|
correct += 1 |
|
|
|
|
|
total += 1 |
|
|
|
|
|
accuracy = correct / total if total > 0 else 0 |
|
|
print(f"{subset} Accuracy: {accuracy:.2%} ({correct}/{total})") |
|
|
return accuracy |
|
|
|
|
|
def evaluate_gsm8k(self) -> float: |
|
|
""" |
|
|
Evaluate on GSM8K (grade school math). |
|
|
|
|
|
Returns: |
|
|
Accuracy on GSM8K |
|
|
""" |
|
|
print("\n=== Evaluating GSM8K ===") |
|
|
dataset = load_dataset("gsm8k", "main", split="test") |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
|
|
|
for item in tqdm(dataset[:500], desc="GSM8K"): |
|
|
question = item["question"] |
|
|
answer = item["answer"].split("####")[-1].strip() |
|
|
|
|
|
|
|
|
prompt = f"Question: {question}\nLet's solve this step by step:\n" |
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
|
|
with torch.no_grad(): |
|
|
outputs = self.model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=400, |
|
|
temperature=0.0, |
|
|
do_sample=False, |
|
|
) |
|
|
|
|
|
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
numbers = re.findall(r'-?\d+\.?\d*', response) |
|
|
if numbers: |
|
|
predicted = numbers[-1] |
|
|
if predicted.replace('.', '').replace('-', '').isdigit(): |
|
|
if float(predicted) == float(answer): |
|
|
correct += 1 |
|
|
|
|
|
total += 1 |
|
|
|
|
|
accuracy = correct / total if total > 0 else 0 |
|
|
print(f"GSM8K Accuracy: {accuracy:.2%} ({correct}/{total})") |
|
|
return accuracy |
|
|
|
|
|
def evaluate_truthfulqa(self) -> float: |
|
|
""" |
|
|
Evaluate on TruthfulQA (truthfulness and informativeness). |
|
|
|
|
|
Returns: |
|
|
MC2 accuracy |
|
|
""" |
|
|
print("\n=== Evaluating TruthfulQA ===") |
|
|
dataset = load_dataset("truthful_qa", "multiple_choice", split="validation") |
|
|
|
|
|
correct = 0 |
|
|
total = 0 |
|
|
|
|
|
for item in tqdm(dataset, desc="TruthfulQA"): |
|
|
question = item["question"] |
|
|
mc2_targets = item["mc2_targets"] |
|
|
choices = mc2_targets["choices"] |
|
|
labels = mc2_targets["labels"] |
|
|
|
|
|
|
|
|
prompt = f"Question: {question}\n" |
|
|
for i, choice in enumerate(choices): |
|
|
prompt += f"{i+1}. {choice}\n" |
|
|
prompt += "Select all correct answers:\n" |
|
|
|
|
|
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device) |
|
|
with torch.no_grad(): |
|
|
outputs = self.model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=100, |
|
|
temperature=0.0, |
|
|
do_sample=False, |
|
|
) |
|
|
|
|
|
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) |
|
|
|
|
|
|
|
|
response_lower = response.lower() |
|
|
found_correct = False |
|
|
for idx, (choice, label) in enumerate(zip(choices, labels)): |
|
|
if label == 1 and (choice.lower() in response_lower or str(idx+1) in response): |
|
|
found_correct = True |
|
|
break |
|
|
|
|
|
if found_correct: |
|
|
correct += 1 |
|
|
total += 1 |
|
|
|
|
|
accuracy = correct / total if total > 0 else 0 |
|
|
print(f"TruthfulQA MC2 Accuracy: {accuracy:.2%} ({correct}/{total})") |
|
|
return accuracy |
|
|
|
|
|
def run_all_benchmarks(self) -> Dict[str, float]: |
|
|
""" |
|
|
Run all available benchmarks. |
|
|
|
|
|
Returns: |
|
|
Dictionary of benchmark results |
|
|
""" |
|
|
results = {} |
|
|
|
|
|
try: |
|
|
results["MMLU"] = self.evaluate_mmlu() |
|
|
except Exception as e: |
|
|
print(f"MMLU evaluation failed: {e}") |
|
|
results["MMLU"] = 0.0 |
|
|
|
|
|
try: |
|
|
results["HellaSwag"] = self.evaluate_hellaswag() |
|
|
except Exception as e: |
|
|
print(f"HellaSwag evaluation failed: {e}") |
|
|
results["HellaSwag"] = 0.0 |
|
|
|
|
|
try: |
|
|
results["ARC-Challenge"] = self.evaluate_arc(challenge=True) |
|
|
except Exception as e: |
|
|
print(f"ARC-Challenge evaluation failed: {e}") |
|
|
results["ARC-Challenge"] = 0.0 |
|
|
|
|
|
try: |
|
|
results["GSM8K"] = self.evaluate_gsm8k() |
|
|
except Exception as e: |
|
|
print(f"GSM8K evaluation failed: {e}") |
|
|
results["GSM8K"] = 0.0 |
|
|
|
|
|
try: |
|
|
results["TruthfulQA"] = self.evaluate_truthfulqa() |
|
|
except Exception as e: |
|
|
print(f"TruthfulQA evaluation failed: {e}") |
|
|
results["TruthfulQA"] = 0.0 |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Benchmark Helion-V2") |
|
|
parser.add_argument( |
|
|
"--model", |
|
|
type=str, |
|
|
default="DeepXR/Helion-V2", |
|
|
help="Model name or path" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--device", |
|
|
type=str, |
|
|
default="cuda", |
|
|
help="Device to use" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--benchmark", |
|
|
type=str, |
|
|
choices=["all", "mmlu", "hellaswag", "arc", "gsm8k", "truthfulqa"], |
|
|
default="all", |
|
|
help="Benchmark to run" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
type=str, |
|
|
default="benchmark_results.json", |
|
|
help="Output file for results" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
evaluator = BenchmarkEvaluator(args.model, args.device) |
|
|
|
|
|
if args.benchmark == "all": |
|
|
results = evaluator.run_all_benchmarks() |
|
|
else: |
|
|
benchmark_map = { |
|
|
"mmlu": evaluator.evaluate_mmlu, |
|
|
"hellaswag": evaluator.evaluate_hellaswag, |
|
|
"arc": evaluator.evaluate_arc, |
|
|
"gsm8k": evaluator.evaluate_gsm8k, |
|
|
"truthfulqa": evaluator.evaluate_truthfulqa, |
|
|
} |
|
|
score = benchmark_map[args.benchmark]() |
|
|
results = {args.benchmark: score} |
|
|
|
|
|
|
|
|
with open(args.output, 'w') as f: |
|
|
json.dump(results, f, indent=2) |
|
|
|
|
|
print(f"\n=== Final Results ===") |
|
|
for benchmark, score in results.items(): |
|
|
print(f"{benchmark}: {score:.2%}") |
|
|
print(f"\nResults saved to {args.output}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |