Helion-V2 / benchmark.py

Create benchmark.py

7440c87 verified about 2 months ago

12.6 kB

	"""
	Benchmark script for evaluating Helion-V2 on standard benchmarks.
	Includes MMLU, HellaSwag, ARC, TruthfulQA, GSM8K, and HumanEval.
	"""

	import torch
	import json
	import numpy as np
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from datasets import load_dataset
	from tqdm import tqdm
	import argparse
	from typing import Dict, List, Tuple
	import re


	class BenchmarkEvaluator:
	"""Evaluator for running benchmarks on Helion-V2."""

	def __init__(self, model_name: str, device: str = "cuda"):
	"""Initialize evaluator with model."""
	print(f"Loading model: {model_name}")
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map=device,
	)
	self.model.eval()
	self.device = device

	def evaluate_mmlu(self, num_shots: int = 5) -> float:
	"""
	Evaluate on MMLU (Massive Multitask Language Understanding).

	Args:
	num_shots: Number of examples for few-shot learning

	Returns:
	Average accuracy across all subjects
	"""
	print("\n=== Evaluating MMLU ===")
	dataset = load_dataset("cais/mmlu", "all", split="test")

	correct = 0
	total = 0

	for item in tqdm(dataset, desc="MMLU"):
	question = item["question"]
	choices = item["choices"]
	answer = item["answer"]

	# Format prompt
	prompt = f"Question: {question}\n"
	for i, choice in enumerate(choices):
	prompt += f"{chr(65+i)}. {choice}\n"
	prompt += "Answer:"

	# Get model prediction
	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=1,
	temperature=0.0,
	do_sample=False,
	)

	response = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()

	# Check if correct
	if response.upper() in ['A', 'B', 'C', 'D']:
	predicted_idx = ord(response.upper()) - ord('A')
	if predicted_idx == answer:
	correct += 1

	total += 1

	if total >= 1000: # Limit for testing
	break

	accuracy = correct / total if total > 0 else 0
	print(f"MMLU Accuracy: {accuracy:.2%} ({correct}/{total})")
	return accuracy

	def evaluate_hellaswag(self) -> float:
	"""
	Evaluate on HellaSwag (commonsense reasoning).

	Returns:
	Accuracy on HellaSwag
	"""
	print("\n=== Evaluating HellaSwag ===")
	dataset = load_dataset("Rowan/hellaswag", split="validation")

	correct = 0
	total = 0

	for item in tqdm(dataset[:1000], desc="HellaSwag"):
	context = item["ctx"]
	endings = item["endings"]
	label = int(item["label"])

	# Calculate log-likelihood for each ending
	best_score = float('-inf')
	best_idx = -1

	for idx, ending in enumerate(endings):
	full_text = context + " " + ending
	inputs = self.tokenizer(full_text, return_tensors="pt").to(self.device)

	with torch.no_grad():
	outputs = self.model(**inputs, labels=inputs["input_ids"])
	score = -outputs.loss.item()

	if score > best_score:
	best_score = score
	best_idx = idx

	if best_idx == label:
	correct += 1
	total += 1

	accuracy = correct / total if total > 0 else 0
	print(f"HellaSwag Accuracy: {accuracy:.2%} ({correct}/{total})")
	return accuracy

	def evaluate_arc(self, challenge: bool = True) -> float:
	"""
	Evaluate on ARC (AI2 Reasoning Challenge).

	Args:
	challenge: Use ARC-Challenge (harder) vs ARC-Easy

	Returns:
	Accuracy on ARC
	"""
	subset = "ARC-Challenge" if challenge else "ARC-Easy"
	print(f"\n=== Evaluating {subset} ===")

	dataset = load_dataset("ai2_arc", subset, split="test")

	correct = 0
	total = 0

	for item in tqdm(dataset, desc=subset):
	question = item["question"]
	choices = item["choices"]["text"]
	labels = item["choices"]["label"]
	answer_key = item["answerKey"]

	# Format prompt
	prompt = f"Question: {question}\n"
	for label, choice in zip(labels, choices):
	prompt += f"{label}. {choice}\n"
	prompt += "Answer:"

	# Get model prediction
	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=5,
	temperature=0.0,
	do_sample=False,
	)

	response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()

	# Extract answer
	predicted = response[0] if response else ""

	if predicted.upper() == answer_key.upper():
	correct += 1

	total += 1

	accuracy = correct / total if total > 0 else 0
	print(f"{subset} Accuracy: {accuracy:.2%} ({correct}/{total})")
	return accuracy

	def evaluate_gsm8k(self) -> float:
	"""
	Evaluate on GSM8K (grade school math).

	Returns:
	Accuracy on GSM8K
	"""
	print("\n=== Evaluating GSM8K ===")
	dataset = load_dataset("gsm8k", "main", split="test")

	correct = 0
	total = 0

	for item in tqdm(dataset[:500], desc="GSM8K"): # Sample for speed
	question = item["question"]
	answer = item["answer"].split("####")[-1].strip()

	# Format with chain-of-thought prompt
	prompt = f"Question: {question}\nLet's solve this step by step:\n"

	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=400,
	temperature=0.0,
	do_sample=False,
	)

	response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

	# Extract numerical answer
	numbers = re.findall(r'-?\d+\.?\d*', response)
	if numbers:
	predicted = numbers[-1] # Take last number
	if predicted.replace('.', '').replace('-', '').isdigit():
	if float(predicted) == float(answer):
	correct += 1

	total += 1

	accuracy = correct / total if total > 0 else 0
	print(f"GSM8K Accuracy: {accuracy:.2%} ({correct}/{total})")
	return accuracy

	def evaluate_truthfulqa(self) -> float:
	"""
	Evaluate on TruthfulQA (truthfulness and informativeness).

	Returns:
	MC2 accuracy
	"""
	print("\n=== Evaluating TruthfulQA ===")
	dataset = load_dataset("truthful_qa", "multiple_choice", split="validation")

	correct = 0
	total = 0

	for item in tqdm(dataset, desc="TruthfulQA"):
	question = item["question"]
	mc2_targets = item["mc2_targets"]
	choices = mc2_targets["choices"]
	labels = mc2_targets["labels"]

	# Format prompt
	prompt = f"Question: {question}\n"
	for i, choice in enumerate(choices):
	prompt += f"{i+1}. {choice}\n"
	prompt += "Select all correct answers:\n"

	inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=100,
	temperature=0.0,
	do_sample=False,
	)

	response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

	# Simple scoring: if any correct answer is mentioned
	response_lower = response.lower()
	found_correct = False
	for idx, (choice, label) in enumerate(zip(choices, labels)):
	if label == 1 and (choice.lower() in response_lower or str(idx+1) in response):
	found_correct = True
	break

	if found_correct:
	correct += 1
	total += 1

	accuracy = correct / total if total > 0 else 0
	print(f"TruthfulQA MC2 Accuracy: {accuracy:.2%} ({correct}/{total})")
	return accuracy

	def run_all_benchmarks(self) -> Dict[str, float]:
	"""
	Run all available benchmarks.

	Returns:
	Dictionary of benchmark results
	"""
	results = {}

	try:
	results["MMLU"] = self.evaluate_mmlu()
	except Exception as e:
	print(f"MMLU evaluation failed: {e}")
	results["MMLU"] = 0.0

	try:
	results["HellaSwag"] = self.evaluate_hellaswag()
	except Exception as e:
	print(f"HellaSwag evaluation failed: {e}")
	results["HellaSwag"] = 0.0

	try:
	results["ARC-Challenge"] = self.evaluate_arc(challenge=True)
	except Exception as e:
	print(f"ARC-Challenge evaluation failed: {e}")
	results["ARC-Challenge"] = 0.0

	try:
	results["GSM8K"] = self.evaluate_gsm8k()
	except Exception as e:
	print(f"GSM8K evaluation failed: {e}")
	results["GSM8K"] = 0.0

	try:
	results["TruthfulQA"] = self.evaluate_truthfulqa()
	except Exception as e:
	print(f"TruthfulQA evaluation failed: {e}")
	results["TruthfulQA"] = 0.0

	return results


	def main():
	parser = argparse.ArgumentParser(description="Benchmark Helion-V2")
	parser.add_argument(
	"--model",
	type=str,
	default="DeepXR/Helion-V2",
	help="Model name or path"
	)
	parser.add_argument(
	"--device",
	type=str,
	default="cuda",
	help="Device to use"
	)
	parser.add_argument(
	"--benchmark",
	type=str,
	choices=["all", "mmlu", "hellaswag", "arc", "gsm8k", "truthfulqa"],
	default="all",
	help="Benchmark to run"
	)
	parser.add_argument(
	"--output",
	type=str,
	default="benchmark_results.json",
	help="Output file for results"
	)

	args = parser.parse_args()

	evaluator = BenchmarkEvaluator(args.model, args.device)

	if args.benchmark == "all":
	results = evaluator.run_all_benchmarks()
	else:
	benchmark_map = {
	"mmlu": evaluator.evaluate_mmlu,
	"hellaswag": evaluator.evaluate_hellaswag,
	"arc": evaluator.evaluate_arc,
	"gsm8k": evaluator.evaluate_gsm8k,
	"truthfulqa": evaluator.evaluate_truthfulqa,
	}
	score = benchmark_map[args.benchmark]()
	results = {args.benchmark: score}

	# Save results
	with open(args.output, 'w') as f:
	json.dump(results, f, indent=2)

	print(f"\n=== Final Results ===")
	for benchmark, score in results.items():
	print(f"{benchmark}: {score:.2%}")
	print(f"\nResults saved to {args.output}")


	if __name__ == "__main__":
	main()