Instructions to use DeepXR/Helion-V2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use DeepXR/Helion-V2 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="DeepXR/Helion-V2", trust_remote_code=True)
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V2", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps Settings

vLLM

How to use DeepXR/Helion-V2 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "DeepXR/Helion-V2"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/DeepXR/Helion-V2

SGLang

How to use DeepXR/Helion-V2 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "DeepXR/Helion-V2" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "DeepXR/Helion-V2" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V2",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use DeepXR/Helion-V2 with Docker Model Runner:
```
docker model run hf.co/DeepXR/Helion-V2
```

Trouter-Library commited on Nov 16, 2025

Commit

7440c87

verified ·

1 Parent(s): e2035b7

Create benchmark.py

Browse files

Files changed (1) hide show

benchmark.py +377 -0

benchmark.py ADDED Viewed

	@@ -0,0 +1,377 @@

+"""
+Benchmark script for evaluating Helion-V2 on standard benchmarks.
+Includes MMLU, HellaSwag, ARC, TruthfulQA, GSM8K, and HumanEval.
+"""
+import torch
+import json
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from datasets import load_dataset
+from tqdm import tqdm
+import argparse
+from typing import Dict, List, Tuple
+import re
+class BenchmarkEvaluator:
+    """Evaluator for running benchmarks on Helion-V2."""
+    def __init__(self, model_name: str, device: str = "cuda"):
+        """Initialize evaluator with model."""
+        print(f"Loading model: {model_name}")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map=device,
+        )
+        self.model.eval()
+        self.device = device
+    def evaluate_mmlu(self, num_shots: int = 5) -> float:
+        """
+        Evaluate on MMLU (Massive Multitask Language Understanding).
+        Args:
+            num_shots: Number of examples for few-shot learning
+        Returns:
+            Average accuracy across all subjects
+        """
+        print("\n=== Evaluating MMLU ===")
+        dataset = load_dataset("cais/mmlu", "all", split="test")
+        correct = 0
+        total = 0
+        for item in tqdm(dataset, desc="MMLU"):
+            question = item["question"]
+            choices = item["choices"]
+            answer = item["answer"]
+            # Format prompt
+            prompt = f"Question: {question}\n"
+            for i, choice in enumerate(choices):
+                prompt += f"{chr(65+i)}. {choice}\n"
+            prompt += "Answer:"
+            # Get model prediction
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=1,
+                    temperature=0.0,
+                    do_sample=False,
+                )
+            response = self.tokenizer.decode(outputs[0][-1:], skip_special_tokens=True).strip()
+            # Check if correct
+            if response.upper() in ['A', 'B', 'C', 'D']:
+                predicted_idx = ord(response.upper()) - ord('A')
+                if predicted_idx == answer:
+                    correct += 1
+            total += 1
+            if total >= 1000:  # Limit for testing
+                break
+        accuracy = correct / total if total > 0 else 0
+        print(f"MMLU Accuracy: {accuracy:.2%} ({correct}/{total})")
+        return accuracy
+    def evaluate_hellaswag(self) -> float:
+        """
+        Evaluate on HellaSwag (commonsense reasoning).
+        Returns:
+            Accuracy on HellaSwag
+        """
+        print("\n=== Evaluating HellaSwag ===")
+        dataset = load_dataset("Rowan/hellaswag", split="validation")
+        correct = 0
+        total = 0
+        for item in tqdm(dataset[:1000], desc="HellaSwag"):
+            context = item["ctx"]
+            endings = item["endings"]
+            label = int(item["label"])
+            # Calculate log-likelihood for each ending
+            best_score = float('-inf')
+            best_idx = -1
+            for idx, ending in enumerate(endings):
+                full_text = context + " " + ending
+                inputs = self.tokenizer(full_text, return_tensors="pt").to(self.device)
+                with torch.no_grad():
+                    outputs = self.model(**inputs, labels=inputs["input_ids"])
+                    score = -outputs.loss.item()
+                if score > best_score:
+                    best_score = score
+                    best_idx = idx
+            if best_idx == label:
+                correct += 1
+            total += 1
+        accuracy = correct / total if total > 0 else 0
+        print(f"HellaSwag Accuracy: {accuracy:.2%} ({correct}/{total})")
+        return accuracy
+    def evaluate_arc(self, challenge: bool = True) -> float:
+        """
+        Evaluate on ARC (AI2 Reasoning Challenge).
+        Args:
+            challenge: Use ARC-Challenge (harder) vs ARC-Easy
+        Returns:
+            Accuracy on ARC
+        """
+        subset = "ARC-Challenge" if challenge else "ARC-Easy"
+        print(f"\n=== Evaluating {subset} ===")
+        dataset = load_dataset("ai2_arc", subset, split="test")
+        correct = 0
+        total = 0
+        for item in tqdm(dataset, desc=subset):
+            question = item["question"]
+            choices = item["choices"]["text"]
+            labels = item["choices"]["label"]
+            answer_key = item["answerKey"]
+            # Format prompt
+            prompt = f"Question: {question}\n"
+            for label, choice in zip(labels, choices):
+                prompt += f"{label}. {choice}\n"
+            prompt += "Answer:"
+            # Get model prediction
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=5,
+                    temperature=0.0,
+                    do_sample=False,
+                )
+            response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
+            # Extract answer
+            predicted = response[0] if response else ""
+            if predicted.upper() == answer_key.upper():
+                correct += 1
+            total += 1
+        accuracy = correct / total if total > 0 else 0
+        print(f"{subset} Accuracy: {accuracy:.2%} ({correct}/{total})")
+        return accuracy
+    def evaluate_gsm8k(self) -> float:
+        """
+        Evaluate on GSM8K (grade school math).
+        Returns:
+            Accuracy on GSM8K
+        """
+        print("\n=== Evaluating GSM8K ===")
+        dataset = load_dataset("gsm8k", "main", split="test")
+        correct = 0
+        total = 0
+        for item in tqdm(dataset[:500], desc="GSM8K"):  # Sample for speed
+            question = item["question"]
+            answer = item["answer"].split("####")[-1].strip()
+            # Format with chain-of-thought prompt
+            prompt = f"Question: {question}\nLet's solve this step by step:\n"
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=400,
+                    temperature=0.0,
+                    do_sample=False,
+                )
+            response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+            # Extract numerical answer
+            numbers = re.findall(r'-?\d+\.?\d*', response)
+            if numbers:
+                predicted = numbers[-1]  # Take last number
+                if predicted.replace('.', '').replace('-', '').isdigit():
+                    if float(predicted) == float(answer):
+                        correct += 1
+            total += 1
+        accuracy = correct / total if total > 0 else 0
+        print(f"GSM8K Accuracy: {accuracy:.2%} ({correct}/{total})")
+        return accuracy
+    def evaluate_truthfulqa(self) -> float:
+        """
+        Evaluate on TruthfulQA (truthfulness and informativeness).
+        Returns:
+            MC2 accuracy
+        """
+        print("\n=== Evaluating TruthfulQA ===")
+        dataset = load_dataset("truthful_qa", "multiple_choice", split="validation")
+        correct = 0
+        total = 0
+        for item in tqdm(dataset, desc="TruthfulQA"):
+            question = item["question"]
+            mc2_targets = item["mc2_targets"]
+            choices = mc2_targets["choices"]
+            labels = mc2_targets["labels"]
+            # Format prompt
+            prompt = f"Question: {question}\n"
+            for i, choice in enumerate(choices):
+                prompt += f"{i+1}. {choice}\n"
+            prompt += "Select all correct answers:\n"
+            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=100,
+                    temperature=0.0,
+                    do_sample=False,
+                )
+            response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+            # Simple scoring: if any correct answer is mentioned
+            response_lower = response.lower()
+            found_correct = False
+            for idx, (choice, label) in enumerate(zip(choices, labels)):
+                if label == 1 and (choice.lower() in response_lower or str(idx+1) in response):
+                    found_correct = True
+                    break
+            if found_correct:
+                correct += 1
+            total += 1
+        accuracy = correct / total if total > 0 else 0
+        print(f"TruthfulQA MC2 Accuracy: {accuracy:.2%} ({correct}/{total})")
+        return accuracy
+    def run_all_benchmarks(self) -> Dict[str, float]:
+        """
+        Run all available benchmarks.
+        Returns:
+            Dictionary of benchmark results
+        """
+        results = {}
+        try:
+            results["MMLU"] = self.evaluate_mmlu()
+        except Exception as e:
+            print(f"MMLU evaluation failed: {e}")
+            results["MMLU"] = 0.0
+        try:
+            results["HellaSwag"] = self.evaluate_hellaswag()
+        except Exception as e:
+            print(f"HellaSwag evaluation failed: {e}")
+            results["HellaSwag"] = 0.0
+        try:
+            results["ARC-Challenge"] = self.evaluate_arc(challenge=True)
+        except Exception as e:
+            print(f"ARC-Challenge evaluation failed: {e}")
+            results["ARC-Challenge"] = 0.0
+        try:
+            results["GSM8K"] = self.evaluate_gsm8k()
+        except Exception as e:
+            print(f"GSM8K evaluation failed: {e}")
+            results["GSM8K"] = 0.0
+        try:
+            results["TruthfulQA"] = self.evaluate_truthfulqa()
+        except Exception as e:
+            print(f"TruthfulQA evaluation failed: {e}")
+            results["TruthfulQA"] = 0.0
+        return results
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark Helion-V2")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="DeepXR/Helion-V2",
+        help="Model name or path"
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device to use"
+    )
+    parser.add_argument(
+        "--benchmark",
+        type=str,
+        choices=["all", "mmlu", "hellaswag", "arc", "gsm8k", "truthfulqa"],
+        default="all",
+        help="Benchmark to run"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="benchmark_results.json",
+        help="Output file for results"
+    )
+    args = parser.parse_args()
+    evaluator = BenchmarkEvaluator(args.model, args.device)
+    if args.benchmark == "all":
+        results = evaluator.run_all_benchmarks()
+    else:
+        benchmark_map = {
+            "mmlu": evaluator.evaluate_mmlu,
+            "hellaswag": evaluator.evaluate_hellaswag,
+            "arc": evaluator.evaluate_arc,
+            "gsm8k": evaluator.evaluate_gsm8k,
+            "truthfulqa": evaluator.evaluate_truthfulqa,
+        }
+        score = benchmark_map[args.benchmark]()
+        results = {args.benchmark: score}
+    # Save results
+    with open(args.output, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"\n=== Final Results ===")
+    for benchmark, score in results.items():
+        print(f"{benchmark}: {score:.2%}")
+    print(f"\nResults saved to {args.output}")
+if __name__ == "__main__":
+    main()