Instructions to use DeepXR/Helion-V1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use DeepXR/Helion-V1 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="DeepXR/Helion-V1")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V1")
model = AutoModelForCausalLM.from_pretrained("DeepXR/Helion-V1")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use DeepXR/Helion-V1 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "DeepXR/Helion-V1"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V1",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/DeepXR/Helion-V1

SGLang

How to use DeepXR/Helion-V1 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "DeepXR/Helion-V1" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V1",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "DeepXR/Helion-V1" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "DeepXR/Helion-V1",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use DeepXR/Helion-V1 with Docker Model Runner:
```
docker model run hf.co/DeepXR/Helion-V1
```

Trouter-Library commited on Nov 4, 2025

Commit

f1a0ba2

verified ·

1 Parent(s): c1ead59

Create deployment.py

Browse files

Files changed (1) hide show

deployment.py +345 -0

deployment.py ADDED Viewed

	@@ -0,0 +1,345 @@

+"""
+Helion-V1 Production Deployment Script
+Optimized for serving with vLLM, TGI, or custom inference servers
+"""
+import os
+import json
+import logging
+from typing import Dict, List, Optional
+from dataclasses import dataclass
+import asyncio
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+@dataclass
+class DeploymentConfig:
+    """Configuration for model deployment."""
+    model_name: str = "DeepXR/Helion-V1"
+    tensor_parallel_size: int = 1
+    max_model_len: int = 4096
+    max_num_seqs: int = 256
+    gpu_memory_utilization: float = 0.90
+    trust_remote_code: bool = True
+    quantization: Optional[str] = None  # "awq", "gptq", or None
+    dtype: str = "bfloat16"
+    enforce_eager: bool = False
+    # Safety settings
+    max_tokens: int = 2048
+    temperature: float = 0.7
+    top_p: float = 0.9
+    frequency_penalty: float = 0.1
+    presence_penalty: float = 0.1
+    # Rate limiting
+    rate_limit_requests_per_minute: int = 60
+    rate_limit_tokens_per_minute: int = 90000
+class HelionDeployment:
+    """
+    Production deployment handler for Helion-V1.
+    Supports vLLM, Text Generation Inference, and custom servers.
+    """
+    def __init__(self, config: DeploymentConfig):
+        self.config = config
+        self.model = None
+        self.tokenizer = None
+    def deploy_vllm(self):
+        """Deploy using vLLM for high-throughput inference."""
+        try:
+            from vllm import LLM, SamplingParams
+            logger.info("Initializing vLLM engine...")
+            self.model = LLM(
+                model=self.config.model_name,
+                tensor_parallel_size=self.config.tensor_parallel_size,
+                max_model_len=self.config.max_model_len,
+                max_num_seqs=self.config.max_num_seqs,
+                gpu_memory_utilization=self.config.gpu_memory_utilization,
+                trust_remote_code=self.config.trust_remote_code,
+                quantization=self.config.quantization,
+                dtype=self.config.dtype,
+                enforce_eager=self.config.enforce_eager
+            )
+            logger.info("✅ vLLM engine initialized successfully")
+            return True
+        except ImportError:
+            logger.error("vLLM not installed. Install with: pip install vllm")
+            return False
+        except Exception as e:
+            logger.error(f"Failed to initialize vLLM: {e}")
+            return False
+    def get_sampling_params(self) -> 'SamplingParams':
+        """Get vLLM sampling parameters."""
+        from vllm import SamplingParams
+        return SamplingParams(
+            temperature=self.config.temperature,
+            top_p=self.config.top_p,
+            max_tokens=self.config.max_tokens,
+            frequency_penalty=self.config.frequency_penalty,
+            presence_penalty=self.config.presence_penalty
+        )
+    def generate_vllm(self, prompts: List[str]) -> List[str]:
+        """Generate responses using vLLM."""
+        if not self.model:
+            raise RuntimeError("Model not initialized. Call deploy_vllm() first.")
+        sampling_params = self.get_sampling_params()
+        outputs = self.model.generate(prompts, sampling_params)
+        return [output.outputs[0].text for output in outputs]
+    def create_fastapi_server(self):
+        """Create FastAPI server for HTTP API."""
+        try:
+            from fastapi import FastAPI, HTTPException
+            from fastapi.middleware.cors import CORSMiddleware
+            from pydantic import BaseModel
+            import uvicorn
+            app = FastAPI(
+                title="Helion-V1 API",
+                description="Safe and helpful AI assistant API",
+                version="1.0.0"
+            )
+            # CORS middleware
+            app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+            class ChatRequest(BaseModel):
+                messages: List[Dict[str, str]]
+                max_tokens: Optional[int] = 512
+                temperature: Optional[float] = 0.7
+                top_p: Optional[float] = 0.9
+            class ChatResponse(BaseModel):
+                response: str
+                model: str
+                usage: Dict[str, int]
+            @app.post("/v1/chat/completions", response_model=ChatResponse)
+            async def chat_completion(request: ChatRequest):
+                """OpenAI-compatible chat completion endpoint."""
+                try:
+                    # Format messages
+                    from transformers import AutoTokenizer
+                    tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
+                    prompt = tokenizer.apply_chat_template(
+                        request.messages,
+                        tokenize=False,
+                        add_generation_prompt=True
+                    )
+                    # Generate response
+                    responses = self.generate_vllm([prompt])
+                    return ChatResponse(
+                        response=responses[0],
+                        model=self.config.model_name,
+                        usage={
+                            "prompt_tokens": len(tokenizer.encode(prompt)),
+                            "completion_tokens": len(tokenizer.encode(responses[0])),
+                            "total_tokens": len(tokenizer.encode(prompt + responses[0]))
+                        }
+                    )
+                except Exception as e:
+                    logger.error(f"Generation error: {e}")
+                    raise HTTPException(status_code=500, detail=str(e))
+            @app.get("/health")
+            async def health_check():
+                """Health check endpoint."""
+                return {"status": "healthy", "model": self.config.model_name}
+            @app.get("/")
+            async def root():
+                """Root endpoint."""
+                return {
+                    "name": "Helion-V1 API",
+                    "version": "1.0.0",
+                    "status": "online"
+                }
+            return app
+        except ImportError:
+            logger.error("FastAPI not installed. Install with: pip install fastapi uvicorn")
+            return None
+    def export_onnx(self, output_path: str = "./helion_onnx"):
+        """Export model to ONNX format for optimized deployment."""
+        try:
+            from optimum.onnxruntime import ORTModelForCausalLM
+            from transformers import AutoTokenizer
+            logger.info("Exporting model to ONNX...")
+            model = ORTModelForCausalLM.from_pretrained(
+                self.config.model_name,
+                export=True
+            )
+            tokenizer = AutoTokenizer.from_pretrained(self.config.model_name)
+            model.save_pretrained(output_path)
+            tokenizer.save_pretrained(output_path)
+            logger.info(f"✅ Model exported to {output_path}")
+            return True
+        except ImportError:
+            logger.error("Optimum not installed. Install with: pip install optimum[onnxruntime-gpu]")
+            return False
+        except Exception as e:
+            logger.error(f"ONNX export failed: {e}")
+            return False
+    def create_docker_config(self, output_path: str = "./"):
+        """Generate Dockerfile for containerized deployment."""
+        dockerfile_content = f"""FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
+# Set working directory
+WORKDIR /app
+# Install Python and dependencies
+RUN apt-get update && apt-get install -y \\
+    python3.10 \\
+    python3-pip \\
+    git \\
+    && rm -rf /var/lib/apt/lists/*
+# Install Python packages
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Install vLLM for high-performance inference
+RUN pip3 install vllm
+# Copy application code
+COPY . .
+# Set environment variables
+ENV MODEL_NAME={self.config.model_name}
+ENV MAX_MODEL_LEN={self.config.max_model_len}
+ENV GPU_MEMORY_UTILIZATION={self.config.gpu_memory_utilization}
+ENV TENSOR_PARALLEL_SIZE={self.config.tensor_parallel_size}
+# Expose port
+EXPOSE 8000
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \\
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the application
+CMD ["python3", "deployment.py", "--server"]
+"""
+        dockerfile_path = os.path.join(output_path, "Dockerfile")
+        with open(dockerfile_path, 'w') as f:
+            f.write(dockerfile_content)
+        # Also create docker-compose.yml
+        docker_compose_content = f"""version: '3.8'
+services:
+  helion-v1:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - MODEL_NAME={self.config.model_name}
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    volumes:
+      - model_cache:/root/.cache/huggingface
+    restart: unless-stopped
+volumes:
+  model_cache:
+"""
+        compose_path = os.path.join(output_path, "docker-compose.yml")
+        with open(compose_path, 'w') as f:
+            f.write(docker_compose_content)
+        logger.info(f"✅ Docker configuration created in {output_path}")
+        logger.info("Build with: docker-compose build")
+        logger.info("Run with: docker-compose up -d")
+def main():
+    """Main deployment function."""
+    import argparse
+    parser = argparse.ArgumentParser(description="Deploy Helion-V1")
+    parser.add_argument("--model", default="DeepXR/Helion-V1", help="Model name or path")
+    parser.add_argument("--backend", choices=["vllm", "tgi", "fastapi"], default="vllm")
+    parser.add_argument("--server", action="store_true", help="Start HTTP server")
+    parser.add_argument("--export-onnx", action="store_true", help="Export to ONNX")
+    parser.add_argument("--create-docker", action="store_true", help="Create Docker config")
+    parser.add_argument("--tensor-parallel", type=int, default=1)
+    parser.add_argument("--quantization", choices=["awq", "gptq", None], default=None)
+    args = parser.parse_args()
+    # Create config
+    config = DeploymentConfig(
+        model_name=args.model,
+        tensor_parallel_size=args.tensor_parallel,
+        quantization=args.quantization
+    )
+    deployment = HelionDeployment(config)
+    if args.export_onnx:
+        deployment.export_onnx()
+    if args.create_docker:
+        deployment.create_docker_config()
+    if args.server:
+        if args.backend == "vllm":
+            if deployment.deploy_vllm():
+                app = deployment.create_fastapi_server()
+                if app:
+                    import uvicorn
+                    logger.info("🚀 Starting Helion-V1 server on http://0.0.0.0:8000")
+                    uvicorn.run(app, host="0.0.0.0", port=8000)
+        else:
+            logger.error(f"Backend {args.backend} not implemented yet")
+    else:
+        logger.info("No action specified. Use --help for options.")
+if __name__ == "__main__":
+    main()