Helion-V1.5 / helion_orchestrator.py
Specific-Cognito's picture
Update helion_orchestrator.py
fc7a401 verified
"""
Helion Orchestrator - Complete Model Management System
Unified interface for training, inference, deployment, evaluation, and monitoring
"""
import os
import sys
import json
import logging
import argparse
import subprocess
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime
import torch
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class MemoryManager:
"""
Conversation memory manager for Helion.
Stores and retrieves conversation history for context-aware responses.
"""
def __init__(self, memory_file: str = "helion_memory.json", window_size: int = 10):
self.memory_file = Path(memory_file)
self.window_size = window_size
self.conversations: Dict[str, List[Dict]] = {}
self.load()
def add_interaction(self, conversation_id: str, user_input: str, assistant_response: str):
"""
Add interaction to memory.
Args:
conversation_id: Unique conversation identifier
user_input: User's message
assistant_response: Assistant's response
"""
if conversation_id not in self.conversations:
self.conversations[conversation_id] = []
self.conversations[conversation_id].append({
"timestamp": datetime.now().isoformat(),
"user": user_input,
"assistant": assistant_response
})
# Keep only last N interactions per conversation
if len(self.conversations[conversation_id]) > self.window_size:
self.conversations[conversation_id] = self.conversations[conversation_id][-self.window_size:]
self.save()
def get_context(self, conversation_id: str, max_length: int = 500) -> str:
"""
Get conversation context as a summary string.
Args:
conversation_id: Conversation ID
max_length: Maximum context length in characters
Returns:
Context string
"""
if conversation_id not in self.conversations:
return ""
interactions = self.conversations[conversation_id]
# Build context from recent interactions
context_parts = []
total_length = 0
for interaction in reversed(interactions):
part = f"User: {interaction['user'][:100]} | Assistant: {interaction['assistant'][:100]}"
if total_length + len(part) > max_length:
break
context_parts.insert(0, part)
total_length += len(part)
return " | ".join(context_parts)
def get_conversation(self, conversation_id: str) -> List[Dict]:
"""Get full conversation history."""
return self.conversations.get(conversation_id, [])
def clear_conversation(self, conversation_id: str):
"""Clear specific conversation."""
if conversation_id in self.conversations:
del self.conversations[conversation_id]
self.save()
def clear_all(self):
"""Clear all conversations."""
self.conversations = {}
self.save()
def save(self):
"""Save memory to file."""
try:
self.memory_file.parent.mkdir(parents=True, exist_ok=True)
with open(self.memory_file, 'w') as f:
json.dump(self.conversations, f, indent=2)
except Exception as e:
logger.error(f"Failed to save memory: {e}")
def load(self):
"""Load memory from file."""
try:
if self.memory_file.exists():
with open(self.memory_file, 'r') as f:
self.conversations = json.load(f)
logger.info(f"Loaded {len(self.conversations)} conversations from memory")
except Exception as e:
logger.warning(f"Failed to load memory: {e}")
self.conversations = {}
@dataclass
class HelionConfig:
"""Central configuration for all Helion operations."""
model_name: str = "DeepXR/Helion-V1.5"
base_model: str = "meta-llama/Llama-2-7b-hf"
output_dir: str = "./helion_workspace"
cache_dir: str = "./helion_cache"
log_dir: str = "./helion_logs"
# Inference settings
max_tokens: int = 512
temperature: float = 0.7
top_p: float = 0.9
device: str = "auto"
load_in_4bit: bool = False
# Training settings
dataset_path: Optional[str] = None
num_epochs: int = 3
batch_size: int = 4
learning_rate: float = 2e-5
# Server settings
server_host: str = "0.0.0.0"
server_port: int = 8000
enable_safeguards: bool = True
enable_tools: bool = False
# Memory settings
enable_memory: bool = True
memory_window: int = 10 # Remember last N conversations
memory_file: str = "helion_memory.json"
# HuggingFace
hf_token: Optional[str] = None
push_to_hub: bool = False
class HelionOrchestrator:
"""
Master orchestrator for all Helion model operations.
Provides unified interface for training, inference, deployment, and management.
"""
def __init__(self, config: Optional[HelionConfig] = None):
self.config = config or HelionConfig()
self.config.hf_token = self.config.hf_token or os.getenv("HF_TOKEN")
# Create directories
for dir_path in [self.config.output_dir, self.config.cache_dir, self.config.log_dir]:
Path(dir_path).mkdir(parents=True, exist_ok=True)
# Model components
self.model = None
self.tokenizer = None
self.safeguards = None
self.tool_system = None
self.memory = None
self.session_log = []
# Initialize memory if enabled
if self.config.enable_memory:
self._init_memory()
# ==================== Model Loading ====================
def load_model(
self,
model_name: Optional[str] = None,
quantization: Optional[str] = None
) -> bool:
"""
Load model and tokenizer with optional quantization.
Args:
model_name: Model to load (default: config model)
quantization: Quantization type ('4bit', '8bit', None)
Returns:
True if successful
"""
try:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
model_name = model_name or self.config.model_name
logger.info(f"Loading model: {model_name}")
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
token=self.config.hf_token,
cache_dir=self.config.cache_dir
)
# Quantization config
load_kwargs = {
"device_map": self.config.device,
"torch_dtype": torch.bfloat16,
"cache_dir": self.config.cache_dir,
"token": self.config.hf_token
}
if quantization == "4bit" or self.config.load_in_4bit:
load_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16
)
elif quantization == "8bit":
load_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_8bit=True
)
# Load model
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
**load_kwargs
)
self.model.eval()
# Load safeguards if enabled
if self.config.enable_safeguards:
self._load_safeguards()
# Load tools if enabled
if self.config.enable_tools:
self._load_tools()
logger.info("✅ Model loaded successfully")
self._log_event("model_loaded", {"model": model_name, "quantization": quantization})
return True
except Exception as e:
logger.error(f"Failed to load model: {e}")
return False
def _load_safeguards(self):
"""Load safeguard system."""
try:
from safeguards_v15 import HelionSafeguardSystem, SafeguardConfig, PolicyMode
config = SafeguardConfig(policy_mode=PolicyMode.MODERATE)
self.safeguards = HelionSafeguardSystem(config)
logger.info("Safeguards loaded")
except ImportError:
logger.warning("Safeguards module not found")
def _load_tools(self):
"""Load tool system."""
try:
from tools_system import HelionToolSystem
self.tool_system = HelionToolSystem(self.model, self.tokenizer)
logger.info("Tool system loaded")
except ImportError:
logger.warning("Tools module not found")
def _init_memory(self):
"""Initialize memory system."""
self.memory = MemoryManager(
memory_file=os.path.join(self.config.output_dir, self.config.memory_file),
window_size=self.config.memory_window
)
logger.info("Memory system initialized")
def unload_model(self):
"""Unload model to free memory."""
if self.model:
del self.model
del self.tokenizer
torch.cuda.empty_cache()
logger.info("Model unloaded")
# ==================== Inference ====================
def generate(
self,
prompt: str,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None,
system_prompt: Optional[str] = None,
use_safeguards: bool = True,
use_memory: bool = True,
conversation_id: Optional[str] = None
) -> Dict[str, Any]:
"""
Generate response from prompt.
Args:
prompt: Input prompt
max_tokens: Max tokens to generate
temperature: Sampling temperature
system_prompt: Optional system prompt
use_safeguards: Apply safeguard checks
use_memory: Use conversation memory
conversation_id: Conversation identifier for memory
Returns:
Dict with response and metadata
"""
if not self.model:
raise RuntimeError("Model not loaded. Call load_model() first.")
max_tokens = max_tokens or self.config.max_tokens
temperature = temperature or self.config.temperature
# Retrieve memory context if enabled
memory_context = ""
if use_memory and self.memory and conversation_id:
memory_context = self.memory.get_context(conversation_id)
# Build messages
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
# Add memory context if available
if memory_context:
messages.append({"role": "system", "content": f"Previous context: {memory_context}"})
messages.append({"role": "user", "content": prompt})
# Check with safeguards
if use_safeguards and self.safeguards:
allowed, response = self.safeguards.filter_message(prompt)
if not allowed:
return {
"response": response,
"blocked": True,
"reason": "safeguard_violation"
}
# Generate
input_ids = self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(self.model.device)
with torch.no_grad():
output = self.model.generate(
input_ids,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=self.config.top_p,
do_sample=True,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
response_text = self.tokenizer.decode(
output[0][input_ids.shape[1]:],
skip_special_tokens=True
).strip()
# Store in memory if enabled
if use_memory and self.memory and conversation_id:
self.memory.add_interaction(conversation_id, prompt, response_text)
result = {
"response": response_text,
"blocked": False,
"prompt_tokens": input_ids.shape[1],
"completion_tokens": output.shape[1] - input_ids.shape[1],
"total_tokens": output.shape[1],
"conversation_id": conversation_id
}
self._log_event("generation", {"prompt": prompt[:100], "tokens": result["total_tokens"]})
return result
def chat(
self,
messages: List[Dict[str, str]],
use_memory: bool = True,
conversation_id: Optional[str] = None,
**kwargs
) -> Dict[str, Any]:
"""
Multi-turn chat completion.
Args:
messages: List of message dicts
use_memory: Use memory for context
conversation_id: Conversation ID for memory
**kwargs: Generation parameters
Returns:
Dict with response and metadata
"""
if not self.model:
raise RuntimeError("Model not loaded")
# Add memory context if available
if use_memory and self.memory and conversation_id:
memory_context = self.memory.get_context(conversation_id)
if memory_context:
# Insert memory context before user messages
messages = [
{"role": "system", "content": f"Previous context: {memory_context}"}
] + messages
# Similar to generate but maintains conversation
input_ids = self.tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(self.model.device)
with torch.no_grad():
output = self.model.generate(
input_ids,
max_new_tokens=kwargs.get("max_tokens", self.config.max_tokens),
temperature=kwargs.get("temperature", self.config.temperature),
top_p=kwargs.get("top_p", self.config.top_p),
do_sample=True,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
response = self.tokenizer.decode(
output[0][input_ids.shape[1]:],
skip_special_tokens=True
).strip()
# Store in memory
if use_memory and self.memory and conversation_id:
user_message = messages[-1]["content"]
self.memory.add_interaction(conversation_id, user_message, response)
return {"response": response, "blocked": False}
def interactive_chat(self):
"""Start interactive chat session."""
if not self.model:
print("❌ Model not loaded. Run: orchestrator.load_model()")
return
print("\n" + "="*60)
print("Helion Interactive Chat with Memory")
print("Commands: /quit, /clear, /save, /memory, /newconv, /help")
print("="*60 + "\n")
conversation = []
conversation_id = f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
# Show memory status
if self.memory:
print(f"💾 Memory: Enabled (ID: {conversation_id})")
# Check if there's previous context
prev_context = self.memory.get_context(conversation_id)
if prev_context:
print(f"📝 Retrieved previous context\n")
while True:
try:
user_input = input("\n🧑 You: ").strip()
if not user_input:
continue
# Handle commands
if user_input.startswith("/"):
if user_input == "/quit":
if self.memory:
self.memory.save()
print("Goodbye!")
break
elif user_input == "/clear":
conversation = []
print("Conversation cleared.")
continue
elif user_input == "/memory":
self._show_memory(conversation_id)
continue
elif user_input == "/newconv":
if self.memory:
self.memory.save()
conversation = []
conversation_id = f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
print(f"New conversation started (ID: {conversation_id})")
continue
elif user_input.startswith("/save"):
self._save_conversation(conversation, user_input.split()[1] if len(user_input.split()) > 1 else None)
continue
elif user_input == "/help":
self._print_chat_help()
continue
else:
print("Unknown command. Type /help for options.")
continue
conversation.append({"role": "user", "content": user_input})
result = self.chat(
conversation,
use_memory=True,
conversation_id=conversation_id
)
if result.get("blocked"):
print(f"🤖 Helion: {result['response']}")
conversation.pop()
else:
print(f"🤖 Helion: {result['response']}")
conversation.append({"role": "assistant", "content": result['response']})
except KeyboardInterrupt:
print("\n\nInterrupted. Type /quit to exit.")
except Exception as e:
logger.error(f"Error: {e}")
conversation.pop()
# ==================== Training ====================
def train(
self,
dataset_path: Optional[str] = None,
output_dir: Optional[str] = None,
**kwargs
) -> bool:
"""
Train or fine-tune model.
Args:
dataset_path: Path to training data
output_dir: Output directory
**kwargs: Training parameters
Returns:
True if successful
"""
dataset_path = dataset_path or self.config.dataset_path
output_dir = output_dir or os.path.join(self.config.output_dir, "trained_model")
if not dataset_path:
logger.error("No dataset provided")
return False
logger.info(f"Starting training with dataset: {dataset_path}")
try:
# Import training script
from autotrain_v15 import HelionV15Trainer, HelionV15Config
train_config = HelionV15Config(
base_model=self.config.base_model,
dataset_name=dataset_path,
output_dir=output_dir,
num_epochs=kwargs.get("epochs", self.config.num_epochs),
batch_size=kwargs.get("batch_size", self.config.batch_size),
learning_rate=kwargs.get("learning_rate", self.config.learning_rate),
hf_token=self.config.hf_token
)
trainer = HelionV15Trainer(train_config)
success = trainer.run_pipeline()
if success:
logger.info("✅ Training completed successfully")
self._log_event("training_completed", {"dataset": dataset_path, "output": output_dir})
return success
except Exception as e:
logger.error(f"Training failed: {e}")
return False
# ==================== Evaluation ====================
def evaluate(
self,
benchmark: str = "all",
output_file: Optional[str] = None
) -> Dict[str, Any]:
"""
Evaluate model on benchmarks.
Args:
benchmark: Benchmark name or 'all'
output_file: Save results to file
Returns:
Evaluation results
"""
if not self.model:
logger.error("Model not loaded")
return {}
logger.info(f"Running evaluation: {benchmark}")
results = {
"model": self.config.model_name,
"timestamp": datetime.now().isoformat(),
"benchmarks": {}
}
# Sample evaluation (integrate with actual benchmarks)
test_prompts = [
"What is 2+2?",
"Explain machine learning",
"Write a Python function to reverse a string"
]
for i, prompt in enumerate(test_prompts):
result = self.generate(prompt, max_tokens=256)
results["benchmarks"][f"test_{i}"] = {
"prompt": prompt,
"response": result["response"][:100],
"tokens": result.get("total_tokens", 0)
}
if output_file:
with open(output_file, 'w') as f:
json.dump(results, f, indent=2)
logger.info(f"Results saved to {output_file}")
self._log_event("evaluation", {"benchmark": benchmark})
return results
# ==================== Deployment ====================
def start_server(
self,
host: Optional[str] = None,
port: Optional[int] = None,
reload: bool = False
):
"""
Start API server.
Args:
host: Server host
port: Server port
reload: Enable auto-reload
"""
host = host or self.config.server_host
port = port or self.config.server_port
logger.info(f"Starting server on {host}:{port}")
try:
import uvicorn
from server import app
# Set environment variables
os.environ["MODEL_NAME"] = self.config.model_name
os.environ["SAFEGUARD_MODE"] = "moderate" if self.config.enable_safeguards else "permissive"
uvicorn.run(
"server:app",
host=host,
port=port,
reload=reload
)
except ImportError:
logger.error("Server dependencies not installed. Run: pip install fastapi uvicorn")
except Exception as e:
logger.error(f"Failed to start server: {e}")
def deploy(self, method: str = "docker", **kwargs):
"""
Deploy model using specified method.
Args:
method: Deployment method ('docker', 'vllm', 'tgi')
**kwargs: Deployment parameters
"""
logger.info(f"Deploying with method: {method}")
if method == "docker":
self._deploy_docker(**kwargs)
elif method == "vllm":
self._deploy_vllm(**kwargs)
elif method == "tgi":
self._deploy_tgi(**kwargs)
else:
logger.error(f"Unknown deployment method: {method}")
def _deploy_docker(self, **kwargs):
"""Create Docker deployment."""
dockerfile_content = f"""FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
WORKDIR /app
RUN apt-get update && apt-get install -y python3 python3-pip git
COPY requirements.txt .
RUN pip3 install -r requirements.txt
COPY . .
ENV MODEL_NAME={self.config.model_name}
EXPOSE 8000
CMD ["python3", "server.py", "--host", "0.0.0.0", "--port", "8000"]
"""
with open(Path(self.config.output_dir) / "Dockerfile", 'w') as f:
f.write(dockerfile_content)
logger.info("Dockerfile created. Build with: docker build -t helion-v15 .")
def _deploy_vllm(self, **kwargs):
"""Deploy with vLLM."""
cmd = f"python -m vllm.entrypoints.openai.api_server --model {self.config.model_name}"
logger.info(f"vLLM command: {cmd}")
subprocess.run(cmd, shell=True)
def _deploy_tgi(self, **kwargs):
"""Deploy with Text Generation Inference."""
cmd = f"""docker run --gpus all -p 8080:80 \\
ghcr.io/huggingface/text-generation-inference:latest \\
--model-id {self.config.model_name}"""
logger.info(f"TGI command:\n{cmd}")
# ==================== Utilities ====================
def benchmark_speed(self, num_requests: int = 10) -> Dict[str, float]:
"""
Benchmark inference speed.
Args:
num_requests: Number of test requests
Returns:
Speed metrics
"""
if not self.model:
logger.error("Model not loaded")
return {}
import time
logger.info(f"Running speed benchmark ({num_requests} requests)...")
test_prompt = "Explain artificial intelligence in one sentence."
times = []
tokens = []
for i in range(num_requests):
start = time.time()
result = self.generate(test_prompt, max_tokens=50, use_safeguards=False)
elapsed = time.time() - start
times.append(elapsed)
tokens.append(result.get("completion_tokens", 0))
avg_time = sum(times) / len(times)
avg_tokens = sum(tokens) / len(tokens)
tokens_per_sec = avg_tokens / avg_time
metrics = {
"avg_latency": avg_time,
"avg_tokens": avg_tokens,
"tokens_per_second": tokens_per_sec,
"requests": num_requests
}
logger.info(f"Benchmark results: {tokens_per_sec:.2f} tokens/sec, {avg_time:.2f}s avg latency")
return metrics
def export_model(self, format: str = "gguf", output_path: Optional[str] = None):
"""
Export model to different formats.
Args:
format: Export format ('gguf', 'onnx', 'tensorrt')
output_path: Output path
"""
output_path = output_path or os.path.join(self.config.output_dir, f"exported_{format}")
logger.info(f"Exporting model to {format} format...")
if format == "gguf":
logger.info("GGUF export: Use llama.cpp conversion tools")
elif format == "onnx":
logger.info("ONNX export: Use optimum library")
elif format == "tensorrt":
logger.info("TensorRT export: Use TensorRT conversion")
else:
logger.error(f"Unknown format: {format}")
def push_to_hub(
self,
repo_id: Optional[str] = None,
private: bool = False
) -> bool:
"""
Push model to HuggingFace Hub.
Args:
repo_id: Repository ID
private: Make repository private
Returns:
True if successful
"""
repo_id = repo_id or self.config.model_name
if not self.config.hf_token:
logger.error("HuggingFace token not set")
return False
try:
from huggingface_hub import HfApi
api = HfApi(token=self.config.hf_token)
logger.info(f"Pushing to hub: {repo_id}")
api.create_repo(repo_id, exist_ok=True, private=private)
api.upload_folder(
folder_path=self.config.output_dir,
repo_id=repo_id,
repo_type="model"
)
logger.info("✅ Model pushed successfully")
return True
except Exception as e:
logger.error(f"Failed to push to hub: {e}")
return False
def get_info(self) -> Dict[str, Any]:
"""Get orchestrator status and info."""
info = {
"model_loaded": self.model is not None,
"model_name": self.config.model_name,
"device": str(self.model.device) if self.model else None,
"safeguards_enabled": self.safeguards is not None,
"tools_enabled": self.tool_system is not None,
"memory_enabled": self.memory is not None,
"config": asdict(self.config),
"session_events": len(self.session_log)
}
if self.model:
info["model_memory"] = torch.cuda.max_memory_allocated() / 1024**3 if torch.cuda.is_available() else 0
if self.memory:
info["total_conversations"] = len(self.memory.conversations)
info["total_interactions"] = sum(len(conv) for conv in self.memory.conversations.values())
return info
def _show_memory(self, conversation_id: str):
"""Display memory for conversation."""
if not self.memory:
print("Memory not enabled")
return
context = self.memory.get_context(conversation_id)
interactions = self.memory.get_conversation(conversation_id)
print(f"\n{'='*60}")
print(f"Memory for Conversation: {conversation_id}")
print(f"{'='*60}")
print(f"Total interactions: {len(interactions)}")
print(f"\nContext summary:\n{context[:200]}..." if len(context) > 200 else f"\nContext:\n{context}")
print(f"{'='*60}\n")
def _log_event(self, event_type: str, data: Dict[str, Any]):
"""Log orchestrator event."""
event = {
"timestamp": datetime.now().isoformat(),
"type": event_type,
"data": data
}
self.session_log.append(event)
# Save to log file
log_file = Path(self.config.log_dir) / "orchestrator.jsonl"
with open(log_file, 'a') as f:
f.write(json.dumps(event) + '\n')
def _save_conversation(self, conversation: List[Dict], filename: Optional[str] = None):
"""Save conversation to file."""
filename = filename or f"conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
filepath = Path(self.config.output_dir) / filename
with open(filepath, 'w') as f:
json.dump(conversation, f, indent=2)
print(f"💾 Conversation saved to {filepath}")
def _print_chat_help(self):
"""Print chat help."""
print("""
Available Commands:
/quit - Exit chat and save memory
/clear - Clear current conversation
/save [name] - Save conversation to file
/memory - Show memory for this conversation
/newconv - Start a new conversation (saves current)
/help - Show this help message
""")
def main():
"""CLI interface for Helion Orchestrator."""
parser = argparse.ArgumentParser(
description="Helion Orchestrator - Complete Model Management",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Load model and start interactive chat
python helion_orchestrator.py chat --model DeepXR/Helion-V1.5
# Train model
python helion_orchestrator.py train --dataset ./data/train.jsonl --epochs 3
# Start API server
python helion_orchestrator.py serve --port 8000
# Evaluate model
python helion_orchestrator.py eval --benchmark all
# Benchmark speed
python helion_orchestrator.py benchmark --requests 20
"""
)
parser.add_argument("command", choices=[
"chat", "generate", "train", "eval", "serve",
"benchmark", "export", "push", "info"
])
# Common args
parser.add_argument("--model", help="Model name or path")
parser.add_argument("--4bit", action="store_true", help="Use 4-bit quantization")
parser.add_argument("--no-safeguards", action="store_true", help="Disable safeguards")
# Command-specific args
parser.add_argument("--prompt", help="Prompt for generation")
parser.add_argument("--dataset", help="Dataset path for training")
parser.add_argument("--epochs", type=int, help="Training epochs")
parser.add_argument("--port", type=int, default=8000, help="Server port")
parser.add_argument("--benchmark", help="Benchmark name")
parser.add_argument("--requests", type=int, default=10, help="Number of benchmark requests")
parser.add_argument("--format", help="Export format")
parser.add_argument("--repo-id", help="HuggingFace repo ID")
args = parser.parse_args()
# Create config
config = HelionConfig(
model_name=args.model or "DeepXR/Helion-V1.5",
load_in_4bit=args.__dict__.get('4bit', False),
enable_safeguards=not args.no_safeguards
)
# Create orchestrator
orchestrator = HelionOrchestrator(config)
# Execute command
if args.command == "chat":
orchestrator.load_model()
orchestrator.interactive_chat()
elif args.command == "generate":
if not args.prompt:
print("Error: --prompt required")
sys.exit(1)
orchestrator.load_model()
result = orchestrator.generate(args.prompt)
print(f"\nResponse:\n{result['response']}\n")
print(f"Tokens: {result.get('total_tokens', 0)}")
elif args.command == "train":
if not args.dataset:
print("Error: --dataset required")
sys.exit(1)
orchestrator.train(dataset_path=args.dataset, epochs=args.epochs)
elif args.command == "eval":
orchestrator.load_model()
orchestrator.evaluate(benchmark=args.benchmark or "all")
elif args.command == "serve":
orchestrator.start_server(port=args.port)
elif args.command == "benchmark":
orchestrator.load_model()
orchestrator.benchmark_speed(num_requests=args.requests)
elif args.command == "export":
orchestrator.load_model()
orchestrator.export_model(format=args.format or "gguf")
elif args.command == "push":
orchestrator.push_to_hub(repo_id=args.repo_id)
elif args.command == "info":
info = orchestrator.get_info()
print(json.dumps(info, indent=2))
if __name__ == "__main__":
main()