Helion-V1.5 / helion_orchestrator.py

Update helion_orchestrator.py

fc7a401 verified 6 months ago

35.5 kB

	"""
	Helion Orchestrator - Complete Model Management System
	Unified interface for training, inference, deployment, evaluation, and monitoring
	"""

	import os
	import sys
	import json
	import logging
	import argparse
	import subprocess
	from pathlib import Path
	from typing import Dict, List, Optional, Any
	from dataclasses import dataclass, asdict
	from datetime import datetime
	import torch

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	class MemoryManager:
	"""
	Conversation memory manager for Helion.
	Stores and retrieves conversation history for context-aware responses.
	"""

	def __init__(self, memory_file: str = "helion_memory.json", window_size: int = 10):
	self.memory_file = Path(memory_file)
	self.window_size = window_size
	self.conversations: Dict[str, List[Dict]] = {}
	self.load()

	def add_interaction(self, conversation_id: str, user_input: str, assistant_response: str):
	"""
	Add interaction to memory.

	Args:
	conversation_id: Unique conversation identifier
	user_input: User's message
	assistant_response: Assistant's response
	"""
	if conversation_id not in self.conversations:
	self.conversations[conversation_id] = []

	self.conversations[conversation_id].append({
	"timestamp": datetime.now().isoformat(),
	"user": user_input,
	"assistant": assistant_response
	})

	# Keep only last N interactions per conversation
	if len(self.conversations[conversation_id]) > self.window_size:
	self.conversations[conversation_id] = self.conversations[conversation_id][-self.window_size:]

	self.save()

	def get_context(self, conversation_id: str, max_length: int = 500) -> str:
	"""
	Get conversation context as a summary string.

	Args:
	conversation_id: Conversation ID
	max_length: Maximum context length in characters

	Returns:
	Context string
	"""
	if conversation_id not in self.conversations:
	return ""

	interactions = self.conversations[conversation_id]

	# Build context from recent interactions
	context_parts = []
	total_length = 0

	for interaction in reversed(interactions):
	part = f"User: {interaction['user'][:100]} \| Assistant: {interaction['assistant'][:100]}"
	if total_length + len(part) > max_length:
	break
	context_parts.insert(0, part)
	total_length += len(part)

	return " \| ".join(context_parts)

	def get_conversation(self, conversation_id: str) -> List[Dict]:
	"""Get full conversation history."""
	return self.conversations.get(conversation_id, [])

	def clear_conversation(self, conversation_id: str):
	"""Clear specific conversation."""
	if conversation_id in self.conversations:
	del self.conversations[conversation_id]
	self.save()

	def clear_all(self):
	"""Clear all conversations."""
	self.conversations = {}
	self.save()

	def save(self):
	"""Save memory to file."""
	try:
	self.memory_file.parent.mkdir(parents=True, exist_ok=True)
	with open(self.memory_file, 'w') as f:
	json.dump(self.conversations, f, indent=2)
	except Exception as e:
	logger.error(f"Failed to save memory: {e}")

	def load(self):
	"""Load memory from file."""
	try:
	if self.memory_file.exists():
	with open(self.memory_file, 'r') as f:
	self.conversations = json.load(f)
	logger.info(f"Loaded {len(self.conversations)} conversations from memory")
	except Exception as e:
	logger.warning(f"Failed to load memory: {e}")
	self.conversations = {}


	@dataclass
	class HelionConfig:
	"""Central configuration for all Helion operations."""
	model_name: str = "DeepXR/Helion-V1.5"
	base_model: str = "meta-llama/Llama-2-7b-hf"
	output_dir: str = "./helion_workspace"
	cache_dir: str = "./helion_cache"
	log_dir: str = "./helion_logs"

	# Inference settings
	max_tokens: int = 512
	temperature: float = 0.7
	top_p: float = 0.9
	device: str = "auto"
	load_in_4bit: bool = False

	# Training settings
	dataset_path: Optional[str] = None
	num_epochs: int = 3
	batch_size: int = 4
	learning_rate: float = 2e-5

	# Server settings
	server_host: str = "0.0.0.0"
	server_port: int = 8000
	enable_safeguards: bool = True
	enable_tools: bool = False

	# Memory settings
	enable_memory: bool = True
	memory_window: int = 10 # Remember last N conversations
	memory_file: str = "helion_memory.json"

	# HuggingFace
	hf_token: Optional[str] = None
	push_to_hub: bool = False


	class HelionOrchestrator:
	"""
	Master orchestrator for all Helion model operations.
	Provides unified interface for training, inference, deployment, and management.
	"""

	def __init__(self, config: Optional[HelionConfig] = None):
	self.config = config or HelionConfig()
	self.config.hf_token = self.config.hf_token or os.getenv("HF_TOKEN")

	# Create directories
	for dir_path in [self.config.output_dir, self.config.cache_dir, self.config.log_dir]:
	Path(dir_path).mkdir(parents=True, exist_ok=True)

	# Model components
	self.model = None
	self.tokenizer = None
	self.safeguards = None
	self.tool_system = None
	self.memory = None

	self.session_log = []

	# Initialize memory if enabled
	if self.config.enable_memory:
	self._init_memory()

	# ==================== Model Loading ====================

	def load_model(
	self,
	model_name: Optional[str] = None,
	quantization: Optional[str] = None
	) -> bool:
	"""
	Load model and tokenizer with optional quantization.

	Args:
	model_name: Model to load (default: config model)
	quantization: Quantization type ('4bit', '8bit', None)

	Returns:
	True if successful
	"""
	try:
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

	model_name = model_name or self.config.model_name
	logger.info(f"Loading model: {model_name}")

	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	token=self.config.hf_token,
	cache_dir=self.config.cache_dir
	)

	# Quantization config
	load_kwargs = {
	"device_map": self.config.device,
	"torch_dtype": torch.bfloat16,
	"cache_dir": self.config.cache_dir,
	"token": self.config.hf_token
	}

	if quantization == "4bit" or self.config.load_in_4bit:
	load_kwargs["quantization_config"] = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16
	)
	elif quantization == "8bit":
	load_kwargs["quantization_config"] = BitsAndBytesConfig(
	load_in_8bit=True
	)

	# Load model
	self.model = AutoModelForCausalLM.from_pretrained(
	model_name,
	**load_kwargs
	)
	self.model.eval()

	# Load safeguards if enabled
	if self.config.enable_safeguards:
	self._load_safeguards()

	# Load tools if enabled
	if self.config.enable_tools:
	self._load_tools()

	logger.info("✅ Model loaded successfully")
	self._log_event("model_loaded", {"model": model_name, "quantization": quantization})
	return True

	except Exception as e:
	logger.error(f"Failed to load model: {e}")
	return False

	def _load_safeguards(self):
	"""Load safeguard system."""
	try:
	from safeguards_v15 import HelionSafeguardSystem, SafeguardConfig, PolicyMode

	config = SafeguardConfig(policy_mode=PolicyMode.MODERATE)
	self.safeguards = HelionSafeguardSystem(config)
	logger.info("Safeguards loaded")
	except ImportError:
	logger.warning("Safeguards module not found")

	def _load_tools(self):
	"""Load tool system."""
	try:
	from tools_system import HelionToolSystem

	self.tool_system = HelionToolSystem(self.model, self.tokenizer)
	logger.info("Tool system loaded")
	except ImportError:
	logger.warning("Tools module not found")

	def _init_memory(self):
	"""Initialize memory system."""
	self.memory = MemoryManager(
	memory_file=os.path.join(self.config.output_dir, self.config.memory_file),
	window_size=self.config.memory_window
	)
	logger.info("Memory system initialized")

	def unload_model(self):
	"""Unload model to free memory."""
	if self.model:
	del self.model
	del self.tokenizer
	torch.cuda.empty_cache()
	logger.info("Model unloaded")

	# ==================== Inference ====================

	def generate(
	self,
	prompt: str,
	max_tokens: Optional[int] = None,
	temperature: Optional[float] = None,
	system_prompt: Optional[str] = None,
	use_safeguards: bool = True,
	use_memory: bool = True,
	conversation_id: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Generate response from prompt.

	Args:
	prompt: Input prompt
	max_tokens: Max tokens to generate
	temperature: Sampling temperature
	system_prompt: Optional system prompt
	use_safeguards: Apply safeguard checks
	use_memory: Use conversation memory
	conversation_id: Conversation identifier for memory

	Returns:
	Dict with response and metadata
	"""
	if not self.model:
	raise RuntimeError("Model not loaded. Call load_model() first.")

	max_tokens = max_tokens or self.config.max_tokens
	temperature = temperature or self.config.temperature

	# Retrieve memory context if enabled
	memory_context = ""
	if use_memory and self.memory and conversation_id:
	memory_context = self.memory.get_context(conversation_id)

	# Build messages
	messages = []
	if system_prompt:
	messages.append({"role": "system", "content": system_prompt})

	# Add memory context if available
	if memory_context:
	messages.append({"role": "system", "content": f"Previous context: {memory_context}"})

	messages.append({"role": "user", "content": prompt})

	# Check with safeguards
	if use_safeguards and self.safeguards:
	allowed, response = self.safeguards.filter_message(prompt)
	if not allowed:
	return {
	"response": response,
	"blocked": True,
	"reason": "safeguard_violation"
	}

	# Generate
	input_ids = self.tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(self.model.device)

	with torch.no_grad():
	output = self.model.generate(
	input_ids,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=self.config.top_p,
	do_sample=True,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id
	)

	response_text = self.tokenizer.decode(
	output[0][input_ids.shape[1]:],
	skip_special_tokens=True
	).strip()

	# Store in memory if enabled
	if use_memory and self.memory and conversation_id:
	self.memory.add_interaction(conversation_id, prompt, response_text)

	result = {
	"response": response_text,
	"blocked": False,
	"prompt_tokens": input_ids.shape[1],
	"completion_tokens": output.shape[1] - input_ids.shape[1],
	"total_tokens": output.shape[1],
	"conversation_id": conversation_id
	}

	self._log_event("generation", {"prompt": prompt[:100], "tokens": result["total_tokens"]})
	return result

	def chat(
	self,
	messages: List[Dict[str, str]],
	use_memory: bool = True,
	conversation_id: Optional[str] = None,
	**kwargs
	) -> Dict[str, Any]:
	"""
	Multi-turn chat completion.

	Args:
	messages: List of message dicts
	use_memory: Use memory for context
	conversation_id: Conversation ID for memory
	**kwargs: Generation parameters

	Returns:
	Dict with response and metadata
	"""
	if not self.model:
	raise RuntimeError("Model not loaded")

	# Add memory context if available
	if use_memory and self.memory and conversation_id:
	memory_context = self.memory.get_context(conversation_id)
	if memory_context:
	# Insert memory context before user messages
	messages = [
	{"role": "system", "content": f"Previous context: {memory_context}"}
	] + messages

	# Similar to generate but maintains conversation
	input_ids = self.tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(self.model.device)

	with torch.no_grad():
	output = self.model.generate(
	input_ids,
	max_new_tokens=kwargs.get("max_tokens", self.config.max_tokens),
	temperature=kwargs.get("temperature", self.config.temperature),
	top_p=kwargs.get("top_p", self.config.top_p),
	do_sample=True,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id
	)

	response = self.tokenizer.decode(
	output[0][input_ids.shape[1]:],
	skip_special_tokens=True
	).strip()

	# Store in memory
	if use_memory and self.memory and conversation_id:
	user_message = messages[-1]["content"]
	self.memory.add_interaction(conversation_id, user_message, response)

	return {"response": response, "blocked": False}

	def interactive_chat(self):
	"""Start interactive chat session."""
	if not self.model:
	print("❌ Model not loaded. Run: orchestrator.load_model()")
	return

	print("\n" + "="*60)
	print("Helion Interactive Chat with Memory")
	print("Commands: /quit, /clear, /save, /memory, /newconv, /help")
	print("="*60 + "\n")

	conversation = []
	conversation_id = f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

	# Show memory status
	if self.memory:
	print(f"💾 Memory: Enabled (ID: {conversation_id})")
	# Check if there's previous context
	prev_context = self.memory.get_context(conversation_id)
	if prev_context:
	print(f"📝 Retrieved previous context\n")

	while True:
	try:
	user_input = input("\n🧑 You: ").strip()

	if not user_input:
	continue

	# Handle commands
	if user_input.startswith("/"):
	if user_input == "/quit":
	if self.memory:
	self.memory.save()
	print("Goodbye!")
	break
	elif user_input == "/clear":
	conversation = []
	print("Conversation cleared.")
	continue
	elif user_input == "/memory":
	self._show_memory(conversation_id)
	continue
	elif user_input == "/newconv":
	if self.memory:
	self.memory.save()
	conversation = []
	conversation_id = f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
	print(f"New conversation started (ID: {conversation_id})")
	continue
	elif user_input.startswith("/save"):
	self._save_conversation(conversation, user_input.split()[1] if len(user_input.split()) > 1 else None)
	continue
	elif user_input == "/help":
	self._print_chat_help()
	continue
	else:
	print("Unknown command. Type /help for options.")
	continue

	conversation.append({"role": "user", "content": user_input})

	result = self.chat(
	conversation,
	use_memory=True,
	conversation_id=conversation_id
	)

	if result.get("blocked"):
	print(f"🤖 Helion: {result['response']}")
	conversation.pop()
	else:
	print(f"🤖 Helion: {result['response']}")
	conversation.append({"role": "assistant", "content": result['response']})

	except KeyboardInterrupt:
	print("\n\nInterrupted. Type /quit to exit.")
	except Exception as e:
	logger.error(f"Error: {e}")
	conversation.pop()

	# ==================== Training ====================

	def train(
	self,
	dataset_path: Optional[str] = None,
	output_dir: Optional[str] = None,
	**kwargs
	) -> bool:
	"""
	Train or fine-tune model.

	Args:
	dataset_path: Path to training data
	output_dir: Output directory
	**kwargs: Training parameters

	Returns:
	True if successful
	"""
	dataset_path = dataset_path or self.config.dataset_path
	output_dir = output_dir or os.path.join(self.config.output_dir, "trained_model")

	if not dataset_path:
	logger.error("No dataset provided")
	return False

	logger.info(f"Starting training with dataset: {dataset_path}")

	try:
	# Import training script
	from autotrain_v15 import HelionV15Trainer, HelionV15Config

	train_config = HelionV15Config(
	base_model=self.config.base_model,
	dataset_name=dataset_path,
	output_dir=output_dir,
	num_epochs=kwargs.get("epochs", self.config.num_epochs),
	batch_size=kwargs.get("batch_size", self.config.batch_size),
	learning_rate=kwargs.get("learning_rate", self.config.learning_rate),
	hf_token=self.config.hf_token
	)

	trainer = HelionV15Trainer(train_config)
	success = trainer.run_pipeline()

	if success:
	logger.info("✅ Training completed successfully")
	self._log_event("training_completed", {"dataset": dataset_path, "output": output_dir})

	return success

	except Exception as e:
	logger.error(f"Training failed: {e}")
	return False

	# ==================== Evaluation ====================

	def evaluate(
	self,
	benchmark: str = "all",
	output_file: Optional[str] = None
	) -> Dict[str, Any]:
	"""
	Evaluate model on benchmarks.

	Args:
	benchmark: Benchmark name or 'all'
	output_file: Save results to file

	Returns:
	Evaluation results
	"""
	if not self.model:
	logger.error("Model not loaded")
	return {}

	logger.info(f"Running evaluation: {benchmark}")

	results = {
	"model": self.config.model_name,
	"timestamp": datetime.now().isoformat(),
	"benchmarks": {}
	}

	# Sample evaluation (integrate with actual benchmarks)
	test_prompts = [
	"What is 2+2?",
	"Explain machine learning",
	"Write a Python function to reverse a string"
	]

	for i, prompt in enumerate(test_prompts):
	result = self.generate(prompt, max_tokens=256)
	results["benchmarks"][f"test_{i}"] = {
	"prompt": prompt,
	"response": result["response"][:100],
	"tokens": result.get("total_tokens", 0)
	}

	if output_file:
	with open(output_file, 'w') as f:
	json.dump(results, f, indent=2)
	logger.info(f"Results saved to {output_file}")

	self._log_event("evaluation", {"benchmark": benchmark})
	return results

	# ==================== Deployment ====================

	def start_server(
	self,
	host: Optional[str] = None,
	port: Optional[int] = None,
	reload: bool = False
	):
	"""
	Start API server.

	Args:
	host: Server host
	port: Server port
	reload: Enable auto-reload
	"""
	host = host or self.config.server_host
	port = port or self.config.server_port

	logger.info(f"Starting server on {host}:{port}")

	try:
	import uvicorn
	from server import app

	# Set environment variables
	os.environ["MODEL_NAME"] = self.config.model_name
	os.environ["SAFEGUARD_MODE"] = "moderate" if self.config.enable_safeguards else "permissive"

	uvicorn.run(
	"server:app",
	host=host,
	port=port,
	reload=reload
	)

	except ImportError:
	logger.error("Server dependencies not installed. Run: pip install fastapi uvicorn")
	except Exception as e:
	logger.error(f"Failed to start server: {e}")

	def deploy(self, method: str = "docker", **kwargs):
	"""
	Deploy model using specified method.

	Args:
	method: Deployment method ('docker', 'vllm', 'tgi')
	**kwargs: Deployment parameters
	"""
	logger.info(f"Deploying with method: {method}")

	if method == "docker":
	self._deploy_docker(**kwargs)
	elif method == "vllm":
	self._deploy_vllm(**kwargs)
	elif method == "tgi":
	self._deploy_tgi(**kwargs)
	else:
	logger.error(f"Unknown deployment method: {method}")

	def _deploy_docker(self, **kwargs):
	"""Create Docker deployment."""
	dockerfile_content = f"""FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04

	WORKDIR /app

	RUN apt-get update && apt-get install -y python3 python3-pip git

	COPY requirements.txt .
	RUN pip3 install -r requirements.txt

	COPY . .

	ENV MODEL_NAME={self.config.model_name}
	EXPOSE 8000

	CMD ["python3", "server.py", "--host", "0.0.0.0", "--port", "8000"]
	"""

	with open(Path(self.config.output_dir) / "Dockerfile", 'w') as f:
	f.write(dockerfile_content)

	logger.info("Dockerfile created. Build with: docker build -t helion-v15 .")

	def _deploy_vllm(self, **kwargs):
	"""Deploy with vLLM."""
	cmd = f"python -m vllm.entrypoints.openai.api_server --model {self.config.model_name}"
	logger.info(f"vLLM command: {cmd}")
	subprocess.run(cmd, shell=True)

	def _deploy_tgi(self, **kwargs):
	"""Deploy with Text Generation Inference."""
	cmd = f"""docker run --gpus all -p 8080:80 \\
	ghcr.io/huggingface/text-generation-inference:latest \\
	--model-id {self.config.model_name}"""
	logger.info(f"TGI command:\n{cmd}")

	# ==================== Utilities ====================

	def benchmark_speed(self, num_requests: int = 10) -> Dict[str, float]:
	"""
	Benchmark inference speed.

	Args:
	num_requests: Number of test requests

	Returns:
	Speed metrics
	"""
	if not self.model:
	logger.error("Model not loaded")
	return {}

	import time

	logger.info(f"Running speed benchmark ({num_requests} requests)...")

	test_prompt = "Explain artificial intelligence in one sentence."
	times = []
	tokens = []

	for i in range(num_requests):
	start = time.time()
	result = self.generate(test_prompt, max_tokens=50, use_safeguards=False)
	elapsed = time.time() - start

	times.append(elapsed)
	tokens.append(result.get("completion_tokens", 0))

	avg_time = sum(times) / len(times)
	avg_tokens = sum(tokens) / len(tokens)
	tokens_per_sec = avg_tokens / avg_time

	metrics = {
	"avg_latency": avg_time,
	"avg_tokens": avg_tokens,
	"tokens_per_second": tokens_per_sec,
	"requests": num_requests
	}

	logger.info(f"Benchmark results: {tokens_per_sec:.2f} tokens/sec, {avg_time:.2f}s avg latency")
	return metrics

	def export_model(self, format: str = "gguf", output_path: Optional[str] = None):
	"""
	Export model to different formats.

	Args:
	format: Export format ('gguf', 'onnx', 'tensorrt')
	output_path: Output path
	"""
	output_path = output_path or os.path.join(self.config.output_dir, f"exported_{format}")
	logger.info(f"Exporting model to {format} format...")

	if format == "gguf":
	logger.info("GGUF export: Use llama.cpp conversion tools")
	elif format == "onnx":
	logger.info("ONNX export: Use optimum library")
	elif format == "tensorrt":
	logger.info("TensorRT export: Use TensorRT conversion")
	else:
	logger.error(f"Unknown format: {format}")

	def push_to_hub(
	self,
	repo_id: Optional[str] = None,
	private: bool = False
	) -> bool:
	"""
	Push model to HuggingFace Hub.

	Args:
	repo_id: Repository ID
	private: Make repository private

	Returns:
	True if successful
	"""
	repo_id = repo_id or self.config.model_name

	if not self.config.hf_token:
	logger.error("HuggingFace token not set")
	return False

	try:
	from huggingface_hub import HfApi

	api = HfApi(token=self.config.hf_token)

	logger.info(f"Pushing to hub: {repo_id}")

	api.create_repo(repo_id, exist_ok=True, private=private)
	api.upload_folder(
	folder_path=self.config.output_dir,
	repo_id=repo_id,
	repo_type="model"
	)

	logger.info("✅ Model pushed successfully")
	return True

	except Exception as e:
	logger.error(f"Failed to push to hub: {e}")
	return False

	def get_info(self) -> Dict[str, Any]:
	"""Get orchestrator status and info."""
	info = {
	"model_loaded": self.model is not None,
	"model_name": self.config.model_name,
	"device": str(self.model.device) if self.model else None,
	"safeguards_enabled": self.safeguards is not None,
	"tools_enabled": self.tool_system is not None,
	"memory_enabled": self.memory is not None,
	"config": asdict(self.config),
	"session_events": len(self.session_log)
	}

	if self.model:
	info["model_memory"] = torch.cuda.max_memory_allocated() / 1024**3 if torch.cuda.is_available() else 0

	if self.memory:
	info["total_conversations"] = len(self.memory.conversations)
	info["total_interactions"] = sum(len(conv) for conv in self.memory.conversations.values())

	return info

	def _show_memory(self, conversation_id: str):
	"""Display memory for conversation."""
	if not self.memory:
	print("Memory not enabled")
	return

	context = self.memory.get_context(conversation_id)
	interactions = self.memory.get_conversation(conversation_id)

	print(f"\n{'='*60}")
	print(f"Memory for Conversation: {conversation_id}")
	print(f"{'='*60}")
	print(f"Total interactions: {len(interactions)}")
	print(f"\nContext summary:\n{context[:200]}..." if len(context) > 200 else f"\nContext:\n{context}")
	print(f"{'='*60}\n")

	def _log_event(self, event_type: str, data: Dict[str, Any]):
	"""Log orchestrator event."""
	event = {
	"timestamp": datetime.now().isoformat(),
	"type": event_type,
	"data": data
	}
	self.session_log.append(event)

	# Save to log file
	log_file = Path(self.config.log_dir) / "orchestrator.jsonl"
	with open(log_file, 'a') as f:
	f.write(json.dumps(event) + '\n')

	def _save_conversation(self, conversation: List[Dict], filename: Optional[str] = None):
	"""Save conversation to file."""
	filename = filename or f"conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
	filepath = Path(self.config.output_dir) / filename

	with open(filepath, 'w') as f:
	json.dump(conversation, f, indent=2)

	print(f"💾 Conversation saved to {filepath}")

	def _print_chat_help(self):
	"""Print chat help."""
	print("""
	Available Commands:
	/quit - Exit chat and save memory
	/clear - Clear current conversation
	/save [name] - Save conversation to file
	/memory - Show memory for this conversation
	/newconv - Start a new conversation (saves current)
	/help - Show this help message
	""")


	def main():
	"""CLI interface for Helion Orchestrator."""
	parser = argparse.ArgumentParser(
	description="Helion Orchestrator - Complete Model Management",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	Examples:
	# Load model and start interactive chat
	python helion_orchestrator.py chat --model DeepXR/Helion-V1.5

	# Train model
	python helion_orchestrator.py train --dataset ./data/train.jsonl --epochs 3

	# Start API server
	python helion_orchestrator.py serve --port 8000

	# Evaluate model
	python helion_orchestrator.py eval --benchmark all

	# Benchmark speed
	python helion_orchestrator.py benchmark --requests 20
	"""
	)

	parser.add_argument("command", choices=[
	"chat", "generate", "train", "eval", "serve",
	"benchmark", "export", "push", "info"
	])

	# Common args
	parser.add_argument("--model", help="Model name or path")
	parser.add_argument("--4bit", action="store_true", help="Use 4-bit quantization")
	parser.add_argument("--no-safeguards", action="store_true", help="Disable safeguards")

	# Command-specific args
	parser.add_argument("--prompt", help="Prompt for generation")
	parser.add_argument("--dataset", help="Dataset path for training")
	parser.add_argument("--epochs", type=int, help="Training epochs")
	parser.add_argument("--port", type=int, default=8000, help="Server port")
	parser.add_argument("--benchmark", help="Benchmark name")
	parser.add_argument("--requests", type=int, default=10, help="Number of benchmark requests")
	parser.add_argument("--format", help="Export format")
	parser.add_argument("--repo-id", help="HuggingFace repo ID")

	args = parser.parse_args()

	# Create config
	config = HelionConfig(
	model_name=args.model or "DeepXR/Helion-V1.5",
	load_in_4bit=args.__dict__.get('4bit', False),
	enable_safeguards=not args.no_safeguards
	)

	# Create orchestrator
	orchestrator = HelionOrchestrator(config)

	# Execute command
	if args.command == "chat":
	orchestrator.load_model()
	orchestrator.interactive_chat()

	elif args.command == "generate":
	if not args.prompt:
	print("Error: --prompt required")
	sys.exit(1)
	orchestrator.load_model()
	result = orchestrator.generate(args.prompt)
	print(f"\nResponse:\n{result['response']}\n")
	print(f"Tokens: {result.get('total_tokens', 0)}")

	elif args.command == "train":
	if not args.dataset:
	print("Error: --dataset required")
	sys.exit(1)
	orchestrator.train(dataset_path=args.dataset, epochs=args.epochs)

	elif args.command == "eval":
	orchestrator.load_model()
	orchestrator.evaluate(benchmark=args.benchmark or "all")

	elif args.command == "serve":
	orchestrator.start_server(port=args.port)

	elif args.command == "benchmark":
	orchestrator.load_model()
	orchestrator.benchmark_speed(num_requests=args.requests)

	elif args.command == "export":
	orchestrator.load_model()
	orchestrator.export_model(format=args.format or "gguf")

	elif args.command == "push":
	orchestrator.push_to_hub(repo_id=args.repo_id)

	elif args.command == "info":
	info = orchestrator.get_info()
	print(json.dumps(info, indent=2))


	if __name__ == "__main__":
	main()