| """ |
| Helion Orchestrator - Complete Model Management System |
| Unified interface for training, inference, deployment, evaluation, and monitoring |
| """ |
|
|
| import os |
| import sys |
| import json |
| import logging |
| import argparse |
| import subprocess |
| from pathlib import Path |
| from typing import Dict, List, Optional, Any |
| from dataclasses import dataclass, asdict |
| from datetime import datetime |
| import torch |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
| ) |
| logger = logging.getLogger(__name__) |
|
|
|
|
| class MemoryManager: |
| """ |
| Conversation memory manager for Helion. |
| Stores and retrieves conversation history for context-aware responses. |
| """ |
| |
| def __init__(self, memory_file: str = "helion_memory.json", window_size: int = 10): |
| self.memory_file = Path(memory_file) |
| self.window_size = window_size |
| self.conversations: Dict[str, List[Dict]] = {} |
| self.load() |
| |
| def add_interaction(self, conversation_id: str, user_input: str, assistant_response: str): |
| """ |
| Add interaction to memory. |
| |
| Args: |
| conversation_id: Unique conversation identifier |
| user_input: User's message |
| assistant_response: Assistant's response |
| """ |
| if conversation_id not in self.conversations: |
| self.conversations[conversation_id] = [] |
| |
| self.conversations[conversation_id].append({ |
| "timestamp": datetime.now().isoformat(), |
| "user": user_input, |
| "assistant": assistant_response |
| }) |
| |
| |
| if len(self.conversations[conversation_id]) > self.window_size: |
| self.conversations[conversation_id] = self.conversations[conversation_id][-self.window_size:] |
| |
| self.save() |
| |
| def get_context(self, conversation_id: str, max_length: int = 500) -> str: |
| """ |
| Get conversation context as a summary string. |
| |
| Args: |
| conversation_id: Conversation ID |
| max_length: Maximum context length in characters |
| |
| Returns: |
| Context string |
| """ |
| if conversation_id not in self.conversations: |
| return "" |
| |
| interactions = self.conversations[conversation_id] |
| |
| |
| context_parts = [] |
| total_length = 0 |
| |
| for interaction in reversed(interactions): |
| part = f"User: {interaction['user'][:100]} | Assistant: {interaction['assistant'][:100]}" |
| if total_length + len(part) > max_length: |
| break |
| context_parts.insert(0, part) |
| total_length += len(part) |
| |
| return " | ".join(context_parts) |
| |
| def get_conversation(self, conversation_id: str) -> List[Dict]: |
| """Get full conversation history.""" |
| return self.conversations.get(conversation_id, []) |
| |
| def clear_conversation(self, conversation_id: str): |
| """Clear specific conversation.""" |
| if conversation_id in self.conversations: |
| del self.conversations[conversation_id] |
| self.save() |
| |
| def clear_all(self): |
| """Clear all conversations.""" |
| self.conversations = {} |
| self.save() |
| |
| def save(self): |
| """Save memory to file.""" |
| try: |
| self.memory_file.parent.mkdir(parents=True, exist_ok=True) |
| with open(self.memory_file, 'w') as f: |
| json.dump(self.conversations, f, indent=2) |
| except Exception as e: |
| logger.error(f"Failed to save memory: {e}") |
| |
| def load(self): |
| """Load memory from file.""" |
| try: |
| if self.memory_file.exists(): |
| with open(self.memory_file, 'r') as f: |
| self.conversations = json.load(f) |
| logger.info(f"Loaded {len(self.conversations)} conversations from memory") |
| except Exception as e: |
| logger.warning(f"Failed to load memory: {e}") |
| self.conversations = {} |
|
|
|
|
| @dataclass |
| class HelionConfig: |
| """Central configuration for all Helion operations.""" |
| model_name: str = "DeepXR/Helion-V1.5" |
| base_model: str = "meta-llama/Llama-2-7b-hf" |
| output_dir: str = "./helion_workspace" |
| cache_dir: str = "./helion_cache" |
| log_dir: str = "./helion_logs" |
| |
| |
| max_tokens: int = 512 |
| temperature: float = 0.7 |
| top_p: float = 0.9 |
| device: str = "auto" |
| load_in_4bit: bool = False |
| |
| |
| dataset_path: Optional[str] = None |
| num_epochs: int = 3 |
| batch_size: int = 4 |
| learning_rate: float = 2e-5 |
| |
| |
| server_host: str = "0.0.0.0" |
| server_port: int = 8000 |
| enable_safeguards: bool = True |
| enable_tools: bool = False |
| |
| |
| enable_memory: bool = True |
| memory_window: int = 10 |
| memory_file: str = "helion_memory.json" |
| |
| |
| hf_token: Optional[str] = None |
| push_to_hub: bool = False |
|
|
|
|
| class HelionOrchestrator: |
| """ |
| Master orchestrator for all Helion model operations. |
| Provides unified interface for training, inference, deployment, and management. |
| """ |
| |
| def __init__(self, config: Optional[HelionConfig] = None): |
| self.config = config or HelionConfig() |
| self.config.hf_token = self.config.hf_token or os.getenv("HF_TOKEN") |
| |
| |
| for dir_path in [self.config.output_dir, self.config.cache_dir, self.config.log_dir]: |
| Path(dir_path).mkdir(parents=True, exist_ok=True) |
| |
| |
| self.model = None |
| self.tokenizer = None |
| self.safeguards = None |
| self.tool_system = None |
| self.memory = None |
| |
| self.session_log = [] |
| |
| |
| if self.config.enable_memory: |
| self._init_memory() |
| |
| |
| |
| def load_model( |
| self, |
| model_name: Optional[str] = None, |
| quantization: Optional[str] = None |
| ) -> bool: |
| """ |
| Load model and tokenizer with optional quantization. |
| |
| Args: |
| model_name: Model to load (default: config model) |
| quantization: Quantization type ('4bit', '8bit', None) |
| |
| Returns: |
| True if successful |
| """ |
| try: |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
| |
| model_name = model_name or self.config.model_name |
| logger.info(f"Loading model: {model_name}") |
| |
| |
| self.tokenizer = AutoTokenizer.from_pretrained( |
| model_name, |
| token=self.config.hf_token, |
| cache_dir=self.config.cache_dir |
| ) |
| |
| |
| load_kwargs = { |
| "device_map": self.config.device, |
| "torch_dtype": torch.bfloat16, |
| "cache_dir": self.config.cache_dir, |
| "token": self.config.hf_token |
| } |
| |
| if quantization == "4bit" or self.config.load_in_4bit: |
| load_kwargs["quantization_config"] = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_compute_dtype=torch.bfloat16 |
| ) |
| elif quantization == "8bit": |
| load_kwargs["quantization_config"] = BitsAndBytesConfig( |
| load_in_8bit=True |
| ) |
| |
| |
| self.model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| **load_kwargs |
| ) |
| self.model.eval() |
| |
| |
| if self.config.enable_safeguards: |
| self._load_safeguards() |
| |
| |
| if self.config.enable_tools: |
| self._load_tools() |
| |
| logger.info("✅ Model loaded successfully") |
| self._log_event("model_loaded", {"model": model_name, "quantization": quantization}) |
| return True |
| |
| except Exception as e: |
| logger.error(f"Failed to load model: {e}") |
| return False |
| |
| def _load_safeguards(self): |
| """Load safeguard system.""" |
| try: |
| from safeguards_v15 import HelionSafeguardSystem, SafeguardConfig, PolicyMode |
| |
| config = SafeguardConfig(policy_mode=PolicyMode.MODERATE) |
| self.safeguards = HelionSafeguardSystem(config) |
| logger.info("Safeguards loaded") |
| except ImportError: |
| logger.warning("Safeguards module not found") |
| |
| def _load_tools(self): |
| """Load tool system.""" |
| try: |
| from tools_system import HelionToolSystem |
| |
| self.tool_system = HelionToolSystem(self.model, self.tokenizer) |
| logger.info("Tool system loaded") |
| except ImportError: |
| logger.warning("Tools module not found") |
| |
| def _init_memory(self): |
| """Initialize memory system.""" |
| self.memory = MemoryManager( |
| memory_file=os.path.join(self.config.output_dir, self.config.memory_file), |
| window_size=self.config.memory_window |
| ) |
| logger.info("Memory system initialized") |
| |
| def unload_model(self): |
| """Unload model to free memory.""" |
| if self.model: |
| del self.model |
| del self.tokenizer |
| torch.cuda.empty_cache() |
| logger.info("Model unloaded") |
| |
| |
| |
| def generate( |
| self, |
| prompt: str, |
| max_tokens: Optional[int] = None, |
| temperature: Optional[float] = None, |
| system_prompt: Optional[str] = None, |
| use_safeguards: bool = True, |
| use_memory: bool = True, |
| conversation_id: Optional[str] = None |
| ) -> Dict[str, Any]: |
| """ |
| Generate response from prompt. |
| |
| Args: |
| prompt: Input prompt |
| max_tokens: Max tokens to generate |
| temperature: Sampling temperature |
| system_prompt: Optional system prompt |
| use_safeguards: Apply safeguard checks |
| use_memory: Use conversation memory |
| conversation_id: Conversation identifier for memory |
| |
| Returns: |
| Dict with response and metadata |
| """ |
| if not self.model: |
| raise RuntimeError("Model not loaded. Call load_model() first.") |
| |
| max_tokens = max_tokens or self.config.max_tokens |
| temperature = temperature or self.config.temperature |
| |
| |
| memory_context = "" |
| if use_memory and self.memory and conversation_id: |
| memory_context = self.memory.get_context(conversation_id) |
| |
| |
| messages = [] |
| if system_prompt: |
| messages.append({"role": "system", "content": system_prompt}) |
| |
| |
| if memory_context: |
| messages.append({"role": "system", "content": f"Previous context: {memory_context}"}) |
| |
| messages.append({"role": "user", "content": prompt}) |
| |
| |
| if use_safeguards and self.safeguards: |
| allowed, response = self.safeguards.filter_message(prompt) |
| if not allowed: |
| return { |
| "response": response, |
| "blocked": True, |
| "reason": "safeguard_violation" |
| } |
| |
| |
| input_ids = self.tokenizer.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| return_tensors="pt" |
| ).to(self.model.device) |
| |
| with torch.no_grad(): |
| output = self.model.generate( |
| input_ids, |
| max_new_tokens=max_tokens, |
| temperature=temperature, |
| top_p=self.config.top_p, |
| do_sample=True, |
| pad_token_id=self.tokenizer.pad_token_id, |
| eos_token_id=self.tokenizer.eos_token_id |
| ) |
| |
| response_text = self.tokenizer.decode( |
| output[0][input_ids.shape[1]:], |
| skip_special_tokens=True |
| ).strip() |
| |
| |
| if use_memory and self.memory and conversation_id: |
| self.memory.add_interaction(conversation_id, prompt, response_text) |
| |
| result = { |
| "response": response_text, |
| "blocked": False, |
| "prompt_tokens": input_ids.shape[1], |
| "completion_tokens": output.shape[1] - input_ids.shape[1], |
| "total_tokens": output.shape[1], |
| "conversation_id": conversation_id |
| } |
| |
| self._log_event("generation", {"prompt": prompt[:100], "tokens": result["total_tokens"]}) |
| return result |
| |
| def chat( |
| self, |
| messages: List[Dict[str, str]], |
| use_memory: bool = True, |
| conversation_id: Optional[str] = None, |
| **kwargs |
| ) -> Dict[str, Any]: |
| """ |
| Multi-turn chat completion. |
| |
| Args: |
| messages: List of message dicts |
| use_memory: Use memory for context |
| conversation_id: Conversation ID for memory |
| **kwargs: Generation parameters |
| |
| Returns: |
| Dict with response and metadata |
| """ |
| if not self.model: |
| raise RuntimeError("Model not loaded") |
| |
| |
| if use_memory and self.memory and conversation_id: |
| memory_context = self.memory.get_context(conversation_id) |
| if memory_context: |
| |
| messages = [ |
| {"role": "system", "content": f"Previous context: {memory_context}"} |
| ] + messages |
| |
| |
| input_ids = self.tokenizer.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| return_tensors="pt" |
| ).to(self.model.device) |
| |
| with torch.no_grad(): |
| output = self.model.generate( |
| input_ids, |
| max_new_tokens=kwargs.get("max_tokens", self.config.max_tokens), |
| temperature=kwargs.get("temperature", self.config.temperature), |
| top_p=kwargs.get("top_p", self.config.top_p), |
| do_sample=True, |
| pad_token_id=self.tokenizer.pad_token_id, |
| eos_token_id=self.tokenizer.eos_token_id |
| ) |
| |
| response = self.tokenizer.decode( |
| output[0][input_ids.shape[1]:], |
| skip_special_tokens=True |
| ).strip() |
| |
| |
| if use_memory and self.memory and conversation_id: |
| user_message = messages[-1]["content"] |
| self.memory.add_interaction(conversation_id, user_message, response) |
| |
| return {"response": response, "blocked": False} |
| |
| def interactive_chat(self): |
| """Start interactive chat session.""" |
| if not self.model: |
| print("❌ Model not loaded. Run: orchestrator.load_model()") |
| return |
| |
| print("\n" + "="*60) |
| print("Helion Interactive Chat with Memory") |
| print("Commands: /quit, /clear, /save, /memory, /newconv, /help") |
| print("="*60 + "\n") |
| |
| conversation = [] |
| conversation_id = f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}" |
| |
| |
| if self.memory: |
| print(f"💾 Memory: Enabled (ID: {conversation_id})") |
| |
| prev_context = self.memory.get_context(conversation_id) |
| if prev_context: |
| print(f"📝 Retrieved previous context\n") |
| |
| while True: |
| try: |
| user_input = input("\n🧑 You: ").strip() |
| |
| if not user_input: |
| continue |
| |
| |
| if user_input.startswith("/"): |
| if user_input == "/quit": |
| if self.memory: |
| self.memory.save() |
| print("Goodbye!") |
| break |
| elif user_input == "/clear": |
| conversation = [] |
| print("Conversation cleared.") |
| continue |
| elif user_input == "/memory": |
| self._show_memory(conversation_id) |
| continue |
| elif user_input == "/newconv": |
| if self.memory: |
| self.memory.save() |
| conversation = [] |
| conversation_id = f"chat_{datetime.now().strftime('%Y%m%d_%H%M%S')}" |
| print(f"New conversation started (ID: {conversation_id})") |
| continue |
| elif user_input.startswith("/save"): |
| self._save_conversation(conversation, user_input.split()[1] if len(user_input.split()) > 1 else None) |
| continue |
| elif user_input == "/help": |
| self._print_chat_help() |
| continue |
| else: |
| print("Unknown command. Type /help for options.") |
| continue |
| |
| conversation.append({"role": "user", "content": user_input}) |
| |
| result = self.chat( |
| conversation, |
| use_memory=True, |
| conversation_id=conversation_id |
| ) |
| |
| if result.get("blocked"): |
| print(f"🤖 Helion: {result['response']}") |
| conversation.pop() |
| else: |
| print(f"🤖 Helion: {result['response']}") |
| conversation.append({"role": "assistant", "content": result['response']}) |
| |
| except KeyboardInterrupt: |
| print("\n\nInterrupted. Type /quit to exit.") |
| except Exception as e: |
| logger.error(f"Error: {e}") |
| conversation.pop() |
| |
| |
| |
| def train( |
| self, |
| dataset_path: Optional[str] = None, |
| output_dir: Optional[str] = None, |
| **kwargs |
| ) -> bool: |
| """ |
| Train or fine-tune model. |
| |
| Args: |
| dataset_path: Path to training data |
| output_dir: Output directory |
| **kwargs: Training parameters |
| |
| Returns: |
| True if successful |
| """ |
| dataset_path = dataset_path or self.config.dataset_path |
| output_dir = output_dir or os.path.join(self.config.output_dir, "trained_model") |
| |
| if not dataset_path: |
| logger.error("No dataset provided") |
| return False |
| |
| logger.info(f"Starting training with dataset: {dataset_path}") |
| |
| try: |
| |
| from autotrain_v15 import HelionV15Trainer, HelionV15Config |
| |
| train_config = HelionV15Config( |
| base_model=self.config.base_model, |
| dataset_name=dataset_path, |
| output_dir=output_dir, |
| num_epochs=kwargs.get("epochs", self.config.num_epochs), |
| batch_size=kwargs.get("batch_size", self.config.batch_size), |
| learning_rate=kwargs.get("learning_rate", self.config.learning_rate), |
| hf_token=self.config.hf_token |
| ) |
| |
| trainer = HelionV15Trainer(train_config) |
| success = trainer.run_pipeline() |
| |
| if success: |
| logger.info("✅ Training completed successfully") |
| self._log_event("training_completed", {"dataset": dataset_path, "output": output_dir}) |
| |
| return success |
| |
| except Exception as e: |
| logger.error(f"Training failed: {e}") |
| return False |
| |
| |
| |
| def evaluate( |
| self, |
| benchmark: str = "all", |
| output_file: Optional[str] = None |
| ) -> Dict[str, Any]: |
| """ |
| Evaluate model on benchmarks. |
| |
| Args: |
| benchmark: Benchmark name or 'all' |
| output_file: Save results to file |
| |
| Returns: |
| Evaluation results |
| """ |
| if not self.model: |
| logger.error("Model not loaded") |
| return {} |
| |
| logger.info(f"Running evaluation: {benchmark}") |
| |
| results = { |
| "model": self.config.model_name, |
| "timestamp": datetime.now().isoformat(), |
| "benchmarks": {} |
| } |
| |
| |
| test_prompts = [ |
| "What is 2+2?", |
| "Explain machine learning", |
| "Write a Python function to reverse a string" |
| ] |
| |
| for i, prompt in enumerate(test_prompts): |
| result = self.generate(prompt, max_tokens=256) |
| results["benchmarks"][f"test_{i}"] = { |
| "prompt": prompt, |
| "response": result["response"][:100], |
| "tokens": result.get("total_tokens", 0) |
| } |
| |
| if output_file: |
| with open(output_file, 'w') as f: |
| json.dump(results, f, indent=2) |
| logger.info(f"Results saved to {output_file}") |
| |
| self._log_event("evaluation", {"benchmark": benchmark}) |
| return results |
| |
| |
| |
| def start_server( |
| self, |
| host: Optional[str] = None, |
| port: Optional[int] = None, |
| reload: bool = False |
| ): |
| """ |
| Start API server. |
| |
| Args: |
| host: Server host |
| port: Server port |
| reload: Enable auto-reload |
| """ |
| host = host or self.config.server_host |
| port = port or self.config.server_port |
| |
| logger.info(f"Starting server on {host}:{port}") |
| |
| try: |
| import uvicorn |
| from server import app |
| |
| |
| os.environ["MODEL_NAME"] = self.config.model_name |
| os.environ["SAFEGUARD_MODE"] = "moderate" if self.config.enable_safeguards else "permissive" |
| |
| uvicorn.run( |
| "server:app", |
| host=host, |
| port=port, |
| reload=reload |
| ) |
| |
| except ImportError: |
| logger.error("Server dependencies not installed. Run: pip install fastapi uvicorn") |
| except Exception as e: |
| logger.error(f"Failed to start server: {e}") |
| |
| def deploy(self, method: str = "docker", **kwargs): |
| """ |
| Deploy model using specified method. |
| |
| Args: |
| method: Deployment method ('docker', 'vllm', 'tgi') |
| **kwargs: Deployment parameters |
| """ |
| logger.info(f"Deploying with method: {method}") |
| |
| if method == "docker": |
| self._deploy_docker(**kwargs) |
| elif method == "vllm": |
| self._deploy_vllm(**kwargs) |
| elif method == "tgi": |
| self._deploy_tgi(**kwargs) |
| else: |
| logger.error(f"Unknown deployment method: {method}") |
| |
| def _deploy_docker(self, **kwargs): |
| """Create Docker deployment.""" |
| dockerfile_content = f"""FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 |
| |
| WORKDIR /app |
| |
| RUN apt-get update && apt-get install -y python3 python3-pip git |
| |
| COPY requirements.txt . |
| RUN pip3 install -r requirements.txt |
| |
| COPY . . |
| |
| ENV MODEL_NAME={self.config.model_name} |
| EXPOSE 8000 |
| |
| CMD ["python3", "server.py", "--host", "0.0.0.0", "--port", "8000"] |
| """ |
| |
| with open(Path(self.config.output_dir) / "Dockerfile", 'w') as f: |
| f.write(dockerfile_content) |
| |
| logger.info("Dockerfile created. Build with: docker build -t helion-v15 .") |
| |
| def _deploy_vllm(self, **kwargs): |
| """Deploy with vLLM.""" |
| cmd = f"python -m vllm.entrypoints.openai.api_server --model {self.config.model_name}" |
| logger.info(f"vLLM command: {cmd}") |
| subprocess.run(cmd, shell=True) |
| |
| def _deploy_tgi(self, **kwargs): |
| """Deploy with Text Generation Inference.""" |
| cmd = f"""docker run --gpus all -p 8080:80 \\ |
| ghcr.io/huggingface/text-generation-inference:latest \\ |
| --model-id {self.config.model_name}""" |
| logger.info(f"TGI command:\n{cmd}") |
| |
| |
| |
| def benchmark_speed(self, num_requests: int = 10) -> Dict[str, float]: |
| """ |
| Benchmark inference speed. |
| |
| Args: |
| num_requests: Number of test requests |
| |
| Returns: |
| Speed metrics |
| """ |
| if not self.model: |
| logger.error("Model not loaded") |
| return {} |
| |
| import time |
| |
| logger.info(f"Running speed benchmark ({num_requests} requests)...") |
| |
| test_prompt = "Explain artificial intelligence in one sentence." |
| times = [] |
| tokens = [] |
| |
| for i in range(num_requests): |
| start = time.time() |
| result = self.generate(test_prompt, max_tokens=50, use_safeguards=False) |
| elapsed = time.time() - start |
| |
| times.append(elapsed) |
| tokens.append(result.get("completion_tokens", 0)) |
| |
| avg_time = sum(times) / len(times) |
| avg_tokens = sum(tokens) / len(tokens) |
| tokens_per_sec = avg_tokens / avg_time |
| |
| metrics = { |
| "avg_latency": avg_time, |
| "avg_tokens": avg_tokens, |
| "tokens_per_second": tokens_per_sec, |
| "requests": num_requests |
| } |
| |
| logger.info(f"Benchmark results: {tokens_per_sec:.2f} tokens/sec, {avg_time:.2f}s avg latency") |
| return metrics |
| |
| def export_model(self, format: str = "gguf", output_path: Optional[str] = None): |
| """ |
| Export model to different formats. |
| |
| Args: |
| format: Export format ('gguf', 'onnx', 'tensorrt') |
| output_path: Output path |
| """ |
| output_path = output_path or os.path.join(self.config.output_dir, f"exported_{format}") |
| logger.info(f"Exporting model to {format} format...") |
| |
| if format == "gguf": |
| logger.info("GGUF export: Use llama.cpp conversion tools") |
| elif format == "onnx": |
| logger.info("ONNX export: Use optimum library") |
| elif format == "tensorrt": |
| logger.info("TensorRT export: Use TensorRT conversion") |
| else: |
| logger.error(f"Unknown format: {format}") |
| |
| def push_to_hub( |
| self, |
| repo_id: Optional[str] = None, |
| private: bool = False |
| ) -> bool: |
| """ |
| Push model to HuggingFace Hub. |
| |
| Args: |
| repo_id: Repository ID |
| private: Make repository private |
| |
| Returns: |
| True if successful |
| """ |
| repo_id = repo_id or self.config.model_name |
| |
| if not self.config.hf_token: |
| logger.error("HuggingFace token not set") |
| return False |
| |
| try: |
| from huggingface_hub import HfApi |
| |
| api = HfApi(token=self.config.hf_token) |
| |
| logger.info(f"Pushing to hub: {repo_id}") |
| |
| api.create_repo(repo_id, exist_ok=True, private=private) |
| api.upload_folder( |
| folder_path=self.config.output_dir, |
| repo_id=repo_id, |
| repo_type="model" |
| ) |
| |
| logger.info("✅ Model pushed successfully") |
| return True |
| |
| except Exception as e: |
| logger.error(f"Failed to push to hub: {e}") |
| return False |
| |
| def get_info(self) -> Dict[str, Any]: |
| """Get orchestrator status and info.""" |
| info = { |
| "model_loaded": self.model is not None, |
| "model_name": self.config.model_name, |
| "device": str(self.model.device) if self.model else None, |
| "safeguards_enabled": self.safeguards is not None, |
| "tools_enabled": self.tool_system is not None, |
| "memory_enabled": self.memory is not None, |
| "config": asdict(self.config), |
| "session_events": len(self.session_log) |
| } |
| |
| if self.model: |
| info["model_memory"] = torch.cuda.max_memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 |
| |
| if self.memory: |
| info["total_conversations"] = len(self.memory.conversations) |
| info["total_interactions"] = sum(len(conv) for conv in self.memory.conversations.values()) |
| |
| return info |
| |
| def _show_memory(self, conversation_id: str): |
| """Display memory for conversation.""" |
| if not self.memory: |
| print("Memory not enabled") |
| return |
| |
| context = self.memory.get_context(conversation_id) |
| interactions = self.memory.get_conversation(conversation_id) |
| |
| print(f"\n{'='*60}") |
| print(f"Memory for Conversation: {conversation_id}") |
| print(f"{'='*60}") |
| print(f"Total interactions: {len(interactions)}") |
| print(f"\nContext summary:\n{context[:200]}..." if len(context) > 200 else f"\nContext:\n{context}") |
| print(f"{'='*60}\n") |
| |
| def _log_event(self, event_type: str, data: Dict[str, Any]): |
| """Log orchestrator event.""" |
| event = { |
| "timestamp": datetime.now().isoformat(), |
| "type": event_type, |
| "data": data |
| } |
| self.session_log.append(event) |
| |
| |
| log_file = Path(self.config.log_dir) / "orchestrator.jsonl" |
| with open(log_file, 'a') as f: |
| f.write(json.dumps(event) + '\n') |
| |
| def _save_conversation(self, conversation: List[Dict], filename: Optional[str] = None): |
| """Save conversation to file.""" |
| filename = filename or f"conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" |
| filepath = Path(self.config.output_dir) / filename |
| |
| with open(filepath, 'w') as f: |
| json.dump(conversation, f, indent=2) |
| |
| print(f"💾 Conversation saved to {filepath}") |
| |
| def _print_chat_help(self): |
| """Print chat help.""" |
| print(""" |
| Available Commands: |
| /quit - Exit chat and save memory |
| /clear - Clear current conversation |
| /save [name] - Save conversation to file |
| /memory - Show memory for this conversation |
| /newconv - Start a new conversation (saves current) |
| /help - Show this help message |
| """) |
|
|
|
|
| def main(): |
| """CLI interface for Helion Orchestrator.""" |
| parser = argparse.ArgumentParser( |
| description="Helion Orchestrator - Complete Model Management", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=""" |
| Examples: |
| # Load model and start interactive chat |
| python helion_orchestrator.py chat --model DeepXR/Helion-V1.5 |
| |
| # Train model |
| python helion_orchestrator.py train --dataset ./data/train.jsonl --epochs 3 |
| |
| # Start API server |
| python helion_orchestrator.py serve --port 8000 |
| |
| # Evaluate model |
| python helion_orchestrator.py eval --benchmark all |
| |
| # Benchmark speed |
| python helion_orchestrator.py benchmark --requests 20 |
| """ |
| ) |
| |
| parser.add_argument("command", choices=[ |
| "chat", "generate", "train", "eval", "serve", |
| "benchmark", "export", "push", "info" |
| ]) |
| |
| |
| parser.add_argument("--model", help="Model name or path") |
| parser.add_argument("--4bit", action="store_true", help="Use 4-bit quantization") |
| parser.add_argument("--no-safeguards", action="store_true", help="Disable safeguards") |
| |
| |
| parser.add_argument("--prompt", help="Prompt for generation") |
| parser.add_argument("--dataset", help="Dataset path for training") |
| parser.add_argument("--epochs", type=int, help="Training epochs") |
| parser.add_argument("--port", type=int, default=8000, help="Server port") |
| parser.add_argument("--benchmark", help="Benchmark name") |
| parser.add_argument("--requests", type=int, default=10, help="Number of benchmark requests") |
| parser.add_argument("--format", help="Export format") |
| parser.add_argument("--repo-id", help="HuggingFace repo ID") |
| |
| args = parser.parse_args() |
| |
| |
| config = HelionConfig( |
| model_name=args.model or "DeepXR/Helion-V1.5", |
| load_in_4bit=args.__dict__.get('4bit', False), |
| enable_safeguards=not args.no_safeguards |
| ) |
| |
| |
| orchestrator = HelionOrchestrator(config) |
| |
| |
| if args.command == "chat": |
| orchestrator.load_model() |
| orchestrator.interactive_chat() |
| |
| elif args.command == "generate": |
| if not args.prompt: |
| print("Error: --prompt required") |
| sys.exit(1) |
| orchestrator.load_model() |
| result = orchestrator.generate(args.prompt) |
| print(f"\nResponse:\n{result['response']}\n") |
| print(f"Tokens: {result.get('total_tokens', 0)}") |
| |
| elif args.command == "train": |
| if not args.dataset: |
| print("Error: --dataset required") |
| sys.exit(1) |
| orchestrator.train(dataset_path=args.dataset, epochs=args.epochs) |
| |
| elif args.command == "eval": |
| orchestrator.load_model() |
| orchestrator.evaluate(benchmark=args.benchmark or "all") |
| |
| elif args.command == "serve": |
| orchestrator.start_server(port=args.port) |
| |
| elif args.command == "benchmark": |
| orchestrator.load_model() |
| orchestrator.benchmark_speed(num_requests=args.requests) |
| |
| elif args.command == "export": |
| orchestrator.load_model() |
| orchestrator.export_model(format=args.format or "gguf") |
| |
| elif args.command == "push": |
| orchestrator.push_to_hub(repo_id=args.repo_id) |
| |
| elif args.command == "info": |
| info = orchestrator.get_info() |
| print(json.dumps(info, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |