| |
| """ |
| HuggingFace Model Orchestrator |
| =============================== |
| Loads and manages HuggingFace models for the dual-LLM pipeline. |
| Supports LFM2-8B-A1B-Dimensional-Entanglement and 9xdSq-LIMPS-FemTO-R1C. |
| """ |
|
|
| import torch |
| import logging |
| import gc |
| from pathlib import Path |
| from typing import Dict, Any, Optional, Tuple, List |
| from dataclasses import dataclass |
| import warnings |
|
|
| |
| try: |
| from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig |
| from transformers import BitsAndBytesConfig, GenerationConfig |
| TRANSFORMERS_AVAILABLE = True |
| except ImportError: |
| TRANSFORMERS_AVAILABLE = False |
| print("⚠️ Transformers not available - install with: pip install transformers") |
|
|
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| @dataclass |
| class ModelConfig: |
| """Configuration for HuggingFace model loading.""" |
| model_name: str |
| device: str = "auto" |
| torch_dtype: torch.dtype = torch.bfloat16 |
| trust_remote_code: bool = True |
| use_cache: bool = True |
| low_cpu_mem_usage: bool = True |
| quantization_config: Optional[Any] = None |
| max_memory: Optional[Dict[int, str]] = None |
| offload_folder: Optional[str] = None |
|
|
| @dataclass |
| class GenerationSettings: |
| """Settings for text generation.""" |
| max_new_tokens: int = 512 |
| temperature: float = 0.7 |
| top_p: float = 0.9 |
| top_k: int = 50 |
| repetition_penalty: float = 1.1 |
| do_sample: bool = True |
| pad_token_id: Optional[int] = None |
| eos_token_id: Optional[int] = None |
|
|
| class HuggingFaceModelOrchestrator: |
| """Orchestrator for managing HuggingFace models in the dual-LLM pipeline.""" |
| |
| def __init__(self, primary_config: ModelConfig, secondary_config: ModelConfig): |
| self.primary_config = primary_config |
| self.secondary_config = secondary_config |
| |
| self.primary_model = None |
| self.primary_tokenizer = None |
| self.secondary_model = None |
| self.secondary_tokenizer = None |
| |
| self.device = self._determine_device() |
| self.generation_settings = GenerationSettings() |
| |
| |
| self.model_cache = {} |
| self.tokenizer_cache = {} |
| |
| logger.info(f"🤖 Initializing HuggingFace Model Orchestrator") |
| logger.info(f" Primary Model: {primary_config.model_name}") |
| logger.info(f" Secondary Model: {secondary_config.model_name}") |
| logger.info(f" Device: {self.device}") |
| |
| def _determine_device(self) -> str: |
| """Determine the best device for model loading.""" |
| if torch.cuda.is_available(): |
| gpu_count = torch.cuda.device_count() |
| gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) |
| logger.info(f"🖥️ GPU detected: {gpu_count} devices, {gpu_memory:.1f}GB total memory") |
| |
| if gpu_memory > 20: |
| return "cuda:0" |
| else: |
| logger.warning("⚠️ Limited GPU memory, using CPU for secondary model") |
| return "cpu" |
| else: |
| logger.info("🖥️ No GPU detected, using CPU") |
| return "cpu" |
| |
| def _setup_quantization_config(self) -> Optional[BitsAndBytesConfig]: |
| """Setup quantization configuration for memory optimization.""" |
| if torch.cuda.is_available(): |
| try: |
| return BitsAndBytesConfig( |
| load_in_8bit=True, |
| llm_int8_enable_fp32_cpu_offload=True, |
| llm_int8_skip_modules=["lm_head"] |
| ) |
| except Exception as e: |
| logger.warning(f"⚠️ Quantization setup failed: {e}") |
| return None |
| return None |
| |
| def _load_model_and_tokenizer(self, config: ModelConfig, model_key: str) -> Tuple[Any, Any]: |
| """Load a single model and tokenizer with error handling.""" |
| logger.info(f"📥 Loading {model_key} model: {config.model_name}") |
| |
| try: |
| |
| tokenizer = AutoTokenizer.from_pretrained( |
| config.model_name, |
| trust_remote_code=config.trust_remote_code, |
| use_fast=True |
| ) |
| |
| |
| if tokenizer.pad_token is None: |
| if tokenizer.eos_token is not None: |
| tokenizer.pad_token = tokenizer.eos_token |
| else: |
| tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
| |
| |
| model_config = AutoConfig.from_pretrained( |
| config.model_name, |
| trust_remote_code=config.trust_remote_code |
| ) |
| |
| |
| quantization_config = config.quantization_config or self._setup_quantization_config() |
| |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| config.model_name, |
| config=model_config, |
| torch_dtype=config.torch_dtype, |
| device_map=config.device, |
| trust_remote_code=config.trust_remote_code, |
| use_cache=config.use_cache, |
| low_cpu_mem_usage=config.low_cpu_mem_usage, |
| quantization_config=quantization_config, |
| max_memory=config.max_memory, |
| offload_folder=config.offload_folder |
| ) |
| |
| |
| model.generation_config = GenerationConfig( |
| max_new_tokens=self.generation_settings.max_new_tokens, |
| temperature=self.generation_settings.temperature, |
| top_p=self.generation_settings.top_p, |
| top_k=self.generation_settings.top_k, |
| repetition_penalty=self.generation_settings.repetition_penalty, |
| do_sample=self.generation_settings.do_sample, |
| pad_token_id=tokenizer.pad_token_id, |
| eos_token_id=tokenizer.eos_token_id |
| ) |
| |
| logger.info(f"✅ {model_key} model loaded successfully") |
| logger.info(f" Model size: {model.num_parameters():,} parameters") |
| logger.info(f" Device: {next(model.parameters()).device}") |
| |
| return model, tokenizer |
| |
| except Exception as e: |
| logger.error(f"❌ Failed to load {model_key} model: {e}") |
| raise |
| |
| def load_primary_model(self) -> bool: |
| """Load the primary LFM2-8B model.""" |
| try: |
| self.primary_model, self.primary_tokenizer = self._load_model_and_tokenizer( |
| self.primary_config, "Primary" |
| ) |
| return True |
| except Exception as e: |
| logger.error(f"❌ Primary model loading failed: {e}") |
| return False |
| |
| def load_secondary_model(self) -> bool: |
| """Load the secondary FemTO-R1C model.""" |
| try: |
| |
| if torch.cuda.is_available(): |
| gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3) |
| if gpu_memory < 24: |
| secondary_config = ModelConfig( |
| model_name=self.secondary_config.model_name, |
| device="cpu", |
| torch_dtype=torch.float32, |
| trust_remote_code=self.secondary_config.trust_remote_code, |
| use_cache=self.secondary_config.use_cache, |
| low_cpu_mem_usage=self.secondary_config.low_cpu_mem_usage |
| ) |
| else: |
| secondary_config = self.secondary_config |
| else: |
| secondary_config = ModelConfig( |
| model_name=self.secondary_config.model_name, |
| device="cpu", |
| torch_dtype=torch.float32, |
| trust_remote_code=self.secondary_config.trust_remote_code, |
| use_cache=self.secondary_config.use_cache, |
| low_cpu_mem_usage=self.secondary_config.low_cpu_mem_usage |
| ) |
| |
| self.secondary_model, self.secondary_tokenizer = self._load_model_and_tokenizer( |
| secondary_config, "Secondary" |
| ) |
| return True |
| except Exception as e: |
| logger.error(f"❌ Secondary model loading failed: {e}") |
| return False |
| |
| def load_all_models(self) -> bool: |
| """Load both primary and secondary models.""" |
| logger.info("🚀 Loading all HuggingFace models...") |
| |
| |
| if not self.load_primary_model(): |
| return False |
| |
| |
| if not self.load_secondary_model(): |
| logger.warning("⚠️ Secondary model failed, continuing with primary only") |
| |
| logger.info("✅ All models loaded successfully") |
| return True |
| |
| def generate_with_primary(self, prompt: str, **kwargs) -> str: |
| """Generate text using the primary model.""" |
| if self.primary_model is None or self.primary_tokenizer is None: |
| raise RuntimeError("Primary model not loaded") |
| |
| try: |
| |
| inputs = self.primary_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) |
| inputs = {k: v.to(self.primary_model.device) for k, v in inputs.items()} |
| |
| |
| with torch.no_grad(): |
| outputs = self.primary_model.generate( |
| **inputs, |
| max_new_tokens=kwargs.get('max_new_tokens', self.generation_settings.max_new_tokens), |
| temperature=kwargs.get('temperature', self.generation_settings.temperature), |
| top_p=kwargs.get('top_p', self.generation_settings.top_p), |
| top_k=kwargs.get('top_k', self.generation_settings.top_k), |
| repetition_penalty=kwargs.get('repetition_penalty', self.generation_settings.repetition_penalty), |
| do_sample=kwargs.get('do_sample', self.generation_settings.do_sample), |
| pad_token_id=self.primary_tokenizer.pad_token_id, |
| eos_token_id=self.primary_tokenizer.eos_token_id |
| ) |
| |
| |
| generated_text = self.primary_tokenizer.decode( |
| outputs[0][inputs['input_ids'].shape[1]:], |
| skip_special_tokens=True |
| ) |
| |
| return generated_text |
| |
| except Exception as e: |
| logger.error(f"❌ Primary model generation failed: {e}") |
| raise |
| |
| def generate_with_secondary(self, prompt: str, **kwargs) -> str: |
| """Generate text using the secondary model.""" |
| if self.secondary_model is None or self.secondary_tokenizer is None: |
| raise RuntimeError("Secondary model not loaded") |
| |
| try: |
| |
| inputs = self.secondary_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True) |
| inputs = {k: v.to(self.secondary_model.device) for k, v in inputs.items()} |
| |
| |
| with torch.no_grad(): |
| outputs = self.secondary_model.generate( |
| **inputs, |
| max_new_tokens=kwargs.get('max_new_tokens', self.generation_settings.max_new_tokens), |
| temperature=kwargs.get('temperature', self.generation_settings.temperature), |
| top_p=kwargs.get('top_p', self.generation_settings.top_p), |
| top_k=kwargs.get('top_k', self.generation_settings.top_k), |
| repetition_penalty=kwargs.get('repetition_penalty', self.generation_settings.repetition_penalty), |
| do_sample=kwargs.get('do_sample', self.generation_settings.do_sample), |
| pad_token_id=self.secondary_tokenizer.pad_token_id, |
| eos_token_id=self.secondary_tokenizer.eos_token_id |
| ) |
| |
| |
| generated_text = self.secondary_tokenizer.decode( |
| outputs[0][inputs['input_ids'].shape[1]:], |
| skip_special_tokens=True |
| ) |
| |
| return generated_text |
| |
| except Exception as e: |
| logger.error(f"❌ Secondary model generation failed: {e}") |
| raise |
| |
| def get_model_info(self) -> Dict[str, Any]: |
| """Get information about loaded models.""" |
| info = { |
| "primary_model": { |
| "loaded": self.primary_model is not None, |
| "name": self.primary_config.model_name, |
| "parameters": self.primary_model.num_parameters() if self.primary_model else 0, |
| "device": str(next(self.primary_model.parameters()).device) if self.primary_model else "Not loaded" |
| }, |
| "secondary_model": { |
| "loaded": self.secondary_model is not None, |
| "name": self.secondary_config.model_name, |
| "parameters": self.secondary_model.num_parameters() if self.secondary_model else 0, |
| "device": str(next(self.secondary_model.parameters()).device) if self.secondary_model else "Not loaded" |
| }, |
| "system": { |
| "device": self.device, |
| "cuda_available": torch.cuda.is_available(), |
| "cuda_devices": torch.cuda.device_count() if torch.cuda.is_available() else 0 |
| } |
| } |
| |
| return info |
| |
| def cleanup(self): |
| """Clean up models and free memory.""" |
| logger.info("🧹 Cleaning up HuggingFace models...") |
| |
| if self.primary_model is not None: |
| del self.primary_model |
| if self.primary_tokenizer is not None: |
| del self.primary_tokenizer |
| if self.secondary_model is not None: |
| del self.secondary_model |
| if self.secondary_tokenizer is not None: |
| del self.secondary_tokenizer |
| |
| |
| self.model_cache.clear() |
| self.tokenizer_cache.clear() |
| |
| |
| gc.collect() |
| if torch.cuda.is_available(): |
| torch.cuda.empty_cache() |
| |
| logger.info("✅ Cleanup completed") |
|
|
| def create_model_orchestrator() -> HuggingFaceModelOrchestrator: |
| """Create a configured model orchestrator with the specified models.""" |
| |
| |
| primary_config = ModelConfig( |
| model_name="9x25dillon/LFM2-8B-A1B-Dimensional-Entanglement", |
| device="auto", |
| torch_dtype=torch.bfloat16, |
| trust_remote_code=True, |
| use_cache=True, |
| low_cpu_mem_usage=True |
| ) |
| |
| |
| secondary_config = ModelConfig( |
| model_name="9x25dillon/9xdSq-LIMPS-FemTO-R1C", |
| device="auto", |
| torch_dtype=torch.bfloat16, |
| trust_remote_code=True, |
| use_cache=True, |
| low_cpu_mem_usage=True |
| ) |
| |
| return HuggingFaceModelOrchestrator(primary_config, secondary_config) |
|
|
| def main(): |
| """Demo function to test the model orchestrator.""" |
| print("🚀 Testing HuggingFace Model Orchestrator") |
| print("=" * 50) |
| |
| |
| orchestrator = create_model_orchestrator() |
| |
| try: |
| |
| if orchestrator.load_all_models(): |
| print("✅ All models loaded successfully") |
| |
| |
| info = orchestrator.get_model_info() |
| print(f"\n📊 Model Information:") |
| print(f" Primary: {info['primary_model']['name']}") |
| print(f" Parameters: {info['primary_model']['parameters']:,}") |
| print(f" Device: {info['primary_model']['device']}") |
| |
| if info['secondary_model']['loaded']: |
| print(f" Secondary: {info['secondary_model']['name']}") |
| print(f" Parameters: {info['secondary_model']['parameters']:,}") |
| print(f" Device: {info['secondary_model']['device']}") |
| |
| |
| test_prompt = "Explain the concept of dimensional entanglement in AI systems." |
| |
| print(f"\n🧪 Testing generation with prompt: '{test_prompt}'") |
| |
| |
| try: |
| primary_output = orchestrator.generate_with_primary(test_prompt, max_new_tokens=100) |
| print(f"✅ Primary model output: {primary_output[:200]}...") |
| except Exception as e: |
| print(f"❌ Primary model generation failed: {e}") |
| |
| |
| try: |
| secondary_output = orchestrator.generate_with_secondary(test_prompt, max_new_tokens=100) |
| print(f"✅ Secondary model output: {secondary_output[:200]}...") |
| except Exception as e: |
| print(f"❌ Secondary model generation failed: {e}") |
| |
| else: |
| print("❌ Failed to load models") |
| |
| except Exception as e: |
| print(f"❌ Error: {e}") |
| |
| finally: |
| |
| orchestrator.cleanup() |
| print("\n🧹 Cleanup completed") |
|
|
| if __name__ == "__main__": |
| main() |
|
|