# llm_clients/shared_models.py """ Shared model manager to avoid loading the same model multiple times. This significantly improves memory usage and startup time. """ from typing import Optional, Dict, Any, Tuple import threading import os class SharedModelManager: """Singleton class to manage shared model instances""" _instance = None _lock = threading.Lock() _models: Dict[str, Any] = {} _model_components: Dict[str, Dict[str, Any]] = {} # Store actual model components def __new__(cls): if cls._instance is None: with cls._lock: if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def get_finetuned_model_components(self, model_name: str = "zazaman/fmb") -> Optional[Dict[str, Any]]: """ Get or load shared model components (model, tokenizer, classifier). Args: model_name: Name of the model to load Returns: Dict with 'model', 'tokenizer', 'classifier' components or None if loading fails """ model_key = f"finetuned_components_{model_name}" if model_key not in self._model_components: try: print(f"🔄 Loading shared finetuned model components: {model_name}") # Import here to avoid circular imports from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline import torch # Set up cache directory for HF Spaces compatibility if not os.getenv('HF_HOME'): cache_dir = os.path.expanduser("~/.cache/huggingface") os.environ['HF_HOME'] = cache_dir os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, 'transformers') # Create cache directories if they don't exist os.makedirs(cache_dir, exist_ok=True) os.makedirs(os.path.join(cache_dir, 'transformers'), exist_ok=True) print(f" 📁 Using cache directory: {cache_dir}") # Apply optimizations torch._dynamo.config.suppress_errors = True torch._dynamo.config.disable = True os.environ["TORCH_COMPILE_DISABLE"] = "1" os.environ["TORCHDYNAMO_DISABLE"] = "1" os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" print(f" 📥 Downloading model from Hugging Face: {model_name}") # Load model and tokenizer with explicit cache directory model = AutoModelForSequenceClassification.from_pretrained( model_name, torch_dtype=torch.float32, device_map=None, cache_dir=os.environ.get('TRANSFORMERS_CACHE'), local_files_only=False, # Allow downloading trust_remote_code=False # Security best practice ) tokenizer = AutoTokenizer.from_pretrained( model_name, cache_dir=os.environ.get('TRANSFORMERS_CACHE'), local_files_only=False, trust_remote_code=False ) # Disable compilation if hasattr(model, '_compiler_config'): model._compiler_config = None # Move to CPU device = "cpu" model = model.to(device) print(f" 🧠 Creating classifier pipeline...") # Create classifier pipeline classifier = pipeline( "text-classification", model=model, tokenizer=tokenizer, device=device, framework="pt", torch_dtype=torch.float32 ) # Store components self._model_components[model_key] = { "model": model, "tokenizer": tokenizer, "classifier": classifier, "device": device, "model_name": model_name } print(f"✅ Shared finetuned model components loaded successfully: {model_name}") print(f" Device: {device}") print(f" Cache: {os.environ.get('TRANSFORMERS_CACHE', 'default')}") except PermissionError as e: print(f"❌ Permission error loading model {model_name}: {e}") print(f" This might be a cache directory issue in the deployment environment.") print(f" Suggestion: Check HF_HOME and cache directory permissions.") self._model_components[model_key] = None return None except Exception as e: print(f"❌ Failed to load shared finetuned model components {model_name}: {e}") print(f" Error type: {type(e).__name__}") if "connection" in str(e).lower() or "network" in str(e).lower(): print(f" This appears to be a network issue. Check internet connectivity.") elif "disk" in str(e).lower() or "space" in str(e).lower(): print(f" This appears to be a disk space issue.") self._model_components[model_key] = None return None return self._model_components[model_key] def get_finetuned_guard_client(self, model_name: str = "zazaman/fmb") -> Optional[Any]: """ Get or create a shared FinetunedGuardClient instance that uses shared model components. Args: model_name: Name of the model to load Returns: FinetunedGuardClient instance or None if loading fails """ model_key = f"finetuned_guard_{model_name}" if model_key not in self._models: try: # Get shared model components components = self.get_finetuned_model_components(model_name) if not components: return None from .finetuned_guard import FinetunedGuardClient print(f" 🔍 Creating FinetunedGuardClient with shared model components: {model_name}") model_config = { "model_name": model_name } # Create client that will use shared components client = FinetunedGuardClient(model_config, "", shared_components=components) self._models[model_key] = client print(f"✅ Shared finetuned guard client created successfully: {model_name}") except Exception as e: print(f"❌ Failed to create shared finetuned guard client {model_name}: {e}") self._models[model_key] = None return None return self._models[model_key] def clear_models(self): """Clear all cached models (useful for testing)""" self._models.clear() self._model_components.clear() def get_model_info(self) -> Dict[str, bool]: """Get information about loaded models""" return { model_key: model is not None for model_key, model in self._models.items() } # Global singleton instance shared_model_manager = SharedModelManager()