# TerraSyncra/app/utils/model_manager.py """ Lazy Model Manager for CPU Optimization Loads models on-demand instead of at import time. """ import os import logging import torch from typing import Optional from functools import lru_cache logging.basicConfig(level=logging.INFO) # Global model cache _models = { "expert_model": None, "expert_tokenizer": None, "translation_model": None, "translation_tokenizer": None, "embedder": None, "lang_identifier": None, "classifier": None, } _device = "cpu" # Force CPU for HuggingFace Spaces def get_device(): """Always return CPU for HuggingFace Spaces.""" return _device def load_expert_model(model_name: str, use_quantization: bool = True): """ Lazy load expert model with optional quantization. Args: model_name: Model identifier use_quantization: Use INT8 quantization for CPU (recommended) """ if _models["expert_model"] is not None: return _models["expert_tokenizer"], _models["expert_model"] from transformers import AutoTokenizer, AutoModelForCausalLM from app.utils import config logging.info(f"Loading expert model ({model_name})...") # Get cache directory from config cache_dir = getattr(config, 'hf_cache', '/models/huggingface') tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, # Use fast tokenizer cache_dir=cache_dir ) # Load model with CPU optimizations model_kwargs = { "torch_dtype": torch.float32, # Use float32 for CPU "device_map": "cpu", "low_cpu_mem_usage": True, } # Note: For CPU, we use float32 (most compatible) # For quantization on CPU, consider using smaller models or ONNX runtime # BitsAndBytesConfig is GPU-only, so we skip it for CPU deployment logging.info("Loading model in float32 for CPU compatibility") cache_dir = getattr(config, 'hf_cache', '/models/huggingface') model = AutoModelForCausalLM.from_pretrained( model_name, cache_dir=cache_dir, **model_kwargs ) model.eval() # Set to evaluation mode _models["expert_model"] = model _models["expert_tokenizer"] = tokenizer logging.info("Expert model loaded successfully") return tokenizer, model def load_translation_model(model_name: str): """Lazy load translation model.""" if _models["translation_model"] is not None: return _models["translation_tokenizer"], _models["translation_model"] from transformers import AutoModelForSeq2SeqLM, NllbTokenizer from app.utils import config logging.info(f"Loading translation model ({model_name})...") cache_dir = getattr(config, 'hf_cache', '/models/huggingface') tokenizer = NllbTokenizer.from_pretrained( model_name, cache_dir=cache_dir ) model = AutoModelForSeq2SeqLM.from_pretrained( model_name, torch_dtype=torch.float32, # CPU uses float32 cache_dir=cache_dir, device_map="cpu", low_cpu_mem_usage=True ) model.eval() _models["translation_model"] = model _models["translation_tokenizer"] = tokenizer logging.info("Translation model loaded successfully") return tokenizer, model def load_embedder(model_name: str): """Lazy load sentence transformer embedder.""" if _models["embedder"] is not None: return _models["embedder"] from sentence_transformers import SentenceTransformer from app.utils import config logging.info(f"Loading embedder ({model_name})...") cache_folder = getattr(config, 'hf_cache', '/models/huggingface') embedder = SentenceTransformer( model_name, device=_device, cache_folder=cache_folder ) _models["embedder"] = embedder logging.info("Embedder loaded successfully") return embedder def load_lang_identifier(repo_id: str, filename: str = "model.bin"): """Lazy load FastText language identifier.""" if _models["lang_identifier"] is not None: return _models["lang_identifier"] import fasttext from huggingface_hub import hf_hub_download from app.utils import config logging.info(f"Loading language identifier ({repo_id})...") cache_dir = getattr(config, 'hf_cache', '/models/huggingface') lang_model_path = hf_hub_download( repo_id=repo_id, filename=filename, cache_dir=cache_dir ) lang_identifier = fasttext.load_model(lang_model_path) _models["lang_identifier"] = lang_identifier logging.info("Language identifier loaded successfully") return lang_identifier def load_classifier(classifier_path: str): """Lazy load intent classifier.""" if _models["classifier"] is not None: return _models["classifier"] import joblib from pathlib import Path logging.info(f"Loading classifier ({classifier_path})...") if not Path(classifier_path).exists(): logging.warning(f"Classifier not found at {classifier_path}") return None try: classifier = joblib.load(classifier_path) _models["classifier"] = classifier logging.info("Classifier loaded successfully") return classifier except Exception as e: logging.error(f"Failed to load classifier: {e}") return None def clear_model_cache(): """Clear all loaded models from memory.""" global _models for key in _models: if _models[key] is not None: del _models[key] _models[key] = None import gc gc.collect() logging.info("Model cache cleared") def get_model_memory_usage(): """Get approximate memory usage of loaded models.""" usage = {} if _models["expert_model"] is not None: # Rough estimate: 4B params * 4 bytes = 16 GB usage["expert_model"] = "~16 GB" if _models["translation_model"] is not None: usage["translation_model"] = "~2-5 GB" if _models["embedder"] is not None: usage["embedder"] = "~1 GB" if _models["lang_identifier"] is not None: usage["lang_identifier"] = "~200 MB" return usage