import os import logging import torch from typing import Optional from functools import lru_cache logging.basicConfig(level=logging.INFO) _models = { "expert_model": None, "expert_tokenizer": None, "multimodal_model": None, "multimodal_processor": None, "translation_model": None, "translation_tokenizer": None, "embedder": None, "lang_identifier": None, "classifier": None, } _device = "cpu" def get_device(): return _device def load_expert_model(model_name: str, use_quantization: bool = True): if _models["expert_model"] is not None: return _models["expert_tokenizer"], _models["expert_model"] from transformers import AutoTokenizer, AutoModelForCausalLM from app.utils import config logging.info(f"Loading expert model ({model_name})...") cache_dir = getattr(config, 'hf_cache', '/models/huggingface') tokenizer = AutoTokenizer.from_pretrained( model_name, use_fast=True, cache_dir=cache_dir ) model_kwargs = { "torch_dtype": torch.float32, "device_map": "cpu", "low_cpu_mem_usage": True, } logging.info("Loading model in float32 for CPU compatibility") cache_dir = getattr(config, 'hf_cache', '/models/huggingface') model = AutoModelForCausalLM.from_pretrained( model_name, cache_dir=cache_dir, **model_kwargs ) model.eval() _models["expert_model"] = model _models["expert_tokenizer"] = tokenizer logging.info("Expert model loaded successfully") return tokenizer, model def load_multimodal_model(model_name: str): """ Lazy load multimodal Qwen2-VL model (vision-language). Used for photo/video-aware advisory. """ if _models["multimodal_model"] is not None: return _models["multimodal_processor"], _models["multimodal_model"] # With latest transformers + qwen-vl-utils, Qwen2VLForConditionalGeneration # and AutoProcessor support full image/video chat as in official docs. from transformers import AutoProcessor, Qwen2VLForConditionalGeneration from app.utils import config logging.info(f"Loading multimodal expert model ({model_name})...") cache_dir = getattr(config, "hf_cache", "/models/huggingface") try: processor = AutoProcessor.from_pretrained( model_name, cache_dir=cache_dir, ) model = Qwen2VLForConditionalGeneration.from_pretrained( model_name, torch_dtype=torch.float32, # CPU deployment cache_dir=cache_dir, device_map="cpu", low_cpu_mem_usage=True, ) model.eval() _models["multimodal_model"] = model _models["multimodal_processor"] = processor logging.info("Multimodal expert model loaded successfully") return processor, model except Exception as e: logging.error( f"Failed to load multimodal model {model_name}: {e}. " "Falling back to text-only expert model." ) _models["multimodal_model"] = None _models["multimodal_processor"] = None return None, None def load_translation_model(model_name: str): """Lazy load translation model.""" if _models["translation_model"] is not None: return _models["translation_tokenizer"], _models["translation_model"] from transformers import AutoModelForSeq2SeqLM, NllbTokenizer from app.utils import config logging.info(f"Loading translation model ({model_name})...") cache_dir = getattr(config, 'hf_cache', '/models/huggingface') tokenizer = NllbTokenizer.from_pretrained( model_name, cache_dir=cache_dir ) model = AutoModelForSeq2SeqLM.from_pretrained( model_name, torch_dtype=torch.float32, # CPU uses float32 cache_dir=cache_dir, device_map="cpu", low_cpu_mem_usage=True ) model.eval() _models["translation_model"] = model _models["translation_tokenizer"] = tokenizer logging.info("Translation model loaded successfully") return tokenizer, model def load_embedder(model_name: str): """Lazy load sentence transformer embedder.""" if _models["embedder"] is not None: return _models["embedder"] from sentence_transformers import SentenceTransformer from app.utils import config logging.info(f"Loading embedder ({model_name})...") cache_folder = getattr(config, 'hf_cache', '/models/huggingface') embedder = SentenceTransformer( model_name, device=_device, cache_folder=cache_folder ) _models["embedder"] = embedder logging.info("Embedder loaded successfully") return embedder def load_lang_identifier(repo_id: str, filename: str = "model.bin"): """Lazy load FastText language identifier.""" if _models["lang_identifier"] is not None: return _models["lang_identifier"] import fasttext from huggingface_hub import hf_hub_download from app.utils import config logging.info(f"Loading language identifier ({repo_id})...") cache_dir = getattr(config, 'hf_cache', '/models/huggingface') lang_model_path = hf_hub_download( repo_id=repo_id, filename=filename, cache_dir=cache_dir ) lang_identifier = fasttext.load_model(lang_model_path) _models["lang_identifier"] = lang_identifier logging.info("Language identifier loaded successfully") return lang_identifier def load_classifier(classifier_path: str): """Lazy load intent classifier.""" if _models["classifier"] is not None: return _models["classifier"] import joblib from pathlib import Path logging.info(f"Loading classifier ({classifier_path})...") if not Path(classifier_path).exists(): logging.warning(f"Classifier not found at {classifier_path}") return None try: classifier = joblib.load(classifier_path) _models["classifier"] = classifier logging.info("Classifier loaded successfully") return classifier except Exception as e: logging.error(f"Failed to load classifier: {e}") return None def clear_model_cache(): """Clear all loaded models from memory.""" global _models for key in _models: if _models[key] is not None: del _models[key] _models[key] = None import gc gc.collect() logging.info("Model cache cleared") def get_model_memory_usage(): """Get approximate memory usage of loaded models.""" usage = {} if _models["expert_model"] is not None: # Rough estimate: 4B params * 4 bytes = 16 GB usage["expert_model"] = "~16 GB" if _models["translation_model"] is not None: usage["translation_model"] = "~2-5 GB" if _models["embedder"] is not None: usage["embedder"] = "~1 GB" if _models["lang_identifier"] is not None: usage["lang_identifier"] = "~200 MB" return usage