Spaces:
Sleeping
Sleeping
| # TerraSyncra/app/utils/model_manager.py | |
| """ | |
| Lazy Model Manager for CPU Optimization | |
| Loads models on-demand instead of at import time. | |
| """ | |
| import os | |
| import logging | |
| import torch | |
| from typing import Optional | |
| from functools import lru_cache | |
| logging.basicConfig(level=logging.INFO) | |
| # Global model cache | |
| _models = { | |
| "expert_model": None, | |
| "expert_tokenizer": None, | |
| "translation_model": None, | |
| "translation_tokenizer": None, | |
| "embedder": None, | |
| "lang_identifier": None, | |
| "classifier": None, | |
| } | |
| _device = "cpu" # Force CPU for HuggingFace Spaces | |
| def get_device(): | |
| """Always return CPU for HuggingFace Spaces.""" | |
| return _device | |
| def load_expert_model(model_name: str, use_quantization: bool = True): | |
| """ | |
| Lazy load expert model with optional quantization. | |
| Args: | |
| model_name: Model identifier | |
| use_quantization: Use INT8 quantization for CPU (recommended) | |
| """ | |
| if _models["expert_model"] is not None: | |
| return _models["expert_tokenizer"], _models["expert_model"] | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from app.utils import config | |
| logging.info(f"Loading expert model ({model_name})...") | |
| # Get cache directory from config | |
| cache_dir = getattr(config, 'hf_cache', '/models/huggingface') | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| use_fast=True, # Use fast tokenizer | |
| cache_dir=cache_dir | |
| ) | |
| # Load model with CPU optimizations | |
| model_kwargs = { | |
| "torch_dtype": torch.float32, # Use float32 for CPU | |
| "device_map": "cpu", | |
| "low_cpu_mem_usage": True, | |
| } | |
| # Note: For CPU, we use float32 (most compatible) | |
| # For quantization on CPU, consider using smaller models or ONNX runtime | |
| # BitsAndBytesConfig is GPU-only, so we skip it for CPU deployment | |
| logging.info("Loading model in float32 for CPU compatibility") | |
| cache_dir = getattr(config, 'hf_cache', '/models/huggingface') | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| cache_dir=cache_dir, | |
| **model_kwargs | |
| ) | |
| model.eval() # Set to evaluation mode | |
| _models["expert_model"] = model | |
| _models["expert_tokenizer"] = tokenizer | |
| logging.info("Expert model loaded successfully") | |
| return tokenizer, model | |
| def load_translation_model(model_name: str): | |
| """Lazy load translation model.""" | |
| if _models["translation_model"] is not None: | |
| return _models["translation_tokenizer"], _models["translation_model"] | |
| from transformers import AutoModelForSeq2SeqLM, NllbTokenizer | |
| from app.utils import config | |
| logging.info(f"Loading translation model ({model_name})...") | |
| cache_dir = getattr(config, 'hf_cache', '/models/huggingface') | |
| tokenizer = NllbTokenizer.from_pretrained( | |
| model_name, | |
| cache_dir=cache_dir | |
| ) | |
| model = AutoModelForSeq2SeqLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float32, # CPU uses float32 | |
| cache_dir=cache_dir, | |
| device_map="cpu", | |
| low_cpu_mem_usage=True | |
| ) | |
| model.eval() | |
| _models["translation_model"] = model | |
| _models["translation_tokenizer"] = tokenizer | |
| logging.info("Translation model loaded successfully") | |
| return tokenizer, model | |
| def load_embedder(model_name: str): | |
| """Lazy load sentence transformer embedder.""" | |
| if _models["embedder"] is not None: | |
| return _models["embedder"] | |
| from sentence_transformers import SentenceTransformer | |
| from app.utils import config | |
| logging.info(f"Loading embedder ({model_name})...") | |
| cache_folder = getattr(config, 'hf_cache', '/models/huggingface') | |
| embedder = SentenceTransformer( | |
| model_name, | |
| device=_device, | |
| cache_folder=cache_folder | |
| ) | |
| _models["embedder"] = embedder | |
| logging.info("Embedder loaded successfully") | |
| return embedder | |
| def load_lang_identifier(repo_id: str, filename: str = "model.bin"): | |
| """Lazy load FastText language identifier.""" | |
| if _models["lang_identifier"] is not None: | |
| return _models["lang_identifier"] | |
| import fasttext | |
| from huggingface_hub import hf_hub_download | |
| from app.utils import config | |
| logging.info(f"Loading language identifier ({repo_id})...") | |
| cache_dir = getattr(config, 'hf_cache', '/models/huggingface') | |
| lang_model_path = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| cache_dir=cache_dir | |
| ) | |
| lang_identifier = fasttext.load_model(lang_model_path) | |
| _models["lang_identifier"] = lang_identifier | |
| logging.info("Language identifier loaded successfully") | |
| return lang_identifier | |
| def load_classifier(classifier_path: str): | |
| """Lazy load intent classifier.""" | |
| if _models["classifier"] is not None: | |
| return _models["classifier"] | |
| import joblib | |
| from pathlib import Path | |
| logging.info(f"Loading classifier ({classifier_path})...") | |
| if not Path(classifier_path).exists(): | |
| logging.warning(f"Classifier not found at {classifier_path}") | |
| return None | |
| try: | |
| classifier = joblib.load(classifier_path) | |
| _models["classifier"] = classifier | |
| logging.info("Classifier loaded successfully") | |
| return classifier | |
| except Exception as e: | |
| logging.error(f"Failed to load classifier: {e}") | |
| return None | |
| def clear_model_cache(): | |
| """Clear all loaded models from memory.""" | |
| global _models | |
| for key in _models: | |
| if _models[key] is not None: | |
| del _models[key] | |
| _models[key] = None | |
| import gc | |
| gc.collect() | |
| logging.info("Model cache cleared") | |
| def get_model_memory_usage(): | |
| """Get approximate memory usage of loaded models.""" | |
| usage = {} | |
| if _models["expert_model"] is not None: | |
| # Rough estimate: 4B params * 4 bytes = 16 GB | |
| usage["expert_model"] = "~16 GB" | |
| if _models["translation_model"] is not None: | |
| usage["translation_model"] = "~2-5 GB" | |
| if _models["embedder"] is not None: | |
| usage["embedder"] = "~1 GB" | |
| if _models["lang_identifier"] is not None: | |
| usage["lang_identifier"] = "~200 MB" | |
| return usage | |