Spaces:

ICMLABS
/

Terrasyncra

Sleeping

File size: 6,305 Bytes

9ebe82e

# TerraSyncra/app/utils/model_manager.py
"""
Lazy Model Manager for CPU Optimization
Loads models on-demand instead of at import time.
"""
import os
import logging
import torch
from typing import Optional
from functools import lru_cache

logging.basicConfig(level=logging.INFO)

# Global model cache
_models = {
    "expert_model": None,
    "expert_tokenizer": None,
    "translation_model": None,
    "translation_tokenizer": None,
    "embedder": None,
    "lang_identifier": None,
    "classifier": None,
}

_device = "cpu"  # Force CPU for HuggingFace Spaces


def get_device():
    """Always return CPU for HuggingFace Spaces."""
    return _device


def load_expert_model(model_name: str, use_quantization: bool = True):
    """
    Lazy load expert model with optional quantization.
    
    Args:
        model_name: Model identifier
        use_quantization: Use INT8 quantization for CPU (recommended)
    """
    if _models["expert_model"] is not None:
        return _models["expert_tokenizer"], _models["expert_model"]
    
    from transformers import AutoTokenizer, AutoModelForCausalLM
    from app.utils import config
    
    logging.info(f"Loading expert model ({model_name})...")
    
    # Get cache directory from config
    cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        use_fast=True,  # Use fast tokenizer
        cache_dir=cache_dir
    )
    
    # Load model with CPU optimizations
    model_kwargs = {
        "torch_dtype": torch.float32,  # Use float32 for CPU
        "device_map": "cpu",
        "low_cpu_mem_usage": True,
    }
    
    # Note: For CPU, we use float32 (most compatible)
    # For quantization on CPU, consider using smaller models or ONNX runtime
    # BitsAndBytesConfig is GPU-only, so we skip it for CPU deployment
    logging.info("Loading model in float32 for CPU compatibility")
    
    cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        **model_kwargs
    )
    
    model.eval()  # Set to evaluation mode
    
    _models["expert_model"] = model
    _models["expert_tokenizer"] = tokenizer
    
    logging.info("Expert model loaded successfully")
    return tokenizer, model


def load_translation_model(model_name: str):
    """Lazy load translation model."""
    if _models["translation_model"] is not None:
        return _models["translation_tokenizer"], _models["translation_model"]
    
    from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
    from app.utils import config
    
    logging.info(f"Loading translation model ({model_name})...")
    
    cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
    
    tokenizer = NllbTokenizer.from_pretrained(
        model_name,
        cache_dir=cache_dir
    )
    
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        torch_dtype=torch.float32,  # CPU uses float32
        cache_dir=cache_dir,
        device_map="cpu",
        low_cpu_mem_usage=True
    )
    
    model.eval()
    
    _models["translation_model"] = model
    _models["translation_tokenizer"] = tokenizer
    
    logging.info("Translation model loaded successfully")
    return tokenizer, model


def load_embedder(model_name: str):
    """Lazy load sentence transformer embedder."""
    if _models["embedder"] is not None:
        return _models["embedder"]
    
    from sentence_transformers import SentenceTransformer
    from app.utils import config
    
    logging.info(f"Loading embedder ({model_name})...")
    
    cache_folder = getattr(config, 'hf_cache', '/models/huggingface')
    
    embedder = SentenceTransformer(
        model_name,
        device=_device,
        cache_folder=cache_folder
    )
    
    _models["embedder"] = embedder
    
    logging.info("Embedder loaded successfully")
    return embedder


def load_lang_identifier(repo_id: str, filename: str = "model.bin"):
    """Lazy load FastText language identifier."""
    if _models["lang_identifier"] is not None:
        return _models["lang_identifier"]
    
    import fasttext
    from huggingface_hub import hf_hub_download
    from app.utils import config
    
    logging.info(f"Loading language identifier ({repo_id})...")
    
    cache_dir = getattr(config, 'hf_cache', '/models/huggingface')
    
    lang_model_path = hf_hub_download(
        repo_id=repo_id,
        filename=filename,
        cache_dir=cache_dir
    )
    
    lang_identifier = fasttext.load_model(lang_model_path)
    
    _models["lang_identifier"] = lang_identifier
    
    logging.info("Language identifier loaded successfully")
    return lang_identifier


def load_classifier(classifier_path: str):
    """Lazy load intent classifier."""
    if _models["classifier"] is not None:
        return _models["classifier"]
    
    import joblib
    from pathlib import Path
    
    logging.info(f"Loading classifier ({classifier_path})...")
    
    if not Path(classifier_path).exists():
        logging.warning(f"Classifier not found at {classifier_path}")
        return None
    
    try:
        classifier = joblib.load(classifier_path)
        _models["classifier"] = classifier
        logging.info("Classifier loaded successfully")
        return classifier
    except Exception as e:
        logging.error(f"Failed to load classifier: {e}")
        return None


def clear_model_cache():
    """Clear all loaded models from memory."""
    global _models
    for key in _models:
        if _models[key] is not None:
            del _models[key]
        _models[key] = None
    import gc
    gc.collect()
    logging.info("Model cache cleared")


def get_model_memory_usage():
    """Get approximate memory usage of loaded models."""
    usage = {}
    if _models["expert_model"] is not None:
        # Rough estimate: 4B params * 4 bytes = 16 GB
        usage["expert_model"] = "~16 GB"
    if _models["translation_model"] is not None:
        usage["translation_model"] = "~2-5 GB"
    if _models["embedder"] is not None:
        usage["embedder"] = "~1 GB"
    if _models["lang_identifier"] is not None:
        usage["lang_identifier"] = "~200 MB"
    return usage