Spaces:
Running
Running
| """ | |
| Knowledge Universe β Shared Embedding Model Singleton | |
| ===================================================== | |
| RICK'S FIX β prevents loading all-MiniLM-L6-v2 twice in one process. | |
| Problem: | |
| LocalLLMReranker.__init__() calls SentenceTransformer("all-MiniLM-L6-v2") | |
| CoverageConfidenceScorer._get_model() also calls SentenceTransformer("all-MiniLM-L6-v2") | |
| HuggingFace Spaces free tier has 2GB RAM. | |
| The model is ~90MB on disk but ~300MB in RAM after loading. | |
| Loading it twice = 600MB just for embeddings. You're running into your limit. | |
| Even if sentence_transformers caches internally, the initialization | |
| path still incurs latency on the second call. | |
| Fix: | |
| One module-level singleton. Both classes import _get_shared_model(). | |
| Model loads exactly once per process, stays loaded forever. | |
| Pre-warm call in lifespan() ensures it's ready before the first request. | |
| Usage: | |
| from src.integrations.shared_model import get_shared_model, prewarm_model | |
| model = get_shared_model() | |
| embeddings = model.encode(texts, convert_to_tensor=True) | |
| """ | |
| import logging | |
| import threading | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| _model = None | |
| _model_lock = threading.Lock() | |
| _MODEL_NAME = "all-MiniLM-L6-v2" | |
| def get_shared_model(): | |
| """ | |
| Returns the shared SentenceTransformer instance. | |
| Thread-safe. Loads once, cached forever. | |
| Raises on failure β callers should handle gracefully. | |
| """ | |
| global _model | |
| if _model is not None: | |
| return _model | |
| with _model_lock: | |
| # Double-checked locking pattern | |
| if _model is None: | |
| logger.info(f"Loading shared embedding model: {_MODEL_NAME}") | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| _model = SentenceTransformer(_MODEL_NAME) | |
| logger.info(f"Shared model loaded: {_MODEL_NAME}") | |
| except Exception as e: | |
| logger.error(f"Failed to load shared model: {e}") | |
| raise | |
| return _model | |
| def prewarm_model() -> bool: | |
| """ | |
| Force-initialize the model. Call this at startup before any requests. | |
| Returns True if successful, False if model unavailable. | |
| Add to lifespan() in main.py: | |
| from src.integrations.shared_model import prewarm_model | |
| prewarm_model() | |
| """ | |
| try: | |
| model = get_shared_model() | |
| # Encode a dummy sentence to fully initialize the model | |
| # (lazy components like tokenizer warm up on first encode, not __init__) | |
| model.encode("knowledge universe warmup", convert_to_tensor=True) | |
| logger.info("Shared embedding model pre-warmed and ready") | |
| return True | |
| except Exception as e: | |
| logger.warning(f"Model pre-warm failed (non-fatal): {e}") | |
| return False | |
| def is_model_loaded() -> bool: | |
| """Check if model is already loaded without triggering a load.""" | |
| return _model is not None |