""" Knowledge Universe — Shared Embedding Model Singleton ===================================================== RICK'S FIX — prevents loading all-MiniLM-L6-v2 twice in one process. Problem: LocalLLMReranker.__init__() calls SentenceTransformer("all-MiniLM-L6-v2") CoverageConfidenceScorer._get_model() also calls SentenceTransformer("all-MiniLM-L6-v2") HuggingFace Spaces free tier has 2GB RAM. The model is ~90MB on disk but ~300MB in RAM after loading. Loading it twice = 600MB just for embeddings. You're running into your limit. Even if sentence_transformers caches internally, the initialization path still incurs latency on the second call. Fix: One module-level singleton. Both classes import _get_shared_model(). Model loads exactly once per process, stays loaded forever. Pre-warm call in lifespan() ensures it's ready before the first request. Usage: from src.integrations.shared_model import get_shared_model, prewarm_model model = get_shared_model() embeddings = model.encode(texts, convert_to_tensor=True) """ import logging import threading from typing import Optional logger = logging.getLogger(__name__) _model = None _model_lock = threading.Lock() _MODEL_NAME = "all-MiniLM-L6-v2" def get_shared_model(): """ Returns the shared SentenceTransformer instance. Thread-safe. Loads once, cached forever. Raises on failure — callers should handle gracefully. """ global _model if _model is not None: return _model with _model_lock: # Double-checked locking pattern if _model is None: logger.info(f"Loading shared embedding model: {_MODEL_NAME}") try: from sentence_transformers import SentenceTransformer _model = SentenceTransformer(_MODEL_NAME) logger.info(f"Shared model loaded: {_MODEL_NAME}") except Exception as e: logger.error(f"Failed to load shared model: {e}") raise return _model def prewarm_model() -> bool: """ Force-initialize the model. Call this at startup before any requests. Returns True if successful, False if model unavailable. Add to lifespan() in main.py: from src.integrations.shared_model import prewarm_model prewarm_model() """ try: model = get_shared_model() # Encode a dummy sentence to fully initialize the model # (lazy components like tokenizer warm up on first encode, not __init__) model.encode("knowledge universe warmup", convert_to_tensor=True) logger.info("Shared embedding model pre-warmed and ready") return True except Exception as e: logger.warning(f"Model pre-warm failed (non-fatal): {e}") return False def is_model_loaded() -> bool: """Check if model is already loaded without triggering a load.""" return _model is not None