"""
BGE-M3 embedding model singleton — Phase 3.

Responsibilities:
  - Load BAAI/bge-m3 once (lazily on first call or eagerly via get_model())
  - encode_query(text) → (dense: np.ndarray[1024], sparse: dict[int, float])
  - LRU cache on query text to avoid re-encoding repeats
  - CPU float32, no GPU dependency
  - Thread-safe (model is read-only after load)
"""
from __future__ import annotations

import threading
from functools import lru_cache

import numpy as np

from app import config

# ── Module-level singleton ────────────────────────────────────────────────────

_model = None
_model_lock = threading.Lock()


def get_model():
    """
    Return the BGE-M3 model singleton.  Thread-safe, loads once.

    Called eagerly in main.py lifespan so the first request doesn't pay
    the ~15 s model-download cost.
    """
    global _model
    if _model is not None:
        return _model

    with _model_lock:
        # Double-check after acquiring lock
        if _model is not None:
            return _model

        from FlagEmbedding import BGEM3FlagModel

        print(f"[embed_svc] Loading {config.BGE_M3_MODEL} on {config.BGE_M3_DEVICE}...")

        # use_fp16=False on CPU (fp16 requires CUDA)
        use_fp16 = config.BGE_M3_DEVICE != "cpu"
        _model = BGEM3FlagModel(
            config.BGE_M3_MODEL,
            use_fp16=use_fp16,
            device=config.BGE_M3_DEVICE,
        )
        print("[embed_svc] Model loaded successfully")
        return _model


# ── Cached query encoding ────────────────────────────────────────────────────

@lru_cache(maxsize=config.ENCODE_CACHE_SIZE)
def _encode_cached(text: str) -> tuple:
    """
    Encode a single query string.  Returns (dense_vec, sparse_dict).

    The LRU cache key is the raw text string.  Cached results avoid
    re-running BGE-M3 inference for repeated queries.

    Returns a tuple so it's hashable for the cache decorator.
    The caller unpacks it.
    """
    model = get_model()
    out = model.encode(
        [text],
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=False,
        max_length=512,
    )
    dense = out["dense_vecs"][0]          # shape (1024,) float32
    sparse = out["lexical_weights"][0]    # dict {token_id_int: float}

    # Ensure dense is a numpy array (model may return tensor)
    if not isinstance(dense, np.ndarray):
        dense = np.array(dense, dtype=np.float32)

    # Ensure sparse values are plain floats (not tensors)
    sparse_clean = {int(k): float(v) for k, v in sparse.items()}

    return (dense, sparse_clean)


def encode_query(text: str) -> tuple[np.ndarray, dict[int, float]]:
    """
    Encode a query string into dense + sparse representations.

    Args:
        text: User's search query (raw or rewritten).

    Returns:
        (dense_vec, sparse_dict) where:
          dense_vec:   np.ndarray of shape (1024,), float32
          sparse_dict: {int_token_id: float_weight}
    """
    text = text.strip()
    if not text:
        return np.zeros(1024, dtype=np.float32), {}
    return _encode_cached(text)