"""Lightweight indoor/outdoor classifier for routing PaGeR's twin scale heads.

Wraps CLIP ViT-B/32 (OpenAI weights via ``open_clip``) behind a tiny
``classify(cubemap) -> ("indoor"|"outdoor", p_outdoor)`` API. The classifier
reuses the 4 equatorial faces of the cubemap that PaGeR's depth pipeline
already computes from the input panorama (Front, Right, Back, Left at indices
0-3 of the standard 'stack' face order), so it adds no extra projection cost.

Each of the 4 faces is resized to 224×224, normalised with CLIP's image mean
/ std, and run through CLIP's image encoder. Indoor / outdoor are represented
as L2-normalised mean text-embedding centroids over a small fixed prompt
ensemble; per-face cosine similarities are softmax'd with CLIP's 100× logit
scale and averaged across faces to give ``P(outdoor)``.

The default prompt set ('rich') was validated across the three evaluation
datasets shipped with PaGeR (Stanford2D3DS, Matterport3D360, ZuriPano) and
gives universally good routing without per-dataset tuning.
"""

from __future__ import annotations

from typing import Sequence, Union

import open_clip
import torch
import torch.nn.functional as F


# Rich prompt ensemble — verified universal across PaGeR's eval datasets.
DEFAULT_INDOOR_PROMPTS: tuple[str, ...] = (
    "an indoor scene",
    "the interior of a building",
    "a room inside a building",
    "a hallway",
    "an office room",
    "a bedroom",
    "a living room",
    "a kitchen",
    "an indoor space with walls and a ceiling",
)
DEFAULT_OUTDOOR_PROMPTS: tuple[str, ...] = (
    "an outdoor scene",
    "the outdoors",
    "outside in nature",
    "a street view",
    "a city street",
    "an urban panorama",
    "a park",
    "a landscape",
    "a residential neighborhood",
)

# CLIP image-encoder normalisation (different from ImageNet).
_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)


class IndoorOutdoorClassifier:
    """CLIP ViT-B/32 indoor/outdoor scene classifier operating on cubemap faces.

    Parameters
    ----------
    device:
        Torch device the CLIP model lives on. Inputs are moved to this device
        inside :meth:`classify`.
    indoor_prompts, outdoor_prompts:
        Text prompts used to build the two class centroids. Each centroid is
        the L2-normalised mean of its prompts' L2-normalised CLIP embeddings.
    model_name, pretrained:
        Passed to :func:`open_clip.create_model_and_transforms`. The defaults
        (``"ViT-B-32"`` / ``"openai"``) give a 151M-param image+text model
        with the image encoder running at ~10 ms / sample on a recent GPU.
    """

    def __init__(
        self,
        device: Union[str, torch.device] = "cuda",
        indoor_prompts: Sequence[str] = DEFAULT_INDOOR_PROMPTS,
        outdoor_prompts: Sequence[str] = DEFAULT_OUTDOOR_PROMPTS,
        model_name: str = "ViT-B-32",
        pretrained: str = "openai",
    ) -> None:
        self.device = torch.device(device) if not isinstance(device, torch.device) else device

        # ``force_quick_gelu`` matches the activation that OpenAI's CLIP was
        # trained with; without it open_clip emits a warning and silently
        # falls back to standard GELU, which gives slightly different logits.
        model, _, _ = open_clip.create_model_and_transforms(
            model_name, pretrained=pretrained, device=self.device,
            force_quick_gelu=(pretrained == "openai"),
        )
        self.model = model.eval()
        self.tokenizer = open_clip.get_tokenizer(model_name)

        # The text centroids are built lazily on first ``classify`` call -- see
        # ``_ensure_centroids``. Building them here would run a CLIP text-encoder
        # forward pass at construction time, which breaks on HF ZeroGPU: a GPU
        # is only attached inside a ``@spaces.GPU`` window, so a forward run at
        # import/startup has no device. Loading the weights (above) is fine;
        # only the forward must wait for a GPU window.
        self._indoor_prompts = tuple(indoor_prompts)
        self._outdoor_prompts = tuple(outdoor_prompts)
        self.text_indoor = None    # (1, D), filled lazily
        self.text_outdoor = None   # (1, D), filled lazily

        # Image-side normalisation buffers (kept on device for fast inference).
        self.register_clip_norm(self.device)

    @torch.inference_mode()
    def _ensure_centroids(self) -> None:
        """Build the indoor/outdoor text centroids on first use (idempotent).

        Each centroid is the L2-normalised mean of its prompts' L2-normalised
        CLIP text embeddings. Deferred out of ``__init__`` so the forward pass
        runs inside the caller's GPU window (see the note there)."""
        if self.text_indoor is not None:
            return
        centroids = {}
        for key, prompts in (("indoor", self._indoor_prompts),
                             ("outdoor", self._outdoor_prompts)):
            toks = self.tokenizer(list(prompts)).to(self.device)
            feats = F.normalize(self.model.encode_text(toks), dim=-1)
            centroids[key] = F.normalize(feats.mean(dim=0, keepdim=True), dim=-1)
        self.text_indoor = centroids["indoor"]    # (1, D)
        self.text_outdoor = centroids["outdoor"]  # (1, D)

    def register_clip_norm(self, device: torch.device) -> None:
        self._clip_mean = torch.tensor(_CLIP_MEAN, device=device).view(1, 3, 1, 1)
        self._clip_std = torch.tensor(_CLIP_STD, device=device).view(1, 3, 1, 1)

    @torch.inference_mode()
    def p_outdoor(self, cubemap_01: torch.Tensor) -> float:
        """Return P(outdoor) for one panorama.

        Parameters
        ----------
        cubemap_01:
            Cubemap tensor of shape ``(6, 3, F, F)`` or ``(1, 6, 3, F, F)``
            with raw RGB in ``[0, 1]`` (no normalisation). Only faces 0-3
            (Front, Right, Back, Left from the standard 'stack' order) are
            used; the top / bottom faces are ignored.
        """
        if cubemap_01.ndim == 5:
            cubemap_01 = cubemap_01[0]
        if cubemap_01.ndim != 4 or cubemap_01.shape[0] < 4:
            raise ValueError(
                f"Expected cubemap of shape (6, 3, F, F); got {tuple(cubemap_01.shape)}"
            )
        self._ensure_centroids()
        eq = cubemap_01[:4].to(self.device).clamp(0, 1)
        eq = F.interpolate(eq, size=(224, 224), mode="bilinear",
                           align_corners=False, antialias=True)
        eq = (eq - self._clip_mean) / self._clip_std
        feats = F.normalize(self.model.encode_image(eq), dim=-1)         # (4, D)
        s_in = (feats @ self.text_indoor.T).squeeze(-1)                  # (4,)
        s_out = (feats @ self.text_outdoor.T).squeeze(-1)                # (4,)
        probs = torch.stack([s_in, s_out], dim=-1).mul(100.0).softmax(dim=-1)
        return float(probs[:, 1].mean().item())

    def classify(self, cubemap_01: torch.Tensor, threshold: float = 0.5):
        """Return ``(label, p_outdoor)``; ``label`` is ``"indoor"`` if
        ``p_outdoor < threshold`` else ``"outdoor"``."""
        p = self.p_outdoor(cubemap_01)
        return ("outdoor" if p >= threshold else "indoor", p)


_SINGLETON: "IndoorOutdoorClassifier | None" = None


def get_classifier(device: Union[str, torch.device] = "cuda") -> IndoorOutdoorClassifier:
    """Return a process-wide singleton classifier (loaded on first call)."""
    global _SINGLETON
    if _SINGLETON is None:
        _SINGLETON = IndoorOutdoorClassifier(device=device)
    return _SINGLETON