"""Lightweight indoor/outdoor classifier for routing PaGeR's twin scale heads. Wraps CLIP ViT-B/32 (OpenAI weights via ``open_clip``) behind a tiny ``classify(cubemap) -> ("indoor"|"outdoor", p_outdoor)`` API. The classifier reuses the 4 equatorial faces of the cubemap that PaGeR's depth pipeline already computes from the input panorama (Front, Right, Back, Left at indices 0-3 of the standard 'stack' face order), so it adds no extra projection cost. Each of the 4 faces is resized to 224×224, normalised with CLIP's image mean / std, and run through CLIP's image encoder. Indoor / outdoor are represented as L2-normalised mean text-embedding centroids over a small fixed prompt ensemble; per-face cosine similarities are softmax'd with CLIP's 100× logit scale and averaged across faces to give ``P(outdoor)``. The default prompt set ('rich') was validated across the three evaluation datasets shipped with PaGeR (Stanford2D3DS, Matterport3D360, ZuriPano) and gives universally good routing without per-dataset tuning. """ from __future__ import annotations from typing import Sequence, Union import open_clip import torch import torch.nn.functional as F # Rich prompt ensemble — verified universal across PaGeR's eval datasets. DEFAULT_INDOOR_PROMPTS: tuple[str, ...] = ( "an indoor scene", "the interior of a building", "a room inside a building", "a hallway", "an office room", "a bedroom", "a living room", "a kitchen", "an indoor space with walls and a ceiling", ) DEFAULT_OUTDOOR_PROMPTS: tuple[str, ...] = ( "an outdoor scene", "the outdoors", "outside in nature", "a street view", "a city street", "an urban panorama", "a park", "a landscape", "a residential neighborhood", ) # CLIP image-encoder normalisation (different from ImageNet). _CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073) _CLIP_STD = (0.26862954, 0.26130258, 0.27577711) class IndoorOutdoorClassifier: """CLIP ViT-B/32 indoor/outdoor scene classifier operating on cubemap faces. Parameters ---------- device: Torch device the CLIP model lives on. Inputs are moved to this device inside :meth:`classify`. indoor_prompts, outdoor_prompts: Text prompts used to build the two class centroids. Each centroid is the L2-normalised mean of its prompts' L2-normalised CLIP embeddings. model_name, pretrained: Passed to :func:`open_clip.create_model_and_transforms`. The defaults (``"ViT-B-32"`` / ``"openai"``) give a 151M-param image+text model with the image encoder running at ~10 ms / sample on a recent GPU. """ def __init__( self, device: Union[str, torch.device] = "cuda", indoor_prompts: Sequence[str] = DEFAULT_INDOOR_PROMPTS, outdoor_prompts: Sequence[str] = DEFAULT_OUTDOOR_PROMPTS, model_name: str = "ViT-B-32", pretrained: str = "openai", ) -> None: self.device = torch.device(device) if not isinstance(device, torch.device) else device # ``force_quick_gelu`` matches the activation that OpenAI's CLIP was # trained with; without it open_clip emits a warning and silently # falls back to standard GELU, which gives slightly different logits. model, _, _ = open_clip.create_model_and_transforms( model_name, pretrained=pretrained, device=self.device, force_quick_gelu=(pretrained == "openai"), ) self.model = model.eval() self.tokenizer = open_clip.get_tokenizer(model_name) # The text centroids are built lazily on first ``classify`` call -- see # ``_ensure_centroids``. Building them here would run a CLIP text-encoder # forward pass at construction time, which breaks on HF ZeroGPU: a GPU # is only attached inside a ``@spaces.GPU`` window, so a forward run at # import/startup has no device. Loading the weights (above) is fine; # only the forward must wait for a GPU window. self._indoor_prompts = tuple(indoor_prompts) self._outdoor_prompts = tuple(outdoor_prompts) self.text_indoor = None # (1, D), filled lazily self.text_outdoor = None # (1, D), filled lazily # Image-side normalisation buffers (kept on device for fast inference). self.register_clip_norm(self.device) @torch.inference_mode() def _ensure_centroids(self) -> None: """Build the indoor/outdoor text centroids on first use (idempotent). Each centroid is the L2-normalised mean of its prompts' L2-normalised CLIP text embeddings. Deferred out of ``__init__`` so the forward pass runs inside the caller's GPU window (see the note there).""" if self.text_indoor is not None: return centroids = {} for key, prompts in (("indoor", self._indoor_prompts), ("outdoor", self._outdoor_prompts)): toks = self.tokenizer(list(prompts)).to(self.device) feats = F.normalize(self.model.encode_text(toks), dim=-1) centroids[key] = F.normalize(feats.mean(dim=0, keepdim=True), dim=-1) self.text_indoor = centroids["indoor"] # (1, D) self.text_outdoor = centroids["outdoor"] # (1, D) def register_clip_norm(self, device: torch.device) -> None: self._clip_mean = torch.tensor(_CLIP_MEAN, device=device).view(1, 3, 1, 1) self._clip_std = torch.tensor(_CLIP_STD, device=device).view(1, 3, 1, 1) @torch.inference_mode() def p_outdoor(self, cubemap_01: torch.Tensor) -> float: """Return P(outdoor) for one panorama. Parameters ---------- cubemap_01: Cubemap tensor of shape ``(6, 3, F, F)`` or ``(1, 6, 3, F, F)`` with raw RGB in ``[0, 1]`` (no normalisation). Only faces 0-3 (Front, Right, Back, Left from the standard 'stack' order) are used; the top / bottom faces are ignored. """ if cubemap_01.ndim == 5: cubemap_01 = cubemap_01[0] if cubemap_01.ndim != 4 or cubemap_01.shape[0] < 4: raise ValueError( f"Expected cubemap of shape (6, 3, F, F); got {tuple(cubemap_01.shape)}" ) self._ensure_centroids() eq = cubemap_01[:4].to(self.device).clamp(0, 1) eq = F.interpolate(eq, size=(224, 224), mode="bilinear", align_corners=False, antialias=True) eq = (eq - self._clip_mean) / self._clip_std feats = F.normalize(self.model.encode_image(eq), dim=-1) # (4, D) s_in = (feats @ self.text_indoor.T).squeeze(-1) # (4,) s_out = (feats @ self.text_outdoor.T).squeeze(-1) # (4,) probs = torch.stack([s_in, s_out], dim=-1).mul(100.0).softmax(dim=-1) return float(probs[:, 1].mean().item()) def classify(self, cubemap_01: torch.Tensor, threshold: float = 0.5): """Return ``(label, p_outdoor)``; ``label`` is ``"indoor"`` if ``p_outdoor < threshold`` else ``"outdoor"``.""" p = self.p_outdoor(cubemap_01) return ("outdoor" if p >= threshold else "indoor", p) _SINGLETON: "IndoorOutdoorClassifier | None" = None def get_classifier(device: Union[str, torch.device] = "cuda") -> IndoorOutdoorClassifier: """Return a process-wide singleton classifier (loaded on first call).""" global _SINGLETON if _SINGLETON is None: _SINGLETON = IndoorOutdoorClassifier(device=device) return _SINGLETON