Spaces:
Running on Zero
Running on Zero
| """Lightweight indoor/outdoor classifier for routing PaGeR's twin scale heads. | |
| Wraps CLIP ViT-B/32 (OpenAI weights via ``open_clip``) behind a tiny | |
| ``classify(cubemap) -> ("indoor"|"outdoor", p_outdoor)`` API. The classifier | |
| reuses the 4 equatorial faces of the cubemap that PaGeR's depth pipeline | |
| already computes from the input panorama (Front, Right, Back, Left at indices | |
| 0-3 of the standard 'stack' face order), so it adds no extra projection cost. | |
| Each of the 4 faces is resized to 224×224, normalised with CLIP's image mean | |
| / std, and run through CLIP's image encoder. Indoor / outdoor are represented | |
| as L2-normalised mean text-embedding centroids over a small fixed prompt | |
| ensemble; per-face cosine similarities are softmax'd with CLIP's 100× logit | |
| scale and averaged across faces to give ``P(outdoor)``. | |
| The default prompt set ('rich') was validated across the three evaluation | |
| datasets shipped with PaGeR (Stanford2D3DS, Matterport3D360, ZuriPano) and | |
| gives universally good routing without per-dataset tuning. | |
| """ | |
| from __future__ import annotations | |
| from typing import Sequence, Union | |
| import open_clip | |
| import torch | |
| import torch.nn.functional as F | |
| # Rich prompt ensemble — verified universal across PaGeR's eval datasets. | |
| DEFAULT_INDOOR_PROMPTS: tuple[str, ...] = ( | |
| "an indoor scene", | |
| "the interior of a building", | |
| "a room inside a building", | |
| "a hallway", | |
| "an office room", | |
| "a bedroom", | |
| "a living room", | |
| "a kitchen", | |
| "an indoor space with walls and a ceiling", | |
| ) | |
| DEFAULT_OUTDOOR_PROMPTS: tuple[str, ...] = ( | |
| "an outdoor scene", | |
| "the outdoors", | |
| "outside in nature", | |
| "a street view", | |
| "a city street", | |
| "an urban panorama", | |
| "a park", | |
| "a landscape", | |
| "a residential neighborhood", | |
| ) | |
| # CLIP image-encoder normalisation (different from ImageNet). | |
| _CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073) | |
| _CLIP_STD = (0.26862954, 0.26130258, 0.27577711) | |
| class IndoorOutdoorClassifier: | |
| """CLIP ViT-B/32 indoor/outdoor scene classifier operating on cubemap faces. | |
| Parameters | |
| ---------- | |
| device: | |
| Torch device the CLIP model lives on. Inputs are moved to this device | |
| inside :meth:`classify`. | |
| indoor_prompts, outdoor_prompts: | |
| Text prompts used to build the two class centroids. Each centroid is | |
| the L2-normalised mean of its prompts' L2-normalised CLIP embeddings. | |
| model_name, pretrained: | |
| Passed to :func:`open_clip.create_model_and_transforms`. The defaults | |
| (``"ViT-B-32"`` / ``"openai"``) give a 151M-param image+text model | |
| with the image encoder running at ~10 ms / sample on a recent GPU. | |
| """ | |
| def __init__( | |
| self, | |
| device: Union[str, torch.device] = "cuda", | |
| indoor_prompts: Sequence[str] = DEFAULT_INDOOR_PROMPTS, | |
| outdoor_prompts: Sequence[str] = DEFAULT_OUTDOOR_PROMPTS, | |
| model_name: str = "ViT-B-32", | |
| pretrained: str = "openai", | |
| ) -> None: | |
| self.device = torch.device(device) if not isinstance(device, torch.device) else device | |
| # ``force_quick_gelu`` matches the activation that OpenAI's CLIP was | |
| # trained with; without it open_clip emits a warning and silently | |
| # falls back to standard GELU, which gives slightly different logits. | |
| model, _, _ = open_clip.create_model_and_transforms( | |
| model_name, pretrained=pretrained, device=self.device, | |
| force_quick_gelu=(pretrained == "openai"), | |
| ) | |
| self.model = model.eval() | |
| self.tokenizer = open_clip.get_tokenizer(model_name) | |
| # The text centroids are built lazily on first ``classify`` call -- see | |
| # ``_ensure_centroids``. Building them here would run a CLIP text-encoder | |
| # forward pass at construction time, which breaks on HF ZeroGPU: a GPU | |
| # is only attached inside a ``@spaces.GPU`` window, so a forward run at | |
| # import/startup has no device. Loading the weights (above) is fine; | |
| # only the forward must wait for a GPU window. | |
| self._indoor_prompts = tuple(indoor_prompts) | |
| self._outdoor_prompts = tuple(outdoor_prompts) | |
| self.text_indoor = None # (1, D), filled lazily | |
| self.text_outdoor = None # (1, D), filled lazily | |
| # Image-side normalisation buffers (kept on device for fast inference). | |
| self.register_clip_norm(self.device) | |
| def _ensure_centroids(self) -> None: | |
| """Build the indoor/outdoor text centroids on first use (idempotent). | |
| Each centroid is the L2-normalised mean of its prompts' L2-normalised | |
| CLIP text embeddings. Deferred out of ``__init__`` so the forward pass | |
| runs inside the caller's GPU window (see the note there).""" | |
| if self.text_indoor is not None: | |
| return | |
| centroids = {} | |
| for key, prompts in (("indoor", self._indoor_prompts), | |
| ("outdoor", self._outdoor_prompts)): | |
| toks = self.tokenizer(list(prompts)).to(self.device) | |
| feats = F.normalize(self.model.encode_text(toks), dim=-1) | |
| centroids[key] = F.normalize(feats.mean(dim=0, keepdim=True), dim=-1) | |
| self.text_indoor = centroids["indoor"] # (1, D) | |
| self.text_outdoor = centroids["outdoor"] # (1, D) | |
| def register_clip_norm(self, device: torch.device) -> None: | |
| self._clip_mean = torch.tensor(_CLIP_MEAN, device=device).view(1, 3, 1, 1) | |
| self._clip_std = torch.tensor(_CLIP_STD, device=device).view(1, 3, 1, 1) | |
| def p_outdoor(self, cubemap_01: torch.Tensor) -> float: | |
| """Return P(outdoor) for one panorama. | |
| Parameters | |
| ---------- | |
| cubemap_01: | |
| Cubemap tensor of shape ``(6, 3, F, F)`` or ``(1, 6, 3, F, F)`` | |
| with raw RGB in ``[0, 1]`` (no normalisation). Only faces 0-3 | |
| (Front, Right, Back, Left from the standard 'stack' order) are | |
| used; the top / bottom faces are ignored. | |
| """ | |
| if cubemap_01.ndim == 5: | |
| cubemap_01 = cubemap_01[0] | |
| if cubemap_01.ndim != 4 or cubemap_01.shape[0] < 4: | |
| raise ValueError( | |
| f"Expected cubemap of shape (6, 3, F, F); got {tuple(cubemap_01.shape)}" | |
| ) | |
| self._ensure_centroids() | |
| eq = cubemap_01[:4].to(self.device).clamp(0, 1) | |
| eq = F.interpolate(eq, size=(224, 224), mode="bilinear", | |
| align_corners=False, antialias=True) | |
| eq = (eq - self._clip_mean) / self._clip_std | |
| feats = F.normalize(self.model.encode_image(eq), dim=-1) # (4, D) | |
| s_in = (feats @ self.text_indoor.T).squeeze(-1) # (4,) | |
| s_out = (feats @ self.text_outdoor.T).squeeze(-1) # (4,) | |
| probs = torch.stack([s_in, s_out], dim=-1).mul(100.0).softmax(dim=-1) | |
| return float(probs[:, 1].mean().item()) | |
| def classify(self, cubemap_01: torch.Tensor, threshold: float = 0.5): | |
| """Return ``(label, p_outdoor)``; ``label`` is ``"indoor"`` if | |
| ``p_outdoor < threshold`` else ``"outdoor"``.""" | |
| p = self.p_outdoor(cubemap_01) | |
| return ("outdoor" if p >= threshold else "indoor", p) | |
| _SINGLETON: "IndoorOutdoorClassifier | None" = None | |
| def get_classifier(device: Union[str, torch.device] = "cuda") -> IndoorOutdoorClassifier: | |
| """Return a process-wide singleton classifier (loaded on first call).""" | |
| global _SINGLETON | |
| if _SINGLETON is None: | |
| _SINGLETON = IndoorOutdoorClassifier(device=device) | |
| return _SINGLETON | |