PaGeR / src /utils /scene_classifier.py
vulus98's picture
Fix ZeroGPU startup: disable Xet, run inference behind @spaces.GPU
aeea5fe
Raw
History Blame Contribute Delete
7.58 kB
"""Lightweight indoor/outdoor classifier for routing PaGeR's twin scale heads.
Wraps CLIP ViT-B/32 (OpenAI weights via ``open_clip``) behind a tiny
``classify(cubemap) -> ("indoor"|"outdoor", p_outdoor)`` API. The classifier
reuses the 4 equatorial faces of the cubemap that PaGeR's depth pipeline
already computes from the input panorama (Front, Right, Back, Left at indices
0-3 of the standard 'stack' face order), so it adds no extra projection cost.
Each of the 4 faces is resized to 224×224, normalised with CLIP's image mean
/ std, and run through CLIP's image encoder. Indoor / outdoor are represented
as L2-normalised mean text-embedding centroids over a small fixed prompt
ensemble; per-face cosine similarities are softmax'd with CLIP's 100× logit
scale and averaged across faces to give ``P(outdoor)``.
The default prompt set ('rich') was validated across the three evaluation
datasets shipped with PaGeR (Stanford2D3DS, Matterport3D360, ZuriPano) and
gives universally good routing without per-dataset tuning.
"""
from __future__ import annotations
from typing import Sequence, Union
import open_clip
import torch
import torch.nn.functional as F
# Rich prompt ensemble — verified universal across PaGeR's eval datasets.
DEFAULT_INDOOR_PROMPTS: tuple[str, ...] = (
"an indoor scene",
"the interior of a building",
"a room inside a building",
"a hallway",
"an office room",
"a bedroom",
"a living room",
"a kitchen",
"an indoor space with walls and a ceiling",
)
DEFAULT_OUTDOOR_PROMPTS: tuple[str, ...] = (
"an outdoor scene",
"the outdoors",
"outside in nature",
"a street view",
"a city street",
"an urban panorama",
"a park",
"a landscape",
"a residential neighborhood",
)
# CLIP image-encoder normalisation (different from ImageNet).
_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
class IndoorOutdoorClassifier:
"""CLIP ViT-B/32 indoor/outdoor scene classifier operating on cubemap faces.
Parameters
----------
device:
Torch device the CLIP model lives on. Inputs are moved to this device
inside :meth:`classify`.
indoor_prompts, outdoor_prompts:
Text prompts used to build the two class centroids. Each centroid is
the L2-normalised mean of its prompts' L2-normalised CLIP embeddings.
model_name, pretrained:
Passed to :func:`open_clip.create_model_and_transforms`. The defaults
(``"ViT-B-32"`` / ``"openai"``) give a 151M-param image+text model
with the image encoder running at ~10 ms / sample on a recent GPU.
"""
def __init__(
self,
device: Union[str, torch.device] = "cuda",
indoor_prompts: Sequence[str] = DEFAULT_INDOOR_PROMPTS,
outdoor_prompts: Sequence[str] = DEFAULT_OUTDOOR_PROMPTS,
model_name: str = "ViT-B-32",
pretrained: str = "openai",
) -> None:
self.device = torch.device(device) if not isinstance(device, torch.device) else device
# ``force_quick_gelu`` matches the activation that OpenAI's CLIP was
# trained with; without it open_clip emits a warning and silently
# falls back to standard GELU, which gives slightly different logits.
model, _, _ = open_clip.create_model_and_transforms(
model_name, pretrained=pretrained, device=self.device,
force_quick_gelu=(pretrained == "openai"),
)
self.model = model.eval()
self.tokenizer = open_clip.get_tokenizer(model_name)
# The text centroids are built lazily on first ``classify`` call -- see
# ``_ensure_centroids``. Building them here would run a CLIP text-encoder
# forward pass at construction time, which breaks on HF ZeroGPU: a GPU
# is only attached inside a ``@spaces.GPU`` window, so a forward run at
# import/startup has no device. Loading the weights (above) is fine;
# only the forward must wait for a GPU window.
self._indoor_prompts = tuple(indoor_prompts)
self._outdoor_prompts = tuple(outdoor_prompts)
self.text_indoor = None # (1, D), filled lazily
self.text_outdoor = None # (1, D), filled lazily
# Image-side normalisation buffers (kept on device for fast inference).
self.register_clip_norm(self.device)
@torch.inference_mode()
def _ensure_centroids(self) -> None:
"""Build the indoor/outdoor text centroids on first use (idempotent).
Each centroid is the L2-normalised mean of its prompts' L2-normalised
CLIP text embeddings. Deferred out of ``__init__`` so the forward pass
runs inside the caller's GPU window (see the note there)."""
if self.text_indoor is not None:
return
centroids = {}
for key, prompts in (("indoor", self._indoor_prompts),
("outdoor", self._outdoor_prompts)):
toks = self.tokenizer(list(prompts)).to(self.device)
feats = F.normalize(self.model.encode_text(toks), dim=-1)
centroids[key] = F.normalize(feats.mean(dim=0, keepdim=True), dim=-1)
self.text_indoor = centroids["indoor"] # (1, D)
self.text_outdoor = centroids["outdoor"] # (1, D)
def register_clip_norm(self, device: torch.device) -> None:
self._clip_mean = torch.tensor(_CLIP_MEAN, device=device).view(1, 3, 1, 1)
self._clip_std = torch.tensor(_CLIP_STD, device=device).view(1, 3, 1, 1)
@torch.inference_mode()
def p_outdoor(self, cubemap_01: torch.Tensor) -> float:
"""Return P(outdoor) for one panorama.
Parameters
----------
cubemap_01:
Cubemap tensor of shape ``(6, 3, F, F)`` or ``(1, 6, 3, F, F)``
with raw RGB in ``[0, 1]`` (no normalisation). Only faces 0-3
(Front, Right, Back, Left from the standard 'stack' order) are
used; the top / bottom faces are ignored.
"""
if cubemap_01.ndim == 5:
cubemap_01 = cubemap_01[0]
if cubemap_01.ndim != 4 or cubemap_01.shape[0] < 4:
raise ValueError(
f"Expected cubemap of shape (6, 3, F, F); got {tuple(cubemap_01.shape)}"
)
self._ensure_centroids()
eq = cubemap_01[:4].to(self.device).clamp(0, 1)
eq = F.interpolate(eq, size=(224, 224), mode="bilinear",
align_corners=False, antialias=True)
eq = (eq - self._clip_mean) / self._clip_std
feats = F.normalize(self.model.encode_image(eq), dim=-1) # (4, D)
s_in = (feats @ self.text_indoor.T).squeeze(-1) # (4,)
s_out = (feats @ self.text_outdoor.T).squeeze(-1) # (4,)
probs = torch.stack([s_in, s_out], dim=-1).mul(100.0).softmax(dim=-1)
return float(probs[:, 1].mean().item())
def classify(self, cubemap_01: torch.Tensor, threshold: float = 0.5):
"""Return ``(label, p_outdoor)``; ``label`` is ``"indoor"`` if
``p_outdoor < threshold`` else ``"outdoor"``."""
p = self.p_outdoor(cubemap_01)
return ("outdoor" if p >= threshold else "indoor", p)
_SINGLETON: "IndoorOutdoorClassifier | None" = None
def get_classifier(device: Union[str, torch.device] = "cuda") -> IndoorOutdoorClassifier:
"""Return a process-wide singleton classifier (loaded on first call)."""
global _SINGLETON
if _SINGLETON is None:
_SINGLETON = IndoorOutdoorClassifier(device=device)
return _SINGLETON