Spaces:

prs-eth
/

PaGeR

Running on Zero

App Files Files Community

PaGeR / src /utils /scene_classifier.py

vulus98

Fix ZeroGPU startup: disable Xet, run inference behind @spaces.GPU

aeea5fe about 1 month ago

Raw

History Blame Contribute Delete

7.58 kB

	"""Lightweight indoor/outdoor classifier for routing PaGeR's twin scale heads.

	Wraps CLIP ViT-B/32 (OpenAI weights via ``open_clip``) behind a tiny
	``classify(cubemap) -> ("indoor"\|"outdoor", p_outdoor)`` API. The classifier
	reuses the 4 equatorial faces of the cubemap that PaGeR's depth pipeline
	already computes from the input panorama (Front, Right, Back, Left at indices
	0-3 of the standard 'stack' face order), so it adds no extra projection cost.

	Each of the 4 faces is resized to 224×224, normalised with CLIP's image mean
	/ std, and run through CLIP's image encoder. Indoor / outdoor are represented
	as L2-normalised mean text-embedding centroids over a small fixed prompt
	ensemble; per-face cosine similarities are softmax'd with CLIP's 100× logit
	scale and averaged across faces to give ``P(outdoor)``.

	The default prompt set ('rich') was validated across the three evaluation
	datasets shipped with PaGeR (Stanford2D3DS, Matterport3D360, ZuriPano) and
	gives universally good routing without per-dataset tuning.
	"""

	from __future__ import annotations

	from typing import Sequence, Union

	import open_clip
	import torch
	import torch.nn.functional as F


	# Rich prompt ensemble — verified universal across PaGeR's eval datasets.
	DEFAULT_INDOOR_PROMPTS: tuple[str, ...] = (
	"an indoor scene",
	"the interior of a building",
	"a room inside a building",
	"a hallway",
	"an office room",
	"a bedroom",
	"a living room",
	"a kitchen",
	"an indoor space with walls and a ceiling",
	)
	DEFAULT_OUTDOOR_PROMPTS: tuple[str, ...] = (
	"an outdoor scene",
	"the outdoors",
	"outside in nature",
	"a street view",
	"a city street",
	"an urban panorama",
	"a park",
	"a landscape",
	"a residential neighborhood",
	)

	# CLIP image-encoder normalisation (different from ImageNet).
	_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
	_CLIP_STD = (0.26862954, 0.26130258, 0.27577711)


	class IndoorOutdoorClassifier:
	"""CLIP ViT-B/32 indoor/outdoor scene classifier operating on cubemap faces.

	Parameters
	----------
	device:
	Torch device the CLIP model lives on. Inputs are moved to this device
	inside :meth:`classify`.
	indoor_prompts, outdoor_prompts:
	Text prompts used to build the two class centroids. Each centroid is
	the L2-normalised mean of its prompts' L2-normalised CLIP embeddings.
	model_name, pretrained:
	Passed to :func:`open_clip.create_model_and_transforms`. The defaults
	(``"ViT-B-32"`` / ``"openai"``) give a 151M-param image+text model
	with the image encoder running at ~10 ms / sample on a recent GPU.
	"""

	def __init__(
	self,
	device: Union[str, torch.device] = "cuda",
	indoor_prompts: Sequence[str] = DEFAULT_INDOOR_PROMPTS,
	outdoor_prompts: Sequence[str] = DEFAULT_OUTDOOR_PROMPTS,
	model_name: str = "ViT-B-32",
	pretrained: str = "openai",
	) -> None:
	self.device = torch.device(device) if not isinstance(device, torch.device) else device

	# ``force_quick_gelu`` matches the activation that OpenAI's CLIP was
	# trained with; without it open_clip emits a warning and silently
	# falls back to standard GELU, which gives slightly different logits.
	model, _, _ = open_clip.create_model_and_transforms(
	model_name, pretrained=pretrained, device=self.device,
	force_quick_gelu=(pretrained == "openai"),
	)
	self.model = model.eval()
	self.tokenizer = open_clip.get_tokenizer(model_name)

	# The text centroids are built lazily on first ``classify`` call -- see
	# ``_ensure_centroids``. Building them here would run a CLIP text-encoder
	# forward pass at construction time, which breaks on HF ZeroGPU: a GPU
	# is only attached inside a ``@spaces.GPU`` window, so a forward run at
	# import/startup has no device. Loading the weights (above) is fine;
	# only the forward must wait for a GPU window.
	self._indoor_prompts = tuple(indoor_prompts)
	self._outdoor_prompts = tuple(outdoor_prompts)
	self.text_indoor = None # (1, D), filled lazily
	self.text_outdoor = None # (1, D), filled lazily

	# Image-side normalisation buffers (kept on device for fast inference).
	self.register_clip_norm(self.device)

	@torch.inference_mode()
	def _ensure_centroids(self) -> None:
	"""Build the indoor/outdoor text centroids on first use (idempotent).

	Each centroid is the L2-normalised mean of its prompts' L2-normalised
	CLIP text embeddings. Deferred out of ``__init__`` so the forward pass
	runs inside the caller's GPU window (see the note there)."""
	if self.text_indoor is not None:
	return
	centroids = {}
	for key, prompts in (("indoor", self._indoor_prompts),
	("outdoor", self._outdoor_prompts)):
	toks = self.tokenizer(list(prompts)).to(self.device)
	feats = F.normalize(self.model.encode_text(toks), dim=-1)
	centroids[key] = F.normalize(feats.mean(dim=0, keepdim=True), dim=-1)
	self.text_indoor = centroids["indoor"] # (1, D)
	self.text_outdoor = centroids["outdoor"] # (1, D)

	def register_clip_norm(self, device: torch.device) -> None:
	self._clip_mean = torch.tensor(_CLIP_MEAN, device=device).view(1, 3, 1, 1)
	self._clip_std = torch.tensor(_CLIP_STD, device=device).view(1, 3, 1, 1)

	@torch.inference_mode()
	def p_outdoor(self, cubemap_01: torch.Tensor) -> float:
	"""Return P(outdoor) for one panorama.

	Parameters
	----------
	cubemap_01:
	Cubemap tensor of shape ``(6, 3, F, F)`` or ``(1, 6, 3, F, F)``
	with raw RGB in ``[0, 1]`` (no normalisation). Only faces 0-3
	(Front, Right, Back, Left from the standard 'stack' order) are
	used; the top / bottom faces are ignored.
	"""
	if cubemap_01.ndim == 5:
	cubemap_01 = cubemap_01[0]
	if cubemap_01.ndim != 4 or cubemap_01.shape[0] < 4:
	raise ValueError(
	f"Expected cubemap of shape (6, 3, F, F); got {tuple(cubemap_01.shape)}"
	)
	self._ensure_centroids()
	eq = cubemap_01[:4].to(self.device).clamp(0, 1)
	eq = F.interpolate(eq, size=(224, 224), mode="bilinear",
	align_corners=False, antialias=True)
	eq = (eq - self._clip_mean) / self._clip_std
	feats = F.normalize(self.model.encode_image(eq), dim=-1) # (4, D)
	s_in = (feats @ self.text_indoor.T).squeeze(-1) # (4,)
	s_out = (feats @ self.text_outdoor.T).squeeze(-1) # (4,)
	probs = torch.stack([s_in, s_out], dim=-1).mul(100.0).softmax(dim=-1)
	return float(probs[:, 1].mean().item())

	def classify(self, cubemap_01: torch.Tensor, threshold: float = 0.5):
	"""Return ``(label, p_outdoor)``; ``label`` is ``"indoor"`` if
	``p_outdoor < threshold`` else ``"outdoor"``."""
	p = self.p_outdoor(cubemap_01)
	return ("outdoor" if p >= threshold else "indoor", p)


	_SINGLETON: "IndoorOutdoorClassifier \| None" = None


	def get_classifier(device: Union[str, torch.device] = "cuda") -> IndoorOutdoorClassifier:
	"""Return a process-wide singleton classifier (loaded on first call)."""
	global _SINGLETON
	if _SINGLETON is None:
	_SINGLETON = IndoorOutdoorClassifier(device=device)
	return _SINGLETON