Spaces:

hyper3labs
/

HyperView-ABO-Catalog

Running

App Files Files Community

github-actions[bot] commited on Jun 4

Commit

ccf4b11

1 Parent(s): 1779c3f

Deploy hyper3labs/HyperView-ABO-Catalog from Hyper3Labs/hyperview-spaces@fd3578c

Browse files

Files changed (16) hide show

Dockerfile +4 -8
README.md +9 -8
demo.py +2 -14
hyper3_clip/__init__.py +0 -3
hyper3_clip/models/__init__.py +0 -3
hyper3_clip/models/encoders.py +0 -173
hyper3_clip/models/experimental.py +0 -587
hyper3_clip/models/himo.py +0 -55
hyper3_clip/models/hyper3_clip.py +0 -958
hyper3_clip/models/lorentz.py +0 -265
hyper3_clip/models/losses.py +0 -1400
hyper3_clip/models/objectives.py +0 -580
hyper3_clip/models/tren.py +0 -255
hyper3_clip/training/__init__.py +0 -1
hyper3_clip/training/distributed.py +0 -149
hyper3_clip_provider.py +0 -115

Dockerfile CHANGED Viewed

@@ -20,7 +20,8 @@ WORKDIR $HOME/app
 RUN pip install --upgrade pip
-ARG HYPERVIEW_VERSION=0.6.0
 # Install CPU-only PyTorch first so the Space does not pull the default CUDA bundle.
 RUN pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
@@ -33,14 +34,9 @@ import hyperview as hv
 print("hyperview", hv.__version__, inspect.signature(hv.launch))
 PY
 RUN pip install \
     "datasets>=4.5.0" \
-    "Pillow>=12.0.0" \
-    "timm>=1.0.0" \
-    "transformers==4.49.0" \
-    "safetensors>=0.4.0" \
-    "pyyaml>=6.0.0" \
-    "sentencepiece>=0.2.0" \
-    "protobuf>=4.25.0"
 COPY --chown=user . .

 RUN pip install --upgrade pip
+ARG HYPERVIEW_VERSION=0.6.1
+ARG HYPER_MODELS_VERSION=0.3.0
 # Install CPU-only PyTorch first so the Space does not pull the default CUDA bundle.
 RUN pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
 print("hyperview", hv.__version__, inspect.signature(hv.launch))
 PY
 RUN pip install \
+    "hyper-models[ml]==${HYPER_MODELS_VERSION}" \
     "datasets>=4.5.0" \
+    "Pillow>=12.0.0"
 COPY --chown=user . .

README.md CHANGED Viewed

@@ -14,7 +14,7 @@ This demo builds a small Amazon Berkeley Objects product-catalog subset and open
 HyperView with two pinned scatter panels plus a comparison readout:
 - CLIP ViT-B/32 in a Euclidean 2D layout
-- Hyper3-CLIP `hyper3labs/hyper3-clip-v0.5` in a Poincare 2D layout
 The right-side panel uses fixed product examples to compare nearest-neighbor
 behavior for the same query under each model.
@@ -45,7 +45,7 @@ variables or edit the second entry in `MODEL_SPECS`:
 ```bash
 ABO_CANDIDATE_DISPLAY_NAME="New Model" \
-ABO_CANDIDATE_PROVIDER="hyper3-clip" \
 ABO_CANDIDATE_MODEL="new-model-id" \
 ABO_CANDIDATE_LAYOUT="poincare:2d" \
 ABO_CANDIDATE_GEOMETRY="poincare" \
@@ -61,10 +61,11 @@ JavaScript.
 This folder is intended to deploy to `hyper3labs/HyperView-ABO-Catalog` from
 the `hyperview-spaces` deployment repository.
-The Dockerfile installs `hyperview==0.6.0` from PyPI. The released HyperView
-wheel includes the built frontend assets, so this Space does not carry a local
-`static/` bundle or copy frontend files into the installed package.
-Hyper3-CLIP weights are loaded from the gated
-`hyper3labs/hyper3-clip-v0.5` model repository at runtime. The Space needs an
-`HF_TOKEN` secret with access to that model.

 HyperView with two pinned scatter panels plus a comparison readout:
 - CLIP ViT-B/32 in a Euclidean 2D layout
+- Hyper3-CLIP `hyper3-clip-v0.5` from `hyper-models` in a Poincare 2D layout
 The right-side panel uses fixed product examples to compare nearest-neighbor
 behavior for the same query under each model.
 ```bash
 ABO_CANDIDATE_DISPLAY_NAME="New Model" \
+ABO_CANDIDATE_PROVIDER="hyper-models" \
 ABO_CANDIDATE_MODEL="new-model-id" \
 ABO_CANDIDATE_LAYOUT="poincare:2d" \
 ABO_CANDIDATE_GEOMETRY="poincare" \
 This folder is intended to deploy to `hyper3labs/HyperView-ABO-Catalog` from
 the `hyperview-spaces` deployment repository.
+The Dockerfile installs `hyperview==0.6.1` and `hyper-models[ml]==0.3.0` from
+PyPI. The released HyperView wheel includes the built frontend assets, so this
+Space does not carry a local `static/` bundle or copy frontend files into the
+installed package.
+Hyper3-CLIP weights are loaded through the `hyper-models` catalog entry for the
+gated `hyper3labs/hyper3-clip-v0.5` model repository at runtime. The Space needs
+an `HF_TOKEN` secret with access to that model.

demo.py CHANGED Viewed

@@ -64,8 +64,8 @@ MODEL_SPECS = [
         "key": "candidate",
         "display_name": os.environ.get("ABO_CANDIDATE_DISPLAY_NAME", "Hyper3-CLIP"),
         "button_label": os.environ.get("ABO_CANDIDATE_BUTTON_LABEL", "Hyper3-CLIP query"),
-        "provider": os.environ.get("ABO_CANDIDATE_PROVIDER", "hyper3-clip"),
-        "model": os.environ.get("ABO_CANDIDATE_MODEL", "hyper3labs/hyper3-clip-v0.5"),
         "layout": os.environ.get("ABO_CANDIDATE_LAYOUT", "poincare:2d"),
         "geometry": os.environ.get("ABO_CANDIDATE_GEOMETRY", "poincare"),
         "layout_dimension": int(os.environ.get("ABO_CANDIDATE_LAYOUT_DIMENSION", "2")),
@@ -341,17 +341,6 @@ def supported_kwargs(func: Any, kwargs: dict[str, Any]) -> dict[str, Any]:
     return {key: value for key, value in kwargs.items() if key in params}
-def register_hyper3_clip_provider() -> None:
-    from hyperview.runtime import ProviderRegistry
-    ProviderRegistry().register_python(
-        "hyper3-clip",
-        "hyper3_clip_provider:Hyper3ClipEmbeddings",
-        description="Hyper3-CLIP v0.5 image embeddings from hyper3labs/hyper3-clip-v0.5",
-        overwrite=True,
-    )
 def api_base_url() -> str:
     host = "127.0.0.1" if SPACE_HOST == "0.0.0.0" else SPACE_HOST
     return f"http://{host}:{SPACE_PORT}"
@@ -500,7 +489,6 @@ def launch_demo(dataset: hv.Dataset, layouts: dict[str, str]) -> hv.Session:
 def main() -> None:
-    register_hyper3_clip_provider()
     dataset, layouts = build_dataset()
     print("Layouts:", flush=True)
     for spec in MODEL_SPECS:

         "key": "candidate",
         "display_name": os.environ.get("ABO_CANDIDATE_DISPLAY_NAME", "Hyper3-CLIP"),
         "button_label": os.environ.get("ABO_CANDIDATE_BUTTON_LABEL", "Hyper3-CLIP query"),
+        "provider": os.environ.get("ABO_CANDIDATE_PROVIDER", "hyper-models"),
+        "model": os.environ.get("ABO_CANDIDATE_MODEL", "hyper3-clip-v0.5"),
         "layout": os.environ.get("ABO_CANDIDATE_LAYOUT", "poincare:2d"),
         "geometry": os.environ.get("ABO_CANDIDATE_GEOMETRY", "poincare"),
         "layout_dimension": int(os.environ.get("ABO_CANDIDATE_LAYOUT_DIMENSION", "2")),
     return {key: value for key, value in kwargs.items() if key in params}
 def api_base_url() -> str:
     host = "127.0.0.1" if SPACE_HOST == "0.0.0.0" else SPACE_HOST
     return f"http://{host}:{SPACE_PORT}"
 def main() -> None:
     dataset, layouts = build_dataset()
     print("Layouts:", flush=True)
     for spec in MODEL_SPECS:

hyper3_clip/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-__all__ = ["Hyper3CLIP"]

hyper3_clip/models/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-__all__ = ["Hyper3CLIP"]

hyper3_clip/models/encoders.py DELETED Viewed

@@ -1,173 +0,0 @@
-from __future__ import annotations
-import timm
-import torch
-from torch import nn
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoTokenizer,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPVisionConfig,
-    CLIPVisionModel,
-    CLIPVisionModelWithProjection,
-    SiglipTextConfig,
-    SiglipTextModel,
-    SiglipVisionConfig,
-    SiglipVisionModel,
-)
-class VisionEncoder(nn.Module):
-    def __init__(self, backbone_name: str, pretrained: bool = True) -> None:
-        super().__init__()
-        self.kind = "timm"
-        if backbone_name.startswith("hf_clip_projected:"):
-            self.kind = "hf_clip_projected"
-            model_name = backbone_name.removeprefix("hf_clip_projected:")
-            self.backbone = (
-                CLIPVisionModelWithProjection.from_pretrained(model_name)
-                if pretrained
-                else CLIPVisionModelWithProjection(CLIPVisionConfig.from_pretrained(model_name))
-            )
-            self.output_dim = self.backbone.config.projection_dim
-        elif backbone_name.startswith("hf_clip:"):
-            self.kind = "hf_vision"
-            model_name = backbone_name.removeprefix("hf_clip:")
-            self.backbone = (
-                CLIPVisionModel.from_pretrained(model_name)
-                if pretrained
-                else CLIPVisionModel(CLIPVisionConfig.from_pretrained(model_name))
-            )
-            self.output_dim = self.backbone.config.hidden_size
-        elif backbone_name.startswith("hf_siglip:"):
-            self.kind = "hf_vision"
-            model_name = backbone_name.removeprefix("hf_siglip:")
-            self.backbone = (
-                SiglipVisionModel.from_pretrained(model_name)
-                if pretrained
-                else SiglipVisionModel(SiglipVisionConfig.from_pretrained(model_name))
-            )
-            self.output_dim = self.backbone.config.hidden_size
-        else:
-            self.backbone = timm.create_model(
-                backbone_name,
-                pretrained=pretrained,
-                num_classes=0,
-                global_pool="avg",
-            )
-            self.output_dim = self.backbone.num_features
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
-        if self.kind == "hf_clip_projected":
-            return self.backbone(pixel_values=image).image_embeds
-        if self.kind == "hf_vision":
-            out = self.backbone(pixel_values=image)
-            if hasattr(out, "pooler_output") and out.pooler_output is not None:
-                return out.pooler_output
-            return out.last_hidden_state[:, 0]
-        return self.backbone(image)
-    def forward_with_tokens(self, image: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        if self.kind == "hf_clip_projected":
-            out = self.backbone(pixel_values=image)
-            tokens = getattr(out, "last_hidden_state", None)
-            if tokens is None and hasattr(out, "vision_model_output"):
-                tokens = out.vision_model_output.last_hidden_state
-            if tokens is None:
-                raise RuntimeError("Projected CLIP vision output did not include patch tokens")
-            return out.image_embeds, tokens
-        if self.kind == "hf_vision":
-            out = self.backbone(pixel_values=image)
-            if hasattr(out, "pooler_output") and out.pooler_output is not None:
-                pooled = out.pooler_output
-            else:
-                pooled = out.last_hidden_state[:, 0]
-            return pooled, out.last_hidden_state
-        if not hasattr(self.backbone, "forward_features"):
-            pooled = self.backbone(image)
-            return pooled, pooled[:, None, :]
-        features = self.backbone.forward_features(image)
-        if hasattr(self.backbone, "forward_head"):
-            pooled = self.backbone.forward_head(features, pre_logits=False)
-        else:
-            pooled = self.backbone(image)
-        return pooled, _tokens_from_features(features)
-class TextEncoder(nn.Module):
-    def __init__(self, model_name: str, pretrained: bool = True, pooling: str = "auto") -> None:
-        super().__init__()
-        if pooling not in {"auto", "pooler", "cls", "mean"}:
-            raise ValueError(f"Unsupported text pooling {pooling!r}; expected auto, pooler, cls, or mean")
-        self.kind = "hf_text"
-        self.pooling = pooling
-        tokenizer_name = model_name.removeprefix("hf_clip_projected:")
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-        model_name_lower = model_name.lower()
-        if model_name.startswith("hf_clip_projected:"):
-            self.kind = "hf_clip_projected"
-            projected_model_name = model_name.removeprefix("hf_clip_projected:")
-            if pretrained:
-                self.backbone = CLIPTextModelWithProjection.from_pretrained(projected_model_name)
-            else:
-                self.backbone = CLIPTextModelWithProjection(CLIPTextConfig.from_pretrained(projected_model_name))
-            self.output_dim = self.backbone.config.projection_dim
-        elif "siglip" in model_name_lower:
-            if pretrained:
-                self.backbone = SiglipTextModel.from_pretrained(model_name)
-            else:
-                self.backbone = SiglipTextModel(SiglipTextConfig.from_pretrained(model_name))
-            self.output_dim = self.backbone.config.hidden_size
-        elif "clip" in model_name_lower:
-            if pretrained:
-                self.backbone = CLIPTextModel.from_pretrained(model_name)
-            else:
-                self.backbone = CLIPTextModel(CLIPTextConfig.from_pretrained(model_name))
-            self.output_dim = self.backbone.config.hidden_size
-        else:
-            if pretrained:
-                self.backbone = AutoModel.from_pretrained(model_name)
-            else:
-                self.backbone = AutoModel.from_config(AutoConfig.from_pretrained(model_name))
-            hidden_size = getattr(self.backbone.config, "hidden_size", None)
-            if hidden_size is None:
-                raise ValueError(f"Unsupported text model config for {model_name}")
-            self.output_dim = hidden_size
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
-        if self.kind == "hf_clip_projected":
-            return out.text_embeds
-        if self.pooling == "mean":
-            mask = attention_mask.to(dtype=out.last_hidden_state.dtype).unsqueeze(-1)
-            summed = (out.last_hidden_state * mask).sum(dim=1)
-            denom = mask.sum(dim=1).clamp_min(1.0)
-            return summed / denom
-        if self.pooling in {"auto", "pooler"} and hasattr(out, "pooler_output") and out.pooler_output is not None:
-            return out.pooler_output
-        return out.last_hidden_state[:, 0]
-def _tokens_from_features(features: torch.Tensor | dict | tuple | list) -> torch.Tensor:
-    if isinstance(features, dict):
-        for key in ("x", "last_hidden_state", "features"):
-            if key in features:
-                features = features[key]
-                break
-        else:
-            features = next(iter(features.values()))
-    if isinstance(features, tuple | list):
-        features = features[0]
-    if not torch.is_tensor(features):
-        raise TypeError(f"Expected tensor features, got {type(features)!r}")
-    if features.ndim == 4:
-        return features.flatten(2).transpose(1, 2)
-    if features.ndim == 3:
-        return features
-    if features.ndim == 2:
-        return features[:, None, :]
-    raise ValueError(f"Unsupported feature tensor shape {tuple(features.shape)}")

hyper3_clip/models/experimental.py DELETED Viewed

@@ -1,587 +0,0 @@
-from __future__ import annotations
-from collections.abc import Callable
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-from hyper3_clip.models.lorentz import exp_map0, metric_pairwise_dist
-from hyper3_clip.models.losses import beta_cal_loss
-from hyper3_clip.models.tren import TRENRegionEncoder
-from hyper3_clip.training.distributed import gather_variable_with_grad, gather_with_grad, get_rank
-ProjectionHeadFactory = Callable[[int, int, int | None], nn.Module]
-class ExperimentalObjectiveMixin:
-    @staticmethod
-    def _validate_experimental_options(
-        *,
-        proclip_geometry: str,
-        proclip_projection_hidden_dim: int | None,
-        proclip_component_dim: int | None,
-        beta_clip_weight: float,
-        beta_clip_global_weight: float,
-        beta_clip_beta: float,
-        beta_clip_variant: str,
-        beta_clip_similarity: str,
-        beta_clip_num_heads: int,
-        beta_clip_mlp_ratio: float,
-        tren_weight: float,
-        tren_visual_distill_weight: float,
-        tren_text_distill_weight: float,
-        tren_region_text_weight: float,
-        tren_num_region_tokens: int,
-        tren_num_decoder_layers: int,
-        tren_num_attention_heads: int,
-        tren_prompt_grid_size: int,
-        tren_dropout: float,
-    ) -> None:
-        if proclip_geometry not in {"product", "hyperbolic", "euclidean", "spherical", "clip"}:
-            raise ValueError("proclip_geometry must be 'product', 'hyperbolic', 'euclidean', 'spherical', or 'clip'")
-        if proclip_projection_hidden_dim is not None and proclip_projection_hidden_dim <= 0:
-            raise ValueError("proclip_projection_hidden_dim must be positive when set")
-        if proclip_component_dim is not None and proclip_component_dim <= 0:
-            raise ValueError("proclip_component_dim must be positive when set")
-        if beta_clip_variant not in {"ce", "bce"}:
-            raise ValueError("beta_clip_variant must be 'ce' or 'bce'")
-        if beta_clip_similarity not in {"metric", "dot"}:
-            raise ValueError("beta_clip_similarity must be 'metric' or 'dot'")
-        if beta_clip_weight < 0.0:
-            raise ValueError("beta_clip_weight must be non-negative")
-        if beta_clip_global_weight < 0.0:
-            raise ValueError("beta_clip_global_weight must be non-negative")
-        if beta_clip_beta < 0.0:
-            raise ValueError("beta_clip_beta must be non-negative")
-        if beta_clip_num_heads <= 0:
-            raise ValueError("beta_clip_num_heads must be positive")
-        if beta_clip_mlp_ratio <= 0.0:
-            raise ValueError("beta_clip_mlp_ratio must be positive")
-        if tren_weight < 0.0:
-            raise ValueError("tren_weight must be non-negative")
-        if tren_visual_distill_weight < 0.0 or tren_text_distill_weight < 0.0 or tren_region_text_weight < 0.0:
-            raise ValueError("T-REN loss weights must be non-negative")
-        if tren_num_region_tokens <= 0:
-            raise ValueError("tren_num_region_tokens must be positive")
-        if tren_num_decoder_layers <= 0:
-            raise ValueError("tren_num_decoder_layers must be positive")
-        if tren_num_attention_heads <= 0:
-            raise ValueError("tren_num_attention_heads must be positive")
-        if tren_prompt_grid_size <= 0:
-            raise ValueError("tren_prompt_grid_size must be positive")
-        if tren_dropout < 0.0:
-            raise ValueError("tren_dropout must be non-negative")
-    def _init_experimental_modules(
-        self,
-        *,
-        beta_clip_num_heads: int,
-        beta_clip_mlp_ratio: float,
-        tren_num_region_tokens: int,
-        tren_num_decoder_layers: int,
-        tren_num_attention_heads: int,
-        tren_prompt_grid_size: int,
-        tren_dropout: float,
-        projection_hidden_dim: int | None,
-        proclip_projection_hidden_dim: int | None,
-        projection_head: ProjectionHeadFactory,
-    ) -> None:
-        if self.beta_query_pooling_enabled:
-            if self.vision_encoder.output_dim % beta_clip_num_heads != 0:
-                raise ValueError("vision encoder output_dim must be divisible by beta_clip_num_heads")
-            beta_clip_hidden_dim = max(1, int(round(self.vision_encoder.output_dim * beta_clip_mlp_ratio)))
-            self.beta_clip_text_query_proj = nn.Linear(self.text_encoder.output_dim, self.vision_encoder.output_dim)
-            self.beta_clip_cross_attention = nn.MultiheadAttention(
-                self.vision_encoder.output_dim,
-                beta_clip_num_heads,
-                batch_first=True,
-            )
-            self.beta_clip_mlp_norm = nn.LayerNorm(self.vision_encoder.output_dim)
-            self.beta_clip_pool_mlp = nn.Sequential(
-                nn.Linear(self.vision_encoder.output_dim, beta_clip_hidden_dim),
-                nn.GELU(),
-                nn.Linear(beta_clip_hidden_dim, self.vision_encoder.output_dim),
-            )
-        if self.beta_clip_enabled:
-            self.beta_clip_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-        if self.tren_enabled:
-            self.tren_region_encoder = TRENRegionEncoder(
-                vision_dim=self.vision_encoder.output_dim,
-                text_dim=self.text_encoder.output_dim,
-                num_region_tokens=tren_num_region_tokens,
-                num_decoder_layers=tren_num_decoder_layers,
-                num_attention_heads=tren_num_attention_heads,
-                prompt_grid_size=tren_prompt_grid_size,
-                dropout=tren_dropout,
-            )
-            self.tren_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-        if self.proclip_enabled:
-            component_dim = self._proclip_component_dim
-            spherical_dim = self._proclip_spherical_ambient_dim
-            proclip_hidden_dim = proclip_projection_hidden_dim
-            if proclip_hidden_dim is None:
-                proclip_hidden_dim = projection_hidden_dim
-            if self.proclip_dedicated_hyperbolic:
-                self.proclip_image_hyperbolic_proj = projection_head(
-                    self.vision_encoder.output_dim, self.embed_dim, proclip_hidden_dim
-                )
-                self.proclip_text_hyperbolic_proj = projection_head(
-                    self.text_encoder.output_dim, self.embed_dim, proclip_hidden_dim
-                )
-            self.proclip_image_euclidean_proj = projection_head(
-                self.vision_encoder.output_dim, component_dim, proclip_hidden_dim
-            )
-            self.proclip_text_euclidean_proj = projection_head(
-                self.text_encoder.output_dim, component_dim, proclip_hidden_dim
-            )
-            self.proclip_image_spherical_proj = projection_head(
-                self.vision_encoder.output_dim, spherical_dim, proclip_hidden_dim
-            )
-            self.proclip_text_spherical_proj = projection_head(
-                self.text_encoder.output_dim, spherical_dim, proclip_hidden_dim
-            )
-            self.proclip_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-            self.proclip_log_weights = nn.Parameter(torch.zeros(3))
-    @property
-    def proclip_enabled(self) -> bool:
-        return (
-            self.objective_name == "proclip"
-            or self.proclip_component_dim is not None
-            or self.proclip_weight > 0.0
-            or self.proclip_retrieval
-        )
-    @property
-    def beta_clip_enabled(self) -> bool:
-        return self.beta_clip_weight > 0.0
-    @property
-    def beta_query_pooling_enabled(self) -> bool:
-        return self.beta_clip_enabled or (
-            self.objective_name == "uncha"
-            and self.uncha_entailment_loss in {"hier_beta_argent", "hier_beta_sourcepart_argent"}
-        )
-    @property
-    def tren_enabled(self) -> bool:
-        return self.tren_weight > 0.0
-    @property
-    def _proclip_component_dim(self) -> int:
-        return int(self.proclip_component_dim or self.embed_dim)
-    @property
-    def _proclip_spherical_ambient_dim(self) -> int:
-        return self._proclip_component_dim + 1
-    def _clamp_experimental_logit_scales(self) -> None:
-        if self.proclip_enabled:
-            self.proclip_logit_scale.clamp_(max=4.6052)
-        if self.beta_clip_enabled:
-            self.beta_clip_logit_scale.clamp_(max=4.6052)
-        if self.tren_enabled:
-            self.tren_logit_scale.clamp_(max=4.6052)
-    def _detached_experimental_logit_scales(self) -> dict[str, torch.Tensor]:
-        logs = {}
-        if self.proclip_enabled:
-            logs.update(self._detached_proclip_logs())
-        if self.beta_clip_enabled:
-            logs["beta_clip_logit_scale"] = self.beta_clip_logit_scale.exp().detach()
-        if self.tren_enabled:
-            logs["tren_logit_scale"] = self.tren_logit_scale.exp().detach()
-        return logs
-    def _beta_clip_global_contrastive_loss(
-        self,
-        *,
-        image_euc: torch.Tensor,
-        text_euc: torch.Tensor,
-        targets: torch.Tensor,
-    ) -> torch.Tensor:
-        image_feats = F.normalize(image_euc.float(), dim=-1)
-        text_feats = F.normalize(text_euc.float(), dim=-1)
-        all_image_feats = gather_with_grad(image_feats)
-        all_text_feats = gather_with_grad(text_feats)
-        if self.objective_name == "hycoclip":
-            scale = self.logit_scale.exp().clamp(max=100.0)
-        elif self.objective_name == "proclip":
-            scale = self.proclip_logit_scale.exp().clamp(max=100.0)
-        else:
-            scale = self.global_logit_scale.exp().clamp(max=100.0)
-        logits_i_t = image_feats @ all_text_feats.T * scale
-        logits_t_i = text_feats @ all_image_feats.T * scale
-        return 0.5 * (F.cross_entropy(logits_i_t, targets) + F.cross_entropy(logits_t_i, targets))
-    def _beta_query_entailment_embeddings(
-        self,
-        *,
-        image_tokens: torch.Tensor,
-        beta_query_input_ids: torch.Tensor | None,
-        beta_query_attention_mask: torch.Tensor | None,
-        beta_query_owner: torch.Tensor | None,
-        beta_query_parent: torch.Tensor | None,
-        beta_query_weight: torch.Tensor | None,
-        beta_query_source_part: torch.Tensor | None,
-        kappa: torch.Tensor,
-        query_base: torch.Tensor | None = None,
-    ) -> dict[str, torch.Tensor]:
-        if beta_query_input_ids is None or beta_query_attention_mask is None or beta_query_owner is None:
-            raise ValueError(f"{self.uncha_entailment_loss} requires beta query tensors from the collator")
-        if beta_query_parent is None or beta_query_weight is None:
-            raise ValueError(f"{self.uncha_entailment_loss} requires beta query hierarchy metadata from the collator")
-        if self.uncha_entailment_loss == "hier_beta_sourcepart_argent" and beta_query_source_part is None:
-            raise ValueError("hier_beta_sourcepart_argent requires beta_query_source_part from the collator")
-        if beta_query_input_ids.shape[0] == 0:
-            source_part = (
-                beta_query_source_part.to(device=image_tokens.device, dtype=torch.long)
-                if beta_query_source_part is not None
-                else beta_query_owner.new_zeros((0,), device=image_tokens.device, dtype=torch.long)
-            )
-            return {
-                "beta_query_image_feats": image_tokens.new_zeros((0, self.embed_dim)),
-                "beta_query_text_feats": image_tokens.new_zeros((0, self.embed_dim)),
-                "beta_query_owner": beta_query_owner.to(device=image_tokens.device, dtype=torch.long),
-                "beta_query_parent": beta_query_parent.to(device=image_tokens.device, dtype=torch.long),
-                "beta_query_weight": beta_query_weight.to(device=image_tokens.device, dtype=torch.float32),
-                "beta_query_source_part": source_part,
-            }
-        query_owner = beta_query_owner.to(device=image_tokens.device, dtype=torch.long)
-        if query_base is None:
-            query_base = self.encode_text_base(beta_query_input_ids, beta_query_attention_mask)
-        conditioned_image_base = self._beta_clip_text_conditioned_pool(image_tokens, query_base, query_owner)
-        query_image_euc = self.image_proj(conditioned_image_base)
-        query_text_euc = self.text_proj(query_base)
-        return {
-            "beta_query_image_feats": self.project_image_features(query_image_euc),
-            "beta_query_text_feats": self.project_text_features(query_text_euc),
-            "beta_query_owner": query_owner,
-            "beta_query_parent": beta_query_parent.to(device=image_tokens.device, dtype=torch.long),
-            "beta_query_weight": beta_query_weight.to(device=image_tokens.device, dtype=torch.float32),
-            **(
-                {"beta_query_source_part": beta_query_source_part.to(device=image_tokens.device, dtype=torch.long)}
-                if beta_query_source_part is not None
-                else {}
-            ),
-        }
-    def _beta_clip_auxiliary_loss(
-        self,
-        *,
-        image_tokens: torch.Tensor,
-        beta_query_input_ids: torch.Tensor | None,
-        beta_query_attention_mask: torch.Tensor | None,
-        beta_query_owner: torch.Tensor | None,
-        global_targets: torch.Tensor,
-        kappa: torch.Tensor,
-    ) -> torch.Tensor:
-        if beta_query_input_ids is None or beta_query_attention_mask is None or beta_query_owner is None:
-            raise ValueError("beta-CLIP auxiliary requires beta query tensors from the collator")
-        if beta_query_input_ids.shape[0] == 0:
-            return image_tokens.new_zeros(())
-        beta_query_owner = beta_query_owner.to(device=image_tokens.device, dtype=torch.long)
-        query_base = self.encode_text_base(beta_query_input_ids, beta_query_attention_mask)
-        conditioned_image_base = self._beta_clip_text_conditioned_pool(image_tokens, query_base, beta_query_owner)
-        query_image_euc = self.image_proj(conditioned_image_base)
-        query_text_euc = self.text_proj(query_base)
-        if self.beta_clip_similarity == "dot":
-            query_image_feats = F.normalize(query_image_euc.float(), dim=-1)
-            query_text_feats = F.normalize(query_text_euc.float(), dim=-1)
-        else:
-            query_image_feats = self.project_image_features(query_image_euc)
-            query_text_feats = self.project_text_features(query_text_euc)
-        all_query_image_feats, query_counts = gather_variable_with_grad(query_image_feats)
-        all_query_text_feats, _ = gather_variable_with_grad(query_text_feats)
-        query_offset = query_counts[: get_rank()].sum() if query_counts.numel() > 1 else query_counts.new_zeros(())
-        query_targets = torch.arange(query_image_feats.size(0), device=query_image_feats.device) + query_offset
-        query_group_ids = global_targets.index_select(0, beta_query_owner)
-        all_query_group_ids, _ = gather_variable_with_grad(query_group_ids)
-        scale = self.beta_clip_logit_scale.exp().clamp(max=100.0)
-        if self.beta_clip_similarity == "dot":
-            logits_i_t = query_image_feats @ all_query_text_feats.T * scale
-            logits_t_i = query_text_feats @ all_query_image_feats.T * scale
-        else:
-            logits_i_t = -metric_pairwise_dist(
-                query_image_feats,
-                all_query_text_feats,
-                kappa,
-                product_metric=self.phyclip_product_metric,
-            ) * scale
-            logits_t_i = -metric_pairwise_dist(
-                query_text_feats,
-                all_query_image_feats,
-                kappa,
-                product_metric=self.phyclip_product_metric,
-            ) * scale
-        return 0.5 * (
-            beta_cal_loss(
-                logits_i_t,
-                targets=query_targets,
-                group_ids=query_group_ids,
-                all_group_ids=all_query_group_ids,
-                beta=self.beta_clip_beta,
-                variant=self.beta_clip_variant,
-            )
-            + beta_cal_loss(
-                logits_t_i,
-                targets=query_targets,
-                group_ids=query_group_ids,
-                all_group_ids=all_query_group_ids,
-                beta=self.beta_clip_beta,
-                variant=self.beta_clip_variant,
-            )
-        )
-    def _beta_clip_text_conditioned_pool(
-        self,
-        image_tokens: torch.Tensor,
-        query_base: torch.Tensor,
-        query_owner: torch.Tensor,
-    ) -> torch.Tensor:
-        if image_tokens.ndim != 3:
-            raise ValueError("beta-CLIP image tokens must have shape [batch, tokens, dim]")
-        if getattr(self, "group_beta_query_pooling", False):
-            return self._beta_clip_text_conditioned_pool_grouped(image_tokens, query_base, query_owner)
-        if self.beta_clip_drop_cls_token and image_tokens.size(1) > 1:
-            image_tokens = image_tokens[:, 1:, :]
-        selected_tokens = image_tokens.index_select(0, query_owner).to(dtype=query_base.dtype)
-        query = self.beta_clip_text_query_proj(query_base).unsqueeze(1)
-        attended, _ = self.beta_clip_cross_attention(query, selected_tokens, selected_tokens, need_weights=False)
-        pooled = attended.squeeze(1)
-        return pooled + self.beta_clip_pool_mlp(self.beta_clip_mlp_norm(pooled))
-    def _beta_clip_text_conditioned_pool_grouped(
-        self,
-        image_tokens: torch.Tensor,
-        query_base: torch.Tensor,
-        query_owner: torch.Tensor,
-    ) -> torch.Tensor:
-        if query_owner.numel() == 0:
-            return query_base.new_zeros((0, self.vision_encoder.output_dim))
-        if query_owner.min().item() < 0 or query_owner.max().item() >= image_tokens.size(0):
-            raise IndexError("beta_query_owner contains an out-of-range image index")
-        tokens = image_tokens[:, 1:, :] if self.beta_clip_drop_cls_token and image_tokens.size(1) > 1 else image_tokens
-        tokens = tokens.to(dtype=query_base.dtype)
-        query_projected = self.beta_clip_text_query_proj(query_base)
-        counts = torch.bincount(query_owner, minlength=image_tokens.size(0))
-        max_queries = int(counts.max().item())
-        order = torch.argsort(query_owner)
-        sorted_owner = query_owner.index_select(0, order)
-        owner_offsets = torch.zeros_like(counts)
-        owner_offsets[1:] = counts.cumsum(0)[:-1]
-        sorted_positions = torch.arange(query_owner.numel(), device=query_owner.device) - owner_offsets.index_select(
-            0, sorted_owner
-        )
-        positions = torch.empty_like(sorted_positions)
-        positions[order] = sorted_positions
-        packed_query = query_projected.new_zeros((image_tokens.size(0), max_queries, query_projected.size(-1)))
-        packed_query[query_owner, positions] = query_projected
-        attended, _ = self.beta_clip_cross_attention(packed_query, tokens, tokens, need_weights=False)
-        pooled = attended[query_owner, positions]
-        return pooled + self.beta_clip_pool_mlp(self.beta_clip_mlp_norm(pooled))
-    def _tren_auxiliary_losses(
-        self,
-        *,
-        image_tokens: torch.Tensor,
-        part_owner: torch.Tensor,
-        part_image_base: torch.Tensor,
-        part_text_base: torch.Tensor,
-    ) -> dict[str, torch.Tensor]:
-        zero = image_tokens.new_zeros(())
-        if part_owner.numel() == 0:
-            return {
-                "tren_loss": zero,
-                "tren_visual_distill_loss": zero,
-                "tren_text_distill_loss": zero,
-                "tren_region_text_contrastive_loss": zero,
-                "tren_assignment_count": part_owner.new_tensor(0),
-            }
-        tren_outputs = self.tren_region_encoder(image_tokens)
-        visual_tokens = tren_outputs["visual_tokens"].flatten(1, 2)
-        text_tokens = tren_outputs["text_aligned_tokens"].flatten(1, 2)
-        matched_visual: list[torch.Tensor] = []
-        matched_text: list[torch.Tensor] = []
-        target_visual: list[torch.Tensor] = []
-        target_text: list[torch.Tensor] = []
-        for owner in range(image_tokens.size(0)):
-            region_mask = part_owner == owner
-            if not bool(region_mask.any()):
-                continue
-            owner_target_visual = part_image_base[region_mask].detach()
-            owner_target_text = part_text_base[region_mask].detach()
-            owner_visual_tokens = visual_tokens[owner]
-            owner_text_tokens = text_tokens[owner]
-            pred_indices, target_indices = _greedy_region_assignment(owner_visual_tokens, owner_target_visual)
-            if pred_indices.numel() == 0:
-                continue
-            matched_visual.append(owner_visual_tokens.index_select(0, pred_indices))
-            matched_text.append(owner_text_tokens.index_select(0, pred_indices))
-            target_visual.append(owner_target_visual.index_select(0, target_indices))
-            target_text.append(owner_target_text.index_select(0, target_indices))
-        if not matched_visual:
-            return {
-                "tren_loss": zero,
-                "tren_visual_distill_loss": zero,
-                "tren_text_distill_loss": zero,
-                "tren_region_text_contrastive_loss": zero,
-                "tren_assignment_count": part_owner.new_tensor(0),
-            }
-        matched_visual_tensor = torch.cat(matched_visual, dim=0)
-        matched_text_tensor = torch.cat(matched_text, dim=0)
-        target_visual_tensor = torch.cat(target_visual, dim=0)
-        target_text_tensor = torch.cat(target_text, dim=0)
-        visual_distill = 1.0 - F.cosine_similarity(matched_visual_tensor, target_visual_tensor, dim=-1).mean()
-        text_distill = 1.0 - F.cosine_similarity(matched_text_tensor, target_text_tensor, dim=-1).mean()
-        region_text = _symmetric_dot_contrastive(
-            matched_text_tensor,
-            target_text_tensor,
-            scale=self.tren_logit_scale.exp().clamp(max=100.0),
-        )
-        total = (
-            self.tren_visual_distill_weight * visual_distill
-            + self.tren_text_distill_weight * text_distill
-            + self.tren_region_text_weight * region_text
-        )
-        return {
-            "tren_loss": total,
-            "tren_visual_distill_loss": visual_distill,
-            "tren_text_distill_loss": text_distill,
-            "tren_region_text_contrastive_loss": region_text,
-            "tren_assignment_count": part_owner.new_tensor(matched_visual_tensor.size(0)),
-        }
-    def _project_proclip_image_base(self, base_feats: torch.Tensor, hyperbolic: torch.Tensor) -> torch.Tensor:
-        if self.proclip_geometry == "clip":
-            return F.normalize(base_feats.float(), dim=-1)
-        if self.proclip_dedicated_hyperbolic:
-            hyperbolic = exp_map0(self.proclip_image_hyperbolic_proj(base_feats.float()), self._kappa().float())
-        return self._pack_proclip_features(
-            hyperbolic=hyperbolic,
-            euclidean=self.proclip_image_euclidean_proj(base_feats.float()),
-            spherical=self.proclip_image_spherical_proj(base_feats.float()),
-        )
-    def _project_proclip_text_base(self, base_feats: torch.Tensor, hyperbolic: torch.Tensor) -> torch.Tensor:
-        if self.proclip_geometry == "clip":
-            return F.normalize(base_feats.float(), dim=-1)
-        if self.proclip_dedicated_hyperbolic:
-            hyperbolic = exp_map0(self.proclip_text_hyperbolic_proj(base_feats.float()), self._kappa().float())
-        return self._pack_proclip_features(
-            hyperbolic=hyperbolic,
-            euclidean=self.proclip_text_euclidean_proj(base_feats.float()),
-            spherical=self.proclip_text_spherical_proj(base_feats.float()),
-        )
-    def _pack_proclip_features(self, hyperbolic: torch.Tensor, euclidean: torch.Tensor, spherical: torch.Tensor) -> torch.Tensor:
-        spherical = F.normalize(spherical.float(), dim=-1)
-        if self.proclip_geometry == "hyperbolic":
-            return hyperbolic.float()
-        if self.proclip_geometry == "euclidean":
-            return euclidean.float()
-        if self.proclip_geometry == "spherical":
-            return spherical
-        return torch.cat([hyperbolic.float(), euclidean.float(), spherical], dim=-1)
-    def _split_proclip_features(self, feats: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        hyperbolic_dim = self.embed_dim + 1
-        component_dim = self._proclip_component_dim
-        spherical_dim = self._proclip_spherical_ambient_dim
-        hyperbolic = feats[:, :hyperbolic_dim]
-        euclidean = feats[:, hyperbolic_dim : hyperbolic_dim + component_dim]
-        spherical = feats[:, hyperbolic_dim + component_dim : hyperbolic_dim + component_dim + spherical_dim]
-        return hyperbolic, euclidean, spherical
-    def _proclip_similarity_scores(self, image_feats: torch.Tensor, text_feats: torch.Tensor) -> torch.Tensor:
-        if self.proclip_geometry == "clip":
-            return image_feats.float() @ text_feats.float().T
-        if self.proclip_geometry == "hyperbolic":
-            return -metric_pairwise_dist(image_feats, text_feats, self._kappa()).square()
-        if self.proclip_geometry == "euclidean":
-            return -torch.cdist(image_feats.float(), text_feats.float(), p=2).square()
-        if self.proclip_geometry == "spherical":
-            dot = (image_feats.float() @ text_feats.float().T).clamp(min=-1.0 + 1e-6, max=1.0 - 1e-6)
-            return -torch.acos(dot).square()
-        image_hyp, image_euc, image_sph = self._split_proclip_features(image_feats)
-        text_hyp, text_euc, text_sph = self._split_proclip_features(text_feats)
-        weights = self.proclip_log_weights.exp().to(device=image_feats.device, dtype=torch.float32)
-        hyperbolic_dist2 = metric_pairwise_dist(image_hyp, text_hyp, self._kappa()).square()
-        euclidean_dist2 = torch.cdist(image_euc.float(), text_euc.float(), p=2).square()
-        spherical_dot = (image_sph.float() @ text_sph.float().T).clamp(min=-1.0 + 1e-6, max=1.0 - 1e-6)
-        spherical_dist2 = torch.acos(spherical_dot).square()
-        return -(weights[0] * hyperbolic_dist2 + weights[1] * euclidean_dist2 + weights[2] * spherical_dist2)
-    def _proclip_contrastive_loss(
-        self,
-        image_feats: torch.Tensor,
-        text_feats: torch.Tensor,
-        all_image_feats: torch.Tensor,
-        all_text_feats: torch.Tensor,
-        targets: torch.Tensor,
-    ) -> torch.Tensor:
-        scale = self.proclip_logit_scale.exp().clamp(max=100.0)
-        logits_i_t = self._proclip_similarity_scores(image_feats, all_text_feats) * scale
-        logits_t_i = self._proclip_similarity_scores(text_feats, all_image_feats) * scale
-        return 0.5 * (F.cross_entropy(logits_i_t, targets) + F.cross_entropy(logits_t_i, targets))
-    def _detached_proclip_logs(self) -> dict[str, torch.Tensor]:
-        weights = self.proclip_log_weights.exp().detach()
-        return {
-            "proclip_logit_scale": self.proclip_logit_scale.exp().detach(),
-            "proclip_hyperbolic_weight": weights[0],
-            "proclip_euclidean_weight": weights[1],
-            "proclip_spherical_weight": weights[2],
-        }
-def _greedy_region_assignment(pred_tokens: torch.Tensor, target_tokens: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    if pred_tokens.numel() == 0 or target_tokens.numel() == 0:
-        empty = torch.zeros((0,), dtype=torch.long, device=pred_tokens.device)
-        return empty, empty
-    similarities = F.normalize(pred_tokens.float(), dim=-1) @ F.normalize(target_tokens.float(), dim=-1).T
-    pair_scores = similarities.flatten()
-    order = torch.argsort(pair_scores, descending=True)
-    used_pred = torch.zeros(pred_tokens.size(0), dtype=torch.bool, device=pred_tokens.device)
-    used_target = torch.zeros(target_tokens.size(0), dtype=torch.bool, device=pred_tokens.device)
-    pred_indices: list[torch.Tensor] = []
-    target_indices: list[torch.Tensor] = []
-    for flat_index in order:
-        pred_index = torch.div(flat_index, target_tokens.size(0), rounding_mode="floor")
-        target_index = flat_index % target_tokens.size(0)
-        if used_pred[pred_index] or used_target[target_index]:
-            continue
-        used_pred[pred_index] = True
-        used_target[target_index] = True
-        pred_indices.append(pred_index)
-        target_indices.append(target_index)
-        if len(target_indices) == target_tokens.size(0):
-            break
-    if not pred_indices:
-        empty = torch.zeros((0,), dtype=torch.long, device=pred_tokens.device)
-        return empty, empty
-    return torch.stack(pred_indices), torch.stack(target_indices)
-def _symmetric_dot_contrastive(region_tokens: torch.Tensor, text_tokens: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
-    if region_tokens.size(0) == 1:
-        return region_tokens.new_zeros(())
-    region_tokens = F.normalize(region_tokens.float(), dim=-1)
-    text_tokens = F.normalize(text_tokens.float(), dim=-1)
-    logits = region_tokens @ text_tokens.T * scale
-    targets = torch.arange(logits.size(0), device=logits.device)
-    return 0.5 * (F.cross_entropy(logits, targets) + F.cross_entropy(logits.T, targets))

hyper3_clip/models/himo.py DELETED Viewed

@@ -1,55 +0,0 @@
-from __future__ import annotations
-import torch
-from torch import Tensor
-def hide_reconstruct_embeddings(
-    embeddings: Tensor,
-    *,
-    variance_threshold: float = 0.9,
-    detach_pca: bool = True,
-    eps: float = 1e-8,
-) -> Tensor:
-    """HiMo-CLIP HiDe: PCA-reconstruct embeddings using top principal components.
-    Given a batch of embeddings ``U ∈ R^{B×D}``, compute mean-centered embeddings,
-    perform SVD/PCA, choose the smallest number of components whose cumulative
-    explained variance exceeds ``variance_threshold``, and reconstruct each
-    embedding from this principal subspace:
-        u'_i = P^T (P (u_i - ū)) + ū
-    where P stacks the selected principal components as rows.
-    """
-    if embeddings.ndim != 2:
-        raise ValueError("hide_reconstruct_embeddings expects a [batch, dim] tensor")
-    if not (0.0 < variance_threshold <= 1.0):
-        raise ValueError("variance_threshold must be in (0, 1]")
-    if embeddings.size(0) < 2:
-        return embeddings
-    u = embeddings.to(dtype=torch.float32)
-    mean = u.mean(dim=0, keepdim=True)
-    centered = u - mean
-    if detach_pca:
-        centered_for_pca = centered.detach()
-    else:
-        centered_for_pca = centered
-    # SVD: centered = U S Vh, principal components are rows of Vh.
-    _, s, vh = torch.linalg.svd(centered_for_pca, full_matrices=False)
-    if s.numel() == 0 or float((s.square().sum()).item()) <= eps:
-        return embeddings
-    explained = s.square()
-    cumulative = explained.cumsum(dim=0) / explained.sum().clamp_min(eps)
-    m = int((cumulative >= variance_threshold).to(dtype=torch.int64).argmax().item()) + 1
-    m = max(1, min(m, vh.size(0)))
-    p = vh[:m]
-    if detach_pca:
-        p = p.detach()
-    recon = (centered @ p.T) @ p + mean
-    return recon.to(dtype=embeddings.dtype)

hyper3_clip/models/hyper3_clip.py DELETED Viewed

@@ -1,958 +0,0 @@
-from __future__ import annotations
-import torch
-import torch.nn.functional as F
-from torch import nn
-from hyper3_clip.models.encoders import TextEncoder, VisionEncoder
-from hyper3_clip.models.experimental import ExperimentalObjectiveMixin
-from hyper3_clip.models.himo import hide_reconstruct_embeddings
-from hyper3_clip.models.lorentz import exp_map0, metric_similarity
-from hyper3_clip.models.objectives import build_objective
-from hyper3_clip.training.distributed import (
-    gather_with_grad,
-    get_rank,
-    get_world_size,
-    local_target_indices,
-)
-class Hyper3CLIP(ExperimentalObjectiveMixin, nn.Module):
-    def __init__(
-        self,
-        vision_backbone: str,
-        text_model_name: str,
-        embed_dim: int,
-        curv_init: float,
-        learn_curv: bool,
-        entail_weight: float,
-        inter_aperture_scale: float,
-        intra_aperture_scale: float,
-        objective: str = "hycoclip",
-        uncha_piecewise_factor: float = 0.1,
-        uncha_calibration_alpha: float = 10.0,
-        uncha_stop_grad_calibration: bool = True,
-        vision_pretrained: bool = True,
-        text_pretrained: bool = True,
-        text_pooling: str = "auto",
-        freeze_vision_encoder: bool = False,
-        freeze_text_encoder: bool = False,
-        normalize_encoder_features: bool = False,
-        projection_hidden_dim: int | None = None,
-        uncha_entailment_geometry: str = "lorentz",
-        uncha_aggregate_weight: float = 0.0,
-        uncha_entailment_loss: str = "piecewise",
-        uncha_argent_beta: float = 1.0,
-        uncha_argent_norm_weight: float = 0.0,
-        uncha_argent_aux_weight: float = 0.5,
-        uncha_argent_aggregation: str = "uncha",
-        uncha_part_weight_power: float = 0.0,
-        uncha_contrastive_loss: str = "ce",
-        uncha_sigmoid_bias_init: float = -10.0,
-        uncha_sigmoid_negative_weight: float = 1.0,
-        uncha_part_quality_mode: str = "none",
-        uncha_part_quality_topk: int = 5,
-        uncha_part_quality_temperature: float = 4.0,
-        uncha_entailment_warmup_steps: int = 0,
-        uncha_contrastive_global_weight: float = 1.0,
-        uncha_contrastive_local_weight: float = 1.0,
-        uncha_contrastive_global_local_weight: float = 1.0,
-        uncha_global_local_mode: str = "repeat",
-        uncha_global_local_metric: str = "distance",
-        uncha_global_local_angle_aux_weight: float = 0.0,
-        uncha_global_local_angle_aux_mode: str = "contrastive",
-        uncha_global_local_angle_aux_scale: float = 5.5,
-        uncha_global_local_angle_aux_aperture_scale: float = 1.0,
-        uncha_beta_cal_beta: float = 0.0,
-        uncha_beta_cal_variant: str = "ce",
-        uncha_beta_cal_weight: float = 0.0,
-        uncha_himo_component_weight: float = 0.0,
-        uncha_himo_variance_threshold: float = 0.9,
-        uncha_himo_detach_pca: bool = True,
-        uncha_radius_order_weight: float = 0.0,
-        uncha_radius_order_margin: float = 0.0,
-        uncha_gramian_align_weight: float = 0.0,
-        phyclip_subspace_dim: int | None = None,
-        phyclip_product_metric: str = "l1",
-        proclip_weight: float = 0.0,
-        proclip_component_dim: int | None = None,
-        proclip_retrieval: bool = False,
-        proclip_geometry: str = "product",
-        proclip_dedicated_hyperbolic: bool = False,
-        proclip_projection_hidden_dim: int | None = None,
-        beta_clip_weight: float = 0.0,
-        beta_clip_global_weight: float = 0.0,
-        beta_clip_beta: float = 0.5,
-        beta_clip_variant: str = "ce",
-        beta_clip_similarity: str = "metric",
-        beta_clip_num_heads: int = 8,
-        beta_clip_mlp_ratio: float = 4.0,
-        beta_clip_drop_cls_token: bool = True,
-        tren_weight: float = 0.0,
-        tren_visual_distill_weight: float = 1.0,
-        tren_text_distill_weight: float = 1.0,
-        tren_region_text_weight: float = 1.0,
-        tren_num_region_tokens: int = 3,
-        tren_num_decoder_layers: int = 2,
-        tren_num_attention_heads: int = 8,
-        tren_prompt_grid_size: int = 7,
-        tren_dropout: float = 0.1,
-        fuse_whole_part_encoder_forwards: bool = False,
-        fuse_beta_query_encoder_forwards: bool = False,
-        group_beta_query_pooling: bool = False,
-        objective_autocast_dtype: str = "float32",
-    ) -> None:
-        super().__init__()
-        if objective not in {"hycoclip", "uncha", "proclip"}:
-            raise ValueError(f"Unsupported objective {objective!r}; expected 'hycoclip', 'uncha', or 'proclip'")
-        if phyclip_product_metric not in {"l1", "l2"}:
-            raise ValueError("phyclip_product_metric must be 'l1' or 'l2'")
-        self._validate_experimental_options(
-            proclip_geometry=proclip_geometry,
-            proclip_projection_hidden_dim=proclip_projection_hidden_dim,
-            proclip_component_dim=proclip_component_dim,
-            beta_clip_weight=beta_clip_weight,
-            beta_clip_global_weight=beta_clip_global_weight,
-            beta_clip_beta=beta_clip_beta,
-            beta_clip_variant=beta_clip_variant,
-            beta_clip_similarity=beta_clip_similarity,
-            beta_clip_num_heads=beta_clip_num_heads,
-            beta_clip_mlp_ratio=beta_clip_mlp_ratio,
-            tren_weight=tren_weight,
-            tren_visual_distill_weight=tren_visual_distill_weight,
-            tren_text_distill_weight=tren_text_distill_weight,
-            tren_region_text_weight=tren_region_text_weight,
-            tren_num_region_tokens=tren_num_region_tokens,
-            tren_num_decoder_layers=tren_num_decoder_layers,
-            tren_num_attention_heads=tren_num_attention_heads,
-            tren_prompt_grid_size=tren_prompt_grid_size,
-            tren_dropout=tren_dropout,
-        )
-        if objective_autocast_dtype not in {"float32", "fp32", "float16", "fp16", "bfloat16", "bf16"}:
-            raise ValueError("objective_autocast_dtype must be one of 'float32', 'float16', or 'bfloat16'")
-        if uncha_contrastive_loss not in {"ce", "sigmoid", "siglip", "siglip_metric"}:
-            raise ValueError("uncha_contrastive_loss must be 'ce', 'sigmoid', 'siglip', or 'siglip_metric'")
-        if uncha_global_local_metric not in {"distance", "angle"}:
-            raise ValueError("uncha_global_local_metric must be 'distance' or 'angle'")
-        if uncha_global_local_angle_aux_mode not in {"contrastive", "positive_hinge"}:
-            raise ValueError("uncha_global_local_angle_aux_mode must be 'contrastive' or 'positive_hinge'")
-        if uncha_global_local_angle_aux_weight < 0.0:
-            raise ValueError("uncha_global_local_angle_aux_weight must be non-negative")
-        if uncha_global_local_angle_aux_scale <= 0.0:
-            raise ValueError("uncha_global_local_angle_aux_scale must be positive")
-        if uncha_global_local_angle_aux_aperture_scale <= 0.0:
-            raise ValueError("uncha_global_local_angle_aux_aperture_scale must be positive")
-        if uncha_entailment_warmup_steps < 0:
-            raise ValueError("uncha_entailment_warmup_steps must be non-negative")
-        self.objective_name = objective
-        self.uncha_contrastive_loss = uncha_contrastive_loss
-        self.uncha_entailment_loss = uncha_entailment_loss
-        self.uncha_entailment_warmup_steps = uncha_entailment_warmup_steps
-        self.uncha_himo_component_weight = float(uncha_himo_component_weight)
-        self.uncha_himo_variance_threshold = float(uncha_himo_variance_threshold)
-        self.uncha_himo_detach_pca = bool(uncha_himo_detach_pca)
-        self.proclip_weight = float(proclip_weight)
-        self.proclip_retrieval = bool(proclip_retrieval)
-        self.proclip_geometry = proclip_geometry
-        self.proclip_dedicated_hyperbolic = bool(proclip_dedicated_hyperbolic)
-        self.beta_clip_weight = float(beta_clip_weight)
-        self.beta_clip_global_weight = float(beta_clip_global_weight)
-        self.beta_clip_beta = float(beta_clip_beta)
-        self.beta_clip_variant = beta_clip_variant
-        self.beta_clip_similarity = beta_clip_similarity
-        self.beta_clip_drop_cls_token = bool(beta_clip_drop_cls_token)
-        self.tren_weight = float(tren_weight)
-        self.tren_visual_distill_weight = float(tren_visual_distill_weight)
-        self.tren_text_distill_weight = float(tren_text_distill_weight)
-        self.tren_region_text_weight = float(tren_region_text_weight)
-        self.fuse_whole_part_encoder_forwards = bool(fuse_whole_part_encoder_forwards)
-        self.fuse_beta_query_encoder_forwards = bool(fuse_beta_query_encoder_forwards)
-        self.group_beta_query_pooling = bool(group_beta_query_pooling)
-        self.objective_autocast_dtype = objective_autocast_dtype
-        self.freeze_vision_encoder = bool(freeze_vision_encoder)
-        self.freeze_text_encoder = bool(freeze_text_encoder)
-        self.normalize_encoder_features = bool(normalize_encoder_features)
-        self.phyclip_subspace_dim = phyclip_subspace_dim
-        self.phyclip_product_metric = phyclip_product_metric
-        self.proclip_component_dim = proclip_component_dim
-        if projection_hidden_dim is not None and projection_hidden_dim <= 0:
-            raise ValueError("projection_hidden_dim must be positive when set")
-        if self.proclip_enabled and phyclip_subspace_dim is not None:
-            raise ValueError("ProCLIP mixed-curvature proxy cannot be combined with PHyCLIP Lorentz factors")
-        if phyclip_subspace_dim is not None:
-            if phyclip_subspace_dim <= 0:
-                raise ValueError("phyclip_subspace_dim must be positive when set")
-            if embed_dim % phyclip_subspace_dim != 0:
-                raise ValueError("embed_dim must be divisible by phyclip_subspace_dim")
-            self.phyclip_num_factors = embed_dim // phyclip_subspace_dim
-        else:
-            self.phyclip_num_factors = 0
-        self.vision_encoder = VisionEncoder(vision_backbone, pretrained=vision_pretrained)
-        self.text_encoder = TextEncoder(text_model_name, pretrained=text_pretrained, pooling=text_pooling)
-        self.tokenizer = self.text_encoder.tokenizer
-        self.embed_dim = embed_dim
-        if self.freeze_vision_encoder:
-            self.vision_encoder.requires_grad_(False)
-            self.vision_encoder.eval()
-        if self.freeze_text_encoder:
-            self.text_encoder.requires_grad_(False)
-            self.text_encoder.eval()
-        self.image_proj = _projection_head(self.vision_encoder.output_dim, embed_dim, projection_hidden_dim)
-        self.text_proj = _projection_head(self.text_encoder.output_dim, embed_dim, projection_hidden_dim)
-        self._init_experimental_modules(
-            beta_clip_num_heads=beta_clip_num_heads,
-            beta_clip_mlp_ratio=beta_clip_mlp_ratio,
-            tren_num_region_tokens=tren_num_region_tokens,
-            tren_num_decoder_layers=tren_num_decoder_layers,
-            tren_num_attention_heads=tren_num_attention_heads,
-            tren_prompt_grid_size=tren_prompt_grid_size,
-            tren_dropout=tren_dropout,
-            projection_hidden_dim=projection_hidden_dim,
-            proclip_projection_hidden_dim=proclip_projection_hidden_dim,
-            projection_head=_projection_head,
-        )
-        if objective == "hycoclip":
-            self.logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-        elif objective == "uncha":
-            self.global_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-            self.local_logit_scale = nn.Parameter(torch.tensor(1 / 0.05).log())
-            self.global_local_logit_scale = nn.Parameter(torch.tensor(1 / 0.06).log())
-            if uncha_contrastive_loss in {"sigmoid", "siglip", "siglip_metric"}:
-                self.global_logit_bias = nn.Parameter(torch.tensor(float(uncha_sigmoid_bias_init)))
-                self.local_logit_bias = nn.Parameter(torch.tensor(float(uncha_sigmoid_bias_init)))
-                self.global_local_logit_bias = nn.Parameter(torch.tensor(float(uncha_sigmoid_bias_init)))
-        alpha_dim = phyclip_subspace_dim or embed_dim
-        alpha_shape = (self.phyclip_num_factors,) if self.phyclip_enabled else ()
-        self.visual_alpha = nn.Parameter(torch.full(alpha_shape, alpha_dim**-0.5).log())
-        self.textual_alpha = nn.Parameter(torch.full(alpha_shape, alpha_dim**-0.5).log())
-        curv_shape = (self.phyclip_num_factors,) if self.phyclip_enabled else ()
-        log_curv = torch.full(curv_shape, curv_init).log()
-        self.log_curv = nn.Parameter(log_curv, requires_grad=learn_curv)
-        self.curv_min = curv_init / 10.0
-        self.curv_max = curv_init * 10.0
-        self.objective = None
-        if objective != "proclip":
-            self.objective = build_objective(
-                objective=objective,
-                entail_weight=entail_weight,
-                inter_aperture_scale=inter_aperture_scale,
-                intra_aperture_scale=intra_aperture_scale,
-                uncha_piecewise_factor=uncha_piecewise_factor,
-                uncha_calibration_alpha=uncha_calibration_alpha,
-                uncha_stop_grad_calibration=uncha_stop_grad_calibration,
-                uncha_entailment_geometry=uncha_entailment_geometry,
-                uncha_aggregate_weight=uncha_aggregate_weight,
-                uncha_entailment_loss=uncha_entailment_loss,
-                uncha_argent_beta=uncha_argent_beta,
-                uncha_argent_norm_weight=uncha_argent_norm_weight,
-                uncha_argent_aux_weight=uncha_argent_aux_weight,
-                uncha_argent_aggregation=uncha_argent_aggregation,
-                uncha_part_weight_power=uncha_part_weight_power,
-                uncha_contrastive_loss=uncha_contrastive_loss,
-                uncha_sigmoid_negative_weight=uncha_sigmoid_negative_weight,
-                uncha_part_quality_mode=uncha_part_quality_mode,
-                uncha_part_quality_topk=uncha_part_quality_topk,
-                uncha_part_quality_temperature=uncha_part_quality_temperature,
-                uncha_contrastive_global_weight=uncha_contrastive_global_weight,
-                uncha_contrastive_local_weight=uncha_contrastive_local_weight,
-                uncha_contrastive_global_local_weight=uncha_contrastive_global_local_weight,
-                uncha_global_local_mode=uncha_global_local_mode,
-                uncha_global_local_metric=uncha_global_local_metric,
-                uncha_global_local_angle_aux_weight=uncha_global_local_angle_aux_weight,
-                uncha_global_local_angle_aux_mode=uncha_global_local_angle_aux_mode,
-                uncha_global_local_angle_aux_scale=uncha_global_local_angle_aux_scale,
-                uncha_global_local_angle_aux_aperture_scale=uncha_global_local_angle_aux_aperture_scale,
-                uncha_beta_cal_beta=uncha_beta_cal_beta,
-                uncha_beta_cal_variant=uncha_beta_cal_variant,
-                uncha_beta_cal_weight=uncha_beta_cal_weight,
-                uncha_himo_component_weight=uncha_himo_component_weight,
-                uncha_radius_order_weight=uncha_radius_order_weight,
-                uncha_radius_order_margin=uncha_radius_order_margin,
-                uncha_gramian_align_weight=uncha_gramian_align_weight,
-                product_metric=phyclip_product_metric,
-            )
-    def train(self, mode: bool = True) -> Hyper3CLIP:
-        super().train(mode)
-        if self.freeze_vision_encoder:
-            self.vision_encoder.eval()
-        if self.freeze_text_encoder:
-            self.text_encoder.eval()
-        return self
-    @property
-    def phyclip_enabled(self) -> bool:
-        return self.phyclip_subspace_dim is not None
-    def _kappa(self) -> torch.Tensor:
-        return self.log_curv.exp().clamp(min=self.curv_min, max=self.curv_max)
-    def encode_image(self, image: torch.Tensor, project: bool = True) -> torch.Tensor:
-        feats = self.image_proj(self.encode_image_base(image))
-        if not project:
-            return feats
-        return self.project_image_features(feats)
-    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, project: bool = True) -> torch.Tensor:
-        feats = self.text_proj(self.encode_text_base(input_ids, attention_mask))
-        if not project:
-            return feats
-        return self.project_text_features(feats)
-    def encode_image_base(self, image: torch.Tensor) -> torch.Tensor:
-        with torch.set_grad_enabled(self.training and not self.freeze_vision_encoder):
-            feats = self.vision_encoder(image)
-        feats = feats.detach() if self.freeze_vision_encoder else feats
-        return F.normalize(feats.float(), dim=-1) if self.normalize_encoder_features else feats
-    def encode_image_base_with_tokens(self, image: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        with torch.set_grad_enabled(self.training and not self.freeze_vision_encoder):
-            feats, tokens = self.vision_encoder.forward_with_tokens(image)
-        if self.freeze_vision_encoder:
-            feats = feats.detach()
-            tokens = tokens.detach()
-        if self.normalize_encoder_features:
-            feats = F.normalize(feats.float(), dim=-1)
-        return feats, tokens
-    def encode_text_base(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        with torch.set_grad_enabled(self.training and not self.freeze_text_encoder):
-            feats = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
-        feats = feats.detach() if self.freeze_text_encoder else feats
-        return F.normalize(feats.float(), dim=-1) if self.normalize_encoder_features else feats
-    def project_image_features(self, feats: torch.Tensor) -> torch.Tensor:
-        if self.phyclip_enabled:
-            return self._project_product_features(feats, self.visual_alpha)
-        return exp_map0(feats.float() * self.visual_alpha.exp().float(), self._kappa().float())
-    def project_text_features(self, feats: torch.Tensor) -> torch.Tensor:
-        if self.phyclip_enabled:
-            return self._project_product_features(feats, self.textual_alpha)
-        return exp_map0(feats.float() * self.textual_alpha.exp().float(), self._kappa().float())
-    def similarity_scores(self, image_feats: torch.Tensor, text_feats: torch.Tensor) -> torch.Tensor:
-        return metric_similarity(image_feats, text_feats, self._kappa(), product_metric=self.phyclip_product_metric)
-    def encode_retrieval_image(self, image: torch.Tensor) -> torch.Tensor:
-        base = self.encode_image_base(image)
-        tangent = self.image_proj(base)
-        if self.proclip_retrieval:
-            return self._project_proclip_image_base(base, self.project_image_features(tangent))
-        return self.project_image_features(tangent)
-    def encode_retrieval_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        base = self.encode_text_base(input_ids, attention_mask)
-        tangent = self.text_proj(base)
-        if self.proclip_retrieval:
-            return self._project_proclip_text_base(base, self.project_text_features(tangent))
-        return self.project_text_features(tangent)
-    def retrieval_similarity_scores(self, image_feats: torch.Tensor, text_feats: torch.Tensor) -> torch.Tensor:
-        if self.proclip_retrieval:
-            return self._proclip_similarity_scores(image_feats, text_feats)
-        return self.similarity_scores(image_feats, text_feats)
-    @property
-    def retrieval_requires_chunking(self) -> bool:
-        return self.phyclip_enabled or self.proclip_retrieval
-    def _objective_autocast(self, device_type: str):
-        dtype = {
-            "float32": torch.float32,
-            "fp32": torch.float32,
-            "float16": torch.float16,
-            "fp16": torch.float16,
-            "bfloat16": torch.bfloat16,
-            "bf16": torch.bfloat16,
-        }[self.objective_autocast_dtype]
-        enabled = device_type != "cpu" and dtype is not torch.float32
-        return torch.autocast(device_type=device_type, dtype=dtype, enabled=enabled)
-    def forward(
-        self,
-        image: torch.Tensor,
-        part_images: torch.Tensor,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        part_owner: torch.Tensor,
-        step: int | None = None,
-        beta_query_input_ids: torch.Tensor | None = None,
-        beta_query_attention_mask: torch.Tensor | None = None,
-        beta_query_owner: torch.Tensor | None = None,
-        beta_query_type: torch.Tensor | None = None,
-        beta_query_parent: torch.Tensor | None = None,
-        beta_query_weight: torch.Tensor | None = None,
-        beta_query_source_part: torch.Tensor | None = None,
-    ) -> dict[str, torch.Tensor]:
-        with torch.no_grad():
-            self._clamp_logit_scales()
-            self.visual_alpha.clamp_(max=0.0)
-            self.textual_alpha.clamp_(max=0.0)
-        kappa = self._kappa()
-        feature_dim = self.embed_dim
-        beta_image_tokens = None
-        beta_query_base = None
-        part_image_base = part_images.new_zeros((0, self.vision_encoder.output_dim))
-        part_text_base = part_images.new_zeros((0, self.text_encoder.output_dim))
-        hier_beta_enabled = self.objective_name == "uncha" and self.uncha_entailment_loss in {
-            "hier_beta_argent",
-            "hier_beta_sourcepart_argent",
-        }
-        if (
-            hier_beta_enabled
-            and self.fuse_beta_query_encoder_forwards
-            and not self.tren_enabled
-            and beta_query_input_ids is not None
-            and beta_query_attention_mask is not None
-            and part_images.shape[0] > 0
-        ):
-            (
-                image_base,
-                text_base,
-                image_euc,
-                text_euc,
-                image_feats,
-                text_feats,
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-                beta_image_tokens,
-                beta_query_base,
-            ) = self._encode_hier_beta_whole_parts_and_queries(
-                image=image,
-                part_images=part_images,
-                text_input_ids=text_input_ids,
-                text_attention_mask=text_attention_mask,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-                beta_query_input_ids=beta_query_input_ids,
-                beta_query_attention_mask=beta_query_attention_mask,
-            )
-        elif self.beta_query_pooling_enabled or self.tren_enabled:
-            image_base, beta_image_tokens = self.encode_image_base_with_tokens(image)
-            text_base = self.encode_text_base(text_input_ids, text_attention_mask)
-            image_euc = self.image_proj(image_base)
-            text_euc = self.text_proj(text_base)
-            image_feats = self.project_image_features(image_euc)
-            text_feats = self.project_text_features(text_euc)
-            (
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-            ) = self._encode_parts_with_base(
-                part_images=part_images,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-                feature_dim=feature_dim,
-            )
-        elif self.fuse_whole_part_encoder_forwards and self.objective_name != "proclip" and part_images.shape[0] > 0:
-            (
-                image_base,
-                text_base,
-                image_euc,
-                text_euc,
-                image_feats,
-                text_feats,
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-            ) = self._encode_whole_and_parts(
-                image=image,
-                part_images=part_images,
-                text_input_ids=text_input_ids,
-                text_attention_mask=text_attention_mask,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-            )
-        else:
-            image_base = self.encode_image_base(image)
-            text_base = self.encode_text_base(text_input_ids, text_attention_mask)
-            image_euc = self.image_proj(image_base)
-            text_euc = self.text_proj(text_base)
-            image_feats = self.project_image_features(image_euc)
-            text_feats = self.project_text_features(text_euc)
-            (
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-            ) = self._encode_parts_with_base(
-                part_images=part_images,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-                feature_dim=feature_dim,
-            )
-        targets = local_target_indices(image_feats.size(0), image_feats.device)
-        if self.objective_name == "proclip":
-            proclip_image_feats = self._project_proclip_image_base(image_base, image_feats)
-            proclip_text_feats = self._project_proclip_text_base(text_base, text_feats)
-            proclip_loss = self._proclip_contrastive_loss(
-                image_feats=proclip_image_feats,
-                text_feats=proclip_text_feats,
-                all_image_feats=gather_with_grad(proclip_image_feats),
-                all_text_feats=gather_with_grad(proclip_text_feats),
-                targets=targets,
-            )
-            zero = proclip_loss.new_zeros(())
-            return {
-                "loss": proclip_loss,
-                "contrastive_loss": proclip_loss,
-                "entailment_loss": zero,
-                "part_count": part_owner.new_tensor(0),
-                "proclip_contrastive_loss": proclip_loss,
-                **self._detached_kappa_logs(kappa),
-                **self._detached_logit_scales(),
-            }
-        himo_text_feats = None
-        all_himo_text_feats = None
-        if self.objective_name == "uncha" and self.uncha_himo_component_weight > 0.0:
-            all_text_euc = gather_with_grad(text_euc)
-            all_component_euc = hide_reconstruct_embeddings(
-                all_text_euc,
-                variance_threshold=self.uncha_himo_variance_threshold,
-                detach_pca=self.uncha_himo_detach_pca,
-            )
-            if get_world_size() > 1:
-                start = text_euc.size(0) * get_rank()
-                end = start + text_euc.size(0)
-                component_euc = all_component_euc[start:end]
-            else:
-                component_euc = all_component_euc
-            himo_text_feats = self.project_text_features(component_euc)
-            all_himo_text_feats = gather_with_grad(himo_text_feats)
-        all_image_feats = gather_with_grad(image_feats)
-        all_text_feats = gather_with_grad(text_feats)
-        all_image_euc = None
-        all_text_euc = None
-        if self.objective_name == "uncha" and self.uncha_contrastive_loss == "siglip":
-            all_image_euc = gather_with_grad(image_euc)
-            all_text_euc = gather_with_grad(text_euc)
-        part_owner = part_owner.to(device=image_feats.device, dtype=torch.long)
-        beta_query_embeddings = {}
-        if self.objective_name == "uncha" and self.uncha_entailment_loss in {
-            "hier_beta_argent",
-            "hier_beta_sourcepart_argent",
-        }:
-            if beta_image_tokens is None:
-                raise RuntimeError(f"{self.uncha_entailment_loss} requires image patch tokens")
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                beta_query_embeddings = self._beta_query_entailment_embeddings(
-                    image_tokens=beta_image_tokens.float(),
-                    beta_query_input_ids=beta_query_input_ids,
-                    beta_query_attention_mask=beta_query_attention_mask,
-                    beta_query_owner=beta_query_owner,
-                    beta_query_parent=beta_query_parent,
-                    beta_query_weight=beta_query_weight,
-                    beta_query_source_part=beta_query_source_part,
-                    kappa=kappa.float(),
-                    query_base=beta_query_base,
-                )
-        with self._objective_autocast(image.device.type):
-            if self.objective is None:
-                raise RuntimeError("Non-ProCLIP forward requires an objective module")
-            losses = self.objective(
-                {
-                    "image_feats": image_feats,
-                    "text_feats": text_feats,
-                    "part_image_feats": part_image_feats,
-                    "part_text_feats": part_text_feats,
-                    "part_owner": part_owner,
-                    "all_image_feats": all_image_feats,
-                    "all_text_feats": all_text_feats,
-                    **(
-                        {
-                            "image_euc_feats": image_euc,
-                            "text_euc_feats": text_euc,
-                            "part_image_euc_feats": part_image_euc,
-                            "part_text_euc_feats": part_text_euc,
-                            "all_image_euc_feats": all_image_euc,
-                            "all_text_euc_feats": all_text_euc,
-                        }
-                        if all_image_euc is not None and all_text_euc is not None
-                        else {}
-                    ),
-                    "targets": targets,
-                    "kappa": kappa,
-                    "entail_weight_scale": self._entail_weight_scale(step, image_feats.device),
-                    **beta_query_embeddings,
-                    **(
-                        {
-                            "himo_text_feats": himo_text_feats,
-                            "all_himo_text_feats": all_himo_text_feats,
-                        }
-                        if himo_text_feats is not None
-                        else {}
-                    ),
-                },
-                self._objective_logit_scales(),
-            )
-        if self.beta_clip_global_weight > 0.0:
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                beta_clip_global_loss = self._beta_clip_global_contrastive_loss(
-                    image_euc=image_euc,
-                    text_euc=text_euc,
-                    targets=targets,
-                )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.beta_clip_global_weight * beta_clip_global_loss,
-                "beta_clip_global_loss": beta_clip_global_loss,
-            }
-        if self.beta_clip_enabled:
-            if beta_image_tokens is None:
-                raise RuntimeError("beta-CLIP auxiliary requires image patch tokens")
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                beta_clip_loss = self._beta_clip_auxiliary_loss(
-                    image_tokens=beta_image_tokens.float(),
-                    beta_query_input_ids=beta_query_input_ids,
-                    beta_query_attention_mask=beta_query_attention_mask,
-                    beta_query_owner=beta_query_owner,
-                    global_targets=targets,
-                    kappa=kappa.float(),
-                )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.beta_clip_weight * beta_clip_loss,
-                "beta_clip_loss": beta_clip_loss,
-            }
-        if self.tren_enabled:
-            if beta_image_tokens is None:
-                raise RuntimeError("T-REN auxiliary requires image patch tokens")
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                tren_losses = self._tren_auxiliary_losses(
-                    image_tokens=beta_image_tokens.float(),
-                    part_owner=part_owner,
-                    part_image_base=part_image_base.float(),
-                    part_text_base=part_text_base.float(),
-                )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.tren_weight * tren_losses["tren_loss"],
-                **tren_losses,
-            }
-        if self.proclip_enabled and self.proclip_weight > 0.0:
-            proclip_image_feats = self._project_proclip_image_base(image_base, image_feats)
-            proclip_text_feats = self._project_proclip_text_base(text_base, text_feats)
-            proclip_loss = self._proclip_contrastive_loss(
-                image_feats=proclip_image_feats,
-                text_feats=proclip_text_feats,
-                all_image_feats=gather_with_grad(proclip_image_feats),
-                all_text_feats=gather_with_grad(proclip_text_feats),
-                targets=targets,
-            )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.proclip_weight * proclip_loss,
-                "proclip_contrastive_loss": proclip_loss,
-            }
-        return {**losses, **self._detached_kappa_logs(kappa), **self._detached_logit_scales()}
-    def _encode_parts(
-        self,
-        part_images: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        feature_dim: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        if part_images.shape[0] == 0:
-            empty = part_images.new_zeros((0, feature_dim))
-            return empty, empty, empty, empty
-        part_image_euc = self.image_proj(self.encode_image_base(part_images))
-        part_text_euc = self.text_proj(self.encode_text_base(part_text_input_ids, part_text_attention_mask))
-        part_image_feats = self.project_image_features(part_image_euc)
-        part_text_feats = self.project_text_features(part_text_euc)
-        return part_image_feats, part_text_feats, part_image_euc, part_text_euc
-    def _encode_parts_with_base(
-        self,
-        part_images: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        feature_dim: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        if part_images.shape[0] == 0:
-            empty = part_images.new_zeros((0, feature_dim))
-            empty_image_base = part_images.new_zeros((0, self.vision_encoder.output_dim))
-            empty_text_base = part_images.new_zeros((0, self.text_encoder.output_dim))
-            return empty, empty, empty, empty, empty_image_base, empty_text_base
-        part_image_base = self.encode_image_base(part_images)
-        part_text_base = self.encode_text_base(part_text_input_ids, part_text_attention_mask)
-        part_image_euc = self.image_proj(part_image_base)
-        part_text_euc = self.text_proj(part_text_base)
-        part_image_feats = self.project_image_features(part_image_euc)
-        part_text_feats = self.project_text_features(part_text_euc)
-        return part_image_feats, part_text_feats, part_image_euc, part_text_euc, part_image_base, part_text_base
-    def _encode_whole_and_parts(
-        self,
-        image: torch.Tensor,
-        part_images: torch.Tensor,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-    ) -> tuple[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        batch_size = image.shape[0]
-        part_count = part_images.shape[0]
-        image_base_all = self.encode_image_base(torch.cat([image, part_images], dim=0))
-        image_euc_all = self.image_proj(image_base_all)
-        image_feats_all = self.project_image_features(image_euc_all)
-        text_ids, text_mask = self._concat_text_batches(
-            text_input_ids,
-            text_attention_mask,
-            part_text_input_ids,
-            part_text_attention_mask,
-        )
-        text_base_all = self.encode_text_base(text_ids, text_mask)
-        text_euc_all = self.text_proj(text_base_all)
-        text_feats_all = self.project_text_features(text_euc_all)
-        image_base, part_image_base = image_base_all.split([batch_size, part_count], dim=0)
-        text_base, part_text_base = text_base_all.split([batch_size, part_count], dim=0)
-        image_euc, part_image_euc = image_euc_all.split([batch_size, part_count], dim=0)
-        text_euc, part_text_euc = text_euc_all.split([batch_size, part_count], dim=0)
-        image_feats, part_image_feats = image_feats_all.split([batch_size, part_count], dim=0)
-        text_feats, part_text_feats = text_feats_all.split([batch_size, part_count], dim=0)
-        return (
-            image_base,
-            text_base,
-            image_euc,
-            text_euc,
-            image_feats,
-            text_feats,
-            part_image_feats,
-            part_text_feats,
-            part_image_euc,
-            part_text_euc,
-            part_image_base,
-            part_text_base,
-        )
-    def _encode_hier_beta_whole_parts_and_queries(
-        self,
-        image: torch.Tensor,
-        part_images: torch.Tensor,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        beta_query_input_ids: torch.Tensor,
-        beta_query_attention_mask: torch.Tensor,
-    ) -> tuple[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        batch_size = image.shape[0]
-        part_count = part_images.shape[0]
-        query_count = beta_query_input_ids.shape[0]
-        image_base_all, image_tokens_all = self.encode_image_base_with_tokens(torch.cat([image, part_images], dim=0))
-        image_euc_all = self.image_proj(image_base_all)
-        image_feats_all = self.project_image_features(image_euc_all)
-        image_base, part_image_base = image_base_all.split([batch_size, part_count], dim=0)
-        image_euc, part_image_euc = image_euc_all.split([batch_size, part_count], dim=0)
-        image_feats, part_image_feats = image_feats_all.split([batch_size, part_count], dim=0)
-        beta_image_tokens = image_tokens_all[:batch_size]
-        text_ids, text_mask = self._concat_text_batch_list(
-            (text_input_ids, text_attention_mask),
-            (part_text_input_ids, part_text_attention_mask),
-            (beta_query_input_ids, beta_query_attention_mask),
-        )
-        text_base_all = self.encode_text_base(text_ids, text_mask)
-        text_euc_all = self.text_proj(text_base_all)
-        text_feats_all = self.project_text_features(text_euc_all)
-        text_base, part_text_base, beta_query_base = text_base_all.split([batch_size, part_count, query_count], dim=0)
-        text_euc, part_text_euc, _ = text_euc_all.split([batch_size, part_count, query_count], dim=0)
-        text_feats, part_text_feats, _ = text_feats_all.split([batch_size, part_count, query_count], dim=0)
-        return (
-            image_base,
-            text_base,
-            image_euc,
-            text_euc,
-            image_feats,
-            text_feats,
-            part_image_feats,
-            part_text_feats,
-            part_image_euc,
-            part_text_euc,
-            part_image_base,
-            part_text_base,
-            beta_image_tokens,
-            beta_query_base,
-        )
-    def _concat_text_batches(
-        self,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        return self._concat_text_batch_list(
-            (text_input_ids, text_attention_mask),
-            (part_text_input_ids, part_text_attention_mask),
-        )
-    def _concat_text_batch_list(
-        self,
-        *batches: tuple[torch.Tensor, torch.Tensor],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        target_length = max(input_ids.shape[1] for input_ids, _ in batches)
-        pad_token_id = self.text_encoder.tokenizer.pad_token_id
-        if pad_token_id is None:
-            pad_token_id = 0
-        return (
-            torch.cat([_pad_sequence_dim(input_ids, target_length, pad_token_id) for input_ids, _ in batches], dim=0),
-            torch.cat([_pad_sequence_dim(attention_mask, target_length, 0) for _, attention_mask in batches], dim=0),
-        )
-    def _clamp_logit_scales(self) -> None:
-        if self.objective_name == "proclip":
-            self.proclip_logit_scale.clamp_(max=4.6052)
-            self._clamp_experimental_logit_scales()
-            return
-        if self.objective_name == "hycoclip":
-            self.logit_scale.clamp_(max=4.6052)
-            self._clamp_experimental_logit_scales()
-            return
-        self.global_logit_scale.clamp_(max=4.6052)
-        self.local_logit_scale.clamp_(max=4.6052)
-        self.global_local_logit_scale.clamp_(max=4.6052)
-        self._clamp_experimental_logit_scales()
-    def _objective_logit_scales(self) -> torch.Tensor | dict[str, torch.Tensor]:
-        if self.objective_name == "hycoclip":
-            return self.logit_scale
-        if self.objective_name == "proclip":
-            return self.proclip_logit_scale
-        return {
-            "global": self.global_logit_scale,
-            "local": self.local_logit_scale,
-            "global_local": self.global_local_logit_scale,
-            **(
-                {
-                    "global_bias": self.global_logit_bias,
-                    "local_bias": self.local_logit_bias,
-                    "global_local_bias": self.global_local_logit_bias,
-                }
-                if self.uncha_contrastive_loss in {"sigmoid", "siglip", "siglip_metric"}
-                else {}
-            ),
-        }
-    def _detached_logit_scales(self) -> dict[str, torch.Tensor]:
-        if self.objective_name == "proclip":
-            return self._detached_experimental_logit_scales()
-        if self.objective_name == "hycoclip":
-            logs = {"logit_scale": self.logit_scale.exp().detach()}
-            logs.update(self._detached_experimental_logit_scales())
-            return logs
-        logs = {
-            "global_logit_scale": self.global_logit_scale.exp().detach(),
-            "local_logit_scale": self.local_logit_scale.exp().detach(),
-            "global_local_logit_scale": self.global_local_logit_scale.exp().detach(),
-        }
-        if self.uncha_contrastive_loss in {"sigmoid", "siglip", "siglip_metric"}:
-            logs.update(
-                {
-                    "global_logit_bias": self.global_logit_bias.detach(),
-                    "local_logit_bias": self.local_logit_bias.detach(),
-                    "global_local_logit_bias": self.global_local_logit_bias.detach(),
-                }
-            )
-        logs.update(self._detached_experimental_logit_scales())
-        return logs
-    def _project_product_features(self, feats: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor:
-        product_feats = feats.float().reshape(feats.size(0), self.phyclip_num_factors, self.phyclip_subspace_dim)
-        product_feats = product_feats * alpha.exp().float().view(1, -1, 1)
-        return exp_map0(product_feats, self._kappa().float().view(1, -1, 1))
-    def _detached_kappa_logs(self, kappa: torch.Tensor) -> dict[str, torch.Tensor]:
-        detached = kappa.detach()
-        if detached.numel() == 1:
-            return {"kappa": detached.reshape(())}
-        return {
-            "kappa": detached.mean(),
-            "kappa_min": detached.min(),
-            "kappa_max": detached.max(),
-        }
-    def _entail_weight_scale(self, step: int | None, device: torch.device) -> torch.Tensor:
-        if self.uncha_entailment_warmup_steps <= 0 or step is None:
-            return torch.ones((), device=device)
-        scale = min(1.0, float(step + 1) / float(self.uncha_entailment_warmup_steps))
-        return torch.tensor(scale, device=device)
-def _projection_head(input_dim: int, output_dim: int, hidden_dim: int | None) -> nn.Module:
-    if hidden_dim is None:
-        return nn.Linear(input_dim, output_dim)
-    return nn.Sequential(
-        nn.Linear(input_dim, hidden_dim),
-        nn.ReLU(),
-        nn.Linear(hidden_dim, output_dim),
-    )
-def _pad_sequence_dim(tensor: torch.Tensor, target_length: int, value: int) -> torch.Tensor:
-    pad = target_length - tensor.shape[1]
-    if pad <= 0:
-        return tensor
-    return F.pad(tensor, (0, pad), value=value)

hyper3_clip/models/lorentz.py DELETED Viewed

@@ -1,265 +0,0 @@
-from __future__ import annotations
-import math
-import torch
-from torch import Tensor
-def lorentz_inner(x: Tensor, y: Tensor) -> Tensor:
-    """Compute batched Lorentzian inner product for matching rows."""
-    x = x.float()
-    y = y.float()
-    return -x[..., 0] * y[..., 0] + (x[..., 1:] * y[..., 1:]).sum(dim=-1)
-def pairwise_lorentz_inner(x: Tensor, y: Tensor) -> Tensor:
-    """Compute all-pairs Lorentzian inner products."""
-    x = x.float()
-    y = y.float()
-    time = -x[:, :1] @ y[:, :1].T
-    space = x[:, 1:] @ y[:, 1:].T
-    return time + space
-def exp_map0(u: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Exponential map at the origin from tangent space to hyperboloid."""
-    u = u.float()
-    kappa = kappa.float()
-    sqrt_k = torch.sqrt(kappa)
-    norm_u = torch.linalg.norm(u, dim=-1, keepdim=True).clamp_min(eps)
-    scaled = sqrt_k * norm_u
-    clipped_scaled = scaled.clamp_max(math.asinh(2**15))
-    time = torch.cosh(clipped_scaled) / sqrt_k
-    space = torch.sinh(clipped_scaled) * u / scaled.clamp_min(eps)
-    return torch.cat([time, space], dim=-1)
-def log_map0(x: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Logarithmic map at the origin from hyperboloid to tangent space.
-    Inverts ``exp_map0`` for points on the Lorentz model hyperboloid. Returns
-    vectors in the Euclidean tangent space at the origin (no time coordinate).
-    """
-    x = x.float()
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    kappa = kappa.to(dtype=torch.float32).flatten()
-    if x.dim() == 2:
-        if kappa.numel() != 1:
-            raise ValueError("log_map0 expects scalar kappa for non-product embeddings")
-        sqrt_k = torch.sqrt(kappa.reshape(()))
-        alpha = torch.acosh((sqrt_k * x[:, 0]).clamp_min(1.0 + dist_eps))
-        coef = alpha / torch.sinh(alpha).clamp_min(dist_eps)
-        return x[:, 1:] * coef.unsqueeze(-1)
-    if x.dim() == 3:
-        if kappa.numel() == 1:
-            kappa = kappa.expand(x.shape[1])
-        if kappa.numel() != x.shape[1]:
-            raise ValueError(f"Expected {x.shape[1]} curvatures for product space, got {kappa.numel()}")
-        sqrt_k = torch.sqrt(kappa).view(1, -1)
-        alpha = torch.acosh((sqrt_k * x[..., 0]).clamp_min(1.0 + dist_eps))
-        coef = alpha / torch.sinh(alpha).clamp_min(dist_eps)
-        return x[..., 1:] * coef.unsqueeze(-1)
-    raise ValueError("log_map0 expects [batch, dim + 1] or [batch, factors, dim + 1] tensors")
-def pairwise_dist(x: Tensor, y: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Pairwise geodesic distance on the Lorentz model."""
-    kappa = kappa.float()
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    prod = (-kappa) * pairwise_lorentz_inner(x, y)
-    prod = prod.clamp_min(1.0 + dist_eps)
-    return torch.acosh(prod) / torch.sqrt(kappa)
-def product_pairwise_dist(
-    x: Tensor,
-    y: Tensor,
-    kappa: Tensor,
-    metric: str = "l1",
-    eps: float = 1e-8,
-) -> Tensor:
-    """Pairwise distance in an l1/l2 product of Lorentz factors.
-    Inputs have shape ``[batch, factors, dim + 1]``. For ``metric="l1"``, this
-    matches the official PHyCLIP implementation's mean distance over factors.
-    """
-    if x.dim() != 3 or y.dim() != 3:
-        raise ValueError("product_pairwise_dist expects [batch, factors, dim + 1] tensors")
-    if x.shape[1] != y.shape[1] or x.shape[2] != y.shape[2]:
-        raise ValueError("Product Lorentz tensors must have matching factor and feature dimensions")
-    kappa = _product_kappa(kappa, x.shape[1], x.device).to(dtype=torch.float32)
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    x = x.float()
-    y = y.float()
-    inner = -x[:, None, :, 0] * y[None, :, :, 0] + torch.einsum("bkd,nkd->bnk", x[..., 1:], y[..., 1:])
-    prod = (-kappa.view(1, 1, -1)) * inner
-    dist = torch.acosh(prod.clamp_min(1.0 + dist_eps)) / torch.sqrt(kappa).view(1, 1, -1)
-    if metric == "l1":
-        return dist.mean(dim=-1)
-    if metric == "l2":
-        return dist.square().mean(dim=-1).sqrt()
-    raise ValueError(f"Unsupported product metric {metric!r}; expected 'l1' or 'l2'")
-def metric_pairwise_dist(x: Tensor, y: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    """Pairwise distance for either a single Lorentz space or a product space."""
-    if x.dim() == 3 or y.dim() == 3:
-        return product_pairwise_dist(x, y, kappa, metric=product_metric)
-    return pairwise_dist(x, y, kappa)
-def paired_dist(x: Tensor, y: Tensor, kappa: Tensor, product_metric: str = "l1", eps: float = 1e-8) -> Tensor:
-    """Row-wise distance for either a single Lorentz space or a product space."""
-    if x.dim() == 3 or y.dim() == 3:
-        if x.shape != y.shape:
-            raise ValueError("Product paired_dist expects matching tensor shapes")
-        kappa = _product_kappa(kappa, x.shape[1], x.device).to(dtype=torch.float32)
-        dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-        x = x.float()
-        y = y.float()
-        inner = -x[..., 0] * y[..., 0] + (x[..., 1:] * y[..., 1:]).sum(dim=-1)
-        prod = (-kappa.view(1, -1)) * inner
-        dist = torch.acosh(prod.clamp_min(1.0 + dist_eps)) / torch.sqrt(kappa).view(1, -1)
-        if product_metric == "l1":
-            return dist.mean(dim=-1)
-        if product_metric == "l2":
-            return dist.square().mean(dim=-1).sqrt()
-        raise ValueError(f"Unsupported product metric {product_metric!r}; expected 'l1' or 'l2'")
-    kappa = kappa.float()
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    prod = (-kappa) * lorentz_inner(x, y)
-    prod = prod.clamp_min(1.0 + dist_eps)
-    return torch.acosh(prod) / torch.sqrt(kappa)
-def radial_distance(x: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Geodesic distance from the origin.
-    For points on the hyperboloid, the time coordinate satisfies
-    ``x0 = cosh(sqrt(kappa) * r) / sqrt(kappa)``, so we can recover the radial
-    distance via ``r = arcosh(sqrt(kappa) * x0) / sqrt(kappa)``.
-    """
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    x = x.float()
-    kappa = kappa.to(dtype=torch.float32).flatten()
-    if x.dim() == 2:
-        if kappa.numel() != 1:
-            raise ValueError("radial_distance expects scalar kappa for non-product embeddings")
-        sqrt_k = torch.sqrt(kappa.reshape(()))
-        arg = (sqrt_k * x[:, 0]).clamp_min(1.0 + dist_eps)
-        return torch.acosh(arg) / sqrt_k
-    if x.dim() == 3:
-        if kappa.numel() == 1:
-            kappa = kappa.expand(x.shape[1])
-        if kappa.numel() != x.shape[1]:
-            raise ValueError(f"Expected {x.shape[1]} curvatures for product space, got {kappa.numel()}")
-        sqrt_k = torch.sqrt(kappa).view(1, -1)
-        arg = (sqrt_k * x[..., 0]).clamp_min(1.0 + dist_eps)
-        dist = torch.acosh(arg) / sqrt_k
-        return dist.mean(dim=-1)
-    raise ValueError("radial_distance expects [batch, dim + 1] or [batch, factors, dim + 1] tensors")
-def metric_similarity(x: Tensor, y: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    """Retrieval/classification similarity for single-space and PHyCLIP-style models."""
-    if x.dim() == 3 or y.dim() == 3:
-        return -product_pairwise_dist(x, y, kappa, metric=product_metric)
-    return pairwise_lorentz_inner(x, y)
-def half_aperture(general: Tensor, kappa: Tensor, min_radius: float = 0.1, eps: float = 1e-8) -> Tensor:
-    """Cone half-aperture for entailment cone centered at general concept."""
-    general = general.float()
-    kappa = kappa.float()
-    aperture_eps = max(eps, 16.0 * torch.finfo(general.dtype).eps)
-    general_norm = torch.linalg.norm(general[:, 1:], dim=-1)
-    ratio = (2.0 * min_radius) / (general_norm * torch.sqrt(kappa) + aperture_eps)
-    ratio = ratio.clamp(max=1.0 - aperture_eps)
-    return torch.asin(ratio)
-def oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Exterior angle between specific point and entailment cone at general point."""
-    specific = specific.float()
-    general = general.float()
-    kappa = kappa.float()
-    angle_eps = max(eps, 16.0 * torch.finfo(specific.dtype).eps)
-    inner = lorentz_inner(specific, general)
-    numerator = specific[:, 0] + kappa * inner * general[:, 0]
-    general_norm = torch.linalg.norm(general[:, 1:], dim=-1).clamp_min(angle_eps)
-    denom_term = (kappa * inner).pow(2) - 1.0
-    denom = general_norm * torch.sqrt(denom_term.clamp_min(angle_eps))
-    cosine = (numerator / denom).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    return torch.acos(cosine)
-def pairwise_oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """All-pairs exterior angle between specific points and entailment cones at general points."""
-    specific = specific.float()
-    general = general.float()
-    kappa = kappa.to(dtype=torch.float32).flatten()
-    if kappa.numel() != 1:
-        raise ValueError("pairwise_oxy_angle expects scalar kappa for non-product embeddings")
-    kappa_scalar = kappa.reshape(())
-    angle_eps = max(eps, 16.0 * torch.finfo(specific.dtype).eps)
-    inner = -specific[:, None, 0] * general[None, :, 0] + torch.einsum("nd,md->nm", specific[:, 1:], general[:, 1:])
-    numerator = specific[:, None, 0] + kappa_scalar * inner * general[None, :, 0]
-    general_norm = torch.linalg.norm(general[:, 1:], dim=-1).clamp_min(angle_eps)
-    denom_term = (kappa_scalar * inner).pow(2) - 1.0
-    denom = general_norm[None, :] * torch.sqrt(denom_term.clamp_min(angle_eps))
-    cosine = (numerator / denom).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    return torch.acos(cosine)
-def product_pairwise_oxy_angle(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    metric: str = "l1",
-    eps: float = 1e-8,
-) -> Tensor:
-    """All-pairs exterior angle in an l1/l2 product of Lorentz factors."""
-    if specific.dim() != 3 or general.dim() != 3:
-        raise ValueError("product_pairwise_oxy_angle expects [batch, factors, dim + 1] tensors")
-    if specific.shape[1] != general.shape[1] or specific.shape[2] != general.shape[2]:
-        raise ValueError("Product Lorentz tensors must have matching factor and feature dimensions")
-    kappa = _product_kappa(kappa, specific.shape[1], specific.device).to(dtype=torch.float32)
-    angle_eps = max(eps, 16.0 * torch.finfo(specific.dtype).eps)
-    specific = specific.float()
-    general = general.float()
-    inner = -specific[:, None, :, 0] * general[None, :, :, 0] + torch.einsum(
-        "nkd,mkd->nmk",
-        specific[..., 1:],
-        general[..., 1:],
-    )
-    numerator = specific[:, None, :, 0] + (kappa.view(1, 1, -1) * inner) * general[None, :, :, 0]
-    general_norm = torch.linalg.norm(general[..., 1:], dim=-1).clamp_min(angle_eps)
-    denom_term = (kappa.view(1, 1, -1) * inner).pow(2) - 1.0
-    denom = general_norm[None, :, :] * torch.sqrt(denom_term.clamp_min(angle_eps))
-    cosine = (numerator / denom).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    angles = torch.acos(cosine)
-    if metric == "l1":
-        return angles.mean(dim=-1)
-    if metric == "l2":
-        return angles.square().mean(dim=-1).sqrt()
-    raise ValueError(f"Unsupported product metric {metric!r}; expected 'l1' or 'l2'")
-def metric_pairwise_oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    """All-pairs oxy-angle for either a single Lorentz space or a product space."""
-    if specific.dim() == 3 or general.dim() == 3:
-        return product_pairwise_oxy_angle(specific, general, kappa, metric=product_metric)
-    return pairwise_oxy_angle(specific, general, kappa)
-def _product_kappa(kappa: Tensor, num_factors: int, device: torch.device) -> Tensor:
-    kappa = kappa.to(device=device, dtype=torch.float32).flatten()
-    if kappa.numel() == 1:
-        return kappa.expand(num_factors)
-    if kappa.numel() != num_factors:
-        raise ValueError(f"Expected {num_factors} curvatures for product space, got {kappa.numel()}")
-    return kappa

hyper3_clip/models/losses.py DELETED Viewed

@@ -1,1400 +0,0 @@
-from __future__ import annotations
-import math
-import torch
-from torch import Tensor
-import torch.nn.functional as F
-from hyper3_clip.models.lorentz import (
-    half_aperture,
-    metric_pairwise_dist,
-    metric_pairwise_oxy_angle,
-    oxy_angle,
-    paired_dist,
-    radial_distance,
-)
-def contrastive_ce(logits: Tensor, targets: Tensor | None = None, weights: Tensor | None = None) -> Tensor:
-    if targets is None:
-        targets = torch.arange(logits.size(0), device=logits.device)
-    losses = F.cross_entropy(logits, targets, reduction="none")
-    return weighted_mean(losses, weights)
-def contrastive_sigmoid(
-    logits: Tensor,
-    targets: Tensor | None = None,
-    weights: Tensor | None = None,
-    negative_weight: float = 1.0,
-) -> Tensor:
-    if targets is None:
-        targets = torch.arange(logits.size(0), device=logits.device)
-    labels = torch.zeros_like(logits)
-    labels[torch.arange(logits.size(0), device=logits.device), targets] = 1.0
-    losses = F.binary_cross_entropy_with_logits(logits, labels, reduction="none")
-    if negative_weight != 1.0:
-        element_weights = torch.where(labels > 0.0, torch.ones_like(labels), logits.new_full((), negative_weight))
-        losses = losses * element_weights
-    losses = losses.mean(dim=1)
-    return weighted_mean(losses, weights)
-def contrastive_siglip(
-    logits: Tensor,
-    targets: Tensor | None = None,
-    weights: Tensor | None = None,
-    negative_weight: float = 1.0,
-) -> Tensor:
-    """SigLIP pairwise sigmoid loss (Zhai et al., ICCV 2023).
-    Uses labels in {+1, -1} with a per-row sum (not mean) over pairs:
-      L_i = sum_j softplus(- y_ij * logit_ij)
-    """
-    if logits.ndim != 2:
-        raise ValueError("contrastive_siglip expects a [batch, classes] logit matrix")
-    if targets is None:
-        targets = torch.arange(logits.size(0), device=logits.device)
-    labels = logits.new_full(logits.shape, -1.0)
-    labels[torch.arange(logits.size(0), device=logits.device), targets] = 1.0
-    losses = F.softplus(-(labels * logits))
-    if negative_weight != 1.0:
-        element_weights = torch.where(labels > 0.0, torch.ones_like(labels), logits.new_full((), negative_weight))
-        losses = losses * element_weights
-    row_losses = losses.sum(dim=1)
-    return weighted_mean(row_losses, weights)
-def weighted_mean(values: Tensor, weights: Tensor | None = None) -> Tensor:
-    if weights is None:
-        return values.mean()
-    weights = weights.to(device=values.device, dtype=values.dtype)
-    while weights.dim() < values.dim():
-        weights = weights.unsqueeze(-1)
-    return (values * weights).sum() / weights.sum().clamp_min(torch.finfo(values.dtype).eps)
-def gramian_volume_loss(vectors: Tensor, weights: Tensor | None = None, eps: float = 1e-4) -> Tensor:
-    """GRAM-style volume loss for sets of vectors.
-    ``vectors`` is expected to have shape ``[batch, k, dim]``. Each set of k
-    vectors is L2-normalized along ``dim``, then we compute the Gramian
-    ``G = V V^T`` and return ``sqrt(det(G + eps I))`` averaged over the batch.
-    """
-    if vectors.ndim != 3:
-        raise ValueError("gramian_volume_loss expects a [batch, k, dim] tensor")
-    if eps <= 0.0:
-        raise ValueError("gramian_volume_loss eps must be positive")
-    vectors = F.normalize(vectors.float(), dim=-1, eps=1e-8)
-    gram = vectors @ vectors.transpose(-1, -2)
-    k = gram.size(-1)
-    gram = gram + eps * torch.eye(k, device=gram.device, dtype=gram.dtype)
-    sign, logabsdet = torch.linalg.slogdet(gram)
-    volume = torch.exp(0.5 * logabsdet)
-    volume = torch.where(sign > 0, volume, volume.new_ones(volume.shape))
-    return weighted_mean(volume, weights)
-def radius_order_hinge(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    margin: float,
-    weights: Tensor | None = None,
-) -> Tensor:
-    if specific.shape[0] != general.shape[0]:
-        raise ValueError("radius_order_hinge expects matching batch dimensions")
-    if margin < 0.0:
-        raise ValueError("radius_order_hinge margin must be non-negative")
-    specific_radius = radial_distance(specific, kappa)
-    general_radius = radial_distance(general, kappa)
-    losses = F.relu(float(margin) + general_radius - specific_radius)
-    return weighted_mean(losses, weights)
-def soft_contrastive_ce(logits: Tensor, target_weights: Tensor, weights: Tensor | None = None) -> Tensor:
-    if logits.ndim != 2 or target_weights.ndim != 2:
-        raise ValueError("soft_contrastive_ce expects [batch, classes] tensors")
-    if logits.shape != target_weights.shape:
-        raise ValueError("soft_contrastive_ce requires logits and target_weights to have matching shapes")
-    log_probs = F.log_softmax(logits, dim=1)
-    losses = -(target_weights.to(dtype=log_probs.dtype) * log_probs).sum(dim=1)
-    return weighted_mean(losses, weights)
-def beta_cal_loss(
-    logits: Tensor,
-    *,
-    targets: Tensor,
-    group_ids: Tensor,
-    all_group_ids: Tensor,
-    beta: float,
-    variant: str,
-    weights: Tensor | None = None,
-) -> Tensor:
-    if beta < 0.0:
-        raise ValueError("beta_cal_loss beta must be non-negative")
-    if variant not in {"ce", "bce"}:
-        raise ValueError("beta_cal_loss variant must be 'ce' or 'bce'")
-    if logits.ndim != 2:
-        raise ValueError("beta_cal_loss expects a [batch, classes] logit matrix")
-    if targets.shape != (logits.size(0),):
-        raise ValueError("beta_cal_loss targets must have shape [batch]")
-    if group_ids.shape != (logits.size(0),):
-        raise ValueError("beta_cal_loss group_ids must have shape [batch]")
-    if all_group_ids.shape != (logits.size(1),):
-        raise ValueError("beta_cal_loss all_group_ids must have shape [classes]")
-    same_group = group_ids[:, None] == all_group_ids[None, :]
-    same_pair = targets[:, None] == torch.arange(logits.size(1), device=logits.device)[None, :]
-    if variant == "ce":
-        target_weights = logits.new_zeros(logits.shape)
-        target_weights = torch.where(same_pair, logits.new_ones(()), target_weights)
-        target_weights = torch.where(same_group & ~same_pair, logits.new_full((), float(beta)), target_weights)
-        target_weights = target_weights / target_weights.sum(dim=1, keepdim=True).clamp_min(
-            torch.finfo(target_weights.dtype).eps
-        )
-        return soft_contrastive_ce(logits, target_weights, weights)
-    labels = same_group.to(dtype=logits.dtype)
-    element_weights = logits.new_ones(logits.shape)
-    element_weights = torch.where(same_group & ~same_pair, logits.new_full((), float(beta)), element_weights)
-    element_losses = F.binary_cross_entropy_with_logits(logits, labels, reduction="none") * element_weights
-    row_losses = element_losses.mean(dim=1)
-    return weighted_mean(row_losses, weights)
-def compositional_contrastive_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    box_image_feats: Tensor,
-    box_text_feats: Tensor,
-    kappa: Tensor,
-    logit_scale: Tensor,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    targets: Tensor | None = None,
-) -> Tensor:
-    scale = logit_scale.exp().clamp(max=100.0)
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    logits_i_t = -metric_pairwise_dist(image_feats, all_text_feats, kappa) * scale
-    logits_t_i = -metric_pairwise_dist(text_feats, all_image_feats, kappa) * scale
-    logits_bi_t = -metric_pairwise_dist(box_image_feats, all_text_feats, kappa) * scale
-    logits_bt_i = -metric_pairwise_dist(box_text_feats, all_image_feats, kappa) * scale
-    return 0.25 * (
-        contrastive_ce(logits_i_t, targets)
-        + contrastive_ce(logits_t_i, targets)
-        + contrastive_ce(logits_bi_t, targets)
-        + contrastive_ce(logits_bt_i, targets)
-    )
-def multi_part_contrastive_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_mask: Tensor,
-    kappa: Tensor,
-    logit_scale: Tensor,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    targets: Tensor | None = None,
-) -> Tensor:
-    scale = logit_scale.exp().clamp(max=100.0)
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    if targets is None:
-        targets = torch.arange(image_feats.size(0), device=image_feats.device)
-    part_image_flat, part_text_flat, part_targets = _flatten_valid_parts(part_image_feats, part_text_feats, part_mask, targets)
-    logits_i_t = -metric_pairwise_dist(image_feats, all_text_feats, kappa) * scale
-    logits_t_i = -metric_pairwise_dist(text_feats, all_image_feats, kappa) * scale
-    logits_pi_t = -metric_pairwise_dist(part_image_flat, all_text_feats, kappa) * scale
-    logits_pt_i = -metric_pairwise_dist(part_text_flat, all_image_feats, kappa) * scale
-    return 0.25 * (
-        contrastive_ce(logits_i_t, targets)
-        + contrastive_ce(logits_t_i, targets)
-        + contrastive_ce(logits_pi_t, part_targets)
-        + contrastive_ce(logits_pt_i, part_targets)
-    )
-def packed_part_contrastive_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_owner: Tensor,
-    kappa: Tensor,
-    logit_scale: Tensor,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    targets: Tensor | None = None,
-) -> Tensor:
-    scale = logit_scale.exp().clamp(max=100.0)
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    if targets is None:
-        targets = torch.arange(image_feats.size(0), device=image_feats.device)
-    logits_i_t = -metric_pairwise_dist(image_feats, all_text_feats, kappa) * scale
-    logits_t_i = -metric_pairwise_dist(text_feats, all_image_feats, kappa) * scale
-    global_loss = 0.5 * (contrastive_ce(logits_i_t, targets) + contrastive_ce(logits_t_i, targets))
-    if part_image_feats.numel() == 0:
-        return global_loss
-    part_targets = targets[part_owner]
-    logits_pi_t = -metric_pairwise_dist(part_image_feats, all_text_feats, kappa) * scale
-    logits_pt_i = -metric_pairwise_dist(part_text_feats, all_image_feats, kappa) * scale
-    part_loss = 0.5 * (contrastive_ce(logits_pi_t, part_targets) + contrastive_ce(logits_pt_i, part_targets))
-    return 0.5 * (global_loss + part_loss)
-def factor_oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor) -> Tensor:
-    if specific.dim() != 3:
-        return oxy_angle(specific=specific, general=general, kappa=kappa)
-    batch_size, num_factors, feature_dim = specific.shape
-    kappa = _factor_kappa(kappa, num_factors, specific.device)
-    factor_kappa = kappa.view(1, num_factors).expand(batch_size, num_factors).reshape(-1)
-    return oxy_angle(
-        specific=specific.reshape(batch_size * num_factors, feature_dim),
-        general=general.reshape(batch_size * num_factors, feature_dim),
-        kappa=factor_kappa,
-    ).reshape(batch_size, num_factors)
-def factor_half_aperture(general: Tensor, kappa: Tensor) -> Tensor:
-    if general.dim() != 3:
-        return half_aperture(general=general, kappa=kappa)
-    batch_size, num_factors, feature_dim = general.shape
-    kappa = _factor_kappa(kappa, num_factors, general.device)
-    factor_kappa = kappa.view(1, num_factors).expand(batch_size, num_factors).reshape(-1)
-    return half_aperture(
-        general=general.reshape(batch_size * num_factors, feature_dim),
-        kappa=factor_kappa,
-    ).reshape(batch_size, num_factors)
-def _factor_kappa(kappa: Tensor, num_factors: int, device: torch.device) -> Tensor:
-    kappa = kappa.to(device=device, dtype=torch.float32).flatten()
-    if kappa.numel() == 1:
-        return kappa.expand(num_factors)
-    if kappa.numel() != num_factors:
-        raise ValueError(f"Expected {num_factors} curvatures for product space, got {kappa.numel()}")
-    return kappa
-def entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    aperture_scale: float,
-) -> Tensor:
-    angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-    apertures = factor_half_aperture(general=general, kappa=kappa)
-    return torch.clamp(angles - (aperture_scale * apertures), min=0.0).mean()
-def weighted_entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    aperture_scale: float,
-    weights: Tensor | None = None,
-) -> Tensor:
-    angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-    apertures = factor_half_aperture(general=general, kappa=kappa)
-    residuals = torch.clamp(angles - (aperture_scale * apertures), min=0.0)
-    if residuals.dim() == 2:
-        residuals = residuals.mean(dim=-1)
-    return weighted_mean(residuals, weights)
-def compositional_entailment_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    box_image_feats: Tensor,
-    box_text_feats: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-) -> Tensor:
-    text_to_image = entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    box_text_to_box_image = entailment_residual(
-        specific=box_image_feats,
-        general=box_text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    box_image_to_image = entailment_residual(
-        specific=image_feats,
-        general=box_image_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    box_text_to_text = entailment_residual(
-        specific=text_feats,
-        general=box_text_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    return 0.5 * (text_to_image + box_text_to_box_image + box_image_to_image + box_text_to_text)
-def multi_part_entailment_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_mask: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-) -> Tensor:
-    part_image_flat = part_image_feats[part_mask]
-    part_text_flat = part_text_feats[part_mask]
-    image_for_parts = image_feats[:, None, :].expand_as(part_image_feats)[part_mask]
-    text_for_parts = text_feats[:, None, :].expand_as(part_text_feats)[part_mask]
-    text_to_image = entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    part_text_to_part_image = entailment_residual(
-        specific=part_image_flat,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    part_image_to_image = entailment_residual(
-        specific=image_for_parts,
-        general=part_image_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    part_text_to_text = entailment_residual(
-        specific=text_for_parts,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    return 0.5 * (text_to_image + part_text_to_part_image + part_image_to_image + part_text_to_text)
-def packed_part_entailment_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_owner: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-) -> Tensor:
-    text_to_image = entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    if part_image_feats.numel() == 0:
-        return text_to_image
-    image_for_parts = image_feats[part_owner]
-    text_for_parts = text_feats[part_owner]
-    part_text_to_part_image = entailment_residual(
-        specific=part_image_feats,
-        general=part_text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    part_image_to_image = entailment_residual(
-        specific=image_for_parts,
-        general=part_image_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    part_text_to_text = entailment_residual(
-        specific=text_for_parts,
-        general=part_text_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    return 0.5 * (text_to_image + part_text_to_part_image + part_image_to_image + part_text_to_text)
-def uncha_contrastive_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    global_logit_scale: Tensor,
-    local_logit_scale: Tensor,
-    global_local_logit_scale: Tensor,
-    image_euc_feats: Tensor | None = None,
-    text_euc_feats: Tensor | None = None,
-    part_image_euc_flat: Tensor | None = None,
-    part_text_euc_flat: Tensor | None = None,
-    image_for_parts_euc: Tensor | None = None,
-    text_for_parts_euc: Tensor | None = None,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    all_part_image_feats: Tensor | None = None,
-    all_part_text_feats: Tensor | None = None,
-    all_image_for_parts: Tensor | None = None,
-    all_text_for_parts: Tensor | None = None,
-    all_image_euc_feats: Tensor | None = None,
-    all_text_euc_feats: Tensor | None = None,
-    all_part_image_euc_feats: Tensor | None = None,
-    all_part_text_euc_feats: Tensor | None = None,
-    all_image_for_parts_euc: Tensor | None = None,
-    all_text_for_parts_euc: Tensor | None = None,
-    global_targets: Tensor | None = None,
-    part_targets: Tensor | None = None,
-    part_weights: Tensor | None = None,
-    product_metric: str = "l1",
-    loss_type: str = "ce",
-    contrastive_global_weight: float = 1.0,
-    contrastive_local_weight: float = 1.0,
-    contrastive_global_local_weight: float = 1.0,
-    beta_cal_beta: float = 0.0,
-    beta_cal_variant: str = "ce",
-    beta_cal_weight: float = 0.0,
-    part_group_ids: Tensor | None = None,
-    all_part_group_ids: Tensor | None = None,
-    global_logit_bias: Tensor | None = None,
-    local_logit_bias: Tensor | None = None,
-    global_local_logit_bias: Tensor | None = None,
-    sigmoid_negative_weight: float = 1.0,
-    global_local_mode: str = "repeat",
-    global_local_metric: str = "distance",
-    global_local_angle_aux_weight: float = 0.0,
-    global_local_angle_aux_mode: str = "contrastive",
-    global_local_angle_aux_scale: float = 5.5,
-    global_local_angle_aux_aperture_scale: float = 1.0,
-) -> dict[str, Tensor]:
-    if loss_type not in {"ce", "sigmoid", "siglip", "siglip_metric"}:
-        raise ValueError(
-            f"Unsupported contrastive loss {loss_type!r}; expected 'ce', 'sigmoid', 'siglip', or 'siglip_metric'"
-        )
-    if global_local_mode not in {"repeat", "inbatch"}:
-        raise ValueError("global_local_mode must be 'repeat' or 'inbatch'")
-    if global_local_metric not in {"distance", "angle"}:
-        raise ValueError("global_local_metric must be 'distance' or 'angle'")
-    if global_local_angle_aux_mode not in {"contrastive", "positive_hinge"}:
-        raise ValueError("global_local_angle_aux_mode must be 'contrastive' or 'positive_hinge'")
-    if global_local_angle_aux_weight < 0.0:
-        raise ValueError("global_local_angle_aux_weight must be non-negative")
-    if global_local_angle_aux_scale <= 0.0:
-        raise ValueError("global_local_angle_aux_scale must be positive")
-    if global_local_angle_aux_aperture_scale <= 0.0:
-        raise ValueError("global_local_angle_aux_aperture_scale must be positive")
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    all_part_image_feats = part_image_flat if all_part_image_feats is None else all_part_image_feats
-    all_part_text_feats = part_text_flat if all_part_text_feats is None else all_part_text_feats
-    all_image_for_parts = image_for_parts if all_image_for_parts is None else all_image_for_parts
-    all_text_for_parts = text_for_parts if all_text_for_parts is None else all_text_for_parts
-    if global_targets is None:
-        global_targets = torch.arange(image_feats.size(0), device=image_feats.device)
-    if part_targets is None:
-        part_targets = torch.arange(part_image_flat.size(0), device=part_image_flat.device)
-    global_scale = global_logit_scale.exp().clamp(max=100.0)
-    local_scale = local_logit_scale.exp().clamp(max=100.0)
-    global_local_scale = global_local_logit_scale.exp().clamp(max=100.0)
-    if loss_type == "siglip":
-        if image_euc_feats is None or text_euc_feats is None:
-            raise ValueError("siglip contrastive requires image_euc_feats and text_euc_feats")
-        if image_feats.dim() != 2 or text_feats.dim() != 2:
-            raise ValueError("siglip contrastive is only supported for non-product features")
-        all_image_euc_feats = image_euc_feats if all_image_euc_feats is None else all_image_euc_feats
-        all_text_euc_feats = text_euc_feats if all_text_euc_feats is None else all_text_euc_feats
-        zimg = F.normalize(image_euc_feats.float(), dim=-1)
-        ztxt = F.normalize(text_euc_feats.float(), dim=-1)
-        zimg_all = F.normalize(all_image_euc_feats.float(), dim=-1)
-        ztxt_all = F.normalize(all_text_euc_feats.float(), dim=-1)
-        image_logits = (zimg @ ztxt_all.T) * global_scale
-        text_logits = (ztxt @ zimg_all.T) * global_scale
-    else:
-        image_logits = -metric_pairwise_dist(image_feats, all_text_feats, kappa, product_metric=product_metric) * global_scale
-        text_logits = -metric_pairwise_dist(text_feats, all_image_feats, kappa, product_metric=product_metric) * global_scale
-    if loss_type in {"sigmoid", "siglip", "siglip_metric"}:
-        bias = image_logits.new_zeros(()) if global_logit_bias is None else global_logit_bias.to(image_logits.device)
-        image_logits = image_logits + bias
-        text_logits = text_logits + bias
-    global_contrastive = 0.5 * (
-        _contrastive_loss(image_logits, global_targets, None, loss_type, sigmoid_negative_weight)
-        + _contrastive_loss(text_logits, global_targets, None, loss_type, sigmoid_negative_weight)
-    )
-    if part_image_flat.numel() == 0:
-        zero = image_feats.new_zeros(())
-        contrastive = contrastive_global_weight * global_contrastive
-        return {
-            "contrastive_loss": contrastive,
-            "global_contrastive_loss": global_contrastive,
-            "local_contrastive_loss": zero,
-            "global_local_contrastive_loss": zero,
-            "global_local_angle_aux_loss": zero,
-            "beta_cal_loss": zero,
-        }
-    if loss_type == "siglip":
-        if part_image_euc_flat is None or part_text_euc_flat is None:
-            raise ValueError("siglip contrastive requires part_image_euc_flat and part_text_euc_flat when parts exist")
-        all_part_image_euc_feats = part_image_euc_flat if all_part_image_euc_feats is None else all_part_image_euc_feats
-        all_part_text_euc_feats = part_text_euc_flat if all_part_text_euc_feats is None else all_part_text_euc_feats
-        zpi = F.normalize(part_image_euc_flat.float(), dim=-1)
-        zpt = F.normalize(part_text_euc_flat.float(), dim=-1)
-        zpi_all = F.normalize(all_part_image_euc_feats.float(), dim=-1)
-        zpt_all = F.normalize(all_part_text_euc_feats.float(), dim=-1)
-        part_image_logits = (zpi @ zpt_all.T) * local_scale
-        part_text_logits = (zpt @ zpi_all.T) * local_scale
-    else:
-        part_image_logits = -metric_pairwise_dist(part_image_flat, all_part_text_feats, kappa, product_metric=product_metric) * local_scale
-        part_text_logits = -metric_pairwise_dist(part_text_flat, all_part_image_feats, kappa, product_metric=product_metric) * local_scale
-    if loss_type in {"sigmoid", "siglip", "siglip_metric"}:
-        bias = part_image_logits.new_zeros(()) if local_logit_bias is None else local_logit_bias.to(part_image_logits.device)
-        part_image_logits = part_image_logits + bias
-        part_text_logits = part_text_logits + bias
-    local_contrastive = 0.5 * (
-        _contrastive_loss(part_image_logits, part_targets, part_weights, loss_type, sigmoid_negative_weight)
-        + _contrastive_loss(part_text_logits, part_targets, part_weights, loss_type, sigmoid_negative_weight)
-    )
-    global_local_contrastive = image_feats.new_zeros(())
-    global_local_angle_aux = image_feats.new_zeros(())
-    if contrastive_global_local_weight != 0.0:
-        if global_local_mode == "inbatch":
-            if part_group_ids is None:
-                raise ValueError("inbatch global-local contrastive requires part_group_ids to be provided")
-            global_local_targets = part_group_ids
-            all_text_for_global_local = all_text_feats
-            all_image_for_global_local = all_image_feats
-            all_text_for_global_local_euc = all_text_euc_feats
-            all_image_for_global_local_euc = all_image_euc_feats
-        else:
-            global_local_targets = part_targets
-            all_text_for_global_local = all_text_for_parts
-            all_image_for_global_local = all_image_for_parts
-            all_text_for_global_local_euc = all_text_for_parts_euc
-            all_image_for_global_local_euc = all_image_for_parts_euc
-        image_uncertainty = embedding_uncertainty(part_image_flat).detach()
-        text_uncertainty = embedding_uncertainty(part_text_flat).detach()
-        image_temp = torch.exp(-0.5 * image_uncertainty).clamp(min=0.1, max=10.0)
-        text_temp = torch.exp(-0.5 * text_uncertainty).clamp(min=0.1, max=10.0)
-        if loss_type == "siglip":
-            if part_image_euc_flat is None or part_text_euc_flat is None:
-                raise ValueError("siglip global-local contrastive requires part_image_euc_flat/part_text_euc_flat")
-            if all_text_for_global_local_euc is None or all_image_for_global_local_euc is None:
-                raise ValueError("siglip global-local contrastive requires all_image_euc_feats/all_text_euc_feats")
-            zpi = F.normalize(part_image_euc_flat.float(), dim=-1)
-            zpt = F.normalize(part_text_euc_flat.float(), dim=-1)
-            zimg_all = F.normalize(all_image_for_global_local_euc.float(), dim=-1)
-            ztxt_all = F.normalize(all_text_for_global_local_euc.float(), dim=-1)
-            part_image_to_whole_text = (zpi @ ztxt_all.T) * image_temp[:, None] * global_local_scale
-            part_text_to_whole_image = (zpt @ zimg_all.T) * text_temp[:, None] * global_local_scale
-        else:
-            if global_local_metric == "angle":
-                part_image_to_whole_text = -metric_pairwise_oxy_angle(
-                    part_image_flat,
-                    all_text_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                )
-                part_text_to_whole_image = -metric_pairwise_oxy_angle(
-                    part_text_flat,
-                    all_image_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                )
-            else:
-                part_image_to_whole_text = -metric_pairwise_dist(
-                    part_image_flat, all_text_for_global_local, kappa, product_metric=product_metric
-                )
-                part_text_to_whole_image = -metric_pairwise_dist(
-                    part_text_flat, all_image_for_global_local, kappa, product_metric=product_metric
-                )
-            part_image_to_whole_text = part_image_to_whole_text * image_temp[:, None] * global_local_scale
-            part_text_to_whole_image = part_text_to_whole_image * text_temp[:, None] * global_local_scale
-        if loss_type in {"sigmoid", "siglip", "siglip_metric"}:
-            bias = (
-                part_image_to_whole_text.new_zeros(())
-                if global_local_logit_bias is None
-                else global_local_logit_bias.to(part_image_to_whole_text.device)
-            )
-            part_image_to_whole_text = part_image_to_whole_text + bias
-            part_text_to_whole_image = part_text_to_whole_image + bias
-        global_local_contrastive = 0.5 * (
-            _contrastive_loss(part_image_to_whole_text, global_local_targets, part_weights, loss_type, sigmoid_negative_weight)
-            + _contrastive_loss(part_text_to_whole_image, global_local_targets, part_weights, loss_type, sigmoid_negative_weight)
-        )
-        if global_local_angle_aux_weight > 0.0:
-            if global_local_angle_aux_mode == "positive_hinge":
-                positive_text = all_text_for_global_local.index_select(0, global_local_targets)
-                positive_image = all_image_for_global_local.index_select(0, global_local_targets)
-                global_local_angle_aux = 0.5 * (
-                    weighted_entailment_residual(
-                        specific=part_image_flat,
-                        general=positive_text,
-                        kappa=kappa,
-                        aperture_scale=global_local_angle_aux_aperture_scale,
-                        weights=part_weights,
-                    )
-                    + weighted_entailment_residual(
-                        specific=part_text_flat,
-                        general=positive_image,
-                        kappa=kappa,
-                        aperture_scale=global_local_angle_aux_aperture_scale,
-                        weights=part_weights,
-                    )
-                )
-            elif loss_type != "siglip":
-                angle_scale = part_image_flat.new_tensor(float(global_local_angle_aux_scale))
-                part_image_to_whole_text_angle = -metric_pairwise_oxy_angle(
-                    part_image_flat,
-                    all_text_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                ) * image_temp[:, None] * angle_scale
-                part_text_to_whole_image_angle = -metric_pairwise_oxy_angle(
-                    part_text_flat,
-                    all_image_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                ) * text_temp[:, None] * angle_scale
-                if loss_type in {"sigmoid", "siglip_metric"}:
-                    bias = (
-                        part_image_to_whole_text_angle.new_zeros(())
-                        if global_local_logit_bias is None
-                        else global_local_logit_bias.to(part_image_to_whole_text_angle.device)
-                    )
-                    part_image_to_whole_text_angle = part_image_to_whole_text_angle + bias
-                    part_text_to_whole_image_angle = part_text_to_whole_image_angle + bias
-                global_local_angle_aux = 0.5 * (
-                    _contrastive_loss(
-                        part_image_to_whole_text_angle,
-                        global_local_targets,
-                        part_weights,
-                        loss_type,
-                        sigmoid_negative_weight,
-                    )
-                    + _contrastive_loss(
-                        part_text_to_whole_image_angle,
-                        global_local_targets,
-                        part_weights,
-                        loss_type,
-                        sigmoid_negative_weight,
-                    )
-                )
-    beta_cal = image_feats.new_zeros(())
-    if beta_cal_weight > 0.0 and beta_cal_beta > 0.0:
-        if part_group_ids is None or all_part_group_ids is None:
-            raise ValueError("beta_cal requires part_group_ids and all_part_group_ids to be provided")
-        beta_cal = 0.5 * (
-            beta_cal_loss(
-                part_image_logits,
-                targets=part_targets,
-                group_ids=part_group_ids,
-                all_group_ids=all_part_group_ids,
-                beta=beta_cal_beta,
-                variant=beta_cal_variant,
-                weights=part_weights,
-            )
-            + beta_cal_loss(
-                part_text_logits,
-                targets=part_targets,
-                group_ids=part_group_ids,
-                all_group_ids=all_part_group_ids,
-                beta=beta_cal_beta,
-                variant=beta_cal_variant,
-                weights=part_weights,
-            )
-        )
-    contrastive = (
-        contrastive_global_weight * global_contrastive
-        + contrastive_local_weight * local_contrastive
-        + contrastive_global_local_weight * global_local_contrastive
-        + global_local_angle_aux_weight * global_local_angle_aux
-        + beta_cal_weight * beta_cal
-    )
-    return {
-        "contrastive_loss": contrastive,
-        "global_contrastive_loss": global_contrastive,
-        "local_contrastive_loss": local_contrastive,
-        "global_local_contrastive_loss": global_local_contrastive,
-        "global_local_angle_aux_loss": global_local_angle_aux,
-        "beta_cal_loss": beta_cal,
-    }
-def _contrastive_loss(
-    logits: Tensor,
-    targets: Tensor,
-    weights: Tensor | None,
-    loss_type: str,
-    sigmoid_negative_weight: float,
-) -> Tensor:
-    if loss_type == "ce":
-        return contrastive_ce(logits, targets, weights)
-    if loss_type == "sigmoid":
-        return contrastive_sigmoid(logits, targets, weights, negative_weight=sigmoid_negative_weight)
-    if loss_type in {"siglip", "siglip_metric"}:
-        return contrastive_siglip(logits, targets, weights, negative_weight=sigmoid_negative_weight)
-    raise ValueError(f"Unsupported contrastive loss {loss_type!r}")
-def uncha_entailment_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-    piecewise_factor: float = 0.1,
-    calibration_alpha: float = 10.0,
-    stop_grad_calibration: bool = True,
-    geometry: str = "lorentz",
-    part_weights: Tensor | None = None,
-) -> dict[str, Tensor]:
-    text_image = piecewise_entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    text_image_entailment = 0.5 * text_image.mean()
-    if part_image_flat.numel() == 0:
-        zero = image_feats.new_zeros(())
-        return {
-            "entailment_loss": text_image_entailment,
-            "text_image_entailment_loss": text_image_entailment,
-            "part_text_image_entailment_loss": zero,
-            "cross_image_entailment_loss": zero,
-            "cross_text_entailment_loss": zero,
-            "cross_image_calibration_loss": zero,
-            "cross_text_calibration_loss": zero,
-        }
-    part_text_image = piecewise_entailment_residual(
-        specific=part_image_flat,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    cross_image = piecewise_entailment_residual(
-        specific=image_for_parts,
-        general=part_image_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    cross_text = piecewise_entailment_residual(
-        specific=text_for_parts,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    part_text_image_entailment = 0.5 * weighted_mean(part_text_image, part_weights)
-    cross_image_entailment, cross_image_calibration = uncertainty_calibrated_entailment_loss(
-        cross_image,
-        embedding_uncertainty(part_image_flat),
-        alpha=calibration_alpha,
-        stop_grad=stop_grad_calibration,
-        weights=part_weights,
-    )
-    cross_text_entailment, cross_text_calibration = uncertainty_calibrated_entailment_loss(
-        cross_text,
-        embedding_uncertainty(part_text_flat),
-        alpha=calibration_alpha,
-        stop_grad=stop_grad_calibration,
-        weights=part_weights,
-    )
-    entailment = (
-        text_image_entailment
-        + part_text_image_entailment
-        + 0.5 * (cross_image_entailment + cross_text_entailment)
-        + cross_image_calibration
-        + cross_text_calibration
-    )
-    return {
-        "entailment_loss": entailment,
-        "text_image_entailment_loss": text_image_entailment,
-        "part_text_image_entailment_loss": part_text_image_entailment,
-        "cross_image_entailment_loss": cross_image_entailment,
-        "cross_text_entailment_loss": cross_text_entailment,
-        "cross_image_calibration_loss": cross_image_calibration,
-        "cross_text_calibration_loss": cross_text_calibration,
-    }
-def uncha_argent_entailment_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    beta: float = 1.0,
-    part_weights: Tensor | None = None,
-    product_metric: str = "l1",
-    aggregation: str = "uncha",
-) -> dict[str, Tensor]:
-    if aggregation not in {"uncha", "equal"}:
-        raise ValueError("aggregation must be 'uncha' or 'equal'")
-    text_image = argent_adaptive_entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        adaptive_weight=False,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    text_image_entailment = 0.5 * text_image.mean()
-    if part_image_flat.numel() == 0:
-        zero = image_feats.new_zeros(())
-        norm_regularization = argent_norm_regularization_loss(image_feats, text_feats)
-        return {
-            "entailment_loss": text_image_entailment,
-            "text_image_entailment_loss": text_image_entailment,
-            "part_text_image_entailment_loss": zero,
-            "cross_image_entailment_loss": zero,
-            "cross_text_entailment_loss": zero,
-            "cross_image_calibration_loss": zero,
-            "cross_text_calibration_loss": zero,
-            "norm_regularization_loss": norm_regularization,
-        }
-    part_text_image = argent_adaptive_entailment_residual(
-        specific=part_image_flat,
-        general=part_text_flat,
-        kappa=kappa,
-        adaptive_weight=False,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    cross_image = argent_adaptive_entailment_residual(
-        specific=image_for_parts,
-        general=part_image_flat,
-        kappa=kappa,
-        adaptive_weight=True,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    cross_text = argent_adaptive_entailment_residual(
-        specific=text_for_parts,
-        general=part_text_flat,
-        kappa=kappa,
-        adaptive_weight=True,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    part_text_image_entailment = 0.5 * weighted_mean(part_text_image, part_weights)
-    cross_image_entailment = 0.5 * weighted_mean(cross_image, part_weights)
-    cross_text_entailment = 0.5 * weighted_mean(cross_text, part_weights)
-    norm_regularization = argent_norm_regularization_loss(image_feats, text_feats, part_image_flat, part_text_flat)
-    if aggregation == "equal":
-        entailment = text_image_entailment + part_text_image_entailment + cross_image_entailment + cross_text_entailment
-    else:
-        entailment = text_image_entailment + part_text_image_entailment + 0.5 * (
-            cross_image_entailment + cross_text_entailment
-        )
-    diagnostics = argent_entailment_diagnostics(
-        image_feats=image_feats,
-        text_feats=text_feats,
-        part_image_flat=part_image_flat,
-        part_text_flat=part_text_flat,
-        image_for_parts=image_for_parts,
-        text_for_parts=text_for_parts,
-        kappa=kappa,
-        product_metric=product_metric,
-    )
-    return {
-        "entailment_loss": entailment,
-        "text_image_entailment_loss": text_image_entailment,
-        "part_text_image_entailment_loss": part_text_image_entailment,
-        "cross_image_entailment_loss": cross_image_entailment,
-        "cross_text_entailment_loss": cross_text_entailment,
-        "cross_image_calibration_loss": image_feats.new_zeros(()),
-        "cross_text_calibration_loss": image_feats.new_zeros(()),
-        "norm_regularization_loss": norm_regularization,
-        **diagnostics,
-    }
-def hierarchical_beta_argent_entailment_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    beta_query_image_feats: Tensor,
-    beta_query_text_feats: Tensor,
-    beta_query_owner: Tensor,
-    beta_query_parent: Tensor,
-    beta_query_weight: Tensor,
-    kappa: Tensor,
-    beta_query_source_part: Tensor | None = None,
-    beta: float = 1.0,
-    part_weights: Tensor | None = None,
-    product_metric: str = "l1",
-    aggregation: str = "uncha",
-) -> dict[str, Tensor]:
-    base = uncha_argent_entailment_losses(
-        image_feats=image_feats,
-        text_feats=text_feats,
-        part_image_flat=part_image_flat,
-        part_text_flat=part_text_flat,
-        image_for_parts=image_for_parts,
-        text_for_parts=text_for_parts,
-        kappa=kappa,
-        beta=beta,
-        part_weights=part_weights,
-        product_metric=product_metric,
-        aggregation=aggregation,
-    )
-    if beta_query_image_feats.numel() == 0:
-        return {
-            **base,
-            "hier_beta_query_text_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_visual_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_text_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_sourcepart_visual_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_sourcepart_text_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_query_count": beta_query_owner.new_tensor(0),
-            "hier_beta_sourcepart_query_count": beta_query_owner.new_tensor(0),
-        }
-    query_owner = beta_query_owner.to(device=image_feats.device, dtype=torch.long)
-    query_weights = beta_query_weight.to(device=image_feats.device, dtype=torch.float32).clamp_min(0.0)
-    if query_weights.numel() != beta_query_image_feats.size(0):
-        raise ValueError("beta_query_weight must have one value per beta query")
-    query_weights = query_weights / query_weights.mean().clamp_min(torch.finfo(query_weights.dtype).eps)
-    query_text = argent_adaptive_entailment_residual(
-        specific=beta_query_image_feats,
-        general=beta_query_text_feats,
-        kappa=kappa,
-        adaptive_weight=False,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    visual_hierarchy = argent_adaptive_entailment_residual(
-        specific=image_feats.index_select(0, query_owner),
-        general=beta_query_image_feats,
-        kappa=kappa,
-        adaptive_weight=True,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    query_text_entailment = 0.5 * weighted_mean(query_text, query_weights)
-    visual_entailment = 0.5 * weighted_mean(visual_hierarchy, query_weights)
-    parent = beta_query_parent.to(device=image_feats.device, dtype=torch.long)
-    parent_mask = (parent >= 0) & (parent < beta_query_text_feats.size(0)) & (query_weights > 0.0)
-    if bool(parent_mask.any()):
-        child_text = beta_query_text_feats[parent_mask]
-        parent_text = beta_query_text_feats[parent[parent_mask]]
-        text_hierarchy = argent_adaptive_entailment_residual(
-            specific=parent_text,
-            general=child_text,
-            kappa=kappa,
-            adaptive_weight=True,
-            beta=beta,
-            product_metric=product_metric,
-        )
-        text_entailment = 0.5 * weighted_mean(text_hierarchy, query_weights[parent_mask])
-    else:
-        text_entailment = image_feats.new_zeros(())
-    sourcepart_visual_entailment = image_feats.new_zeros(())
-    sourcepart_text_entailment = image_feats.new_zeros(())
-    sourcepart_query_count = beta_query_owner.new_tensor(0)
-    if beta_query_source_part is not None and part_image_flat.numel() > 0:
-        source_part = beta_query_source_part.to(device=image_feats.device, dtype=torch.long)
-        if source_part.numel() != beta_query_image_feats.size(0):
-            raise ValueError("beta_query_source_part must have one value per beta query")
-        source_mask = (
-            (source_part >= 0)
-            & (source_part < part_image_flat.size(0))
-            & (query_weights > 0.0)
-        )
-        if bool(source_mask.any()):
-            source_indices = source_part[source_mask]
-            sourcepart_visual = argent_adaptive_entailment_residual(
-                specific=part_image_flat.index_select(0, source_indices),
-                general=beta_query_image_feats[source_mask],
-                kappa=kappa,
-                adaptive_weight=True,
-                beta=beta,
-                product_metric=product_metric,
-            )
-            sourcepart_text = argent_adaptive_entailment_residual(
-                specific=part_text_flat.index_select(0, source_indices),
-                general=beta_query_text_feats[source_mask],
-                kappa=kappa,
-                adaptive_weight=True,
-                beta=beta,
-                product_metric=product_metric,
-            )
-            source_weights = query_weights[source_mask]
-            sourcepart_visual_entailment = 0.5 * weighted_mean(sourcepart_visual, source_weights)
-            sourcepart_text_entailment = 0.5 * weighted_mean(sourcepart_text, source_weights)
-            sourcepart_query_count = beta_query_owner.new_tensor(int(source_mask.sum().item()))
-    norm_regularization = argent_norm_regularization_loss(
-        image_feats,
-        text_feats,
-        part_image_flat,
-        part_text_flat,
-        beta_query_image_feats,
-        beta_query_text_feats,
-    )
-    sourcepart_entailment = 0.5 * (sourcepart_visual_entailment + sourcepart_text_entailment)
-    query_entailment = query_text_entailment + 0.5 * (visual_entailment + text_entailment) + sourcepart_entailment
-    return {
-        **base,
-        "entailment_loss": base["entailment_loss"] + query_entailment,
-        "norm_regularization_loss": norm_regularization,
-        "hier_beta_query_text_entailment_loss": query_text_entailment,
-        "hier_beta_visual_entailment_loss": visual_entailment,
-        "hier_beta_text_entailment_loss": text_entailment,
-        "hier_beta_sourcepart_visual_entailment_loss": sourcepart_visual_entailment,
-        "hier_beta_sourcepart_text_entailment_loss": sourcepart_text_entailment,
-        "hier_beta_query_count": beta_query_owner.new_tensor(beta_query_owner.numel()),
-        "hier_beta_sourcepart_query_count": sourcepart_query_count,
-    }
-def argent_entailment_diagnostics(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    product_metric: str = "l1",
-) -> dict[str, Tensor]:
-    zero = image_feats.new_zeros(())
-    def angle_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-        if angles.dim() == 2:
-            angles = angles.mean(dim=-1)
-        return angles.detach().mean()
-    def pent_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-        if angles.dim() == 2:
-            angles = angles.mean(dim=-1)
-        scores = torch.clamp(1.0 - (2.0 * angles / math.pi), min=0.0, max=1.0)
-        return scores.detach().mean()
-    def distance_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        return lorentz_dist(specific, general, kappa, product_metric=product_metric).detach().mean()
-    def adaptive_weight_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        weights = 1.0 - torch.exp(-lorentz_dist(specific, general, kappa, product_metric=product_metric))
-        return weights.detach().mean()
-    def space_norm_mean(embedding: Tensor) -> Tensor:
-        if embedding.numel() == 0:
-            return zero
-        return torch.linalg.norm(_space_components(embedding).float(), dim=-1).detach().mean()
-    return {
-        "argent_text_image_angle_mean": angle_mean(image_feats, text_feats),
-        "argent_text_image_pent_mean": pent_mean(image_feats, text_feats),
-        "argent_part_text_image_angle_mean": angle_mean(part_image_flat, part_text_flat),
-        "argent_part_text_image_pent_mean": pent_mean(part_image_flat, part_text_flat),
-        "argent_cross_image_angle_mean": angle_mean(image_for_parts, part_image_flat),
-        "argent_cross_image_pent_mean": pent_mean(image_for_parts, part_image_flat),
-        "argent_cross_image_distance_mean": distance_mean(image_for_parts, part_image_flat),
-        "argent_cross_image_adaptive_weight_mean": adaptive_weight_mean(image_for_parts, part_image_flat),
-        "argent_cross_text_angle_mean": angle_mean(text_for_parts, part_text_flat),
-        "argent_cross_text_pent_mean": pent_mean(text_for_parts, part_text_flat),
-        "argent_cross_text_distance_mean": distance_mean(text_for_parts, part_text_flat),
-        "argent_cross_text_adaptive_weight_mean": adaptive_weight_mean(text_for_parts, part_text_flat),
-        "argent_image_space_norm_mean": space_norm_mean(image_feats),
-        "argent_text_space_norm_mean": space_norm_mean(text_feats),
-        "argent_part_image_space_norm_mean": space_norm_mean(part_image_flat),
-        "argent_part_text_space_norm_mean": space_norm_mean(part_text_flat),
-    }
-def part_quality_weights(
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    part_owner: Tensor,
-    batch_size: int,
-    kappa: Tensor,
-    mode: str,
-    topk: int = 5,
-    temperature: float = 4.0,
-    product_metric: str = "l1",
-) -> tuple[Tensor | None, Tensor, Tensor]:
-    if mode not in {"none", "soft", "topk"}:
-        raise ValueError(f"Unsupported part quality mode {mode!r}; expected 'none', 'soft', or 'topk'")
-    if mode == "none" or part_image_flat.numel() == 0:
-        empty = part_image_flat.new_zeros((part_image_flat.size(0),))
-        return None, empty, empty
-    with torch.no_grad():
-        image_parent = torch.exp(-lorentz_dist(part_image_flat, image_for_parts, kappa, product_metric=product_metric))
-        text_parent = torch.exp(-lorentz_dist(part_text_flat, text_for_parts, kappa, product_metric=product_metric))
-        image_text = torch.exp(-lorentz_dist(part_image_flat, part_text_flat, kappa, product_metric=product_metric))
-        scores = torch.stack([image_parent, text_parent, image_text]).mean(dim=0).clamp_min(0.0)
-        if mode == "soft":
-            weights = _owner_softmax_weights(scores, part_owner, batch_size, temperature)
-        else:
-            weights = _owner_topk_weights(scores, part_owner, batch_size, topk)
-        weights = weights / weights.mean().clamp_min(torch.finfo(weights.dtype).eps)
-    return weights, scores, (weights > 0.0).to(dtype=scores.dtype)
-def _owner_softmax_weights(scores: Tensor, part_owner: Tensor, batch_size: int, temperature: float) -> Tensor:
-    weights = torch.zeros_like(scores)
-    for owner in range(batch_size):
-        mask = part_owner == owner
-        if not bool(mask.any()):
-            continue
-        owner_scores = scores[mask]
-        owner_weights = torch.softmax(owner_scores * temperature, dim=0) * owner_scores.numel()
-        weights[mask] = owner_weights
-    return weights
-def _owner_topk_weights(scores: Tensor, part_owner: Tensor, batch_size: int, topk: int) -> Tensor:
-    if topk <= 0:
-        raise ValueError("topk must be positive for top-k part quality weighting")
-    weights = torch.zeros_like(scores)
-    for owner in range(batch_size):
-        indices = torch.nonzero(part_owner == owner, as_tuple=False).flatten()
-        if indices.numel() == 0:
-            continue
-        keep = min(topk, indices.numel())
-        selected = indices[scores[indices].topk(k=keep).indices]
-        weights[selected] = 1.0
-    return weights
-def argent_adaptive_entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    adaptive_weight: bool,
-    beta: float = 1.0,
-    product_metric: str = "l1",
-) -> Tensor:
-    angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-    if angles.dim() == 2:
-        angles = angles.mean(dim=-1)
-    if adaptive_weight:
-        weights = 1.0 - torch.exp(
-            -lorentz_dist(specific=specific, general=general, kappa=kappa, product_metric=product_metric)
-        )
-        angles = angles * weights
-    return F.huber_loss(angles, torch.zeros_like(angles), delta=beta, reduction="none")
-def lorentz_dist(specific: Tensor, general: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    return paired_dist(specific, general, kappa, product_metric=product_metric)
-def argent_norm_regularization_loss(*embeddings: Tensor, eps: float = 1e-6) -> Tensor:
-    losses = []
-    for embedding in embeddings:
-        if embedding.numel() == 0:
-            continue
-        space = _space_components(embedding)
-        space_norm = torch.linalg.norm(space.float(), dim=-1).clamp_min(eps)
-        losses.append((space_norm.square() - torch.log(space_norm)).mean())
-    if not losses:
-        raise ValueError("argent_norm_regularization_loss requires at least one non-empty embedding tensor")
-    return torch.stack(losses).mean()
-def piecewise_entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    aperture_scale: float,
-    factor: float = 0.1,
-    geometry: str = "lorentz",
-) -> Tensor:
-    if geometry == "lorentz":
-        angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-        apertures = factor_half_aperture(general=general, kappa=kappa)
-    elif geometry == "euclidean":
-        angles = euclidean_angle(specific=specific, general=general)
-        apertures = euclidean_half_aperture(general=general, aperture_scale=aperture_scale)
-        aperture_scale = 1.0
-    else:
-        raise ValueError(f"Unsupported entailment geometry {geometry!r}; expected 'lorentz' or 'euclidean'")
-    residual = angles - aperture_scale * apertures
-    loss = torch.where(residual > 0.0, residual + factor * angles, factor * angles)
-    return loss.mean(dim=-1) if loss.dim() == 2 else loss
-def euclidean_angle(specific: Tensor, general: Tensor, eps: float = 1e-6) -> Tensor:
-    specific_space = _space_components(specific).float()
-    general_space = _space_components(general).float()
-    numerator = (specific_space * general_space).sum(dim=-1)
-    denominator = torch.linalg.norm(specific_space, dim=-1) * torch.linalg.norm(general_space, dim=-1)
-    dtype_eps = torch.finfo(specific_space.dtype).eps
-    angle_eps = max(eps, 16.0 * dtype_eps)
-    cosine = (numerator / denominator.clamp_min(angle_eps)).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    return torch.acos(cosine)
-def euclidean_half_aperture(general: Tensor, aperture_scale: float, eps: float = 1e-8) -> Tensor:
-    general_norm = torch.linalg.norm(_space_components(general).float(), dim=-1).clamp_min(eps)
-    return torch.atan(torch.as_tensor(aperture_scale, device=general.device, dtype=general.dtype) / general_norm)
-def aggregate_part_consistency_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    part_owner: Tensor,
-    part_weights: Tensor | None = None,
-) -> Tensor:
-    if part_image_flat.numel() == 0:
-        return image_feats.new_zeros(())
-    batch_size = image_feats.size(0)
-    image_space = _space_components(image_feats).reshape(batch_size, -1).float()
-    text_space = _space_components(text_feats).reshape(batch_size, -1).float()
-    part_image_space = _space_components(part_image_flat).reshape(part_image_flat.size(0), -1).float()
-    part_text_space = _space_components(part_text_flat).reshape(part_text_flat.size(0), -1).float()
-    if part_weights is None:
-        counts = torch.bincount(part_owner, minlength=batch_size).to(device=image_feats.device, dtype=image_space.dtype)
-        denom = counts
-        valid = counts > 0
-        weights = part_image_space.new_ones((part_image_space.size(0),))
-    else:
-        weights = part_weights.to(device=image_feats.device, dtype=image_space.dtype).flatten()
-        if weights.numel() != part_owner.numel():
-            raise ValueError("part_weights must have the same number of elements as part_owner when provided")
-        denom = torch.zeros(batch_size, device=image_feats.device, dtype=image_space.dtype)
-        denom.index_add_(0, part_owner, weights)
-        valid = denom > 0
-    image_agg = image_space.new_zeros(image_space.shape)
-    text_agg = text_space.new_zeros(text_space.shape)
-    image_agg.index_add_(0, part_owner, part_image_space * weights[:, None])
-    text_agg.index_add_(0, part_owner, part_text_space * weights[:, None])
-    image_agg = image_agg[valid] / denom[valid, None].clamp_min(1.0)
-    text_agg = text_agg[valid] / denom[valid, None].clamp_min(1.0)
-    image_space = image_space[valid]
-    text_space = text_space[valid]
-    return 0.25 * (
-        cosine_residual(image_agg, image_space)
-        + cosine_residual(text_agg, text_space)
-        + cosine_residual(image_agg, text_space)
-        + cosine_residual(text_agg, image_space)
-    )
-def cosine_residual(x: Tensor, y: Tensor) -> Tensor:
-    return (1.0 - F.cosine_similarity(x, y, dim=-1)).mean()
-def uncertainty_calibrated_entailment_loss(
-    entail_residual: Tensor,
-    log_uncertainty: Tensor,
-    alpha: float = 10.0,
-    stop_grad: bool = True,
-    weights: Tensor | None = None,
-) -> tuple[Tensor, Tensor]:
-    mean_loss = 0.5 * entail_residual
-    uncertainty = torch.exp(log_uncertainty).clamp(min=1e-6, max=1e6)
-    residual = entail_residual.detach() if stop_grad else entail_residual
-    scaled_entail = residual / (uncertainty + 1e-6)
-    calibration_term = 0.5 * scaled_entail + 0.5 * log_uncertainty
-    prob = torch.softmax(log_uncertainty.flatten(), dim=0)
-    entropy = -(prob * torch.log(prob + 1e-8)).sum()
-    calibration_loss = alpha * (calibration_term + entropy)
-    return weighted_mean(mean_loss, weights), weighted_mean(calibration_loss, weights)
-def embedding_uncertainty(x: Tensor) -> Tensor:
-    space = _space_components(x)
-    norm = torch.linalg.norm(space.float(), dim=-1)
-    if norm.dim() > 1:
-        norm = norm.mean(dim=-1)
-    return F.softplus(-norm)
-def _space_components(x: Tensor) -> Tensor:
-    return x[..., 1:] if x.shape[-1] > 1 else x
-def _flatten_valid_parts(part_image_feats: Tensor, part_text_feats: Tensor, part_mask: Tensor, targets: Tensor) -> tuple[Tensor, Tensor, Tensor]:
-    part_targets = targets[:, None].expand_as(part_mask)[part_mask]
-    return part_image_feats[part_mask], part_text_feats[part_mask], part_targets

hyper3_clip/models/objectives.py DELETED Viewed

@@ -1,580 +0,0 @@
-from __future__ import annotations
-from collections.abc import Mapping
-import torch
-from torch import Tensor, nn
-from hyper3_clip.models.lorentz import log_map0, metric_pairwise_dist
-from hyper3_clip.models.losses import (
-    aggregate_part_consistency_loss,
-    contrastive_ce,
-    gramian_volume_loss,
-    hierarchical_beta_argent_entailment_losses,
-    packed_part_contrastive_loss,
-    packed_part_entailment_loss,
-    part_quality_weights,
-    radius_order_hinge,
-    uncha_argent_entailment_losses,
-    uncha_contrastive_losses,
-    uncha_entailment_losses,
-)
-from hyper3_clip.training.distributed import gather_variable_many_with_grad, gather_variable_no_grad, get_rank
-class HyCoCLIPObjective(nn.Module):
-    def __init__(
-        self,
-        entail_weight: float,
-        inter_aperture_scale: float,
-        intra_aperture_scale: float,
-        product_metric: str = "l1",
-    ) -> None:
-        super().__init__()
-        self.entail_weight = entail_weight
-        self.inter_aperture_scale = inter_aperture_scale
-        self.intra_aperture_scale = intra_aperture_scale
-        self.product_metric = product_metric
-    def forward(self, embeddings: Mapping[str, Tensor], logit_scale: Tensor) -> dict[str, Tensor]:
-        part_owner = embeddings["part_owner"].long()
-        part_count = part_owner.new_tensor(part_owner.numel())
-        contrastive = packed_part_contrastive_loss(
-            image_feats=embeddings["image_feats"],
-            text_feats=embeddings["text_feats"],
-            part_image_feats=embeddings["part_image_feats"],
-            part_text_feats=embeddings["part_text_feats"],
-            part_owner=part_owner,
-            kappa=embeddings["kappa"],
-            logit_scale=logit_scale,
-            all_image_feats=embeddings.get("all_image_feats"),
-            all_text_feats=embeddings.get("all_text_feats"),
-            targets=embeddings.get("targets"),
-        )
-        entailment = packed_part_entailment_loss(
-            image_feats=embeddings["image_feats"],
-            text_feats=embeddings["text_feats"],
-            part_image_feats=embeddings["part_image_feats"],
-            part_text_feats=embeddings["part_text_feats"],
-            part_owner=part_owner,
-            kappa=embeddings["kappa"],
-            inter_aperture_scale=self.inter_aperture_scale,
-            intra_aperture_scale=self.intra_aperture_scale,
-        )
-        total = contrastive + self.entail_weight * entailment
-        return {
-            "loss": total,
-            "contrastive_loss": contrastive,
-            "entailment_loss": entailment,
-            "part_count": part_count,
-        }
-class UNCHAObjective(nn.Module):
-    def __init__(
-        self,
-        entail_weight: float,
-        inter_aperture_scale: float,
-        intra_aperture_scale: float,
-        piecewise_factor: float = 0.1,
-        calibration_alpha: float = 10.0,
-        stop_grad_calibration: bool = True,
-        entailment_geometry: str = "lorentz",
-        aggregate_weight: float = 0.0,
-        entailment_loss: str = "piecewise",
-        argent_beta: float = 1.0,
-        argent_norm_weight: float = 0.0,
-        argent_aux_weight: float = 0.5,
-        argent_aggregation: str = "uncha",
-        part_weight_power: float = 0.0,
-        product_metric: str = "l1",
-        contrastive_loss: str = "ce",
-        sigmoid_negative_weight: float = 1.0,
-        part_quality_mode: str = "none",
-        part_quality_topk: int = 5,
-        part_quality_temperature: float = 4.0,
-        contrastive_global_weight: float = 1.0,
-        contrastive_local_weight: float = 1.0,
-        contrastive_global_local_weight: float = 1.0,
-        beta_cal_beta: float = 0.0,
-        beta_cal_variant: str = "ce",
-        beta_cal_weight: float = 0.0,
-        himo_component_weight: float = 0.0,
-        global_local_mode: str = "repeat",
-        global_local_metric: str = "distance",
-        global_local_angle_aux_weight: float = 0.0,
-        global_local_angle_aux_mode: str = "contrastive",
-        global_local_angle_aux_scale: float = 5.5,
-        global_local_angle_aux_aperture_scale: float = 1.0,
-        radius_order_weight: float = 0.0,
-        radius_order_margin: float = 0.0,
-        gramian_align_weight: float = 0.0,
-    ) -> None:
-        super().__init__()
-        if entailment_loss not in {
-            "piecewise",
-            "argent",
-            "piecewise_argent",
-            "hier_beta_argent",
-            "hier_beta_sourcepart_argent",
-        }:
-            raise ValueError(
-                f"Unsupported UNCHA entailment loss {entailment_loss!r}; "
-                "expected 'piecewise', 'argent', 'piecewise_argent', 'hier_beta_argent', "
-                "or 'hier_beta_sourcepart_argent'"
-            )
-        if contrastive_loss not in {"ce", "sigmoid", "siglip", "siglip_metric"}:
-            raise ValueError("contrastive_loss must be 'ce', 'sigmoid', 'siglip', or 'siglip_metric'")
-        if beta_cal_variant not in {"ce", "bce"}:
-            raise ValueError("beta_cal_variant must be 'ce' or 'bce'")
-        if argent_aggregation not in {"uncha", "equal"}:
-            raise ValueError("argent_aggregation must be 'uncha' or 'equal'")
-        if part_quality_mode not in {"none", "soft", "topk"}:
-            raise ValueError("part_quality_mode must be 'none', 'soft', or 'topk'")
-        if global_local_mode not in {"repeat", "inbatch"}:
-            raise ValueError("global_local_mode must be 'repeat' or 'inbatch'")
-        if global_local_metric not in {"distance", "angle"}:
-            raise ValueError("global_local_metric must be 'distance' or 'angle'")
-        if global_local_angle_aux_mode not in {"contrastive", "positive_hinge"}:
-            raise ValueError("global_local_angle_aux_mode must be 'contrastive' or 'positive_hinge'")
-        if global_local_angle_aux_weight < 0.0:
-            raise ValueError("global_local_angle_aux_weight must be non-negative")
-        if global_local_angle_aux_scale <= 0.0:
-            raise ValueError("global_local_angle_aux_scale must be positive")
-        if global_local_angle_aux_aperture_scale <= 0.0:
-            raise ValueError("global_local_angle_aux_aperture_scale must be positive")
-        if part_quality_topk <= 0:
-            raise ValueError("part_quality_topk must be positive")
-        self.entail_weight = entail_weight
-        self.inter_aperture_scale = inter_aperture_scale
-        self.intra_aperture_scale = intra_aperture_scale
-        self.piecewise_factor = piecewise_factor
-        self.calibration_alpha = calibration_alpha
-        self.stop_grad_calibration = stop_grad_calibration
-        self.entailment_geometry = entailment_geometry
-        self.aggregate_weight = aggregate_weight
-        self.entailment_loss = entailment_loss
-        self.argent_beta = argent_beta
-        self.argent_norm_weight = argent_norm_weight
-        self.argent_aux_weight = argent_aux_weight
-        self.argent_aggregation = argent_aggregation
-        self.part_weight_power = part_weight_power
-        self.product_metric = product_metric
-        self.contrastive_loss = contrastive_loss
-        self.sigmoid_negative_weight = sigmoid_negative_weight
-        self.part_quality_mode = part_quality_mode
-        self.part_quality_topk = part_quality_topk
-        self.part_quality_temperature = part_quality_temperature
-        self.contrastive_global_weight = float(contrastive_global_weight)
-        self.contrastive_local_weight = float(contrastive_local_weight)
-        self.contrastive_global_local_weight = float(contrastive_global_local_weight)
-        self.beta_cal_beta = float(beta_cal_beta)
-        self.beta_cal_variant = beta_cal_variant
-        self.beta_cal_weight = float(beta_cal_weight)
-        self.himo_component_weight = float(himo_component_weight)
-        self.global_local_mode = global_local_mode
-        self.global_local_metric = global_local_metric
-        self.global_local_angle_aux_weight = float(global_local_angle_aux_weight)
-        self.global_local_angle_aux_mode = global_local_angle_aux_mode
-        self.global_local_angle_aux_scale = float(global_local_angle_aux_scale)
-        self.global_local_angle_aux_aperture_scale = float(global_local_angle_aux_aperture_scale)
-        self.radius_order_weight = float(radius_order_weight)
-        self.radius_order_margin = float(radius_order_margin)
-        self.gramian_align_weight = float(gramian_align_weight)
-    def forward(self, embeddings: Mapping[str, Tensor], logit_scales: Mapping[str, Tensor]) -> dict[str, Tensor]:
-        part_owner = embeddings["part_owner"].long()
-        part_count = part_owner.new_tensor(part_owner.numel())
-        part_image_flat = embeddings["part_image_feats"]
-        part_text_flat = embeddings["part_text_feats"]
-        image_feats = embeddings["image_feats"]
-        text_feats = embeddings["text_feats"]
-        if part_owner.numel() == 0:
-            image_for_parts = image_feats.new_zeros((0, image_feats.size(-1)))
-            text_for_parts = text_feats.new_zeros((0, text_feats.size(-1)))
-        else:
-            image_for_parts = image_feats[part_owner]
-            text_for_parts = text_feats[part_owner]
-        count_part_weights = _part_weights(part_owner, image_feats.size(0), self.part_weight_power)
-        quality_part_weights, quality_scores, quality_keep = part_quality_weights(
-            image_for_parts=image_for_parts,
-            text_for_parts=text_for_parts,
-            part_image_flat=part_image_flat,
-            part_text_flat=part_text_flat,
-            part_owner=part_owner,
-            batch_size=image_feats.size(0),
-            kappa=embeddings["kappa"],
-            mode=self.part_quality_mode,
-            topk=self.part_quality_topk,
-            temperature=self.part_quality_temperature,
-            product_metric=self.product_metric,
-        )
-        part_weights = _combine_part_weights(count_part_weights, quality_part_weights)
-        needs_repeated_global_local = self.global_local_mode == "repeat" and self.contrastive_global_local_weight != 0.0
-        part_feature_tensors = [part_image_flat, part_text_flat]
-        if needs_repeated_global_local:
-            part_feature_tensors.extend([image_for_parts, text_for_parts])
-        gathered_part_features, part_counts = gather_variable_many_with_grad(part_feature_tensors)
-        all_part_image_feats = gathered_part_features[0]
-        all_part_text_feats = gathered_part_features[1]
-        all_image_for_parts = gathered_part_features[2] if needs_repeated_global_local else None
-        all_text_for_parts = gathered_part_features[3] if needs_repeated_global_local else None
-        image_euc_feats = embeddings.get("image_euc_feats")
-        text_euc_feats = embeddings.get("text_euc_feats")
-        part_image_euc_flat = embeddings.get("part_image_euc_feats")
-        part_text_euc_flat = embeddings.get("part_text_euc_feats")
-        image_for_parts_euc = None
-        text_for_parts_euc = None
-        all_part_image_euc_feats = None
-        all_part_text_euc_feats = None
-        all_image_for_parts_euc = None
-        all_text_for_parts_euc = None
-        if (
-            image_euc_feats is not None
-            and text_euc_feats is not None
-            and part_owner.numel() > 0
-            and needs_repeated_global_local
-        ):
-            image_for_parts_euc = image_euc_feats[part_owner]
-            text_for_parts_euc = text_euc_feats[part_owner]
-        if part_image_euc_flat is not None and part_text_euc_flat is not None:
-            euc_feature_tensors = [part_image_euc_flat, part_text_euc_flat]
-            if image_for_parts_euc is not None and text_for_parts_euc is not None:
-                euc_feature_tensors.extend([image_for_parts_euc, text_for_parts_euc])
-            gathered_euc_features, _ = gather_variable_many_with_grad(euc_feature_tensors)
-            all_part_image_euc_feats = gathered_euc_features[0]
-            all_part_text_euc_feats = gathered_euc_features[1]
-            if image_for_parts_euc is not None and text_for_parts_euc is not None:
-                all_image_for_parts_euc = gathered_euc_features[2]
-                all_text_for_parts_euc = gathered_euc_features[3]
-        if "targets" not in embeddings:
-            raise ValueError("UNCHAObjective requires 'targets' to compute group-aware losses")
-        global_targets = embeddings["targets"]
-        part_group_ids = global_targets[part_owner] if part_owner.numel() > 0 else part_owner.new_zeros((0,))
-        all_part_group_ids = None
-        if self.beta_cal_weight > 0.0 and self.beta_cal_beta > 0.0:
-            all_part_group_ids, _ = gather_variable_no_grad(part_group_ids)
-        part_offset = part_counts[: get_rank()].sum() if part_counts.numel() > 1 else part_counts.new_zeros(())
-        part_targets = torch.arange(part_image_flat.size(0), device=part_image_flat.device) + part_offset
-        contrastive = uncha_contrastive_losses(
-            image_feats=image_feats,
-            text_feats=text_feats,
-            part_image_flat=part_image_flat,
-            part_text_flat=part_text_flat,
-            image_for_parts=image_for_parts,
-            text_for_parts=text_for_parts,
-            image_euc_feats=image_euc_feats,
-            text_euc_feats=text_euc_feats,
-            part_image_euc_flat=part_image_euc_flat,
-            part_text_euc_flat=part_text_euc_flat,
-            image_for_parts_euc=image_for_parts_euc,
-            text_for_parts_euc=text_for_parts_euc,
-            kappa=embeddings["kappa"],
-            global_logit_scale=logit_scales["global"],
-            local_logit_scale=logit_scales["local"],
-            global_local_logit_scale=logit_scales["global_local"],
-            all_image_feats=embeddings.get("all_image_feats"),
-            all_text_feats=embeddings.get("all_text_feats"),
-            all_part_image_feats=all_part_image_feats,
-            all_part_text_feats=all_part_text_feats,
-            all_image_for_parts=all_image_for_parts,
-            all_text_for_parts=all_text_for_parts,
-            all_image_euc_feats=embeddings.get("all_image_euc_feats"),
-            all_text_euc_feats=embeddings.get("all_text_euc_feats"),
-            all_part_image_euc_feats=all_part_image_euc_feats,
-            all_part_text_euc_feats=all_part_text_euc_feats,
-            all_image_for_parts_euc=all_image_for_parts_euc,
-            all_text_for_parts_euc=all_text_for_parts_euc,
-            global_targets=global_targets,
-            part_targets=part_targets,
-            part_weights=part_weights,
-            product_metric=self.product_metric,
-            loss_type=self.contrastive_loss,
-            contrastive_global_weight=self.contrastive_global_weight,
-            contrastive_local_weight=self.contrastive_local_weight,
-            contrastive_global_local_weight=self.contrastive_global_local_weight,
-            beta_cal_beta=self.beta_cal_beta,
-            beta_cal_variant=self.beta_cal_variant,
-            beta_cal_weight=self.beta_cal_weight,
-            part_group_ids=part_group_ids,
-            all_part_group_ids=all_part_group_ids,
-            global_logit_bias=logit_scales.get("global_bias"),
-            local_logit_bias=logit_scales.get("local_bias"),
-            global_local_logit_bias=logit_scales.get("global_local_bias"),
-            sigmoid_negative_weight=self.sigmoid_negative_weight,
-            global_local_mode=self.global_local_mode,
-            global_local_metric=self.global_local_metric,
-            global_local_angle_aux_weight=self.global_local_angle_aux_weight,
-            global_local_angle_aux_mode=self.global_local_angle_aux_mode,
-            global_local_angle_aux_scale=self.global_local_angle_aux_scale,
-            global_local_angle_aux_aperture_scale=self.global_local_angle_aux_aperture_scale,
-        )
-        himo_component_loss = image_feats.new_zeros(())
-        if self.himo_component_weight > 0.0 and embeddings.get("himo_text_feats") is not None:
-            himo_text_feats = embeddings["himo_text_feats"]
-            all_himo_text_feats = embeddings.get("all_himo_text_feats")
-            if all_himo_text_feats is None:
-                raise ValueError("himo_text_feats requires all_himo_text_feats for distributed contrastive loss")
-            scale = logit_scales["global"].exp().clamp(max=100.0)
-            logits_i_t = -metric_pairwise_dist(image_feats, all_himo_text_feats, embeddings["kappa"], product_metric=self.product_metric) * scale
-            logits_t_i = -metric_pairwise_dist(himo_text_feats, embeddings["all_image_feats"], embeddings["kappa"], product_metric=self.product_metric) * scale
-            himo_component_loss = 0.5 * (contrastive_ce(logits_i_t, global_targets) + contrastive_ce(logits_t_i, global_targets))
-        if self.entailment_loss == "argent":
-            entailment = uncha_argent_entailment_losses(
-                image_feats=image_feats,
-                text_feats=text_feats,
-                part_image_flat=part_image_flat,
-                part_text_flat=part_text_flat,
-                image_for_parts=image_for_parts,
-                text_for_parts=text_for_parts,
-                kappa=embeddings["kappa"],
-                beta=self.argent_beta,
-                part_weights=part_weights,
-                product_metric=self.product_metric,
-                aggregation=self.argent_aggregation,
-            )
-        elif self.entailment_loss in {"hier_beta_argent", "hier_beta_sourcepart_argent"}:
-            required = (
-                "beta_query_image_feats",
-                "beta_query_text_feats",
-                "beta_query_owner",
-                "beta_query_parent",
-                "beta_query_weight",
-            )
-            if self.entailment_loss == "hier_beta_sourcepart_argent":
-                required = (*required, "beta_query_source_part")
-            missing = [key for key in required if embeddings.get(key) is None]
-            if missing:
-                raise ValueError(f"{self.entailment_loss} requires beta query embeddings: missing {missing}")
-            entailment = hierarchical_beta_argent_entailment_losses(
-                image_feats=image_feats,
-                text_feats=text_feats,
-                part_image_flat=part_image_flat,
-                part_text_flat=part_text_flat,
-                image_for_parts=image_for_parts,
-                text_for_parts=text_for_parts,
-                beta_query_image_feats=embeddings["beta_query_image_feats"],
-                beta_query_text_feats=embeddings["beta_query_text_feats"],
-                beta_query_owner=embeddings["beta_query_owner"],
-                beta_query_parent=embeddings["beta_query_parent"],
-                beta_query_weight=embeddings["beta_query_weight"],
-                beta_query_source_part=embeddings.get("beta_query_source_part")
-                if self.entailment_loss == "hier_beta_sourcepart_argent"
-                else None,
-                kappa=embeddings["kappa"],
-                beta=self.argent_beta,
-                part_weights=part_weights,
-                product_metric=self.product_metric,
-                aggregation=self.argent_aggregation,
-            )
-        else:
-            piecewise_entailment = uncha_entailment_losses(
-                image_feats=image_feats,
-                text_feats=text_feats,
-                part_image_flat=part_image_flat,
-                part_text_flat=part_text_flat,
-                image_for_parts=image_for_parts,
-                text_for_parts=text_for_parts,
-                kappa=embeddings["kappa"],
-                inter_aperture_scale=self.inter_aperture_scale,
-                intra_aperture_scale=self.intra_aperture_scale,
-                piecewise_factor=self.piecewise_factor,
-                calibration_alpha=self.calibration_alpha,
-                stop_grad_calibration=self.stop_grad_calibration,
-                geometry=self.entailment_geometry,
-                part_weights=part_weights,
-            )
-            if self.entailment_loss == "piecewise_argent":
-                argent_entailment = uncha_argent_entailment_losses(
-                    image_feats=image_feats,
-                    text_feats=text_feats,
-                    part_image_flat=part_image_flat,
-                    part_text_flat=part_text_flat,
-                    image_for_parts=image_for_parts,
-                    text_for_parts=text_for_parts,
-                    kappa=embeddings["kappa"],
-                    beta=self.argent_beta,
-                    part_weights=part_weights,
-                    product_metric=self.product_metric,
-                    aggregation=self.argent_aggregation,
-                )
-                entailment = {
-                    **piecewise_entailment,
-                    "entailment_loss": piecewise_entailment["entailment_loss"]
-                    + self.argent_aux_weight * argent_entailment["entailment_loss"],
-                    "piecewise_entailment_loss": piecewise_entailment["entailment_loss"],
-                    "argent_entailment_loss": argent_entailment["entailment_loss"],
-                    "norm_regularization_loss": argent_entailment["norm_regularization_loss"],
-                }
-            else:
-                entailment = piecewise_entailment
-        aggregate = aggregate_part_consistency_loss(
-            image_feats=image_feats,
-            text_feats=text_feats,
-            part_image_flat=part_image_flat,
-            part_text_flat=part_text_flat,
-            part_owner=part_owner,
-            part_weights=part_weights,
-        )
-        radius_order = image_feats.new_zeros(())
-        if self.radius_order_weight > 0.0:
-            radius_order = (
-                radius_order_hinge(image_feats, text_feats, embeddings["kappa"], self.radius_order_margin)
-                + radius_order_hinge(part_image_flat, part_text_flat, embeddings["kappa"], self.radius_order_margin, part_weights)
-                + radius_order_hinge(image_for_parts, part_image_flat, embeddings["kappa"], self.radius_order_margin, part_weights)
-                + radius_order_hinge(text_for_parts, part_text_flat, embeddings["kappa"], self.radius_order_margin, part_weights)
-            )
-        gramian_align = image_feats.new_zeros(())
-        if self.gramian_align_weight > 0.0 and part_owner.numel() > 0:
-            def _tangent_flat(x: Tensor) -> Tensor:
-                tangent = log_map0(x, embeddings["kappa"])
-                return tangent.reshape(tangent.size(0), -1) if tangent.dim() == 3 else tangent
-            gramian_vectors = torch.stack(
-                [
-                    _tangent_flat(image_for_parts),
-                    _tangent_flat(text_for_parts),
-                    _tangent_flat(part_image_flat),
-                    _tangent_flat(part_text_flat),
-                ],
-                dim=1,
-            )
-            gramian_align = gramian_volume_loss(gramian_vectors, part_weights)
-        entail_weight_scale = embeddings.get("entail_weight_scale", image_feats.new_ones(()))
-        total = (
-            contrastive["contrastive_loss"]
-            + self.himo_component_weight * himo_component_loss
-            + self.entail_weight * entail_weight_scale * entailment["entailment_loss"]
-            + self.aggregate_weight * aggregate
-            + self.radius_order_weight * radius_order
-            + self.gramian_align_weight * gramian_align
-            + self.argent_norm_weight * entailment.get(
-                "norm_regularization_loss",
-                image_feats.new_zeros(()),
-            )
-        )
-        return {
-            "loss": total,
-            **contrastive,
-            "himo_component_contrastive_loss": himo_component_loss,
-            **entailment,
-            "aggregate_consistency_loss": aggregate,
-            "radius_order_loss": radius_order,
-            "gramian_align_loss": gramian_align,
-            "part_count": part_count,
-            "entail_weight_scale": entail_weight_scale.detach(),
-            "part_quality_mean": (
-                image_feats.new_zeros(()) if quality_scores.numel() == 0 else quality_scores.mean().detach()
-            ),
-            "part_quality_keep_fraction": (
-                image_feats.new_zeros(()) if quality_keep.numel() == 0 else quality_keep.mean().detach()
-            ),
-        }
-def build_objective(
-    objective: str,
-    entail_weight: float,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-    uncha_piecewise_factor: float = 0.1,
-    uncha_calibration_alpha: float = 10.0,
-    uncha_stop_grad_calibration: bool = True,
-    uncha_entailment_geometry: str = "lorentz",
-    uncha_aggregate_weight: float = 0.0,
-    uncha_entailment_loss: str = "piecewise",
-    uncha_argent_beta: float = 1.0,
-    uncha_argent_norm_weight: float = 0.0,
-    uncha_argent_aux_weight: float = 0.5,
-    uncha_argent_aggregation: str = "uncha",
-    uncha_part_weight_power: float = 0.0,
-    uncha_contrastive_loss: str = "ce",
-    uncha_sigmoid_negative_weight: float = 1.0,
-    uncha_part_quality_mode: str = "none",
-    uncha_part_quality_topk: int = 5,
-    uncha_part_quality_temperature: float = 4.0,
-    uncha_contrastive_global_weight: float = 1.0,
-    uncha_contrastive_local_weight: float = 1.0,
-    uncha_contrastive_global_local_weight: float = 1.0,
-    uncha_beta_cal_beta: float = 0.0,
-    uncha_beta_cal_variant: str = "ce",
-    uncha_beta_cal_weight: float = 0.0,
-    uncha_himo_component_weight: float = 0.0,
-    uncha_global_local_mode: str = "repeat",
-    uncha_global_local_metric: str = "distance",
-    uncha_global_local_angle_aux_weight: float = 0.0,
-    uncha_global_local_angle_aux_mode: str = "contrastive",
-    uncha_global_local_angle_aux_scale: float = 5.5,
-    uncha_global_local_angle_aux_aperture_scale: float = 1.0,
-    uncha_radius_order_weight: float = 0.0,
-    uncha_radius_order_margin: float = 0.0,
-    uncha_gramian_align_weight: float = 0.0,
-    product_metric: str = "l1",
-) -> nn.Module:
-    if objective == "hycoclip":
-        return HyCoCLIPObjective(
-            entail_weight=entail_weight,
-            inter_aperture_scale=inter_aperture_scale,
-            intra_aperture_scale=intra_aperture_scale,
-            product_metric=product_metric,
-        )
-    if objective == "uncha":
-        return UNCHAObjective(
-            entail_weight=entail_weight,
-            inter_aperture_scale=inter_aperture_scale,
-            intra_aperture_scale=intra_aperture_scale,
-            piecewise_factor=uncha_piecewise_factor,
-            calibration_alpha=uncha_calibration_alpha,
-            stop_grad_calibration=uncha_stop_grad_calibration,
-            entailment_geometry=uncha_entailment_geometry,
-            aggregate_weight=uncha_aggregate_weight,
-            entailment_loss=uncha_entailment_loss,
-            argent_beta=uncha_argent_beta,
-            argent_norm_weight=uncha_argent_norm_weight,
-            argent_aux_weight=uncha_argent_aux_weight,
-            argent_aggregation=uncha_argent_aggregation,
-            part_weight_power=uncha_part_weight_power,
-            product_metric=product_metric,
-            contrastive_loss=uncha_contrastive_loss,
-            sigmoid_negative_weight=uncha_sigmoid_negative_weight,
-            part_quality_mode=uncha_part_quality_mode,
-            part_quality_topk=uncha_part_quality_topk,
-            part_quality_temperature=uncha_part_quality_temperature,
-            contrastive_global_weight=uncha_contrastive_global_weight,
-            contrastive_local_weight=uncha_contrastive_local_weight,
-            contrastive_global_local_weight=uncha_contrastive_global_local_weight,
-            beta_cal_beta=uncha_beta_cal_beta,
-            beta_cal_variant=uncha_beta_cal_variant,
-            beta_cal_weight=uncha_beta_cal_weight,
-            himo_component_weight=uncha_himo_component_weight,
-            global_local_mode=uncha_global_local_mode,
-            global_local_metric=uncha_global_local_metric,
-            global_local_angle_aux_weight=uncha_global_local_angle_aux_weight,
-            global_local_angle_aux_mode=uncha_global_local_angle_aux_mode,
-            global_local_angle_aux_scale=uncha_global_local_angle_aux_scale,
-            global_local_angle_aux_aperture_scale=uncha_global_local_angle_aux_aperture_scale,
-            radius_order_weight=uncha_radius_order_weight,
-            radius_order_margin=uncha_radius_order_margin,
-            gramian_align_weight=uncha_gramian_align_weight,
-        )
-    raise ValueError(f"Unsupported objective {objective!r}; expected 'hycoclip' or 'uncha'")
-def _part_weights(part_owner: Tensor, batch_size: int, power: float) -> Tensor | None:
-    if power <= 0.0 or part_owner.numel() == 0:
-        return None
-    counts = torch.bincount(part_owner, minlength=batch_size).to(dtype=torch.float32, device=part_owner.device)
-    weights = counts[part_owner].clamp_min(1.0).pow(-power)
-    return weights / weights.mean().clamp_min(torch.finfo(weights.dtype).eps)
-def _combine_part_weights(count_weights: Tensor | None, quality_weights: Tensor | None) -> Tensor | None:
-    if count_weights is None:
-        return quality_weights
-    if quality_weights is None:
-        return count_weights
-    weights = count_weights * quality_weights
-    return weights / weights.mean().clamp_min(torch.finfo(weights.dtype).eps)

hyper3_clip/models/tren.py DELETED Viewed

@@ -1,255 +0,0 @@
-from __future__ import annotations
-import math
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-class FourierPositionEncoding2D(nn.Module):
-    def __init__(self, dim: int, scale: float = 1.0) -> None:
-        super().__init__()
-        if dim <= 0 or dim % 2 != 0:
-            raise ValueError("FourierPositionEncoding2D dim must be a positive even integer")
-        if scale <= 0.0:
-            raise ValueError("FourierPositionEncoding2D scale must be positive")
-        generator = torch.Generator()
-        generator.manual_seed(42)
-        self.register_buffer("gaussian_matrix", scale * torch.randn((2, dim // 2), generator=generator))
-    def forward(self, coords: Tensor) -> Tensor:
-        projected = (2.0 * coords.float() - 1.0) @ self.gaussian_matrix
-        projected = 2.0 * math.pi * projected
-        return torch.cat([torch.sin(projected), torch.cos(projected)], dim=-1)
-class _MLPBlock(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int, dropout: float) -> None:
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(dim, hidden_dim),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim, dim),
-        )
-    def forward(self, x: Tensor) -> Tensor:
-        return self.net(x)
-class _AttentionLayer(nn.Module):
-    def __init__(
-        self,
-        q_dim: int,
-        kv_dim: int,
-        hidden_dim: int,
-        *,
-        num_heads: int,
-        dropout: float,
-        use_bias: bool = False,
-        use_v_proj: bool = True,
-        use_out_proj: bool = True,
-    ) -> None:
-        super().__init__()
-        if hidden_dim % num_heads != 0:
-            raise ValueError("hidden_dim must be divisible by num_heads")
-        if not use_v_proj and kv_dim != hidden_dim:
-            raise ValueError("kv_dim must equal hidden_dim when value projection is disabled")
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        self.head_dim = hidden_dim // num_heads
-        self.q_proj = nn.Linear(q_dim, hidden_dim, bias=use_bias)
-        self.k_proj = nn.Linear(kv_dim, hidden_dim, bias=use_bias)
-        self.v_proj = nn.Linear(kv_dim, hidden_dim, bias=use_bias) if use_v_proj else nn.Identity()
-        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=use_bias) if use_out_proj else nn.Identity()
-        self.q_norm = nn.LayerNorm(self.head_dim)
-        self.k_norm = nn.LayerNorm(self.head_dim)
-        self.dropout = nn.Dropout(dropout)
-        self.scale = self.head_dim**-0.5
-        nn.init.kaiming_normal_(self.q_proj.weight, mode="fan_in", nonlinearity="linear")
-        nn.init.kaiming_normal_(self.k_proj.weight, mode="fan_in", nonlinearity="linear")
-        if isinstance(self.v_proj, nn.Linear):
-            nn.init.kaiming_normal_(self.v_proj.weight, mode="fan_in", nonlinearity="linear")
-        if isinstance(self.out_proj, nn.Linear):
-            nn.init.kaiming_normal_(self.out_proj.weight, mode="fan_in", nonlinearity="linear")
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
-        batch_size, q_len, _ = q.shape
-        _, kv_len, _ = k.shape
-        query = self.q_proj(q).view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key = self.k_proj(k).view(batch_size, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value = self.v_proj(v).view(batch_size, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
-        query = self.q_norm(query)
-        key = self.k_norm(key)
-        attn_scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
-        attn_weights = self.dropout(F.softmax(attn_scores, dim=-1))
-        out = torch.matmul(attn_weights, value)
-        out = out.transpose(1, 2).contiguous().view(batch_size, q_len, self.hidden_dim)
-        return self.out_proj(out), attn_weights
-class _CrossAttentionBlock(nn.Module):
-    def __init__(self, dim: int, *, num_heads: int, dropout: float) -> None:
-        super().__init__()
-        self.query_norm = nn.LayerNorm(dim)
-        self.cross_attn = _AttentionLayer(dim, dim, dim, num_heads=num_heads, dropout=dropout)
-        self.dropout = nn.Dropout(dropout)
-        self.mlp_norm = nn.LayerNorm(dim)
-        self.mlp = _MLPBlock(dim, 2 * dim, dropout)
-        self.out_norm = nn.LayerNorm(dim)
-    def forward(self, query: Tensor, context: Tensor) -> Tensor:
-        x, _ = self.cross_attn(self.query_norm(query), context, context)
-        x = query + self.dropout(x)
-        return self.out_norm(x + self.mlp(self.mlp_norm(x)))
-class TRENRegionEncoder(nn.Module):
-    """T-REN-style point-prompted region token encoder.
-    The module follows the public T-REN architecture: learned k-per-prompt
-    query tokens, Fourier 2D prompt/patch position encodings, alternating
-    cross-attention and per-prompt self-attention, then final single-head
-    attention that pools unprojected patch tokens into region tokens.
-    """
-    def __init__(
-        self,
-        vision_dim: int,
-        text_dim: int,
-        *,
-        hidden_dim: int | None = None,
-        num_region_tokens: int = 3,
-        num_decoder_layers: int = 2,
-        num_attention_heads: int = 8,
-        prompt_grid_size: int = 7,
-        dropout: float = 0.1,
-    ) -> None:
-        super().__init__()
-        if num_region_tokens <= 0:
-            raise ValueError("num_region_tokens must be positive")
-        if num_decoder_layers <= 0:
-            raise ValueError("num_decoder_layers must be positive")
-        if prompt_grid_size <= 0:
-            raise ValueError("prompt_grid_size must be positive")
-        hidden_dim = int(hidden_dim or vision_dim)
-        if hidden_dim != vision_dim:
-            raise ValueError("TRENRegionEncoder currently requires hidden_dim == vision_dim")
-        if hidden_dim % 2 != 0:
-            raise ValueError("TRENRegionEncoder hidden_dim must be even for Fourier features")
-        if hidden_dim % num_attention_heads != 0:
-            raise ValueError("TRENRegionEncoder hidden_dim must be divisible by num_attention_heads")
-        self.vision_dim = vision_dim
-        self.text_dim = text_dim
-        self.hidden_dim = hidden_dim
-        self.num_region_tokens = num_region_tokens
-        self.prompt_grid_size = prompt_grid_size
-        self.position_encoder = FourierPositionEncoding2D(hidden_dim)
-        self.region_token_embeddings = nn.Embedding(num_region_tokens, hidden_dim)
-        nn.init.normal_(self.region_token_embeddings.weight, std=0.02)
-        self.region_attention_layers = nn.ModuleList(
-            [_CrossAttentionBlock(hidden_dim, num_heads=num_attention_heads, dropout=dropout) for _ in range(num_decoder_layers)]
-        )
-        self.region_attention_norms = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_decoder_layers)])
-        self.prompt_attention_layers = nn.ModuleList(
-            [
-                _AttentionLayer(
-                    hidden_dim,
-                    hidden_dim,
-                    hidden_dim,
-                    num_heads=num_attention_heads,
-                    dropout=dropout,
-                )
-                for _ in range(num_decoder_layers)
-            ]
-        )
-        self.prompt_attention_norms = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_decoder_layers)])
-        self.token_prediction_head = _AttentionLayer(
-            hidden_dim,
-            hidden_dim,
-            hidden_dim,
-            num_heads=1,
-            dropout=0.0,
-            use_v_proj=False,
-            use_out_proj=False,
-        )
-        self.text_alignment_block = nn.Sequential(
-            nn.Linear(hidden_dim, 2 * hidden_dim),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(2 * hidden_dim, text_dim),
-        )
-    def forward(self, image_tokens: Tensor) -> dict[str, Tensor]:
-        patch_tokens, patch_grid = _patch_tokens_and_grid(image_tokens)
-        batch_size, patch_count, _ = patch_tokens.shape
-        patch_coords = _grid_coords(patch_grid, patch_grid, patch_tokens.device)
-        prompt_coords = _grid_coords(self.prompt_grid_size, self.prompt_grid_size, patch_tokens.device)
-        prompt_count = prompt_coords.size(0)
-        feature_pos = self.position_encoder(patch_coords).to(dtype=patch_tokens.dtype)
-        prompt_pos = self.position_encoder(prompt_coords).to(dtype=patch_tokens.dtype)
-        kv = patch_tokens + feature_pos.unsqueeze(0)
-        prompt_pos = prompt_pos.view(1, prompt_count, 1, self.hidden_dim)
-        q = self.region_token_embeddings.weight.to(dtype=patch_tokens.dtype)
-        q = q.view(1, 1, self.num_region_tokens, self.hidden_dim).expand(
-            batch_size,
-            prompt_count,
-            self.num_region_tokens,
-            self.hidden_dim,
-        )
-        for region_layer, region_norm, prompt_layer, prompt_norm in zip(
-            self.region_attention_layers,
-            self.region_attention_norms,
-            self.prompt_attention_layers,
-            self.prompt_attention_norms,
-            strict=True,
-        ):
-            q = q + prompt_pos
-            q = q.reshape(batch_size, prompt_count * self.num_region_tokens, self.hidden_dim)
-            q = region_layer(q, kv)
-            q = q.reshape(batch_size, prompt_count, self.num_region_tokens, self.hidden_dim)
-            q = region_norm(q)
-            q = q.reshape(batch_size * prompt_count, self.num_region_tokens, self.hidden_dim)
-            q, _ = prompt_layer(q, q, q)
-            q = prompt_norm(q)
-            q = q.reshape(batch_size, prompt_count, self.num_region_tokens, self.hidden_dim)
-        flat_q = q.reshape(batch_size, prompt_count * self.num_region_tokens, self.hidden_dim)
-        visual_tokens, attn_weights = self.token_prediction_head(flat_q, kv, patch_tokens)
-        visual_tokens = visual_tokens.reshape(batch_size, prompt_count, self.num_region_tokens, self.hidden_dim)
-        attn_weights = attn_weights.squeeze(1).reshape(batch_size, prompt_count, self.num_region_tokens, patch_count)
-        region_masks = attn_weights / attn_weights.amax(dim=-1, keepdim=True).clamp_min(torch.finfo(attn_weights.dtype).eps)
-        region_masks = region_masks.reshape(batch_size, prompt_count, self.num_region_tokens, patch_grid, patch_grid)
-        text_aligned_tokens = self.text_alignment_block(visual_tokens)
-        return {
-            "visual_tokens": visual_tokens,
-            "text_aligned_tokens": text_aligned_tokens,
-            "region_masks": region_masks,
-            "prompt_coords": prompt_coords,
-        }
-def _patch_tokens_and_grid(tokens: Tensor) -> tuple[Tensor, int]:
-    if tokens.ndim != 3:
-        raise ValueError("TRENRegionEncoder expects image tokens with shape [batch, tokens, dim]")
-    token_count = tokens.size(1)
-    grid = int(math.isqrt(token_count))
-    if grid * grid == token_count:
-        return tokens, grid
-    grid = int(math.isqrt(token_count - 1))
-    if grid * grid == token_count - 1:
-        return tokens[:, 1:, :], grid
-    raise ValueError(f"Cannot infer a square patch grid from {token_count} image tokens")
-def _grid_coords(height: int, width: int, device: torch.device) -> Tensor:
-    y = torch.linspace(0.5 / height, 1.0 - 0.5 / height, height, device=device)
-    x = torch.linspace(0.5 / width, 1.0 - 0.5 / width, width, device=device)
-    yy, xx = torch.meshgrid(y, x, indexing="ij")
-    return torch.stack([xx, yy], dim=-1).reshape(-1, 2)

hyper3_clip/training/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __all__: list[str] = []

hyper3_clip/training/distributed.py DELETED Viewed

@@ -1,149 +0,0 @@
-from __future__ import annotations
-from collections.abc import Sequence
-import os
-import torch
-import torch.distributed as dist
-from torch.distributed.nn import all_gather as differentiable_all_gather
-from torch import Tensor
-def init_distributed() -> None:
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ and not dist.is_initialized():
-        backend = "nccl" if torch.cuda.is_available() else "gloo"
-        if torch.cuda.is_available():
-            torch.cuda.set_device(get_local_rank())
-        dist.init_process_group(backend=backend)
-def is_distributed() -> bool:
-    return dist.is_available() and dist.is_initialized()
-def barrier() -> None:
-    if is_distributed():
-        dist.barrier()
-def destroy_distributed() -> None:
-    if is_distributed():
-        dist.destroy_process_group()
-def get_rank() -> int:
-    return dist.get_rank() if is_distributed() else 0
-def get_world_size() -> int:
-    return dist.get_world_size() if is_distributed() else 1
-def get_local_rank() -> int:
-    return int(os.environ.get("LOCAL_RANK", "0"))
-def is_main_process() -> bool:
-    return get_rank() == 0
-def gather_with_grad(tensor: Tensor) -> Tensor:
-    world_size = get_world_size()
-    if world_size == 1:
-        return tensor
-    return torch.cat(list(differentiable_all_gather(tensor.contiguous())), dim=0)
-def gather_variable_with_grad(tensor: Tensor) -> tuple[Tensor, Tensor]:
-    """Gather tensors with variable first-dimension lengths across ranks."""
-    count_tensor, max_count, keep = _variable_gather_metadata(tensor)
-    if get_world_size() == 1:
-        return tensor, count_tensor
-    return _gather_variable_from_metadata(tensor, max_count, keep), count_tensor
-def gather_variable_many_with_grad(tensors: Sequence[Tensor]) -> tuple[list[Tensor], Tensor]:
-    """Gather same-length variable tensors while sharing count metadata.
-    Tensors with matching dtype/rank/trailing shape are packed along the last
-    dimension so a single differentiable all-gather can serve several feature
-    tensors with the same variable first dimension.
-    """
-    if not tensors:
-        raise ValueError("gather_variable_many_with_grad requires at least one tensor")
-    first = tensors[0]
-    for tensor in tensors:
-        if tensor.device != first.device:
-            raise ValueError("all tensors must be on the same device")
-        if tensor.shape[0] != first.shape[0]:
-            raise ValueError("all tensors must have the same first dimension")
-    count_tensor, max_count, keep = _variable_gather_metadata(first)
-    if get_world_size() == 1:
-        return list(tensors), count_tensor
-    gathered: list[Tensor | None] = [None] * len(tensors)
-    groups: dict[tuple[torch.dtype, torch.Size, int], list[int]] = {}
-    for index, tensor in enumerate(tensors):
-        if tensor.dim() == 0:
-            raise ValueError("variable gather tensors must have at least one dimension")
-        key = (tensor.dtype, tensor.shape[1:-1], tensor.dim()) if tensor.dim() > 1 else (tensor.dtype, torch.Size(), 1)
-        groups.setdefault(key, []).append(index)
-    for indices in groups.values():
-        group_tensors = [tensors[index] for index in indices]
-        if len(group_tensors) == 1 or group_tensors[0].dim() == 1:
-            for index, tensor in zip(indices, group_tensors, strict=True):
-                gathered[index] = _gather_variable_from_metadata(tensor, max_count, keep)
-            continue
-        widths = [tensor.shape[-1] for tensor in group_tensors]
-        packed = torch.cat(group_tensors, dim=-1)
-        gathered_packed = _gather_variable_from_metadata(packed, max_count, keep)
-        for index, chunk in zip(indices, gathered_packed.split(widths, dim=-1), strict=True):
-            gathered[index] = chunk
-    if any(tensor is None for tensor in gathered):
-        raise RuntimeError("internal error while gathering variable tensors")
-    return [tensor for tensor in gathered if tensor is not None], count_tensor
-def gather_variable_no_grad(tensor: Tensor) -> tuple[Tensor, Tensor]:
-    """Gather variable-length tensors that do not require autograd."""
-    count_tensor, max_count, keep = _variable_gather_metadata(tensor)
-    if get_world_size() == 1:
-        return tensor, count_tensor
-    padded = tensor.new_zeros((max_count, *tensor.shape[1:]))
-    padded[: tensor.shape[0]] = tensor
-    gathered = [torch.zeros_like(padded) for _ in range(get_world_size())]
-    dist.all_gather(gathered, padded.contiguous())
-    return torch.cat(gathered, dim=0)[keep], count_tensor
-def _variable_gather_metadata(tensor: Tensor) -> tuple[Tensor, int, Tensor]:
-    world_size = get_world_size()
-    local_count = torch.tensor([tensor.shape[0]], device=tensor.device, dtype=torch.long)
-    if world_size == 1:
-        keep = torch.ones(tensor.shape[0], device=tensor.device, dtype=torch.bool)
-        return local_count, tensor.shape[0], keep
-    counts = [torch.zeros_like(local_count) for _ in range(world_size)]
-    dist.all_gather(counts, local_count)
-    count_tensor = torch.cat(counts)
-    max_count = int(count_tensor.max().item())
-    keep = torch.zeros(world_size * max_count, device=tensor.device, dtype=torch.bool)
-    for rank, count in enumerate(count_tensor.tolist()):
-        start = rank * max_count
-        keep[start : start + count] = True
-    return count_tensor, max_count, keep
-def _gather_variable_from_metadata(tensor: Tensor, max_count: int, keep: Tensor) -> Tensor:
-    padded_shape = (max_count, *tensor.shape[1:])
-    padded = tensor.new_zeros(padded_shape)
-    padded[: tensor.shape[0]] = tensor
-    gathered = torch.cat(list(differentiable_all_gather(padded.contiguous())), dim=0)
-    return gathered[keep]
-def local_target_indices(batch_size: int, device: torch.device) -> Tensor:
-    return torch.arange(batch_size, device=device) + batch_size * get_rank()

hyper3_clip_provider.py DELETED Viewed

@@ -1,115 +0,0 @@
-"""HyperView embedding provider for the Hyper3-CLIP v0.5 HF checkpoint."""
-from __future__ import annotations
-import os
-from pathlib import Path
-from typing import Any
-import numpy as np
-import torch
-import yaml
-from huggingface_hub import snapshot_download
-from lancedb.embeddings import EmbeddingFunction
-from pydantic import PrivateAttr
-from safetensors.torch import load_file
-class Hyper3ClipEmbeddings(EmbeddingFunction):
-    """Image embeddings from Hyper3-CLIP v0.5 in Lorentz/hyperboloid space."""
-    name: str = "hyper3labs/hyper3-clip-v0.5"
-    batch_size: int = 8
-    device: str = "cpu"
-    _model: Any = PrivateAttr(default=None)
-    _transform: Any = PrivateAttr(default=None)
-    @property
-    def geometry(self) -> str:
-        return "hyperboloid"
-    @property
-    def curvature(self) -> float:
-        self._ensure_model()
-        return float(self._model._kappa().detach().cpu().reshape(-1)[0].item())
-    def ndims(self) -> int:
-        return 513
-    def _ensure_model(self) -> None:
-        if self._model is not None:
-            return
-        from hyper3_clip import Hyper3CLIP
-        from torchvision import transforms
-        token = os.environ.get("HF_TOKEN")
-        local_dir = snapshot_download(
-            self.name,
-            allow_patterns=["config.yaml", "model.safetensors"],
-            token=token,
-        )
-        root = Path(local_dir)
-        config = yaml.safe_load((root / "config.yaml").read_text(encoding="utf-8"))
-        model = Hyper3CLIP(**config["model"])
-        state = load_file(root / "model.safetensors", device="cpu")
-        model.load_state_dict(state)
-        model.to(torch.device(self.device))
-        model.eval()
-        self._model = model
-        image_size = int(config.get("data", {}).get("image_size", 224))
-        self._transform = transforms.Compose(
-            [
-                transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BICUBIC),
-                transforms.CenterCrop(image_size),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=(0.485, 0.456, 0.406),
-                    std=(0.229, 0.224, 0.225),
-                ),
-            ]
-        )
-    def compute_source_embeddings(
-        self,
-        inputs: Any,
-        *args: Any,
-        **kwargs: Any,
-    ) -> list[np.ndarray | None]:
-        from PIL import Image
-        from hyperview.core.sample import Sample
-        self._ensure_model()
-        device = torch.device(self.device)
-        images = []
-        for item in self.sanitize_input(inputs):
-            if isinstance(item, Sample):
-                with item.load_image() as img:
-                    images.append(img.convert("RGB"))
-            elif isinstance(item, str):
-                with Image.open(item) as img:
-                    images.append(img.convert("RGB"))
-            elif isinstance(item, Image.Image):
-                images.append(item.convert("RGB"))
-            else:
-                raise TypeError(f"Unsupported input type: {type(item)}")
-        outputs: list[np.ndarray | None] = []
-        with torch.inference_mode():
-            for start in range(0, len(images), self.batch_size):
-                batch = images[start:start + self.batch_size]
-                tensor = torch.stack([self._transform(image) for image in batch]).to(device)
-                encoded = self._model.encode_image(tensor).detach().cpu().numpy().astype(np.float32)
-                outputs.extend(encoded)
-        return outputs
-    def compute_query_embeddings(
-        self,
-        query: Any,
-        *args: Any,
-        **kwargs: Any,
-    ) -> list[np.ndarray | None]:
-        return self.compute_source_embeddings([query], *args, **kwargs)