Spaces:

hyper3labs
/

HyperView-VisA-Manufacturing

Sleeping

App Files Files Community

mnm-matin commited on 28 days ago

Commit

90cc3c3

verified ·

1 Parent(s): 4e18ed4

Update manufacturing Space to latest HyperView public API

Browse files

Files changed (34) hide show

.hyperview/extensions/manufacturing-readout/panel.js +86 -66
Dockerfile +3 -1
README.md +6 -6
demo.py +28 -27
hyper3_clip/__init__.py +0 -3
hyper3_clip/data/__init__.py +0 -14
hyper3_clip/data/collators.py +0 -209
hyper3_clip/data/grit_cleaning.py +0 -554
hyper3_clip/data/grit_webdataset.py +0 -133
hyper3_clip/data/manifest_dataset.py +0 -120
hyper3_clip/data/mixed_dataset.py +0 -68
hyper3_clip/data/transforms.py +0 -125
hyper3_clip/data/types.py +0 -48
hyper3_clip/evaluation/__init__.py +0 -20
hyper3_clip/evaluation/classification.py +0 -105
hyper3_clip/evaluation/hierarchical.py +0 -118
hyper3_clip/evaluation/pep.py +0 -462
hyper3_clip/evaluation/retrieval.py +0 -215
hyper3_clip/models/__init__.py +0 -3
hyper3_clip/models/encoders.py +0 -173
hyper3_clip/models/experimental.py +0 -587
hyper3_clip/models/himo.py +0 -55
hyper3_clip/models/hyper3_clip.py +0 -958
hyper3_clip/models/lorentz.py +0 -265
hyper3_clip/models/losses.py +0 -1400
hyper3_clip/models/objectives.py +0 -580
hyper3_clip/models/tren.py +0 -255
hyper3_clip/training/__init__.py +0 -1
hyper3_clip/training/checkpointing.py +0 -91
hyper3_clip/training/distributed.py +0 -149
hyper3_clip/training/engine.py +0 -442
hyper3_clip/training/logging.py +0 -15
hyper3_clip/utils/io.py +0 -29
hyper3_clip_provider.py +0 -133

.hyperview/extensions/manufacturing-readout/panel.js CHANGED Viewed

@@ -39,7 +39,6 @@ function normalizeModels(value) {
       displayName: String(model.displayName || model.display_name || model.key || `Model ${index + 1}`),
       buttonLabel: String(model.buttonLabel || model.button_label || `${model.key || "Model"} query`),
       layoutKey: model.layoutKey || model.layout_key || null,
-      spaceKey: model.spaceKey || model.space_key || null,
     }))
     .filter((model) => model.layoutKey);
 }
@@ -178,6 +177,77 @@ function CompactEvidence({ item, models }) {
   );
 }
 function StepBlock({ number, title, children }) {
   return React.createElement(
     "div",
@@ -320,8 +390,7 @@ function choiceFromSimilarity(similarity, examples, models) {
   const sourceKey = source.includes(":") ? source.split(":").pop() : null;
   const model =
     models.find((candidate) => candidate.key === sourceKey) ||
-    models.find((candidate) => candidate.layoutKey === similarity.layout_key) ||
-    models.find((candidate) => candidate.spaceKey === similarity.space_key);
   if (!model) return null;
   const metric = advantageMetric(model.key, item.id);
   return {
@@ -332,18 +401,6 @@ function choiceFromSimilarity(similarity, examples, models) {
   };
 }
-async function sendJson(path, payload, method = "POST") {
-  const response = await fetch(path, {
-    method,
-    headers: { "Content-Type": "application/json" },
-    body: JSON.stringify(payload),
-  });
-  if (!response.ok) {
-    throw new Error(await response.text());
-  }
-  return response.json();
-}
 function buttonText(model) {
   if (model.key === "candidate") return "Hyper3";
   if (model.key === "clip") return "CLIP";
@@ -416,6 +473,7 @@ function WalkthroughCard({ item, models, onSelectQuery, loadingKey, activeModelK
       { style: { color: colors.mutedText, fontSize: 10, lineHeight: 1.3 } },
       "Wrong-line references send operators to the wrong golden sample.",
     ),
     React.createElement(CompactEvidence, { item, models }),
   );
 }
@@ -424,7 +482,6 @@ export default function ManufacturingPanel() {
   const props = usePanelProps() || {};
   const commands = usePanelCommands();
   const runtimeState = usePanelRuntimeState ? usePanelRuntimeState() : {};
-  const workspaceId = String(props.workspaceId || props.workspace_id || "manufacturing-visa-reference-clip-hyper3clip");
   const models = normalizeModels(props.models);
   const examples = Array.isArray(props.examples) ? props.examples : [];
   const primaryExample = examples.find((item) => item.id === "fryum") || examples[0] || null;
@@ -449,6 +506,7 @@ export default function ManufacturingPanel() {
       const item = examples.find((example) => example.queryId === sampleId);
       const metric = advantageMetric(model.key, item?.id);
       const nextChoice = {
         modelName: model.displayName,
         queryLabel: title(item?.queryLabel || "fryum"),
         metricLine: metric?.line || null,
@@ -458,63 +516,25 @@ export default function ManufacturingPanel() {
       setActiveChoice(nextChoice);
       setLoadingKey(choiceKey);
       try {
-        if (commands.setActiveLayout) {
-          await commands.setActiveLayout(model.layoutKey, { persist: "none" });
-        }
-        if (commands.showSimilar) {
-          await commands.showSimilar({
-            sampleId,
-            layoutKey: model.layoutKey,
-            spaceKey: model.spaceKey,
-            k: 10,
-            source: `manufacturing-demo:${model.key}`,
-            focus: "samples",
-            persist: "none",
-          });
-        }
-        await sendJson("/api/control/ui/state", {
-          workspace_id: workspaceId,
-          set_active_layout: true,
-          active_layout_key: model.layoutKey,
-          set_selection: true,
-          selected_ids: [sampleId],
-          set_similarity_query: true,
-          similarity_query: {
-            sample_id: sampleId,
-            layout_key: model.layoutKey,
-            space_key: model.spaceKey,
-            k: 10,
-            source: `manufacturing-demo:${model.key}`,
-          },
-        }, "PATCH");
         setActiveChoice(nextChoice);
         setActiveModelKey(model.key);
       } catch (error) {
-        try {
-          await sendJson("/api/control/ui/layout", {
-            workspace_id: workspaceId,
-            layout_key: model.layoutKey,
-          });
-          await sendJson("/api/control/ui/similarity", {
-            workspace_id: workspaceId,
-            sample_id: sampleId,
-            layout_key: model.layoutKey,
-            space_key: model.spaceKey,
-            k: 10,
-            source: `manufacturing-demo:${model.key}`,
-          });
-          setActiveChoice(nextChoice);
-          setActiveModelKey(model.key);
-          return;
-        } catch (fallbackError) {
-          const message = fallbackError instanceof Error ? fallbackError.message : String(fallbackError);
-          setPanelError(`Could not show neighbors: ${message}`);
-        }
       } finally {
         setLoadingKey(null);
       }
     },
-    [commands, examples, workspaceId],
   );
   return React.createElement(

       displayName: String(model.displayName || model.display_name || model.key || `Model ${index + 1}`),
       buttonLabel: String(model.buttonLabel || model.button_label || `${model.key || "Model"} query`),
       layoutKey: model.layoutKey || model.layout_key || null,
     }))
     .filter((model) => model.layoutKey);
 }
   );
 }
+function ActiveNeighbors({ item, modelKey }) {
+  if (!item || !modelKey) return null;
+  const summary = item.summaries?.[modelKey] || {};
+  const neighbors = Array.isArray(summary.neighbors) ? summary.neighbors.slice(0, 5) : [];
+  if (!neighbors.length) return null;
+  const cell = {
+    padding: "4px 3px",
+    borderBottom: `1px solid ${colors.border}`,
+    fontSize: 10,
+    color: colors.bodyText,
+  };
+  const head = { ...cell, color: colors.mutedText, fontSize: 9, textTransform: "uppercase" };
+  return React.createElement(
+    "div",
+    {
+      style: {
+        borderTop: `1px solid ${colors.border}`,
+        paddingTop: 7,
+        display: "flex",
+        flexDirection: "column",
+        gap: 4,
+      },
+    },
+    React.createElement(
+      "div",
+      { style: { color: colors.strongText, fontSize: 11.5, fontWeight: 900 } },
+      modelKey === "candidate" ? "Hyper3 Top Refs" : "CLIP Top Refs",
+    ),
+    React.createElement(
+      "table",
+      { style: { width: "100%", borderCollapse: "collapse" } },
+      React.createElement(
+        "thead",
+        null,
+        React.createElement(
+          "tr",
+          null,
+          React.createElement("th", { style: head, align: "left" }, "Rank"),
+          React.createElement("th", { style: head, align: "left" }, "SKU"),
+          React.createElement("th", { style: head, align: "right" }, "Signal"),
+        ),
+      ),
+      React.createElement(
+        "tbody",
+        null,
+        neighbors.map((neighbor) => {
+          const signal = neighbor.sameSkuNormal
+            ? "correct normal"
+            : neighbor.pipeFryumConfusion
+              ? "wrong line"
+              : neighbor.sameSku
+                ? "same SKU"
+                : "other";
+          const signalColor = neighbor.sameSkuNormal
+            ? colors.good
+            : neighbor.pipeFryumConfusion
+              ? colors.error
+              : colors.bodyText;
+          return React.createElement(
+            "tr",
+            { key: `${modelKey}-${neighbor.rank}-${neighbor.id}` },
+            React.createElement("td", { style: { ...cell, color: colors.strongText, fontWeight: 800 } }, `#${neighbor.rank}`),
+            React.createElement("td", { style: cell }, pretty(neighbor.sku)),
+            React.createElement("td", { style: { ...cell, color: signalColor, fontWeight: 800 }, align: "right" }, signal),
+          );
+        }),
+      ),
+    ),
+  );
+}
 function StepBlock({ number, title, children }) {
   return React.createElement(
     "div",
   const sourceKey = source.includes(":") ? source.split(":").pop() : null;
   const model =
     models.find((candidate) => candidate.key === sourceKey) ||
+    models.find((candidate) => candidate.layoutKey === similarity.layout_key);
   if (!model) return null;
   const metric = advantageMetric(model.key, item.id);
   return {
   };
 }
 function buttonText(model) {
   if (model.key === "candidate") return "Hyper3";
   if (model.key === "clip") return "CLIP";
       { style: { color: colors.mutedText, fontSize: 10, lineHeight: 1.3 } },
       "Wrong-line references send operators to the wrong golden sample.",
     ),
+    React.createElement(ActiveNeighbors, { item, modelKey: activeModelKey }),
     React.createElement(CompactEvidence, { item, models }),
   );
 }
   const props = usePanelProps() || {};
   const commands = usePanelCommands();
   const runtimeState = usePanelRuntimeState ? usePanelRuntimeState() : {};
   const models = normalizeModels(props.models);
   const examples = Array.isArray(props.examples) ? props.examples : [];
   const primaryExample = examples.find((item) => item.id === "fryum") || examples[0] || null;
       const item = examples.find((example) => example.queryId === sampleId);
       const metric = advantageMetric(model.key, item?.id);
       const nextChoice = {
+        modelKey: model.key,
         modelName: model.displayName,
         queryLabel: title(item?.queryLabel || "fryum"),
         metricLine: metric?.line || null,
       setActiveChoice(nextChoice);
       setLoadingKey(choiceKey);
       try {
+        await commands.setActiveLayout(model.layoutKey, { persist: true });
+        await commands.showSimilar({
+          sampleId,
+          layoutKey: model.layoutKey,
+          k: 10,
+          source: `manufacturing-demo:${model.key}`,
+          focus: "samples",
+          persist: true,
+        });
         setActiveChoice(nextChoice);
         setActiveModelKey(model.key);
       } catch (error) {
+        const message = error instanceof Error ? error.message : String(error);
+        setPanelError(`Could not show neighbors: ${message}`);
       } finally {
         setLoadingKey(null);
       }
     },
+    [commands, examples],
   );
   return React.createElement(

Dockerfile CHANGED Viewed

@@ -20,7 +20,8 @@ WORKDIR $HOME/app
 RUN pip install --upgrade pip
-ARG HYPERVIEW_VERSION=0.6.0
 # Install CPU-only PyTorch first so the Space does not pull the default CUDA bundle.
 RUN pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
@@ -33,6 +34,7 @@ import hyperview as hv
 print("hyperview", hv.__version__, inspect.signature(hv.launch))
 PY
 RUN pip install \
     "datasets>=4.5.0" \
     "Pillow>=12.0.0" \
     "timm>=1.0.0" \

 RUN pip install --upgrade pip
+ARG HYPERVIEW_VERSION=0.6.2
+ARG HYPER_MODELS_VERSION=0.3.0
 # Install CPU-only PyTorch first so the Space does not pull the default CUDA bundle.
 RUN pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
 print("hyperview", hv.__version__, inspect.signature(hv.launch))
 PY
 RUN pip install \
+    "hyper-models[ml]==${HYPER_MODELS_VERSION}" \
     "datasets>=4.5.0" \
     "Pillow>=12.0.0" \
     "timm>=1.0.0" \

README.md CHANGED Viewed

@@ -14,7 +14,7 @@ This Space builds a balanced subset of the VisA industrial visual anomaly
 dataset and opens HyperView with two side-by-side embedding spaces:
 - CLIP ViT-B/32 in a Euclidean 2D layout
-- Hyper3-CLIP `hyper3labs/hyper3-clip-v0.5` in a Poincare 2D layout
 The workflow is inspection reference retrieval: given a production-line
 inspection image, retrieve the right normal references for the same SKU or
@@ -59,8 +59,8 @@ VISA_SAMPLES_PER_CATEGORY=12 HYPERVIEW_PORT=6265 \
   uv run python hyperview-spaces/spaces/manufacturing-visa-reference-clip-hyper3clip/demo.py
 ```
-Hyper3-CLIP weights are loaded from the gated
-`hyper3labs/hyper3-clip-v0.5` model repository at runtime. The Space needs an
-`HF_TOKEN` secret with access to that model. If unavailable, the Space can start
-with a clearly labeled CLIP fallback unless `HYPERVIEW_ALLOW_CANDIDATE_FALLBACK=0`
-is set.

 dataset and opens HyperView with two side-by-side embedding spaces:
 - CLIP ViT-B/32 in a Euclidean 2D layout
+- Hyper3-CLIP `hyper3-clip-v0.5` from `hyper-models` in a Poincare 2D layout
 The workflow is inspection reference retrieval: given a production-line
 inspection image, retrieve the right normal references for the same SKU or
   uv run python hyperview-spaces/spaces/manufacturing-visa-reference-clip-hyper3clip/demo.py
 ```
+Hyper3-CLIP weights are loaded through the `hyper-models` catalog entry for the
+gated `hyper3labs/hyper3-clip-v0.5` model repository at runtime. The Space needs
+an `HF_TOKEN` secret with access to that model. If unavailable, the Space can
+start with a clearly labeled CLIP fallback unless
+`HYPERVIEW_ALLOW_CANDIDATE_FALLBACK=0` is set.

demo.py CHANGED Viewed

@@ -18,7 +18,6 @@ from PIL import Image, ImageOps
 import hyperview as hv
 SPACE_DIR = Path(__file__).resolve().parent
 SPACE_HOST = os.environ.get("HYPERVIEW_HOST", "127.0.0.1")
 SPACE_PORT = int(os.environ.get("HYPERVIEW_PORT", "6265"))
@@ -29,6 +28,11 @@ EXTENSION_DIR = SPACE_DIR / ".hyperview" / "extensions" / "manufacturing-readout
 SAMPLES_PER_CATEGORY = int(os.environ.get("VISA_SAMPLES_PER_CATEGORY", "4"))
 TRAIN_FRACTION = float(os.environ.get("VISA_TRAIN_FRACTION", "0.5"))
 IMAGE_MAX_SIZE = (640, 640)
 ALLOW_CANDIDATE_FALLBACK = os.environ.get("HYPERVIEW_ALLOW_CANDIDATE_FALLBACK", "1").lower() in {
     "1",
     "true",
@@ -90,8 +94,8 @@ MODEL_SPECS = [
         "key": "candidate",
         "display_name": os.environ.get("VISA_CANDIDATE_DISPLAY_NAME", "Hyper3-CLIP"),
         "button_label": os.environ.get("VISA_CANDIDATE_BUTTON_LABEL", "Show Hyper3 neighbors"),
-        "provider": os.environ.get("VISA_CANDIDATE_PROVIDER", "hyper3-clip"),
-        "model": os.environ.get("VISA_CANDIDATE_MODEL", "hyper3labs/hyper3-clip-v0.5"),
         "layout": os.environ.get("VISA_CANDIDATE_LAYOUT", "poincare:2d"),
         "geometry": os.environ.get("VISA_CANDIDATE_GEOMETRY", "poincare"),
         "layout_dimension": int(os.environ.get("VISA_CANDIDATE_LAYOUT_DIMENSION", "2")),
@@ -219,6 +223,7 @@ def add_visa_samples(dataset: hv.Dataset) -> None:
     media_dir = media_root()
     added = 0
     updated = 0
     for record in select_visa_records():
         sample_id = safe_sample_id(record["category"], record["split_name"], record["row_index"], record["defect_label"])
         destination = Path(record["local_path"]) if record.get("local_path") else media_dir / f"{sample_id}.jpg"
@@ -234,12 +239,17 @@ def add_visa_samples(dataset: hv.Dataset) -> None:
             "source_dataset": "BrachioLab/visa",
         }
         existed = sample_id in existing_ids
         dataset.add_image(str(destination), label=record["category"], metadata=metadata, sample_id=sample_id)
         if existed:
             updated += 1
         else:
             added += 1
             existing_ids.add(sample_id)
     print(f"Prepared VisA samples ({added} added, {updated} updated).", flush=True)
@@ -262,6 +272,7 @@ def ensure_layouts(dataset: hv.Dataset) -> dict[str, str]:
                 )
                 print(warning, flush=True)
                 RUNTIME_WARNINGS.append(warning)
                 spec.update(
                     {
                         "display_name": "Hyper3-CLIP unavailable (CLIP fallback)",
@@ -270,21 +281,22 @@ def ensure_layouts(dataset: hv.Dataset) -> dict[str, str]:
                         "layout_dimension": MODEL_SPECS[0]["layout_dimension"],
                         "panel_title": "Hyper3-CLIP unavailable - showing CLIP fallback",
                         "fallback": True,
-                        "space_key": MODEL_SPECS[0].get("space_key"),
                     }
                 )
-                layouts[spec["key"]] = layouts["clip"]
                 continue
             raise
-        spec["space_key"] = space_key
         print(f"Ensuring {spec['display_name']} layout...", flush=True)
-        layouts[spec["key"]] = dataset.compute_visualization(
             space_key=space_key,
             layout=spec["layout"],
             n_neighbors=20,
             min_dist=0.08,
             metric=spec["metric"],
         )
     return layouts
@@ -305,24 +317,19 @@ def model_panel_props(layouts: dict[str, str]) -> list[dict[str, Any]]:
                 "displayName": spec["display_name"],
                 "buttonLabel": spec["button_label"],
                 "layoutKey": layout_key,
-                "spaceKey": spec.get("space_key") or space_key_from_layout(layout_key),
             }
         )
     return props
-def space_key_from_layout(layout_key: str) -> str:
-    return layout_key.split("__euclidean_umap", 1)[0].split("__poincare_umap", 1)[0]
 def reference_summary(dataset: hv.Dataset, sample_id: str, model_key: str) -> dict[str, Any]:
     spec = next((item for item in MODEL_SPECS if item["key"] == model_key), None)
-    if spec is None or spec.get("space_key") is None:
         return {}
     query = dataset[sample_id]
     query_sku = query.metadata.get("sku")
     query_family = query.metadata.get("product_family")
-    neighbors = dataset.find_similar(sample_id, k=10, space_key=str(spec["space_key"]))
     sku_hits = sum(1 for sample, _distance in neighbors if sample.metadata.get("sku") == query_sku)
     family_hits = sum(1 for sample, _distance in neighbors if sample.metadata.get("product_family") == query_family)
     normal_refs = sum(1 for sample, _distance in neighbors if sample.metadata.get("workflow_role") == "normal_reference")
@@ -431,17 +438,6 @@ def category_strength_rows(dataset: hv.Dataset) -> list[dict[str, str]]:
     return sorted(rows, key=lambda row: float(row["delta"]), reverse=True)[:3]
-def register_hyper3_clip_provider() -> None:
-    from hyperview.runtime import ProviderRegistry
-    ProviderRegistry().register_python(
-        "hyper3-clip",
-        "hyper3_clip_provider:Hyper3ClipEmbeddings",
-        description="Hyper3-CLIP v0.5 image embeddings from hyper3labs/hyper3-clip-v0.5",
-        overwrite=True,
-    )
 def build_demo_view(dataset: hv.Dataset, layouts: dict[str, str]) -> hv.ui.View:
     scatter_panels = [
         hv.ui.Scatter(
@@ -460,14 +456,20 @@ def build_demo_view(dataset: hv.Dataset, layouts: dict[str, str]) -> hv.ui.View:
             extension="manufacturing-readout",
             panel="manufacturing-comparison",
             position="right",
             props={
-                "workspaceId": WORKSPACE_ID,
                 "models": model_panel_props(layouts),
                 "examples": build_examples(dataset),
                 "strengthRows": category_strength_rows(dataset),
                 "warnings": RUNTIME_WARNINGS,
             },
         ),
     )
@@ -491,7 +493,6 @@ def launch_demo(dataset: hv.Dataset, layouts: dict[str, str]) -> hv.Session:
 def main() -> None:
-    register_hyper3_clip_provider()
     dataset, layouts = build_dataset()
     print("Layouts:", flush=True)
     for spec in MODEL_SPECS:

 import hyperview as hv
 SPACE_DIR = Path(__file__).resolve().parent
 SPACE_HOST = os.environ.get("HYPERVIEW_HOST", "127.0.0.1")
 SPACE_PORT = int(os.environ.get("HYPERVIEW_PORT", "6265"))
 SAMPLES_PER_CATEGORY = int(os.environ.get("VISA_SAMPLES_PER_CATEGORY", "4"))
 TRAIN_FRACTION = float(os.environ.get("VISA_TRAIN_FRACTION", "0.5"))
 IMAGE_MAX_SIZE = (640, 640)
+FORCE_SAMPLE_REFRESH = os.environ.get("HYPERVIEW_VISA_FORCE_REFRESH", "").lower() in {
+    "1",
+    "true",
+    "yes",
+}
 ALLOW_CANDIDATE_FALLBACK = os.environ.get("HYPERVIEW_ALLOW_CANDIDATE_FALLBACK", "1").lower() in {
     "1",
     "true",
         "key": "candidate",
         "display_name": os.environ.get("VISA_CANDIDATE_DISPLAY_NAME", "Hyper3-CLIP"),
         "button_label": os.environ.get("VISA_CANDIDATE_BUTTON_LABEL", "Show Hyper3 neighbors"),
+        "provider": os.environ.get("VISA_CANDIDATE_PROVIDER", "hyper-models"),
+        "model": os.environ.get("VISA_CANDIDATE_MODEL", "hyper3-clip-v0.5"),
         "layout": os.environ.get("VISA_CANDIDATE_LAYOUT", "poincare:2d"),
         "geometry": os.environ.get("VISA_CANDIDATE_GEOMETRY", "poincare"),
         "layout_dimension": int(os.environ.get("VISA_CANDIDATE_LAYOUT_DIMENSION", "2")),
     media_dir = media_root()
     added = 0
     updated = 0
+    skipped = 0
     for record in select_visa_records():
         sample_id = safe_sample_id(record["category"], record["split_name"], record["row_index"], record["defect_label"])
         destination = Path(record["local_path"]) if record.get("local_path") else media_dir / f"{sample_id}.jpg"
             "source_dataset": "BrachioLab/visa",
         }
         existed = sample_id in existing_ids
+        if existed and not FORCE_SAMPLE_REFRESH:
+            skipped += 1
+            continue
         dataset.add_image(str(destination), label=record["category"], metadata=metadata, sample_id=sample_id)
         if existed:
             updated += 1
         else:
             added += 1
             existing_ids.add(sample_id)
+    if skipped:
+        print(f"Skipped {skipped} existing VisA sample rows.", flush=True)
     print(f"Prepared VisA samples ({added} added, {updated} updated).", flush=True)
                 )
                 print(warning, flush=True)
                 RUNTIME_WARNINGS.append(warning)
+                fallback_layout_key = layouts["clip"]
                 spec.update(
                     {
                         "display_name": "Hyper3-CLIP unavailable (CLIP fallback)",
                         "layout_dimension": MODEL_SPECS[0]["layout_dimension"],
                         "panel_title": "Hyper3-CLIP unavailable - showing CLIP fallback",
                         "fallback": True,
+                        "layout_key": fallback_layout_key,
                     }
                 )
+                layouts[spec["key"]] = fallback_layout_key
                 continue
             raise
         print(f"Ensuring {spec['display_name']} layout...", flush=True)
+        layout_key = dataset.compute_visualization(
             space_key=space_key,
             layout=spec["layout"],
             n_neighbors=20,
             min_dist=0.08,
             metric=spec["metric"],
         )
+        spec["layout_key"] = layout_key
+        layouts[spec["key"]] = layout_key
     return layouts
                 "displayName": spec["display_name"],
                 "buttonLabel": spec["button_label"],
                 "layoutKey": layout_key,
             }
         )
     return props
 def reference_summary(dataset: hv.Dataset, sample_id: str, model_key: str) -> dict[str, Any]:
     spec = next((item for item in MODEL_SPECS if item["key"] == model_key), None)
+    if spec is None or spec.get("layout_key") is None:
         return {}
     query = dataset[sample_id]
     query_sku = query.metadata.get("sku")
     query_family = query.metadata.get("product_family")
+    neighbors = dataset.find_similar(sample_id, k=10, layout_key=str(spec["layout_key"]))
     sku_hits = sum(1 for sample, _distance in neighbors if sample.metadata.get("sku") == query_sku)
     family_hits = sum(1 for sample, _distance in neighbors if sample.metadata.get("product_family") == query_family)
     normal_refs = sum(1 for sample, _distance in neighbors if sample.metadata.get("workflow_role") == "normal_reference")
     return sorted(rows, key=lambda row: float(row["delta"]), reverse=True)[:3]
 def build_demo_view(dataset: hv.Dataset, layouts: dict[str, str]) -> hv.ui.View:
     scatter_panels = [
         hv.ui.Scatter(
             extension="manufacturing-readout",
             panel="manufacturing-comparison",
             position="right",
+            layout=hv.ui.PanelLayout(width=340, min_width=300),
             props={
                 "models": model_panel_props(layouts),
                 "examples": build_examples(dataset),
                 "strengthRows": category_strength_rows(dataset),
                 "warnings": RUNTIME_WARNINGS,
             },
         ),
+        hv.ui.Samples(
+            id="manufacturing-neighbors",
+            title="Step 2 - Retrieved References",
+            position="bottom",
+            layout=hv.ui.PanelLayout(height=220, min_height=180),
+        ),
     )
 def main() -> None:
     dataset, layouts = build_dataset()
     print("Layouts:", flush=True)
     for spec in MODEL_SPECS:

hyper3_clip/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-__all__ = ["Hyper3CLIP"]

hyper3_clip/data/__init__.py DELETED Viewed

@@ -1,14 +0,0 @@
-from hyper3_clip.data.collators import collate_grounded
-from hyper3_clip.data.grit_webdataset import ProcessedGritDataset
-from hyper3_clip.data.manifest_dataset import GroundedManifestDataset
-from hyper3_clip.data.mixed_dataset import MixedGroundedIterableDataset
-from hyper3_clip.data.types import GroundedParent, GroundedRecord
-__all__ = [
-    "GroundedManifestDataset",
-    "GroundedParent",
-    "GroundedRecord",
-    "MixedGroundedIterableDataset",
-    "ProcessedGritDataset",
-    "collate_grounded",
-]

hyper3_clip/data/collators.py DELETED Viewed

@@ -1,209 +0,0 @@
-from __future__ import annotations
-import re
-from typing import Any
-import torch
-def _attention_mask(tokens) -> torch.Tensor:
-    if "attention_mask" in tokens:
-        return tokens["attention_mask"]
-    return torch.ones_like(tokens["input_ids"])
-def collate_grounded(
-    batch: list[dict[str, Any]],
-    tokenizer,
-    max_text_length: int,
-    *,
-    beta_clip_queries: bool = False,
-    beta_clip_max_sentences: int = 5,
-    beta_clip_max_phrases: int = 30,
-    beta_clip_max_queries_per_image: int | None = None,
-    beta_clip_use_part_texts: bool = True,
-) -> dict[str, torch.Tensor]:
-    images = torch.stack([item["image"] for item in batch])
-    captions = [item["caption"] for item in batch]
-    part_image_rows: list[torch.Tensor] = []
-    part_texts: list[str] = []
-    part_owner: list[int] = []
-    for batch_index, item in enumerate(batch):
-        for part_index, part_image in enumerate(item["part_images"]):
-            part_image_rows.append(part_image)
-            part_texts.append(item["part_texts"][part_index])
-            part_owner.append(batch_index)
-    text = tokenizer(captions, padding=True, truncation=True, max_length=max_text_length, return_tensors="pt")
-    text_attention_mask = _attention_mask(text)
-    if part_image_rows:
-        part_images = torch.stack(part_image_rows)
-        part_text = tokenizer(part_texts, padding=True, truncation=True, max_length=max_text_length, return_tensors="pt")
-        part_text_input_ids = part_text["input_ids"]
-        part_text_attention_mask = _attention_mask(part_text)
-    else:
-        part_images = images.new_zeros((0, *images.shape[1:]))
-        empty_text_shape = (0, text["input_ids"].shape[1])
-        part_text_input_ids = text["input_ids"].new_zeros(empty_text_shape)
-        part_text_attention_mask = text_attention_mask.new_zeros(empty_text_shape)
-    collated = {
-        "image": images,
-        "part_images": part_images,
-        "part_owner": torch.tensor(part_owner, dtype=torch.long),
-        "text_input_ids": text["input_ids"],
-        "text_attention_mask": text_attention_mask,
-        "part_text_input_ids": part_text_input_ids,
-        "part_text_attention_mask": part_text_attention_mask,
-    }
-    if beta_clip_queries:
-        query_texts: list[str] = []
-        query_owner: list[int] = []
-        query_type: list[int] = []
-        query_parent: list[int] = []
-        query_weight: list[float] = []
-        query_source_part: list[int] = []
-        part_offsets = []
-        cursor = 0
-        for item in batch:
-            part_offsets.append(cursor)
-            cursor += len(item["part_images"])
-        for batch_index, item in enumerate(batch):
-            image_queries = _beta_clip_query_items_for_item(
-                caption=item["caption"],
-                part_texts=item["part_texts"],
-                max_sentences=beta_clip_max_sentences,
-                max_phrases=beta_clip_max_phrases,
-                max_queries=beta_clip_max_queries_per_image,
-                use_part_texts=beta_clip_use_part_texts,
-            )
-            query_offset = len(query_texts)
-            for query in image_queries:
-                query_texts.append(query["text"])
-                query_owner.append(batch_index)
-                query_type.append(query["type"])
-                local_parent = query["parent"]
-                query_parent.append(-1 if local_parent < 0 else query_offset + local_parent)
-                query_weight.append(query["weight"])
-                local_part = query["source_part"]
-                query_source_part.append(-1 if local_part < 0 else part_offsets[batch_index] + local_part)
-        query_tokens = tokenizer(query_texts, padding=True, truncation=True, max_length=max_text_length, return_tensors="pt")
-        collated.update(
-            {
-                "beta_query_input_ids": query_tokens["input_ids"],
-                "beta_query_attention_mask": _attention_mask(query_tokens),
-                "beta_query_owner": torch.tensor(query_owner, dtype=torch.long),
-                "beta_query_type": torch.tensor(query_type, dtype=torch.long),
-                "beta_query_parent": torch.tensor(query_parent, dtype=torch.long),
-                "beta_query_weight": torch.tensor(query_weight, dtype=torch.float32),
-                "beta_query_source_part": torch.tensor(query_source_part, dtype=torch.long),
-            }
-        )
-    return collated
-def _beta_clip_queries_for_item(
-    *,
-    caption: str,
-    part_texts: list[str],
-    max_sentences: int,
-    max_phrases: int,
-    max_queries: int | None,
-    use_part_texts: bool,
-) -> list[str]:
-    return [
-        query["text"]
-        for query in _beta_clip_query_items_for_item(
-            caption=caption,
-            part_texts=part_texts,
-            max_sentences=max_sentences,
-            max_phrases=max_phrases,
-            max_queries=max_queries,
-            use_part_texts=use_part_texts,
-        )
-    ]
-def _beta_clip_query_items_for_item(
-    *,
-    caption: str,
-    part_texts: list[str],
-    max_sentences: int,
-    max_phrases: int,
-    max_queries: int | None,
-    use_part_texts: bool,
-) -> list[dict[str, str | int | float]]:
-    queries: list[str] = []
-    query_items: list[dict[str, str | int | float]] = []
-    seen: set[str] = set()
-    def add_query(text: str, *, query_type: int, parent: int = 0, weight: float = 1.0, source_part: int = -1) -> int:
-        normalized = " ".join(str(text).strip().split())
-        key = normalized.casefold()
-        if len(normalized) >= 3 and key not in seen:
-            seen.add(key)
-            queries.append(normalized)
-            query_items.append(
-                {
-                    "text": normalized,
-                    "type": query_type,
-                    "parent": parent,
-                    "weight": weight,
-                    "source_part": source_part,
-                }
-            )
-            return len(queries) - 1
-        return -1
-    caption_index = add_query(caption, query_type=0, parent=-1, weight=0.0)
-    if caption_index < 0:
-        caption_index = 0
-    sentence_indices: list[int] = []
-    for sentence in _split_sentences(caption)[: max(0, max_sentences)]:
-        sentence_index = add_query(sentence, query_type=1, parent=caption_index, weight=1.0)
-        if sentence_index >= 0:
-            sentence_indices.append(sentence_index)
-    phrase_parent = sentence_indices[0] if sentence_indices else caption_index
-    phrase_count = 0
-    if use_part_texts:
-        for part_index, part_text in enumerate(part_texts):
-            if phrase_count >= max_phrases:
-                break
-            before = len(queries)
-            add_query(part_text, query_type=2, parent=caption_index, weight=0.75, source_part=part_index)
-            phrase_count += int(len(queries) > before)
-    if phrase_count < max_phrases:
-        for phrase in _extract_lightweight_phrases(caption):
-            if phrase_count >= max_phrases:
-                break
-            before = len(queries)
-            add_query(phrase, query_type=3, parent=phrase_parent, weight=0.5)
-            phrase_count += int(len(queries) > before)
-    if max_queries is not None:
-        return query_items[: max(1, int(max_queries))]
-    return query_items
-def _split_sentences(text: str) -> list[str]:
-    return [part.strip() for part in re.split(r"(?<=[.!?;])\s+|\n+", text) if part.strip()]
-def _extract_lightweight_phrases(text: str) -> list[str]:
-    chunks = re.split(r"[,;:()]|\s+(?:and|with|near|beside|behind|under|above|around|next to)\s+", text, flags=re.I)
-    phrases: list[str] = []
-    for chunk in chunks:
-        words = re.findall(r"[A-Za-z0-9]+(?:[-'][A-Za-z0-9]+)?", chunk)
-        if 2 <= len(words) <= 8:
-            phrases.append(" ".join(words))
-        elif len(words) > 8:
-            for start in range(0, len(words) - 1, 4):
-                phrase = " ".join(words[start : start + 6])
-                if len(phrase.split()) >= 2:
-                    phrases.append(phrase)
-    return phrases

hyper3_clip/data/grit_cleaning.py DELETED Viewed

@@ -1,554 +0,0 @@
-from __future__ import annotations
-import math
-import re
-import unicodedata
-from dataclasses import dataclass
-from typing import Any
-import numpy as np
-from PIL import Image
-SPACE_RE = re.compile(r"\s+")
-URL_RE = re.compile(r"(https?://|www\.|\.com\b|\.net\b|\.org\b)", re.IGNORECASE)
-EMAIL_RE = re.compile(r"\b[\w.+-]+@[\w.-]+\.\w+\b")
-HTML_RE = re.compile(r"<[^>]+>")
-TOKEN_RE = re.compile(r"[a-z0-9]+(?:[-'][a-z0-9]+)?")
-LEADING_DETERMINERS = {
-    "a",
-    "an",
-    "the",
-    "this",
-    "that",
-    "these",
-    "those",
-    "his",
-    "her",
-    "its",
-    "their",
-    "my",
-    "our",
-    "your",
-}
-QUANTITY_WORDS = {
-    "one",
-    "two",
-    "three",
-    "four",
-    "five",
-    "six",
-    "many",
-    "several",
-    "some",
-    "few",
-    "group",
-    "pair",
-}
-VISUAL_MODIFIERS = {
-    "big",
-    "small",
-    "large",
-    "little",
-    "old",
-    "young",
-    "new",
-    "red",
-    "blue",
-    "green",
-    "yellow",
-    "white",
-    "black",
-    "brown",
-    "gray",
-    "grey",
-    "orange",
-    "pink",
-    "purple",
-    "colorful",
-    "colourful",
-    "wooden",
-    "metal",
-    "plastic",
-    "striped",
-}
-NON_VISUAL_HEADS = {
-    "background",
-    "foreground",
-    "caption",
-    "copyright",
-    "credit",
-    "item",
-    "edge",
-    "image",
-    "left",
-    "logo",
-    "method",
-    "middle",
-    "number",
-    "photo",
-    "photograph",
-    "picture",
-    "place",
-    "right",
-    "scene",
-    "side",
-    "statement",
-    "stock",
-    "text",
-    "thing",
-    "view",
-    "watermark",
-}
-NON_VISUAL_PHRASES = {
-    "available at",
-    "all rights reserved",
-    "click here",
-    "copyright",
-    "getty images",
-    "istock",
-    "shutterstock",
-    "stock photo",
-}
-ACTION_SPLITS = (
-    " standing ",
-    " sitting ",
-    " lying ",
-    " walking ",
-    " running ",
-    " flying ",
-    " eating ",
-    " holding ",
-    " wearing ",
-    " playing ",
-)
-PREPOSITION_SPLITS = (
-    " next to ",
-    " in front of ",
-    " on top of ",
-    " inside ",
-    " outside ",
-    " with ",
-    " without ",
-    " near ",
-    " beside ",
-    " behind ",
-    " under ",
-    " over ",
-    " from ",
-    " into ",
-    " across ",
-    " around ",
-    " at ",
-    " on ",
-    " in ",
-    " of ",
-)
-CANONICAL_REWRITES = {
-    "aeroplane": "airplane",
-    "aircraft": "airplane",
-    "bike": "bicycle",
-    "cell phone": "phone",
-    "mobile phone": "phone",
-    "motorbike": "motorcycle",
-    "plant pot": "potted plant",
-    "tv": "television",
-}
-TOKEN_SYNONYMS = {
-    "airplane": {"airplane", "aeroplane", "aircraft", "plane"},
-    "bicycle": {"bicycle", "bike"},
-    "motorcycle": {"motorcycle", "motorbike"},
-    "person": {"person", "people", "man", "woman", "boy", "girl", "teenager", "teenagers"},
-    "people": {"person", "people", "man", "woman", "men", "women", "children", "teenager", "teenagers"},
-    "phone": {"phone", "cell", "mobile", "telephone"},
-    "television": {"television", "tv"},
-}
-HUMAN_GROUP_WORDS = {
-    "adults",
-    "boys",
-    "children",
-    "crowd",
-    "girls",
-    "kids",
-    "men",
-    "people",
-    "teenagers",
-    "teens",
-    "women",
-}
-HUMAN_SINGULAR_WORDS = {
-    "adult",
-    "baby",
-    "boy",
-    "child",
-    "girl",
-    "kid",
-    "man",
-    "person",
-    "teenager",
-    "woman",
-}
-HUMAN_ROLE_WORDS = {
-    "actor",
-    "actress",
-    "artist",
-    "athlete",
-    "boss",
-    "coach",
-    "doctor",
-    "lawyer",
-    "manager",
-    "minister",
-    "musician",
-    "player",
-    "politician",
-    "president",
-    "singer",
-    "teacher",
-}
-DEFAULT_HYPERNYMS = {
-    "airplane": ("aircraft", "vehicle"),
-    "apple": ("fruit", "food"),
-    "backpack": ("bag", "accessory"),
-    "baseball bat": ("bat", "sports equipment"),
-    "bear": ("mammal", "animal"),
-    "bicycle": ("vehicle",),
-    "bird": ("animal",),
-    "boat": ("vehicle",),
-    "bottle": ("container",),
-    "bus": ("vehicle",),
-    "car": ("vehicle",),
-    "cat": ("mammal", "animal"),
-    "chair": ("furniture",),
-    "cup": ("container",),
-    "dog": ("mammal", "animal"),
-    "flower": ("plant",),
-    "fork": ("utensil",),
-    "horse": ("mammal", "animal"),
-    "knife": ("utensil",),
-    "lamp": ("light", "furniture"),
-    "laptop": ("computer", "electronic device"),
-    "person": ("human", "animal"),
-    "phone": ("electronic device",),
-    "potted plant": ("plant",),
-    "shirt": ("clothing",),
-    "shoe": ("footwear", "clothing"),
-    "skis": ("sports equipment",),
-    "spoon": ("utensil",),
-    "sports ball": ("ball", "sports equipment"),
-    "table": ("furniture",),
-    "television": ("electronic device",),
-    "train": ("vehicle",),
-    "tree": ("plant",),
-    "truck": ("vehicle",),
-}
-@dataclass(frozen=True)
-class ImageQuality:
-    width: int
-    height: int
-    brightness: float
-    contrast: float
-    entropy: float
-    black_border_fraction: float
-@dataclass(frozen=True)
-class ParentCleanDecision:
-    original_text: str
-    canonical_text: str
-    keep: bool
-    quality_score: float
-    reasons: tuple[str, ...]
-    hypernyms: tuple[str, ...]
-    image_quality: ImageQuality | None = None
-def clean_parent(
-    parent_text: str,
-    caption: str = "",
-    parent_image: Image.Image | None = None,
-    min_score: float = 0.45,
-    hypernym_map: dict[str, tuple[str, ...]] | None = None,
-) -> ParentCleanDecision:
-    canonical = canonicalize_parent_text(parent_text)
-    reasons: list[str] = []
-    fatal = False
-    if not canonical:
-        reasons.append("empty_after_canonicalization")
-        fatal = True
-    if looks_like_boilerplate(parent_text):
-        reasons.append("boilerplate_or_url")
-        fatal = True
-    if canonical and is_non_visual_parent(canonical):
-        reasons.append("non_visual_parent")
-        fatal = True
-    if canonical and len(canonical.split()) > 6:
-        reasons.append("too_long_for_clean_parent")
-    if canonical and caption_duplicates_parent(caption, canonical):
-        reasons.append("duplicates_caption")
-        fatal = True
-    if canonical and not caption_mentions_parent(caption, canonical):
-        reasons.append("caption_does_not_mention_parent")
-    image_quality = image_quality_stats(parent_image) if parent_image is not None else None
-    if image_quality is not None:
-        if image_quality.entropy < 1.0 or image_quality.contrast < 3.0:
-            reasons.append("low_information_crop")
-        if image_quality.black_border_fraction > 0.65:
-            reasons.append("mostly_black_border")
-        if (
-            "caption_does_not_mention_parent" in reasons
-            and "low_information_crop" in reasons
-            and "mostly_black_border" in reasons
-        ):
-            reasons.append("text_slide_or_bad_crop")
-            fatal = True
-    score = parent_quality_score(canonical, reasons, image_quality)
-    hmap = DEFAULT_HYPERNYMS if hypernym_map is None else hypernym_map
-    hypernyms = tuple(hmap.get(canonical, ()))
-    keep = not fatal and score >= min_score
-    return ParentCleanDecision(
-        original_text=parent_text,
-        canonical_text=canonical,
-        keep=keep,
-        quality_score=score,
-        reasons=tuple(reasons),
-        hypernyms=hypernyms,
-        image_quality=image_quality,
-    )
-def canonicalize_parent_text(text: str) -> str:
-    text = normalize_text(text)
-    if not text:
-        return ""
-    text = strip_boilerplate_tail(text)
-    for marker in ACTION_SPLITS:
-        if marker in text:
-            text = text.split(marker, maxsplit=1)[0].strip()
-            break
-    human = canonicalize_human_text(text)
-    if human:
-        return human
-    for marker in PREPOSITION_SPLITS:
-        if marker in text:
-            text = text.split(marker, maxsplit=1)[0].strip()
-            break
-    tokens = TOKEN_RE.findall(text)
-    while tokens and (tokens[0] in LEADING_DETERMINERS or tokens[0] in QUANTITY_WORDS):
-        tokens.pop(0)
-    if len(tokens) > 2:
-        while tokens and tokens[0] in VISUAL_MODIFIERS:
-            tokens.pop(0)
-    candidate = " ".join(tokens).strip()
-    candidate = CANONICAL_REWRITES.get(candidate, candidate)
-    if candidate.endswith("s") and candidate[:-1] in DEFAULT_HYPERNYMS:
-        candidate = candidate[:-1]
-    return candidate
-def canonicalize_human_text(text: str) -> str:
-    tokens = TOKEN_RE.findall(text)
-    if not tokens:
-        return ""
-    token_set = set(tokens)
-    if token_set.intersection(HUMAN_GROUP_WORDS):
-        return "people"
-    for word in ("baby", "woman", "man", "girl", "boy", "child", "teenager", "person"):
-        if word in token_set:
-            return word
-    if token_set.intersection(HUMAN_ROLE_WORDS):
-        return "person"
-    return ""
-def normalize_text(text: str) -> str:
-    text = unicodedata.normalize("NFKC", str(text))
-    text = HTML_RE.sub(" ", text)
-    text = text.replace("_", " ").replace("/", " ")
-    text = text.strip().lower()
-    text = text.strip(" \t\r\n\"'.,;:!?()[]{}")
-    return SPACE_RE.sub(" ", text)
-def strip_boilerplate_tail(text: str) -> str:
-    for marker in (" - available at ", " available at ", " | ", " © ", " copyright "):
-        if marker in text:
-            text = text.split(marker, maxsplit=1)[0]
-    return text.strip()
-def looks_like_boilerplate(text: str) -> bool:
-    normalized = normalize_text(text)
-    if URL_RE.search(normalized) or EMAIL_RE.search(normalized):
-        return True
-    return any(phrase in normalized for phrase in NON_VISUAL_PHRASES)
-def is_non_visual_parent(canonical: str) -> bool:
-    tokens = canonical.split()
-    if not tokens:
-        return True
-    if canonical in NON_VISUAL_HEADS:
-        return True
-    if tokens[-1] in NON_VISUAL_HEADS:
-        return True
-    if all(token.isdigit() for token in tokens):
-        return True
-    return False
-def caption_mentions_parent(caption: str, canonical_parent: str) -> bool:
-    if not caption or not canonical_parent:
-        return True
-    caption_tokens = set(TOKEN_RE.findall(normalize_text(caption)))
-    parent_tokens = TOKEN_RE.findall(canonical_parent)
-    if not parent_tokens:
-        return False
-    for token in parent_tokens:
-        synonyms = TOKEN_SYNONYMS.get(token, {token})
-        if not caption_tokens.intersection(synonyms):
-            return False
-    return True
-def caption_duplicates_parent(caption: str, canonical_parent: str) -> bool:
-    if not caption or not canonical_parent:
-        return False
-    caption_tokens = TOKEN_RE.findall(normalize_text(caption))
-    parent_tokens = TOKEN_RE.findall(canonical_parent)
-    if len(parent_tokens) < 6 or not caption_tokens:
-        return False
-    caption_set = set(caption_tokens)
-    parent_set = set(parent_tokens)
-    overlap = len(caption_set.intersection(parent_set))
-    return overlap / max(len(parent_set), 1) >= 0.85 and overlap / max(len(caption_set), 1) >= 0.65
-def image_quality_stats(image: Image.Image | None) -> ImageQuality | None:
-    if image is None:
-        return None
-    rgb = image.convert("RGB")
-    width, height = rgb.size
-    gray = np.asarray(rgb.convert("L"), dtype=np.float32)
-    brightness = float(gray.mean())
-    contrast = float(gray.std())
-    hist, _ = np.histogram(gray, bins=64, range=(0, 256), density=True)
-    hist = hist[hist > 0]
-    entropy = float(-(hist * np.log2(hist)).sum())
-    border = _border_pixels(gray)
-    black_border_fraction = float((border < 8).mean()) if border.size else 0.0
-    return ImageQuality(
-        width=width,
-        height=height,
-        brightness=brightness,
-        contrast=contrast,
-        entropy=entropy,
-        black_border_fraction=black_border_fraction,
-    )
-def parent_quality_score(canonical: str, reasons: list[str], image_quality: ImageQuality | None) -> float:
-    if not canonical:
-        return 0.0
-    score = 1.0
-    penalties = {
-        "caption_does_not_mention_parent": 0.20,
-        "too_long_for_clean_parent": 0.20,
-        "low_information_crop": 0.25,
-        "mostly_black_border": 0.15,
-        "non_visual_parent": 0.60,
-        "boilerplate_or_url": 0.80,
-        "duplicates_caption": 0.80,
-        "text_slide_or_bad_crop": 0.80,
-    }
-    for reason in reasons:
-        score -= penalties.get(reason, 0.10)
-    if image_quality is not None:
-        if image_quality.brightness < 8 or image_quality.brightness > 247:
-            score -= 0.10
-        if image_quality.contrast > 8 and image_quality.entropy > 2:
-            score += 0.05
-    if canonical in DEFAULT_HYPERNYMS:
-        score += 0.05
-    return max(0.0, min(1.0, score))
-def merge_vlm_decision(
-    cheap: ParentCleanDecision,
-    vlm_payload: dict[str, Any] | None,
-    vlm_can_rescue: bool = False,
-) -> ParentCleanDecision:
-    if not vlm_payload:
-        return cheap
-    reasons = list(cheap.reasons)
-    canonical = normalize_text(vlm_payload.get("canonical_parent") or cheap.canonical_text)
-    if canonical:
-        canonical = canonicalize_parent_text(canonical)
-    hypernyms = tuple(
-        normalize_text(value)
-        for value in vlm_payload.get("hypernyms", cheap.hypernyms)
-        if normalize_text(value)
-    )
-    quality_score = float(vlm_payload.get("quality_score", cheap.quality_score))
-    keep_payload = vlm_payload.get("keep")
-    if keep_payload is False:
-        reasons.append("vlm_reject")
-        keep = False
-    elif keep_payload is True and vlm_can_rescue:
-        keep = quality_score >= 0.45 and bool(canonical)
-    else:
-        keep = cheap.keep and keep_payload is not False
-    reject_reason = normalize_text(vlm_payload.get("reject_reason") or "")
-    if reject_reason:
-        reasons.append(f"vlm:{reject_reason}")
-    return ParentCleanDecision(
-        original_text=cheap.original_text,
-        canonical_text=canonical,
-        keep=keep,
-        quality_score=max(0.0, min(1.0, quality_score)),
-        reasons=tuple(dict.fromkeys(reasons)),
-        hypernyms=hypernyms,
-        image_quality=cheap.image_quality,
-    )
-def expand_parent_texts(decision: ParentCleanDecision, add_hypernyms: bool) -> tuple[str, ...]:
-    if not decision.keep:
-        return ()
-    values = [decision.canonical_text]
-    if add_hypernyms:
-        values.extend(decision.hypernyms)
-    return tuple(dict.fromkeys(value for value in values if value))
-def _border_pixels(gray: np.ndarray) -> np.ndarray:
-    if gray.ndim != 2 or min(gray.shape) < 4:
-        return np.asarray([], dtype=np.float32)
-    width = max(1, min(gray.shape) // 16)
-    top = gray[:width, :].reshape(-1)
-    bottom = gray[-width:, :].reshape(-1)
-    left = gray[:, :width].reshape(-1)
-    right = gray[:, -width:].reshape(-1)
-    return np.concatenate([top, bottom, left, right])
-def finite_float(value: float) -> float:
-    return value if math.isfinite(value) else 0.0

hyper3_clip/data/grit_webdataset.py DELETED Viewed

@@ -1,133 +0,0 @@
-from __future__ import annotations
-import copy
-import glob
-import hashlib
-import random
-from collections.abc import Iterator, Sequence
-from pathlib import Path
-from typing import Any
-import torch
-import webdataset as wds
-from PIL import Image
-from torch.utils.data import IterableDataset, get_worker_info
-from hyper3_clip.data.transforms import build_train_transform
-from hyper3_clip.training.distributed import get_rank, get_world_size
-PART_SAMPLING_MODES = {"random_one", "all"}
-class ProcessedGritDataset(IterableDataset):
-    """Reader for official HyCoCLIP processed GRIT shards."""
-    def __init__(
-        self,
-        tarfiles: Sequence[str],
-        image_size: int,
-        seed: int,
-        shuffle_buffer: int = 4000,
-        part_sampling: str = "random_one",
-        max_parts: int | None = None,
-        train_transform: str = "wide_random_crop",
-        image_normalization: str = "imagenet",
-        deterministic_transforms: bool = False,
-    ) -> None:
-        self.tarfiles = _expand_tarfiles(tarfiles)
-        if not self.tarfiles:
-            raise FileNotFoundError(f"No GRIT processed shards matched {tarfiles!r}")
-        rank = get_rank()
-        world_size = get_world_size()
-        self.tarfiles = self.tarfiles[rank::world_size]
-        self.shuffle_buffer = shuffle_buffer
-        self.seed = seed
-        if part_sampling not in PART_SAMPLING_MODES:
-            raise ValueError(f"part_sampling must be one of {sorted(PART_SAMPLING_MODES)}, got {part_sampling!r}")
-        if max_parts is not None and max_parts <= 0:
-            raise ValueError("max_parts must be positive when set")
-        self.part_sampling = part_sampling
-        self.max_parts = max_parts
-        self.deterministic_transforms = deterministic_transforms
-        self.transform = build_train_transform(image_size, preset=train_transform, normalization=image_normalization)
-    def __iter__(self) -> Iterator[dict[str, Any]]:
-        worker = get_worker_info()
-        worker_id = worker.id if worker is not None else 0
-        shuffle_rng = random.Random(self.seed + get_rank() * 1_000_003 + worker_id)
-        part_rng = random.Random(self.seed + 31_415_926 + get_rank() * 1_000_003 + worker_id)
-        pipeline: Any = wds.DataPipeline(
-            wds.SimpleShardList(self.tarfiles, seed=self.seed),
-            wds.split_by_worker,
-            wds.tarfile_to_samples(),
-            wds.shuffle(self.shuffle_buffer, initial=self.shuffle_buffer, rng=shuffle_rng),
-            wds.decode("pil", handler=wds.warn_and_continue),
-        )
-        while True:
-            pipeline_copy = copy.deepcopy(pipeline)
-            for sample in pipeline_copy:
-                yield self._decode_sample(sample, part_rng)
-    def _decode_sample(self, sample: dict[str, Any], rng: random.Random) -> dict[str, Any]:
-        num_parents = int(_as_text(sample["numparents.txt"]))
-        parent_indices = self._select_parent_indices(num_parents, rng)
-        parent_keys = [f"parent{parent_index:03d}" for parent_index in parent_indices]
-        sample_key = _as_text(sample.get("__key__", ""))
-        return {
-            "image": self._transform_image(sample["child.jpg"], sample_key, "child"),
-            "caption": _as_text(sample["child.txt"]),
-            "part_images": [
-                self._transform_image(sample[f"{parent_key}.jpg"], sample_key, parent_key) for parent_key in parent_keys
-            ],
-            "part_texts": [_as_text(sample[f"{parent_key}.txt"]) for parent_key in parent_keys],
-        }
-    def _select_parent_indices(self, num_parents: int, rng: random.Random) -> list[int]:
-        if self.part_sampling == "random_one":
-            return [rng.randrange(num_parents)]
-        parent_indices = list(range(num_parents))
-        if self.max_parts is not None and len(parent_indices) > self.max_parts:
-            parent_indices = sorted(rng.sample(parent_indices, k=self.max_parts))
-        return parent_indices
-    def _transform_image(self, value: Any, sample_key: str, role: str) -> torch.Tensor:
-        image = _as_image(value)
-        if not self.deterministic_transforms:
-            return self.transform(image)
-        transform_seed = _stable_seed(self.seed, sample_key, role)
-        python_random_state = random.getstate()
-        try:
-            random.seed(transform_seed)
-            with torch.random.fork_rng(devices=[]):
-                torch.manual_seed(transform_seed)
-                return self.transform(image)
-        finally:
-            random.setstate(python_random_state)
-def _expand_tarfiles(tarfiles: Sequence[str]) -> list[str]:
-    expanded: list[str] = []
-    for pattern in tarfiles:
-        matches = sorted(glob.glob(pattern))
-        expanded.extend(matches if matches else [pattern])
-    return [str(Path(path)) for path in expanded]
-def _as_text(value: Any) -> str:
-    return value.decode("utf-8") if isinstance(value, bytes) else str(value)
-def _as_image(value: Any) -> Image.Image:
-    if not isinstance(value, Image.Image):
-        raise TypeError(f"Expected PIL image from WebDataset decode, got {type(value)!r}")
-    return value.convert("RGB")
-def _stable_seed(seed: int, *parts: str) -> int:
-    digest = hashlib.blake2b(digest_size=8)
-    digest.update(str(seed).encode("utf-8"))
-    for part in parts:
-        digest.update(b"\0")
-        digest.update(part.encode("utf-8"))
-    return int.from_bytes(digest.digest(), byteorder="big", signed=False)

hyper3_clip/data/manifest_dataset.py DELETED Viewed

@@ -1,120 +0,0 @@
-from __future__ import annotations
-import json
-import random
-from pathlib import Path
-from typing import Any
-import torch
-from PIL import Image
-from torch.utils.data import Dataset, get_worker_info
-from hyper3_clip.data.collators import collate_grounded as collate_grounded
-from hyper3_clip.data.transforms import build_train_transform
-from hyper3_clip.data.types import GroundedParent, GroundedRecord
-__all__ = ["GroundedManifestDataset", "collate_grounded"]
-PART_SAMPLING_MODES = {"random_one", "all"}
-class GroundedManifestDataset(Dataset):
-    """Manifest dataset with one full image/caption and one or more grounded parents per row."""
-    def __init__(
-        self,
-        manifests: list[str] | str | Path,
-        image_size: int,
-        seed: int,
-        manifest_weights: list[float] | None = None,
-        part_sampling: str = "random_one",
-        max_parts: int | None = None,
-        train_transform: str = "wide_random_crop",
-        image_normalization: str = "imagenet",
-    ) -> None:
-        manifest_paths = [str(manifests)] if isinstance(manifests, str | Path) else manifests
-        self.records: list[GroundedRecord] = []
-        source_records: list[list[GroundedRecord]] = []
-        for manifest_path in manifest_paths:
-            rows: list[GroundedRecord] = []
-            with Path(manifest_path).open("r", encoding="utf-8") as handle:
-                for line in handle:
-                    if line.strip():
-                        rows.append(GroundedRecord.from_json(json.loads(line)))
-            source_records.append(rows)
-        if manifest_weights is None:
-            for rows in source_records:
-                self.records.extend(rows)
-        else:
-            if len(manifest_weights) != len(source_records):
-                raise ValueError("manifest_weights must match manifests length")
-            max_len = max(len(rows) for rows in source_records if rows)
-            for rows, weight in zip(source_records, manifest_weights):
-                if not rows or weight <= 0.0:
-                    continue
-                target_len = max(1, int(round(max_len * weight)))
-                for idx in range(target_len):
-                    self.records.append(rows[idx % len(rows)])
-        self.seed = seed
-        if part_sampling not in PART_SAMPLING_MODES:
-            raise ValueError(f"part_sampling must be one of {sorted(PART_SAMPLING_MODES)}, got {part_sampling!r}")
-        if max_parts is not None and max_parts <= 0:
-            raise ValueError("max_parts must be positive when set")
-        self.part_sampling = part_sampling
-        self.max_parts = max_parts
-        self.transform = build_train_transform(image_size, preset=train_transform, normalization=image_normalization)
-    def __len__(self) -> int:
-        return len(self.records)
-    def __getitem__(self, index: int) -> dict[str, Any]:
-        record = self.records[index]
-        parents = self._select_parents(index, record.parents)
-        return {
-            "image": self._load_image(record.image_path),
-            "part_images": [self._load_parent_image(record.image_path, parent) for parent in parents],
-            "caption": record.caption,
-            "part_texts": [parent.text for parent in parents],
-        }
-    def _select_parents(self, index: int, parents: tuple[GroundedParent, ...]) -> tuple[GroundedParent, ...]:
-        if self.part_sampling == "all":
-            if self.max_parts is None or len(parents) <= self.max_parts:
-                return parents
-            worker = get_worker_info()
-            worker_id = worker.id if worker is not None else 0
-            rng = random.Random(self.seed + index + 1_000_003 * worker_id)
-            parent_indices = sorted(rng.sample(range(len(parents)), k=self.max_parts))
-            return tuple(parents[parent_index] for parent_index in parent_indices)
-        worker = get_worker_info()
-        worker_id = worker.id if worker is not None else 0
-        rng = random.Random(self.seed + index + 1_000_003 * worker_id)
-        return (parents[rng.randrange(len(parents))],)
-    def _load_image(self, path: Path) -> torch.Tensor:
-        with Image.open(path) as image:
-            return self.transform(image.convert("RGB"))
-    def _load_parent_image(self, image_path: Path, parent: GroundedParent) -> torch.Tensor:
-        source_path = parent.image_path or image_path
-        with Image.open(source_path) as image:
-            rgb = image.convert("RGB")
-            if parent.bbox is not None:
-                rgb = _crop_bbox(rgb, parent.bbox)
-            return self.transform(rgb)
-def _crop_bbox(image: Image.Image, bbox: tuple[float, float, float, float]) -> Image.Image:
-    width, height = image.size
-    left, top, right, bottom = bbox
-    crop_box = (
-        max(0, min(width, int(round(left)))),
-        max(0, min(height, int(round(top)))),
-        max(0, min(width, int(round(right)))),
-        max(0, min(height, int(round(bottom)))),
-    )
-    if crop_box[2] <= crop_box[0] or crop_box[3] <= crop_box[1]:
-        return image
-    return image.crop(crop_box)

hyper3_clip/data/mixed_dataset.py DELETED Viewed

@@ -1,68 +0,0 @@
-from __future__ import annotations
-import random
-from collections.abc import Iterator
-from typing import Any
-from torch.utils.data import Dataset, IterableDataset, get_worker_info
-from hyper3_clip.training.distributed import get_rank, get_world_size
-class MixedGroundedIterableDataset(IterableDataset):
-    """Infinite stream that mixes a primary stream with a finite grounded dataset.
-    This is intended for cleaned processed-GRIT plus explicit taxonomy hierarchy
-    manifests. The primary stream remains the pacing dataset, while auxiliary
-    examples are sampled with a fixed probability.
-    """
-    def __init__(
-        self,
-        primary: IterableDataset,
-        auxiliary: Dataset,
-        auxiliary_probability: float,
-        seed: int,
-    ) -> None:
-        if not 0.0 <= auxiliary_probability <= 1.0:
-            raise ValueError("auxiliary_probability must be in [0, 1]")
-        if len(auxiliary) == 0:
-            raise ValueError("auxiliary dataset must not be empty")
-        self.primary = primary
-        self.auxiliary = auxiliary
-        self.auxiliary_probability = auxiliary_probability
-        self.seed = seed
-    def __iter__(self) -> Iterator[dict[str, Any]]:
-        worker = get_worker_info()
-        worker_id = worker.id if worker is not None else 0
-        num_workers = worker.num_workers if worker is not None else 1
-        rank = get_rank()
-        world_size = get_world_size()
-        rng = random.Random(self.seed + 1_000_003 * rank + 9_176 * worker_id)
-        primary_iter = iter(self.primary)
-        auxiliary_iter = self._iter_auxiliary_indices(rng, rank, world_size, worker_id, num_workers)
-        while True:
-            if rng.random() < self.auxiliary_probability:
-                yield self.auxiliary[next(auxiliary_iter)]
-            else:
-                yield next(primary_iter)
-    def _iter_auxiliary_indices(
-        self,
-        rng: random.Random,
-        rank: int,
-        world_size: int,
-        worker_id: int,
-        num_workers: int,
-    ) -> Iterator[int]:
-        indices = list(range(len(self.auxiliary)))
-        indices = indices[rank::world_size]
-        indices = indices[worker_id::num_workers]
-        if not indices:
-            indices = list(range(len(self.auxiliary)))
-        while True:
-            shuffled = list(indices)
-            rng.shuffle(shuffled)
-            yield from shuffled

hyper3_clip/data/transforms.py DELETED Viewed

@@ -1,125 +0,0 @@
-from __future__ import annotations
-from torchvision import transforms
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073)
-CLIP_STD = (0.26862954, 0.26130258, 0.27577711)
-SIGLIP_MEAN = (0.5, 0.5, 0.5)
-SIGLIP_STD = (0.5, 0.5, 0.5)
-def normalization_stats(normalization: str) -> tuple[tuple[float, float, float], tuple[float, float, float]]:
-    if normalization == "imagenet":
-        return IMAGENET_MEAN, IMAGENET_STD
-    if normalization == "clip":
-        return CLIP_MEAN, CLIP_STD
-    if normalization == "siglip":
-        return SIGLIP_MEAN, SIGLIP_STD
-    raise ValueError("normalization must be one of 'imagenet', 'clip', or 'siglip'")
-def build_train_transform(
-    image_size: int,
-    preset: str = "wide_random_crop",
-    normalization: str = "imagenet",
-) -> transforms.Compose:
-    if preset == "wide_random_crop":
-        steps = [
-            transforms.RandomResizedCrop(
-                size=image_size,
-                scale=(0.5, 1.0),
-                interpolation=transforms.InterpolationMode.BICUBIC,
-            ),
-            transforms.ToTensor(),
-        ]
-    elif preset == "wide_random_crop_light_color":
-        steps = [
-            transforms.RandomResizedCrop(
-                size=image_size,
-                scale=(0.5, 1.0),
-                interpolation=transforms.InterpolationMode.BICUBIC,
-            ),
-            transforms.RandomApply(
-                [
-                    transforms.ColorJitter(
-                        brightness=0.2,
-                        contrast=0.2,
-                        saturation=0.2,
-                        hue=0.05,
-                    )
-                ],
-                p=0.4,
-            ),
-            transforms.ToTensor(),
-        ]
-    elif preset == "medium_random_crop":
-        steps = [
-            transforms.RandomResizedCrop(
-                size=image_size,
-                scale=(0.6, 1.0),
-                interpolation=transforms.InterpolationMode.BICUBIC,
-            ),
-            transforms.ToTensor(),
-        ]
-    elif preset == "center_crop":
-        steps = [
-            transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(image_size),
-            transforms.ToTensor(),
-        ]
-    elif preset == "tight_crop_color_jitter_gray":
-        steps = [
-            transforms.RandomResizedCrop(
-                size=image_size,
-                scale=(0.8, 1.0),
-                interpolation=transforms.InterpolationMode.BICUBIC,
-            ),
-            transforms.RandomApply(
-                [
-                    transforms.ColorJitter(
-                        brightness=0.4,
-                        contrast=0.4,
-                        saturation=0.4,
-                        hue=0.1,
-                    )
-                ],
-                p=0.8,
-            ),
-            transforms.RandomGrayscale(p=0.2),
-            transforms.ToTensor(),
-        ]
-    else:
-        raise ValueError(
-            f"Unsupported train transform preset {preset!r}; "
-            "expected 'wide_random_crop', 'wide_random_crop_light_color', "
-            "'medium_random_crop', 'tight_crop_color_jitter_gray', or 'center_crop'"
-        )
-    mean, std = normalization_stats(normalization)
-    return transforms.Compose([*steps, transforms.Normalize(mean=mean, std=std)])
-def build_eval_transform(image_size: int, normalization: str = "imagenet") -> transforms.Compose:
-    mean, std = normalization_stats(normalization)
-    return transforms.Compose(
-        [
-            transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.CenterCrop(image_size),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
-        ]
-    )
-def build_retrieval_transform(image_size: int, normalization: str = "imagenet") -> transforms.Compose:
-    mean, std = normalization_stats(normalization)
-    return transforms.Compose(
-        [
-            transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=mean, std=std),
-        ]
-    )

hyper3_clip/data/types.py DELETED Viewed

@@ -1,48 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any
-@dataclass(frozen=True)
-class GroundedParent:
-    text: str
-    image_path: Path | None = None
-    bbox: tuple[float, float, float, float] | None = None
-@dataclass(frozen=True)
-class GroundedRecord:
-    image_path: Path
-    caption: str
-    parents: tuple[GroundedParent, ...]
-    @classmethod
-    def from_json(cls, payload: dict[str, Any]) -> "GroundedRecord":
-        parents_payload = payload.get("parents")
-        if parents_payload is None:
-            parents_payload = [
-                {
-                    "text": payload.get("box_text", ""),
-                    "image_path": payload.get("box_image_path"),
-                    "bbox": payload.get("bbox"),
-                }
-            ]
-        parents: list[GroundedParent] = []
-        for parent_payload in parents_payload:
-            text = str(parent_payload.get("text") or parent_payload.get("box_text") or "").strip()
-            image_path = parent_payload.get("image_path") or parent_payload.get("box_image_path")
-            bbox_payload = parent_payload.get("bbox")
-            bbox = None
-            if bbox_payload is not None:
-                if len(bbox_payload) != 4:
-                    raise ValueError(f"Expected four bbox values, got {bbox_payload!r}")
-                bbox = tuple(float(value) for value in bbox_payload)
-            parents.append(GroundedParent(text=text, image_path=Path(image_path) if image_path else None, bbox=bbox))
-        if not parents:
-            raise ValueError("Grounded records must include at least one parent")
-        return cls(image_path=Path(payload["image_path"]), caption=str(payload["caption"]), parents=tuple(parents))

hyper3_clip/evaluation/__init__.py DELETED Viewed

@@ -1,20 +0,0 @@
-from hyper3_clip.evaluation.classification import evaluate_imagenet_zero_shot
-from hyper3_clip.evaluation.hierarchical import evaluate_imagenet_hierarchical
-from hyper3_clip.evaluation.pep import PEPEntailmentDataset, evaluate_pep_entailment
-from hyper3_clip.evaluation.retrieval import (
-    CocoCaptionRetrieval,
-    CocoKarpathyCaptionRetrieval,
-    Flickr30kCaptionRetrieval,
-    evaluate_caption_retrieval,
-)
-__all__ = [
-    "CocoCaptionRetrieval",
-    "CocoKarpathyCaptionRetrieval",
-    "Flickr30kCaptionRetrieval",
-    "PEPEntailmentDataset",
-    "evaluate_caption_retrieval",
-    "evaluate_imagenet_hierarchical",
-    "evaluate_imagenet_zero_shot",
-    "evaluate_pep_entailment",
-]

hyper3_clip/evaluation/classification.py DELETED Viewed

@@ -1,105 +0,0 @@
-from __future__ import annotations
-from pathlib import Path
-import torch
-from torch.utils.data import DataLoader
-from torch.utils.data import Subset
-from torchvision import datasets
-from hyper3_clip.data.transforms import build_eval_transform
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-IMAGENET_PROMPTS = (
-    "i took a picture : itap of a {}.",
-    "pics : a bad photo of the {}.",
-    "pics : a origami {}.",
-    "pics : a photo of the large {}.",
-    "pics : a {} in a video game.",
-    "pics : art of the {}.",
-    "pics : a photo of the small {}.",
-)
-@torch.inference_mode()
-def evaluate_imagenet_zero_shot(
-    model: Hyper3CLIP,
-    imagenet_val_root: str | Path,
-    device: torch.device,
-    batch_size: int = 128,
-    image_size: int = 224,
-    max_text_length: int = 77,
-    max_items: int | None = None,
-    prompts: tuple[str, ...] = IMAGENET_PROMPTS,
-) -> dict[str, float]:
-    model.eval()
-    dataset = datasets.ImageFolder(str(imagenet_val_root), transform=build_eval_transform(image_size))
-    class_names = _imagenet_prompt_names(dataset.classes, Path(imagenet_val_root))
-    classifier = _build_text_classifier(model, class_names, prompts, device, max_text_length)
-    eval_dataset = Subset(dataset, range(min(max_items, len(dataset)))) if max_items is not None else dataset
-    loader = DataLoader(eval_dataset, batch_size=batch_size, num_workers=4, pin_memory=device.type == "cuda")
-    correct = 0
-    total = 0
-    per_class_correct = torch.zeros(len(dataset.classes), dtype=torch.float64)
-    per_class_total = torch.zeros(len(dataset.classes), dtype=torch.float64)
-    for images, targets in loader:
-        images = images.to(device, non_blocking=True)
-        targets = targets.to(device, non_blocking=True)
-        predictions = model.similarity_scores(model.encode_image(images), classifier).argmax(dim=1)
-        matches = predictions == targets
-        correct += int(matches.sum().item())
-        total += targets.numel()
-        per_class_correct.scatter_add_(0, targets.cpu(), matches.cpu().double())
-        per_class_total.scatter_add_(0, targets.cpu(), torch.ones_like(targets.cpu(), dtype=torch.float64))
-    observed_classes = per_class_total > 0
-    mean_per_class = (per_class_correct[observed_classes] / per_class_total[observed_classes]).mean().item()
-    top1 = correct / max(total, 1)
-    return {"top1": top1, "top1_pct": 100.0 * top1, "mean_per_class_acc_pct": 100.0 * mean_per_class}
-def _build_text_classifier(
-    model: Hyper3CLIP,
-    class_names: list[str],
-    prompts: tuple[str, ...],
-    device: torch.device,
-    max_text_length: int,
-) -> torch.Tensor:
-    class_embeddings: list[torch.Tensor] = []
-    for class_name in class_names:
-        readable_name = class_name.replace("_", " ")
-        texts = [prompt.format(readable_name) for prompt in prompts]
-        tokenized = model.tokenizer(
-            texts,
-            padding=True,
-            truncation=True,
-            max_length=max_text_length,
-            return_tensors="pt",
-        ).to(device)
-        attention_mask = (
-            tokenized.attention_mask if "attention_mask" in tokenized else torch.ones_like(tokenized.input_ids)
-        )
-        tangent = model.encode_text(tokenized.input_ids, attention_mask, project=False).float().mean(dim=0, keepdim=True)
-        class_embeddings.append(model.project_text_features(tangent).squeeze(0))
-    return torch.stack(class_embeddings, dim=0)
-def _imagenet_prompt_names(class_names: list[str], imagenet_val_root: Path) -> list[str]:
-    if len(class_names) == 1000 and all(_looks_like_wnid(class_name) for class_name in class_names):
-        from torchvision.models._meta import _IMAGENET_CATEGORIES
-        label_index = imagenet_val_root / "imagenet_label_to_wnid.tsv"
-        if label_index.exists():
-            wnid_to_label: dict[str, int] = {}
-            for line in label_index.read_text(encoding="utf-8").splitlines():
-                label, wnid = line.split("\t", maxsplit=1)
-                wnid_to_label[wnid] = int(label)
-            return [_IMAGENET_CATEGORIES[wnid_to_label[class_name]] for class_name in class_names]
-        return list(_IMAGENET_CATEGORIES)
-    return class_names
-def _looks_like_wnid(class_name: str) -> bool:
-    return len(class_name) == 9 and class_name.startswith("n") and class_name[1:].isdigit()

hyper3_clip/evaluation/hierarchical.py DELETED Viewed

@@ -1,118 +0,0 @@
-from __future__ import annotations
-import csv
-import pickle
-from pathlib import Path
-import networkx as nx
-import torch
-from torch.utils.data import DataLoader, Subset
-from torchvision import datasets
-from hyper3_clip.data.transforms import build_eval_transform
-from hyper3_clip.evaluation.classification import IMAGENET_PROMPTS, _build_text_classifier, _imagenet_prompt_names, _looks_like_wnid
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-@torch.inference_mode()
-def evaluate_imagenet_hierarchical(
-    model: Hyper3CLIP,
-    imagenet_val_root: str | Path,
-    assets_root: str | Path,
-    device: torch.device,
-    batch_size: int = 128,
-    image_size: int = 224,
-    max_text_length: int = 77,
-    max_items: int | None = None,
-    prompts: tuple[str, ...] = IMAGENET_PROMPTS,
-) -> dict[str, float]:
-    model.eval()
-    imagenet_root = Path(imagenet_val_root)
-    dataset = datasets.ImageFolder(str(imagenet_root), transform=build_eval_transform(image_size))
-    class_names = _imagenet_prompt_names(dataset.classes, imagenet_root)
-    classifier = _build_text_classifier(model, class_names, prompts, device, max_text_length)
-    eval_dataset = Subset(dataset, range(min(max_items, len(dataset)))) if max_items is not None else dataset
-    loader = DataLoader(eval_dataset, batch_size=batch_size, num_workers=4, pin_memory=device.type == "cuda")
-    assets_path = Path(assets_root)
-    synsets_ordering = pickle.load((assets_path / "all_synsets.pkl").open("rb"))
-    ancestor_indices = pickle.load((assets_path / "all_ancestors_indices.pkl").open("rb"))
-    graph = _create_graph_from_edges(assets_path / "imagenet_isa.txt")
-    dataset_to_official = _dataset_to_official_indices(dataset.classes, imagenet_root, synsets_ordering).to(device)
-    totals = torch.zeros(5, dtype=torch.float64)
-    total_count = 0
-    for images, targets in loader:
-        images = images.to(device, non_blocking=True)
-        official_targets = dataset_to_official[targets.to(device, non_blocking=True)]
-        dataset_predictions = model.similarity_scores(model.encode_image(images), classifier).argmax(dim=1)
-        official_predictions = dataset_to_official[dataset_predictions]
-        batch_totals = _hierarchical_totals(
-            official_predictions.cpu().tolist(),
-            official_targets.cpu().tolist(),
-            ancestor_indices,
-            graph,
-            synsets_ordering,
-        )
-        totals += torch.tensor(batch_totals, dtype=torch.float64)
-        total_count += int(official_targets.numel())
-    averages = totals / max(total_count, 1)
-    return {
-        "tie": float(averages[0].item()),
-        "lca": float(averages[1].item()),
-        "jaccard": float(averages[2].item()),
-        "hierarchical_precision": float(averages[3].item()),
-        "hierarchical_recall": float(averages[4].item()),
-    }
-def _create_graph_from_edges(edge_file: Path) -> nx.DiGraph:
-    graph = nx.DiGraph()
-    with edge_file.open("r", encoding="utf-8") as handle:
-        reader = csv.reader(handle, delimiter=" ")
-        for parent, child in reader:
-            graph.add_edge(parent, child)
-    return graph
-def _dataset_to_official_indices(class_names: list[str], imagenet_val_root: Path, synsets_ordering: list[str]) -> torch.Tensor:
-    label_index = imagenet_val_root / "imagenet_label_to_wnid.tsv"
-    if label_index.exists():
-        wnid_to_label = {}
-        for line in label_index.read_text(encoding="utf-8").splitlines():
-            label, wnid = line.split("\t", maxsplit=1)
-            wnid_to_label[wnid] = int(label)
-        return torch.tensor([wnid_to_label[class_name] for class_name in class_names], dtype=torch.long)
-    if all(_looks_like_wnid(class_name) for class_name in class_names):
-        synset_to_label = {synset: label for label, synset in enumerate(synsets_ordering)}
-        return torch.tensor([synset_to_label[class_name] for class_name in class_names], dtype=torch.long)
-    return torch.arange(len(class_names), dtype=torch.long)
-def _hierarchical_totals(
-    predicted_labels: list[int],
-    true_labels: list[int],
-    ancestor_indices: list[list[int]],
-    graph: nx.DiGraph,
-    synsets_ordering: list[str],
-) -> tuple[float, float, float, float, float]:
-    undirected_graph = graph.to_undirected()
-    tree_induced_error = 0.0
-    least_common_ancestor = 0.0
-    jaccard = 0.0
-    hierarchical_precision = 0.0
-    hierarchical_recall = 0.0
-    for pred_label, true_label in zip(predicted_labels, true_labels):
-        pred_synset = synsets_ordering[pred_label]
-        true_synset = synsets_ordering[true_label]
-        pred_ancestors = set(ancestor_indices[pred_label])
-        true_ancestors = set(ancestor_indices[true_label])
-        intersection = pred_ancestors.intersection(true_ancestors)
-        union = pred_ancestors.union(true_ancestors)
-        tree_induced_error += nx.shortest_path_length(undirected_graph, source=pred_synset, target=true_synset)
-        least_common_ancestor += len(pred_ancestors) - len(intersection) + 1
-        jaccard += len(intersection) / len(union)
-        hierarchical_precision += len(intersection) / len(pred_ancestors)
-        hierarchical_recall += len(intersection) / len(true_ancestors)
-    return tree_induced_error, least_common_ancestor, jaccard, hierarchical_precision, hierarchical_recall

hyper3_clip/evaluation/pep.py DELETED Viewed

@@ -1,462 +0,0 @@
-from __future__ import annotations
-import csv
-import hashlib
-import json
-import math
-from urllib.parse import urlparse
-import urllib.request
-from dataclasses import dataclass
-from io import BytesIO
-from pathlib import Path
-from typing import Any
-import torch
-from PIL import Image
-from torch.utils.data import Dataset
-from hyper3_clip.data.transforms import build_eval_transform
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-from hyper3_clip.models.losses import factor_oxy_angle
-@dataclass(frozen=True)
-class PEPSample:
-    image_id: str
-    image_path: Path | None
-    image_url: str | None
-    positive_captions: tuple[str, ...]
-    negative_captions: tuple[str, ...] = ()
-class PEPEntailmentDataset(Dataset):
-    def __init__(
-        self,
-        annotations_path: str | Path,
-        image_root: str | Path | None = None,
-        image_size: int = 224,
-        max_items: int | None = None,
-        image_cache_dir: str | Path | None = None,
-        allow_image_download: bool = False,
-    ) -> None:
-        self.samples, self.global_negative_captions = load_pep_samples(
-            annotations_path,
-            image_root=image_root,
-            max_items=max_items,
-        )
-        self.transform = build_eval_transform(image_size)
-        self.image_cache_dir = Path(image_cache_dir) if image_cache_dir is not None else None
-        self.allow_image_download = allow_image_download
-    def __len__(self) -> int:
-        return len(self.samples)
-    def __getitem__(self, index: int) -> dict[str, Any]:
-        sample = self.samples[index]
-        image = _load_sample_image(sample, self.image_cache_dir, self.allow_image_download)
-        return {
-            "image": self.transform(image.convert("RGB")),
-            "image_id": sample.image_id,
-            "positive_captions": sample.positive_captions,
-            "negative_captions": sample.negative_captions,
-        }
-@torch.inference_mode()
-def evaluate_pep_entailment(
-    model: Hyper3CLIP,
-    annotations_path: str | Path,
-    device: torch.device,
-    image_root: str | Path | None = None,
-    image_size: int = 224,
-    max_text_length: int = 77,
-    batch_size: int = 128,
-    max_items: int | None = None,
-    image_cache_dir: str | Path | None = None,
-    allow_image_download: bool = False,
-    negative_pool_strategy: str = "annotation",
-    max_negatives_per_image: int | None = None,
-    pair_batch_size: int = 8192,
-) -> dict[str, float]:
-    """Evaluate ARGENT-style PEP entailment AUC/AP.
-    PEP treats image-caption hierarchy evaluation as binary entailment
-    classification. Positives are the hierarchical captions attached to the same
-    image. Negatives come either from explicit annotation/global pools, or from
-    other samples' finest captions when ``negative_pool_strategy`` is
-    ``"all_fine_captions"``.
-    """
-    if negative_pool_strategy not in {"annotation", "all_fine_captions"}:
-        raise ValueError("negative_pool_strategy must be 'annotation' or 'all_fine_captions'")
-    model.eval()
-    dataset = PEPEntailmentDataset(
-        annotations_path,
-        image_root=image_root,
-        image_size=image_size,
-        max_items=max_items,
-        image_cache_dir=image_cache_dir,
-        allow_image_download=allow_image_download,
-    )
-    if len(dataset) == 0:
-        raise ValueError("PEP evaluation requires at least one sample")
-    image_feats = _encode_images(model, dataset, device, batch_size)
-    pair_image_indices, pair_captions, labels = _build_pep_pairs(
-        dataset.samples,
-        dataset.global_negative_captions,
-        negative_pool_strategy=negative_pool_strategy,
-        max_negatives_per_image=max_negatives_per_image,
-    )
-    if not any(labels) or all(labels):
-        raise ValueError("PEP evaluation requires both positive and negative pairs")
-    captions = sorted(set(pair_captions))
-    caption_to_index = {caption: index for index, caption in enumerate(captions)}
-    text_feats = _encode_texts(model, captions, device, max_text_length, batch_size)
-    pair_text_indices = [caption_to_index[caption] for caption in pair_captions]
-    scores = _score_pep_pairs(
-        model,
-        image_feats,
-        text_feats,
-        pair_image_indices,
-        pair_text_indices,
-        device,
-        pair_batch_size=pair_batch_size,
-    )
-    auc = _roc_auc_score(labels, scores)
-    ap = _average_precision_score(labels, scores)
-    positive_scores = [score for score, label in zip(scores, labels) if label == 1]
-    negative_scores = [score for score, label in zip(scores, labels) if label == 0]
-    return {
-        "auc_roc": auc,
-        "average_precision": ap,
-        "auc_roc_pct": 100.0 * auc,
-        "average_precision_pct": 100.0 * ap,
-        "num_samples": float(len(dataset.samples)),
-        "num_pairs": float(len(labels)),
-        "num_positive_pairs": float(sum(labels)),
-        "num_negative_pairs": float(len(labels) - sum(labels)),
-        "mean_positive_score": float(sum(positive_scores) / len(positive_scores)),
-        "mean_negative_score": float(sum(negative_scores) / len(negative_scores)),
-    }
-def probabilistic_entailment_score(
-    specific: torch.Tensor,
-    general: torch.Tensor,
-    kappa: torch.Tensor,
-) -> torch.Tensor:
-    angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-    if angles.dim() == 2:
-        angles = angles.mean(dim=-1)
-    return torch.clamp(1.0 - (2.0 * angles / math.pi), min=0.0, max=1.0)
-def load_pep_samples(
-    annotations_path: str | Path,
-    image_root: str | Path | None = None,
-    max_items: int | None = None,
-) -> tuple[list[PEPSample], tuple[str, ...]]:
-    path = Path(annotations_path)
-    if path.suffix.lower() == ".jsonl":
-        samples = [_sample_from_mapping(json.loads(line), image_root) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]
-        return _limit_samples(samples, max_items), ()
-    if path.suffix.lower() == ".json":
-        payload = json.loads(path.read_text(encoding="utf-8"))
-        samples_payload = payload.get("samples", payload.get("data", [])) if isinstance(payload, dict) else payload
-        global_negatives = _caption_tuple(
-            payload.get("negative_captions") or payload.get("global_negative_captions") or payload.get("negative_pool")
-        ) if isinstance(payload, dict) else ()
-        samples = [_sample_from_mapping(item, image_root) for item in samples_payload]
-        return _limit_samples(samples, max_items), global_negatives
-    if path.suffix.lower() in {".csv", ".tsv"}:
-        return _load_csv_samples(path, image_root, max_items)
-    raise ValueError(f"Unsupported PEP annotations format {path.suffix!r}; expected .json, .jsonl, .csv, or .tsv")
-def _load_csv_samples(
-    path: Path,
-    image_root: str | Path | None,
-    max_items: int | None,
-) -> tuple[list[PEPSample], tuple[str, ...]]:
-    delimiter = "\t" if path.suffix.lower() == ".tsv" else ","
-    with path.open("r", encoding="utf-8", newline="") as handle:
-        rows = list(csv.DictReader(handle, delimiter=delimiter))
-    if not rows:
-        return [], ()
-    has_pair_labels = "caption" in rows[0] and "label" in rows[0]
-    if has_pair_labels:
-        grouped: dict[str, dict[str, Any]] = {}
-        for row in rows:
-            key = _image_key(row)
-            item = grouped.setdefault(key, {**row, "positive_captions": [], "negative_captions": []})
-            if _truthy_label(row["label"]):
-                item["positive_captions"].append(row["caption"])
-            else:
-                item["negative_captions"].append(row["caption"])
-        samples = [_sample_from_mapping(item, image_root) for item in grouped.values()]
-        return _limit_samples(samples, max_items), ()
-    samples = [_sample_from_mapping(row, image_root) for row in rows]
-    return _limit_samples(samples, max_items), ()
-def _sample_from_mapping(item: dict[str, Any], image_root: str | Path | None) -> PEPSample:
-    positives = _extract_positive_captions(item)
-    if not positives:
-        raise ValueError(f"PEP sample {item.get('id', item.get('image_id', '<unknown>'))!r} has no positive captions")
-    image_path = _extract_image_path(item, image_root)
-    image_url = _first_present(item, ("image_url", "url"))
-    if image_path is None and not image_url:
-        raise ValueError(f"PEP sample {item.get('id', item.get('image_id', '<unknown>'))!r} has no image path or URL")
-    image_id = str(_first_present(item, ("image_id", "id", "uid")) or image_path or image_url)
-    negatives = _caption_tuple(_first_present(item, ("negative_captions", "negatives", "negative_pool")))
-    return PEPSample(
-        image_id=image_id,
-        image_path=image_path,
-        image_url=str(image_url) if image_url else None,
-        positive_captions=positives,
-        negative_captions=negatives,
-    )
-def _extract_positive_captions(item: dict[str, Any]) -> tuple[str, ...]:
-    raw = _first_present(item, ("positive_captions", "hierarchical_captions", "caption_hierarchy", "captions"))
-    captions = _caption_tuple(raw)
-    if captions:
-        return captions
-    caption = item.get("caption")
-    return (str(caption).strip(),) if caption else ()
-def _caption_tuple(raw: Any) -> tuple[str, ...]:
-    if raw is None:
-        return ()
-    if isinstance(raw, str):
-        stripped = raw.strip()
-        if not stripped:
-            return ()
-        if stripped.startswith("["):
-            try:
-                parsed = json.loads(stripped)
-                return _caption_tuple(parsed)
-            except json.JSONDecodeError:
-                pass
-        separator = "=>" if "=>" in stripped else "||" if "||" in stripped else None
-        values = stripped.split(separator) if separator else [stripped]
-        return tuple(value.strip() for value in values if value.strip())
-    if isinstance(raw, (list, tuple)):
-        return tuple(str(value).strip() for value in raw if str(value).strip())
-    return (str(raw).strip(),)
-def _extract_image_path(item: dict[str, Any], image_root: str | Path | None) -> Path | None:
-    raw = _first_present(item, ("image_path", "path", "file_name", "filename"))
-    if raw is None:
-        image_url = _first_present(item, ("image_url", "url"))
-        if image_url is None or image_root is None:
-            return None
-        return _url_to_local_image_path(str(image_url), Path(image_root))
-    path = Path(str(raw))
-    if not path.is_absolute() and image_root is not None:
-        path = Path(image_root) / path
-    return path
-def _url_to_local_image_path(url: str, image_root: Path) -> Path:
-    url_path = Path(urlparse(url).path)
-    filename = url_path.name
-    if not filename:
-        raise ValueError(f"Cannot infer image filename from URL {url!r}")
-    candidates = [image_root / filename]
-    if url_path.parent.name:
-        candidates.append(image_root / url_path.parent.name / filename)
-    for candidate in candidates:
-        if candidate.exists():
-            return candidate
-    return candidates[0]
-def _first_present(item: dict[str, Any], keys: tuple[str, ...]) -> Any:
-    for key in keys:
-        value = item.get(key)
-        if value not in (None, ""):
-            return value
-    return None
-def _image_key(row: dict[str, Any]) -> str:
-    return str(_first_present(row, ("image_id", "id", "image_path", "path", "file_name", "filename", "image_url", "url")))
-def _truthy_label(raw: Any) -> bool:
-    return str(raw).strip().lower() in {"1", "true", "yes", "positive", "pos"}
-def _limit_samples(samples: list[PEPSample], max_items: int | None) -> list[PEPSample]:
-    return samples[:max_items] if max_items is not None else samples
-def _load_sample_image(sample: PEPSample, image_cache_dir: Path | None, allow_image_download: bool) -> Image.Image:
-    if sample.image_path is not None:
-        with Image.open(sample.image_path) as image:
-            return image.convert("RGB")
-    if not sample.image_url:
-        raise ValueError(f"PEP sample {sample.image_id!r} has no image path or URL")
-    if not allow_image_download:
-        raise ValueError("PEP sample uses image_url; set allow_image_download=true and image_cache_dir to evaluate it")
-    if image_cache_dir is None:
-        with urllib.request.urlopen(sample.image_url, timeout=30) as response:
-            return Image.open(BytesIO(response.read())).convert("RGB")
-    image_cache_dir.mkdir(parents=True, exist_ok=True)
-    cache_path = image_cache_dir / _url_cache_name(sample.image_url)
-    if not cache_path.exists():
-        urllib.request.urlretrieve(sample.image_url, cache_path)
-    with Image.open(cache_path) as image:
-        return image.convert("RGB")
-def _url_cache_name(url: str) -> str:
-    suffix = Path(url.split("?", maxsplit=1)[0]).suffix or ".jpg"
-    digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
-    return f"{digest}{suffix}"
-def _encode_images(
-    model: Hyper3CLIP,
-    dataset: PEPEntailmentDataset,
-    device: torch.device,
-    batch_size: int,
-) -> torch.Tensor:
-    feats: list[torch.Tensor] = []
-    batch: list[torch.Tensor] = []
-    for index in range(len(dataset)):
-        batch.append(dataset[index]["image"])
-        if len(batch) == batch_size or index == len(dataset) - 1:
-            images = torch.stack(batch).to(device)
-            feats.append(model.encode_image(images).cpu())
-            batch = []
-    return torch.cat(feats)
-def _encode_texts(
-    model: Hyper3CLIP,
-    captions: list[str],
-    device: torch.device,
-    max_text_length: int,
-    batch_size: int,
-) -> torch.Tensor:
-    feats: list[torch.Tensor] = []
-    for start in range(0, len(captions), batch_size):
-        batch = captions[start : start + batch_size]
-        tokenized = model.tokenizer(
-            batch,
-            padding=True,
-            truncation=True,
-            max_length=max_text_length,
-            return_tensors="pt",
-        ).to(device)
-        attention_mask = (
-            tokenized.attention_mask if "attention_mask" in tokenized else torch.ones_like(tokenized.input_ids)
-        )
-        feats.append(model.encode_text(tokenized.input_ids, attention_mask).cpu())
-    return torch.cat(feats)
-def _build_pep_pairs(
-    samples: list[PEPSample],
-    global_negative_captions: tuple[str, ...],
-    negative_pool_strategy: str,
-    max_negatives_per_image: int | None,
-) -> tuple[list[int], list[str], list[int]]:
-    fine_caption_pool = tuple(sample.positive_captions[-1] for sample in samples)
-    pair_image_indices: list[int] = []
-    pair_captions: list[str] = []
-    labels: list[int] = []
-    for image_index, sample in enumerate(samples):
-        positives = set(sample.positive_captions)
-        for caption in sample.positive_captions:
-            pair_image_indices.append(image_index)
-            pair_captions.append(caption)
-            labels.append(1)
-        negatives = sample.negative_captions or global_negative_captions
-        if not negatives and negative_pool_strategy == "all_fine_captions":
-            negatives = tuple(caption for idx, caption in enumerate(fine_caption_pool) if idx != image_index)
-        negatives = tuple(caption for caption in negatives if caption not in positives)
-        if max_negatives_per_image is not None:
-            negatives = negatives[:max_negatives_per_image]
-        for caption in negatives:
-            pair_image_indices.append(image_index)
-            pair_captions.append(caption)
-            labels.append(0)
-    return pair_image_indices, pair_captions, labels
-def _score_pep_pairs(
-    model: Hyper3CLIP,
-    image_feats: torch.Tensor,
-    text_feats: torch.Tensor,
-    pair_image_indices: list[int],
-    pair_text_indices: list[int],
-    device: torch.device,
-    pair_batch_size: int,
-) -> list[float]:
-    kappa = model._kappa().detach().to(device)
-    scores: list[torch.Tensor] = []
-    for start in range(0, len(pair_image_indices), pair_batch_size):
-        image_index = torch.tensor(pair_image_indices[start : start + pair_batch_size], dtype=torch.long)
-        text_index = torch.tensor(pair_text_indices[start : start + pair_batch_size], dtype=torch.long)
-        batch_images = image_feats.index_select(0, image_index).to(device)
-        batch_texts = text_feats.index_select(0, text_index).to(device)
-        scores.append(probabilistic_entailment_score(batch_images, batch_texts, kappa).cpu())
-    return torch.cat(scores).tolist()
-def _roc_auc_score(labels: list[int], scores: list[float]) -> float:
-    positives = sum(labels)
-    negatives = len(labels) - positives
-    if positives == 0 or negatives == 0:
-        raise ValueError("ROC AUC requires both positive and negative labels")
-    sorted_pairs = sorted(zip(scores, labels), key=lambda pair: pair[0])
-    rank_sum_pos = 0.0
-    rank = 1
-    index = 0
-    while index < len(sorted_pairs):
-        end = index + 1
-        while end < len(sorted_pairs) and sorted_pairs[end][0] == sorted_pairs[index][0]:
-            end += 1
-        avg_rank = (rank + rank + (end - index) - 1) / 2.0
-        rank_sum_pos += avg_rank * sum(label for _, label in sorted_pairs[index:end])
-        rank += end - index
-        index = end
-    return (rank_sum_pos - positives * (positives + 1) / 2.0) / (positives * negatives)
-def _average_precision_score(labels: list[int], scores: list[float]) -> float:
-    positives = sum(labels)
-    if positives == 0:
-        raise ValueError("Average precision requires at least one positive label")
-    sorted_pairs = sorted(zip(scores, labels), key=lambda pair: pair[0], reverse=True)
-    true_positives = 0
-    false_positives = 0
-    previous_recall = 0.0
-    ap = 0.0
-    index = 0
-    while index < len(sorted_pairs):
-        end = index + 1
-        while end < len(sorted_pairs) and sorted_pairs[end][0] == sorted_pairs[index][0]:
-            end += 1
-        true_positives += sum(label for _, label in sorted_pairs[index:end])
-        false_positives += (end - index) - sum(label for _, label in sorted_pairs[index:end])
-        recall = true_positives / positives
-        precision = true_positives / (true_positives + false_positives)
-        ap += (recall - previous_recall) * precision
-        previous_recall = recall
-        index = end
-    return ap

hyper3_clip/evaluation/retrieval.py DELETED Viewed

@@ -1,215 +0,0 @@
-from __future__ import annotations
-import json
-from collections import defaultdict
-from pathlib import Path
-from typing import Any
-import torch
-from PIL import Image
-from torch.utils.data import Dataset
-from hyper3_clip.data.transforms import build_retrieval_transform
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-class CocoCaptionRetrieval(Dataset):
-    def __init__(
-        self,
-        root: str | Path,
-        image_size: int = 224,
-        max_items: int | None = None,
-        image_normalization: str = "imagenet",
-    ) -> None:
-        self.root = Path(root)
-        with (self.root / "annotations" / "captions_val2017.json").open("r", encoding="utf-8") as handle:
-            payload = json.load(handle)
-        images = {item["id"]: item["file_name"] for item in payload["images"]}
-        captions: dict[int, list[str]] = defaultdict(list)
-        for annotation in payload["annotations"]:
-            captions[int(annotation["image_id"])].append(str(annotation["caption"]))
-        self.items = [
-            {"image_id": image_id, "image_path": self.root / "val2017" / images[image_id], "captions": captions[image_id]}
-            for image_id in sorted(captions)
-        ]
-        if max_items is not None:
-            self.items = self.items[:max_items]
-        self.transform = build_retrieval_transform(image_size, normalization=image_normalization)
-    def __len__(self) -> int:
-        return len(self.items)
-    def __getitem__(self, index: int) -> dict[str, Any]:
-        item = self.items[index]
-        with Image.open(item["image_path"]) as image:
-            tensor = self.transform(image.convert("RGB"))
-        return {"image": tensor, "captions": item["captions"], "image_id": item["image_id"]}
-class CocoKarpathyCaptionRetrieval(Dataset):
-    def __init__(
-        self,
-        root: str | Path,
-        split: str = "test",
-        image_size: int = 224,
-        max_items: int | None = None,
-        image_normalization: str = "imagenet",
-    ) -> None:
-        self.root = Path(root)
-        with (self.root / "karpathy" / "dataset_coco.json").open("r", encoding="utf-8") as handle:
-            payload = json.load(handle)
-        images = [item for item in payload["images"] if item["split"] == split]
-        if max_items is not None:
-            images = images[:max_items]
-        self.items = [
-            {
-                "image_id": item["imgid"],
-                "image_path": self.root / item["filepath"] / item["filename"],
-                "captions": [sentence["raw"].strip() for sentence in item["sentences"]],
-            }
-            for item in images
-        ]
-        self.transform = build_retrieval_transform(image_size, normalization=image_normalization)
-    def __len__(self) -> int:
-        return len(self.items)
-    def __getitem__(self, index: int) -> dict[str, Any]:
-        item = self.items[index]
-        with Image.open(item["image_path"]) as image:
-            tensor = self.transform(image.convert("RGB"))
-        return {"image": tensor, "captions": item["captions"], "image_id": item["image_id"]}
-class Flickr30kCaptionRetrieval(Dataset):
-    def __init__(
-        self,
-        root: str | Path,
-        split: str = "test",
-        image_size: int = 224,
-        max_items: int | None = None,
-        image_normalization: str = "imagenet",
-    ) -> None:
-        self.root = Path(root)
-        with (self.root / "dataset_flickr30k.json").open("r", encoding="utf-8") as handle:
-            payload = json.load(handle)
-        self.items = []
-        for index, image_payload in enumerate(payload["images"]):
-            if image_payload.get("split") != split:
-                continue
-            captions = [str(sentence.get("raw") or " ".join(sentence.get("tokens", []))) for sentence in image_payload["sentences"]]
-            self.items.append(
-                {
-                    "image_id": index,
-                    "image_path": self.root / "flickr30k_images" / image_payload["filename"],
-                    "captions": captions,
-                }
-            )
-        if max_items is not None:
-            self.items = self.items[:max_items]
-        self.transform = build_retrieval_transform(image_size, normalization=image_normalization)
-    def __len__(self) -> int:
-        return len(self.items)
-    def __getitem__(self, index: int) -> dict[str, Any]:
-        item = self.items[index]
-        with Image.open(item["image_path"]) as image:
-            tensor = self.transform(image.convert("RGB"))
-        return {"image": tensor, "captions": item["captions"], "image_id": item["image_id"]}
-@torch.inference_mode()
-def evaluate_caption_retrieval(
-    model: Hyper3CLIP,
-    dataset: Dataset,
-    device: torch.device,
-    max_text_length: int = 77,
-    batch_size: int = 128,
-) -> dict[str, float]:
-    model.eval()
-    image_feats: list[torch.Tensor] = []
-    captions: list[str] = []
-    text_feats: list[torch.Tensor] = []
-    text_to_image: list[int] = []
-    image_batch: list[torch.Tensor] = []
-    for item_index in range(len(dataset)):
-        item = dataset[item_index]
-        image_batch.append(item["image"])
-        if len(image_batch) == batch_size or item_index == len(dataset) - 1:
-            images = torch.stack(image_batch).to(device)
-            image_feats.append(model.encode_retrieval_image(images).cpu())
-            image_batch = []
-        captions.extend(item["captions"])
-        text_to_image.extend([item_index] * len(item["captions"]))
-    for start in range(0, len(captions), batch_size):
-        caption_batch = captions[start : start + batch_size]
-        tokenized = model.tokenizer(
-            caption_batch,
-            padding=True,
-            truncation=True,
-            max_length=max_text_length,
-            return_tensors="pt",
-        ).to(device)
-        attention_mask = (
-            tokenized.attention_mask if "attention_mask" in tokenized else torch.ones_like(tokenized.input_ids)
-        )
-        text_feats.append(model.encode_retrieval_text(tokenized.input_ids, attention_mask).cpu())
-    images = torch.cat(image_feats).to(device)
-    texts = torch.cat(text_feats).to(device)
-    scores_i2t = _retrieval_similarity_scores(model, images, texts, chunk_size=max(1, min(batch_size, 64)))
-    scores_t2i = scores_i2t.transpose(0, 1)
-    target_device = scores_i2t.device
-    text_targets = torch.tensor(text_to_image, device=target_device)
-    fractions = {
-        "image_to_text_r1": _recall_at_k(scores_i2t, _image_to_text_targets(text_to_image, len(dataset), target_device), 1),
-        "image_to_text_r5": _recall_at_k(scores_i2t, _image_to_text_targets(text_to_image, len(dataset), target_device), 5),
-        "image_to_text_r10": _recall_at_k(scores_i2t, _image_to_text_targets(text_to_image, len(dataset), target_device), 10),
-        "text_to_image_r1": _single_target_recall_at_k(scores_t2i, text_targets, 1),
-        "text_to_image_r5": _single_target_recall_at_k(scores_t2i, text_targets, 5),
-        "text_to_image_r10": _single_target_recall_at_k(scores_t2i, text_targets, 10),
-    }
-    return {
-        **fractions,
-        "i2t_r1": 100.0 * fractions["image_to_text_r1"],
-        "i2t_r5": 100.0 * fractions["image_to_text_r5"],
-        "i2t_r10": 100.0 * fractions["image_to_text_r10"],
-        "t2i_r1": 100.0 * fractions["text_to_image_r1"],
-        "t2i_r5": 100.0 * fractions["text_to_image_r5"],
-        "t2i_r10": 100.0 * fractions["text_to_image_r10"],
-    }
-def _retrieval_similarity_scores(
-    model: Hyper3CLIP, images: torch.Tensor, texts: torch.Tensor, chunk_size: int
-) -> torch.Tensor:
-    if not getattr(model, "retrieval_requires_chunking", False):
-        return model.retrieval_similarity_scores(images, texts)
-    chunks: list[torch.Tensor] = []
-    for start in range(0, images.shape[0], chunk_size):
-        chunk_scores = model.retrieval_similarity_scores(images[start : start + chunk_size], texts)
-        chunks.append(chunk_scores.cpu())
-    return torch.cat(chunks, dim=0)
-def _image_to_text_targets(text_to_image: list[int], num_images: int, device: torch.device) -> list[torch.Tensor]:
-    targets: list[list[int]] = [[] for _ in range(num_images)]
-    for text_index, image_index in enumerate(text_to_image):
-        targets[image_index].append(text_index)
-    return [torch.tensor(indices, device=device) for indices in targets]
-def _recall_at_k(scores: torch.Tensor, targets: list[torch.Tensor], k: int) -> float:
-    topk = scores.topk(k=min(k, scores.shape[1]), dim=1).indices
-    hits = [bool(torch.isin(targets[row], topk[row]).any().item()) for row in range(scores.shape[0])]
-    return float(sum(hits) / len(hits))
-def _single_target_recall_at_k(scores: torch.Tensor, targets: torch.Tensor, k: int) -> float:
-    topk = scores.topk(k=min(k, scores.shape[1]), dim=1).indices
-    return float((topk == targets[:, None]).any(dim=1).float().mean().item())

hyper3_clip/models/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-__all__ = ["Hyper3CLIP"]

hyper3_clip/models/encoders.py DELETED Viewed

@@ -1,173 +0,0 @@
-from __future__ import annotations
-import timm
-import torch
-from torch import nn
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoTokenizer,
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPVisionConfig,
-    CLIPVisionModel,
-    CLIPVisionModelWithProjection,
-    SiglipTextConfig,
-    SiglipTextModel,
-    SiglipVisionConfig,
-    SiglipVisionModel,
-)
-class VisionEncoder(nn.Module):
-    def __init__(self, backbone_name: str, pretrained: bool = True) -> None:
-        super().__init__()
-        self.kind = "timm"
-        if backbone_name.startswith("hf_clip_projected:"):
-            self.kind = "hf_clip_projected"
-            model_name = backbone_name.removeprefix("hf_clip_projected:")
-            self.backbone = (
-                CLIPVisionModelWithProjection.from_pretrained(model_name)
-                if pretrained
-                else CLIPVisionModelWithProjection(CLIPVisionConfig.from_pretrained(model_name))
-            )
-            self.output_dim = self.backbone.config.projection_dim
-        elif backbone_name.startswith("hf_clip:"):
-            self.kind = "hf_vision"
-            model_name = backbone_name.removeprefix("hf_clip:")
-            self.backbone = (
-                CLIPVisionModel.from_pretrained(model_name)
-                if pretrained
-                else CLIPVisionModel(CLIPVisionConfig.from_pretrained(model_name))
-            )
-            self.output_dim = self.backbone.config.hidden_size
-        elif backbone_name.startswith("hf_siglip:"):
-            self.kind = "hf_vision"
-            model_name = backbone_name.removeprefix("hf_siglip:")
-            self.backbone = (
-                SiglipVisionModel.from_pretrained(model_name)
-                if pretrained
-                else SiglipVisionModel(SiglipVisionConfig.from_pretrained(model_name))
-            )
-            self.output_dim = self.backbone.config.hidden_size
-        else:
-            self.backbone = timm.create_model(
-                backbone_name,
-                pretrained=pretrained,
-                num_classes=0,
-                global_pool="avg",
-            )
-            self.output_dim = self.backbone.num_features
-    def forward(self, image: torch.Tensor) -> torch.Tensor:
-        if self.kind == "hf_clip_projected":
-            return self.backbone(pixel_values=image).image_embeds
-        if self.kind == "hf_vision":
-            out = self.backbone(pixel_values=image)
-            if hasattr(out, "pooler_output") and out.pooler_output is not None:
-                return out.pooler_output
-            return out.last_hidden_state[:, 0]
-        return self.backbone(image)
-    def forward_with_tokens(self, image: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        if self.kind == "hf_clip_projected":
-            out = self.backbone(pixel_values=image)
-            tokens = getattr(out, "last_hidden_state", None)
-            if tokens is None and hasattr(out, "vision_model_output"):
-                tokens = out.vision_model_output.last_hidden_state
-            if tokens is None:
-                raise RuntimeError("Projected CLIP vision output did not include patch tokens")
-            return out.image_embeds, tokens
-        if self.kind == "hf_vision":
-            out = self.backbone(pixel_values=image)
-            if hasattr(out, "pooler_output") and out.pooler_output is not None:
-                pooled = out.pooler_output
-            else:
-                pooled = out.last_hidden_state[:, 0]
-            return pooled, out.last_hidden_state
-        if not hasattr(self.backbone, "forward_features"):
-            pooled = self.backbone(image)
-            return pooled, pooled[:, None, :]
-        features = self.backbone.forward_features(image)
-        if hasattr(self.backbone, "forward_head"):
-            pooled = self.backbone.forward_head(features, pre_logits=False)
-        else:
-            pooled = self.backbone(image)
-        return pooled, _tokens_from_features(features)
-class TextEncoder(nn.Module):
-    def __init__(self, model_name: str, pretrained: bool = True, pooling: str = "auto") -> None:
-        super().__init__()
-        if pooling not in {"auto", "pooler", "cls", "mean"}:
-            raise ValueError(f"Unsupported text pooling {pooling!r}; expected auto, pooler, cls, or mean")
-        self.kind = "hf_text"
-        self.pooling = pooling
-        tokenizer_name = model_name.removeprefix("hf_clip_projected:")
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
-        model_name_lower = model_name.lower()
-        if model_name.startswith("hf_clip_projected:"):
-            self.kind = "hf_clip_projected"
-            projected_model_name = model_name.removeprefix("hf_clip_projected:")
-            if pretrained:
-                self.backbone = CLIPTextModelWithProjection.from_pretrained(projected_model_name)
-            else:
-                self.backbone = CLIPTextModelWithProjection(CLIPTextConfig.from_pretrained(projected_model_name))
-            self.output_dim = self.backbone.config.projection_dim
-        elif "siglip" in model_name_lower:
-            if pretrained:
-                self.backbone = SiglipTextModel.from_pretrained(model_name)
-            else:
-                self.backbone = SiglipTextModel(SiglipTextConfig.from_pretrained(model_name))
-            self.output_dim = self.backbone.config.hidden_size
-        elif "clip" in model_name_lower:
-            if pretrained:
-                self.backbone = CLIPTextModel.from_pretrained(model_name)
-            else:
-                self.backbone = CLIPTextModel(CLIPTextConfig.from_pretrained(model_name))
-            self.output_dim = self.backbone.config.hidden_size
-        else:
-            if pretrained:
-                self.backbone = AutoModel.from_pretrained(model_name)
-            else:
-                self.backbone = AutoModel.from_config(AutoConfig.from_pretrained(model_name))
-            hidden_size = getattr(self.backbone.config, "hidden_size", None)
-            if hidden_size is None:
-                raise ValueError(f"Unsupported text model config for {model_name}")
-            self.output_dim = hidden_size
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
-        if self.kind == "hf_clip_projected":
-            return out.text_embeds
-        if self.pooling == "mean":
-            mask = attention_mask.to(dtype=out.last_hidden_state.dtype).unsqueeze(-1)
-            summed = (out.last_hidden_state * mask).sum(dim=1)
-            denom = mask.sum(dim=1).clamp_min(1.0)
-            return summed / denom
-        if self.pooling in {"auto", "pooler"} and hasattr(out, "pooler_output") and out.pooler_output is not None:
-            return out.pooler_output
-        return out.last_hidden_state[:, 0]
-def _tokens_from_features(features: torch.Tensor | dict | tuple | list) -> torch.Tensor:
-    if isinstance(features, dict):
-        for key in ("x", "last_hidden_state", "features"):
-            if key in features:
-                features = features[key]
-                break
-        else:
-            features = next(iter(features.values()))
-    if isinstance(features, tuple | list):
-        features = features[0]
-    if not torch.is_tensor(features):
-        raise TypeError(f"Expected tensor features, got {type(features)!r}")
-    if features.ndim == 4:
-        return features.flatten(2).transpose(1, 2)
-    if features.ndim == 3:
-        return features
-    if features.ndim == 2:
-        return features[:, None, :]
-    raise ValueError(f"Unsupported feature tensor shape {tuple(features.shape)}")

hyper3_clip/models/experimental.py DELETED Viewed

@@ -1,587 +0,0 @@
-from __future__ import annotations
-from collections.abc import Callable
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-from hyper3_clip.models.lorentz import exp_map0, metric_pairwise_dist
-from hyper3_clip.models.losses import beta_cal_loss
-from hyper3_clip.models.tren import TRENRegionEncoder
-from hyper3_clip.training.distributed import gather_variable_with_grad, gather_with_grad, get_rank
-ProjectionHeadFactory = Callable[[int, int, int | None], nn.Module]
-class ExperimentalObjectiveMixin:
-    @staticmethod
-    def _validate_experimental_options(
-        *,
-        proclip_geometry: str,
-        proclip_projection_hidden_dim: int | None,
-        proclip_component_dim: int | None,
-        beta_clip_weight: float,
-        beta_clip_global_weight: float,
-        beta_clip_beta: float,
-        beta_clip_variant: str,
-        beta_clip_similarity: str,
-        beta_clip_num_heads: int,
-        beta_clip_mlp_ratio: float,
-        tren_weight: float,
-        tren_visual_distill_weight: float,
-        tren_text_distill_weight: float,
-        tren_region_text_weight: float,
-        tren_num_region_tokens: int,
-        tren_num_decoder_layers: int,
-        tren_num_attention_heads: int,
-        tren_prompt_grid_size: int,
-        tren_dropout: float,
-    ) -> None:
-        if proclip_geometry not in {"product", "hyperbolic", "euclidean", "spherical", "clip"}:
-            raise ValueError("proclip_geometry must be 'product', 'hyperbolic', 'euclidean', 'spherical', or 'clip'")
-        if proclip_projection_hidden_dim is not None and proclip_projection_hidden_dim <= 0:
-            raise ValueError("proclip_projection_hidden_dim must be positive when set")
-        if proclip_component_dim is not None and proclip_component_dim <= 0:
-            raise ValueError("proclip_component_dim must be positive when set")
-        if beta_clip_variant not in {"ce", "bce"}:
-            raise ValueError("beta_clip_variant must be 'ce' or 'bce'")
-        if beta_clip_similarity not in {"metric", "dot"}:
-            raise ValueError("beta_clip_similarity must be 'metric' or 'dot'")
-        if beta_clip_weight < 0.0:
-            raise ValueError("beta_clip_weight must be non-negative")
-        if beta_clip_global_weight < 0.0:
-            raise ValueError("beta_clip_global_weight must be non-negative")
-        if beta_clip_beta < 0.0:
-            raise ValueError("beta_clip_beta must be non-negative")
-        if beta_clip_num_heads <= 0:
-            raise ValueError("beta_clip_num_heads must be positive")
-        if beta_clip_mlp_ratio <= 0.0:
-            raise ValueError("beta_clip_mlp_ratio must be positive")
-        if tren_weight < 0.0:
-            raise ValueError("tren_weight must be non-negative")
-        if tren_visual_distill_weight < 0.0 or tren_text_distill_weight < 0.0 or tren_region_text_weight < 0.0:
-            raise ValueError("T-REN loss weights must be non-negative")
-        if tren_num_region_tokens <= 0:
-            raise ValueError("tren_num_region_tokens must be positive")
-        if tren_num_decoder_layers <= 0:
-            raise ValueError("tren_num_decoder_layers must be positive")
-        if tren_num_attention_heads <= 0:
-            raise ValueError("tren_num_attention_heads must be positive")
-        if tren_prompt_grid_size <= 0:
-            raise ValueError("tren_prompt_grid_size must be positive")
-        if tren_dropout < 0.0:
-            raise ValueError("tren_dropout must be non-negative")
-    def _init_experimental_modules(
-        self,
-        *,
-        beta_clip_num_heads: int,
-        beta_clip_mlp_ratio: float,
-        tren_num_region_tokens: int,
-        tren_num_decoder_layers: int,
-        tren_num_attention_heads: int,
-        tren_prompt_grid_size: int,
-        tren_dropout: float,
-        projection_hidden_dim: int | None,
-        proclip_projection_hidden_dim: int | None,
-        projection_head: ProjectionHeadFactory,
-    ) -> None:
-        if self.beta_query_pooling_enabled:
-            if self.vision_encoder.output_dim % beta_clip_num_heads != 0:
-                raise ValueError("vision encoder output_dim must be divisible by beta_clip_num_heads")
-            beta_clip_hidden_dim = max(1, int(round(self.vision_encoder.output_dim * beta_clip_mlp_ratio)))
-            self.beta_clip_text_query_proj = nn.Linear(self.text_encoder.output_dim, self.vision_encoder.output_dim)
-            self.beta_clip_cross_attention = nn.MultiheadAttention(
-                self.vision_encoder.output_dim,
-                beta_clip_num_heads,
-                batch_first=True,
-            )
-            self.beta_clip_mlp_norm = nn.LayerNorm(self.vision_encoder.output_dim)
-            self.beta_clip_pool_mlp = nn.Sequential(
-                nn.Linear(self.vision_encoder.output_dim, beta_clip_hidden_dim),
-                nn.GELU(),
-                nn.Linear(beta_clip_hidden_dim, self.vision_encoder.output_dim),
-            )
-        if self.beta_clip_enabled:
-            self.beta_clip_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-        if self.tren_enabled:
-            self.tren_region_encoder = TRENRegionEncoder(
-                vision_dim=self.vision_encoder.output_dim,
-                text_dim=self.text_encoder.output_dim,
-                num_region_tokens=tren_num_region_tokens,
-                num_decoder_layers=tren_num_decoder_layers,
-                num_attention_heads=tren_num_attention_heads,
-                prompt_grid_size=tren_prompt_grid_size,
-                dropout=tren_dropout,
-            )
-            self.tren_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-        if self.proclip_enabled:
-            component_dim = self._proclip_component_dim
-            spherical_dim = self._proclip_spherical_ambient_dim
-            proclip_hidden_dim = proclip_projection_hidden_dim
-            if proclip_hidden_dim is None:
-                proclip_hidden_dim = projection_hidden_dim
-            if self.proclip_dedicated_hyperbolic:
-                self.proclip_image_hyperbolic_proj = projection_head(
-                    self.vision_encoder.output_dim, self.embed_dim, proclip_hidden_dim
-                )
-                self.proclip_text_hyperbolic_proj = projection_head(
-                    self.text_encoder.output_dim, self.embed_dim, proclip_hidden_dim
-                )
-            self.proclip_image_euclidean_proj = projection_head(
-                self.vision_encoder.output_dim, component_dim, proclip_hidden_dim
-            )
-            self.proclip_text_euclidean_proj = projection_head(
-                self.text_encoder.output_dim, component_dim, proclip_hidden_dim
-            )
-            self.proclip_image_spherical_proj = projection_head(
-                self.vision_encoder.output_dim, spherical_dim, proclip_hidden_dim
-            )
-            self.proclip_text_spherical_proj = projection_head(
-                self.text_encoder.output_dim, spherical_dim, proclip_hidden_dim
-            )
-            self.proclip_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-            self.proclip_log_weights = nn.Parameter(torch.zeros(3))
-    @property
-    def proclip_enabled(self) -> bool:
-        return (
-            self.objective_name == "proclip"
-            or self.proclip_component_dim is not None
-            or self.proclip_weight > 0.0
-            or self.proclip_retrieval
-        )
-    @property
-    def beta_clip_enabled(self) -> bool:
-        return self.beta_clip_weight > 0.0
-    @property
-    def beta_query_pooling_enabled(self) -> bool:
-        return self.beta_clip_enabled or (
-            self.objective_name == "uncha"
-            and self.uncha_entailment_loss in {"hier_beta_argent", "hier_beta_sourcepart_argent"}
-        )
-    @property
-    def tren_enabled(self) -> bool:
-        return self.tren_weight > 0.0
-    @property
-    def _proclip_component_dim(self) -> int:
-        return int(self.proclip_component_dim or self.embed_dim)
-    @property
-    def _proclip_spherical_ambient_dim(self) -> int:
-        return self._proclip_component_dim + 1
-    def _clamp_experimental_logit_scales(self) -> None:
-        if self.proclip_enabled:
-            self.proclip_logit_scale.clamp_(max=4.6052)
-        if self.beta_clip_enabled:
-            self.beta_clip_logit_scale.clamp_(max=4.6052)
-        if self.tren_enabled:
-            self.tren_logit_scale.clamp_(max=4.6052)
-    def _detached_experimental_logit_scales(self) -> dict[str, torch.Tensor]:
-        logs = {}
-        if self.proclip_enabled:
-            logs.update(self._detached_proclip_logs())
-        if self.beta_clip_enabled:
-            logs["beta_clip_logit_scale"] = self.beta_clip_logit_scale.exp().detach()
-        if self.tren_enabled:
-            logs["tren_logit_scale"] = self.tren_logit_scale.exp().detach()
-        return logs
-    def _beta_clip_global_contrastive_loss(
-        self,
-        *,
-        image_euc: torch.Tensor,
-        text_euc: torch.Tensor,
-        targets: torch.Tensor,
-    ) -> torch.Tensor:
-        image_feats = F.normalize(image_euc.float(), dim=-1)
-        text_feats = F.normalize(text_euc.float(), dim=-1)
-        all_image_feats = gather_with_grad(image_feats)
-        all_text_feats = gather_with_grad(text_feats)
-        if self.objective_name == "hycoclip":
-            scale = self.logit_scale.exp().clamp(max=100.0)
-        elif self.objective_name == "proclip":
-            scale = self.proclip_logit_scale.exp().clamp(max=100.0)
-        else:
-            scale = self.global_logit_scale.exp().clamp(max=100.0)
-        logits_i_t = image_feats @ all_text_feats.T * scale
-        logits_t_i = text_feats @ all_image_feats.T * scale
-        return 0.5 * (F.cross_entropy(logits_i_t, targets) + F.cross_entropy(logits_t_i, targets))
-    def _beta_query_entailment_embeddings(
-        self,
-        *,
-        image_tokens: torch.Tensor,
-        beta_query_input_ids: torch.Tensor | None,
-        beta_query_attention_mask: torch.Tensor | None,
-        beta_query_owner: torch.Tensor | None,
-        beta_query_parent: torch.Tensor | None,
-        beta_query_weight: torch.Tensor | None,
-        beta_query_source_part: torch.Tensor | None,
-        kappa: torch.Tensor,
-        query_base: torch.Tensor | None = None,
-    ) -> dict[str, torch.Tensor]:
-        if beta_query_input_ids is None or beta_query_attention_mask is None or beta_query_owner is None:
-            raise ValueError(f"{self.uncha_entailment_loss} requires beta query tensors from the collator")
-        if beta_query_parent is None or beta_query_weight is None:
-            raise ValueError(f"{self.uncha_entailment_loss} requires beta query hierarchy metadata from the collator")
-        if self.uncha_entailment_loss == "hier_beta_sourcepart_argent" and beta_query_source_part is None:
-            raise ValueError("hier_beta_sourcepart_argent requires beta_query_source_part from the collator")
-        if beta_query_input_ids.shape[0] == 0:
-            source_part = (
-                beta_query_source_part.to(device=image_tokens.device, dtype=torch.long)
-                if beta_query_source_part is not None
-                else beta_query_owner.new_zeros((0,), device=image_tokens.device, dtype=torch.long)
-            )
-            return {
-                "beta_query_image_feats": image_tokens.new_zeros((0, self.embed_dim)),
-                "beta_query_text_feats": image_tokens.new_zeros((0, self.embed_dim)),
-                "beta_query_owner": beta_query_owner.to(device=image_tokens.device, dtype=torch.long),
-                "beta_query_parent": beta_query_parent.to(device=image_tokens.device, dtype=torch.long),
-                "beta_query_weight": beta_query_weight.to(device=image_tokens.device, dtype=torch.float32),
-                "beta_query_source_part": source_part,
-            }
-        query_owner = beta_query_owner.to(device=image_tokens.device, dtype=torch.long)
-        if query_base is None:
-            query_base = self.encode_text_base(beta_query_input_ids, beta_query_attention_mask)
-        conditioned_image_base = self._beta_clip_text_conditioned_pool(image_tokens, query_base, query_owner)
-        query_image_euc = self.image_proj(conditioned_image_base)
-        query_text_euc = self.text_proj(query_base)
-        return {
-            "beta_query_image_feats": self.project_image_features(query_image_euc),
-            "beta_query_text_feats": self.project_text_features(query_text_euc),
-            "beta_query_owner": query_owner,
-            "beta_query_parent": beta_query_parent.to(device=image_tokens.device, dtype=torch.long),
-            "beta_query_weight": beta_query_weight.to(device=image_tokens.device, dtype=torch.float32),
-            **(
-                {"beta_query_source_part": beta_query_source_part.to(device=image_tokens.device, dtype=torch.long)}
-                if beta_query_source_part is not None
-                else {}
-            ),
-        }
-    def _beta_clip_auxiliary_loss(
-        self,
-        *,
-        image_tokens: torch.Tensor,
-        beta_query_input_ids: torch.Tensor | None,
-        beta_query_attention_mask: torch.Tensor | None,
-        beta_query_owner: torch.Tensor | None,
-        global_targets: torch.Tensor,
-        kappa: torch.Tensor,
-    ) -> torch.Tensor:
-        if beta_query_input_ids is None or beta_query_attention_mask is None or beta_query_owner is None:
-            raise ValueError("beta-CLIP auxiliary requires beta query tensors from the collator")
-        if beta_query_input_ids.shape[0] == 0:
-            return image_tokens.new_zeros(())
-        beta_query_owner = beta_query_owner.to(device=image_tokens.device, dtype=torch.long)
-        query_base = self.encode_text_base(beta_query_input_ids, beta_query_attention_mask)
-        conditioned_image_base = self._beta_clip_text_conditioned_pool(image_tokens, query_base, beta_query_owner)
-        query_image_euc = self.image_proj(conditioned_image_base)
-        query_text_euc = self.text_proj(query_base)
-        if self.beta_clip_similarity == "dot":
-            query_image_feats = F.normalize(query_image_euc.float(), dim=-1)
-            query_text_feats = F.normalize(query_text_euc.float(), dim=-1)
-        else:
-            query_image_feats = self.project_image_features(query_image_euc)
-            query_text_feats = self.project_text_features(query_text_euc)
-        all_query_image_feats, query_counts = gather_variable_with_grad(query_image_feats)
-        all_query_text_feats, _ = gather_variable_with_grad(query_text_feats)
-        query_offset = query_counts[: get_rank()].sum() if query_counts.numel() > 1 else query_counts.new_zeros(())
-        query_targets = torch.arange(query_image_feats.size(0), device=query_image_feats.device) + query_offset
-        query_group_ids = global_targets.index_select(0, beta_query_owner)
-        all_query_group_ids, _ = gather_variable_with_grad(query_group_ids)
-        scale = self.beta_clip_logit_scale.exp().clamp(max=100.0)
-        if self.beta_clip_similarity == "dot":
-            logits_i_t = query_image_feats @ all_query_text_feats.T * scale
-            logits_t_i = query_text_feats @ all_query_image_feats.T * scale
-        else:
-            logits_i_t = -metric_pairwise_dist(
-                query_image_feats,
-                all_query_text_feats,
-                kappa,
-                product_metric=self.phyclip_product_metric,
-            ) * scale
-            logits_t_i = -metric_pairwise_dist(
-                query_text_feats,
-                all_query_image_feats,
-                kappa,
-                product_metric=self.phyclip_product_metric,
-            ) * scale
-        return 0.5 * (
-            beta_cal_loss(
-                logits_i_t,
-                targets=query_targets,
-                group_ids=query_group_ids,
-                all_group_ids=all_query_group_ids,
-                beta=self.beta_clip_beta,
-                variant=self.beta_clip_variant,
-            )
-            + beta_cal_loss(
-                logits_t_i,
-                targets=query_targets,
-                group_ids=query_group_ids,
-                all_group_ids=all_query_group_ids,
-                beta=self.beta_clip_beta,
-                variant=self.beta_clip_variant,
-            )
-        )
-    def _beta_clip_text_conditioned_pool(
-        self,
-        image_tokens: torch.Tensor,
-        query_base: torch.Tensor,
-        query_owner: torch.Tensor,
-    ) -> torch.Tensor:
-        if image_tokens.ndim != 3:
-            raise ValueError("beta-CLIP image tokens must have shape [batch, tokens, dim]")
-        if getattr(self, "group_beta_query_pooling", False):
-            return self._beta_clip_text_conditioned_pool_grouped(image_tokens, query_base, query_owner)
-        if self.beta_clip_drop_cls_token and image_tokens.size(1) > 1:
-            image_tokens = image_tokens[:, 1:, :]
-        selected_tokens = image_tokens.index_select(0, query_owner).to(dtype=query_base.dtype)
-        query = self.beta_clip_text_query_proj(query_base).unsqueeze(1)
-        attended, _ = self.beta_clip_cross_attention(query, selected_tokens, selected_tokens, need_weights=False)
-        pooled = attended.squeeze(1)
-        return pooled + self.beta_clip_pool_mlp(self.beta_clip_mlp_norm(pooled))
-    def _beta_clip_text_conditioned_pool_grouped(
-        self,
-        image_tokens: torch.Tensor,
-        query_base: torch.Tensor,
-        query_owner: torch.Tensor,
-    ) -> torch.Tensor:
-        if query_owner.numel() == 0:
-            return query_base.new_zeros((0, self.vision_encoder.output_dim))
-        if query_owner.min().item() < 0 or query_owner.max().item() >= image_tokens.size(0):
-            raise IndexError("beta_query_owner contains an out-of-range image index")
-        tokens = image_tokens[:, 1:, :] if self.beta_clip_drop_cls_token and image_tokens.size(1) > 1 else image_tokens
-        tokens = tokens.to(dtype=query_base.dtype)
-        query_projected = self.beta_clip_text_query_proj(query_base)
-        counts = torch.bincount(query_owner, minlength=image_tokens.size(0))
-        max_queries = int(counts.max().item())
-        order = torch.argsort(query_owner)
-        sorted_owner = query_owner.index_select(0, order)
-        owner_offsets = torch.zeros_like(counts)
-        owner_offsets[1:] = counts.cumsum(0)[:-1]
-        sorted_positions = torch.arange(query_owner.numel(), device=query_owner.device) - owner_offsets.index_select(
-            0, sorted_owner
-        )
-        positions = torch.empty_like(sorted_positions)
-        positions[order] = sorted_positions
-        packed_query = query_projected.new_zeros((image_tokens.size(0), max_queries, query_projected.size(-1)))
-        packed_query[query_owner, positions] = query_projected
-        attended, _ = self.beta_clip_cross_attention(packed_query, tokens, tokens, need_weights=False)
-        pooled = attended[query_owner, positions]
-        return pooled + self.beta_clip_pool_mlp(self.beta_clip_mlp_norm(pooled))
-    def _tren_auxiliary_losses(
-        self,
-        *,
-        image_tokens: torch.Tensor,
-        part_owner: torch.Tensor,
-        part_image_base: torch.Tensor,
-        part_text_base: torch.Tensor,
-    ) -> dict[str, torch.Tensor]:
-        zero = image_tokens.new_zeros(())
-        if part_owner.numel() == 0:
-            return {
-                "tren_loss": zero,
-                "tren_visual_distill_loss": zero,
-                "tren_text_distill_loss": zero,
-                "tren_region_text_contrastive_loss": zero,
-                "tren_assignment_count": part_owner.new_tensor(0),
-            }
-        tren_outputs = self.tren_region_encoder(image_tokens)
-        visual_tokens = tren_outputs["visual_tokens"].flatten(1, 2)
-        text_tokens = tren_outputs["text_aligned_tokens"].flatten(1, 2)
-        matched_visual: list[torch.Tensor] = []
-        matched_text: list[torch.Tensor] = []
-        target_visual: list[torch.Tensor] = []
-        target_text: list[torch.Tensor] = []
-        for owner in range(image_tokens.size(0)):
-            region_mask = part_owner == owner
-            if not bool(region_mask.any()):
-                continue
-            owner_target_visual = part_image_base[region_mask].detach()
-            owner_target_text = part_text_base[region_mask].detach()
-            owner_visual_tokens = visual_tokens[owner]
-            owner_text_tokens = text_tokens[owner]
-            pred_indices, target_indices = _greedy_region_assignment(owner_visual_tokens, owner_target_visual)
-            if pred_indices.numel() == 0:
-                continue
-            matched_visual.append(owner_visual_tokens.index_select(0, pred_indices))
-            matched_text.append(owner_text_tokens.index_select(0, pred_indices))
-            target_visual.append(owner_target_visual.index_select(0, target_indices))
-            target_text.append(owner_target_text.index_select(0, target_indices))
-        if not matched_visual:
-            return {
-                "tren_loss": zero,
-                "tren_visual_distill_loss": zero,
-                "tren_text_distill_loss": zero,
-                "tren_region_text_contrastive_loss": zero,
-                "tren_assignment_count": part_owner.new_tensor(0),
-            }
-        matched_visual_tensor = torch.cat(matched_visual, dim=0)
-        matched_text_tensor = torch.cat(matched_text, dim=0)
-        target_visual_tensor = torch.cat(target_visual, dim=0)
-        target_text_tensor = torch.cat(target_text, dim=0)
-        visual_distill = 1.0 - F.cosine_similarity(matched_visual_tensor, target_visual_tensor, dim=-1).mean()
-        text_distill = 1.0 - F.cosine_similarity(matched_text_tensor, target_text_tensor, dim=-1).mean()
-        region_text = _symmetric_dot_contrastive(
-            matched_text_tensor,
-            target_text_tensor,
-            scale=self.tren_logit_scale.exp().clamp(max=100.0),
-        )
-        total = (
-            self.tren_visual_distill_weight * visual_distill
-            + self.tren_text_distill_weight * text_distill
-            + self.tren_region_text_weight * region_text
-        )
-        return {
-            "tren_loss": total,
-            "tren_visual_distill_loss": visual_distill,
-            "tren_text_distill_loss": text_distill,
-            "tren_region_text_contrastive_loss": region_text,
-            "tren_assignment_count": part_owner.new_tensor(matched_visual_tensor.size(0)),
-        }
-    def _project_proclip_image_base(self, base_feats: torch.Tensor, hyperbolic: torch.Tensor) -> torch.Tensor:
-        if self.proclip_geometry == "clip":
-            return F.normalize(base_feats.float(), dim=-1)
-        if self.proclip_dedicated_hyperbolic:
-            hyperbolic = exp_map0(self.proclip_image_hyperbolic_proj(base_feats.float()), self._kappa().float())
-        return self._pack_proclip_features(
-            hyperbolic=hyperbolic,
-            euclidean=self.proclip_image_euclidean_proj(base_feats.float()),
-            spherical=self.proclip_image_spherical_proj(base_feats.float()),
-        )
-    def _project_proclip_text_base(self, base_feats: torch.Tensor, hyperbolic: torch.Tensor) -> torch.Tensor:
-        if self.proclip_geometry == "clip":
-            return F.normalize(base_feats.float(), dim=-1)
-        if self.proclip_dedicated_hyperbolic:
-            hyperbolic = exp_map0(self.proclip_text_hyperbolic_proj(base_feats.float()), self._kappa().float())
-        return self._pack_proclip_features(
-            hyperbolic=hyperbolic,
-            euclidean=self.proclip_text_euclidean_proj(base_feats.float()),
-            spherical=self.proclip_text_spherical_proj(base_feats.float()),
-        )
-    def _pack_proclip_features(self, hyperbolic: torch.Tensor, euclidean: torch.Tensor, spherical: torch.Tensor) -> torch.Tensor:
-        spherical = F.normalize(spherical.float(), dim=-1)
-        if self.proclip_geometry == "hyperbolic":
-            return hyperbolic.float()
-        if self.proclip_geometry == "euclidean":
-            return euclidean.float()
-        if self.proclip_geometry == "spherical":
-            return spherical
-        return torch.cat([hyperbolic.float(), euclidean.float(), spherical], dim=-1)
-    def _split_proclip_features(self, feats: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        hyperbolic_dim = self.embed_dim + 1
-        component_dim = self._proclip_component_dim
-        spherical_dim = self._proclip_spherical_ambient_dim
-        hyperbolic = feats[:, :hyperbolic_dim]
-        euclidean = feats[:, hyperbolic_dim : hyperbolic_dim + component_dim]
-        spherical = feats[:, hyperbolic_dim + component_dim : hyperbolic_dim + component_dim + spherical_dim]
-        return hyperbolic, euclidean, spherical
-    def _proclip_similarity_scores(self, image_feats: torch.Tensor, text_feats: torch.Tensor) -> torch.Tensor:
-        if self.proclip_geometry == "clip":
-            return image_feats.float() @ text_feats.float().T
-        if self.proclip_geometry == "hyperbolic":
-            return -metric_pairwise_dist(image_feats, text_feats, self._kappa()).square()
-        if self.proclip_geometry == "euclidean":
-            return -torch.cdist(image_feats.float(), text_feats.float(), p=2).square()
-        if self.proclip_geometry == "spherical":
-            dot = (image_feats.float() @ text_feats.float().T).clamp(min=-1.0 + 1e-6, max=1.0 - 1e-6)
-            return -torch.acos(dot).square()
-        image_hyp, image_euc, image_sph = self._split_proclip_features(image_feats)
-        text_hyp, text_euc, text_sph = self._split_proclip_features(text_feats)
-        weights = self.proclip_log_weights.exp().to(device=image_feats.device, dtype=torch.float32)
-        hyperbolic_dist2 = metric_pairwise_dist(image_hyp, text_hyp, self._kappa()).square()
-        euclidean_dist2 = torch.cdist(image_euc.float(), text_euc.float(), p=2).square()
-        spherical_dot = (image_sph.float() @ text_sph.float().T).clamp(min=-1.0 + 1e-6, max=1.0 - 1e-6)
-        spherical_dist2 = torch.acos(spherical_dot).square()
-        return -(weights[0] * hyperbolic_dist2 + weights[1] * euclidean_dist2 + weights[2] * spherical_dist2)
-    def _proclip_contrastive_loss(
-        self,
-        image_feats: torch.Tensor,
-        text_feats: torch.Tensor,
-        all_image_feats: torch.Tensor,
-        all_text_feats: torch.Tensor,
-        targets: torch.Tensor,
-    ) -> torch.Tensor:
-        scale = self.proclip_logit_scale.exp().clamp(max=100.0)
-        logits_i_t = self._proclip_similarity_scores(image_feats, all_text_feats) * scale
-        logits_t_i = self._proclip_similarity_scores(text_feats, all_image_feats) * scale
-        return 0.5 * (F.cross_entropy(logits_i_t, targets) + F.cross_entropy(logits_t_i, targets))
-    def _detached_proclip_logs(self) -> dict[str, torch.Tensor]:
-        weights = self.proclip_log_weights.exp().detach()
-        return {
-            "proclip_logit_scale": self.proclip_logit_scale.exp().detach(),
-            "proclip_hyperbolic_weight": weights[0],
-            "proclip_euclidean_weight": weights[1],
-            "proclip_spherical_weight": weights[2],
-        }
-def _greedy_region_assignment(pred_tokens: torch.Tensor, target_tokens: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-    if pred_tokens.numel() == 0 or target_tokens.numel() == 0:
-        empty = torch.zeros((0,), dtype=torch.long, device=pred_tokens.device)
-        return empty, empty
-    similarities = F.normalize(pred_tokens.float(), dim=-1) @ F.normalize(target_tokens.float(), dim=-1).T
-    pair_scores = similarities.flatten()
-    order = torch.argsort(pair_scores, descending=True)
-    used_pred = torch.zeros(pred_tokens.size(0), dtype=torch.bool, device=pred_tokens.device)
-    used_target = torch.zeros(target_tokens.size(0), dtype=torch.bool, device=pred_tokens.device)
-    pred_indices: list[torch.Tensor] = []
-    target_indices: list[torch.Tensor] = []
-    for flat_index in order:
-        pred_index = torch.div(flat_index, target_tokens.size(0), rounding_mode="floor")
-        target_index = flat_index % target_tokens.size(0)
-        if used_pred[pred_index] or used_target[target_index]:
-            continue
-        used_pred[pred_index] = True
-        used_target[target_index] = True
-        pred_indices.append(pred_index)
-        target_indices.append(target_index)
-        if len(target_indices) == target_tokens.size(0):
-            break
-    if not pred_indices:
-        empty = torch.zeros((0,), dtype=torch.long, device=pred_tokens.device)
-        return empty, empty
-    return torch.stack(pred_indices), torch.stack(target_indices)
-def _symmetric_dot_contrastive(region_tokens: torch.Tensor, text_tokens: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
-    if region_tokens.size(0) == 1:
-        return region_tokens.new_zeros(())
-    region_tokens = F.normalize(region_tokens.float(), dim=-1)
-    text_tokens = F.normalize(text_tokens.float(), dim=-1)
-    logits = region_tokens @ text_tokens.T * scale
-    targets = torch.arange(logits.size(0), device=logits.device)
-    return 0.5 * (F.cross_entropy(logits, targets) + F.cross_entropy(logits.T, targets))

hyper3_clip/models/himo.py DELETED Viewed

@@ -1,55 +0,0 @@
-from __future__ import annotations
-import torch
-from torch import Tensor
-def hide_reconstruct_embeddings(
-    embeddings: Tensor,
-    *,
-    variance_threshold: float = 0.9,
-    detach_pca: bool = True,
-    eps: float = 1e-8,
-) -> Tensor:
-    """HiMo-CLIP HiDe: PCA-reconstruct embeddings using top principal components.
-    Given a batch of embeddings ``U ∈ R^{B×D}``, compute mean-centered embeddings,
-    perform SVD/PCA, choose the smallest number of components whose cumulative
-    explained variance exceeds ``variance_threshold``, and reconstruct each
-    embedding from this principal subspace:
-        u'_i = P^T (P (u_i - ū)) + ū
-    where P stacks the selected principal components as rows.
-    """
-    if embeddings.ndim != 2:
-        raise ValueError("hide_reconstruct_embeddings expects a [batch, dim] tensor")
-    if not (0.0 < variance_threshold <= 1.0):
-        raise ValueError("variance_threshold must be in (0, 1]")
-    if embeddings.size(0) < 2:
-        return embeddings
-    u = embeddings.to(dtype=torch.float32)
-    mean = u.mean(dim=0, keepdim=True)
-    centered = u - mean
-    if detach_pca:
-        centered_for_pca = centered.detach()
-    else:
-        centered_for_pca = centered
-    # SVD: centered = U S Vh, principal components are rows of Vh.
-    _, s, vh = torch.linalg.svd(centered_for_pca, full_matrices=False)
-    if s.numel() == 0 or float((s.square().sum()).item()) <= eps:
-        return embeddings
-    explained = s.square()
-    cumulative = explained.cumsum(dim=0) / explained.sum().clamp_min(eps)
-    m = int((cumulative >= variance_threshold).to(dtype=torch.int64).argmax().item()) + 1
-    m = max(1, min(m, vh.size(0)))
-    p = vh[:m]
-    if detach_pca:
-        p = p.detach()
-    recon = (centered @ p.T) @ p + mean
-    return recon.to(dtype=embeddings.dtype)

hyper3_clip/models/hyper3_clip.py DELETED Viewed

@@ -1,958 +0,0 @@
-from __future__ import annotations
-import torch
-import torch.nn.functional as F
-from torch import nn
-from hyper3_clip.models.encoders import TextEncoder, VisionEncoder
-from hyper3_clip.models.experimental import ExperimentalObjectiveMixin
-from hyper3_clip.models.himo import hide_reconstruct_embeddings
-from hyper3_clip.models.lorentz import exp_map0, metric_similarity
-from hyper3_clip.models.objectives import build_objective
-from hyper3_clip.training.distributed import (
-    gather_with_grad,
-    get_rank,
-    get_world_size,
-    local_target_indices,
-)
-class Hyper3CLIP(ExperimentalObjectiveMixin, nn.Module):
-    def __init__(
-        self,
-        vision_backbone: str,
-        text_model_name: str,
-        embed_dim: int,
-        curv_init: float,
-        learn_curv: bool,
-        entail_weight: float,
-        inter_aperture_scale: float,
-        intra_aperture_scale: float,
-        objective: str = "hycoclip",
-        uncha_piecewise_factor: float = 0.1,
-        uncha_calibration_alpha: float = 10.0,
-        uncha_stop_grad_calibration: bool = True,
-        vision_pretrained: bool = True,
-        text_pretrained: bool = True,
-        text_pooling: str = "auto",
-        freeze_vision_encoder: bool = False,
-        freeze_text_encoder: bool = False,
-        normalize_encoder_features: bool = False,
-        projection_hidden_dim: int | None = None,
-        uncha_entailment_geometry: str = "lorentz",
-        uncha_aggregate_weight: float = 0.0,
-        uncha_entailment_loss: str = "piecewise",
-        uncha_argent_beta: float = 1.0,
-        uncha_argent_norm_weight: float = 0.0,
-        uncha_argent_aux_weight: float = 0.5,
-        uncha_argent_aggregation: str = "uncha",
-        uncha_part_weight_power: float = 0.0,
-        uncha_contrastive_loss: str = "ce",
-        uncha_sigmoid_bias_init: float = -10.0,
-        uncha_sigmoid_negative_weight: float = 1.0,
-        uncha_part_quality_mode: str = "none",
-        uncha_part_quality_topk: int = 5,
-        uncha_part_quality_temperature: float = 4.0,
-        uncha_entailment_warmup_steps: int = 0,
-        uncha_contrastive_global_weight: float = 1.0,
-        uncha_contrastive_local_weight: float = 1.0,
-        uncha_contrastive_global_local_weight: float = 1.0,
-        uncha_global_local_mode: str = "repeat",
-        uncha_global_local_metric: str = "distance",
-        uncha_global_local_angle_aux_weight: float = 0.0,
-        uncha_global_local_angle_aux_mode: str = "contrastive",
-        uncha_global_local_angle_aux_scale: float = 5.5,
-        uncha_global_local_angle_aux_aperture_scale: float = 1.0,
-        uncha_beta_cal_beta: float = 0.0,
-        uncha_beta_cal_variant: str = "ce",
-        uncha_beta_cal_weight: float = 0.0,
-        uncha_himo_component_weight: float = 0.0,
-        uncha_himo_variance_threshold: float = 0.9,
-        uncha_himo_detach_pca: bool = True,
-        uncha_radius_order_weight: float = 0.0,
-        uncha_radius_order_margin: float = 0.0,
-        uncha_gramian_align_weight: float = 0.0,
-        phyclip_subspace_dim: int | None = None,
-        phyclip_product_metric: str = "l1",
-        proclip_weight: float = 0.0,
-        proclip_component_dim: int | None = None,
-        proclip_retrieval: bool = False,
-        proclip_geometry: str = "product",
-        proclip_dedicated_hyperbolic: bool = False,
-        proclip_projection_hidden_dim: int | None = None,
-        beta_clip_weight: float = 0.0,
-        beta_clip_global_weight: float = 0.0,
-        beta_clip_beta: float = 0.5,
-        beta_clip_variant: str = "ce",
-        beta_clip_similarity: str = "metric",
-        beta_clip_num_heads: int = 8,
-        beta_clip_mlp_ratio: float = 4.0,
-        beta_clip_drop_cls_token: bool = True,
-        tren_weight: float = 0.0,
-        tren_visual_distill_weight: float = 1.0,
-        tren_text_distill_weight: float = 1.0,
-        tren_region_text_weight: float = 1.0,
-        tren_num_region_tokens: int = 3,
-        tren_num_decoder_layers: int = 2,
-        tren_num_attention_heads: int = 8,
-        tren_prompt_grid_size: int = 7,
-        tren_dropout: float = 0.1,
-        fuse_whole_part_encoder_forwards: bool = False,
-        fuse_beta_query_encoder_forwards: bool = False,
-        group_beta_query_pooling: bool = False,
-        objective_autocast_dtype: str = "float32",
-    ) -> None:
-        super().__init__()
-        if objective not in {"hycoclip", "uncha", "proclip"}:
-            raise ValueError(f"Unsupported objective {objective!r}; expected 'hycoclip', 'uncha', or 'proclip'")
-        if phyclip_product_metric not in {"l1", "l2"}:
-            raise ValueError("phyclip_product_metric must be 'l1' or 'l2'")
-        self._validate_experimental_options(
-            proclip_geometry=proclip_geometry,
-            proclip_projection_hidden_dim=proclip_projection_hidden_dim,
-            proclip_component_dim=proclip_component_dim,
-            beta_clip_weight=beta_clip_weight,
-            beta_clip_global_weight=beta_clip_global_weight,
-            beta_clip_beta=beta_clip_beta,
-            beta_clip_variant=beta_clip_variant,
-            beta_clip_similarity=beta_clip_similarity,
-            beta_clip_num_heads=beta_clip_num_heads,
-            beta_clip_mlp_ratio=beta_clip_mlp_ratio,
-            tren_weight=tren_weight,
-            tren_visual_distill_weight=tren_visual_distill_weight,
-            tren_text_distill_weight=tren_text_distill_weight,
-            tren_region_text_weight=tren_region_text_weight,
-            tren_num_region_tokens=tren_num_region_tokens,
-            tren_num_decoder_layers=tren_num_decoder_layers,
-            tren_num_attention_heads=tren_num_attention_heads,
-            tren_prompt_grid_size=tren_prompt_grid_size,
-            tren_dropout=tren_dropout,
-        )
-        if objective_autocast_dtype not in {"float32", "fp32", "float16", "fp16", "bfloat16", "bf16"}:
-            raise ValueError("objective_autocast_dtype must be one of 'float32', 'float16', or 'bfloat16'")
-        if uncha_contrastive_loss not in {"ce", "sigmoid", "siglip", "siglip_metric"}:
-            raise ValueError("uncha_contrastive_loss must be 'ce', 'sigmoid', 'siglip', or 'siglip_metric'")
-        if uncha_global_local_metric not in {"distance", "angle"}:
-            raise ValueError("uncha_global_local_metric must be 'distance' or 'angle'")
-        if uncha_global_local_angle_aux_mode not in {"contrastive", "positive_hinge"}:
-            raise ValueError("uncha_global_local_angle_aux_mode must be 'contrastive' or 'positive_hinge'")
-        if uncha_global_local_angle_aux_weight < 0.0:
-            raise ValueError("uncha_global_local_angle_aux_weight must be non-negative")
-        if uncha_global_local_angle_aux_scale <= 0.0:
-            raise ValueError("uncha_global_local_angle_aux_scale must be positive")
-        if uncha_global_local_angle_aux_aperture_scale <= 0.0:
-            raise ValueError("uncha_global_local_angle_aux_aperture_scale must be positive")
-        if uncha_entailment_warmup_steps < 0:
-            raise ValueError("uncha_entailment_warmup_steps must be non-negative")
-        self.objective_name = objective
-        self.uncha_contrastive_loss = uncha_contrastive_loss
-        self.uncha_entailment_loss = uncha_entailment_loss
-        self.uncha_entailment_warmup_steps = uncha_entailment_warmup_steps
-        self.uncha_himo_component_weight = float(uncha_himo_component_weight)
-        self.uncha_himo_variance_threshold = float(uncha_himo_variance_threshold)
-        self.uncha_himo_detach_pca = bool(uncha_himo_detach_pca)
-        self.proclip_weight = float(proclip_weight)
-        self.proclip_retrieval = bool(proclip_retrieval)
-        self.proclip_geometry = proclip_geometry
-        self.proclip_dedicated_hyperbolic = bool(proclip_dedicated_hyperbolic)
-        self.beta_clip_weight = float(beta_clip_weight)
-        self.beta_clip_global_weight = float(beta_clip_global_weight)
-        self.beta_clip_beta = float(beta_clip_beta)
-        self.beta_clip_variant = beta_clip_variant
-        self.beta_clip_similarity = beta_clip_similarity
-        self.beta_clip_drop_cls_token = bool(beta_clip_drop_cls_token)
-        self.tren_weight = float(tren_weight)
-        self.tren_visual_distill_weight = float(tren_visual_distill_weight)
-        self.tren_text_distill_weight = float(tren_text_distill_weight)
-        self.tren_region_text_weight = float(tren_region_text_weight)
-        self.fuse_whole_part_encoder_forwards = bool(fuse_whole_part_encoder_forwards)
-        self.fuse_beta_query_encoder_forwards = bool(fuse_beta_query_encoder_forwards)
-        self.group_beta_query_pooling = bool(group_beta_query_pooling)
-        self.objective_autocast_dtype = objective_autocast_dtype
-        self.freeze_vision_encoder = bool(freeze_vision_encoder)
-        self.freeze_text_encoder = bool(freeze_text_encoder)
-        self.normalize_encoder_features = bool(normalize_encoder_features)
-        self.phyclip_subspace_dim = phyclip_subspace_dim
-        self.phyclip_product_metric = phyclip_product_metric
-        self.proclip_component_dim = proclip_component_dim
-        if projection_hidden_dim is not None and projection_hidden_dim <= 0:
-            raise ValueError("projection_hidden_dim must be positive when set")
-        if self.proclip_enabled and phyclip_subspace_dim is not None:
-            raise ValueError("ProCLIP mixed-curvature proxy cannot be combined with PHyCLIP Lorentz factors")
-        if phyclip_subspace_dim is not None:
-            if phyclip_subspace_dim <= 0:
-                raise ValueError("phyclip_subspace_dim must be positive when set")
-            if embed_dim % phyclip_subspace_dim != 0:
-                raise ValueError("embed_dim must be divisible by phyclip_subspace_dim")
-            self.phyclip_num_factors = embed_dim // phyclip_subspace_dim
-        else:
-            self.phyclip_num_factors = 0
-        self.vision_encoder = VisionEncoder(vision_backbone, pretrained=vision_pretrained)
-        self.text_encoder = TextEncoder(text_model_name, pretrained=text_pretrained, pooling=text_pooling)
-        self.tokenizer = self.text_encoder.tokenizer
-        self.embed_dim = embed_dim
-        if self.freeze_vision_encoder:
-            self.vision_encoder.requires_grad_(False)
-            self.vision_encoder.eval()
-        if self.freeze_text_encoder:
-            self.text_encoder.requires_grad_(False)
-            self.text_encoder.eval()
-        self.image_proj = _projection_head(self.vision_encoder.output_dim, embed_dim, projection_hidden_dim)
-        self.text_proj = _projection_head(self.text_encoder.output_dim, embed_dim, projection_hidden_dim)
-        self._init_experimental_modules(
-            beta_clip_num_heads=beta_clip_num_heads,
-            beta_clip_mlp_ratio=beta_clip_mlp_ratio,
-            tren_num_region_tokens=tren_num_region_tokens,
-            tren_num_decoder_layers=tren_num_decoder_layers,
-            tren_num_attention_heads=tren_num_attention_heads,
-            tren_prompt_grid_size=tren_prompt_grid_size,
-            tren_dropout=tren_dropout,
-            projection_hidden_dim=projection_hidden_dim,
-            proclip_projection_hidden_dim=proclip_projection_hidden_dim,
-            projection_head=_projection_head,
-        )
-        if objective == "hycoclip":
-            self.logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-        elif objective == "uncha":
-            self.global_logit_scale = nn.Parameter(torch.tensor(1 / 0.07).log())
-            self.local_logit_scale = nn.Parameter(torch.tensor(1 / 0.05).log())
-            self.global_local_logit_scale = nn.Parameter(torch.tensor(1 / 0.06).log())
-            if uncha_contrastive_loss in {"sigmoid", "siglip", "siglip_metric"}:
-                self.global_logit_bias = nn.Parameter(torch.tensor(float(uncha_sigmoid_bias_init)))
-                self.local_logit_bias = nn.Parameter(torch.tensor(float(uncha_sigmoid_bias_init)))
-                self.global_local_logit_bias = nn.Parameter(torch.tensor(float(uncha_sigmoid_bias_init)))
-        alpha_dim = phyclip_subspace_dim or embed_dim
-        alpha_shape = (self.phyclip_num_factors,) if self.phyclip_enabled else ()
-        self.visual_alpha = nn.Parameter(torch.full(alpha_shape, alpha_dim**-0.5).log())
-        self.textual_alpha = nn.Parameter(torch.full(alpha_shape, alpha_dim**-0.5).log())
-        curv_shape = (self.phyclip_num_factors,) if self.phyclip_enabled else ()
-        log_curv = torch.full(curv_shape, curv_init).log()
-        self.log_curv = nn.Parameter(log_curv, requires_grad=learn_curv)
-        self.curv_min = curv_init / 10.0
-        self.curv_max = curv_init * 10.0
-        self.objective = None
-        if objective != "proclip":
-            self.objective = build_objective(
-                objective=objective,
-                entail_weight=entail_weight,
-                inter_aperture_scale=inter_aperture_scale,
-                intra_aperture_scale=intra_aperture_scale,
-                uncha_piecewise_factor=uncha_piecewise_factor,
-                uncha_calibration_alpha=uncha_calibration_alpha,
-                uncha_stop_grad_calibration=uncha_stop_grad_calibration,
-                uncha_entailment_geometry=uncha_entailment_geometry,
-                uncha_aggregate_weight=uncha_aggregate_weight,
-                uncha_entailment_loss=uncha_entailment_loss,
-                uncha_argent_beta=uncha_argent_beta,
-                uncha_argent_norm_weight=uncha_argent_norm_weight,
-                uncha_argent_aux_weight=uncha_argent_aux_weight,
-                uncha_argent_aggregation=uncha_argent_aggregation,
-                uncha_part_weight_power=uncha_part_weight_power,
-                uncha_contrastive_loss=uncha_contrastive_loss,
-                uncha_sigmoid_negative_weight=uncha_sigmoid_negative_weight,
-                uncha_part_quality_mode=uncha_part_quality_mode,
-                uncha_part_quality_topk=uncha_part_quality_topk,
-                uncha_part_quality_temperature=uncha_part_quality_temperature,
-                uncha_contrastive_global_weight=uncha_contrastive_global_weight,
-                uncha_contrastive_local_weight=uncha_contrastive_local_weight,
-                uncha_contrastive_global_local_weight=uncha_contrastive_global_local_weight,
-                uncha_global_local_mode=uncha_global_local_mode,
-                uncha_global_local_metric=uncha_global_local_metric,
-                uncha_global_local_angle_aux_weight=uncha_global_local_angle_aux_weight,
-                uncha_global_local_angle_aux_mode=uncha_global_local_angle_aux_mode,
-                uncha_global_local_angle_aux_scale=uncha_global_local_angle_aux_scale,
-                uncha_global_local_angle_aux_aperture_scale=uncha_global_local_angle_aux_aperture_scale,
-                uncha_beta_cal_beta=uncha_beta_cal_beta,
-                uncha_beta_cal_variant=uncha_beta_cal_variant,
-                uncha_beta_cal_weight=uncha_beta_cal_weight,
-                uncha_himo_component_weight=uncha_himo_component_weight,
-                uncha_radius_order_weight=uncha_radius_order_weight,
-                uncha_radius_order_margin=uncha_radius_order_margin,
-                uncha_gramian_align_weight=uncha_gramian_align_weight,
-                product_metric=phyclip_product_metric,
-            )
-    def train(self, mode: bool = True) -> Hyper3CLIP:
-        super().train(mode)
-        if self.freeze_vision_encoder:
-            self.vision_encoder.eval()
-        if self.freeze_text_encoder:
-            self.text_encoder.eval()
-        return self
-    @property
-    def phyclip_enabled(self) -> bool:
-        return self.phyclip_subspace_dim is not None
-    def _kappa(self) -> torch.Tensor:
-        return self.log_curv.exp().clamp(min=self.curv_min, max=self.curv_max)
-    def encode_image(self, image: torch.Tensor, project: bool = True) -> torch.Tensor:
-        feats = self.image_proj(self.encode_image_base(image))
-        if not project:
-            return feats
-        return self.project_image_features(feats)
-    def encode_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, project: bool = True) -> torch.Tensor:
-        feats = self.text_proj(self.encode_text_base(input_ids, attention_mask))
-        if not project:
-            return feats
-        return self.project_text_features(feats)
-    def encode_image_base(self, image: torch.Tensor) -> torch.Tensor:
-        with torch.set_grad_enabled(self.training and not self.freeze_vision_encoder):
-            feats = self.vision_encoder(image)
-        feats = feats.detach() if self.freeze_vision_encoder else feats
-        return F.normalize(feats.float(), dim=-1) if self.normalize_encoder_features else feats
-    def encode_image_base_with_tokens(self, image: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        with torch.set_grad_enabled(self.training and not self.freeze_vision_encoder):
-            feats, tokens = self.vision_encoder.forward_with_tokens(image)
-        if self.freeze_vision_encoder:
-            feats = feats.detach()
-            tokens = tokens.detach()
-        if self.normalize_encoder_features:
-            feats = F.normalize(feats.float(), dim=-1)
-        return feats, tokens
-    def encode_text_base(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        with torch.set_grad_enabled(self.training and not self.freeze_text_encoder):
-            feats = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
-        feats = feats.detach() if self.freeze_text_encoder else feats
-        return F.normalize(feats.float(), dim=-1) if self.normalize_encoder_features else feats
-    def project_image_features(self, feats: torch.Tensor) -> torch.Tensor:
-        if self.phyclip_enabled:
-            return self._project_product_features(feats, self.visual_alpha)
-        return exp_map0(feats.float() * self.visual_alpha.exp().float(), self._kappa().float())
-    def project_text_features(self, feats: torch.Tensor) -> torch.Tensor:
-        if self.phyclip_enabled:
-            return self._project_product_features(feats, self.textual_alpha)
-        return exp_map0(feats.float() * self.textual_alpha.exp().float(), self._kappa().float())
-    def similarity_scores(self, image_feats: torch.Tensor, text_feats: torch.Tensor) -> torch.Tensor:
-        return metric_similarity(image_feats, text_feats, self._kappa(), product_metric=self.phyclip_product_metric)
-    def encode_retrieval_image(self, image: torch.Tensor) -> torch.Tensor:
-        base = self.encode_image_base(image)
-        tangent = self.image_proj(base)
-        if self.proclip_retrieval:
-            return self._project_proclip_image_base(base, self.project_image_features(tangent))
-        return self.project_image_features(tangent)
-    def encode_retrieval_text(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        base = self.encode_text_base(input_ids, attention_mask)
-        tangent = self.text_proj(base)
-        if self.proclip_retrieval:
-            return self._project_proclip_text_base(base, self.project_text_features(tangent))
-        return self.project_text_features(tangent)
-    def retrieval_similarity_scores(self, image_feats: torch.Tensor, text_feats: torch.Tensor) -> torch.Tensor:
-        if self.proclip_retrieval:
-            return self._proclip_similarity_scores(image_feats, text_feats)
-        return self.similarity_scores(image_feats, text_feats)
-    @property
-    def retrieval_requires_chunking(self) -> bool:
-        return self.phyclip_enabled or self.proclip_retrieval
-    def _objective_autocast(self, device_type: str):
-        dtype = {
-            "float32": torch.float32,
-            "fp32": torch.float32,
-            "float16": torch.float16,
-            "fp16": torch.float16,
-            "bfloat16": torch.bfloat16,
-            "bf16": torch.bfloat16,
-        }[self.objective_autocast_dtype]
-        enabled = device_type != "cpu" and dtype is not torch.float32
-        return torch.autocast(device_type=device_type, dtype=dtype, enabled=enabled)
-    def forward(
-        self,
-        image: torch.Tensor,
-        part_images: torch.Tensor,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        part_owner: torch.Tensor,
-        step: int | None = None,
-        beta_query_input_ids: torch.Tensor | None = None,
-        beta_query_attention_mask: torch.Tensor | None = None,
-        beta_query_owner: torch.Tensor | None = None,
-        beta_query_type: torch.Tensor | None = None,
-        beta_query_parent: torch.Tensor | None = None,
-        beta_query_weight: torch.Tensor | None = None,
-        beta_query_source_part: torch.Tensor | None = None,
-    ) -> dict[str, torch.Tensor]:
-        with torch.no_grad():
-            self._clamp_logit_scales()
-            self.visual_alpha.clamp_(max=0.0)
-            self.textual_alpha.clamp_(max=0.0)
-        kappa = self._kappa()
-        feature_dim = self.embed_dim
-        beta_image_tokens = None
-        beta_query_base = None
-        part_image_base = part_images.new_zeros((0, self.vision_encoder.output_dim))
-        part_text_base = part_images.new_zeros((0, self.text_encoder.output_dim))
-        hier_beta_enabled = self.objective_name == "uncha" and self.uncha_entailment_loss in {
-            "hier_beta_argent",
-            "hier_beta_sourcepart_argent",
-        }
-        if (
-            hier_beta_enabled
-            and self.fuse_beta_query_encoder_forwards
-            and not self.tren_enabled
-            and beta_query_input_ids is not None
-            and beta_query_attention_mask is not None
-            and part_images.shape[0] > 0
-        ):
-            (
-                image_base,
-                text_base,
-                image_euc,
-                text_euc,
-                image_feats,
-                text_feats,
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-                beta_image_tokens,
-                beta_query_base,
-            ) = self._encode_hier_beta_whole_parts_and_queries(
-                image=image,
-                part_images=part_images,
-                text_input_ids=text_input_ids,
-                text_attention_mask=text_attention_mask,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-                beta_query_input_ids=beta_query_input_ids,
-                beta_query_attention_mask=beta_query_attention_mask,
-            )
-        elif self.beta_query_pooling_enabled or self.tren_enabled:
-            image_base, beta_image_tokens = self.encode_image_base_with_tokens(image)
-            text_base = self.encode_text_base(text_input_ids, text_attention_mask)
-            image_euc = self.image_proj(image_base)
-            text_euc = self.text_proj(text_base)
-            image_feats = self.project_image_features(image_euc)
-            text_feats = self.project_text_features(text_euc)
-            (
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-            ) = self._encode_parts_with_base(
-                part_images=part_images,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-                feature_dim=feature_dim,
-            )
-        elif self.fuse_whole_part_encoder_forwards and self.objective_name != "proclip" and part_images.shape[0] > 0:
-            (
-                image_base,
-                text_base,
-                image_euc,
-                text_euc,
-                image_feats,
-                text_feats,
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-            ) = self._encode_whole_and_parts(
-                image=image,
-                part_images=part_images,
-                text_input_ids=text_input_ids,
-                text_attention_mask=text_attention_mask,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-            )
-        else:
-            image_base = self.encode_image_base(image)
-            text_base = self.encode_text_base(text_input_ids, text_attention_mask)
-            image_euc = self.image_proj(image_base)
-            text_euc = self.text_proj(text_base)
-            image_feats = self.project_image_features(image_euc)
-            text_feats = self.project_text_features(text_euc)
-            (
-                part_image_feats,
-                part_text_feats,
-                part_image_euc,
-                part_text_euc,
-                part_image_base,
-                part_text_base,
-            ) = self._encode_parts_with_base(
-                part_images=part_images,
-                part_text_input_ids=part_text_input_ids,
-                part_text_attention_mask=part_text_attention_mask,
-                feature_dim=feature_dim,
-            )
-        targets = local_target_indices(image_feats.size(0), image_feats.device)
-        if self.objective_name == "proclip":
-            proclip_image_feats = self._project_proclip_image_base(image_base, image_feats)
-            proclip_text_feats = self._project_proclip_text_base(text_base, text_feats)
-            proclip_loss = self._proclip_contrastive_loss(
-                image_feats=proclip_image_feats,
-                text_feats=proclip_text_feats,
-                all_image_feats=gather_with_grad(proclip_image_feats),
-                all_text_feats=gather_with_grad(proclip_text_feats),
-                targets=targets,
-            )
-            zero = proclip_loss.new_zeros(())
-            return {
-                "loss": proclip_loss,
-                "contrastive_loss": proclip_loss,
-                "entailment_loss": zero,
-                "part_count": part_owner.new_tensor(0),
-                "proclip_contrastive_loss": proclip_loss,
-                **self._detached_kappa_logs(kappa),
-                **self._detached_logit_scales(),
-            }
-        himo_text_feats = None
-        all_himo_text_feats = None
-        if self.objective_name == "uncha" and self.uncha_himo_component_weight > 0.0:
-            all_text_euc = gather_with_grad(text_euc)
-            all_component_euc = hide_reconstruct_embeddings(
-                all_text_euc,
-                variance_threshold=self.uncha_himo_variance_threshold,
-                detach_pca=self.uncha_himo_detach_pca,
-            )
-            if get_world_size() > 1:
-                start = text_euc.size(0) * get_rank()
-                end = start + text_euc.size(0)
-                component_euc = all_component_euc[start:end]
-            else:
-                component_euc = all_component_euc
-            himo_text_feats = self.project_text_features(component_euc)
-            all_himo_text_feats = gather_with_grad(himo_text_feats)
-        all_image_feats = gather_with_grad(image_feats)
-        all_text_feats = gather_with_grad(text_feats)
-        all_image_euc = None
-        all_text_euc = None
-        if self.objective_name == "uncha" and self.uncha_contrastive_loss == "siglip":
-            all_image_euc = gather_with_grad(image_euc)
-            all_text_euc = gather_with_grad(text_euc)
-        part_owner = part_owner.to(device=image_feats.device, dtype=torch.long)
-        beta_query_embeddings = {}
-        if self.objective_name == "uncha" and self.uncha_entailment_loss in {
-            "hier_beta_argent",
-            "hier_beta_sourcepart_argent",
-        }:
-            if beta_image_tokens is None:
-                raise RuntimeError(f"{self.uncha_entailment_loss} requires image patch tokens")
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                beta_query_embeddings = self._beta_query_entailment_embeddings(
-                    image_tokens=beta_image_tokens.float(),
-                    beta_query_input_ids=beta_query_input_ids,
-                    beta_query_attention_mask=beta_query_attention_mask,
-                    beta_query_owner=beta_query_owner,
-                    beta_query_parent=beta_query_parent,
-                    beta_query_weight=beta_query_weight,
-                    beta_query_source_part=beta_query_source_part,
-                    kappa=kappa.float(),
-                    query_base=beta_query_base,
-                )
-        with self._objective_autocast(image.device.type):
-            if self.objective is None:
-                raise RuntimeError("Non-ProCLIP forward requires an objective module")
-            losses = self.objective(
-                {
-                    "image_feats": image_feats,
-                    "text_feats": text_feats,
-                    "part_image_feats": part_image_feats,
-                    "part_text_feats": part_text_feats,
-                    "part_owner": part_owner,
-                    "all_image_feats": all_image_feats,
-                    "all_text_feats": all_text_feats,
-                    **(
-                        {
-                            "image_euc_feats": image_euc,
-                            "text_euc_feats": text_euc,
-                            "part_image_euc_feats": part_image_euc,
-                            "part_text_euc_feats": part_text_euc,
-                            "all_image_euc_feats": all_image_euc,
-                            "all_text_euc_feats": all_text_euc,
-                        }
-                        if all_image_euc is not None and all_text_euc is not None
-                        else {}
-                    ),
-                    "targets": targets,
-                    "kappa": kappa,
-                    "entail_weight_scale": self._entail_weight_scale(step, image_feats.device),
-                    **beta_query_embeddings,
-                    **(
-                        {
-                            "himo_text_feats": himo_text_feats,
-                            "all_himo_text_feats": all_himo_text_feats,
-                        }
-                        if himo_text_feats is not None
-                        else {}
-                    ),
-                },
-                self._objective_logit_scales(),
-            )
-        if self.beta_clip_global_weight > 0.0:
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                beta_clip_global_loss = self._beta_clip_global_contrastive_loss(
-                    image_euc=image_euc,
-                    text_euc=text_euc,
-                    targets=targets,
-                )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.beta_clip_global_weight * beta_clip_global_loss,
-                "beta_clip_global_loss": beta_clip_global_loss,
-            }
-        if self.beta_clip_enabled:
-            if beta_image_tokens is None:
-                raise RuntimeError("beta-CLIP auxiliary requires image patch tokens")
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                beta_clip_loss = self._beta_clip_auxiliary_loss(
-                    image_tokens=beta_image_tokens.float(),
-                    beta_query_input_ids=beta_query_input_ids,
-                    beta_query_attention_mask=beta_query_attention_mask,
-                    beta_query_owner=beta_query_owner,
-                    global_targets=targets,
-                    kappa=kappa.float(),
-                )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.beta_clip_weight * beta_clip_loss,
-                "beta_clip_loss": beta_clip_loss,
-            }
-        if self.tren_enabled:
-            if beta_image_tokens is None:
-                raise RuntimeError("T-REN auxiliary requires image patch tokens")
-            with torch.autocast(device_type=image.device.type, enabled=False):
-                tren_losses = self._tren_auxiliary_losses(
-                    image_tokens=beta_image_tokens.float(),
-                    part_owner=part_owner,
-                    part_image_base=part_image_base.float(),
-                    part_text_base=part_text_base.float(),
-                )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.tren_weight * tren_losses["tren_loss"],
-                **tren_losses,
-            }
-        if self.proclip_enabled and self.proclip_weight > 0.0:
-            proclip_image_feats = self._project_proclip_image_base(image_base, image_feats)
-            proclip_text_feats = self._project_proclip_text_base(text_base, text_feats)
-            proclip_loss = self._proclip_contrastive_loss(
-                image_feats=proclip_image_feats,
-                text_feats=proclip_text_feats,
-                all_image_feats=gather_with_grad(proclip_image_feats),
-                all_text_feats=gather_with_grad(proclip_text_feats),
-                targets=targets,
-            )
-            losses = {
-                **losses,
-                "loss": losses["loss"] + self.proclip_weight * proclip_loss,
-                "proclip_contrastive_loss": proclip_loss,
-            }
-        return {**losses, **self._detached_kappa_logs(kappa), **self._detached_logit_scales()}
-    def _encode_parts(
-        self,
-        part_images: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        feature_dim: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        if part_images.shape[0] == 0:
-            empty = part_images.new_zeros((0, feature_dim))
-            return empty, empty, empty, empty
-        part_image_euc = self.image_proj(self.encode_image_base(part_images))
-        part_text_euc = self.text_proj(self.encode_text_base(part_text_input_ids, part_text_attention_mask))
-        part_image_feats = self.project_image_features(part_image_euc)
-        part_text_feats = self.project_text_features(part_text_euc)
-        return part_image_feats, part_text_feats, part_image_euc, part_text_euc
-    def _encode_parts_with_base(
-        self,
-        part_images: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        feature_dim: int,
-    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        if part_images.shape[0] == 0:
-            empty = part_images.new_zeros((0, feature_dim))
-            empty_image_base = part_images.new_zeros((0, self.vision_encoder.output_dim))
-            empty_text_base = part_images.new_zeros((0, self.text_encoder.output_dim))
-            return empty, empty, empty, empty, empty_image_base, empty_text_base
-        part_image_base = self.encode_image_base(part_images)
-        part_text_base = self.encode_text_base(part_text_input_ids, part_text_attention_mask)
-        part_image_euc = self.image_proj(part_image_base)
-        part_text_euc = self.text_proj(part_text_base)
-        part_image_feats = self.project_image_features(part_image_euc)
-        part_text_feats = self.project_text_features(part_text_euc)
-        return part_image_feats, part_text_feats, part_image_euc, part_text_euc, part_image_base, part_text_base
-    def _encode_whole_and_parts(
-        self,
-        image: torch.Tensor,
-        part_images: torch.Tensor,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-    ) -> tuple[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        batch_size = image.shape[0]
-        part_count = part_images.shape[0]
-        image_base_all = self.encode_image_base(torch.cat([image, part_images], dim=0))
-        image_euc_all = self.image_proj(image_base_all)
-        image_feats_all = self.project_image_features(image_euc_all)
-        text_ids, text_mask = self._concat_text_batches(
-            text_input_ids,
-            text_attention_mask,
-            part_text_input_ids,
-            part_text_attention_mask,
-        )
-        text_base_all = self.encode_text_base(text_ids, text_mask)
-        text_euc_all = self.text_proj(text_base_all)
-        text_feats_all = self.project_text_features(text_euc_all)
-        image_base, part_image_base = image_base_all.split([batch_size, part_count], dim=0)
-        text_base, part_text_base = text_base_all.split([batch_size, part_count], dim=0)
-        image_euc, part_image_euc = image_euc_all.split([batch_size, part_count], dim=0)
-        text_euc, part_text_euc = text_euc_all.split([batch_size, part_count], dim=0)
-        image_feats, part_image_feats = image_feats_all.split([batch_size, part_count], dim=0)
-        text_feats, part_text_feats = text_feats_all.split([batch_size, part_count], dim=0)
-        return (
-            image_base,
-            text_base,
-            image_euc,
-            text_euc,
-            image_feats,
-            text_feats,
-            part_image_feats,
-            part_text_feats,
-            part_image_euc,
-            part_text_euc,
-            part_image_base,
-            part_text_base,
-        )
-    def _encode_hier_beta_whole_parts_and_queries(
-        self,
-        image: torch.Tensor,
-        part_images: torch.Tensor,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-        beta_query_input_ids: torch.Tensor,
-        beta_query_attention_mask: torch.Tensor,
-    ) -> tuple[
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        batch_size = image.shape[0]
-        part_count = part_images.shape[0]
-        query_count = beta_query_input_ids.shape[0]
-        image_base_all, image_tokens_all = self.encode_image_base_with_tokens(torch.cat([image, part_images], dim=0))
-        image_euc_all = self.image_proj(image_base_all)
-        image_feats_all = self.project_image_features(image_euc_all)
-        image_base, part_image_base = image_base_all.split([batch_size, part_count], dim=0)
-        image_euc, part_image_euc = image_euc_all.split([batch_size, part_count], dim=0)
-        image_feats, part_image_feats = image_feats_all.split([batch_size, part_count], dim=0)
-        beta_image_tokens = image_tokens_all[:batch_size]
-        text_ids, text_mask = self._concat_text_batch_list(
-            (text_input_ids, text_attention_mask),
-            (part_text_input_ids, part_text_attention_mask),
-            (beta_query_input_ids, beta_query_attention_mask),
-        )
-        text_base_all = self.encode_text_base(text_ids, text_mask)
-        text_euc_all = self.text_proj(text_base_all)
-        text_feats_all = self.project_text_features(text_euc_all)
-        text_base, part_text_base, beta_query_base = text_base_all.split([batch_size, part_count, query_count], dim=0)
-        text_euc, part_text_euc, _ = text_euc_all.split([batch_size, part_count, query_count], dim=0)
-        text_feats, part_text_feats, _ = text_feats_all.split([batch_size, part_count, query_count], dim=0)
-        return (
-            image_base,
-            text_base,
-            image_euc,
-            text_euc,
-            image_feats,
-            text_feats,
-            part_image_feats,
-            part_text_feats,
-            part_image_euc,
-            part_text_euc,
-            part_image_base,
-            part_text_base,
-            beta_image_tokens,
-            beta_query_base,
-        )
-    def _concat_text_batches(
-        self,
-        text_input_ids: torch.Tensor,
-        text_attention_mask: torch.Tensor,
-        part_text_input_ids: torch.Tensor,
-        part_text_attention_mask: torch.Tensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        return self._concat_text_batch_list(
-            (text_input_ids, text_attention_mask),
-            (part_text_input_ids, part_text_attention_mask),
-        )
-    def _concat_text_batch_list(
-        self,
-        *batches: tuple[torch.Tensor, torch.Tensor],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        target_length = max(input_ids.shape[1] for input_ids, _ in batches)
-        pad_token_id = self.text_encoder.tokenizer.pad_token_id
-        if pad_token_id is None:
-            pad_token_id = 0
-        return (
-            torch.cat([_pad_sequence_dim(input_ids, target_length, pad_token_id) for input_ids, _ in batches], dim=0),
-            torch.cat([_pad_sequence_dim(attention_mask, target_length, 0) for _, attention_mask in batches], dim=0),
-        )
-    def _clamp_logit_scales(self) -> None:
-        if self.objective_name == "proclip":
-            self.proclip_logit_scale.clamp_(max=4.6052)
-            self._clamp_experimental_logit_scales()
-            return
-        if self.objective_name == "hycoclip":
-            self.logit_scale.clamp_(max=4.6052)
-            self._clamp_experimental_logit_scales()
-            return
-        self.global_logit_scale.clamp_(max=4.6052)
-        self.local_logit_scale.clamp_(max=4.6052)
-        self.global_local_logit_scale.clamp_(max=4.6052)
-        self._clamp_experimental_logit_scales()
-    def _objective_logit_scales(self) -> torch.Tensor | dict[str, torch.Tensor]:
-        if self.objective_name == "hycoclip":
-            return self.logit_scale
-        if self.objective_name == "proclip":
-            return self.proclip_logit_scale
-        return {
-            "global": self.global_logit_scale,
-            "local": self.local_logit_scale,
-            "global_local": self.global_local_logit_scale,
-            **(
-                {
-                    "global_bias": self.global_logit_bias,
-                    "local_bias": self.local_logit_bias,
-                    "global_local_bias": self.global_local_logit_bias,
-                }
-                if self.uncha_contrastive_loss in {"sigmoid", "siglip", "siglip_metric"}
-                else {}
-            ),
-        }
-    def _detached_logit_scales(self) -> dict[str, torch.Tensor]:
-        if self.objective_name == "proclip":
-            return self._detached_experimental_logit_scales()
-        if self.objective_name == "hycoclip":
-            logs = {"logit_scale": self.logit_scale.exp().detach()}
-            logs.update(self._detached_experimental_logit_scales())
-            return logs
-        logs = {
-            "global_logit_scale": self.global_logit_scale.exp().detach(),
-            "local_logit_scale": self.local_logit_scale.exp().detach(),
-            "global_local_logit_scale": self.global_local_logit_scale.exp().detach(),
-        }
-        if self.uncha_contrastive_loss in {"sigmoid", "siglip", "siglip_metric"}:
-            logs.update(
-                {
-                    "global_logit_bias": self.global_logit_bias.detach(),
-                    "local_logit_bias": self.local_logit_bias.detach(),
-                    "global_local_logit_bias": self.global_local_logit_bias.detach(),
-                }
-            )
-        logs.update(self._detached_experimental_logit_scales())
-        return logs
-    def _project_product_features(self, feats: torch.Tensor, alpha: torch.Tensor) -> torch.Tensor:
-        product_feats = feats.float().reshape(feats.size(0), self.phyclip_num_factors, self.phyclip_subspace_dim)
-        product_feats = product_feats * alpha.exp().float().view(1, -1, 1)
-        return exp_map0(product_feats, self._kappa().float().view(1, -1, 1))
-    def _detached_kappa_logs(self, kappa: torch.Tensor) -> dict[str, torch.Tensor]:
-        detached = kappa.detach()
-        if detached.numel() == 1:
-            return {"kappa": detached.reshape(())}
-        return {
-            "kappa": detached.mean(),
-            "kappa_min": detached.min(),
-            "kappa_max": detached.max(),
-        }
-    def _entail_weight_scale(self, step: int | None, device: torch.device) -> torch.Tensor:
-        if self.uncha_entailment_warmup_steps <= 0 or step is None:
-            return torch.ones((), device=device)
-        scale = min(1.0, float(step + 1) / float(self.uncha_entailment_warmup_steps))
-        return torch.tensor(scale, device=device)
-def _projection_head(input_dim: int, output_dim: int, hidden_dim: int | None) -> nn.Module:
-    if hidden_dim is None:
-        return nn.Linear(input_dim, output_dim)
-    return nn.Sequential(
-        nn.Linear(input_dim, hidden_dim),
-        nn.ReLU(),
-        nn.Linear(hidden_dim, output_dim),
-    )
-def _pad_sequence_dim(tensor: torch.Tensor, target_length: int, value: int) -> torch.Tensor:
-    pad = target_length - tensor.shape[1]
-    if pad <= 0:
-        return tensor
-    return F.pad(tensor, (0, pad), value=value)

hyper3_clip/models/lorentz.py DELETED Viewed

@@ -1,265 +0,0 @@
-from __future__ import annotations
-import math
-import torch
-from torch import Tensor
-def lorentz_inner(x: Tensor, y: Tensor) -> Tensor:
-    """Compute batched Lorentzian inner product for matching rows."""
-    x = x.float()
-    y = y.float()
-    return -x[..., 0] * y[..., 0] + (x[..., 1:] * y[..., 1:]).sum(dim=-1)
-def pairwise_lorentz_inner(x: Tensor, y: Tensor) -> Tensor:
-    """Compute all-pairs Lorentzian inner products."""
-    x = x.float()
-    y = y.float()
-    time = -x[:, :1] @ y[:, :1].T
-    space = x[:, 1:] @ y[:, 1:].T
-    return time + space
-def exp_map0(u: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Exponential map at the origin from tangent space to hyperboloid."""
-    u = u.float()
-    kappa = kappa.float()
-    sqrt_k = torch.sqrt(kappa)
-    norm_u = torch.linalg.norm(u, dim=-1, keepdim=True).clamp_min(eps)
-    scaled = sqrt_k * norm_u
-    clipped_scaled = scaled.clamp_max(math.asinh(2**15))
-    time = torch.cosh(clipped_scaled) / sqrt_k
-    space = torch.sinh(clipped_scaled) * u / scaled.clamp_min(eps)
-    return torch.cat([time, space], dim=-1)
-def log_map0(x: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Logarithmic map at the origin from hyperboloid to tangent space.
-    Inverts ``exp_map0`` for points on the Lorentz model hyperboloid. Returns
-    vectors in the Euclidean tangent space at the origin (no time coordinate).
-    """
-    x = x.float()
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    kappa = kappa.to(dtype=torch.float32).flatten()
-    if x.dim() == 2:
-        if kappa.numel() != 1:
-            raise ValueError("log_map0 expects scalar kappa for non-product embeddings")
-        sqrt_k = torch.sqrt(kappa.reshape(()))
-        alpha = torch.acosh((sqrt_k * x[:, 0]).clamp_min(1.0 + dist_eps))
-        coef = alpha / torch.sinh(alpha).clamp_min(dist_eps)
-        return x[:, 1:] * coef.unsqueeze(-1)
-    if x.dim() == 3:
-        if kappa.numel() == 1:
-            kappa = kappa.expand(x.shape[1])
-        if kappa.numel() != x.shape[1]:
-            raise ValueError(f"Expected {x.shape[1]} curvatures for product space, got {kappa.numel()}")
-        sqrt_k = torch.sqrt(kappa).view(1, -1)
-        alpha = torch.acosh((sqrt_k * x[..., 0]).clamp_min(1.0 + dist_eps))
-        coef = alpha / torch.sinh(alpha).clamp_min(dist_eps)
-        return x[..., 1:] * coef.unsqueeze(-1)
-    raise ValueError("log_map0 expects [batch, dim + 1] or [batch, factors, dim + 1] tensors")
-def pairwise_dist(x: Tensor, y: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Pairwise geodesic distance on the Lorentz model."""
-    kappa = kappa.float()
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    prod = (-kappa) * pairwise_lorentz_inner(x, y)
-    prod = prod.clamp_min(1.0 + dist_eps)
-    return torch.acosh(prod) / torch.sqrt(kappa)
-def product_pairwise_dist(
-    x: Tensor,
-    y: Tensor,
-    kappa: Tensor,
-    metric: str = "l1",
-    eps: float = 1e-8,
-) -> Tensor:
-    """Pairwise distance in an l1/l2 product of Lorentz factors.
-    Inputs have shape ``[batch, factors, dim + 1]``. For ``metric="l1"``, this
-    matches the official PHyCLIP implementation's mean distance over factors.
-    """
-    if x.dim() != 3 or y.dim() != 3:
-        raise ValueError("product_pairwise_dist expects [batch, factors, dim + 1] tensors")
-    if x.shape[1] != y.shape[1] or x.shape[2] != y.shape[2]:
-        raise ValueError("Product Lorentz tensors must have matching factor and feature dimensions")
-    kappa = _product_kappa(kappa, x.shape[1], x.device).to(dtype=torch.float32)
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    x = x.float()
-    y = y.float()
-    inner = -x[:, None, :, 0] * y[None, :, :, 0] + torch.einsum("bkd,nkd->bnk", x[..., 1:], y[..., 1:])
-    prod = (-kappa.view(1, 1, -1)) * inner
-    dist = torch.acosh(prod.clamp_min(1.0 + dist_eps)) / torch.sqrt(kappa).view(1, 1, -1)
-    if metric == "l1":
-        return dist.mean(dim=-1)
-    if metric == "l2":
-        return dist.square().mean(dim=-1).sqrt()
-    raise ValueError(f"Unsupported product metric {metric!r}; expected 'l1' or 'l2'")
-def metric_pairwise_dist(x: Tensor, y: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    """Pairwise distance for either a single Lorentz space or a product space."""
-    if x.dim() == 3 or y.dim() == 3:
-        return product_pairwise_dist(x, y, kappa, metric=product_metric)
-    return pairwise_dist(x, y, kappa)
-def paired_dist(x: Tensor, y: Tensor, kappa: Tensor, product_metric: str = "l1", eps: float = 1e-8) -> Tensor:
-    """Row-wise distance for either a single Lorentz space or a product space."""
-    if x.dim() == 3 or y.dim() == 3:
-        if x.shape != y.shape:
-            raise ValueError("Product paired_dist expects matching tensor shapes")
-        kappa = _product_kappa(kappa, x.shape[1], x.device).to(dtype=torch.float32)
-        dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-        x = x.float()
-        y = y.float()
-        inner = -x[..., 0] * y[..., 0] + (x[..., 1:] * y[..., 1:]).sum(dim=-1)
-        prod = (-kappa.view(1, -1)) * inner
-        dist = torch.acosh(prod.clamp_min(1.0 + dist_eps)) / torch.sqrt(kappa).view(1, -1)
-        if product_metric == "l1":
-            return dist.mean(dim=-1)
-        if product_metric == "l2":
-            return dist.square().mean(dim=-1).sqrt()
-        raise ValueError(f"Unsupported product metric {product_metric!r}; expected 'l1' or 'l2'")
-    kappa = kappa.float()
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    prod = (-kappa) * lorentz_inner(x, y)
-    prod = prod.clamp_min(1.0 + dist_eps)
-    return torch.acosh(prod) / torch.sqrt(kappa)
-def radial_distance(x: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Geodesic distance from the origin.
-    For points on the hyperboloid, the time coordinate satisfies
-    ``x0 = cosh(sqrt(kappa) * r) / sqrt(kappa)``, so we can recover the radial
-    distance via ``r = arcosh(sqrt(kappa) * x0) / sqrt(kappa)``.
-    """
-    dist_eps = max(eps, 16.0 * torch.finfo(x.dtype).eps)
-    x = x.float()
-    kappa = kappa.to(dtype=torch.float32).flatten()
-    if x.dim() == 2:
-        if kappa.numel() != 1:
-            raise ValueError("radial_distance expects scalar kappa for non-product embeddings")
-        sqrt_k = torch.sqrt(kappa.reshape(()))
-        arg = (sqrt_k * x[:, 0]).clamp_min(1.0 + dist_eps)
-        return torch.acosh(arg) / sqrt_k
-    if x.dim() == 3:
-        if kappa.numel() == 1:
-            kappa = kappa.expand(x.shape[1])
-        if kappa.numel() != x.shape[1]:
-            raise ValueError(f"Expected {x.shape[1]} curvatures for product space, got {kappa.numel()}")
-        sqrt_k = torch.sqrt(kappa).view(1, -1)
-        arg = (sqrt_k * x[..., 0]).clamp_min(1.0 + dist_eps)
-        dist = torch.acosh(arg) / sqrt_k
-        return dist.mean(dim=-1)
-    raise ValueError("radial_distance expects [batch, dim + 1] or [batch, factors, dim + 1] tensors")
-def metric_similarity(x: Tensor, y: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    """Retrieval/classification similarity for single-space and PHyCLIP-style models."""
-    if x.dim() == 3 or y.dim() == 3:
-        return -product_pairwise_dist(x, y, kappa, metric=product_metric)
-    return pairwise_lorentz_inner(x, y)
-def half_aperture(general: Tensor, kappa: Tensor, min_radius: float = 0.1, eps: float = 1e-8) -> Tensor:
-    """Cone half-aperture for entailment cone centered at general concept."""
-    general = general.float()
-    kappa = kappa.float()
-    aperture_eps = max(eps, 16.0 * torch.finfo(general.dtype).eps)
-    general_norm = torch.linalg.norm(general[:, 1:], dim=-1)
-    ratio = (2.0 * min_radius) / (general_norm * torch.sqrt(kappa) + aperture_eps)
-    ratio = ratio.clamp(max=1.0 - aperture_eps)
-    return torch.asin(ratio)
-def oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """Exterior angle between specific point and entailment cone at general point."""
-    specific = specific.float()
-    general = general.float()
-    kappa = kappa.float()
-    angle_eps = max(eps, 16.0 * torch.finfo(specific.dtype).eps)
-    inner = lorentz_inner(specific, general)
-    numerator = specific[:, 0] + kappa * inner * general[:, 0]
-    general_norm = torch.linalg.norm(general[:, 1:], dim=-1).clamp_min(angle_eps)
-    denom_term = (kappa * inner).pow(2) - 1.0
-    denom = general_norm * torch.sqrt(denom_term.clamp_min(angle_eps))
-    cosine = (numerator / denom).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    return torch.acos(cosine)
-def pairwise_oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor, eps: float = 1e-8) -> Tensor:
-    """All-pairs exterior angle between specific points and entailment cones at general points."""
-    specific = specific.float()
-    general = general.float()
-    kappa = kappa.to(dtype=torch.float32).flatten()
-    if kappa.numel() != 1:
-        raise ValueError("pairwise_oxy_angle expects scalar kappa for non-product embeddings")
-    kappa_scalar = kappa.reshape(())
-    angle_eps = max(eps, 16.0 * torch.finfo(specific.dtype).eps)
-    inner = -specific[:, None, 0] * general[None, :, 0] + torch.einsum("nd,md->nm", specific[:, 1:], general[:, 1:])
-    numerator = specific[:, None, 0] + kappa_scalar * inner * general[None, :, 0]
-    general_norm = torch.linalg.norm(general[:, 1:], dim=-1).clamp_min(angle_eps)
-    denom_term = (kappa_scalar * inner).pow(2) - 1.0
-    denom = general_norm[None, :] * torch.sqrt(denom_term.clamp_min(angle_eps))
-    cosine = (numerator / denom).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    return torch.acos(cosine)
-def product_pairwise_oxy_angle(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    metric: str = "l1",
-    eps: float = 1e-8,
-) -> Tensor:
-    """All-pairs exterior angle in an l1/l2 product of Lorentz factors."""
-    if specific.dim() != 3 or general.dim() != 3:
-        raise ValueError("product_pairwise_oxy_angle expects [batch, factors, dim + 1] tensors")
-    if specific.shape[1] != general.shape[1] or specific.shape[2] != general.shape[2]:
-        raise ValueError("Product Lorentz tensors must have matching factor and feature dimensions")
-    kappa = _product_kappa(kappa, specific.shape[1], specific.device).to(dtype=torch.float32)
-    angle_eps = max(eps, 16.0 * torch.finfo(specific.dtype).eps)
-    specific = specific.float()
-    general = general.float()
-    inner = -specific[:, None, :, 0] * general[None, :, :, 0] + torch.einsum(
-        "nkd,mkd->nmk",
-        specific[..., 1:],
-        general[..., 1:],
-    )
-    numerator = specific[:, None, :, 0] + (kappa.view(1, 1, -1) * inner) * general[None, :, :, 0]
-    general_norm = torch.linalg.norm(general[..., 1:], dim=-1).clamp_min(angle_eps)
-    denom_term = (kappa.view(1, 1, -1) * inner).pow(2) - 1.0
-    denom = general_norm[None, :, :] * torch.sqrt(denom_term.clamp_min(angle_eps))
-    cosine = (numerator / denom).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    angles = torch.acos(cosine)
-    if metric == "l1":
-        return angles.mean(dim=-1)
-    if metric == "l2":
-        return angles.square().mean(dim=-1).sqrt()
-    raise ValueError(f"Unsupported product metric {metric!r}; expected 'l1' or 'l2'")
-def metric_pairwise_oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    """All-pairs oxy-angle for either a single Lorentz space or a product space."""
-    if specific.dim() == 3 or general.dim() == 3:
-        return product_pairwise_oxy_angle(specific, general, kappa, metric=product_metric)
-    return pairwise_oxy_angle(specific, general, kappa)
-def _product_kappa(kappa: Tensor, num_factors: int, device: torch.device) -> Tensor:
-    kappa = kappa.to(device=device, dtype=torch.float32).flatten()
-    if kappa.numel() == 1:
-        return kappa.expand(num_factors)
-    if kappa.numel() != num_factors:
-        raise ValueError(f"Expected {num_factors} curvatures for product space, got {kappa.numel()}")
-    return kappa

hyper3_clip/models/losses.py DELETED Viewed

@@ -1,1400 +0,0 @@
-from __future__ import annotations
-import math
-import torch
-from torch import Tensor
-import torch.nn.functional as F
-from hyper3_clip.models.lorentz import (
-    half_aperture,
-    metric_pairwise_dist,
-    metric_pairwise_oxy_angle,
-    oxy_angle,
-    paired_dist,
-    radial_distance,
-)
-def contrastive_ce(logits: Tensor, targets: Tensor | None = None, weights: Tensor | None = None) -> Tensor:
-    if targets is None:
-        targets = torch.arange(logits.size(0), device=logits.device)
-    losses = F.cross_entropy(logits, targets, reduction="none")
-    return weighted_mean(losses, weights)
-def contrastive_sigmoid(
-    logits: Tensor,
-    targets: Tensor | None = None,
-    weights: Tensor | None = None,
-    negative_weight: float = 1.0,
-) -> Tensor:
-    if targets is None:
-        targets = torch.arange(logits.size(0), device=logits.device)
-    labels = torch.zeros_like(logits)
-    labels[torch.arange(logits.size(0), device=logits.device), targets] = 1.0
-    losses = F.binary_cross_entropy_with_logits(logits, labels, reduction="none")
-    if negative_weight != 1.0:
-        element_weights = torch.where(labels > 0.0, torch.ones_like(labels), logits.new_full((), negative_weight))
-        losses = losses * element_weights
-    losses = losses.mean(dim=1)
-    return weighted_mean(losses, weights)
-def contrastive_siglip(
-    logits: Tensor,
-    targets: Tensor | None = None,
-    weights: Tensor | None = None,
-    negative_weight: float = 1.0,
-) -> Tensor:
-    """SigLIP pairwise sigmoid loss (Zhai et al., ICCV 2023).
-    Uses labels in {+1, -1} with a per-row sum (not mean) over pairs:
-      L_i = sum_j softplus(- y_ij * logit_ij)
-    """
-    if logits.ndim != 2:
-        raise ValueError("contrastive_siglip expects a [batch, classes] logit matrix")
-    if targets is None:
-        targets = torch.arange(logits.size(0), device=logits.device)
-    labels = logits.new_full(logits.shape, -1.0)
-    labels[torch.arange(logits.size(0), device=logits.device), targets] = 1.0
-    losses = F.softplus(-(labels * logits))
-    if negative_weight != 1.0:
-        element_weights = torch.where(labels > 0.0, torch.ones_like(labels), logits.new_full((), negative_weight))
-        losses = losses * element_weights
-    row_losses = losses.sum(dim=1)
-    return weighted_mean(row_losses, weights)
-def weighted_mean(values: Tensor, weights: Tensor | None = None) -> Tensor:
-    if weights is None:
-        return values.mean()
-    weights = weights.to(device=values.device, dtype=values.dtype)
-    while weights.dim() < values.dim():
-        weights = weights.unsqueeze(-1)
-    return (values * weights).sum() / weights.sum().clamp_min(torch.finfo(values.dtype).eps)
-def gramian_volume_loss(vectors: Tensor, weights: Tensor | None = None, eps: float = 1e-4) -> Tensor:
-    """GRAM-style volume loss for sets of vectors.
-    ``vectors`` is expected to have shape ``[batch, k, dim]``. Each set of k
-    vectors is L2-normalized along ``dim``, then we compute the Gramian
-    ``G = V V^T`` and return ``sqrt(det(G + eps I))`` averaged over the batch.
-    """
-    if vectors.ndim != 3:
-        raise ValueError("gramian_volume_loss expects a [batch, k, dim] tensor")
-    if eps <= 0.0:
-        raise ValueError("gramian_volume_loss eps must be positive")
-    vectors = F.normalize(vectors.float(), dim=-1, eps=1e-8)
-    gram = vectors @ vectors.transpose(-1, -2)
-    k = gram.size(-1)
-    gram = gram + eps * torch.eye(k, device=gram.device, dtype=gram.dtype)
-    sign, logabsdet = torch.linalg.slogdet(gram)
-    volume = torch.exp(0.5 * logabsdet)
-    volume = torch.where(sign > 0, volume, volume.new_ones(volume.shape))
-    return weighted_mean(volume, weights)
-def radius_order_hinge(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    margin: float,
-    weights: Tensor | None = None,
-) -> Tensor:
-    if specific.shape[0] != general.shape[0]:
-        raise ValueError("radius_order_hinge expects matching batch dimensions")
-    if margin < 0.0:
-        raise ValueError("radius_order_hinge margin must be non-negative")
-    specific_radius = radial_distance(specific, kappa)
-    general_radius = radial_distance(general, kappa)
-    losses = F.relu(float(margin) + general_radius - specific_radius)
-    return weighted_mean(losses, weights)
-def soft_contrastive_ce(logits: Tensor, target_weights: Tensor, weights: Tensor | None = None) -> Tensor:
-    if logits.ndim != 2 or target_weights.ndim != 2:
-        raise ValueError("soft_contrastive_ce expects [batch, classes] tensors")
-    if logits.shape != target_weights.shape:
-        raise ValueError("soft_contrastive_ce requires logits and target_weights to have matching shapes")
-    log_probs = F.log_softmax(logits, dim=1)
-    losses = -(target_weights.to(dtype=log_probs.dtype) * log_probs).sum(dim=1)
-    return weighted_mean(losses, weights)
-def beta_cal_loss(
-    logits: Tensor,
-    *,
-    targets: Tensor,
-    group_ids: Tensor,
-    all_group_ids: Tensor,
-    beta: float,
-    variant: str,
-    weights: Tensor | None = None,
-) -> Tensor:
-    if beta < 0.0:
-        raise ValueError("beta_cal_loss beta must be non-negative")
-    if variant not in {"ce", "bce"}:
-        raise ValueError("beta_cal_loss variant must be 'ce' or 'bce'")
-    if logits.ndim != 2:
-        raise ValueError("beta_cal_loss expects a [batch, classes] logit matrix")
-    if targets.shape != (logits.size(0),):
-        raise ValueError("beta_cal_loss targets must have shape [batch]")
-    if group_ids.shape != (logits.size(0),):
-        raise ValueError("beta_cal_loss group_ids must have shape [batch]")
-    if all_group_ids.shape != (logits.size(1),):
-        raise ValueError("beta_cal_loss all_group_ids must have shape [classes]")
-    same_group = group_ids[:, None] == all_group_ids[None, :]
-    same_pair = targets[:, None] == torch.arange(logits.size(1), device=logits.device)[None, :]
-    if variant == "ce":
-        target_weights = logits.new_zeros(logits.shape)
-        target_weights = torch.where(same_pair, logits.new_ones(()), target_weights)
-        target_weights = torch.where(same_group & ~same_pair, logits.new_full((), float(beta)), target_weights)
-        target_weights = target_weights / target_weights.sum(dim=1, keepdim=True).clamp_min(
-            torch.finfo(target_weights.dtype).eps
-        )
-        return soft_contrastive_ce(logits, target_weights, weights)
-    labels = same_group.to(dtype=logits.dtype)
-    element_weights = logits.new_ones(logits.shape)
-    element_weights = torch.where(same_group & ~same_pair, logits.new_full((), float(beta)), element_weights)
-    element_losses = F.binary_cross_entropy_with_logits(logits, labels, reduction="none") * element_weights
-    row_losses = element_losses.mean(dim=1)
-    return weighted_mean(row_losses, weights)
-def compositional_contrastive_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    box_image_feats: Tensor,
-    box_text_feats: Tensor,
-    kappa: Tensor,
-    logit_scale: Tensor,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    targets: Tensor | None = None,
-) -> Tensor:
-    scale = logit_scale.exp().clamp(max=100.0)
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    logits_i_t = -metric_pairwise_dist(image_feats, all_text_feats, kappa) * scale
-    logits_t_i = -metric_pairwise_dist(text_feats, all_image_feats, kappa) * scale
-    logits_bi_t = -metric_pairwise_dist(box_image_feats, all_text_feats, kappa) * scale
-    logits_bt_i = -metric_pairwise_dist(box_text_feats, all_image_feats, kappa) * scale
-    return 0.25 * (
-        contrastive_ce(logits_i_t, targets)
-        + contrastive_ce(logits_t_i, targets)
-        + contrastive_ce(logits_bi_t, targets)
-        + contrastive_ce(logits_bt_i, targets)
-    )
-def multi_part_contrastive_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_mask: Tensor,
-    kappa: Tensor,
-    logit_scale: Tensor,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    targets: Tensor | None = None,
-) -> Tensor:
-    scale = logit_scale.exp().clamp(max=100.0)
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    if targets is None:
-        targets = torch.arange(image_feats.size(0), device=image_feats.device)
-    part_image_flat, part_text_flat, part_targets = _flatten_valid_parts(part_image_feats, part_text_feats, part_mask, targets)
-    logits_i_t = -metric_pairwise_dist(image_feats, all_text_feats, kappa) * scale
-    logits_t_i = -metric_pairwise_dist(text_feats, all_image_feats, kappa) * scale
-    logits_pi_t = -metric_pairwise_dist(part_image_flat, all_text_feats, kappa) * scale
-    logits_pt_i = -metric_pairwise_dist(part_text_flat, all_image_feats, kappa) * scale
-    return 0.25 * (
-        contrastive_ce(logits_i_t, targets)
-        + contrastive_ce(logits_t_i, targets)
-        + contrastive_ce(logits_pi_t, part_targets)
-        + contrastive_ce(logits_pt_i, part_targets)
-    )
-def packed_part_contrastive_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_owner: Tensor,
-    kappa: Tensor,
-    logit_scale: Tensor,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    targets: Tensor | None = None,
-) -> Tensor:
-    scale = logit_scale.exp().clamp(max=100.0)
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    if targets is None:
-        targets = torch.arange(image_feats.size(0), device=image_feats.device)
-    logits_i_t = -metric_pairwise_dist(image_feats, all_text_feats, kappa) * scale
-    logits_t_i = -metric_pairwise_dist(text_feats, all_image_feats, kappa) * scale
-    global_loss = 0.5 * (contrastive_ce(logits_i_t, targets) + contrastive_ce(logits_t_i, targets))
-    if part_image_feats.numel() == 0:
-        return global_loss
-    part_targets = targets[part_owner]
-    logits_pi_t = -metric_pairwise_dist(part_image_feats, all_text_feats, kappa) * scale
-    logits_pt_i = -metric_pairwise_dist(part_text_feats, all_image_feats, kappa) * scale
-    part_loss = 0.5 * (contrastive_ce(logits_pi_t, part_targets) + contrastive_ce(logits_pt_i, part_targets))
-    return 0.5 * (global_loss + part_loss)
-def factor_oxy_angle(specific: Tensor, general: Tensor, kappa: Tensor) -> Tensor:
-    if specific.dim() != 3:
-        return oxy_angle(specific=specific, general=general, kappa=kappa)
-    batch_size, num_factors, feature_dim = specific.shape
-    kappa = _factor_kappa(kappa, num_factors, specific.device)
-    factor_kappa = kappa.view(1, num_factors).expand(batch_size, num_factors).reshape(-1)
-    return oxy_angle(
-        specific=specific.reshape(batch_size * num_factors, feature_dim),
-        general=general.reshape(batch_size * num_factors, feature_dim),
-        kappa=factor_kappa,
-    ).reshape(batch_size, num_factors)
-def factor_half_aperture(general: Tensor, kappa: Tensor) -> Tensor:
-    if general.dim() != 3:
-        return half_aperture(general=general, kappa=kappa)
-    batch_size, num_factors, feature_dim = general.shape
-    kappa = _factor_kappa(kappa, num_factors, general.device)
-    factor_kappa = kappa.view(1, num_factors).expand(batch_size, num_factors).reshape(-1)
-    return half_aperture(
-        general=general.reshape(batch_size * num_factors, feature_dim),
-        kappa=factor_kappa,
-    ).reshape(batch_size, num_factors)
-def _factor_kappa(kappa: Tensor, num_factors: int, device: torch.device) -> Tensor:
-    kappa = kappa.to(device=device, dtype=torch.float32).flatten()
-    if kappa.numel() == 1:
-        return kappa.expand(num_factors)
-    if kappa.numel() != num_factors:
-        raise ValueError(f"Expected {num_factors} curvatures for product space, got {kappa.numel()}")
-    return kappa
-def entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    aperture_scale: float,
-) -> Tensor:
-    angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-    apertures = factor_half_aperture(general=general, kappa=kappa)
-    return torch.clamp(angles - (aperture_scale * apertures), min=0.0).mean()
-def weighted_entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    aperture_scale: float,
-    weights: Tensor | None = None,
-) -> Tensor:
-    angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-    apertures = factor_half_aperture(general=general, kappa=kappa)
-    residuals = torch.clamp(angles - (aperture_scale * apertures), min=0.0)
-    if residuals.dim() == 2:
-        residuals = residuals.mean(dim=-1)
-    return weighted_mean(residuals, weights)
-def compositional_entailment_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    box_image_feats: Tensor,
-    box_text_feats: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-) -> Tensor:
-    text_to_image = entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    box_text_to_box_image = entailment_residual(
-        specific=box_image_feats,
-        general=box_text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    box_image_to_image = entailment_residual(
-        specific=image_feats,
-        general=box_image_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    box_text_to_text = entailment_residual(
-        specific=text_feats,
-        general=box_text_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    return 0.5 * (text_to_image + box_text_to_box_image + box_image_to_image + box_text_to_text)
-def multi_part_entailment_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_mask: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-) -> Tensor:
-    part_image_flat = part_image_feats[part_mask]
-    part_text_flat = part_text_feats[part_mask]
-    image_for_parts = image_feats[:, None, :].expand_as(part_image_feats)[part_mask]
-    text_for_parts = text_feats[:, None, :].expand_as(part_text_feats)[part_mask]
-    text_to_image = entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    part_text_to_part_image = entailment_residual(
-        specific=part_image_flat,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    part_image_to_image = entailment_residual(
-        specific=image_for_parts,
-        general=part_image_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    part_text_to_text = entailment_residual(
-        specific=text_for_parts,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    return 0.5 * (text_to_image + part_text_to_part_image + part_image_to_image + part_text_to_text)
-def packed_part_entailment_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_feats: Tensor,
-    part_text_feats: Tensor,
-    part_owner: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-) -> Tensor:
-    text_to_image = entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    if part_image_feats.numel() == 0:
-        return text_to_image
-    image_for_parts = image_feats[part_owner]
-    text_for_parts = text_feats[part_owner]
-    part_text_to_part_image = entailment_residual(
-        specific=part_image_feats,
-        general=part_text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-    )
-    part_image_to_image = entailment_residual(
-        specific=image_for_parts,
-        general=part_image_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    part_text_to_text = entailment_residual(
-        specific=text_for_parts,
-        general=part_text_feats,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-    )
-    return 0.5 * (text_to_image + part_text_to_part_image + part_image_to_image + part_text_to_text)
-def uncha_contrastive_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    global_logit_scale: Tensor,
-    local_logit_scale: Tensor,
-    global_local_logit_scale: Tensor,
-    image_euc_feats: Tensor | None = None,
-    text_euc_feats: Tensor | None = None,
-    part_image_euc_flat: Tensor | None = None,
-    part_text_euc_flat: Tensor | None = None,
-    image_for_parts_euc: Tensor | None = None,
-    text_for_parts_euc: Tensor | None = None,
-    all_image_feats: Tensor | None = None,
-    all_text_feats: Tensor | None = None,
-    all_part_image_feats: Tensor | None = None,
-    all_part_text_feats: Tensor | None = None,
-    all_image_for_parts: Tensor | None = None,
-    all_text_for_parts: Tensor | None = None,
-    all_image_euc_feats: Tensor | None = None,
-    all_text_euc_feats: Tensor | None = None,
-    all_part_image_euc_feats: Tensor | None = None,
-    all_part_text_euc_feats: Tensor | None = None,
-    all_image_for_parts_euc: Tensor | None = None,
-    all_text_for_parts_euc: Tensor | None = None,
-    global_targets: Tensor | None = None,
-    part_targets: Tensor | None = None,
-    part_weights: Tensor | None = None,
-    product_metric: str = "l1",
-    loss_type: str = "ce",
-    contrastive_global_weight: float = 1.0,
-    contrastive_local_weight: float = 1.0,
-    contrastive_global_local_weight: float = 1.0,
-    beta_cal_beta: float = 0.0,
-    beta_cal_variant: str = "ce",
-    beta_cal_weight: float = 0.0,
-    part_group_ids: Tensor | None = None,
-    all_part_group_ids: Tensor | None = None,
-    global_logit_bias: Tensor | None = None,
-    local_logit_bias: Tensor | None = None,
-    global_local_logit_bias: Tensor | None = None,
-    sigmoid_negative_weight: float = 1.0,
-    global_local_mode: str = "repeat",
-    global_local_metric: str = "distance",
-    global_local_angle_aux_weight: float = 0.0,
-    global_local_angle_aux_mode: str = "contrastive",
-    global_local_angle_aux_scale: float = 5.5,
-    global_local_angle_aux_aperture_scale: float = 1.0,
-) -> dict[str, Tensor]:
-    if loss_type not in {"ce", "sigmoid", "siglip", "siglip_metric"}:
-        raise ValueError(
-            f"Unsupported contrastive loss {loss_type!r}; expected 'ce', 'sigmoid', 'siglip', or 'siglip_metric'"
-        )
-    if global_local_mode not in {"repeat", "inbatch"}:
-        raise ValueError("global_local_mode must be 'repeat' or 'inbatch'")
-    if global_local_metric not in {"distance", "angle"}:
-        raise ValueError("global_local_metric must be 'distance' or 'angle'")
-    if global_local_angle_aux_mode not in {"contrastive", "positive_hinge"}:
-        raise ValueError("global_local_angle_aux_mode must be 'contrastive' or 'positive_hinge'")
-    if global_local_angle_aux_weight < 0.0:
-        raise ValueError("global_local_angle_aux_weight must be non-negative")
-    if global_local_angle_aux_scale <= 0.0:
-        raise ValueError("global_local_angle_aux_scale must be positive")
-    if global_local_angle_aux_aperture_scale <= 0.0:
-        raise ValueError("global_local_angle_aux_aperture_scale must be positive")
-    all_image_feats = image_feats if all_image_feats is None else all_image_feats
-    all_text_feats = text_feats if all_text_feats is None else all_text_feats
-    all_part_image_feats = part_image_flat if all_part_image_feats is None else all_part_image_feats
-    all_part_text_feats = part_text_flat if all_part_text_feats is None else all_part_text_feats
-    all_image_for_parts = image_for_parts if all_image_for_parts is None else all_image_for_parts
-    all_text_for_parts = text_for_parts if all_text_for_parts is None else all_text_for_parts
-    if global_targets is None:
-        global_targets = torch.arange(image_feats.size(0), device=image_feats.device)
-    if part_targets is None:
-        part_targets = torch.arange(part_image_flat.size(0), device=part_image_flat.device)
-    global_scale = global_logit_scale.exp().clamp(max=100.0)
-    local_scale = local_logit_scale.exp().clamp(max=100.0)
-    global_local_scale = global_local_logit_scale.exp().clamp(max=100.0)
-    if loss_type == "siglip":
-        if image_euc_feats is None or text_euc_feats is None:
-            raise ValueError("siglip contrastive requires image_euc_feats and text_euc_feats")
-        if image_feats.dim() != 2 or text_feats.dim() != 2:
-            raise ValueError("siglip contrastive is only supported for non-product features")
-        all_image_euc_feats = image_euc_feats if all_image_euc_feats is None else all_image_euc_feats
-        all_text_euc_feats = text_euc_feats if all_text_euc_feats is None else all_text_euc_feats
-        zimg = F.normalize(image_euc_feats.float(), dim=-1)
-        ztxt = F.normalize(text_euc_feats.float(), dim=-1)
-        zimg_all = F.normalize(all_image_euc_feats.float(), dim=-1)
-        ztxt_all = F.normalize(all_text_euc_feats.float(), dim=-1)
-        image_logits = (zimg @ ztxt_all.T) * global_scale
-        text_logits = (ztxt @ zimg_all.T) * global_scale
-    else:
-        image_logits = -metric_pairwise_dist(image_feats, all_text_feats, kappa, product_metric=product_metric) * global_scale
-        text_logits = -metric_pairwise_dist(text_feats, all_image_feats, kappa, product_metric=product_metric) * global_scale
-    if loss_type in {"sigmoid", "siglip", "siglip_metric"}:
-        bias = image_logits.new_zeros(()) if global_logit_bias is None else global_logit_bias.to(image_logits.device)
-        image_logits = image_logits + bias
-        text_logits = text_logits + bias
-    global_contrastive = 0.5 * (
-        _contrastive_loss(image_logits, global_targets, None, loss_type, sigmoid_negative_weight)
-        + _contrastive_loss(text_logits, global_targets, None, loss_type, sigmoid_negative_weight)
-    )
-    if part_image_flat.numel() == 0:
-        zero = image_feats.new_zeros(())
-        contrastive = contrastive_global_weight * global_contrastive
-        return {
-            "contrastive_loss": contrastive,
-            "global_contrastive_loss": global_contrastive,
-            "local_contrastive_loss": zero,
-            "global_local_contrastive_loss": zero,
-            "global_local_angle_aux_loss": zero,
-            "beta_cal_loss": zero,
-        }
-    if loss_type == "siglip":
-        if part_image_euc_flat is None or part_text_euc_flat is None:
-            raise ValueError("siglip contrastive requires part_image_euc_flat and part_text_euc_flat when parts exist")
-        all_part_image_euc_feats = part_image_euc_flat if all_part_image_euc_feats is None else all_part_image_euc_feats
-        all_part_text_euc_feats = part_text_euc_flat if all_part_text_euc_feats is None else all_part_text_euc_feats
-        zpi = F.normalize(part_image_euc_flat.float(), dim=-1)
-        zpt = F.normalize(part_text_euc_flat.float(), dim=-1)
-        zpi_all = F.normalize(all_part_image_euc_feats.float(), dim=-1)
-        zpt_all = F.normalize(all_part_text_euc_feats.float(), dim=-1)
-        part_image_logits = (zpi @ zpt_all.T) * local_scale
-        part_text_logits = (zpt @ zpi_all.T) * local_scale
-    else:
-        part_image_logits = -metric_pairwise_dist(part_image_flat, all_part_text_feats, kappa, product_metric=product_metric) * local_scale
-        part_text_logits = -metric_pairwise_dist(part_text_flat, all_part_image_feats, kappa, product_metric=product_metric) * local_scale
-    if loss_type in {"sigmoid", "siglip", "siglip_metric"}:
-        bias = part_image_logits.new_zeros(()) if local_logit_bias is None else local_logit_bias.to(part_image_logits.device)
-        part_image_logits = part_image_logits + bias
-        part_text_logits = part_text_logits + bias
-    local_contrastive = 0.5 * (
-        _contrastive_loss(part_image_logits, part_targets, part_weights, loss_type, sigmoid_negative_weight)
-        + _contrastive_loss(part_text_logits, part_targets, part_weights, loss_type, sigmoid_negative_weight)
-    )
-    global_local_contrastive = image_feats.new_zeros(())
-    global_local_angle_aux = image_feats.new_zeros(())
-    if contrastive_global_local_weight != 0.0:
-        if global_local_mode == "inbatch":
-            if part_group_ids is None:
-                raise ValueError("inbatch global-local contrastive requires part_group_ids to be provided")
-            global_local_targets = part_group_ids
-            all_text_for_global_local = all_text_feats
-            all_image_for_global_local = all_image_feats
-            all_text_for_global_local_euc = all_text_euc_feats
-            all_image_for_global_local_euc = all_image_euc_feats
-        else:
-            global_local_targets = part_targets
-            all_text_for_global_local = all_text_for_parts
-            all_image_for_global_local = all_image_for_parts
-            all_text_for_global_local_euc = all_text_for_parts_euc
-            all_image_for_global_local_euc = all_image_for_parts_euc
-        image_uncertainty = embedding_uncertainty(part_image_flat).detach()
-        text_uncertainty = embedding_uncertainty(part_text_flat).detach()
-        image_temp = torch.exp(-0.5 * image_uncertainty).clamp(min=0.1, max=10.0)
-        text_temp = torch.exp(-0.5 * text_uncertainty).clamp(min=0.1, max=10.0)
-        if loss_type == "siglip":
-            if part_image_euc_flat is None or part_text_euc_flat is None:
-                raise ValueError("siglip global-local contrastive requires part_image_euc_flat/part_text_euc_flat")
-            if all_text_for_global_local_euc is None or all_image_for_global_local_euc is None:
-                raise ValueError("siglip global-local contrastive requires all_image_euc_feats/all_text_euc_feats")
-            zpi = F.normalize(part_image_euc_flat.float(), dim=-1)
-            zpt = F.normalize(part_text_euc_flat.float(), dim=-1)
-            zimg_all = F.normalize(all_image_for_global_local_euc.float(), dim=-1)
-            ztxt_all = F.normalize(all_text_for_global_local_euc.float(), dim=-1)
-            part_image_to_whole_text = (zpi @ ztxt_all.T) * image_temp[:, None] * global_local_scale
-            part_text_to_whole_image = (zpt @ zimg_all.T) * text_temp[:, None] * global_local_scale
-        else:
-            if global_local_metric == "angle":
-                part_image_to_whole_text = -metric_pairwise_oxy_angle(
-                    part_image_flat,
-                    all_text_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                )
-                part_text_to_whole_image = -metric_pairwise_oxy_angle(
-                    part_text_flat,
-                    all_image_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                )
-            else:
-                part_image_to_whole_text = -metric_pairwise_dist(
-                    part_image_flat, all_text_for_global_local, kappa, product_metric=product_metric
-                )
-                part_text_to_whole_image = -metric_pairwise_dist(
-                    part_text_flat, all_image_for_global_local, kappa, product_metric=product_metric
-                )
-            part_image_to_whole_text = part_image_to_whole_text * image_temp[:, None] * global_local_scale
-            part_text_to_whole_image = part_text_to_whole_image * text_temp[:, None] * global_local_scale
-        if loss_type in {"sigmoid", "siglip", "siglip_metric"}:
-            bias = (
-                part_image_to_whole_text.new_zeros(())
-                if global_local_logit_bias is None
-                else global_local_logit_bias.to(part_image_to_whole_text.device)
-            )
-            part_image_to_whole_text = part_image_to_whole_text + bias
-            part_text_to_whole_image = part_text_to_whole_image + bias
-        global_local_contrastive = 0.5 * (
-            _contrastive_loss(part_image_to_whole_text, global_local_targets, part_weights, loss_type, sigmoid_negative_weight)
-            + _contrastive_loss(part_text_to_whole_image, global_local_targets, part_weights, loss_type, sigmoid_negative_weight)
-        )
-        if global_local_angle_aux_weight > 0.0:
-            if global_local_angle_aux_mode == "positive_hinge":
-                positive_text = all_text_for_global_local.index_select(0, global_local_targets)
-                positive_image = all_image_for_global_local.index_select(0, global_local_targets)
-                global_local_angle_aux = 0.5 * (
-                    weighted_entailment_residual(
-                        specific=part_image_flat,
-                        general=positive_text,
-                        kappa=kappa,
-                        aperture_scale=global_local_angle_aux_aperture_scale,
-                        weights=part_weights,
-                    )
-                    + weighted_entailment_residual(
-                        specific=part_text_flat,
-                        general=positive_image,
-                        kappa=kappa,
-                        aperture_scale=global_local_angle_aux_aperture_scale,
-                        weights=part_weights,
-                    )
-                )
-            elif loss_type != "siglip":
-                angle_scale = part_image_flat.new_tensor(float(global_local_angle_aux_scale))
-                part_image_to_whole_text_angle = -metric_pairwise_oxy_angle(
-                    part_image_flat,
-                    all_text_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                ) * image_temp[:, None] * angle_scale
-                part_text_to_whole_image_angle = -metric_pairwise_oxy_angle(
-                    part_text_flat,
-                    all_image_for_global_local,
-                    kappa,
-                    product_metric=product_metric,
-                ) * text_temp[:, None] * angle_scale
-                if loss_type in {"sigmoid", "siglip_metric"}:
-                    bias = (
-                        part_image_to_whole_text_angle.new_zeros(())
-                        if global_local_logit_bias is None
-                        else global_local_logit_bias.to(part_image_to_whole_text_angle.device)
-                    )
-                    part_image_to_whole_text_angle = part_image_to_whole_text_angle + bias
-                    part_text_to_whole_image_angle = part_text_to_whole_image_angle + bias
-                global_local_angle_aux = 0.5 * (
-                    _contrastive_loss(
-                        part_image_to_whole_text_angle,
-                        global_local_targets,
-                        part_weights,
-                        loss_type,
-                        sigmoid_negative_weight,
-                    )
-                    + _contrastive_loss(
-                        part_text_to_whole_image_angle,
-                        global_local_targets,
-                        part_weights,
-                        loss_type,
-                        sigmoid_negative_weight,
-                    )
-                )
-    beta_cal = image_feats.new_zeros(())
-    if beta_cal_weight > 0.0 and beta_cal_beta > 0.0:
-        if part_group_ids is None or all_part_group_ids is None:
-            raise ValueError("beta_cal requires part_group_ids and all_part_group_ids to be provided")
-        beta_cal = 0.5 * (
-            beta_cal_loss(
-                part_image_logits,
-                targets=part_targets,
-                group_ids=part_group_ids,
-                all_group_ids=all_part_group_ids,
-                beta=beta_cal_beta,
-                variant=beta_cal_variant,
-                weights=part_weights,
-            )
-            + beta_cal_loss(
-                part_text_logits,
-                targets=part_targets,
-                group_ids=part_group_ids,
-                all_group_ids=all_part_group_ids,
-                beta=beta_cal_beta,
-                variant=beta_cal_variant,
-                weights=part_weights,
-            )
-        )
-    contrastive = (
-        contrastive_global_weight * global_contrastive
-        + contrastive_local_weight * local_contrastive
-        + contrastive_global_local_weight * global_local_contrastive
-        + global_local_angle_aux_weight * global_local_angle_aux
-        + beta_cal_weight * beta_cal
-    )
-    return {
-        "contrastive_loss": contrastive,
-        "global_contrastive_loss": global_contrastive,
-        "local_contrastive_loss": local_contrastive,
-        "global_local_contrastive_loss": global_local_contrastive,
-        "global_local_angle_aux_loss": global_local_angle_aux,
-        "beta_cal_loss": beta_cal,
-    }
-def _contrastive_loss(
-    logits: Tensor,
-    targets: Tensor,
-    weights: Tensor | None,
-    loss_type: str,
-    sigmoid_negative_weight: float,
-) -> Tensor:
-    if loss_type == "ce":
-        return contrastive_ce(logits, targets, weights)
-    if loss_type == "sigmoid":
-        return contrastive_sigmoid(logits, targets, weights, negative_weight=sigmoid_negative_weight)
-    if loss_type in {"siglip", "siglip_metric"}:
-        return contrastive_siglip(logits, targets, weights, negative_weight=sigmoid_negative_weight)
-    raise ValueError(f"Unsupported contrastive loss {loss_type!r}")
-def uncha_entailment_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-    piecewise_factor: float = 0.1,
-    calibration_alpha: float = 10.0,
-    stop_grad_calibration: bool = True,
-    geometry: str = "lorentz",
-    part_weights: Tensor | None = None,
-) -> dict[str, Tensor]:
-    text_image = piecewise_entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    text_image_entailment = 0.5 * text_image.mean()
-    if part_image_flat.numel() == 0:
-        zero = image_feats.new_zeros(())
-        return {
-            "entailment_loss": text_image_entailment,
-            "text_image_entailment_loss": text_image_entailment,
-            "part_text_image_entailment_loss": zero,
-            "cross_image_entailment_loss": zero,
-            "cross_text_entailment_loss": zero,
-            "cross_image_calibration_loss": zero,
-            "cross_text_calibration_loss": zero,
-        }
-    part_text_image = piecewise_entailment_residual(
-        specific=part_image_flat,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=inter_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    cross_image = piecewise_entailment_residual(
-        specific=image_for_parts,
-        general=part_image_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    cross_text = piecewise_entailment_residual(
-        specific=text_for_parts,
-        general=part_text_flat,
-        kappa=kappa,
-        aperture_scale=intra_aperture_scale,
-        factor=piecewise_factor,
-        geometry=geometry,
-    )
-    part_text_image_entailment = 0.5 * weighted_mean(part_text_image, part_weights)
-    cross_image_entailment, cross_image_calibration = uncertainty_calibrated_entailment_loss(
-        cross_image,
-        embedding_uncertainty(part_image_flat),
-        alpha=calibration_alpha,
-        stop_grad=stop_grad_calibration,
-        weights=part_weights,
-    )
-    cross_text_entailment, cross_text_calibration = uncertainty_calibrated_entailment_loss(
-        cross_text,
-        embedding_uncertainty(part_text_flat),
-        alpha=calibration_alpha,
-        stop_grad=stop_grad_calibration,
-        weights=part_weights,
-    )
-    entailment = (
-        text_image_entailment
-        + part_text_image_entailment
-        + 0.5 * (cross_image_entailment + cross_text_entailment)
-        + cross_image_calibration
-        + cross_text_calibration
-    )
-    return {
-        "entailment_loss": entailment,
-        "text_image_entailment_loss": text_image_entailment,
-        "part_text_image_entailment_loss": part_text_image_entailment,
-        "cross_image_entailment_loss": cross_image_entailment,
-        "cross_text_entailment_loss": cross_text_entailment,
-        "cross_image_calibration_loss": cross_image_calibration,
-        "cross_text_calibration_loss": cross_text_calibration,
-    }
-def uncha_argent_entailment_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    beta: float = 1.0,
-    part_weights: Tensor | None = None,
-    product_metric: str = "l1",
-    aggregation: str = "uncha",
-) -> dict[str, Tensor]:
-    if aggregation not in {"uncha", "equal"}:
-        raise ValueError("aggregation must be 'uncha' or 'equal'")
-    text_image = argent_adaptive_entailment_residual(
-        specific=image_feats,
-        general=text_feats,
-        kappa=kappa,
-        adaptive_weight=False,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    text_image_entailment = 0.5 * text_image.mean()
-    if part_image_flat.numel() == 0:
-        zero = image_feats.new_zeros(())
-        norm_regularization = argent_norm_regularization_loss(image_feats, text_feats)
-        return {
-            "entailment_loss": text_image_entailment,
-            "text_image_entailment_loss": text_image_entailment,
-            "part_text_image_entailment_loss": zero,
-            "cross_image_entailment_loss": zero,
-            "cross_text_entailment_loss": zero,
-            "cross_image_calibration_loss": zero,
-            "cross_text_calibration_loss": zero,
-            "norm_regularization_loss": norm_regularization,
-        }
-    part_text_image = argent_adaptive_entailment_residual(
-        specific=part_image_flat,
-        general=part_text_flat,
-        kappa=kappa,
-        adaptive_weight=False,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    cross_image = argent_adaptive_entailment_residual(
-        specific=image_for_parts,
-        general=part_image_flat,
-        kappa=kappa,
-        adaptive_weight=True,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    cross_text = argent_adaptive_entailment_residual(
-        specific=text_for_parts,
-        general=part_text_flat,
-        kappa=kappa,
-        adaptive_weight=True,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    part_text_image_entailment = 0.5 * weighted_mean(part_text_image, part_weights)
-    cross_image_entailment = 0.5 * weighted_mean(cross_image, part_weights)
-    cross_text_entailment = 0.5 * weighted_mean(cross_text, part_weights)
-    norm_regularization = argent_norm_regularization_loss(image_feats, text_feats, part_image_flat, part_text_flat)
-    if aggregation == "equal":
-        entailment = text_image_entailment + part_text_image_entailment + cross_image_entailment + cross_text_entailment
-    else:
-        entailment = text_image_entailment + part_text_image_entailment + 0.5 * (
-            cross_image_entailment + cross_text_entailment
-        )
-    diagnostics = argent_entailment_diagnostics(
-        image_feats=image_feats,
-        text_feats=text_feats,
-        part_image_flat=part_image_flat,
-        part_text_flat=part_text_flat,
-        image_for_parts=image_for_parts,
-        text_for_parts=text_for_parts,
-        kappa=kappa,
-        product_metric=product_metric,
-    )
-    return {
-        "entailment_loss": entailment,
-        "text_image_entailment_loss": text_image_entailment,
-        "part_text_image_entailment_loss": part_text_image_entailment,
-        "cross_image_entailment_loss": cross_image_entailment,
-        "cross_text_entailment_loss": cross_text_entailment,
-        "cross_image_calibration_loss": image_feats.new_zeros(()),
-        "cross_text_calibration_loss": image_feats.new_zeros(()),
-        "norm_regularization_loss": norm_regularization,
-        **diagnostics,
-    }
-def hierarchical_beta_argent_entailment_losses(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    beta_query_image_feats: Tensor,
-    beta_query_text_feats: Tensor,
-    beta_query_owner: Tensor,
-    beta_query_parent: Tensor,
-    beta_query_weight: Tensor,
-    kappa: Tensor,
-    beta_query_source_part: Tensor | None = None,
-    beta: float = 1.0,
-    part_weights: Tensor | None = None,
-    product_metric: str = "l1",
-    aggregation: str = "uncha",
-) -> dict[str, Tensor]:
-    base = uncha_argent_entailment_losses(
-        image_feats=image_feats,
-        text_feats=text_feats,
-        part_image_flat=part_image_flat,
-        part_text_flat=part_text_flat,
-        image_for_parts=image_for_parts,
-        text_for_parts=text_for_parts,
-        kappa=kappa,
-        beta=beta,
-        part_weights=part_weights,
-        product_metric=product_metric,
-        aggregation=aggregation,
-    )
-    if beta_query_image_feats.numel() == 0:
-        return {
-            **base,
-            "hier_beta_query_text_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_visual_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_text_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_sourcepart_visual_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_sourcepart_text_entailment_loss": image_feats.new_zeros(()),
-            "hier_beta_query_count": beta_query_owner.new_tensor(0),
-            "hier_beta_sourcepart_query_count": beta_query_owner.new_tensor(0),
-        }
-    query_owner = beta_query_owner.to(device=image_feats.device, dtype=torch.long)
-    query_weights = beta_query_weight.to(device=image_feats.device, dtype=torch.float32).clamp_min(0.0)
-    if query_weights.numel() != beta_query_image_feats.size(0):
-        raise ValueError("beta_query_weight must have one value per beta query")
-    query_weights = query_weights / query_weights.mean().clamp_min(torch.finfo(query_weights.dtype).eps)
-    query_text = argent_adaptive_entailment_residual(
-        specific=beta_query_image_feats,
-        general=beta_query_text_feats,
-        kappa=kappa,
-        adaptive_weight=False,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    visual_hierarchy = argent_adaptive_entailment_residual(
-        specific=image_feats.index_select(0, query_owner),
-        general=beta_query_image_feats,
-        kappa=kappa,
-        adaptive_weight=True,
-        beta=beta,
-        product_metric=product_metric,
-    )
-    query_text_entailment = 0.5 * weighted_mean(query_text, query_weights)
-    visual_entailment = 0.5 * weighted_mean(visual_hierarchy, query_weights)
-    parent = beta_query_parent.to(device=image_feats.device, dtype=torch.long)
-    parent_mask = (parent >= 0) & (parent < beta_query_text_feats.size(0)) & (query_weights > 0.0)
-    if bool(parent_mask.any()):
-        child_text = beta_query_text_feats[parent_mask]
-        parent_text = beta_query_text_feats[parent[parent_mask]]
-        text_hierarchy = argent_adaptive_entailment_residual(
-            specific=parent_text,
-            general=child_text,
-            kappa=kappa,
-            adaptive_weight=True,
-            beta=beta,
-            product_metric=product_metric,
-        )
-        text_entailment = 0.5 * weighted_mean(text_hierarchy, query_weights[parent_mask])
-    else:
-        text_entailment = image_feats.new_zeros(())
-    sourcepart_visual_entailment = image_feats.new_zeros(())
-    sourcepart_text_entailment = image_feats.new_zeros(())
-    sourcepart_query_count = beta_query_owner.new_tensor(0)
-    if beta_query_source_part is not None and part_image_flat.numel() > 0:
-        source_part = beta_query_source_part.to(device=image_feats.device, dtype=torch.long)
-        if source_part.numel() != beta_query_image_feats.size(0):
-            raise ValueError("beta_query_source_part must have one value per beta query")
-        source_mask = (
-            (source_part >= 0)
-            & (source_part < part_image_flat.size(0))
-            & (query_weights > 0.0)
-        )
-        if bool(source_mask.any()):
-            source_indices = source_part[source_mask]
-            sourcepart_visual = argent_adaptive_entailment_residual(
-                specific=part_image_flat.index_select(0, source_indices),
-                general=beta_query_image_feats[source_mask],
-                kappa=kappa,
-                adaptive_weight=True,
-                beta=beta,
-                product_metric=product_metric,
-            )
-            sourcepart_text = argent_adaptive_entailment_residual(
-                specific=part_text_flat.index_select(0, source_indices),
-                general=beta_query_text_feats[source_mask],
-                kappa=kappa,
-                adaptive_weight=True,
-                beta=beta,
-                product_metric=product_metric,
-            )
-            source_weights = query_weights[source_mask]
-            sourcepart_visual_entailment = 0.5 * weighted_mean(sourcepart_visual, source_weights)
-            sourcepart_text_entailment = 0.5 * weighted_mean(sourcepart_text, source_weights)
-            sourcepart_query_count = beta_query_owner.new_tensor(int(source_mask.sum().item()))
-    norm_regularization = argent_norm_regularization_loss(
-        image_feats,
-        text_feats,
-        part_image_flat,
-        part_text_flat,
-        beta_query_image_feats,
-        beta_query_text_feats,
-    )
-    sourcepart_entailment = 0.5 * (sourcepart_visual_entailment + sourcepart_text_entailment)
-    query_entailment = query_text_entailment + 0.5 * (visual_entailment + text_entailment) + sourcepart_entailment
-    return {
-        **base,
-        "entailment_loss": base["entailment_loss"] + query_entailment,
-        "norm_regularization_loss": norm_regularization,
-        "hier_beta_query_text_entailment_loss": query_text_entailment,
-        "hier_beta_visual_entailment_loss": visual_entailment,
-        "hier_beta_text_entailment_loss": text_entailment,
-        "hier_beta_sourcepart_visual_entailment_loss": sourcepart_visual_entailment,
-        "hier_beta_sourcepart_text_entailment_loss": sourcepart_text_entailment,
-        "hier_beta_query_count": beta_query_owner.new_tensor(beta_query_owner.numel()),
-        "hier_beta_sourcepart_query_count": sourcepart_query_count,
-    }
-def argent_entailment_diagnostics(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    kappa: Tensor,
-    product_metric: str = "l1",
-) -> dict[str, Tensor]:
-    zero = image_feats.new_zeros(())
-    def angle_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-        if angles.dim() == 2:
-            angles = angles.mean(dim=-1)
-        return angles.detach().mean()
-    def pent_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-        if angles.dim() == 2:
-            angles = angles.mean(dim=-1)
-        scores = torch.clamp(1.0 - (2.0 * angles / math.pi), min=0.0, max=1.0)
-        return scores.detach().mean()
-    def distance_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        return lorentz_dist(specific, general, kappa, product_metric=product_metric).detach().mean()
-    def adaptive_weight_mean(specific: Tensor, general: Tensor) -> Tensor:
-        if specific.numel() == 0:
-            return zero
-        weights = 1.0 - torch.exp(-lorentz_dist(specific, general, kappa, product_metric=product_metric))
-        return weights.detach().mean()
-    def space_norm_mean(embedding: Tensor) -> Tensor:
-        if embedding.numel() == 0:
-            return zero
-        return torch.linalg.norm(_space_components(embedding).float(), dim=-1).detach().mean()
-    return {
-        "argent_text_image_angle_mean": angle_mean(image_feats, text_feats),
-        "argent_text_image_pent_mean": pent_mean(image_feats, text_feats),
-        "argent_part_text_image_angle_mean": angle_mean(part_image_flat, part_text_flat),
-        "argent_part_text_image_pent_mean": pent_mean(part_image_flat, part_text_flat),
-        "argent_cross_image_angle_mean": angle_mean(image_for_parts, part_image_flat),
-        "argent_cross_image_pent_mean": pent_mean(image_for_parts, part_image_flat),
-        "argent_cross_image_distance_mean": distance_mean(image_for_parts, part_image_flat),
-        "argent_cross_image_adaptive_weight_mean": adaptive_weight_mean(image_for_parts, part_image_flat),
-        "argent_cross_text_angle_mean": angle_mean(text_for_parts, part_text_flat),
-        "argent_cross_text_pent_mean": pent_mean(text_for_parts, part_text_flat),
-        "argent_cross_text_distance_mean": distance_mean(text_for_parts, part_text_flat),
-        "argent_cross_text_adaptive_weight_mean": adaptive_weight_mean(text_for_parts, part_text_flat),
-        "argent_image_space_norm_mean": space_norm_mean(image_feats),
-        "argent_text_space_norm_mean": space_norm_mean(text_feats),
-        "argent_part_image_space_norm_mean": space_norm_mean(part_image_flat),
-        "argent_part_text_space_norm_mean": space_norm_mean(part_text_flat),
-    }
-def part_quality_weights(
-    image_for_parts: Tensor,
-    text_for_parts: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    part_owner: Tensor,
-    batch_size: int,
-    kappa: Tensor,
-    mode: str,
-    topk: int = 5,
-    temperature: float = 4.0,
-    product_metric: str = "l1",
-) -> tuple[Tensor | None, Tensor, Tensor]:
-    if mode not in {"none", "soft", "topk"}:
-        raise ValueError(f"Unsupported part quality mode {mode!r}; expected 'none', 'soft', or 'topk'")
-    if mode == "none" or part_image_flat.numel() == 0:
-        empty = part_image_flat.new_zeros((part_image_flat.size(0),))
-        return None, empty, empty
-    with torch.no_grad():
-        image_parent = torch.exp(-lorentz_dist(part_image_flat, image_for_parts, kappa, product_metric=product_metric))
-        text_parent = torch.exp(-lorentz_dist(part_text_flat, text_for_parts, kappa, product_metric=product_metric))
-        image_text = torch.exp(-lorentz_dist(part_image_flat, part_text_flat, kappa, product_metric=product_metric))
-        scores = torch.stack([image_parent, text_parent, image_text]).mean(dim=0).clamp_min(0.0)
-        if mode == "soft":
-            weights = _owner_softmax_weights(scores, part_owner, batch_size, temperature)
-        else:
-            weights = _owner_topk_weights(scores, part_owner, batch_size, topk)
-        weights = weights / weights.mean().clamp_min(torch.finfo(weights.dtype).eps)
-    return weights, scores, (weights > 0.0).to(dtype=scores.dtype)
-def _owner_softmax_weights(scores: Tensor, part_owner: Tensor, batch_size: int, temperature: float) -> Tensor:
-    weights = torch.zeros_like(scores)
-    for owner in range(batch_size):
-        mask = part_owner == owner
-        if not bool(mask.any()):
-            continue
-        owner_scores = scores[mask]
-        owner_weights = torch.softmax(owner_scores * temperature, dim=0) * owner_scores.numel()
-        weights[mask] = owner_weights
-    return weights
-def _owner_topk_weights(scores: Tensor, part_owner: Tensor, batch_size: int, topk: int) -> Tensor:
-    if topk <= 0:
-        raise ValueError("topk must be positive for top-k part quality weighting")
-    weights = torch.zeros_like(scores)
-    for owner in range(batch_size):
-        indices = torch.nonzero(part_owner == owner, as_tuple=False).flatten()
-        if indices.numel() == 0:
-            continue
-        keep = min(topk, indices.numel())
-        selected = indices[scores[indices].topk(k=keep).indices]
-        weights[selected] = 1.0
-    return weights
-def argent_adaptive_entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    adaptive_weight: bool,
-    beta: float = 1.0,
-    product_metric: str = "l1",
-) -> Tensor:
-    angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-    if angles.dim() == 2:
-        angles = angles.mean(dim=-1)
-    if adaptive_weight:
-        weights = 1.0 - torch.exp(
-            -lorentz_dist(specific=specific, general=general, kappa=kappa, product_metric=product_metric)
-        )
-        angles = angles * weights
-    return F.huber_loss(angles, torch.zeros_like(angles), delta=beta, reduction="none")
-def lorentz_dist(specific: Tensor, general: Tensor, kappa: Tensor, product_metric: str = "l1") -> Tensor:
-    return paired_dist(specific, general, kappa, product_metric=product_metric)
-def argent_norm_regularization_loss(*embeddings: Tensor, eps: float = 1e-6) -> Tensor:
-    losses = []
-    for embedding in embeddings:
-        if embedding.numel() == 0:
-            continue
-        space = _space_components(embedding)
-        space_norm = torch.linalg.norm(space.float(), dim=-1).clamp_min(eps)
-        losses.append((space_norm.square() - torch.log(space_norm)).mean())
-    if not losses:
-        raise ValueError("argent_norm_regularization_loss requires at least one non-empty embedding tensor")
-    return torch.stack(losses).mean()
-def piecewise_entailment_residual(
-    specific: Tensor,
-    general: Tensor,
-    kappa: Tensor,
-    aperture_scale: float,
-    factor: float = 0.1,
-    geometry: str = "lorentz",
-) -> Tensor:
-    if geometry == "lorentz":
-        angles = factor_oxy_angle(specific=specific, general=general, kappa=kappa)
-        apertures = factor_half_aperture(general=general, kappa=kappa)
-    elif geometry == "euclidean":
-        angles = euclidean_angle(specific=specific, general=general)
-        apertures = euclidean_half_aperture(general=general, aperture_scale=aperture_scale)
-        aperture_scale = 1.0
-    else:
-        raise ValueError(f"Unsupported entailment geometry {geometry!r}; expected 'lorentz' or 'euclidean'")
-    residual = angles - aperture_scale * apertures
-    loss = torch.where(residual > 0.0, residual + factor * angles, factor * angles)
-    return loss.mean(dim=-1) if loss.dim() == 2 else loss
-def euclidean_angle(specific: Tensor, general: Tensor, eps: float = 1e-6) -> Tensor:
-    specific_space = _space_components(specific).float()
-    general_space = _space_components(general).float()
-    numerator = (specific_space * general_space).sum(dim=-1)
-    denominator = torch.linalg.norm(specific_space, dim=-1) * torch.linalg.norm(general_space, dim=-1)
-    dtype_eps = torch.finfo(specific_space.dtype).eps
-    angle_eps = max(eps, 16.0 * dtype_eps)
-    cosine = (numerator / denominator.clamp_min(angle_eps)).clamp(min=-1.0 + angle_eps, max=1.0 - angle_eps)
-    return torch.acos(cosine)
-def euclidean_half_aperture(general: Tensor, aperture_scale: float, eps: float = 1e-8) -> Tensor:
-    general_norm = torch.linalg.norm(_space_components(general).float(), dim=-1).clamp_min(eps)
-    return torch.atan(torch.as_tensor(aperture_scale, device=general.device, dtype=general.dtype) / general_norm)
-def aggregate_part_consistency_loss(
-    image_feats: Tensor,
-    text_feats: Tensor,
-    part_image_flat: Tensor,
-    part_text_flat: Tensor,
-    part_owner: Tensor,
-    part_weights: Tensor | None = None,
-) -> Tensor:
-    if part_image_flat.numel() == 0:
-        return image_feats.new_zeros(())
-    batch_size = image_feats.size(0)
-    image_space = _space_components(image_feats).reshape(batch_size, -1).float()
-    text_space = _space_components(text_feats).reshape(batch_size, -1).float()
-    part_image_space = _space_components(part_image_flat).reshape(part_image_flat.size(0), -1).float()
-    part_text_space = _space_components(part_text_flat).reshape(part_text_flat.size(0), -1).float()
-    if part_weights is None:
-        counts = torch.bincount(part_owner, minlength=batch_size).to(device=image_feats.device, dtype=image_space.dtype)
-        denom = counts
-        valid = counts > 0
-        weights = part_image_space.new_ones((part_image_space.size(0),))
-    else:
-        weights = part_weights.to(device=image_feats.device, dtype=image_space.dtype).flatten()
-        if weights.numel() != part_owner.numel():
-            raise ValueError("part_weights must have the same number of elements as part_owner when provided")
-        denom = torch.zeros(batch_size, device=image_feats.device, dtype=image_space.dtype)
-        denom.index_add_(0, part_owner, weights)
-        valid = denom > 0
-    image_agg = image_space.new_zeros(image_space.shape)
-    text_agg = text_space.new_zeros(text_space.shape)
-    image_agg.index_add_(0, part_owner, part_image_space * weights[:, None])
-    text_agg.index_add_(0, part_owner, part_text_space * weights[:, None])
-    image_agg = image_agg[valid] / denom[valid, None].clamp_min(1.0)
-    text_agg = text_agg[valid] / denom[valid, None].clamp_min(1.0)
-    image_space = image_space[valid]
-    text_space = text_space[valid]
-    return 0.25 * (
-        cosine_residual(image_agg, image_space)
-        + cosine_residual(text_agg, text_space)
-        + cosine_residual(image_agg, text_space)
-        + cosine_residual(text_agg, image_space)
-    )
-def cosine_residual(x: Tensor, y: Tensor) -> Tensor:
-    return (1.0 - F.cosine_similarity(x, y, dim=-1)).mean()
-def uncertainty_calibrated_entailment_loss(
-    entail_residual: Tensor,
-    log_uncertainty: Tensor,
-    alpha: float = 10.0,
-    stop_grad: bool = True,
-    weights: Tensor | None = None,
-) -> tuple[Tensor, Tensor]:
-    mean_loss = 0.5 * entail_residual
-    uncertainty = torch.exp(log_uncertainty).clamp(min=1e-6, max=1e6)
-    residual = entail_residual.detach() if stop_grad else entail_residual
-    scaled_entail = residual / (uncertainty + 1e-6)
-    calibration_term = 0.5 * scaled_entail + 0.5 * log_uncertainty
-    prob = torch.softmax(log_uncertainty.flatten(), dim=0)
-    entropy = -(prob * torch.log(prob + 1e-8)).sum()
-    calibration_loss = alpha * (calibration_term + entropy)
-    return weighted_mean(mean_loss, weights), weighted_mean(calibration_loss, weights)
-def embedding_uncertainty(x: Tensor) -> Tensor:
-    space = _space_components(x)
-    norm = torch.linalg.norm(space.float(), dim=-1)
-    if norm.dim() > 1:
-        norm = norm.mean(dim=-1)
-    return F.softplus(-norm)
-def _space_components(x: Tensor) -> Tensor:
-    return x[..., 1:] if x.shape[-1] > 1 else x
-def _flatten_valid_parts(part_image_feats: Tensor, part_text_feats: Tensor, part_mask: Tensor, targets: Tensor) -> tuple[Tensor, Tensor, Tensor]:
-    part_targets = targets[:, None].expand_as(part_mask)[part_mask]
-    return part_image_feats[part_mask], part_text_feats[part_mask], part_targets

hyper3_clip/models/objectives.py DELETED Viewed

@@ -1,580 +0,0 @@
-from __future__ import annotations
-from collections.abc import Mapping
-import torch
-from torch import Tensor, nn
-from hyper3_clip.models.lorentz import log_map0, metric_pairwise_dist
-from hyper3_clip.models.losses import (
-    aggregate_part_consistency_loss,
-    contrastive_ce,
-    gramian_volume_loss,
-    hierarchical_beta_argent_entailment_losses,
-    packed_part_contrastive_loss,
-    packed_part_entailment_loss,
-    part_quality_weights,
-    radius_order_hinge,
-    uncha_argent_entailment_losses,
-    uncha_contrastive_losses,
-    uncha_entailment_losses,
-)
-from hyper3_clip.training.distributed import gather_variable_many_with_grad, gather_variable_no_grad, get_rank
-class HyCoCLIPObjective(nn.Module):
-    def __init__(
-        self,
-        entail_weight: float,
-        inter_aperture_scale: float,
-        intra_aperture_scale: float,
-        product_metric: str = "l1",
-    ) -> None:
-        super().__init__()
-        self.entail_weight = entail_weight
-        self.inter_aperture_scale = inter_aperture_scale
-        self.intra_aperture_scale = intra_aperture_scale
-        self.product_metric = product_metric
-    def forward(self, embeddings: Mapping[str, Tensor], logit_scale: Tensor) -> dict[str, Tensor]:
-        part_owner = embeddings["part_owner"].long()
-        part_count = part_owner.new_tensor(part_owner.numel())
-        contrastive = packed_part_contrastive_loss(
-            image_feats=embeddings["image_feats"],
-            text_feats=embeddings["text_feats"],
-            part_image_feats=embeddings["part_image_feats"],
-            part_text_feats=embeddings["part_text_feats"],
-            part_owner=part_owner,
-            kappa=embeddings["kappa"],
-            logit_scale=logit_scale,
-            all_image_feats=embeddings.get("all_image_feats"),
-            all_text_feats=embeddings.get("all_text_feats"),
-            targets=embeddings.get("targets"),
-        )
-        entailment = packed_part_entailment_loss(
-            image_feats=embeddings["image_feats"],
-            text_feats=embeddings["text_feats"],
-            part_image_feats=embeddings["part_image_feats"],
-            part_text_feats=embeddings["part_text_feats"],
-            part_owner=part_owner,
-            kappa=embeddings["kappa"],
-            inter_aperture_scale=self.inter_aperture_scale,
-            intra_aperture_scale=self.intra_aperture_scale,
-        )
-        total = contrastive + self.entail_weight * entailment
-        return {
-            "loss": total,
-            "contrastive_loss": contrastive,
-            "entailment_loss": entailment,
-            "part_count": part_count,
-        }
-class UNCHAObjective(nn.Module):
-    def __init__(
-        self,
-        entail_weight: float,
-        inter_aperture_scale: float,
-        intra_aperture_scale: float,
-        piecewise_factor: float = 0.1,
-        calibration_alpha: float = 10.0,
-        stop_grad_calibration: bool = True,
-        entailment_geometry: str = "lorentz",
-        aggregate_weight: float = 0.0,
-        entailment_loss: str = "piecewise",
-        argent_beta: float = 1.0,
-        argent_norm_weight: float = 0.0,
-        argent_aux_weight: float = 0.5,
-        argent_aggregation: str = "uncha",
-        part_weight_power: float = 0.0,
-        product_metric: str = "l1",
-        contrastive_loss: str = "ce",
-        sigmoid_negative_weight: float = 1.0,
-        part_quality_mode: str = "none",
-        part_quality_topk: int = 5,
-        part_quality_temperature: float = 4.0,
-        contrastive_global_weight: float = 1.0,
-        contrastive_local_weight: float = 1.0,
-        contrastive_global_local_weight: float = 1.0,
-        beta_cal_beta: float = 0.0,
-        beta_cal_variant: str = "ce",
-        beta_cal_weight: float = 0.0,
-        himo_component_weight: float = 0.0,
-        global_local_mode: str = "repeat",
-        global_local_metric: str = "distance",
-        global_local_angle_aux_weight: float = 0.0,
-        global_local_angle_aux_mode: str = "contrastive",
-        global_local_angle_aux_scale: float = 5.5,
-        global_local_angle_aux_aperture_scale: float = 1.0,
-        radius_order_weight: float = 0.0,
-        radius_order_margin: float = 0.0,
-        gramian_align_weight: float = 0.0,
-    ) -> None:
-        super().__init__()
-        if entailment_loss not in {
-            "piecewise",
-            "argent",
-            "piecewise_argent",
-            "hier_beta_argent",
-            "hier_beta_sourcepart_argent",
-        }:
-            raise ValueError(
-                f"Unsupported UNCHA entailment loss {entailment_loss!r}; "
-                "expected 'piecewise', 'argent', 'piecewise_argent', 'hier_beta_argent', "
-                "or 'hier_beta_sourcepart_argent'"
-            )
-        if contrastive_loss not in {"ce", "sigmoid", "siglip", "siglip_metric"}:
-            raise ValueError("contrastive_loss must be 'ce', 'sigmoid', 'siglip', or 'siglip_metric'")
-        if beta_cal_variant not in {"ce", "bce"}:
-            raise ValueError("beta_cal_variant must be 'ce' or 'bce'")
-        if argent_aggregation not in {"uncha", "equal"}:
-            raise ValueError("argent_aggregation must be 'uncha' or 'equal'")
-        if part_quality_mode not in {"none", "soft", "topk"}:
-            raise ValueError("part_quality_mode must be 'none', 'soft', or 'topk'")
-        if global_local_mode not in {"repeat", "inbatch"}:
-            raise ValueError("global_local_mode must be 'repeat' or 'inbatch'")
-        if global_local_metric not in {"distance", "angle"}:
-            raise ValueError("global_local_metric must be 'distance' or 'angle'")
-        if global_local_angle_aux_mode not in {"contrastive", "positive_hinge"}:
-            raise ValueError("global_local_angle_aux_mode must be 'contrastive' or 'positive_hinge'")
-        if global_local_angle_aux_weight < 0.0:
-            raise ValueError("global_local_angle_aux_weight must be non-negative")
-        if global_local_angle_aux_scale <= 0.0:
-            raise ValueError("global_local_angle_aux_scale must be positive")
-        if global_local_angle_aux_aperture_scale <= 0.0:
-            raise ValueError("global_local_angle_aux_aperture_scale must be positive")
-        if part_quality_topk <= 0:
-            raise ValueError("part_quality_topk must be positive")
-        self.entail_weight = entail_weight
-        self.inter_aperture_scale = inter_aperture_scale
-        self.intra_aperture_scale = intra_aperture_scale
-        self.piecewise_factor = piecewise_factor
-        self.calibration_alpha = calibration_alpha
-        self.stop_grad_calibration = stop_grad_calibration
-        self.entailment_geometry = entailment_geometry
-        self.aggregate_weight = aggregate_weight
-        self.entailment_loss = entailment_loss
-        self.argent_beta = argent_beta
-        self.argent_norm_weight = argent_norm_weight
-        self.argent_aux_weight = argent_aux_weight
-        self.argent_aggregation = argent_aggregation
-        self.part_weight_power = part_weight_power
-        self.product_metric = product_metric
-        self.contrastive_loss = contrastive_loss
-        self.sigmoid_negative_weight = sigmoid_negative_weight
-        self.part_quality_mode = part_quality_mode
-        self.part_quality_topk = part_quality_topk
-        self.part_quality_temperature = part_quality_temperature
-        self.contrastive_global_weight = float(contrastive_global_weight)
-        self.contrastive_local_weight = float(contrastive_local_weight)
-        self.contrastive_global_local_weight = float(contrastive_global_local_weight)
-        self.beta_cal_beta = float(beta_cal_beta)
-        self.beta_cal_variant = beta_cal_variant
-        self.beta_cal_weight = float(beta_cal_weight)
-        self.himo_component_weight = float(himo_component_weight)
-        self.global_local_mode = global_local_mode
-        self.global_local_metric = global_local_metric
-        self.global_local_angle_aux_weight = float(global_local_angle_aux_weight)
-        self.global_local_angle_aux_mode = global_local_angle_aux_mode
-        self.global_local_angle_aux_scale = float(global_local_angle_aux_scale)
-        self.global_local_angle_aux_aperture_scale = float(global_local_angle_aux_aperture_scale)
-        self.radius_order_weight = float(radius_order_weight)
-        self.radius_order_margin = float(radius_order_margin)
-        self.gramian_align_weight = float(gramian_align_weight)
-    def forward(self, embeddings: Mapping[str, Tensor], logit_scales: Mapping[str, Tensor]) -> dict[str, Tensor]:
-        part_owner = embeddings["part_owner"].long()
-        part_count = part_owner.new_tensor(part_owner.numel())
-        part_image_flat = embeddings["part_image_feats"]
-        part_text_flat = embeddings["part_text_feats"]
-        image_feats = embeddings["image_feats"]
-        text_feats = embeddings["text_feats"]
-        if part_owner.numel() == 0:
-            image_for_parts = image_feats.new_zeros((0, image_feats.size(-1)))
-            text_for_parts = text_feats.new_zeros((0, text_feats.size(-1)))
-        else:
-            image_for_parts = image_feats[part_owner]
-            text_for_parts = text_feats[part_owner]
-        count_part_weights = _part_weights(part_owner, image_feats.size(0), self.part_weight_power)
-        quality_part_weights, quality_scores, quality_keep = part_quality_weights(
-            image_for_parts=image_for_parts,
-            text_for_parts=text_for_parts,
-            part_image_flat=part_image_flat,
-            part_text_flat=part_text_flat,
-            part_owner=part_owner,
-            batch_size=image_feats.size(0),
-            kappa=embeddings["kappa"],
-            mode=self.part_quality_mode,
-            topk=self.part_quality_topk,
-            temperature=self.part_quality_temperature,
-            product_metric=self.product_metric,
-        )
-        part_weights = _combine_part_weights(count_part_weights, quality_part_weights)
-        needs_repeated_global_local = self.global_local_mode == "repeat" and self.contrastive_global_local_weight != 0.0
-        part_feature_tensors = [part_image_flat, part_text_flat]
-        if needs_repeated_global_local:
-            part_feature_tensors.extend([image_for_parts, text_for_parts])
-        gathered_part_features, part_counts = gather_variable_many_with_grad(part_feature_tensors)
-        all_part_image_feats = gathered_part_features[0]
-        all_part_text_feats = gathered_part_features[1]
-        all_image_for_parts = gathered_part_features[2] if needs_repeated_global_local else None
-        all_text_for_parts = gathered_part_features[3] if needs_repeated_global_local else None
-        image_euc_feats = embeddings.get("image_euc_feats")
-        text_euc_feats = embeddings.get("text_euc_feats")
-        part_image_euc_flat = embeddings.get("part_image_euc_feats")
-        part_text_euc_flat = embeddings.get("part_text_euc_feats")
-        image_for_parts_euc = None
-        text_for_parts_euc = None
-        all_part_image_euc_feats = None
-        all_part_text_euc_feats = None
-        all_image_for_parts_euc = None
-        all_text_for_parts_euc = None
-        if (
-            image_euc_feats is not None
-            and text_euc_feats is not None
-            and part_owner.numel() > 0
-            and needs_repeated_global_local
-        ):
-            image_for_parts_euc = image_euc_feats[part_owner]
-            text_for_parts_euc = text_euc_feats[part_owner]
-        if part_image_euc_flat is not None and part_text_euc_flat is not None:
-            euc_feature_tensors = [part_image_euc_flat, part_text_euc_flat]
-            if image_for_parts_euc is not None and text_for_parts_euc is not None:
-                euc_feature_tensors.extend([image_for_parts_euc, text_for_parts_euc])
-            gathered_euc_features, _ = gather_variable_many_with_grad(euc_feature_tensors)
-            all_part_image_euc_feats = gathered_euc_features[0]
-            all_part_text_euc_feats = gathered_euc_features[1]
-            if image_for_parts_euc is not None and text_for_parts_euc is not None:
-                all_image_for_parts_euc = gathered_euc_features[2]
-                all_text_for_parts_euc = gathered_euc_features[3]
-        if "targets" not in embeddings:
-            raise ValueError("UNCHAObjective requires 'targets' to compute group-aware losses")
-        global_targets = embeddings["targets"]
-        part_group_ids = global_targets[part_owner] if part_owner.numel() > 0 else part_owner.new_zeros((0,))
-        all_part_group_ids = None
-        if self.beta_cal_weight > 0.0 and self.beta_cal_beta > 0.0:
-            all_part_group_ids, _ = gather_variable_no_grad(part_group_ids)
-        part_offset = part_counts[: get_rank()].sum() if part_counts.numel() > 1 else part_counts.new_zeros(())
-        part_targets = torch.arange(part_image_flat.size(0), device=part_image_flat.device) + part_offset
-        contrastive = uncha_contrastive_losses(
-            image_feats=image_feats,
-            text_feats=text_feats,
-            part_image_flat=part_image_flat,
-            part_text_flat=part_text_flat,
-            image_for_parts=image_for_parts,
-            text_for_parts=text_for_parts,
-            image_euc_feats=image_euc_feats,
-            text_euc_feats=text_euc_feats,
-            part_image_euc_flat=part_image_euc_flat,
-            part_text_euc_flat=part_text_euc_flat,
-            image_for_parts_euc=image_for_parts_euc,
-            text_for_parts_euc=text_for_parts_euc,
-            kappa=embeddings["kappa"],
-            global_logit_scale=logit_scales["global"],
-            local_logit_scale=logit_scales["local"],
-            global_local_logit_scale=logit_scales["global_local"],
-            all_image_feats=embeddings.get("all_image_feats"),
-            all_text_feats=embeddings.get("all_text_feats"),
-            all_part_image_feats=all_part_image_feats,
-            all_part_text_feats=all_part_text_feats,
-            all_image_for_parts=all_image_for_parts,
-            all_text_for_parts=all_text_for_parts,
-            all_image_euc_feats=embeddings.get("all_image_euc_feats"),
-            all_text_euc_feats=embeddings.get("all_text_euc_feats"),
-            all_part_image_euc_feats=all_part_image_euc_feats,
-            all_part_text_euc_feats=all_part_text_euc_feats,
-            all_image_for_parts_euc=all_image_for_parts_euc,
-            all_text_for_parts_euc=all_text_for_parts_euc,
-            global_targets=global_targets,
-            part_targets=part_targets,
-            part_weights=part_weights,
-            product_metric=self.product_metric,
-            loss_type=self.contrastive_loss,
-            contrastive_global_weight=self.contrastive_global_weight,
-            contrastive_local_weight=self.contrastive_local_weight,
-            contrastive_global_local_weight=self.contrastive_global_local_weight,
-            beta_cal_beta=self.beta_cal_beta,
-            beta_cal_variant=self.beta_cal_variant,
-            beta_cal_weight=self.beta_cal_weight,
-            part_group_ids=part_group_ids,
-            all_part_group_ids=all_part_group_ids,
-            global_logit_bias=logit_scales.get("global_bias"),
-            local_logit_bias=logit_scales.get("local_bias"),
-            global_local_logit_bias=logit_scales.get("global_local_bias"),
-            sigmoid_negative_weight=self.sigmoid_negative_weight,
-            global_local_mode=self.global_local_mode,
-            global_local_metric=self.global_local_metric,
-            global_local_angle_aux_weight=self.global_local_angle_aux_weight,
-            global_local_angle_aux_mode=self.global_local_angle_aux_mode,
-            global_local_angle_aux_scale=self.global_local_angle_aux_scale,
-            global_local_angle_aux_aperture_scale=self.global_local_angle_aux_aperture_scale,
-        )
-        himo_component_loss = image_feats.new_zeros(())
-        if self.himo_component_weight > 0.0 and embeddings.get("himo_text_feats") is not None:
-            himo_text_feats = embeddings["himo_text_feats"]
-            all_himo_text_feats = embeddings.get("all_himo_text_feats")
-            if all_himo_text_feats is None:
-                raise ValueError("himo_text_feats requires all_himo_text_feats for distributed contrastive loss")
-            scale = logit_scales["global"].exp().clamp(max=100.0)
-            logits_i_t = -metric_pairwise_dist(image_feats, all_himo_text_feats, embeddings["kappa"], product_metric=self.product_metric) * scale
-            logits_t_i = -metric_pairwise_dist(himo_text_feats, embeddings["all_image_feats"], embeddings["kappa"], product_metric=self.product_metric) * scale
-            himo_component_loss = 0.5 * (contrastive_ce(logits_i_t, global_targets) + contrastive_ce(logits_t_i, global_targets))
-        if self.entailment_loss == "argent":
-            entailment = uncha_argent_entailment_losses(
-                image_feats=image_feats,
-                text_feats=text_feats,
-                part_image_flat=part_image_flat,
-                part_text_flat=part_text_flat,
-                image_for_parts=image_for_parts,
-                text_for_parts=text_for_parts,
-                kappa=embeddings["kappa"],
-                beta=self.argent_beta,
-                part_weights=part_weights,
-                product_metric=self.product_metric,
-                aggregation=self.argent_aggregation,
-            )
-        elif self.entailment_loss in {"hier_beta_argent", "hier_beta_sourcepart_argent"}:
-            required = (
-                "beta_query_image_feats",
-                "beta_query_text_feats",
-                "beta_query_owner",
-                "beta_query_parent",
-                "beta_query_weight",
-            )
-            if self.entailment_loss == "hier_beta_sourcepart_argent":
-                required = (*required, "beta_query_source_part")
-            missing = [key for key in required if embeddings.get(key) is None]
-            if missing:
-                raise ValueError(f"{self.entailment_loss} requires beta query embeddings: missing {missing}")
-            entailment = hierarchical_beta_argent_entailment_losses(
-                image_feats=image_feats,
-                text_feats=text_feats,
-                part_image_flat=part_image_flat,
-                part_text_flat=part_text_flat,
-                image_for_parts=image_for_parts,
-                text_for_parts=text_for_parts,
-                beta_query_image_feats=embeddings["beta_query_image_feats"],
-                beta_query_text_feats=embeddings["beta_query_text_feats"],
-                beta_query_owner=embeddings["beta_query_owner"],
-                beta_query_parent=embeddings["beta_query_parent"],
-                beta_query_weight=embeddings["beta_query_weight"],
-                beta_query_source_part=embeddings.get("beta_query_source_part")
-                if self.entailment_loss == "hier_beta_sourcepart_argent"
-                else None,
-                kappa=embeddings["kappa"],
-                beta=self.argent_beta,
-                part_weights=part_weights,
-                product_metric=self.product_metric,
-                aggregation=self.argent_aggregation,
-            )
-        else:
-            piecewise_entailment = uncha_entailment_losses(
-                image_feats=image_feats,
-                text_feats=text_feats,
-                part_image_flat=part_image_flat,
-                part_text_flat=part_text_flat,
-                image_for_parts=image_for_parts,
-                text_for_parts=text_for_parts,
-                kappa=embeddings["kappa"],
-                inter_aperture_scale=self.inter_aperture_scale,
-                intra_aperture_scale=self.intra_aperture_scale,
-                piecewise_factor=self.piecewise_factor,
-                calibration_alpha=self.calibration_alpha,
-                stop_grad_calibration=self.stop_grad_calibration,
-                geometry=self.entailment_geometry,
-                part_weights=part_weights,
-            )
-            if self.entailment_loss == "piecewise_argent":
-                argent_entailment = uncha_argent_entailment_losses(
-                    image_feats=image_feats,
-                    text_feats=text_feats,
-                    part_image_flat=part_image_flat,
-                    part_text_flat=part_text_flat,
-                    image_for_parts=image_for_parts,
-                    text_for_parts=text_for_parts,
-                    kappa=embeddings["kappa"],
-                    beta=self.argent_beta,
-                    part_weights=part_weights,
-                    product_metric=self.product_metric,
-                    aggregation=self.argent_aggregation,
-                )
-                entailment = {
-                    **piecewise_entailment,
-                    "entailment_loss": piecewise_entailment["entailment_loss"]
-                    + self.argent_aux_weight * argent_entailment["entailment_loss"],
-                    "piecewise_entailment_loss": piecewise_entailment["entailment_loss"],
-                    "argent_entailment_loss": argent_entailment["entailment_loss"],
-                    "norm_regularization_loss": argent_entailment["norm_regularization_loss"],
-                }
-            else:
-                entailment = piecewise_entailment
-        aggregate = aggregate_part_consistency_loss(
-            image_feats=image_feats,
-            text_feats=text_feats,
-            part_image_flat=part_image_flat,
-            part_text_flat=part_text_flat,
-            part_owner=part_owner,
-            part_weights=part_weights,
-        )
-        radius_order = image_feats.new_zeros(())
-        if self.radius_order_weight > 0.0:
-            radius_order = (
-                radius_order_hinge(image_feats, text_feats, embeddings["kappa"], self.radius_order_margin)
-                + radius_order_hinge(part_image_flat, part_text_flat, embeddings["kappa"], self.radius_order_margin, part_weights)
-                + radius_order_hinge(image_for_parts, part_image_flat, embeddings["kappa"], self.radius_order_margin, part_weights)
-                + radius_order_hinge(text_for_parts, part_text_flat, embeddings["kappa"], self.radius_order_margin, part_weights)
-            )
-        gramian_align = image_feats.new_zeros(())
-        if self.gramian_align_weight > 0.0 and part_owner.numel() > 0:
-            def _tangent_flat(x: Tensor) -> Tensor:
-                tangent = log_map0(x, embeddings["kappa"])
-                return tangent.reshape(tangent.size(0), -1) if tangent.dim() == 3 else tangent
-            gramian_vectors = torch.stack(
-                [
-                    _tangent_flat(image_for_parts),
-                    _tangent_flat(text_for_parts),
-                    _tangent_flat(part_image_flat),
-                    _tangent_flat(part_text_flat),
-                ],
-                dim=1,
-            )
-            gramian_align = gramian_volume_loss(gramian_vectors, part_weights)
-        entail_weight_scale = embeddings.get("entail_weight_scale", image_feats.new_ones(()))
-        total = (
-            contrastive["contrastive_loss"]
-            + self.himo_component_weight * himo_component_loss
-            + self.entail_weight * entail_weight_scale * entailment["entailment_loss"]
-            + self.aggregate_weight * aggregate
-            + self.radius_order_weight * radius_order
-            + self.gramian_align_weight * gramian_align
-            + self.argent_norm_weight * entailment.get(
-                "norm_regularization_loss",
-                image_feats.new_zeros(()),
-            )
-        )
-        return {
-            "loss": total,
-            **contrastive,
-            "himo_component_contrastive_loss": himo_component_loss,
-            **entailment,
-            "aggregate_consistency_loss": aggregate,
-            "radius_order_loss": radius_order,
-            "gramian_align_loss": gramian_align,
-            "part_count": part_count,
-            "entail_weight_scale": entail_weight_scale.detach(),
-            "part_quality_mean": (
-                image_feats.new_zeros(()) if quality_scores.numel() == 0 else quality_scores.mean().detach()
-            ),
-            "part_quality_keep_fraction": (
-                image_feats.new_zeros(()) if quality_keep.numel() == 0 else quality_keep.mean().detach()
-            ),
-        }
-def build_objective(
-    objective: str,
-    entail_weight: float,
-    inter_aperture_scale: float,
-    intra_aperture_scale: float,
-    uncha_piecewise_factor: float = 0.1,
-    uncha_calibration_alpha: float = 10.0,
-    uncha_stop_grad_calibration: bool = True,
-    uncha_entailment_geometry: str = "lorentz",
-    uncha_aggregate_weight: float = 0.0,
-    uncha_entailment_loss: str = "piecewise",
-    uncha_argent_beta: float = 1.0,
-    uncha_argent_norm_weight: float = 0.0,
-    uncha_argent_aux_weight: float = 0.5,
-    uncha_argent_aggregation: str = "uncha",
-    uncha_part_weight_power: float = 0.0,
-    uncha_contrastive_loss: str = "ce",
-    uncha_sigmoid_negative_weight: float = 1.0,
-    uncha_part_quality_mode: str = "none",
-    uncha_part_quality_topk: int = 5,
-    uncha_part_quality_temperature: float = 4.0,
-    uncha_contrastive_global_weight: float = 1.0,
-    uncha_contrastive_local_weight: float = 1.0,
-    uncha_contrastive_global_local_weight: float = 1.0,
-    uncha_beta_cal_beta: float = 0.0,
-    uncha_beta_cal_variant: str = "ce",
-    uncha_beta_cal_weight: float = 0.0,
-    uncha_himo_component_weight: float = 0.0,
-    uncha_global_local_mode: str = "repeat",
-    uncha_global_local_metric: str = "distance",
-    uncha_global_local_angle_aux_weight: float = 0.0,
-    uncha_global_local_angle_aux_mode: str = "contrastive",
-    uncha_global_local_angle_aux_scale: float = 5.5,
-    uncha_global_local_angle_aux_aperture_scale: float = 1.0,
-    uncha_radius_order_weight: float = 0.0,
-    uncha_radius_order_margin: float = 0.0,
-    uncha_gramian_align_weight: float = 0.0,
-    product_metric: str = "l1",
-) -> nn.Module:
-    if objective == "hycoclip":
-        return HyCoCLIPObjective(
-            entail_weight=entail_weight,
-            inter_aperture_scale=inter_aperture_scale,
-            intra_aperture_scale=intra_aperture_scale,
-            product_metric=product_metric,
-        )
-    if objective == "uncha":
-        return UNCHAObjective(
-            entail_weight=entail_weight,
-            inter_aperture_scale=inter_aperture_scale,
-            intra_aperture_scale=intra_aperture_scale,
-            piecewise_factor=uncha_piecewise_factor,
-            calibration_alpha=uncha_calibration_alpha,
-            stop_grad_calibration=uncha_stop_grad_calibration,
-            entailment_geometry=uncha_entailment_geometry,
-            aggregate_weight=uncha_aggregate_weight,
-            entailment_loss=uncha_entailment_loss,
-            argent_beta=uncha_argent_beta,
-            argent_norm_weight=uncha_argent_norm_weight,
-            argent_aux_weight=uncha_argent_aux_weight,
-            argent_aggregation=uncha_argent_aggregation,
-            part_weight_power=uncha_part_weight_power,
-            product_metric=product_metric,
-            contrastive_loss=uncha_contrastive_loss,
-            sigmoid_negative_weight=uncha_sigmoid_negative_weight,
-            part_quality_mode=uncha_part_quality_mode,
-            part_quality_topk=uncha_part_quality_topk,
-            part_quality_temperature=uncha_part_quality_temperature,
-            contrastive_global_weight=uncha_contrastive_global_weight,
-            contrastive_local_weight=uncha_contrastive_local_weight,
-            contrastive_global_local_weight=uncha_contrastive_global_local_weight,
-            beta_cal_beta=uncha_beta_cal_beta,
-            beta_cal_variant=uncha_beta_cal_variant,
-            beta_cal_weight=uncha_beta_cal_weight,
-            himo_component_weight=uncha_himo_component_weight,
-            global_local_mode=uncha_global_local_mode,
-            global_local_metric=uncha_global_local_metric,
-            global_local_angle_aux_weight=uncha_global_local_angle_aux_weight,
-            global_local_angle_aux_mode=uncha_global_local_angle_aux_mode,
-            global_local_angle_aux_scale=uncha_global_local_angle_aux_scale,
-            global_local_angle_aux_aperture_scale=uncha_global_local_angle_aux_aperture_scale,
-            radius_order_weight=uncha_radius_order_weight,
-            radius_order_margin=uncha_radius_order_margin,
-            gramian_align_weight=uncha_gramian_align_weight,
-        )
-    raise ValueError(f"Unsupported objective {objective!r}; expected 'hycoclip' or 'uncha'")
-def _part_weights(part_owner: Tensor, batch_size: int, power: float) -> Tensor | None:
-    if power <= 0.0 or part_owner.numel() == 0:
-        return None
-    counts = torch.bincount(part_owner, minlength=batch_size).to(dtype=torch.float32, device=part_owner.device)
-    weights = counts[part_owner].clamp_min(1.0).pow(-power)
-    return weights / weights.mean().clamp_min(torch.finfo(weights.dtype).eps)
-def _combine_part_weights(count_weights: Tensor | None, quality_weights: Tensor | None) -> Tensor | None:
-    if count_weights is None:
-        return quality_weights
-    if quality_weights is None:
-        return count_weights
-    weights = count_weights * quality_weights
-    return weights / weights.mean().clamp_min(torch.finfo(weights.dtype).eps)

hyper3_clip/models/tren.py DELETED Viewed

@@ -1,255 +0,0 @@
-from __future__ import annotations
-import math
-import torch
-import torch.nn.functional as F
-from torch import Tensor, nn
-class FourierPositionEncoding2D(nn.Module):
-    def __init__(self, dim: int, scale: float = 1.0) -> None:
-        super().__init__()
-        if dim <= 0 or dim % 2 != 0:
-            raise ValueError("FourierPositionEncoding2D dim must be a positive even integer")
-        if scale <= 0.0:
-            raise ValueError("FourierPositionEncoding2D scale must be positive")
-        generator = torch.Generator()
-        generator.manual_seed(42)
-        self.register_buffer("gaussian_matrix", scale * torch.randn((2, dim // 2), generator=generator))
-    def forward(self, coords: Tensor) -> Tensor:
-        projected = (2.0 * coords.float() - 1.0) @ self.gaussian_matrix
-        projected = 2.0 * math.pi * projected
-        return torch.cat([torch.sin(projected), torch.cos(projected)], dim=-1)
-class _MLPBlock(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int, dropout: float) -> None:
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(dim, hidden_dim),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(hidden_dim, dim),
-        )
-    def forward(self, x: Tensor) -> Tensor:
-        return self.net(x)
-class _AttentionLayer(nn.Module):
-    def __init__(
-        self,
-        q_dim: int,
-        kv_dim: int,
-        hidden_dim: int,
-        *,
-        num_heads: int,
-        dropout: float,
-        use_bias: bool = False,
-        use_v_proj: bool = True,
-        use_out_proj: bool = True,
-    ) -> None:
-        super().__init__()
-        if hidden_dim % num_heads != 0:
-            raise ValueError("hidden_dim must be divisible by num_heads")
-        if not use_v_proj and kv_dim != hidden_dim:
-            raise ValueError("kv_dim must equal hidden_dim when value projection is disabled")
-        self.hidden_dim = hidden_dim
-        self.num_heads = num_heads
-        self.head_dim = hidden_dim // num_heads
-        self.q_proj = nn.Linear(q_dim, hidden_dim, bias=use_bias)
-        self.k_proj = nn.Linear(kv_dim, hidden_dim, bias=use_bias)
-        self.v_proj = nn.Linear(kv_dim, hidden_dim, bias=use_bias) if use_v_proj else nn.Identity()
-        self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=use_bias) if use_out_proj else nn.Identity()
-        self.q_norm = nn.LayerNorm(self.head_dim)
-        self.k_norm = nn.LayerNorm(self.head_dim)
-        self.dropout = nn.Dropout(dropout)
-        self.scale = self.head_dim**-0.5
-        nn.init.kaiming_normal_(self.q_proj.weight, mode="fan_in", nonlinearity="linear")
-        nn.init.kaiming_normal_(self.k_proj.weight, mode="fan_in", nonlinearity="linear")
-        if isinstance(self.v_proj, nn.Linear):
-            nn.init.kaiming_normal_(self.v_proj.weight, mode="fan_in", nonlinearity="linear")
-        if isinstance(self.out_proj, nn.Linear):
-            nn.init.kaiming_normal_(self.out_proj.weight, mode="fan_in", nonlinearity="linear")
-    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
-        batch_size, q_len, _ = q.shape
-        _, kv_len, _ = k.shape
-        query = self.q_proj(q).view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key = self.k_proj(k).view(batch_size, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value = self.v_proj(v).view(batch_size, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
-        query = self.q_norm(query)
-        key = self.k_norm(key)
-        attn_scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
-        attn_weights = self.dropout(F.softmax(attn_scores, dim=-1))
-        out = torch.matmul(attn_weights, value)
-        out = out.transpose(1, 2).contiguous().view(batch_size, q_len, self.hidden_dim)
-        return self.out_proj(out), attn_weights
-class _CrossAttentionBlock(nn.Module):
-    def __init__(self, dim: int, *, num_heads: int, dropout: float) -> None:
-        super().__init__()
-        self.query_norm = nn.LayerNorm(dim)
-        self.cross_attn = _AttentionLayer(dim, dim, dim, num_heads=num_heads, dropout=dropout)
-        self.dropout = nn.Dropout(dropout)
-        self.mlp_norm = nn.LayerNorm(dim)
-        self.mlp = _MLPBlock(dim, 2 * dim, dropout)
-        self.out_norm = nn.LayerNorm(dim)
-    def forward(self, query: Tensor, context: Tensor) -> Tensor:
-        x, _ = self.cross_attn(self.query_norm(query), context, context)
-        x = query + self.dropout(x)
-        return self.out_norm(x + self.mlp(self.mlp_norm(x)))
-class TRENRegionEncoder(nn.Module):
-    """T-REN-style point-prompted region token encoder.
-    The module follows the public T-REN architecture: learned k-per-prompt
-    query tokens, Fourier 2D prompt/patch position encodings, alternating
-    cross-attention and per-prompt self-attention, then final single-head
-    attention that pools unprojected patch tokens into region tokens.
-    """
-    def __init__(
-        self,
-        vision_dim: int,
-        text_dim: int,
-        *,
-        hidden_dim: int | None = None,
-        num_region_tokens: int = 3,
-        num_decoder_layers: int = 2,
-        num_attention_heads: int = 8,
-        prompt_grid_size: int = 7,
-        dropout: float = 0.1,
-    ) -> None:
-        super().__init__()
-        if num_region_tokens <= 0:
-            raise ValueError("num_region_tokens must be positive")
-        if num_decoder_layers <= 0:
-            raise ValueError("num_decoder_layers must be positive")
-        if prompt_grid_size <= 0:
-            raise ValueError("prompt_grid_size must be positive")
-        hidden_dim = int(hidden_dim or vision_dim)
-        if hidden_dim != vision_dim:
-            raise ValueError("TRENRegionEncoder currently requires hidden_dim == vision_dim")
-        if hidden_dim % 2 != 0:
-            raise ValueError("TRENRegionEncoder hidden_dim must be even for Fourier features")
-        if hidden_dim % num_attention_heads != 0:
-            raise ValueError("TRENRegionEncoder hidden_dim must be divisible by num_attention_heads")
-        self.vision_dim = vision_dim
-        self.text_dim = text_dim
-        self.hidden_dim = hidden_dim
-        self.num_region_tokens = num_region_tokens
-        self.prompt_grid_size = prompt_grid_size
-        self.position_encoder = FourierPositionEncoding2D(hidden_dim)
-        self.region_token_embeddings = nn.Embedding(num_region_tokens, hidden_dim)
-        nn.init.normal_(self.region_token_embeddings.weight, std=0.02)
-        self.region_attention_layers = nn.ModuleList(
-            [_CrossAttentionBlock(hidden_dim, num_heads=num_attention_heads, dropout=dropout) for _ in range(num_decoder_layers)]
-        )
-        self.region_attention_norms = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_decoder_layers)])
-        self.prompt_attention_layers = nn.ModuleList(
-            [
-                _AttentionLayer(
-                    hidden_dim,
-                    hidden_dim,
-                    hidden_dim,
-                    num_heads=num_attention_heads,
-                    dropout=dropout,
-                )
-                for _ in range(num_decoder_layers)
-            ]
-        )
-        self.prompt_attention_norms = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_decoder_layers)])
-        self.token_prediction_head = _AttentionLayer(
-            hidden_dim,
-            hidden_dim,
-            hidden_dim,
-            num_heads=1,
-            dropout=0.0,
-            use_v_proj=False,
-            use_out_proj=False,
-        )
-        self.text_alignment_block = nn.Sequential(
-            nn.Linear(hidden_dim, 2 * hidden_dim),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(2 * hidden_dim, text_dim),
-        )
-    def forward(self, image_tokens: Tensor) -> dict[str, Tensor]:
-        patch_tokens, patch_grid = _patch_tokens_and_grid(image_tokens)
-        batch_size, patch_count, _ = patch_tokens.shape
-        patch_coords = _grid_coords(patch_grid, patch_grid, patch_tokens.device)
-        prompt_coords = _grid_coords(self.prompt_grid_size, self.prompt_grid_size, patch_tokens.device)
-        prompt_count = prompt_coords.size(0)
-        feature_pos = self.position_encoder(patch_coords).to(dtype=patch_tokens.dtype)
-        prompt_pos = self.position_encoder(prompt_coords).to(dtype=patch_tokens.dtype)
-        kv = patch_tokens + feature_pos.unsqueeze(0)
-        prompt_pos = prompt_pos.view(1, prompt_count, 1, self.hidden_dim)
-        q = self.region_token_embeddings.weight.to(dtype=patch_tokens.dtype)
-        q = q.view(1, 1, self.num_region_tokens, self.hidden_dim).expand(
-            batch_size,
-            prompt_count,
-            self.num_region_tokens,
-            self.hidden_dim,
-        )
-        for region_layer, region_norm, prompt_layer, prompt_norm in zip(
-            self.region_attention_layers,
-            self.region_attention_norms,
-            self.prompt_attention_layers,
-            self.prompt_attention_norms,
-            strict=True,
-        ):
-            q = q + prompt_pos
-            q = q.reshape(batch_size, prompt_count * self.num_region_tokens, self.hidden_dim)
-            q = region_layer(q, kv)
-            q = q.reshape(batch_size, prompt_count, self.num_region_tokens, self.hidden_dim)
-            q = region_norm(q)
-            q = q.reshape(batch_size * prompt_count, self.num_region_tokens, self.hidden_dim)
-            q, _ = prompt_layer(q, q, q)
-            q = prompt_norm(q)
-            q = q.reshape(batch_size, prompt_count, self.num_region_tokens, self.hidden_dim)
-        flat_q = q.reshape(batch_size, prompt_count * self.num_region_tokens, self.hidden_dim)
-        visual_tokens, attn_weights = self.token_prediction_head(flat_q, kv, patch_tokens)
-        visual_tokens = visual_tokens.reshape(batch_size, prompt_count, self.num_region_tokens, self.hidden_dim)
-        attn_weights = attn_weights.squeeze(1).reshape(batch_size, prompt_count, self.num_region_tokens, patch_count)
-        region_masks = attn_weights / attn_weights.amax(dim=-1, keepdim=True).clamp_min(torch.finfo(attn_weights.dtype).eps)
-        region_masks = region_masks.reshape(batch_size, prompt_count, self.num_region_tokens, patch_grid, patch_grid)
-        text_aligned_tokens = self.text_alignment_block(visual_tokens)
-        return {
-            "visual_tokens": visual_tokens,
-            "text_aligned_tokens": text_aligned_tokens,
-            "region_masks": region_masks,
-            "prompt_coords": prompt_coords,
-        }
-def _patch_tokens_and_grid(tokens: Tensor) -> tuple[Tensor, int]:
-    if tokens.ndim != 3:
-        raise ValueError("TRENRegionEncoder expects image tokens with shape [batch, tokens, dim]")
-    token_count = tokens.size(1)
-    grid = int(math.isqrt(token_count))
-    if grid * grid == token_count:
-        return tokens, grid
-    grid = int(math.isqrt(token_count - 1))
-    if grid * grid == token_count - 1:
-        return tokens[:, 1:, :], grid
-    raise ValueError(f"Cannot infer a square patch grid from {token_count} image tokens")
-def _grid_coords(height: int, width: int, device: torch.device) -> Tensor:
-    y = torch.linspace(0.5 / height, 1.0 - 0.5 / height, height, device=device)
-    x = torch.linspace(0.5 / width, 1.0 - 0.5 / width, width, device=device)
-    yy, xx = torch.meshgrid(y, x, indexing="ij")
-    return torch.stack([xx, yy], dim=-1).reshape(-1, 2)

hyper3_clip/training/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- __all__: list[str] = []

hyper3_clip/training/checkpointing.py DELETED Viewed

@@ -1,91 +0,0 @@
-from __future__ import annotations
-from pathlib import Path
-import random
-from typing import Any
-import numpy as np
-import torch
-from torch import nn
-def save_checkpoint(
-    path: str | Path,
-    step: int,
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    scheduler: Any,
-    scaler: Any,
-    config: dict,
-) -> None:
-    checkpoint_path = Path(path)
-    tmp_path = checkpoint_path.with_name(f"{checkpoint_path.name}.tmp")
-    checkpoint = {
-        "step": step,
-        "model": model.state_dict(),
-        "optimizer": optimizer.state_dict(),
-        "scheduler": scheduler.state_dict(),
-        "scaler": scaler.state_dict(),
-        "config": config,
-        "rng": _rng_state(),
-    }
-    torch.save(checkpoint, tmp_path)
-    tmp_path.replace(checkpoint_path)
-def load_checkpoint(
-    path: str | Path,
-    model: nn.Module,
-    optimizer: torch.optim.Optimizer,
-    scheduler: Any,
-    scaler: Any,
-    device: torch.device,
-    *,
-    model_only: bool = False,
-    strict_model: bool = True,
-) -> int:
-    checkpoint = torch.load(path, map_location=device, weights_only=False)
-    model.load_state_dict(checkpoint["model"], strict=strict_model)
-    if model_only:
-        return int(checkpoint["step"])
-    optimizer.load_state_dict(checkpoint["optimizer"])
-    scheduler.load_state_dict(checkpoint["scheduler"])
-    scaler.load_state_dict(checkpoint["scaler"])
-    _set_rng_state(checkpoint["rng"])
-    return int(checkpoint["step"])
-def latest_checkpoint(output_dir: str | Path) -> Path | None:
-    paths = sorted(Path(output_dir).glob("checkpoint_step_*.pt"))
-    if not paths:
-        return None
-    return max(paths, key=_checkpoint_step)
-def _checkpoint_step(path: Path) -> int:
-    return int(path.stem.rsplit("_", 1)[1])
-def _rng_state() -> dict[str, Any]:
-    state: dict[str, Any] = {
-        "python": random.getstate(),
-        "numpy": np.random.get_state(),
-        "torch": torch.get_rng_state(),
-    }
-    if torch.cuda.is_available():
-        state["cuda"] = torch.cuda.get_rng_state_all()
-    return state
-def _set_rng_state(state: dict[str, Any]) -> None:
-    random.setstate(state["python"])
-    np.random.set_state(state["numpy"])
-    torch.set_rng_state(_cpu_byte_tensor(state["torch"]))
-    if torch.cuda.is_available() and "cuda" in state:
-        torch.cuda.set_rng_state_all([_cpu_byte_tensor(cuda_state) for cuda_state in state["cuda"]])
-def _cpu_byte_tensor(value: Any) -> torch.ByteTensor:
-    if isinstance(value, torch.Tensor):
-        return value.detach().to(device="cpu", dtype=torch.uint8)
-    return torch.as_tensor(value, dtype=torch.uint8, device="cpu")

hyper3_clip/training/distributed.py DELETED Viewed

@@ -1,149 +0,0 @@
-from __future__ import annotations
-from collections.abc import Sequence
-import os
-import torch
-import torch.distributed as dist
-from torch.distributed.nn import all_gather as differentiable_all_gather
-from torch import Tensor
-def init_distributed() -> None:
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ and not dist.is_initialized():
-        backend = "nccl" if torch.cuda.is_available() else "gloo"
-        if torch.cuda.is_available():
-            torch.cuda.set_device(get_local_rank())
-        dist.init_process_group(backend=backend)
-def is_distributed() -> bool:
-    return dist.is_available() and dist.is_initialized()
-def barrier() -> None:
-    if is_distributed():
-        dist.barrier()
-def destroy_distributed() -> None:
-    if is_distributed():
-        dist.destroy_process_group()
-def get_rank() -> int:
-    return dist.get_rank() if is_distributed() else 0
-def get_world_size() -> int:
-    return dist.get_world_size() if is_distributed() else 1
-def get_local_rank() -> int:
-    return int(os.environ.get("LOCAL_RANK", "0"))
-def is_main_process() -> bool:
-    return get_rank() == 0
-def gather_with_grad(tensor: Tensor) -> Tensor:
-    world_size = get_world_size()
-    if world_size == 1:
-        return tensor
-    return torch.cat(list(differentiable_all_gather(tensor.contiguous())), dim=0)
-def gather_variable_with_grad(tensor: Tensor) -> tuple[Tensor, Tensor]:
-    """Gather tensors with variable first-dimension lengths across ranks."""
-    count_tensor, max_count, keep = _variable_gather_metadata(tensor)
-    if get_world_size() == 1:
-        return tensor, count_tensor
-    return _gather_variable_from_metadata(tensor, max_count, keep), count_tensor
-def gather_variable_many_with_grad(tensors: Sequence[Tensor]) -> tuple[list[Tensor], Tensor]:
-    """Gather same-length variable tensors while sharing count metadata.
-    Tensors with matching dtype/rank/trailing shape are packed along the last
-    dimension so a single differentiable all-gather can serve several feature
-    tensors with the same variable first dimension.
-    """
-    if not tensors:
-        raise ValueError("gather_variable_many_with_grad requires at least one tensor")
-    first = tensors[0]
-    for tensor in tensors:
-        if tensor.device != first.device:
-            raise ValueError("all tensors must be on the same device")
-        if tensor.shape[0] != first.shape[0]:
-            raise ValueError("all tensors must have the same first dimension")
-    count_tensor, max_count, keep = _variable_gather_metadata(first)
-    if get_world_size() == 1:
-        return list(tensors), count_tensor
-    gathered: list[Tensor | None] = [None] * len(tensors)
-    groups: dict[tuple[torch.dtype, torch.Size, int], list[int]] = {}
-    for index, tensor in enumerate(tensors):
-        if tensor.dim() == 0:
-            raise ValueError("variable gather tensors must have at least one dimension")
-        key = (tensor.dtype, tensor.shape[1:-1], tensor.dim()) if tensor.dim() > 1 else (tensor.dtype, torch.Size(), 1)
-        groups.setdefault(key, []).append(index)
-    for indices in groups.values():
-        group_tensors = [tensors[index] for index in indices]
-        if len(group_tensors) == 1 or group_tensors[0].dim() == 1:
-            for index, tensor in zip(indices, group_tensors, strict=True):
-                gathered[index] = _gather_variable_from_metadata(tensor, max_count, keep)
-            continue
-        widths = [tensor.shape[-1] for tensor in group_tensors]
-        packed = torch.cat(group_tensors, dim=-1)
-        gathered_packed = _gather_variable_from_metadata(packed, max_count, keep)
-        for index, chunk in zip(indices, gathered_packed.split(widths, dim=-1), strict=True):
-            gathered[index] = chunk
-    if any(tensor is None for tensor in gathered):
-        raise RuntimeError("internal error while gathering variable tensors")
-    return [tensor for tensor in gathered if tensor is not None], count_tensor
-def gather_variable_no_grad(tensor: Tensor) -> tuple[Tensor, Tensor]:
-    """Gather variable-length tensors that do not require autograd."""
-    count_tensor, max_count, keep = _variable_gather_metadata(tensor)
-    if get_world_size() == 1:
-        return tensor, count_tensor
-    padded = tensor.new_zeros((max_count, *tensor.shape[1:]))
-    padded[: tensor.shape[0]] = tensor
-    gathered = [torch.zeros_like(padded) for _ in range(get_world_size())]
-    dist.all_gather(gathered, padded.contiguous())
-    return torch.cat(gathered, dim=0)[keep], count_tensor
-def _variable_gather_metadata(tensor: Tensor) -> tuple[Tensor, int, Tensor]:
-    world_size = get_world_size()
-    local_count = torch.tensor([tensor.shape[0]], device=tensor.device, dtype=torch.long)
-    if world_size == 1:
-        keep = torch.ones(tensor.shape[0], device=tensor.device, dtype=torch.bool)
-        return local_count, tensor.shape[0], keep
-    counts = [torch.zeros_like(local_count) for _ in range(world_size)]
-    dist.all_gather(counts, local_count)
-    count_tensor = torch.cat(counts)
-    max_count = int(count_tensor.max().item())
-    keep = torch.zeros(world_size * max_count, device=tensor.device, dtype=torch.bool)
-    for rank, count in enumerate(count_tensor.tolist()):
-        start = rank * max_count
-        keep[start : start + count] = True
-    return count_tensor, max_count, keep
-def _gather_variable_from_metadata(tensor: Tensor, max_count: int, keep: Tensor) -> Tensor:
-    padded_shape = (max_count, *tensor.shape[1:])
-    padded = tensor.new_zeros(padded_shape)
-    padded[: tensor.shape[0]] = tensor
-    gathered = torch.cat(list(differentiable_all_gather(padded.contiguous())), dim=0)
-    return gathered[keep]
-def local_target_indices(batch_size: int, device: torch.device) -> Tensor:
-    return torch.arange(batch_size, device=device) + batch_size * get_rank()

hyper3_clip/training/engine.py DELETED Viewed

@@ -1,442 +0,0 @@
-from __future__ import annotations
-from datetime import datetime, timezone
-import json
-import os
-from pathlib import Path
-import time
-import torch
-from torch import nn
-from torch.optim import AdamW, Optimizer
-from torch.nn.parallel import DistributedDataParallel
-from torch.utils.data import DataLoader, DistributedSampler, IterableDataset
-from torch.amp import GradScaler
-from hyper3_clip.data import (
-    GroundedManifestDataset,
-    MixedGroundedIterableDataset,
-    ProcessedGritDataset,
-    collate_grounded,
-)
-from hyper3_clip.models.hyper3_clip import Hyper3CLIP
-from hyper3_clip.training.checkpointing import latest_checkpoint, load_checkpoint, save_checkpoint
-from hyper3_clip.training.distributed import (
-    barrier,
-    destroy_distributed,
-    get_local_rank,
-    get_rank,
-    get_world_size,
-    init_distributed,
-    is_main_process,
-)
-from hyper3_clip.training.logging import JsonlLogger
-from hyper3_clip.utils.io import ensure_dir, save_yaml, set_seed
-try:
-    from hypercluster.hooks import RunControl
-except ImportError:  # pragma: no cover - hypercluster is only present in cluster allocations.
-    RunControl = None
-class CosineWithWarmup:
-    def __init__(self, optimizer: torch.optim.Optimizer, warmup_steps: int, total_steps: int, base_lr: float) -> None:
-        self.optimizer = optimizer
-        self.warmup_steps = warmup_steps
-        self.total_steps = total_steps
-        self.base_lr = base_lr
-    def step(self, step_idx: int) -> None:
-        if step_idx < self.warmup_steps:
-            lr = self.base_lr * float(step_idx + 1) / float(max(1, self.warmup_steps))
-        else:
-            progress = float(step_idx - self.warmup_steps) / float(max(1, self.total_steps - self.warmup_steps))
-            lr = self.base_lr * 0.5 * (1.0 + torch.cos(torch.tensor(progress * torch.pi)).item())
-        for group in self.optimizer.param_groups:
-            group["lr"] = lr
-    def state_dict(self) -> dict[str, int | float]:
-        return {"warmup_steps": self.warmup_steps, "total_steps": self.total_steps, "base_lr": self.base_lr}
-    def load_state_dict(self, state: dict[str, int | float]) -> None:
-        self.warmup_steps = int(state["warmup_steps"])
-        self.total_steps = int(state["total_steps"])
-        self.base_lr = float(state["base_lr"])
-def _build_optimizer(model: nn.Module, cfg: dict) -> AdamW:
-    no_decay_names = set(cfg.get("optimizer", {}).get("no_decay_params", []))
-    decay_params = []
-    no_decay_params = []
-    for name, param in model.named_parameters():
-        if not param.requires_grad:
-            continue
-        leaf_name = name.split(".")[-1]
-        if param.ndim < 2 or leaf_name in no_decay_names or leaf_name == "bias" or "norm" in name.lower():
-            no_decay_params.append(param)
-        else:
-            decay_params.append(param)
-    return AdamW(
-        [
-            {"params": decay_params, "weight_decay": cfg["training"]["weight_decay"]},
-            {"params": no_decay_params, "weight_decay": 0.0},
-        ],
-        lr=cfg["training"]["lr"],
-        betas=tuple(cfg["training"]["betas"]),
-    )
-def run_training(config: dict) -> None:
-    init_distributed()
-    set_seed(config["seed"] + get_rank())
-    ensure_dir(config["output_dir"])
-    started_at = utc_timestamp()
-    if is_main_process():
-        save_yaml(Path(config["output_dir"]) / "config.yaml", config)
-        write_metadata(config, status="running", started_at=started_at)
-    if torch.cuda.is_available():
-        if "LOCAL_RANK" in os.environ:
-            device = torch.device(f"cuda:{get_local_rank()}")
-            torch.cuda.set_device(device)
-        else:
-            device = torch.device("cuda")
-    else:
-        device = torch.device("cpu")
-    if device.type == "cuda":
-        torch.cuda.reset_peak_memory_stats()
-        torch.backends.cudnn.benchmark = bool(config["training"].get("cudnn_benchmark", False))
-    raw_model = Hyper3CLIP(**config["model"]).to(device)
-    channels_last = str(config["training"].get("memory_format", "")).lower() == "channels_last"
-    if channels_last:
-        raw_model = raw_model.to(memory_format=torch.channels_last)
-    model: nn.Module = raw_model
-    if get_world_size() > 1:
-        device_ids = [get_local_rank()] if device.type == "cuda" else None
-        model = DistributedDataParallel(
-            raw_model,
-            device_ids=device_ids,
-            broadcast_buffers=False,
-            find_unused_parameters=bool(config["training"].get("find_unused_parameters", False)),
-        )
-    dataset = _build_dataset(config["data"], config["seed"])
-    sampler = _build_sampler(dataset)
-    local_batch_size = _local_batch_size(config["training"])
-    num_workers = config["data"].get("num_workers", config["training"].get("num_workers", 4))
-    dataloader_kwargs = {}
-    if num_workers > 0:
-        dataloader_kwargs["persistent_workers"] = bool(
-            config["data"].get("persistent_workers", config["training"].get("persistent_workers", False))
-        )
-        prefetch_factor = config["data"].get("prefetch_factor", config["training"].get("prefetch_factor"))
-        if prefetch_factor is not None:
-            dataloader_kwargs["prefetch_factor"] = int(prefetch_factor)
-    beta_clip_data_config = config["data"].get("beta_clip", {})
-    dataloader = DataLoader(
-        dataset,
-        batch_size=local_batch_size,
-        sampler=sampler,
-        shuffle=sampler is None and not isinstance(dataset, IterableDataset),
-        num_workers=num_workers,
-        pin_memory=bool(config["data"].get("pin_memory", True)),
-        drop_last=True,
-        collate_fn=lambda x: collate_grounded(
-            x,
-            tokenizer=raw_model.text_encoder.tokenizer,
-            max_text_length=config["data"]["max_text_length"],
-            beta_clip_queries=bool(beta_clip_data_config.get("enabled", False)),
-            beta_clip_max_sentences=int(beta_clip_data_config.get("max_sentences", 5)),
-            beta_clip_max_phrases=int(beta_clip_data_config.get("max_phrases", 30)),
-            beta_clip_max_queries_per_image=beta_clip_data_config.get("max_queries_per_image"),
-            beta_clip_use_part_texts=bool(beta_clip_data_config.get("use_part_texts", True)),
-        ),
-        **dataloader_kwargs,
-    )
-    optimizer = _build_optimizer(model=raw_model, cfg=config)
-    scheduler = CosineWithWarmup(
-        optimizer=optimizer,
-        warmup_steps=config["training"]["warmup_steps"],
-        total_steps=config["training"]["total_steps"],
-        base_lr=config["training"]["lr"],
-    )
-    scaler = GradScaler(device.type, enabled=config["training"]["amp"])
-    start_step = _resume_step(config, raw_model, optimizer, scheduler, scaler, device)
-    run_control = RunControl.from_env() if RunControl is not None else None
-    logger = JsonlLogger(Path(config["output_dir"]) / "train_log.jsonl")
-    model.train()
-    step = start_step
-    micro_step = 0
-    grad_accum_steps = max(1, int(config["training"].get("grad_accum_steps", 1)))
-    non_blocking_transfer = bool(config["training"].get("non_blocking_transfer", True))
-    micro_batch_global_size = local_batch_size * get_world_size()
-    effective_global_batch_size = micro_batch_global_size * grad_accum_steps
-    last_step_time = time.perf_counter()
-    optimizer.zero_grad(set_to_none=True)
-    while step < config["training"]["total_steps"]:
-        if sampler is not None:
-            sampler.set_epoch(step)
-        for batch in dataloader:
-            if step >= config["training"]["total_steps"]:
-                break
-            if micro_step % grad_accum_steps == 0:
-                optimizer.zero_grad(set_to_none=True)
-                scheduler.step(step)
-            batch = {k: v.to(device, non_blocking=non_blocking_transfer) for k, v in batch.items()}
-            if channels_last:
-                batch["image"] = batch["image"].contiguous(memory_format=torch.channels_last)
-                batch["part_images"] = batch["part_images"].contiguous(memory_format=torch.channels_last)
-            with torch.autocast(device_type=device.type, dtype=torch.float16, enabled=config["training"]["amp"]):
-                out = model(**batch, step=step)
-                loss = out["loss"] / grad_accum_steps
-            scaler.scale(loss).backward()
-            micro_step += 1
-            if micro_step % grad_accum_steps != 0:
-                continue
-            if config["training"]["max_grad_norm"] > 0:
-                scaler.unscale_(optimizer)
-                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), config["training"]["max_grad_norm"])
-            else:
-                grad_norm = None
-            scaler.step(optimizer)
-            scaler.update()
-            completed_steps = step + 1
-            now = time.perf_counter()
-            step_time_seconds = now - last_step_time
-            last_step_time = now
-            if completed_steps == 1 or completed_steps % config["training"]["log_interval"] == 0:
-                remaining_steps = config["training"]["total_steps"] - completed_steps
-                row = {
-                    "timestamp": datetime.now(timezone.utc).replace(microsecond=0).isoformat(),
-                    "step": completed_steps,
-                    "loss": float(out["loss"].detach().cpu().item()),
-                    "contrastive_loss": float(out["contrastive_loss"].detach().cpu().item()),
-                    "entailment_loss": float(out["entailment_loss"].detach().cpu().item()),
-                    "part_count": int(out["part_count"].detach().cpu().item()),
-                    "kappa": float(out["kappa"].detach().cpu().item()),
-                    "lr": optimizer.param_groups[0]["lr"],
-                    "grad_norm": None if grad_norm is None else float(grad_norm.detach().cpu().item()),
-                    "step_time_seconds": step_time_seconds,
-                    "steps_per_second": 1.0 / max(step_time_seconds, 1e-12),
-                    "samples_per_second": effective_global_batch_size / max(step_time_seconds, 1e-12),
-                    "samples_seen": completed_steps * effective_global_batch_size,
-                    "progress": completed_steps / config["training"]["total_steps"],
-                    "eta_seconds": remaining_steps * step_time_seconds,
-                    "rank": get_rank(),
-                    "world_size": get_world_size(),
-                    "local_batch_size": local_batch_size,
-                    "micro_batch_global_size": micro_batch_global_size,
-                    "global_batch_size": effective_global_batch_size,
-                    "grad_accum_steps": grad_accum_steps,
-                }
-                if device.type == "cuda":
-                    row["cuda_max_memory_allocated_mb"] = torch.cuda.max_memory_allocated() / (1024**2)
-                for key, value in out.items():
-                    if key in row or key == "loss":
-                        continue
-                    if torch.is_tensor(value) and value.numel() == 1:
-                        row[key] = _scalar_log_value(value)
-                if is_main_process():
-                    logger.write(row)
-                    print(_format_log_row(row), flush=True)
-            if is_main_process() and completed_steps > 0 and completed_steps % config["training"]["ckpt_interval"] == 0:
-                ckpt_path = str(Path(config["output_dir"]) / f"checkpoint_step_{completed_steps}.pt")
-                save_checkpoint(ckpt_path, completed_steps, raw_model, optimizer, scheduler, scaler, config)
-            step = completed_steps
-            if run_control is not None and run_control.should_pause():
-                ckpt_path = str(Path(config["output_dir"]) / f"checkpoint_step_{completed_steps}.pt")
-                if is_main_process():
-                    save_checkpoint(ckpt_path, completed_steps, raw_model, optimizer, scheduler, scaler, config)
-                    (Path(config["output_dir"]) / "latest_checkpoint.txt").write_text(f"{ckpt_path}\n", encoding="utf-8")
-                    run_control.report_checkpoint(ckpt_path)
-                    write_metadata(config, status="paused", started_at=started_at, ended_at=utc_timestamp(), final_step=completed_steps)
-                barrier()
-                destroy_distributed()
-                raise SystemExit(run_control.PAUSED_EXIT_CODE)
-    barrier()
-    if is_main_process():
-        final_ckpt = str(Path(config["output_dir"]) / "checkpoint_final.pt")
-        save_checkpoint(final_ckpt, step, raw_model, optimizer, scheduler, scaler, config)
-        write_metadata(config, status="completed", started_at=started_at, ended_at=utc_timestamp(), final_step=step)
-    barrier()
-    destroy_distributed()
-def utc_timestamp() -> str:
-    return datetime.now(timezone.utc).replace(microsecond=0).isoformat()
-def write_metadata(
-    config: dict,
-    *,
-    status: str,
-    started_at: str,
-    ended_at: str | None = None,
-    final_step: int | None = None,
-) -> None:
-    metadata = {
-        "run_id": config["project"]["experiment"],
-        "experiment_name": config["project"]["name"],
-        "status": status,
-        "start_time": started_at,
-        "end_time": ended_at,
-        "final_step": final_step,
-        "tags": {
-            "data": config.get("data", {}).get("type", "unknown"),
-            "model": config.get("model", {}).get("vision_backbone", "unknown"),
-            "objective": config.get("model", {}).get("objective", "hycoclip"),
-        },
-        "job": {
-            "job_id": os.environ.get("JOB_ID") or os.environ.get("SCHEDULER_JOB_ID") or os.environ.get("SLURM_JOB_ID"),
-            "partition": os.environ.get("JOB_PARTITION")
-            or os.environ.get("SCHEDULER_PARTITION")
-            or os.environ.get("SLURM_JOB_PARTITION"),
-            "num_nodes": os.environ.get("NUM_NODES") or os.environ.get("SLURM_JOB_NUM_NODES"),
-            "node_list": os.environ.get("NODE_LIST") or os.environ.get("SLURM_JOB_NODELIST"),
-            "gpus": os.environ.get("GPU_DEVICES") or os.environ.get("SLURM_JOB_GPUS") or os.environ.get("SLURM_GPUS"),
-        },
-        "env": {
-            "hostname": os.environ.get("HOSTNAME"),
-            "world_size": str(get_world_size()),
-            "rank": str(get_rank()),
-        },
-    }
-    path = Path(config["output_dir"]) / "metadata.json"
-    path.write_text(json.dumps(metadata, indent=2, sort_keys=True) + "\n", encoding="utf-8")
-def _build_dataset(data_config: dict, seed: int) -> GroundedManifestDataset | ProcessedGritDataset | MixedGroundedIterableDataset:
-    data_type = data_config.get("type")
-    if data_type is None:
-        data_type = "processed_grit" if data_config.get("tarfiles") else "manifest"
-    if data_type == "manifest":
-        manifests = data_config.get("manifests") or data_config.get("manifest")
-        if manifests is None:
-            raise ValueError("Manifest training requires data.manifests or data.manifest")
-        return GroundedManifestDataset(
-            manifests=manifests,
-            image_size=data_config["image_size"],
-            seed=seed,
-            manifest_weights=data_config.get("manifest_weights"),
-            part_sampling=data_config.get("part_sampling", "random_one"),
-            max_parts=data_config.get("max_parts"),
-            train_transform=data_config.get("train_transform", "wide_random_crop"),
-            image_normalization=data_config.get("image_normalization", "imagenet"),
-        )
-    if data_type == "processed_grit":
-        return ProcessedGritDataset(
-            tarfiles=data_config["tarfiles"],
-            image_size=data_config["image_size"],
-            seed=seed,
-            shuffle_buffer=data_config.get("shuffle_buffer", 4000),
-            part_sampling=data_config.get("part_sampling", "random_one"),
-            max_parts=data_config.get("max_parts"),
-            train_transform=data_config.get("train_transform", "wide_random_crop"),
-            image_normalization=data_config.get("image_normalization", "imagenet"),
-            deterministic_transforms=data_config.get("deterministic_transforms", False),
-        )
-    if data_type == "mixed_processed_grit_manifest":
-        manifest_config = data_config.get("manifest_data", {})
-        manifests = manifest_config.get("manifests") or manifest_config.get("manifest") or data_config.get("manifests")
-        if manifests is None:
-            raise ValueError("Mixed GRIT+manifest training requires data.manifest_data.manifests")
-        primary = ProcessedGritDataset(
-            tarfiles=data_config["tarfiles"],
-            image_size=data_config["image_size"],
-            seed=seed,
-            shuffle_buffer=data_config.get("shuffle_buffer", 4000),
-            part_sampling=data_config.get("part_sampling", "random_one"),
-            max_parts=data_config.get("max_parts"),
-            train_transform=data_config.get("train_transform", "wide_random_crop"),
-            image_normalization=data_config.get("image_normalization", "imagenet"),
-            deterministic_transforms=data_config.get("deterministic_transforms", False),
-        )
-        auxiliary = GroundedManifestDataset(
-            manifests=manifests,
-            image_size=manifest_config.get("image_size", data_config["image_size"]),
-            seed=seed + 47,
-            manifest_weights=manifest_config.get("manifest_weights"),
-            part_sampling=manifest_config.get("part_sampling", data_config.get("manifest_part_sampling", "all")),
-            max_parts=manifest_config.get("max_parts", data_config.get("manifest_max_parts")),
-            train_transform=manifest_config.get("train_transform", data_config.get("train_transform", "wide_random_crop")),
-            image_normalization=manifest_config.get("image_normalization", data_config.get("image_normalization", "imagenet")),
-        )
-        return MixedGroundedIterableDataset(
-            primary=primary,
-            auxiliary=auxiliary,
-            auxiliary_probability=float(data_config.get("manifest_probability", 0.15)),
-            seed=seed,
-        )
-    raise ValueError(f"Unsupported data.type {data_type!r}")
-def _build_sampler(dataset: GroundedManifestDataset | ProcessedGritDataset | MixedGroundedIterableDataset) -> DistributedSampler | None:
-    if get_world_size() == 1 or isinstance(dataset, IterableDataset):
-        return None
-    return DistributedSampler(dataset, num_replicas=get_world_size(), rank=get_rank(), shuffle=True, drop_last=True)
-def _local_batch_size(training_config: dict) -> int:
-    if "batch_size" in training_config:
-        return int(training_config["batch_size"])
-    global_batch_size = int(training_config["global_batch_size"])
-    if global_batch_size % get_world_size() != 0:
-        raise ValueError("training.global_batch_size must be divisible by world size")
-    return global_batch_size // get_world_size()
-def _resume_step(
-    config: dict,
-    model: nn.Module,
-    optimizer: Optimizer,
-    scheduler: CosineWithWarmup,
-    scaler: GradScaler,
-    device: torch.device,
-) -> int:
-    training_config = config["training"]
-    resume_env = training_config.get("resume_from_env", "RESUME_FROM_CHECKPOINT")
-    resume_path = os.environ.get(str(resume_env)) if resume_env else None
-    if resume_path is None:
-        resume_path = training_config.get("resume_from")
-    if resume_path is None and training_config.get("resume", False):
-        resume_path = latest_checkpoint(config["output_dir"])
-    if resume_path is None:
-        return 0
-    return load_checkpoint(
-        resume_path,
-        model,
-        optimizer,
-        scheduler,
-        scaler,
-        device,
-        model_only=bool(training_config.get("resume_model_only", False)),
-        strict_model=bool(training_config.get("resume_strict_model", True)),
-    )
-def _format_log_row(row: dict) -> str:
-    return " ".join(f"{key}={value}" for key, value in row.items())
-def _scalar_log_value(value: torch.Tensor) -> float | int:
-    detached = value.detach().cpu()
-    if detached.dtype == torch.bool:
-        return int(detached.item())
-    if detached.dtype in (torch.int8, torch.int16, torch.int32, torch.int64, torch.uint8):
-        return int(detached.item())
-    return float(detached.item())

hyper3_clip/training/logging.py DELETED Viewed

@@ -1,15 +0,0 @@
-from __future__ import annotations
-import json
-from pathlib import Path
-from typing import Any
-class JsonlLogger:
-    def __init__(self, path: str | Path) -> None:
-        self.path = Path(path)
-        self.path.parent.mkdir(parents=True, exist_ok=True)
-    def write(self, row: dict[str, Any]) -> None:
-        with self.path.open("a", encoding="utf-8") as handle:
-            handle.write(json.dumps(row) + "\n")

hyper3_clip/utils/io.py DELETED Viewed

@@ -1,29 +0,0 @@
-from __future__ import annotations
-from pathlib import Path
-import random
-import numpy as np
-import torch
-import yaml
-def load_yaml(path: str) -> dict:
-    with Path(path).open("r", encoding="utf-8") as f:
-        return yaml.safe_load(f)
-def save_yaml(path: str | Path, payload: dict) -> None:
-    with Path(path).open("w", encoding="utf-8") as f:
-        yaml.safe_dump(payload, f, sort_keys=False)
-def ensure_dir(path: str) -> None:
-    Path(path).mkdir(parents=True, exist_ok=True)
-def set_seed(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)

hyper3_clip_provider.py DELETED Viewed

@@ -1,133 +0,0 @@
-"""HyperView embedding provider for the Hyper3-CLIP v0.5 HF checkpoint."""
-from __future__ import annotations
-import os
-from pathlib import Path
-from typing import Any
-import numpy as np
-import torch
-import yaml
-from huggingface_hub import snapshot_download
-from lancedb.embeddings import EmbeddingFunction
-from pydantic import PrivateAttr
-from safetensors.torch import load_file
-class Hyper3ClipEmbeddings(EmbeddingFunction):
-    """Image embeddings from Hyper3-CLIP v0.5 in Lorentz/hyperboloid space."""
-    name: str = "hyper3labs/hyper3-clip-v0.5"
-    batch_size: int = 8
-    device: str = "cpu"
-    _model: Any = PrivateAttr(default=None)
-    _transform: Any = PrivateAttr(default=None)
-    @property
-    def geometry(self) -> str:
-        return "hyperboloid"
-    @property
-    def curvature(self) -> float:
-        self._ensure_model()
-        return float(self._model._kappa().detach().cpu().reshape(-1)[0].item())
-    def ndims(self) -> int:
-        return 513
-    def _ensure_model(self) -> None:
-        if self._model is not None:
-            return
-        from hyper3_clip import Hyper3CLIP
-        from torchvision import transforms
-        token = os.environ.get("HF_TOKEN")
-        local_dir = snapshot_download(
-            self.name,
-            allow_patterns=["config.yaml", "model.safetensors"],
-            token=token,
-        )
-        root = Path(local_dir)
-        config = yaml.safe_load((root / "config.yaml").read_text(encoding="utf-8"))
-        model = Hyper3CLIP(**config["model"])
-        state = load_file(root / "model.safetensors", device="cpu")
-        state = _normalize_checkpoint_keys(state, model)
-        model.load_state_dict(state)
-        model.to(torch.device(self.device))
-        model.eval()
-        self._model = model
-        image_size = int(config.get("data", {}).get("image_size", 224))
-        self._transform = transforms.Compose(
-            [
-                transforms.Resize(image_size, interpolation=transforms.InterpolationMode.BICUBIC),
-                transforms.CenterCrop(image_size),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=(0.485, 0.456, 0.406),
-                    std=(0.229, 0.224, 0.225),
-                ),
-            ]
-        )
-    def compute_source_embeddings(
-        self,
-        inputs: Any,
-        *args: Any,
-        **kwargs: Any,
-    ) -> list[np.ndarray | None]:
-        from PIL import Image
-        from hyperview.core.sample import Sample
-        self._ensure_model()
-        device = torch.device(self.device)
-        images = []
-        for item in self.sanitize_input(inputs):
-            if isinstance(item, Sample):
-                with item.load_image() as img:
-                    images.append(img.convert("RGB"))
-            elif isinstance(item, str):
-                with Image.open(item) as img:
-                    images.append(img.convert("RGB"))
-            elif isinstance(item, Image.Image):
-                images.append(item.convert("RGB"))
-            else:
-                raise TypeError(f"Unsupported input type: {type(item)}")
-        outputs: list[np.ndarray | None] = []
-        with torch.inference_mode():
-            for start in range(0, len(images), self.batch_size):
-                batch = images[start:start + self.batch_size]
-                tensor = torch.stack([self._transform(image) for image in batch]).to(device)
-                encoded = self._model.encode_image(tensor).detach().cpu().numpy().astype(np.float32)
-                outputs.extend(encoded)
-        return outputs
-    def compute_query_embeddings(
-        self,
-        query: Any,
-        *args: Any,
-        **kwargs: Any,
-    ) -> list[np.ndarray | None]:
-        return self.compute_source_embeddings([query], *args, **kwargs)
-def _normalize_checkpoint_keys(state: dict[str, torch.Tensor], model: torch.nn.Module) -> dict[str, torch.Tensor]:
-    """Handle CLIPTextModel wrapper key drift between training and Space runtime."""
-    model_keys = set(model.state_dict())
-    old_prefix = "text_encoder.backbone.text_model."
-    new_prefix = "text_encoder.backbone."
-    if not any(key.startswith(old_prefix) for key in state):
-        return state
-    if any(key.startswith(old_prefix) for key in model_keys):
-        return state
-    normalized: dict[str, torch.Tensor] = {}
-    for key, value in state.items():
-        candidate = new_prefix + key[len(old_prefix):] if key.startswith(old_prefix) else key
-        normalized[candidate if candidate in model_keys else key] = value
-    return normalized