Spaces:

HorizonRobotics
/

3D-Fixer

Paused

App Files Files Community

JasonYinnnn commited on Apr 3

Commit

2a36119

1 Parent(s): 7dabaaa

move threeDFixer to safe file

Browse files

Files changed (39) hide show

app.py +59 -50
moge/__init__.py +5 -0
moge/model/__init__.py +23 -0
moge/model/dinov2/__init__.py +6 -0
moge/model/dinov2/hub/__init__.py +4 -0
moge/model/dinov2/hub/backbones.py +156 -0
moge/model/dinov2/hub/utils.py +39 -0
moge/model/dinov2/layers/__init__.py +11 -0
moge/model/dinov2/layers/attention.py +100 -0
moge/model/dinov2/layers/block.py +259 -0
moge/model/dinov2/layers/dino_head.py +58 -0
moge/model/dinov2/layers/drop_path.py +34 -0
moge/model/dinov2/layers/layer_scale.py +27 -0
moge/model/dinov2/layers/mlp.py +40 -0
moge/model/dinov2/layers/patch_embed.py +88 -0
moge/model/dinov2/layers/swiglu_ffn.py +72 -0
moge/model/dinov2/models/__init__.py +43 -0
moge/model/dinov2/models/vision_transformer.py +407 -0
moge/model/dinov2/utils/__init__.py +4 -0
moge/model/dinov2/utils/cluster.py +95 -0
moge/model/dinov2/utils/config.py +72 -0
moge/model/dinov2/utils/dtype.py +37 -0
moge/model/dinov2/utils/param_groups.py +103 -0
moge/model/dinov2/utils/utils.py +95 -0
moge/model/modules.py +259 -0
moge/model/transforms.py +1344 -0
moge/model/utils.py +54 -0
moge/model/v2.py +359 -0
moge/utils/__init__.py +5 -0
moge/utils/download.py +60 -0
moge/utils/geometry_numpy.py +411 -0
moge/utils/geometry_torch.py +359 -0
moge/utils/io.py +241 -0
moge/utils/panorama.py +196 -0
moge/utils/pipeline.py +508 -0
moge/utils/tools.py +294 -0
moge/utils/vis.py +70 -0
moge/utils/webfile.py +78 -0
moge/utils/webzipfile.py +133 -0

app.py CHANGED Viewed

@@ -18,17 +18,7 @@ import random
 import imageio
 from einops import repeat
 from huggingface_hub import snapshot_download
-from threeDFixer.moge.model.v2 import MoGeModel
-from threeDFixer.pipelines import ThreeDFixerPipeline
-from threeDFixer.datasets.utils import (
-    edge_mask_morph_gradient,
-    process_scene_image,
-    process_instance_image,
-    transform_vertices,
-    normalize_vertices,
-    project2ply
-)
-from threeDFixer.utils import render_utils, postprocessing_utils
 from transformers import AutoModelForMaskGeneration, AutoProcessor
 from scripts.grounding_sam import plot_segmentation, segment
 import copy
@@ -192,6 +182,11 @@ def run_depth_estimation(
 ) -> Image.Image:
     rgb_image = image_prompts["image"].convert("RGB")
     rgb_image = rgb_image.resize((1024, 1024), Image.Resampling.LANCZOS)
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -291,45 +286,6 @@ def set_random_seed(seed):
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(seed)
-def export_single_glb_from_outputs(
-    outputs,
-    fine_scale,
-    fine_trans,
-    coarse_scale,
-    coarse_trans,
-    trans,
-    scale,
-    rot,
-    work_space,
-    instance_name,
-    run_id
-):
-    with torch.enable_grad():
-        glb = postprocessing_utils.to_glb(
-            outputs["gaussian"][0],
-            outputs["mesh"][0],
-            simplify=0.95,
-            texture_size=1024,
-            transform_fn=lambda x: transform_vertices(
-                x,
-                ops=["scale", "translation", "scale", "translation"],
-                params=[fine_scale, fine_trans[None], coarse_scale, coarse_trans[None]],
-            ),
-            verbose=False
-        )
-    instance_glb_path = os.path.abspath(
-        os.path.join(work_space, f"{run_id}_{instance_name}.glb")
-    )
-    glb.apply_translation(-trans) \
-       .apply_scale(1.0 / (scale + 1e-6)) \
-       .apply_transform(rot) \
-       .export(instance_glb_path)
-    return instance_glb_path, glb
 def export_scene_glb(trimeshes, work_space, scene_name):
     scene_path = os.path.abspath(os.path.join(work_space, scene_name))
@@ -356,6 +312,59 @@ def run_generation(
     cfg_interval_end: float = 1.0,
     t_rescale: float = 3.0,
 ):
     global dpt_pack
     global work_space
     global generated_object_map

 import imageio
 from einops import repeat
 from huggingface_hub import snapshot_download
+from moge.model.v2 import MoGeModel
 from transformers import AutoModelForMaskGeneration, AutoProcessor
 from scripts.grounding_sam import plot_segmentation, segment
 import copy
 ) -> Image.Image:
     rgb_image = image_prompts["image"].convert("RGB")
+    from threeDFixer.datasets.utils import (
+        normalize_vertices,
+        project2ply
+    )
     rgb_image = rgb_image.resize((1024, 1024), Image.Resampling.LANCZOS)
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(seed)
 def export_scene_glb(trimeshes, work_space, scene_name):
     scene_path = os.path.abspath(os.path.join(work_space, scene_name))
     cfg_interval_end: float = 1.0,
     t_rescale: float = 3.0,
 ):
+    from threeDFixer.pipelines import ThreeDFixerPipeline
+    from threeDFixer.datasets.utils import (
+        edge_mask_morph_gradient,
+        process_scene_image,
+        process_instance_image,
+    )
+    from threeDFixer.utils import render_utils
+    def export_single_glb_from_outputs(
+        outputs,
+        fine_scale,
+        fine_trans,
+        coarse_scale,
+        coarse_trans,
+        trans,
+        scale,
+        rot,
+        work_space,
+        instance_name,
+        run_id
+    ):
+        from threeDFixer.datasets.utils import (
+            transform_vertices,
+        )
+        from threeDFixer.utils import postprocessing_utils
+        with torch.enable_grad():
+            glb = postprocessing_utils.to_glb(
+                outputs["gaussian"][0],
+                outputs["mesh"][0],
+                simplify=0.95,
+                texture_size=1024,
+                transform_fn=lambda x: transform_vertices(
+                    x,
+                    ops=["scale", "translation", "scale", "translation"],
+                    params=[fine_scale, fine_trans[None], coarse_scale, coarse_trans[None]],
+                ),
+                verbose=False
+            )
+        instance_glb_path = os.path.abspath(
+            os.path.join(work_space, f"{run_id}_{instance_name}.glb")
+        )
+        glb.apply_translation(-trans) \
+        .apply_scale(1.0 / (scale + 1e-6)) \
+        .apply_transform(rot) \
+        .export(instance_glb_path)
+        return instance_glb_path, glb
     global dpt_pack
     global work_space
     global generated_object_map

moge/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors

moge/model/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+import importlib
+from typing import *
+if TYPE_CHECKING:
+    from .v1 import MoGeModel as MoGeModelV1
+    from .v2 import MoGeModel as MoGeModelV2
+def import_model_class_by_version(version: str) -> Type[Union['MoGeModelV1', 'MoGeModelV2']]:
+    assert version in ['v1', 'v2'], f'Unsupported model version: {version}'
+    try:
+        module = importlib.import_module(f'.{version}', __package__)
+    except ModuleNotFoundError:
+        raise ValueError(f'Model version "{version}" not found.')
+    cls = getattr(module, 'MoGeModel')
+    return cls

moge/model/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

moge/model/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

moge/model/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

moge/model/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

moge/model/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

moge/model/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+import torch.nn.functional as F
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    # # Deprecated implementation, extremely slow
+    # def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+    #     B, N, C = x.shape
+    #     qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+    #     q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+    #     attn = q @ k.transpose(-2, -1)
+    #     attn = attn.softmax(dim=-1)
+    #     attn = self.attn_drop(attn)
+    #     x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+    #     x = self.proj(x)
+    #     x = self.proj_drop(x)
+    #     return x
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)  # (3, B, H, N, C // H)
+        q, k, v = qkv.unbind(0)      # (B, H, N, C // H)
+        x = F.scaled_dot_product_attention(q, k, v, attn_bias)
+        x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

moge/model/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Block)")
+    else:
+        # warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

moge/model/dinov2/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

moge/model/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

moge/model/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

moge/model/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

moge/model/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

moge/model/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        # warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

moge/model/dinov2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+from . import vision_transformer as vits
+logger = logging.getLogger("dinov2")
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)

moge/model/dinov2/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,407 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable, Optional, List
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from ..layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    @property
+    def onnx_compatible_mode(self):
+        return getattr(self, "_onnx_compatible_mode", False)
+    @onnx_compatible_mode.setter
+    def onnx_compatible_mode(self, value: bool):
+        self._onnx_compatible_mode = value
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, h, w):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        batch_size = x.shape[0]
+        N = self.pos_embed.shape[1] - 1
+        if not self.onnx_compatible_mode and npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0, :]
+        patch_pos_embed = pos_embed[:, 1:, :]
+        dim = x.shape[-1]
+        h0, w0 = h // self.patch_size, w // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if not self.onnx_compatible_mode and self.interpolate_offset > 0:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sy, sx)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (h0, w0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (h0, w0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).flatten(1, 2)
+        return torch.cat((class_pos_embed[:, None, :].expand(patch_pos_embed.shape[0], -1, -1), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, h, w = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, h, w)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks, ar in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

moge/model/dinov2/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

moge/model/dinov2/utils/cluster.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+    return ClusterType.FAIR
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+    return cluster_type
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params

moge/model/dinov2/utils/config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import math
+import logging
+import os
+from omegaconf import OmegaConf
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+logger = logging.getLogger("dinov2")
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg

moge/model/dinov2/utils/dtype.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from typing import Dict, Union
+import numpy as np
+import torch
+TypeSpec = Union[str, np.dtype, torch.dtype]
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]

moge/model/dinov2/utils/param_groups.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+import logging
+logger = logging.getLogger("dinov2")
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if (
+            ".pos_embed" in name
+            or ".patch_embed" in name
+            or ".mask_token" in name
+            or ".cls_token" in name
+            or ".register_tokens" in name
+        ):
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name
+            or "patch_embed" in name
+            or "mask_token" in name
+            or "cls_token" in name
+            or "register_tokens" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+    return all_param_groups
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+    return fused_params_groups.values()

moge/model/dinov2/utils/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+import numpy as np
+import torch
+from torch import nn
+logger = logging.getLogger("dinov2")
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+        freeze_schedule = np.zeros((freeze_iters))
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+        assert len(self.schedule) == self.total_iters
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False

moge/model/modules.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+from numbers import Number
+import importlib
+import itertools
+import functools
+import sys
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+from .dinov2.models.vision_transformer import DinoVisionTransformer
+from .utils import wrap_dinov2_attention_with_sdpa, wrap_module_with_gradient_checkpointing, unwrap_module_with_gradient_checkpointing
+from ..utils.geometry_torch import normalized_view_plane_uv
+class ResidualConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        hidden_channels: int = None,
+        kernel_size: int = 3,
+        padding_mode: str = 'replicate',
+        activation: Literal['relu', 'leaky_relu', 'silu', 'elu'] = 'relu',
+        in_norm: Literal['group_norm', 'layer_norm', 'instance_norm', 'none'] = 'layer_norm',
+        hidden_norm: Literal['group_norm', 'layer_norm', 'instance_norm'] = 'group_norm',
+    ):
+        super(ResidualConvBlock, self).__init__()
+        if out_channels is None:
+            out_channels = in_channels
+        if hidden_channels is None:
+            hidden_channels = in_channels
+        if activation =='relu':
+            activation_cls = nn.ReLU
+        elif activation == 'leaky_relu':
+            activation_cls = functools.partial(nn.LeakyReLU, negative_slope=0.2)
+        elif activation =='silu':
+            activation_cls = nn.SiLU
+        elif activation == 'elu':
+            activation_cls = nn.ELU
+        else:
+            raise ValueError(f'Unsupported activation function: {activation}')
+        self.layers = nn.Sequential(
+            nn.GroupNorm(in_channels // 32, in_channels) if in_norm == 'group_norm' else \
+                nn.GroupNorm(1, in_channels) if in_norm == 'layer_norm' else \
+                nn.InstanceNorm2d(in_channels) if in_norm == 'instance_norm' else \
+                nn.Identity(),
+            activation_cls(),
+            nn.Conv2d(in_channels, hidden_channels, kernel_size=kernel_size, padding=kernel_size // 2, padding_mode=padding_mode),
+            nn.GroupNorm(hidden_channels // 32, hidden_channels) if hidden_norm == 'group_norm' else \
+                nn.GroupNorm(1, hidden_channels) if hidden_norm == 'layer_norm' else \
+                nn.InstanceNorm2d(hidden_channels) if hidden_norm == 'instance_norm' else\
+                nn.Identity(),
+            activation_cls(),
+            nn.Conv2d(hidden_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2, padding_mode=padding_mode)
+        )
+        self.skip_connection = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0) if in_channels != out_channels else nn.Identity()
+    def forward(self, x):
+        skip = self.skip_connection(x)
+        x = self.layers(x)
+        x = x + skip
+        return x
+class DINOv2Encoder(nn.Module):
+    "Wrapped DINOv2 encoder supporting gradient checkpointing. Input is RGB image in range [0, 1]."
+    backbone: DinoVisionTransformer
+    image_mean: torch.Tensor
+    image_std: torch.Tensor
+    dim_features: int
+    def __init__(self, backbone: str, intermediate_layers: Union[int, List[int]], dim_out: int, **deprecated_kwargs):
+        super(DINOv2Encoder, self).__init__()
+        self.intermediate_layers = intermediate_layers
+        # Load the backbone
+        self.hub_loader = getattr(importlib.import_module(".dinov2.hub.backbones", __package__), backbone)
+        self.backbone_name = backbone
+        self.backbone = self.hub_loader(pretrained=False)
+        self.dim_features = self.backbone.blocks[0].attn.qkv.in_features
+        self.num_features = intermediate_layers if isinstance(intermediate_layers, int) else len(intermediate_layers)
+        self.output_projections = nn.ModuleList([
+            nn.Conv2d(in_channels=self.dim_features, out_channels=dim_out, kernel_size=1, stride=1, padding=0,)
+                for _ in range(self.num_features)
+        ])
+        self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+    @property
+    def onnx_compatible_mode(self):
+        return getattr(self, "_onnx_compatible_mode", False)
+    @onnx_compatible_mode.setter
+    def onnx_compatible_mode(self, value: bool):
+        self._onnx_compatible_mode = value
+        self.backbone.onnx_compatible_mode = value
+    def init_weights(self):
+        pretrained_backbone_state_dict = self.hub_loader(pretrained=True).state_dict()
+        self.backbone.load_state_dict(pretrained_backbone_state_dict)
+    def enable_gradient_checkpointing(self):
+        for i in range(len(self.backbone.blocks)):
+            wrap_module_with_gradient_checkpointing(self.backbone.blocks[i])
+    def enable_pytorch_native_sdpa(self):
+        for i in range(len(self.backbone.blocks)):
+            wrap_dinov2_attention_with_sdpa(self.backbone.blocks[i].attn)
+    def forward(self, image: torch.Tensor, token_rows: Union[int, torch.LongTensor], token_cols: Union[int, torch.LongTensor], return_class_token: bool = False) -> Tuple[torch.Tensor, torch.Tensor]:
+        image_14 = F.interpolate(image, (token_rows * 14, token_cols * 14), mode="bilinear", align_corners=False, antialias=not self.onnx_compatible_mode)
+        image_14 = (image_14 - self.image_mean) / self.image_std
+        # Get intermediate layers from the backbone
+        features = self.backbone.get_intermediate_layers(image_14, n=self.intermediate_layers, return_class_token=True)
+        # Project features to the desired dimensionality
+        x = torch.stack([
+            proj(feat.permute(0, 2, 1).unflatten(2, (token_rows, token_cols)).contiguous())
+                for proj, (feat, clstoken) in zip(self.output_projections, features)
+        ], dim=1).sum(dim=1)
+        if return_class_token:
+            return x, features[-1][1]
+        else:
+            return x
+class Resampler(nn.Sequential):
+    def __init__(self,
+        in_channels: int,
+        out_channels: int,
+        type_: Literal['pixel_shuffle', 'nearest', 'bilinear', 'conv_transpose', 'pixel_unshuffle', 'avg_pool', 'max_pool'],
+        scale_factor: int = 2,
+    ):
+        if type_ == 'pixel_shuffle':
+            nn.Sequential.__init__(self,
+                nn.Conv2d(in_channels, out_channels * (scale_factor ** 2), kernel_size=3, stride=1, padding=1, padding_mode='replicate'),
+                nn.PixelShuffle(scale_factor),
+                nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+            for i in range(1, scale_factor ** 2):
+                self[0].weight.data[i::scale_factor ** 2] = self[0].weight.data[0::scale_factor ** 2]
+                self[0].bias.data[i::scale_factor ** 2] = self[0].bias.data[0::scale_factor ** 2]
+        elif type_ in ['nearest', 'bilinear']:
+            nn.Sequential.__init__(self,
+                nn.Upsample(scale_factor=scale_factor, mode=type_, align_corners=False if type_ == 'bilinear' else None),
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+        elif type_ == 'conv_transpose':
+            nn.Sequential.__init__(self,
+                nn.ConvTranspose2d(in_channels, out_channels, kernel_size=scale_factor, stride=scale_factor),
+                nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+            self[0].weight.data[:] = self[0].weight.data[:, :, :1, :1]
+        elif type_ == 'pixel_unshuffle':
+            nn.Sequential.__init__(self,
+                nn.PixelUnshuffle(scale_factor),
+                nn.Conv2d(in_channels * (scale_factor ** 2), out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+        elif type_ == 'avg_pool':
+            nn.Sequential.__init__(self,
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate'),
+                nn.AvgPool2d(kernel_size=scale_factor, stride=scale_factor),
+            )
+        elif type_ == 'max_pool':
+            nn.Sequential.__init__(self,
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate'),
+                nn.MaxPool2d(kernel_size=scale_factor, stride=scale_factor),
+            )
+        else:
+            raise ValueError(f'Unsupported resampler type: {type_}')
+class MLP(nn.Sequential):
+    def __init__(self, dims: Sequence[int]):
+        nn.Sequential.__init__(self,
+            *itertools.chain(*[
+                (nn.Linear(dim_in, dim_out), nn.ReLU(inplace=True))
+                    for dim_in, dim_out in zip(dims[:-2], dims[1:-1])
+            ]),
+            nn.Linear(dims[-2], dims[-1]),
+        )
+class ConvStack(nn.Module):
+    def __init__(self,
+        dim_in: List[Optional[int]],
+        dim_res_blocks: List[int],
+        dim_out: List[Optional[int]],
+        resamplers: Union[Literal['pixel_shuffle', 'nearest', 'bilinear', 'conv_transpose', 'pixel_unshuffle', 'avg_pool', 'max_pool'], List],
+        dim_times_res_block_hidden: int = 1,
+        num_res_blocks: int = 1,
+        res_block_in_norm: Literal['layer_norm', 'group_norm' , 'instance_norm', 'none'] = 'layer_norm',
+        res_block_hidden_norm: Literal['layer_norm', 'group_norm' , 'instance_norm', 'none'] = 'group_norm',
+        activation: Literal['relu', 'leaky_relu', 'silu', 'elu'] = 'relu',
+    ):
+        super().__init__()
+        self.input_blocks = nn.ModuleList([
+            nn.Conv2d(dim_in_, dim_res_block_, kernel_size=1, stride=1, padding=0) if dim_in_ is not None else nn.Identity()
+                for dim_in_, dim_res_block_ in zip(dim_in if isinstance(dim_in, Sequence) else itertools.repeat(dim_in), dim_res_blocks)
+        ])
+        self.resamplers = nn.ModuleList([
+            Resampler(dim_prev, dim_succ, scale_factor=2, type_=resampler)
+                for i, (dim_prev, dim_succ, resampler) in enumerate(zip(
+                    dim_res_blocks[:-1],
+                    dim_res_blocks[1:],
+                    resamplers if isinstance(resamplers, Sequence) else itertools.repeat(resamplers)
+                ))
+        ])
+        self.res_blocks = nn.ModuleList([
+            nn.Sequential(
+                *(
+                    ResidualConvBlock(
+                        dim_res_block_, dim_res_block_, dim_times_res_block_hidden * dim_res_block_,
+                        activation=activation, in_norm=res_block_in_norm, hidden_norm=res_block_hidden_norm
+                    ) for _ in range(num_res_blocks[i] if isinstance(num_res_blocks, list) else num_res_blocks)
+                )
+            ) for i, dim_res_block_ in enumerate(dim_res_blocks)
+        ])
+        self.output_blocks = nn.ModuleList([
+            nn.Conv2d(dim_res_block_, dim_out_, kernel_size=1, stride=1, padding=0) if dim_out_ is not None else nn.Identity()
+                for dim_out_, dim_res_block_ in zip(dim_out if isinstance(dim_out, Sequence) else itertools.repeat(dim_out), dim_res_blocks)
+        ])
+    def enable_gradient_checkpointing(self):
+        for i in range(len(self.resamplers)):
+            self.resamplers[i] = wrap_module_with_gradient_checkpointing(self.resamplers[i])
+        for i in range(len(self.res_blocks)):
+            for j in range(len(self.res_blocks[i])):
+                self.res_blocks[i][j] = wrap_module_with_gradient_checkpointing(self.res_blocks[i][j])
+    def forward(self, in_features: List[torch.Tensor]):
+        out_features = []
+        for i in range(len(self.res_blocks)):
+            feature = self.input_blocks[i](in_features[i])
+            if i == 0:
+                x = feature
+            elif feature is not None:
+                x = x + feature
+            x = self.res_blocks[i](x)
+            out_features.append(self.output_blocks[i](x))
+            if i < len(self.res_blocks) - 1:
+                x = self.resamplers[i](x)
+        return out_features

moge/model/transforms.py ADDED Viewed

	@@ -0,0 +1,1344 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+from numbers import Number
+import torch
+import torch.nn.functional as F
+import inspect
+from functools import wraps
+import warnings
+def suppress_traceback(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            e.__traceback__ = e.__traceback__.tb_next.tb_next
+            raise
+    return wrapper
+class no_warnings:
+    def __init__(self, action: str = 'ignore', **kwargs):
+        self.action = action
+        self.filter_kwargs = kwargs
+    def __call__(self, fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with warnings.catch_warnings():
+                warnings.simplefilter(self.action, **self.filter_kwargs)
+                return fn(*args, **kwargs)
+        return wrapper
+    def __enter__(self):
+        self.warnings_manager = warnings.catch_warnings()
+        self.warnings_manager.__enter__()
+        warnings.simplefilter(self.action, **self.filter_kwargs)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.warnings_manager.__exit__(exc_type, exc_val, exc_tb)
+def get_device(args, kwargs):
+    device = None
+    for arg in (list(args) + list(kwargs.values())):
+        if isinstance(arg, torch.Tensor):
+            if device is None:
+                device = arg.device
+            elif device != arg.device:
+                raise ValueError("All tensors must be on the same device.")
+    return device
+def get_args_order(func, args, kwargs):
+    """
+    Get the order of the arguments of a function.
+    """
+    names = inspect.getfullargspec(func).args
+    names_idx = {name: i for i, name in enumerate(names)}
+    args_order = []
+    kwargs_order = {}
+    for name, arg in kwargs.items():
+        if name in names:
+            kwargs_order[name] = names_idx[name]
+            names.remove(name)
+    for i, arg in enumerate(args):
+        if i < len(names):
+            args_order.append(names_idx[names[i]])
+    return args_order, kwargs_order
+def broadcast_args(args, kwargs, args_dim, kwargs_dim):
+    spatial = []
+    for arg, arg_dim in zip(args + list(kwargs.values()), args_dim + list(kwargs_dim.values())):
+        if isinstance(arg, torch.Tensor) and arg_dim is not None:
+            arg_spatial = arg.shape[:arg.ndim-arg_dim]
+            if len(arg_spatial) > len(spatial):
+                spatial = [1] * (len(arg_spatial) - len(spatial)) + spatial
+            for j in range(len(arg_spatial)):
+                if spatial[-j] < arg_spatial[-j]:
+                    if spatial[-j] == 1:
+                        spatial[-j] = arg_spatial[-j]
+                    else:
+                        raise ValueError("Cannot broadcast arguments.")
+    for i, arg in enumerate(args):
+        if isinstance(arg, torch.Tensor) and args_dim[i] is not None:
+            args[i] = torch.broadcast_to(arg, [*spatial, *arg.shape[arg.ndim-args_dim[i]:]])
+    for key, arg in kwargs.items():
+        if isinstance(arg, torch.Tensor) and kwargs_dim[key] is not None:
+            kwargs[key] = torch.broadcast_to(arg, [*spatial, *arg.shape[arg.ndim-kwargs_dim[key]:]])
+    return args, kwargs, spatial
+@suppress_traceback
+def batched(*dims):
+    """
+    Decorator that allows a function to be called with batched arguments.
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, device=torch.device('cpu'), **kwargs):
+            args = list(args)
+            # get arguments dimensions
+            args_order, kwargs_order = get_args_order(func, args, kwargs)
+            args_dim = [dims[i] for i in args_order]
+            kwargs_dim = {key: dims[i] for key, i in kwargs_order.items()}
+            # convert to torch tensor
+            device = get_device(args, kwargs) or device
+            for i, arg in enumerate(args):
+                if isinstance(arg, (Number, list, tuple)) and args_dim[i] is not None:
+                    args[i] = torch.tensor(arg, device=device)
+            for key, arg in kwargs.items():
+                if isinstance(arg, (Number, list, tuple)) and kwargs_dim[key] is not None:
+                    kwargs[key] = torch.tensor(arg, device=device)
+            # broadcast arguments
+            args, kwargs, spatial = broadcast_args(args, kwargs, args_dim, kwargs_dim)
+            for i, (arg, arg_dim) in enumerate(zip(args, args_dim)):
+                if isinstance(arg, torch.Tensor) and arg_dim is not None:
+                    args[i] = arg.reshape([-1, *arg.shape[arg.ndim-arg_dim:]])
+            for key, arg in kwargs.items():
+                if isinstance(arg, torch.Tensor) and kwargs_dim[key] is not None:
+                    kwargs[key] = arg.reshape([-1, *arg.shape[arg.ndim-kwargs_dim[key]:]])
+            # call function
+            results = func(*args, **kwargs)
+            type_results = type(results)
+            results = list(results) if isinstance(results, (tuple, list)) else [results]
+            # restore spatial dimensions
+            for i, result in enumerate(results):
+                results[i] = result.reshape([*spatial, *result.shape[1:]])
+            if type_results == tuple:
+                results = tuple(results)
+            elif type_results == list:
+                results = list(results)
+            else:
+                results = results[0]
+            return results
+        return wrapper
+    return decorator
+__all__ = [
+    'perspective',
+    'perspective_from_fov',
+    'perspective_from_fov_xy',
+    'intrinsics_from_focal_center',
+    'intrinsics_from_fov',
+    'intrinsics_from_fov_xy',
+    'focal_to_fov',
+    'fov_to_focal',
+    'intrinsics_to_fov',
+    'view_look_at',
+    'extrinsics_look_at',
+    'perspective_to_intrinsics',
+    'intrinsics_to_perspective',
+    'extrinsics_to_view',
+    'view_to_extrinsics',
+    'normalize_intrinsics',
+    'crop_intrinsics',
+    'pixel_to_uv',
+    'pixel_to_ndc',
+    'uv_to_pixel',
+    'project_depth',
+    'depth_buffer_to_linear',
+    'project_gl',
+    'project_cv',
+    'unproject_gl',
+    'unproject_cv',
+    'skew_symmetric',
+    'rotation_matrix_from_vectors',
+    'euler_axis_angle_rotation',
+    'euler_angles_to_matrix',
+    'matrix_to_euler_angles',
+    'matrix_to_quaternion',
+    'quaternion_to_matrix',
+    'matrix_to_axis_angle',
+    'axis_angle_to_matrix',
+    'axis_angle_to_quaternion',
+    'quaternion_to_axis_angle',
+    'slerp',
+    'interpolate_extrinsics',
+    'interpolate_view',
+    'extrinsics_to_essential',
+    'to4x4',
+    'rotation_matrix_2d',
+    'rotate_2d',
+    'translate_2d',
+    'scale_2d',
+    'apply_2d',
+]
+@batched(0,0,0,0)
+def perspective(
+        fov_y: Union[float, torch.Tensor],
+        aspect: Union[float, torch.Tensor],
+        near: Union[float, torch.Tensor],
+        far: Union[float, torch.Tensor]
+    ) -> torch.Tensor:
+    """
+    Get OpenGL perspective matrix
+    Args:
+        fov_y (float | torch.Tensor): field of view in y axis
+        aspect (float | torch.Tensor): aspect ratio
+        near (float | torch.Tensor): near plane to clip
+        far (float | torch.Tensor): far plane to clip
+    Returns:
+        (torch.Tensor): [..., 4, 4] perspective matrix
+    """
+    N = fov_y.shape[0]
+    ret = torch.zeros((N, 4, 4), dtype=fov_y.dtype, device=fov_y.device)
+    ret[:, 0, 0] = 1. / (torch.tan(fov_y / 2) * aspect)
+    ret[:, 1, 1] = 1. / (torch.tan(fov_y / 2))
+    ret[:, 2, 2] = (near + far) / (near - far)
+    ret[:, 2, 3] = 2. * near * far / (near - far)
+    ret[:, 3, 2] = -1.
+    return ret
+def perspective_from_fov(
+        fov: Union[float, torch.Tensor],
+        width: Union[int, torch.Tensor],
+        height: Union[int, torch.Tensor],
+        near: Union[float, torch.Tensor],
+        far: Union[float, torch.Tensor]
+    ) -> torch.Tensor:
+    """
+    Get OpenGL perspective matrix from field of view in largest dimension
+    Args:
+        fov (float | torch.Tensor): field of view in largest dimension
+        width (int | torch.Tensor): image width
+        height (int | torch.Tensor): image height
+        near (float | torch.Tensor): near plane to clip
+        far (float | torch.Tensor): far plane to clip
+    Returns:
+        (torch.Tensor): [..., 4, 4] perspective matrix
+    """
+    fov_y = 2 * torch.atan(torch.tan(fov / 2) * height / torch.maximum(width, height))
+    aspect = width / height
+    return perspective(fov_y, aspect, near, far)
+def perspective_from_fov_xy(
+        fov_x: Union[float, torch.Tensor],
+        fov_y: Union[float, torch.Tensor],
+        near: Union[float, torch.Tensor],
+        far: Union[float, torch.Tensor]
+    ) -> torch.Tensor:
+    """
+    Get OpenGL perspective matrix from field of view in x and y axis
+    Args:
+        fov_x (float | torch.Tensor): field of view in x axis
+        fov_y (float | torch.Tensor): field of view in y axis
+        near (float | torch.Tensor): near plane to clip
+        far (float | torch.Tensor): far plane to clip
+    Returns:
+        (torch.Tensor): [..., 4, 4] perspective matrix
+    """
+    aspect = torch.tan(fov_x / 2) / torch.tan(fov_y / 2)
+    return perspective(fov_y, aspect, near, far)
+@batched(0,0,0,0)
+def intrinsics_from_focal_center(
+    fx: Union[float, torch.Tensor],
+    fy: Union[float, torch.Tensor],
+    cx: Union[float, torch.Tensor],
+    cy: Union[float, torch.Tensor]
+) -> torch.Tensor:
+    """
+    Get OpenCV intrinsics matrix
+    Args:
+        focal_x (float | torch.Tensor): focal length in x axis
+        focal_y (float | torch.Tensor): focal length in y axis
+        cx (float | torch.Tensor): principal point in x axis
+        cy (float | torch.Tensor): principal point in y axis
+    Returns:
+        (torch.Tensor): [..., 3, 3] OpenCV intrinsics matrix
+    """
+    N = fx.shape[0]
+    ret = torch.zeros((N, 3, 3), dtype=fx.dtype, device=fx.device)
+    zeros, ones = torch.zeros(N, dtype=fx.dtype, device=fx.device), torch.ones(N, dtype=fx.dtype, device=fx.device)
+    ret = torch.stack([fx, zeros, cx, zeros, fy, cy, zeros, zeros, ones], dim=-1).unflatten(-1, (3, 3))
+    return ret
+@batched(0, 0, 0, 0, 0, 0)
+def intrinsics_from_fov(
+    fov_max: Union[float, torch.Tensor] = None,
+    fov_min: Union[float, torch.Tensor] = None,
+    fov_x: Union[float, torch.Tensor] = None,
+    fov_y: Union[float, torch.Tensor] = None,
+    width: Union[int, torch.Tensor] = None,
+    height: Union[int, torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Get normalized OpenCV intrinsics matrix from given field of view.
+    You can provide either fov_max, fov_min, fov_x or fov_y
+    Args:
+        width (int | torch.Tensor): image width
+        height (int | torch.Tensor): image height
+        fov_max (float | torch.Tensor): field of view in largest dimension
+        fov_min (float | torch.Tensor): field of view in smallest dimension
+        fov_x (float | torch.Tensor): field of view in x axis
+        fov_y (float | torch.Tensor): field of view in y axis
+    Returns:
+        (torch.Tensor): [..., 3, 3] OpenCV intrinsics matrix
+    """
+    if fov_max is not None:
+        fx = torch.maximum(width, height) / width / (2 * torch.tan(fov_max / 2))
+        fy = torch.maximum(width, height) / height / (2 * torch.tan(fov_max / 2))
+    elif fov_min is not None:
+        fx = torch.minimum(width, height) / width / (2 * torch.tan(fov_min / 2))
+        fy = torch.minimum(width, height) / height / (2 * torch.tan(fov_min / 2))
+    elif fov_x is not None and fov_y is not None:
+        fx = 1 / (2 * torch.tan(fov_x / 2))
+        fy = 1 / (2 * torch.tan(fov_y / 2))
+    elif fov_x is not None:
+        fx = 1 / (2 * torch.tan(fov_x / 2))
+        fy = fx * width / height
+    elif fov_y is not None:
+        fy = 1 / (2 * torch.tan(fov_y / 2))
+        fx = fy * height / width
+    cx = 0.5
+    cy = 0.5
+    ret = intrinsics_from_focal_center(fx, fy, cx, cy)
+    return ret
+def intrinsics_from_fov_xy(
+    fov_x: Union[float, torch.Tensor],
+    fov_y: Union[float, torch.Tensor]
+) -> torch.Tensor:
+    """
+    Get OpenCV intrinsics matrix from field of view in x and y axis
+    Args:
+        fov_x (float | torch.Tensor): field of view in x axis
+        fov_y (float | torch.Tensor): field of view in y axis
+    Returns:
+        (torch.Tensor): [..., 3, 3] OpenCV intrinsics matrix
+    """
+    focal_x = 0.5 / torch.tan(fov_x / 2)
+    focal_y = 0.5 / torch.tan(fov_y / 2)
+    cx = cy = 0.5
+    return intrinsics_from_focal_center(focal_x, focal_y, cx, cy)
+def focal_to_fov(focal: torch.Tensor):
+    return 2 * torch.atan(0.5 / focal)
+def fov_to_focal(fov: torch.Tensor):
+    return 0.5 / torch.tan(fov / 2)
+def intrinsics_to_fov(intrinsics: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    "NOTE: approximate FOV by assuming centered principal point"
+    fov_x = focal_to_fov(intrinsics[..., 0, 0])
+    fov_y = focal_to_fov(intrinsics[..., 1, 1])
+    return fov_x, fov_y
+@batched(1,1,1)
+def view_look_at(
+    eye: torch.Tensor,
+    look_at: torch.Tensor,
+    up: torch.Tensor
+) -> torch.Tensor:
+    """
+    Get OpenGL view matrix looking at something
+    Args:
+        eye (torch.Tensor): [..., 3] the eye position
+        look_at (torch.Tensor): [..., 3] the position to look at
+        up (torch.Tensor): [..., 3] head up direction (y axis in screen space). Not necessarily othogonal to view direction
+    Returns:
+        (torch.Tensor): [..., 4, 4], view matrix
+    """
+    N = eye.shape[0]
+    z = eye - look_at
+    x = torch.cross(up, z, dim=-1)
+    y = torch.cross(z, x, dim=-1)
+    # x = torch.cross(y, z, dim=-1)
+    x = x / x.norm(dim=-1, keepdim=True)
+    y = y / y.norm(dim=-1, keepdim=True)
+    z = z / z.norm(dim=-1, keepdim=True)
+    R = torch.stack([x, y, z], dim=-2)
+    t = -torch.matmul(R, eye[..., None])
+    ret = torch.zeros((N, 4, 4), dtype=eye.dtype, device=eye.device)
+    ret[:, :3, :3] = R
+    ret[:, :3, 3] = t[:, :, 0]
+    ret[:, 3, 3] = 1.
+    return ret
+@batched(1, 1, 1)
+def extrinsics_look_at(
+    eye: torch.Tensor,
+    look_at: torch.Tensor,
+    up: torch.Tensor
+) -> torch.Tensor:
+    """
+    Get OpenCV extrinsics matrix looking at something
+    Args:
+        eye (torch.Tensor): [..., 3] the eye position
+        look_at (torch.Tensor): [..., 3] the position to look at
+        up (torch.Tensor): [..., 3] head up direction (-y axis in screen space). Not necessarily othogonal to view direction
+    Returns:
+        (torch.Tensor): [..., 4, 4], extrinsics matrix
+    """
+    N = eye.shape[0]
+    z = look_at - eye
+    x = torch.cross(-up, z, dim=-1)
+    y = torch.cross(z, x, dim=-1)
+    # x = torch.cross(y, z, dim=-1)
+    x = x / x.norm(dim=-1, keepdim=True)
+    y = y / y.norm(dim=-1, keepdim=True)
+    z = z / z.norm(dim=-1, keepdim=True)
+    R = torch.stack([x, y, z], dim=-2)
+    t = -torch.matmul(R, eye[..., None])
+    ret = torch.zeros((N, 4, 4), dtype=eye.dtype, device=eye.device)
+    ret[:, :3, :3] = R
+    ret[:, :3, 3] = t[:, :, 0]
+    ret[:, 3, 3] = 1.
+    return ret
+@batched(2)
+def perspective_to_intrinsics(
+    perspective: torch.Tensor
+) -> torch.Tensor:
+    """
+    OpenGL perspective matrix to OpenCV intrinsics
+    Args:
+        perspective (torch.Tensor): [..., 4, 4] OpenGL perspective matrix
+    Returns:
+        (torch.Tensor): shape [..., 3, 3] OpenCV intrinsics
+    """
+    assert torch.allclose(perspective[:, [0, 1, 3], 3], 0), "The perspective matrix is not a projection matrix"
+    ret = torch.tensor([[0.5, 0., 0.5], [0., -0.5, 0.5], [0., 0., 1.]], dtype=perspective.dtype, device=perspective.device) \
+        @ perspective[:, [0, 1, 3], :3] \
+        @ torch.diag(torch.tensor([1, -1, -1], dtype=perspective.dtype, device=perspective.device))
+    return ret / ret[:, 2, 2, None, None]
+@batched(2,0,0)
+def intrinsics_to_perspective(
+    intrinsics: torch.Tensor,
+    near: Union[float, torch.Tensor],
+    far: Union[float, torch.Tensor],
+) -> torch.Tensor:
+    """
+    OpenCV intrinsics to OpenGL perspective matrix
+    Args:
+        intrinsics (torch.Tensor): [..., 3, 3] OpenCV intrinsics matrix
+        near (float | torch.Tensor): [...] near plane to clip
+        far (float | torch.Tensor): [...] far plane to clip
+    Returns:
+        (torch.Tensor): [..., 4, 4] OpenGL perspective matrix
+    """
+    N = intrinsics.shape[0]
+    fx, fy = intrinsics[:, 0, 0], intrinsics[:, 1, 1]
+    cx, cy = intrinsics[:, 0, 2], intrinsics[:, 1, 2]
+    ret = torch.zeros((N, 4, 4), dtype=intrinsics.dtype, device=intrinsics.device)
+    ret[:, 0, 0] = 2 * fx
+    ret[:, 1, 1] = 2 * fy
+    ret[:, 0, 2] = -2 * cx + 1
+    ret[:, 1, 2] = 2 * cy - 1
+    ret[:, 2, 2] = (near + far) / (near - far)
+    ret[:, 2, 3] = 2. * near * far / (near - far)
+    ret[:, 3, 2] = -1.
+    return ret
+@batched(2)
+def extrinsics_to_view(
+        extrinsics: torch.Tensor
+    ) -> torch.Tensor:
+    """
+    OpenCV camera extrinsics to OpenGL view matrix
+    Args:
+        extrinsics (torch.Tensor): [..., 4, 4] OpenCV camera extrinsics matrix
+    Returns:
+        (torch.Tensor): [..., 4, 4] OpenGL view matrix
+    """
+    return extrinsics * torch.tensor([1, -1, -1, 1], dtype=extrinsics.dtype, device=extrinsics.device)[:, None]
+@batched(2)
+def view_to_extrinsics(
+        view: torch.Tensor
+    ) -> torch.Tensor:
+    """
+    OpenGL view matrix to OpenCV camera extrinsics
+    Args:
+        view (torch.Tensor): [..., 4, 4] OpenGL view matrix
+    Returns:
+        (torch.Tensor): [..., 4, 4] OpenCV camera extrinsics matrix
+    """
+    return view  * torch.tensor([1, -1, -1, 1], dtype=view.dtype, device=view.device)[:, None]
+@batched(2,0,0)
+def normalize_intrinsics(
+        intrinsics: torch.Tensor,
+        width: Union[int, torch.Tensor],
+        height: Union[int, torch.Tensor]
+    ) -> torch.Tensor:
+    """
+    Normalize camera intrinsics(s) to uv space
+    Args:
+        intrinsics (torch.Tensor): [..., 3, 3] camera intrinsics(s) to normalize
+        width (int | torch.Tensor): [...] image width(s)
+        height (int | torch.Tensor): [...] image height(s)
+    Returns:
+        (torch.Tensor): [..., 3, 3] normalized camera intrinsics(s)
+    """
+    zeros = torch.zeros_like(width)
+    ones = torch.ones_like(width)
+    transform = torch.stack([
+        1 / width, zeros, 0.5 / width,
+        zeros, 1 / height, 0.5 / height,
+        zeros, zeros, ones
+    ]).reshape(*zeros.shape, 3, 3).to(intrinsics)
+    return transform @ intrinsics
+@batched(2,0,0,0,0,0,0)
+def crop_intrinsics(
+    intrinsics: torch.Tensor,
+    width: Union[int, torch.Tensor],
+    height: Union[int, torch.Tensor],
+    left: Union[int, torch.Tensor],
+    top: Union[int, torch.Tensor],
+    crop_width: Union[int, torch.Tensor],
+    crop_height: Union[int, torch.Tensor]
+) -> torch.Tensor:
+    """
+    Evaluate the new intrinsics(s) after crop the image: cropped_img = img[top:top+crop_height, left:left+crop_width]
+    Args:
+        intrinsics (torch.Tensor): [..., 3, 3] camera intrinsics(s) to crop
+        width (int | torch.Tensor): [...] image width(s)
+        height (int | torch.Tensor): [...] image height(s)
+        left (int | torch.Tensor): [...] left crop boundary
+        top (int | torch.Tensor): [...] top crop boundary
+        crop_width (int | torch.Tensor): [...] crop width
+        crop_height (int | torch.Tensor): [...] crop height
+    Returns:
+        (torch.Tensor): [..., 3, 3] cropped camera intrinsics(s)
+    """
+    zeros = torch.zeros_like(width)
+    ones = torch.ones_like(width)
+    transform = torch.stack([
+        width / crop_width, zeros, -left / crop_width,
+        zeros, height / crop_height, -top / crop_height,
+        zeros, zeros, ones
+    ]).reshape(*zeros.shape, 3, 3).to(intrinsics)
+    return transform @ intrinsics
+@batched(1,0,0)
+def pixel_to_uv(
+    pixel: torch.Tensor,
+    width: Union[int, torch.Tensor],
+    height: Union[int, torch.Tensor]
+) -> torch.Tensor:
+    """
+    Args:
+        pixel (torch.Tensor): [..., 2] pixel coordinrates defined in image space,  x range is (0, W - 1), y range is (0, H - 1)
+        width (int | torch.Tensor): [...] image width(s)
+        height (int | torch.Tensor): [...] image height(s)
+    Returns:
+        (torch.Tensor): [..., 2] pixel coordinrates defined in uv space, the range is (0, 1)
+    """
+    if not torch.is_floating_point(pixel):
+        pixel = pixel.float()
+    uv = (pixel + 0.5) / torch.stack([width, height], dim=-1).to(pixel)
+    return uv
+@batched(1,0,0)
+def uv_to_pixel(
+    uv: torch.Tensor,
+    width: Union[int, torch.Tensor],
+    height: Union[int, torch.Tensor]
+) -> torch.Tensor:
+    """
+    Args:
+        uv (torch.Tensor): [..., 2] pixel coordinrates defined in uv space, the range is (0, 1)
+        width (int | torch.Tensor): [...] image width(s)
+        height (int | torch.Tensor): [...] image height(s)
+    Returns:
+        (torch.Tensor): [..., 2] pixel coordinrates defined in uv space, the range is (0, 1)
+    """
+    pixel = uv * torch.stack([width, height], dim=-1).to(uv) - 0.5
+    return pixel
+@batched(1,0,0)
+def pixel_to_ndc(
+    pixel: torch.Tensor,
+    width: Union[int, torch.Tensor],
+    height: Union[int, torch.Tensor]
+) -> torch.Tensor:
+    """
+    Args:
+        pixel (torch.Tensor): [..., 2] pixel coordinrates defined in image space, x range is (0, W - 1), y range is (0, H - 1)
+        width (int | torch.Tensor): [...] image width(s)
+        height (int | torch.Tensor): [...] image height(s)
+    Returns:
+        (torch.Tensor): [..., 2] pixel coordinrates defined in ndc space, the range is (-1, 1)
+    """
+    if not torch.is_floating_point(pixel):
+        pixel = pixel.float()
+    ndc = (pixel + 0.5) / (torch.stack([width, height], dim=-1).to(pixel) * torch.tensor([2, -2], dtype=pixel.dtype, device=pixel.device)) \
+        + torch.tensor([-1, 1], dtype=pixel.dtype, device=pixel.device)
+    return ndc
+@batched(0,0,0)
+def project_depth(
+        depth: torch.Tensor,
+        near: Union[float, torch.Tensor],
+        far: Union[float, torch.Tensor]
+    ) -> torch.Tensor:
+    """
+    Project linear depth to depth value in screen space
+    Args:
+        depth (torch.Tensor): [...] depth value
+        near (float | torch.Tensor): [...] near plane to clip
+        far (float | torch.Tensor): [...] far plane to clip
+    Returns:
+        (torch.Tensor): [..., 1] depth value in screen space, value ranging in [0, 1]
+    """
+    return (far - near * far / depth) / (far - near)
+@batched(0,0,0)
+def depth_buffer_to_linear(
+        depth: torch.Tensor,
+        near: Union[float, torch.Tensor],
+        far: Union[float, torch.Tensor]
+    ) -> torch.Tensor:
+    """
+    Linearize depth value to linear depth
+    Args:
+        depth (torch.Tensor): [...] screen depth value, ranging in [0, 1]
+        near (float | torch.Tensor): [...] near plane to clip
+        far (float | torch.Tensor): [...] far plane to clip
+    Returns:
+        (torch.Tensor): [...] linear depth
+    """
+    return near * far / (far - (far - near) * depth)
+@batched(2, 2, 2, 2)
+def project_gl(
+    points: torch.Tensor,
+    model: torch.Tensor = None,
+    view: torch.Tensor = None,
+    perspective: torch.Tensor = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Project 3D points to 2D following the OpenGL convention (except for row major matrice)
+    Args:
+        points (torch.Tensor): [..., N, 3 or 4] 3D points to project, if the last
+            dimension is 4, the points are assumed to be in homogeneous coordinates
+        model (torch.Tensor): [..., 4, 4] model matrix
+        view (torch.Tensor): [..., 4, 4] view matrix
+        perspective (torch.Tensor): [..., 4, 4] perspective matrix
+    Returns:
+        scr_coord (torch.Tensor): [..., N, 3] screen space coordinates, value ranging in [0, 1].
+            The origin (0., 0., 0.) is corresponding to the left & bottom & nearest
+        linear_depth (torch.Tensor): [..., N] linear depth
+    """
+    assert perspective is not None, "perspective matrix is required"
+    if points.shape[-1] == 3:
+        points = torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
+    mvp = perspective if perspective is not None else torch.eye(4).to(points)
+    if view is not None:
+        mvp = mvp @ view
+    if model is not None:
+        mvp = mvp @ model
+    clip_coord = points @ mvp.transpose(-1, -2)
+    ndc_coord = clip_coord[..., :3] / clip_coord[..., 3:]
+    scr_coord = ndc_coord * 0.5 + 0.5
+    linear_depth = clip_coord[..., 3]
+    return scr_coord, linear_depth
+@batched(2, 2, 2)
+def project_cv(
+    points: torch.Tensor,
+    extrinsics: torch.Tensor = None,
+    intrinsics: torch.Tensor = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Project 3D points to 2D following the OpenCV convention
+    Args:
+        points (torch.Tensor): [..., N, 3] or [..., N, 4] 3D points to project, if the last
+            dimension is 4, the points are assumed to be in homogeneous coordinates
+        extrinsics (torch.Tensor): [..., 4, 4] extrinsics matrix
+        intrinsics (torch.Tensor): [..., 3, 3] intrinsics matrix
+    Returns:
+        uv_coord (torch.Tensor): [..., N, 2] uv coordinates, value ranging in [0, 1].
+            The origin (0., 0.) is corresponding to the left & top
+        linear_depth (torch.Tensor): [..., N] linear depth
+    """
+    assert intrinsics is not None, "intrinsics matrix is required"
+    if points.shape[-1] == 3:
+        points = torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
+    if extrinsics is not None:
+        points = points @ extrinsics.transpose(-1, -2)
+    points = points[..., :3] @ intrinsics.transpose(-2, -1)
+    uv_coord = points[..., :2] / points[..., 2:]
+    linear_depth = points[..., 2]
+    return uv_coord, linear_depth
+@batched(2, 2, 2, 2)
+def unproject_gl(
+        screen_coord: torch.Tensor,
+        model: torch.Tensor = None,
+        view: torch.Tensor = None,
+        perspective: torch.Tensor = None
+    ) -> torch.Tensor:
+    """
+    Unproject screen space coordinates to 3D view space following the OpenGL convention (except for row major matrice)
+    Args:
+        screen_coord (torch.Tensor): [... N, 3] screen space coordinates, value ranging in [0, 1].
+            The origin (0., 0., 0.) is corresponding to the left & bottom & nearest
+        model (torch.Tensor): [..., 4, 4] model matrix
+        view (torch.Tensor): [..., 4, 4] view matrix
+        perspective (torch.Tensor): [..., 4, 4] perspective matrix
+    Returns:
+        points (torch.Tensor): [..., N, 3] 3d points
+    """
+    assert perspective is not None, "perspective matrix is required"
+    ndc_xy = screen_coord * 2 - 1
+    clip_coord = torch.cat([ndc_xy, torch.ones_like(ndc_xy[..., :1])], dim=-1)
+    transform = perspective
+    if view is not None:
+        transform = transform @ view
+    if model is not None:
+        transform = transform @ model
+    transform = torch.inverse(transform)
+    points = clip_coord @ transform.transpose(-1, -2)
+    points = points[..., :3] / points[..., 3:]
+    return points
+@batched(2, 1, 2, 2)
+def unproject_cv(
+    uv_coord: torch.Tensor,
+    depth: torch.Tensor = None,
+    extrinsics: torch.Tensor = None,
+    intrinsics: torch.Tensor = None
+) -> torch.Tensor:
+    """
+    Unproject uv coordinates to 3D view space following the OpenCV convention
+    Args:
+        uv_coord (torch.Tensor): [..., N, 2] uv coordinates, value ranging in [0, 1].
+            The origin (0., 0.) is corresponding to the left & top
+        depth (torch.Tensor): [..., N] depth value
+        extrinsics (torch.Tensor): [..., 4, 4] extrinsics matrix
+        intrinsics (torch.Tensor): [..., 3, 3] intrinsics matrix
+    Returns:
+        points (torch.Tensor): [..., N, 3] 3d points
+    """
+    assert intrinsics is not None, "intrinsics matrix is required"
+    points = torch.cat([uv_coord, torch.ones_like(uv_coord[..., :1])], dim=-1)
+    points = points @ torch.inverse(intrinsics).transpose(-2, -1)
+    if depth is not None:
+        points = points * depth[..., None]
+    if extrinsics is not None:
+        points = torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
+        points = (points @ torch.inverse(extrinsics).transpose(-2, -1))[..., :3]
+    return points
+def euler_axis_angle_rotation(axis: str, angle: torch.Tensor) -> torch.Tensor:
+    """
+    Return the rotation matrices for one of the rotations about an axis
+    of which Euler angles describe, for each value of the angle given.
+    Args:
+        axis: Axis label "X" or "Y or "Z".
+        angle: any shape tensor of Euler angles in radians
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    cos = torch.cos(angle)
+    sin = torch.sin(angle)
+    one = torch.ones_like(angle)
+    zero = torch.zeros_like(angle)
+    if axis == "X":
+        R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+    elif axis == "Y":
+        R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+    elif axis == "Z":
+        R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+    else:
+        raise ValueError("letter must be either X, Y or Z.")
+    return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+def euler_angles_to_matrix(euler_angles: torch.Tensor, convention: str = 'XYZ') -> torch.Tensor:
+    """
+    Convert rotations given as Euler angles in radians to rotation matrices.
+    Args:
+        euler_angles: Euler angles in radians as tensor of shape (..., 3), XYZ
+        convention: permutation of "X", "Y" or "Z", representing the order of Euler rotations to apply.
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    if euler_angles.dim() == 0 or euler_angles.shape[-1] != 3:
+        raise ValueError("Invalid input euler angles.")
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    matrices = [
+        euler_axis_angle_rotation(c, euler_angles[..., 'XYZ'.index(c)])
+        for c in convention
+    ]
+    # return functools.reduce(torch.matmul, matrices)
+    return matrices[2] @ matrices[1] @ matrices[0]
+def skew_symmetric(v: torch.Tensor):
+    "Skew symmetric matrix from a 3D vector"
+    assert v.shape[-1] == 3, "v must be 3D"
+    x, y, z = v.unbind(dim=-1)
+    zeros = torch.zeros_like(x)
+    return torch.stack([
+        zeros, -z, y,
+        z, zeros, -x,
+        -y, x, zeros,
+    ], dim=-1).reshape(*v.shape[:-1], 3, 3)
+def rotation_matrix_from_vectors(v1: torch.Tensor, v2: torch.Tensor):
+    "Rotation matrix that rotates v1 to v2"
+    I = torch.eye(3).to(v1)
+    v1 = F.normalize(v1, dim=-1)
+    v2 = F.normalize(v2, dim=-1)
+    v = torch.cross(v1, v2, dim=-1)
+    c = torch.sum(v1 * v2, dim=-1)
+    K = skew_symmetric(v)
+    R = I + K + (1 / (1 + c))[None, None] * (K @ K)
+    return R
+def _angle_from_tan(
+    axis: str, other_axis: str, data, horizontal: bool, tait_bryan: bool
+) -> torch.Tensor:
+    """
+    Extract the first or third Euler angle from the two members of
+    the matrix which are positive constant times its sine and cosine.
+    Args:
+        axis: Axis label "X" or "Y or "Z" for the angle we are finding.
+        other_axis: Axis label "X" or "Y or "Z" for the middle axis in the
+            convention.
+        data: Rotation matrices as tensor of shape (..., 3, 3).
+        horizontal: Whether we are looking for the angle for the third axis,
+            which means the relevant entries are in the same row of the
+            rotation matrix. If not, they are in the same column.
+        tait_bryan: Whether the first and third axes in the convention differ.
+    Returns:
+        Euler Angles in radians for each matrix in data as a tensor
+        of shape (...).
+    """
+    i1, i2 = {"X": (2, 1), "Y": (0, 2), "Z": (1, 0)}[axis]
+    if horizontal:
+        i2, i1 = i1, i2
+    even = (axis + other_axis) in ["XY", "YZ", "ZX"]
+    if horizontal == even:
+        return torch.atan2(data[..., i1], data[..., i2])
+    if tait_bryan:
+        return torch.atan2(-data[..., i2], data[..., i1])
+    return torch.atan2(data[..., i2], -data[..., i1])
+def matrix_to_euler_angles(matrix: torch.Tensor, convention: str) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to Euler angles in radians.
+    NOTE: The composition order eg. `XYZ` means `Rz * Ry * Rx` (like blender), instead of `Rx * Ry * Rz` (like pytorch3d)
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+        convention: Convention string of three uppercase letters.
+    Returns:
+        Euler angles in radians as tensor of shape (..., 3), in the order of XYZ (like blender), instead of convention (like pytorch3d)
+    """
+    if not all(c in 'XYZ' for c in convention) or not all(c in convention for c in 'XYZ'):
+        raise ValueError(f"Invalid convention {convention}.")
+    if not matrix.shape[-2:] == (3, 3):
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    i0 = 'XYZ'.index(convention[0])
+    i2 = 'XYZ'.index(convention[2])
+    tait_bryan = i0 != i2
+    if tait_bryan:
+        central_angle = torch.asin(matrix[..., i2, i0] * (-1.0 if i2 - i0 in [-1, 2] else 1.0))
+    else:
+        central_angle = torch.acos(matrix[..., i2, i2])
+    # Angles in composition order
+    o = [
+        _angle_from_tan(
+            convention[0], convention[1], matrix[..., i2, :], True, tait_bryan
+        ),
+        central_angle,
+        _angle_from_tan(
+            convention[2], convention[1], matrix[..., i0], False, tait_bryan
+        ),
+    ]
+    return torch.stack([o[convention.index(c)] for c in 'XYZ'], -1)
+def axis_angle_to_matrix(axis_angle: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    """Convert axis-angle representation (rotation vector) to rotation matrix, whose direction is the axis of rotation and length is the angle of rotation
+    Args:
+        axis_angle (torch.Tensor): shape (..., 3), axis-angle vcetors
+    Returns:
+        torch.Tensor: shape (..., 3, 3) The rotation matrices for the given axis-angle parameters
+    """
+    batch_shape = axis_angle.shape[:-1]
+    device, dtype = axis_angle.device, axis_angle.dtype
+    angle = torch.norm(axis_angle + eps, dim=-1, keepdim=True)
+    axis = axis_angle / angle
+    cos = torch.cos(angle)[..., None, :]
+    sin = torch.sin(angle)[..., None, :]
+    rx, ry, rz = torch.split(axis, 3, dim=-1)
+    zeros = torch.zeros((*batch_shape, 1), dtype=dtype, device=device)
+    K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros], dim=-1).view((*batch_shape, 3, 3))
+    ident = torch.eye(3, dtype=dtype, device=device)
+    rot_mat = ident + sin * K + (1 - cos) * torch.matmul(K, K)
+    return rot_mat
+def matrix_to_axis_angle(rot_mat: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    """Convert a batch of 3x3 rotation matrices to axis-angle representation (rotation vector)
+    Args:
+        rot_mat (torch.Tensor): shape (..., 3, 3), the rotation matrices to convert
+    Returns:
+        torch.Tensor: shape (..., 3), the axis-angle vectors corresponding to the given rotation matrices
+    """
+    quat = matrix_to_quaternion(rot_mat)
+    axis_angle = quaternion_to_axis_angle(quat, eps=eps)
+    return axis_angle
+def quaternion_to_axis_angle(quaternion: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    """Convert a batch of quaternions (w, x, y, z) to axis-angle representation (rotation vector)
+    Args:
+        quaternion (torch.Tensor): shape (..., 4), the quaternions to convert
+    Returns:
+        torch.Tensor: shape (..., 3), the axis-angle vectors corresponding to the given quaternions
+    """
+    assert quaternion.shape[-1] == 4
+    norm = torch.norm(quaternion[..., 1:], dim=-1, keepdim=True)
+    axis = quaternion[..., 1:] / norm.clamp(min=eps)
+    angle = 2 * torch.atan2(norm, quaternion[..., 0:1])
+    return angle * axis
+def axis_angle_to_quaternion(axis_angle: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    """Convert axis-angle representation (rotation vector) to quaternion (w, x, y, z)
+    Args:
+        axis_angle (torch.Tensor): shape (..., 3), axis-angle vcetors
+    Returns:
+        torch.Tensor: shape (..., 4) The quaternions for the given axis-angle parameters
+    """
+    axis = F.normalize(axis_angle, dim=-1, eps=eps)
+    angle = torch.norm(axis_angle, dim=-1, keepdim=True)
+    quat = torch.cat([torch.cos(angle / 2), torch.sin(angle / 2) * axis], dim=-1)
+    return quat
+def matrix_to_quaternion(rot_mat: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    """Convert 3x3 rotation matrix to quaternion (w, x, y, z)
+    Args:
+        rot_mat (torch.Tensor): shape (..., 3, 3), the rotation matrices to convert
+    Returns:
+        torch.Tensor: shape (..., 4), the quaternions corresponding to the given rotation matrices
+    """
+    # Extract the diagonal and off-diagonal elements of the rotation matrix
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = rot_mat.flatten(-2).unbind(dim=-1)
+    diag = torch.diagonal(rot_mat, dim1=-2, dim2=-1)
+    M = torch.tensor([
+        [1, 1, 1],
+        [1, -1, -1],
+        [-1, 1, -1],
+        [-1, -1, 1]
+    ], dtype=rot_mat.dtype, device=rot_mat.device)
+    wxyz = (1 + diag @ M.transpose(-1, -2)).clamp_(0).sqrt().mul(0.5)
+    _, max_idx = wxyz.max(dim=-1)
+    xw = torch.sign(m21 - m12)
+    yw = torch.sign(m02 - m20)
+    zw = torch.sign(m10 - m01)
+    yz = torch.sign(m21 + m12)
+    xz = torch.sign(m02 + m20)
+    xy = torch.sign(m01 + m10)
+    ones = torch.ones_like(xw)
+    sign = torch.where(
+        max_idx[..., None] == 0,
+        torch.stack([ones, xw, yw, zw], dim=-1),
+        torch.where(
+            max_idx[..., None] == 1,
+            torch.stack([xw, ones, xy, xz], dim=-1),
+            torch.where(
+                max_idx[..., None] == 2,
+                torch.stack([yw, xy, ones, yz], dim=-1),
+                torch.stack([zw, xz, yz, ones], dim=-1)
+            )
+        )
+    )
+    quat = sign * wxyz
+    quat = F.normalize(quat, dim=-1, eps=eps)
+    return quat
+def quaternion_to_matrix(quaternion: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    """Converts a batch of quaternions (w, x, y, z) to rotation matrices
+    Args:
+        quaternion (torch.Tensor): shape (..., 4), the quaternions to convert
+    Returns:
+        torch.Tensor: shape (..., 3, 3), the rotation matrices corresponding to the given quaternions
+    """
+    assert quaternion.shape[-1] == 4
+    quaternion = F.normalize(quaternion, dim=-1, eps=eps)
+    w, x, y, z = quaternion.unbind(dim=-1)
+    zeros = torch.zeros_like(w)
+    I = torch.eye(3, dtype=quaternion.dtype, device=quaternion.device)
+    xyz = quaternion[..., 1:]
+    A = xyz[..., :, None] * xyz[..., None, :] - I * (xyz ** 2).sum(dim=-1)[..., None, None]
+    B = torch.stack([
+        zeros, -z, y,
+        z, zeros, -x,
+        -y, x, zeros
+    ], dim=-1).unflatten(-1, (3, 3))
+    rot_mat = I + 2 * (A + w[..., None, None] * B)
+    return rot_mat
+def slerp(rot_mat_1: torch.Tensor, rot_mat_2: torch.Tensor, t: Union[Number, torch.Tensor]) -> torch.Tensor:
+    """Spherical linear interpolation between two rotation matrices
+    Args:
+        rot_mat_1 (torch.Tensor): shape (..., 3, 3), the first rotation matrix
+        rot_mat_2 (torch.Tensor): shape (..., 3, 3), the second rotation matrix
+        t (torch.Tensor): scalar or shape (...,), the interpolation factor
+    Returns:
+        torch.Tensor: shape (..., 3, 3), the interpolated rotation matrix
+    """
+    assert rot_mat_1.shape[-2:] == (3, 3)
+    rot_vec_1 = matrix_to_axis_angle(rot_mat_1)
+    rot_vec_2 = matrix_to_axis_angle(rot_mat_2)
+    if isinstance(t, Number):
+        t = torch.tensor(t, dtype=rot_mat_1.dtype, device=rot_mat_1.device)
+    rot_vec = (1 - t[..., None]) * rot_vec_1 + t[..., None] * rot_vec_2
+    rot_mat = axis_angle_to_matrix(rot_vec)
+    return rot_mat
+def interpolate_extrinsics(ext1: torch.Tensor, ext2: torch.Tensor, t: Union[Number, torch.Tensor]) -> torch.Tensor:
+    """Interpolate extrinsics between two camera poses. Linear interpolation for translation, spherical linear interpolation for rotation.
+    Args:
+        ext1 (torch.Tensor): shape (..., 4, 4), the first camera pose
+        ext2 (torch.Tensor): shape (..., 4, 4), the second camera pose
+        t (torch.Tensor): scalar or shape (...,), the interpolation factor
+    Returns:
+        torch.Tensor: shape (..., 4, 4), the interpolated camera pose
+    """
+    return torch.inverse(interpolate_transform(torch.inverse(ext1), torch.inverse(ext2), t))
+def interpolate_view(view1: torch.Tensor, view2: torch.Tensor, t: Union[Number, torch.Tensor]):
+    """Interpolate view matrices between two camera poses. Linear interpolation for translation, spherical linear interpolation for rotation.
+    Args:
+        ext1 (torch.Tensor): shape (..., 4, 4), the first camera pose
+        ext2 (torch.Tensor): shape (..., 4, 4), the second camera pose
+        t (torch.Tensor): scalar or shape (...,), the interpolation factor
+    Returns:
+        torch.Tensor: shape (..., 4, 4), the interpolated camera pose
+    """
+    return interpolate_extrinsics(view1, view2, t)
+def interpolate_transform(transform1: torch.Tensor, transform2: torch.Tensor, t: Union[Number, torch.Tensor]):
+    assert transform1.shape[-2:] == (4, 4) and transform2.shape[-2:] == (4, 4)
+    if isinstance(t, Number):
+        t = torch.tensor(t, dtype=transform1.dtype, device=transform1.device)
+    pos = (1 - t[..., None]) * transform1[..., :3, 3] + t[..., None] * transform2[..., :3, 3]
+    rot = slerp(transform1[..., :3, :3], transform2[..., :3, :3], t)
+    transform = torch.cat([rot, pos[..., None]], dim=-1)
+    transform = torch.cat([ext, torch.tensor([0, 0, 0, 1], dtype=transform.dtype, device=transform.device).expand_as(transform[..., :1, :])], dim=-2)
+    return transform
+def extrinsics_to_essential(extrinsics: torch.Tensor):
+    """
+    extrinsics matrix `[[R, t] [0, 0, 0, 1]]` such that `x' = R (x - t)` to essential matrix such that `x' E x = 0`
+    Args:
+        extrinsics (torch.Tensor): [..., 4, 4] extrinsics matrix
+    Returns:
+        (torch.Tensor): [..., 3, 3] essential matrix
+    """
+    assert extrinsics.shape[-2:] == (4, 4)
+    R = extrinsics[..., :3, :3]
+    t = extrinsics[..., :3, 3]
+    zeros = torch.zeros_like(t)
+    t_x = torch.stack([
+        zeros, -t[..., 2], t[..., 1],
+        t[..., 2], zeros, -t[..., 0],
+        -t[..., 1], t[..., 0], zeros
+    ]).reshape(*t.shape[:-1], 3, 3)
+    return R @ t_x
+def to4x4(R: torch.Tensor, t: torch.Tensor):
+    """
+    Compose rotation matrix and translation vector to 4x4 transformation matrix
+    Args:
+        R (torch.Tensor): [..., 3, 3] rotation matrix
+        t (torch.Tensor): [..., 3] translation vector
+    Returns:
+        (torch.Tensor): [..., 4, 4] transformation matrix
+    """
+    assert R.shape[-2:] == (3, 3)
+    assert t.shape[-1] == 3
+    assert R.shape[:-2] == t.shape[:-1]
+    return torch.cat([
+        torch.cat([R, t[..., None]], dim=-1),
+        torch.tensor([0, 0, 0, 1], dtype=R.dtype, device=R.device).expand(*R.shape[:-2], 1, 4)
+    ], dim=-2)
+def rotation_matrix_2d(theta: Union[float, torch.Tensor]):
+    """
+    2x2 matrix for 2D rotation
+    Args:
+        theta (float | torch.Tensor): rotation angle in radians, arbitrary shape (...,)
+    Returns:
+        (torch.Tensor): (..., 2, 2) rotation matrix
+    """
+    if isinstance(theta, float):
+        theta = torch.tensor(theta)
+    return torch.stack([
+        torch.cos(theta), -torch.sin(theta),
+        torch.sin(theta), torch.cos(theta),
+    ], dim=-1).unflatten(-1, (2, 2))
+def rotate_2d(theta: Union[float, torch.Tensor], center: torch.Tensor = None):
+    """
+    3x3 matrix for 2D rotation around a center
+    ```
+       [[Rxx, Rxy, tx],
+        [Ryx, Ryy, ty],
+        [0,     0,  1]]
+    ```
+    Args:
+        theta (float | torch.Tensor): rotation angle in radians, arbitrary shape (...,)
+        center (torch.Tensor): rotation center, arbitrary shape (..., 2). Default to (0, 0)
+    Returns:
+        (torch.Tensor): (..., 3, 3) transformation matrix
+    """
+    if isinstance(theta, float):
+        theta = torch.tensor(theta)
+        if center is not None:
+            theta = theta.to(center)
+    if center is None:
+        center = torch.zeros(2).to(theta).expand(*theta.shape, -1)
+    R = rotation_matrix_2d(theta)
+    return torch.cat([
+        torch.cat([
+            R,
+            center[..., :, None] - R @ center[..., :, None],
+        ], dim=-1),
+        torch.tensor([[0, 0, 1]], dtype=center.dtype, device=center.device).expand(*center.shape[:-1], -1, -1),
+    ], dim=-2)
+def translate_2d(translation: torch.Tensor):
+    """
+    Translation matrix for 2D translation
+    ```
+       [[1, 0, tx],
+        [0, 1, ty],
+        [0, 0,  1]]
+    ```
+    Args:
+        translation (torch.Tensor): translation vector, arbitrary shape (..., 2)
+    Returns:
+        (torch.Tensor): (..., 3, 3) transformation matrix
+    """
+    return torch.cat([
+        torch.cat([
+            torch.eye(2, dtype=translation.dtype, device=translation.device).expand(*translation.shape[:-1], -1, -1),
+            translation[..., None],
+        ], dim=-1),
+        torch.tensor([[0, 0, 1]], dtype=translation.dtype, device=translation.device).expand(*translation.shape[:-1], -1, -1),
+    ], dim=-2)
+def scale_2d(scale: Union[float, torch.Tensor], center: torch.Tensor = None):
+    """
+    Scale matrix for 2D scaling
+    ```
+       [[s, 0, tx],
+        [0, s, ty],
+        [0, 0,  1]]
+    ```
+    Args:
+        scale (float | torch.Tensor): scale factor, arbitrary shape (...,)
+        center (torch.Tensor): scale center, arbitrary shape (..., 2). Default to (0, 0)
+    Returns:
+        (torch.Tensor): (..., 3, 3) transformation matrix
+    """
+    if isinstance(scale, float):
+        scale = torch.tensor(scale)
+        if center is not None:
+            scale = scale.to(center)
+    if center is None:
+        center = torch.zeros(2, dtype=scale.dtype, device=scale.device).expand(*scale.shape, -1)
+    return torch.cat([
+        torch.cat([
+            scale * torch.eye(2, dtype=scale.dtype, device=scale.device).expand(*scale.shape[:-1], -1, -1),
+            center[..., :, None] - center[..., :, None] * scale[..., None, None],
+        ], dim=-1),
+        torch.tensor([[0, 0, 1]], dtype=scale.dtype, device=scale.device).expand(*center.shape[:-1], -1, -1),
+    ], dim=-2)
+def apply_2d(transform: torch.Tensor, points: torch.Tensor):
+    """
+    Apply (3x3 or 2x3) 2D affine transformation to points
+    ```
+        p = R @ p + t
+    ```
+    Args:
+        transform (torch.Tensor): (..., 2 or 3, 3) transformation matrix
+        points (torch.Tensor): (..., N, 2) points to transform
+    Returns:
+        (torch.Tensor): (..., N, 2) transformed points
+    """
+    assert transform.shape[-2:] == (3, 3) or transform.shape[-2:] == (2, 3), "transform must be 3x3 or 2x3"
+    assert points.shape[-1] == 2, "points must be 2D"
+    return points @ transform[..., :2, :2].mT + transform[..., :2, None, 2]

moge/model/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def wrap_module_with_gradient_checkpointing(module: nn.Module):
+    from torch.utils.checkpoint import checkpoint
+    class _CheckpointingWrapper(module.__class__):
+        _restore_cls = module.__class__
+        def forward(self, *args, **kwargs):
+            return checkpoint(super().forward, *args, use_reentrant=False, **kwargs)
+    module.__class__ = _CheckpointingWrapper
+    return module
+def unwrap_module_with_gradient_checkpointing(module: nn.Module):
+    module.__class__ = module.__class__._restore_cls
+def wrap_dinov2_attention_with_sdpa(module: nn.Module):
+    assert torch.__version__ >= '2.0', "SDPA requires PyTorch 2.0 or later"
+    class _AttentionWrapper(module.__class__):
+        def forward(self, x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+            B, N, C = x.shape
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)  # (3, B, H, N, C // H)
+            q, k, v = torch.unbind(qkv, 0)      # (B, H, N, C // H)
+            x = F.scaled_dot_product_attention(q, k, v, attn_bias)
+            x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+            return x
+    module.__class__ = _AttentionWrapper
+    return module
+def sync_ddp_hook(state, bucket: torch.distributed.GradBucket) -> torch.futures.Future[torch.Tensor]:
+    group_to_use = torch.distributed.group.WORLD
+    world_size = group_to_use.size()
+    grad = bucket.buffer()
+    grad.div_(world_size)
+    torch.distributed.all_reduce(grad, group=group_to_use)
+    fut = torch.futures.Future()
+    fut.set_result(grad)
+    return fut

moge/model/v2.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# This file is modified from MoGe:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+# Modifications Copyright (c) 2026 Ze-Xin Yin, Robot labs of Horizon Robotics, and D-Robotics.
+from typing import *
+from numbers import Number
+from functools import partial
+from pathlib import Path
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils
+import torch.utils.checkpoint
+import torch.amp
+import torch.version
+import utils3d
+from huggingface_hub import hf_hub_download
+from ..utils.geometry_torch import normalized_view_plane_uv, recover_focal_shift, angle_diff_vec3
+from .utils import wrap_dinov2_attention_with_sdpa, wrap_module_with_gradient_checkpointing, unwrap_module_with_gradient_checkpointing
+from .modules import DINOv2Encoder, MLP, ConvStack
+from . import transforms
+from einops import rearrange
+def image_uv(height: int, width: int, left: int = None, top: int = None, right: int = None, bottom: int = None, device: torch.device = None, dtype: torch.dtype = None) -> torch.Tensor:
+    """
+    Get image space UV grid, ranging in [0, 1].
+    >>> image_uv(10, 10):
+    [[[0.05, 0.05], [0.15, 0.05], ..., [0.95, 0.05]],
+     [[0.05, 0.15], [0.15, 0.15], ..., [0.95, 0.15]],
+      ...             ...                  ...
+     [[0.05, 0.95], [0.15, 0.95], ..., [0.95, 0.95]]]
+    Args:
+        width (int): image width
+        height (int): image height
+    Returns:
+        torch.Tensor: shape (height, width, 2)
+    """
+    if left is None: left = 0
+    if top is None: top = 0
+    if right is None: right = width
+    if bottom is None: bottom = height
+    u = torch.linspace((left + 0.5) / width, (right - 0.5) / width, right - left, device=device, dtype=dtype)
+    v = torch.linspace((top + 0.5) / height, (bottom - 0.5) / height, bottom - top, device=device, dtype=dtype)
+    u, v = torch.meshgrid(u, v, indexing='xy')
+    uv = torch.stack([u, v], dim=-1)
+    return uv
+def depth_to_points(depth: torch.Tensor, intrinsics: torch.Tensor, extrinsics: torch.Tensor = None):
+    height, width = depth.shape[-2:]
+    uv = image_uv(width=width, height=height, dtype=depth.dtype, device=depth.device)
+    pts = transforms.unproject_cv(uv, depth, intrinsics=intrinsics[..., None, :, :], extrinsics=extrinsics[..., None, :, :] if extrinsics is not None else None)
+    return pts
+class MoGeModel(nn.Module):
+    encoder: DINOv2Encoder
+    neck: ConvStack
+    points_head: ConvStack
+    mask_head: ConvStack
+    scale_head: MLP
+    def __init__(self,
+        encoder: Dict[str, Any],
+        neck: Dict[str, Any],
+        points_head: Dict[str, Any] = None,
+        mask_head: Dict[str, Any] = None,
+        normal_head: Dict[str, Any] = None,
+        scale_head: Dict[str, Any] = None,
+        remap_output: Literal['linear', 'sinh', 'exp', 'sinh_exp'] = 'linear',
+        num_tokens_range: List[int] = [1200, 3600],
+        **deprecated_kwargs
+    ):
+        super(MoGeModel, self).__init__()
+        if deprecated_kwargs:
+            warnings.warn(f"The following deprecated/invalid arguments are ignored: {deprecated_kwargs}")
+        self.remap_output = remap_output
+        self.num_tokens_range = num_tokens_range
+        self.encoder = DINOv2Encoder(**encoder)
+        self.neck = ConvStack(**neck)
+        if points_head is not None:
+            self.points_head = ConvStack(**points_head)
+        if mask_head is not None:
+            self.mask_head = ConvStack(**mask_head)
+        if normal_head is not None:
+            self.normal_head = ConvStack(**normal_head)
+        if scale_head is not None:
+            self.scale_head = MLP(**scale_head)
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, Path, IO[bytes]], model_kwargs: Optional[Dict[str, Any]] = None, **hf_kwargs) -> 'MoGeModel':
+        """
+        Load a model from a checkpoint file.
+        ### Parameters:
+        - `pretrained_model_name_or_path`: path to the checkpoint file or repo id.
+        - `compiled`
+        - `model_kwargs`: additional keyword arguments to override the parameters in the checkpoint.
+        - `hf_kwargs`: additional keyword arguments to pass to the `hf_hub_download` function. Ignored if `pretrained_model_name_or_path` is a local path.
+        ### Returns:
+        - A new instance of `MoGe` with the parameters loaded from the checkpoint.
+        """
+        if Path(pretrained_model_name_or_path).exists():
+            checkpoint_path = pretrained_model_name_or_path
+        else:
+            checkpoint_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                repo_type="model",
+                filename="model.pt",
+                **hf_kwargs
+            )
+        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=True)
+        model_config = checkpoint['model_config']
+        if model_kwargs is not None:
+            model_config.update(model_kwargs)
+        model = cls(**model_config)
+        model.load_state_dict(checkpoint['model'], strict=False)
+        return model
+    def init_weights(self):
+        self.encoder.init_weights()
+    def enable_gradient_checkpointing(self):
+        self.encoder.enable_gradient_checkpointing()
+        self.neck.enable_gradient_checkpointing()
+        for head in ['points_head', 'normal_head', 'mask_head']:
+            if hasattr(self, head):
+                getattr(self, head).enable_gradient_checkpointing()
+    def enable_pytorch_native_sdpa(self):
+        self.encoder.enable_pytorch_native_sdpa()
+    def _remap_points(self, points: torch.Tensor) -> torch.Tensor:
+        if self.remap_output == 'linear':
+            pass
+        elif self.remap_output =='sinh':
+            points = torch.sinh(points)
+        elif self.remap_output == 'exp':
+            xy, z = points.split([2, 1], dim=-1)
+            z = torch.exp(z)
+            points = torch.cat([xy * z, z], dim=-1)
+        elif self.remap_output =='sinh_exp':
+            xy, z = points.split([2, 1], dim=-1)
+            points = torch.cat([torch.sinh(xy), torch.exp(z)], dim=-1)
+        else:
+            raise ValueError(f"Invalid remap output type: {self.remap_output}")
+        return points
+    @torch.inference_mode()
+    def infer_feature_tokens(self, image: torch.Tensor, num_tokens: int, tokens_layer: int = -1) -> torch.Tensor:
+        batch_size, _, img_h, img_w = image.shape
+        device, dtype = image.device, image.dtype
+        aspect_ratio = img_w / img_h
+        base_h, base_w = int((num_tokens / aspect_ratio) ** 0.5), int((num_tokens * aspect_ratio) ** 0.5)
+        num_tokens = base_h * base_w
+        # Backbones encoding
+        features = self.encoder(image, base_h, base_w, return_class_token=False)
+        features = [features, None, None, None, None]
+        # Concat UVs for aspect ratio input
+        for level in range(5):
+            uv = normalized_view_plane_uv(width=base_w * 2 ** level, height=base_h * 2 ** level, aspect_ratio=aspect_ratio, dtype=dtype, device=device)
+            uv = uv.permute(2, 0, 1).unsqueeze(0).expand(batch_size, -1, -1, -1)
+            if features[level] is None:
+                features[level] = uv
+            else:
+                features[level] = torch.concat([features[level], uv], dim=1)
+        # Shared neck
+        features = self.neck(features)[tokens_layer]
+        return features
+    def forward(self, image: torch.Tensor, num_tokens: int) -> Dict[str, torch.Tensor]:
+        batch_size, _, img_h, img_w = image.shape
+        device, dtype = image.device, image.dtype
+        aspect_ratio = img_w / img_h
+        base_h, base_w = int((num_tokens / aspect_ratio) ** 0.5), int((num_tokens * aspect_ratio) ** 0.5)
+        num_tokens = base_h * base_w
+        # Backbones encoding
+        features, cls_token = self.encoder(image, base_h, base_w, return_class_token=True)
+        features = [features, None, None, None, None]
+        # Concat UVs for aspect ratio input
+        for level in range(5):
+            uv = normalized_view_plane_uv(width=base_w * 2 ** level, height=base_h * 2 ** level, aspect_ratio=aspect_ratio, dtype=dtype, device=device)
+            uv = uv.permute(2, 0, 1).unsqueeze(0).expand(batch_size, -1, -1, -1)
+            if features[level] is None:
+                features[level] = uv
+            else:
+                features[level] = torch.concat([features[level], uv], dim=1)
+        # Shared neck
+        features = self.neck(features)
+        # Heads decoding
+        points, normal, mask = (getattr(self, head)(features)[-1] if hasattr(self, head) else None for head in ['points_head', 'normal_head', 'mask_head'])
+        metric_scale = self.scale_head(cls_token) if hasattr(self, 'scale_head') else None
+        # Resize
+        points, normal, mask = (F.interpolate(v, (img_h, img_w), mode='bilinear', align_corners=False, antialias=False) if v is not None else None for v in [points, normal, mask])
+        # Remap output
+        if points is not None:
+            points = points.permute(0, 2, 3, 1)
+            points = self._remap_points(points)     # slightly improves the performance in case of very large output values
+        if normal is not None:
+            normal = normal.permute(0, 2, 3, 1)
+            normal = F.normalize(normal, dim=-1)
+        if mask is not None:
+            mask = mask.squeeze(1).sigmoid()
+        if metric_scale is not None:
+            metric_scale = metric_scale.squeeze(1).exp()
+        return_dict = {
+            'points': points,
+            'normal': normal,
+            'mask': mask,
+            'metric_scale': metric_scale
+        }
+        return_dict = {k: v for k, v in return_dict.items() if v is not None}
+        return return_dict
+    @torch.inference_mode()
+    def infer(
+        self,
+        image: torch.Tensor,
+        num_tokens: int = None,
+        resolution_level: int = 9,
+        force_projection: bool = True,
+        apply_mask: Literal[False, True, 'blend'] = True,
+        fov_x: Optional[Union[Number, torch.Tensor]] = None,
+        use_fp16: bool = True,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        User-friendly inference function
+        ### Parameters
+        - `image`: input image tensor of shape (B, 3, H, W) or (3, H, W)
+        - `num_tokens`: the number of base ViT tokens to use for inference, `'least'` or `'most'` or an integer. Suggested range: 1200 ~ 2500.
+            More tokens will result in significantly higher accuracy and finer details, but slower inference time. Default: `'most'`.
+        - `force_projection`: if True, the output point map will be computed using the actual depth map. Default: True
+        - `apply_mask`: if True, the output point map will be masked using the predicted mask. Default: True
+        - `fov_x`: the horizontal camera FoV in degrees. If None, it will be inferred from the predicted point map. Default: None
+        - `use_fp16`: if True, use mixed precision to speed up inference. Default: True
+        ### Returns
+        A dictionary containing the following keys:
+        - `points`: output tensor of shape (B, H, W, 3) or (H, W, 3).
+        - `depth`: tensor of shape (B, H, W) or (H, W) containing the depth map.
+        - `intrinsics`: tensor of shape (B, 3, 3) or (3, 3) containing the camera intrinsics.
+        """
+        if image.dim() == 3:
+            omit_batch_dim = True
+            image = image.unsqueeze(0)
+        else:
+            omit_batch_dim = False
+        image = image.to(dtype=self.dtype, device=self.device)
+        original_height, original_width = image.shape[-2:]
+        area = original_height * original_width
+        aspect_ratio = original_width / original_height
+        # Determine the number of base tokens to use
+        if num_tokens is None:
+            min_tokens, max_tokens = self.num_tokens_range
+            num_tokens = int(min_tokens + (resolution_level / 9) * (max_tokens - min_tokens))
+        # Forward pass
+        with torch.autocast(device_type=self.device.type, dtype=torch.float16, enabled=use_fp16 and self.dtype != torch.float16):
+            output = self.forward(image, num_tokens=num_tokens)
+        points, normal, mask, metric_scale = (output.get(k, None) for k in ['points', 'normal', 'mask', 'metric_scale'])
+        # Always process the output in fp32 precision
+        points, normal, mask, metric_scale, fov_x = map(lambda x: x.float() if isinstance(x, torch.Tensor) else x, [points, normal, mask, metric_scale, fov_x])
+        with torch.autocast(device_type=self.device.type, dtype=torch.float32):
+            if mask is not None:
+                mask_binary = mask > 0.5
+            else:
+                mask_binary = None
+            if points is not None:
+                # Convert affine point map to camera-space. Recover depth and intrinsics from point map.
+                # NOTE: Focal here is the focal length relative to half the image diagonal
+                if fov_x is None:
+                    # Recover focal and shift from predicted point map
+                    focal, shift = recover_focal_shift(points, mask_binary)
+                else:
+                    # Focal is known, recover shift only
+                    focal = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5 / torch.tan(torch.deg2rad(torch.as_tensor(fov_x, device=points.device, dtype=points.dtype) / 2))
+                    if focal.ndim == 0:
+                        focal = focal[None].expand(points.shape[0])
+                    _, shift = recover_focal_shift(points, mask_binary, focal=focal)
+                fx, fy = focal / 2 * (1 + aspect_ratio ** 2) ** 0.5 / aspect_ratio, focal / 2 * (1 + aspect_ratio ** 2) ** 0.5
+                intrinsics = utils3d.torch.intrinsics_from_focal_center(fx, fy, 0.5, 0.5)
+                points[..., 2] += shift[..., None, None]
+                if mask_binary is not None:
+                    mask_binary &= points[..., 2] > 0        # in case depth is contains negative values (which should never happen in practice)
+                depth = points[..., 2].clone()
+            else:
+                depth, intrinsics = None, None
+            # If projection constraint is forced, recompute the point map using the actual depth map & intrinsics
+            if force_projection and depth is not None:
+                points = depth_to_points(depth, intrinsics=intrinsics)
+            # Apply metric scale
+            if metric_scale is not None:
+                if points is not None:
+                    points *= metric_scale[:, None, None, None]
+                if depth is not None:
+                    depth *= metric_scale[:, None, None]
+            # Apply mask
+            if apply_mask and mask_binary is not None:
+                points = torch.where(mask_binary[..., None], points, torch.inf) if points is not None else None
+                depth = torch.where(mask_binary, depth, torch.inf) if depth is not None else None
+                normal = torch.where(mask_binary[..., None], normal, torch.zeros_like(normal)) if normal is not None else None
+        return_dict = {
+            'points': points,
+            'intrinsics': intrinsics,
+            'depth': depth,
+            'mask': mask_binary,
+            'normal': normal,
+            'metric_scale': metric_scale
+        }
+        return_dict = {k: v for k, v in return_dict.items() if v is not None}
+        if omit_batch_dim:
+            return_dict = {k: v.squeeze(0) for k, v in return_dict.items()}
+        return return_dict

moge/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors

moge/utils/download.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from pathlib import Path
+from typing import *
+import requests
+from tqdm import tqdm
+__all__ = ["download_file", "download_bytes"]
+def download_file(url: str, filepath: Union[str, Path], headers: dict = None, resume: bool = True) -> None:
+    # Ensure headers is a dict if not provided
+    headers = headers or {}
+    # Initialize local variables
+    file_path = Path(filepath)
+    downloaded_bytes = 0
+    # Check if we should resume the download
+    if resume and file_path.exists():
+        downloaded_bytes = file_path.stat().st_size
+        headers['Range'] = f"bytes={downloaded_bytes}-"
+    # Make a GET request to fetch the file
+    with requests.get(url, stream=True, headers=headers) as response:
+        response.raise_for_status()  # This will raise an HTTPError if the status is 4xx/5xx
+        # Calculate the total size to download
+        total_size = downloaded_bytes + int(response.headers.get('content-length', 0))
+        # Display a progress bar while downloading
+        with (
+            tqdm(desc=f"Downloading {file_path.name}", total=total_size, unit='B', unit_scale=True, leave=False) as pbar,
+            open(file_path, 'ab') as file,
+        ):
+            # Set the initial position of the progress bar
+            pbar.update(downloaded_bytes)
+            # Write the content to the file in chunks
+            for chunk in response.iter_content(chunk_size=4096):
+                file.write(chunk)
+                pbar.update(len(chunk))
+def download_bytes(url: str, headers: dict = None) -> bytes:
+    # Ensure headers is a dict if not provided
+    headers = headers or {}
+    # Make a GET request to fetch the file
+    with requests.get(url, stream=True, headers=headers) as response:
+        response.raise_for_status()  # This will raise an HTTPError if the status is 4xx/5xx
+        # Read the content of the response
+        return response.content

moge/utils/geometry_numpy.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+from functools import partial
+import math
+import cv2
+import numpy as np
+from scipy.signal import fftconvolve
+import numpy as np
+import utils3d
+from .tools import timeit
+def weighted_mean_numpy(x: np.ndarray, w: np.ndarray = None, axis: Union[int, Tuple[int,...]] = None, keepdims: bool = False, eps: float = 1e-7) -> np.ndarray:
+    if w is None:
+        return np.mean(x, axis=axis)
+    else:
+        w = w.astype(x.dtype)
+        return (x * w).mean(axis=axis) / np.clip(w.mean(axis=axis), eps, None)
+def harmonic_mean_numpy(x: np.ndarray, w: np.ndarray = None, axis: Union[int, Tuple[int,...]] = None, keepdims: bool = False, eps: float = 1e-7) -> np.ndarray:
+    if w is None:
+        return 1 / (1 / np.clip(x, eps, None)).mean(axis=axis)
+    else:
+        w = w.astype(x.dtype)
+        return 1 / (weighted_mean_numpy(1 / (x + eps), w, axis=axis, keepdims=keepdims, eps=eps) + eps)
+def normalized_view_plane_uv_numpy(width: int, height: int, aspect_ratio: float = None, dtype: np.dtype = np.float32) -> np.ndarray:
+    "UV with left-top corner as (-width / diagonal, -height / diagonal) and right-bottom corner as (width / diagonal, height / diagonal)"
+    if aspect_ratio is None:
+        aspect_ratio = width / height
+    span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
+    span_y = 1 / (1 + aspect_ratio ** 2) ** 0.5
+    u = np.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype)
+    v = np.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype)
+    u, v = np.meshgrid(u, v, indexing='xy')
+    uv = np.stack([u, v], axis=-1)
+    return uv
+def focal_to_fov_numpy(focal: np.ndarray):
+    return 2 * np.arctan(0.5 / focal)
+def fov_to_focal_numpy(fov: np.ndarray):
+    return 0.5 / np.tan(fov / 2)
+def intrinsics_to_fov_numpy(intrinsics: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    fov_x = focal_to_fov_numpy(intrinsics[..., 0, 0])
+    fov_y = focal_to_fov_numpy(intrinsics[..., 1, 1])
+    return fov_x, fov_y
+def point_map_to_depth_legacy_numpy(points: np.ndarray):
+    height, width = points.shape[-3:-1]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    uv = normalized_view_plane_uv_numpy(width, height, dtype=points.dtype)  # (H, W, 2)
+    _, uv = np.broadcast_arrays(points[..., :2], uv)
+    # Solve least squares problem
+    b = (uv * points[..., 2:]).reshape(*points.shape[:-3], -1)                                  # (..., H * W * 2)
+    A = np.stack([points[..., :2], -uv], axis=-1).reshape(*points.shape[:-3], -1, 2)   # (..., H * W * 2, 2)
+    M = A.swapaxes(-2, -1) @ A
+    solution = (np.linalg.inv(M + 1e-6 * np.eye(2)) @ (A.swapaxes(-2, -1) @ b[..., None])).squeeze(-1)
+    focal, shift = solution
+    depth = points[..., 2] + shift[..., None, None]
+    fov_x = np.arctan(width / diagonal / focal) * 2
+    fov_y = np.arctan(height / diagonal / focal) * 2
+    return depth, fov_x, fov_y, shift
+def solve_optimal_focal_shift(uv: np.ndarray, xyz: np.ndarray):
+    "Solve `min |focal * xy / (z + shift) - uv|` with respect to shift and focal"
+    from scipy.optimize import least_squares
+    uv, xy, z = uv.reshape(-1, 2), xyz[..., :2].reshape(-1, 2), xyz[..., 2].reshape(-1)
+    def fn(uv: np.ndarray, xy: np.ndarray, z: np.ndarray, shift: np.ndarray):
+        xy_proj = xy / (z + shift)[: , None]
+        f = (xy_proj * uv).sum() / np.square(xy_proj).sum()
+        err = (f * xy_proj - uv).ravel()
+        return err
+    solution = least_squares(partial(fn, uv, xy, z), x0=0, ftol=1e-3, method='lm')
+    optim_shift = solution['x'].squeeze().astype(np.float32)
+    xy_proj = xy / (z + optim_shift)[: , None]
+    optim_focal = (xy_proj * uv).sum() / np.square(xy_proj).sum()
+    return optim_shift, optim_focal
+def solve_optimal_shift(uv: np.ndarray, xyz: np.ndarray, focal: float):
+    "Solve `min |focal * xy / (z + shift) - uv|` with respect to shift"
+    from scipy.optimize import least_squares
+    uv, xy, z = uv.reshape(-1, 2), xyz[..., :2].reshape(-1, 2), xyz[..., 2].reshape(-1)
+    def fn(uv: np.ndarray, xy: np.ndarray, z: np.ndarray, shift: np.ndarray):
+        xy_proj = xy / (z + shift)[: , None]
+        err = (focal * xy_proj - uv).ravel()
+        return err
+    solution = least_squares(partial(fn, uv, xy, z), x0=0, ftol=1e-3, method='lm')
+    optim_shift = solution['x'].squeeze().astype(np.float32)
+    return optim_shift
+def recover_focal_shift_numpy(points: np.ndarray, mask: np.ndarray = None, focal: float = None, downsample_size: Tuple[int, int] = (64, 64)):
+    import cv2
+    assert points.shape[-1] == 3, "Points should (H, W, 3)"
+    height, width = points.shape[-3], points.shape[-2]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    uv = normalized_view_plane_uv_numpy(width=width, height=height)
+    if mask is None:
+        points_lr = cv2.resize(points, downsample_size, interpolation=cv2.INTER_LINEAR).reshape(-1, 3)
+        uv_lr = cv2.resize(uv, downsample_size, interpolation=cv2.INTER_LINEAR).reshape(-1, 2)
+    else:
+        (points_lr, uv_lr), mask_lr = mask_aware_nearest_resize_numpy((points, uv), mask, downsample_size)
+    if points_lr.size < 2:
+        return 1., 0.
+    if focal is None:
+        focal, shift = solve_optimal_focal_shift(uv_lr, points_lr)
+    else:
+        shift = solve_optimal_shift(uv_lr, points_lr, focal)
+    return focal, shift
+def mask_aware_nearest_resize_numpy(
+    inputs: Union[np.ndarray, Tuple[np.ndarray, ...], None],
+    mask: np.ndarray,
+    size: Tuple[int, int],
+    return_index: bool = False
+) -> Tuple[Union[np.ndarray, Tuple[np.ndarray, ...], None], np.ndarray, Tuple[np.ndarray, ...]]:
+    """
+    Resize 2D map by nearest interpolation. Return the nearest neighbor index and mask of the resized map.
+    ### Parameters
+    - `inputs`: a single or a list of input 2D map(s) of shape (..., H, W, ...).
+    - `mask`: input 2D mask of shape (..., H, W)
+    - `size`: target size (width, height)
+    ### Returns
+    - `*resized_maps`: resized map(s) of shape (..., target_height, target_width, ...).
+    - `resized_mask`: mask of the resized map of shape (..., target_height, target_width)
+    - `nearest_idx`: if return_index is True, nearest neighbor index of the resized map of shape (..., target_height, target_width) for each dimension.
+    """
+    height, width = mask.shape[-2:]
+    target_width, target_height = size
+    filter_h_f, filter_w_f = max(1, height / target_height), max(1, width / target_width)
+    filter_h_i, filter_w_i = math.ceil(filter_h_f), math.ceil(filter_w_f)
+    filter_size = filter_h_i * filter_w_i
+    padding_h, padding_w = filter_h_i // 2 + 1, filter_w_i // 2 + 1
+    # Window the original mask and uv
+    uv = utils3d.numpy.image_pixel_center(width=width, height=height, dtype=np.float32)
+    indices = np.arange(height * width, dtype=np.int32).reshape(height, width)
+    padded_uv = np.full((height + 2 * padding_h, width + 2 * padding_w, 2), 0, dtype=np.float32)
+    padded_uv[padding_h:padding_h + height, padding_w:padding_w + width] = uv
+    padded_mask = np.full((*mask.shape[:-2], height + 2 * padding_h, width + 2 * padding_w), False, dtype=bool)
+    padded_mask[..., padding_h:padding_h + height, padding_w:padding_w + width] = mask
+    padded_indices = np.full((height + 2 * padding_h, width + 2 * padding_w), 0, dtype=np.int32)
+    padded_indices[padding_h:padding_h + height, padding_w:padding_w + width] = indices
+    windowed_uv = utils3d.numpy.sliding_window_2d(padded_uv, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    windowed_mask = utils3d.numpy.sliding_window_2d(padded_mask, (filter_h_i, filter_w_i), 1, axis=(-2, -1))
+    windowed_indices = utils3d.numpy.sliding_window_2d(padded_indices, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    # Gather the target pixels's local window
+    target_centers = utils3d.numpy.image_uv(width=target_width, height=target_height, dtype=np.float32) * np.array([width, height], dtype=np.float32)
+    target_lefttop = target_centers - np.array((filter_w_f / 2, filter_h_f / 2), dtype=np.float32)
+    target_window = np.round(target_lefttop).astype(np.int32) + np.array((padding_w, padding_h), dtype=np.int32)
+    target_window_centers = windowed_uv[target_window[..., 1], target_window[..., 0], :, :, :].reshape(target_height, target_width, 2, filter_size)                          # (target_height, tgt_width, 2, filter_size)
+    target_window_mask = windowed_mask[..., target_window[..., 1], target_window[..., 0], :, :].reshape(*mask.shape[:-2], target_height, target_width, filter_size)     # (..., target_height, tgt_width, filter_size)
+    target_window_indices = windowed_indices[target_window[..., 1], target_window[..., 0], :, :].reshape(*([-1] * (mask.ndim - 2)), target_height, target_width, filter_size)                      # (target_height, tgt_width, filter_size)
+    # Compute nearest neighbor in the local window for each pixel
+    dist = np.square(target_window_centers - target_centers[..., None])
+    dist = dist[..., 0, :] + dist[..., 1, :]
+    dist = np.where(target_window_mask, dist, np.inf)                                                   # (..., target_height, tgt_width, filter_size)
+    nearest_in_window = np.argmin(dist, axis=-1, keepdims=True)                                         # (..., target_height, tgt_width, 1)
+    nearest_idx = np.take_along_axis(target_window_indices, nearest_in_window, axis=-1).squeeze(-1)     # (..., target_height, tgt_width)
+    nearest_i, nearest_j = nearest_idx // width, nearest_idx % width
+    target_mask = np.any(target_window_mask, axis=-1)
+    batch_indices = [np.arange(n).reshape([1] * i + [n] + [1] * (mask.ndim - i - 1)) for i, n in enumerate(mask.shape[:-2])]
+    index = (*batch_indices, nearest_i, nearest_j)
+    if inputs is None:
+        outputs = None
+    elif isinstance(inputs, np.ndarray):
+        outputs = inputs[index]
+    elif isinstance(inputs, Sequence):
+        outputs = tuple(x[index] for x in inputs)
+    else:
+        raise ValueError(f'Invalid input type: {type(inputs)}')
+    if return_index:
+        return outputs, target_mask, index
+    else:
+        return outputs, target_mask
+def mask_aware_area_resize_numpy(image: np.ndarray, mask: np.ndarray, target_width: int, target_height: int) -> Tuple[Tuple[np.ndarray, ...], np.ndarray]:
+    """
+    Resize 2D map by nearest interpolation. Return the nearest neighbor index and mask of the resized map.
+    ### Parameters
+    - `image`: Input 2D image of shape (..., H, W, C)
+    - `mask`: Input 2D mask of shape (..., H, W)
+    - `target_width`: target width of the resized map
+    - `target_height`: target height of the resized map
+    ### Returns
+    - `nearest_idx`: Nearest neighbor index of the resized map of shape (..., target_height, target_width).
+    - `target_mask`: Mask of the resized map of shape (..., target_height, target_width)
+    """
+    height, width = mask.shape[-2:]
+    if image.shape[-2:] == (height, width):
+        omit_channel_dim = True
+    else:
+        omit_channel_dim = False
+    if omit_channel_dim:
+        image = image[..., None]
+    image = np.where(mask[..., None], image, 0)
+    filter_h_f, filter_w_f = max(1, height / target_height), max(1, width / target_width)
+    filter_h_i, filter_w_i = math.ceil(filter_h_f) + 1, math.ceil(filter_w_f) + 1
+    filter_size = filter_h_i * filter_w_i
+    padding_h, padding_w = filter_h_i // 2 + 1, filter_w_i // 2 + 1
+    # Window the original mask and uv (non-copy)
+    uv = utils3d.numpy.image_pixel_center(width=width, height=height, dtype=np.float32)
+    indices = np.arange(height * width, dtype=np.int32).reshape(height, width)
+    padded_uv = np.full((height + 2 * padding_h, width + 2 * padding_w, 2), 0, dtype=np.float32)
+    padded_uv[padding_h:padding_h + height, padding_w:padding_w + width] = uv
+    padded_mask = np.full((*mask.shape[:-2], height + 2 * padding_h, width + 2 * padding_w), False, dtype=bool)
+    padded_mask[..., padding_h:padding_h + height, padding_w:padding_w + width] = mask
+    padded_indices = np.full((height + 2 * padding_h, width + 2 * padding_w), 0, dtype=np.int32)
+    padded_indices[padding_h:padding_h + height, padding_w:padding_w + width] = indices
+    windowed_uv = utils3d.numpy.sliding_window_2d(padded_uv, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    windowed_mask = utils3d.numpy.sliding_window_2d(padded_mask, (filter_h_i, filter_w_i), 1, axis=(-2, -1))
+    windowed_indices = utils3d.numpy.sliding_window_2d(padded_indices, (filter_h_i, filter_w_i), 1, axis=(0, 1))
+    # Gather the target pixels's local window
+    target_center = utils3d.numpy.image_uv(width=target_width, height=target_height, dtype=np.float32) * np.array([width, height], dtype=np.float32)
+    target_lefttop = target_center - np.array((filter_w_f / 2, filter_h_f / 2), dtype=np.float32)
+    target_bottomright = target_center + np.array((filter_w_f / 2, filter_h_f / 2), dtype=np.float32)
+    target_window = np.floor(target_lefttop).astype(np.int32) + np.array((padding_w, padding_h), dtype=np.int32)
+    target_window_centers = windowed_uv[target_window[..., 1], target_window[..., 0], :, :, :].reshape(target_height, target_width, 2, filter_size)                          # (target_height, tgt_width, 2, filter_size)
+    target_window_mask = windowed_mask[..., target_window[..., 1], target_window[..., 0], :, :].reshape(*mask.shape[:-2], target_height, target_width, filter_size)     # (..., target_height, tgt_width, filter_size)
+    target_window_indices = windowed_indices[target_window[..., 1], target_window[..., 0], :, :].reshape(target_height, target_width, filter_size)                      # (target_height, tgt_width, filter_size)
+    # Compute pixel area in the local windows
+    target_window_lefttop = np.maximum(target_window_centers - 0.5, target_lefttop[..., None])
+    target_window_bottomright = np.minimum(target_window_centers + 0.5, target_bottomright[..., None])
+    target_window_area = (target_window_bottomright - target_window_lefttop).clip(0, None)
+    target_window_area = np.where(target_window_mask, target_window_area[..., 0, :] * target_window_area[..., 1, :], 0)
+    # Weighted sum by area
+    target_window_image = image.reshape(*image.shape[:-3], height * width, -1)[..., target_window_indices, :].swapaxes(-2, -1)
+    target_mask = np.sum(target_window_area, axis=-1) >= 0.25
+    target_image = weighted_mean_numpy(target_window_image, target_window_area[..., None, :], axis=-1)
+    if omit_channel_dim:
+        target_image = target_image[..., 0]
+    return target_image, target_mask
+def norm3d(x: np.ndarray) -> np.ndarray:
+    "Faster `np.linalg.norm(x, axis=-1)` for 3D vectors"
+    return np.sqrt(np.square(x[..., 0]) + np.square(x[..., 1]) + np.square(x[..., 2]))
+def depth_occlusion_edge_numpy(depth: np.ndarray, mask: np.ndarray, thickness: int = 1, tol: float = 0.1):
+    disp = np.where(mask, 1 / depth, 0)
+    disp_pad = np.pad(disp, (thickness, thickness), constant_values=0)
+    mask_pad = np.pad(mask, (thickness, thickness), constant_values=False)
+    kernel_size = 2 * thickness + 1
+    disp_window = utils3d.numpy.sliding_window_2d(disp_pad, (kernel_size, kernel_size), 1, axis=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    mask_window = utils3d.numpy.sliding_window_2d(mask_pad, (kernel_size, kernel_size), 1, axis=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    disp_mean = weighted_mean_numpy(disp_window, mask_window, axis=(-2, -1))
+    fg_edge_mask = mask & (disp > (1 + tol) * disp_mean)
+    bg_edge_mask = mask & (disp_mean > (1 + tol) * disp)
+    edge_mask = (cv2.dilate(fg_edge_mask.astype(np.uint8), np.ones((3, 3), dtype=np.uint8), iterations=thickness) > 0) \
+        & (cv2.dilate(bg_edge_mask.astype(np.uint8), np.ones((3, 3), dtype=np.uint8), iterations=thickness) > 0)
+    return edge_mask
+def disk_kernel(radius: int) -> np.ndarray:
+    """
+    Generate disk kernel with given radius.
+    Args:
+        radius (int): Radius of the disk (in pixels).
+    Returns:
+        np.ndarray: (2*radius+1, 2*radius+1) normalized convolution kernel.
+    """
+    # Create coordinate grid centered at (0,0)
+    L = np.arange(-radius, radius + 1)
+    X, Y = np.meshgrid(L, L)
+    # Generate disk: region inside circle with radius R is 1
+    kernel = ((X**2 + Y**2) <= radius**2).astype(np.float32)
+    # Normalize the kernel
+    kernel /= np.sum(kernel)
+    return kernel
+def disk_blur(image: np.ndarray, radius: int) -> np.ndarray:
+    """
+    Apply disk blur to an image using FFT convolution.
+    Args:
+        image (np.ndarray): Input image, can be grayscale or color.
+        radius (int): Blur radius (in pixels).
+    Returns:
+        np.ndarray: Blurred image.
+    """
+    if radius == 0:
+        return image
+    kernel = disk_kernel(radius)
+    if image.ndim == 2:
+        blurred = fftconvolve(image, kernel, mode='same')
+    elif image.ndim == 3:
+        channels = []
+        for i in range(image.shape[2]):
+            blurred_channel = fftconvolve(image[..., i], kernel, mode='same')
+            channels.append(blurred_channel)
+        blurred = np.stack(channels, axis=-1)
+    else:
+        raise ValueError("Image must be 2D or 3D.")
+    return blurred
+def depth_of_field(
+    img: np.ndarray,
+    disp: np.ndarray,
+    focus_disp : float,
+    max_blur_radius : int = 10,
+) -> np.ndarray:
+    """
+    Apply depth of field effect to an image.
+    Args:
+        img (numpy.ndarray): (H, W, 3) input image.
+        depth (numpy.ndarray): (H, W) depth map of the scene.
+        focus_depth (float): Focus depth of the lens.
+        strength (float): Strength of the depth of field effect.
+        max_blur_radius (int): Maximum blur radius (in pixels).
+    Returns:
+        numpy.ndarray: (H, W, 3) output image with depth of field effect applied.
+    """
+    # Precalculate dialated depth map for each blur radius
+    max_disp = np.max(disp)
+    disp = disp / max_disp
+    focus_disp = focus_disp / max_disp
+    dilated_disp = []
+    for radius in range(max_blur_radius + 1):
+        dilated_disp.append(cv2.dilate(disp, cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2*radius+1, 2*radius+1)), iterations=1))
+    # Determine the blur radius for each pixel based on the depth map
+    blur_radii = np.clip(abs(disp - focus_disp) * max_blur_radius, 0, max_blur_radius).astype(np.int32)
+    for radius in range(max_blur_radius + 1):
+        dialted_blur_radii = np.clip(abs(dilated_disp[radius] - focus_disp) * max_blur_radius, 0, max_blur_radius).astype(np.int32)
+        mask = (dialted_blur_radii >= radius) & (dialted_blur_radii >= blur_radii) & (dilated_disp[radius] > disp)
+        blur_radii[mask] = dialted_blur_radii[mask]
+    blur_radii = np.clip(blur_radii, 0, max_blur_radius)
+    blur_radii = cv2.blur(blur_radii, (5, 5))
+    # Precalculate the blured image for each blur radius
+    unique_radii = np.unique(blur_radii)
+    precomputed = {}
+    for radius in range(max_blur_radius + 1):
+        if radius not in unique_radii:
+            continue
+        precomputed[radius] = disk_blur(img, radius)
+    # Composit the blured image for each pixel
+    output = np.zeros_like(img)
+    for r in unique_radii:
+        mask = blur_radii == r
+        output[mask] = precomputed[r][mask]
+    return output

moge/utils/geometry_torch.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+import math
+from collections import namedtuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.types
+import utils3d
+from .tools import timeit
+from .geometry_numpy import solve_optimal_focal_shift, solve_optimal_shift
+def weighted_mean(x: torch.Tensor, w: torch.Tensor = None, dim: Union[int, torch.Size] = None, keepdim: bool = False, eps: float = 1e-7) -> torch.Tensor:
+    if w is None:
+        return x.mean(dim=dim, keepdim=keepdim)
+    else:
+        w = w.to(x.dtype)
+        return (x * w).mean(dim=dim, keepdim=keepdim) / w.mean(dim=dim, keepdim=keepdim).add(eps)
+def harmonic_mean(x: torch.Tensor, w: torch.Tensor = None, dim: Union[int, torch.Size] = None, keepdim: bool = False, eps: float = 1e-7) -> torch.Tensor:
+    if w is None:
+        return x.add(eps).reciprocal().mean(dim=dim, keepdim=keepdim).reciprocal()
+    else:
+        w = w.to(x.dtype)
+        return weighted_mean(x.add(eps).reciprocal(), w, dim=dim, keepdim=keepdim, eps=eps).add(eps).reciprocal()
+def geometric_mean(x: torch.Tensor, w: torch.Tensor = None, dim: Union[int, torch.Size] = None, keepdim: bool = False, eps: float = 1e-7) -> torch.Tensor:
+    if w is None:
+        return x.add(eps).log().mean(dim=dim).exp()
+    else:
+        w = w.to(x.dtype)
+        return weighted_mean(x.add(eps).log(), w, dim=dim, keepdim=keepdim, eps=eps).exp()
+def normalized_view_plane_uv(width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None) -> torch.Tensor:
+    "UV with left-top corner as (-width / diagonal, -height / diagonal) and right-bottom corner as (width / diagonal, height / diagonal)"
+    if aspect_ratio is None:
+        aspect_ratio = width / height
+    span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
+    span_y = 1 / (1 + aspect_ratio ** 2) ** 0.5
+    u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device)
+    v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device)
+    u, v = torch.meshgrid(u, v, indexing='xy')
+    uv = torch.stack([u, v], dim=-1)
+    return uv
+def gaussian_blur_2d(input: torch.Tensor, kernel_size: int, sigma: float) -> torch.Tensor:
+    kernel = torch.exp(-(torch.arange(-kernel_size // 2 + 1, kernel_size // 2 + 1, dtype=input.dtype, device=input.device) ** 2) / (2 * sigma ** 2))
+    kernel = kernel / kernel.sum()
+    kernel = (kernel[:, None] * kernel[None, :]).reshape(1, 1, kernel_size, kernel_size)
+    input = F.pad(input, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), mode='replicate')
+    input = F.conv2d(input, kernel, groups=input.shape[1])
+    return input
+def focal_to_fov(focal: torch.Tensor):
+    return 2 * torch.atan(0.5 / focal)
+def fov_to_focal(fov: torch.Tensor):
+    return 0.5 / torch.tan(fov / 2)
+def angle_diff_vec3(v1: torch.Tensor, v2: torch.Tensor, eps: float = 1e-12):
+    return torch.atan2(torch.cross(v1, v2, dim=-1).norm(dim=-1) + eps, (v1 * v2).sum(dim=-1))
+def intrinsics_to_fov(intrinsics: torch.Tensor):
+    """
+    Returns field of view in radians from normalized intrinsics matrix.
+    ### Parameters:
+    - intrinsics: torch.Tensor of shape (..., 3, 3)
+    ### Returns:
+    - fov_x: torch.Tensor of shape (...)
+    - fov_y: torch.Tensor of shape (...)
+    """
+    focal_x = intrinsics[..., 0, 0]
+    focal_y = intrinsics[..., 1, 1]
+    return 2 * torch.atan(0.5 / focal_x), 2 * torch.atan(0.5 / focal_y)
+def point_map_to_depth_legacy(points: torch.Tensor):
+    height, width = points.shape[-3:-1]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    uv = normalized_view_plane_uv(width, height, dtype=points.dtype, device=points.device)  # (H, W, 2)
+    # Solve least squares problem
+    b = (uv * points[..., 2:]).flatten(-3, -1)                        # (..., H * W * 2)
+    A = torch.stack([points[..., :2], -uv.expand_as(points[..., :2])], dim=-1).flatten(-4, -2)   # (..., H * W * 2, 2)
+    M = A.transpose(-2, -1) @ A
+    solution = (torch.inverse(M + 1e-6 * torch.eye(2).to(A)) @ (A.transpose(-2, -1) @ b[..., None])).squeeze(-1)
+    focal, shift = solution.unbind(-1)
+    depth = points[..., 2] + shift[..., None, None]
+    fov_x = torch.atan(width / diagonal / focal) * 2
+    fov_y = torch.atan(height / diagonal / focal) * 2
+    return depth, fov_x, fov_y, shift
+def view_plane_uv_to_focal(uv: torch.Tensor):
+    normed_uv = normalized_view_plane_uv(width=uv.shape[-2], height=uv.shape[-3], device=uv.device, dtype=uv.dtype)
+    focal = (uv * normed_uv).sum() / uv.square().sum().add(1e-12)
+    return focal
+def recover_focal_shift(points: torch.Tensor, mask: torch.Tensor = None, focal: torch.Tensor = None, downsample_size: Tuple[int, int] = (64, 64)):
+    """
+    Recover the depth map and FoV from a point map with unknown z shift and focal.
+    Note that it assumes:
+    - the optical center is at the center of the map
+    - the map is undistorted
+    - the map is isometric in the x and y directions
+    ### Parameters:
+    - `points: torch.Tensor` of shape (..., H, W, 3)
+    - `downsample_size: Tuple[int, int]` in (height, width), the size of the downsampled map. Downsampling produces approximate solution and is efficient for large maps.
+    ### Returns:
+    - `focal`: torch.Tensor of shape (...) the estimated focal length, relative to the half diagonal of the map
+    - `shift`: torch.Tensor of shape (...) Z-axis shift to translate the point map to camera space
+    """
+    shape = points.shape
+    height, width = points.shape[-3], points.shape[-2]
+    diagonal = (height ** 2 + width ** 2) ** 0.5
+    points = points.reshape(-1, *shape[-3:])
+    mask = None if mask is None else mask.reshape(-1, *shape[-3:-1])
+    focal = focal.reshape(-1) if focal is not None else None
+    uv = normalized_view_plane_uv(width, height, dtype=points.dtype, device=points.device)  # (H, W, 2)
+    points_lr = F.interpolate(points.permute(0, 3, 1, 2), downsample_size, mode='nearest').permute(0, 2, 3, 1)
+    uv_lr = F.interpolate(uv.unsqueeze(0).permute(0, 3, 1, 2), downsample_size, mode='nearest').squeeze(0).permute(1, 2, 0)
+    mask_lr = None if mask is None else F.interpolate(mask.to(torch.float32).unsqueeze(1), downsample_size, mode='nearest').squeeze(1) > 0
+    uv_lr_np = uv_lr.cpu().numpy()
+    points_lr_np = points_lr.detach().cpu().numpy()
+    focal_np = focal.cpu().numpy() if focal is not None else None
+    mask_lr_np = None if mask is None else mask_lr.cpu().numpy()
+    optim_shift, optim_focal = [], []
+    for i in range(points.shape[0]):
+        points_lr_i_np = points_lr_np[i] if mask is None else points_lr_np[i][mask_lr_np[i]]
+        uv_lr_i_np = uv_lr_np if mask is None else uv_lr_np[mask_lr_np[i]]
+        if uv_lr_i_np.shape[0] < 2:
+            optim_focal.append(1)
+            optim_shift.append(0)
+            continue
+        if focal is None:
+            optim_shift_i, optim_focal_i = solve_optimal_focal_shift(uv_lr_i_np, points_lr_i_np)
+            optim_focal.append(float(optim_focal_i))
+        else:
+            optim_shift_i = solve_optimal_shift(uv_lr_i_np, points_lr_i_np, focal_np[i])
+        optim_shift.append(float(optim_shift_i))
+    optim_shift = torch.tensor(optim_shift, device=points.device, dtype=points.dtype).reshape(shape[:-3])
+    if focal is None:
+        optim_focal = torch.tensor(optim_focal, device=points.device, dtype=points.dtype).reshape(shape[:-3])
+    else:
+        optim_focal = focal.reshape(shape[:-3])
+    return optim_focal, optim_shift
+def mask_aware_nearest_resize(
+    inputs: Union[torch.Tensor, Sequence[torch.Tensor], None],
+    mask: torch.BoolTensor,
+    size: Tuple[int, int],
+    return_index: bool = False
+) -> Tuple[Union[torch.Tensor, Sequence[torch.Tensor], None], torch.BoolTensor, Tuple[torch.LongTensor, ...]]:
+    """
+    Resize 2D map by nearest interpolation. Return the nearest neighbor index and mask of the resized map.
+    ### Parameters
+    - `inputs`: a single or a list of input 2D map(s) of shape (..., H, W, ...).
+    - `mask`: input 2D mask of shape (..., H, W)
+    - `size`: target size (target_width, target_height)
+    ### Returns
+    - `*resized_maps`: resized map(s) of shape (..., target_height, target_width, ...).
+    - `resized_mask`: mask of the resized map of shape (..., target_height, target_width)
+    - `nearest_idx`: if return_index is True, nearest neighbor index of the resized map of shape (..., target_height, target_width) for each dimension, .
+    """
+    height, width = mask.shape[-2:]
+    target_width, target_height = size
+    device = mask.device
+    filter_h_f, filter_w_f = max(1, height / target_height), max(1, width / target_width)
+    filter_h_i, filter_w_i = math.ceil(filter_h_f), math.ceil(filter_w_f)
+    filter_size = filter_h_i * filter_w_i
+    padding_h, padding_w = filter_h_i // 2 + 1, filter_w_i // 2 + 1
+    # Window the original mask and uv
+    uv = utils3d.torch.image_pixel_center(width=width, height=height, dtype=torch.float32, device=device)
+    indices = torch.arange(height * width, dtype=torch.long, device=device).reshape(height, width)
+    padded_uv = torch.full((height + 2 * padding_h, width + 2 * padding_w, 2), 0, dtype=torch.float32, device=device)
+    padded_uv[padding_h:padding_h + height, padding_w:padding_w + width] = uv
+    padded_mask = torch.full((*mask.shape[:-2], height + 2 * padding_h, width + 2 * padding_w), False, dtype=torch.bool, device=device)
+    padded_mask[..., padding_h:padding_h + height, padding_w:padding_w + width] = mask
+    padded_indices = torch.full((height + 2 * padding_h, width + 2 * padding_w), 0, dtype=torch.long, device=device)
+    padded_indices[padding_h:padding_h + height, padding_w:padding_w + width] = indices
+    windowed_uv = utils3d.torch.sliding_window_2d(padded_uv, (filter_h_i, filter_w_i), 1, dim=(0, 1))
+    windowed_mask = utils3d.torch.sliding_window_2d(padded_mask, (filter_h_i, filter_w_i), 1, dim=(-2, -1))
+    windowed_indices = utils3d.torch.sliding_window_2d(padded_indices, (filter_h_i, filter_w_i), 1, dim=(0, 1))
+    # Gather the target pixels's local window
+    target_uv = utils3d.torch.image_uv(width=target_width, height=target_height, dtype=torch.float32, device=device) * torch.tensor([width, height], dtype=torch.float32, device=device)
+    target_lefttop = target_uv - torch.tensor((filter_w_f / 2, filter_h_f / 2), dtype=torch.float32, device=device)
+    target_window = torch.round(target_lefttop).long() + torch.tensor((padding_w, padding_h), dtype=torch.long, device=device)
+    target_window_uv = windowed_uv[target_window[..., 1], target_window[..., 0], :, :, :].reshape(target_height, target_width, 2, filter_size)                          # (target_height, tgt_width, 2, filter_size)
+    target_window_mask = windowed_mask[..., target_window[..., 1], target_window[..., 0], :, :].reshape(*mask.shape[:-2], target_height, target_width, filter_size)     # (..., target_height, tgt_width, filter_size)
+    target_window_indices = windowed_indices[target_window[..., 1], target_window[..., 0], :, :].reshape(target_height, target_width, filter_size)                      # (target_height, tgt_width, filter_size)
+    target_window_indices = target_window_indices.expand_as(target_window_mask)
+    # Compute nearest neighbor in the local window for each pixel
+    dist = torch.where(target_window_mask, torch.norm(target_window_uv - target_uv[..., None], dim=-2), torch.inf)  # (..., target_height, tgt_width, filter_size)
+    nearest = torch.argmin(dist, dim=-1, keepdim=True)                                                              # (..., target_height, tgt_width, 1)
+    nearest_idx = torch.gather(target_window_indices, index=nearest, dim=-1).squeeze(-1)                            # (..., target_height, tgt_width)
+    target_mask = torch.any(target_window_mask, dim=-1)
+    nearest_i, nearest_j = nearest_idx // width, nearest_idx % width
+    batch_indices = [torch.arange(n, device=device).reshape([1] * i + [n] + [1] * (mask.dim() - i - 1)) for i, n in enumerate(mask.shape[:-2])]
+    index = (*batch_indices, nearest_i, nearest_j)
+    if inputs is None:
+        outputs = None
+    elif isinstance(inputs, torch.Tensor):
+        outputs = inputs[index]
+    elif isinstance(inputs, Sequence):
+        outputs = tuple(x[index] for x in inputs)
+    else:
+        raise ValueError(f'Invalid input type: {type(inputs)}')
+    if return_index:
+        return outputs, target_mask, index
+    else:
+        return outputs, target_mask
+def theshold_depth_change(depth: torch.Tensor, mask: torch.Tensor, pooler: Literal['min', 'max'], rtol: float = 0.2, kernel_size: int = 3):
+    *batch_shape, height, width = depth.shape
+    depth = depth.reshape(-1, 1, height, width)
+    mask = mask.reshape(-1, 1, height, width)
+    if pooler =='max':
+        pooled_depth = F.max_pool2d(torch.where(mask, depth, -torch.inf), kernel_size, stride=1, padding=kernel_size // 2)
+        output_mask = pooled_depth > depth * (1 + rtol)
+    elif pooler =='min':
+        pooled_depth = -F.max_pool2d(-torch.where(mask, depth, torch.inf), kernel_size, stride=1, padding=kernel_size // 2)
+        output_mask =  pooled_depth < depth * (1 - rtol)
+    else:
+        raise ValueError(f'Unsupported pooler: {pooler}')
+    output_mask = output_mask.reshape(*batch_shape, height, width)
+    return output_mask
+def depth_occlusion_edge(depth: torch.FloatTensor, mask: torch.BoolTensor, kernel_size: int = 3, tol: float = 0.1):
+    device, dtype = depth.device, depth.dtype
+    disp = torch.where(mask, 1 / depth, 0)
+    disp_pad = F.pad(disp, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=0)
+    mask_pad = F.pad(mask, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=False)
+    disp_window = utils3d.torch.sliding_window_2d(disp_pad, (kernel_size, kernel_size), 1, dim=(-2, -1)).flatten(-2)  # [..., H, W, kernel_size ** 2]
+    mask_window = utils3d.torch.sliding_window_2d(mask_pad, (kernel_size, kernel_size), 1, dim=(-2, -1)).flatten(-2)            # [..., H, W, kernel_size ** 2]
+    x = torch.linspace(-kernel_size // 2, kernel_size // 2, kernel_size, device=device, dtype=dtype)
+    A = torch.stack([*torch.meshgrid(x, x, indexing='xy'), torch.ones((kernel_size, kernel_size), device=device, dtype=dtype)], dim=-1).reshape(kernel_size ** 2, 3)        # [kernel_size ** 2, 3]
+    A = mask_window[..., None] * A
+    I = torch.eye(3, device=device, dtype=dtype)
+    affine_disp_window = (disp_window[..., None, :] @ A @ torch.inverse(A.mT @ A + 1e-5 * I) @ A.mT).clamp_min(1e-12)[..., 0, :]  # [..., H, W, kernel_size ** 2]
+    diff = torch.where(mask_window, torch.maximum(affine_disp_window, disp_window) / torch.minimum(affine_disp_window, disp_window) - 1, 0)
+    edge_mask = mask & (diff > tol).any(dim=-1)
+    disp_mean = weighted_mean(disp_window, mask_window, dim=-1)
+    fg_edge_mask = edge_mask & (disp > disp_mean)
+    # fg_edge_mask = edge_mask & theshold_depth_change(depth, mask, pooler='max', rtol=tol, kernel_size=kernel_size)
+    bg_edge_mask = edge_mask & ~fg_edge_mask
+    return fg_edge_mask, bg_edge_mask
+def depth_occlusion_edge(depth: torch.FloatTensor, mask: torch.BoolTensor, kernel_size: int = 3, tol: float = 0.1):
+    device, dtype = depth.device, depth.dtype
+    disp = torch.where(mask, 1 / depth, 0)
+    disp_pad = F.pad(disp, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=0)
+    mask_pad = F.pad(mask, (kernel_size // 2, kernel_size // 2, kernel_size // 2, kernel_size // 2), value=False)
+    disp_window = utils3d.torch.sliding_window_2d(disp_pad, (kernel_size, kernel_size), 1, dim=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    mask_window = utils3d.torch.sliding_window_2d(mask_pad, (kernel_size, kernel_size), 1, dim=(-2, -1))  # [..., H, W, kernel_size ** 2]
+    disp_mean = weighted_mean(disp_window, mask_window, dim=(-2, -1))
+    fg_edge_mask = mask & (disp / disp_mean > 1 + tol)
+    bg_edge_mask = mask & (disp_mean / disp > 1 + tol)
+    fg_edge_mask = fg_edge_mask & F.max_pool2d(bg_edge_mask.float(), kernel_size + 2, stride=1, padding=kernel_size // 2 + 1).bool()
+    bg_edge_mask = bg_edge_mask & F.max_pool2d(fg_edge_mask.float(), kernel_size + 2, stride=1, padding=kernel_size // 2 + 1).bool()
+    return fg_edge_mask, bg_edge_mask
+def dilate_with_mask(input: torch.Tensor, mask: torch.BoolTensor, filter: Literal['min', 'max', 'mean', 'median'] = 'mean', iterations: int = 1) -> torch.Tensor:
+    kernel = torch.tensor([[False, True, False], [True, True, True], [False, True, False]], device=input.device, dtype=torch.bool)
+    for _ in range(iterations):
+        input_window = utils3d.torch.sliding_window_2d(F.pad(input, (1, 1, 1, 1), mode='constant', value=0), window_size=3, stride=1, dim=(-2, -1))
+        mask_window = kernel & utils3d.torch.sliding_window_2d(F.pad(mask, (1, 1, 1, 1), mode='constant', value=False), window_size=3, stride=1, dim=(-2, -1))
+        if filter =='min':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, torch.inf).min(dim=(-2, -1)).values)
+        elif filter =='max':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, -torch.inf).max(dim=(-2, -1)).values)
+        elif filter == 'mean':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, torch.nan).nanmean(dim=(-2, -1)))
+        elif filter =='median':
+            input = torch.where(mask, input, torch.where(mask_window, input_window, torch.nan).flatten(-2).nanmedian(dim=-1).values)
+        mask = mask_window.any(dim=(-2, -1))
+    return input, mask
+def refine_depth_with_normal(depth: torch.Tensor, normal: torch.Tensor, intrinsics: torch.Tensor, iterations: int = 10, damp: float = 1e-3, eps: float = 1e-12, kernel_size: int = 5) -> torch.Tensor:
+    device, dtype = depth.device, depth.dtype
+    height, width = depth.shape[-2:]
+    radius = kernel_size // 2
+    duv = torch.stack(torch.meshgrid(torch.linspace(-radius / width, radius / width, kernel_size, device=device, dtype=dtype), torch.linspace(-radius / height, radius / height, kernel_size, device=device, dtype=dtype), indexing='xy'), dim=-1).to(dtype=dtype, device=device)
+    log_depth = depth.clamp_min_(eps).log()
+    log_depth_diff = utils3d.torch.sliding_window_2d(log_depth, window_size=kernel_size, stride=1, dim=(-2, -1)) - log_depth[..., radius:-radius, radius:-radius, None, None]
+    weight = torch.exp(-(log_depth_diff / duv.norm(dim=-1).clamp_min_(eps) / 10).square())
+    tot_weight = weight.sum(dim=(-2, -1)).clamp_min_(eps)
+    uv = utils3d.torch.image_uv(height=height, width=width, device=device, dtype=dtype)
+    K_inv = torch.inverse(intrinsics)
+    grad = -(normal[..., None, :2] @ K_inv[..., None, None, :2, :2]).squeeze(-2) \
+            / (normal[..., None, 2:] + normal[..., None, :2] @ (K_inv[..., None, None, :2, :2] @ uv[..., :, None] + K_inv[..., None, None, :2, 2:])).squeeze(-2)
+    laplacian = (weight * ((utils3d.torch.sliding_window_2d(grad, window_size=kernel_size, stride=1, dim=(-3, -2)) + grad[..., radius:-radius, radius:-radius, :, None, None]) * (duv.permute(2, 0, 1) / 2)).sum(dim=-3)).sum(dim=(-2, -1))
+    laplacian = laplacian.clamp(-0.1, 0.1)
+    log_depth_refine = log_depth.clone()
+    for _ in range(iterations):
+        log_depth_refine[..., radius:-radius, radius:-radius] = 0.1 * log_depth_refine[..., radius:-radius, radius:-radius] + 0.9 * (damp * log_depth[..., radius:-radius, radius:-radius] - laplacian + (weight * utils3d.torch.sliding_window_2d(log_depth_refine, window_size=kernel_size, stride=1, dim=(-2, -1))).sum(dim=(-2, -1))) / (tot_weight + damp)
+    depth_refine = log_depth_refine.exp()
+    return depth_refine

moge/utils/io.py ADDED Viewed

	@@ -0,0 +1,241 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+import os
+os.environ['OPENCV_IO_ENABLE_OPENEXR'] = '1'
+from typing import IO
+import zipfile
+import json
+import io
+from typing import *
+from pathlib import Path
+import re
+from PIL import Image, PngImagePlugin
+import numpy as np
+import cv2
+from .tools import timeit
+def save_glb(
+    save_path: Union[str, os.PathLike],
+    vertices: np.ndarray,
+    faces: np.ndarray,
+    vertex_uvs: np.ndarray,
+    texture: np.ndarray,
+    vertex_normals: Optional[np.ndarray] = None,
+):
+    import trimesh
+    import trimesh.visual
+    from PIL import Image
+    trimesh.Trimesh(
+        vertices=vertices,
+        vertex_normals=vertex_normals,
+        faces=faces,
+        visual = trimesh.visual.texture.TextureVisuals(
+            uv=vertex_uvs,
+            material=trimesh.visual.material.PBRMaterial(
+                baseColorTexture=Image.fromarray(texture),
+                metallicFactor=0.5,
+                roughnessFactor=1.0
+            )
+        ),
+        process=False
+    ).export(save_path)
+def save_ply(
+    save_path: Union[str, os.PathLike],
+    vertices: np.ndarray,
+    faces: np.ndarray,
+    vertex_colors: np.ndarray,
+    vertex_normals: Optional[np.ndarray] = None,
+):
+    import trimesh
+    import trimesh.visual
+    from PIL import Image
+    trimesh.Trimesh(
+        vertices=vertices,
+        faces=faces,
+        vertex_colors=vertex_colors,
+        vertex_normals=vertex_normals,
+        process=False
+    ).export(save_path)
+def read_image(path: Union[str, os.PathLike, IO]) -> np.ndarray:
+    """
+    Read a image, return uint8 RGB array of shape (H, W, 3).
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    image = cv2.cvtColor(cv2.imdecode(np.frombuffer(data, np.uint8), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+    return image
+def write_image(path: Union[str, os.PathLike, IO], image: np.ndarray, quality: int = 95):
+    """
+    Write a image, input uint8 RGB array of shape (H, W, 3).
+    """
+    data = cv2.imencode('.jpg', cv2.cvtColor(image, cv2.COLOR_RGB2BGR), [cv2.IMWRITE_JPEG_QUALITY, quality])[1].tobytes()
+    if isinstance(path, (str, os.PathLike)):
+        Path(path).write_bytes(data)
+    else:
+        path.write(data)
+def read_depth(path: Union[str, os.PathLike, IO]) -> Tuple[np.ndarray, float]:
+    """
+    Read a depth image, return float32 depth array of shape (H, W).
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    pil_image = Image.open(io.BytesIO(data))
+    near = float(pil_image.info.get('near'))
+    far = float(pil_image.info.get('far'))
+    unit = float(pil_image.info.get('unit')) if 'unit' in pil_image.info else None
+    depth = np.array(pil_image)
+    mask_nan, mask_inf = depth == 0, depth == 65535
+    depth = (depth.astype(np.float32) - 1) / 65533
+    depth = near ** (1 - depth) * far ** depth
+    depth[mask_nan] = np.nan
+    depth[mask_inf] = np.inf
+    return depth, unit
+def write_depth(
+    path: Union[str, os.PathLike, IO],
+    depth: np.ndarray,
+    unit: float = None,
+    max_range: float = 1e5,
+    compression_level: int = 7,
+):
+    """
+    Encode and write a depth image as 16-bit PNG format.
+    ### Parameters:
+    - `path: Union[str, os.PathLike, IO]`
+        The file path or file object to write to.
+    - `depth: np.ndarray`
+        The depth array, float32 array of shape (H, W).
+        May contain `NaN` for invalid values and `Inf` for infinite values.
+    - `unit: float = None`
+        The unit of the depth values.
+    Depth values are encoded as follows:
+    - 0: unknown
+    - 1 ~ 65534: depth values in logarithmic
+    - 65535: infinity
+    metadata is stored in the PNG file as text fields:
+    - `near`: the minimum depth value
+    - `far`: the maximum depth value
+    - `unit`: the unit of the depth values (optional)
+    """
+    mask_values, mask_nan, mask_inf = np.isfinite(depth), np.isnan(depth),np.isinf(depth)
+    depth = depth.astype(np.float32)
+    mask_finite = depth
+    near = max(depth[mask_values].min(), 1e-5)
+    far = max(near * 1.1, min(depth[mask_values].max(), near * max_range))
+    depth = 1 + np.round((np.log(np.nan_to_num(depth, nan=0).clip(near, far) / near) / np.log(far / near)).clip(0, 1) * 65533).astype(np.uint16) # 1~65534
+    depth[mask_nan] = 0
+    depth[mask_inf] = 65535
+    pil_image = Image.fromarray(depth)
+    pnginfo = PngImagePlugin.PngInfo()
+    pnginfo.add_text('near', str(near))
+    pnginfo.add_text('far', str(far))
+    if unit is not None:
+        pnginfo.add_text('unit', str(unit))
+    pil_image.save(path, pnginfo=pnginfo, compress_level=compression_level)
+def read_segmentation(path: Union[str, os.PathLike, IO]) -> Tuple[np.ndarray, Dict[str, int]]:
+    """
+    Read a segmentation mask
+    ### Parameters:
+    - `path: Union[str, os.PathLike, IO]`
+        The file path or file object to read from.
+    ### Returns:
+    - `Tuple[np.ndarray, Dict[str, int]]`
+        A tuple containing:
+        - `mask`: uint8 or uint16 numpy.ndarray of shape (H, W).
+        - `labels`: Dict[str, int]. The label mapping, a dictionary of {label_name: label_id}.
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    pil_image = Image.open(io.BytesIO(data))
+    labels = json.loads(pil_image.info['labels']) if 'labels' in pil_image.info else None
+    mask = np.array(pil_image)
+    return mask, labels
+def write_segmentation(path: Union[str, os.PathLike, IO], mask: np.ndarray, labels: Dict[str, int] = None, compression_level: int = 7):
+    """
+    Write a segmentation mask and label mapping, as PNG format.
+    ### Parameters:
+    - `path: Union[str, os.PathLike, IO]`
+        The file path or file object to write to.
+    - `mask: np.ndarray`
+        The segmentation mask, uint8 or uint16 array of shape (H, W).
+    - `labels: Dict[str, int] = None`
+        The label mapping, a dictionary of {label_name: label_id}.
+    - `compression_level: int = 7`
+        The compression level for PNG compression.
+    """
+    assert mask.dtype == np.uint8 or mask.dtype == np.uint16, f"Unsupported dtype {mask.dtype}"
+    pil_image = Image.fromarray(mask)
+    pnginfo = PngImagePlugin.PngInfo()
+    if labels is not None:
+        labels_json = json.dumps(labels, ensure_ascii=True, separators=(',', ':'))
+        pnginfo.add_text('labels', labels_json)
+    pil_image.save(path, pnginfo=pnginfo, compress_level=compression_level)
+def read_normal(path: Union[str, os.PathLike, IO]) -> np.ndarray:
+    """
+    Read a normal image, return float32 normal array of shape (H, W, 3).
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    normal = cv2.cvtColor(cv2.imdecode(np.frombuffer(data, np.uint8), cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
+    mask_nan = np.all(normal == 0, axis=-1)
+    normal = (normal.astype(np.float32) / 65535 - 0.5) * [2.0, -2.0, -2.0]
+    normal = normal / (np.sqrt(np.square(normal[..., 0]) + np.square(normal[..., 1]) + np.square(normal[..., 2])) + 1e-12)
+    normal[mask_nan] = np.nan
+    return normal
+def write_normal(path: Union[str, os.PathLike, IO], normal: np.ndarray, compression_level: int = 7) -> np.ndarray:
+    """
+    Write a normal image, input float32 normal array of shape (H, W, 3).
+    """
+    mask_nan = np.isnan(normal).any(axis=-1)
+    normal = ((normal * [0.5, -0.5, -0.5] + 0.5).clip(0, 1) * 65535).astype(np.uint16)
+    normal[mask_nan] = 0
+    data = cv2.imencode('.png', cv2.cvtColor(normal, cv2.COLOR_RGB2BGR), [cv2.IMWRITE_PNG_COMPRESSION, compression_level])[1].tobytes()
+    if isinstance(path, (str, os.PathLike)):
+        Path(path).write_bytes(data)
+    else:
+        path.write(data)
+def read_meta(path: Union[str, os.PathLike, IO]) -> Dict[str, Any]:
+    return json.loads(Path(path).read_text())
+def write_meta(path: Union[str, os.PathLike, IO], meta: Dict[str, Any]):
+    Path(path).write_text(json.dumps(meta))

moge/utils/panorama.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+import os
+os.environ['OPENCV_IO_ENABLE_OPENEXR'] = '1'
+from pathlib import Path
+from typing import *
+import itertools
+import json
+import warnings
+import cv2
+import numpy as np
+from numpy import ndarray
+from tqdm import tqdm, trange
+from scipy.sparse import csr_array, hstack, vstack
+from scipy.ndimage import convolve
+from scipy.sparse.linalg import lsmr
+import utils3d
+def get_panorama_cameras():
+    vertices, _ = utils3d.numpy.icosahedron()
+    intrinsics = utils3d.numpy.intrinsics_from_fov(fov_x=np.deg2rad(90), fov_y=np.deg2rad(90))
+    extrinsics = utils3d.numpy.extrinsics_look_at([0, 0, 0], vertices, [0, 0, 1]).astype(np.float32)
+    return extrinsics, [intrinsics] * len(vertices)
+def spherical_uv_to_directions(uv: np.ndarray):
+    theta, phi = (1 - uv[..., 0]) * (2 * np.pi), uv[..., 1] * np.pi
+    directions = np.stack([np.sin(phi) * np.cos(theta), np.sin(phi) * np.sin(theta), np.cos(phi)], axis=-1)
+    return directions
+def directions_to_spherical_uv(directions: np.ndarray):
+    directions = directions / np.linalg.norm(directions, axis=-1, keepdims=True)
+    u = 1 - np.arctan2(directions[..., 1], directions[..., 0]) / (2 * np.pi) % 1.0
+    v = np.arccos(directions[..., 2]) / np.pi
+    return np.stack([u, v], axis=-1)
+def split_panorama_image(image: np.ndarray, extrinsics: np.ndarray, intrinsics: np.ndarray, resolution: int):
+    height, width = image.shape[:2]
+    uv = utils3d.numpy.image_uv(width=resolution, height=resolution)
+    splitted_images = []
+    for i in range(len(extrinsics)):
+        spherical_uv = directions_to_spherical_uv(utils3d.numpy.unproject_cv(uv, extrinsics=extrinsics[i], intrinsics=intrinsics[i]))
+        pixels = utils3d.numpy.uv_to_pixel(spherical_uv, width=width, height=height).astype(np.float32)
+        splitted_image = cv2.remap(image, pixels[..., 0], pixels[..., 1], interpolation=cv2.INTER_LINEAR)
+        splitted_images.append(splitted_image)
+    return splitted_images
+def poisson_equation(width: int, height: int, wrap_x: bool = False, wrap_y: bool = False) -> Tuple[csr_array, ndarray]:
+    grid_index = np.arange(height * width).reshape(height, width)
+    grid_index = np.pad(grid_index, ((0, 0), (1, 1)), mode='wrap' if wrap_x else 'edge')
+    grid_index = np.pad(grid_index, ((1, 1), (0, 0)), mode='wrap' if wrap_y else 'edge')
+    data = np.array([[-4, 1, 1, 1, 1]], dtype=np.float32).repeat(height * width, axis=0).reshape(-1)
+    indices = np.stack([
+        grid_index[1:-1, 1:-1],
+        grid_index[:-2, 1:-1],         # up
+        grid_index[2:, 1:-1],          # down
+        grid_index[1:-1, :-2],         # left
+        grid_index[1:-1, 2:]           # right
+    ], axis=-1).reshape(-1)
+    indptr = np.arange(0, height * width * 5 + 1, 5)
+    A = csr_array((data, indices, indptr), shape=(height * width, height * width))
+    return A
+def grad_equation(width: int, height: int, wrap_x: bool = False, wrap_y: bool = False) -> Tuple[csr_array, np.ndarray]:
+    grid_index = np.arange(width * height).reshape(height, width)
+    if wrap_x:
+        grid_index = np.pad(grid_index, ((0, 0), (0, 1)), mode='wrap')
+    if wrap_y:
+        grid_index = np.pad(grid_index, ((0, 1), (0, 0)), mode='wrap')
+    data = np.concatenate([
+        np.concatenate([
+            np.ones((grid_index.shape[0], grid_index.shape[1] - 1), dtype=np.float32).reshape(-1, 1),        # x[i,j]
+            -np.ones((grid_index.shape[0], grid_index.shape[1] - 1), dtype=np.float32).reshape(-1, 1),       # x[i,j-1]
+        ], axis=1).reshape(-1),
+        np.concatenate([
+            np.ones((grid_index.shape[0] - 1, grid_index.shape[1]), dtype=np.float32).reshape(-1, 1),        # x[i,j]
+            -np.ones((grid_index.shape[0] - 1, grid_index.shape[1]), dtype=np.float32).reshape(-1, 1),       # x[i-1,j]
+        ], axis=1).reshape(-1),
+    ])
+    indices = np.concatenate([
+        np.concatenate([
+            grid_index[:, :-1].reshape(-1, 1),
+            grid_index[:, 1:].reshape(-1, 1),
+        ], axis=1).reshape(-1),
+        np.concatenate([
+            grid_index[:-1, :].reshape(-1, 1),
+            grid_index[1:, :].reshape(-1, 1),
+        ], axis=1).reshape(-1),
+    ])
+    indptr = np.arange(0, grid_index.shape[0] * (grid_index.shape[1] - 1) * 2 + (grid_index.shape[0] - 1) * grid_index.shape[1] * 2 + 1, 2)
+    A = csr_array((data, indices, indptr), shape=(grid_index.shape[0] * (grid_index.shape[1] - 1) + (grid_index.shape[0] - 1) * grid_index.shape[1], height * width))
+    return A
+def merge_panorama_depth(width: int, height: int, distance_maps: List[np.ndarray], pred_masks: List[np.ndarray], extrinsics: List[np.ndarray], intrinsics: List[np.ndarray]):
+    if max(width, height) > 256:
+        panorama_depth_init, _ = merge_panorama_depth(width // 2, height // 2, distance_maps, pred_masks, extrinsics, intrinsics)
+        panorama_depth_init = cv2.resize(panorama_depth_init, (width, height), cv2.INTER_LINEAR)
+    else:
+        panorama_depth_init = None
+    uv = utils3d.numpy.image_uv(width=width, height=height)
+    spherical_directions = spherical_uv_to_directions(uv)
+    # Warp each view to the panorama
+    panorama_log_distance_grad_maps, panorama_grad_masks = [], []
+    panorama_log_distance_laplacian_maps, panorama_laplacian_masks = [], []
+    panorama_pred_masks = []
+    for i in range(len(distance_maps)):
+        projected_uv, projected_depth = utils3d.numpy.project_cv(spherical_directions, extrinsics=extrinsics[i], intrinsics=intrinsics[i])
+        projection_valid_mask = (projected_depth > 0) & (projected_uv > 0).all(axis=-1) & (projected_uv < 1).all(axis=-1)
+        projected_pixels = utils3d.numpy.uv_to_pixel(np.clip(projected_uv, 0, 1), width=distance_maps[i].shape[1], height=distance_maps[i].shape[0]).astype(np.float32)
+        log_splitted_distance = np.log(distance_maps[i])
+        panorama_log_distance_map = np.where(projection_valid_mask, cv2.remap(log_splitted_distance, projected_pixels[..., 0], projected_pixels[..., 1], cv2.INTER_LINEAR, borderMode=cv2.BORDER_REPLICATE), 0)
+        panorama_pred_mask = projection_valid_mask & (cv2.remap(pred_masks[i].astype(np.uint8), projected_pixels[..., 0], projected_pixels[..., 1], cv2.INTER_NEAREST, borderMode=cv2.BORDER_REPLICATE) > 0)
+        # calculate gradient map
+        padded = np.pad(panorama_log_distance_map, ((0, 0), (0, 1)), mode='wrap')
+        grad_x, grad_y = padded[:, :-1] - padded[:, 1:], padded[:-1, :] - padded[1:, :]
+        padded = np.pad(panorama_pred_mask, ((0, 0), (0, 1)), mode='wrap')
+        mask_x, mask_y = padded[:, :-1] & padded[:, 1:], padded[:-1, :] & padded[1:, :]
+        panorama_log_distance_grad_maps.append((grad_x, grad_y))
+        panorama_grad_masks.append((mask_x, mask_y))
+        # calculate laplacian map
+        padded = np.pad(panorama_log_distance_map, ((1, 1), (0, 0)), mode='edge')
+        padded = np.pad(padded, ((0, 0), (1, 1)), mode='wrap')
+        laplacian = convolve(padded, np.array([[0, 1, 0], [1, -4, 1], [0, 1, 0]], dtype=np.float32))[1:-1, 1:-1]
+        padded = np.pad(panorama_pred_mask, ((1, 1), (0, 0)), mode='edge')
+        padded = np.pad(padded, ((0, 0), (1, 1)), mode='wrap')
+        mask = convolve(padded.astype(np.uint8), np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8))[1:-1, 1:-1] == 5
+        panorama_log_distance_laplacian_maps.append(laplacian)
+        panorama_laplacian_masks.append(mask)
+        panorama_pred_masks.append(panorama_pred_mask)
+    panorama_log_distance_grad_x = np.stack([grad_map[0] for grad_map in panorama_log_distance_grad_maps], axis=0)
+    panorama_log_distance_grad_y = np.stack([grad_map[1] for grad_map in panorama_log_distance_grad_maps], axis=0)
+    panorama_grad_mask_x = np.stack([mask_map[0] for mask_map in panorama_grad_masks], axis=0)
+    panorama_grad_mask_y = np.stack([mask_map[1] for mask_map in panorama_grad_masks], axis=0)
+    panorama_log_distance_grad_x = np.sum(panorama_log_distance_grad_x * panorama_grad_mask_x, axis=0) / np.sum(panorama_grad_mask_x, axis=0).clip(1e-3)
+    panorama_log_distance_grad_y = np.sum(panorama_log_distance_grad_y * panorama_grad_mask_y, axis=0) / np.sum(panorama_grad_mask_y, axis=0).clip(1e-3)
+    panorama_laplacian_maps = np.stack(panorama_log_distance_laplacian_maps, axis=0)
+    panorama_laplacian_masks = np.stack(panorama_laplacian_masks, axis=0)
+    panorama_laplacian_map = np.sum(panorama_laplacian_maps * panorama_laplacian_masks, axis=0) / np.sum(panorama_laplacian_masks, axis=0).clip(1e-3)
+    grad_x_mask = np.any(panorama_grad_mask_x, axis=0).reshape(-1)
+    grad_y_mask = np.any(panorama_grad_mask_y, axis=0).reshape(-1)
+    grad_mask = np.concatenate([grad_x_mask, grad_y_mask])
+    laplacian_mask = np.any(panorama_laplacian_masks, axis=0).reshape(-1)
+    # Solve overdetermined system
+    A = vstack([
+        grad_equation(width, height, wrap_x=True, wrap_y=False)[grad_mask],
+        poisson_equation(width, height, wrap_x=True, wrap_y=False)[laplacian_mask],
+    ])
+    b = np.concatenate([
+        panorama_log_distance_grad_x.reshape(-1)[grad_x_mask],
+        panorama_log_distance_grad_y.reshape(-1)[grad_y_mask],
+        panorama_laplacian_map.reshape(-1)[laplacian_mask]
+    ])
+    x, *_ = lsmr(
+        A, b,
+        atol=1e-5, btol=1e-5,
+        x0=np.log(panorama_depth_init).reshape(-1) if panorama_depth_init is not None else None,
+        show=False,
+    )
+    panorama_depth = np.exp(x).reshape(height, width).astype(np.float32)
+    panorama_mask = np.any(panorama_pred_masks, axis=0)
+    return panorama_depth, panorama_mask

moge/utils/pipeline.py ADDED Viewed

	@@ -0,0 +1,508 @@

+from typing import *
+from abc import abstractmethod
+from queue import Empty, Full
+from threading import Thread
+from queue import Queue
+from multiprocessing import Process
+from threading import Thread, Event
+import multiprocessing
+import threading
+import inspect
+import time
+import uuid
+from copy import deepcopy
+import itertools
+import functools
+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+__all__ = [
+    'Node',
+    'Link',
+    'ConcurrentNode',
+    'Worker',
+    'WorkerFunction',
+    'Provider',
+    'ProviderFunction',
+    'Sequential',
+    'Batch',
+    'Unbatch',
+    'Parallel',
+    'Graph',
+    'Buffer',
+]
+TERMINATE_CHECK_INTERVAL = 0.5
+class _ItemWrapper:
+    def __init__(self, data: Any, id: Union[int, List[int]] = None):
+        self.data = data
+        self.id = id
+class Terminate(Exception):
+    pass
+def _get_queue_item(queue: Queue, terminate_flag: Event, timeout: float = None) -> _ItemWrapper:
+    while True:
+        try:
+            item: _ItemWrapper = queue.get(block=True, timeout=TERMINATE_CHECK_INTERVAL if timeout is None else min(timeout, TERMINATE_CHECK_INTERVAL))
+            if terminate_flag.is_set():
+                raise Terminate()
+            return item
+        except Empty:
+            if terminate_flag.is_set():
+                raise Terminate()
+        if timeout is not None:
+            timeout -= TERMINATE_CHECK_INTERVAL
+            if timeout <= 0:
+                raise Empty()
+def _put_queue_item(queue: Queue, item: _ItemWrapper, terminate_flag: Event):
+    while True:
+        try:
+            queue.put(item, block=True, timeout=TERMINATE_CHECK_INTERVAL)
+            if terminate_flag.is_set():
+                raise Terminate()
+            return
+        except Full:
+            if terminate_flag.is_set():
+                raise Terminate()
+class Node:
+    def __init__(self, in_buffer_size: int = 1, out_buffer_size: int = 1) -> None:
+        self.input: Queue = Queue(maxsize=in_buffer_size)
+        self.output: Queue = Queue(maxsize=out_buffer_size)
+        self.in_buffer_size = in_buffer_size
+        self.out_buffer_size = out_buffer_size
+    @abstractmethod
+    def start(self):
+        pass
+    @abstractmethod
+    def terminate(self):
+        pass
+    def stop(self):
+        self.terminate()
+        self.join()
+    @abstractmethod
+    def join(self):
+        pass
+    def put(self, data: Any, key: str = None, block: bool = True) -> None:
+        item = _ItemWrapper(data)
+        self.input.put(item, block=block)
+    def get(self, key: str = None, block: bool = True) -> Any:
+        item: _ItemWrapper = self.output.get(block=block)
+        return item.data
+    def __enter__(self):
+        self.start()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.terminate()
+        self.join()
+class ConcurrentNode(Node):
+    job: Union[Thread, Process]
+    def __init__(self, running_as: Literal['thread', 'process'] = 'thread', in_buffer_size: int = 1, out_buffer_size: int = 1) -> None:
+        super().__init__(in_buffer_size, out_buffer_size)
+        self.running_as = running_as
+    @abstractmethod
+    def _loop_fn(self, input: Queue, output: Queue, terminate_flag: Event):
+        pass
+    def start(self):
+        if self.running_as == 'thread':
+            terminate_flag = threading.Event()
+            job = Thread(target=self._loop_fn, args=(self.input, self.output, terminate_flag))
+        elif self.running_as == 'process':
+            terminate_flag = multiprocessing.Event()
+            job = Process(target=self._loop_fn, args=(self.input, self.output, terminate_flag))
+        job.start()
+        self.job = job
+        self.terminate_flag = terminate_flag
+    def terminate(self):
+        self.terminate_flag.set()
+    def join(self):
+        self.job.join()
+class Worker(ConcurrentNode):
+    def __init__(self, running_as: Literal['thread', 'process'] = 'thread', in_buffer_size: int = 0, out_buffer_size: int = 0) -> None:
+        super().__init__(running_as, in_buffer_size, out_buffer_size)
+    def init(self) -> None:
+        """
+        This method is called the the thread is started, to initialize any resources that is only held in the thread.
+        """
+        pass
+    @abstractmethod
+    def work(self, *args, **kwargs) -> Union[Any, Dict[str, Any]]:
+        """
+        This method defines the job that the node should do for each input item.
+        A item obtained from the input queue is passed as arguments to this method, and the result is placed in the output queue.
+        The method is executed concurrently with other nodes.
+        """
+        pass
+    def _loop_fn(self, input: Queue, output: Queue, terminate_flag: Event):
+        self.init()
+        try:
+            while True:
+                item = _get_queue_item(input, terminate_flag)
+                result = self.work(item.data)
+                _put_queue_item(output, _ItemWrapper(result, item.id), terminate_flag)
+        except Terminate:
+            return
+class Provider(ConcurrentNode):
+    """
+    A node that provides data to successive nodes. It takes no input and provides data to the output queue.
+    """
+    def __init__(self, running_as: Literal['thread', 'process'], out_buffer_size: int = 1) -> None:
+        super().__init__(running_as, 0, out_buffer_size)
+    def init(self) -> None:
+        """
+        This method is called the the thread or process is started, to initialize any resources that is only held in the thread or process.
+        """
+        pass
+    @abstractmethod
+    def provide(self) -> Generator[Any, None, None]:
+        pass
+    def _loop_fn(self, input: Queue, output: Queue, terminate_flag: Event):
+        self.init()
+        try:
+            for data in self.provide():
+                _put_queue_item(output, _ItemWrapper(data), terminate_flag)
+        except Terminate:
+            return
+class WorkerFunction(Worker):
+    def __init__(self, fn: Callable, running_as: 'thread', in_buffer_size: int = 1, out_buffer_size: int = 1) -> None:
+        super().__init__(running_as, in_buffer_size, out_buffer_size)
+        self.fn = fn
+    def work(self, *args, **kwargs):
+        return self.fn(*args, **kwargs)
+class ProviderFunction(Provider):
+    def __init__(self, fn: Callable, running_as: 'thread', out_buffer_size: int = 1) -> None:
+        super().__init__(running_as, out_buffer_size)
+        self.fn = fn
+    def provide(self):
+        for item in self.fn():
+            yield item
+class Link:
+    def __init__(self, src: Queue, dst: Queue):
+        self.src = src
+        self.dst = dst
+    def _thread_fn(self):
+        try:
+            while True:
+                item = _get_queue_item(self.src, self.terminate_flag)
+                _put_queue_item(self.dst, item, self.terminate_flag)
+        except Terminate:
+            return
+    def start(self):
+        self.terminate_flag = threading.Event()
+        self.thread = Thread(target=self._thread_fn)
+        self.thread.start()
+    def terminate(self):
+        self.terminate_flag.set()
+    def join(self):
+        self.thread.join()
+class Graph(Node):
+    """
+    Graph pipeline of nodes and links
+    """
+    nodes: List[Node]
+    links: List[Link]
+    def __init__(self, in_buffer_size: int = 1, out_buffer_size: int = 1):
+        super().__init__(in_buffer_size, out_buffer_size)
+        self.nodes = []
+        self.links = []
+    def add(self, node: Node):
+        self.nodes.append(node)
+    def link(self, src: Union[Node, Tuple[Node, str]], dst: Union[Node, Tuple[Node, str]]):
+        """
+        Links the output of the source node to the input of the destination node.
+        If the source or destination node is None, the pipeline's input or output is used.
+        """
+        src_queue = self.input if src is None else src.output
+        dst_queue = self.output if dst is None else dst.input
+        self.links.append(Link(src_queue, dst_queue))
+    def chain(self, nodes: Iterable[Node]):
+        """
+        Link the output of each node to the input of the next node.
+        """
+        nodes = list(nodes)
+        for i in range(len(nodes) - 1):
+            self.link(nodes[i], nodes[i + 1])
+    def start(self):
+        for node in self.nodes:
+            node.start()
+        for link in self.links:
+            link.start()
+    def terminate(self):
+        for node in self.nodes:
+            node.terminate()
+        for link in self.links:
+            link.terminate()
+    def join(self):
+        for node in self.nodes:
+            node.join()
+        for link in self.links:
+            link.join()
+    def __iter__(self):
+        providers = [node for node in self.nodes if isinstance(node, Provider)]
+        if len(providers) == 0:
+            raise ValueError("No provider node found in the pipeline. If you want to iterate over the pipeline, the pipeline must be driven by a provider node.")
+        with self:
+            # while all(provider.job.is_alive() for provider in providers):
+            while True:
+                yield self.get()
+    def __call__(self, data: Any) -> Any:
+        """
+        Submit data to the pipeline's input queue, and return the output data asynchronously.
+        NOTE: The pipeline must be streamed (i.e., every output item is uniquely associated with an input item) for this to work.
+        """
+        # TODO
+class Sequential(Graph):
+    """
+    Pipeline of nodes in sequential order, where each node takes the output of the previous node as input.
+    The order of input and output items is preserved (FIFO)
+    """
+    def __init__(self, nodes: List[Union[Node, Callable]], function_running_as: Literal['thread', 'process'] = 'thread', in_buffer_size: int = 1, out_buffer_size: int = 1):
+        """
+        Initialize the pipeline with a list of nodes to execute sequentially.
+        ### Parameters:
+        - nodes: List of nodes or functions to execute sequentially. Generator functions are wrapped in provider nodes, and other functions are wrapped in worker nodes.
+        - function_running_as: Whether to wrap the function as a thread or process worker. Defaults to 'thread'.
+        - in_buffer_size: Maximum size of the input queue of the pipeline. Defaults to 0 (unlimited).
+        - out_buffer_size: Maximum size of the output queue of the pipeline. Defaults to 0 (unlimited).
+        """
+        super().__init__(in_buffer_size, out_buffer_size)
+        for node in nodes:
+            if isinstance(node, Node):
+                pass
+            elif isinstance(node, Callable):
+                if inspect.isgeneratorfunction(node):
+                    node = ProviderFunction(node, function_running_as)
+                else:
+                    node = WorkerFunction(node, function_running_as)
+            else:
+                raise ValueError(f"Invalid node type: {type(node)}")
+            self.add(node)
+        self.chain([None, *self.nodes, None])
+class Parallel(Node):
+    """
+    A FIFO node that runs multiple nodes in parallel to process the input items. Each input item is handed to one of the nodes whoever is available.
+    NOTE: It is FIFO if and only if all the nested nodes are FIFO.
+    """
+    nodes: List[Node]
+    def __init__(self, nodes: Iterable[Node], in_buffer_size: int = 1, out_buffer_size: int = 1, function_running_as: Literal['thread', 'process'] = 'thread'):
+        super().__init__(in_buffer_size, out_buffer_size)
+        self.nodes = []
+        for node in nodes:
+            if isinstance(node, Node):
+                pass
+            elif isinstance(node, Callable):
+                if inspect.isgeneratorfunction(node):
+                    node = ProviderFunction(node, function_running_as)
+                else:
+                    node = WorkerFunction(node, function_running_as)
+            else:
+                raise ValueError(f"Invalid node type: {type(node)}")
+            self.nodes.append(node)
+        self.output_order = Queue()
+        self.lock = threading.Lock()
+    def _in_thread_fn(self, node: Node):
+        try:
+            while True:
+                with self.lock:
+                    # A better idea: first make sure its node is vacant, then get it a new item.
+                    # Currently we will not be able to know which node is busy util there is at least one item already waiting in the queue of the node.
+                    # This could lead to suboptimal scheduling.
+                    item = _get_queue_item(self.input, self.terminate_flag)
+                    self.output_order.put(node.output)
+                _put_queue_item(node.input, item, self.terminate_flag)
+        except Terminate:
+            return
+    def _out_thread_fn(self):
+        try:
+            while True:
+                queue = _get_queue_item(self.output_order, self.terminate_flag)
+                item = _get_queue_item(queue, self.terminate_flag)
+                _put_queue_item(self.output, item, self.terminate_flag)
+        except Terminate:
+            return
+    def start(self):
+        self.terminate_flag = threading.Event()
+        self.in_threads = []
+        for node in self.nodes:
+            thread = Thread(target=self._in_thread_fn, args=(node,))
+            thread.start()
+            self.in_threads.append(thread)
+        thread = Thread(target=self._out_thread_fn)
+        thread.start()
+        self.out_thread = thread
+        for node in self.nodes:
+            node.start()
+    def terminate(self):
+        self.terminate_flag.set()
+        for node in self.nodes:
+            node.terminate()
+    def join(self):
+        for thread in self.in_threads:
+            thread.join()
+        self.out_thread.join()
+class UnorderedParallel(Graph):
+    """
+    Pipeline of nodes in parallel, where each input item is handed to one of the nodes whoever is available.
+    NOTE: The order of the output items is NOT guaranteed to be the same as the input items, depending on how fast the nodes handle their input.
+    """
+    def __init__(self, nodes: List[Union[Node, Callable]], function_running_as: Literal['thread', 'process'] = 'thread', in_buffer_size: int = 1, out_buffer_size: int = 1):
+        """
+        Initialize the pipeline with a list of nodes to execute in parallel. If a function is given, it is wrapped in a worker node.
+        ### Parameters:
+        - nodes: List of nodes or functions to execute in parallel. Generator functions are wrapped in provider nodes, and other functions are wrapped in worker nodes.
+        - function_running_as: Whether to wrap the function as a thread or process worker. Defaults to 'thread'.
+        - in_buffer_size: Maximum size of the input queue of the pipeline. Defaults to 0 (unlimited).
+        - out_buffer_size: Maximum size of the output queue of the pipeline. Defaults to 0 (unlimited).
+        """
+        super().__init__(in_buffer_size, out_buffer_size)
+        for node in nodes:
+            if isinstance(node, Node):
+                pass
+            elif isinstance(node, Callable):
+                if inspect.isgeneratorfunction(node):
+                    node = ProviderFunction(node, function_running_as)
+                else:
+                    node = WorkerFunction(node, function_running_as)
+            else:
+                raise ValueError(f"Invalid node type: {type(node)}")
+            self.add(node)
+        for i in range(len(nodes)):
+            self.chain([None, self.nodes[i], None])
+class Batch(ConcurrentNode):
+    """
+    Groups every `batch_size` items into a batch (a list of items) and passes the batch to successive nodes.
+    The `patience` parameter specifies the maximum time to wait for a batch to be filled before sending it to the next node,
+    i.e., when the earliest item in the batch is out of `patience` seconds, the batch is sent regardless of its size.
+    """
+    def __init__(self, batch_size: int, patience: float = None, in_buffer_size: int = 1, out_buffer_size: int = 1):
+        assert batch_size > 0, "Batch size must be greater than 0."
+        super().__init__('thread', in_buffer_size, out_buffer_size)
+        self.batch_size = batch_size
+        self.patience = patience
+    def _loop_fn(self, input: Queue, output: Queue, terminate_flag: Event):
+        try:
+            while True:
+                batch_id, batch_data = [], []
+                # Try to fill the batch
+                for i in range(self.batch_size):
+                    if i == 0 or self.patience is None:
+                        timeout = None
+                    else:
+                        timeout = self.patience - (time.time() - earliest_time)
+                        if timeout < 0:
+                            break
+                    try:
+                        item = _get_queue_item(input, terminate_flag, timeout)
+                    except Empty:
+                        break
+                    if i == 0:
+                        earliest_time = time.time()
+                    batch_data.append(item.data)
+                    batch_id.append(item.id)
+                batch = _ItemWrapper(batch_data, batch_id)
+                _put_queue_item(output, batch, terminate_flag)
+        except Terminate:
+            return
+class Unbatch(ConcurrentNode):
+    """
+    Ungroups every batch (a list of items) into individual items and passes them to successive nodes.
+    """
+    def __init__(self, in_buffer_size: int = 1, out_buffer_size: int = 1):
+        super().__init__('thread', in_buffer_size, out_buffer_size)
+    def _loop_fn(self, input: Queue, output: Queue, terminate_flag: Event):
+        try:
+            while True:
+                batch = _get_queue_item(input, terminate_flag)
+                for id, data in zip(batch.id or itertools.repeat(None), batch.data):
+                    item = _ItemWrapper(data, id)
+                    _put_queue_item(output, item, terminate_flag)
+        except Terminate:
+            return
+class Buffer(Node):
+    "A FIFO node that buffers items in a queue. Usefull achieve better temporal balance when its successor node has a variable processing time."
+    def __init__(self, size: int):
+        super().__init__(size, size)
+        self.size = size
+        self.input = self.output = Queue(maxsize=size)

moge/utils/tools.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+import time
+from pathlib import Path
+from numbers import Number
+from functools import wraps
+import warnings
+import math
+import json
+import os
+import importlib
+import importlib.util
+def catch_exception(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            import traceback
+            print(f"Exception in {fn.__name__}",  end='r')
+            # print({', '.join(repr(arg) for arg in args)}, {', '.join(f'{k}={v!r}' for k, v in kwargs.items())})
+            traceback.print_exc(chain=False)
+            time.sleep(0.1)
+            return None
+    return wrapper
+class CallbackOnException:
+    def __init__(self, callback: Callable, exception: type):
+        self.exception = exception
+        self.callback = callback
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if isinstance(exc_val, self.exception):
+            self.callback()
+            return True
+        return False
+def traverse_nested_dict_keys(d: Dict[str, Dict]) -> Generator[Tuple[str, ...], None, None]:
+    for k, v in d.items():
+        if isinstance(v, dict):
+            for sub_key in traverse_nested_dict_keys(v):
+                yield (k, ) + sub_key
+        else:
+            yield (k, )
+def get_nested_dict(d: Dict[str, Dict], keys: Tuple[str, ...], default: Any = None):
+    for k in keys:
+        d = d.get(k, default)
+        if d is None:
+            break
+    return d
+def set_nested_dict(d: Dict[str, Dict], keys: Tuple[str, ...], value: Any):
+    for k in keys[:-1]:
+        d = d.setdefault(k, {})
+    d[keys[-1]] = value
+def key_average(list_of_dicts: list) -> Dict[str, Any]:
+    """
+    Returns a dictionary with the average value of each key in the input list of dictionaries.
+    """
+    _nested_dict_keys = set()
+    for d in list_of_dicts:
+        _nested_dict_keys.update(traverse_nested_dict_keys(d))
+    _nested_dict_keys = sorted(_nested_dict_keys)
+    result = {}
+    for k in _nested_dict_keys:
+        values = []
+        for d in list_of_dicts:
+            v = get_nested_dict(d, k)
+            if v is not None and not math.isnan(v):
+                values.append(v)
+        avg = sum(values) / len(values) if values else float('nan')
+        set_nested_dict(result, k, avg)
+    return result
+def flatten_nested_dict(d: Dict[str, Any], parent_key: Tuple[str, ...] = None) -> Dict[Tuple[str, ...], Any]:
+    """
+    Flattens a nested dictionary into a single-level dictionary, with keys as tuples.
+    """
+    items = []
+    if parent_key is None:
+        parent_key = ()
+    for k, v in d.items():
+        new_key = parent_key + (k, )
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_nested_dict(v, new_key).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+def unflatten_nested_dict(d: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Unflattens a single-level dictionary into a nested dictionary, with keys as tuples.
+    """
+    result = {}
+    for k, v in d.items():
+        sub_dict = result
+        for k_ in k[:-1]:
+            if k_ not in sub_dict:
+                sub_dict[k_] = {}
+            sub_dict = sub_dict[k_]
+        sub_dict[k[-1]] = v
+    return result
+def read_jsonl(file):
+    import json
+    with open(file, 'r') as f:
+        data = f.readlines()
+    return [json.loads(line) for line in data]
+def write_jsonl(data: List[dict], file):
+    import json
+    with open(file, 'w') as f:
+        for item in data:
+            f.write(json.dumps(item) + '\n')
+def to_hierachical_dataframe(data: List[Dict[Tuple[str, ...], Any]]):
+    import pandas as pd
+    data = [flatten_nested_dict(d) for d in data]
+    df = pd.DataFrame(data)
+    df = df.sort_index(axis=1)
+    df.columns = pd.MultiIndex.from_tuples(df.columns)
+    return df
+def recursive_replace(d: Union[List, Dict, str], mapping: Dict[str, str]):
+    if isinstance(d, str):
+        for old, new in mapping.items():
+            d = d.replace(old, new)
+    elif isinstance(d, list):
+        for i, item in enumerate(d):
+            d[i] = recursive_replace(item, mapping)
+    elif isinstance(d, dict):
+        for k, v in d.items():
+            d[k] = recursive_replace(v, mapping)
+    return d
+class timeit:
+    _history: Dict[str, List['timeit']] = {}
+    def __init__(self, name: str = None, verbose: bool = True, average: bool = False):
+        self.name = name
+        self.verbose = verbose
+        self.start = None
+        self.end = None
+        self.average = average
+        if average and name not in timeit._history:
+            timeit._history[name] = []
+    def __call__(self, func: Callable):
+        import inspect
+        if inspect.iscoroutinefunction(func):
+            async def wrapper(*args, **kwargs):
+                with timeit(self.name or func.__qualname__):
+                    ret = await func(*args, **kwargs)
+                return ret
+            return wrapper
+        else:
+            def wrapper(*args, **kwargs):
+                with timeit(self.name or func.__qualname__):
+                    ret = func(*args, **kwargs)
+                return ret
+            return wrapper
+    def __enter__(self):
+        self.start = time.time()
+        return self
+    @property
+    def time(self) -> float:
+        assert self.start is not None, "Time not yet started."
+        assert self.end is not None, "Time not yet ended."
+        return self.end - self.start
+    @property
+    def average_time(self) -> float:
+        assert self.average, "Average time not available."
+        return sum(t.time for t in timeit._history[self.name]) / len(timeit._history[self.name])
+    @property
+    def history(self) -> List['timeit']:
+        return timeit._history.get(self.name, [])
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.end = time.time()
+        if self.average:
+            timeit._history[self.name].append(self)
+        if self.verbose:
+            if self.average:
+                avg = self.average_time
+                print(f"{self.name or 'It'} took {avg:.6f} seconds in average.")
+            else:
+                print(f"{self.name or 'It'} took {self.time:.6f} seconds.")
+def strip_common_prefix_suffix(strings: List[str]) -> List[str]:
+    first = strings[0]
+    for start in range(len(first)):
+        if any(s[start] != strings[0][start] for s in strings):
+            break
+    for end in range(1, min(len(s) for s in strings)):
+        if any(s[-end] != first[-end] for s in strings):
+            break
+    return [s[start:len(s) - end + 1] for s in strings]
+def multithead_execute(inputs: List[Any], num_workers: int, pbar = None):
+    from concurrent.futures import ThreadPoolExecutor
+    from contextlib import nullcontext
+    from tqdm import tqdm
+    if pbar is not None:
+        pbar.total = len(inputs) if hasattr(inputs, '__len__') else None
+    else:
+        pbar = tqdm(total=len(inputs) if hasattr(inputs, '__len__') else None)
+    def decorator(fn: Callable):
+        with (
+            ThreadPoolExecutor(max_workers=num_workers) as executor,
+            pbar
+        ):
+            pbar.refresh()
+            @catch_exception
+            @suppress_traceback
+            def _fn(input):
+                ret = fn(input)
+                pbar.update()
+                return ret
+            executor.map(_fn, inputs)
+            executor.shutdown(wait=True)
+    return decorator
+def suppress_traceback(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            e.__traceback__ = e.__traceback__.tb_next.tb_next
+            raise
+    return wrapper
+class no_warnings:
+    def __init__(self, action: str = 'ignore', **kwargs):
+        self.action = action
+        self.filter_kwargs = kwargs
+    def __call__(self, fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with warnings.catch_warnings():
+                warnings.simplefilter(self.action, **self.filter_kwargs)
+                return fn(*args, **kwargs)
+        return wrapper
+    def __enter__(self):
+        self.warnings_manager = warnings.catch_warnings()
+        self.warnings_manager.__enter__()
+        warnings.simplefilter(self.action, **self.filter_kwargs)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.warnings_manager.__exit__(exc_type, exc_val, exc_tb)
+def import_file_as_module(file_path: Union[str, os.PathLike], module_name: str):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module

moge/utils/vis.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+import numpy as np
+import matplotlib
+def colorize_depth(depth: np.ndarray, mask: np.ndarray = None, normalize: bool = True, cmap: str = 'Spectral') -> np.ndarray:
+    if mask is None:
+        depth = np.where(depth > 0, depth, np.nan)
+    else:
+        depth = np.where((depth > 0) & mask, depth, np.nan)
+    disp = 1 / depth
+    if normalize:
+        min_disp, max_disp = np.nanquantile(disp, 0.001), np.nanquantile(disp, 0.99)
+        disp = (disp - min_disp) / (max_disp - min_disp)
+    colored = np.nan_to_num(matplotlib.colormaps[cmap](1.0 - disp)[..., :3], 0)
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def colorize_depth_affine(depth: np.ndarray, mask: np.ndarray = None, cmap: str = 'Spectral') -> np.ndarray:
+    if mask is not None:
+        depth = np.where(mask, depth, np.nan)
+    min_depth, max_depth = np.nanquantile(depth, 0.001), np.nanquantile(depth, 0.999)
+    depth = (depth - min_depth) / (max_depth - min_depth)
+    colored = np.nan_to_num(matplotlib.colormaps[cmap](depth)[..., :3], 0)
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def colorize_disparity(disparity: np.ndarray, mask: np.ndarray = None, normalize: bool = True, cmap: str = 'Spectral') -> np.ndarray:
+    if mask is not None:
+        disparity = np.where(mask, disparity, np.nan)
+    if normalize:
+        min_disp, max_disp = np.nanquantile(disparity, 0.001), np.nanquantile(disparity, 0.999)
+        disparity = (disparity - min_disp) / (max_disp - min_disp)
+    colored = np.nan_to_num(matplotlib.colormaps[cmap](1.0 - disparity)[..., :3], 0)
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def colorize_segmentation(segmentation: np.ndarray, cmap: str = 'Set1') -> np.ndarray:
+    colored = matplotlib.colormaps[cmap]((segmentation % 20) / 20)[..., :3]
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def colorize_normal(normal: np.ndarray, mask: np.ndarray = None) -> np.ndarray:
+    if mask is not None:
+        normal = np.where(mask[..., None], normal, 0)
+    normal = normal * [0.5, -0.5, -0.5] + 0.5
+    normal = (normal.clip(0, 1) * 255).astype(np.uint8)
+    return normal
+def colorize_error_map(error_map: np.ndarray, mask: np.ndarray = None, cmap: str = 'plasma', value_range: Tuple[float, float] = None):
+    vmin, vmax = value_range if value_range is not None else (np.nanmin(error_map), np.nanmax(error_map))
+    cmap = matplotlib.colormaps[cmap]
+    colorized_error_map = cmap(((error_map - vmin) / (vmax - vmin)).clip(0, 1))[..., :3]
+    if mask is not None:
+        colorized_error_map = np.where(mask[..., None], colorized_error_map, 0)
+    colorized_error_map = np.ascontiguousarray((colorized_error_map.clip(0, 1) * 255).astype(np.uint8))
+    return colorized_error_map

moge/utils/webfile.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+import requests
+from typing import *
+__all__ = ["WebFile"]
+class WebFile:
+    def __init__(self, url: str, session: Optional[requests.Session] = None, headers: Optional[Dict[str, str]] = None, size: Optional[int] = None):
+        self.url = url
+        self.session = session or requests.Session()
+        self.session.headers.update(headers or {})
+        self._offset = 0
+        self.size = size if size is not None else self._fetch_size()
+    def _fetch_size(self):
+        with self.session.get(self.url, stream=True) as response:
+            response.raise_for_status()
+            content_length = response.headers.get("Content-Length")
+            if content_length is None:
+                raise ValueError("Missing Content-Length in header")
+            return int(content_length)
+    def _fetch_data(self, offset: int, n: int) -> bytes:
+        headers = {"Range": f"bytes={offset}-{min(offset + n - 1, self.size)}"}
+        response = self.session.get(self.url, headers=headers)
+        response.raise_for_status()
+        return response.content
+    def seekable(self) -> bool:
+        return True
+    def tell(self) -> int:
+        return self._offset
+    def available(self) -> int:
+        return self.size - self._offset
+    def seek(self, offset: int, whence: int = 0) -> None:
+        if whence == 0:
+            new_offset = offset
+        elif whence == 1:
+            new_offset = self._offset + offset
+        elif whence == 2:
+            new_offset = self.size + offset
+        else:
+            raise ValueError("Invalid value for whence")
+        self._offset = max(0, min(new_offset, self.size))
+    def read(self, n: Optional[int] = None) -> bytes:
+        if n is None or n < 0:
+            n = self.available()
+        else:
+            n = min(n, self.available())
+        if n == 0:
+            return b''
+        data = self._fetch_data(self._offset, n)
+        self._offset += len(data)
+        return data
+    def close(self) -> None:
+        pass
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass

moge/utils/webzipfile.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copied from the MoGe project:
+# https://github.com/microsoft/MoGe
+# Original license: MIT
+# Copyright (c) the MoGe authors
+from typing import *
+import io
+import os
+from zipfile import (
+    ZipInfo, BadZipFile, ZipFile, ZipExtFile,
+    sizeFileHeader, structFileHeader, stringFileHeader,
+    _FH_SIGNATURE, _FH_FILENAME_LENGTH, _FH_EXTRA_FIELD_LENGTH, _FH_GENERAL_PURPOSE_FLAG_BITS,
+    _MASK_COMPRESSED_PATCH, _MASK_STRONG_ENCRYPTION, _MASK_UTF_FILENAME, _MASK_ENCRYPTED
+)
+import struct
+from requests import Session
+from .webfile import WebFile
+class _SharedWebFile(WebFile):
+    def __init__(self, webfile: WebFile, pos: int):
+        super().__init__(webfile.url, webfile.session, size=webfile.size)
+        self.seek(pos)
+class WebZipFile(ZipFile):
+    "Lock-free version of ZipFile that reads from a WebFile, allowing for concurrent reads."
+    def __init__(self, url: str, session: Optional[Session] = None, headers: Optional[Dict[str, str]] = None):
+        """Open the ZIP file with mode read 'r', write 'w', exclusive create 'x',
+        or append 'a'."""
+        webf = WebFile(url, session=session, headers=headers)
+        super().__init__(webf, mode='r')
+    def open(self, name, mode="r", pwd=None, *, force_zip64=False):
+        """Return file-like object for 'name'.
+        name is a string for the file name within the ZIP file, or a ZipInfo
+        object.
+        mode should be 'r' to read a file already in the ZIP file, or 'w' to
+        write to a file newly added to the archive.
+        pwd is the password to decrypt files (only used for reading).
+        When writing, if the file size is not known in advance but may exceed
+        2 GiB, pass force_zip64 to use the ZIP64 format, which can handle large
+        files.  If the size is known in advance, it is best to pass a ZipInfo
+        instance for name, with zinfo.file_size set.
+        """
+        if mode not in {"r", "w"}:
+            raise ValueError('open() requires mode "r" or "w"')
+        if pwd and (mode == "w"):
+            raise ValueError("pwd is only supported for reading files")
+        if not self.fp:
+            raise ValueError(
+                "Attempt to use ZIP archive that was already closed")
+        assert mode == "r", "Only read mode is supported for now"
+        # Make sure we have an info object
+        if isinstance(name, ZipInfo):
+            # 'name' is already an info object
+            zinfo = name
+        elif mode == 'w':
+            zinfo = ZipInfo(name)
+            zinfo.compress_type = self.compression
+            zinfo._compresslevel = self.compresslevel
+        else:
+            # Get info object for name
+            zinfo = self.getinfo(name)
+        if mode == 'w':
+            return self._open_to_write(zinfo, force_zip64=force_zip64)
+        if self._writing:
+            raise ValueError("Can't read from the ZIP file while there "
+                    "is an open writing handle on it. "
+                    "Close the writing handle before trying to read.")
+        # Open for reading:
+        self._fileRefCnt += 1
+        zef_file = _SharedWebFile(self.fp, zinfo.header_offset)
+        try:
+            # Skip the file header:
+            fheader = zef_file.read(sizeFileHeader)
+            if len(fheader) != sizeFileHeader:
+                raise BadZipFile("Truncated file header")
+            fheader = struct.unpack(structFileHeader, fheader)
+            if fheader[_FH_SIGNATURE] != stringFileHeader:
+                raise BadZipFile("Bad magic number for file header")
+            fname = zef_file.read(fheader[_FH_FILENAME_LENGTH])
+            if fheader[_FH_EXTRA_FIELD_LENGTH]:
+                zef_file.seek(fheader[_FH_EXTRA_FIELD_LENGTH], whence=1)
+            if zinfo.flag_bits & _MASK_COMPRESSED_PATCH:
+                # Zip 2.7: compressed patched data
+                raise NotImplementedError("compressed patched data (flag bit 5)")
+            if zinfo.flag_bits & _MASK_STRONG_ENCRYPTION:
+                # strong encryption
+                raise NotImplementedError("strong encryption (flag bit 6)")
+            if fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] & _MASK_UTF_FILENAME:
+                # UTF-8 filename
+                fname_str = fname.decode("utf-8")
+            else:
+                fname_str = fname.decode(self.metadata_encoding or "cp437")
+            if fname_str != zinfo.orig_filename:
+                raise BadZipFile(
+                    'File name in directory %r and header %r differ.'
+                    % (zinfo.orig_filename, fname))
+            # check for encrypted flag & handle password
+            is_encrypted = zinfo.flag_bits & _MASK_ENCRYPTED
+            if is_encrypted:
+                if not pwd:
+                    pwd = self.pwd
+                if pwd and not isinstance(pwd, bytes):
+                    raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__)
+                if not pwd:
+                    raise RuntimeError("File %r is encrypted, password "
+                                       "required for extraction" % name)
+            else:
+                pwd = None
+            return ZipExtFile(zef_file, mode, zinfo, pwd, True)
+        except:
+            zef_file.close()
+            raise