Spaces:

shriarul5273
/

Depth-Estimation-Compare-demo

Running on Zero

+# Python cache
+__pycache__/
+*.py[cod]
+# Distribution / packaging
+workspace/
+build/
+dist/
+*.egg-info/
+.gradio/
+# Test/coverage
+.coverage
+.pytest_cache/
+htmlcov/
+.tox/
+gallery*/
+debug*/
+DA3HF*/
+gradio_workspace/
+eval_workspace/
+FILTER*/
+input_images*/
+*.gradio/
+# Jupyter notebooks
+.ipynb_checkpoints
+# OS files
+.DS_Store
+.vscode
+src/debug_main.py
+temp*.png
+/outputs

Depth-Anything-3-anysize/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+repos:
+  - repo: 'https://github.com/pre-commit/pre-commit-hooks'
+    rev: v4.5.0
+    hooks:
+      - id: check-added-large-files
+        args:
+          - '--maxkb=125'
+      - id: check-ast
+      - id: check-executables-have-shebangs
+      - id: check-merge-conflict
+      - id: check-symlinks
+      - id: check-toml
+      - id: check-yaml
+      - id: debug-statements
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: no-commit-to-branch
+        args:
+          - '--branch'
+          - 'master'
+      - id: pretty-format-json
+        exclude: '.*\.ipynb$'
+        args:
+          - '--autofix'
+          - '--indent'
+          - '4'
+      - id: trailing-whitespace
+        args:
+          - '--markdown-linebreak-ext=md'
+  - repo: 'https://github.com/pycqa/isort'
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args:
+          - '--settings-file'
+          - 'pyproject.toml'
+          - '--filter-files'
+  - repo: 'https://github.com/asottile/pyupgrade'
+    rev: v3.15.2
+    hooks:
+      - id: pyupgrade
+        args: [--py38-plus, --keep-runtime-typing]
+  - repo: 'https://github.com/psf/black.git'
+    rev: 24.3.0
+    hooks:
+      - id: black
+        args:
+          - '--config=pyproject.toml'
+  - repo: 'https://github.com/PyCQA/flake8'
+    rev: 7.0.0
+    hooks:
+      - id: flake8
+        args:
+          - '--config=.flake8'
+  - repo: 'https://github.com/myint/autoflake'
+    rev: v1.4
+    hooks:
+      - id: autoflake
+        args: [ '--remove-all-unused-imports', '--recursive', '--remove-unused-variables', '--in-place']

Depth-Anything-3-anysize/README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# Depth Anything 3 AnySize
+## 🔄 Key Modifications from the [Original Repo](https://github.com/ByteDance-Seed/Depth-Anything-3)
+- **Native-Resolution Inputs:** Images are now processed at their original resolution by default. During inference, inputs are padded to the ViT patch size, and outputs (depth/confidence/sky maps and processed images) are cropped back to the source height and width. Using larger inputs now will increase memory and compute requirements.
+- **Updated Defaults:** The CLI defaults to `--process-res None --process-res-method keep`, and the API uses `process_res=None, process_res_method="keep"`. See `docs/CLI.md` and `docs/API.md` for details.
+- **Optional Downscaling:** For faster inference and lower memory usage, set `process_res` (e.g., `720`) with a resize strategy like `--process-res-method upper_bound_resize`.
+- **Original Baseline:** Previously, images were resized to 504 px on the long side.
+- **Implementation Details:** Input padding is handled in `src/depth_anything_3/utils/io/input_processor.py`, and output cropping is managed in `src/depth_anything_3/api.py`.
+--------------------------------------

Depth-Anything-3-anysize/app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from __future__ import annotations
+from typing import Dict, Optional, Tuple
+import gradio as gr
+import numpy as np
+import torch
+from PIL import Image
+from depth_anything_3.api import DepthAnything3
+from depth_anything_3.utils.visualize import visualize_depth
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_SOURCES: Dict[str, str] = {
+    "Depth Anything v3 Nested Giant Large": "depth-anything/DA3NESTED-GIANT-LARGE",
+    "Depth Anything v3 Giant": "depth-anything/DA3-GIANT",
+    "Depth Anything v3 Large": "depth-anything/DA3-LARGE",
+    "Depth Anything v3 Base": "depth-anything/DA3-BASE",
+    "Depth Anything v3 Small": "depth-anything/DA3-SMALL",
+    "Depth Anything v3 Metric Large": "depth-anything/DA3METRIC-LARGE",
+    "Depth Anything v3 Mono Large": "depth-anything/DA3MONO-LARGE",
+}
+_MODEL_CACHE: Dict[str, DepthAnything3] = {}
+def _load_model(model_label: str) -> DepthAnything3:
+    repo_id = MODEL_SOURCES[model_label]
+    if repo_id not in _MODEL_CACHE:
+        model = DepthAnything3.from_pretrained(repo_id)
+        model = model.to(device=DEVICE)
+        model.eval()
+        _MODEL_CACHE[repo_id] = model
+    return _MODEL_CACHE[repo_id]
+def _prep_image(image: np.ndarray) -> np.ndarray:
+    if image.ndim == 2:
+        image = np.stack([image] * 3, axis=-1)
+    if image.dtype != np.uint8:
+        image = np.clip(image, 0, 255).astype(np.uint8)
+    return image
+def run_inference(
+    model_label: str,
+    image: Optional[np.ndarray],
+) -> tuple[Tuple[np.ndarray, np.ndarray], str]:
+    if image is None:
+        raise gr.Error("Upload an image before running inference.")
+    rgb = _prep_image(image)
+    model = _load_model(model_label)
+    prediction = model.inference(
+        image=[Image.fromarray(rgb)],
+        process_res=None,
+        process_res_method="keep",
+    )
+    depth_map = prediction.depth[0]
+    depth_vis = visualize_depth(depth_map, cmap="Spectral")
+    processed_rgb = (
+        prediction.processed_images[0]
+        if prediction.processed_images is not None
+        else rgb
+    )
+    slider_value: Tuple[np.ndarray, np.ndarray] = (processed_rgb, depth_vis)
+    lines = [
+        f"**Model:** `{MODEL_SOURCES[model_label]}`",
+        f"**Device:** `{DEVICE}`",
+        f"**Depth shape:** `{tuple(prediction.depth.shape)}`",
+    ]
+    if prediction.extrinsics is not None:
+        lines.append(f"**Extrinsics shape:** `{prediction.extrinsics.shape}`")
+    if prediction.intrinsics is not None:
+        lines.append(f"**Intrinsics shape:** `{prediction.intrinsics.shape}`")
+    return slider_value, "\n".join(lines)
+def build_app() -> gr.Blocks:
+    with gr.Blocks(title="Depth Anything v3 - Any Size Demo") as demo:
+        gr.Markdown(
+            """
+            ## Depth Anything v3 (Any-Size Demo)
+            Upload an image, pick a pretrained model, and compare RGB against the inferred depth.
+            """
+        )
+        with gr.Row():
+            model_dropdown = gr.Dropdown(
+                choices=list(MODEL_SOURCES.keys()),
+                value="Depth Anything v3 Large",
+                label="Model",
+            )
+        image_input = gr.Image(type="numpy", label="Input Image", image_mode="RGB")
+        run_button = gr.Button("Run Inference", variant="primary")
+        with gr.Row():
+            comparison_slider = gr.ImageSlider(label="RGB vs Depth")
+        info_panel = gr.Markdown()
+        run_button.click(
+            fn=run_inference,
+            inputs=[model_dropdown, image_input],
+            outputs=[comparison_slider, info_panel],
+        )
+    return demo
+def main() -> None:
+    app = build_app()
+    app.queue(max_size=8).launch()
+if __name__ == "__main__":
+    main()

Depth-Anything-3-anysize/depth3_anysize.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image
+import torch
+from depth_anything_3.api import DepthAnything3
+from depth_anything_3.utils.visualize import visualize_depth
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = DepthAnything3.from_pretrained("depth-anything/DA3-LARGE")
+model = model.to(device)
+model.eval()
+print(f"Model loaded on {device}")
+# Load sample images and run inference
+image_paths = [
+    "assets/examples/SOH/demo.png",
+]
+# Run inference
+prediction = model.inference(
+    image=image_paths,
+    # export_dir=None,
+    # export_format="glb"
+)
+print(f"Depth shape: {prediction.depth.shape}")
+print(f"Extrinsics: {prediction.extrinsics.shape if prediction.extrinsics is not None else 'None'}")
+print(f"Intrinsics: {prediction.intrinsics.shape if prediction.intrinsics is not None else 'None'}")
+# Visualize input images and depth maps
+n_images = prediction.depth.shape[0]
+fig, axes = plt.subplots(2, n_images, figsize=(12, 6))
+if n_images == 1:
+    axes = axes.reshape(2, 1)
+for i in range(n_images):
+    # Show original image
+    if prediction.processed_images is not None:
+        axes[0, i].imshow(prediction.processed_images[i])
+    axes[0, i].set_title(f"Input {i+1}")
+    axes[0, i].axis('off')
+    # Show depth map
+    depth_vis = visualize_depth(prediction.depth[i], cmap="Spectral")
+    axes[1, i].imshow(depth_vis)
+    axes[1, i].set_title(f"Depth {i+1}")
+    axes[1, i].axis('off')
+plt.tight_layout()
+plt.show()

Depth-Anything-3-anysize/pyproject.toml ADDED Viewed

	@@ -0,0 +1,93 @@

+[build-system]
+requires = ["hatchling>=1.25", "hatch-vcs>=0.4"]
+build-backend = "hatchling.build"
+[project]
+name = "depth-anything-3"
+version = "0.0.0"
+description = "Depth Anything 3"
+readme = "README.md"
+requires-python = ">=3.9, <=3.13"
+license = { text = "Apache-2.0" }
+authors = [{ name = "Your Name" }]
+dependencies = [
+    "pre-commit",
+    "trimesh",
+    "torch>=2",
+    "torchvision",
+    "einops",
+    "huggingface_hub",
+    "imageio",
+    "numpy<2",
+    "opencv-python",
+    "open3d",
+    "fastapi",
+    "uvicorn",
+    "requests",
+    "typer",
+    "pillow",
+    "omegaconf",
+    "evo",
+    "e3nn",
+    "moviepy",
+    "plyfile",
+    "pillow_heif",
+    "safetensors",
+    "uvicorn",
+    "moviepy==1.0.3",
+    "typer>=0.9.0",
+    "pycolmap",
+]
+[project.optional-dependencies]
+app = ["gradio>=5", "pillow>=9.0"] # requires that python3>=3.10
+gs = ["gsplat @ git+https://github.com/nerfstudio-project/gsplat.git@0b4dddf04cb687367602c01196913cde6a743d70"]
+all = ["depth-anything-3[app,gs]"]
+[project.scripts]
+da3 = "depth_anything_3.cli:app"
+[project.urls]
+Homepage = "https://github.com/ByteDance-Seed/Depth-Anything-3"
+[tool.hatch.version]
+source = "vcs"
+[tool.hatch.build.targets.wheel]
+packages = ["src/depth_anything_3"]
+[tool.hatch.build.targets.sdist]
+include = [
+  "README.md",
+  "pyproject.toml",
+  "src/depth_anything_3",
+]
+[tool.hatch.metadata]
+allow-direct-references = true
+[tool.mypy]
+plugins = ["jaxtyping.mypy_plugin"]
+[tool.black]
+line-length = 99
+target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
+include = '\.pyi?$'
+exclude = '''
+/(
+  | \.git
+)/
+'''
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+include_trailing_comma = true
+known_third_party = ["bson","cruise","cv2","dataloader","diffusers","omegaconf","tensorflow","torch","torchvision","transformers","gsplat"]
+known_first_party = ["common", "data", "models", "projects"]
+sections = ["FUTURE","STDLIB","THIRDPARTY","FIRSTPARTY","LOCALFOLDER"]
+skip_gitignore = true
+line_length = 99
+no_lines_before="THIRDPARTY"

Depth-Anything-3-anysize/requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+torchvision
+einops
+huggingface_hub
+imageio
+opencv-python
+open3d
+fastapi
+requests
+evo
+e3nn
+moviepy==1.0.3
+plyfile
+pillow_heif
+safetensors
+pycolmap
+torch>=2
+uvicorn
+typer>=0.9.0
+pillow
+pre-commit
+trimesh
+numpy<2
+omegaconf
+-e .[all]

Depth-Anything-3-anysize/src/depth_anything_3/api.py ADDED Viewed

	@@ -0,0 +1,483 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Depth Anything 3 API module.
+This module provides the main API for Depth Anything 3, including model loading,
+inference, and export capabilities. It supports both single and nested model architectures.
+"""
+from __future__ import annotations
+import time
+from typing import Optional, Sequence
+import numpy as np
+import torch
+import torch.nn as nn
+from huggingface_hub import PyTorchModelHubMixin
+from PIL import Image
+from depth_anything_3.cfg import create_object, load_config
+from depth_anything_3.registry import MODEL_REGISTRY
+from depth_anything_3.specs import Prediction
+from depth_anything_3.utils.export import export
+from depth_anything_3.utils.geometry import affine_inverse
+from depth_anything_3.utils.io.input_processor import InputProcessor
+from depth_anything_3.utils.io.output_processor import OutputProcessor
+from depth_anything_3.utils.logger import logger
+from depth_anything_3.utils.pose_align import align_poses_umeyama
+torch.backends.cudnn.benchmark = False
+# logger.info("CUDNN Benchmark Disabled")
+SAFETENSORS_NAME = "model.safetensors"
+CONFIG_NAME = "config.json"
+class DepthAnything3(nn.Module, PyTorchModelHubMixin):
+    """
+    Depth Anything 3 main API class.
+    This class provides a high-level interface for depth estimation using Depth Anything 3.
+    It supports both single and nested model architectures with metric scaling capabilities.
+    Features:
+    - Hugging Face Hub integration via PyTorchModelHubMixin
+    - Support for multiple model presets (vitb, vitg, nested variants)
+    - Automatic mixed precision inference
+    - Export capabilities for various formats (GLB, PLY, NPZ, etc.)
+    - Camera pose estimation and metric depth scaling
+    Usage:
+        # Load from Hugging Face Hub
+        model = DepthAnything3.from_pretrained("huggingface/model-name")
+        # Or create with specific preset
+        model = DepthAnything3(preset="vitg")
+        # Run inference
+        prediction = model.inference(images, export_dir="output", export_format="glb")
+    """
+    _commit_hash: str | None = None  # Set by mixin when loading from Hub
+    def __init__(self, model_name: str = "da3-large", **kwargs):
+        """
+        Initialize DepthAnything3 with specified preset.
+        Args:
+        model_name: The name of the model preset to use.
+                    Examples: 'da3-giant', 'da3-large', 'da3metric-large', 'da3nested-giant-large'.
+        **kwargs: Additional keyword arguments (currently unused).
+        """
+        super().__init__()
+        self.model_name = model_name
+        # Build the underlying network
+        self.config = load_config(MODEL_REGISTRY[self.model_name])
+        self.model = create_object(self.config)
+        self.model.eval()
+        # Initialize processors
+        self.input_processor = InputProcessor()
+        self.output_processor = OutputProcessor()
+        # Device management (set by user)
+        self.device = None
+    @torch.inference_mode()
+    def forward(
+        self,
+        image: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+        export_feat_layers: list[int] | None = None,
+        infer_gs: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        """
+        Forward pass through the model.
+        Args:
+            image: Input batch with shape ``(B, N, 3, H, W)`` on the model device.
+            extrinsics: Optional camera extrinsics with shape ``(B, N, 4, 4)``.
+            intrinsics: Optional camera intrinsics with shape ``(B, N, 3, 3)``.
+            export_feat_layers: Layer indices to return intermediate features for.
+        Returns:
+            Dictionary containing model predictions
+        """
+        # Determine optimal autocast dtype
+        autocast_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+        with torch.no_grad():
+            with torch.autocast(device_type=image.device.type, dtype=autocast_dtype):
+                return self.model(image, extrinsics, intrinsics, export_feat_layers, infer_gs)
+    def inference(
+        self,
+        image: list[np.ndarray | Image.Image | str],
+        extrinsics: np.ndarray | None = None,
+        intrinsics: np.ndarray | None = None,
+        align_to_input_ext_scale: bool = True,
+        infer_gs: bool = False,
+        render_exts: np.ndarray | None = None,
+        render_ixts: np.ndarray | None = None,
+        render_hw: tuple[int, int] | None = None,
+        process_res: int | None = None,
+        process_res_method: str = "keep",
+        export_dir: str | None = None,
+        export_format: str = "mini_npz",
+        export_feat_layers: Sequence[int] | None = None,
+        # GLB export parameters
+        conf_thresh_percentile: float = 40.0,
+        num_max_points: int = 1_000_000,
+        show_cameras: bool = True,
+        # Feat_vis export parameters
+        feat_vis_fps: int = 15,
+        # Other export parameters, e.g., gs_ply, gs_video
+        export_kwargs: Optional[dict] = {},
+    ) -> Prediction:
+        """
+        Run inference on input images.
+        Args:
+            image: List of input images (numpy arrays, PIL Images, or file paths)
+            extrinsics: Camera extrinsics (N, 4, 4)
+            intrinsics: Camera intrinsics (N, 3, 3)
+            align_to_input_ext_scale: whether to align the input pose scale to the prediction
+            infer_gs: Enable the 3D Gaussian branch (needed for `gs_ply`/`gs_video` exports)
+            render_exts: Optional render extrinsics for Gaussian video export
+            render_ixts: Optional render intrinsics for Gaussian video export
+            render_hw: Optional render resolution for Gaussian video export
+            process_res: Processing resolution
+            process_res_method: Resize method for processing
+            export_dir: Directory to export results
+            export_format: Export format (mini_npz, npz, glb, ply, gs, gs_video)
+            export_feat_layers: Layer indices to export intermediate features from
+            conf_thresh_percentile: [GLB] Lower percentile for adaptive confidence threshold (default: 40.0) # noqa: E501
+            num_max_points: [GLB] Maximum number of points in the point cloud (default: 1,000,000)
+            show_cameras: [GLB] Show camera wireframes in the exported scene (default: True)
+            feat_vis_fps: [FEAT_VIS] Frame rate for output video (default: 15)
+            export_kwargs: additional arguments to export functions.
+        Returns:
+            Prediction object containing depth maps and camera parameters
+        """
+        if "gs" in export_format:
+            assert infer_gs, "must set `infer_gs=True` to perform gs-related export."
+        if "colmap" in export_format:
+            assert isinstance(image[0], str), "`image` must be image paths for COLMAP export."
+        # Preprocess images
+        imgs_cpu, extrinsics, intrinsics, pad_meta = self._preprocess_inputs(
+            image, extrinsics, intrinsics, process_res, process_res_method
+        )
+        # Prepare tensors for model
+        imgs, ex_t, in_t = self._prepare_model_inputs(imgs_cpu, extrinsics, intrinsics)
+        # Normalize extrinsics
+        ex_t_norm = self._normalize_extrinsics(ex_t.clone() if ex_t is not None else None)
+        # Run model forward pass
+        export_feat_layers = list(export_feat_layers) if export_feat_layers is not None else []
+        raw_output = self._run_model_forward(imgs, ex_t_norm, in_t, export_feat_layers, infer_gs)
+        # Convert raw output to prediction
+        prediction = self._convert_to_prediction(raw_output)
+        # Crop padded regions back to original sizes if needed
+        prediction = self._crop_to_original(prediction, pad_meta)
+        # Align prediction to extrinsincs
+        prediction = self._align_to_input_extrinsics_intrinsics(
+            extrinsics, intrinsics, prediction, align_to_input_ext_scale
+        )
+        # Add processed images for visualization
+        prediction = self._add_processed_images(prediction, imgs_cpu, pad_meta)
+        # Export if requested
+        if export_dir is not None:
+            if "gs" in export_format:
+                if infer_gs and "gs_video" not in export_format:
+                    export_format = f"{export_format}-gs_video"
+                if "gs_video" in export_format:
+                    if "gs_video" not in export_kwargs:
+                        export_kwargs["gs_video"] = {}
+                    export_kwargs["gs_video"].update(
+                        {
+                            "extrinsics": render_exts,
+                            "intrinsics": render_ixts,
+                            "out_image_hw": render_hw,
+                        }
+                    )
+            # Add GLB export parameters
+            if "glb" in export_format:
+                if "glb" not in export_kwargs:
+                    export_kwargs["glb"] = {}
+                export_kwargs["glb"].update(
+                    {
+                        "conf_thresh_percentile": conf_thresh_percentile,
+                        "num_max_points": num_max_points,
+                        "show_cameras": show_cameras,
+                    }
+                )
+            # Add Feat_vis export parameters
+            if "feat_vis" in export_format:
+                if "feat_vis" not in export_kwargs:
+                    export_kwargs["feat_vis"] = {}
+                export_kwargs["feat_vis"].update(
+                    {
+                        "fps": feat_vis_fps,
+                    }
+                )
+            # Add COLMAP export parameters
+            if "colmap" in export_format:
+                if "colmap" not in export_kwargs:
+                    export_kwargs["colmap"] = {}
+                export_kwargs["colmap"].update(
+                    {
+                        "image_paths": image,
+                        "conf_thresh_percentile": conf_thresh_percentile,
+                        "process_res_method": process_res_method,
+                    }
+                )
+            self._export_results(prediction, export_format, export_dir, **export_kwargs)
+        return prediction
+    def _preprocess_inputs(
+        self,
+        image: list[np.ndarray | Image.Image | str],
+        extrinsics: np.ndarray | None = None,
+        intrinsics: np.ndarray | None = None,
+        process_res: int | None = None,
+        process_res_method: str = "keep",
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None, list[dict]]:
+        """Preprocess input images using input processor."""
+        start_time = time.time()
+        imgs_cpu, extrinsics, intrinsics, pad_meta = self.input_processor(
+            image,
+            extrinsics.copy() if extrinsics is not None else None,
+            intrinsics.copy() if intrinsics is not None else None,
+            process_res,
+            process_res_method,
+        )
+        end_time = time.time()
+        logger.info(
+            "Processed Images Done taking",
+            end_time - start_time,
+            "seconds. Shape: ",
+            imgs_cpu.shape,
+        )
+        return imgs_cpu, extrinsics, intrinsics, pad_meta
+    def _prepare_model_inputs(
+        self,
+        imgs_cpu: torch.Tensor,
+        extrinsics: torch.Tensor | None,
+        intrinsics: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+        """Prepare tensors for model input."""
+        device = self._get_model_device()
+        # Move images to model device
+        imgs = imgs_cpu.to(device, non_blocking=True)[None].float()
+        # Convert camera parameters to tensors
+        ex_t = (
+            extrinsics.to(device, non_blocking=True)[None].float()
+            if extrinsics is not None
+            else None
+        )
+        in_t = (
+            intrinsics.to(device, non_blocking=True)[None].float()
+            if intrinsics is not None
+            else None
+        )
+        return imgs, ex_t, in_t
+    def _normalize_extrinsics(self, ex_t: torch.Tensor | None) -> torch.Tensor | None:
+        """Normalize extrinsics"""
+        if ex_t is None:
+            return None
+        transform = affine_inverse(ex_t[:, :1])
+        ex_t_norm = ex_t @ transform
+        c2ws = affine_inverse(ex_t_norm)
+        translations = c2ws[..., :3, 3]
+        dists = translations.norm(dim=-1)
+        median_dist = torch.median(dists)
+        median_dist = torch.clamp(median_dist, min=1e-1)
+        ex_t_norm[..., :3, 3] = ex_t_norm[..., :3, 3] / median_dist
+        return ex_t_norm
+    def _align_to_input_extrinsics_intrinsics(
+        self,
+        extrinsics: torch.Tensor | None,
+        intrinsics: torch.Tensor | None,
+        prediction: Prediction,
+        align_to_input_ext_scale: bool = True,
+        ransac_view_thresh: int = 10,
+    ) -> Prediction:
+        """Align depth map to input extrinsics"""
+        if extrinsics is None:
+            return prediction
+        prediction.intrinsics = intrinsics.numpy()
+        _, _, scale, aligned_extrinsics = align_poses_umeyama(
+            prediction.extrinsics,
+            extrinsics.numpy(),
+            ransac=len(extrinsics) >= ransac_view_thresh,
+            return_aligned=True,
+            random_state=42,
+        )
+        if align_to_input_ext_scale:
+            prediction.extrinsics = extrinsics[..., :3, :].numpy()
+            prediction.depth /= scale
+        else:
+            prediction.extrinsics = aligned_extrinsics
+        return prediction
+    def _run_model_forward(
+        self,
+        imgs: torch.Tensor,
+        ex_t: torch.Tensor | None,
+        in_t: torch.Tensor | None,
+        export_feat_layers: Sequence[int] | None = None,
+        infer_gs: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        """Run model forward pass."""
+        device = imgs.device
+        need_sync = device.type == "cuda"
+        if need_sync:
+            torch.cuda.synchronize(device)
+        start_time = time.time()
+        feat_layers = list(export_feat_layers) if export_feat_layers is not None else None
+        output = self.forward(imgs, ex_t, in_t, feat_layers, infer_gs)
+        if need_sync:
+            torch.cuda.synchronize(device)
+        end_time = time.time()
+        logger.info(f"Model Forward Pass Done. Time: {end_time - start_time} seconds")
+        return output
+    def _convert_to_prediction(self, raw_output: dict[str, torch.Tensor]) -> Prediction:
+        """Convert raw model output to Prediction object."""
+        start_time = time.time()
+        output = self.output_processor(raw_output)
+        end_time = time.time()
+        logger.info(f"Conversion to Prediction Done. Time: {end_time - start_time} seconds")
+        return output
+    def _add_processed_images(
+        self, prediction: Prediction, imgs_cpu: torch.Tensor, pad_meta: list[dict]
+    ) -> Prediction:
+        """Add processed images to prediction for visualization."""
+        # Convert from (N, 3, H, W) to (N, H, W, 3) and denormalize
+        processed_imgs = imgs_cpu.permute(0, 2, 3, 1).cpu().numpy()  # (N, H, W, 3)
+        # Denormalize from ImageNet normalization
+        mean = np.array([0.485, 0.456, 0.406])
+        std = np.array([0.229, 0.224, 0.225])
+        processed_imgs = processed_imgs * std + mean
+        processed_imgs = np.clip(processed_imgs, 0, 1)
+        processed_imgs = (processed_imgs * 255).astype(np.uint8)
+        # Crop to original size if padding was applied
+        if pad_meta:
+            cropped_imgs = []
+            for i, meta in enumerate(pad_meta):
+                img = processed_imgs[i]
+                pt, pb, pl, pr = meta.get("pad", (0, 0, 0, 0))
+                if any((pt, pb, pl, pr)):
+                    img = img[pt : img.shape[0] - pb if pb > 0 else img.shape[0], pl : img.shape[1] - pr if pr > 0 else img.shape[1]]
+                cropped_imgs.append(img)
+            processed_imgs = np.stack(cropped_imgs, axis=0)
+        prediction.processed_images = processed_imgs
+        return prediction
+    def _export_results(
+        self, prediction: Prediction, export_format: str, export_dir: str, **kwargs
+    ) -> None:
+        """Export results to specified format and directory."""
+        start_time = time.time()
+        export(prediction, export_format, export_dir, **kwargs)
+        end_time = time.time()
+        logger.info(f"Export Results Done. Time: {end_time - start_time} seconds")
+    def _get_model_device(self) -> torch.device:
+        """
+        Get the device where the model is located.
+        Returns:
+            Device where the model parameters are located
+        Raises:
+            ValueError: If no tensors are found in the model
+        """
+        if self.device is not None:
+            return self.device
+        # Find device from parameters
+        for param in self.parameters():
+            self.device = param.device
+            return param.device
+        # Find device from buffers
+        for buffer in self.buffers():
+            self.device = buffer.device
+            return buffer.device
+        raise ValueError("No tensor found in model")
+    def _crop_to_original(self, prediction: Prediction, pad_meta: list[dict]) -> Prediction:
+        """
+        Remove padding added for patch divisibility to restore original HxW.
+        """
+        if not pad_meta:
+            return prediction
+        depth_list = []
+        conf_list = [] if prediction.conf is not None else None
+        sky_list = [] if prediction.sky is not None else None
+        for idx, meta in enumerate(pad_meta):
+            pt, pb, pl, pr = meta.get("pad", (0, 0, 0, 0))
+            def crop(arr: np.ndarray | None) -> np.ndarray | None:
+                if arr is None:
+                    return None
+                h, w = arr.shape[-2], arr.shape[-1]
+                return arr[pt : h - pb if pb > 0 else h, pl : w - pr if pr > 0 else w]
+            depth_list.append(crop(prediction.depth[idx]) if prediction.depth is not None else None)
+            if conf_list is not None:
+                conf_list.append(crop(prediction.conf[idx]))
+            if sky_list is not None:
+                sky_list.append(crop(prediction.sky[idx]))
+            if prediction.intrinsics is not None:
+                prediction.intrinsics[idx, 0, 2] -= pl
+                prediction.intrinsics[idx, 1, 2] -= pt
+        if depth_list:
+            prediction.depth = np.stack(depth_list, axis=0)
+        if conf_list is not None:
+            prediction.conf = np.stack(conf_list, axis=0)
+        if sky_list is not None:
+            prediction.sky = np.stack(sky_list, axis=0)
+        return prediction

Depth-Anything-3-anysize/src/depth_anything_3/app/css_and_html.py ADDED Viewed

	@@ -0,0 +1,594 @@

+# flake8: noqa: E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CSS and HTML content for the Depth Anything 3 Gradio application.
+This module contains all the CSS styles and HTML content blocks
+used in the Gradio interface.
+"""
+# CSS Styles for the Gradio interface
+GRADIO_CSS = """
+/* Add Font Awesome CDN with all styles including brands and colors */
+@import url('https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css');
+/* Add custom styles for colored icons */
+.fa-color-blue {
+    color: #3b82f6;
+}
+.fa-color-purple {
+    color: #8b5cf6;
+}
+.fa-color-cyan {
+    color: #06b6d4;
+}
+.fa-color-green {
+    color: #10b981;
+}
+.fa-color-yellow {
+    color: #f59e0b;
+}
+.fa-color-red {
+    color: #ef4444;
+}
+.link-btn {
+    display: inline-flex;
+    align-items: center;
+    gap: 8px;
+    text-decoration: none;
+    padding: 12px 24px;
+    border-radius: 50px;
+    font-weight: 500;
+    transition: all 0.3s ease;
+}
+/* Dark mode tech theme */
+@media (prefers-color-scheme: dark) {
+    html, body {
+        background: #1e293b;
+        color: #ffffff;
+    }
+    .gradio-container {
+        background: #1e293b;
+        color: #ffffff;
+    }
+    .link-btn {
+        background: rgba(255, 255, 255, 0.2);
+        color: white;
+        backdrop-filter: blur(10px);
+        border: 1px solid rgba(255, 255, 255, 0.3);
+    }
+    .link-btn:hover {
+        background: rgba(255, 255, 255, 0.3);
+        transform: translateY(-2px);
+        box-shadow: 0 8px 25px rgba(0, 0, 0, 0.2);
+    }
+    .tech-bg {
+        background: linear-gradient(135deg, #0f172a, #1e293b); /* Darker colors */
+        position: relative;
+        overflow: hidden;
+    }
+    .tech-bg::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+            radial-gradient(circle at 20% 80%, rgba(59, 130, 246, 0.15) 0%, transparent 50%), /* Reduced opacity */
+            radial-gradient(circle at 80% 20%, rgba(139, 92, 246, 0.15) 0%, transparent 50%), /* Reduced opacity */
+            radial-gradient(circle at 40% 40%, rgba(18, 194, 233, 0.1) 0%, transparent 50%); /* Reduced opacity */
+        animation: techPulse 8s ease-in-out infinite;
+    }
+    .gradio-container .panel,
+    .gradio-container .block,
+    .gradio-container .form {
+        background: rgba(0, 0, 0, 0.3);
+        border: 1px solid rgba(59, 130, 246, 0.2);
+        border-radius: 10px;
+    }
+    .gradio-container * {
+        color: #ffffff;
+    }
+    .gradio-container label {
+        color: #e0e0e0;
+    }
+    .gradio-container .markdown {
+        color: #e0e0e0;
+    }
+}
+/* Light mode tech theme */
+@media (prefers-color-scheme: light) {
+    html, body {
+        background: #ffffff;
+        color: #1e293b;
+    }
+    .gradio-container {
+        background: #ffffff;
+        color: #1e293b;
+    }
+    .tech-bg {
+        background: linear-gradient(135deg, #ffffff, #f1f5f9);
+        position: relative;
+        overflow: hidden;
+    }
+    .link-btn {
+        background: rgba(59, 130, 246, 0.15);
+        color: var(--body-text-color);
+        border: 1px solid rgba(59, 130, 246, 0.3);
+    }
+    .link-btn:hover {
+        background: rgba(59, 130, 246, 0.25);
+        transform: translateY(-2px);
+        box-shadow: 0 8px 25px rgba(59, 130, 246, 0.2);
+    }
+    .tech-bg::before {
+        content: '';
+        position: absolute;
+        top: 0;
+        left: 0;
+        right: 0;
+        bottom: 0;
+        background:
+            radial-gradient(circle at 20% 80%, rgba(59, 130, 246, 0.1) 0%, transparent 50%),
+            radial-gradient(circle at 80% 20%, rgba(139, 92, 246, 0.1) 0%, transparent 50%),
+            radial-gradient(circle at 40% 40%, rgba(18, 194, 233, 0.08) 0%, transparent 50%);
+        animation: techPulse 8s ease-in-out infinite;
+    }
+    .gradio-container .panel,
+    .gradio-container .block,
+    .gradio-container .form {
+        background: rgba(255, 255, 255, 0.8);
+        border: 1px solid rgba(59, 130, 246, 0.3);
+        border-radius: 10px;
+        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
+    }
+    .gradio-container * {
+        color: #1e293b;
+    }
+    .gradio-container label {
+        color: #334155;
+    }
+    .gradio-container .markdown {
+        color: #334155;
+    }
+}
+@keyframes techPulse {
+    0%, 100% { opacity: 0.5; }
+    50% { opacity: 0.8; }
+}
+/* Custom log with tech gradient */
+.custom-log * {
+    font-style: italic;
+    font-size: 22px !important;
+    background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+    background-size: 400% 400%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    font-weight: bold !important;
+    color: transparent !important;
+    text-align: center !important;
+    animation: techGradient 3s ease infinite;
+}
+@keyframes techGradient {
+    0% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+    100% { background-position: 0% 50%; }
+}
+@keyframes metricPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes pointcloudPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes camerasPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+@keyframes gaussiansPulse {
+    0%, 100% { background-position: 0% 50%; }
+    50% { background-position: 100% 50%; }
+}
+/* Special colors for key terms - Global styles */
+.metric-text {
+    background: linear-gradient(45deg, #ff6b6b, #ff8e53, #ff6b6b);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: metricPulse 2s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
+}
+.pointcloud-text {
+    background: linear-gradient(45deg, #4ecdc4, #44a08d, #4ecdc4);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: pointcloudPulse 2.5s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(78, 205, 196, 0.5);
+}
+.cameras-text {
+    background: linear-gradient(45deg, #667eea, #764ba2, #667eea);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: camerasPulse 3s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(102, 126, 234, 0.5);
+}
+.gaussians-text {
+    background: linear-gradient(45deg, #f093fb, #f5576c, #f093fb);
+    background-size: 200% 200%;
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+    animation: gaussiansPulse 2.2s ease-in-out infinite;
+    font-weight: 700;
+    text-shadow: 0 0 10px rgba(240, 147, 251, 0.5);
+}
+.example-log * {
+    font-style: italic;
+    font-size: 16px !important;
+    background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+    -webkit-background-clip: text;
+    background-clip: text;
+    color: transparent !important;
+}
+#my_radio .wrap {
+    display: flex;
+    flex-wrap: nowrap;
+    justify-content: center;
+    align-items: center;
+}
+#my_radio .wrap label {
+    display: flex;
+    width: 50%;
+    justify-content: center;
+    align-items: center;
+    margin: 0;
+    padding: 10px 0;
+    box-sizing: border-box;
+}
+/* Align navigation buttons with dropdown bottom */
+.navigation-row {
+    display: flex !important;
+    align-items: flex-end !important;
+    gap: 8px !important;
+}
+.navigation-row > div:nth-child(1),
+.navigation-row > div:nth-child(3) {
+    align-self: flex-end !important;
+}
+.navigation-row > div:nth-child(2) {
+    flex: 1 !important;
+}
+/* Make thumbnails clickable with pointer cursor */
+.clickable-thumbnail img {
+    cursor: pointer !important;
+}
+.clickable-thumbnail:hover img {
+    cursor: pointer !important;
+    opacity: 0.8;
+    transition: opacity 0.3s ease;
+}
+/* Make thumbnail containers narrower horizontally */
+.clickable-thumbnail {
+    padding: 5px 2px !important;
+    margin: 0 2px !important;
+}
+.clickable-thumbnail .image-container {
+    margin: 0 !important;
+    padding: 0 !important;
+}
+.scene-info {
+    text-align: center !important;
+    padding: 5px 2px !important;
+    margin: 0 !important;
+}
+"""
+def get_header_html(logo_base64=None):
+    """
+    Generate the main header HTML with logo and title.
+    Args:
+        logo_base64 (str, optional): Base64 encoded logo image
+    Returns:
+        str: HTML string for the header
+    """
+    return """
+    <div class="tech-bg" style="text-align: center; margin-bottom: 5px; padding: 40px 20px; border-radius: 15px; position: relative; overflow: hidden;">
+        <div style="position: relative; z-index: 2;">
+            <h1 style="margin: 0; font-size: 3.5em; font-weight: 700;
+                background: linear-gradient(135deg, #3b82f6, #8b5cf6);
+                background-size: 400% 400%;
+                -webkit-background-clip: text;
+                background-clip: text;
+                color: transparent;
+                animation: techGradient 3s ease infinite;
+                text-shadow: 0 0 30px rgba(59, 130, 246, 0.5);
+                letter-spacing: 2px;">
+                Depth Anything 3
+            </h1>
+            <p style="margin: 15px 0 0 0; font-size: 2.16em; font-weight: 300;" class="header-subtitle">
+                Recovering the Visual Space from Any Views
+            </p>
+            <div style="margin-top: 20px;">
+                <!-- Revert buttons to original inline styles -->
+                <a href="https://depth-anything-3.github.io" target="_blank" class="link-btn">
+                    <i class="fas fa-globe" style="margin-right: 8px;"></i> Project Page
+                </a>
+                <a href="https://arxiv.org/abs/2406.09414" target="_blank" class="link-btn">
+                    <i class="fas fa-file-pdf" style="margin-right: 8px;"></i> Paper
+                </a>
+                <a href="https://github.com/ByteDance-Seed/Depth-Anything-3" target="_blank" class="link-btn">
+                    <i class="fab fa-github" style="margin-right: 8px;"></i> Code
+                </a>
+            </div>
+        </div>
+    </div>
+    <style>
+        /* Ensure tech-bg class is properly applied in dark mode */
+        @media (prefers-color-scheme: dark) {
+            .header-subtitle {
+                color: #cbd5e1;
+            }
+            /* Increase priority to ensure background color is properly applied */
+            .tech-bg {
+                background: linear-gradient(135deg, #0f172a, #1e293b) !important;
+            }
+        }
+        @media (prefers-color-scheme: light) {
+            .header-subtitle {
+                color: #475569;
+            }
+            /* Also add explicit background color for light mode */
+            .tech-bg {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%) !important;
+            }
+        }
+    </style>
+    """
+def get_description_html():
+    """
+    Generate the main description and getting started HTML.
+    Returns:
+        str: HTML string for the description
+    """
+    return """
+    <div class="description-container" style="padding: 25px; border-radius: 15px; margin: 0 0 20px 0;">
+        <h2 class="description-title" style="margin-top: 0; font-size: 1.6em; text-align: center;">
+            <i class="fas fa-bullseye fa-color-red" style="margin-right: 8px;"></i> What This Demo Does
+        </h2>
+        <div class="description-content" style="padding: 20px; border-radius: 10px; margin: 15px 0; text-align: center;">
+            <p class="description-main" style="line-height: 1.6; margin: 0; font-size: 1.45em;">
+                <strong>Upload images or videos</strong> → <strong>Get <span class="metric-text">Metric</span> <span class="pointcloud-text">Point Clouds</span>, <span class="cameras-text">Cameras</span> and <span class="gaussians-text">Novel Views</span></strong> → <strong>Explore in 3D</strong>
+            </p>
+        </div>
+        <div style="text-align: center; margin-top: 15px;">
+            <p class="description-tip" style="font-style: italic; margin: 0;">
+                <i class="fas fa-lightbulb fa-color-yellow" style="margin-right: 8px;"></i> <strong>Tip:</strong> Landscape-oriented images or videos are preferred for best 3D recovering.
+            </p>
+        </div>
+    </div>
+    <style>
+        @media (prefers-color-scheme: dark) {
+            .description-container {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%);
+                border: 1px solid rgba(59, 130, 246, 0.2);
+            }
+            .description-title { color: #3b82f6; }
+            .description-content { background: rgba(0, 0, 0, 0.3); }
+            .description-main { color: #e0e0e0; }
+            .description-text { color: #cbd5e1; }
+            .description-tip { color: #cbd5e1; }
+        }
+        @media (prefers-color-scheme: light) {
+            .description-container {
+                background: linear-gradient(135deg, rgba(59, 130, 246, 0.05) 0%, rgba(139, 92, 246, 0.05) 100%);
+                border: 1px solid rgba(59, 130, 246, 0.3);
+            }
+            .description-title { color: #3b82f6; }
+            .description-content { background: transparent; }
+            .description-main { color: #1e293b; }
+            .description-text { color: #475569; }
+            .description-tip { color: #475569; }
+        }
+    </style>
+    """
+def get_acknowledgements_html():
+    """
+    Generate the acknowledgements section HTML.
+    Returns:
+        str: HTML string for the acknowledgements
+    """
+    return """
+    <div style="background: linear-gradient(135deg, rgba(59, 130, 246, 0.1) 0%, rgba(139, 92, 246, 0.1) 100%);
+                padding: 25px; border-radius: 15px; margin: 20px 0; border: 1px solid rgba(59, 130, 246, 0.2);">
+        <h3 style="color: #3b82f6; margin-top: 0; text-align: center; font-size: 1.4em;">
+            <i class="fas fa-trophy fa-color-yellow" style="margin-right: 8px;"></i> Research Credits & Acknowledgments
+        </h3>
+        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin: 15px 0;">
+            <!-- Original Research Section (Left) -->
+            <div style="text-align: center;">
+                <h4 style="color: #8b5cf6; margin: 10px 0;"><i class="fas fa-flask fa-color-green" style="margin-right: 8px;"></i> Original Research</h4>
+                <p style="color: #e0e0e0; margin: 5px 0;">
+                    <a href="https://depth-anything-3.github.io" target="_blank"
+                       style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                        Depth Anything 3
+                    </a>
+                </p>
+            </div>
+            <!-- Previous Versions Section (Right) -->
+            <div style="text-align: center;">
+                <h4 style="color: #8b5cf6; margin: 10px 0;"><i class="fas fa-history fa-color-blue" style="margin-right: 8px;"></i> Previous Versions</h4>
+                <div style="display: flex; flex-direction: row; gap: 15px; justify-content: center; align-items: center;">
+                    <p style="color: #e0e0e0; margin: 0;">
+                        <a href="https://huggingface.co/spaces/LiheYoung/Depth-Anything" target="_blank"
+                           style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                            Depth-Anything
+                        </a>
+                    </p>
+                    <span style="color: #e0e0e0;">•</span>
+                    <p style="color: #e0e0e0; margin: 0;">
+                        <a href="https://huggingface.co/spaces/depth-anything/Depth-Anything-V2" target="_blank"
+                           style="color: #3b82f6; text-decoration: none; font-weight: 600;">
+                            Depth-Anything-V2
+                        </a>
+                    </p>
+                </div>
+            </div>
+        </div>
+        <!-- HF Demo Adapted from - Centered at the bottom of the whole block -->
+        <div style="margin-top: 20px; padding-top: 15px; border-top: 1px solid rgba(59, 130, 246, 0.3); text-align: center;">
+            <p style="color: #a0a0a0; font-size: 0.9em; margin: 0;">
+                <i class="fas fa-code-branch fa-color-gray" style="margin-right: 5px;"></i> HF demo adapted from <a href="https://huggingface.co/spaces/facebook/map-anything" target="_blank" style="color: inherit; text-decoration: none;">Map Anything</a>
+            </p>
+        </div>
+    </div>
+    """
+def get_gradio_theme():
+    """
+    Get the configured Gradio theme with adaptive tech colors.
+    Returns:
+        gr.themes.Base: Configured Gradio theme
+    """
+    import gradio as gr
+    return gr.themes.Base(
+        primary_hue=gr.themes.Color(
+            c50="#eff6ff",
+            c100="#dbeafe",
+            c200="#bfdbfe",
+            c300="#93c5fd",
+            c400="#60a5fa",
+            c500="#3b82f6",
+            c600="#2563eb",
+            c700="#1d4ed8",
+            c800="#1e40af",
+            c900="#1e3a8a",
+            c950="#172554",
+        ),
+        secondary_hue=gr.themes.Color(
+            c50="#f5f3ff",
+            c100="#ede9fe",
+            c200="#ddd6fe",
+            c300="#c4b5fd",
+            c400="#a78bfa",
+            c500="#8b5cf6",
+            c600="#7c3aed",
+            c700="#6d28d9",
+            c800="#5b21b6",
+            c900="#4c1d95",
+            c950="#2e1065",
+        ),
+        neutral_hue=gr.themes.Color(
+            c50="#f8fafc",
+            c100="#f1f5f9",
+            c200="#e2e8f0",
+            c300="#cbd5e1",
+            c400="#94a3b8",
+            c500="#64748b",
+            c600="#475569",
+            c700="#334155",
+            c800="#1e293b",
+            c900="#0f172a",
+            c950="#020617",
+        ),
+    )
+# Measure tab instructions HTML
+MEASURE_INSTRUCTIONS_HTML = """
+### Click points on the image to compute distance.
+> <i class="fas fa-triangle-exclamation fa-color-red" style="margin-right: 5px;"></i> Metric scale estimation is difficult on aerial/drone images.
+"""

Depth-Anything-3-anysize/src/depth_anything_3/app/gradio_app.py ADDED Viewed

	@@ -0,0 +1,747 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Refactored Gradio App for Depth Anything 3.
+This is the main application file that orchestrates all components.
+The original functionality has been split into modular components for better maintainability.
+"""
+import argparse
+import os
+from typing import Any, Dict, List
+import gradio as gr
+from depth_anything_3.app.css_and_html import GRADIO_CSS, get_gradio_theme
+from depth_anything_3.app.modules.event_handlers import EventHandlers
+from depth_anything_3.app.modules.ui_components import UIComponents
+# Set environment variables
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+class DepthAnything3App:
+    """
+    Main application class for Depth Anything 3 Gradio app.
+    """
+    def __init__(self, model_dir: str = None, workspace_dir: str = None, gallery_dir: str = None):
+        """
+        Initialize the application.
+        Args:
+            model_dir: Path to the model directory
+            workspace_dir: Path to the workspace directory
+            gallery_dir: Path to the gallery directory
+        """
+        self.model_dir = model_dir
+        self.workspace_dir = workspace_dir
+        self.gallery_dir = gallery_dir
+        # Set environment variables for directories
+        if self.model_dir:
+            os.environ["DA3_MODEL_DIR"] = self.model_dir
+        if self.workspace_dir:
+            os.environ["DA3_WORKSPACE_DIR"] = self.workspace_dir
+        if self.gallery_dir:
+            os.environ["DA3_GALLERY_DIR"] = self.gallery_dir
+        self.event_handlers = EventHandlers()
+        self.ui_components = UIComponents()
+    def cache_examples(
+        self,
+        show_cam: bool = True,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        save_percentage: float = 20.0,
+        num_max_points: int = 1000,
+        cache_gs_tag: str = "",
+        gs_trj_mode: str = "smooth",
+        gs_video_quality: str = "low",
+    ) -> None:
+        """
+        Pre-cache all example scenes at startup.
+        Args:
+            show_cam: Whether to show camera in visualization
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            save_percentage: Filter percentage for point cloud
+            num_max_points: Maximum number of points
+            cache_gs_tag: Tag to match scene names for high-res+3DGS caching (e.g., "dl3dv")
+            gs_trj_mode: Trajectory mode for 3DGS
+            gs_video_quality: Video quality for 3DGS
+        """
+        from depth_anything_3.app.modules.utils import get_scene_info
+        examples_dir = os.path.join(self.workspace_dir, "examples")
+        if not os.path.exists(examples_dir):
+            print(f"Examples directory not found: {examples_dir}")
+            return
+        scenes = get_scene_info(examples_dir)
+        if not scenes:
+            print("No example scenes found to cache.")
+            return
+        print(f"\n{'='*60}")
+        print(f"Caching {len(scenes)} example scenes...")
+        print(f"{'='*60}\n")
+        for i, scene in enumerate(scenes, 1):
+            scene_name = scene["name"]
+            # Check if scene name matches the gs tag for high-res+3DGS caching
+            use_high_res_gs = cache_gs_tag and cache_gs_tag.lower() in scene_name.lower()
+            if use_high_res_gs:
+                print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (HIGH-RES + 3DGS)")
+                print(f"  - Number of images: {scene['num_images']}")
+                print(f"  - Matched tag: '{cache_gs_tag}' - using high_res + 3DGS")
+            else:
+                print(f"[{i}/{len(scenes)}] Caching scene: {scene_name} (LOW-RES)")
+                print(f"  - Number of images: {scene['num_images']}")
+            try:
+                # Load example scene
+                _, target_dir, _, _, _, _, _, _, _ = self.event_handlers.load_example_scene(
+                    scene_name
+                )
+                if target_dir and target_dir != "None":
+                    # Run reconstruction with appropriate settings
+                    print("  - Running reconstruction...")
+                    result = self.event_handlers.gradio_demo(
+                        target_dir=target_dir,
+                        show_cam=show_cam,
+                        filter_black_bg=filter_black_bg,
+                        filter_white_bg=filter_white_bg,
+                        process_res_method="high_res" if use_high_res_gs else "low_res",
+                        selected_first_frame="",
+                        save_percentage=save_percentage,
+                        num_max_points=num_max_points,
+                        infer_gs=use_high_res_gs,
+                        gs_trj_mode=gs_trj_mode,
+                        gs_video_quality=gs_video_quality,
+                    )
+                    # Check if successful
+                    if result[0] is not None:  # reconstruction_output
+                        print(f"  ✓ Scene '{scene_name}' cached successfully")
+                    else:
+                        print(f"  ✗ Scene '{scene_name}' caching failed: {result[1]}")
+                else:
+                    print(f"  ✗ Scene '{scene_name}' loading failed")
+            except Exception as e:
+                print(f"  ✗ Error caching scene '{scene_name}': {str(e)}")
+            print()
+        print("=" * 60)
+        print("Example scene caching completed!")
+        print("=" * 60 + "\n")
+    def create_app(self) -> gr.Blocks:
+        """
+        Create and configure the Gradio application.
+        Returns:
+            Configured Gradio Blocks interface
+        """
+        # Initialize theme
+        def get_theme():
+            return get_gradio_theme()
+        with gr.Blocks(theme=get_theme(), css=GRADIO_CSS) as demo:
+            # State variables for the tabbed interface
+            is_example = gr.Textbox(label="is_example", visible=False, value="None")
+            processed_data_state = gr.State(value=None)
+            measure_points_state = gr.State(value=[])
+            selected_first_frame_state = gr.State(value="")
+            selected_image_index_state = gr.State(value=0)  # Track selected image index
+            # current_view_index = gr.State(value=0)  # noqa: F841 Track current view index
+            # Header and description
+            self.ui_components.create_header_section()
+            self.ui_components.create_description_section()
+            target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
+            # Main content area
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Upload section
+                    (
+                        input_video,
+                        s_time_interval,
+                        input_images,
+                        image_gallery,
+                        select_first_frame_btn,
+                    ) = self.ui_components.create_upload_section()
+                with gr.Column(scale=4):
+                    with gr.Column():
+                        # gr.Markdown("**Metric 3D Reconstruction (Point Cloud and Camera Poses)**")
+                        # Reconstruction control section (buttons) - moved below tabs
+                        log_output = gr.Markdown(
+                            "Please upload a video or images, then click Reconstruct.",
+                            elem_classes=["custom-log"],
+                        )
+                        # Tabbed interface
+                        with gr.Tabs():
+                            with gr.Tab("Point Cloud & Cameras"):
+                                reconstruction_output = (
+                                    self.ui_components.create_3d_viewer_section()
+                                )
+                            with gr.Tab("Metric Depth"):
+                                (
+                                    prev_measure_btn,
+                                    measure_view_selector,
+                                    next_measure_btn,
+                                    measure_image,
+                                    measure_depth_image,
+                                    measure_text,
+                                ) = self.ui_components.create_measure_section()
+                            with gr.Tab("3DGS Rendered Novel Views"):
+                                gs_video, gs_info = self.ui_components.create_nvs_video()
+                        # Inference control section (before inference)
+                        (process_res_method_dropdown, infer_gs) = (
+                            self.ui_components.create_inference_control_section()
+                        )
+                        # Display control section - includes 3DGS options, buttons, and Visualization Options  # noqa: E501
+                        (
+                            show_cam,
+                            filter_black_bg,
+                            filter_white_bg,
+                            save_percentage,
+                            num_max_points,
+                            gs_trj_mode,
+                            gs_video_quality,
+                            submit_btn,
+                            clear_btn,
+                        ) = self.ui_components.create_display_control_section()
+                        # bind visibility of gs_trj_mode to infer_gs
+                        infer_gs.change(
+                            fn=lambda checked: (
+                                gr.update(visible=checked),
+                                gr.update(visible=checked),
+                                gr.update(visible=checked),
+                                gr.update(visible=(not checked)),
+                            ),
+                            inputs=infer_gs,
+                            outputs=[gs_trj_mode, gs_video_quality, gs_video, gs_info],
+                        )
+            # Example scenes section
+            gr.Markdown("## Example Scenes")
+            scenes = self.ui_components.create_example_scenes_section()
+            scene_components = self.ui_components.create_example_scene_grid(scenes)
+            # Set up event handlers
+            self._setup_event_handlers(
+                demo,
+                is_example,
+                processed_data_state,
+                measure_points_state,
+                target_dir_output,
+                input_video,
+                input_images,
+                s_time_interval,
+                image_gallery,
+                reconstruction_output,
+                log_output,
+                show_cam,
+                filter_black_bg,
+                filter_white_bg,
+                process_res_method_dropdown,
+                save_percentage,
+                submit_btn,
+                clear_btn,
+                num_max_points,
+                infer_gs,
+                select_first_frame_btn,
+                selected_first_frame_state,
+                selected_image_index_state,
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_text,
+                prev_measure_btn,
+                next_measure_btn,
+                scenes,
+                scene_components,
+                gs_video,
+                gs_info,
+                gs_trj_mode,
+                gs_video_quality,
+            )
+            # Acknowledgements
+            self.ui_components.create_acknowledgements_section()
+        return demo
+    def _setup_event_handlers(
+        self,
+        demo: gr.Blocks,
+        is_example: gr.Textbox,
+        processed_data_state: gr.State,
+        measure_points_state: gr.State,
+        target_dir_output: gr.Textbox,
+        input_video: gr.Video,
+        input_images: gr.File,
+        s_time_interval: gr.Slider,
+        image_gallery: gr.Gallery,
+        reconstruction_output: gr.Model3D,
+        log_output: gr.Markdown,
+        show_cam: gr.Checkbox,
+        filter_black_bg: gr.Checkbox,
+        filter_white_bg: gr.Checkbox,
+        process_res_method_dropdown: gr.Dropdown,
+        save_percentage: gr.Slider,
+        submit_btn: gr.Button,
+        clear_btn: gr.ClearButton,
+        num_max_points: gr.Slider,
+        infer_gs: gr.Checkbox,
+        select_first_frame_btn: gr.Button,
+        selected_first_frame_state: gr.State,
+        selected_image_index_state: gr.State,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        measure_text: gr.Markdown,
+        prev_measure_btn: gr.Button,
+        next_measure_btn: gr.Button,
+        scenes: List[Dict[str, Any]],
+        scene_components: List[gr.Image],
+        gs_video: gr.Video,
+        gs_info: gr.Markdown,
+        gs_trj_mode: gr.Dropdown,
+        gs_video_quality: gr.Dropdown,
+    ) -> None:
+        """
+        Set up all event handlers for the application.
+        Args:
+            demo: Gradio Blocks interface
+            All other arguments: Gradio components to connect
+        """
+        # Configure clear button
+        clear_btn.add(
+            [
+                input_video,
+                input_images,
+                reconstruction_output,
+                log_output,
+                target_dir_output,
+                image_gallery,
+                gs_video,
+            ]
+        )
+        # Main reconstruction button
+        submit_btn.click(
+            fn=self.event_handlers.clear_fields, inputs=[], outputs=[reconstruction_output]
+        ).then(fn=self.event_handlers.update_log, inputs=[], outputs=[log_output]).then(
+            fn=self.event_handlers.gradio_demo,
+            inputs=[
+                target_dir_output,
+                show_cam,
+                filter_black_bg,
+                filter_white_bg,
+                process_res_method_dropdown,
+                selected_first_frame_state,
+                save_percentage,
+                # pass num_max_points
+                num_max_points,
+                infer_gs,
+                gs_trj_mode,
+                gs_video_quality,
+            ],
+            outputs=[
+                reconstruction_output,
+                log_output,
+                processed_data_state,
+                measure_image,
+                measure_depth_image,
+                measure_text,
+                measure_view_selector,
+                gs_video,
+                gs_video,  # gs_video visibility
+                gs_info,  # gs_info visibility
+            ],
+        ).then(
+            fn=lambda: "False",
+            inputs=[],
+            outputs=[is_example],  # set is_example to "False"
+        )
+        # Real-time visualization updates
+        self._setup_visualization_handlers(
+            show_cam,
+            filter_black_bg,
+            filter_white_bg,
+            process_res_method_dropdown,
+            target_dir_output,
+            is_example,
+            reconstruction_output,
+            log_output,
+        )
+        # File upload handlers
+        input_video.change(
+            fn=self.event_handlers.handle_uploads,
+            inputs=[input_video, input_images, s_time_interval],
+            outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+        )
+        input_images.change(
+            fn=self.event_handlers.handle_uploads,
+            inputs=[input_video, input_images, s_time_interval],
+            outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+        )
+        # Image gallery click handler (for selecting first frame)
+        def handle_image_selection(evt: gr.SelectData):
+            if evt is None or evt.index is None:
+                return "No image selected", 0
+            selected_index = evt.index
+            return f"Selected image {selected_index} as potential first frame", selected_index
+        image_gallery.select(
+            fn=handle_image_selection,
+            outputs=[log_output, selected_image_index_state],
+        )
+        # Select first frame handler
+        select_first_frame_btn.click(
+            fn=self.event_handlers.select_first_frame,
+            inputs=[image_gallery, selected_image_index_state],
+            outputs=[image_gallery, log_output, selected_first_frame_state],
+        )
+        # Navigation handlers
+        self._setup_navigation_handlers(
+            prev_measure_btn,
+            next_measure_btn,
+            measure_view_selector,
+            measure_image,
+            measure_depth_image,
+            measure_points_state,
+            processed_data_state,
+        )
+        # Measurement handler
+        measure_image.select(
+            fn=self.event_handlers.measure,
+            inputs=[processed_data_state, measure_points_state, measure_view_selector],
+            outputs=[measure_image, measure_depth_image, measure_points_state, measure_text],
+        )
+        # Example scene handlers
+        self._setup_example_scene_handlers(
+            scenes,
+            scene_components,
+            reconstruction_output,
+            target_dir_output,
+            image_gallery,
+            log_output,
+            is_example,
+            processed_data_state,
+            measure_view_selector,
+            measure_image,
+            measure_depth_image,
+            gs_video,
+            gs_info,
+        )
+    def _setup_visualization_handlers(
+        self,
+        show_cam: gr.Checkbox,
+        filter_black_bg: gr.Checkbox,
+        filter_white_bg: gr.Checkbox,
+        process_res_method_dropdown: gr.Dropdown,
+        target_dir_output: gr.Textbox,
+        is_example: gr.Textbox,
+        reconstruction_output: gr.Model3D,
+        log_output: gr.Markdown,
+    ) -> None:
+        """Set up visualization update handlers."""
+        # Common inputs for visualization updates
+        viz_inputs = [
+            target_dir_output,
+            show_cam,
+            is_example,
+            filter_black_bg,
+            filter_white_bg,
+            process_res_method_dropdown,
+        ]
+        # Set up change handlers for all visualization controls
+        for component in [show_cam, filter_black_bg, filter_white_bg]:
+            component.change(
+                fn=self.event_handlers.update_visualization,
+                inputs=viz_inputs,
+                outputs=[reconstruction_output, log_output],
+            )
+    def _setup_navigation_handlers(
+        self,
+        prev_measure_btn: gr.Button,
+        next_measure_btn: gr.Button,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        measure_points_state: gr.State,
+        processed_data_state: gr.State,
+    ) -> None:
+        """Set up navigation handlers for measure tab."""
+        # Measure tab navigation
+        prev_measure_btn.click(
+            fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
+                processed_data, current_selector, -1
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_points_state,
+            ],
+        )
+        next_measure_btn.click(
+            fn=lambda processed_data, current_selector: self.event_handlers.navigate_measure_view(
+                processed_data, current_selector, 1
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[
+                measure_view_selector,
+                measure_image,
+                measure_depth_image,
+                measure_points_state,
+            ],
+        )
+        measure_view_selector.change(
+            fn=lambda processed_data, selector_value: (
+                self.event_handlers.update_measure_view(
+                    processed_data, int(selector_value.split()[1]) - 1
+                )
+                if selector_value
+                else (None, None, [])
+            ),
+            inputs=[processed_data_state, measure_view_selector],
+            outputs=[measure_image, measure_depth_image, measure_points_state],
+        )
+    def _setup_example_scene_handlers(
+        self,
+        scenes: List[Dict[str, Any]],
+        scene_components: List[gr.Image],
+        reconstruction_output: gr.Model3D,
+        target_dir_output: gr.Textbox,
+        image_gallery: gr.Gallery,
+        log_output: gr.Markdown,
+        is_example: gr.Textbox,
+        processed_data_state: gr.State,
+        measure_view_selector: gr.Dropdown,
+        measure_image: gr.Image,
+        measure_depth_image: gr.Image,
+        gs_video: gr.Video,
+        gs_info: gr.Markdown,
+    ) -> None:
+        """Set up example scene handlers."""
+        def load_and_update_measure(name):
+            result = self.event_handlers.load_example_scene(name)
+            # result = (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis)  # noqa: E501
+            # Update measure view if processed_data is available
+            measure_img = None
+            measure_depth = None
+            if result[4] is not None:  # processed_data exists
+                measure_img, measure_depth, _ = (
+                    self.event_handlers.visualization_handler.update_measure_view(result[4], 0)
+                )
+            return result + ("True", measure_img, measure_depth)
+        for i, scene in enumerate(scenes):
+            if i < len(scene_components):
+                scene_components[i].select(
+                    fn=lambda name=scene["name"]: load_and_update_measure(name),
+                    outputs=[
+                        reconstruction_output,
+                        target_dir_output,
+                        image_gallery,
+                        log_output,
+                        processed_data_state,
+                        measure_view_selector,
+                        gs_video,
+                        gs_video,  # gs_video_visibility
+                        gs_info,  # gs_info_visibility
+                        is_example,
+                        measure_image,
+                        measure_depth_image,
+                    ],
+                )
+    def launch(self, host: str = "127.0.0.1", port: int = 7860, **kwargs) -> None:
+        """
+        Launch the application.
+        Args:
+            host: Host address to bind to
+            port: Port number to bind to
+            **kwargs: Additional arguments for demo.launch()
+        """
+        demo = self.create_app()
+        demo.queue(max_size=20).launch(
+            show_error=True, ssr_mode=False, server_name=host, server_port=port, **kwargs
+        )
+def main():
+    """Main function to run the application."""
+    parser = argparse.ArgumentParser(
+        description="Depth Anything 3 Gradio Application",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Basic usage
+  python gradio_app.py --help
+  python gradio_app.py --host 0.0.0.0 --port 8080
+  python gradio_app.py --model-dir /path/to/model --workspace-dir /path/to/workspace
+  # Cache examples at startup (all low-res)
+  python gradio_app.py --cache-examples
+  # Cache with selective high-res+3DGS for scenes matching tag
+  python gradio_app.py --cache-examples --cache-gs-tag dl3dv
+  # This will use high-res + 3DGS for scenes containing "dl3dv" in their name,
+  # and low-res only for other scenes
+        """,
+    )
+    # Server configuration
+    parser.add_argument(
+        "--host", default="127.0.0.1", help="Host address to bind to (default: 127.0.0.1)"
+    )
+    parser.add_argument(
+        "--port", type=int, default=7860, help="Port number to bind to (default: 7860)"
+    )
+    # Directory configuration
+    parser.add_argument(
+        "--model-dir",
+        default="depth-anything/DA3NESTED-GIANT-LARGE",
+        help="Path to the model directory (default: depth-anything/DA3NESTED-GIANT-LARGE)",
+    )
+    parser.add_argument(
+        "--workspace-dir",
+        default="workspace/gradio",  # noqa: E501
+        help="Path to the workspace directory (default: workspace/gradio)",  # noqa: E501
+    )
+    parser.add_argument(
+        "--gallery-dir",
+        default="workspace/gallery",
+        help="Path to the gallery directory (default: workspace/gallery)",  # noqa: E501
+    )
+    # Additional Gradio options
+    parser.add_argument("--share", action="store_true", help="Create a public link for the app")
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    # Example caching options
+    parser.add_argument(
+        "--cache-examples",
+        action="store_true",
+        help="Pre-cache all example scenes at startup for faster loading",
+    )
+    parser.add_argument(
+        "--cache-gs-tag",
+        type=str,
+        default="",
+        help="Tag to match scene names for high-res+3DGS caching (e.g., 'dl3dv'). Scenes containing this tag will use high_res and infer_gs=True; others will use low_res only.",  # noqa: E501
+    )
+    args = parser.parse_args()
+    # Create directories if they don't exist
+    os.makedirs(args.workspace_dir, exist_ok=True)
+    os.makedirs(args.gallery_dir, exist_ok=True)
+    # Initialize and launch the application
+    app = DepthAnything3App(
+        model_dir=args.model_dir, workspace_dir=args.workspace_dir, gallery_dir=args.gallery_dir
+    )
+    # Prepare launch arguments
+    launch_kwargs = {"share": args.share, "debug": args.debug}
+    print("Starting Depth Anything 3 Gradio App...")
+    print(f"Host: {args.host}")
+    print(f"Port: {args.port}")
+    print(f"Model Directory: {args.model_dir}")
+    print(f"Workspace Directory: {args.workspace_dir}")
+    print(f"Gallery Directory: {args.gallery_dir}")
+    print(f"Share: {args.share}")
+    print(f"Debug: {args.debug}")
+    print(f"Cache Examples: {args.cache_examples}")
+    if args.cache_examples:
+        if args.cache_gs_tag:
+            print(
+                f"Cache GS Tag: '{args.cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)"  # noqa: E501
+            )  # noqa: E501
+        else:
+            print("Cache GS Tag: None (all scenes will use low-res only)")
+    # Pre-cache examples if requested
+    if args.cache_examples:
+        print("\n" + "=" * 60)
+        print("Pre-caching mode enabled")
+        if args.cache_gs_tag:
+            print(f"Scenes containing '{args.cache_gs_tag}' will use HIGH-RES + 3DGS")
+            print("Other scenes will use LOW-RES only")
+        else:
+            print("All scenes will use LOW-RES only")
+        print("=" * 60)
+        app.cache_examples(
+            show_cam=True,
+            filter_black_bg=False,
+            filter_white_bg=False,
+            save_percentage=5.0,
+            num_max_points=1000,
+            cache_gs_tag=args.cache_gs_tag,
+            gs_trj_mode="smooth",
+            gs_video_quality="low",
+        )
+    app.launch(host=args.host, port=args.port, **launch_kwargs)
+if __name__ == "__main__":
+    main()

Depth-Anything-3-anysize/src/depth_anything_3/app/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Modules package for Depth Anything 3 Gradio app.
+This package contains all the modular components for the Gradio application.
+"""
+from depth_anything_3.app.modules.event_handlers import EventHandlers
+from depth_anything_3.app.modules.file_handlers import FileHandler
+from depth_anything_3.app.modules.model_inference import ModelInference
+from depth_anything_3.app.modules.ui_components import UIComponents
+from depth_anything_3.app.modules.utils import (
+    create_depth_visualization,
+    get_logo_base64,
+    get_scene_info,
+    save_to_gallery_func,
+)
+from depth_anything_3.app.modules.visualization import VisualizationHandler
+__all__ = [
+    "ModelInference",
+    "FileHandler",
+    "VisualizationHandler",
+    "EventHandlers",
+    "UIComponents",
+    "create_depth_visualization",
+    "save_to_gallery_func",
+    "get_scene_info",
+    "get_logo_base64",
+]

Depth-Anything-3-anysize/src/depth_anything_3/app/modules/event_handlers.py ADDED Viewed

	@@ -0,0 +1,629 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Event handling module for Depth Anything 3 Gradio app.
+This module handles all event callbacks and user interactions.
+"""
+import os
+import time
+from glob import glob
+from typing import Any, Dict, List, Optional, Tuple
+import gradio as gr
+import numpy as np
+import torch
+from depth_anything_3.app.modules.file_handlers import FileHandler
+from depth_anything_3.app.modules.model_inference import ModelInference
+from depth_anything_3.utils.memory import cleanup_cuda_memory
+from depth_anything_3.app.modules.visualization import VisualizationHandler
+class EventHandlers:
+    """
+    Handles all event callbacks and user interactions for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the event handlers."""
+        self.model_inference = ModelInference()
+        self.file_handler = FileHandler()
+        self.visualization_handler = VisualizationHandler()
+    def clear_fields(self) -> None:
+        """
+        Clears the 3D viewer, the stored target_dir, and empties the gallery.
+        """
+        return None
+    def update_log(self) -> str:
+        """
+        Display a quick log message while waiting.
+        """
+        return "Loading and Reconstructing..."
+    def save_current_visualization(
+        self,
+        target_dir: str,
+        save_percentage: float,
+        show_cam: bool,
+        filter_black_bg: bool,
+        filter_white_bg: bool,
+        processed_data: Optional[Dict],
+        scene_name: str = "",
+    ) -> str:
+        """
+        Save current visualization results to gallery with specified save percentage.
+        Args:
+            target_dir: Directory containing results
+            save_percentage: Percentage of points to save (0-100)
+            show_cam: Whether to show cameras
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            processed_data: Processed data from reconstruction
+        Returns:
+            Status message
+        """
+        if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+            return "No reconstruction available. Please run 'Reconstruct' first."
+        if processed_data is None:
+            return "No processed data available. Please run 'Reconstruct' first."
+        try:
+            # Add debug information
+            print("[DEBUG] save_current_visualization called with:")
+            print(f"  target_dir: {target_dir}")
+            print(f"  save_percentage: {save_percentage}")
+            print(f"  show_cam: {show_cam}")
+            print(f"  filter_black_bg: {filter_black_bg}")
+            print(f"  filter_white_bg: {filter_white_bg}")
+            print(f"  processed_data: {processed_data is not None}")
+            # Import the gallery save function
+            # Create gallery name with user input or auto-generated
+            import datetime
+            from .utils import save_to_gallery_func
+            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+            if scene_name and scene_name.strip():
+                gallery_name = f"{scene_name.strip()}_{timestamp}_pct{save_percentage:.0f}"
+            else:
+                gallery_name = f"save_{timestamp}_pct{save_percentage:.0f}"
+            print(f"[DEBUG] Saving to gallery with name: {gallery_name}")
+            # Save entire process folder to gallery
+            success, message = save_to_gallery_func(
+                target_dir=target_dir, processed_data=processed_data, gallery_name=gallery_name
+            )
+            if success:
+                print(f"[DEBUG] Gallery save completed successfully: {message}")
+                return (
+                    "Successfully saved to gallery!\n"
+                    f"Gallery name: {gallery_name}\n"
+                    f"Save percentage: {save_percentage}%\n"
+                    f"Show cameras: {show_cam}\n"
+                    f"Filter black bg: {filter_black_bg}\n"
+                    f"Filter white bg: {filter_white_bg}\n\n"
+                    f"{message}"
+                )
+            else:
+                print(f"[DEBUG] Gallery save failed: {message}")
+                return f"Failed to save to gallery: {message}"
+        except Exception as e:
+            return f"Error saving visualization: {str(e)}"
+    def gradio_demo(
+        self,
+        target_dir: str,
+        show_cam: bool = True,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "keep",
+        selected_first_frame: str = "",
+        save_percentage: float = 30.0,
+        num_max_points: int = 1_000_000,
+        infer_gs: bool = False,
+        gs_trj_mode: str = "extend",
+        gs_video_quality: str = "high",
+    ) -> Tuple[
+        Optional[str],
+        str,
+        Optional[Dict],
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+        str,
+        gr.Dropdown,
+        Optional[str],  # gs video path
+        gr.update,  # gs video visibility update
+        gr.update,  # gs info visibility update
+    ]:
+        """
+        Perform reconstruction using the already-created target_dir/images.
+        Args:
+            target_dir: Directory containing images
+            show_cam: Whether to show camera
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+            selected_first_frame: Selected first frame filename
+            infer_gs: Whether to infer 3D Gaussian Splatting
+        Returns:
+            Tuple of reconstruction results
+        """
+        if not os.path.isdir(target_dir) or target_dir == "None":
+            return (
+                None,
+                "No valid target directory found. Please upload first.",
+                None,
+                None,
+                None,
+                "",
+                None,
+                None,
+                gr.update(visible=False),  # gs_video
+                gr.update(visible=True),  # gs_info
+            )
+        start_time = time.time()
+        cleanup_cuda_memory()
+        # Get image files for logging
+        target_dir_images = os.path.join(target_dir, "images")
+        all_files = (
+            sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
+        )
+        print("Running DepthAnything3 model...")
+        print(f"Selected first frame: {selected_first_frame}")
+        # Validate selected_first_frame against current image list
+        if selected_first_frame and target_dir_images:
+            current_files = (
+                sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
+            )
+            if selected_first_frame not in current_files:
+                print(
+                    f"Selected first frame '{selected_first_frame}' not found in "
+                    "current images. Using default order."
+                )
+                selected_first_frame = ""  # Reset to use default order
+        with torch.no_grad():
+            prediction, processed_data = self.model_inference.run_inference(
+                target_dir,
+                process_res_method=process_res_method,
+                show_camera=show_cam,
+                selected_first_frame=selected_first_frame,
+                save_percentage=save_percentage,
+                num_max_points=int(num_max_points * 1000),  # Convert K to actual count
+                infer_gs=infer_gs,
+                gs_trj_mode=gs_trj_mode,
+                gs_video_quality=gs_video_quality,
+            )
+        # The GLB file is already generated by the API
+        glbfile = os.path.join(target_dir, "scene.glb")
+        # Handle 3DGS video based on infer_gs flag
+        gsvideo_path = None
+        gs_video_visible = False
+        gs_info_visible = True
+        if infer_gs:
+            try:
+                gsvideo_path = sorted(glob(os.path.join(target_dir, "gs_video", "*.mp4")))[-1]
+                gs_video_visible = True
+                gs_info_visible = False
+            except IndexError:
+                gsvideo_path = None
+                print("3DGS video not found, but infer_gs was enabled")
+        # Cleanup
+        cleanup_cuda_memory()
+        end_time = time.time()
+        print(f"Total time: {end_time - start_time:.2f} seconds")
+        log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
+        # Populate visualization tabs with processed data
+        depth_vis, measure_img, measure_depth_vis, measure_pts = (
+            self.visualization_handler.populate_visualization_tabs(processed_data)
+        )
+        # Update view selectors based on available views
+        depth_selector, measure_selector = self.visualization_handler.update_view_selectors(
+            processed_data
+        )
+        return (
+            glbfile,
+            log_msg,
+            processed_data,
+            measure_img,  # measure_image
+            measure_depth_vis,  # measure_depth_image
+            "",  # measure_text (empty initially)
+            measure_selector,  # measure_view_selector
+            gsvideo_path,
+            gr.update(visible=gs_video_visible),  # gs_video visibility
+            gr.update(visible=gs_info_visible),  # gs_info visibility
+        )
+    def update_visualization(
+        self,
+        target_dir: str,
+        show_cam: bool,
+        is_example: str,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "keep",
+    ) -> Tuple[gr.update, str]:
+        """
+        Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
+        and return it for the 3D viewer.
+        Args:
+            target_dir: Directory containing results
+            show_cam: Whether to show camera
+            is_example: Whether this is an example scene
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+        Returns:
+            Tuple of (glb_file, log_message)
+        """
+        if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+            return (
+                gr.update(),
+                "No reconstruction available. Please click the Reconstruct button first.",
+            )
+        # Check if GLB exists (could be cached example or reconstructed scene)
+        glbfile = os.path.join(target_dir, "scene.glb")
+        if os.path.exists(glbfile):
+            return (
+                glbfile,
+                (
+                    "Visualization loaded from cache."
+                    if is_example == "True"
+                    else "Visualization updated."
+                ),
+            )
+        # If no GLB but it's an example that hasn't been reconstructed yet
+        if is_example == "True":
+            return (
+                gr.update(),
+                "No reconstruction available. Please click the Reconstruct button first.",
+            )
+        # For non-examples, check predictions.npz
+        predictions_path = os.path.join(target_dir, "predictions.npz")
+        if not os.path.exists(predictions_path):
+            error_message = (
+                f"No reconstruction available at {predictions_path}. "
+                "Please run 'Reconstruct' first."
+            )
+            return gr.update(), error_message
+        loaded = np.load(predictions_path, allow_pickle=True)
+        predictions = {key: loaded[key] for key in loaded.keys()}  # noqa: F841
+        return (
+            glbfile,
+            "Visualization updated.",
+        )
+    def handle_uploads(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
+        """
+        Handle file uploads and update gallery.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        return self.file_handler.update_gallery_on_upload(
+            input_video, input_images, s_time_interval
+        )
+    def load_example_scene(self, scene_name: str, examples_dir: str = None) -> Tuple[
+        Optional[str],
+        Optional[str],
+        Optional[List],
+        str,
+        Optional[Dict],
+        gr.Dropdown,
+        Optional[str],
+        gr.update,
+        gr.update,
+    ]:
+        """
+        Load a scene from examples directory.
+        Args:
+            scene_name: Name of the scene to load
+            examples_dir: Path to examples directory (if None, uses workspace_dir/examples)
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message, processed_data, measure_view_selector, gs_video, gs_video_vis, gs_info_vis)  # noqa: E501
+        """
+        if examples_dir is None:
+            # Get workspace directory from environment variable
+            workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+            examples_dir = os.path.join(workspace_dir, "examples")
+        reconstruction_output, target_dir, image_paths, log_message = (
+            self.file_handler.load_example_scene(scene_name, examples_dir)
+        )
+        # Try to load cached processed data if available
+        processed_data = None
+        measure_view_selector = gr.Dropdown(choices=["View 1"], value="View 1")
+        gs_video_path = None
+        gs_video_visible = False
+        gs_info_visible = True
+        if target_dir and target_dir != "None":
+            predictions_path = os.path.join(target_dir, "predictions.npz")
+            if os.path.exists(predictions_path):
+                try:
+                    # Load predictions from cache
+                    loaded = np.load(predictions_path, allow_pickle=True)
+                    predictions = {key: loaded[key] for key in loaded.keys()}
+                    # Reconstruct processed_data structure
+                    num_images = len(predictions.get("images", []))
+                    processed_data = {}
+                    for i in range(num_images):
+                        processed_data[i] = {
+                            "image": predictions["images"][i] if "images" in predictions else None,
+                            "depth": predictions["depths"][i] if "depths" in predictions else None,
+                            "depth_image": os.path.join(
+                                target_dir, "depth_vis", f"{i:04d}.jpg"  # Fixed: use .jpg not .png
+                            ),
+                            "intrinsics": (
+                                predictions["intrinsics"][i]
+                                if "intrinsics" in predictions
+                                and i < len(predictions["intrinsics"])
+                                else None
+                            ),
+                            "mask": None,
+                        }
+                    # Update measure view selector
+                    choices = [f"View {i + 1}" for i in range(num_images)]
+                    measure_view_selector = gr.Dropdown(choices=choices, value=choices[0])
+                except Exception as e:
+                    print(f"Error loading cached data: {e}")
+            # Check for cached 3DGS video
+            gs_video_dir = os.path.join(target_dir, "gs_video")
+            if os.path.exists(gs_video_dir):
+                try:
+                    from glob import glob
+                    gs_videos = sorted(glob(os.path.join(gs_video_dir, "*.mp4")))
+                    if gs_videos:
+                        gs_video_path = gs_videos[-1]
+                        gs_video_visible = True
+                        gs_info_visible = False
+                        print(f"Loaded cached 3DGS video: {gs_video_path}")
+                except Exception as e:
+                    print(f"Error loading cached 3DGS video: {e}")
+        return (
+            reconstruction_output,
+            target_dir,
+            image_paths,
+            log_message,
+            processed_data,
+            measure_view_selector,
+            gs_video_path,
+            gr.update(visible=gs_video_visible),
+            gr.update(visible=gs_info_visible),
+        )
+    def navigate_depth_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector: str,
+        direction: int,
+    ) -> Tuple[str, Optional[str]]:
+        """
+        Navigate depth view.
+        Args:
+            processed_data: Processed data dictionary
+            current_selector: Current selector value
+            direction: Direction to navigate
+        Returns:
+            Tuple of (new_selector_value, depth_vis)
+        """
+        return self.visualization_handler.navigate_depth_view(
+            processed_data, current_selector, direction
+        )
+    def update_depth_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[str]:
+        """
+        Update depth view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Path to depth visualization image or None
+        """
+        return self.visualization_handler.update_depth_view(processed_data, view_index)
+    def navigate_measure_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector: str,
+        direction: int,
+    ) -> Tuple[str, Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Navigate measure view.
+        Args:
+            processed_data: Processed data dictionary
+            current_selector: Current selector value
+            direction: Direction to navigate
+        Returns:
+            Tuple of (new_selector_value, measure_image, depth_right_half, measure_points)
+        """
+        return self.visualization_handler.navigate_measure_view(
+            processed_data, current_selector, direction
+        )
+    def update_measure_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Update measure view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Tuple of (measure_image, depth_right_half, measure_points)
+        """
+        return self.visualization_handler.update_measure_view(processed_data, view_index)
+    def measure(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        measure_points: List,
+        current_view_selector: str,
+        event: gr.SelectData,
+    ) -> List:
+        """
+        Handle measurement on images.
+        Args:
+            processed_data: Processed data dictionary
+            measure_points: List of current measure points
+            current_view_selector: Current view selector value
+            event: Gradio select event
+        Returns:
+            List of [image, depth_right_half, measure_points, text]
+        """
+        return self.visualization_handler.measure(
+            processed_data, measure_points, current_view_selector, event
+        )
+    def select_first_frame(
+        self, image_gallery: List, selected_index: int = 0
+    ) -> Tuple[List, str, str]:
+        """
+        Select the first frame from the image gallery.
+        Args:
+            image_gallery: List of images in the gallery
+            selected_index: Index of the selected image (default: 0)
+        Returns:
+            Tuple of (updated_image_gallery, log_message, selected_frame_path)
+        """
+        try:
+            if not image_gallery or len(image_gallery) == 0:
+                return image_gallery, "No images available to select as first frame.", ""
+            # Handle None or invalid selected_index
+            if (
+                selected_index is None
+                or selected_index < 0
+                or selected_index >= len(image_gallery)
+            ):
+                selected_index = 0
+                print(f"Invalid selected_index: {selected_index}, using default: 0")
+            # Get the selected image based on index
+            selected_image = image_gallery[selected_index]
+            print(f"Selected image index: {selected_index}")
+            print(f"Total images: {len(image_gallery)}")
+            # Extract the file path from the selected image
+            selected_frame_path = ""
+            print(f"Selected image type: {type(selected_image)}")
+            print(f"Selected image: {selected_image}")
+            if isinstance(selected_image, tuple):
+                # Gradio Gallery returns tuple (path, None)
+                selected_frame_path = selected_image[0]
+            elif isinstance(selected_image, str):
+                selected_frame_path = selected_image
+            elif hasattr(selected_image, "name"):
+                selected_frame_path = selected_image.name
+            elif isinstance(selected_image, dict):
+                if "name" in selected_image:
+                    selected_frame_path = selected_image["name"]
+                elif "path" in selected_image:
+                    selected_frame_path = selected_image["path"]
+                elif "src" in selected_image:
+                    selected_frame_path = selected_image["src"]
+            else:
+                # Try to convert to string
+                selected_frame_path = str(selected_image)
+            print(f"Extracted path: {selected_frame_path}")
+            # Extract filename from the path for matching
+            import os
+            selected_filename = os.path.basename(selected_frame_path)
+            print(f"Selected filename: {selected_filename}")
+            # Move the selected image to the front
+            updated_gallery = [selected_image] + [
+                img for img in image_gallery if img != selected_image
+            ]
+            log_message = (
+                f"Selected frame: {selected_filename}. "
+                f"Moved to first position. Total frames: {len(updated_gallery)}"
+            )
+            return updated_gallery, log_message, selected_filename
+        except Exception as e:
+            print(f"Error selecting first frame: {e}")
+            return image_gallery, f"Error selecting first frame: {e}", ""

Depth-Anything-3-anysize/src/depth_anything_3/app/modules/file_handlers.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+File handling module for Depth Anything 3 Gradio app.
+This module handles file uploads, video processing, and file operations.
+"""
+import os
+import shutil
+import time
+from datetime import datetime
+from typing import List, Optional, Tuple
+import cv2
+from PIL import Image
+from pillow_heif import register_heif_opener
+register_heif_opener()
+class FileHandler:
+    """
+    Handles file uploads and processing for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the file handler."""
+    def handle_uploads(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[str, List[str]]:
+        """
+        Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
+        images or extracted frames from video into it.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (target_dir, image_paths)
+        """
+        start_time = time.time()
+        # Get workspace directory from environment variable or use default
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        if not os.path.exists(workspace_dir):
+            os.makedirs(workspace_dir)
+        # Create input_images subdirectory
+        input_images_dir = os.path.join(workspace_dir, "input_images")
+        if not os.path.exists(input_images_dir):
+            os.makedirs(input_images_dir)
+        # Create a unique folder name within input_images
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        target_dir = os.path.join(input_images_dir, f"session_{timestamp}")
+        target_dir_images = os.path.join(target_dir, "images")
+        # Clean up if somehow that folder already exists
+        if os.path.exists(target_dir):
+            shutil.rmtree(target_dir)
+        os.makedirs(target_dir)
+        os.makedirs(target_dir_images)
+        image_paths = []
+        # Handle images
+        if input_images is not None:
+            image_paths.extend(self._process_images(input_images, target_dir_images))
+        # Handle video
+        if input_video is not None:
+            image_paths.extend(
+                self._process_video(input_video, target_dir_images, s_time_interval)
+            )
+        # Sort final images for gallery
+        image_paths = sorted(image_paths)
+        end_time = time.time()
+        print(f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds")
+        return target_dir, image_paths
+    def _process_images(self, input_images: List, target_dir_images: str) -> List[str]:
+        """
+        Process uploaded images.
+        Args:
+            input_images: List of input image files
+            target_dir_images: Target directory for images
+        Returns:
+            List of processed image paths
+        """
+        image_paths = []
+        for file_data in input_images:
+            if isinstance(file_data, dict) and "name" in file_data:
+                file_path = file_data["name"]
+            else:
+                file_path = file_data
+            # Check if the file is a HEIC image
+            file_ext = os.path.splitext(file_path)[1].lower()
+            if file_ext in [".heic", ".heif"]:
+                # Convert HEIC to JPEG for better gallery compatibility
+                try:
+                    with Image.open(file_path) as img:
+                        # Convert to RGB if necessary (HEIC can have different color modes)
+                        if img.mode not in ("RGB", "L"):
+                            img = img.convert("RGB")
+                        # Create JPEG filename
+                        base_name = os.path.splitext(os.path.basename(file_path))[0]
+                        dst_path = os.path.join(target_dir_images, f"{base_name}.jpg")
+                        # Save as JPEG with high quality
+                        img.save(dst_path, "JPEG", quality=95)
+                        image_paths.append(dst_path)
+                        print(
+                            f"Converted HEIC to JPEG: {os.path.basename(file_path)} -> "
+                            f"{os.path.basename(dst_path)}"
+                        )
+                except Exception as e:
+                    print(f"Error converting HEIC file {file_path}: {e}")
+                    # Fall back to copying as is
+                    dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                    shutil.copy(file_path, dst_path)
+                    image_paths.append(dst_path)
+            else:
+                # Regular image files - copy as is
+                dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                shutil.copy(file_path, dst_path)
+                image_paths.append(dst_path)
+        return image_paths
+    def _process_video(
+        self, input_video: str, target_dir_images: str, s_time_interval: float
+    ) -> List[str]:
+        """
+        Process video file and extract frames.
+        Args:
+            input_video: Path to input video file
+            target_dir_images: Target directory for extracted frames
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            List of extracted frame paths
+        """
+        image_paths = []
+        if isinstance(input_video, dict) and "name" in input_video:
+            video_path = input_video["name"]
+        else:
+            video_path = input_video
+        vs = cv2.VideoCapture(video_path)
+        fps = vs.get(cv2.CAP_PROP_FPS)
+        frame_interval = max(1, int(fps / s_time_interval))  # Convert FPS to frame interval
+        count = 0
+        video_frame_num = 0
+        while True:
+            gotit, frame = vs.read()
+            if not gotit:
+                break
+            count += 1
+            if count % frame_interval == 0:
+                image_path = os.path.join(target_dir_images, f"{video_frame_num:06}.png")
+                cv2.imwrite(image_path, frame)
+                image_paths.append(image_path)
+                video_frame_num += 1
+        return image_paths
+    def update_gallery_on_upload(
+        self,
+        input_video: Optional[str],
+        input_images: Optional[List],
+        s_time_interval: float = 10.0,
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], Optional[str]]:
+        """
+        Handle file uploads and update gallery.
+        Args:
+            input_video: Path to input video file
+            input_images: List of input image files
+            s_time_interval: Sampling FPS (frames per second) for frame extraction
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        if not input_video and not input_images:
+            return None, None, None, None
+        target_dir, image_paths = self.handle_uploads(input_video, input_images, s_time_interval)
+        return (
+            None,
+            target_dir,
+            image_paths,
+            "Upload complete. Click 'Reconstruct' to begin 3D processing.",
+        )
+    def load_example_scene(
+        self, scene_name: str, examples_dir: str = "examples"
+    ) -> Tuple[Optional[str], Optional[str], Optional[List], str]:
+        """
+        Load a scene from examples directory.
+        Args:
+            scene_name: Name of the scene to load
+            examples_dir: Path to examples directory
+        Returns:
+            Tuple of (reconstruction_output, target_dir, image_paths, log_message)
+        """
+        from depth_anything_3.app.modules.utils import get_scene_info
+        scenes = get_scene_info(examples_dir)
+        # Find the selected scene
+        selected_scene = None
+        for scene in scenes:
+            if scene["name"] == scene_name:
+                selected_scene = scene
+                break
+        if selected_scene is None:
+            return None, None, None, "Scene not found"
+        # Use fixed directory name for examples (not timestamp-based)
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        input_images_dir = os.path.join(workspace_dir, "input_images")
+        if not os.path.exists(input_images_dir):
+            os.makedirs(input_images_dir)
+        # Create a fixed folder name based on scene name
+        target_dir = os.path.join(input_images_dir, f"example_{scene_name}")
+        target_dir_images = os.path.join(target_dir, "images")
+        # Check if already cached (GLB file exists)
+        glb_path = os.path.join(target_dir, "scene.glb")
+        is_cached = os.path.exists(glb_path)
+        # Create directory if it doesn't exist
+        if not os.path.exists(target_dir):
+            os.makedirs(target_dir)
+            os.makedirs(target_dir_images)
+        # Copy images if directory is new or empty
+        if not os.path.exists(target_dir_images) or len(os.listdir(target_dir_images)) == 0:
+            os.makedirs(target_dir_images, exist_ok=True)
+            image_paths = []
+            for file_path in selected_scene["image_files"]:
+                dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+                shutil.copy(file_path, dst_path)
+                image_paths.append(dst_path)
+        else:
+            # Use existing images
+            image_paths = sorted(
+                [
+                    os.path.join(target_dir_images, f)
+                    for f in os.listdir(target_dir_images)
+                    if f.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"))
+                ]
+            )
+        # Return cached GLB if available
+        if is_cached:
+            return (
+                glb_path,  # Return cached reconstruction
+                target_dir,  # Set target directory
+                image_paths,  # Set gallery
+                f"Loaded cached scene '{scene_name}' with {selected_scene['num_images']} images.",
+            )
+        else:
+            return (
+                None,  # No cached reconstruction
+                target_dir,  # Set target directory
+                image_paths,  # Set gallery
+                (
+                    f"Loaded scene '{scene_name}' with {selected_scene['num_images']} images. "
+                    "Click 'Reconstruct' to begin 3D processing."
+                ),
+            )

Depth-Anything-3-anysize/src/depth_anything_3/app/modules/model_inference.py ADDED Viewed

	@@ -0,0 +1,292 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Model inference module for Depth Anything 3 Gradio app.
+This module handles all model-related operations including inference,
+data processing, and result preparation.
+"""
+import glob
+import os
+from typing import Any, Dict, Optional, Tuple
+import numpy as np
+import torch
+from depth_anything_3.api import DepthAnything3
+from depth_anything_3.utils.memory import cleanup_cuda_memory
+from depth_anything_3.utils.export.glb import export_to_glb
+from depth_anything_3.utils.export.gs import export_to_gs_video
+class ModelInference:
+    """
+    Handles model inference and data processing for Depth Anything 3.
+    """
+    def __init__(self):
+        """Initialize the model inference handler."""
+        self.model = None
+    def initialize_model(self, device: str = "cuda") -> None:
+        """
+        Initialize the DepthAnything3 model.
+        Args:
+            device: Device to load the model on
+        """
+        if self.model is None:
+            # Get model directory from environment variable or use default
+            model_dir = os.environ.get(
+                "DA3_MODEL_DIR", "/dev/shm/da3_models/DA3HF-VITG-METRIC_VITL"
+            )
+            self.model = DepthAnything3.from_pretrained(model_dir)
+            self.model = self.model.to(device)
+        else:
+            self.model = self.model.to(device)
+        self.model.eval()
+    def run_inference(
+        self,
+        target_dir: str,
+        filter_black_bg: bool = False,
+        filter_white_bg: bool = False,
+        process_res_method: str = "keep",
+        show_camera: bool = True,
+        selected_first_frame: Optional[str] = None,
+        save_percentage: float = 30.0,
+        num_max_points: int = 1_000_000,
+        infer_gs: bool = False,
+        gs_trj_mode: str = "extend",
+        gs_video_quality: str = "high",
+    ) -> Tuple[Any, Dict[int, Dict[str, Any]]]:
+        """
+        Run DepthAnything3 model inference on images.
+        Args:
+            target_dir: Directory containing images
+            apply_mask: Whether to apply mask for ambiguous depth classes
+            mask_edges: Whether to mask edges
+            filter_black_bg: Whether to filter black background
+            filter_white_bg: Whether to filter white background
+            process_res_method: Method for resizing input images
+            show_camera: Whether to show camera in 3D view
+            selected_first_frame: Selected first frame filename
+            save_percentage: Percentage of points to save (0-100)
+            infer_gs: Whether to infer 3D Gaussian Splatting
+        Returns:
+            Tuple of (prediction, processed_data)
+        """
+        print(f"Processing images from {target_dir}")
+        # Device check
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        device = torch.device(device)
+        # Initialize model if needed
+        self.initialize_model(device)
+        # Get image paths
+        print("Loading images...")
+        image_folder_path = os.path.join(target_dir, "images")
+        all_image_paths = sorted(glob.glob(os.path.join(image_folder_path, "*")))
+        # Filter for image files
+        image_extensions = [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif"]
+        all_image_paths = [
+            path
+            for path in all_image_paths
+            if any(path.lower().endswith(ext) for ext in image_extensions)
+        ]
+        print(f"Found {len(all_image_paths)} images")
+        print(f"All image paths: {all_image_paths}")
+        # Apply first frame selection logic
+        if selected_first_frame:
+            # Find the image with matching filename
+            selected_path = None
+            for path in all_image_paths:
+                if os.path.basename(path) == selected_first_frame:
+                    selected_path = path
+                    break
+            if selected_path:
+                # Move selected frame to the front
+                image_paths = [selected_path] + [
+                    path for path in all_image_paths if path != selected_path
+                ]
+                print(f"User selected first frame: {selected_first_frame} -> {selected_path}")
+                print(f"Reordered image paths: {image_paths}")
+            else:
+                # Use default order if no match found
+                image_paths = all_image_paths
+                print(
+                    f"Selected frame '{selected_first_frame}' not found in image paths. "
+                    "Using default order."
+                )
+                first_frame_display = image_paths[0] if image_paths else "No images"
+                print(f"Using default order (first frame): {first_frame_display}")
+        else:
+            # Use default order (sorted)
+            image_paths = all_image_paths
+            first_frame_display = image_paths[0] if image_paths else "No images"
+            print(f"Using default order (first frame): {first_frame_display}")
+        if len(image_paths) == 0:
+            raise ValueError("No images found. Check your upload.")
+        # Map UI options to actual method names
+        method_mapping = {
+            "high_res": "lower_bound_resize",
+            "low_res": "upper_bound_resize",
+            "keep": "keep",
+            "original": "original",
+        }
+        actual_method = method_mapping.get(process_res_method, process_res_method)
+        process_res_value = None if actual_method in ("keep", "original") else 504
+        # Run model inference
+        print(f"Running inference with method: {actual_method}")
+        with torch.no_grad():
+            prediction = self.model.inference(
+                image_paths,
+                export_dir=None,
+                process_res=process_res_value,
+                process_res_method=actual_method,
+                infer_gs=infer_gs,
+            )
+        # num_max_points: int = 1_000_000,
+        export_to_glb(
+            prediction,
+            filter_black_bg=filter_black_bg,
+            filter_white_bg=filter_white_bg,
+            export_dir=target_dir,
+            show_cameras=show_camera,
+            conf_thresh_percentile=save_percentage,
+            num_max_points=int(num_max_points),
+        )
+        # export to gs video if needed
+        if infer_gs:
+            mode_mapping = {"extend": "extend", "smooth": "interpolate_smooth"}
+            print(f"GS mode: {gs_trj_mode}; Backend mode: {mode_mapping[gs_trj_mode]}")
+            export_to_gs_video(
+                prediction,
+                export_dir=target_dir,
+                chunk_size=4,
+                trj_mode=mode_mapping.get(gs_trj_mode, "extend"),
+                enable_tqdm=True,
+                vis_depth="hcat",
+                video_quality=gs_video_quality,
+            )
+        # Save predictions.npz for caching metric depth data
+        self._save_predictions_cache(target_dir, prediction)
+        # Process results
+        processed_data = self._process_results(target_dir, prediction, image_paths)
+        # Clean up using centralized memory utilities for consistency with backend
+        cleanup_cuda_memory()
+        return prediction, processed_data
+    def _save_predictions_cache(self, target_dir: str, prediction: Any) -> None:
+        """
+        Save predictions data to predictions.npz for caching.
+        Args:
+            target_dir: Directory to save the cache
+            prediction: Model prediction object
+        """
+        try:
+            output_file = os.path.join(target_dir, "predictions.npz")
+            # Build save dict with prediction data
+            save_dict = {}
+            # Save processed images if available
+            if prediction.processed_images is not None:
+                save_dict["images"] = prediction.processed_images
+            # Save depth data
+            if prediction.depth is not None:
+                save_dict["depths"] = np.round(prediction.depth, 6)
+            # Save confidence if available
+            if prediction.conf is not None:
+                save_dict["conf"] = np.round(prediction.conf, 2)
+            # Save camera parameters
+            if prediction.extrinsics is not None:
+                save_dict["extrinsics"] = prediction.extrinsics
+            if prediction.intrinsics is not None:
+                save_dict["intrinsics"] = prediction.intrinsics
+            # Save to file
+            np.savez_compressed(output_file, **save_dict)
+            print(f"Saved predictions cache to: {output_file}")
+        except Exception as e:
+            print(f"Warning: Failed to save predictions cache: {e}")
+    def _process_results(
+        self, target_dir: str, prediction: Any, image_paths: list
+    ) -> Dict[int, Dict[str, Any]]:
+        """
+        Process model results into structured data.
+        Args:
+            target_dir: Directory containing results
+            prediction: Model prediction object
+            image_paths: List of input image paths
+        Returns:
+            Dictionary containing processed data for each view
+        """
+        processed_data = {}
+        # Read generated depth visualization files
+        depth_vis_dir = os.path.join(target_dir, "depth_vis")
+        if os.path.exists(depth_vis_dir):
+            depth_files = sorted(glob.glob(os.path.join(depth_vis_dir, "*.jpg")))
+            for i, depth_file in enumerate(depth_files):
+                # Use processed images directly from API
+                processed_image = None
+                if prediction.processed_images is not None and i < len(
+                    prediction.processed_images
+                ):
+                    processed_image = prediction.processed_images[i]
+                processed_data[i] = {
+                    "depth_image": depth_file,
+                    "image": processed_image,
+                    "original_image_path": image_paths[i] if i < len(image_paths) else None,
+                    "depth": prediction.depth[i] if i < len(prediction.depth) else None,
+                    "intrinsics": (
+                        prediction.intrinsics[i]
+                        if prediction.intrinsics is not None and i < len(prediction.intrinsics)
+                        else None
+                    ),
+                    "mask": None,  # No mask information available
+                }
+        return processed_data
+    # cleanup() removed: call cleanup_cuda_memory() directly where needed.

Depth-Anything-3-anysize/src/depth_anything_3/app/modules/ui_components.py ADDED Viewed

	@@ -0,0 +1,474 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UI components module for Depth Anything 3 Gradio app.
+This module contains UI component definitions and layout functions.
+"""
+import os
+from typing import Any, Dict, List, Tuple
+import gradio as gr
+from depth_anything_3.app.modules.utils import get_logo_base64, get_scene_info
+class UIComponents:
+    """
+    Handles UI component creation and layout for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the UI components handler."""
+    def create_upload_section(self) -> Tuple[gr.Video, gr.Slider, gr.File, gr.Gallery, gr.Button]:
+        """
+        Create the upload section with video, images, and gallery components.
+        Returns:
+            A tuple of Gradio components: (input_video, s_time_interval, input_images,
+            image_gallery, select_first_frame_btn).
+        """
+        input_video = gr.Video(label="Upload Video", interactive=True)
+        s_time_interval = gr.Slider(
+            minimum=0.1,
+            maximum=60,
+            value=10,
+            step=0.1,
+            label="Sampling FPS (Frames Per Second)",
+            interactive=True,
+            visible=True,
+        )
+        input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
+        image_gallery = gr.Gallery(
+            label="Preview",
+            columns=4,
+            height="300px",
+            show_download_button=True,
+            object_fit="contain",
+            preview=True,
+            interactive=False,
+        )
+        # Select first frame button (moved below image gallery)
+        select_first_frame_btn = gr.Button("Select First Frame", scale=1)
+        return input_video, s_time_interval, input_images, image_gallery, select_first_frame_btn
+    def create_3d_viewer_section(self) -> gr.Model3D:
+        """
+        Create the 3D viewer component.
+        Returns:
+            3D model viewer component
+        """
+        return gr.Model3D(
+            height=520,
+            zoom_speed=0.5,
+            pan_speed=0.5,
+            clear_color=[0.0, 0.0, 0.0, 0.0],
+            key="persistent_3d_viewer",
+            elem_id="reconstruction_3d_viewer",
+        )
+    def create_nvs_video(self) -> Tuple[gr.Video, gr.Markdown]:
+        """
+        Create the 3DGS rendered video display component and info message.
+        Returns:
+            Tuple of (video component, info message component)
+        """
+        with gr.Column():
+            gs_info = gr.Markdown(
+                (
+                    "‼️ **3D Gaussian Splatting rendering is currently DISABLED.** <br><br><br>"
+                    "To render novel views from 3DGS, "
+                    "enable **Infer 3D Gaussian Splatting** below. <br>"
+                    "Next, in **Visualization Options**, "
+                    "*optionally* configure the **rendering trajectory** (default: smooth) "
+                    "and **video quality** (default: low), "
+                    "then click **Reconstruct**."
+                ),
+                visible=True,
+                height=520,
+            )
+            gs_video = gr.Video(
+                height=520,
+                label="3DGS Rendered NVS Video (depth shown for reference only)",
+                interactive=False,
+                visible=False,
+            )
+        return gs_video, gs_info
+    def create_depth_section(self) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image]:
+        """
+        Create the depth visualization section.
+        Returns:
+            A tuple of (prev_depth_btn, depth_view_selector, next_depth_btn, depth_map)
+        """
+        with gr.Row(elem_classes=["navigation-row"]):
+            prev_depth_btn = gr.Button("◀ Previous", size="sm", scale=1)
+            depth_view_selector = gr.Dropdown(
+                choices=["View 1"],
+                value="View 1",
+                label="Select View",
+                scale=2,
+                interactive=True,
+                allow_custom_value=True,
+            )
+            next_depth_btn = gr.Button("Next ▶", size="sm", scale=1)
+        depth_map = gr.Image(
+            type="numpy",
+            label="Colorized Depth Map",
+            format="png",
+            interactive=False,
+        )
+        return prev_depth_btn, depth_view_selector, next_depth_btn, depth_map
+    def create_measure_section(
+        self,
+    ) -> Tuple[gr.Button, gr.Dropdown, gr.Button, gr.Image, gr.Image, gr.Markdown]:
+        """
+        Create the measurement section.
+        Returns:
+            A tuple of (prev_measure_btn, measure_view_selector, next_measure_btn, measure_image,
+            measure_depth_image, measure_text)
+        """
+        from depth_anything_3.app.css_and_html import MEASURE_INSTRUCTIONS_HTML
+        gr.Markdown(MEASURE_INSTRUCTIONS_HTML)
+        with gr.Row(elem_classes=["navigation-row"]):
+            prev_measure_btn = gr.Button("◀ Previous", size="sm", scale=1)
+            measure_view_selector = gr.Dropdown(
+                choices=["View 1"],
+                value="View 1",
+                label="Select View",
+                scale=2,
+                interactive=True,
+                allow_custom_value=True,
+            )
+            next_measure_btn = gr.Button("Next ▶", size="sm", scale=1)
+        with gr.Row():
+            measure_image = gr.Image(
+                type="numpy",
+                show_label=False,
+                format="webp",
+                interactive=False,
+                sources=[],
+                label="RGB Image",
+                scale=1,
+                height=275,
+            )
+            measure_depth_image = gr.Image(
+                type="numpy",
+                show_label=False,
+                format="webp",
+                interactive=False,
+                sources=[],
+                label="Depth Visualization (Right Half)",
+                scale=1,
+                height=275,
+            )
+        gr.Markdown(
+            "**Note:** Images have been adjusted to model processing size. "
+            "Click two points on the RGB image to measure distance."
+        )
+        measure_text = gr.Markdown("")
+        return (
+            prev_measure_btn,
+            measure_view_selector,
+            next_measure_btn,
+            measure_image,
+            measure_depth_image,
+            measure_text,
+        )
+    def create_inference_control_section(self) -> Tuple[gr.Dropdown, gr.Checkbox]:
+        """
+        Create the inference control section (before inference).
+        Returns:
+            Tuple of (process_res_method_dropdown, infer_gs)
+        """
+        with gr.Row():
+            process_res_method_dropdown = gr.Dropdown(
+                choices=["high_res", "low_res"],
+                value="low_res",
+                label="Image Processing Method",
+                info="low_res for much more images",
+                scale=1,
+            )
+            # Modify line 220, add color class
+            infer_gs = gr.Checkbox(
+                label="Infer 3D Gaussian Splatting",
+                value=False,
+                info=(
+                    'Enable novel view rendering from 3DGS (<i class="fas fa-triangle-exclamation '
+                    'fa-color-red"></i> requires extra processing time)'
+                ),
+                scale=1,
+            )
+        return (process_res_method_dropdown, infer_gs)
+    def create_display_control_section(
+        self,
+    ) -> Tuple[
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Slider,
+        gr.Slider,
+        gr.Dropdown,
+        gr.Dropdown,
+        gr.Button,
+        gr.ClearButton,
+    ]:
+        """
+        Create the display control section (options for visualization).
+        Returns:
+            Tuple of display control components including buttons
+        """
+        with gr.Column():
+            # 3DGS options at the top
+            with gr.Row():
+                gs_trj_mode = gr.Dropdown(
+                    choices=["smooth", "extend"],
+                    value="smooth",
+                    label=("Rendering trajectory for 3DGS viewpoints (requires n_views ≥ 2)"),
+                    info=("'smooth' for view interpolation; 'extend' for longer trajectory"),
+                    visible=False,  # initially hidden
+                )
+                gs_video_quality = gr.Dropdown(
+                    choices=["low", "medium", "high"],
+                    value="low",
+                    label=("Video quality for 3DGS rendered outputs"),
+                    info=("'low' for faster loading speed; 'high' for better visual quality"),
+                    visible=False,  # initially hidden
+                )
+            # Reconstruct and Clear buttons (before Visualization Options)
+            with gr.Row():
+                submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
+                clear_btn = gr.ClearButton(scale=1)
+            gr.Markdown("### Visualization Options: (Click Reconstruct to update)")
+            show_cam = gr.Checkbox(label="Show Camera", value=True)
+            filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
+            filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
+            save_percentage = gr.Slider(
+                minimum=0,
+                maximum=100,
+                value=10,
+                step=1,
+                label="Filter Percentage",
+                info="Confidence Threshold (%): Higher values filter more points.",
+            )
+            num_max_points = gr.Slider(
+                minimum=1000,
+                maximum=100000,
+                value=1000,
+                step=1000,
+                label="Max Points (K points)",
+                info="Maximum number of points to export to GLB (in thousands)",
+            )
+        return (
+            show_cam,
+            filter_black_bg,
+            filter_white_bg,
+            save_percentage,
+            num_max_points,
+            gs_trj_mode,
+            gs_video_quality,
+            submit_btn,
+            clear_btn,
+        )
+    def create_control_section(
+        self,
+    ) -> Tuple[
+        gr.Button,
+        gr.ClearButton,
+        gr.Dropdown,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Checkbox,
+        gr.Dropdown,
+        gr.Checkbox,
+        gr.Textbox,
+    ]:
+        """
+        Create the control section with buttons and options.
+        Returns:
+            Tuple of control components
+        """
+        with gr.Row():
+            submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
+            clear_btn = gr.ClearButton(
+                scale=1,
+            )
+        with gr.Row():
+            frame_filter = gr.Dropdown(
+                choices=["All"], value="All", label="Show Points from Frame"
+            )
+            with gr.Column():
+                gr.Markdown("### Visualization Option: (Click Reconstruct to update)")
+                show_cam = gr.Checkbox(label="Show Camera", value=True)
+                show_mesh = gr.Checkbox(label="Show Mesh", value=True)
+                filter_black_bg = gr.Checkbox(label="Filter Black Background", value=False)
+                filter_white_bg = gr.Checkbox(label="Filter White Background", value=False)
+                gr.Markdown("### Reconstruction Options: (updated on next run)")
+                apply_mask_checkbox = gr.Checkbox(
+                    label="Apply mask for predicted ambiguous depth classes & edges",
+                    value=True,
+                )
+                process_res_method_dropdown = gr.Dropdown(
+                    choices=[
+                        "upper_bound_resize",
+                        "upper_bound_crop",
+                        "lower_bound_resize",
+                        "lower_bound_crop",
+                    ],
+                    value="upper_bound_resize",
+                    label="Image Processing Method",
+                    info="Method for resizing input images",
+                )
+                save_to_gallery_checkbox = gr.Checkbox(
+                    label="Save to Gallery",
+                    value=False,
+                    info="Save current reconstruction results to gallery directory",
+                )
+                gallery_name_input = gr.Textbox(
+                    label="Gallery Name",
+                    placeholder="Enter a name for the gallery folder",
+                    value="",
+                    info="Leave empty for auto-generated name with timestamp",
+                )
+        return (
+            submit_btn,
+            clear_btn,
+            frame_filter,
+            show_cam,
+            show_mesh,
+            filter_black_bg,
+            filter_white_bg,
+            apply_mask_checkbox,
+            process_res_method_dropdown,
+            save_to_gallery_checkbox,
+            gallery_name_input,
+        )
+    def create_example_scenes_section(self) -> List[Dict[str, Any]]:
+        """
+        Create the example scenes section.
+        Returns:
+            List of scene information dictionaries
+        """
+        # Get workspace directory from environment variable
+        workspace_dir = os.environ.get("DA3_WORKSPACE_DIR", "gradio_workspace")
+        examples_dir = os.path.join(workspace_dir, "examples")
+        # Get scene information
+        scenes = get_scene_info(examples_dir)
+        return scenes
+    def create_example_scene_grid(self, scenes: List[Dict[str, Any]]) -> List[gr.Image]:
+        """
+        Create the example scene grid.
+        Args:
+            scenes: List of scene information dictionaries
+        Returns:
+            List of scene image components
+        """
+        scene_components = []
+        if scenes:
+            for i in range(0, len(scenes), 4):  # Process 4 scenes per row
+                with gr.Row():
+                    for j in range(4):
+                        scene_idx = i + j
+                        if scene_idx < len(scenes):
+                            scene = scenes[scene_idx]
+                            with gr.Column(scale=1, elem_classes=["clickable-thumbnail"]):
+                                # Clickable thumbnail
+                                scene_img = gr.Image(
+                                    value=scene["thumbnail"],
+                                    height=150,
+                                    interactive=False,
+                                    show_label=False,
+                                    elem_id=f"scene_thumb_{scene['name']}",
+                                    sources=[],
+                                )
+                                scene_components.append(scene_img)
+                                # Scene name and image count as text below thumbnail
+                                gr.Markdown(
+                                    f"**{scene['name']}** \n {scene['num_images']} images",
+                                    elem_classes=["scene-info"],
+                                )
+                        else:
+                            # Empty column to maintain grid structure
+                            with gr.Column(scale=1):
+                                pass
+        return scene_components
+    def create_header_section(self) -> gr.HTML:
+        """
+        Create the header section with logo and title.
+        Returns:
+            Header HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_header_html
+        return gr.HTML(get_header_html(get_logo_base64()))
+    def create_description_section(self) -> gr.HTML:
+        """
+        Create the description section.
+        Returns:
+            Description HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_description_html
+        return gr.HTML(get_description_html())
+    def create_acknowledgements_section(self) -> gr.HTML:
+        """
+        Create the acknowledgements section.
+        Returns:
+            Acknowledgements HTML component
+        """
+        from depth_anything_3.app.css_and_html import get_acknowledgements_html
+        return gr.HTML(get_acknowledgements_html())

Depth-Anything-3-anysize/src/depth_anything_3/app/modules/utils.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utility functions for Depth Anything 3 Gradio app.
+This module contains helper functions for data processing, visualization,
+and file operations.
+"""
+import json
+import os
+import shutil
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+def create_depth_visualization(depth: np.ndarray) -> Optional[np.ndarray]:
+    """
+    Create a colored depth visualization.
+    Args:
+        depth: Depth array
+    Returns:
+        Colored depth visualization or None
+    """
+    if depth is None:
+        return None
+    # Normalize depth to 0-1 range
+    depth_min = depth[depth > 0].min() if (depth > 0).any() else 0
+    depth_max = depth.max()
+    if depth_max <= depth_min:
+        return None
+    # Normalize depth
+    depth_norm = (depth - depth_min) / (depth_max - depth_min)
+    depth_norm = np.clip(depth_norm, 0, 1)
+    # Apply colormap (using matplotlib's viridis colormap)
+    import matplotlib.cm as cm
+    # Convert to colored image
+    depth_colored = cm.viridis(depth_norm)[:, :, :3]  # Remove alpha channel
+    depth_colored = (depth_colored * 255).astype(np.uint8)
+    return depth_colored
+def save_to_gallery_func(
+    target_dir: str, processed_data: Dict[int, Dict[str, Any]], gallery_name: Optional[str] = None
+) -> Tuple[bool, str]:
+    """
+    Save the current reconstruction results to the gallery directory.
+    Args:
+        target_dir: Source directory containing reconstruction results
+        processed_data: Processed data dictionary
+        gallery_name: Name for the gallery folder
+    Returns:
+        Tuple of (success, message)
+    """
+    try:
+        # Get gallery directory from environment variable or use default
+        gallery_dir = os.environ.get(
+            "DA3_GALLERY_DIR",
+            "workspace/gallery",
+        )
+        if not os.path.exists(gallery_dir):
+            os.makedirs(gallery_dir)
+        # Use provided name or create a unique name
+        if gallery_name is None or gallery_name.strip() == "":
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            gallery_name = f"reconstruction_{timestamp}"
+        gallery_path = os.path.join(gallery_dir, gallery_name)
+        # Check if directory already exists
+        if os.path.exists(gallery_path):
+            return False, f"Save failed: folder '{gallery_name}' already exists"
+        # Create the gallery directory
+        os.makedirs(gallery_path, exist_ok=True)
+        # Copy GLB file
+        glb_source = os.path.join(target_dir, "scene.glb")
+        glb_dest = os.path.join(gallery_path, "scene.glb")
+        if os.path.exists(glb_source):
+            shutil.copy2(glb_source, glb_dest)
+        # Copy depth visualization images
+        depth_vis_dir = os.path.join(target_dir, "depth_vis")
+        if os.path.exists(depth_vis_dir):
+            gallery_depth_vis = os.path.join(gallery_path, "depth_vis")
+            shutil.copytree(depth_vis_dir, gallery_depth_vis)
+        # Copy original images
+        images_source = os.path.join(target_dir, "images")
+        if os.path.exists(images_source):
+            gallery_images = os.path.join(gallery_path, "images")
+            shutil.copytree(images_source, gallery_images)
+        scene_preview_source = os.path.join(target_dir, "scene.jpg")
+        scene_preview_dest = os.path.join(gallery_path, "scene.jpg")
+        shutil.copy2(scene_preview_source, scene_preview_dest)
+        # Save metadata
+        metadata = {
+            "timestamp": datetime.now().strftime("%Y%m%d_%H%M%S"),
+            "num_images": len(processed_data) if processed_data else 0,
+            "gallery_name": gallery_name,
+        }
+        with open(os.path.join(gallery_path, "metadata.json"), "w") as f:
+            json.dump(metadata, f, indent=2)
+        print(f"Saved reconstruction to gallery: {gallery_path}")
+        return True, f"Save successful: saved to {gallery_path}"
+    except Exception as e:
+        print(f"Error saving to gallery: {e}")
+        return False, f"Save failed: {str(e)}"
+def get_scene_info(examples_dir: str) -> List[Dict[str, Any]]:
+    """
+    Get information about scenes in the examples directory.
+    Args:
+        examples_dir: Path to examples directory
+    Returns:
+        List of scene information dictionaries
+    """
+    import glob
+    scenes = []
+    if not os.path.exists(examples_dir):
+        return scenes
+    for scene_folder in sorted(os.listdir(examples_dir)):
+        scene_path = os.path.join(examples_dir, scene_folder)
+        if os.path.isdir(scene_path):
+            # Find all image files in the scene folder
+            image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
+            image_files = []
+            for ext in image_extensions:
+                image_files.extend(glob.glob(os.path.join(scene_path, ext)))
+                image_files.extend(glob.glob(os.path.join(scene_path, ext.upper())))
+            if image_files:
+                # Sort images and get the first one for thumbnail
+                image_files = sorted(image_files)
+                first_image = image_files[0]
+                num_images = len(image_files)
+                scenes.append(
+                    {
+                        "name": scene_folder,
+                        "path": scene_path,
+                        "thumbnail": first_image,
+                        "num_images": num_images,
+                        "image_files": image_files,
+                    }
+                )
+    return scenes
+# NOTE: cleanup was moved to a single canonical helper in
+# `depth_anything_3.utils.memory.cleanup_cuda_memory`.
+# Callers should import and call that directly instead of using this module.
+def get_logo_base64() -> Optional[str]:
+    """
+    Convert WAI logo to base64 for embedding in HTML.
+    Returns:
+        Base64 encoded logo string or None
+    """
+    import base64
+    logo_path = "examples/WAI-Logo/wai_logo.png"
+    try:
+        with open(logo_path, "rb") as img_file:
+            img_data = img_file.read()
+            base64_str = base64.b64encode(img_data).decode()
+            return f"data:image/png;base64,{base64_str}"
+    except FileNotFoundError:
+        return None

Depth-Anything-3-anysize/src/depth_anything_3/app/modules/visualization.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Visualization module for Depth Anything 3 Gradio app.
+This module handles visualization updates, navigation, and measurement functionality.
+"""
+import os
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import gradio as gr
+import numpy as np
+class VisualizationHandler:
+    """
+    Handles visualization updates and navigation for the Gradio app.
+    """
+    def __init__(self):
+        """Initialize the visualization handler."""
+    def update_view_selectors(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[gr.Dropdown, gr.Dropdown]:
+        """
+        Update view selector dropdowns based on available views.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (depth_view_selector, measure_view_selector)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            choices = ["View 1"]
+        else:
+            num_views = len(processed_data)
+            choices = [f"View {i + 1}" for i in range(num_views)]
+        return (
+            gr.Dropdown(choices=choices, value=choices[0]),  # depth_view_selector
+            gr.Dropdown(choices=choices, value=choices[0]),  # measure_view_selector
+        )
+    def get_view_data_by_index(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Get view data by index, handling bounds.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to get
+        Returns:
+            View data dictionary or None
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None
+        view_keys = list(processed_data.keys())
+        if view_index < 0 or view_index >= len(view_keys):
+            view_index = 0
+        return processed_data[view_keys[view_index]]
+    def update_depth_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Optional[str]:
+        """
+        Update depth view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Path to depth visualization image or None
+        """
+        view_data = self.get_view_data_by_index(processed_data, view_index)
+        if view_data is None or view_data.get("depth_image") is None:
+            return None
+        # Return the depth visualization image directly
+        return view_data["depth_image"]
+    def navigate_depth_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector_value: str,
+        direction: int,
+    ) -> Tuple[str, Optional[str]]:
+        """
+        Navigate depth view (direction: -1 for previous, +1 for next).
+        Args:
+            processed_data: Processed data dictionary
+            current_selector_value: Current selector value
+            direction: Direction to navigate (-1 for previous, +1 for next)
+        Returns:
+            Tuple of (new_selector_value, depth_vis)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return "View 1", None
+        # Parse current view number
+        try:
+            current_view = int(current_selector_value.split()[1]) - 1
+        except:  # noqa
+            current_view = 0
+        num_views = len(processed_data)
+        new_view = (current_view + direction) % num_views
+        new_selector_value = f"View {new_view + 1}"
+        depth_vis = self.update_depth_view(processed_data, new_view)
+        return new_selector_value, depth_vis
+    def update_measure_view(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]], view_index: int
+    ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], List]:
+        """
+        Update measure view for a specific view index.
+        Args:
+            processed_data: Processed data dictionary
+            view_index: Index of the view to update
+        Returns:
+            Tuple of (measure_image, depth_right_half, measure_points)
+        """
+        view_data = self.get_view_data_by_index(processed_data, view_index)
+        if view_data is None:
+            return None, None, []  # image, depth_right_half, measure_points
+        # Get the processed (resized) image
+        if "image" in view_data and view_data["image"] is not None:
+            image = view_data["image"].copy()
+        else:
+            return None, None, []
+        # Ensure image is in uint8 format
+        if image.dtype != np.uint8:
+            if image.max() <= 1.0:
+                image = (image * 255).astype(np.uint8)
+            else:
+                image = image.astype(np.uint8)
+        # Extract right half of the depth visualization (pure depth part)
+        depth_image_path = view_data.get("depth_image", None)
+        depth_right_half = None
+        if depth_image_path and os.path.exists(depth_image_path):
+            try:
+                # Load the combined depth visualization image
+                depth_combined = cv2.imread(depth_image_path)
+                depth_combined = cv2.cvtColor(depth_combined, cv2.COLOR_BGR2RGB)
+                if depth_combined is not None:
+                    height, width = depth_combined.shape[:2]
+                    # Extract right half (depth visualization part)
+                    depth_right_half = depth_combined[:, width // 2 :]
+            except Exception as e:
+                print(f"Error extracting depth right half: {e}")
+        return image, depth_right_half, []
+    def navigate_measure_view(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        current_selector_value: str,
+        direction: int,
+    ) -> Tuple[str, Optional[np.ndarray], Optional[str], List]:
+        """
+        Navigate measure view (direction: -1 for previous, +1 for next).
+        Args:
+            processed_data: Processed data dictionary
+            current_selector_value: Current selector value
+            direction: Direction to navigate (-1 for previous, +1 for next)
+        Returns:
+            Tuple of (new_selector_value, measure_image, depth_image_path, measure_points)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return "View 1", None, None, []
+        # Parse current view number
+        try:
+            current_view = int(current_selector_value.split()[1]) - 1
+        except:  # noqa
+            current_view = 0
+        num_views = len(processed_data)
+        new_view = (current_view + direction) % num_views
+        new_selector_value = f"View {new_view + 1}"
+        measure_image, depth_right_half, measure_points = self.update_measure_view(
+            processed_data, new_view
+        )
+        return new_selector_value, measure_image, depth_right_half, measure_points
+    def populate_visualization_tabs(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[Optional[str], Optional[np.ndarray], Optional[str], List]:
+        """
+        Populate the depth and measure tabs with processed data.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (depth_vis, measure_img, depth_image_path, measure_points)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None, None, None, []
+        # Use update function to get depth visualization
+        depth_vis = self.update_depth_view(processed_data, 0)
+        measure_img, depth_right_half, _ = self.update_measure_view(processed_data, 0)
+        return depth_vis, measure_img, depth_right_half, []
+    def reset_measure(
+        self, processed_data: Optional[Dict[int, Dict[str, Any]]]
+    ) -> Tuple[Optional[np.ndarray], List, str]:
+        """
+        Reset measure points.
+        Args:
+            processed_data: Processed data dictionary
+        Returns:
+            Tuple of (image, measure_points, text)
+        """
+        if processed_data is None or len(processed_data) == 0:
+            return None, [], ""
+        # Return the first view image
+        first_view = list(processed_data.values())[0]
+        return first_view["image"], [], ""
+    def measure(
+        self,
+        processed_data: Optional[Dict[int, Dict[str, Any]]],
+        measure_points: List,
+        current_view_selector: str,
+        event: gr.SelectData,
+    ) -> List:
+        """
+        Handle measurement on images.
+        Args:
+            processed_data: Processed data dictionary
+            measure_points: List of current measure points
+            current_view_selector: Current view selector value
+            event: Gradio select event
+        Returns:
+            List of [image, depth_right_half, measure_points, text]
+        """
+        try:
+            print(f"Measure function called with selector: {current_view_selector}")
+            if processed_data is None or len(processed_data) == 0:
+                return [None, [], "No data available"]
+            # Use the currently selected view instead of always using the first view
+            try:
+                current_view_index = int(current_view_selector.split()[1]) - 1
+            except:  # noqa
+                current_view_index = 0
+            print(f"Using view index: {current_view_index}")
+            # Get view data safely
+            if current_view_index < 0 or current_view_index >= len(processed_data):
+                current_view_index = 0
+            view_keys = list(processed_data.keys())
+            current_view = processed_data[view_keys[current_view_index]]
+            if current_view is None:
+                return [None, [], "No view data available"]
+            point2d = event.index[0], event.index[1]
+            print(f"Clicked point: {point2d}")
+            measure_points.append(point2d)
+            # Get image and depth visualization
+            image, depth_right_half, _ = self.update_measure_view(
+                processed_data, current_view_index
+            )
+            if image is None:
+                return [None, [], "No image available"]
+            image = image.copy()
+            # Ensure image is in uint8 format for proper cv2 operations
+            try:
+                if image.dtype != np.uint8:
+                    if image.max() <= 1.0:
+                        # Image is in [0, 1] range, convert to [0, 255]
+                        image = (image * 255).astype(np.uint8)
+                    else:
+                        # Image is already in [0, 255] range
+                        image = image.astype(np.uint8)
+            except Exception as e:
+                print(f"Image conversion error: {e}")
+                return [None, [], f"Image conversion error: {e}"]
+            # Draw circles for points
+            try:
+                for p in measure_points:
+                    if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
+                        image = cv2.circle(image, p, radius=5, color=(255, 0, 0), thickness=2)
+            except Exception as e:
+                print(f"Drawing error: {e}")
+                return [None, [], f"Drawing error: {e}"]
+            # Get depth information from processed_data
+            depth_text = ""
+            try:
+                for i, p in enumerate(measure_points):
+                    if (
+                        current_view["depth"] is not None
+                        and 0 <= p[1] < current_view["depth"].shape[0]
+                        and 0 <= p[0] < current_view["depth"].shape[1]
+                    ):
+                        d = current_view["depth"][p[1], p[0]]
+                        depth_text += f"- **P{i + 1} depth: {d:.2f}m**\n"
+                    else:
+                        depth_text += f"- **P{i + 1}: Click position ({p[0]}, {p[1]}) - No depth information**\n"  # noqa: E501
+            except Exception as e:
+                print(f"Depth text error: {e}")
+                depth_text = f"Error computing depth: {e}\n"
+            if len(measure_points) == 2:
+                try:
+                    point1, point2 = measure_points
+                    # Draw line
+                    if (
+                        0 <= point1[0] < image.shape[1]
+                        and 0 <= point1[1] < image.shape[0]
+                        and 0 <= point2[0] < image.shape[1]
+                        and 0 <= point2[1] < image.shape[0]
+                    ):
+                        image = cv2.line(image, point1, point2, color=(255, 0, 0), thickness=2)
+                    # Compute 3D distance using depth information and camera intrinsics
+                    distance_text = "- **Distance: Unable to calculate 3D distance**"
+                    if (
+                        current_view["depth"] is not None
+                        and 0 <= point1[1] < current_view["depth"].shape[0]
+                        and 0 <= point1[0] < current_view["depth"].shape[1]
+                        and 0 <= point2[1] < current_view["depth"].shape[0]
+                        and 0 <= point2[0] < current_view["depth"].shape[1]
+                    ):
+                        try:
+                            # Get depth values at the two points
+                            d1 = current_view["depth"][point1[1], point1[0]]
+                            d2 = current_view["depth"][point2[1], point2[0]]
+                            # Convert 2D pixel coordinates to 3D world coordinates
+                            if current_view["intrinsics"] is not None:
+                                # Get camera intrinsics
+                                K = current_view["intrinsics"]  # 3x3 intrinsic matrix
+                                fx, fy = K[0, 0], K[1, 1]  # focal lengths
+                                cx, cy = K[0, 2], K[1, 2]  # principal point
+                                # Convert pixel coordinates to normalized camera coordinates
+                                # Point 1: (u1, v1) -> (x1, y1, z1)
+                                u1, v1 = point1[0], point1[1]
+                                x1 = (u1 - cx) * d1 / fx
+                                y1 = (v1 - cy) * d1 / fy
+                                z1 = d1
+                                # Point 2: (u2, v2) -> (x2, y2, z2)
+                                u2, v2 = point2[0], point2[1]
+                                x2 = (u2 - cx) * d2 / fx
+                                y2 = (v2 - cy) * d2 / fy
+                                z2 = d2
+                                # Calculate 3D Euclidean distance
+                                p1_3d = np.array([x1, y1, z1])
+                                p2_3d = np.array([x2, y2, z2])
+                                distance_3d = np.linalg.norm(p1_3d - p2_3d)
+                                distance_text = f"- **Distance: {distance_3d:.2f}m**"
+                            else:
+                                # Fallback to simplified calculation if no intrinsics
+                                pixel_distance = np.sqrt(
+                                    (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
+                                )
+                                avg_depth = (d1 + d2) / 2
+                                scale_factor = avg_depth / 1000  # Rough scaling factor
+                                estimated_3d_distance = pixel_distance * scale_factor
+                                distance_text = f"- **Distance: {estimated_3d_distance:.2f}m (estimated, no intrinsics)**"  # noqa: E501
+                        except Exception as e:
+                            print(f"Distance computation error: {e}")
+                            distance_text = f"- **Distance computation error: {e}**"
+                    measure_points = []
+                    text = depth_text + distance_text
+                    print(f"Measurement complete: {text}")
+                    return [image, depth_right_half, measure_points, text]
+                except Exception as e:
+                    print(f"Final measurement error: {e}")
+                    return [None, [], f"Measurement error: {e}"]
+            else:
+                print(f"Single point measurement: {depth_text}")
+                return [image, depth_right_half, measure_points, depth_text]
+        except Exception as e:
+            print(f"Overall measure function error: {e}")
+            return [None, [], f"Measure function error: {e}"]

Depth-Anything-3-anysize/src/depth_anything_3/cfg.py ADDED Viewed

	@@ -0,0 +1,144 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Configuration utility functions
+"""
+import importlib
+from pathlib import Path
+from typing import Any, Callable, List, Union
+from omegaconf import DictConfig, ListConfig, OmegaConf
+try:
+    OmegaConf.register_new_resolver("eval", eval)
+except Exception as e:
+    # if eval is not available, we can just pass
+    print(f"Error registering eval resolver: {e}")
+def load_config(path: str, argv: List[str] = None) -> Union[DictConfig, ListConfig]:
+    """
+    Load a configuration. Will resolve inheritance.
+    Supports both file paths and module paths (e.g., depth_anything_3.configs.giant).
+    """
+    # Check if path is a module path (contains dots but no slashes and doesn't end with .yaml)
+    if "." in path and "/" not in path and not path.endswith(".yaml"):
+        # It's a module path, load from package resources
+        path_parts = path.split(".")[1:]
+        config_path = Path(__file__).resolve().parent
+        for part in path_parts:
+            config_path = config_path.joinpath(part)
+        config_path = config_path.with_suffix(".yaml")
+        config = OmegaConf.load(str(config_path))
+    else:
+        # It's a file path (absolute, relative, or with .yaml extension)
+        config = OmegaConf.load(path)
+    if argv is not None:
+        config_argv = OmegaConf.from_dotlist(argv)
+        config = OmegaConf.merge(config, config_argv)
+    config = resolve_recursive(config, resolve_inheritance)
+    return config
+def resolve_recursive(
+    config: Any,
+    resolver: Callable[[Union[DictConfig, ListConfig]], Union[DictConfig, ListConfig]],
+) -> Any:
+    config = resolver(config)
+    if isinstance(config, DictConfig):
+        for k in config.keys():
+            v = config.get(k)
+            if isinstance(v, (DictConfig, ListConfig)):
+                config[k] = resolve_recursive(v, resolver)
+    if isinstance(config, ListConfig):
+        for i in range(len(config)):
+            v = config.get(i)
+            if isinstance(v, (DictConfig, ListConfig)):
+                config[i] = resolve_recursive(v, resolver)
+    return config
+def resolve_inheritance(config: Union[DictConfig, ListConfig]) -> Any:
+    """
+    Recursively resolve inheritance if the config contains:
+    __inherit__: path/to/parent.yaml or a ListConfig of such paths.
+    """
+    if isinstance(config, DictConfig):
+        inherit = config.pop("__inherit__", None)
+        if inherit:
+            inherit_list = inherit if isinstance(inherit, ListConfig) else [inherit]
+            parent_config = None
+            for parent_path in inherit_list:
+                assert isinstance(parent_path, str)
+                parent_config = (
+                    load_config(parent_path)
+                    if parent_config is None
+                    else OmegaConf.merge(parent_config, load_config(parent_path))
+                )
+            if len(config.keys()) > 0:
+                config = OmegaConf.merge(parent_config, config)
+            else:
+                config = parent_config
+    return config
+def import_item(path: str, name: str) -> Any:
+    """
+    Import a python item. Example: import_item("path.to.file", "MyClass") -> MyClass
+    """
+    return getattr(importlib.import_module(path), name)
+def create_object(config: DictConfig) -> Any:
+    """
+    Create an object from config.
+    The config is expected to contains the following:
+    __object__:
+      path: path.to.module
+      name: MyClass
+      args: as_config | as_params (default to as_config)
+    """
+    config = DictConfig(config)
+    item = import_item(
+        path=config.__object__.path,
+        name=config.__object__.name,
+    )
+    args = config.__object__.get("args", "as_config")
+    if args == "as_config":
+        return item(config)
+    if args == "as_params":
+        config = OmegaConf.to_object(config)
+        config.pop("__object__")
+        return item(**config)
+    raise NotImplementedError(f"Unknown args type: {args}")
+def create_dataset(path: str, *args, **kwargs) -> Any:
+    """
+    Create a dataset. Requires the file to contain a "create_dataset" function.
+    """
+    return import_item(path, "create_dataset")(*args, **kwargs)
+def to_dict_recursive(config_obj):
+    if isinstance(config_obj, DictConfig):
+        return {k: to_dict_recursive(v) for k, v in config_obj.items()}
+    elif isinstance(config_obj, ListConfig):
+        return [to_dict_recursive(item) for item in config_obj]
+    return config_obj

Depth-Anything-3-anysize/src/depth_anything_3/cli.py ADDED Viewed

	@@ -0,0 +1,748 @@

+# flake8: noqa: E402
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Refactored Depth Anything 3 CLI
+Clean, modular command-line interface
+"""
+from __future__ import annotations
+import os
+from typing import Optional
+import typer
+from depth_anything_3.services import start_server
+from depth_anything_3.services.gallery import gallery as gallery_main
+from depth_anything_3.services.inference_service import run_inference
+from depth_anything_3.services.input_handlers import (
+    ColmapHandler,
+    ImageHandler,
+    ImagesHandler,
+    InputHandler,
+    VideoHandler,
+    parse_export_feat,
+)
+from depth_anything_3.utils.constants import (
+    DEFAULT_EXPORT_DIR,
+    DEFAULT_GALLERY_DIR,
+    DEFAULT_GRADIO_DIR,
+    DEFAULT_MODEL,
+)
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+app = typer.Typer(help="Depth Anything 3 - Video depth estimation CLI", add_completion=False)
+# ============================================================================
+# Input type detection utilities
+# ============================================================================
+# Supported file extensions
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff", ".tif"}
+VIDEO_EXTENSIONS = {".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".webm", ".m4v"}
+def detect_input_type(input_path: str) -> str:
+    """
+    Detect input type from path.
+    Returns:
+        - "image": Single image file
+        - "images": Directory containing images
+        - "video": Video file
+        - "colmap": COLMAP directory structure
+        - "unknown": Cannot determine type
+    """
+    if not os.path.exists(input_path):
+        return "unknown"
+    # Check if it's a file
+    if os.path.isfile(input_path):
+        ext = os.path.splitext(input_path)[1].lower()
+        if ext in IMAGE_EXTENSIONS:
+            return "image"
+        elif ext in VIDEO_EXTENSIONS:
+            return "video"
+        return "unknown"
+    # Check if it's a directory
+    if os.path.isdir(input_path):
+        # Check for COLMAP structure
+        images_dir = os.path.join(input_path, "images")
+        sparse_dir = os.path.join(input_path, "sparse")
+        if os.path.isdir(images_dir) and os.path.isdir(sparse_dir):
+            return "colmap"
+        # Check if directory contains image files
+        for item in os.listdir(input_path):
+            item_path = os.path.join(input_path, item)
+            if os.path.isfile(item_path):
+                ext = os.path.splitext(item)[1].lower()
+                if ext in IMAGE_EXTENSIONS:
+                    return "images"
+        return "unknown"
+    return "unknown"
+# ============================================================================
+# Common parameters and configuration
+# ============================================================================
+# ============================================================================
+# Inference commands
+# ============================================================================
+@app.command()
+def auto(
+    input_path: str = typer.Argument(
+        ..., help="Path to input (image, directory, video, or COLMAP)"
+    ),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: Optional[int] = typer.Option(
+        None, help="Processing resolution; None keeps original size"
+    ),
+    process_res_method: str = typer.Option("keep", help="Processing resolution method"),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS]Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # Video-specific options
+    fps: float = typer.Option(1.0, help="[Video] Sampling FPS for frame extraction"),
+    # COLMAP-specific options
+    sparse_subdir: str = typer.Option(
+        "", help="[COLMAP] Sparse reconstruction subdirectory (e.g., '0' for sparse/0/)"
+    ),
+    align_to_input_ext_scale: bool = typer.Option(
+        True, help="[COLMAP] Align prediction to input extrinsics scale"
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """
+    Automatically detect input type and run appropriate processing.
+    Supports:
+    - Single image file (.jpg, .png, etc.)
+    - Directory of images
+    - Video file (.mp4, .avi, etc.)
+    - COLMAP directory (with 'images' and 'sparse' subdirectories)
+    """
+    # Detect input type
+    input_type = detect_input_type(input_path)
+    if input_type == "unknown":
+        typer.echo(f"❌ Error: Cannot determine input type for: {input_path}", err=True)
+        typer.echo("Supported inputs:", err=True)
+        typer.echo("  - Single image file (.jpg, .png, etc.)", err=True)
+        typer.echo("  - Directory containing images", err=True)
+        typer.echo("  - Video file (.mp4, .avi, etc.)", err=True)
+        typer.echo("  - COLMAP directory (with 'images/' and 'sparse/' subdirectories)", err=True)
+        raise typer.Exit(1)
+    # Display detected type
+    typer.echo(f"🔍 Detected input type: {input_type.upper()}")
+    typer.echo(f"📁 Input path: {input_path}")
+    typer.echo()
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Route to appropriate handler
+    if input_type == "image":
+        typer.echo("Processing single image...")
+        # Process input
+        image_files = ImageHandler.process(input_path)
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    elif input_type == "images":
+        typer.echo("Processing directory of images...")
+        # Process input - use default extensions
+        image_files = ImagesHandler.process(input_path, "png,jpg,jpeg")
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    elif input_type == "video":
+        typer.echo(f"Processing video with FPS={fps}...")
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Process input
+        image_files = VideoHandler.process(input_path, export_dir, fps)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    elif input_type == "colmap":
+        typer.echo(
+            f"Processing COLMAP directory (sparse subdirectory: '{sparse_subdir or 'default'}')..."
+        )
+        # Process input
+        image_files, extrinsics, intrinsics = ColmapHandler.process(input_path, sparse_subdir)
+        # Handle export directory
+        export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+        # Run inference
+        run_inference(
+            image_paths=image_files,
+            export_dir=export_dir,
+            model_dir=model_dir,
+            device=device,
+            backend_url=final_backend_url,
+            export_format=export_format,
+            process_res=process_res,
+            process_res_method=process_res_method,
+            export_feat_layers=export_feat_layers,
+            extrinsics=extrinsics,
+            intrinsics=intrinsics,
+            align_to_input_ext_scale=align_to_input_ext_scale,
+            conf_thresh_percentile=conf_thresh_percentile,
+            num_max_points=num_max_points,
+            show_cameras=show_cameras,
+            feat_vis_fps=feat_vis_fps,
+        )
+    typer.echo()
+    typer.echo("✅ Processing completed successfully!")
+@app.command()
+def image(
+    image_path: str = typer.Argument(..., help="Path to input image file"),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: Optional[int] = typer.Option(
+        None, help="Processing resolution; None keeps original size"
+    ),
+    process_res_method: str = typer.Option("keep", help="Processing resolution method"),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS] Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run camera pose and depth estimation on a single image."""
+    # Process input
+    image_files = ImageHandler.process(image_path)
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+@app.command()
+def images(
+    images_dir: str = typer.Argument(..., help="Path to directory containing input images"),
+    image_extensions: str = typer.Option(
+        "png,jpg,jpeg", help="Comma-separated image file extensions to process"
+    ),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: Optional[int] = typer.Option(
+        None, help="Processing resolution; None keeps original size"
+    ),
+    process_res_method: str = typer.Option("keep", help="Processing resolution method"),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS] Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run camera pose and depth estimation on a directory of images."""
+    # Process input
+    image_files = ImagesHandler.process(images_dir, image_extensions)
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+@app.command()
+def colmap(
+    colmap_dir: str = typer.Argument(
+        ..., help="Path to COLMAP directory containing 'images' and 'sparse' subdirectories"
+    ),
+    sparse_subdir: str = typer.Option(
+        "", help="Sparse reconstruction subdirectory (e.g., '0' for sparse/0/, empty for sparse/)"
+    ),
+    align_to_input_ext_scale: bool = typer.Option(
+        True, help="Align prediction to input extrinsics scale"
+    ),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: Optional[int] = typer.Option(
+        None, help="Processing resolution; None keeps original size"
+    ),
+    process_res_method: str = typer.Option("keep", help="Processing resolution method"),
+    export_feat: str = typer.Option(
+        "",
+        help="Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run pose conditioned depth estimation on COLMAP data."""
+    # Process input
+    image_files, extrinsics, intrinsics = ColmapHandler.process(colmap_dir, sparse_subdir)
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        extrinsics=extrinsics,
+        intrinsics=intrinsics,
+        align_to_input_ext_scale=align_to_input_ext_scale,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+@app.command()
+def video(
+    video_path: str = typer.Argument(..., help="Path to input video file"),
+    fps: float = typer.Option(1.0, help="Sampling FPS for frame extraction"),
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    export_dir: str = typer.Option(DEFAULT_EXPORT_DIR, help="Export directory"),
+    export_format: str = typer.Option("glb", help="Export format"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    use_backend: bool = typer.Option(False, help="Use backend service for inference"),
+    backend_url: str = typer.Option(
+        "http://localhost:8008", help="Backend URL (default: http://localhost:8008)"
+    ),
+    process_res: Optional[int] = typer.Option(
+        None, help="Processing resolution; None keeps original size"
+    ),
+    process_res_method: str = typer.Option("keep", help="Processing resolution method"),
+    export_feat: str = typer.Option(
+        "",
+        help="[FEAT_VIS] Export features from specified layers using comma-separated indices (e.g., '0,1,2').",
+    ),
+    auto_cleanup: bool = typer.Option(
+        False, help="Automatically clean export directory if it exists (no prompt)"
+    ),
+    # GLB export options
+    conf_thresh_percentile: float = typer.Option(
+        40.0, help="[GLB] Lower percentile for adaptive confidence threshold"
+    ),
+    num_max_points: int = typer.Option(
+        1_000_000, help="[GLB] Maximum number of points in the point cloud"
+    ),
+    show_cameras: bool = typer.Option(
+        True, help="[GLB] Show camera wireframes in the exported scene"
+    ),
+    # Feat_vis export options
+    feat_vis_fps: int = typer.Option(15, help="[FEAT_VIS] Frame rate for output video"),
+):
+    """Run depth estimation on video by extracting frames and processing them."""
+    # Handle export directory
+    export_dir = InputHandler.handle_export_dir(export_dir, auto_cleanup)
+    # Process input
+    image_files = VideoHandler.process(video_path, export_dir, fps)
+    # Parse export_feat parameter
+    export_feat_layers = parse_export_feat(export_feat)
+    # Determine backend URL based on use_backend flag
+    final_backend_url = backend_url if use_backend else None
+    # Run inference
+    run_inference(
+        image_paths=image_files,
+        export_dir=export_dir,
+        model_dir=model_dir,
+        device=device,
+        backend_url=final_backend_url,
+        export_format=export_format,
+        process_res=process_res,
+        process_res_method=process_res_method,
+        export_feat_layers=export_feat_layers,
+        conf_thresh_percentile=conf_thresh_percentile,
+        num_max_points=num_max_points,
+        show_cameras=show_cameras,
+        feat_vis_fps=feat_vis_fps,
+    )
+# ============================================================================
+# Service management commands
+# ============================================================================
+@app.command()
+def backend(
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    device: str = typer.Option("cuda", help="Device to use"),
+    host: str = typer.Option("127.0.0.1", help="Host to bind to"),
+    port: int = typer.Option(8008, help="Port to bind to"),
+    gallery_dir: str = typer.Option(DEFAULT_GALLERY_DIR, help="Gallery directory path (optional)"),
+):
+    """Start model backend service with integrated gallery."""
+    typer.echo("=" * 60)
+    typer.echo("🚀 Starting Depth Anything 3 Backend Server")
+    typer.echo("=" * 60)
+    typer.echo(f"Model directory: {model_dir}")
+    typer.echo(f"Device: {device}")
+    # Check if gallery directory exists
+    if gallery_dir and os.path.exists(gallery_dir):
+        typer.echo(f"Gallery directory: {gallery_dir}")
+    else:
+        gallery_dir = None  # Disable gallery if directory doesn't exist
+    typer.echo()
+    typer.echo("📡 Server URLs (Ctrl/CMD+Click to open):")
+    typer.echo(f"  🏠 Home:      http://{host}:{port}")
+    typer.echo(f"  📊 Dashboard: http://{host}:{port}/dashboard")
+    typer.echo(f"  📈 API Status: http://{host}:{port}/status")
+    if gallery_dir:
+        typer.echo(f"  🎨 Gallery:   http://{host}:{port}/gallery/")
+    typer.echo("=" * 60)
+    try:
+        start_server(model_dir, device, host, port, gallery_dir)
+    except KeyboardInterrupt:
+        typer.echo("\n👋 Backend server stopped.")
+    except Exception as e:
+        typer.echo(f"❌ Failed to start backend: {e}")
+        raise typer.Exit(1)
+# ============================================================================
+# Application launch commands
+# ============================================================================
+@app.command()
+def gradio(
+    model_dir: str = typer.Option(DEFAULT_MODEL, help="Model directory path"),
+    workspace_dir: str = typer.Option(DEFAULT_GRADIO_DIR, help="Workspace directory path"),
+    gallery_dir: str = typer.Option(DEFAULT_GALLERY_DIR, help="Gallery directory path"),
+    host: str = typer.Option("127.0.0.1", help="Host address to bind to"),
+    port: int = typer.Option(7860, help="Port number to bind to"),
+    share: bool = typer.Option(False, help="Create a public link for the app"),
+    debug: bool = typer.Option(False, help="Enable debug mode"),
+    cache_examples: bool = typer.Option(
+        False, help="Pre-cache all example scenes at startup for faster loading"
+    ),
+    cache_gs_tag: str = typer.Option(
+        "",
+        help="Tag to match scene names for high-res+3DGS caching (e.g., 'dl3dv'). Scenes containing this tag will use high_res and infer_gs=True; others will use low_res only.",
+    ),
+):
+    """Launch Depth Anything 3 Gradio interactive web application"""
+    from depth_anything_3.app.gradio_app import DepthAnything3App
+    # Create necessary directories
+    os.makedirs(workspace_dir, exist_ok=True)
+    os.makedirs(gallery_dir, exist_ok=True)
+    typer.echo("Launching Depth Anything 3 Gradio application...")
+    typer.echo(f"Model directory: {model_dir}")
+    typer.echo(f"Workspace directory: {workspace_dir}")
+    typer.echo(f"Gallery directory: {gallery_dir}")
+    typer.echo(f"Host: {host}")
+    typer.echo(f"Port: {port}")
+    typer.echo(f"Share: {share}")
+    typer.echo(f"Debug mode: {debug}")
+    typer.echo(f"Cache examples: {cache_examples}")
+    if cache_examples:
+        if cache_gs_tag:
+            typer.echo(
+                f"Cache GS Tag: '{cache_gs_tag}' (scenes matching this tag will use high-res + 3DGS)"
+            )
+        else:
+            typer.echo(f"Cache GS Tag: None (all scenes will use low-res only)")
+    try:
+        # Initialize and launch application
+        app = DepthAnything3App(
+            model_dir=model_dir, workspace_dir=workspace_dir, gallery_dir=gallery_dir
+        )
+        # Pre-cache examples if requested
+        if cache_examples:
+            typer.echo("\n" + "=" * 60)
+            typer.echo("Pre-caching mode enabled")
+            if cache_gs_tag:
+                typer.echo(f"Scenes containing '{cache_gs_tag}' will use HIGH-RES + 3DGS")
+                typer.echo(f"Other scenes will use LOW-RES only")
+            else:
+                typer.echo(f"All scenes will use LOW-RES only")
+            typer.echo("=" * 60)
+            app.cache_examples(
+                show_cam=True,
+                filter_black_bg=False,
+                filter_white_bg=False,
+                save_percentage=20.0,
+                num_max_points=1000,
+                cache_gs_tag=cache_gs_tag,
+                gs_trj_mode="smooth",
+                gs_video_quality="low",
+            )
+        # Prepare launch arguments
+        launch_kwargs = {"share": share, "debug": debug}
+        app.launch(host=host, port=port, **launch_kwargs)
+    except KeyboardInterrupt:
+        typer.echo("\nGradio application stopped.")
+    except Exception as e:
+        typer.echo(f"Failed to launch Gradio application: {e}")
+        raise typer.Exit(1)
+@app.command()
+def gallery(
+    gallery_dir: str = typer.Option(DEFAULT_GALLERY_DIR, help="Gallery root directory"),
+    host: str = typer.Option("127.0.0.1", help="Host address to bind to"),
+    port: int = typer.Option(8007, help="Port number to bind to"),
+    open_browser: bool = typer.Option(False, help="Open browser after launch"),
+):
+    """Launch Depth Anything 3 Gallery server"""
+    # Validate gallery directory
+    if not os.path.exists(gallery_dir):
+        raise typer.BadParameter(f"Gallery directory not found: {gallery_dir}")
+    typer.echo("Launching Depth Anything 3 Gallery server...")
+    typer.echo(f"Gallery directory: {gallery_dir}")
+    typer.echo(f"Host: {host}")
+    typer.echo(f"Port: {port}")
+    typer.echo(f"Auto-open browser: {open_browser}")
+    try:
+        # Set command line arguments
+        import sys
+        sys.argv = ["gallery", "--dir", gallery_dir, "--host", host, "--port", str(port)]
+        if open_browser:
+            sys.argv.append("--open")
+        # Launch gallery server
+        gallery_main()
+    except KeyboardInterrupt:
+        typer.echo("\nGallery server stopped.")
+    except Exception as e:
+        typer.echo(f"Failed to launch Gallery server: {e}")
+        raise typer.Exit(1)
+if __name__ == "__main__":
+    app()

Depth-Anything-3-anysize/src/depth_anything_3/configs/da3-base.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitb
+  out_layers: [5, 7, 9, 11]
+  alt_start: 4
+  qknorm_start: 4
+  rope_start: 4
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 1536
+  output_dim: 2
+  features: &head_features 128
+  out_channels: &head_out_channels [96, 192, 384, 768]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 768
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 1536

Depth-Anything-3-anysize/src/depth_anything_3/configs/da3-giant.yaml ADDED Viewed

	@@ -0,0 +1,71 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitg
+  out_layers: [19, 27, 33, 39]
+  alt_start: 13
+  qknorm_start: 13
+  rope_start: 13
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 3072
+  output_dim: 2
+  features: &head_features 256
+  out_channels: &head_out_channels [256, 512, 1024, 1024]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 1536
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 3072
+gs_head:
+  __object__:
+    path: depth_anything_3.model.gsdpt
+    name: GSDPT
+    args: as_params
+  dim_in: *head_dim_in
+  output_dim: 38  # should align with gs_adapter's setting, for gs params
+  features: *head_features
+  out_channels: *head_out_channels
+gs_adapter:
+  __object__:
+    path: depth_anything_3.model.gs_adapter
+    name: GaussianAdapter
+    args: as_params
+  sh_degree: 2
+  pred_color: false  # predict SH coefficient if false
+  pred_offset_depth: true
+  pred_offset_xy: true
+  gaussian_scale_min: 1e-5
+  gaussian_scale_max: 30.0

Depth-Anything-3-anysize/src/depth_anything_3/configs/da3-large.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitl
+  out_layers: [11, 15, 19, 23]
+  alt_start: 8
+  qknorm_start: 8
+  rope_start: 8
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 2048
+  output_dim: 2
+  features: &head_features 256
+  out_channels: &head_out_channels [256, 512, 1024, 1024]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 1024
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 2048

Depth-Anything-3-anysize/src/depth_anything_3/configs/da3-small.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vits
+  out_layers: [5, 7, 9, 11]
+  alt_start: 4
+  qknorm_start: 4
+  rope_start: 4
+  cat_token: True
+head:
+  __object__:
+    path: depth_anything_3.model.dualdpt
+    name: DualDPT
+    args: as_params
+  dim_in: &head_dim_in 768
+  output_dim: 2
+  features: &head_features 64
+  out_channels: &head_out_channels [48, 96, 192, 384]
+cam_enc:
+  __object__:
+    path: depth_anything_3.model.cam_enc
+    name: CameraEnc
+    args: as_params
+  dim_out: 384
+cam_dec:
+  __object__:
+    path: depth_anything_3.model.cam_dec
+    name: CameraDec
+    args: as_params
+  dim_in: 768

Depth-Anything-3-anysize/src/depth_anything_3/configs/da3metric-large.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitl
+  out_layers: [4, 11, 17, 23]
+  alt_start: -1 # -1 means disable
+  qknorm_start: -1
+  rope_start: -1
+  cat_token: False
+head:
+  __object__:
+    path: depth_anything_3.model.dpt
+    name: DPT
+    args: as_params
+  dim_in: 1024
+  output_dim: 1
+  features: 256
+  out_channels: [256, 512, 1024, 1024]

Depth-Anything-3-anysize/src/depth_anything_3/configs/da3mono-large.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: DepthAnything3Net
+  args: as_params
+net:
+  __object__:
+    path: depth_anything_3.model.dinov2.dinov2
+    name: DinoV2
+    args: as_params
+  name: vitl
+  out_layers: [4, 11, 17, 23]
+  alt_start: -1 # -1 means disable
+  qknorm_start: -1
+  rope_start: -1
+  cat_token: False
+head:
+  __object__:
+    path: depth_anything_3.model.dpt
+    name: DPT
+    args: as_params
+  dim_in: 1024
+  output_dim: 1
+  features: 256
+  out_channels: [256, 512, 1024, 1024]

Depth-Anything-3-anysize/src/depth_anything_3/configs/da3nested-giant-large.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+__object__:
+  path: depth_anything_3.model.da3
+  name: NestedDepthAnything3Net
+  args: as_params
+anyview:
+  __inherit__: depth_anything_3.configs.da3-giant
+metric:
+  __inherit__: depth_anything_3.configs.da3metric-large

Depth-Anything-3-anysize/src/depth_anything_3/model/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from depth_anything_3.model.da3 import DepthAnything3Net, NestedDepthAnything3Net
+__export__ = [
+    NestedDepthAnything3Net,
+    DepthAnything3Net,
+]

Depth-Anything-3-anysize/src/depth_anything_3/model/cam_dec.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+class CameraDec(nn.Module):
+    def __init__(self, dim_in=1536):
+        super().__init__()
+        output_dim = dim_in
+        self.backbone = nn.Sequential(
+            nn.Linear(output_dim, output_dim),
+            nn.ReLU(),
+            nn.Linear(output_dim, output_dim),
+            nn.ReLU(),
+        )
+        self.fc_t = nn.Linear(output_dim, 3)
+        self.fc_qvec = nn.Linear(output_dim, 4)
+        self.fc_fov = nn.Sequential(nn.Linear(output_dim, 2), nn.ReLU())
+    def forward(self, feat, camera_encoding=None, *args, **kwargs):
+        B, N = feat.shape[:2]
+        feat = feat.reshape(B * N, -1)
+        feat = self.backbone(feat)
+        out_t = self.fc_t(feat.float()).reshape(B, N, 3)
+        if camera_encoding is None:
+            out_qvec = self.fc_qvec(feat.float()).reshape(B, N, 4)
+            out_fov = self.fc_fov(feat.float()).reshape(B, N, 2)
+        else:
+            out_qvec = camera_encoding[..., 3:7]
+            out_fov = camera_encoding[..., -2:]
+        pose_enc = torch.cat([out_t, out_qvec, out_fov], dim=-1)
+        return pose_enc

Depth-Anything-3-anysize/src/depth_anything_3/model/cam_enc.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch.nn as nn
+from depth_anything_3.model.utils.attention import Mlp
+from depth_anything_3.model.utils.block import Block
+from depth_anything_3.model.utils.transform import extri_intri_to_pose_encoding
+from depth_anything_3.utils.geometry import affine_inverse
+class CameraEnc(nn.Module):
+    """
+    CameraHead predicts camera parameters from token representations using iterative refinement.
+    It applies a series of transformer blocks (the "trunk") to dedicated camera tokens.
+    """
+    def __init__(
+        self,
+        dim_out: int = 1024,
+        dim_in: int = 9,
+        trunk_depth: int = 4,
+        target_dim: int = 9,
+        num_heads: int = 16,
+        mlp_ratio: int = 4,
+        init_values: float = 0.01,
+        **kwargs,
+    ):
+        super().__init__()
+        self.target_dim = target_dim
+        self.trunk_depth = trunk_depth
+        self.trunk = nn.Sequential(
+            *[
+                Block(
+                    dim=dim_out,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    init_values=init_values,
+                )
+                for _ in range(trunk_depth)
+            ]
+        )
+        self.token_norm = nn.LayerNorm(dim_out)
+        self.trunk_norm = nn.LayerNorm(dim_out)
+        self.pose_branch = Mlp(
+            in_features=dim_in,
+            hidden_features=dim_out // 2,
+            out_features=dim_out,
+            drop=0,
+        )
+    def forward(
+        self,
+        ext,
+        ixt,
+        image_size,
+    ) -> tuple:
+        c2ws = affine_inverse(ext)
+        pose_encoding = extri_intri_to_pose_encoding(
+            c2ws,
+            ixt,
+            image_size,
+        )
+        pose_tokens = self.pose_branch(pose_encoding)
+        pose_tokens = self.token_norm(pose_tokens)
+        pose_tokens = self.trunk(pose_tokens)
+        pose_tokens = self.trunk_norm(pose_tokens)
+        return pose_tokens

Depth-Anything-3-anysize/src/depth_anything_3/model/da3.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import torch
+import torch.nn as nn
+from addict import Dict
+from omegaconf import DictConfig, OmegaConf
+from depth_anything_3.cfg import create_object
+from depth_anything_3.model.utils.transform import pose_encoding_to_extri_intri
+from depth_anything_3.utils.alignment import (
+    apply_metric_scaling,
+    compute_alignment_mask,
+    compute_sky_mask,
+    least_squares_scale_scalar,
+    sample_tensor_for_quantile,
+    set_sky_regions_to_max_depth,
+)
+from depth_anything_3.utils.geometry import affine_inverse, as_homogeneous, map_pdf_to_opacity
+def _wrap_cfg(cfg_obj):
+    return OmegaConf.create(cfg_obj)
+class DepthAnything3Net(nn.Module):
+    """
+    Depth Anything 3 network for depth estimation and camera pose estimation.
+    This network consists of:
+    - Backbone: DinoV2 feature extractor
+    - Head: DPT or DualDPT for depth prediction
+    - Optional camera decoders for pose estimation
+    - Optional GSDPT for 3DGS prediction
+    Args:
+        preset: Configuration preset containing network dimensions and settings
+    Returns:
+        Dictionary containing:
+        - depth: Predicted depth map (B, H, W)
+        - depth_conf: Depth confidence map (B, H, W)
+        - extrinsics: Camera extrinsics (B, N, 4, 4)
+        - intrinsics: Camera intrinsics (B, N, 3, 3)
+        - gaussians: 3D Gaussian Splats (world space), type: model.gs_adapter.Gaussians
+        - aux: Auxiliary features for specified layers
+    """
+    # Patch size for feature extraction
+    PATCH_SIZE = 14
+    def __init__(self, net, head, cam_dec=None, cam_enc=None, gs_head=None, gs_adapter=None):
+        """
+        Initialize DepthAnything3Net with given yaml-initialized configuration.
+        """
+        super().__init__()
+        self.backbone = net if isinstance(net, nn.Module) else create_object(_wrap_cfg(net))
+        self.head = head if isinstance(head, nn.Module) else create_object(_wrap_cfg(head))
+        self.cam_dec, self.cam_enc = None, None
+        if cam_dec is not None:
+            self.cam_dec = (
+                cam_dec if isinstance(cam_dec, nn.Module) else create_object(_wrap_cfg(cam_dec))
+            )
+            self.cam_enc = (
+                cam_enc if isinstance(cam_enc, nn.Module) else create_object(_wrap_cfg(cam_enc))
+            )
+        self.gs_adapter, self.gs_head = None, None
+        if gs_head is not None and gs_adapter is not None:
+            self.gs_adapter = (
+                gs_adapter
+                if isinstance(gs_adapter, nn.Module)
+                else create_object(_wrap_cfg(gs_adapter))
+            )
+            gs_out_dim = self.gs_adapter.d_in + 1
+            if isinstance(gs_head, nn.Module):
+                assert (
+                    gs_head.out_dim == gs_out_dim
+                ), f"gs_head.out_dim should be {gs_out_dim}, got {gs_head.out_dim}"
+                self.gs_head = gs_head
+            else:
+                assert (
+                    gs_head["output_dim"] == gs_out_dim
+                ), f"gs_head output_dim should set to {gs_out_dim}, got {gs_head['output_dim']}"
+                self.gs_head = create_object(_wrap_cfg(gs_head))
+    def forward(
+        self,
+        x: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+        export_feat_layers: list[int] | None = [],
+        infer_gs: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through the network.
+        Args:
+            x: Input images (B, N, 3, H, W)
+            extrinsics: Camera extrinsics (B, N, 4, 4) - unused
+            intrinsics: Camera intrinsics (B, N, 3, 3) - unused
+            feat_layers: List of layer indices to extract features from
+        Returns:
+            Dictionary containing predictions and auxiliary features
+        """
+        # Extract features using backbone
+        if extrinsics is not None:
+            with torch.autocast(device_type=x.device.type, enabled=False):
+                cam_token = self.cam_enc(extrinsics, intrinsics, x.shape[-2:])
+        else:
+            cam_token = None
+        feats, aux_feats = self.backbone(
+            x, cam_token=cam_token, export_feat_layers=export_feat_layers
+        )
+        # feats = [[item for item in feat] for feat in feats]
+        H, W = x.shape[-2], x.shape[-1]
+        # Process features through depth head
+        with torch.autocast(device_type=x.device.type, enabled=False):
+            output = self._process_depth_head(feats, H, W)
+            output = self._process_camera_estimation(feats, H, W, output)
+            if infer_gs:
+                output = self._process_gs_head(feats, H, W, output, x, extrinsics, intrinsics)
+        # Extract auxiliary features if requested
+        output.aux = self._extract_auxiliary_features(aux_feats, export_feat_layers, H, W)
+        return output
+    def _process_depth_head(
+        self, feats: list[torch.Tensor], H: int, W: int
+    ) -> Dict[str, torch.Tensor]:
+        """Process features through the depth prediction head."""
+        return self.head(feats, H, W, patch_start_idx=0)
+    def _process_camera_estimation(
+        self, feats: list[torch.Tensor], H: int, W: int, output: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Process camera pose estimation if camera decoder is available."""
+        if self.cam_dec is not None:
+            pose_enc = self.cam_dec(feats[-1][1])
+            # Remove ray information as it's not needed for pose estimation
+            if "ray" in output:
+                del output.ray
+            if "ray_conf" in output:
+                del output.ray_conf
+            # Convert pose encoding to extrinsics and intrinsics
+            c2w, ixt = pose_encoding_to_extri_intri(pose_enc, (H, W))
+            output.extrinsics = affine_inverse(c2w)
+            output.intrinsics = ixt
+        return output
+    def _process_gs_head(
+        self,
+        feats: list[torch.Tensor],
+        H: int,
+        W: int,
+        output: Dict[str, torch.Tensor],
+        in_images: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+    ) -> Dict[str, torch.Tensor]:
+        """Process 3DGS parameters estimation if 3DGS head is available."""
+        if self.gs_head is None or self.gs_adapter is None:
+            return output
+        assert output.get("depth", None) is not None, "must provide MV depth for the GS head."
+        # The depth is defined in the DA3 model's camera space,
+        # so even with provided GT camera poses,
+        # we instead use the predicted camera poses for better alignment.
+        ctx_extr = output.get("extrinsics", None)
+        ctx_intr = output.get("intrinsics", None)
+        assert (
+            ctx_extr is not None and ctx_intr is not None
+        ), "must process camera info first if GT is not available"
+        gt_extr = extrinsics
+        # homo the extr if needed
+        ctx_extr = as_homogeneous(ctx_extr)
+        if gt_extr is not None:
+            gt_extr = as_homogeneous(gt_extr)
+        # forward through the gs_dpt head to get 'camera space' parameters
+        gs_outs = self.gs_head(
+            feats=feats,
+            H=H,
+            W=W,
+            patch_start_idx=0,
+            images=in_images,
+        )
+        raw_gaussians = gs_outs.raw_gs
+        densities = gs_outs.raw_gs_conf
+        # convert to 'world space' 3DGS parameters; ready to export and render
+        # gt_extr could be None, and will be used to align the pose scale if available
+        gs_world = self.gs_adapter(
+            extrinsics=ctx_extr,
+            intrinsics=ctx_intr,
+            depths=output.depth,
+            opacities=map_pdf_to_opacity(densities),
+            raw_gaussians=raw_gaussians,
+            image_shape=(H, W),
+            gt_extrinsics=gt_extr,
+        )
+        output.gaussians = gs_world
+        return output
+    def _extract_auxiliary_features(
+        self, feats: list[torch.Tensor], feat_layers: list[int], H: int, W: int
+    ) -> Dict[str, torch.Tensor]:
+        """Extract auxiliary features from specified layers."""
+        aux_features = Dict()
+        assert len(feats) == len(feat_layers)
+        for feat, feat_layer in zip(feats, feat_layers):
+            # Reshape features to spatial dimensions
+            feat_reshaped = feat.reshape(
+                [
+                    feat.shape[0],
+                    feat.shape[1],
+                    H // self.PATCH_SIZE,
+                    W // self.PATCH_SIZE,
+                    feat.shape[-1],
+                ]
+            )
+            aux_features[f"feat_layer_{feat_layer}"] = feat_reshaped
+        return aux_features
+class NestedDepthAnything3Net(nn.Module):
+    """
+    Nested Depth Anything 3 network with metric scaling capabilities.
+    This network combines two DepthAnything3Net branches:
+    - Main branch: Standard depth estimation
+    - Metric branch: Metric depth estimation for scaling alignment
+    The network performs depth alignment using least squares scaling
+    and handles sky region masking for improved depth estimation.
+    Args:
+        preset: Configuration for the main depth estimation branch
+        second_preset: Configuration for the metric depth branch
+    """
+    def __init__(self, anyview: DictConfig, metric: DictConfig):
+        """
+        Initialize NestedDepthAnything3Net with two branches.
+        Args:
+            preset: Configuration for main depth estimation branch
+            second_preset: Configuration for metric depth branch
+        """
+        super().__init__()
+        self.da3 = create_object(anyview)
+        self.da3_metric = create_object(metric)
+    def forward(
+        self,
+        x: torch.Tensor,
+        extrinsics: torch.Tensor | None = None,
+        intrinsics: torch.Tensor | None = None,
+        export_feat_layers: list[int] | None = [],
+        infer_gs: bool = False,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass through both branches with metric scaling alignment.
+        Args:
+            x: Input images (B, N, 3, H, W)
+            extrinsics: Camera extrinsics (B, N, 4, 4) - unused
+            intrinsics: Camera intrinsics (B, N, 3, 3) - unused
+            feat_layers: List of layer indices to extract features from
+            metric_feat: Whether to use metric features (unused)
+        Returns:
+            Dictionary containing aligned depth predictions and camera parameters
+        """
+        # Get predictions from both branches
+        output = self.da3(
+            x, extrinsics, intrinsics, export_feat_layers=export_feat_layers, infer_gs=infer_gs
+        )
+        metric_output = self.da3_metric(x, infer_gs=infer_gs)
+        # Apply metric scaling and alignment
+        output = self._apply_metric_scaling(output, metric_output)
+        output = self._apply_depth_alignment(output, metric_output)
+        output = self._handle_sky_regions(output, metric_output)
+        return output
+    def _apply_metric_scaling(
+        self, output: Dict[str, torch.Tensor], metric_output: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Apply metric scaling to the metric depth output."""
+        # Scale metric depth based on camera intrinsics
+        metric_output.depth = apply_metric_scaling(
+            metric_output.depth,
+            output.intrinsics,
+        )
+        return output
+    def _apply_depth_alignment(
+        self, output: Dict[str, torch.Tensor], metric_output: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Apply depth alignment using least squares scaling."""
+        # Compute non-sky mask
+        non_sky_mask = compute_sky_mask(metric_output.sky, threshold=0.3)
+        # Ensure we have enough non-sky pixels
+        assert non_sky_mask.sum() > 10, "Insufficient non-sky pixels for alignment"
+        # Sample depth confidence for quantile computation
+        depth_conf_ns = output.depth_conf[non_sky_mask]
+        depth_conf_sampled = sample_tensor_for_quantile(depth_conf_ns, max_samples=100000)
+        median_conf = torch.quantile(depth_conf_sampled, 0.5)
+        # Compute alignment mask
+        align_mask = compute_alignment_mask(
+            output.depth_conf, non_sky_mask, output.depth, metric_output.depth, median_conf
+        )
+        # Compute scale factor using least squares
+        valid_depth = output.depth[align_mask]
+        valid_metric_depth = metric_output.depth[align_mask]
+        scale_factor = least_squares_scale_scalar(valid_metric_depth, valid_depth)
+        # Apply scaling to depth and extrinsics
+        output.depth *= scale_factor
+        output.extrinsics[:, :, :3, 3] *= scale_factor
+        output.is_metric = 1
+        output.scale_factor = scale_factor.item()
+        return output
+    def _handle_sky_regions(
+        self,
+        output: Dict[str, torch.Tensor],
+        metric_output: Dict[str, torch.Tensor],
+        sky_depth_def: float = 200.0,
+    ) -> Dict[str, torch.Tensor]:
+        """Handle sky regions by setting them to maximum depth."""
+        non_sky_mask = compute_sky_mask(metric_output.sky, threshold=0.3)
+        # Compute maximum depth for non-sky regions
+        # Use sampling to safely compute quantile on large tensors
+        non_sky_depth = output.depth[non_sky_mask]
+        if non_sky_depth.numel() > 100000:
+            idx = torch.randint(0, non_sky_depth.numel(), (100000,), device=non_sky_depth.device)
+            sampled_depth = non_sky_depth[idx]
+        else:
+            sampled_depth = non_sky_depth
+        non_sky_max = min(torch.quantile(sampled_depth, 0.99), sky_depth_def)
+        # Set sky regions to maximum depth and high confidence
+        output.depth, output.depth_conf = set_sky_regions_to_max_depth(
+            output.depth, output.depth_conf, non_sky_mask, max_depth=non_sky_max
+        )
+        return output

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/dinov2.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from typing import List
+import torch.nn as nn
+from depth_anything_3.model.dinov2.vision_transformer import (
+    vit_base,
+    vit_giant2,
+    vit_large,
+    vit_small,
+)
+class DinoV2(nn.Module):
+    def __init__(
+        self,
+        name: str,
+        out_layers: List[int],
+        alt_start: int = -1,
+        qknorm_start: int = -1,
+        rope_start: int = -1,
+        cat_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__()
+        assert name in {"vits", "vitb", "vitl", "vitg"}
+        self.name = name
+        self.out_layers = out_layers
+        self.alt_start = alt_start
+        self.qknorm_start = qknorm_start
+        self.rope_start = rope_start
+        self.cat_token = cat_token
+        encoder_map = {
+            "vits": vit_small,
+            "vitb": vit_base,
+            "vitl": vit_large,
+            "vitg": vit_giant2,
+        }
+        encoder_fn = encoder_map[self.name]
+        ffn_layer = "swiglufused" if self.name == "vitg" else "mlp"
+        self.pretrained = encoder_fn(
+            img_size=518,
+            patch_size=14,
+            ffn_layer=ffn_layer,
+            alt_start=alt_start,
+            qknorm_start=qknorm_start,
+            rope_start=rope_start,
+            cat_token=cat_token,
+        )
+    def forward(self, x, **kwargs):
+        return self.pretrained.get_intermediate_layers(
+            x,
+            self.out_layers,
+            **kwargs,
+        )

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# from .attention import MemEffAttention
+from .block import Block
+from .layer_scale import LayerScale
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .rope import PositionGetter, RotaryPositionEmbedding2D
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+__all__ = [
+    Mlp,
+    PatchEmbed,
+    SwiGLUFFN,
+    SwiGLUFFNFused,
+    Block,
+    # MemEffAttention,
+    LayerScale,
+    PositionGetter,
+    RotaryPositionEmbedding2D,
+]

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import torch.nn.functional as F
+from torch import Tensor, nn
+logger = logging.getLogger("dinov2")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        qk_norm: bool = False,
+        fused_attn: bool = True,  # use F.scaled_dot_product_attention or not
+        rope=None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.fused_attn = fused_attn
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None and pos is not None:
+            q = self.rope(q, pos)
+            k = self.rope(k, pos)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                dropout_p=self.attn_drop.p if self.training else 0.0,
+                attn_mask=(
+                    (attn_mask)[:, None].repeat(1, self.num_heads, 1, 1)
+                    if attn_mask is not None
+                    else None
+                ),
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# flake8: noqa: F821
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, Optional
+import torch
+from torch import Tensor, nn
+from .attention import Attention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_AVAILABLE = True
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        rope=None,
+        ln_eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim, eps=ln_eps)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            rope=rope,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim, eps=ln_eps)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None, attn_mask=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos, attn_mask=attn_mask))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                pos=pos,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x, pos=pos, attn_mask=attn_mask))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x, pos=pos, attn_mask=attn_mask)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+    pos: Optional[Tensor] = None,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    if pos is not None:
+        # if necessary, apply rope to the subset
+        pos = pos[brange]
+        residual = residual_func(x_subset, pos=pos)
+    else:
+        residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(
+        x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor
+    )
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super().__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110  # noqa: E501
+from typing import Union
+import torch
+from torch import Tensor, nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.dim = dim
+        self.inplace = inplace
+        self.init_values = init_values
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+    def extra_repr(self) -> str:
+        return f"{self.dim}, init_values={self.init_values}, inplace={self.inplace}"

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+import torch.nn as nn
+from torch import Tensor
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert (
+            H % patch_H == 0
+        ), f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert (
+            W % patch_W == 0
+        ), f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = (
+            Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        )
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/rope.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Implementation of 2D Rotary Position Embeddings (RoPE).
+# This module provides a clean implementation of 2D Rotary Position Embeddings,
+# which extends the original RoPE concept to handle 2D spatial positions.
+# Inspired by:
+#         https://github.com/meta-llama/codellama/blob/main/llama/model.py
+#         https://github.com/naver-ai/rope-vit
+from typing import Dict, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class PositionGetter:
+    """Generates and caches 2D spatial positions for patches in a grid.
+    This class efficiently manages the generation of spatial coordinates for patches
+    in a 2D grid, caching results to avoid redundant computations.
+    Attributes:
+        position_cache: Dictionary storing precomputed position tensors for different
+            grid dimensions.
+    """
+    def __init__(self):
+        """Initializes the position generator with an empty cache."""
+        self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {}
+    def __call__(
+        self, batch_size: int, height: int, width: int, device: torch.device
+    ) -> torch.Tensor:
+        """Generates spatial positions for a batch of patches.
+        Args:
+            batch_size: Number of samples in the batch.
+            height: Height of the grid in patches.
+            width: Width of the grid in patches.
+            device: Target device for the position tensor.
+        Returns:
+            Tensor of shape (batch_size, height*width, 2) containing y,x coordinates
+            for each position in the grid, repeated for each batch item.
+        """
+        if (height, width) not in self.position_cache:
+            y_coords = torch.arange(height, device=device)
+            x_coords = torch.arange(width, device=device)
+            positions = torch.cartesian_prod(y_coords, x_coords)
+            self.position_cache[height, width] = positions
+        cached_positions = self.position_cache[height, width]
+        return cached_positions.view(1, height * width, 2).expand(batch_size, -1, -1).clone()
+class RotaryPositionEmbedding2D(nn.Module):
+    """2D Rotary Position Embedding implementation.
+    This module applies rotary position embeddings to input tokens based on their
+    2D spatial positions. It handles the position-dependent rotation of features
+    separately for vertical and horizontal dimensions.
+    Args:
+        frequency: Base frequency for the position embeddings. Default: 100.0
+        scaling_factor: Scaling factor for frequency computation. Default: 1.0
+    Attributes:
+        base_frequency: Base frequency for computing position embeddings.
+        scaling_factor: Factor to scale the computed frequencies.
+        frequency_cache: Cache for storing precomputed frequency components.
+    """
+    def __init__(self, frequency: float = 100.0, scaling_factor: float = 1.0):
+        """Initializes the 2D RoPE module."""
+        super().__init__()
+        self.base_frequency = frequency
+        self.scaling_factor = scaling_factor
+        self.frequency_cache: Dict[Tuple, Tuple[torch.Tensor, torch.Tensor]] = {}
+    def _compute_frequency_components(
+        self, dim: int, seq_len: int, device: torch.device, dtype: torch.dtype
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Computes frequency components for rotary embeddings.
+        Args:
+            dim: Feature dimension (must be even).
+            seq_len: Maximum sequence length.
+            device: Target device for computations.
+            dtype: Data type for the computed tensors.
+        Returns:
+            Tuple of (cosine, sine) tensors for frequency components.
+        """
+        cache_key = (dim, seq_len, device, dtype)
+        if cache_key not in self.frequency_cache:
+            # Compute frequency bands
+            exponents = torch.arange(0, dim, 2, device=device).float() / dim
+            inv_freq = 1.0 / (self.base_frequency**exponents)
+            # Generate position-dependent frequencies
+            positions = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+            angles = torch.einsum("i,j->ij", positions, inv_freq)
+            # Compute and cache frequency components
+            angles = angles.to(dtype)
+            angles = torch.cat((angles, angles), dim=-1)
+            cos_components = angles.cos().to(dtype)
+            sin_components = angles.sin().to(dtype)
+            self.frequency_cache[cache_key] = (cos_components, sin_components)
+        return self.frequency_cache[cache_key]
+    @staticmethod
+    def _rotate_features(x: torch.Tensor) -> torch.Tensor:
+        """Performs feature rotation by splitting and recombining feature dimensions.
+        Args:
+            x: Input tensor to rotate.
+        Returns:
+            Rotated feature tensor.
+        """
+        feature_dim = x.shape[-1]
+        x1, x2 = x[..., : feature_dim // 2], x[..., feature_dim // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+    def _apply_1d_rope(
+        self,
+        tokens: torch.Tensor,
+        positions: torch.Tensor,
+        cos_comp: torch.Tensor,
+        sin_comp: torch.Tensor,
+    ) -> torch.Tensor:
+        """Applies 1D rotary position embeddings along one dimension.
+        Args:
+            tokens: Input token features.
+            positions: Position indices.
+            cos_comp: Cosine components for rotation.
+            sin_comp: Sine components for rotation.
+        Returns:
+            Tokens with applied rotary position embeddings.
+        """
+        # Embed positions with frequency components
+        cos = F.embedding(positions, cos_comp)[:, None, :, :]
+        sin = F.embedding(positions, sin_comp)[:, None, :, :]
+        # Apply rotation
+        return (tokens * cos) + (self._rotate_features(tokens) * sin)
+    def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor:
+        """Applies 2D rotary position embeddings to input tokens.
+        Args:
+            tokens: Input tensor of shape (batch_size, n_heads, n_tokens, dim).
+                   The feature dimension (dim) must be divisible by 4.
+            positions: Position tensor of shape (batch_size, n_tokens, 2) containing
+                      the y and x coordinates for each token.
+        Returns:
+            Tensor of same shape as input with applied 2D rotary position embeddings.
+        Raises:
+            AssertionError: If input dimensions are invalid or positions are malformed.
+        """
+        # Validate inputs
+        assert tokens.size(-1) % 2 == 0, "Feature dimension must be even"
+        assert (
+            positions.ndim == 3 and positions.shape[-1] == 2
+        ), "Positions must have shape (batch_size, n_tokens, 2)"
+        # Compute feature dimension for each spatial direction
+        feature_dim = tokens.size(-1) // 2
+        # Get frequency components
+        max_position = int(positions.max()) + 1
+        cos_comp, sin_comp = self._compute_frequency_components(
+            feature_dim, max_position, tokens.device, tokens.dtype
+        )
+        # Split features for vertical and horizontal processing
+        vertical_features, horizontal_features = tokens.chunk(2, dim=-1)
+        # Apply RoPE separately for each dimension
+        vertical_features = self._apply_1d_rope(
+            vertical_features, positions[..., 0], cos_comp, sin_comp
+        )
+        horizontal_features = self._apply_1d_rope(
+            horizontal_features, positions[..., 1], cos_comp, sin_comp
+        )
+        # Combine processed features
+        return torch.cat((vertical_features, horizontal_features), dim=-1)

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+import torch.nn.functional as F
+from torch import Tensor, nn
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

Depth-Anything-3-anysize/src/depth_anything_3/model/dinov2/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import math
+from typing import Callable, List, Sequence, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from einops import rearrange
+from depth_anything_3.utils.logger import logger
+from .layers import LayerScale  # noqa: F401
+from .layers import Mlp  # noqa: F401
+from .layers import (  # noqa: F401
+    Block,
+    PatchEmbed,
+    PositionGetter,
+    RotaryPositionEmbedding2D,
+    SwiGLUFFNFused,
+)
+# logger = logging.getLogger("dinov2")
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+def named_apply(
+    fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False
+) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True
+        )
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=1.0,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        alt_start=-1,
+        qknorm_start=-1,
+        rope_start=-1,
+        rope_freq=100,
+        plus_cam_token=False,
+        cat_token=True,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating
+                positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating
+                positional embeddings
+            block_prompt: (bool) whether to add ray embeddings to the block input
+        """
+        super().__init__()
+        self.patch_start_idx = 1
+        norm_layer = nn.LayerNorm
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.alt_start = alt_start
+        self.qknorm_start = qknorm_start
+        self.rope_start = rope_start
+        self.cat_token = cat_token
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if self.alt_start != -1:
+            self.camera_token = nn.Parameter(torch.randn(1, 2, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim))
+            if num_register_tokens
+            else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [
+                x.item() for x in torch.linspace(0, drop_path_rate, depth)
+            ]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        if self.rope_start != -1:
+            self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None
+            self.position_getter = PositionGetter() if self.rope is not None else None
+        else:
+            self.rope = None
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                qk_norm=i >= qknorm_start if qknorm_start != -1 else False,
+                rope=self.rope if i >= rope_start and rope_start != -1 else None,
+            )
+            for i in range(depth)
+        ]
+        self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the
+            # interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using
+            # both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_cls_token(self, B, S):
+        cls_token = self.cls_token.expand(B, S, -1)
+        cls_token = cls_token.reshape(B * S, -1, self.embed_dim)
+        return cls_token
+    def prepare_tokens_with_masks(self, x, masks=None, cls_token=None, **kwargs):
+        B, S, nc, w, h = x.shape
+        x = rearrange(x, "b s c h w -> (b s) c h w")
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        cls_token = self.prepare_cls_token(B, S)
+        x = torch.cat((cls_token, x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        x = rearrange(x, "(b s) n c -> b s n c", b=B, s=S)
+        return x
+    def _prepare_rope(self, B, S, H, W, device):
+        pos = None
+        pos_nodiff = None
+        if self.rope is not None:
+            pos = self.position_getter(
+                B * S, H // self.patch_size, W // self.patch_size, device=device
+            )
+            pos = rearrange(pos, "(b s) n c -> b s n c", b=B)
+            pos_nodiff = torch.zeros_like(pos).to(pos.dtype)
+            if self.patch_start_idx > 0:
+                pos = pos + 1
+                pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(device).to(pos.dtype)
+                pos_special = rearrange(pos_special, "(b s) n c -> b s n c", b=B)
+                pos = torch.cat([pos_special, pos], dim=2)
+                pos_nodiff = pos_nodiff + 1
+                pos_nodiff = torch.cat([pos_special, pos_nodiff], dim=2)
+        return pos, pos_nodiff
+    def _get_intermediate_layers_not_chunked(self, x, n=1, export_feat_layers=[], **kwargs):
+        B, S, _, H, W = x.shape
+        x = self.prepare_tokens_with_masks(x)
+        output, total_block_len, aux_output = [], len(self.blocks), []
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        pos, pos_nodiff = self._prepare_rope(B, S, H, W, x.device)
+        for i, blk in enumerate(self.blocks):
+            if i < self.rope_start or self.rope is None:
+                g_pos, l_pos = None, None
+            else:
+                g_pos = pos_nodiff
+                l_pos = pos
+            if self.alt_start != -1 and i == self.alt_start:
+                if kwargs.get("cam_token", None) is not None:
+                    logger.info("Using camera conditions provided by the user")
+                    cam_token = kwargs.get("cam_token")
+                else:
+                    ref_token = self.camera_token[:, :1].expand(B, -1, -1)
+                    src_token = self.camera_token[:, 1:].expand(B, S - 1, -1)
+                    cam_token = torch.cat([ref_token, src_token], dim=1)
+                x[:, :, 0] = cam_token
+            if self.alt_start != -1 and i >= self.alt_start and i % 2 == 1:
+                x = self.process_attention(
+                    x, blk, "global", pos=g_pos, attn_mask=kwargs.get("attn_mask", None)
+                )
+            else:
+                x = self.process_attention(x, blk, "local", pos=l_pos)
+                local_x = x
+            if i in blocks_to_take:
+                out_x = torch.cat([local_x, x], dim=-1) if self.cat_token else x
+                output.append((out_x[:, :, 0], out_x))
+            if i in export_feat_layers:
+                aux_output.append(x)
+        return output, aux_output
+    def process_attention(self, x, block, attn_type="global", pos=None, attn_mask=None):
+        b, s, n = x.shape[:3]
+        if attn_type == "local":
+            x = rearrange(x, "b s n c -> (b s) n c")
+            if pos is not None:
+                pos = rearrange(pos, "b s n c -> (b s) n c")
+        elif attn_type == "global":
+            x = rearrange(x, "b s n c -> b (s n) c")
+            if pos is not None:
+                pos = rearrange(pos, "b s n c -> b (s n) c")
+        else:
+            raise ValueError(f"Invalid attention type: {attn_type}")
+        x = block(x, pos=pos, attn_mask=attn_mask)
+        if attn_type == "local":
+            x = rearrange(x, "(b s) n c -> b s n c", b=b, s=s)
+        elif attn_type == "global":
+            x = rearrange(x, "b (s n) c -> b s n c", b=b, s=s)
+        return x
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        export_feat_layers: List[int] = [],
+        **kwargs,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        outputs, aux_outputs = self._get_intermediate_layers_not_chunked(
+            x, n, export_feat_layers=export_feat_layers, **kwargs
+        )
+        camera_tokens = [out[0] for out in outputs]
+        if outputs[0][1].shape[-1] == self.embed_dim:
+            outputs = [self.norm(out[1]) for out in outputs]
+        elif outputs[0][1].shape[-1] == (self.embed_dim * 2):
+            outputs = [
+                torch.cat(
+                    [out[1][..., : self.embed_dim], self.norm(out[1][..., self.embed_dim :])],
+                    dim=-1,
+                )
+                for out in outputs
+            ]
+        else:
+            raise ValueError(f"Invalid output shape: {outputs[0][1].shape}")
+        aux_outputs = [self.norm(out) for out in aux_outputs]
+        outputs = [out[..., 1 + self.num_register_tokens :, :] for out in outputs]
+        aux_outputs = [out[..., 1 + self.num_register_tokens :, :] for out in aux_outputs]
+        return tuple(zip(outputs, camera_tokens)), aux_outputs
+def vit_small(patch_size=16, num_register_tokens=0, depth=12, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=depth,
+        num_heads=6,
+        mlp_ratio=4,
+        # block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, depth=12, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=depth,
+        num_heads=12,
+        mlp_ratio=4,
+        # block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, depth=24, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=depth,
+        num_heads=16,
+        mlp_ratio=4,
+        # block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, depth=40, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=depth,
+        num_heads=24,
+        mlp_ratio=4,
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

Depth-Anything-3-anysize/src/depth_anything_3/model/dpt.py ADDED Viewed

	@@ -0,0 +1,458 @@

+# flake8: noqa E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict as TyDict
+from typing import List, Sequence, Tuple
+import torch
+import torch.nn as nn
+from addict import Dict
+from einops import rearrange
+from depth_anything_3.model.utils.head_utils import (
+    Permute,
+    create_uv_grid,
+    custom_interpolate,
+    position_grid_to_embed,
+)
+class DPT(nn.Module):
+    """
+    DPT for dense prediction (main head + optional sky head, sky always 1 channel).
+    Returns:
+      - Main head:
+        * If output_dim>1: { head_name, f"{head_name}_conf" }
+        * If output_dim==1: { head_name }
+      - Sky head (if use_sky_head=True): { sky_name }  # [B, S, 1, H/down_ratio, W/down_ratio]
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        *,
+        patch_size: int = 14,
+        output_dim: int = 1,
+        activation: str = "exp",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = False,
+        down_ratio: int = 1,
+        head_name: str = "depth",
+        # ---- sky head (fixed 1 channel) ----
+        use_sky_head: bool = True,
+        sky_name: str = "sky",
+        sky_activation: str = "relu",  # 'sigmoid' / 'relu' / 'linear'
+        use_ln_for_heads: bool = False,  # If needed, apply LayerNorm on intermediate features of both heads
+        norm_type: str = "idt",  # use to match legacy GS-DPT head, "idt" / "layer"
+        fusion_block_inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        # -------------------- configuration --------------------
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.down_ratio = down_ratio
+        # Names
+        self.head_main = head_name
+        self.sky_name = sky_name
+        # Main head: output dimension and confidence switch
+        self.out_dim = output_dim
+        self.has_conf = output_dim > 1
+        # Sky head parameters (always 1 channel)
+        self.use_sky_head = use_sky_head
+        self.sky_activation = sky_activation
+        # Fixed 4 intermediate outputs
+        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
+        # -------------------- token pre-norm + per-stage projection --------------------
+        if norm_type == "layer":
+            self.norm = nn.LayerNorm(dim_in)
+        elif norm_type == "idt":
+            self.norm = nn.Identity()
+        else:
+            raise Exception(f"Unknown norm_type {norm_type}, should be 'layer' or 'idt'.")
+        self.projects = nn.ModuleList(
+            [nn.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0) for oc in out_channels]
+        )
+        # -------------------- Spatial re-size (align to common scale before fusion) --------------------
+        # Design consistent with original: relative to patch grid (x4, x2, x1, /2)
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1),
+            ]
+        )
+        # -------------------- scratch: stage adapters + main fusion chain --------------------
+        self.scratch = _make_scratch(list(out_channels), features, expand=False)
+        # Main fusion chain
+        self.scratch.refinenet1 = _make_fusion_block(features, inplace=fusion_block_inplace)
+        self.scratch.refinenet2 = _make_fusion_block(features, inplace=fusion_block_inplace)
+        self.scratch.refinenet3 = _make_fusion_block(features, inplace=fusion_block_inplace)
+        self.scratch.refinenet4 = _make_fusion_block(
+            features, has_residual=False, inplace=fusion_block_inplace
+        )
+        # Heads (shared neck1; then split into two heads)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+        )
+        ln_seq = (
+            [Permute((0, 2, 3, 1)), nn.LayerNorm(head_features_2), Permute((0, 3, 1, 2))]
+            if use_ln_for_heads
+            else []
+        )
+        # Main head
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            *ln_seq,
+            nn.ReLU(inplace=True),
+            nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+        )
+        # Sky head (fixed 1 channel)
+        if self.use_sky_head:
+            self.scratch.sky_output_conv2 = nn.Sequential(
+                nn.Conv2d(
+                    head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1
+                ),
+                *ln_seq,
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            )
+    # -------------------------------------------------------------------------
+    # Public forward (supports frame chunking to save memory)
+    # -------------------------------------------------------------------------
+    def forward(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+        chunk_size: int = 8,
+        **kwargs,
+    ) -> Dict:
+        """
+        Args:
+            feats: List of 4 entries, each entry is a tensor like [B, S, T, C] (or the 0th element of tuple/list is that tensor).
+            H, W:  Original image dimensions
+            patch_start_idx: Starting index of patch tokens in sequence (for cropping non-patch tokens)
+            chunk_size:      Chunk size along time dimension S
+        Returns:
+            Dict[str, Tensor]
+        """
+        B, S, N, C = feats[0][0].shape
+        feats = [feat[0].reshape(B * S, N, C) for feat in feats]
+        # update image info, used by the GS-DPT head
+        extra_kwargs = {}
+        if "images" in kwargs:
+            extra_kwargs.update({"images": rearrange(kwargs["images"], "B S ... -> (B S) ...")})
+        if chunk_size is None or chunk_size >= S:
+            out_dict = self._forward_impl(feats, H, W, patch_start_idx, **extra_kwargs)
+            out_dict = {k: v.view(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+            return Dict(out_dict)
+        out_dicts: List[TyDict[str, torch.Tensor]] = []
+        for s0 in range(0, S, chunk_size):
+            s1 = min(s0 + chunk_size, S)
+            kw = {}
+            if "images" in extra_kwargs:
+                kw.update({"images": extra_kwargs["images"][s0:s1]})
+            out_dicts.append(
+                self._forward_impl([f[s0:s1] for f in feats], H, W, patch_start_idx, **kw)
+            )
+        out_dict = {k: torch.cat([od[k] for od in out_dicts], dim=0) for k in out_dicts[0].keys()}
+        out_dict = {k: v.view(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+        return Dict(out_dict)
+    # -------------------------------------------------------------------------
+    # Internal forward (single chunk)
+    # -------------------------------------------------------------------------
+    def _forward_impl(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+    ) -> TyDict[str, torch.Tensor]:
+        B, _, C = feats[0].shape
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized_feats = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats[take_idx][:, patch_start_idx:]  # [B*S, N_patch, C]
+            x = self.norm(x)
+            # permute -> contiguous before reshape to keep conv input contiguous
+            x = x.permute(0, 2, 1).contiguous().reshape(B, C, ph, pw)  # [B*S, C, ph, pw]
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = self._add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)  # Align scale
+            resized_feats.append(x)
+        # 2) Fusion pyramid (main branch only)
+        fused = self._fuse(resized_feats)
+        # 3) Upsample to target resolution, optionally add position encoding again
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+        fused = self.scratch.output_conv1(fused)
+        fused = custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
+        if self.pos_embed:
+            fused = self._add_pos_embed(fused, W, H)
+        # 4) Shared neck1
+        feat = fused
+        # 5) Main head: logits -> activation
+        main_logits = self.scratch.output_conv2(feat)
+        outs: TyDict[str, torch.Tensor] = {}
+        if self.has_conf:
+            fmap = main_logits.permute(0, 2, 3, 1)
+            pred = self._apply_activation_single(fmap[..., :-1], self.activation)
+            conf = self._apply_activation_single(fmap[..., -1], self.conf_activation)
+            outs[self.head_main] = pred.squeeze(1)
+            outs[f"{self.head_main}_conf"] = conf.squeeze(1)
+        else:
+            outs[self.head_main] = self._apply_activation_single(
+                main_logits, self.activation
+            ).squeeze(1)
+        # 6) Sky head (fixed 1 channel)
+        if self.use_sky_head:
+            sky_logits = self.scratch.sky_output_conv2(feat)
+            outs[self.sky_name] = self._apply_sky_activation(sky_logits).squeeze(1)
+        return outs
+    # -------------------------------------------------------------------------
+    # Subroutines
+    # -------------------------------------------------------------------------
+    def _fuse(self, feats: List[torch.Tensor]) -> torch.Tensor:
+        """
+        4-layer top-down fusion, returns finest scale features (after fusion, before neck1).
+        """
+        l1, l2, l3, l4 = feats
+        l1_rn = self.scratch.layer1_rn(l1)
+        l2_rn = self.scratch.layer2_rn(l2)
+        l3_rn = self.scratch.layer3_rn(l3)
+        l4_rn = self.scratch.layer4_rn(l4)
+        # 4 -> 3 -> 2 -> 1
+        out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
+        out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
+        out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
+        out = self.scratch.refinenet1(out, l1_rn)
+        return out
+    def _apply_activation_single(
+        self, x: torch.Tensor, activation: str = "linear"
+    ) -> torch.Tensor:
+        """
+        Apply activation to single channel output, maintaining semantic consistency with value branch in multi-channel case.
+        Supports: exp / relu / sigmoid / softplus / tanh / linear / expp1
+        """
+        act = activation.lower() if isinstance(activation, str) else activation
+        if act == "exp":
+            return torch.exp(x)
+        if act == "expp1":
+            return torch.exp(x) + 1
+        if act == "expm1":
+            return torch.expm1(x)
+        if act == "relu":
+            return torch.relu(x)
+        if act == "sigmoid":
+            return torch.sigmoid(x)
+        if act == "softplus":
+            return torch.nn.functional.softplus(x)
+        if act == "tanh":
+            return torch.tanh(x)
+        # Default linear
+        return x
+    def _apply_sky_activation(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Sky head activation (fixed 1 channel):
+          * 'sigmoid' -> Sigmoid probability map
+          * 'relu'    -> ReLU positive domain output
+          * 'linear'  -> Original value (logits)
+        """
+        act = (
+            self.sky_activation.lower()
+            if isinstance(self.sky_activation, str)
+            else self.sky_activation
+        )
+        if act == "sigmoid":
+            return torch.sigmoid(x)
+        if act == "relu":
+            return torch.relu(x)
+        # 'linear'
+        return x
+    def _add_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        """Simple UV position encoding directly added to feature map."""
+        pw, ph = x.shape[-1], x.shape[-2]
+        pe = create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pe = position_grid_to_embed(pe, x.shape[1]) * ratio
+        pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pe
+# -----------------------------------------------------------------------------
+# Building blocks (preserved, consistent with original)
+# -----------------------------------------------------------------------------
+def _make_fusion_block(
+    features: int,
+    size: Tuple[int, int] = None,
+    has_residual: bool = True,
+    groups: int = 1,
+    inplace: bool = False,
+) -> nn.Module:
+    return FeatureFusionBlock(
+        features=features,
+        activation=nn.ReLU(inplace=inplace),
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=size,
+        has_residual=has_residual,
+        groups=groups,
+    )
+def _make_scratch(
+    in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False
+) -> nn.Module:
+    scratch = nn.Module()
+    # Optional expansion by stage
+    c1 = out_shape
+    c2 = out_shape * (2 if expand else 1)
+    c3 = out_shape * (4 if expand else 1)
+    c4 = out_shape * (8 if expand else 1)
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], c1, 3, 1, 1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], c2, 3, 1, 1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], c3, 3, 1, 1, bias=False, groups=groups)
+    scratch.layer4_rn = nn.Conv2d(in_shape[3], c4, 3, 1, 1, bias=False, groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Lightweight residual convolution block for fusion"""
+    def __init__(self, features: int, activation: nn.Module, bn: bool, groups: int = 1) -> None:
+        super().__init__()
+        self.bn = bn
+        self.groups = groups
+        self.conv1 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+        self.conv2 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+        self.norm1 = None
+        self.norm2 = None
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.norm1 is not None:
+            out = self.norm1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.norm2 is not None:
+            out = self.norm2(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Top-down fusion block: (optional) residual merge + upsampling + 1x1 contraction"""
+    def __init__(
+        self,
+        features: int,
+        activation: nn.Module,
+        deconv: bool = False,
+        bn: bool = False,
+        expand: bool = False,
+        align_corners: bool = True,
+        size: Tuple[int, int] = None,
+        has_residual: bool = True,
+        groups: int = 1,
+    ) -> None:
+        super().__init__()
+        self.align_corners = align_corners
+        self.size = size
+        self.has_residual = has_residual
+        self.resConfUnit1 = (
+            ResidualConvUnit(features, activation, bn, groups=groups) if has_residual else None
+        )
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=groups)
+        out_features = (features // 2) if expand else features
+        self.out_conv = nn.Conv2d(features, out_features, 1, 1, 0, bias=True, groups=groups)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs: torch.Tensor, size: Tuple[int, int] = None) -> torch.Tensor:  # type: ignore[override]
+        """
+        xs:
+          - xs[0]: Top branch input
+          - xs[1]: Lateral input (can do residual addition with top branch)
+        """
+        y = xs[0]
+        if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
+            y = self.skip_add.add(y, self.resConfUnit1(xs[1]))
+        y = self.resConfUnit2(y)
+        # Upsampling
+        if (size is None) and (self.size is None):
+            up_kwargs = {"scale_factor": 2}
+        elif size is None:
+            up_kwargs = {"size": self.size}
+        else:
+            up_kwargs = {"size": size}
+        y = custom_interpolate(y, **up_kwargs, mode="bilinear", align_corners=self.align_corners)
+        y = self.out_conv(y)
+        return y

Depth-Anything-3-anysize/src/depth_anything_3/model/dualdpt.py ADDED Viewed

	@@ -0,0 +1,488 @@

+# flake8: noqa E501
+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Sequence, Tuple
+import torch
+import torch.nn as nn
+from addict import Dict
+from depth_anything_3.model.dpt import _make_fusion_block, _make_scratch
+from depth_anything_3.model.utils.head_utils import (
+    Permute,
+    create_uv_grid,
+    custom_interpolate,
+    position_grid_to_embed,
+)
+class DualDPT(nn.Module):
+    """
+    Dual-head DPT for dense prediction with an always-on auxiliary head.
+    Architectural notes:
+      - Sky/object branches are removed.
+      - `intermediate_layer_idx` is fixed to (0, 1, 2, 3).
+      - Auxiliary head has its **own** fusion blocks (no fusion_inplace / no sharing).
+      - Auxiliary head is internally multi-level; **only the final level** is returned.
+      - Returns a **dict** with keys from `head_names`, e.g.:
+          { main_name, f"{main_name}_conf", aux_name, f"{aux_name}_conf" }
+      - `feature_only` is fixed to False.
+    """
+    def __init__(
+        self,
+        dim_in: int,
+        *,
+        patch_size: int = 14,
+        output_dim: int = 2,
+        activation: str = "exp",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = True,
+        down_ratio: int = 1,
+        aux_pyramid_levels: int = 4,
+        aux_out1_conv_num: int = 5,
+        head_names: Tuple[str, str] = ("depth", "ray"),
+    ) -> None:
+        super().__init__()
+        # -------------------- configuration --------------------
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.down_ratio = down_ratio
+        self.aux_levels = aux_pyramid_levels
+        self.aux_out1_conv_num = aux_out1_conv_num
+        # names ONLY come from config (no hard-coded strings elsewhere)
+        self.head_main, self.head_aux = head_names
+        # Always expect 4 scales; enforce intermediate idx = (0, 1, 2, 3)
+        self.intermediate_layer_idx: Tuple[int, int, int, int] = (0, 1, 2, 3)
+        # -------------------- token pre-norm + per-stage projection --------------------
+        self.norm = nn.LayerNorm(dim_in)
+        self.projects = nn.ModuleList(
+            [nn.Conv2d(dim_in, oc, kernel_size=1, stride=1, padding=0) for oc in out_channels]
+        )
+        # -------------------- spatial re-sizers (align to common scale before fusion) --------------------
+        # design: stage strides (x4, x2, x1, /2) relative to patch grid to align to a common pivot scale
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    out_channels[0], out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    out_channels[1], out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(out_channels[3], out_channels[3], kernel_size=3, stride=2, padding=1),
+            ]
+        )
+        # -------------------- scratch: stage adapters + fusion (main & aux are separate) --------------------
+        self.scratch = _make_scratch(list(out_channels), features, expand=False)
+        # Main fusion chain (independent)
+        self.scratch.refinenet1 = _make_fusion_block(features)
+        self.scratch.refinenet2 = _make_fusion_block(features)
+        self.scratch.refinenet3 = _make_fusion_block(features)
+        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)
+        # Primary head neck + head (independent)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(
+            head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+        )
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+        )
+        # Auxiliary fusion chain (completely separate; no sharing, i.e., "fusion_inplace=False")
+        self.scratch.refinenet1_aux = _make_fusion_block(features)
+        self.scratch.refinenet2_aux = _make_fusion_block(features)
+        self.scratch.refinenet3_aux = _make_fusion_block(features)
+        self.scratch.refinenet4_aux = _make_fusion_block(features, has_residual=False)
+        # Aux pre-head per level (we will only *return final level*)
+        self.scratch.output_conv1_aux = nn.ModuleList(
+            [self._make_aux_out1_block(head_features_1) for _ in range(self.aux_levels)]
+        )
+        # Aux final projection per level
+        use_ln = True
+        ln_seq = (
+            [Permute((0, 2, 3, 1)), nn.LayerNorm(head_features_2), Permute((0, 3, 1, 2))]
+            if use_ln
+            else []
+        )
+        self.scratch.output_conv2_aux = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv2d(
+                        head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1
+                    ),
+                    *ln_seq,
+                    nn.ReLU(inplace=True),
+                    nn.Conv2d(head_features_2, 7, kernel_size=1, stride=1, padding=0),
+                )
+                for _ in range(self.aux_levels)
+            ]
+        )
+    # -------------------------------------------------------------------------
+    # Public forward (supports frame chunking for memory)
+    # -------------------------------------------------------------------------
+    def forward(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+        chunk_size: int = 8,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Args:
+            aggregated_tokens_list: List of 4 tensors [B, S, T, C] from transformer.
+            images:                [B, S, 3, H, W], in [0, 1].
+            patch_start_idx:       Patch-token start in the token sequence (to drop non-patch tokens).
+            frames_chunk_size:     Optional chunking along S for memory.
+        Returns:
+            Dict[str, Tensor] with keys based on `head_names`, e.g.:
+                self.head_main, f"{self.head_main}_conf",
+                self.head_aux,  f"{self.head_aux}_conf"
+            Shapes:
+              main:    [B, S, out_dim, H/down_ratio, W/down_ratio]
+              main_cf: [B, S, 1,       H/down_ratio, W/down_ratio]
+              aux:     [B, S, 7,       H/down_ratio, W/down_ratio]
+              aux_cf:  [B, S, 1,       H/down_ratio, W/down_ratio]
+        """
+        B, S, N, C = feats[0][0].shape
+        feats = [feat[0].reshape(B * S, N, C) for feat in feats]
+        if chunk_size is None or chunk_size >= S:
+            out_dict = self._forward_impl(feats, H, W, patch_start_idx)
+            out_dict = {k: v.reshape(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+            return Dict(out_dict)
+        out_dicts = []
+        for s0 in range(0, S, chunk_size):
+            s1 = min(s0 + chunk_size, S)
+            out_dict = self._forward_impl(
+                [feat[s0:s1] for feat in feats],
+                H,
+                W,
+                patch_start_idx,
+            )
+            out_dicts.append(out_dict)
+        out_dict = {
+            k: torch.cat([out_dict[k] for out_dict in out_dicts], dim=0)
+            for k in out_dicts[0].keys()
+        }
+        out_dict = {k: v.view(B, S, *v.shape[1:]) for k, v in out_dict.items()}
+        return Dict(out_dict)
+    # -------------------------------------------------------------------------
+    # Internal forward (single chunk)
+    # -------------------------------------------------------------------------
+    def _forward_impl(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+    ) -> Dict[str, torch.Tensor]:
+        B, _, C = feats[0].shape
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized_feats = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats[take_idx][:, patch_start_idx:]
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).reshape(B, C, ph, pw)  # [B*S, C, ph, pw]
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = self._add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)  # align scales
+            resized_feats.append(x)
+        # 2) Fuse pyramid (main & aux are completely independent)
+        fused_main, fused_aux_pyr = self._fuse(resized_feats)
+        # 3) Upsample to target resolution and (optional) add pos-embed again
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+        fused_main = custom_interpolate(
+            fused_main, (h_out, w_out), mode="bilinear", align_corners=True
+        )
+        if self.pos_embed:
+            fused_main = self._add_pos_embed(fused_main, W, H)
+        # Primary head: conv1 -> conv2 -> activate
+        # fused_main = self.scratch.output_conv1(fused_main)
+        main_logits = self.scratch.output_conv2(fused_main)
+        fmap = main_logits.permute(0, 2, 3, 1)
+        main_pred = self._apply_activation_single(fmap[..., :-1], self.activation)
+        main_conf = self._apply_activation_single(fmap[..., -1], self.conf_activation)
+        # Auxiliary head (multi-level inside) -> only last level returned (after activation)
+        last_aux = fused_aux_pyr[-1]
+        if self.pos_embed:
+            last_aux = self._add_pos_embed(last_aux, W, H)
+        # neck (per-level pre-conv) then final projection (only for last level)
+        # last_aux = self.scratch.output_conv1_aux[-1](last_aux)
+        last_aux_logits = self.scratch.output_conv2_aux[-1](last_aux)
+        fmap_last = last_aux_logits.permute(0, 2, 3, 1)
+        aux_pred = self._apply_activation_single(fmap_last[..., :-1], "linear")
+        aux_conf = self._apply_activation_single(fmap_last[..., -1], self.conf_activation)
+        return {
+            self.head_main: main_pred.squeeze(-1),
+            f"{self.head_main}_conf": main_conf,
+            self.head_aux: aux_pred,
+            f"{self.head_aux}_conf": aux_conf,
+        }
+    # -------------------------------------------------------------------------
+    # Subroutines
+    # -------------------------------------------------------------------------
+    def _fuse(self, feats: List[torch.Tensor]) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Feature pyramid fusion.
+        Returns:
+            fused_main: Tensor at finest scale (after refinenet1)
+            aux_pyr:    List of aux tensors at each level (pre out_conv1_aux)
+        """
+        l1, l2, l3, l4 = feats
+        l1_rn = self.scratch.layer1_rn(l1)
+        l2_rn = self.scratch.layer2_rn(l2)
+        l3_rn = self.scratch.layer3_rn(l3)
+        l4_rn = self.scratch.layer4_rn(l4)
+        # level 4 -> 3
+        out = self.scratch.refinenet4(l4_rn, size=l3_rn.shape[2:])
+        aux_out = self.scratch.refinenet4_aux(l4_rn, size=l3_rn.shape[2:])
+        aux_list: List[torch.Tensor] = []
+        if self.aux_levels >= 4:
+            aux_list.append(aux_out)
+        # level 3 -> 2
+        out = self.scratch.refinenet3(out, l3_rn, size=l2_rn.shape[2:])
+        aux_out = self.scratch.refinenet3_aux(aux_out, l3_rn, size=l2_rn.shape[2:])
+        if self.aux_levels >= 3:
+            aux_list.append(aux_out)
+        # level 2 -> 1
+        out = self.scratch.refinenet2(out, l2_rn, size=l1_rn.shape[2:])
+        aux_out = self.scratch.refinenet2_aux(aux_out, l2_rn, size=l1_rn.shape[2:])
+        if self.aux_levels >= 2:
+            aux_list.append(aux_out)
+        # level 1 (final)
+        out = self.scratch.refinenet1(out, l1_rn)
+        aux_out = self.scratch.refinenet1_aux(aux_out, l1_rn)
+        aux_list.append(aux_out)
+        out = self.scratch.output_conv1(out)
+        aux_list = [self.scratch.output_conv1_aux[i](aux) for i, aux in enumerate(aux_list)]
+        return out, aux_list
+    def _add_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        """Simple UV positional embedding added to feature maps."""
+        pw, ph = x.shape[-1], x.shape[-2]
+        pe = create_uv_grid(pw, ph, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pe = position_grid_to_embed(pe, x.shape[1]) * ratio
+        pe = pe.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pe
+    def _make_aux_out1_block(self, in_ch: int) -> nn.Sequential:
+        """Factory for the aux pre-head stack before the final 1x1 projection."""
+        if self.aux_out1_conv_num == 5:
+            return nn.Sequential(
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+            )
+        if self.aux_out1_conv_num == 3:
+            return nn.Sequential(
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+                nn.Conv2d(in_ch // 2, in_ch, 3, 1, 1),
+                nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1),
+            )
+        if self.aux_out1_conv_num == 1:
+            return nn.Sequential(nn.Conv2d(in_ch, in_ch // 2, 3, 1, 1))
+        raise ValueError(f"aux_out1_conv_num {self.aux_out1_conv_num} not supported")
+    def _apply_activation_single(
+        self, x: torch.Tensor, activation: str = "linear"
+    ) -> torch.Tensor:
+        """
+        Apply activation to single channel output, maintaining semantic consistency with value branch in multi-channel case.
+        Supports: exp / relu / sigmoid / softplus / tanh / linear / expp1
+        """
+        act = activation.lower() if isinstance(activation, str) else activation
+        if act == "exp":
+            return torch.exp(x)
+        if act == "expm1":
+            return torch.expm1(x)
+        if act == "expp1":
+            return torch.exp(x) + 1
+        if act == "relu":
+            return torch.relu(x)
+        if act == "sigmoid":
+            return torch.sigmoid(x)
+        if act == "softplus":
+            return torch.nn.functional.softplus(x)
+        if act == "tanh":
+            return torch.tanh(x)
+        # Default linear
+        return x
+# # -----------------------------------------------------------------------------
+# # Building blocks (tidy)
+# # -----------------------------------------------------------------------------
+# def _make_fusion_block(
+#     features: int,
+#     size: Tuple[int, int] = None,
+#     has_residual: bool = True,
+#     groups: int = 1,
+#     inplace: bool = False,  # <- activation uses inplace=True by default; not related to "fusion_inplace"
+# ) -> nn.Module:
+#     return FeatureFusionBlock(
+#         features=features,
+#         activation=nn.ReLU(inplace=inplace),
+#         deconv=False,
+#         bn=False,
+#         expand=False,
+#         align_corners=True,
+#         size=size,
+#         has_residual=has_residual,
+#         groups=groups,
+#     )
+# def _make_scratch(
+#     in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False
+# ) -> nn.Module:
+#     scratch = nn.Module()
+#     # optionally expand widths by stage
+#     c1 = out_shape
+#     c2 = out_shape * (2 if expand else 1)
+#     c3 = out_shape * (4 if expand else 1)
+#     c4 = out_shape * (8 if expand else 1)
+#     scratch.layer1_rn = nn.Conv2d(in_shape[0], c1, 3, 1, 1, bias=False, groups=groups)
+#     scratch.layer2_rn = nn.Conv2d(in_shape[1], c2, 3, 1, 1, bias=False, groups=groups)
+#     scratch.layer3_rn = nn.Conv2d(in_shape[2], c3, 3, 1, 1, bias=False, groups=groups)
+#     scratch.layer4_rn = nn.Conv2d(in_shape[3], c4, 3, 1, 1, bias=False, groups=groups)
+#     return scratch
+# class ResidualConvUnit(nn.Module):
+#     """Lightweight residual conv block used within fusion."""
+#     def __init__(self, features: int, activation: nn.Module, bn: bool, groups: int = 1) -> None:
+#         super().__init__()
+#         self.bn = bn
+#         self.groups = groups
+#         self.conv1 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+#         self.conv2 = nn.Conv2d(features, features, 3, 1, 1, bias=True, groups=groups)
+#         self.norm1 = None
+#         self.norm2 = None
+#         self.activation = activation
+#         self.skip_add = nn.quantized.FloatFunctional()
+#     def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore[override]
+#         out = self.activation(x)
+#         out = self.conv1(out)
+#         if self.norm1 is not None:
+#             out = self.norm1(out)
+#         out = self.activation(out)
+#         out = self.conv2(out)
+#         if self.norm2 is not None:
+#             out = self.norm2(out)
+#         return self.skip_add.add(out, x)
+# class FeatureFusionBlock(nn.Module):
+#     """Top-down fusion block: (optional) residual merge + upsample + 1x1 shrink."""
+#     def __init__(
+#         self,
+#         features: int,
+#         activation: nn.Module,
+#         deconv: bool = False,
+#         bn: bool = False,
+#         expand: bool = False,
+#         align_corners: bool = True,
+#         size: Tuple[int, int] = None,
+#         has_residual: bool = True,
+#         groups: int = 1,
+#     ) -> None:
+#         super().__init__()
+#         self.align_corners = align_corners
+#         self.size = size
+#         self.has_residual = has_residual
+#         self.resConfUnit1 = (
+#             ResidualConvUnit(features, activation, bn, groups=groups) if has_residual else None
+#         )
+#         self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=groups)
+#         out_features = (features // 2) if expand else features
+#         self.out_conv = nn.Conv2d(features, out_features, 1, 1, 0, bias=True, groups=groups)
+#         self.skip_add = nn.quantized.FloatFunctional()
+#     def forward(self, *xs: torch.Tensor, size: Tuple[int, int] = None) -> torch.Tensor:  # type: ignore[override]
+#         """
+#         xs:
+#           - xs[0]: top input
+#           - xs[1]: (optional) lateral (to be added with residual)
+#         """
+#         y = xs[0]
+#         if self.has_residual and len(xs) > 1 and self.resConfUnit1 is not None:
+#             y = self.skip_add.add(y, self.resConfUnit1(xs[1]))
+#         y = self.resConfUnit2(y)
+#         # upsample
+#         if (size is None) and (self.size is None):
+#             up_kwargs = {"scale_factor": 2}
+#         elif size is None:
+#             up_kwargs = {"size": self.size}
+#         else:
+#             up_kwargs = {"size": size}
+#         y = custom_interpolate(y, **up_kwargs, mode="bilinear", align_corners=self.align_corners)
+#         y = self.out_conv(y)
+#         return y

Depth-Anything-3-anysize/src/depth_anything_3/model/gs_adapter.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import torch
+from einops import einsum, rearrange, repeat
+from torch import nn
+from depth_anything_3.model.utils.transform import cam_quat_xyzw_to_world_quat_wxyz
+from depth_anything_3.specs import Gaussians
+from depth_anything_3.utils.geometry import affine_inverse, get_world_rays, sample_image_grid
+from depth_anything_3.utils.pose_align import batch_align_poses_umeyama
+from depth_anything_3.utils.sh_helpers import rotate_sh
+class GaussianAdapter(nn.Module):
+    def __init__(
+        self,
+        sh_degree: int = 0,
+        pred_color: bool = False,
+        pred_offset_depth: bool = False,
+        pred_offset_xy: bool = True,
+        gaussian_scale_min: float = 1e-5,
+        gaussian_scale_max: float = 30.0,
+    ):
+        super().__init__()
+        self.sh_degree = sh_degree
+        self.pred_color = pred_color
+        self.pred_offset_depth = pred_offset_depth
+        self.pred_offset_xy = pred_offset_xy
+        self.gaussian_scale_min = gaussian_scale_min
+        self.gaussian_scale_max = gaussian_scale_max
+        # Create a mask for the spherical harmonics coefficients. This ensures that at
+        # initialization, the coefficients are biased towards having a large DC
+        # component and small view-dependent components.
+        if not pred_color:
+            self.register_buffer(
+                "sh_mask",
+                torch.ones((self.d_sh,), dtype=torch.float32),
+                persistent=False,
+            )
+            for degree in range(1, sh_degree + 1):
+                self.sh_mask[degree**2 : (degree + 1) ** 2] = 0.1 * 0.25**degree
+    def forward(
+        self,
+        extrinsics: torch.Tensor,  # "*#batch 4 4"
+        intrinsics: torch.Tensor,  # "*#batch 3 3"
+        depths: torch.Tensor,  # "*#batch"
+        opacities: torch.Tensor,  # "*#batch" | "*#batch _"
+        raw_gaussians: torch.Tensor,  # "*#batch _"
+        image_shape: tuple[int, int],
+        eps: float = 1e-8,
+        gt_extrinsics: Optional[torch.Tensor] = None,  # "*#batch 4 4"
+        **kwargs,
+    ) -> Gaussians:
+        device = extrinsics.device
+        dtype = raw_gaussians.dtype
+        H, W = image_shape
+        b, v = raw_gaussians.shape[:2]
+        # get cam2worlds and intr_normed to adapt to 3DGS codebase
+        cam2worlds = affine_inverse(extrinsics)
+        intr_normed = intrinsics.clone().detach()
+        intr_normed[..., 0, :] /= W
+        intr_normed[..., 1, :] /= H
+        # 1. compute 3DGS means
+        # 1.1) offset the predicted depth if needed
+        if self.pred_offset_depth:
+            gs_depths = depths + raw_gaussians[..., -1]
+            raw_gaussians = raw_gaussians[..., :-1]
+        else:
+            gs_depths = depths
+        # 1.2) align predicted poses with GT if needed
+        if gt_extrinsics is not None and not torch.equal(extrinsics, gt_extrinsics):
+            try:
+                _, _, pose_scales = batch_align_poses_umeyama(
+                    gt_extrinsics.detach().float(),
+                    extrinsics.detach().float(),
+                )
+            except Exception:
+                pose_scales = torch.ones_like(extrinsics[:, 0, 0, 0])
+            pose_scales = torch.clamp(pose_scales, min=1 / 3.0, max=3.0)
+            cam2worlds[:, :, :3, 3] = cam2worlds[:, :, :3, 3] * rearrange(
+                pose_scales, "b -> b () ()"
+            )  # [b, i, j]
+            gs_depths = gs_depths * rearrange(pose_scales, "b -> b () () ()")  # [b, v, h, w]
+        # 1.3) casting xy in image space
+        xy_ray, _ = sample_image_grid((H, W), device)
+        xy_ray = xy_ray[None, None, ...].expand(b, v, -1, -1, -1)  # b v h w xy
+        # offset xy if needed
+        if self.pred_offset_xy:
+            pixel_size = 1 / torch.tensor((W, H), dtype=xy_ray.dtype, device=device)
+            offset_xy = raw_gaussians[..., :2]
+            xy_ray = xy_ray + offset_xy * pixel_size
+            raw_gaussians = raw_gaussians[..., 2:]  # skip the offset_xy
+        # 1.4) unproject depth + xy to world ray
+        origins, directions = get_world_rays(
+            xy_ray,
+            repeat(cam2worlds, "b v i j -> b v h w i j", h=H, w=W),
+            repeat(intr_normed, "b v i j -> b v h w i j", h=H, w=W),
+        )
+        gs_means_world = origins + directions * gs_depths[..., None]
+        gs_means_world = rearrange(gs_means_world, "b v h w d -> b (v h w) d")
+        # 2. compute other GS attributes
+        scales, rotations, sh = raw_gaussians.split((3, 4, 3 * self.d_sh), dim=-1)
+        # 2.1) 3DGS scales
+        # make the scale invarient to resolution
+        scale_min = self.gaussian_scale_min
+        scale_max = self.gaussian_scale_max
+        scales = scale_min + (scale_max - scale_min) * scales.sigmoid()
+        pixel_size = 1 / torch.tensor((W, H), dtype=dtype, device=device)
+        multiplier = self.get_scale_multiplier(intr_normed, pixel_size)
+        gs_scales = scales * gs_depths[..., None] * multiplier[..., None, None, None]
+        gs_scales = rearrange(gs_scales, "b v h w d -> b (v h w) d")
+        # 2.2) 3DGS quaternion (world space)
+        # due to historical issue, assume quaternion in order xyzw, not wxyz
+        # Normalize the quaternion features to yield a valid quaternion.
+        rotations = rotations / (rotations.norm(dim=-1, keepdim=True) + eps)
+        # rotate them to world space
+        cam_quat_xyzw = rearrange(rotations, "b v h w c -> b (v h w) c")
+        c2w_mat = repeat(
+            cam2worlds,
+            "b v i j -> b (v h w) i j",
+            h=H,
+            w=W,
+        )
+        world_quat_wxyz = cam_quat_xyzw_to_world_quat_wxyz(cam_quat_xyzw, c2w_mat)
+        gs_rotations_world = world_quat_wxyz  # b (v h w) c
+        # 2.3) 3DGS color / SH coefficient (world space)
+        sh = rearrange(sh, "... (xyz d_sh) -> ... xyz d_sh", xyz=3)
+        if not self.pred_color:
+            sh = sh * self.sh_mask
+        if self.pred_color or self.sh_degree == 0:
+            # predict pre-computed color or predict only DC band, no need to transform
+            gs_sh_world = sh
+        else:
+            gs_sh_world = rotate_sh(sh, cam2worlds[:, :, None, None, None, :3, :3])
+        gs_sh_world = rearrange(gs_sh_world, "b v h w xyz d_sh -> b (v h w) xyz d_sh")
+        # 2.4) 3DGS opacity
+        gs_opacities = rearrange(opacities, "b v h w ... -> b (v h w) ...")
+        return Gaussians(
+            means=gs_means_world,
+            harmonics=gs_sh_world,
+            opacities=gs_opacities,
+            scales=gs_scales,
+            rotations=gs_rotations_world,
+        )
+    def get_scale_multiplier(
+        self,
+        intrinsics: torch.Tensor,  # "*#batch 3 3"
+        pixel_size: torch.Tensor,  # "*#batch 2"
+        multiplier: float = 0.1,
+    ) -> torch.Tensor:  # " *batch"
+        xy_multipliers = multiplier * einsum(
+            intrinsics[..., :2, :2].float().inverse().to(intrinsics),
+            pixel_size,
+            "... i j, j -> ... i",
+        )
+        return xy_multipliers.sum(dim=-1)
+    @property
+    def d_sh(self) -> int:
+        return 1 if self.pred_color else (self.sh_degree + 1) ** 2
+    @property
+    def d_in(self) -> int:
+        # provided as reference to the gs_dpt output dim
+        raw_gs_dim = 0
+        if self.pred_offset_xy:
+            raw_gs_dim += 2
+        raw_gs_dim += 3  # scales
+        raw_gs_dim += 4  # quaternion
+        raw_gs_dim += 3 * self.d_sh  # color
+        if self.pred_offset_depth:
+            raw_gs_dim += 1
+        return raw_gs_dim

Depth-Anything-3-anysize/src/depth_anything_3/model/gsdpt.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict as TyDict
+from typing import List, Sequence
+import torch
+import torch.nn as nn
+from depth_anything_3.model.dpt import DPT
+from depth_anything_3.model.utils.head_utils import activate_head_gs, custom_interpolate
+class GSDPT(DPT):
+    def __init__(
+        self,
+        dim_in: int,
+        patch_size: int = 14,
+        output_dim: int = 4,
+        activation: str = "linear",
+        conf_activation: str = "sigmoid",
+        features: int = 256,
+        out_channels: Sequence[int] = (256, 512, 1024, 1024),
+        pos_embed: bool = True,
+        feature_only: bool = False,
+        down_ratio: int = 1,
+        conf_dim: int = 1,
+        norm_type: str = "idt",  # use to match legacy GS-DPT head, "idt" / "layer"
+        fusion_block_inplace: bool = False,
+    ) -> None:
+        super().__init__(
+            dim_in=dim_in,
+            patch_size=patch_size,
+            output_dim=output_dim,
+            activation=activation,
+            conf_activation=conf_activation,
+            features=features,
+            out_channels=out_channels,
+            pos_embed=pos_embed,
+            down_ratio=down_ratio,
+            head_name="raw_gs",
+            use_sky_head=False,
+            norm_type=norm_type,
+            fusion_block_inplace=fusion_block_inplace,
+        )
+        self.conf_dim = conf_dim
+        if conf_dim and conf_dim > 1:
+            assert (
+                conf_activation == "linear"
+            ), "use linear prediction when using view-dependent opacity"
+        merger_out_dim = features if feature_only else features // 2
+        self.images_merger = nn.Sequential(
+            nn.Conv2d(3, merger_out_dim // 4, 3, 1, 1),  # fewer channels first
+            nn.GELU(),
+            nn.Conv2d(merger_out_dim // 4, merger_out_dim // 2, 3, 1, 1),
+            nn.GELU(),
+            nn.Conv2d(merger_out_dim // 2, merger_out_dim, 3, 1, 1),
+            nn.GELU(),
+        )
+    # -------------------------------------------------------------------------
+    # Internal forward (single chunk)
+    # -------------------------------------------------------------------------
+    def _forward_impl(
+        self,
+        feats: List[torch.Tensor],
+        H: int,
+        W: int,
+        patch_start_idx: int,
+        images: torch.Tensor,
+    ) -> TyDict[str, torch.Tensor]:
+        B, _, C = feats[0].shape
+        ph, pw = H // self.patch_size, W // self.patch_size
+        resized_feats = []
+        for stage_idx, take_idx in enumerate(self.intermediate_layer_idx):
+            x = feats[take_idx][:, patch_start_idx:]  # [B*S, N_patch, C]
+            x = self.norm(x)
+            x = x.permute(0, 2, 1).reshape(B, C, ph, pw)  # [B*S, C, ph, pw]
+            x = self.projects[stage_idx](x)
+            if self.pos_embed:
+                x = self._add_pos_embed(x, W, H)
+            x = self.resize_layers[stage_idx](x)  # Align scale
+            resized_feats.append(x)
+        # 2) Fusion pyramid (main branch only)
+        fused = self._fuse(resized_feats)
+        fused = self.scratch.output_conv1(fused)
+        # 3) Upsample to target resolution, optionally add position encoding again
+        h_out = int(ph * self.patch_size / self.down_ratio)
+        w_out = int(pw * self.patch_size / self.down_ratio)
+        fused = custom_interpolate(fused, (h_out, w_out), mode="bilinear", align_corners=True)
+        # inject the image information here
+        fused = fused + self.images_merger(images)
+        if self.pos_embed:
+            fused = self._add_pos_embed(fused, W, H)
+        # 4) Shared neck1
+        # feat = self.scratch.output_conv1(fused)
+        feat = fused
+        # 5) Main head: logits -> activate_head or single channel activation
+        main_logits = self.scratch.output_conv2(feat)
+        outs: TyDict[str, torch.Tensor] = {}
+        if self.has_conf:
+            pred, conf = activate_head_gs(
+                main_logits,
+                activation=self.activation,
+                conf_activation=self.conf_activation,
+                conf_dim=self.conf_dim,
+            )
+            outs[self.head_main] = pred.squeeze(1)
+            outs[f"{self.head_main}_conf"] = conf.squeeze(1)
+        else:
+            outs[self.head_main] = self._apply_activation_single(main_logits).squeeze(1)
+        return outs

Depth-Anything-3-anysize/src/depth_anything_3/model/utils/attention.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 # noqa
+from typing import Callable, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+        qk_norm: bool = False,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        # Debug breakpoint removed for production
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        q = self.rope(q, pos) if self.rope is not None else q
+        k = self.rope(k, pos) if self.rope is not None else k
+        x = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=self.attn_drop.p if self.training else 0.0,
+            attn_mask=attn_mask,
+        )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

Depth-Anything-3-anysize/src/depth_anything_3/model/utils/block.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable
+from torch import Tensor, nn
+from .attention import Attention, LayerScale, Mlp
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool = False,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            rope=rope,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.sample_drop_ratio = 0.0  # Equivalent to always having drop_path=0
+    def forward(self, x: Tensor, pos=None, attn_mask=None) -> Tensor:
+        def attn_residual_func(x: Tensor, pos=None, attn_mask=None) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), pos=pos, attn_mask=attn_mask))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        # drop_path is always 0, so always take the else branch
+        x = x + attn_residual_func(x, pos=pos, attn_mask=attn_mask)
+        x = x + ffn_residual_func(x)
+        return x

Depth-Anything-3-anysize/src/depth_anything_3/model/utils/gs_renderer.py ADDED Viewed

	@@ -0,0 +1,340 @@

+# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from math import isqrt
+from typing import Literal, Optional
+import torch
+from einops import rearrange, repeat
+from tqdm import tqdm
+from depth_anything_3.specs import Gaussians
+from depth_anything_3.utils.camera_trj_helpers import (
+    interpolate_extrinsics,
+    interpolate_intrinsics,
+    render_dolly_zoom_path,
+    render_stabilization_path,
+    render_wander_path,
+    render_wobble_inter_path,
+)
+from depth_anything_3.utils.geometry import affine_inverse, as_homogeneous, get_fov
+from depth_anything_3.utils.logger import logger
+try:
+    from gsplat import rasterization
+except ImportError:
+    logger.warn(
+        "Dependency `gsplat` is required for rendering 3DGS. "
+        "Install via: pip install git+https://github.com/nerfstudio-project/"
+        "gsplat.git@0b4dddf04cb687367602c01196913cde6a743d70"
+    )
+def render_3dgs(
+    extrinsics: torch.Tensor,  # "batch_views 4 4", w2c
+    intrinsics: torch.Tensor,  # "batch_views 3 3", normalized
+    image_shape: tuple[int, int],
+    gaussian: Gaussians,
+    background_color: Optional[torch.Tensor] = None,  # "batch_views 3"
+    use_sh: bool = True,
+    num_view: int = 1,
+    color_mode: Literal["RGB+D", "RGB+ED"] = "RGB+D",
+    **kwargs,
+) -> tuple[
+    torch.Tensor,  # "batch_views 3 height width"
+    torch.Tensor,  # "batch_views height width"
+]:
+    # extract gaussian params
+    gaussian_means = gaussian.means
+    gaussian_scales = gaussian.scales
+    gaussian_quats = gaussian.rotations
+    gaussian_opacities = gaussian.opacities
+    gaussian_sh_coefficients = gaussian.harmonics
+    b, _, _ = extrinsics.shape
+    if background_color is None:
+        background_color = repeat(torch.tensor([0.0, 0.0, 0.0]), "c -> b c", b=b).to(
+            gaussian_sh_coefficients
+        )
+    if use_sh:
+        _, _, _, n = gaussian_sh_coefficients.shape
+        degree = isqrt(n) - 1
+        shs = rearrange(gaussian_sh_coefficients, "b g xyz n -> b g n xyz").contiguous()
+    else:  # use color
+        shs = (
+            gaussian_sh_coefficients.squeeze(-1).sigmoid().contiguous()
+        )  # (b, g, c), normed to (0, 1)
+    h, w = image_shape
+    fov_x, fov_y = get_fov(intrinsics).unbind(dim=-1)
+    tan_fov_x = (0.5 * fov_x).tan()
+    tan_fov_y = (0.5 * fov_y).tan()
+    focal_length_x = w / (2 * tan_fov_x)
+    focal_length_y = h / (2 * tan_fov_y)
+    view_matrix = extrinsics.float()
+    all_images = []
+    all_radii = []
+    all_depths = []
+    # render view in a batch based, each batch contains one scene
+    # assume the Gaussian parameters are originally repeated along the view dim
+    batch_scene = b // num_view
+    def index_i_gs_attr(full_attr, idx):
+        # return rearrange(full_attr, "(b v) ... -> b v ...", v=num_view)[idx, 0]
+        return full_attr[idx]
+    for i in range(batch_scene):
+        K = repeat(
+            torch.tensor(
+                [
+                    [0, 0, w / 2.0],
+                    [0, 0, h / 2.0],
+                    [0, 0, 1],
+                ]
+            ),
+            "i j -> v i j",
+            v=num_view,
+        ).to(gaussian_means)
+        K[:, 0, 0] = focal_length_x.reshape(batch_scene, num_view)[i]
+        K[:, 1, 1] = focal_length_y.reshape(batch_scene, num_view)[i]
+        i_means = index_i_gs_attr(gaussian_means, i)  # [N, 3]
+        i_scales = index_i_gs_attr(gaussian_scales, i)
+        i_quats = index_i_gs_attr(gaussian_quats, i)
+        i_opacities = index_i_gs_attr(gaussian_opacities, i)  # [N,]
+        i_colors = index_i_gs_attr(shs, i)  # [N, K, 3]
+        i_viewmats = rearrange(view_matrix, "(b v) ... -> b v ...", v=num_view)[i]  # [v, 4, 4]
+        i_backgrounds = rearrange(background_color, "(b v) ... -> b v ...", v=num_view)[
+            i
+        ]  # [v, 3]
+        render_colors, render_alphas, info = rasterization(
+            means=i_means,
+            quats=i_quats,  # [N, 4]
+            scales=i_scales,  # [N, 3]
+            opacities=i_opacities,
+            colors=i_colors,
+            viewmats=i_viewmats,  # [v, 4, 4]
+            Ks=K,  # [v, 3, 3]
+            backgrounds=i_backgrounds,
+            render_mode=color_mode,
+            width=w,
+            height=h,
+            packed=False,
+            sh_degree=degree if use_sh else None,
+        )
+        depth = render_colors[..., -1].unbind(dim=0)
+        image = rearrange(render_colors[..., :3], "v h w c -> v c h w").unbind(dim=0)
+        radii = info["radii"].unbind(dim=0)
+        try:
+            info["means2d"].retain_grad()  # [1, N, 2]
+        except Exception:
+            pass
+        all_images.extend(image)
+        all_depths.extend(depth)
+        all_radii.extend(radii)
+    return torch.stack(all_images), torch.stack(all_depths)
+def run_renderer_in_chunk_w_trj_mode(
+    gaussians: Gaussians,
+    extrinsics: torch.Tensor,  # world2cam, "batch view 4 4" | "batch view 3 4"
+    intrinsics: torch.Tensor,  # unnormed intrinsics, "batch view 3 3"
+    image_shape: tuple[int, int],
+    chunk_size: Optional[int] = 8,
+    trj_mode: Literal[
+        "original",
+        "smooth",
+        "interpolate",
+        "interpolate_smooth",
+        "wander",
+        "dolly_zoom",
+        "extend",
+        "wobble_inter",
+    ] = "smooth",
+    input_shape: Optional[tuple[int, int]] = None,
+    enable_tqdm: Optional[bool] = False,
+    **kwargs,
+) -> tuple[
+    torch.Tensor,  # color, "batch view 3 height width"
+    torch.Tensor,  # depth, "batch view height width"
+]:
+    cam2world = affine_inverse(as_homogeneous(extrinsics))
+    if input_shape is not None:
+        in_h, in_w = input_shape
+    else:
+        in_h, in_w = image_shape
+    intr_normed = intrinsics.clone().detach()
+    intr_normed[..., 0, :] /= in_w
+    intr_normed[..., 1, :] /= in_h
+    if extrinsics.shape[1] <= 1:
+        assert trj_mode in [
+            "wander",
+            "dolly_zoom",
+        ], "Please set trj_mode to 'wander' or 'dolly_zoom' when n_views=1"
+    def _smooth_trj_fn_batch(raw_c2ws, k_size=50):
+        try:
+            smooth_c2ws = torch.stack(
+                [render_stabilization_path(c2w_i, k_size) for c2w_i in raw_c2ws],
+                dim=0,
+            )
+        except Exception as e:
+            print(f"[DEBUG] Path smoothing failed with error: {e}.")
+            smooth_c2ws = raw_c2ws
+        return smooth_c2ws
+    # get rendered trj
+    if trj_mode == "original":
+        tgt_c2w = cam2world
+        tgt_intr = intr_normed
+    elif trj_mode == "smooth":
+        tgt_c2w = _smooth_trj_fn_batch(cam2world)
+        tgt_intr = intr_normed
+    elif trj_mode in ["interpolate", "interpolate_smooth", "extend"]:
+        inter_len = 8
+        total_len = (cam2world.shape[1] - 1) * inter_len
+        if total_len > 24 * 18:  # no more than 18s
+            inter_len = max(1, 24 * 10 // (cam2world.shape[1] - 1))
+        if total_len < 24 * 2:  # no less than 2s
+            inter_len = max(1, 24 * 2 // (cam2world.shape[1] - 1))
+        if inter_len > 2:
+            t = torch.linspace(0, 1, inter_len, dtype=torch.float32, device=cam2world.device)
+            t = (torch.cos(torch.pi * (t + 1)) + 1) / 2
+            tgt_c2w_b = []
+            tgt_intr_b = []
+            for b_idx in range(cam2world.shape[0]):
+                tgt_c2w = []
+                tgt_intr = []
+                for cur_idx in range(cam2world.shape[1] - 1):
+                    tgt_c2w.append(
+                        interpolate_extrinsics(
+                            cam2world[b_idx, cur_idx], cam2world[b_idx, cur_idx + 1], t
+                        )[(0 if cur_idx == 0 else 1) :]
+                    )
+                    tgt_intr.append(
+                        interpolate_intrinsics(
+                            intr_normed[b_idx, cur_idx], intr_normed[b_idx, cur_idx + 1], t
+                        )[(0 if cur_idx == 0 else 1) :]
+                    )
+                tgt_c2w_b.append(torch.cat(tgt_c2w))
+                tgt_intr_b.append(torch.cat(tgt_intr))
+            tgt_c2w = torch.stack(tgt_c2w_b)  # b v 4 4
+            tgt_intr = torch.stack(tgt_intr_b)  # b v 3 3
+        else:
+            tgt_c2w = cam2world
+            tgt_intr = intr_normed
+        if trj_mode in ["interpolate_smooth", "extend"]:
+            tgt_c2w = _smooth_trj_fn_batch(tgt_c2w)
+        if trj_mode == "extend":
+            # apply dolly_zoom and wander in the middle frame
+            assert cam2world.shape[0] == 1, "extend only supports for batch_size=1 currently."
+            mid_idx = tgt_c2w.shape[1] // 2
+            c2w_wd, intr_wd = render_wander_path(
+                tgt_c2w[0, mid_idx],
+                tgt_intr[0, mid_idx],
+                h=in_h,
+                w=in_w,
+                num_frames=max(36, min(60, mid_idx // 2)),
+                max_disp=24.0,
+            )
+            c2w_dz, intr_dz = render_dolly_zoom_path(
+                tgt_c2w[0, mid_idx],
+                tgt_intr[0, mid_idx],
+                h=in_h,
+                w=in_w,
+                num_frames=max(36, min(60, mid_idx // 2)),
+            )
+            tgt_c2w = torch.cat(
+                [
+                    tgt_c2w[:, :mid_idx],
+                    c2w_wd.unsqueeze(0),
+                    c2w_dz.unsqueeze(0),
+                    tgt_c2w[:, mid_idx:],
+                ],
+                dim=1,
+            )
+            tgt_intr = torch.cat(
+                [
+                    tgt_intr[:, :mid_idx],
+                    intr_wd.unsqueeze(0),
+                    intr_dz.unsqueeze(0),
+                    tgt_intr[:, mid_idx:],
+                ],
+                dim=1,
+            )
+    elif trj_mode in ["wander", "dolly_zoom"]:
+        if trj_mode == "wander":
+            render_fn = render_wander_path
+            extra_kwargs = {"max_disp": 24.0}
+        else:
+            render_fn = render_dolly_zoom_path
+            extra_kwargs = {"D_focus": 30.0, "max_disp": 2.0}
+        tgt_c2w = []
+        tgt_intr = []
+        for b_idx in range(cam2world.shape[0]):
+            c2w_i, intr_i = render_fn(
+                cam2world[b_idx, 0], intr_normed[b_idx, 0], h=in_h, w=in_w, **extra_kwargs
+            )
+            tgt_c2w.append(c2w_i)
+            tgt_intr.append(intr_i)
+        tgt_c2w = torch.stack(tgt_c2w)
+        tgt_intr = torch.stack(tgt_intr)
+    elif trj_mode == "wobble_inter":
+        tgt_c2w, tgt_intr = render_wobble_inter_path(
+            cam2world=cam2world,
+            intr_normed=intr_normed,
+            inter_len=10,
+            n_skip=3,
+        )
+    else:
+        raise Exception(f"trj mode [{trj_mode}] is not implemented.")
+    _, v = tgt_c2w.shape[:2]
+    tgt_extr = affine_inverse(tgt_c2w)
+    if chunk_size is None:
+        chunk_size = v
+    chunk_size = min(v, chunk_size)
+    all_colors = []
+    all_depths = []
+    for chunk_idx in tqdm(
+        range(math.ceil(v / chunk_size)),
+        desc="Rendering novel views",
+        disable=(not enable_tqdm),
+        leave=False,
+    ):
+        s = int(chunk_idx * chunk_size)
+        e = int((chunk_idx + 1) * chunk_size)
+        cur_n_view = tgt_extr[:, s:e].shape[1]
+        color, depth = render_3dgs(
+            extrinsics=rearrange(tgt_extr[:, s:e], "b v ... -> (b v) ..."),  # w2c
+            intrinsics=rearrange(tgt_intr[:, s:e], "b v ... -> (b v) ..."),  # normed
+            image_shape=image_shape,
+            gaussian=gaussians,
+            num_view=cur_n_view,
+            **kwargs,
+        )
+        all_colors.append(rearrange(color, "(b v) ... -> b v ...", v=cur_n_view))
+        all_depths.append(rearrange(depth, "(b v) ... -> b v ...", v=cur_n_view))
+    all_colors = torch.cat(all_colors, dim=1)
+    all_depths = torch.cat(all_depths, dim=1)
+    return all_colors, all_depths