Spaces:

ritianyu
/

InfiniDepth

Running on Zero

File size: 33,462 Bytes

f4cf59f
 
 
78bb21c
4b6a4d3
78bb21c
 
f4cf59f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13ac8c9
7819ac9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7af552
 
 
 
7819ac9
c7af552
24d5d34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b2b3e5
 
 
 
16ea76a
f4cf59f
 
 
 
 
 
 
2b2b3e5
 
 
 
 
f4cf59f
 
2b2b3e5
f4cf59f
2b2b3e5
f4cf59f
 
 
2b2b3e5
f4cf59f
2b2b3e5
16ea76a
f4cf59f
 
 
13ac8c9
6afe9df
f4cf59f
bf21000
f4cf59f
 
3a76675
 
24d5d34
 
 
f4cf59f
2b2b3e5
 
24d5d34
 
6afe9df
2b2b3e5
f4cf59f
2b2b3e5
 
 
 
 
1c7685e
7819ac9
f4cf59f
 
2b2b3e5
 
 
 
 
4b6a4d3
f4cf59f
7819ac9
 
 
4b6a4d3
f4cf59f
 
2b2b3e5
 
 
 
 
 
4b6a4d3
f4cf59f
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b6a4d3
 
2b2b3e5
 
 
 
f4cf59f
 
4b6a4d3
 
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
f4cf59f
 
2b2b3e5
 
4b6a4d3
 
78bb21c
 
f4cf59f
 
24d5d34
 
 
 
f4cf59f
 
7819ac9
f4cf59f
 
 
 
7819ac9
 
f4cf59f
 
b6640e8
7819ac9
 
2b2b3e5
 
 
 
 
 
7819ac9
 
f4cf59f
7819ac9
 
b6640e8
 
f4cf59f
 
7819ac9
f320042
 
 
7819ac9
 
2b2b3e5
5699cd4
b6640e8
f4cf59f
 
 
 
 
 
 
 
2b2b3e5
 
 
 
 
 
 
 
 
f4cf59f
 
78bb21c
f4cf59f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b2b3e5
2dc583e
f4cf59f
 
 
 
 
 
 
2b2b3e5
 
f4cf59f
 
 
 
 
 
 
2b2b3e5
 
 
 
 
 
 
 
5699cd4
2b2b3e5
 
 
 
 
 
 
 
 
24d5d34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4cf59f
 
 
 
 
 
 
 
 
 
 
 
24d5d34
 
 
 
 
f4cf59f
 
 
24d5d34
 
 
 
 
f4cf59f
 
 
 
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a76675
f2e9a22
2b2b3e5
 
24d5d34
 
 
 
 
 
 
 
 
 
 
 
 
 
f4cf59f
24d5d34
f4cf59f
 
 
24d5d34
 
 
 
 
 
 
 
 
f4cf59f
2b2b3e5
 
f4cf59f
2b2b3e5
 
 
 
 
2dc583e
 
 
2b2b3e5
 
 
 
 
 
24d5d34
f4cf59f
2b2b3e5
78bb21c
f4cf59f
78bb21c
5e71ded
78bb21c
f4cf59f
 
 
 
78bb21c
 
 
 
f4cf59f
78bb21c
f4cf59f
 
 
24d5d34
2b2b3e5
f4cf59f
7819ac9
f4cf59f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7819ac9
2dc583e
78bb21c
f4cf59f
 
 
 
 
 
 
 
 
78bb21c
f4cf59f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78bb21c
f4cf59f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b2b3e5
f4cf59f
 
 
 
 
 
 
 
 
 
 
 
 
2b2b3e5
 
 
 
 
24d5d34
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24d5d34
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5699cd4
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5699cd4
2b2b3e5
 
5699cd4
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a76675
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4cf59f
 
 
 
2b2b3e5
f4cf59f
 
e1959a2
f4cf59f
 
24d5d34
f4cf59f
 
2b2b3e5
 
 
 
 
24d5d34
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
f4cf59f
2b2b3e5
 
f4cf59f
2b2b3e5
5e71ded
 
f4cf59f
 
5e71ded
f4cf59f
 
 
 
 
78bb21c
f4cf59f
8a982e6
 
 
 
78bb21c
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24d5d34
f4cf59f
78bb21c
2b2b3e5
f4cf59f
2b2b3e5
6c08972
 
f2e9a22
 
 
 
 
 
 
6c08972
f2e9a22
2b2b3e5
6c08972
 
2b2b3e5
 
 
 
 
4b6a4d3
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
 
6c08972
2b2b3e5
 
 
 
 
 
 
 
 
 
 
 
f2e9a22
2b2b3e5
f4cf59f
 
2b2b3e5
 
 
 
 
 
 
 
f4cf59f
 
24d5d34
 
 
 
 
 
f4cf59f
 
24d5d34
 
f4cf59f
78bb21c
f4cf59f
 
2b2b3e5
5699cd4
f4cf59f
1c7685e
f4cf59f
2b2b3e5
 
f2e9a22
2b2b3e5
f4cf59f
5699cd4
 
 
 
 
 
 
 
 
 
f4cf59f
 
78bb21c
2b2b3e5
f4cf59f
 
78bb21c
5e71ded
78bb21c
f4cf59f
 
 
 
78bb21c
 
 
 
 
5699cd4
 
 
 
 
 
 
 
 
 
78bb21c
 
 
 
7819ac9
e1959a2

from __future__ import annotations

from functools import lru_cache
import os
from pathlib import Path
from typing import Optional


def _ensure_localhost_no_proxy() -> None:
    hosts = ["127.0.0.1", "localhost", "::1"]
    for key in ("NO_PROXY", "no_proxy"):
        current = os.environ.get(key, "")
        values = [value.strip() for value in current.split(",") if value.strip()]
        changed = False
        for host in hosts:
            if host not in values:
                values.append(host)
                changed = True
        if changed or not current:
            os.environ[key] = ",".join(values)


_ensure_localhost_no_proxy()


def _ensure_hf_cache_dirs() -> None:
    hf_home = os.environ.get("HF_HOME", "/tmp/huggingface")
    hub_cache = os.environ.get("HF_HUB_CACHE", os.path.join(hf_home, "hub"))
    assets_cache = os.environ.get("HF_ASSETS_CACHE", os.path.join(hf_home, "assets"))

    os.environ["HF_HOME"] = hf_home
    os.environ["HF_HUB_CACHE"] = hub_cache
    os.environ["HF_ASSETS_CACHE"] = assets_cache
    os.environ.setdefault("HUGGINGFACE_HUB_CACHE", hub_cache)
    os.environ.setdefault("HF_HUB_DISABLE_XET", "1")

    os.makedirs(hf_home, exist_ok=True)
    os.makedirs(hub_cache, exist_ok=True)
    os.makedirs(assets_cache, exist_ok=True)


_ensure_hf_cache_dirs()

import cv2
import gradio as gr
import numpy as np
import torch
from huggingface_hub import hf_hub_download

try:
    import spaces
except ImportError:
    class _SpacesFallback:
        @staticmethod
        def GPU(fn=None, **_kwargs):
            if callable(fn):
                return fn

            def decorator(inner_fn):
                return inner_fn

            return decorator

    spaces = _SpacesFallback()

from InfiniDepth.gs import GSPixelAlignPredictor, export_ply
from InfiniDepth.utils.gs_utils import (
    _build_sparse_uniform_gaussians,
)
from InfiniDepth.utils.hf_demo_utils import (
    DemoArtifacts,
    ensure_session_output_dir,
    export_point_cloud_assets,
    preview_depth_file,
    save_demo_artifacts,
    scan_example_cases,
)
from InfiniDepth.utils.hf_gs_viewer import (
    APP_TEMP_ROOT as GS_VIEWER_ROOT,
    build_embedded_viewer_html,
    build_viewer_error_html,
)
from InfiniDepth.utils.inference_utils import (
    apply_sky_mask_to_depth,
    build_camera_matrices,
    build_scaled_intrinsics_matrix,
    filter_gaussians_by_statistical_outlier,
    prepare_metric_depth_inputs,
    resolve_camera_intrinsics_for_inference,
    resolve_output_size_from_mode,
    run_optional_sampling_sky_mask,
    run_optional_sky_mask,
    unpack_gaussians_for_export,
)
from InfiniDepth.utils.io_utils import depth_to_disparity
from InfiniDepth.utils.model_utils import build_model
from InfiniDepth.utils.sampling_utils import SAMPLING_METHODS


APP_ROOT = Path(__file__).resolve().parent
EXAMPLES_DIR = APP_ROOT / "example_data"
INPUT_SIZE = (768, 1024)
APP_NAME = "infinidepth-hf-demo"
GS_TASK_CHOICE = "3DGS"
TASK_CHOICES = ["Depth", GS_TASK_CHOICE]
RGB_MODEL_TYPE = "InfiniDepth"
DEPTH_SENSOR_MODEL_TYPE = "InfiniDepth_DepthSensor"
MODEL_CHOICES = [RGB_MODEL_TYPE, DEPTH_SENSOR_MODEL_TYPE]
OUTPUT_MODE_CHOICES = ["upsample", "original", "specific"]
GS_SAMPLE_POINT_NUM = 2000000
GS_COORD_DETERMINISTIC_SAMPLING = True
DEPTH_GPU_DURATION_SECONDS = 180
GS_GPU_DURATION_SECONDS = 240

LOCAL_DEPTH_MODEL_PATHS = {
    "InfiniDepth": APP_ROOT / "checkpoints/depth/infinidepth.ckpt",
    "InfiniDepth_DepthSensor": APP_ROOT / "checkpoints/depth/infinidepth_depthsensor.ckpt",
}
LOCAL_GS_MODEL_PATHS = {
    "InfiniDepth": APP_ROOT / "checkpoints/gs/infinidepth_gs.ckpt",
    "InfiniDepth_DepthSensor": APP_ROOT / "checkpoints/gs/infinidepth_depthsensor_gs.ckpt",
}
HF_REPO_ID = "ritianyu/InfiniDepth"
HF_DEPTH_FILENAMES = {
    "InfiniDepth": "infinidepth.ckpt",
    "InfiniDepth_DepthSensor": "infinidepth_depthsensor.ckpt",
}
HF_GS_FILENAMES = {
    "InfiniDepth": "infinidepth_gs.ckpt",
    "InfiniDepth_DepthSensor": "infinidepth_depthsensor_gs.ckpt",
}
LOCAL_MOGE2_PATH = APP_ROOT / "checkpoints/moge-2-vitl-normal/model.pt"
HF_MOGE2_FILENAME = "moge2.pt"
LOCAL_SKYSEG_PATH = APP_ROOT / "checkpoints/sky/skyseg.onnx"
HF_SKYSEG_FILENAME = "skyseg.onnx"

EXAMPLE_CASES = scan_example_cases(EXAMPLES_DIR)
EXAMPLE_LOOKUP = {case.name: case for case in EXAMPLE_CASES}
DEFAULT_EXAMPLE_NAME = EXAMPLE_CASES[0].name if EXAMPLE_CASES else None
DEFAULT_EXAMPLE_INDEX = 0 if EXAMPLE_CASES else None
EXAMPLE_GALLERY_ITEMS = [(case.image_path, case.gallery_caption) for case in EXAMPLE_CASES]
DEPTH_VIEW_TAB_ID = "pcd-viewer-tab"
GS_VIEW_TAB_ID = "gs-viewer-tab"
gr.set_static_paths(paths=[str(GS_VIEWER_ROOT)])

CSS = """
#top-workspace {
    align-items: stretch;
}

#controls-column,
#inputs-column,
#outputs-column {
    min-width: 0;
}

#example-gallery {
    min-height: 280px;
}

#input-image {
    min-height: 420px;
}

#input-depth-preview {
    min-height: 240px;
}

#depth-model3d-viewer {
    height: 700px;
}

#depth-model3d-viewer canvas,
#depth-model3d-viewer model-viewer,
#depth-model3d-viewer .wrap,
#depth-model3d-viewer .container {
    height: 100% !important;
    max-height: 100% !important;
}

#gs-viewer-html {
    min-height: 748px;
    padding-bottom: 0.75rem;
}

#gs-viewer-html iframe {
    display: block;
    width: 100%;
    height: 700px !important;
    min-height: 700px !important;
}

#depth-preview,
#depth-comparison,
#depth-color {
    min-height: 260px;
}
"""


def _ensure_cuda() -> None:
    if not torch.cuda.is_available():
        raise gr.Error(
            "No CUDA device is available for this request. On Hugging Face ZeroGPU, "
            "GPU access is only attached while the decorated inference call is running."
        )


def _resolve_repo_asset(local_path: Path, filename: str) -> str:
    if local_path.exists():
        return str(local_path)

    return hf_hub_download(
        repo_id=HF_REPO_ID,
        filename=filename,
    )


@lru_cache(maxsize=2)
def _resolve_depth_checkpoint(model_type: str) -> str:
    return _resolve_repo_asset(LOCAL_DEPTH_MODEL_PATHS[model_type], HF_DEPTH_FILENAMES[model_type])


@lru_cache(maxsize=2)
def _resolve_gs_checkpoint(model_type: str) -> str:
    return _resolve_repo_asset(LOCAL_GS_MODEL_PATHS[model_type], HF_GS_FILENAMES[model_type])


@lru_cache(maxsize=1)
def _resolve_skyseg_path() -> str:
    return _resolve_repo_asset(LOCAL_SKYSEG_PATH, HF_SKYSEG_FILENAME)


@lru_cache(maxsize=1)
def _resolve_moge2_source() -> str:
    return _resolve_repo_asset(LOCAL_MOGE2_PATH, HF_MOGE2_FILENAME)


@lru_cache(maxsize=1)
def _preload_repo_assets() -> tuple[str, ...]:
    depth_paths = tuple(_resolve_depth_checkpoint(model_type) for model_type in MODEL_CHOICES)
    gs_paths = tuple(_resolve_gs_checkpoint(model_type) for model_type in MODEL_CHOICES)
    return depth_paths + gs_paths + (_resolve_moge2_source(), _resolve_skyseg_path())


@lru_cache(maxsize=2)
def _load_model(model_type: str):
    _ensure_cuda()
    model_path = _resolve_depth_checkpoint(model_type)
    return build_model(model_type=model_type, model_path=model_path)


@lru_cache(maxsize=4)
def _load_gs_predictor(model_type: str, dino_feature_dim: int):
    _ensure_cuda()
    predictor = GSPixelAlignPredictor(dino_feature_dim=dino_feature_dim).to(torch.device("cuda"))
    predictor.load_from_infinidepth_gs_checkpoint(_resolve_gs_checkpoint(model_type))
    predictor.eval()
    return predictor


def _to_optional_float(value: Optional[float]) -> Optional[float]:
    if value in (None, ""):
        return None
    return float(value)


def _to_rgb_uint8(image: np.ndarray) -> np.ndarray:
    image = np.asarray(image)
    if image.ndim != 3 or image.shape[2] != 3:
        raise gr.Error("Input image must be an RGB image.")

    if image.dtype == np.uint8:
        return image

    if np.issubdtype(image.dtype, np.floating):
        image = np.clip(image, 0.0, 1.0 if image.max() <= 1.0 else 255.0)
        if image.max() <= 1.0:
            image = image * 255.0
        return image.astype(np.uint8)

    return np.clip(image, 0, 255).astype(np.uint8)


def _prepare_image_tensors(image_rgb: np.ndarray) -> tuple[np.ndarray, torch.Tensor, tuple[int, int]]:
    image_rgb = _to_rgb_uint8(image_rgb)
    org_h, org_w = image_rgb.shape[:2]
    resized = cv2.resize(image_rgb, INPUT_SIZE[::-1], interpolation=cv2.INTER_AREA)
    image = torch.from_numpy(resized).permute(2, 0, 1).unsqueeze(0).float() / 255.0
    return image_rgb, image, (org_h, org_w)


def _format_depth_status(
    model_type: str,
    metric_depth_source: str,
    intrinsics_source: str,
    output_hw: tuple[int, int],
    depth_file: Optional[str],
) -> str:
    depth_label = Path(depth_file).name if depth_file else "None"
    return (
        f"Task: `Depth`\n\n"
        f"Model: `{model_type}`\n\n"
        f"Input depth: `{depth_label}`\n\n"
        f"Metric alignment source: `{metric_depth_source}`\n\n"
        f"Camera intrinsics source: `{intrinsics_source}`\n\n"
        f"Output size: `{output_hw[0]} x {output_hw[1]}`"
    )


def _format_gs_status(
    model_type: str,
    metric_depth_source: str,
    intrinsics_source: str,
    depth_file: Optional[str],
    gaussian_count: int,
) -> str:
    depth_label = Path(depth_file).name if depth_file else "None"
    return (
        f"Task: `GS`\n\n"
        f"Model: `{model_type}`\n\n"
        f"Input depth: `{depth_label}`\n\n"
        f"Metric alignment source: `{metric_depth_source}`\n\n"
        f"Camera intrinsics source: `{intrinsics_source}`\n\n"
        f"Exported gaussians: `{gaussian_count}`"
    )


def _model_availability_note(depth_path: Optional[str], model_type: str, *, auto_switched: bool = False) -> str:
    if depth_path:
        if auto_switched and model_type == DEPTH_SENSOR_MODEL_TYPE:
            return (
                "Depth file loaded. Switched model to `InfiniDepth_DepthSensor`. "
                "You can still switch back to `InfiniDepth` for RGB-only inference."
            )
        return (
            "Depth file loaded. `InfiniDepth_DepthSensor` is available. "
            "You can also keep `InfiniDepth` for RGB-only inference."
        )

    if auto_switched:
        return (
            "No input depth loaded. Switched model back to `InfiniDepth`. "
            "Upload a depth file to enable `InfiniDepth_DepthSensor`."
        )

    return "No input depth loaded. `InfiniDepth` will be used until you upload a depth file."


def _compose_depth_info_message(base_message: str, note: str) -> str:
    return f"{base_message}\n\n{note}" if note else base_message


def _load_example_image(example_name: str) -> tuple[np.ndarray, Optional[str], Optional[np.ndarray], str, str]:
    if not example_name:
        raise gr.Error("Select an example case first.")
    case = EXAMPLE_LOOKUP[example_name]

    image_bgr = cv2.imread(case.image_path, cv2.IMREAD_COLOR)
    if image_bgr is None:
        raise gr.Error(f"Failed to load example image: {case.image_path}")

    image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
    depth_path = case.depth_path
    preview = None
    detail = _compose_depth_info_message(
        f"Loaded example `{case.name}`.",
        _model_availability_note(None, RGB_MODEL_TYPE),
    )
    model_type = RGB_MODEL_TYPE

    if depth_path is not None:
        preview, depth_msg = preview_depth_file(depth_path)
        model_type = DEPTH_SENSOR_MODEL_TYPE
        detail = _compose_depth_info_message(
            f"Loaded example `{case.name}` with paired depth. {depth_msg}",
            _model_availability_note(depth_path, model_type, auto_switched=True),
        )

    return image_rgb, depth_path, preview, model_type, detail


def _selected_example_message(example_name: Optional[str]) -> str:
    if not example_name or example_name not in EXAMPLE_LOOKUP:
        return "Select an example thumbnail, then click `Load Example`."

    case = EXAMPLE_LOOKUP[example_name]
    mode_label = "RGB + depth" if case.has_depth else "RGB only"
    return f"Selected example: `{case.name}` ({mode_label})"


def _select_example(evt: gr.SelectData):
    if not EXAMPLE_CASES or evt.index is None:
        return None, _selected_example_message(None)

    index = evt.index
    if isinstance(index, (tuple, list)):
        index = index[0]
    case = EXAMPLE_CASES[int(index)]
    return case.name, _selected_example_message(case.name)


def _primary_view_for_task(task_type: str):
    selected_tab = GS_VIEW_TAB_ID if task_type == GS_TASK_CHOICE else DEPTH_VIEW_TAB_ID
    return gr.update(selected=selected_tab)


def _reset_uploaded_image_state(
    _image: Optional[np.ndarray],
    depth_path: Optional[str],
) -> tuple[None, None, str, str]:
    note = (
        "Image updated. Cleared the previous depth file. Upload a new depth file to enable "
        "`InfiniDepth_DepthSensor`."
        if depth_path
        else "Image updated. Upload a depth file to enable `InfiniDepth_DepthSensor`."
    )
    return None, None, RGB_MODEL_TYPE, note


def _update_depth_preview(depth_path: Optional[str], model_type: str) -> tuple[Optional[np.ndarray], str, str]:
    try:
        preview, depth_msg = preview_depth_file(depth_path)
    except Exception as exc:
        raise gr.Error(f"Failed to preview depth file: {exc}") from exc

    if depth_path:
        next_model = DEPTH_SENSOR_MODEL_TYPE
        note = _model_availability_note(depth_path, next_model, auto_switched=(model_type != next_model))
    else:
        next_model = RGB_MODEL_TYPE
        note = _model_availability_note(depth_path, next_model, auto_switched=(model_type != next_model))

    return preview, next_model, _compose_depth_info_message(depth_msg, note)


def _settings_visibility(task_type: str, output_resolution_mode: str):
    is_depth = task_type == "Depth"
    return (
        gr.update(visible=is_depth),
        gr.update(visible=is_depth and output_resolution_mode == "upsample"),
        gr.update(visible=is_depth and output_resolution_mode == "specific"),
        gr.update(visible=is_depth and output_resolution_mode == "specific"),
        gr.update(visible=is_depth),
    )


def _normalize_filtered_gaussians(filtered_result):
    if isinstance(filtered_result, tuple):
        return filtered_result[0]
    return filtered_result


@spaces.GPU(duration=DEPTH_GPU_DURATION_SECONDS)
@torch.no_grad()
def _run_depth_inference(
    image: np.ndarray,
    depth_file: Optional[str],
    model_type: str,
    output_resolution_mode: str,
    upsample_ratio: int,
    specific_height: int,
    specific_width: int,
    enable_skyseg_model: bool,
    filter_point_cloud: bool,
    fx_org: Optional[float],
    fy_org: Optional[float],
    cx_org: Optional[float],
    cy_org: Optional[float],
    request: gr.Request,
):
    _ensure_cuda()
    if image is None:
        raise gr.Error("Upload an image or load an example before running inference.")
    if model_type == DEPTH_SENSOR_MODEL_TYPE and not depth_file:
        raise gr.Error("InfiniDepth_DepthSensor requires an input depth file.")

    skyseg_path = _resolve_skyseg_path() if enable_skyseg_model else None

    image_rgb, image_tensor, (org_h, org_w) = _prepare_image_tensors(image)
    device = torch.device("cuda")
    image_tensor = image_tensor.to(device)
    model = _load_model(model_type)

    gt_depth, prompt_depth, gt_depth_mask, use_gt_depth, moge2_intrinsics = prepare_metric_depth_inputs(
        input_depth_path=depth_file,
        input_size=INPUT_SIZE,
        image=image_tensor,
        device=device,
        moge2_pretrained=_resolve_moge2_source(),
    )

    gt_disp = depth_to_disparity(gt_depth)
    prompt_disp = depth_to_disparity(prompt_depth)

    fx_org, fy_org, cx_org, cy_org, intrinsics_source = resolve_camera_intrinsics_for_inference(
        fx_org=_to_optional_float(fx_org),
        fy_org=_to_optional_float(fy_org),
        cx_org=_to_optional_float(cx_org),
        cy_org=_to_optional_float(cy_org),
        org_h=org_h,
        org_w=org_w,
        image=image_tensor,
        moge2_pretrained=_resolve_moge2_source(),
        moge2_intrinsics=moge2_intrinsics,
    )

    _, _, h, w = image_tensor.shape
    fx, fy, cx, cy, _ = build_scaled_intrinsics_matrix(
        fx_org=fx_org,
        fy_org=fy_org,
        cx_org=cx_org,
        cy_org=cy_org,
        org_h=org_h,
        org_w=org_w,
        h=h,
        w=w,
        device=image_tensor.device,
    )

    sky_mask = run_optional_sky_mask(
        image=image_tensor,
        enable_skyseg_model=enable_skyseg_model,
        sky_model_ckpt_path=skyseg_path or str(LOCAL_SKYSEG_PATH),
    )

    h_out, w_out = resolve_output_size_from_mode(
        output_resolution_mode=output_resolution_mode,
        org_h=org_h,
        org_w=org_w,
        h=h,
        w=w,
        output_size=(int(specific_height), int(specific_width)),
        upsample_ratio=int(upsample_ratio),
    )

    query_2d_uniform_coord = SAMPLING_METHODS["2d_uniform"]((h_out, w_out)).unsqueeze(0).to(device)
    pred_2d_uniform_depth, _ = model.inference(
        image=image_tensor,
        query_coord=query_2d_uniform_coord,
        gt_depth=gt_disp,
        gt_depth_mask=gt_depth_mask,
        prompt_depth=prompt_disp,
        prompt_mask=prompt_disp > 0,
    )
    pred_depthmap = pred_2d_uniform_depth.permute(0, 2, 1).reshape(1, 1, h_out, w_out)

    pred_depthmap, pred_2d_uniform_depth = apply_sky_mask_to_depth(
        pred_depthmap=pred_depthmap,
        pred_2d_uniform_depth=pred_2d_uniform_depth,
        sky_mask=sky_mask,
        h_sample=h_out,
        w_sample=w_out,
        sky_depth_value=200.0,
    )

    session_hash = getattr(request, "session_hash", None)
    output_dir = ensure_session_output_dir(APP_NAME, session_hash)

    pred_depth_np = pred_depthmap.squeeze(0).squeeze(0).detach().cpu().numpy().astype(np.float32)
    artifacts = save_demo_artifacts(image_rgb=image_rgb, pred_depth=pred_depth_np, output_dir=output_dir)
    ply_path, glb_path = export_point_cloud_assets(
        sampled_coord=query_2d_uniform_coord.squeeze(0).cpu(),
        sampled_depth=pred_2d_uniform_depth.squeeze(0).squeeze(-1).cpu(),
        rgb_image=image_tensor.squeeze(0).cpu(),
        fx=fx,
        fy=fy,
        cx=cx,
        cy=cy,
        output_dir=output_dir,
        filter_flying_points=filter_point_cloud,
    )
    artifacts = DemoArtifacts(
        comparison_path=artifacts.comparison_path,
        color_depth_path=artifacts.color_depth_path,
        gray_depth_path=artifacts.gray_depth_path,
        raw_depth_path=artifacts.raw_depth_path,
        ply_path=ply_path,
        glb_path=glb_path,
    )

    metric_depth_source = "user depth" if use_gt_depth and depth_file else "MoGe-2"
    status = _format_depth_status(
        model_type=model_type,
        metric_depth_source=metric_depth_source,
        intrinsics_source=intrinsics_source,
        output_hw=(h_out, w_out),
        depth_file=depth_file,
    )
    return (
        status,
        artifacts.comparison_path,
        artifacts.color_depth_path,
        artifacts.gray_depth_path,
        glb_path,
        artifacts.download_files(),
        None,
        None,
    )


@spaces.GPU(duration=GS_GPU_DURATION_SECONDS)
@torch.no_grad()
def _run_gs_inference(
    image: np.ndarray,
    depth_file: Optional[str],
    model_type: str,
    enable_skyseg_model: bool,
    fx_org: Optional[float],
    fy_org: Optional[float],
    cx_org: Optional[float],
    cy_org: Optional[float],
    request: gr.Request,
):
    _ensure_cuda()
    if image is None:
        raise gr.Error("Upload an image or load an example before running inference.")
    if model_type == DEPTH_SENSOR_MODEL_TYPE and not depth_file:
        raise gr.Error("InfiniDepth_DepthSensor requires an input depth file for GS inference.")

    image_rgb, image_tensor, (org_h, org_w) = _prepare_image_tensors(image)
    del image_rgb
    device = torch.device("cuda")
    image_tensor = image_tensor.to(device)
    model = _load_model(model_type)

    gt_depth, prompt_depth, gt_depth_mask, use_gt_depth, moge2_intrinsics = prepare_metric_depth_inputs(
        input_depth_path=depth_file,
        input_size=INPUT_SIZE,
        image=image_tensor,
        device=device,
        moge2_pretrained=_resolve_moge2_source(),
    )

    gt_disp = depth_to_disparity(gt_depth)
    prompt_disp = depth_to_disparity(prompt_depth)

    fx_org, fy_org, cx_org, cy_org, intrinsics_source = resolve_camera_intrinsics_for_inference(
        fx_org=_to_optional_float(fx_org),
        fy_org=_to_optional_float(fy_org),
        cx_org=_to_optional_float(cx_org),
        cy_org=_to_optional_float(cy_org),
        org_h=org_h,
        org_w=org_w,
        image=image_tensor,
        moge2_pretrained=_resolve_moge2_source(),
        moge2_intrinsics=moge2_intrinsics,
    )

    b, _, h, w = image_tensor.shape
    _, _, _, _, intrinsics, extrinsics = build_camera_matrices(
        fx_org=fx_org,
        fy_org=fy_org,
        cx_org=cx_org,
        cy_org=cy_org,
        org_h=org_h,
        org_w=org_w,
        h=h,
        w=w,
        batch=b,
        device=device,
    )

    skyseg_path = _resolve_skyseg_path() if enable_skyseg_model else str(LOCAL_SKYSEG_PATH)
    sky_mask = run_optional_sampling_sky_mask(
        image=image_tensor,
        enable_skyseg_model=enable_skyseg_model,
        sky_model_ckpt_path=skyseg_path,
        dilate_px=0,
    )

    depthmap, dino_tokens, query_3d_uniform_coord, pred_depth_3d = model.inference_for_gs(
        image=image_tensor,
        intrinsics=intrinsics,
        gt_depth=gt_disp,
        gt_depth_mask=gt_depth_mask,
        prompt_depth=prompt_disp,
        prompt_mask=prompt_disp > 0,
        sky_mask=sky_mask,
        sample_point_num=GS_SAMPLE_POINT_NUM,
        coord_deterministic_sampling=GS_COORD_DETERMINISTIC_SAMPLING,
    )
    if query_3d_uniform_coord is None or pred_depth_3d is None:
        raise gr.Error("GS inference did not return 3D-uniform query outputs.")

    gs_predictor = _load_gs_predictor(model_type, int(dino_tokens.shape[-1]))
    dense_gaussians = gs_predictor(
        image=image_tensor,
        depthmap=depthmap,
        dino_tokens=dino_tokens,
        intrinsics=intrinsics,
        extrinsics=extrinsics,
    )

    pixel_gaussians = _build_sparse_uniform_gaussians(
        dense_gaussians=dense_gaussians,
        query_3d_uniform_coord=query_3d_uniform_coord,
        pred_depth_3d=pred_depth_3d,
        intrinsics=intrinsics,
        extrinsics=extrinsics,
        h=h,
        w=w,
    )
    pixel_gaussians = _normalize_filtered_gaussians(filter_gaussians_by_statistical_outlier(pixel_gaussians))
    gaussian_count = int(pixel_gaussians.means.shape[1])
    if gaussian_count == 0:
        raise gr.Error("No valid gaussians remained after filtering.")

    means, harmonics, opacities, scales, rotations = unpack_gaussians_for_export(pixel_gaussians)

    session_hash = getattr(request, "session_hash", None)
    output_dir = ensure_session_output_dir(APP_NAME, session_hash)
    ply_path = output_dir / "gaussians.ply"

    export_ply(
        means=means,
        harmonics=harmonics,
        opacities=opacities,
        path=ply_path,
        scales=scales,
        rotations=rotations,
        focal_length_px=(fx_org, fy_org),
        principal_point_px=(cx_org, cy_org),
        image_shape=(org_h, org_w),
        extrinsic_matrix=extrinsics[0],
    )

    try:
        gs_viewer_html = build_embedded_viewer_html(ply_path)
    except Exception as exc:
        print(f"[Warning] Failed to build embedded GS viewer: {exc}")
        gs_viewer_html = build_viewer_error_html(str(exc), ply_path)

    metric_depth_source = "user depth" if use_gt_depth and depth_file else "MoGe-2"
    status = _format_gs_status(
        model_type=model_type,
        metric_depth_source=metric_depth_source,
        intrinsics_source=intrinsics_source,
        depth_file=depth_file,
        gaussian_count=gaussian_count,
    )
    download_files = [str(ply_path)]

    return (
        status,
        None,
        None,
        None,
        None,
        None,
        gs_viewer_html,
        download_files,
    )


def _run_inference(
    task_type: str,
    image: np.ndarray,
    depth_file: Optional[str],
    model_type: str,
    output_resolution_mode: str,
    upsample_ratio: int,
    specific_height: int,
    specific_width: int,
    enable_skyseg_model: bool,
    filter_point_cloud: bool,
    fx_org: Optional[float],
    fy_org: Optional[float],
    cx_org: Optional[float],
    cy_org: Optional[float],
    request: gr.Request,
):
    if task_type == GS_TASK_CHOICE:
        return _run_gs_inference(
            image=image,
            depth_file=depth_file,
            model_type=model_type,
            enable_skyseg_model=enable_skyseg_model,
            fx_org=fx_org,
            fy_org=fy_org,
            cx_org=cx_org,
            cy_org=cy_org,
            request=request,
        )

    return _run_depth_inference(
        image=image,
        depth_file=depth_file,
        model_type=model_type,
        output_resolution_mode=output_resolution_mode,
        upsample_ratio=upsample_ratio,
        specific_height=specific_height,
        specific_width=specific_width,
        enable_skyseg_model=enable_skyseg_model,
        filter_point_cloud=filter_point_cloud,
        fx_org=fx_org,
        fy_org=fy_org,
        cx_org=cx_org,
        cy_org=cy_org,
        request=request,
    )


def _clear_outputs():
    return "", None, None, None, None, None, "", None


with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as demo:
    gr.Markdown("# InfiniDepth Demo")
    gr.Markdown(
        "Switch between depth inference and GS inference. `InfiniDepth` works with RGB-only inputs, while `InfiniDepth_DepthSensor` is enabled only after you upload a depth file or load an example with paired depth."
    )

    selected_example_name = gr.State(DEFAULT_EXAMPLE_NAME)

    with gr.Row(elem_id="top-workspace"):
        with gr.Column(scale=4, min_width=320, elem_id="controls-column"):
            task_type = gr.Radio(label="Inference Task", choices=TASK_CHOICES, value="Depth")
            model_type = gr.Radio(label="Model Type", choices=MODEL_CHOICES, value=RGB_MODEL_TYPE)

            gr.Markdown("### Example Data")
            example_gallery = gr.Gallery(
                value=EXAMPLE_GALLERY_ITEMS,
                label="Example Data",
                show_label=False,
                columns=2,
                height=280,
                object_fit="cover",
                allow_preview=False,
                selected_index=DEFAULT_EXAMPLE_INDEX,
                elem_id="example-gallery",
            )
            example_selection = gr.Markdown(_selected_example_message(DEFAULT_EXAMPLE_NAME))
            load_example_btn = gr.Button("Load Example")

            with gr.Accordion("Depth Settings", open=True):
                output_resolution_mode = gr.Dropdown(
                    label="Output Resolution Mode",
                    choices=OUTPUT_MODE_CHOICES,
                    value="upsample",
                )
                upsample_ratio = gr.Slider(label="Upsample Ratio", minimum=1, maximum=4, step=1, value=1)
                specific_height = gr.Number(label="Specific Height", value=INPUT_SIZE[0], precision=0, visible=False)
                specific_width = gr.Number(label="Specific Width", value=INPUT_SIZE[1], precision=0, visible=False)
                enable_skyseg_model = gr.Checkbox(label="Apply Sky Mask", value=False)
                filter_point_cloud = gr.Checkbox(label="Filter Flying Points", value=True)

            with gr.Accordion("Optional Camera Intrinsics", open=False):
                fx_org = gr.Textbox(label="fx", value="", placeholder="auto")
                fy_org = gr.Textbox(label="fy", value="", placeholder="auto")
                cx_org = gr.Textbox(label="cx", value="", placeholder="auto")
                cy_org = gr.Textbox(label="cy", value="", placeholder="auto")

        with gr.Column(scale=5, min_width=360, elem_id="inputs-column"):
            input_image = gr.Image(
                label="Input Image",
                image_mode="RGB",
                type="numpy",
                sources=["upload", "clipboard", "webcam"],
                height=420,
                elem_id="input-image",
            )
            input_depth_file = gr.File(
                label="Optional Depth File",
                type="filepath",
                file_types=[".png", ".npy", ".npz", ".h5", ".hdf5", ".exr"],
            )
            input_depth_preview = gr.Image(
                label="Input Depth Preview",
                type="numpy",
                height=240,
                elem_id="input-depth-preview",
            )
            depth_info = gr.Markdown("No input depth loaded. `InfiniDepth` will be used until you upload a depth file.")
            submit_btn = gr.Button("Run Inference", variant="primary")

        with gr.Column(scale=8, min_width=640, elem_id="outputs-column"):
            status_output = gr.Markdown()

            with gr.Tabs(selected=DEPTH_VIEW_TAB_ID, elem_id="primary-view-tabs") as primary_view_tabs:
                with gr.Tab("PCD Viewer", id=DEPTH_VIEW_TAB_ID, render_children=True):
                    depth_model_3d = gr.Model3D(
                        label="Point Cloud Viewer",
                        display_mode="solid",
                        clear_color=[1.0, 1.0, 1.0, 1.0],
                        height=700,
                        elem_id="depth-model3d-viewer",
                    )
                with gr.Tab("GS Viewer", id=GS_VIEW_TAB_ID, render_children=True):
                    gs_viewer_html = gr.HTML(elem_id="gs-viewer-html")

            with gr.Tabs(elem_id="secondary-output-tabs"):
                with gr.Tab("Depth Analysis", render_children=True):
                    depth_comparison = gr.Image(
                        label="RGB vs Depth",
                        type="filepath",
                        height=280,
                        elem_id="depth-comparison",
                    )
                    with gr.Row():
                        color_depth = gr.Image(
                            label="Colorized Depth",
                            type="filepath",
                            height=260,
                            elem_id="depth-color",
                        )
                        gray_depth = gr.Image(
                            label="Grayscale Depth",
                            type="filepath",
                            height=260,
                            elem_id="depth-preview",
                        )
                with gr.Tab("Downloads", render_children=True):
                    with gr.Row():
                        depth_download_files = gr.File(label="Depth Files", type="filepath")
                        gs_download_files = gr.File(label="GS Files", type="filepath")

    task_type.change(
        fn=_settings_visibility,
        inputs=[task_type, output_resolution_mode],
        outputs=[output_resolution_mode, upsample_ratio, specific_height, specific_width, filter_point_cloud],
    )
    task_type.change(
        fn=_primary_view_for_task,
        inputs=[task_type],
        outputs=[primary_view_tabs],
    )

    output_resolution_mode.change(
        fn=_settings_visibility,
        inputs=[task_type, output_resolution_mode],
        outputs=[output_resolution_mode, upsample_ratio, specific_height, specific_width, filter_point_cloud],
    )

    example_gallery.select(
        fn=_select_example,
        outputs=[selected_example_name, example_selection],
    )

    input_image.input(
        fn=_reset_uploaded_image_state,
        inputs=[input_image, input_depth_file],
        outputs=[input_depth_file, input_depth_preview, model_type, depth_info],
    )

    input_depth_file.change(
        fn=_update_depth_preview,
        inputs=[input_depth_file, model_type],
        outputs=[input_depth_preview, model_type, depth_info],
    )

    load_example_btn.click(
        fn=_load_example_image,
        inputs=[selected_example_name],
        outputs=[input_image, input_depth_file, input_depth_preview, model_type, depth_info],
    )

    submit_btn.click(
        fn=_primary_view_for_task,
        inputs=[task_type],
        outputs=[primary_view_tabs],
    ).then(
        fn=_clear_outputs,
        outputs=[
            status_output,
            depth_comparison,
            color_depth,
            gray_depth,
            depth_model_3d,
            depth_download_files,
            gs_viewer_html,
            gs_download_files,
        ],
    ).then(
        fn=_run_inference,
        inputs=[
            task_type,
            input_image,
            input_depth_file,
            model_type,
            output_resolution_mode,
            upsample_ratio,
            specific_height,
            specific_width,
            enable_skyseg_model,
            filter_point_cloud,
            fx_org,
            fy_org,
            cx_org,
            cy_org,
        ],
        outputs=[
            status_output,
            depth_comparison,
            color_depth,
            gray_depth,
            depth_model_3d,
            depth_download_files,
            gs_viewer_html,
            gs_download_files,
        ],
    )


if __name__ == "__main__":
    _preload_repo_assets()
    demo.queue().launch()