Spaces:

allenai
/

WildDet3D

Running on Zero

App Files Files Community

weikaih commited on Apr 4

Commit

f71ac1d

verified ·

1 Parent(s): 2fcc11c

WildDet3D Gradio demo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
__pycache__/vis3d_glb.cpython-311.pyc +0 -0
app.py +822 -0
assets/demo/intrinsics.npy +3 -0
assets/demo/rgb.png +3 -0
requirements.txt +59 -0
third_party/lingbot_depth/mdm/model/__init__.py +15 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/__init__.py +6 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/hub/__init__.py +4 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/hub/backbones.py +162 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/hub/utils.py +39 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/__init__.py +12 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/attention.py +100 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/block.py +259 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/dino_head.py +58 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/drop_path.py +34 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/layer_scale.py +27 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/mlp.py +40 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/patch_embed.py +88 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/patch_embed_mlp.py +153 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/swiglu_ffn.py +72 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/models/__init__.py +55 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/models/mask_utils.py +137 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/models/vision_transformer.py +479 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/__init__.py +4 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/cluster.py +95 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/config.py +72 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/dtype.py +37 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/param_groups.py +103 -0
third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/utils.py +95 -0
third_party/lingbot_depth/mdm/model/modules_decoder.py +185 -0
third_party/lingbot_depth/mdm/model/modules_rgbd_encoder.py +152 -0
third_party/lingbot_depth/mdm/model/utils.py +127 -0
third_party/lingbot_depth/mdm/model/v2.py +297 -0
third_party/lingbot_depth/mdm/utils/__init__.py +0 -0
third_party/lingbot_depth/mdm/utils/geo.py +105 -0
third_party/lingbot_depth/mdm/utils/io.py +270 -0
third_party/lingbot_depth/mdm/utils/tools.py +289 -0
third_party/lingbot_depth/mdm/utils/vis.py +65 -0
third_party/lingbot_depth/pyproject.toml +26 -0
third_party/sam3/pyproject.toml +135 -0
third_party/sam3/sam3/__init__.py +9 -0
third_party/sam3/sam3/__pycache__/__init__.cpython-311.pyc +0 -0
third_party/sam3/sam3/__pycache__/logger.cpython-311.pyc +0 -0
third_party/sam3/sam3/__pycache__/model_builder.cpython-311.pyc +0 -0
third_party/sam3/sam3/agent/__init__.py +3 -0
third_party/sam3/sam3/agent/agent_core.py +565 -0
third_party/sam3/sam3/agent/client_llm.py +207 -0
third_party/sam3/sam3/agent/client_sam3.py +139 -0
third_party/sam3/sam3/agent/helpers/__init__.py +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/demo/rgb.png filter=lfs diff=lfs merge=lfs -text
+third_party/sam3/sam3/model/__pycache__/video_tracking_multiplex.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+third_party/sam3/sam3/perflib/tests/assets/masks.tiff filter=lfs diff=lfs merge=lfs -text

__pycache__/vis3d_glb.cpython-311.pyc ADDED Viewed

Binary file (28.6 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,822 @@

+"""Gradio Web Demo for WildDet3D (5-mode).
+Supports 5 prompt modes:
+- Text: Enter text like "chair.table" (one-to-many)
+- Visual: Click box on image, text="visual" (one-to-many)
+- Visual+Label: Click box + category label (one-to-many)
+- Geometry: Click box on image, text="geometric" (one-to-one)
+- Geometry+Label: Click box + category label (one-to-one)
+- Point: Click on image to select point
+Requirements:
+    pip install gradio>=5.0.0
+Usage:
+    python demo/huggingface/app.py
+Then open http://localhost:7860 in browser.
+"""
+import os
+import sys
+from pathlib import Path
+# Add paths: support both local dev and HuggingFace Space.
+# Local dev:  demo/huggingface/app.py -> repo root = ../../
+# HF Space:   wilddet3d/ is bundled in the same directory as app.py
+_this_dir = Path(__file__).resolve().parent
+if (_this_dir / "wilddet3d").exists():
+    # HuggingFace Space: everything bundled next to app.py
+    sys.path.insert(0, str(_this_dir))
+else:
+    # Local dev: repo root is two levels up
+    repo_root = _this_dir.parent.parent
+    sys.path.insert(0, str(repo_root))
+import spaces
+import gradio as gr
+import numpy as np
+import torch
+import cv2
+from PIL import Image
+from wilddet3d.inference import build_model, WildDet3DPredictor
+from wilddet3d.preprocessing import preprocess
+from wilddet3d.vis.visualize import draw_3d_boxes
+from vis3d_glb import (
+    depth_to_pointcloud, create_scene_glb, create_mesh_scene_glb,
+)
+def draw_points_on_image(image, points, color=(0, 255, 0), radius=8):
+    """Draw points on image.
+    Args:
+        image: numpy array (H, W, 3)
+        points: list of (x, y, label) tuples
+        color: color for positive points (green default)
+        radius: point radius
+    Returns:
+        Image with points drawn
+    """
+    img = image.copy()
+    for x, y, label in points:
+        c = color if label == 1 else (255, 0, 0)
+        cv2.circle(img, (int(x), int(y)), radius, c, -1)
+        cv2.circle(img, (int(x), int(y)), radius + 2, (255, 255, 255), 2)
+    return img
+def draw_box_on_image(image, box, color=(0, 0, 255), thickness=3):
+    """Draw box on image.
+    Args:
+        image: numpy array (H, W, 3)
+        box: [x1, y1, x2, y2] coordinates
+        color: box color (red default)
+        thickness: line thickness
+    Returns:
+        Image with box drawn
+    """
+    img = image.copy()
+    x1, y1, x2, y2 = [int(v) for v in box]
+    cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
+    return img
+# HuggingFace Model repo for checkpoints
+HF_MODEL_REPO = "weikaih/WildDet3D"
+HF_CKPT_NAME = "wilddet3d.pt"
+# Local checkpoint paths (tried in order)
+LOCAL_CHECKPOINTS = [
+    "ckpt/wilddet3d.pt",  # release repo layout
+]
+# Default demo image path
+DEFAULT_IMAGE_PATH = "assets/demo/rgb.png"
+DEFAULT_INTRINSICS_PATH = "assets/demo/intrinsics.npy"
+# Global model (loaded once)
+_cached_model = None
+def _resolve_checkpoint():
+    """Resolve checkpoint: local if exists, else download from HF Hub."""
+    for path in LOCAL_CHECKPOINTS:
+        if os.path.exists(path):
+            return path
+    from huggingface_hub import hf_hub_download
+    hf_token = os.environ.get("HF_TOKEN")
+    print(f"Downloading checkpoint from {HF_MODEL_REPO}...")
+    ckpt = hf_hub_download(
+        repo_id=HF_MODEL_REPO, filename=HF_CKPT_NAME, token=hf_token
+    )
+    return ckpt
+def get_model():
+    """Load model once and cache it."""
+    global _cached_model
+    if _cached_model is None:
+        ckpt_path = _resolve_checkpoint()
+        print(f"Loading WildDet3D model from {ckpt_path}...")
+        _cached_model = build_model(
+            checkpoint=ckpt_path,
+            score_threshold=0.0,
+            canonical_rotation=True,
+            skip_pretrained=True,
+        )
+        print("Model loaded!")
+    return _cached_model
+def load_default_image():
+    """Load the default demo image."""
+    if os.path.exists(DEFAULT_IMAGE_PATH):
+        return np.array(Image.open(DEFAULT_IMAGE_PATH))
+    return None
+def load_default_intrinsics():
+    """Load default intrinsics values."""
+    if os.path.exists(DEFAULT_INTRINSICS_PATH):
+        intrinsics = np.load(DEFAULT_INTRINSICS_PATH)
+        return (
+            float(intrinsics[0, 0]),
+            float(intrinsics[1, 1]),
+            float(intrinsics[0, 2]),
+            float(intrinsics[1, 2]),
+        )
+    return 518.86, 519.47, 325.58, 253.74
+def format_intrinsics(K):
+    """Format intrinsics tensor for display."""
+    if K is None:
+        return "Not available"
+    if isinstance(K, torch.Tensor):
+        K = K.cpu().numpy()
+    if K.ndim == 3:
+        K = K[0]
+    return (
+        f"fx={K[0, 0]:.2f}, fy={K[1, 1]:.2f}, "
+        f"cx={K[0, 2]:.2f}, cy={K[1, 2]:.2f}"
+    )
+def scale_intrinsics_to_original(K, input_hw, original_hw):
+    """Scale intrinsics from model input resolution to original."""
+    if K is None:
+        return None
+    if isinstance(K, torch.Tensor):
+        K = K.clone()
+    else:
+        K = K.copy()
+    input_h, input_w = input_hw
+    orig_h, orig_w = original_hw
+    scale_x = orig_w / input_w
+    scale_y = orig_h / input_h
+    if K.ndim == 3:
+        K[:, 0, 0] *= scale_x
+        K[:, 1, 1] *= scale_y
+        K[:, 0, 2] *= scale_x
+        K[:, 1, 2] *= scale_y
+    else:
+        K[0, 0] *= scale_x
+        K[1, 1] *= scale_y
+        K[0, 2] *= scale_x
+        K[1, 2] *= scale_y
+    return K
+def transform_coords_to_input_space(x, y, original_hw, input_hw, padding):
+    """Transform coords from original image space to preprocessed input.
+    Args:
+        x, y: Coordinates in original image space
+        original_hw: (H, W) of original image
+        input_hw: (H, W) of preprocessed image (e.g., 1008x1008)
+        padding: (pad_left, pad_right, pad_top, pad_bottom)
+    Returns:
+        (new_x, new_y) in preprocessed input space
+    """
+    orig_h, orig_w = original_hw
+    pad_left, pad_right, pad_top, pad_bottom = padding
+    content_w = input_hw[1] - pad_left - pad_right
+    content_h = input_hw[0] - pad_top - pad_bottom
+    scale_x = content_w / orig_w
+    scale_y = content_h / orig_h
+    new_x = x * scale_x + pad_left
+    new_y = y * scale_y + pad_top
+    return new_x, new_y
+def on_image_select(
+    evt: gr.SelectData, image, original_image, state,
+    prompt_mode, point_label,
+):
+    """Handle click on image and visualize the click."""
+    if image is None:
+        return state, "Please upload an image first", None
+    x, y = evt.index[0], evt.index[1]
+    label = 1 if "Positive" in point_label else 0
+    new_state = {
+        "points": list(state.get("points", [])),
+        "box": list(state.get("box", [])),
+    }
+    vis_image = (
+        original_image.copy()
+        if original_image is not None
+        else image.copy()
+    )
+    if prompt_mode == "Point":
+        new_state["points"].append((x, y, label))
+        new_state["box"] = []
+        label_str = "+" if label == 1 else "-"
+        info = (
+            f"Points: {len(new_state['points'])} total. "
+            f"Last: ({x}, {y}) [{label_str}]"
+        )
+        vis_image = draw_points_on_image(vis_image, new_state["points"])
+    elif prompt_mode in ("Box-to-Multi-Object", "Box-to-Single-Object"):
+        new_state["points"] = []
+        box_clicks = list(new_state.get("box", []))
+        box_clicks.append((x, y))
+        if len(box_clicks) == 1:
+            new_state["box"] = box_clicks
+            info = (
+                f"[{prompt_mode}] Corner 1: ({x}, {y}) "
+                f"- click again for corner 2"
+            )
+            vis_image = draw_points_on_image(vis_image, [(x, y, 1)])
+        elif len(box_clicks) >= 2:
+            x1, y1 = box_clicks[0]
+            x2, y2 = box_clicks[1]
+            box = [min(x1, x2), min(y1, y2), max(x1, x2), max(y1, y2)]
+            new_state["box"] = [(box[0], box[1]), (box[2], box[3])]
+            info = (
+                f"[{prompt_mode}] Box: "
+                f"({box[0]}, {box[1]}) -> ({box[2]}, {box[3]})"
+            )
+            vis_image = draw_box_on_image(vis_image, box)
+        else:
+            info = f"Box clicks: {box_clicks}"
+    else:
+        info = "Text mode - just enter text and click Run"
+    return new_state, info, vis_image
+def clear_clicks(state, original_image):
+    """Reset click state and restore original image."""
+    new_state = {"points": [], "box": []}
+    return (
+        new_state,
+        "Cleared - ready for new clicks",
+        original_image.copy() if original_image is not None else None,
+    )
+@spaces.GPU
+def run_wilddet3d(
+    image,
+    state,
+    prompt_mode,
+    text_prompt,
+    use_label,
+    label_text,
+    score_thres,
+    use_predicted_K,
+    fx, fy, cx, cy,
+    enable_3d_vis=True,
+    remove_edges=True,
+    point_density=2,
+    use_textured_mesh=True,
+):
+    """Run WildDet3D with selected prompt mode."""
+    if image is None:
+        return None, "Please upload an image first", None, None
+    # Convert RGBA to RGB if needed
+    if image.ndim == 3 and image.shape[2] == 4:
+        image = image[:, :, :3]
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    detector = get_model()
+    # Build intrinsics matrix (or None if using predicted)
+    if use_predicted_K:
+        intrinsics = None
+    else:
+        intrinsics = np.array([
+            [fx, 0, cx],
+            [0, fy, cy],
+            [0, 0, 1]
+        ], dtype=np.float32)
+    # Preprocess image
+    data = preprocess(image.astype(np.float32), intrinsics)
+    # Build prompt_text for box/point modes
+    if prompt_mode == "Box-to-Multi-Object":
+        prefix = "visual"
+    elif prompt_mode == "Box-to-Single-Object":
+        prefix = "geometric"
+    else:
+        prefix = "geometric"  # Point mode default
+    if prompt_mode != "Text":
+        if use_label and label_text and label_text.strip():
+            geo_prompt_text = f"{prefix}: {label_text.strip()}"
+        else:
+            geo_prompt_text = prefix
+    # Initialize prompt info for visualization
+    prompt_points = None
+    prompt_box = None
+    # Run based on prompt mode
+    if prompt_mode == "Text":
+        input_texts = [
+            t.strip() for t in text_prompt.split(".") if t.strip()
+        ]
+        if not input_texts:
+            input_texts = ["object"]
+        results = detector(
+            images=data["images"].to(device),
+            intrinsics=data["intrinsics"].to(device)[None],
+            input_hw=[data["input_hw"]],
+            original_hw=[data["original_hw"]],
+            padding=[data["padding"]],
+            input_texts=input_texts,
+            return_predicted_intrinsics=True,
+        )
+        (
+            boxes, boxes3d, scores, scores_2d, scores_3d,
+            class_ids, depth_maps, predicted_K,
+        ) = results
+        class_id_mapping = {i: t for i, t in enumerate(input_texts)}
+    elif prompt_mode in ("Box-to-Multi-Object", "Box-to-Single-Object"):
+        box_coords = state.get("box", [])
+        if len(box_coords) < 2:
+            return (
+                None,
+                "Please click twice on the image to define a box",
+                None,
+                None,
+            )
+        x1_orig, y1_orig = box_coords[0]
+        x2_orig, y2_orig = box_coords[1]
+        x1, y1 = transform_coords_to_input_space(
+            x1_orig, y1_orig,
+            data["original_hw"], data["input_hw"], data["padding"],
+        )
+        x2, y2 = transform_coords_to_input_space(
+            x2_orig, y2_orig,
+            data["original_hw"], data["input_hw"], data["padding"],
+        )
+        box_xyxy = [float(x1), float(y1), float(x2), float(y2)]
+        prompt_box = [x1_orig, y1_orig, x2_orig, y2_orig]
+        results = detector(
+            images=data["images"].to(device),
+            intrinsics=data["intrinsics"].to(device)[None],
+            input_hw=[data["input_hw"]],
+            original_hw=[data["original_hw"]],
+            padding=[data["padding"]],
+            input_boxes=[box_xyxy],
+            prompt_text=geo_prompt_text,
+            return_predicted_intrinsics=True,
+        )
+        (
+            boxes, boxes3d, scores, scores_2d, scores_3d,
+            class_ids, depth_maps, predicted_K,
+        ) = results
+        class_id_mapping = {0: geo_prompt_text}
+    elif prompt_mode == "Point":
+        points = state.get("points", [])
+        if not points:
+            return (
+                None,
+                "Please click on the image to select a point",
+                None,
+                None,
+            )
+        transformed_points = []
+        for x_orig, y_orig, lbl in points:
+            x, y = transform_coords_to_input_space(
+                x_orig, y_orig,
+                data["original_hw"], data["input_hw"], data["padding"],
+            )
+            transformed_points.append((x, y, lbl))
+        prompt_points = points
+        results = detector(
+            images=data["images"].to(device),
+            intrinsics=data["intrinsics"].to(device)[None],
+            input_hw=[data["input_hw"]],
+            original_hw=[data["original_hw"]],
+            padding=[data["padding"]],
+            input_points=[transformed_points],
+            prompt_text=geo_prompt_text,
+            return_predicted_intrinsics=True,
+        )
+        (
+            boxes, boxes3d, scores, scores_2d, scores_3d,
+            class_ids, depth_maps, predicted_K,
+        ) = results
+        class_id_mapping = {0: geo_prompt_text}
+    else:
+        return None, f"Unknown prompt mode: {prompt_mode}", None, None
+    # Scale predicted intrinsics to original resolution
+    predicted_K_scaled = scale_intrinsics_to_original(
+        predicted_K,
+        input_hw=data["input_hw"],
+        original_hw=data["original_hw"],
+    )
+    # Format intrinsics info
+    orig_h, orig_w = data["original_hw"]
+    intrinsics_info = f"Image: {orig_w}x{orig_h}\n"
+    intrinsics_info += f"Predicted: {format_intrinsics(predicted_K_scaled)}"
+    if not use_predicted_K:
+        intrinsics_info = f"Image: {orig_w}x{orig_h}\n"
+        intrinsics_info += (
+            f"Used: fx={fx:.2f}, fy={fy:.2f}, "
+            f"cx={cx:.2f}, cy={cy:.2f}\n"
+        )
+        intrinsics_info += (
+            f"Predicted: {format_intrinsics(predicted_K_scaled)}"
+        )
+    # 2D visualization
+    img_2d = visualize_results(
+        data, boxes3d, scores, scores_2d, scores_3d,
+        class_ids, class_id_mapping, score_thres,
+        prompt_points=prompt_points, prompt_box=prompt_box,
+    )
+    # Depth map visualization
+    depth_vis_img = None
+    if depth_maps is not None and len(depth_maps) > 0:
+        depth_np_raw = depth_maps[0].cpu().numpy()
+        d = depth_np_raw.squeeze()
+        pad_l, pad_r, pad_t, pad_b = data["padding"]
+        h_end = d.shape[0] - pad_b if pad_b > 0 else d.shape[0]
+        w_end = d.shape[1] - pad_r if pad_r > 0 else d.shape[1]
+        d_crop = d[pad_t:h_end, pad_l:w_end]
+        d_valid = d_crop[d_crop > 0.01]
+        if len(d_valid) > 0:
+            d_min, d_max = d_valid.min(), d_valid.max()
+            d_norm = np.clip(
+                (d_crop - d_min) / (d_max - d_min + 1e-6), 0, 1
+            )
+            d_norm = (1.0 - d_norm) * 255
+            d_norm = d_norm.astype(np.uint8)
+            depth_vis_img = cv2.applyColorMap(d_norm, cv2.COLORMAP_TURBO)
+            depth_vis_img = cv2.cvtColor(depth_vis_img, cv2.COLOR_BGR2RGB)
+            depth_vis_img = Image.fromarray(depth_vis_img)
+    # 3D visualization (optional)
+    glb_path = None
+    if enable_3d_vis and depth_maps is not None and len(depth_maps) > 0:
+        depth_np = depth_maps[0].cpu().numpy()
+        input_img = data["images"].cpu()
+        mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
+        std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
+        input_img = (input_img * std + mean).clamp(0, 1) * 255
+        input_img = (
+            input_img.squeeze(0).permute(1, 2, 0).numpy().astype(np.uint8)
+        )
+        K_for_unproj = data["intrinsics"].cpu().numpy()
+        filtered_boxes3d_np = []
+        for i in range(len(boxes3d)):
+            mask = scores[i] >= score_thres
+            filtered_boxes3d_np.append(boxes3d[i][mask].cpu().numpy())
+        glb_path = "/tmp/wilddet3d_scene.glb"
+        if use_textured_mesh:
+            create_mesh_scene_glb(
+                depth_np, input_img, K_for_unproj,
+                filtered_boxes3d_np, glb_path,
+                max_depth=20.0,
+                padding=data["padding"],
+                remove_edge=remove_edges,
+                edge_rtol=0.04,
+            )
+        else:
+            subsample = max(1, int(point_density))
+            points, point_colors = depth_to_pointcloud(
+                depth_np, input_img, K_for_unproj,
+                max_depth=20.0, subsample=subsample,
+                padding=data["padding"],
+                remove_edge=remove_edges,
+                edge_rtol=0.04,
+            )
+            create_scene_glb(
+                points, point_colors, filtered_boxes3d_np, glb_path
+            )
+    return img_2d, intrinsics_info, glb_path, depth_vis_img
+def visualize_results(
+    data, boxes3d, scores, scores_2d, scores_3d, class_ids,
+    class_id_mapping, score_thres,
+    prompt_points=None, prompt_box=None,
+):
+    """Visualize 3D detection results using wilddet3d.vis.draw_3d_boxes."""
+    filtered_boxes3d = []
+    filtered_scores_2d = []
+    filtered_scores_3d = []
+    filtered_class_ids = []
+    for i in range(len(boxes3d)):
+        mask = scores[i] >= score_thres
+        filtered_boxes3d.append(boxes3d[i][mask])
+        if scores_2d is not None:
+            filtered_scores_2d.append(scores_2d[i][mask])
+        else:
+            filtered_scores_2d.append(torch.zeros_like(scores[i][mask]))
+        if scores_3d is not None:
+            filtered_scores_3d.append(scores_3d[i][mask])
+        else:
+            filtered_scores_3d.append(torch.zeros_like(scores[i][mask]))
+        filtered_class_ids.append(class_ids[i][mask])
+    # Get original image and draw prompts on it
+    original_img = data["original_images"].cpu().numpy().astype(np.uint8)
+    if prompt_points is not None and len(prompt_points) > 0:
+        original_img = draw_points_on_image(original_img, prompt_points)
+    if prompt_box is not None and len(prompt_box) == 4:
+        original_img = draw_box_on_image(original_img, prompt_box)
+    # Use wilddet3d's draw_3d_boxes for visualization
+    K = data["original_intrinsics"].cpu().numpy()
+    if K.ndim == 3:
+        K = K[0]
+    class_names = [
+        class_id_mapping.get(i, str(i))
+        for i in range(max(len(class_id_mapping), 1))
+    ]
+    # Draw 3D boxes with 2D/3D score labels
+    if len(filtered_boxes3d) > 0 and len(filtered_boxes3d[0]) > 0:
+        pil_img = draw_3d_boxes(
+            image=original_img,
+            boxes3d=filtered_boxes3d[0],
+            intrinsics=K,
+            scores_2d=filtered_scores_2d[0],
+            scores_3d=filtered_scores_3d[0],
+            class_ids=filtered_class_ids[0],
+            class_names=class_names,
+            n_colors=max(len(class_id_mapping), 1),
+        )
+    else:
+        pil_img = Image.fromarray(original_img)
+    return pil_img
+# Load default values
+default_fx, default_fy, default_cx, default_cy = load_default_intrinsics()
+default_image = load_default_image()
+# Build Gradio interface
+with gr.Blocks(title="WildDet3D: 3D Detection") as demo:
+    gr.Markdown("# WildDet3D: Open-Vocabulary 3D Detection in the Wild")
+    gr.Markdown("""
+    **How to use:**
+    - **Text**: Enter object names (e.g., "chair.table"), click Run
+    - **Box-to-Multi-Object**: Draw box -> detect ALL similar objects (one-to-many)
+    - **Box-to-Single-Object**: Draw box -> detect ONLY the boxed object (one-to-one)
+    - **Point**: Click on object, click Run
+    - **+ Label**: Check this to attach a category name (e.g., "chair") to box/point prompts
+    """)
+    # State for click coordinates and original image
+    click_state = gr.State({"points": [], "box": []})
+    original_image_state = gr.State(
+        default_image.copy() if default_image is not None else None
+    )
+    with gr.Row():
+        # Left column: Input
+        with gr.Column(scale=1):
+            input_image = gr.Image(
+                label="Input Image (click for Box/Point mode)",
+                type="numpy",
+                value=default_image,
+                interactive=True,
+                sources=["upload", "clipboard"],
+            )
+            # Prompt settings
+            prompt_mode = gr.Radio(
+                choices=[
+                    "Text",
+                    "Box-to-Multi-Object",
+                    "Box-to-Single-Object",
+                    "Point",
+                ],
+                value="Text",
+                label="Prompt Mode",
+            )
+            text_prompt = gr.Textbox(
+                label="Text Prompt (e.g. 'chair.table')",
+                value="chair.table",
+                placeholder="Enter object names separated by '.'",
+                visible=True,
+            )
+            use_label = gr.Checkbox(
+                label="+ Label (attach category name to box/point prompt)",
+                value=False,
+                visible=False,
+            )
+            label_text = gr.Textbox(
+                label="Category Label (e.g. 'chair')",
+                value="",
+                placeholder="Category name for the selected object",
+                visible=False,
+            )
+            # Point label for Point mode
+            point_label = gr.Radio(
+                choices=["Positive (include)", "Negative (exclude)"],
+                value="Positive (include)",
+                label="Point Label (for Point mode)",
+                visible=False,
+            )
+            # Click info display
+            click_info = gr.Textbox(
+                label="Click Info",
+                value="Select mode and click on image",
+                interactive=False,
+            )
+            with gr.Row():
+                clear_btn = gr.Button("Clear Clicks")
+                run_btn = gr.Button("Run Detection", variant="primary")
+            # Intrinsics settings
+            use_predicted_K = gr.Checkbox(
+                label="Use Predicted Intrinsics",
+                value=True,
+            )
+            with gr.Row():
+                fx = gr.Number(label="fx", value=default_fx)
+                fy = gr.Number(label="fy", value=default_fy)
+                cx = gr.Number(label="cx", value=default_cx)
+                cy = gr.Number(label="cy", value=default_cy)
+            score_thres = gr.Slider(
+                minimum=0, maximum=1, value=0.3, step=0.05,
+                label="Score Threshold",
+            )
+            # 3D visualization settings
+            gr.Markdown("### 3D Visualization Settings")
+            enable_3d_vis = gr.Checkbox(
+                label="Enable 3D Point Cloud / Mesh Visualization",
+                value=False,
+            )
+            gr.Markdown(
+                "*Notice: the model takes the depth latent to generate "
+                "3D boxes, so the boxes and the point cloud might not "
+                "exactly match.*"
+            )
+            use_textured_mesh = gr.Checkbox(
+                label="Textured Mesh (otherwise point cloud)",
+                value=True,
+            )
+            remove_edges = gr.Checkbox(
+                label="Remove depth edges (cleaner geometry)",
+                value=True,
+            )
+            point_density = gr.Slider(
+                minimum=1, maximum=8, value=2, step=1,
+                label="Point Subsample (point cloud mode only, 1=dense)",
+            )
+        # Right column: Output
+        with gr.Column(scale=1):
+            output_image = gr.Image(
+                label="2D Detection Results", type="pil"
+            )
+            depth_image = gr.Image(label="Depth Map", type="pil")
+            output_3d = gr.Model3D(
+                label="3D View (Mesh/Point Cloud + Boxes)",
+                clear_color=(0.1, 0.1, 0.1, 1.0),
+            )
+            intrinsics_info = gr.Textbox(
+                label="Intrinsics Info", interactive=False
+            )
+    # Toggle visibility based on prompt mode
+    def on_mode_change(mode):
+        is_text = mode == "Text"
+        is_point = mode == "Point"
+        return (
+            gr.update(visible=is_text),      # text_prompt
+            gr.update(visible=not is_text),   # use_label
+            gr.update(visible=not is_text),   # label_text
+            gr.update(visible=is_point),      # point_label
+        )
+    prompt_mode.change(
+        on_mode_change,
+        inputs=[prompt_mode],
+        outputs=[text_prompt, use_label, label_text, point_label],
+    )
+    # Connect events
+    input_image.select(
+        on_image_select,
+        inputs=[
+            input_image, original_image_state, click_state,
+            prompt_mode, point_label,
+        ],
+        outputs=[click_state, click_info, input_image],
+    )
+    clear_btn.click(
+        clear_clicks,
+        inputs=[click_state, original_image_state],
+        outputs=[click_state, click_info, input_image],
+    )
+    # When new image is uploaded, save it as original
+    def on_image_upload(image):
+        if image is None:
+            return None, {"points": [], "box": []}, "Upload an image"
+        return (
+            image.copy(),
+            {"points": [], "box": []},
+            "Image loaded - select mode and click",
+        )
+    input_image.upload(
+        on_image_upload,
+        inputs=[input_image],
+        outputs=[original_image_state, click_state, click_info],
+    )
+    run_btn.click(
+        run_wilddet3d,
+        inputs=[
+            input_image, click_state, prompt_mode, text_prompt,
+            use_label, label_text, score_thres, use_predicted_K,
+            fx, fy, cx, cy,
+            enable_3d_vis, remove_edges, point_density, use_textured_mesh,
+        ],
+        outputs=[output_image, intrinsics_info, output_3d, depth_image],
+    )
+if __name__ == "__main__":
+    print("=" * 60)
+    print("WildDet3D Web Demo")
+    print("=" * 60)
+    print()
+    print("Starting server...")
+    port = int(os.environ.get("GRADIO_SERVER_PORT", 7860))
+    demo.launch(share=False, server_name="0.0.0.0", server_port=port)

assets/demo/intrinsics.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a5e46d677b736c45d98fda89d2b4b6b8e88028f8c7a5e25df6c9c3e61f6c6fed
+size 164

assets/demo/rgb.png ADDED Viewed

Git LFS Details

SHA256: 377def0b77a5d11be17fdf3f48466a7dfcde7fff9fd10e1e2f68c57efb18736e
Pointer size: 131 Bytes
Size of remote file: 449 kB

requirements.txt ADDED Viewed

	@@ -0,0 +1,59 @@

+# Vis4D (same approach: install dependencies, not vis4d itself)
+absl-py
+appdirs
+cloudpickle
+cython
+devtools
+h5py
+jsonargparse[signatures]
+lightning
+ml_collections==1.1.0
+numpy>=1.21.0,<2.0.0
+opencv-python
+pandas
+pillow
+plyfile
+pycocotools
+pydantic>=2.0
+setuptools
+tensorboard
+termcolor
+terminaltables
+timm>=0.6.0
+torch>=2.0.0
+torchvision>=0.15.1
+tqdm
+utm
+wheel
+scipy
+# Git utils
+gitdb
+GitPython
+# WildDet3D
+einops
+fvcore
+nltk
+transformers
+fairscale
+mmengine
+decord
+# SAM3 dependencies
+ftfy
+regex
+iopath
+omegaconf
+hydra-core
+scikit-image
+scikit-learn
+open_clip_torch
+# 3D visualization
+pygltflib
+trimesh
+utils3d
+# Depth estimation
+huggingface_hub

third_party/lingbot_depth/mdm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import importlib
+from typing import *
+if TYPE_CHECKING:
+    from .v2 import MDMModel as MDMModelV2
+def import_model_class_by_version(version: str) -> Type[Union['MDMModelV2']]:
+    assert version in ['v2'], f'Unsupported model version: {version}'
+    try:
+        module = importlib.import_module(f'.{version}', __package__)
+    except ModuleNotFoundError:
+        raise ValueError(f'Model version "{version}" not found.')
+    cls = getattr(module, 'MDMModel')
+    return cls

third_party/lingbot_depth/mdm/model/dinov2_rgbd/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

third_party/lingbot_depth/mdm/model/dinov2_rgbd/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

third_party/lingbot_depth/mdm/model/dinov2_rgbd/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,162 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl16(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    # kwargs.update({'img_size': 224, 'patch_size': 16, })
+    return _make_dinov2_model(arch_name="vit_large", pretrained=False, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

third_party/lingbot_depth/mdm/model/dinov2_rgbd/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention
+from .patch_embed_mlp import PatchEmbed as PatchEmbedMLP

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/attention.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+import torch.nn.functional as F
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    # # Deprecated implementation, extremely slow
+    # def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+    #     B, N, C = x.shape
+    #     qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+    #     q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+    #     attn = q @ k.transpose(-2, -1)
+    #     attn = attn.softmax(dim=-1)
+    #     attn = self.attn_drop(attn)
+    #     x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+    #     x = self.proj(x)
+    #     x = self.proj_drop(x)
+    #     return x
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)  # (3, B, H, N, C // H)
+        q, k, v = qkv.unbind(0)      # (B, H, N, C // H)
+        x = F.scaled_dot_product_attention(q, k, v, attn_bias)
+        x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/block.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Block)")
+    else:
+        # warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/patch_embed_mlp.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PixelUnshuffle (nn.Module):
+    def __init__(self, downscale_factor):
+        super().__init__()
+        self.downscale_factor = downscale_factor
+    def forward(self, input):
+        if input.numel() == 0:
+            # this is not in the original torch implementation
+            C,H,W = input.shape[-3:]
+            assert H and W and H % self.downscale_factor == W%self.downscale_factor == 0
+            return input.view(*input.shape[:-3], C*self.downscale_factor**2, H//self.downscale_factor, W//self.downscale_factor)
+        else:
+            return F.pixel_unshuffle(input, self.downscale_factor)
+class Permute(nn.Module):
+    dims: tuple[int, ...]
+    def __init__(self, dims: tuple[int, ...]) -> None:
+        super().__init__()
+        self.dims = tuple(dims)
+    def __repr__(self):
+        return f"Permute{self.dims}"
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return input.permute(*self.dims)
+from itertools import repeat
+import collections.abc
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Sequential(
+            PixelUnshuffle(patch_size),
+            Permute((0,2,3,1)),
+            Mlp(in_chans * patch_size * patch_size, 4*embed_dim, embed_dim),
+            Permute((0,3,1,2)),
+            )
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

third_party/lingbot_depth/mdm/model/dinov2_rgbd/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        # warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

third_party/lingbot_depth/mdm/model/dinov2_rgbd/models/__init__.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+'''
+Docstring for MDM.mdm.model.dinov2_rgbd.models_vlmae
+=======================================================
+This version is modified from the original DINOv2 to support the MIM(masked image modeling) of RGBD input.
+(The original DINOv2 is available at https://github.com/facebookresearch/dinov2.)
+Core Changes:
+1. We add the depth input into the original DINOv2 transformer encoder.
+2. We support the Variable Mask Ratio MAE for both RGB and Depth input.
+'''
+import logging
+from . import vision_transformer as vits
+logger = logging.getLogger("dinov2")
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)

third_party/lingbot_depth/mdm/model/dinov2_rgbd/models/mask_utils.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import torch
+def depth_masking(
+    x,
+    patch_num_h,
+    patch_num_w,
+    depth_values,
+    depth_mask_threshold_ratio=None,
+    depth_mask_threshold_num=None,
+    valid_depth_range=(0.1, 10.0),
+):
+    """
+    Perform patch masking based on depth validity
+    Args:
+        x: [B, N, D] input features (after patch embedding)
+        patch_num_h: int, height of the patch grid
+        patch_num_w: int, width of the patch grid
+        depth_values: [B, 1, H_img, W_img], raw depth map
+        depth_mask_threshold_ratio: float or list, valid depth ratio threshold (0-1)
+        depth_mask_threshold_num: int or list, valid depth pixel count threshold
+        valid_depth_range: tuple, valid depth range (min, max)
+    Returns:
+        visible_list: list of [N_visible_i, D], visible patches for each sample
+        mask_info: dict, containing masking information
+    """
+    B, N, D = x.shape
+    device = x.device
+    assert N == patch_num_h * patch_num_w, \
+        f"N={N} must equal patch_num_h * patch_num_w = {patch_num_h * patch_num_w}"
+    # Compute depth invalid mask
+    depth_invalid_mask = _compute_depth_invalid_mask(
+        depth_values,
+        patch_num_h,
+        patch_num_w,
+        depth_mask_threshold_ratio,
+        depth_mask_threshold_num,
+        valid_depth_range
+    )  # [B, N], True indicates this patch is invalid
+    # Process each sample separately
+    visible_list = []
+    mask_info = {
+        'visible_indices': [],
+        'mask_indices': [],
+        'num_visible': [],
+    }
+    for i in range(B):
+        # Get valid patch indices
+        valid_mask = ~depth_invalid_mask[i]  # [N]
+        visible_indices = torch.where(valid_mask)[0]
+        masked_indices = torch.where(depth_invalid_mask[i])[0]
+        # Extract visible patches
+        visible = x[i, visible_indices]  # [N_visible, D]
+        visible_list.append(visible)
+        # Record information
+        mask_info['visible_indices'].append(visible_indices)
+        mask_info['mask_indices'].append(masked_indices)
+        mask_info['num_visible'].append(len(visible_indices))
+    return visible_list, mask_info
+def _compute_depth_invalid_mask(
+    depth_values,
+    H_patch,
+    W_patch,
+    threshold_ratio,
+    threshold_num,
+    valid_range
+):
+    """
+    Compute depth validity for each patch
+    Args:
+        depth_values: [B, 1, H_img, W_img] raw depth map
+        H_patch, W_patch: patch grid dimensions
+        threshold_ratio: float or list, valid depth ratio threshold
+        threshold_num: int or list, valid depth pixel count threshold
+        valid_range: tuple, (min_depth, max_depth)
+    Returns:
+        invalid_mask: [B, N] bool tensor, True indicates this patch is invalid
+    """
+    B, _, H_img, W_img = depth_values.shape
+    N = H_patch * W_patch
+    device = depth_values.device
+    min_depth, max_depth = valid_range
+    # Calculate pixel size for each patch
+    patch_h = H_img // H_patch
+    patch_w = W_img // W_patch
+    assert H_img % H_patch == 0 and W_img % W_patch == 0, \
+        f"Image size ({H_img}, {W_img}) must be divisible by patch grid ({H_patch}, {W_patch})"
+    # Reshape depth map into patches: [B, 1, H_img, W_img] -> [B, H_patch, patch_h, W_patch, patch_w]
+    depth_reshaped = depth_values.view(B, 1, H_patch, patch_h, W_patch, patch_w)
+    # Transpose and flatten: [B, H_patch, W_patch, patch_h, patch_w] -> [B, N, patch_h*patch_w]
+    depth_reshaped = depth_reshaped.permute(0, 2, 4, 1, 3, 5).reshape(B, N, -1)
+    # Calculate valid depth
+    valid_depth = (depth_reshaped >= min_depth) & (depth_reshaped <= max_depth)
+    valid_depth_ratio = valid_depth.float().mean(dim=-1)  # [B, N]
+    valid_depth_num = valid_depth.float().sum(dim=-1)  # [B, N]
+    # Handle list-form thresholds (different thresholds for each sample in batch)
+    if isinstance(threshold_ratio, list) or isinstance(threshold_num, list):
+        invalid_mask = torch.zeros(B, N, dtype=torch.bool, device=device)
+        for i in range(B):
+            tr = threshold_ratio[i] if isinstance(threshold_ratio, list) else threshold_ratio
+            tn = threshold_num[i] if isinstance(threshold_num, list) else threshold_num
+            sample_mask = torch.zeros(N, dtype=torch.bool, device=device)
+            if tr is not None:
+                sample_mask |= (valid_depth_ratio[i] < tr)
+            if tn is not None:
+                sample_mask |= (valid_depth_num[i] < tn)
+            invalid_mask[i] = sample_mask
+    else:
+        # Uniform threshold
+        invalid_mask = torch.zeros(B, N, dtype=torch.bool, device=device)
+        if threshold_ratio is not None:
+            invalid_mask |= (valid_depth_ratio < threshold_ratio)
+        if threshold_num is not None:
+            invalid_mask |= (valid_depth_num < threshold_num)
+    return invalid_mask

third_party/lingbot_depth/mdm/model/dinov2_rgbd/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,479 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable, Optional, List
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from ..layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+from ..layers import PatchEmbedMLP
+from .mask_utils import depth_masking
+logger = logging.getLogger("dinov2_rgbd")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        img_depth_fuse_mode='',
+        depth_mask_ratio:Union[float, List[float]]=0.6,
+        img_mask_ratio:Union[float, List[float]]=0.0,
+        depth_mask_patch_grid_size: int=1,
+        img_mask_patch_grid_size: int=1,
+        depth_emb_mode='',
+        # depth_emb_mode='conv_1c'
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.depth_emb_mode = depth_emb_mode
+        if self.depth_emb_mode == 'conv_1c':
+            self.depth_patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=1, embed_dim=embed_dim)
+        else:
+            self.depth_patch_embed = None
+        self.img_depth_fuse_mode = img_depth_fuse_mode
+        self.depth_mask_patch_grid_size = depth_mask_patch_grid_size
+        self.img_mask_patch_grid_size = img_mask_patch_grid_size
+        assert self.depth_mask_patch_grid_size == 1, "depth_mask_patch_grid_size must be 1 in current version"
+        assert self.img_mask_patch_grid_size == 1, "img_mask_patch_grid_size must be 1 in current version"
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    @property
+    def onnx_compatible_mode(self):
+        return getattr(self, "_onnx_compatible_mode", False)
+    @onnx_compatible_mode.setter
+    def onnx_compatible_mode(self, value: bool):
+        self._onnx_compatible_mode = value
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, h, w):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        batch_size = x.shape[0]
+        N = self.pos_embed.shape[1] - 1
+        if not self.onnx_compatible_mode and npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0, :]
+        patch_pos_embed = pos_embed[:, 1:, :]
+        dim = x.shape[-1]
+        h0, w0 = h // self.patch_size, w // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if not self.onnx_compatible_mode and self.interpolate_offset > 0:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sy, sx)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (h0, w0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (h0, w0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).flatten(1, 2)
+        return torch.cat((class_pos_embed[:, None, :].expand(patch_pos_embed.shape[0], -1, -1), patch_pos_embed), dim=1).to(previous_dtype)
+    def interpolate_pos_encoding_without_cls(self, x, h, w, input_pos_embed):
+        previous_dtype = x.dtype
+        npatch = x.shape[1]
+        batch_size = x.shape[0]
+        N = input_pos_embed.shape[1]
+        if not self.onnx_compatible_mode and npatch == N and w == h:
+            return input_pos_embed
+        patch_pos_embed = input_pos_embed.float()
+        dim = x.shape[-1]
+        h0, w0 = h // self.patch_size, w // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if not self.onnx_compatible_mode and self.interpolate_offset > 0:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sy, sx)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (h0, w0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (h0, w0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).flatten(1, 2)
+        return patch_pos_embed.to(previous_dtype)
+    def prepare_tokens_with_masks(self, x_img, x_depth, x_img_mask=None, x_depth_mask=None, masks=None, **kwargs):
+        assert masks is None, "extra masks are not supported for this model."
+        B, nc, h_img, w_img = x_img.shape
+        _, _, h_depth, w_depth = x_depth.shape
+        x_depth_raw = x_depth.clone()
+        x_depth_raw[x_depth_raw == 0] = -10
+        depth_patch_num_h, depth_patch_num_w = h_depth // self.patch_size, w_depth // self.patch_size
+        # patchify, embed image tokens and depth tokens
+        x_img = self.patch_embed(x_img) # batch, length_img, dim
+        assert self.depth_patch_embed is not None
+        x_depth = self.depth_patch_embed(x_depth) # batch, length_depth, dim
+        assert depth_patch_num_h * depth_patch_num_w == x_depth.shape[1]
+        # get full pose enc of img and depth
+        # 1-> img data type enc
+        # 2-> depth data type enc
+        img_pose_enc = 1 + self.interpolate_pos_encoding_without_cls(x_img, h_img, w_img, self.pos_embed[:, 1:]).repeat(B, 1, 1)
+        depth_pose_enc = 2 + self.interpolate_pos_encoding_without_cls(x_depth, h_depth, w_depth, self.pos_embed[:, 1:]).repeat(B, 1, 1)
+        # add pose enc to img and depth
+        x_img = x_img + img_pose_enc
+        x_depth = x_depth + depth_pose_enc
+        ## mask depth tokens
+        if kwargs.get('enable_depth_mask', True):
+            x_depth_masked, depth_mask_info = depth_masking(
+                x_depth,
+                depth_patch_num_h,
+                depth_patch_num_w,
+                depth_values=x_depth_raw,
+                depth_mask_threshold_num=[1]*B,
+                valid_depth_range=(-9.5, 200.0)
+            )
+        else:
+            x_depth_masked = x_depth
+            depth_mask_info = None
+        ## mask image tokens
+        x_img_masked = x_img
+        img_mask_info = None
+        # get cls token
+        x_cls = self.cls_token.squeeze(0) +  self.pos_embed.squeeze(0)[:1] # 1, dim
+        # cat cls, img and depth tokens
+        assert self.img_depth_fuse_mode == 'cat_token', "Only cat_token mode is supported for this model."
+        x_masked_list = []
+        for i in range(B):
+            if self.register_tokens is not None:
+                x_mased = torch.cat([x_cls, self.register_tokens.squeeze(0), x_img_masked[i], x_depth_masked[i]], dim=0) # 1 + num_register_tokens + length_img + length_depth, dim
+            else:
+                x_mased = torch.cat([x_cls, x_img_masked[i], x_depth_masked[i]], dim=0) # 1 + length_img + length_depth, dim
+            x_mased = x_mased.unsqueeze(0) # 1, 1 + num_register_tokens + length_img + length_depth, dim
+            x_masked_list.append(x_mased)
+        return x_masked_list
+    def _get_intermediate_layers_not_chunked(self, x_img, x_depth, x_img_mask=None, x_depth_mask=None, n=1, return_mae_aux=False, **kwargs):
+        x = self.prepare_tokens_with_masks(x_img, x_depth, x_img_mask, x_depth_mask, **kwargs)
+        if not kwargs.get('enable_depth_mask', True):
+            x = torch.cat(x, dim=0)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        if not kwargs.get('enable_depth_mask', True):
+            output = [list(torch.split(out, 1, dim=0)) for out in output]
+        return output
+    def _get_intermediate_layers_chunked(self, x_img, x_depth, x_img_mask=None, x_depth_mask=None, n=1, return_mae_aux=False, **kwargs):
+        x = self.prepare_tokens_with_masks(x_img, x_depth, x_img_mask, x_depth_mask, **kwargs)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def extract_features(self, outputs, norm=True):
+        feat_outputs = []
+        class_tokens = []
+        feat_start_idx = 1 + self.num_register_tokens
+        def process_output(out):
+            normed = self.norm(out) if norm else out
+            return normed[:, feat_start_idx:], normed[:, 0]
+        for output in outputs:
+            if isinstance(output, list):
+                feats, tokens = zip(*[process_output(out) for out in output])
+                feat_outputs.append(list(feats))
+                class_tokens.append(list(tokens))
+            else:
+                feat, token = process_output(output)
+                feat_outputs.append(feat)
+                class_tokens.append(token)
+        return feat_outputs, class_tokens
+    def get_intermediate_layers_mae(
+        self,
+        x_img: torch.Tensor,
+        x_depth: torch.Tensor,
+        x_img_mask: torch.Tensor=None,
+        x_depth_mask: torch.Tensor=None,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+        return_mae_aux=True,
+        **kwargs
+    ):
+        assert reshape is False, "reshape is not supported for now"
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x_img, x_depth, x_img_mask, x_depth_mask, n, return_mae_aux=return_mae_aux,**kwargs)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x_img, x_depth, x_img_mask, x_depth_mask, n, return_mae_aux=return_mae_aux,**kwargs)
+        feat_outputs, class_tokens = self.extract_features(outputs, norm)
+        if return_class_token:
+            return tuple(zip(feat_outputs, class_tokens))
+        return tuple(feat_outputs)
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/cluster.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+    return ClusterType.FAIR
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+    return cluster_type
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params

third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import math
+import logging
+import os
+from omegaconf import OmegaConf
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+logger = logging.getLogger("dinov2")
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg

third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/dtype.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from typing import Dict, Union
+import numpy as np
+import torch
+TypeSpec = Union[str, np.dtype, torch.dtype]
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]

third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/param_groups.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+import logging
+logger = logging.getLogger("dinov2")
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if (
+            ".pos_embed" in name
+            or ".patch_embed" in name
+            or ".mask_token" in name
+            or ".cls_token" in name
+            or ".register_tokens" in name
+        ):
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name
+            or "patch_embed" in name
+            or "mask_token" in name
+            or "cls_token" in name
+            or "register_tokens" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+    return all_param_groups
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+    return fused_params_groups.values()

third_party/lingbot_depth/mdm/model/dinov2_rgbd/utils/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+import numpy as np
+import torch
+from torch import nn
+logger = logging.getLogger("dinov2")
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+        freeze_schedule = np.zeros((freeze_iters))
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+        assert len(self.schedule) == self.total_iters
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False

third_party/lingbot_depth/mdm/model/modules_decoder.py ADDED Viewed

	@@ -0,0 +1,185 @@

+from typing import *
+from numbers import Number
+import importlib
+import itertools
+import functools
+import sys
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+from .utils import wrap_module_with_gradient_checkpointing
+class ResidualConvBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int = None,
+        hidden_channels: int = None,
+        kernel_size: int = 3,
+        padding_mode: str = 'replicate',
+        activation: Literal['relu', 'leaky_relu', 'silu', 'elu'] = 'relu',
+        in_norm: Literal['group_norm', 'layer_norm', 'instance_norm', 'none'] = 'layer_norm',
+        hidden_norm: Literal['group_norm', 'layer_norm', 'instance_norm'] = 'group_norm',
+    ):
+        super(ResidualConvBlock, self).__init__()
+        if out_channels is None:
+            out_channels = in_channels
+        if hidden_channels is None:
+            hidden_channels = in_channels
+        if activation =='relu':
+            activation_cls = nn.ReLU
+        elif activation == 'leaky_relu':
+            activation_cls = functools.partial(nn.LeakyReLU, negative_slope=0.2)
+        elif activation =='silu':
+            activation_cls = nn.SiLU
+        elif activation == 'elu':
+            activation_cls = nn.ELU
+        else:
+            raise ValueError(f'Unsupported activation function: {activation}')
+        self.layers = nn.Sequential(
+            nn.GroupNorm(in_channels // 32, in_channels) if in_norm == 'group_norm' else \
+                nn.GroupNorm(1, in_channels) if in_norm == 'layer_norm' else \
+                nn.InstanceNorm2d(in_channels) if in_norm == 'instance_norm' else \
+                nn.Identity(),
+            activation_cls(),
+            nn.Conv2d(in_channels, hidden_channels, kernel_size=kernel_size, padding=kernel_size // 2, padding_mode=padding_mode),
+            nn.GroupNorm(hidden_channels // 32, hidden_channels) if hidden_norm == 'group_norm' else \
+                nn.GroupNorm(1, hidden_channels) if hidden_norm == 'layer_norm' else \
+                nn.InstanceNorm2d(hidden_channels) if hidden_norm == 'instance_norm' else\
+                nn.Identity(),
+            activation_cls(),
+            nn.Conv2d(hidden_channels, out_channels, kernel_size=kernel_size, padding=kernel_size // 2, padding_mode=padding_mode)
+        )
+        self.skip_connection = nn.Conv2d(in_channels, out_channels, kernel_size=1, padding=0) if in_channels != out_channels else nn.Identity()
+    def forward(self, x):
+        skip = self.skip_connection(x)
+        x = self.layers(x)
+        x = x + skip
+        return x
+class Resampler(nn.Sequential):
+    def __init__(self,
+        in_channels: int,
+        out_channels: int,
+        type_: Literal['pixel_shuffle', 'nearest', 'bilinear', 'conv_transpose', 'pixel_unshuffle', 'avg_pool', 'max_pool'],
+        scale_factor: int = 2,
+    ):
+        if type_ == 'pixel_shuffle':
+            nn.Sequential.__init__(self,
+                nn.Conv2d(in_channels, out_channels * (scale_factor ** 2), kernel_size=3, stride=1, padding=1, padding_mode='replicate'),
+                nn.PixelShuffle(scale_factor),
+                nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+            for i in range(1, scale_factor ** 2):
+                self[0].weight.data[i::scale_factor ** 2] = self[0].weight.data[0::scale_factor ** 2]
+                self[0].bias.data[i::scale_factor ** 2] = self[0].bias.data[0::scale_factor ** 2]
+        elif type_ in ['nearest', 'bilinear']:
+            nn.Sequential.__init__(self,
+                nn.Upsample(scale_factor=scale_factor, mode=type_, align_corners=False if type_ == 'bilinear' else None),
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+        elif type_ == 'conv_transpose':
+            nn.Sequential.__init__(self,
+                nn.ConvTranspose2d(in_channels, out_channels, kernel_size=scale_factor, stride=scale_factor),
+                nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+            self[0].weight.data[:] = self[0].weight.data[:, :, :1, :1]
+        elif type_ == 'pixel_unshuffle':
+            nn.Sequential.__init__(self,
+                nn.PixelUnshuffle(scale_factor),
+                nn.Conv2d(in_channels * (scale_factor ** 2), out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate')
+            )
+        elif type_ == 'avg_pool':
+            nn.Sequential.__init__(self,
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate'),
+                nn.AvgPool2d(kernel_size=scale_factor, stride=scale_factor),
+            )
+        elif type_ == 'max_pool':
+            nn.Sequential.__init__(self,
+                nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, padding_mode='replicate'),
+                nn.MaxPool2d(kernel_size=scale_factor, stride=scale_factor),
+            )
+        else:
+            raise ValueError(f'Unsupported resampler type: {type_}')
+class MLP(nn.Sequential):
+    def __init__(self, dims: Sequence[int]):
+        nn.Sequential.__init__(self,
+            *itertools.chain(*[
+                (nn.Linear(dim_in, dim_out), nn.ReLU(inplace=True))
+                    for dim_in, dim_out in zip(dims[:-2], dims[1:-1])
+            ]),
+            nn.Linear(dims[-2], dims[-1]),
+        )
+class ConvStack(nn.Module):
+    def __init__(self,
+        dim_in: List[Optional[int]],
+        dim_res_blocks: List[int],
+        dim_out: List[Optional[int]],
+        resamplers: Union[Literal['pixel_shuffle', 'nearest', 'bilinear', 'conv_transpose', 'pixel_unshuffle', 'avg_pool', 'max_pool'], List],
+        dim_times_res_block_hidden: int = 1,
+        num_res_blocks: int = 1,
+        res_block_in_norm: Literal['layer_norm', 'group_norm' , 'instance_norm', 'none'] = 'layer_norm',
+        res_block_hidden_norm: Literal['layer_norm', 'group_norm' , 'instance_norm', 'none'] = 'group_norm',
+        activation: Literal['relu', 'leaky_relu', 'silu', 'elu'] = 'relu',
+    ):
+        super().__init__()
+        self.input_blocks = nn.ModuleList([
+            nn.Conv2d(dim_in_, dim_res_block_, kernel_size=1, stride=1, padding=0) if dim_in_ is not None else nn.Identity()
+                for dim_in_, dim_res_block_ in zip(dim_in if isinstance(dim_in, Sequence) else itertools.repeat(dim_in), dim_res_blocks)
+        ])
+        self.resamplers = nn.ModuleList([
+            Resampler(dim_prev, dim_succ, scale_factor=2, type_=resampler)
+                for i, (dim_prev, dim_succ, resampler) in enumerate(zip(
+                    dim_res_blocks[:-1],
+                    dim_res_blocks[1:],
+                    resamplers if isinstance(resamplers, Sequence) else itertools.repeat(resamplers)
+                ))
+        ])
+        self.res_blocks = nn.ModuleList([
+            nn.Sequential(
+                *(
+                    ResidualConvBlock(
+                        dim_res_block_, dim_res_block_, dim_times_res_block_hidden * dim_res_block_,
+                        activation=activation, in_norm=res_block_in_norm, hidden_norm=res_block_hidden_norm
+                    ) for _ in range(num_res_blocks[i] if isinstance(num_res_blocks, list) else num_res_blocks)
+                )
+            ) for i, dim_res_block_ in enumerate(dim_res_blocks)
+        ])
+        self.output_blocks = nn.ModuleList([
+            nn.Conv2d(dim_res_block_, dim_out_, kernel_size=1, stride=1, padding=0) if dim_out_ is not None else nn.Identity()
+                for dim_out_, dim_res_block_ in zip(dim_out if isinstance(dim_out, Sequence) else itertools.repeat(dim_out), dim_res_blocks)
+        ])
+    def enable_gradient_checkpointing(self):
+        for i in range(len(self.resamplers)):
+            self.resamplers[i] = wrap_module_with_gradient_checkpointing(self.resamplers[i])
+        for i in range(len(self.res_blocks)):
+            for j in range(len(self.res_blocks[i])):
+                self.res_blocks[i][j] = wrap_module_with_gradient_checkpointing(self.res_blocks[i][j])
+    def forward(self, in_features: List[torch.Tensor]):
+        out_features = []
+        for i in range(len(self.res_blocks)):
+            feature = self.input_blocks[i](in_features[i])
+            if i == 0:
+                x = feature
+            elif feature is not None:
+                x = x + feature
+            x = self.res_blocks[i](x)
+            out_features.append(self.output_blocks[i](x))
+            if i < len(self.res_blocks) - 1:
+                x = self.resamplers[i](x)
+        return out_features

third_party/lingbot_depth/mdm/model/modules_rgbd_encoder.py ADDED Viewed

	@@ -0,0 +1,152 @@

+from typing import *
+from numbers import Number
+import importlib
+import itertools
+import functools
+import sys
+import torch
+from torch import Tensor
+import torch.nn as nn
+import torch.nn.functional as F
+from .dinov2_rgbd.models.vision_transformer import DinoVisionTransformer
+from .utils import wrap_dinov2_attention_with_sdpa, wrap_module_with_gradient_checkpointing
+class DINOv2_RGBD_Encoder(nn.Module):
+    backbone: DinoVisionTransformer
+    image_mean: torch.Tensor
+    image_std: torch.Tensor
+    dim_features: int
+    def __init__(self, backbone: str, intermediate_layers: Union[int, List[int]], dim_out: int, ignore_layers: Union[str, List[str]]=[], in_chans: int=3, strict: bool=True, img_depth_fuse_mode='', depth_emb_mode='', depth_mask_ratio=0.6, img_mask_ratio=0.0, **deprecated_kwargs):
+        super(DINOv2_RGBD_Encoder, self).__init__()
+        self.intermediate_layers = intermediate_layers
+        self.strict = strict
+        self.ignore_layers = ignore_layers
+        self.img_mask_ratio = img_mask_ratio
+        # Load the backbone
+        self.hub_loader = getattr(importlib.import_module(".dinov2_rgbd.hub.backbones", __package__), backbone)
+        self.backbone_name = backbone
+        self.backbone = self.hub_loader(pretrained=False,
+                                        in_chans=in_chans,
+                                        img_depth_fuse_mode=img_depth_fuse_mode,
+                                        depth_emb_mode=depth_emb_mode,
+                                        depth_mask_ratio=depth_mask_ratio,
+                                        img_mask_ratio=img_mask_ratio)
+        self.dim_features = self.backbone.blocks[0].attn.qkv.in_features
+        self.num_features = intermediate_layers if isinstance(intermediate_layers, int) else len(intermediate_layers)
+        if img_mask_ratio > 0:
+            self.mask_token_mae = nn.Parameter(torch.zeros(1, 1, self.dim_features))
+            torch.nn.init.normal_(self.mask_token_mae, std=.02)
+        self.output_projections = nn.ModuleList([
+            nn.Conv2d(in_channels=self.dim_features, out_channels=dim_out, kernel_size=1, stride=1, padding=0,)
+                for _ in range(self.num_features)
+        ])
+        self.register_buffer("image_mean", torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1))
+        self.register_buffer("image_std", torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1))
+    @property
+    def onnx_compatible_mode(self):
+        return getattr(self, "_onnx_compatible_mode", False)
+    @onnx_compatible_mode.setter
+    def onnx_compatible_mode(self, value: bool):
+        self._onnx_compatible_mode = value
+        self.backbone.onnx_compatible_mode = value
+    def init_weights(self):
+        pretrained_backbone_state_dict = self.hub_loader(pretrained=True).state_dict()
+        ignore_layers = []
+        if isinstance(self.ignore_layers, str):
+            ignore_layers = [self.ignore_layers]
+        else:
+            ignore_layers = self.ignore_layers
+        if len(ignore_layers) == 0:
+            self.backbone.load_state_dict(pretrained_backbone_state_dict, strict=self.strict)
+        else:
+            state_dict = {}
+            for k, v in pretrained_backbone_state_dict.items():
+                is_ignore = False
+                for ig_k in ignore_layers:
+                    if ig_k in k:
+                        is_ignore = True
+                        break
+                if not is_ignore:
+                    state_dict[k] = v
+            self.backbone.load_state_dict(state_dict, strict=self.strict)
+    def enable_gradient_checkpointing(self):
+        for i in range(len(self.backbone.blocks)):
+            wrap_module_with_gradient_checkpointing(self.backbone.blocks[i])
+    def enable_pytorch_native_sdpa(self):
+        for i in range(len(self.backbone.blocks)):
+            wrap_dinov2_attention_with_sdpa(self.backbone.blocks[i].attn)
+    def forward(self,
+                image: torch.Tensor,
+                depth: torch.Tensor,
+                token_rows: Union[int, torch.LongTensor],
+                token_cols: Union[int, torch.LongTensor],
+                return_class_token: bool = False,
+                remap_depth_in: str='linear',
+                **kwargs):
+        image_14 = F.interpolate(image, (token_rows * 14, token_cols * 14), mode="bilinear", align_corners=False, antialias=not self.onnx_compatible_mode)
+        image_14 = (image_14 - self.image_mean) / self.image_std
+        depth_14 = F.interpolate(depth, (token_rows * 14, token_cols * 14), mode="nearest")
+        # set invalid depth value to zero
+        depth_14[torch.isinf(depth_14)] = 0.0
+        depth_14[torch.isnan(depth_14)] = 0.0
+        dmask_14 = (depth_14 > 0.01).detach()
+        depth_14 = depth_14 * dmask_14.float()
+        if remap_depth_in == 'linear':
+            pass # do nothing
+        elif remap_depth_in == 'log':
+            depth_14 = torch.log(depth_14)
+            depth_14[~dmask_14] = 0.0
+            depth_14 = torch.nan_to_num(depth_14, nan=0.0, posinf=0.0, neginf=0.0)
+        else:
+            raise NotImplementedError
+        # Get intermediate layers from the backbone
+        features = self.backbone.get_intermediate_layers_mae(
+            x_img=image_14,
+            x_depth=depth_14,
+            n=self.intermediate_layers,
+            return_class_token=True,
+            **kwargs)
+        assert self.img_mask_ratio == 0, "img_mask_ratio is not supported in this encoder"
+        if isinstance(features[0][0], list):
+            num_valid_tokens = token_rows * token_cols
+            features = tuple(
+                (
+                    torch.cat([feat[:, :num_valid_tokens].contiguous() for feat in feats], dim=0),
+                    torch.cat(cls_tokens, dim=0)
+                )
+                for feats, cls_tokens in features
+            )
+        # Project features to the desired dimensionality
+        x = torch.stack([
+            proj(feat.permute(0, 2, 1)[:, :, :token_rows*token_cols].unflatten(2, (token_rows, token_cols)).contiguous())
+                for proj, (feat, clstoken) in zip(self.output_projections, features)
+        ], dim=1).sum(dim=1)
+        cls_token = features[-1][1]
+        if return_class_token:
+            return x, cls_token, None, None
+        else:
+            return x, None, None

third_party/lingbot_depth/mdm/model/utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from typing import *
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+def wrap_module_with_gradient_checkpointing(module: nn.Module):
+    from torch.utils.checkpoint import checkpoint
+    class _CheckpointingWrapper(module.__class__):
+        _restore_cls = module.__class__
+        def forward(self, *args, **kwargs):
+            return checkpoint(super().forward, *args, use_reentrant=False, **kwargs)
+    module.__class__ = _CheckpointingWrapper
+    return module
+def unwrap_module_with_gradient_checkpointing(module: nn.Module):
+    module.__class__ = module.__class__._restore_cls
+def wrap_dinov2_attention_with_sdpa(module: nn.Module):
+    assert torch.__version__ >= '2.0', "SDPA requires PyTorch 2.0 or later"
+    class _AttentionWrapper(module.__class__):
+        def forward(self, x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+            B, N, C = x.shape
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)  # (3, B, H, N, C // H)
+            q, k, v = torch.unbind(qkv, 0)      # (B, H, N, C // H)
+            x = F.scaled_dot_product_attention(q, k, v, attn_bias)
+            x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+            return x
+    module.__class__ = _AttentionWrapper
+    return module
+def wrap_dinov3_attention_with_sdpa(module: nn.Module):
+    assert torch.__version__ >= '2.0', "SDPA requires PyTorch 2.0 or later"
+    class _AttentionWrapper(module.__class__):
+        def forward(self, x: torch.Tensor, attn_bias=None) -> torch.Tensor:
+            B, N, C = x.shape
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)  # (3, B, H, N, C // H)
+            q, k, v = torch.unbind(qkv, 0)      # (B, H, N, C // H)
+            x = F.scaled_dot_product_attention(q, k, v, attn_bias)
+            x = x.permute(0, 2, 1, 3).reshape(B, N, C)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+            return x
+    module.__class__ = _AttentionWrapper
+    return module
+def sync_ddp_hook(state, bucket: torch.distributed.GradBucket) -> torch.futures.Future[torch.Tensor]:
+    group_to_use = torch.distributed.group.WORLD
+    world_size = group_to_use.size()
+    grad = bucket.buffer()
+    grad.div_(world_size)
+    torch.distributed.all_reduce(grad, group=group_to_use)
+    fut = torch.futures.Future()
+    fut.set_result(grad)
+    return fut
+def depth_to_pointcloud(depth, intrinsic_normalized, depth_scale=1.0):
+    """
+    Convert depth map to point cloud (pure Tensor version, no point filtering)
+    Args:
+        depth: torch.Tensor, shape (H, W) or (B, H, W), depth map
+        intrinsic_normalized: torch.Tensor, shape (3, 3) or (B, 3, 3), normalized intrinsic matrix
+            Normalized intrinsics: fx' = fx/W, fy' = fy/H, cx' = cx/W, cy' = cy/H
+        depth_scale: float, depth scale factor, default 1000.0
+    Returns:
+        points: torch.Tensor, shape (H, W, 3) or (B, H, W, 3), point cloud coordinates (x, y, z)
+    """
+    # Handle batch dimension
+    if depth.dim() == 2:
+        depth = depth.unsqueeze(0)  # (1, H, W)
+        intrinsic_normalized = intrinsic_normalized.unsqueeze(0)  # (1, 3, 3)
+        squeeze_output = True
+    else:
+        squeeze_output = False
+    B, H, W = depth.shape
+    device = depth.device
+    # Denormalize intrinsics
+    fx = intrinsic_normalized[:, 0, 0] * W  # (B,)
+    fy = intrinsic_normalized[:, 1, 1] * H
+    cx = intrinsic_normalized[:, 0, 2] * W
+    cy = intrinsic_normalized[:, 1, 2] * H
+    # Create pixel coordinate grid (H, W)
+    v, u = torch.meshgrid(
+        torch.arange(H, device=device, dtype=torch.float32),
+        torch.arange(W, device=device, dtype=torch.float32),
+        indexing='ij'
+    )
+    # Expand to batch dimension (B, H, W)
+    u = u.unsqueeze(0).expand(B, -1, -1)
+    v = v.unsqueeze(0).expand(B, -1, -1)
+    # Backproject to 3D space
+    z = depth / depth_scale  # (B, H, W)
+    # Expand intrinsic dimensions for broadcasting (B, 1, 1)
+    fx = fx.view(B, 1, 1)
+    fy = fy.view(B, 1, 1)
+    cx = cx.view(B, 1, 1)
+    cy = cy.view(B, 1, 1)
+    x = (u - cx) * z / fx  # (B, H, W)
+    y = (v - cy) * z / fy  # (B, H, W)
+    # Stack coordinates (B, H, W, 3)
+    points = torch.stack([x, y, z], dim=-1)
+    if squeeze_output:
+        points = points.squeeze(0)  # (H, W, 3)
+    return points

third_party/lingbot_depth/mdm/model/v2.py ADDED Viewed

	@@ -0,0 +1,297 @@

+from typing import *
+from numbers import Number
+from functools import partial
+from pathlib import Path
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils
+import torch.utils.checkpoint
+import torch.amp
+import torch.version
+from huggingface_hub import hf_hub_download
+from .modules_rgbd_encoder import DINOv2_RGBD_Encoder
+from .modules_decoder import MLP, ConvStack
+from ..utils.geo import depth_to_pointcloud, normalized_view_plane_uv
+class MDMModel(nn.Module):
+    encoder: Union[DINOv2_RGBD_Encoder]
+    neck: ConvStack
+    points_head: ConvStack
+    mask_head: ConvStack
+    scale_head: MLP
+    onnx_compatible_mode: bool
+    def __init__(self,
+        encoder: Dict[str, Any],
+        neck: Dict[str, Any],
+        depth_head: Dict[str, Any] = None,
+        mask_head: Dict[str, Any] = None,
+        normal_head: Dict[str, Any] = None,
+        scale_head: Dict[str, Any] = None,
+        remap_output: Literal['linear', 'sinh', 'exp', 'sinh_exp'] = 'linear',
+        remap_depth_in: Literal['linear', 'log'] = 'log',
+        remap_depth_out: Literal['linear', 'exp'] = 'exp',
+        num_tokens_range: List[int] = [1200, 3600],
+        **deprecated_kwargs
+    ):
+        super(MDMModel, self).__init__()
+        if deprecated_kwargs:
+            warnings.warn(f"The following deprecated/invalid arguments are ignored: {deprecated_kwargs}")
+        self.remap_output = remap_output
+        self.num_tokens_range = num_tokens_range
+        self.remap_depth_in = remap_depth_in
+        self.remap_depth_out = remap_depth_out
+        self.encoder = DINOv2_RGBD_Encoder(**encoder)
+        self.neck = ConvStack(**neck)
+        if depth_head is not None:
+            self.depth_head = ConvStack(**depth_head)
+        if mask_head is not None:
+            self.mask_head = ConvStack(**mask_head)
+    @property
+    def device(self) -> torch.device:
+        return next(self.parameters()).device
+    @property
+    def dtype(self) -> torch.dtype:
+        return next(self.parameters()).dtype
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, Path, IO[bytes]],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+        **hf_kwargs) -> 'MDMModel':
+        if Path(pretrained_model_name_or_path).exists():
+            checkpoint_path = pretrained_model_name_or_path
+        else:
+            checkpoint_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                repo_type="model",
+                filename="model.pt",
+                **hf_kwargs
+            )
+        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=True)
+        model_config = checkpoint['model_config']
+        if model_kwargs is not None:
+            model_config.update(model_kwargs)
+        model = cls(**model_config)
+        model.load_state_dict(checkpoint['model'], strict=False)
+        return model
+    def init_weights(self):
+        self.encoder.init_weights()
+    def enable_pytorch_native_sdpa(self):
+        self.encoder.enable_pytorch_native_sdpa()
+    def forward(self,
+                image: torch.Tensor,
+                num_tokens: Union[int, torch.LongTensor],
+                depth: Union[None, torch.Tensor]=None,
+                **kwargs) -> Dict[str, torch.Tensor]:
+        batch_size, _, img_h, img_w = image.shape
+        device, dtype = image.device, image.dtype
+        assert depth is not None  # in this version, depth is required
+        if depth.dim() == 3:
+            depth = depth.unsqueeze(1) # from (B, H, W) to (B, 1, H, W)
+        aspect_ratio = img_w / img_h
+        base_h, base_w = (num_tokens / aspect_ratio) ** 0.5, (num_tokens * aspect_ratio) ** 0.5
+        if isinstance(base_h, torch.Tensor):
+            base_h, base_w = base_h.round().long(), base_w.round().long()
+        else:
+            base_h, base_w = round(base_h), round(base_w)
+        # Backbones encoding
+        features, cls_token, _, _ = self.encoder(image, depth, base_h, base_w, return_class_token=True, remap_depth_in=self.remap_depth_in, **kwargs)
+        features = features + cls_token[..., None, None]
+        features = [features, None, None, None, None]
+        # Concat UVs for aspect ratio input
+        for level in range(5):
+            uv = normalized_view_plane_uv(width=base_w * 2 ** level, height=base_h * 2 ** level, aspect_ratio=aspect_ratio, dtype=dtype, device=device)
+            uv = uv.permute(2, 0, 1).unsqueeze(0).expand(batch_size, -1, -1, -1)
+            if features[level] is None:
+                features[level] = uv
+            else:
+                features[level] = torch.concat([features[level], uv], dim=1)
+        # Shared neck
+        features = self.neck(features)
+        # Heads decoding
+        depth_reg, normal, mask = (getattr(self, head)(features)[-1] if hasattr(self, head) else None for head in ['depth_head', 'normal_head', 'mask_head'])
+        metric_scale = self.scale_head(cls_token) if hasattr(self, 'scale_head') else None
+        # Resize
+        depth_reg, normal, mask = (F.interpolate(v, (img_h, img_w), mode='bilinear', align_corners=False, antialias=False) if v is not None else None for v in [depth_reg, normal, mask])
+        # Remap output
+        if depth_reg is not None:
+            if self.remap_depth_out == 'exp':
+                depth_reg = depth_reg.exp().squeeze(1)
+            elif self.remap_depth_out == 'linear':
+                depth_reg = depth_reg.squeeze(1)
+            else:
+                raise ValueError(f"Invalid remap_depth_out: {self.remap_depth_out}")
+        if normal is not None:
+            normal = normal.permute(0, 2, 3, 1)
+            normal = F.normalize(normal, dim=-1)
+        if mask is not None:
+            mask_prob = mask.squeeze(1).sigmoid()
+            # mask_logits = mask.squeeze(1)
+        else:
+            mask_prob = None
+        if metric_scale is not None:
+            metric_scale = metric_scale.squeeze(1).exp()
+        return_dict = {
+            'depth_reg': depth_reg,
+            'normal': normal,
+            'mask': mask_prob,
+        }
+        return_dict = {k: v for k, v in return_dict.items() if v is not None}
+        return return_dict
+    @torch.inference_mode()
+    def infer(
+        self,
+        image: torch.Tensor,
+        depth_in: torch.Tensor = None,
+        num_tokens: int = None,
+        resolution_level: int = 9,
+        apply_mask: bool = True,
+        use_fp16: bool = True,
+        intrinsics: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        if image.dim() == 3:
+            omit_batch_dim = True
+            image = image.unsqueeze(0)
+        else:
+            omit_batch_dim = False
+        image = image.to(dtype=self.dtype, device=self.device)
+        if (depth_in is not None) and (depth_in.dim() == 2):
+            depth_in = depth_in.unsqueeze(0).to(dtype=self.dtype, device=self.device)
+        original_height, original_width = image.shape[-2:]
+        area = original_height * original_width
+        aspect_ratio = original_width / original_height
+        # Determine the number of base tokens to use
+        if num_tokens is None:
+            min_tokens, max_tokens = self.num_tokens_range
+            num_tokens = int(min_tokens + (resolution_level / 9) * (max_tokens - min_tokens))
+        # Forward pass
+        with torch.autocast(device_type=self.device.type, dtype=torch.bfloat16, enabled=use_fp16 and self.dtype != torch.bfloat16):
+            output = self.forward(image, num_tokens=num_tokens, depth=depth_in, **kwargs)
+        depth_reg, mask = (output.get(k, None) for k in ['depth_reg', 'mask'])
+        # Always process the output in fp32 precision
+        depth_reg, mask = map(lambda x: x.float() if isinstance(x, torch.Tensor) else x, [depth_reg, mask])
+        with torch.autocast(device_type=self.device.type, dtype=torch.float32):
+            if mask is not None:
+                mask_binary = mask > 0.5
+            else:
+                mask_binary = None
+            depth = depth_reg
+            if intrinsics is not None:
+                points = depth_to_pointcloud(depth, intrinsics)
+            else:
+                points = None
+            # Apply mask
+            if apply_mask and mask_binary is not None:
+                points = torch.where(mask_binary[..., None], points, torch.inf) if points is not None else None
+                depth = torch.where(mask_binary, depth, torch.inf) if depth is not None else None
+        return_dict = {
+            'points': points,
+            'depth': depth,
+            'mask': mask_binary,
+        }
+        return_dict = {k: v for k, v in return_dict.items() if v is not None}
+        if omit_batch_dim:
+            return_dict = {k: v.squeeze(0) for k, v in return_dict.items()}
+        return return_dict
+    def forward_feat(self,
+                image: torch.Tensor,
+                num_tokens: Union[int, torch.LongTensor],
+                depth: Union[None, torch.Tensor]=None,
+                **kwargs) -> Dict[str, torch.Tensor]:
+        batch_size, _, img_h, img_w = image.shape
+        device, dtype = image.device, image.dtype
+        assert depth is not None  # in this version, depth is required
+        if depth.dim() == 3:
+            depth = depth.unsqueeze(1) # from (B, H, W) to (B, 1, H, W)
+        aspect_ratio = img_w / img_h
+        base_h, base_w = (num_tokens / aspect_ratio) ** 0.5, (num_tokens * aspect_ratio) ** 0.5
+        if isinstance(base_h, torch.Tensor):
+            base_h, base_w = base_h.round().long(), base_w.round().long()
+        else:
+            base_h, base_w = round(base_h), round(base_w)
+        # Backbones encoding
+        features, cls_token, _, _ = self.encoder(image, depth, base_h, base_w, return_class_token=True, remap_depth_in=self.remap_depth_in, **kwargs)
+        return features, cls_token
+    @torch.inference_mode()
+    def infer_feat(
+        self,
+        image: torch.Tensor,
+        depth_in: torch.Tensor = None,
+        num_tokens: int = None,
+        resolution_level: int = 9,
+        apply_mask: bool = True,
+        use_fp16: bool = True,
+        intrinsics: Optional[torch.Tensor] = None,
+        **kwargs
+    ):
+        if image.dim() == 3:
+            omit_batch_dim = True
+            image = image.unsqueeze(0)
+        else:
+            omit_batch_dim = False
+        image = image.to(dtype=self.dtype, device=self.device)
+        if (depth_in is not None) and (depth_in.dim() == 2):
+            depth_in = depth_in.unsqueeze(0).to(dtype=self.dtype, device=self.device)
+        original_height, original_width = image.shape[-2:]
+        area = original_height * original_width
+        aspect_ratio = original_width / original_height
+        # Determine the number of base tokens to use
+        if num_tokens is None:
+            min_tokens, max_tokens = self.num_tokens_range
+            num_tokens = int(min_tokens + (resolution_level / 9) * (max_tokens - min_tokens))
+        # Forward pass
+        with torch.autocast(device_type=self.device.type, dtype=torch.bfloat16, enabled=use_fp16 and self.dtype != torch.bfloat16):
+            features, cls_token = self.forward_feat(image, num_tokens=num_tokens, depth=depth_in, **kwargs)
+        return features, cls_token

third_party/lingbot_depth/mdm/utils/__init__.py ADDED Viewed

File without changes

third_party/lingbot_depth/mdm/utils/geo.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+def normalized_view_plane_uv(width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None) -> torch.Tensor:
+    "UV with left-top corner as (-width / diagonal, -height / diagonal) and right-bottom corner as (width / diagonal, height / diagonal)"
+    if aspect_ratio is None:
+        aspect_ratio = width / height
+    span_x = aspect_ratio / (1 + aspect_ratio ** 2) ** 0.5
+    span_y = 1 / (1 + aspect_ratio ** 2) ** 0.5
+    u = torch.linspace(-span_x * (width - 1) / width, span_x * (width - 1) / width, width, dtype=dtype, device=device)
+    v = torch.linspace(-span_y * (height - 1) / height, span_y * (height - 1) / height, height, dtype=dtype, device=device)
+    u, v = torch.meshgrid(u, v, indexing='xy')
+    uv = torch.stack([u, v], dim=-1)
+    return uv
+def depth_to_pointcloud(depth, intrinsic_normalized, depth_scale=1.0):
+    """
+    Convert depth map to point cloud (pure Tensor version, no point filtering)
+    Args:
+        depth: torch.Tensor, shape (H, W) or (B, H, W), depth map
+        intrinsic_normalized: torch.Tensor, shape (3, 3) or (B, 3, 3), normalized intrinsic matrix
+            Normalized intrinsics: fx' = fx/W, fy' = fy/H, cx' = cx/W, cy' = cy/H
+        depth_scale: float, depth scale factor, default 1000.0
+    Returns:
+        points: torch.Tensor, shape (H, W, 3) or (B, H, W, 3), point cloud coordinates (x, y, z)
+    """
+    # Handle batch dimension
+    if depth.dim() == 2:
+        depth = depth.unsqueeze(0)  # (1, H, W)
+        intrinsic_normalized = intrinsic_normalized.unsqueeze(0)  # (1, 3, 3)
+        squeeze_output = True
+    else:
+        squeeze_output = False
+    B, H, W = depth.shape
+    device = depth.device
+    # Denormalize intrinsics
+    fx = intrinsic_normalized[:, 0, 0] * W  # (B,)
+    fy = intrinsic_normalized[:, 1, 1] * H
+    cx = intrinsic_normalized[:, 0, 2] * W
+    cy = intrinsic_normalized[:, 1, 2] * H
+    # Create pixel coordinate grid (H, W)
+    v, u = torch.meshgrid(
+        torch.arange(H, device=device, dtype=torch.float32),
+        torch.arange(W, device=device, dtype=torch.float32),
+        indexing='ij'
+    )
+    # Expand to batch dimension (B, H, W)
+    u = u.unsqueeze(0).expand(B, -1, -1)
+    v = v.unsqueeze(0).expand(B, -1, -1)
+    # Backproject to 3D space
+    z = depth / depth_scale  # (B, H, W)
+    # Expand intrinsic dimensions for broadcasting (B, 1, 1)
+    fx = fx.view(B, 1, 1)
+    fy = fy.view(B, 1, 1)
+    cx = cx.view(B, 1, 1)
+    cy = cy.view(B, 1, 1)
+    x = (u - cx) * z / fx  # (B, H, W)
+    y = (v - cy) * z / fy  # (B, H, W)
+    # Stack coordinates (B, H, W, 3)
+    points = torch.stack([x, y, z], dim=-1)
+    if squeeze_output:
+        points = points.squeeze(0)  # (H, W, 3)
+    return points
+# Usage example
+if __name__ == "__main__":
+    # Single image
+    depth = torch.rand(480, 640) * 5000  # Depth values
+    intrinsic_norm = torch.tensor([
+        [525.0/640, 0, 319.5/640],
+        [0, 525.0/480, 239.5/480],
+        [0, 0, 1]
+    ])
+    points = depth_to_pointcloud(depth, intrinsic_norm)
+    print(f"Point cloud shape: {points.shape}")  # (480, 640, 3)
+    # Batch processing
+    depth_batch = torch.rand(4, 480, 640) * 5000
+    intrinsic_batch = intrinsic_norm.unsqueeze(0).expand(4, -1, -1)
+    points_batch = depth_to_pointcloud(depth_batch, intrinsic_batch)
+    print(f"Batch point cloud shape: {points_batch.shape}")  # (4, 480, 640, 3)
+    # Flatten to (N, 3) format if needed
+    points_flat = points.reshape(-1, 3)
+    print(f"Flattened shape: {points_flat.shape}")  # (480*640, 3)
+    # Batch flatten to (B, N, 3)
+    points_batch_flat = points_batch.reshape(4, -1, 3)
+    print(f"Batch flattened shape: {points_batch_flat.shape}")  # (4, 480*640, 3)

third_party/lingbot_depth/mdm/utils/io.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import os
+os.environ['OPENCV_IO_ENABLE_OPENEXR'] = '1'
+from typing import IO
+import zipfile
+import json
+import io
+from typing import *
+from pathlib import Path
+import re
+from PIL import Image, PngImagePlugin
+import numpy as np
+import cv2
+from .tools import timeit
+def save_glb(
+    save_path: Union[str, os.PathLike],
+    vertices: np.ndarray,
+    faces: np.ndarray,
+    vertex_uvs: np.ndarray,
+    texture: np.ndarray,
+    vertex_normals: Optional[np.ndarray] = None,
+):
+    import trimesh
+    import trimesh.visual
+    from PIL import Image
+    trimesh.Trimesh(
+        vertices=vertices,
+        vertex_normals=vertex_normals,
+        faces=faces,
+        visual = trimesh.visual.texture.TextureVisuals(
+            uv=vertex_uvs,
+            material=trimesh.visual.material.PBRMaterial(
+                baseColorTexture=Image.fromarray(texture),
+                metallicFactor=0.5,
+                roughnessFactor=1.0
+            )
+        ),
+        process=False
+    ).export(save_path)
+def save_ply(
+    save_path: Union[str, os.PathLike],
+    vertices: np.ndarray,
+    faces: np.ndarray,
+    vertex_colors: np.ndarray,
+    vertex_normals: Optional[np.ndarray] = None,
+):
+    import trimesh
+    import trimesh.visual
+    from PIL import Image
+    trimesh.Trimesh(
+        vertices=vertices,
+        faces=faces,
+        vertex_colors=vertex_colors,
+        vertex_normals=vertex_normals,
+        process=False
+    ).export(save_path)
+def read_image(path: Union[str, os.PathLike, IO]) -> np.ndarray:
+    """
+    Read a image, return uint8 RGB array of shape (H, W, 3).
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    image = cv2.cvtColor(cv2.imdecode(np.frombuffer(data, np.uint8), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+    return image
+def write_image(path: Union[str, os.PathLike, IO], image: np.ndarray, quality: int = 95):
+    """
+    Write a image, input uint8 RGB array of shape (H, W, 3).
+    """
+    data = cv2.imencode('.jpg', cv2.cvtColor(image, cv2.COLOR_RGB2BGR), [cv2.IMWRITE_JPEG_QUALITY, quality])[1].tobytes()
+    if isinstance(path, (str, os.PathLike)):
+        Path(path).write_bytes(data)
+    else:
+        path.write(data)
+def read_depth(path: Union[str, os.PathLike, IO]) -> np.ndarray:
+    """
+    Read a depth image, return float32 depth array of shape (H, W).
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    pil_image = Image.open(io.BytesIO(data))
+    near = float(pil_image.info.get('near'))
+    far = float(pil_image.info.get('far'))
+    depth = np.array(pil_image)
+    mask_nan, mask_inf = depth == 0, depth == 65535
+    depth = (depth.astype(np.float32) - 1) / 65533
+    depth = near ** (1 - depth) * far ** depth
+    if 'unit' in pil_image.info:    # Legacy support for depth units
+        unit = float(pil_image.info.get('unit'))
+        depth = depth * unit
+    depth[mask_nan] = np.nan
+    depth[mask_inf] = np.inf
+    return depth
+def write_depth(
+    path: Union[str, os.PathLike, IO],
+    depth: np.ndarray,
+    max_range: float = 1e5,
+    compression_level: int = 7,
+):
+    """
+    Encode and write a depth image as 16-bit PNG format.
+    ## Parameters:
+    - `path: Union[str, os.PathLike, IO]`
+        The file path or file object to write to.
+    - `depth: np.ndarray`
+        The depth array, float32 array of shape (H, W).
+        May contain `NaN` for invalid values and `Inf` for infinite values.
+    Depth values are encoded as follows:
+    - 0: unknown
+    - 1 ~ 65534: depth values in logarithmic
+    - 65535: infinity
+    metadata is stored in the PNG file as text fields:
+    - `near`: the minimum depth value
+    - `far`: the maximum depth value
+    """
+    mask_values, mask_nan, mask_inf = np.isfinite(depth), np.isnan(depth),np.isinf(depth)
+    depth = depth.astype(np.float32)
+    mask_finite = depth
+    near = max(depth[mask_values].min(), 1e-5)
+    far = max(near * 1.1, min(depth[mask_values].max(), near * max_range))
+    depth = 1 + np.round((np.log(np.nan_to_num(depth, nan=0).clip(near, far) / near) / np.log(far / near)).clip(0, 1) * 65533).astype(np.uint16) # 1~65534
+    depth[mask_nan] = 0
+    depth[mask_inf] = 65535
+    pil_image = Image.fromarray(depth)
+    pnginfo = PngImagePlugin.PngInfo()
+    pnginfo.add_text('near', str(near))
+    pnginfo.add_text('far', str(far))
+    pil_image.save(path, pnginfo=pnginfo, compress_level=compression_level)
+def read_segmentation(path: Union[str, os.PathLike, IO]) -> Tuple[np.ndarray, Dict[str, int]]:
+    """
+    Read a segmentation mask
+    ### Parameters:
+    - `path: Union[str, os.PathLike, IO]`
+        The file path or file object to read from.
+    ### Returns:
+    - `Tuple[np.ndarray, Dict[str, int]]`
+        A tuple containing:
+        - `mask`: uint8 or uint16 numpy.ndarray of shape (H, W).
+        - `labels`: Dict[str, int]. The label mapping, a dictionary of {label_name: label_id}.
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    pil_image = Image.open(io.BytesIO(data))
+    labels = json.loads(pil_image.info['labels']) if 'labels' in pil_image.info else None
+    mask = np.array(pil_image)
+    return mask, labels
+def write_segmentation(path: Union[str, os.PathLike, IO], mask: np.ndarray, labels: Dict[str, int] = None, compression_level: int = 7):
+    """
+    Write a segmentation mask and label mapping, as PNG format.
+    ### Parameters:
+    - `path: Union[str, os.PathLike, IO]`
+        The file path or file object to write to.
+    - `mask: np.ndarray`
+        The segmentation mask, uint8 or uint16 array of shape (H, W).
+    - `labels: Dict[str, int] = None`
+        The label mapping, a dictionary of {label_name: label_id}.
+    - `compression_level: int = 7`
+        The compression level for PNG compression.
+    """
+    assert mask.dtype == np.uint8 or mask.dtype == np.uint16, f"Unsupported dtype {mask.dtype}"
+    pil_image = Image.fromarray(mask)
+    pnginfo = PngImagePlugin.PngInfo()
+    if labels is not None:
+        labels_json = json.dumps(labels, ensure_ascii=True, separators=(',', ':'))
+        pnginfo.add_text('labels', labels_json)
+    pil_image.save(path, pnginfo=pnginfo, compress_level=compression_level)
+def read_normal(path: Union[str, os.PathLike, IO]) -> np.ndarray:
+    """
+    Read a normal image, return float32 normal array of shape (H, W, 3).
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    normal = cv2.cvtColor(cv2.imdecode(np.frombuffer(data, np.uint8), cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGB)
+    mask_nan = np.all(normal == 0, axis=-1)
+    normal = (normal.astype(np.float32) / 65535 - 0.5) * [2.0, -2.0, -2.0]
+    normal = normal / (np.sqrt(np.square(normal[..., 0]) + np.square(normal[..., 1]) + np.square(normal[..., 2])) + 1e-12)
+    normal[mask_nan] = np.nan
+    return normal
+def write_normal(path: Union[str, os.PathLike, IO], normal: np.ndarray, compression_level: int = 7) -> np.ndarray:
+    """
+    Write a normal image, input float32 normal array of shape (H, W, 3).
+    """
+    mask_nan = np.isnan(normal).any(axis=-1)
+    normal = ((normal * [0.5, -0.5, -0.5] + 0.5).clip(0, 1) * 65535).astype(np.uint16)
+    normal[mask_nan] = 0
+    data = cv2.imencode('.png', cv2.cvtColor(normal, cv2.COLOR_RGB2BGR), [cv2.IMWRITE_PNG_COMPRESSION, compression_level])[1].tobytes()
+    if isinstance(path, (str, os.PathLike)):
+        Path(path).write_bytes(data)
+    else:
+        path.write(data)
+def read_mask(path: Union[str, os.PathLike, IO[bytes]]) -> np.ndarray:
+    """
+    Read a binary mask, return bool array of shape (H, W).
+    """
+    if isinstance(path, (str, os.PathLike)):
+        data = Path(path).read_bytes()
+    else:
+        data = path.read()
+    mask = cv2.imdecode(np.frombuffer(data, np.uint8), cv2.IMREAD_UNCHANGED)
+    if len(mask.shape) == 3:
+        mask = mask[..., 0]
+    return mask > 0
+def write_mask(path: Union[str, os.PathLike, IO[bytes]], mask: np.ndarray, compression_level: int = 7):
+    """
+    Write a binary mask, input bool array of shape (H, W).
+    """
+    assert mask.dtype == bool, f"Mask must be bool array, got {mask.dtype}"
+    mask = (mask.astype(np.uint8) * 255).astype(np.uint8)
+    data = cv2.imencode('.png', mask, [cv2.IMWRITE_PNG_COMPRESSION, compression_level])[1].tobytes()
+    if isinstance(path, (str, os.PathLike)):
+        Path(path).write_bytes(data)
+    else:
+        path.write(data)
+JSON_TYPE = Union[str, int, float, bool, None, Dict[str, "JSON"], List["JSON"]]
+def read_json(path: Union[str, os.PathLike, IO[str]]) -> JSON_TYPE:
+    if isinstance(path, (str, os.PathLike)):
+        text = Path(path).read_text()
+    else:
+        text = path.read()
+    return json.loads(text)
+def write_json(path: Union[str, os.PathLike, IO[str]], content: JSON_TYPE):
+    text = json.dumps(content)
+    if isinstance(path, (str, os.PathLike)):
+        Path(path).write_text(text)
+    else:
+        path.write(text)

third_party/lingbot_depth/mdm/utils/tools.py ADDED Viewed

	@@ -0,0 +1,289 @@

+from typing import *
+import time
+from pathlib import Path
+from numbers import Number
+from functools import wraps
+import warnings
+import math
+import json
+import os
+import importlib
+import importlib.util
+def catch_exception(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            import traceback
+            print(f"Exception in {fn.__name__}",  end='r')
+            # print({', '.join(repr(arg) for arg in args)}, {', '.join(f'{k}={v!r}' for k, v in kwargs.items())})
+            traceback.print_exc(chain=False)
+            time.sleep(0.1)
+            return None
+    return wrapper
+class CallbackOnException:
+    def __init__(self, callback: Callable, exception: type):
+        self.exception = exception
+        self.callback = callback
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if isinstance(exc_val, self.exception):
+            self.callback()
+            return True
+        return False
+def traverse_nested_dict_keys(d: Dict[str, Dict]) -> Generator[Tuple[str, ...], None, None]:
+    for k, v in d.items():
+        if isinstance(v, dict):
+            for sub_key in traverse_nested_dict_keys(v):
+                yield (k, ) + sub_key
+        else:
+            yield (k, )
+def get_nested_dict(d: Dict[str, Dict], keys: Tuple[str, ...], default: Any = None):
+    for k in keys:
+        d = d.get(k, default)
+        if d is None:
+            break
+    return d
+def set_nested_dict(d: Dict[str, Dict], keys: Tuple[str, ...], value: Any):
+    for k in keys[:-1]:
+        d = d.setdefault(k, {})
+    d[keys[-1]] = value
+def key_average(list_of_dicts: list) -> Dict[str, Any]:
+    """
+    Returns a dictionary with the average value of each key in the input list of dictionaries.
+    """
+    _nested_dict_keys = set()
+    for d in list_of_dicts:
+        _nested_dict_keys.update(traverse_nested_dict_keys(d))
+    _nested_dict_keys = sorted(_nested_dict_keys)
+    result = {}
+    for k in _nested_dict_keys:
+        values = []
+        for d in list_of_dicts:
+            v = get_nested_dict(d, k)
+            if v is not None and not math.isnan(v):
+                values.append(v)
+        avg = sum(values) / len(values) if values else float('nan')
+        set_nested_dict(result, k, avg)
+    return result
+def flatten_nested_dict(d: Dict[str, Any], parent_key: Tuple[str, ...] = None) -> Dict[Tuple[str, ...], Any]:
+    """
+    Flattens a nested dictionary into a single-level dictionary, with keys as tuples.
+    """
+    items = []
+    if parent_key is None:
+        parent_key = ()
+    for k, v in d.items():
+        new_key = parent_key + (k, )
+        if isinstance(v, MutableMapping):
+            items.extend(flatten_nested_dict(v, new_key).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)
+def unflatten_nested_dict(d: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Unflattens a single-level dictionary into a nested dictionary, with keys as tuples.
+    """
+    result = {}
+    for k, v in d.items():
+        sub_dict = result
+        for k_ in k[:-1]:
+            if k_ not in sub_dict:
+                sub_dict[k_] = {}
+            sub_dict = sub_dict[k_]
+        sub_dict[k[-1]] = v
+    return result
+def read_jsonl(file):
+    import json
+    with open(file, 'r') as f:
+        data = f.readlines()
+    return [json.loads(line) for line in data]
+def write_jsonl(data: List[dict], file):
+    import json
+    with open(file, 'w') as f:
+        for item in data:
+            f.write(json.dumps(item) + '\n')
+def to_hierachical_dataframe(data: List[Dict[Tuple[str, ...], Any]]):
+    import pandas as pd
+    data = [flatten_nested_dict(d) for d in data]
+    df = pd.DataFrame(data)
+    df = df.sort_index(axis=1)
+    df.columns = pd.MultiIndex.from_tuples(df.columns)
+    return df
+def recursive_replace(d: Union[List, Dict, str], mapping: Dict[str, str]):
+    if isinstance(d, str):
+        for old, new in mapping.items():
+            d = d.replace(old, new)
+    elif isinstance(d, list):
+        for i, item in enumerate(d):
+            d[i] = recursive_replace(item, mapping)
+    elif isinstance(d, dict):
+        for k, v in d.items():
+            d[k] = recursive_replace(v, mapping)
+    return d
+class timeit:
+    _history: Dict[str, List['timeit']] = {}
+    def __init__(self, name: str = None, verbose: bool = True, average: bool = False):
+        self.name = name
+        self.verbose = verbose
+        self.start = None
+        self.end = None
+        self.average = average
+        if average and name not in timeit._history:
+            timeit._history[name] = []
+    def __call__(self, func: Callable):
+        import inspect
+        if inspect.iscoroutinefunction(func):
+            async def wrapper(*args, **kwargs):
+                with timeit(self.name or func.__qualname__):
+                    ret = await func(*args, **kwargs)
+                return ret
+            return wrapper
+        else:
+            def wrapper(*args, **kwargs):
+                with timeit(self.name or func.__qualname__):
+                    ret = func(*args, **kwargs)
+                return ret
+            return wrapper
+    def __enter__(self):
+        self.start = time.time()
+        return self
+    @property
+    def time(self) -> float:
+        assert self.start is not None, "Time not yet started."
+        assert self.end is not None, "Time not yet ended."
+        return self.end - self.start
+    @property
+    def average_time(self) -> float:
+        assert self.average, "Average time not available."
+        return sum(t.time for t in timeit._history[self.name]) / len(timeit._history[self.name])
+    @property
+    def history(self) -> List['timeit']:
+        return timeit._history.get(self.name, [])
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.end = time.time()
+        if self.average:
+            timeit._history[self.name].append(self)
+        if self.verbose:
+            if self.average:
+                avg = self.average_time
+                print(f"{self.name or 'It'} took {avg:.6f} seconds in average.")
+            else:
+                print(f"{self.name or 'It'} took {self.time:.6f} seconds.")
+def strip_common_prefix_suffix(strings: List[str]) -> List[str]:
+    first = strings[0]
+    for start in range(len(first)):
+        if any(s[start] != strings[0][start] for s in strings):
+            break
+    for end in range(1, min(len(s) for s in strings)):
+        if any(s[-end] != first[-end] for s in strings):
+            break
+    return [s[start:len(s) - end + 1] for s in strings]
+def multithead_execute(inputs: List[Any], num_workers: int, pbar = None):
+    from concurrent.futures import ThreadPoolExecutor
+    from contextlib import nullcontext
+    from tqdm import tqdm
+    if pbar is not None:
+        pbar.total = len(inputs) if hasattr(inputs, '__len__') else None
+    else:
+        pbar = tqdm(total=len(inputs) if hasattr(inputs, '__len__') else None)
+    def decorator(fn: Callable):
+        with (
+            ThreadPoolExecutor(max_workers=num_workers) as executor,
+            pbar
+        ):
+            pbar.refresh()
+            @catch_exception
+            @suppress_traceback
+            def _fn(input):
+                ret = fn(input)
+                pbar.update()
+                return ret
+            executor.map(_fn, inputs)
+            executor.shutdown(wait=True)
+    return decorator
+def suppress_traceback(fn):
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        try:
+            return fn(*args, **kwargs)
+        except Exception as e:
+            e.__traceback__ = e.__traceback__.tb_next.tb_next
+            raise
+    return wrapper
+class no_warnings:
+    def __init__(self, action: str = 'ignore', **kwargs):
+        self.action = action
+        self.filter_kwargs = kwargs
+    def __call__(self, fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            with warnings.catch_warnings():
+                warnings.simplefilter(self.action, **self.filter_kwargs)
+                return fn(*args, **kwargs)
+        return wrapper
+    def __enter__(self):
+        self.warnings_manager = warnings.catch_warnings()
+        self.warnings_manager.__enter__()
+        warnings.simplefilter(self.action, **self.filter_kwargs)
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.warnings_manager.__exit__(exc_type, exc_val, exc_tb)
+def import_file_as_module(file_path: Union[str, os.PathLike], module_name: str):
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module

third_party/lingbot_depth/mdm/utils/vis.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import *
+import numpy as np
+import matplotlib
+import trimesh
+import random
+import torch
+import torch.nn.functional as F
+import os
+def colorize_depth(depth: np.ndarray, mask: np.ndarray = None, normalize: bool = True, cmap: str = 'Spectral') -> np.ndarray:
+    depth = depth.copy()
+    if mask is None:
+        depth = np.where(depth > 0, depth, np.nan)
+    else:
+        depth = np.where((depth > 0) & mask, depth, np.nan)
+    disp = 1 / depth
+    if normalize:
+        min_disp, max_disp = np.nanquantile(disp, 0.001), np.nanquantile(disp, 0.99)
+        disp = (disp - min_disp) / (max_disp - min_disp)
+    colored = np.nan_to_num(matplotlib.colormaps[cmap](1.0 - disp)[..., :3], 0)
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def colorize_depth_affine(depth: np.ndarray, mask: np.ndarray = None, cmap: str = 'Spectral') -> np.ndarray:
+    if mask is not None:
+        depth = np.where(mask, depth, np.nan)
+    min_depth, max_depth = np.nanquantile(depth, 0.001), np.nanquantile(depth, 0.999)
+    depth = (depth - min_depth) / (max_depth - min_depth)
+    colored = np.nan_to_num(matplotlib.colormaps[cmap](depth)[..., :3], 0)
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def colorize_disparity(disparity: np.ndarray, mask: np.ndarray = None, normalize: bool = True, cmap: str = 'Spectral') -> np.ndarray:
+    if mask is not None:
+        disparity = np.where(mask, disparity, np.nan)
+    if normalize:
+        min_disp, max_disp = np.nanquantile(disparity, 0.001), np.nanquantile(disparity, 0.999)
+        disparity = (disparity - min_disp) / (max_disp - min_disp)
+    colored = np.nan_to_num(matplotlib.colormaps[cmap](1.0 - disparity)[..., :3], 0)
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def colorize_normal(normal: np.ndarray, mask: np.ndarray = None) -> np.ndarray:
+    if mask is not None:
+        normal = np.where(mask[..., None], normal, 0)
+    normal = normal * [0.5, -0.5, -0.5] + 0.5
+    normal = (normal.clip(0, 1) * 255).astype(np.uint8)
+    return normal
+def colorize_error_map(error_map: np.ndarray, mask: np.ndarray = None, cmap: str = 'plasma', value_range: Tuple[float, float] = None):
+    vmin, vmax = value_range if value_range is not None else (np.nanmin(error_map), np.nanmax(error_map))
+    cmap = matplotlib.colormaps[cmap]
+    colorized_error_map = cmap(((error_map - vmin) / (vmax - vmin)).clip(0, 1))[..., :3]
+    if mask is not None:
+        colorized_error_map = np.where(mask[..., None], colorized_error_map, 0)
+    colorized_error_map = np.ascontiguousarray((colorized_error_map.clip(0, 1) * 255).astype(np.uint8))
+    return colorized_error_map

third_party/lingbot_depth/pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "mdm"
+version = "1.0.0"
+readme = "README.md"
+dependencies = [
+    "click",
+    "opencv-python",
+    "scipy",
+    "matplotlib",
+    "trimesh",
+    "pillow",
+    "huggingface_hub",
+    "numpy",
+    "torch==2.6.0",
+    "torchvision",
+    "xformers==v0.0.29.post2",
+]
+requires-python = ">=3.9"
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["mdm*"]

third_party/sam3/pyproject.toml ADDED Viewed

	@@ -0,0 +1,135 @@

+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "sam3"
+dynamic = ["version"]
+description = "SAM3 (Segment Anything Model 3) implementation"
+readme = "README.md"
+requires-python = ">=3.8"
+license = {file = "LICENSE"}
+authors = [
+    {name = "Meta AI Research"}
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "timm>=1.0.17",
+    "numpy>=1.26,<2",
+    "tqdm",
+    "ftfy==6.1.1",
+    "regex",
+    "iopath>=0.1.10",
+    "typing_extensions",
+    "huggingface_hub",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "pytest-cov",
+    "black==24.2.0",
+    "ufmt==2.8.0",
+    "ruff-api==0.1.0",
+    "usort==1.0.2",
+    "gitpython==3.1.31",
+    "yt-dlp",
+    "pandas",
+    "opencv-python",
+    "pycocotools",
+    "numba",
+    "python-rapidjson",
+]
+notebooks = [
+    "matplotlib",
+    "jupyter",
+    "notebook",
+    "ipywidgets",
+    "ipycanvas",
+    "ipympl",
+    "pycocotools",
+    "decord",
+    "opencv-python",
+    "einops",
+    "scikit-image",
+    "scikit-learn",
+]
+train = [
+    "hydra-core",
+    "submitit",
+    "tensorboard",
+    "zstandard",
+    "scipy",
+    "torchmetrics",
+    "fvcore",
+    "fairscale",
+    "scikit-image",
+    "scikit-learn",
+]
+[project.urls]
+"Homepage" = "https://github.com/facebookresearch/sam3"
+"Bug Tracker" = "https://github.com/facebookresearch/sam3/issues"
+[tool.setuptools.packages.find]
+include = ["sam3*"]
+exclude = ["build*", "scripts*", "examples*"]
+[tool.setuptools.package-data]
+sam3 = ["assets/*.txt.gz"]
+[tool.setuptools.dynamic]
+version = {attr = "sam3.__version__"}
+[tool.black]
+line-length = 88
+target-version = ['py38', 'py39', 'py310', 'py311', 'py312']
+include = '\.pyi?$'
+[tool.isort]
+profile = "black"
+multi_line_output = 3
+[tool.usort]
+first_party_detection = false
+[tool.ufmt]
+formatter = "ruff-api"
+[tool.mypy]
+python_version = "3.12"
+warn_return_any = true
+warn_unused_configs = true
+disallow_untyped_defs = true
+disallow_incomplete_defs = true
+[[tool.mypy.overrides]]
+module = [
+    "torch.*",
+    "torchvision.*",
+    "timm.*",
+    "numpy.*",
+    "PIL.*",
+    "tqdm.*",
+    "ftfy.*",
+    "regex.*",
+    "iopath.*",
+]
+ignore_missing_imports = true
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = "test_*.py"
+python_classes = "Test*"
+python_functions = "test_*"

third_party/sam3/sam3/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+# pyre-unsafe
+from .model_builder import build_sam3_image_model, build_sam3_predictor
+__version__ = "0.1.0"
+__all__ = ["build_sam3_image_model", "build_sam3_predictor"]

third_party/sam3/sam3/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (453 Bytes). View file

third_party/sam3/sam3/__pycache__/logger.cpython-311.pyc ADDED Viewed

Binary file (3.67 kB). View file

third_party/sam3/sam3/__pycache__/model_builder.cpython-311.pyc ADDED Viewed

Binary file (42.9 kB). View file

third_party/sam3/sam3/agent/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
2	+
3	+ # pyre-unsafe

third_party/sam3/sam3/agent/agent_core.py ADDED Viewed

	@@ -0,0 +1,565 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+# pyre-unsafe
+import copy
+import json
+import os
+import cv2
+from PIL import Image
+from .client_llm import send_generate_request
+from .client_sam3 import call_sam_service
+from .viz import visualize
+def save_debug_messages(messages_list, debug, debug_folder_path, debug_jsonl_path):
+    """Save messages to debug jsonl file if debug is enabled"""
+    if debug and debug_jsonl_path:
+        # Ensure the debug directory exists before writing
+        os.makedirs(debug_folder_path, exist_ok=True)
+        with open(debug_jsonl_path, "w") as f:
+            for msg in messages_list:
+                f.write(json.dumps(msg, indent=4) + "\n")
+def cleanup_debug_files(debug, debug_folder_path, debug_jsonl_path):
+    """Clean up debug files when function successfully returns"""
+    if debug and debug_folder_path:
+        try:
+            if os.path.exists(debug_jsonl_path):
+                os.remove(debug_jsonl_path)
+            if os.path.exists(debug_folder_path):
+                os.rmdir(debug_folder_path)
+        except Exception as e:
+            print(f"Warning: Could not clean up debug files: {e}")
+def count_images(messages):
+    """Count the total number of images present in the messages history."""
+    total = 0
+    for message in messages:
+        # Check if message has content (should be a list)
+        if "content" in message and isinstance(message["content"], list):
+            # Iterate through each content item
+            for content_item in message["content"]:
+                # Check if content item is a dict with type "image"
+                if (
+                    isinstance(content_item, dict)
+                    and content_item.get("type") == "image"
+                ):
+                    total += 1
+    return total
+def _prune_messages_for_next_round(
+    messages_list,
+    used_text_prompts,
+    latest_sam3_text_prompt,
+    img_path,
+    initial_text_prompt,
+):
+    """Return a new messages list that contains only:
+    1) messages[:2] (with optional warning text added to the second message's content)
+    2) the latest assistant message (and everything after it) that contains a segment_phrase tool call
+    """
+    # There should not be more than 10 messages in the conversation history
+    assert len(messages_list) < 10
+    # Part 1: always keep the first two message JSONs
+    part1 = copy.deepcopy(messages_list[:2])
+    # Part 2: search backwards for the latest assistant message containing a segment_phrase tool call
+    part2_start_idx = None
+    for idx in range(len(messages_list) - 1, 1, -1):
+        msg = messages_list[idx]
+        # We only consider assistant messages with a "content" list
+        if msg.get("role") != "assistant" or "content" not in msg:
+            continue
+        # Look for any content element that is a text containing the segment_phrase tool call
+        for content in msg["content"]:
+            if (
+                isinstance(content, dict)
+                and content.get("type") == "text"
+                and "<tool>" in content.get("text", "")
+                and "segment_phrase" in content.get("text", "")
+            ):
+                part2_start_idx = idx
+                break
+        if part2_start_idx is not None:
+            break
+    part2 = messages_list[part2_start_idx:] if part2_start_idx is not None else []
+    # Part 3: decide whether to add warning text to the second message in part1
+    previously_used = (
+        [p for p in used_text_prompts if p != latest_sam3_text_prompt]
+        if latest_sam3_text_prompt
+        else list(used_text_prompts)
+    )
+    if part2 and len(previously_used) > 0:
+        warning_text = f'Note that we have previously called the segment_phrase tool with each "text_prompt" in this list: {list(previously_used)}, but none of the generated results were satisfactory. So make sure that you do not use any of these phrases as the "text_prompt" to call the segment_phrase tool again.'
+        # Replace the second message entirely to keep exactly 2 content items
+        part1[1] = {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": img_path},
+                {
+                    "type": "text",
+                    "text": f"The above image is the raw input image. The initial user input query is: '{initial_text_prompt}'."
+                    + " "
+                    + warning_text,
+                },
+            ],
+        }
+        assert len(part1[1]["content"]) == 2
+    # Build the new messages list: part1 (with optional warning), then part2
+    new_messages = list(part1)
+    new_messages.extend(part2)
+    return new_messages
+def agent_inference(
+    img_path: str,
+    initial_text_prompt: str,
+    debug: bool = False,
+    send_generate_request=send_generate_request,
+    call_sam_service=call_sam_service,
+    max_generations: int = 100,
+    output_dir="../../sam3_agent_out",
+):
+    """
+    Given a text prompt and an image, this tool will perform all aspects of agentic problem solving,
+    while saving sam3 and MLLM outputs to their respective directories.
+    Args:
+        img_path: Path to the input image
+        initial_text_prompt: Initial text prompt from the user
+        debug: Whether to enable debug mode
+        max_generations: Maximum number of send_generate_request calls allowed (default: 100)
+    """
+    # setup dir
+    sam_output_dir = os.path.join(output_dir, "sam_out")
+    error_save_dir = os.path.join(output_dir, "none_out")
+    debug_save_dir = os.path.join(output_dir, "agent_debug_out")
+    os.makedirs(sam_output_dir, exist_ok=True)
+    os.makedirs(error_save_dir, exist_ok=True)
+    os.makedirs(debug_save_dir, exist_ok=True)
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    MLLM_SYSTEM_PROMPT_PATH = os.path.join(
+        current_dir, "system_prompts/system_prompt.txt"
+    )
+    ITERATIVE_CHECKING_SYSTEM_PROMPT_PATH = os.path.join(
+        current_dir, "system_prompts/system_prompt_iterative_checking.txt"
+    )
+    # init variables
+    PATH_TO_LATEST_OUTPUT_JSON = ""
+    LATEST_SAM3_TEXT_PROMPT = ""
+    USED_TEXT_PROMPTS = (
+        set()
+    )  # Track all previously used text prompts for segment_phrase
+    generation_count = 0  # Counter for number of send_generate_request calls
+    # debug setup
+    debug_folder_path = None
+    debug_jsonl_path = None
+    if debug:
+        debug_folder_path = os.path.join(
+            debug_save_dir, f"{img_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}"
+        )
+        debug_jsonl_path = os.path.join(debug_folder_path, "debug_history.json")
+        os.makedirs(debug_folder_path, exist_ok=True)
+    # The helper functions are now defined outside the agent_inference function
+    with open(MLLM_SYSTEM_PROMPT_PATH, "r") as f:
+        system_prompt = f.read().strip()
+    with open(ITERATIVE_CHECKING_SYSTEM_PROMPT_PATH, "r") as f:
+        iterative_checking_system_prompt = f.read().strip()
+    # Construct the initial message list
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": img_path},
+                {
+                    "type": "text",
+                    "text": f"The above image is the raw input image. The initial user input query is: '{initial_text_prompt}'.",
+                },
+            ],
+        },
+    ]
+    print(f"> Text prompt: {initial_text_prompt}")
+    print(f"> Image path: {img_path}")
+    print("\n\n")
+    print("-" * 30 + f" Round {str(generation_count + 1)}" + "-" * 30)
+    print("\n\n")
+    generated_text = send_generate_request(messages)
+    print(f"\n>>> MLLM Response [start]\n{generated_text}\n<<< MLLM Response [end]\n")
+    while generated_text is not None:
+        save_debug_messages(messages, debug, debug_folder_path, debug_jsonl_path)
+        assert (
+            "<tool>" in generated_text,
+            f"Generated text does not contain <tool> tag: {generated_text}",
+        )
+        generated_text = generated_text.split("</tool>", 1)[0] + "</tool>"
+        tool_call_json_str = (
+            generated_text.split("<tool>")[-1]
+            .split("</tool>")[0]
+            .strip()
+            .replace(r"}}}", r"}}")  # remove extra } if any
+        )
+        try:
+            tool_call = json.loads(tool_call_json_str)
+        except json.JSONDecodeError:
+            raise ValueError(f"Invalid JSON in tool call: {tool_call_json_str}")
+        if PATH_TO_LATEST_OUTPUT_JSON == "":
+            # The first tool call must be segment_phrase or report_no_mask
+            assert (
+                tool_call["name"] == "segment_phrase"
+                or tool_call["name"] == "report_no_mask"
+            )
+        if tool_call["name"] == "segment_phrase":
+            print("🔍 Calling segment_phrase tool...")
+            assert list(tool_call["parameters"].keys()) == ["text_prompt"]
+            # Check if this text_prompt has been used before
+            current_text_prompt = tool_call["parameters"]["text_prompt"]
+            if current_text_prompt in USED_TEXT_PROMPTS:
+                print(
+                    f"❌ Text prompt '{current_text_prompt}' has been used before. Requesting a different prompt."
+                )
+                duplicate_prompt_message = f"You have previously used '{current_text_prompt}' as your text_prompt to call the segment_phrase tool. You may not use it again. Please call the segment_phrase tool again with a different, perhaps more general, or more creative simple noun phrase prompt, while adhering to all the rules stated in the system prompt. You must also never use any of the following text_prompt(s): {str(list(USED_TEXT_PROMPTS))}."
+                messages.append(
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": generated_text}],
+                    }
+                )
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [{"type": "text", "text": duplicate_prompt_message}],
+                    }
+                )
+            else:
+                # Add the text_prompt to the set of used prompts
+                USED_TEXT_PROMPTS.add(current_text_prompt)
+                LATEST_SAM3_TEXT_PROMPT = current_text_prompt
+                PATH_TO_LATEST_OUTPUT_JSON = call_sam_service(
+                    image_path=img_path,
+                    text_prompt=current_text_prompt,
+                    output_folder_path=sam_output_dir,
+                )
+                sam3_outputs = json.load(open(PATH_TO_LATEST_OUTPUT_JSON, "r"))
+                sam3_output_image_path = sam3_outputs["output_image_path"]
+                num_masks = len(sam3_outputs["pred_boxes"])
+                messages.append(
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": generated_text}],
+                    }
+                )
+                if num_masks == 0:
+                    print("❌ No masks generated by SAM3, reporting no mask to Qwen.")
+                    sam3_output_text_message = f"The segment_phrase tool did not generate any masks for the text_prompt '{current_text_prompt}'. Now, please call the segment_phrase tool again with a different, perhaps more general, or more creative simple noun phrase text_prompt, while adhering to all the rules stated in the system prompt. Please be reminded that the original user query was '{initial_text_prompt}'."
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": sam3_output_text_message}
+                            ],
+                        }
+                    )
+                else:
+                    sam3_output_text_message = rf"The segment_phrase tool generated {num_masks} available masks. All {num_masks} available masks are rendered in this image below, now you must analyze the {num_masks} available mask(s) carefully, compare them against the raw input image and the original user query, and determine your next action. Please be reminded that the original user query was '{initial_text_prompt}'."
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": sam3_output_text_message},
+                                {"type": "image", "image": sam3_output_image_path},
+                            ],
+                        }
+                    )
+                print("\n\n>>> sam3_output_text_message:\n", sam3_output_text_message)
+        elif tool_call["name"] == "examine_each_mask":
+            print("🔍 Calling examine_each_mask tool...")
+            assert LATEST_SAM3_TEXT_PROMPT != ""
+            # Make sure that the last message is a image
+            assert (
+                messages[-1]["content"][1]["type"] == "image"
+            ), "Second content element should be an image"
+            messages.pop()  # Remove the last user message
+            # Add simplified replacement message
+            simplified_message = {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "The segment_phrase tool generated several masks. Now you must analyze the mask(s) carefully, compare them against the raw input image and the original user query, and determine your next action.",
+                    }
+                ],
+            }
+            messages.append(simplified_message)
+            current_outputs = json.load(open(PATH_TO_LATEST_OUTPUT_JSON, "r"))
+            num_masks = len(current_outputs["pred_masks"])
+            masks_to_keep = []
+            # MLLM check the mask one by one
+            for i in range(num_masks):
+                print(f"🔍 Checking mask {i + 1}/{num_masks}...")
+                image_w_mask_i, image_w_zoomed_in_mask_i = visualize(current_outputs, i)
+                image_w_zoomed_in_mask_i_path = os.path.join(
+                    sam_output_dir, rf"{LATEST_SAM3_TEXT_PROMPT}.png".replace("/", "_")
+                ).replace(".png", f"_zoom_in_mask_{i + 1}.png")
+                image_w_mask_i_path = os.path.join(
+                    sam_output_dir, rf"{LATEST_SAM3_TEXT_PROMPT}.png".replace("/", "_")
+                ).replace(".png", f"_selected_mask_{i + 1}.png")
+                image_w_zoomed_in_mask_i.save(image_w_zoomed_in_mask_i_path)
+                image_w_mask_i.save(image_w_mask_i_path)
+                iterative_checking_messages = [
+                    {"role": "system", "content": iterative_checking_system_prompt},
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": f"The raw input image: "},
+                            {"type": "image", "image": img_path},
+                            {
+                                "type": "text",
+                                "text": f"The initial user input query is: '{initial_text_prompt}'",
+                            },
+                            {
+                                "type": "text",
+                                "text": f"Image with the predicted segmentation mask rendered on it: ",
+                            },
+                            {"type": "image", "image": image_w_mask_i_path},
+                            {
+                                "type": "text",
+                                "text": f"Image with the zoomed-in mask: ",
+                            },
+                            {"type": "image", "image": image_w_zoomed_in_mask_i_path},
+                        ],
+                    },
+                ]
+                checking_generated_text = send_generate_request(
+                    iterative_checking_messages
+                )
+                # Process the generated text to determine if the mask should be kept or rejected
+                if checking_generated_text is None:
+                    raise ValueError(
+                        "Generated text is None, which is unexpected. Please check the Qwen server and the input parameters."
+                    )
+                print(f"Generated text for mask {i + 1}: {checking_generated_text}")
+                verdict = (
+                    checking_generated_text.split("<verdict>")[-1]
+                    .split("</verdict>")[0]
+                    .strip()
+                )
+                if "Accept" in verdict:
+                    assert not "Reject" in verdict
+                    print(f"Mask {i + 1} accepted, keeping it in the outputs.")
+                    masks_to_keep.append(i)
+                elif "Reject" in verdict:
+                    assert not "Accept" in verdict
+                    print(f"Mask {i + 1} rejected, removing it from the outputs.")
+                else:
+                    raise ValueError(
+                        f"Unexpected verdict in generated text: {checking_generated_text}. Expected 'Accept' or 'Reject'."
+                    )
+            updated_outputs = {
+                "original_image_path": current_outputs["original_image_path"],
+                "orig_img_h": current_outputs["orig_img_h"],
+                "orig_img_w": current_outputs["orig_img_w"],
+                "pred_boxes": [current_outputs["pred_boxes"][i] for i in masks_to_keep],
+                "pred_scores": [
+                    current_outputs["pred_scores"][i] for i in masks_to_keep
+                ],
+                "pred_masks": [current_outputs["pred_masks"][i] for i in masks_to_keep],
+            }
+            image_w_check_masks = visualize(updated_outputs)
+            image_w_check_masks_path = os.path.join(
+                sam_output_dir, rf"{LATEST_SAM3_TEXT_PROMPT}.png"
+            ).replace(
+                ".png",
+                f"_selected_masks_{'-'.join(map(str, [i + 1 for i in masks_to_keep]))}.png".replace(
+                    "/", "_"
+                ),
+            )
+            image_w_check_masks.save(image_w_check_masks_path)
+            # save the updated json outputs and append to message history
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": generated_text}],
+                }
+            )
+            if len(masks_to_keep) == 0:
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"The original user query was: '{initial_text_prompt}'. The examine_each_mask tool examined and rejected all of the masks generated by the segment_phrase tool. Now, please call the segment_phrase tool again with a different, perhaps more general, or more creative simple noun phrase text_prompt, while adhering to all the rules stated in the system prompt.",
+                            }
+                        ],
+                    }
+                )
+            else:
+                messages.append(
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"The original user query was: '{initial_text_prompt}'. After calling the examine_each_mask tool on the available masks, the number of available masks is now {len(masks_to_keep)}. All {len(masks_to_keep)} available masks are rendered in this image below, now you must analyze the {len(masks_to_keep)} available mask(s) carefully, compare them against the raw input image and the original user query, and determine your next action.",
+                            },
+                            {"type": "image", "image": image_w_check_masks_path},
+                        ],
+                    }
+                )
+            # Create a new filename based on the original path to avoid filename length issues
+            base_path = PATH_TO_LATEST_OUTPUT_JSON
+            # Remove any existing "masks_" suffix to avoid duplication
+            if "masks_" in base_path:
+                base_path = base_path.split("masks_")[0] + ".json"
+            # Create new filename with current masks; use a clearer suffix when empty
+            if len(masks_to_keep) == 0:
+                PATH_TO_LATEST_OUTPUT_JSON = base_path.replace(
+                    ".json", "masks_none.json"
+                )
+            else:
+                PATH_TO_LATEST_OUTPUT_JSON = base_path.replace(
+                    ".json", f"masks_{'_'.join(map(str, masks_to_keep))}.json"
+                )
+            json.dump(updated_outputs, open(PATH_TO_LATEST_OUTPUT_JSON, "w"), indent=4)
+        elif tool_call["name"] == "select_masks_and_return":
+            print("🔍 Calling select_masks_and_return tool...")
+            current_outputs = json.load(open(PATH_TO_LATEST_OUTPUT_JSON, "r"))
+            assert list(tool_call["parameters"].keys()) == ["final_answer_masks"]
+            masks_to_keep = tool_call["parameters"]["final_answer_masks"]
+            # Keep only valid mask indices, remove duplicates, and preserve deterministic ascending order
+            available_masks = set(range(1, len(current_outputs["pred_masks"]) + 1))
+            masks_to_keep = sorted({i for i in masks_to_keep if i in available_masks})
+            # Change this to a update message telling the model to try again along with information about errors made.
+            final_outputs = {
+                "original_image_path": current_outputs["original_image_path"],
+                "orig_img_h": current_outputs["orig_img_h"],
+                "orig_img_w": current_outputs["orig_img_w"],
+                "pred_boxes": [
+                    current_outputs["pred_boxes"][i - 1] for i in masks_to_keep
+                ],
+                "pred_scores": [
+                    current_outputs["pred_scores"][i - 1] for i in masks_to_keep
+                ],
+                "pred_masks": [
+                    current_outputs["pred_masks"][i - 1] for i in masks_to_keep
+                ],
+            }
+            rendered_final_output = visualize(final_outputs)
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": generated_text}],
+                }
+            )
+            # Clean up debug files before successful return
+            cleanup_debug_files(debug, debug_folder_path, debug_jsonl_path)
+            return messages, final_outputs, rendered_final_output
+        elif tool_call["name"] == "report_no_mask":
+            print("🔍 Calling report_no_mask tool...")
+            height, width = cv2.imread(img_path).shape[:2]
+            final_outputs = {
+                "original_image_path": img_path,
+                "orig_img_h": height,
+                "orig_img_w": width,
+                "pred_boxes": [],
+                "pred_scores": [],
+                "pred_masks": [],
+            }
+            rendered_final_output = Image.open(img_path)
+            messages.append(
+                {
+                    "role": "assistant",
+                    "content": [{"type": "text", "text": generated_text}],
+                }
+            )
+            return messages, final_outputs, rendered_final_output
+        else:
+            raise ValueError(f"Unknown tool call: {tool_call['name']}")
+        # sometimes the MLLM don't know when to stop, and generates multiple tool calls in one round, so we need to split the generated text by </tool> and only keep the first one
+        for message in messages:
+            if message["role"] == "assistant" and "content" in message:
+                for content in message["content"]:
+                    if (
+                        isinstance(content, dict)
+                        and content.get("type") == "text"
+                        and "text" in content
+                    ):
+                        content["text"] = (
+                            content["text"].split("</tool>", 1)[0] + "</tool>\n\n"
+                        )
+        # Prune the messages history before the next MLLM generation round according to the 3-part rules.
+        # This keeps history compact and ensures the model sees only the allowed parts.
+        messages = _prune_messages_for_next_round(
+            messages,
+            USED_TEXT_PROMPTS,
+            LATEST_SAM3_TEXT_PROMPT,
+            img_path,
+            initial_text_prompt,
+        )
+        # make sure there can never be more than 2 images in the context
+        assert count_images(messages) <= 2
+        generation_count += 1
+        if generation_count > max_generations:
+            raise ValueError(
+                f"Exceeded maximum number of allowed generation requests ({max_generations})"
+            )
+        print("\n\n")
+        print("-" * 30 + f" Round {str(generation_count + 1)}" + "-" * 30)
+        print("\n\n")
+        generated_text = send_generate_request(messages)
+        print(
+            f"\n>>> MLLM Response [start]\n{generated_text}\n<<< MLLM Response [end]\n"
+        )
+    print("\n\n>>> SAM 3 Agent execution ended.\n\n")
+    error_save_path = os.path.join(
+        error_save_dir,
+        f"{img_path.rsplit('/', 1)[-1].rsplit('.', 1)[0]}_error_history.json",
+    )
+    with open(error_save_path, "w") as f:
+        json.dump(messages, f, indent=4)
+    print("Saved messages history that caused error to:", error_save_path)
+    raise ValueError(
+        rf"Generated text is None, which is unexpected. Please check the Qwen server and the input parameters for image path: {img_path} and initial text prompt: {initial_text_prompt}."
+    )

third_party/sam3/sam3/agent/client_llm.py ADDED Viewed

	@@ -0,0 +1,207 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+# pyre-unsafe
+import base64
+import os
+from typing import Any, Optional
+from openai import OpenAI
+def get_image_base64_and_mime(image_path):
+    """Convert image file to base64 string and get MIME type"""
+    try:
+        # Get MIME type based on file extension
+        ext = os.path.splitext(image_path)[1].lower()
+        mime_types = {
+            ".jpg": "image/jpeg",
+            ".jpeg": "image/jpeg",
+            ".png": "image/png",
+            ".gif": "image/gif",
+            ".webp": "image/webp",
+            ".bmp": "image/bmp",
+        }
+        mime_type = mime_types.get(ext, "image/jpeg")  # Default to JPEG
+        # Convert image to base64
+        with open(image_path, "rb") as image_file:
+            base64_data = base64.b64encode(image_file.read()).decode("utf-8")
+            return base64_data, mime_type
+    except Exception as e:
+        print(f"Error converting image to base64: {e}")
+        return None, None
+def send_generate_request(
+    messages,
+    server_url=None,
+    model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    api_key=None,
+    max_tokens=4096,
+):
+    """
+    Sends a request to the OpenAI-compatible API endpoint using the OpenAI client library.
+    Args:
+        server_url (str): The base URL of the server, e.g. "http://127.0.0.1:8000"
+        messages (list): A list of message dicts, each containing role and content.
+        model (str): The model to use for generation (default: "llama-4")
+        max_tokens (int): Maximum number of tokens to generate (default: 4096)
+    Returns:
+        str: The generated response text from the server.
+    """
+    # Process messages to convert image paths to base64
+    processed_messages = []
+    for message in messages:
+        processed_message = message.copy()
+        if message["role"] == "user" and "content" in message:
+            processed_content = []
+            for c in message["content"]:
+                if isinstance(c, dict) and c.get("type") == "image":
+                    # Convert image path to base64 format
+                    image_path = c["image"]
+                    print("image_path", image_path)
+                    new_image_path = image_path.replace(
+                        "?", "%3F"
+                    )  # Escape ? in the path
+                    # Read the image file and convert to base64
+                    try:
+                        base64_image, mime_type = get_image_base64_and_mime(
+                            new_image_path
+                        )
+                        if base64_image is None:
+                            print(
+                                f"Warning: Could not convert image to base64: {new_image_path}"
+                            )
+                            continue
+                        # Create the proper image_url structure with base64 data
+                        processed_content.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{mime_type};base64,{base64_image}",
+                                    "detail": "high",
+                                },
+                            }
+                        )
+                    except FileNotFoundError:
+                        print(f"Warning: Image file not found: {new_image_path}")
+                        continue
+                    except Exception as e:
+                        print(f"Warning: Error processing image {new_image_path}: {e}")
+                        continue
+                else:
+                    processed_content.append(c)
+            processed_message["content"] = processed_content
+        processed_messages.append(processed_message)
+    # Create OpenAI client with custom base URL
+    client = OpenAI(api_key=api_key, base_url=server_url)
+    try:
+        print(f"🔍 Calling model {model}...")
+        response = client.chat.completions.create(
+            model=model,
+            messages=processed_messages,
+            max_completion_tokens=max_tokens,
+            n=1,
+        )
+        # print(f"Received response: {response.choices[0].message}")
+        # Extract the response content
+        if response.choices and len(response.choices) > 0:
+            return response.choices[0].message.content
+        else:
+            print(f"Unexpected response format: {response}")
+            return None
+    except Exception as e:
+        print(f"Request failed: {e}")
+        return None
+def send_direct_request(
+    llm: Any,
+    messages: list[dict[str, Any]],
+    sampling_params: Any,
+) -> Optional[str]:
+    """
+    Run inference on a vLLM model instance directly without using a server.
+    Args:
+        llm: Initialized vLLM LLM instance (passed from external initialization)
+        messages: List of message dicts with role and content (OpenAI format)
+        sampling_params: vLLM SamplingParams instance (initialized externally)
+    Returns:
+        str: Generated response text, or None if inference fails
+    """
+    try:
+        # Process messages to handle images (convert to base64 if needed)
+        processed_messages = []
+        for message in messages:
+            processed_message = message.copy()
+            if message["role"] == "user" and "content" in message:
+                processed_content = []
+                for c in message["content"]:
+                    if isinstance(c, dict) and c.get("type") == "image":
+                        # Convert image path to base64 format
+                        image_path = c["image"]
+                        new_image_path = image_path.replace("?", "%3F")
+                        try:
+                            base64_image, mime_type = get_image_base64_and_mime(
+                                new_image_path
+                            )
+                            if base64_image is None:
+                                print(
+                                    f"Warning: Could not convert image: {new_image_path}"
+                                )
+                                continue
+                            # vLLM expects image_url format
+                            processed_content.append(
+                                {
+                                    "type": "image_url",
+                                    "image_url": {
+                                        "url": f"data:{mime_type};base64,{base64_image}"
+                                    },
+                                }
+                            )
+                        except Exception as e:
+                            print(
+                                f"Warning: Error processing image {new_image_path}: {e}"
+                            )
+                            continue
+                    else:
+                        processed_content.append(c)
+                processed_message["content"] = processed_content
+            processed_messages.append(processed_message)
+        print("🔍 Running direct inference with vLLM...")
+        # Run inference using vLLM's chat interface
+        outputs = llm.chat(
+            messages=processed_messages,
+            sampling_params=sampling_params,
+        )
+        # Extract the generated text from the first output
+        if outputs and len(outputs) > 0:
+            generated_text = outputs[0].outputs[0].text
+            return generated_text
+        else:
+            print(f"Unexpected output format: {outputs}")
+            return None
+    except Exception as e:
+        print(f"Direct inference failed: {e}")
+        return None

third_party/sam3/sam3/agent/client_sam3.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
+# pyre-unsafe
+import json
+import os
+import torch
+from PIL import Image
+from sam3.model.box_ops import box_xyxy_to_xywh
+from sam3.train.masks_ops import rle_encode
+from .helpers.mask_overlap_removal import remove_overlapping_masks
+from .viz import visualize
+def sam3_inference(processor, image_path, text_prompt):
+    """Run SAM 3 image inference with text prompts and format the outputs"""
+    image = Image.open(image_path)
+    orig_img_w, orig_img_h = image.size
+    # model inference
+    inference_state = processor.set_image(image)
+    inference_state = processor.set_text_prompt(
+        state=inference_state, prompt=text_prompt
+    )
+    # format and assemble outputs
+    pred_boxes_xyxy = torch.stack(
+        [
+            inference_state["boxes"][:, 0] / orig_img_w,
+            inference_state["boxes"][:, 1] / orig_img_h,
+            inference_state["boxes"][:, 2] / orig_img_w,
+            inference_state["boxes"][:, 3] / orig_img_h,
+        ],
+        dim=-1,
+    )  # normalized in range [0, 1]
+    pred_boxes_xywh = box_xyxy_to_xywh(pred_boxes_xyxy).tolist()
+    pred_masks = rle_encode(inference_state["masks"].squeeze(1))
+    pred_masks = [m["counts"] for m in pred_masks]
+    outputs = {
+        "orig_img_h": orig_img_h,
+        "orig_img_w": orig_img_w,
+        "pred_boxes": pred_boxes_xywh,
+        "pred_masks": pred_masks,
+        "pred_scores": inference_state["scores"].tolist(),
+    }
+    return outputs
+def call_sam_service(
+    sam3_processor,
+    image_path: str,
+    text_prompt: str,
+    output_folder_path: str = "sam3_output",
+):
+    """
+    Loads an image, sends it with a text prompt to the service,
+    saves the results, and renders the visualization.
+    """
+    print(f"📞 Loading image '{image_path}' and sending with prompt '{text_prompt}'...")
+    text_prompt_for_save_path = (
+        text_prompt.replace("/", "_") if "/" in text_prompt else text_prompt
+    )
+    os.makedirs(
+        os.path.join(output_folder_path, image_path.replace("/", "-")), exist_ok=True
+    )
+    output_json_path = os.path.join(
+        output_folder_path,
+        image_path.replace("/", "-"),
+        rf"{text_prompt_for_save_path}.json",
+    )
+    output_image_path = os.path.join(
+        output_folder_path,
+        image_path.replace("/", "-"),
+        rf"{text_prompt_for_save_path}.png",
+    )
+    try:
+        # Send the image and text prompt as a multipart/form-data request
+        serialized_response = sam3_inference(sam3_processor, image_path, text_prompt)
+        # 1. Prepare the response dictionary
+        serialized_response = remove_overlapping_masks(serialized_response)
+        serialized_response = {
+            "original_image_path": image_path,
+            "output_image_path": output_image_path,
+            **serialized_response,
+        }
+        # 2. Reorder predictions by scores (highest to lowest) if scores are available
+        if "pred_scores" in serialized_response and serialized_response["pred_scores"]:
+            # Create indices sorted by scores in descending order
+            score_indices = sorted(
+                range(len(serialized_response["pred_scores"])),
+                key=lambda i: serialized_response["pred_scores"][i],
+                reverse=True,
+            )
+            # Reorder all three lists based on the sorted indices
+            serialized_response["pred_scores"] = [
+                serialized_response["pred_scores"][i] for i in score_indices
+            ]
+            serialized_response["pred_boxes"] = [
+                serialized_response["pred_boxes"][i] for i in score_indices
+            ]
+            serialized_response["pred_masks"] = [
+                serialized_response["pred_masks"][i] for i in score_indices
+            ]
+        # 3. Remove any invalid RLE masks that is too short (shorter than 5 characters)
+        valid_masks = []
+        valid_boxes = []
+        valid_scores = []
+        for i, rle in enumerate(serialized_response["pred_masks"]):
+            if len(rle) > 4:
+                valid_masks.append(rle)
+                valid_boxes.append(serialized_response["pred_boxes"][i])
+                valid_scores.append(serialized_response["pred_scores"][i])
+        serialized_response["pred_masks"] = valid_masks
+        serialized_response["pred_boxes"] = valid_boxes
+        serialized_response["pred_scores"] = valid_scores
+        with open(output_json_path, "w") as f:
+            json.dump(serialized_response, f, indent=4)
+        print(f"✅ Raw JSON response saved to '{output_json_path}'")
+        # 4. Render and save visualizations on the image and save it in the SAM3 output folder
+        print("🔍 Rendering visualizations on the image ...")
+        viz_image = visualize(serialized_response)
+        os.makedirs(os.path.dirname(output_image_path), exist_ok=True)
+        viz_image.save(output_image_path)
+        print("✅ Saved visualization at:", output_image_path)
+    except Exception as e:
+        print(f"❌ Error calling service: {e}")
+    return output_json_path

third_party/sam3/sam3/agent/helpers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
2	+
3	+ # pyre-unsafe