Spaces:

Clara211111
/

flow3r

Running on Zero

App Files Files Community

Clara211111 commited on Feb 26

Commit

1efbda0

1 Parent(s): 5c8071a

upload demo

Browse files

Files changed (37) hide show

app.py +725 -4
flow3r/models/dinov2/__init__.py +6 -0
flow3r/models/dinov2/hub/__init__.py +4 -0
flow3r/models/dinov2/hub/backbones.py +156 -0
flow3r/models/dinov2/hub/utils.py +39 -0
flow3r/models/dinov2/layers/__init__.py +11 -0
flow3r/models/dinov2/layers/attention.py +89 -0
flow3r/models/dinov2/layers/block.py +259 -0
flow3r/models/dinov2/layers/dino_head.py +58 -0
flow3r/models/dinov2/layers/drop_path.py +34 -0
flow3r/models/dinov2/layers/layer_scale.py +27 -0
flow3r/models/dinov2/layers/mlp.py +40 -0
flow3r/models/dinov2/layers/patch_embed.py +88 -0
flow3r/models/dinov2/layers/swiglu_ffn.py +72 -0
flow3r/models/dinov2/models/__init__.py +43 -0
flow3r/models/dinov2/models/vision_transformer.py +404 -0
flow3r/models/dinov2/utils/__init__.py +4 -0
flow3r/models/dinov2/utils/cluster.py +95 -0
flow3r/models/dinov2/utils/config.py +72 -0
flow3r/models/dinov2/utils/dtype.py +37 -0
flow3r/models/dinov2/utils/param_groups.py +103 -0
flow3r/models/dinov2/utils/utils.py +95 -0
flow3r/models/flow3r.py +233 -0
flow3r/models/flow_head/dpt_head.py +498 -0
flow3r/models/flow_head/utils.py +108 -0
flow3r/models/layers/attention.py +403 -0
flow3r/models/layers/block.py +406 -0
flow3r/models/layers/camera_head.py +93 -0
flow3r/models/layers/pos_embed.py +174 -0
flow3r/models/layers/transformer_head.py +389 -0
flow3r/utils/alignment.py +499 -0
flow3r/utils/basic.py +223 -0
flow3r/utils/cropping.py +197 -0
flow3r/utils/debug.py +63 -0
flow3r/utils/flow_utils.py +472 -0
flow3r/utils/geometry.py +367 -0
requirements.txt +15 -0

app.py CHANGED Viewed

@@ -1,8 +1,729 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch(server_name="0.0.0.0", server_port=7860)

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+import cv2
+import torch
+import numpy as np
 import gradio as gr
+import sys
+import shutil
+from datetime import datetime
+import glob
+import gc
+import time
+import trimesh
+import matplotlib
+from flow3r.models.flow3r import Flow3r
+from flow3r.utils.basic import load_images_as_tensor
+from flow3r.utils.geometry import depth_edge
+from scipy.spatial.transform import Rotation
+from huggingface_hub import hf_hub_download
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Initializing and loading Flow3r model...")
+model = Flow3r()
+ckpt_path = hf_hub_download(repo_id="Clara211111/flow3r", filename="flow3r.bin")
+checkpoint = torch.load(ckpt_path, weights_only=False, map_location='cpu')
+model.load_state_dict(checkpoint, strict=True)
+model.eval()
+model = model.to(device)
+# -------------------------------------------------------------------------
+# Utils
+# -------------------------------------------------------------------------
+def predictions_to_glb(
+    predictions,
+    conf_thres=50.0,
+    filter_by_frames="all",
+    show_cam=True,
+) -> trimesh.Scene:
+    """
+    Converts predictions to a 3D scene represented as a GLB file.
+    Args:
+        predictions (dict): Dictionary containing model predictions with keys:
+            - world_points: 3D point coordinates (S, H, W, 3)
+            - world_points_conf: Confidence scores (S, H, W)
+            - images: Input images (S, H, W, 3)
+            - extrinsic: Camera extrinsic matrices (S, 3, 4)
+        conf_thres (float): Percentage of low-confidence points to filter out (default: 50.0)
+        filter_by_frames (str): Frame filter specification (default: "all")
+        show_cam (bool): Include camera visualization (default: True)
+    Returns:
+        trimesh.Scene: Processed 3D scene containing point cloud and cameras
+    Raises:
+        ValueError: If input predictions structure is invalid
+    """
+    if not isinstance(predictions, dict):
+        raise ValueError("predictions must be a dictionary")
+    if conf_thres is None:
+        conf_thres = 10
+    print("Building GLB scene")
+    selected_frame_idx = None
+    if filter_by_frames != "all" and filter_by_frames != "All":
+        try:
+            # Extract the index part before the colon
+            selected_frame_idx = int(filter_by_frames.split(":")[0])
+        except (ValueError, IndexError):
+            pass
+    pred_world_points = predictions["points"]
+    pred_world_points_conf = predictions.get("conf", np.ones_like(pred_world_points[..., 0]))
+    # Get images from predictions
+    images = predictions["images"]
+    # Use extrinsic matrices instead of pred_extrinsic_list
+    camera_poses = predictions["camera_poses"]
+    if selected_frame_idx is not None:
+        pred_world_points = pred_world_points[selected_frame_idx][None]
+        pred_world_points_conf = pred_world_points_conf[selected_frame_idx][None]
+        images = images[selected_frame_idx][None]
+        camera_poses = camera_poses[selected_frame_idx][None]
+    vertices_3d = pred_world_points.reshape(-1, 3)
+    # Handle different image formats - check if images need transposing
+    if images.ndim == 4 and images.shape[1] == 3:  # NCHW format
+        colors_rgb = np.transpose(images, (0, 2, 3, 1))
+    else:  # Assume already in NHWC format
+        colors_rgb = images
+    colors_rgb = (colors_rgb.reshape(-1, 3) * 255).astype(np.uint8)
+    conf = pred_world_points_conf.reshape(-1)
+    # Convert percentage threshold to actual confidence value
+    if conf_thres == 0.0:
+        conf_threshold = 0.0
+    else:
+        # conf_threshold = np.percentile(conf, conf_thres)
+        conf_threshold = conf_thres / 100
+    conf_mask = (conf >= conf_threshold) & (conf > 1e-5)
+    vertices_3d = vertices_3d[conf_mask]
+    colors_rgb = colors_rgb[conf_mask]
+    if vertices_3d is None or np.asarray(vertices_3d).size == 0:
+        vertices_3d = np.array([[1, 0, 0]])
+        colors_rgb = np.array([[255, 255, 255]])
+        scene_scale = 1
+    else:
+        # Calculate the 5th and 95th percentiles along each axis
+        lower_percentile = np.percentile(vertices_3d, 5, axis=0)
+        upper_percentile = np.percentile(vertices_3d, 95, axis=0)
+        # Calculate the diagonal length of the percentile bounding box
+        scene_scale = np.linalg.norm(upper_percentile - lower_percentile)
+    colormap = matplotlib.colormaps.get_cmap("gist_rainbow")
+    # Initialize a 3D scene
+    scene_3d = trimesh.Scene()
+    # Add point cloud data to the scene
+    point_cloud_data = trimesh.PointCloud(vertices=vertices_3d, colors=colors_rgb)
+    scene_3d.add_geometry(point_cloud_data)
+    # Prepare 4x4 matrices for camera extrinsics
+    num_cameras = len(camera_poses)
+    if show_cam:
+        # Add camera models to the scene
+        for i in range(num_cameras):
+            camera_to_world = camera_poses[i]
+            rgba_color = colormap(i / num_cameras)
+            current_color = tuple(int(255 * x) for x in rgba_color[:3])
+            # integrate_camera_into_scene(scene_3d, camera_to_world, current_color, scene_scale)
+            integrate_camera_into_scene(scene_3d, camera_to_world, current_color, 1.)          # fixed camera size
+    # Rotate scene for better visualize
+    align_rotation = np.eye(4)
+    align_rotation[:3, :3] = Rotation.from_euler("y", 100, degrees=True).as_matrix()            # plane rotate
+    align_rotation[:3, :3] = align_rotation[:3, :3] @ Rotation.from_euler("x", 155, degrees=True).as_matrix()           # roll
+    scene_3d.apply_transform(align_rotation)
+    print("GLB Scene built")
+    return scene_3d
+def get_opengl_conversion_matrix() -> np.ndarray:
+    """
+    Constructs and returns the OpenGL conversion matrix.
+    Returns:
+        numpy.ndarray: A 4x4 OpenGL conversion matrix.
+    """
+    # Create an identity matrix
+    matrix = np.identity(4)
+    # Flip the y and z axes
+    matrix[1, 1] = -1
+    matrix[2, 2] = -1
+    return matrix
+def integrate_camera_into_scene(scene: trimesh.Scene, transform: np.ndarray, face_colors: tuple, scene_scale: float):
+    """
+    Integrates a fake camera mesh into the 3D scene.
+    Args:
+        scene (trimesh.Scene): The 3D scene to add the camera model.
+        transform (np.ndarray): Transformation matrix for camera positioning.
+        face_colors (tuple): Color of the camera face.
+        scene_scale (float): Scale of the scene.
+    """
+    cam_width = scene_scale * 0.05
+    cam_height = scene_scale * 0.1
+    # Create cone shape for camera
+    rot_45_degree = np.eye(4)
+    rot_45_degree[:3, :3] = Rotation.from_euler("z", 45, degrees=True).as_matrix()
+    rot_45_degree[2, 3] = -cam_height
+    opengl_transform = get_opengl_conversion_matrix()
+    # Combine transformations
+    complete_transform = transform @ opengl_transform @ rot_45_degree
+    camera_cone_shape = trimesh.creation.cone(cam_width, cam_height, sections=4)
+    # Generate mesh for the camera
+    slight_rotation = np.eye(4)
+    slight_rotation[:3, :3] = Rotation.from_euler("z", 2, degrees=True).as_matrix()
+    vertices_combined = np.concatenate(
+        [
+            camera_cone_shape.vertices,
+            0.95 * camera_cone_shape.vertices,
+            transform_points(slight_rotation, camera_cone_shape.vertices),
+        ]
+    )
+    vertices_transformed = transform_points(complete_transform, vertices_combined)
+    mesh_faces = compute_camera_faces(camera_cone_shape)
+    # Add the camera mesh to the scene
+    camera_mesh = trimesh.Trimesh(vertices=vertices_transformed, faces=mesh_faces)
+    camera_mesh.visual.face_colors[:, :3] = face_colors
+    scene.add_geometry(camera_mesh)
+def transform_points(transformation: np.ndarray, points: np.ndarray, dim: int = None) -> np.ndarray:
+    """
+    Applies a 4x4 transformation to a set of points.
+    Args:
+        transformation (np.ndarray): Transformation matrix.
+        points (np.ndarray): Points to be transformed.
+        dim (int, optional): Dimension for reshaping the result.
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    points = np.asarray(points)
+    initial_shape = points.shape[:-1]
+    dim = dim or points.shape[-1]
+    # Apply transformation
+    transformation = transformation.swapaxes(-1, -2)  # Transpose the transformation matrix
+    points = points @ transformation[..., :-1, :] + transformation[..., -1:, :]
+    # Reshape the result
+    result = points[..., :dim].reshape(*initial_shape, dim)
+    return result
+def compute_camera_faces(cone_shape: trimesh.Trimesh) -> np.ndarray:
+    """
+    Computes the faces for the camera mesh.
+    Args:
+        cone_shape (trimesh.Trimesh): The shape of the camera cone.
+    Returns:
+        np.ndarray: Array of faces for the camera mesh.
+    """
+    # Create pseudo cameras
+    faces_list = []
+    num_vertices_cone = len(cone_shape.vertices)
+    for face in cone_shape.faces:
+        if 0 in face:
+            continue
+        v1, v2, v3 = face
+        v1_offset, v2_offset, v3_offset = face + num_vertices_cone
+        v1_offset_2, v2_offset_2, v3_offset_2 = face + 2 * num_vertices_cone
+        faces_list.extend(
+            [
+                (v1, v2, v2_offset),
+                (v1, v1_offset, v3),
+                (v3_offset, v2, v3),
+                (v1, v2, v2_offset_2),
+                (v1, v1_offset_2, v3),
+                (v3_offset_2, v2, v3),
+            ]
+        )
+    faces_list += [(v3, v2, v1) for v1, v2, v3 in faces_list]
+    return np.array(faces_list)
+# -------------------------------------------------------------------------
+# 1) Core model inference
+# -------------------------------------------------------------------------
+def run_model(target_dir, model) -> dict:
+    print(f"Processing images from {target_dir}")
+    # Device check
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if not torch.cuda.is_available():
+        raise ValueError("CUDA is not available. Check your environment.")
+    # Move model to device
+    model = model.to(device)
+    model.eval()
+    # Load and preprocess images
+    image_names = glob.glob(os.path.join(target_dir, "images", "*"))
+    image_names = sorted(image_names)
+    print(f"Found {len(image_names)} images")
+    if len(image_names) == 0:
+        raise ValueError("No images found. Check your upload.")
+    # interval = 10 if target_dir.endswith('.mp4') else 1
+    interval = 1
+    imgs = load_images_as_tensor(os.path.join(target_dir, "images"), interval=interval).to(device) # (N, 3, H, W)
+    # Run inference
+    print("Running inference...")
+    dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(dtype=dtype):
+            predictions = model(imgs[None]) # Add batch dimension
+    predictions['images'] = imgs[None].permute(0, 1, 3, 4, 2)
+    predictions['conf'] = torch.sigmoid(predictions['conf'])
+    edge = depth_edge(predictions['local_points'][..., 2], rtol=0.03)
+    predictions['conf'][edge] = 0.0
+    del predictions['local_points']
+    # Convert tensors to numpy
+    for key in predictions.keys():
+        if isinstance(predictions[key], torch.Tensor):
+            predictions[key] = predictions[key].cpu().numpy().squeeze(0)  # remove batch dimension
+    # Clean up
+    torch.cuda.empty_cache()
+    return predictions
+# -------------------------------------------------------------------------
+# 2) Handle uploaded video/images --> produce target_dir + images
+# -------------------------------------------------------------------------
+def handle_uploads(input_video, input_images):
+    """
+    Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
+    images or extracted frames from video into it. Return (target_dir, image_paths).
+    """
+    start_time = time.time()
+    gc.collect()
+    torch.cuda.empty_cache()
+    # Create a unique folder name
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+    target_dir = f"input_images_{timestamp}"
+    target_dir_images = os.path.join(target_dir, "images")
+    # Clean up if somehow that folder already exists
+    if os.path.exists(target_dir):
+        shutil.rmtree(target_dir)
+    os.makedirs(target_dir)
+    os.makedirs(target_dir_images)
+    image_paths = []
+    # --- Handle images ---
+    if input_images is not None:
+        for file_data in input_images:
+            if isinstance(file_data, dict) and "name" in file_data:
+                file_path = file_data["name"]
+            else:
+                file_path = file_data
+            dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
+            shutil.copy(file_path, dst_path)
+            image_paths.append(dst_path)
+    # --- Handle video ---
+    if input_video is not None:
+        if isinstance(input_video, dict) and "name" in input_video:
+            video_path = input_video["name"]
+        else:
+            video_path = input_video
+        vs = cv2.VideoCapture(video_path)
+        fps = vs.get(cv2.CAP_PROP_FPS)
+        frame_interval = int(fps * 1)  # 1 frame/sec
+        count = 0
+        video_frame_num = 0
+        while True:
+            gotit, frame = vs.read()
+            if not gotit:
+                break
+            count += 1
+            if count % frame_interval == 0:
+                image_path = os.path.join(target_dir_images, f"{video_frame_num:06}.png")
+                cv2.imwrite(image_path, frame)
+                image_paths.append(image_path)
+                video_frame_num += 1
+    # Sort final images for gallery
+    image_paths = sorted(image_paths)
+    end_time = time.time()
+    print(f"Files copied to {target_dir_images}; took {end_time - start_time:.3f} seconds")
+    return target_dir, image_paths
+# -------------------------------------------------------------------------
+# 3) Update gallery on upload
+# -------------------------------------------------------------------------
+def update_gallery_on_upload(input_video, input_images):
+    """
+    Whenever user uploads or changes files, immediately handle them
+    and show in the gallery. Return (target_dir, image_paths).
+    If nothing is uploaded, returns "None" and empty list.
+    """
+    if not input_video and not input_images:
+        return None, None, None, None
+    target_dir, image_paths = handle_uploads(input_video, input_images)
+    return None, target_dir, image_paths, "Upload complete. Click 'Reconstruct' to begin 3D processing."
+# -------------------------------------------------------------------------
+# 4) Reconstruction: uses the target_dir plus any viz parameters
+# -------------------------------------------------------------------------
+def gradio_demo(
+    target_dir,
+    conf_thres=3.0,
+    frame_filter="All",
+    show_cam=True,
+):
+    """
+    Perform reconstruction using the already-created target_dir/images.
+    """
+    if not os.path.isdir(target_dir) or target_dir == "None":
+        return None, "No valid target directory found. Please upload first.", None, None
+    start_time = time.time()
+    gc.collect()
+    torch.cuda.empty_cache()
+    # Prepare frame_filter dropdown
+    target_dir_images = os.path.join(target_dir, "images")
+    all_files = sorted(os.listdir(target_dir_images)) if os.path.isdir(target_dir_images) else []
+    all_files = [f"{i}: {filename}" for i, filename in enumerate(all_files)]
+    frame_filter_choices = ["All"] + all_files
+    print("Running run_model...")
+    with torch.no_grad():
+        predictions = run_model(target_dir, model)
+    # Save predictions
+    prediction_save_path = os.path.join(target_dir, "predictions.npz")
+    np.savez(prediction_save_path, **predictions)
+    # Handle None frame_filter
+    if frame_filter is None:
+        frame_filter = "All"
+    # Build a GLB file name
+    glbfile = os.path.join(
+        target_dir,
+        f"glbscene_{conf_thres}_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}.glb",
+    )
+    # Convert predictions to GLB
+    glbscene = predictions_to_glb(
+        predictions,
+        conf_thres=conf_thres,
+        filter_by_frames=frame_filter,
+        show_cam=show_cam,
+        # mask_sky=mask_sky,
+    )
+    glbscene.export(file_obj=glbfile)
+    # Cleanup
+    del predictions
+    gc.collect()
+    torch.cuda.empty_cache()
+    end_time = time.time()
+    print(f"Total time: {end_time - start_time:.2f} seconds (including IO)")
+    log_msg = f"Reconstruction Success ({len(all_files)} frames). Waiting for visualization."
+    return glbfile, log_msg, gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True)
+# -------------------------------------------------------------------------
+# 5) Helper functions for UI resets + re-visualization
+# -------------------------------------------------------------------------
+def clear_fields():
+    """
+    Clears the 3D viewer, the stored target_dir, and empties the gallery.
+    """
+    return None
+def update_log():
+    """
+    Display a quick log message while waiting.
+    """
+    return "Loading and Reconstructing..."
+def update_visualization(
+    target_dir, conf_thres, frame_filter, show_cam, is_example
+):
+    """
+    Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
+    and return it for the 3D viewer. If is_example == "True", skip.
+    """
+    # If it's an example click, skip as requested
+    if is_example == "True":
+        return None, "No reconstruction available. Please click the Reconstruct button first."
+    if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+        return None, "No reconstruction available. Please click the Reconstruct button first."
+    predictions_path = os.path.join(target_dir, "predictions.npz")
+    if not os.path.exists(predictions_path):
+        return None, f"No reconstruction available at {predictions_path}. Please run 'Reconstruct' first."
+    key_list = [
+        "images",
+        "points",
+        "conf",
+        "camera_poses",
+    ]
+    loaded = np.load(predictions_path)
+    predictions = {key: np.array(loaded[key]) for key in key_list}
+    glbfile = os.path.join(
+        target_dir,
+        f"glbscene_{conf_thres}_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}.glb",
+    )
+    if not os.path.exists(glbfile):
+        glbscene = predictions_to_glb(
+            predictions,
+            conf_thres=conf_thres,
+            filter_by_frames=frame_filter,
+            show_cam=show_cam,
+            # mask_sky=mask_sky,
+        )
+        glbscene.export(file_obj=glbfile)
+    return glbfile, "Updating Visualization"
+# -------------------------------------------------------------------------
+# Example images
+# -------------------------------------------------------------------------
+great_wall_video = "examples/videos/great_wall.mp4"
+colosseum_video = "examples/videos/Colosseum.mp4"
+room_video = "examples/videos/room.mp4"
+kitchen_video = "examples/videos/kitchen.mp4"
+fern_video = "examples/videos/fern.mp4"
+single_cartoon_video = "examples/videos/single_cartoon.mp4"
+single_oil_painting_video = "examples/videos/single_oil_painting.mp4"
+pyramid_video = "examples/videos/pyramid.mp4"
+# -------------------------------------------------------------------------
+# 6) Build Gradio UI
+# -------------------------------------------------------------------------
+theme = gr.themes.Ocean()
+theme.set(
+    checkbox_label_background_fill_selected="*button_primary_background_fill",
+    checkbox_label_text_color_selected="*button_primary_text_color",
+)
+with gr.Blocks(
+    theme=theme,
+    css="""
+    .custom-log * {
+        font-style: italic;
+        font-size: 22px !important;
+        background-image: linear-gradient(120deg, #0ea5e9 0%, #6ee7b7 60%, #34d399 100%);
+        -webkit-background-clip: text;
+        background-clip: text;
+        font-weight: bold !important;
+        color: transparent !important;
+        text-align: center !important;
+    }
+    .example-log * {
+        font-style: italic;
+        font-size: 16px !important;
+        background-image: linear-gradient(120deg, #0ea5e9 0%, #6ee7b7 60%, #34d399 100%);
+        -webkit-background-clip: text;
+        background-clip: text;
+        color: transparent !important;
+    }
+    #my_radio .wrap {
+        display: flex;
+        flex-wrap: nowrap;
+        justify-content: center;
+        align-items: center;
+    }
+    #my_radio .wrap label {
+        display: flex;
+        width: 50%;
+        justify-content: center;
+        align-items: center;
+        margin: 0;
+        padding: 10px 0;
+        box-sizing: border-box;
+    }
+    """,
+) as demo:
+    # Instead of gr.State, we use a hidden Textbox:
+    is_example = gr.Textbox(label="is_example", visible=False, value="None")
+    num_images = gr.Textbox(label="num_images", visible=False, value="None")
+    gr.HTML(
+        """
+    <h1>Flow3r: Factored Flow Prediction for Visual Geometry Learning</h1>
+    <p>
+    <a href="https://github.com/Kidrauh/flow3r">GitHub Repository</a> |
+    <a href="https://flow3r-project.github.io/">Project Page</a>
+    </p>
+    <div style="font-size: 16px; line-height: 1.5;">
+    <p>Upload a video or a set of images to create a 3D reconstruction of a scene or object. Flow3r takes these images and generates a 3D point cloud, along with estimated camera poses.</p>
+    </div>
+    """
+    )
+    target_dir_output = gr.Textbox(label="Target Dir", visible=False, value="None")
+    with gr.Row():
+        with gr.Column(scale=2):
+            input_video = gr.Video(label="Upload Video", interactive=True)
+            input_images = gr.File(file_count="multiple", label="Upload Images", interactive=True)
+            image_gallery = gr.Gallery(
+                label="Preview",
+                columns=4,
+                height="300px",
+                # show_download_button=True,
+                object_fit="contain",
+                preview=True,
+            )
+        with gr.Column(scale=4):
+            with gr.Column():
+                gr.Markdown("**3D Reconstruction (Point Cloud and Camera Poses)**")
+                log_output = gr.Markdown(
+                    "Please upload a video or images, then click Reconstruct.", elem_classes=["custom-log"]
+                )
+                reconstruction_output = gr.Model3D(height=520, zoom_speed=0.5, pan_speed=0.5)
+            with gr.Row():
+                submit_btn = gr.Button("Reconstruct", scale=1, variant="primary")
+                clear_btn = gr.ClearButton(
+                    [input_video, input_images, reconstruction_output, log_output, target_dir_output, image_gallery],
+                    scale=1,
+                )
+            with gr.Row():
+                conf_thres = gr.Slider(minimum=0, maximum=100, value=0, step=0.1, label="Confidence Threshold (%)")
+                frame_filter = gr.Dropdown(choices=["All"], value="All", label="Show Points from Frame")
+                with gr.Column():
+                    show_cam = gr.Checkbox(label="Show Camera", value=True)
+    submit_btn.click(fn=clear_fields, inputs=[], outputs=[reconstruction_output]).then(
+        fn=update_log, inputs=[], outputs=[log_output]
+    ).then(
+        fn=gradio_demo,
+        inputs=[
+            target_dir_output,
+            conf_thres,
+            frame_filter,
+            show_cam,
+        ],
+        outputs=[reconstruction_output, log_output, frame_filter],
+    ).then(
+        fn=lambda: "False", inputs=[], outputs=[is_example]  # set is_example to "False"
+    )
+    # -------------------------------------------------------------------------
+    # Real-time Visualization Updates
+    # -------------------------------------------------------------------------
+    conf_thres.change(
+        update_visualization,
+        [
+            target_dir_output,
+            conf_thres,
+            frame_filter,
+            show_cam,
+            is_example,
+        ],
+        [reconstruction_output, log_output],
+    )
+    frame_filter.change(
+        update_visualization,
+        [
+            target_dir_output,
+            conf_thres,
+            frame_filter,
+            show_cam,
+            is_example,
+        ],
+        [reconstruction_output, log_output],
+    )
+    show_cam.change(
+        update_visualization,
+        [
+            target_dir_output,
+            conf_thres,
+            frame_filter,
+            show_cam,
+            is_example,
+        ],
+        [reconstruction_output, log_output],
+    )
+    # -------------------------------------------------------------------------
+    # Auto-update gallery whenever user uploads or changes their files
+    # -------------------------------------------------------------------------
+    input_video.change(
+        fn=update_gallery_on_upload,
+        inputs=[input_video, input_images],
+        outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+    )
+    input_images.change(
+        fn=update_gallery_on_upload,
+        inputs=[input_video, input_images],
+        outputs=[reconstruction_output, target_dir_output, image_gallery, log_output],
+    )
+    demo.queue(max_size=20).launch(show_error=True, share=True)

flow3r/models/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

flow3r/models/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

flow3r/models/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

flow3r/models/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

flow3r/models/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

flow3r/models/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

flow3r/models/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Block)")
+    else:
+        # warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

flow3r/models/dinov2/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

flow3r/models/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

flow3r/models/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

flow3r/models/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

flow3r/models/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

flow3r/models/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        # warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

flow3r/models/dinov2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+from . import vision_transformer as vits
+logger = logging.getLogger("dinov2")
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)

flow3r/models/dinov2/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from torch.nn.init import trunc_normal_
+from ..layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+from ...layers.attention import FlashAttention
+# logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            # logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            # logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            # logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                attn_class=FlashAttention
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=False)
+            else:
+                x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=False)
+            else:
+                x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

flow3r/models/dinov2/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

flow3r/models/dinov2/utils/cluster.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+class ClusterType(Enum):
+    AWS = "aws"
+    FAIR = "fair"
+    RSC = "rsc"
+def _guess_cluster_type() -> ClusterType:
+    uname = os.uname()
+    if uname.sysname == "Linux":
+        if uname.release.endswith("-aws"):
+            # Linux kernel versions on AWS instances are of the form "5.4.0-1051-aws"
+            return ClusterType.AWS
+        elif uname.nodename.startswith("rsc"):
+            # Linux kernel versions on RSC instances are standard ones but hostnames start with "rsc"
+            return ClusterType.RSC
+    return ClusterType.FAIR
+def get_cluster_type(cluster_type: Optional[ClusterType] = None) -> Optional[ClusterType]:
+    if cluster_type is None:
+        return _guess_cluster_type()
+    return cluster_type
+def get_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    CHECKPOINT_DIRNAMES = {
+        ClusterType.AWS: "checkpoints",
+        ClusterType.FAIR: "checkpoint",
+        ClusterType.RSC: "checkpoint/dino",
+    }
+    return Path("/") / CHECKPOINT_DIRNAMES[cluster_type]
+def get_user_checkpoint_path(cluster_type: Optional[ClusterType] = None) -> Optional[Path]:
+    checkpoint_path = get_checkpoint_path(cluster_type)
+    if checkpoint_path is None:
+        return None
+    username = os.environ.get("USER")
+    assert username is not None
+    return checkpoint_path / username
+def get_slurm_partition(cluster_type: Optional[ClusterType] = None) -> Optional[str]:
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type is None:
+        return None
+    SLURM_PARTITIONS = {
+        ClusterType.AWS: "learnlab",
+        ClusterType.FAIR: "learnlab",
+        ClusterType.RSC: "learn",
+    }
+    return SLURM_PARTITIONS[cluster_type]
+def get_slurm_executor_parameters(
+    nodes: int, num_gpus_per_node: int, cluster_type: Optional[ClusterType] = None, **kwargs
+) -> Dict[str, Any]:
+    # create default parameters
+    params = {
+        "mem_gb": 0,  # Requests all memory on a node, see https://slurm.schedmd.com/sbatch.html
+        "gpus_per_node": num_gpus_per_node,
+        "tasks_per_node": num_gpus_per_node,  # one task per GPU
+        "cpus_per_task": 10,
+        "nodes": nodes,
+        "slurm_partition": get_slurm_partition(cluster_type),
+    }
+    # apply cluster-specific adjustments
+    cluster_type = get_cluster_type(cluster_type)
+    if cluster_type == ClusterType.AWS:
+        params["cpus_per_task"] = 12
+        del params["mem_gb"]
+    elif cluster_type == ClusterType.RSC:
+        params["cpus_per_task"] = 12
+    # set additional parameters / apply overrides
+    params.update(kwargs)
+    return params

flow3r/models/dinov2/utils/config.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import math
+import logging
+import os
+from omegaconf import OmegaConf
+import dinov2.distributed as distributed
+from dinov2.logging import setup_logging
+from dinov2.utils import utils
+from dinov2.configs import dinov2_default_config
+logger = logging.getLogger("dinov2")
+def apply_scaling_rules_to_cfg(cfg):  # to fix
+    if cfg.optim.scaling_rule == "sqrt_wrt_1024":
+        base_lr = cfg.optim.base_lr
+        cfg.optim.lr = base_lr
+        cfg.optim.lr *= math.sqrt(cfg.train.batch_size_per_gpu * distributed.get_global_size() / 1024.0)
+        logger.info(f"sqrt scaling learning rate; base: {base_lr}, new: {cfg.optim.lr}")
+    else:
+        raise NotImplementedError
+    return cfg
+def write_config(cfg, output_dir, name="config.yaml"):
+    logger.info(OmegaConf.to_yaml(cfg))
+    saved_cfg_path = os.path.join(output_dir, name)
+    with open(saved_cfg_path, "w") as f:
+        OmegaConf.save(config=cfg, f=f)
+    return saved_cfg_path
+def get_cfg_from_args(args):
+    args.output_dir = os.path.abspath(args.output_dir)
+    args.opts += [f"train.output_dir={args.output_dir}"]
+    default_cfg = OmegaConf.create(dinov2_default_config)
+    cfg = OmegaConf.load(args.config_file)
+    cfg = OmegaConf.merge(default_cfg, cfg, OmegaConf.from_cli(args.opts))
+    return cfg
+def default_setup(args):
+    distributed.enable(overwrite=True)
+    seed = getattr(args, "seed", 0)
+    rank = distributed.get_global_rank()
+    global logger
+    setup_logging(output=args.output_dir, level=logging.INFO)
+    logger = logging.getLogger("dinov2")
+    utils.fix_random_seeds(seed + rank)
+    logger.info("git:\n  {}\n".format(utils.get_sha()))
+    logger.info("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items())))
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg_from_args(args)
+    os.makedirs(args.output_dir, exist_ok=True)
+    default_setup(args)
+    apply_scaling_rules_to_cfg(cfg)
+    write_config(cfg, args.output_dir)
+    return cfg

flow3r/models/dinov2/utils/dtype.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from typing import Dict, Union
+import numpy as np
+import torch
+TypeSpec = Union[str, np.dtype, torch.dtype]
+_NUMPY_TO_TORCH_DTYPE: Dict[np.dtype, torch.dtype] = {
+    np.dtype("bool"): torch.bool,
+    np.dtype("uint8"): torch.uint8,
+    np.dtype("int8"): torch.int8,
+    np.dtype("int16"): torch.int16,
+    np.dtype("int32"): torch.int32,
+    np.dtype("int64"): torch.int64,
+    np.dtype("float16"): torch.float16,
+    np.dtype("float32"): torch.float32,
+    np.dtype("float64"): torch.float64,
+    np.dtype("complex64"): torch.complex64,
+    np.dtype("complex128"): torch.complex128,
+}
+def as_torch_dtype(dtype: TypeSpec) -> torch.dtype:
+    if isinstance(dtype, torch.dtype):
+        return dtype
+    if isinstance(dtype, str):
+        dtype = np.dtype(dtype)
+    assert isinstance(dtype, np.dtype), f"Expected an instance of nunpy dtype, got {type(dtype)}"
+    return _NUMPY_TO_TORCH_DTYPE[dtype]

flow3r/models/dinov2/utils/param_groups.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from collections import defaultdict
+import logging
+logger = logging.getLogger("dinov2")
+def get_vit_lr_decay_rate(name, lr_decay_rate=1.0, num_layers=12, force_is_backbone=False, chunked_blocks=False):
+    """
+    Calculate lr decay rate for different ViT blocks.
+    Args:
+        name (string): parameter name.
+        lr_decay_rate (float): base lr decay rate.
+        num_layers (int): number of ViT blocks.
+    Returns:
+        lr decay rate for the given parameter.
+    """
+    layer_id = num_layers + 1
+    if name.startswith("backbone") or force_is_backbone:
+        if (
+            ".pos_embed" in name
+            or ".patch_embed" in name
+            or ".mask_token" in name
+            or ".cls_token" in name
+            or ".register_tokens" in name
+        ):
+            layer_id = 0
+        elif force_is_backbone and (
+            "pos_embed" in name
+            or "patch_embed" in name
+            or "mask_token" in name
+            or "cls_token" in name
+            or "register_tokens" in name
+        ):
+            layer_id = 0
+        elif ".blocks." in name and ".residual." not in name:
+            layer_id = int(name[name.find(".blocks.") :].split(".")[2]) + 1
+        elif chunked_blocks and "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[2]) + 1
+        elif "blocks." in name and "residual." not in name:
+            layer_id = int(name[name.find("blocks.") :].split(".")[1]) + 1
+    return lr_decay_rate ** (num_layers + 1 - layer_id)
+def get_params_groups_with_decay(model, lr_decay_rate=1.0, patch_embed_lr_mult=1.0):
+    chunked_blocks = False
+    if hasattr(model, "n_blocks"):
+        logger.info("chunked fsdp")
+        n_blocks = model.n_blocks
+        chunked_blocks = model.chunked_blocks
+    elif hasattr(model, "blocks"):
+        logger.info("first code branch")
+        n_blocks = len(model.blocks)
+    elif hasattr(model, "backbone"):
+        logger.info("second code branch")
+        n_blocks = len(model.backbone.blocks)
+    else:
+        logger.info("else code branch")
+        n_blocks = 0
+    all_param_groups = []
+    for name, param in model.named_parameters():
+        name = name.replace("_fsdp_wrapped_module.", "")
+        if not param.requires_grad:
+            continue
+        decay_rate = get_vit_lr_decay_rate(
+            name, lr_decay_rate, num_layers=n_blocks, force_is_backbone=n_blocks > 0, chunked_blocks=chunked_blocks
+        )
+        d = {"params": param, "is_last_layer": False, "lr_multiplier": decay_rate, "wd_multiplier": 1.0, "name": name}
+        if "last_layer" in name:
+            d.update({"is_last_layer": True})
+        if name.endswith(".bias") or "norm" in name or "gamma" in name:
+            d.update({"wd_multiplier": 0.0})
+        if "patch_embed" in name:
+            d.update({"lr_multiplier": d["lr_multiplier"] * patch_embed_lr_mult})
+        all_param_groups.append(d)
+        logger.info(f"""{name}: lr_multiplier: {d["lr_multiplier"]}, wd_multiplier: {d["wd_multiplier"]}""")
+    return all_param_groups
+def fuse_params_groups(all_params_groups, keys=("lr_multiplier", "wd_multiplier", "is_last_layer")):
+    fused_params_groups = defaultdict(lambda: {"params": []})
+    for d in all_params_groups:
+        identifier = ""
+        for k in keys:
+            identifier += k + str(d[k]) + "_"
+        for k in keys:
+            fused_params_groups[identifier][k] = d[k]
+        fused_params_groups[identifier]["params"].append(d["params"])
+    return fused_params_groups.values()

flow3r/models/dinov2/utils/utils.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+import os
+import random
+import subprocess
+from urllib.parse import urlparse
+import numpy as np
+import torch
+from torch import nn
+# logger = logging.getLogger("dinov2")
+def load_pretrained_weights(model, pretrained_weights, checkpoint_key):
+    if urlparse(pretrained_weights).scheme:  # If it looks like an URL
+        state_dict = torch.hub.load_state_dict_from_url(pretrained_weights, map_location="cpu")
+    else:
+        state_dict = torch.load(pretrained_weights, map_location="cpu")
+    if checkpoint_key is not None and checkpoint_key in state_dict:
+        # logger.info(f"Take key {checkpoint_key} in provided checkpoint dict")
+        state_dict = state_dict[checkpoint_key]
+    # remove `module.` prefix
+    state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+    # remove `backbone.` prefix induced by multicrop wrapper
+    state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+    msg = model.load_state_dict(state_dict, strict=False)
+    # logger.info("Pretrained weights found at {} and loaded with msg: {}".format(pretrained_weights, msg))
+def fix_random_seeds(seed=31):
+    """
+    Fix random seeds.
+    """
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+    def _run(command):
+        return subprocess.check_output(command, cwd=cwd).decode("ascii").strip()
+    sha = "N/A"
+    diff = "clean"
+    branch = "N/A"
+    try:
+        sha = _run(["git", "rev-parse", "HEAD"])
+        subprocess.check_output(["git", "diff"], cwd=cwd)
+        diff = _run(["git", "diff-index", "HEAD"])
+        diff = "has uncommitted changes" if diff else "clean"
+        branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"])
+    except Exception:
+        pass
+    message = f"sha: {sha}, status: {diff}, branch: {branch}"
+    return message
+class CosineScheduler(object):
+    def __init__(self, base_value, final_value, total_iters, warmup_iters=0, start_warmup_value=0, freeze_iters=0):
+        super().__init__()
+        self.final_value = final_value
+        self.total_iters = total_iters
+        freeze_schedule = np.zeros((freeze_iters))
+        warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
+        iters = np.arange(total_iters - warmup_iters - freeze_iters)
+        schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
+        self.schedule = np.concatenate((freeze_schedule, warmup_schedule, schedule))
+        assert len(self.schedule) == self.total_iters
+    def __getitem__(self, it):
+        if it >= self.total_iters:
+            return self.final_value
+        else:
+            return self.schedule[it]
+def has_batchnorms(model):
+    bn_types = (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.SyncBatchNorm)
+    for name, module in model.named_modules():
+        if isinstance(module, bn_types):
+            return True
+    return False

flow3r/models/flow3r.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import torch
+import torch.nn as nn
+from functools import partial
+from copy import deepcopy
+from .dinov2.layers import Mlp
+from ..utils.geometry import homogenize_points
+from .layers.pos_embed import RoPE2D, PositionGetter
+from .layers.block import BlockRope
+from .layers.attention import FlashAttentionRope
+from .layers.transformer_head import TransformerDecoder, LinearPts3d, ContextTransformerDecoder
+from .layers.camera_head import CameraHead
+from .flow_head.dpt_head import DPTHead
+from .dinov2.hub.backbones import dinov2_vitl14_reg
+class Flow3r(nn.Module):
+    def __init__(
+            self,
+            pos_type='rope100',
+            decoder_size='large',
+        ):
+        super().__init__()
+        # ----------------------
+        #        Encoder
+        # ----------------------
+        self.encoder = dinov2_vitl14_reg(pretrained=False)
+        self.patch_size = 14
+        del self.encoder.mask_token
+        # ----------------------
+        #  Positonal Encoding
+        # ----------------------
+        self.pos_type = pos_type if pos_type is not None else 'none'
+        self.rope=None
+        if self.pos_type.startswith('rope'): # eg rope100
+            if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+            freq = float(self.pos_type[len('rope'):])
+            self.rope = RoPE2D(freq=freq)
+            self.position_getter = PositionGetter()
+        else:
+            raise NotImplementedError
+        # ----------------------
+        #        Decoder
+        # ----------------------
+        if decoder_size == 'small':
+            dec_embed_dim = 384
+            dec_num_heads = 6
+            mlp_ratio = 4
+            dec_depth = 24
+        elif decoder_size == 'base':
+            dec_embed_dim = 768
+            dec_num_heads = 12
+            mlp_ratio = 4
+            dec_depth = 24
+        elif decoder_size == 'large':
+            dec_embed_dim = 1024
+            dec_num_heads = 16
+            mlp_ratio = 4
+            dec_depth = 36
+        else:
+            raise NotImplementedError
+        self.decoder = nn.ModuleList([
+            BlockRope(
+                dim=dec_embed_dim,
+                num_heads=dec_num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=True,
+                proj_bias=True,
+                ffn_bias=True,
+                drop_path=0.0,
+                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                act_layer=nn.GELU,
+                ffn_layer=Mlp,
+                init_values=0.01,
+                qk_norm=True,
+                attn_class=FlashAttentionRope,
+                rope=self.rope
+            ) for _ in range(dec_depth)])
+        self.dec_embed_dim = dec_embed_dim
+        # ----------------------
+        #     Register_token
+        # ----------------------
+        num_register_tokens = 5
+        self.patch_start_idx = num_register_tokens
+        self.register_token = nn.Parameter(torch.randn(1, 1, num_register_tokens, self.dec_embed_dim))
+        nn.init.normal_(self.register_token, std=1e-6)
+        # ----------------------
+        #  Local Points Decoder
+        # ----------------------
+        self.point_decoder = TransformerDecoder(
+            in_dim=2*self.dec_embed_dim,
+            dec_embed_dim=1024,
+            dec_num_heads=16,
+            out_dim=1024,
+            rope=self.rope
+        )
+        self.point_head = LinearPts3d(patch_size=14, dec_embed_dim=1024, output_dim=3)
+        # ----------------------
+        #  Camera Pose Decoder
+        # ----------------------
+        self.camera_decoder = TransformerDecoder(
+            in_dim=2*self.dec_embed_dim,
+            dec_embed_dim=1024,
+            dec_num_heads=16,                # 8
+            out_dim=512,
+            rope=self.rope,
+            use_checkpoint=False
+        )
+        self.camera_head = CameraHead(dim=512)
+        # ----------------------
+        #  Motion Flow Decoder
+        # ----------------------
+        self.flow_head = DPTHead(
+            patch_size=14,
+            output_dim=2,
+        )
+        # ----------------------
+        #     Conf Decoder
+        # ----------------------
+        self.conf_decoder = deepcopy(self.point_decoder)
+        self.conf_head = LinearPts3d(patch_size=14, dec_embed_dim=1024, output_dim=1)
+        # For ImageNet Normalize
+        image_mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+        image_std = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+        self.register_buffer("image_mean", image_mean)
+        self.register_buffer("image_std", image_std)
+    def decode(self, hidden, N, H, W):
+        BN, hw, _ = hidden.shape
+        B = BN // N
+        final_output = []
+        hidden = hidden.reshape(B*N, hw, -1)
+        register_token = self.register_token.repeat(B, N, 1, 1).reshape(B*N, *self.register_token.shape[-2:])
+        # Concatenate special tokens with patch tokens
+        hidden = torch.cat([register_token, hidden], dim=1)
+        hw = hidden.shape[1]
+        if self.pos_type.startswith('rope'):
+            pos = self.position_getter(B * N, H//self.patch_size, W//self.patch_size, hidden.device)
+        if self.patch_start_idx > 0:
+            # do not use position embedding for special tokens (camera and register tokens)
+            # so set pos to 0 for the special tokens
+            pos = pos + 1
+            pos_special = torch.zeros(B * N, self.patch_start_idx, 2).to(hidden.device).to(pos.dtype)
+            pos = torch.cat([pos_special, pos], dim=1)
+        for i in range(len(self.decoder)):
+            blk = self.decoder[i]
+            if i % 2 == 0:
+                pos = pos.reshape(B*N, hw, -1)
+                hidden = hidden.reshape(B*N, hw, -1)
+            else:
+                pos = pos.reshape(B, N*hw, -1)
+                hidden = hidden.reshape(B, N*hw, -1)
+            hidden = blk(hidden, xpos=pos)
+            if i+1 in [len(self.decoder)-1, len(self.decoder)]:
+                final_output.append(hidden.reshape(B*N, hw, -1))
+        return torch.cat([final_output[0], final_output[1]], dim=-1), pos.reshape(B*N, hw, -1)
+    def forward(self, imgs, pair_indices=None):
+        imgs = (imgs - self.image_mean) / self.image_std
+        # print("the shape of imgs is", imgs.shape)
+        B, N, _, H, W = imgs.shape
+        patch_h, patch_w = H // 14, W // 14
+        # encode by dinov2
+        imgs = imgs.reshape(B*N, _, H, W)
+        hidden = self.encoder(imgs, is_training=True)
+        if isinstance(hidden, dict):
+            hidden = hidden["x_norm_patchtokens"]
+        hidden, pos = self.decode(hidden, N, H, W)
+        point_hidden, point_intermediate = self.point_decoder(hidden, xpos=pos, return_intermediate=True)
+        conf_hidden = self.conf_decoder(hidden, xpos=pos)
+        camera_hidden, camera_intermediate = self.camera_decoder(hidden, xpos=pos, return_intermediate=True)
+        with torch.amp.autocast(device_type='cuda', enabled=False):
+            # local points
+            point_hidden = point_hidden.float()
+            ret = self.point_head([point_hidden[:, self.patch_start_idx:]], (H, W)).reshape(B, N, H, W, -1)
+            xy, z = ret.split([2, 1], dim=-1)
+            z = torch.exp(z)
+            local_points = torch.cat([xy * z, z], dim=-1)
+            # confidence
+            conf_hidden = conf_hidden.float()
+            conf = self.conf_head([conf_hidden[:, self.patch_start_idx:]], (H, W)).reshape(B, N, H, W, -1)
+            # camera
+            camera_hidden = camera_hidden.float()
+            camera_poses = self.camera_head(camera_hidden[:, self.patch_start_idx:], patch_h, patch_w).reshape(B, N, 4, 4)
+            # Flow
+            if pair_indices is not None:
+                flow = self.flow_head([t.float() for t in point_intermediate], [t.float() for t in camera_intermediate], pair_indices, self.patch_start_idx,(H, W), B, N)
+            else:
+                flow = None
+            # unproject local points using camera poses
+            points = torch.einsum('bnij, bnhwj -> bnhwi', camera_poses, homogenize_points(local_points))[..., :3]
+        return dict(
+            points=points,
+            local_points=local_points,
+            conf=conf,
+            camera_poses=camera_poses,
+            flow=flow,
+        )

flow3r/models/flow_head/dpt_head.py ADDED Viewed

	@@ -0,0 +1,498 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Inspired by https://github.com/DepthAnything/Depth-Anything-V2
+import os
+from typing import List, Dict, Tuple, Union
+import torch
+import torch.nn as nn
+from .utils import create_uv_grid, position_grid_to_embed
+class DPTHead(nn.Module):
+    """
+    DPT  Head for dense prediction tasks.
+    This implementation follows the architecture described in "Vision Transformers for Dense Prediction"
+    (https://arxiv.org/abs/2103.13413). The DPT head processes features from a vision transformer
+    backbone and produces dense predictions by fusing multi-scale features.
+    Args:
+        dim_in (int): Input dimension (channels).
+        patch_size (int, optional): Patch size. Default is 14.
+        output_dim (int, optional): Number of output channels. Default is 4.
+        activation (str, optional): Activation type. Default is "inv_log".
+        conf_activation (str, optional): Confidence activation type. Default is "expp1".
+        features (int, optional): Feature channels for intermediate representations. Default is 256.
+        out_channels (List[int], optional): Output channels for each intermediate layer.
+        intermediate_layer_idx (List[int], optional): Indices of layers from aggregated tokens used for DPT.
+        pos_embed (bool, optional): Whether to use positional embedding. Default is True.
+        feature_only (bool, optional): If True, return features only without the last several layers and activation head. Default is False.
+        down_ratio (int, optional): Downscaling factor for the output resolution. Default is 1.
+    """
+    def __init__(
+        self,
+        dim_in: int = 1024,
+        patch_size: int = 14,
+        output_dim: int = 4,
+        activation: str = "inv_log",
+        conf_activation: str = "expp1",
+        features: int = 256,
+        out_channels: List[int] = [256, 512, 1024, 1024],
+        intermediate_layer_idx: List[int] = [4, 11, 17, 23],
+        pos_embed: bool = True,
+        feature_only: bool = False,
+        down_ratio: int = 1,
+    ) -> None:
+        super(DPTHead, self).__init__()
+        self.patch_size = patch_size
+        self.activation = activation
+        self.conf_activation = conf_activation
+        self.pos_embed = pos_embed
+        self.feature_only = feature_only
+        self.down_ratio = down_ratio
+        self.intermediate_layer_idx = intermediate_layer_idx
+        self.dim_in = dim_in
+        self.output_dim = output_dim
+        self.mlp = nn.Sequential(
+            nn.Linear(3 * dim_in, 2*dim_in),
+            nn.ReLU(),
+            nn.Linear(2*dim_in, 2*dim_in),
+            nn.ReLU(),
+            nn.Linear(2*dim_in, dim_in),
+        )
+        self.norm = nn.LayerNorm(dim_in)
+        # Projection layers for each output channel from tokens.
+        self.projects = nn.ModuleList(
+            [
+                nn.Conv2d(
+                    in_channels=dim_in,
+                    out_channels=oc,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+                for oc in out_channels
+            ]
+        )
+        # Resize layers for upsampling feature maps.
+        self.resize_layers = nn.ModuleList(
+            [
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
+                ),
+                nn.ConvTranspose2d(
+                    in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
+                ),
+                nn.Identity(),
+                nn.Conv2d(
+                    in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
+                ),
+            ]
+        )
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            expand=False,
+        )
+        # Attach additional modules to scratch.
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features)
+        self.scratch.refinenet2 = _make_fusion_block(features)
+        self.scratch.refinenet3 = _make_fusion_block(features)
+        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)
+        head_features_1 = features
+        head_features_2 = 32
+        if feature_only:
+            self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1)
+        else:
+            self.scratch.output_conv1 = nn.Conv2d(
+                head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
+            )
+            conv2_in_channels = head_features_1 // 2
+            self.scratch.output_conv2 = nn.Sequential(
+                nn.Conv2d(conv2_in_channels, head_features_2, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
+            )
+    def forward(
+        self,
+        patch_intermediate_4: List[torch.Tensor],   # len=4, each (B*N, hw, dec_embed_dim)
+        camera_intermediate_4: List[torch.Tensor],  # len=4, each (B*N, hw, camera_dim)
+        pair_indices: torch.Tensor,                 # (B, S, 2)
+        patch_start_idx: int,
+        img_shape: Tuple[int, int],
+        B: int,
+        N: int,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Forward pass through the DPT head, supports processing by chunking frames.
+        Args:
+            patch_intermediate_4 (List[Tensor]): List of token tensors from different transformer layers.
+            images (Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1].
+            patch_start_idx (int): Starting index for patch tokens in the token sequence.
+                Used to separate patch tokens from other tokens (e.g., camera or register tokens).
+            frames_chunk_size (int, optional): Number of frames to process in each chunk.
+                If None or larger than S, all frames are processed at once. Default: 8.
+        Returns:
+            Tensor or Tuple[Tensor, Tensor]:
+                - If feature_only=True: Feature maps with shape [B, S, C, H, W]
+                - Otherwise: Tuple of (predictions, confidence) both with shape [B, S, 1, H, W]
+        """
+        feats_4 = []
+        for l in range(4):
+            feat_l = self._fuse_one_layer(
+                patch_intermediate_4[l],
+                camera_intermediate_4[l],
+                patch_start_idx,
+                pair_indices,
+                img_shape,
+                B,
+                N,
+            )
+            feats_4.append(feat_l)
+        flow = self._dpt_fuse_and_predict(feats_4, img_shape)  # (T,2,H,W)
+        H, W = img_shape
+        S = pair_indices.shape[1]
+        return flow.permute(0, 2, 3, 1).reshape(B, S, H, W, self.output_dim)
+    def _apply_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        """
+        Apply positional embedding to tensor x.
+        """
+        patch_w = x.shape[-1]
+        patch_h = x.shape[-2]
+        pos_embed = create_uv_grid(patch_w, patch_h, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pos_embed = position_grid_to_embed(pos_embed, x.shape[1])
+        pos_embed = pos_embed * ratio
+        pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pos_embed
+    def _fuse_one_layer(
+        self,
+        patch_hidden_l: torch.Tensor,   # (B*N, hw, dec_embed_dim)
+        camera_hidden_l: torch.Tensor,  # (B*N, hw, camera_dim)
+        patch_start_idx: int,
+        pair_indices: torch.Tensor,     # (B, S, 2)
+        img_shape: Tuple[int, int],
+        B: int,
+        N: int,
+    ) -> Tuple[torch.Tensor, int, int, int]:
+        """
+        Returns:
+          feat_map: (T, dec_embed_dim, patch_h, patch_w) where T = B*S
+          T, patch_h, patch_w
+        """
+        H, W = img_shape
+        hw = patch_hidden_l[:, patch_start_idx:].shape[1]
+        patch_h, patch_w = H // self.patch_size, W // self.patch_size
+        assert hw == patch_h * patch_w, (hw, patch_h, patch_w)
+        # reshape to (B, N, hw, C)
+        patch_hidden_l = patch_hidden_l[:, patch_start_idx:].reshape(B, N, hw, self.dim_in)
+        camera_hidden_l = camera_hidden_l[:, patch_start_idx:].reshape(B, N, hw, self.dim_in)
+        S = pair_indices.shape[1]
+        batch_idx = torch.arange(B, device=pair_indices.device).unsqueeze(1).expand(B, S)
+        idx_i = pair_indices[:, :, 0]
+        idx_j = pair_indices[:, :, 1]
+        patch_i = patch_hidden_l[batch_idx, idx_i]   # (B,S,hw,dec)
+        cam_i   = camera_hidden_l[batch_idx, idx_i]  # (B,S,hw,cam)
+        cam_j   = camera_hidden_l[batch_idx, idx_j]  # (B,S,hw,cam)
+        # Average cam_j to get a single camera token per pair
+        cam_j = cam_j.mean(dim=2, keepdim=True).expand(-1, -1, hw, -1)
+        # concat + flatten
+        concat = torch.cat([cam_i, cam_j, patch_i], dim=-1)  # (B,S,hw, 2cam+dec)
+        T = B * S
+        x = concat.reshape(B * S, hw, 3 * self.dim_in)
+        # MLP fuse
+        x = self.mlp(x)          # (B * S, hw, dim_in)
+        x = self.norm(x)
+        # token -> grid
+        feat = x.transpose(1, 2).reshape(B * S, self.dim_in, patch_h, patch_w)  # (B * S, dim_in, ph, pw)
+        return feat
+    def _dpt_fuse_and_predict(
+        self,
+        feats_4: List[torch.Tensor],
+        img_shape: Tuple[int, int],
+    ) -> torch.Tensor:
+        """
+        Runs standard DPT fusion and outputs flow:
+          returns (T, 2, H, W)
+        """
+        H, W = img_shape
+        out = []
+        for i in range(4):
+            x = feats_4[i]                 # (T, dec, ph, pw)
+            x = self.projects[i](x)
+            if self.pos_embed:
+                x = self._apply_pos_embed(x, W, H)
+            x = self.resize_layers[i](x)   # multi-scale path
+            out.append(x)
+        x = self.scratch_forward(out)      # (T, features, ...)
+        x = custom_interpolate(
+            x, (H, W),
+            mode="bilinear",
+            align_corners=True,
+        )
+        if self.pos_embed:
+            out = self._apply_pos_embed(x, W, H)
+        flow = self.scratch.output_conv2(out)           # (T, 2,H, W)
+        return flow
+    def scratch_forward(self, features: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Forward pass through the fusion blocks.
+        Args:
+            features (List[Tensor]): List of feature maps from different layers.
+        Returns:
+            Tensor: Fused feature map.
+        """
+        layer_1, layer_2, layer_3, layer_4 = features
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        out = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        del layer_4_rn, layer_4
+        out = self.scratch.refinenet3(out, layer_3_rn, size=layer_2_rn.shape[2:])
+        del layer_3_rn, layer_3
+        out = self.scratch.refinenet2(out, layer_2_rn, size=layer_1_rn.shape[2:])
+        del layer_2_rn, layer_2
+        out = self.scratch.refinenet1(out, layer_1_rn)
+        del layer_1_rn, layer_1
+        out = self.scratch.output_conv1(out)
+        return out
+################################################################################
+# Modules
+################################################################################
+def _make_fusion_block(features: int, size: int = None, has_residual: bool = True, groups: int = 1) -> nn.Module:
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(inplace=True),
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=size,
+        has_residual=has_residual,
+        groups=groups,
+    )
+def _make_scratch(in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False) -> nn.Module:
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(
+            in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+        )
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn, groups=1):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = groups
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.norm1 = None
+        self.norm2 = None
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.norm1 is not None:
+            out = self.norm1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.norm2 is not None:
+            out = self.norm2(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None,
+        has_residual=True,
+        groups=1,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = groups
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=self.groups
+        )
+        if has_residual:
+            self.resConfUnit1 = ResidualConvUnit(features, activation, bn, groups=self.groups)
+        self.has_residual = has_residual
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=self.groups)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size = size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if self.has_residual:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = custom_interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+def custom_interpolate(
+    x: torch.Tensor,
+    size: Tuple[int, int] = None,
+    scale_factor: float = None,
+    mode: str = "bilinear",
+    align_corners: bool = True,
+) -> torch.Tensor:
+    """
+    Custom interpolate to avoid INT_MAX issues in nn.functional.interpolate.
+    """
+    if size is None:
+        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
+    INT_MAX = 1610612736
+    input_elements = size[0] * size[1] * x.shape[0] * x.shape[1]
+    if input_elements > INT_MAX:
+        chunks = torch.chunk(x, chunks=(input_elements // INT_MAX) + 1, dim=0)
+        interpolated_chunks = [
+            nn.functional.interpolate(chunk, size=size, mode=mode, align_corners=align_corners) for chunk in chunks
+        ]
+        x = torch.cat(interpolated_chunks, dim=0)
+        return x.contiguous()
+    else:
+        return nn.functional.interpolate(x, size=size, mode=mode, align_corners=align_corners)

flow3r/models/flow_head/utils.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+def position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100) -> torch.Tensor:
+    """
+    Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)
+    Args:
+        pos_grid: Tensor of shape (H, W, 2) containing 2D coordinates
+        embed_dim: Output channel dimension for embeddings
+    Returns:
+        Tensor of shape (H, W, embed_dim) with positional embeddings
+    """
+    H, W, grid_dim = pos_grid.shape
+    assert grid_dim == 2
+    pos_flat = pos_grid.reshape(-1, grid_dim)  # Flatten to (H*W, 2)
+    # Process x and y coordinates separately
+    emb_x = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0)  # [1, H*W, D/2]
+    emb_y = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0)  # [1, H*W, D/2]
+    # Combine and reshape
+    emb = torch.cat([emb_x, emb_y], dim=-1)  # [1, H*W, D]
+    return emb.view(H, W, embed_dim)  # [H, W, D]
+def make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double, device=pos.device)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / omega_0**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb.float()
+# Inspired by https://github.com/microsoft/moge
+def create_uv_grid(
+    width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None
+) -> torch.Tensor:
+    """
+    Create a normalized UV grid of shape (width, height, 2).
+    The grid spans horizontally and vertically according to an aspect ratio,
+    ensuring the top-left corner is at (-x_span, -y_span) and the bottom-right
+    corner is at (x_span, y_span), normalized by the diagonal of the plane.
+    Args:
+        width (int): Number of points horizontally.
+        height (int): Number of points vertically.
+        aspect_ratio (float, optional): Width-to-height ratio. Defaults to width/height.
+        dtype (torch.dtype, optional): Data type of the resulting tensor.
+        device (torch.device, optional): Device on which the tensor is created.
+    Returns:
+        torch.Tensor: A (width, height, 2) tensor of UV coordinates.
+    """
+    # Derive aspect ratio if not explicitly provided
+    if aspect_ratio is None:
+        aspect_ratio = float(width) / float(height)
+    # Compute normalized spans for X and Y
+    diag_factor = (aspect_ratio**2 + 1.0) ** 0.5
+    span_x = aspect_ratio / diag_factor
+    span_y = 1.0 / diag_factor
+    # Establish the linspace boundaries
+    left_x = -span_x * (width - 1) / width
+    right_x = span_x * (width - 1) / width
+    top_y = -span_y * (height - 1) / height
+    bottom_y = span_y * (height - 1) / height
+    # Generate 1D coordinates
+    x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
+    y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
+    # Create 2D meshgrid (width x height) and stack into UV
+    uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
+    uv_grid = torch.stack((uu, vv), dim=-1)
+    return uv_grid

flow3r/models/layers/attention.py ADDED Viewed

	@@ -0,0 +1,403 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+import torch
+from torch.nn.functional import scaled_dot_product_attention
+from torch.nn.attention import SDPBackend
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        # q, k, v = unbind(qkv, 2)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class FlashAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1, 3)
+        # q, k, v = unbind(qkv, 2)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        if q.dtype == torch.bfloat16:
+            with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                x = scaled_dot_product_attention(q, k, v)
+        else:
+            with nn.attention.sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+                x = scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+"""
+Following is written by GPT-4o
+"""
+class CrossAttentionRope(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        qk_norm: bool = False,
+        norm_layer: nn.Module = nn.LayerNorm,
+        rope=None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        # Separate projection layers for query, key, and value
+        self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
+        self.q_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, attn_bias=None, qpos=None, kpos=None) -> Tensor:
+        """
+        Args:
+            query: Tensor of shape (B, N, C), input query
+            key: Tensor of shape (B, M, C), input key
+            value: Tensor of shape (B, M, C), input value
+            attn_bias: Optional tensor for attention bias
+        Returns:
+            Tensor of shape (B, N, C), output of cross-attention
+        """
+        B, N, C = query.shape
+        _, M, _ = key.shape
+        # Project query, key, and value
+        q = self.q_proj(query).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.k_proj(key).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.v_proj(value).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype)
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+        # Scale query
+        q = q * self.scale
+        # Compute attention scores
+        attn = q @ k.transpose(-2, -1)  # (B, num_heads, N, M)
+        if attn_bias is not None:
+            attn = attn + attn_bias
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        # Compute attention output
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)  # (B, N, C)
+        # Final projection
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffCrossAttentionRope(CrossAttentionRope):
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, attn_bias=None, qpos=None, kpos=None) -> Tensor:
+        """
+        Args:
+            query: Tensor of shape (B, N, C), input query
+            key: Tensor of shape (B, M, C), input key
+            value: Tensor of shape (B, M, C), input value
+            attn_bias: Optional tensor for attention bias
+        Returns:
+            Tensor of shape (B, N, C), output of cross-attention
+        """
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(query, key, value, attn_bias)
+        B, N, C = query.shape
+        _, M, _ = key.shape
+        # Project query, key, and value
+        q = self.q_proj(query).reshape(B, N, self.num_heads, C // self.num_heads)
+        k = self.k_proj(key).reshape(B, M, self.num_heads, C // self.num_heads)
+        v = self.v_proj(value).reshape(B, M, self.num_heads, C // self.num_heads)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype)
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        # Compute memory-efficient attention
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape(B, N, C)
+        # Final projection
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class FlashCrossAttentionRope(CrossAttentionRope):
+    def forward(self, query: Tensor, key: Tensor, value: Tensor, attn_bias=None, qpos=None, kpos=None) -> Tensor:
+        B, N, C = query.shape
+        _, M, _ = key.shape
+        # 1. 投射 query, key, value 并调整维度为 (B, num_heads, Seq_Len, head_dim)
+        q = self.q_proj(query).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.k_proj(key).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.v_proj(value).reshape(B, M, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype)
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+        dropout_p = self.attn_drop.p if self.training else 0.0
+        if q.dtype == torch.bfloat16:
+            with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                x = scaled_dot_product_attention(
+                    q, k, v, attn_mask=attn_bias, dropout_p=dropout_p
+                )
+        else:
+            with nn.attention.sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+                x = scaled_dot_product_attention(
+                    q, k, v, attn_mask=attn_bias, dropout_p=dropout_p
+                )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class AttentionRope(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        qk_norm: bool = False,
+        norm_layer: nn.Module = nn.LayerNorm,
+        rope=None
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.q_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(head_dim) if qk_norm else nn.Identity()
+        self.rope = rope
+    def forward(self, x: Tensor, attn_bias=None, xpos=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype)
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttentionRope(AttentionRope):
+    def forward(self, x: Tensor, attn_bias=None, xpos=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        qkv = qkv.transpose(1, 3)
+        # q, k, v = unbind(qkv, 2)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype)
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        # score_matrix = (q.permute(0, 2, 1, 3) * self.scale @ k.permute(0, 2, 1, 3).transpose(-2, -1)).sum(dim=1).reshape(frame_num, 261, frame_num, 261).mean(dim=[1, 3]).sum(1)         # for frame attention matrix
+        # global_valid_id = torch.where(score_matrix > 0)
+        # score_matrix = (q.permute(0, 2, 1, 3) * self.scale @ k.permute(0, 2, 1, 3).transpose(-2, -1)).sum(dim=1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class FlashAttentionRope(AttentionRope):
+    def forward(self, x: Tensor, attn_bias=None, xpos=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1, 3)
+        # q, k, v = unbind(qkv, 2)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        q, k = self.q_norm(q).to(v.dtype), self.k_norm(k).to(v.dtype)
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+        if q.dtype == torch.bfloat16:
+            with nn.attention.sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+                x = scaled_dot_product_attention(q, k, v)
+        else:
+            with nn.attention.sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION]):
+                x = scaled_dot_product_attention(q, k, v)
+        x = x.transpose(1, 2).reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+def get_attn_score(blk_class, x, frame_num, token_length, xpos=None):
+    x = blk_class.norm1(x)
+    B, N, C = x.shape
+    qkv = blk_class.attn.qkv(x).reshape(B, N, 3, blk_class.attn.num_heads, C // blk_class.attn.num_heads)
+    qkv = qkv.transpose(1, 3)
+    # q, k, v = unbind(qkv, 2)
+    q, k, v = [qkv[:,:,i] for i in range(3)]
+    q, k = blk_class.attn.q_norm(q).to(v.dtype), blk_class.attn.k_norm(k).to(v.dtype)
+    if blk_class.attn.rope is not None:
+        q = blk_class.attn.rope(q, xpos)
+        k = blk_class.attn.rope(k, xpos)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    score = (q.permute(0, 2, 1, 3) * blk_class.attn.scale @ k.permute(0, 2, 1, 3).transpose(-2, -1)).sum(dim=1).reshape(B, frame_num, token_length, frame_num, token_length).mean(dim=[2, 4]).sum(-1)
+    return score

flow3r/models/layers/block.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention, CrossAttentionRope, MemEffCrossAttentionRope, FlashAttentionRope
+from ..dinov2.layers.drop_path import DropPath
+from ..dinov2.layers.layer_scale import LayerScale
+from ..dinov2.layers.mlp import Mlp
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Block)")
+    else:
+        # warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
+class BlockRope(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        qk_norm: bool=False,
+        rope=None
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            qk_norm=qk_norm,
+            rope=rope
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor, xpos=None) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), xpos=xpos))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+class CrossBlockRope(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        cross_attn_class: Callable[..., nn.Module] = CrossAttentionRope,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+        init_values=None,
+        qk_norm: bool=False,
+        rope=None
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            rope=rope,
+            qk_norm=qk_norm
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.ls_y = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm_y = norm_layer(dim)
+        self.cross_attn = cross_attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            rope=rope,
+            qk_norm=qk_norm
+        )
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            bias=ffn_bias,
+        )
+    def forward(self, x: Tensor, y: Tensor, xpos=None, ypos=None) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x), xpos=xpos))
+        def cross_attn_residual_func(x: Tensor, y: Tensor) -> Tensor:
+            return self.ls_y(self.cross_attn(self.norm2(x), y, y, qpos=xpos, kpos=ypos))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm3(x)))
+        x = x + attn_residual_func(x)
+        y_ = self.norm_y(y)
+        x = x + cross_attn_residual_func(x, y_)
+        x = x + ffn_residual_func(x)
+        return x

flow3r/models/layers/camera_head.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import torch.nn as nn
+from copy import deepcopy
+import torch.nn.functional as F
+# code adapted from 'https://github.com/nianticlabs/marepo/blob/9a45e2bb07e5bb8cb997620088d352b439b13e0e/transformer/transformer.py#L172'
+class ResConvBlock(nn.Module):
+    """
+    1x1 convolution residual block
+    """
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.head_skip = nn.Identity() if self.in_channels == self.out_channels else nn.Conv2d(self.in_channels, self.out_channels, 1, 1, 0)
+        # self.res_conv1 = nn.Conv2d(self.in_channels, self.out_channels, 1, 1, 0)
+        # self.res_conv2 = nn.Conv2d(self.out_channels, self.out_channels, 1, 1, 0)
+        # self.res_conv3 = nn.Conv2d(self.out_channels, self.out_channels, 1, 1, 0)
+        # change 1x1 convolution to linear
+        self.res_conv1 = nn.Linear(self.in_channels, self.out_channels)
+        self.res_conv2 = nn.Linear(self.out_channels, self.out_channels)
+        self.res_conv3 = nn.Linear(self.out_channels, self.out_channels)
+    def forward(self, res):
+        x = F.relu(self.res_conv1(res))
+        x = F.relu(self.res_conv2(x))
+        x = F.relu(self.res_conv3(x))
+        res = self.head_skip(res) + x
+        return res
+class CameraHead(nn.Module):
+    def __init__(self, dim=512):
+        super().__init__()
+        output_dim = dim
+        self.res_conv = nn.ModuleList([deepcopy(ResConvBlock(output_dim, output_dim))
+                for _ in range(2)])
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.more_mlps = nn.Sequential(
+            nn.Linear(output_dim,output_dim),
+            nn.ReLU(),
+            nn.Linear(output_dim,output_dim),
+            nn.ReLU()
+            )
+        self.fc_t = nn.Linear(output_dim, 3)
+        self.fc_rot = nn.Linear(output_dim, 9)
+    def forward(self, feat, patch_h, patch_w):
+        BN, hw, c = feat.shape
+        for i in range(2):
+            feat = self.res_conv[i](feat)
+        # feat = self.avgpool(feat)
+        feat = self.avgpool(feat.permute(0, 2, 1).reshape(BN, -1, patch_h, patch_w).contiguous())              ##########
+        feat = feat.view(feat.size(0), -1)
+        feat = self.more_mlps(feat)  # [B, D_]
+        with torch.amp.autocast(device_type='cuda', enabled=False):
+            out_t = self.fc_t(feat.float())  # [B,3]
+            out_r = self.fc_rot(feat.float())  # [B,9]
+            pose = self.convert_pose_to_4x4(BN, out_r, out_t, feat.device)
+        return pose
+    def convert_pose_to_4x4(self, B, out_r, out_t, device):
+        out_r = self.svd_orthogonalize(out_r)  # [N,3,3]
+        pose = torch.zeros((B, 4, 4), device=device)
+        pose[:, :3, :3] = out_r
+        pose[:, :3, 3] = out_t
+        pose[:, 3, 3] = 1.
+        return pose
+    def svd_orthogonalize(self, m):
+        """Convert 9D representation to SO(3) using SVD orthogonalization.
+        Args:
+          m: [BATCH, 3, 3] 3x3 matrices.
+        Returns:
+          [BATCH, 3, 3] SO(3) rotation matrices.
+        """
+        if m.dim() < 3:
+            m = m.reshape((-1, 3, 3))
+        m_transpose = torch.transpose(torch.nn.functional.normalize(m, p=2, dim=-1), dim0=-1, dim1=-2)
+        u, s, v = torch.svd(m_transpose)
+        det = torch.det(torch.matmul(v, u.transpose(-2, -1)))
+        # Check orientation reflection.
+        r = torch.matmul(
+            torch.cat([v[:, :, :-1], v[:, :, -1:] * det.view(-1, 1, 1)], dim=2),
+            u.transpose(-2, -1)
+        )
+        return r

flow3r/models/layers/pos_embed.py ADDED Viewed

	@@ -0,0 +1,174 @@

+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token>0:
+        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed']
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+try:
+    from models.curope import cuRoPE2D
+    RoPE2D = cuRoPE2D
+except ImportError:
+    print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+    class RoPE2D(torch.nn.Module):
+        def __init__(self, freq=100.0, F0=1.0):
+            super().__init__()
+            self.base = freq
+            self.F0 = F0
+            self.cache = {}
+        def get_cos_sin(self, D, seq_len, device, dtype):
+            if (D,seq_len,device,dtype) not in self.cache:
+                inv_freq = 1.0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+                freqs = torch.cat((freqs, freqs), dim=-1)
+                cos = freqs.cos() # (Seq, Dim)
+                sin = freqs.sin()
+                self.cache[D,seq_len,device,dtype] = (cos,sin)
+            return self.cache[D,seq_len,device,dtype]
+        @staticmethod
+        def rotate_half(x):
+            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+        def apply_rope1d(self, tokens, pos1d, cos, sin):
+            assert pos1d.ndim==2
+            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+            return (tokens * cos) + (self.rotate_half(tokens) * sin)
+        def forward(self, tokens, positions):
+            """
+            input:
+                * tokens: batch_size x nheads x ntokens x dim
+                * positions: batch_size x ntokens x 2 (y and x position of each token)
+            output:
+                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+            """
+            assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+            D = tokens.size(3) // 2
+            assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+            cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+            # split features into two along the feature dimension, and apply rope1d on each half
+            y, x = tokens.chunk(2, dim=-1)
+            y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+            x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+            tokens = torch.cat((y, x), dim=-1)
+            return tokens
+# patch embedding
+class PositionGetter(object):
+    """ return positions of patches """
+    def __init__(self):
+        self.cache_positions = {}
+    def __call__(self, b, h, w, device):
+        if not (h,w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+        return pos

flow3r/models/layers/transformer_head.py ADDED Viewed

	@@ -0,0 +1,389 @@

+from .attention import FlashAttentionRope, FlashCrossAttentionRope
+from .block import BlockRope, CrossBlockRope
+from ..dinov2.layers import Mlp
+import torch
+import torch.nn as nn
+from functools import partial
+from torch.utils.checkpoint import checkpoint
+import torch.nn.functional as F
+from flow3r.models.flow_head.utils import create_uv_grid, position_grid_to_embed
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        dec_embed_dim=512,
+        depth=5,
+        dec_num_heads=8,
+        mlp_ratio=4,
+        rope=None,
+        need_project=True,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.projects = nn.Linear(in_dim, dec_embed_dim) if need_project else nn.Identity()
+        self.use_checkpoint = use_checkpoint
+        self.blocks = nn.ModuleList([
+            BlockRope(
+                dim=dec_embed_dim,
+                num_heads=dec_num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=True,
+                proj_bias=True,
+                ffn_bias=True,
+                drop_path=0.0,
+                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                act_layer=nn.GELU,
+                ffn_layer=Mlp,
+                init_values=None,
+                qk_norm=False,
+                # attn_class=MemEffAttentionRope,
+                attn_class=FlashAttentionRope,
+                rope=rope
+            ) for _ in range(depth)])
+        self.linear_out = nn.Linear(dec_embed_dim, out_dim)
+    def forward(self, hidden, xpos=None, return_intermediate=False):
+        hidden = self.projects(hidden)
+        intermediate = []
+        for i, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                hidden = checkpoint(blk, hidden, xpos=xpos, use_reentrant=False)
+            else:
+                hidden = blk(hidden, xpos=xpos)
+            if return_intermediate:
+                intermediate.append(hidden)
+        out = self.linear_out(hidden)
+        if return_intermediate:
+            return out, intermediate[-4:]
+        return out
+class LinearPts3d (nn.Module):
+    """
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+    def __init__(self, patch_size, dec_embed_dim, output_dim=3,):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Linear(dec_embed_dim, (output_dim)*self.patch_size**2)
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+        # print("--------------------------------")
+        # print("pointhead")
+        # print("H, W is", H, W)
+        # print("hw is", S)
+        # print("patch_h is", H//self.patch_size)
+        # print("patch_w is", W//self.patch_size)
+        # print("--------------------------------")
+        # extract 3D points
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size)
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+        # permute + norm depth
+        return feat.permute(0, 2, 3, 1)
+class LinearFlow2d (nn.Module):
+    """
+    Linear head for flow 2D with MLP fusion of camera and patch features
+    Each token outputs: - 16x16 2D flow
+    """
+    def __init__(self, patch_size, dec_embed_dim, output_dim=2, camera_dim=512, num_heads=8, rope=None):
+        super().__init__()
+        self.patch_size = patch_size
+        self.dec_embed_dim = dec_embed_dim
+        self.camera_dim = camera_dim
+        # Position embedding for camera features (to distinguish first and second camera)
+        self.camera_pos_embed = nn.Parameter(torch.randn(2, 1, camera_dim))
+        nn.init.normal_(self.camera_pos_embed, std=0.02)
+        # Projection to match camera feature dimension to patch feature dimension
+        # self.camera_proj = nn.Linear(camera_dim, dec_embed_dim)
+        # MLP to fuse camera features and patch features
+        self.mlp = nn.Sequential(
+            nn.Linear(2*camera_dim + dec_embed_dim, 2*dec_embed_dim),
+            nn.ReLU(),
+            nn.Linear(2*dec_embed_dim, 2*dec_embed_dim),
+            nn.ReLU(),
+            nn.Linear(2*dec_embed_dim, dec_embed_dim),
+        )
+        # Final projection to output dimension
+        self.proj = nn.Linear(dec_embed_dim, (output_dim)*self.patch_size**2)
+    def forward(self, patch_hidden, camera_hidden, pair_indices, img_shape, B, N):
+        """
+        Args:
+            patch_hidden: (B*N, hw, dec_embed_dim) - motion decoder output
+            camera_hidden: (B*N, hw, camera_dim) - camera decoder output
+            pair_indices: Tensor of shape (B, S, 2) or list of tuples
+                         If Tensor (B, S, 2): indices are (i, j) relative to each batch
+                         If list: [(b1, i1, j1), ...] or [(i1, j1), ...]
+            img_shape: (H, W)
+            B: batch size
+            N: sequence length (number of images)
+        Returns:
+            flow: (total_pairs, H, W, 2)
+        """
+        H, W = img_shape
+        hw = patch_hidden.shape[1]
+        # Reshape from (B*N, hw, dim) to (B, N, hw, dim)、
+        # print("!!!!!now inside the LinearFlow2d forward function")
+        patch_hidden = patch_hidden.reshape(B, N, hw, self.dec_embed_dim)
+        camera_hidden = camera_hidden.reshape(B, N, hw, self.camera_dim)
+        # print("the shape of patch_hidden is", patch_hidden.shape)
+        # print("the shape of camera_hidden is", camera_hidden.shape)
+        # Handle Tensor input (B, S, 2)
+        if isinstance(pair_indices, torch.Tensor) and pair_indices.dim() == 3:
+            # pair_indices shape: (B, S, 2)
+            # We can use advanced indexing for efficiency
+            # Create batch indices: (B, S)
+            S = pair_indices.shape[1]
+            batch_idx = torch.arange(B, device=pair_indices.device).unsqueeze(1).expand(B, S)
+            # Extract indices for i and j images: (B, S)
+            idx_i = pair_indices[:, :, 0]
+            idx_j = pair_indices[:, :, 1]
+            # Extract patch features: (B, S, hw, dim)
+            patch_feat = patch_hidden[batch_idx, idx_i]
+            # print("the shape of patch_feat is", patch_feat.shape)
+            # Extract camera features: (B, S, hw, dim)
+            camera_i = camera_hidden[batch_idx, idx_i]
+            camera_j = camera_hidden[batch_idx, idx_j]
+            # print("the shape of camera_i is", camera_i.shape)
+            # print("the shape of camera_j is", camera_j.shape)
+            # Add position encoding
+            camera_i = camera_i + self.camera_pos_embed[0]
+            camera_j = camera_j + self.camera_pos_embed[1]
+            # print("the shape of camera_i after position encoding is", camera_i.shape)
+            # print("the shape of camera_j after position encoding is", camera_j.shape)
+            # Project camera features
+            # camera_i = self.camera_proj(camera_i)
+            # camera_j = self.camera_proj(camera_j)
+            # print("the shape of camera_i after projection is", camera_i.shape)
+            # print("the shape of camera_j after projection is", camera_j.shape)
+            # Concatenate camera features and patch features: (B, S, hw, 3*dim)
+            concat_features = torch.cat([camera_i, camera_j, patch_feat], dim=-1)
+            # Flatten B and S dimensions
+            total_pairs = B * S
+            input_features = concat_features.reshape(total_pairs, hw, 2*self.camera_dim + self.dec_embed_dim)
+        else:
+            raise ValueError("Invalid pair_indices type")
+        # Apply MLP
+        fused_features = self.mlp(input_features)
+        # print("the shape of fused_features after reshape is", fused_features.shape)
+        # Project to output dimension
+        feat = self.proj(fused_features)  # (total_pairs, patch_hw, output_dim * patch_size^2)
+        # print("the shape of feat is", feat.shape)
+        # Reshape and apply pixel shuffle
+        patch_h, patch_w = H // self.patch_size, W // self.patch_size
+        # print("--------------------------------")
+        # print("H, W is", H, W)
+        # print("hw is", hw)
+        # print("patch_h is", patch_h)
+        # print("patch_w is", patch_w)
+        # print("--------------------------------")
+        feat = feat.transpose(-1, -2).reshape(total_pairs, -1, patch_h, patch_w)
+        feat = F.pixel_shuffle(feat, self.patch_size)  # (total_pairs, output_dim, H, W)
+        # print("the shape of feat after pixel shuffle is", feat.shape)
+        # Permute to (total_pairs, H, W, output_dim)
+        return feat.permute(0, 2, 3, 1).reshape(B, S, H, W, -1)
+class DPTFlow2d (nn.Module):
+    """
+    Simplified DPT head for flow 2D with only one layer input
+    Each token outputs: - 16x16 2D flow
+    """
+    def __init__(self, patch_size, dec_embed_dim, output_dim=2, camera_dim=512, rope=None, features=256):
+        super().__init__()
+        self.patch_size = patch_size
+        self.dec_embed_dim = dec_embed_dim
+        self.camera_dim = camera_dim
+        # Projection to match camera feature dimension to patch feature dimension
+        # self.camera_proj = nn.Linear(camera_dim, dec_embed_dim)
+        # MLP to fuse camera features and patch features
+        self.mlp = nn.Sequential(
+            nn.Linear(2*camera_dim + dec_embed_dim, 2*dec_embed_dim),
+            nn.ReLU(),
+            nn.Linear(2*dec_embed_dim, 2*dec_embed_dim),
+            nn.ReLU(),
+            nn.Linear(2*dec_embed_dim, dec_embed_dim),
+        )
+        self.norm = nn.LayerNorm(dec_embed_dim)
+        self.project = nn.Conv2d(dec_embed_dim, features, kernel_size=1, stride=1, padding=0)
+        self.refine_low = nn.Sequential(
+            nn.Conv2d(features, features, 3, padding=1),
+            nn.GELU(),
+            nn.Conv2d(features, features, 3, padding=1),
+            nn.GELU(),
+        )
+        self.refine_high = nn.Sequential(
+            nn.Conv2d(features, features, 3, padding=1),
+            nn.GELU(),
+            nn.Conv2d(features, features, 3, padding=1),
+            nn.GELU(),
+        )
+        self.out_head = nn.Sequential(
+            nn.Conv2d(features, 64, 3, padding=1),
+            nn.GELU(),
+            nn.Conv2d(64, output_dim, 1),
+        )
+        # Final projection to output dimension
+        # self.proj = nn.Linear(dec_embed_dim, (output_dim)*self.patch_size**2)
+    def _apply_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
+        """
+        Apply positional embedding to tensor x.
+        """
+        patch_w = x.shape[-1]
+        patch_h = x.shape[-2]
+        pos_embed = create_uv_grid(patch_w, patch_h, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
+        pos_embed = position_grid_to_embed(pos_embed, x.shape[1])
+        pos_embed = pos_embed * ratio
+        pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
+        return x + pos_embed
+    def forward(self, patch_hidden, camera_hidden, pair_indices, img_shape, B, N):
+        """
+        Args:
+            patch_hidden: (B*N, hw, dec_embed_dim) - motion decoder output
+            camera_hidden: (B*N, hw, camera_dim) - camera decoder output
+            pair_indices: Tensor of shape (B, S, 2) or list of tuples
+                         If Tensor (B, S, 2): indices are (i, j) relative to each batch
+                         If list: [(b1, i1, j1), ...] or [(i1, j1), ...]
+            img_shape: (H, W)
+            B: batch size
+            N: sequence length (number of images)
+        Returns:
+            flow: (total_pairs, H, W, 2)
+        """
+        H, W = img_shape
+        hw = patch_hidden.shape[1]
+        # Reshape from (B*N, hw, dim) to (B, N, hw, dim)、
+        # print("!!!!!now inside the LinearFlow2d forward function")
+        patch_hidden = patch_hidden.reshape(B, N, hw, self.dec_embed_dim)
+        camera_hidden = camera_hidden.reshape(B, N, hw, self.camera_dim)
+        # Handle Tensor input (B, S, 2)
+        S = pair_indices.shape[1]
+        batch_idx = torch.arange(B, device=pair_indices.device).unsqueeze(1).expand(B, S)
+        # Extract indices for i and j images: (B, S)
+        idx_i = pair_indices[:, :, 0]
+        idx_j = pair_indices[:, :, 1]
+        # Extract patch features: (B, S, hw, dim)
+        patch_feat = patch_hidden[batch_idx, idx_i]
+        # print("the shape of patch_feat is", patch_feat.shape)
+        # Extract camera features: (B, S, hw, dim)
+        camera_i = camera_hidden[batch_idx, idx_i]
+        camera_j = camera_hidden[batch_idx, idx_j]
+        # Concatenate camera features and patch features: (B, S, hw, 3*dim)
+        concat_features = torch.cat([camera_i, camera_j, patch_feat], dim=-1)
+        # Flatten B and S dimensions
+        total_pairs = B * S
+        input_features = concat_features.reshape(total_pairs, hw, 2*self.camera_dim + self.dec_embed_dim)
+        # Apply MLP
+        fused = self.mlp(input_features)  # (T, hw, dec_embed_dim)
+        patch_h, patch_w = H // self.patch_size, W // self.patch_size
+        assert hw == patch_h * patch_w, (hw, patch_h, patch_w)
+        fused = self.norm(fused)
+        feat = fused.transpose(1, 2).reshape(total_pairs, self.dec_embed_dim, patch_h, patch_w)  # (T,D,h,w)
+        feat = self.project(feat)              # (T,features,h,w)
+        feat = self._apply_pos_embed(feat, W, H)
+        feat = self.refine_low(feat)
+        feat = F.interpolate(feat, size=(H, W), mode="bilinear", align_corners=True)
+        feat = self._apply_pos_embed(feat, W, H)
+        feat = self.refine_high(feat)
+        flow = self.out_head(feat)  # (T,2,H,W)
+        return flow.permute(0, 2, 3, 1).reshape(B, S, H, W, -1)
+class ContextTransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        dec_embed_dim=512,
+        depth=5,
+        dec_num_heads=8,
+        mlp_ratio=4,
+        rope=None,
+    ):
+        super().__init__()
+        self.projects_x = nn.Linear(in_dim, dec_embed_dim)
+        self.projects_y = nn.Linear(in_dim, dec_embed_dim)
+        self.blocks = nn.ModuleList([
+            CrossBlockRope(
+                dim=dec_embed_dim,
+                num_heads=dec_num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=True,
+                proj_bias=True,
+                ffn_bias=True,
+                norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                act_layer=nn.GELU,
+                ffn_layer=Mlp,
+                init_values=None,
+                qk_norm=False,
+                # attn_class=MemEffAttentionRope,
+                # cross_attn_class=MemEffCrossAttentionRope,
+                attn_class=FlashAttentionRope,
+                cross_attn_class=FlashCrossAttentionRope,
+                rope=rope
+            ) for _ in range(depth)])
+        self.linear_out = nn.Linear(dec_embed_dim, out_dim)
+    def forward(self, hidden, context, xpos=None, ypos=None):
+        hidden = self.projects_x(hidden)
+        context = self.projects_y(context)
+        for i, blk in enumerate(self.blocks):
+            hidden = blk(hidden, context, xpos=xpos, ypos=ypos)
+        out = self.linear_out(hidden)

flow3r/utils/alignment.py ADDED Viewed

	@@ -0,0 +1,499 @@

+from typing import *
+import math
+from collections import namedtuple
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.types
+# import utils3d
+def scatter_min(size: int, dim: int, index: torch.LongTensor, src: torch.Tensor) -> torch.return_types.min:
+    "Scatter the minimum value along the given dimension of `input` into `src` at the indices specified in `index`."
+    shape = src.shape[:dim] + (size,) + src.shape[dim + 1:]
+    minimum = torch.full(shape, float('inf'), dtype=src.dtype, device=src.device).scatter_reduce(dim=dim, index=index, src=src, reduce='amin', include_self=False)
+    minimum_where = torch.where(src == torch.gather(minimum, dim=dim, index=index))
+    indices = torch.full(shape, -1, dtype=torch.long, device=src.device)
+    indices[(*minimum_where[:dim], index[minimum_where], *minimum_where[dim + 1:])] = minimum_where[dim]
+    return torch.return_types.min((minimum, indices))
+def split_batch_fwd(fn: Callable, chunk_size: int, *args, **kwargs):
+    batch_size = next(x for x in (*args, *kwargs.values()) if isinstance(x, torch.Tensor)).shape[0]
+    n_chunks = batch_size // chunk_size + (batch_size % chunk_size > 0)
+    splited_args = tuple(arg.split(chunk_size, dim=0) if isinstance(arg, torch.Tensor) else [arg] * n_chunks for arg in args)
+    splited_kwargs = {k: [v.split(chunk_size, dim=0) if isinstance(v, torch.Tensor) else [v] * n_chunks] for k, v in kwargs.items()}
+    results = []
+    for i in range(n_chunks):
+        chunk_args = tuple(arg[i] for arg in splited_args)
+        chunk_kwargs = {k: v[i] for k, v in splited_kwargs.items()}
+        results.append(fn(*chunk_args, **chunk_kwargs))
+    if isinstance(results[0], tuple):
+        return tuple(torch.cat(r, dim=0) for r in zip(*results))
+    else:
+        return torch.cat(results, dim=0)
+def _pad_inf(x_: torch.Tensor):
+    return torch.cat([torch.full_like(x_[..., :1], -torch.inf), x_, torch.full_like(x_[..., :1], torch.inf)], dim=-1)
+def _pad_cumsum(cumsum: torch.Tensor):
+    return torch.cat([torch.zeros_like(cumsum[..., :1]), cumsum, cumsum[..., -1:]], dim=-1)
+def _compute_residual(a: torch.Tensor, xyw: torch.Tensor, trunc: float):
+    return a.mul(xyw[..., 0]).sub_(xyw[..., 1]).abs_().mul_(xyw[..., 2]).clamp_max_(trunc).sum(dim=-1)
+def align(x: torch.Tensor, y: torch.Tensor, w: torch.Tensor, trunc: Optional[Union[float, torch.Tensor]] = None, eps: float = 1e-7) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]:
+    """
+    If trunc is None, solve `min sum_i w_i * |a * x_i - y_i|`, otherwise solve `min sum_i min(trunc, w_i * |a * x_i - y_i|)`.
+    w_i must be >= 0.
+    ### Parameters:
+    - `x`: tensor of shape (..., n)
+    - `y`: tensor of shape (..., n)
+    - `w`: tensor of shape (..., n)
+    - `trunc`: optional, float or tensor of shape (..., n) or None
+    ### Returns:
+    - `a`: tensor of shape (...), differentiable
+    - `loss`: tensor of shape (...), value of loss function at `a`, detached
+    - `index`: tensor of shape (...), where a = y[idx] / x[idx]
+    """
+    if trunc is None:
+        x, y, w = torch.broadcast_tensors(x, y, w)
+        sign = torch.sign(x)
+        x, y = x * sign, y * sign
+        y_div_x = y / x.clamp_min(eps)
+        y_div_x, argsort = y_div_x.sort(dim=-1)
+        wx = torch.gather(x * w, dim=-1, index=argsort)
+        derivatives = 2 * wx.cumsum(dim=-1) - wx.sum(dim=-1, keepdim=True)
+        search = torch.searchsorted(derivatives, torch.zeros_like(derivatives[..., :1]), side='left').clamp_max(derivatives.shape[-1] - 1)
+        a = y_div_x.gather(dim=-1, index=search).squeeze(-1)
+        index = argsort.gather(dim=-1, index=search).squeeze(-1)
+        loss = (w * (a[..., None] * x - y).abs()).sum(dim=-1)
+    else:
+        # Reshape to (batch_size, n) for simplicity
+        x, y, w = torch.broadcast_tensors(x, y, w)
+        batch_shape = x.shape[:-1]
+        batch_size = math.prod(batch_shape)
+        x, y, w = x.reshape(-1, x.shape[-1]), y.reshape(-1, y.shape[-1]), w.reshape(-1, w.shape[-1])
+        sign = torch.sign(x)
+        x, y = x * sign, y * sign
+        wx, wy = w * x, w * y
+        xyw = torch.stack([x, y, w], dim=-1)    # Stacked for convenient gathering
+        y_div_x = A = y / x.clamp_min(eps)
+        B = (wy - trunc) / wx.clamp_min(eps)
+        C = (wy + trunc) / wx.clamp_min(eps)
+        with torch.no_grad():
+            # Caculate prefix sum by orders of A, B, C
+            A, A_argsort = A.sort(dim=-1)
+            Q_A = torch.cumsum(torch.gather(wx, dim=-1, index=A_argsort), dim=-1)
+            A, Q_A = _pad_inf(A), _pad_cumsum(Q_A)    # Pad [-inf, A1, ..., An, inf] and [0, Q1, ..., Qn, Qn] to handle edge cases.
+            B, B_argsort = B.sort(dim=-1)
+            Q_B = torch.cumsum(torch.gather(wx, dim=-1, index=B_argsort), dim=-1)
+            B, Q_B = _pad_inf(B), _pad_cumsum(Q_B)
+            C, C_argsort = C.sort(dim=-1)
+            Q_C = torch.cumsum(torch.gather(wx, dim=-1, index=C_argsort), dim=-1)
+            C, Q_C = _pad_inf(C), _pad_cumsum(Q_C)
+            # Caculate left and right derivative of A
+            j_A = torch.searchsorted(A, y_div_x, side='left').sub_(1)
+            j_B = torch.searchsorted(B, y_div_x, side='left').sub_(1)
+            j_C = torch.searchsorted(C, y_div_x, side='left').sub_(1)
+            left_derivative = 2 * torch.gather(Q_A, dim=-1, index=j_A) - torch.gather(Q_B, dim=-1, index=j_B) - torch.gather(Q_C, dim=-1, index=j_C)
+            j_A = torch.searchsorted(A, y_div_x, side='right').sub_(1)
+            j_B = torch.searchsorted(B, y_div_x, side='right').sub_(1)
+            j_C = torch.searchsorted(C, y_div_x, side='right').sub_(1)
+            right_derivative = 2 * torch.gather(Q_A, dim=-1, index=j_A) - torch.gather(Q_B, dim=-1, index=j_B) - torch.gather(Q_C, dim=-1, index=j_C)
+            # Find extrema
+            is_extrema = (left_derivative < 0) & (right_derivative >= 0)
+            is_extrema[..., 0] |= ~is_extrema.any(dim=-1)                       # In case all derivatives are zero, take the first one as extrema.
+            where_extrema_batch, where_extrema_index = torch.where(is_extrema)
+            # Calculate objective value at extrema
+            extrema_a = y_div_x[where_extrema_batch, where_extrema_index]               # (num_extrema,)
+            MAX_ELEMENTS = 4096 ** 2      # Split into small batches to avoid OOM in case there are too many extrema.(~1G)
+            SPLIT_SIZE = MAX_ELEMENTS // x.shape[-1]
+            extrema_value = torch.cat([
+                _compute_residual(extrema_a_split[:, None], xyw[extrema_i_split, :, :], trunc)
+                for extrema_a_split, extrema_i_split in zip(extrema_a.split(SPLIT_SIZE), where_extrema_batch.split(SPLIT_SIZE))
+            ])          # (num_extrema,)
+            # Find minima among corresponding extrema
+            minima, indices = scatter_min(size=batch_size, dim=0, index=where_extrema_batch, src=extrema_value)        # (batch_size,)
+            index = where_extrema_index[indices]
+        a = torch.gather(y, dim=-1, index=index[..., None]) / torch.gather(x, dim=-1, index=index[..., None]).clamp_min(eps)
+        a = a.reshape(batch_shape)
+        loss = minima.reshape(batch_shape)
+        index = index.reshape(batch_shape)
+    return a, loss, index
+def align_depth_scale(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    Align `depth_src` to `depth_tgt` with given constant weights.
+    ### Parameters:
+    - `depth_src: torch.Tensor` of shape (..., N)
+    - `depth_tgt: torch.Tensor` of shape (..., N)
+    """
+    scale, _, _ = align(depth_src, depth_tgt, weight, trunc)
+    return scale
+def align_depth_affine(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    Align `depth_src` to `depth_tgt` with given constant weights.
+    ### Parameters:
+    - `depth_src: torch.Tensor` of shape (..., N)
+    - `depth_tgt: torch.Tensor` of shape (..., N)
+    - `weight: torch.Tensor` of shape (..., N)
+    - `trunc: float` or tensor of shape (..., N) or None
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (...).
+    """
+    dtype, device = depth_src.dtype, depth_src.device
+    # Flatten batch dimensions for simplicity
+    batch_shape, n = depth_src.shape[:-1], depth_src.shape[-1]
+    batch_size = math.prod(batch_shape)
+    depth_src, depth_tgt, weight = depth_src.reshape(batch_size, n), depth_tgt.reshape(batch_size, n), weight.reshape(batch_size, n)
+    # Here, we take anchors only for non-zero weights.
+    # Although the results will be still correct even anchor points have zero weight,
+    # it is wasting computation and may cause instability in some cases, e.g. too many extrema.
+    anchors_where_batch, anchors_where_n = torch.where(weight > 0)
+    # Stop gradient when solving optimal anchors
+    with torch.no_grad():
+        depth_src_anchor = depth_src[anchors_where_batch, anchors_where_n]                              # (anchors)
+        depth_tgt_anchor = depth_tgt[anchors_where_batch, anchors_where_n]                              # (anchors)
+        depth_src_anchored = depth_src[anchors_where_batch, :] - depth_src_anchor[..., None]            # (anchors, n)
+        depth_tgt_anchored = depth_tgt[anchors_where_batch, :] - depth_tgt_anchor[..., None]            # (anchors, n)
+        weight_anchored = weight[anchors_where_batch, :]                                                # (anchors, n)
+        scale, loss, index = align(depth_src_anchored, depth_tgt_anchored, weight_anchored, trunc)      # (anchors)
+        loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchors_where_batch, src=loss)   # (batch_size,)
+    # Reproduce by indexing for shorter compute graph
+    index_1 = anchors_where_n[index_anchor]      # (batch_size,)
+    index_2 = index[index_anchor]                # (batch_size,)
+    tgt_1, src_1 = torch.gather(depth_tgt, dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(depth_src, dim=1, index=index_1[..., None]).squeeze(-1)
+    tgt_2, src_2 = torch.gather(depth_tgt, dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(depth_src, dim=1, index=index_2[..., None]).squeeze(-1)
+    scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1e-7)
+    shift = tgt_1 - scale * src_1
+    scale, shift = scale.reshape(batch_shape), shift.reshape(batch_shape)
+    return scale, shift
+def align_depth_affine_irls(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], max_iter: int = 100, eps: float = 1e-12):
+    """
+    Align `depth_src` to `depth_tgt` with given constant weights using IRLS.
+    """
+    dtype, device = depth_src.dtype, depth_src.device
+    w = weight
+    x = torch.stack([depth_src, torch.ones_like(depth_src)], dim=-1)
+    y = depth_tgt
+    for i in range(max_iter):
+        beta = (x.transpose(-1, -2) @ (w * y)) @ (x.transpose(-1, -2) @ (w[..., None] * x)).inverse().transpose(-2, -1)
+        w = 1 / (y - (x @ beta[..., None])[..., 0]).abs().clamp_min(eps)
+    return beta[..., 0], beta[..., 1]
+def align_points_scale(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weight: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `a: torch.Tensor` of shape (...). Only positive solutions are garunteed. You should filter out negative scales before using it.
+    - `b: torch.Tensor` of shape (...)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    scale, _, _ = align(points_src.flatten(-2), points_tgt.flatten(-2), weight[..., None].expand_as(points_src).flatten(-2), trunc)
+    return scale
+def align_points_scale_z_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None):
+    """
+    Align `points_src` to `points_tgt` with respect to a shared xyz scale and z shift.
+    It is similar to `align_affine` but scale and shift are applied to different dimensions.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3). x and y shifts are zeros.
+    """
+    dtype, device = points_src.dtype, points_src.device
+    # Flatten batch dimensions for simplicity
+    batch_shape, n = points_src.shape[:-2], points_src.shape[-2]
+    batch_size = math.prod(batch_shape)
+    points_src, points_tgt, weight = points_src.reshape(batch_size, n, 3), points_tgt.reshape(batch_size, n, 3), weight.reshape(batch_size, n)
+    # Take anchors
+    anchor_where_batch, anchor_where_n = torch.where(weight > 0)
+    with torch.no_grad():
+        zeros = torch.zeros(anchor_where_batch.shape[0], device=device, dtype=dtype)
+        points_src_anchor = torch.stack([zeros, zeros, points_src[anchor_where_batch, anchor_where_n, 2]], dim=-1)      # (anchors, 3)
+        points_tgt_anchor = torch.stack([zeros, zeros, points_tgt[anchor_where_batch, anchor_where_n, 2]], dim=-1)      # (anchors, 3)
+        points_src_anchored = points_src[anchor_where_batch, :, :] - points_src_anchor[..., None, :]    # (anchors, n, 3)
+        points_tgt_anchored = points_tgt[anchor_where_batch, :, :] - points_tgt_anchor[..., None, :]    # (anchors, n, 3)
+        weight_anchored = weight[anchor_where_batch, :, None].expand(-1, -1, 3)                         # (anchors, n, 3)
+        # Solve optimal scale and shift for each anchor
+        MAX_ELEMENTS = 2 ** 20
+        scale, loss, index = split_batch_fwd(align, MAX_ELEMENTS // n, points_src_anchored.flatten(-2), points_tgt_anchored.flatten(-2), weight_anchored.flatten(-2), trunc)   # (anchors,)
+        loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchor_where_batch, src=loss)    # (batch_size,)
+    # Reproduce by indexing for shorter compute graph
+    index_2 = index[index_anchor]                               # (batch_size,) [0, 3n)
+    index_1 = anchor_where_n[index_anchor] * 3 + index_2 % 3    # (batch_size,) [0, 3n)
+    zeros = torch.zeros((batch_size, n), device=device, dtype=dtype)
+    points_tgt_00z, points_src_00z = torch.stack([zeros, zeros, points_tgt[..., 2]], dim=-1), torch.stack([zeros, zeros, points_src[..., 2]], dim=-1)
+    tgt_1, src_1 = torch.gather(points_tgt_00z.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(points_src_00z.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1)
+    tgt_2, src_2 = torch.gather(points_tgt.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(points_src.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1)
+    scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1.0)
+    shift = torch.gather(points_tgt_00z, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) - scale[..., None] * torch.gather(points_src_00z, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2)
+    scale, shift = scale.reshape(batch_shape), shift.reshape(*batch_shape, 3)
+    return scale, shift
+def align_points_scale_xyz_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6):
+    """
+    Align `points_src` to `points_tgt` with respect to a shared xyz scale and z shift.
+    It is similar to `align_affine` but scale and shift are applied to different dimensions.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    # Flatten batch dimensions for simplicity
+    batch_shape, n = points_src.shape[:-2], points_src.shape[-2]
+    batch_size = math.prod(batch_shape)
+    points_src, points_tgt, weight = points_src.reshape(batch_size, n, 3), points_tgt.reshape(batch_size, n, 3), weight.reshape(batch_size, n)
+    # Take anchors
+    anchor_where_batch, anchor_where_n = torch.where(weight > 0)
+    with torch.no_grad():
+        points_src_anchor = points_src[anchor_where_batch, anchor_where_n]          # (anchors, 3)
+        points_tgt_anchor = points_tgt[anchor_where_batch, anchor_where_n]          # (anchors, 3)
+        points_src_anchored = points_src[anchor_where_batch, :, :] - points_src_anchor[..., None, :]    # (anchors, n, 3)
+        points_tgt_anchored = points_tgt[anchor_where_batch, :, :] - points_tgt_anchor[..., None, :]    # (anchors, n, 3)
+        weight_anchored = weight[anchor_where_batch, :, None].expand(-1, -1, 3)                         # (anchors, n, 3)
+        # Solve optimal scale and shift for each anchor
+        MAX_ELEMENTS = 2 ** 20
+        scale, loss, index = split_batch_fwd(align, MAX_ELEMENTS // 2, points_src_anchored.flatten(-2), points_tgt_anchored.flatten(-2), weight_anchored.flatten(-2), trunc)   # (anchors,)
+        # Get optimal scale and shift for each batch element
+        loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchor_where_batch, src=loss)    # (batch_size,)
+    index_2 = index[index_anchor]                               # (batch_size,) [0, 3n)
+    index_1 = anchor_where_n[index_anchor] * 3 + index_2 % 3    # (batch_size,) [0, 3n)
+    src_1, tgt_1 = torch.gather(points_src.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(points_tgt.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1)
+    src_2, tgt_2 = torch.gather(points_src.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(points_tgt.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1)
+    scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1.0)
+    shift = torch.gather(points_tgt, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) - scale[..., None] * torch.gather(points_src, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2)
+    scale, shift = scale.reshape(batch_shape), shift.reshape(*batch_shape, 3)
+    return scale, shift
+def align_points_z_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6):
+    """
+    Align `points_src` to `points_tgt` with respect to a Z-axis shift.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    shift, _, _ = align(torch.ones_like(points_src[..., 2]), points_tgt[..., 2] - points_src[..., 2], weight, trunc)
+    shift = torch.stack([torch.zeros_like(shift), torch.zeros_like(shift), shift], dim=-1)
+    return shift
+def align_points_xyz_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6):
+    """
+    Align `points_src` to `points_tgt` with respect to a Z-axis shift.
+    ### Parameters:
+    - `points_src: torch.Tensor` of shape (..., N, 3)
+    - `points_tgt: torch.Tensor` of shape (..., N, 3)
+    - `weights: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `scale: torch.Tensor` of shape (...).
+    - `shift: torch.Tensor` of shape (..., 3)
+    """
+    dtype, device = points_src.dtype, points_src.device
+    shift, _, _ = align(torch.ones_like(points_src).swapaxes(-2, -1), (points_tgt - points_src).swapaxes(-2, -1), weight[..., None, :], trunc)
+    return shift
+def align_affine_lstsq(x: torch.Tensor, y: torch.Tensor, w: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Solve `min sum_i w_i * (a * x_i + b - y_i ) ^ 2`, where `a` and `b` are scalars, with respect to `a` and `b` using least squares.
+    ### Parameters:
+    - `x: torch.Tensor` of shape (..., N)
+    - `y: torch.Tensor` of shape (..., N)
+    - `w: torch.Tensor` of shape (..., N)
+    ### Returns:
+    - `a: torch.Tensor` of shape (...,)
+    - `b: torch.Tensor` of shape (...,)
+    """
+    w_sqrt = torch.ones_like(x) if w is None else w.sqrt()
+    A = torch.stack([w_sqrt * x, torch.ones_like(x)], dim=-1)
+    B = (w_sqrt * y)[..., None]
+    a, b = torch.linalg.lstsq(A, B)[0].squeeze(-1).unbind(-1)
+    return a, b
+def align_affine_lstsq_z_shift(x: torch.Tensor, y: torch.Tensor, w: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Solve `min sum_i w_i * ||a * x_i + b - y_i||^2`, where x_i and y_i are 3D points,
+    `a` is a scalar (isotropic scaling), and `b` is a translation vector of the form `[0, 0, shift_z]`.
+    The minimization is with respect to `a` (scalar_scale) and `shift_z`.
+    The input point clouds x and y are expected to have a shape like (..., N, 3),
+    where N is the number of points and the last dimension has size 3 (X, Y, Z).
+    The weights w, if provided, should have shape (..., N) corresponding to the points.
+    This function adapts the structure of a 1D affine least squares solver to this specific
+    3D problem by reformulating the design matrix A and observation vector B for torch.linalg.lstsq.
+    Parameters:
+    - `x: torch.Tensor` of shape (..., N, 3), representing the source point cloud.
+    - `y: torch.Tensor` of shape (..., N, 3), representing the target point cloud.
+    - `w: torch.Tensor` (optional) of shape (..., N), representing weights for each point.
+      If None, all points are weighted equally.
+    Returns:
+    - `a: torch.Tensor` of shape (...,), the scalar scaling factor.
+    - `b: torch.Tensor` of shape (..., 3), the translation vector `[0, 0, shift_z]`.
+    """
+    if x.shape[-1] != 3 or y.shape[-1] != 3:
+        raise ValueError("Input tensors x and y must have 3 features in the last dimension (X, Y, Z). "
+                         f"Got x shape: {x.shape}, y shape: {y.shape}")
+    # Check all dimensions except the last one (feature dimension)
+    if x.shape[:-1] != y.shape[:-1]:
+        raise ValueError("Input tensors x and y must have matching shapes up to the last dimension. "
+                         f"Got x shape: {x.shape}, y shape: {y.shape}")
+    if w is not None and w.shape != x.shape[:-1]:
+        raise ValueError("Weights w, if provided, must have shape (..., N) matching x and y's point dimensions. "
+                         f"Got w shape: {w.shape}, x shape: {x.shape}")
+    # Determine batch shape and number of points
+    # Example: x shape (B1, B2, N, 3) -> batch_shape (B1, B2), num_points N
+    batch_shape = x.shape[:-2]
+    num_points = x.shape[-2]
+    # Prepare w_sqrt. If w is None, use unit weights.
+    # w_sqrt_points will have shape (..., N)
+    if w is None:
+        w_sqrt_points = torch.ones(*batch_shape, num_points, device=x.device, dtype=x.dtype)
+    else:
+        w_sqrt_points = w.sqrt()
+    # Dimension along which to concatenate point data from different coordinates (X, Y, Z)
+    dim_to_cat = len(batch_shape)
+    # Coefficients for 'a_val' (the scalar scale)
+    s_terms_x = w_sqrt_points * x[..., :, 0]  # Shape (..., N)
+    s_terms_y = w_sqrt_points * x[..., :, 1]  # Shape (..., N)
+    s_terms_z = w_sqrt_points * x[..., :, 2]  # Shape (..., N)
+    a_val_coeff_column = torch.cat([s_terms_x, s_terms_y, s_terms_z], dim=dim_to_cat)  # Shape (..., 3*N)
+    # Coefficients for 'shift_z_val'
+    zeros_for_shift_coeffs = torch.zeros_like(s_terms_x)  # Shape (..., N)
+    shift_z_val_coeff_column = torch.cat([zeros_for_shift_coeffs, zeros_for_shift_coeffs, w_sqrt_points], dim=dim_to_cat)  # Shape (..., 3*N)
+    # Construct the design matrix A_ls (shape (..., 3*N, 2))
+    A_ls = torch.stack([a_val_coeff_column, shift_z_val_coeff_column], dim=-1)
+    # Construct the observation vector B_ls (shape (..., 3*N, 1))
+    B_terms_x = w_sqrt_points * y[..., :, 0]  # Shape (..., N)
+    B_terms_y = w_sqrt_points * y[..., :, 1]  # Shape (..., N)
+    B_terms_z = w_sqrt_points * y[..., :, 2]  # Shape (..., N)
+    B_ls_flat = torch.cat([B_terms_x, B_terms_y, B_terms_z], dim=dim_to_cat)  # Shape (..., 3*N)
+    B_ls = B_ls_flat.unsqueeze(-1)
+    # Solve the least squares problem
+    solution = torch.linalg.lstsq(A_ls, B_ls)[0] # solution shape (..., 2, 1)
+    # Extract the scalar scale 'a_val' and 'shift_z_val'
+    a_val = solution[..., 0, 0]      # Shape (...,)
+    shift_z_val = solution[..., 1, 0] # Shape (...,)
+    # Construct the output translation vector b = [0, 0, shift_z_val]
+    zeros_for_b = torch.zeros_like(a_val)
+    b_vector = torch.stack([zeros_for_b, zeros_for_b, shift_z_val], dim=-1) # Shape (..., 3)
+    return a_val, b_vector

flow3r/utils/basic.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import os
+import os.path as osp
+import math
+import cv2
+from PIL import Image
+import torch
+from torchvision import transforms
+from plyfile import PlyData, PlyElement
+import numpy as np
+def load_images_as_tensor(path='data/truck', interval=1, PIXEL_LIMIT=255000):
+    """
+    Loads images from a directory or video, resizes them to a uniform size,
+    then converts and stacks them into a single [N, 3, H, W] PyTorch tensor.
+    """
+    sources = []
+    # --- 1. Load image paths or video frames ---
+    if osp.isdir(path):
+        print(f"Loading images from directory: {path}")
+        filenames = sorted([x for x in os.listdir(path) if x.lower().endswith(('.png', '.jpg', '.jpeg'))])
+        for i in range(0, len(filenames), interval):
+            img_path = osp.join(path, filenames[i])
+            try:
+                sources.append(Image.open(img_path).convert('RGB'))
+            except Exception as e:
+                print(f"Could not load image {filenames[i]}: {e}")
+    elif path.lower().endswith('.mp4'):
+        print(f"Loading frames from video: {path}")
+        cap = cv2.VideoCapture(path)
+        if not cap.isOpened(): raise IOError(f"Cannot open video file: {path}")
+        frame_idx = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret: break
+            if frame_idx % interval == 0:
+                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                sources.append(Image.fromarray(rgb_frame))
+            frame_idx += 1
+        cap.release()
+    else:
+        raise ValueError(f"Unsupported path. Must be a directory or a .mp4 file: {path}")
+    if not sources:
+        print("No images found or loaded.")
+        return torch.empty(0)
+    print(f"Found {len(sources)} images/frames. Processing...")
+    # --- 2. Determine a uniform target size for all images based on the first image ---
+    # This is necessary to ensure all tensors have the same dimensions for stacking.
+    first_img = sources[0]
+    W_orig, H_orig = first_img.size
+    scale = math.sqrt(PIXEL_LIMIT / (W_orig * H_orig)) if W_orig * H_orig > 0 else 1
+    W_target, H_target = W_orig * scale, H_orig * scale
+    k, m = round(W_target / 14), round(H_target / 14)
+    while (k * 14) * (m * 14) > PIXEL_LIMIT:
+        if k / m > W_target / H_target: k -= 1
+        else: m -= 1
+    TARGET_W, TARGET_H = max(1, k) * 14, max(1, m) * 14
+    print(f"All images will be resized to a uniform size: ({TARGET_W}, {TARGET_H})")
+    # --- 3. Resize images and convert them to tensors in the [0, 1] range ---
+    tensor_list = []
+    # Define a transform to convert a PIL Image to a CxHxW tensor and normalize to [0,1]
+    to_tensor_transform = transforms.ToTensor()
+    for img_pil in sources:
+        try:
+            # Resize to the uniform target size
+            resized_img = img_pil.resize((TARGET_W, TARGET_H), Image.Resampling.LANCZOS)
+            # Convert to tensor
+            img_tensor = to_tensor_transform(resized_img)
+            tensor_list.append(img_tensor)
+        except Exception as e:
+            print(f"Error processing an image: {e}")
+    if not tensor_list:
+        print("No images were successfully processed.")
+        return torch.empty(0)
+    # --- 4. Stack the list of tensors into a single [N, C, H, W] batch tensor ---
+    return torch.stack(tensor_list, dim=0)
+def tensor_to_pil(tensor):
+    """
+    Converts a PyTorch tensor to a PIL image. Automatically moves the channel dimension
+    (if it has size 3) to the last axis before converting.
+    Args:
+        tensor (torch.Tensor): Input tensor. Expected shape can be [C, H, W], [H, W, C], or [H, W].
+    Returns:
+        PIL.Image: The converted PIL image.
+    """
+    if torch.is_tensor(tensor):
+        array = tensor.detach().cpu().numpy()
+    else:
+        array = tensor
+    return array_to_pil(array)
+def array_to_pil(array):
+    """
+    Converts a NumPy array to a PIL image. Automatically:
+        - Squeezes dimensions of size 1.
+        - Moves the channel dimension (if it has size 3) to the last axis.
+    Args:
+        array (np.ndarray): Input array. Expected shape can be [C, H, W], [H, W, C], or [H, W].
+    Returns:
+        PIL.Image: The converted PIL image.
+    """
+    # Remove singleton dimensions
+    array = np.squeeze(array)
+    # Ensure the array has the channel dimension as the last axis
+    if array.ndim == 3 and array.shape[0] == 3:  # If the channel is the first axis
+        array = np.transpose(array, (1, 2, 0))  # Move channel to the last axis
+    # Handle single-channel grayscale images
+    if array.ndim == 2:  # [H, W]
+        return Image.fromarray((array * 255).astype(np.uint8), mode="L")
+    elif array.ndim == 3 and array.shape[2] == 3:  # [H, W, C] with 3 channels
+        return Image.fromarray((array * 255).astype(np.uint8), mode="RGB")
+    else:
+        raise ValueError(f"Unsupported array shape for PIL conversion: {array.shape}")
+def rotate_target_dim_to_last_axis(x, target_dim=3):
+    shape = x.shape
+    axis_to_move = -1
+    # Iterate backwards to find the first occurrence from the end
+    # (which corresponds to the last dimension of size 3 in the original order).
+    for i in range(len(shape) - 1, -1, -1):
+        if shape[i] == target_dim:
+            axis_to_move = i
+            break
+    # 2. If the axis is found and it's not already in the last position, move it.
+    if axis_to_move != -1 and axis_to_move != len(shape) - 1:
+        # Create the new dimension order.
+        dims_order = list(range(len(shape)))
+        dims_order.pop(axis_to_move)
+        dims_order.append(axis_to_move)
+        # Use permute to reorder the dimensions.
+        ret = x.transpose(*dims_order)
+    else:
+        ret = x
+    return ret
+def write_ply(
+    xyz,
+    rgb=None,
+    path='output.ply',
+) -> None:
+    if torch.is_tensor(xyz):
+        xyz = xyz.detach().cpu().numpy()
+    if torch.is_tensor(rgb):
+        rgb = rgb.detach().cpu().numpy()
+    if rgb is not None and rgb.max() > 1:
+        rgb = rgb / 255.
+    xyz = rotate_target_dim_to_last_axis(xyz, 3)
+    xyz = xyz.reshape(-1, 3)
+    if rgb is not None:
+        rgb = rotate_target_dim_to_last_axis(rgb, 3)
+        rgb = rgb.reshape(-1, 3)
+    if rgb is None:
+        min_coord = np.min(xyz, axis=0)
+        max_coord = np.max(xyz, axis=0)
+        normalized_coord = (xyz - min_coord) / (max_coord - min_coord + 1e-8)
+        hue = 0.7 * normalized_coord[:,0] + 0.2 * normalized_coord[:,1] + 0.1 * normalized_coord[:,2]
+        hsv = np.stack([hue, 0.9*np.ones_like(hue), 0.8*np.ones_like(hue)], axis=1)
+        c = hsv[:,2:] * hsv[:,1:2]
+        x = c * (1 - np.abs( (hsv[:,0:1]*6) % 2 - 1 ))
+        m = hsv[:,2:] - c
+        rgb = np.zeros_like(hsv)
+        cond = (0 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 1)
+        rgb[cond] = np.hstack([c[cond], x[cond], np.zeros_like(x[cond])])
+        cond = (1 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 2)
+        rgb[cond] = np.hstack([x[cond], c[cond], np.zeros_like(x[cond])])
+        cond = (2 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 3)
+        rgb[cond] = np.hstack([np.zeros_like(x[cond]), c[cond], x[cond]])
+        cond = (3 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 4)
+        rgb[cond] = np.hstack([np.zeros_like(x[cond]), x[cond], c[cond]])
+        cond = (4 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 5)
+        rgb[cond] = np.hstack([x[cond], np.zeros_like(x[cond]), c[cond]])
+        cond = (5 <= hsv[:,0]*6%6) & (hsv[:,0]*6%6 < 6)
+        rgb[cond] = np.hstack([c[cond], np.zeros_like(x[cond]), x[cond]])
+        rgb = (rgb + m)
+    dtype = [
+        ("x", "f4"),
+        ("y", "f4"),
+        ("z", "f4"),
+        ("nx", "f4"),
+        ("ny", "f4"),
+        ("nz", "f4"),
+        ("red", "u1"),
+        ("green", "u1"),
+        ("blue", "u1"),
+    ]
+    normals = np.zeros_like(xyz)
+    elements = np.empty(xyz.shape[0], dtype=dtype)
+    attributes = np.concatenate((xyz, normals, rgb * 255), axis=1)
+    elements[:] = list(map(tuple, attributes))
+    vertex_element = PlyElement.describe(elements, "vertex")
+    ply_data = PlyData([vertex_element])
+    ply_data.write(path)

flow3r/utils/cropping.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# croppping utilities
+# --------------------------------------------------------
+import PIL.Image
+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+    bicubic = PIL.Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+    bicubic = PIL.Image.BICUBIC
+from utils.basic import colmap_to_opencv_intrinsics, opencv_to_colmap_intrinsics
+class ImageList:
+    """ Convenience class to aply the same operation to a whole set of images.
+    """
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+    def __len__(self):
+        return len(self.images)
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch('resize', *args, **kwargs))
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch('crop', *args, **kwargs))
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+def rescale_image_depthmap(image, depthmap, camera_intrinsics, output_resolution, force=True, normal=None, far_mask=None):
+    """ Jointly rescale a (image, depthmap)
+        so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        # can also use this with masks instead of depthmaps
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+    # define output resolution
+    assert output_resolution.shape == (2,)
+    scale_final = max(output_resolution / image.size) + 1e-8
+    if scale_final >= 1 and not force:  # image is already smaller than what is asked
+        return (image.to_pil(), depthmap, camera_intrinsics)
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+    # first rescale the image so that it contains the crop
+    image = image.resize(tuple(output_resolution), resample=lanczos if scale_final < 1 else bicubic)
+    if depthmap is not None:
+        depthmap = cv2.resize(depthmap, output_resolution, fx=scale_final,
+                              fy=scale_final, interpolation=cv2.INTER_NEAREST)
+    if normal is not None:
+        normal = cv2.resize(normal, output_resolution, fx=scale_final,
+                              fy=scale_final, interpolation=cv2.INTER_NEAREST)
+    if far_mask is not None:
+        far_mask = cv2.resize(far_mask, output_resolution, fx=scale_final,
+                              fy=scale_final, interpolation=cv2.INTER_NEAREST)
+    # no offset here; simple rescaling
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final)
+    return image.to_pil(), depthmap, camera_intrinsics, normal, far_mask
+def center_crop_image_depthmap(image, depthmap, camera_intrinsics, crop_scale, normal=None, far_mask=None):
+    """
+    Jointly center-crop an image and its depthmap, and adjust the camera intrinsics accordingly.
+    Parameters:
+    - image: PIL.Image or similar, the input image.
+    - depthmap: np.ndarray, the corresponding depth map.
+    - camera_intrinsics: np.ndarray, the 3x3 camera intrinsics matrix.
+    - crop_scale: float between 0 and 1, the fraction of the image to keep.
+    Returns:
+    - cropped_image: PIL.Image, the center-cropped image.
+    - cropped_depthmap: np.ndarray, the center-cropped depth map.
+    - adjusted_intrinsics: np.ndarray, the adjusted camera intrinsics matrix.
+    """
+    # Ensure crop_scale is valid
+    assert 0 < crop_scale <= 1, "crop_scale must be between 0 and 1"
+    # Convert image to ImageList for consistent processing
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (width, height)
+    if depthmap is not None:
+        # Ensure depthmap matches the image size
+        assert depthmap.shape[:2] == tuple(image.size[::-1]), "Depthmap size must match image size"
+    # Compute output resolution after cropping
+    output_resolution = np.floor(input_resolution * crop_scale).astype(int)
+    # get the correct crop_scale
+    crop_scale = output_resolution / input_resolution
+    # Compute margins (amount to crop from each side)
+    margins = input_resolution - output_resolution
+    offset = margins / 2  # Since we are center cropping
+    # Calculate the crop bounding box
+    l, t = offset.astype(int)
+    r = l + output_resolution[0]
+    b = t + output_resolution[1]
+    crop_bbox = (l, t, r, b)
+    # Crop the image and depthmap
+    image = image.crop(crop_bbox)
+    if depthmap is not None:
+        depthmap = depthmap[t:b, l:r]
+    if normal is not None:
+        normal = normal[t:b, l:r]
+    if far_mask is not None:
+        far_mask = far_mask[t:b, l:r]
+    # Adjust the camera intrinsics
+    adjusted_intrinsics = camera_intrinsics.copy()
+    # Adjust focal lengths (fx, fy)                         # no need to adjust focal lengths for cropping
+    # adjusted_intrinsics[0, 0] /= crop_scale[0]  # fx
+    # adjusted_intrinsics[1, 1] /= crop_scale[1]  # fy
+    # Adjust principal point (cx, cy)
+    adjusted_intrinsics[0, 2] -= l  # cx
+    adjusted_intrinsics[1, 2] -= t  # cy
+    return image.to_pil(), depthmap, adjusted_intrinsics, normal, far_mask
+def camera_matrix_of_crop(input_camera_matrix, input_resolution, output_resolution, scaling=1, offset_factor=0.5, offset=None):
+    # Margins to offset the origin
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+    # Generate new camera parameters
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+    return output_camera_matrix
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox, normal=None, far_mask=None):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+    if normal is not None:
+        normal = normal[t:b, l:r]
+    if far_mask is not None:
+        far_mask = far_mask[t:b, l:r]
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+    return image.to_pil(), depthmap, camera_intrinsics, normal, far_mask
+def bbox_from_intrinsics_in_out(input_camera_matrix, output_camera_matrix, output_resolution):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l + out_width, t + out_height)
+    return crop_bbox

flow3r/utils/debug.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import json
+import debugpy
+import socket
+import random
+def update_vscode_launch_file(host: str, port: int):
+    """Update the .vscode/launch.json file with the new host and port."""
+    launch_file_path = ".vscode/launch.json"
+    # Desired configuration
+    new_config = {
+        "version": "0.2.0",
+        "configurations": [
+            {
+                "name": "bash_debug",
+                "type": "debugpy",
+                "request": "attach",
+                "connect": {
+                    "host": host,
+                    "port": port
+                },
+                "justMyCode": False
+            },
+        ]
+    }
+    # Ensure the .vscode directory exists
+    if not os.path.exists(".vscode"):
+        os.makedirs(".vscode")
+    # Write the updated configuration to launch.json
+    with open(launch_file_path, "w") as f:
+        json.dump(new_config, f, indent=4)
+    print(f"Updated {launch_file_path} with host: {host} and port: {port}")
+def is_port_in_use(host, port):
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return s.connect_ex((host, port)) == 0
+def setup_debug(is_main_process=True, max_retries=10, port_range=(10000, 20000)):
+    if is_main_process:
+        host = os.environ['SLURM_NODELIST'].split(',')[0]
+        for _ in range(max_retries):
+            port = random.randint(*port_range)
+            try:
+                if is_port_in_use(host, port):
+                    print(f"Port {port} is already in use, trying another...")
+                    continue
+                # 更新 launch.json
+                update_vscode_launch_file(host, port)
+                print("master_addr = ", host)
+                debugpy.listen((host, port))
+                print(f"Waiting for debugger attach at port {port}...", flush=True)
+                debugpy.wait_for_client()
+                print("Debugger attached", flush=True)
+                return
+            except Exception as e:
+                print(f"Failed to bind to port {port}: {e}")
+        raise RuntimeError("Could not find a free port for debugpy after several attempts.")

flow3r/utils/flow_utils.py ADDED Viewed

	@@ -0,0 +1,472 @@

+import torch
+import numpy as np
+import os
+from PIL import Image
+import flow_vis
+from .geometry import se3_inverse, homogenize_points
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+import wandb
+def warp_image_with_flow(source_image, source_mask, target_image, flow) -> np.ndarray:
+    """
+    Warp the target to source image using the given flow vectors.
+    Flow vectors indicate the displacement from source to target.
+    Args:
+    source_image: np.ndarray of shape (H, W, 3), normalized to [0, 1]
+    target_image: np.ndarray of shape (H, W, 3), normalized to [0, 1]
+    flow: np.ndarray of shape (H, W, 2)
+    source_mask: non_occluded mask represented in source image.
+    Returns:
+    warped_image: target_image warped according to flow into frame of source image
+    np.ndarray of shape (H, W, 3), normalized to [0, 1]
+    """
+    # assert source_image.shape[-1] == 3
+    # assert target_image.shape[-1] == 3
+    assert flow.shape[-1] == 2
+    # Get the shape of the source image
+    height, width = source_image.shape[:2]
+    target_height, target_width = target_image.shape[:2]
+    # Create mesh grid
+    x, y = np.meshgrid(np.arange(width), np.arange(height))
+    # Apply flow displacements
+    flow_x, flow_y = flow[..., 0], flow[..., 1]
+    x_new = np.clip(x + flow_x, 0, target_width - 1) + 0.5
+    y_new = np.clip(y + flow_y, 0, target_height - 1) + 0.5
+    x_new = (x_new / target_image.shape[1]) * 2 - 1
+    y_new = (y_new / target_image.shape[0]) * 2 - 1
+    warped_image = F.grid_sample(
+        torch.from_numpy(target_image).permute(2, 0, 1)[None, ...].float(),
+        torch.from_numpy(np.stack([x_new, y_new], axis=-1)).float()[None, ...],
+        mode="bilinear",
+        align_corners=False,
+    )
+    warped_image = warped_image[0].permute(1, 2, 0).numpy()
+    if source_mask is not None:
+        warped_image = warped_image * (source_mask > 0.5)[..., None]
+    return warped_image
+def ndc_to_pixel_coords(coords_ndc: torch.Tensor, H: int, W: int) -> torch.Tensor:
+    """
+    Convert coordinates from NDC space back to pixel space.
+    Args:
+        coords_ndc: [..., H, W, 2], coordinates in NDC space (x_ndc, y_ndc)
+        H, W: image dimensions
+    Returns:
+        coords_px: [..., H, W, 2], coordinates in pixel space (x_pix, y_pix)
+    """
+    coords_px = coords_ndc.clone()
+    # Convert x: NDC [1, -1] -> pixel [0, W-1]
+    coords_px[..., 0] = (1.0 - coords_ndc[..., 0]) * max(W - 1, 1) / 2.0
+    # Convert y: NDC [1, -1] -> pixel [0, H-1]
+    coords_px[..., 1] = (1.0 - coords_ndc[..., 1]) * max(H - 1, 1) / 2.0
+    return coords_px
+def coords_to_flow(coords: torch.Tensor, H: int, W: int) -> torch.Tensor:
+    """
+    Convert coordinates to flow by subtracting source pixel coordinates.
+    Args:
+        coords: [..., H, W, 2], target coordinates (where pixels from source appear)
+        H, W: image dimensions
+    Returns:
+        flow: [..., H, W, 2], optical flow (displacement vectors)
+    """
+    device = coords.device
+    # Create source coordinate grid
+    grid_y, grid_x = torch.meshgrid(
+        torch.arange(H, device=device),
+        torch.arange(W, device=device),
+        indexing="ij"
+    )
+    source_coords = torch.stack([grid_x, grid_y], dim=-1).float()  # (H, W, 2)
+    # Compute flow as target - source
+    flow = coords - source_coords
+    return flow
+def flow_to_coords(flow: torch.Tensor, H: int, W: int) -> torch.Tensor:
+    """
+    Convert optical flow to absolute target coordinates.
+    Args:
+        flow: [..., H, W, 2], optical flow (displacement vectors)
+        H, W: image dimensions
+    Returns:
+        coords: [..., H, W, 2], absolute target coordinates (pixel positions in target image)
+    """
+    device = flow.device
+    # Create source coordinate grid
+    grid_y, grid_x = torch.meshgrid(
+        torch.arange(H, device=device),
+        torch.arange(W, device=device),
+        indexing="ij"
+    )
+    source_coords = torch.stack([grid_x, grid_y], dim=-1).float()  # (H, W, 2)
+    # Compute absolute target coordinates
+    coords = flow + source_coords
+    return coords
+def ndc_pixels_to_flow(flow_ndc: torch.Tensor, H: int, W: int) -> torch.Tensor:
+    """
+    Convert optical flow from NDC space back to pixel space.
+    Args:
+        flow_ndc: [..., H, W, 2], optical flow in NDC (dx_ndc, dy_ndc),
+                PyTorch3D NDC convention: +x left, +y up, origin at image center.
+        H, W: image height and width.
+    Returns:
+        flow_px:  [..., H, W, 2], optical flow in pixel space (dx_pix, dy_pix),
+                screen convention: +x right, +y down, origin at top-left.
+    """
+    # Inverse of: dx_ndc = -2/(W-1)*dx_pix, dy_ndc = -2/(H-1)*dy_pix
+    sx = 2.0 / max(W - 1, 1)
+    sy = 2.0 / max(H - 1, 1)
+    flow_px = flow_ndc.clone()
+    flow_px[..., 0] = - flow_ndc[..., 0] / sx   # dx_pix
+    flow_px[..., 1] = - flow_ndc[..., 1] / sy   # dy_pix
+    return flow_px
+def coords_pixels_to_ndc(coords_px: torch.Tensor, H: int, W: int) -> torch.Tensor:
+    """
+    PyTorch3D convention:
+    - NDC space: x ∈ [1, -1] (+x left), y ∈ [-1, 1] (+y up), origin at center
+    - Pixel space: x ∈ [0, W-1] (+x right), y ∈ [0, H-1] (+y down), origin at top-left
+    """
+    coords_ndc = coords_px.clone()
+    # Convert x: pixel [0, W-1] (left→right) -> NDC [1, -1] (left→right in NDC means 1→-1)
+    coords_ndc[..., 0] = 1.0 - (coords_px[..., 0] / max(W - 1, 1)) * 2.0
+    # Convert y: pixel [0, H-1] (top→bottom) -> NDC [1, -1] (top→bottom in NDC means 1→-1)
+    coords_ndc[..., 1] = 1.0 - (coords_px[..., 1] / max(H - 1, 1)) * 2.0
+    return coords_ndc
+def batched_pi3_motion_flow(world_points, camera_poses, camera_intrinsics, sampled_pairs, image_size):
+    """
+    Compute batched motion flow from img1 to img2 using world points and camera pose encodings.
+    Args:
+        world_points: (B, N, H, W, 3) predicted world points per image.
+        camera_poses: (B, N, 4, 4) extrinsics for each frame, camera-to-world.
+        camera_intrinsics: (B, N, 3, 3) camera intrinsics for each frame.
+        sampled_pairs: (B, P, 2) image pairs to compute flow between.
+        image_size: int, image height/width.
+    Returns:
+        flow: (B, P, H, W, 2) motion flows, (x, y) in pixel coordinates
+    """
+    B, N, H, W, _ = world_points.shape
+    P = sampled_pairs.shape[1]
+    device = world_points.device
+    # Gather source points
+    # (B, P)
+    src_idx = sampled_pairs[..., 0]
+    # (B, P, 1, 1, 1) -> (B, P, H, W, 3)
+    # Expand indices to gather along N dimension
+    src_idx_exp = src_idx.view(B, P, 1, 1, 1).expand(B, P, H, W, 3)
+    src_points = torch.gather(world_points, 1, src_idx_exp)
+    # Gather target poses and intrinsics
+    # (B, P)
+    tgt_idx = sampled_pairs[..., 1]
+    tgt_poses = torch.gather(camera_poses, 1, tgt_idx.view(B, P, 1, 1).expand(B, P, 4, 4))
+    tgt_intrinsics = torch.gather(camera_intrinsics, 1, tgt_idx.view(B, P, 1, 1).expand(B, P, 3, 3))
+    # Transform points to target camera frame
+    w2c_tgt = se3_inverse(tgt_poses)
+    src_points_homo = homogenize_points(src_points)
+    # P_cam = T_w2c @ P_world
+    # (B, P, 4, 4) @ (B, P, H, W, 4) -> (B, P, H, W, 4)
+    pts_cam = torch.einsum('bpij,bphwj->bphwi', w2c_tgt, src_points_homo)[..., :3]
+    # Project to image plane
+    # P_img = K @ P_cam
+    # (B, P, 3, 3) @ (B, P, H, W, 3) -> (B, P, H, W, 3)
+    pts_img = torch.einsum('bpij,bphwj->bphwi', tgt_intrinsics, pts_cam)
+    # Normalize to pixels
+    uv_tgt = pts_img[..., :2] / (pts_img[..., 2:3] + 1e-6)
+    # Generate source pixel coordinates
+    # print("image_size is: ", image_size)
+    H_img, W_img = image_size[0]
+    scale_h = H_img / H
+    scale_w = W_img / W
+    y, x = torch.meshgrid(
+        torch.arange(H, device=device, dtype=torch.float32),
+        torch.arange(W, device=device, dtype=torch.float32),
+        indexing='ij'
+    )
+    # Map grid to image coordinates (assuming center of pixels/patches)
+    uv_src = torch.stack([
+        (x + 0.5) * scale_w - 0.5,
+        (y + 0.5) * scale_h - 0.5
+    ], dim=-1) # (H, W, 2)
+    uv_src = uv_src.view(1, 1, H, W, 2).expand(B, P, -1, -1, -1)
+    return uv_tgt - uv_src
+def visualize_flow(pred_motion_coords, motion_coords, covis_masks, sampled_pairs, images, pred_pi3_flow, iteration, accelerator, dataset_names):
+    # visualize gt images, gt flow, pred flow, flow computed from predicted cameras and points
+    path = f"/ocean/projects/cis250013p/zcong/pi3/outputs/flow_vis/{iteration}"
+    if not os.path.exists(path):
+        os.makedirs(path)
+    with torch.no_grad():
+        # Get dimensions
+        B, num_pairs = sampled_pairs.shape[0], sampled_pairs.shape[1]
+        H, W = motion_coords[0, 0].shape[0], motion_coords[0, 0].shape[1]
+        # Process all pairs for all batches
+        for batch_idx in range(B):
+            dataset_name = dataset_names[batch_idx]
+            for pair_idx in range(num_pairs):
+                if pair_idx > 1: break
+                # Get pair indices
+                pairs = sampled_pairs[batch_idx, pair_idx].cpu().numpy()  # (2,)
+                img1 = images[batch_idx, pairs[0]].cpu().numpy()
+                img2 = images[batch_idx, pairs[1]].cpu().numpy()
+                # Convert ground truth coordinates to flow
+                gt_coords_ndc = motion_coords[batch_idx, pair_idx]  # NDC coordinates
+                gt_coords_pixel = ndc_to_pixel_coords(gt_coords_ndc, H, W)  # Convert to pixel coordinates
+                flow_tensor = coords_to_flow(gt_coords_pixel, H, W).float().cpu()  # (H, W, 2)
+                flow = flow_tensor.numpy()  # (H, W, 2)
+                covis_mask = covis_masks[batch_idx, pair_idx].float().cpu().numpy()  # (H, W)
+                masked_flow = flow * covis_mask[..., None]
+                # Convert predicted coordinates to flow
+                pred_coords_ndc = pred_motion_coords[batch_idx, pair_idx]  # NDC coordinates
+                pred_coords_pixel = ndc_to_pixel_coords(pred_coords_ndc, H, W)  # Convert to pixel coordinates
+                pred_flow = coords_to_flow(pred_coords_pixel, H, W).float().cpu().numpy()  # (H, W, 2)
+                masked_pred_flow = pred_flow * covis_mask[..., None]
+                pi3_flow = pred_pi3_flow[batch_idx, pair_idx].float().cpu().numpy()  # (H, W, 2)
+                masked_pi3_flow = pi3_flow * covis_mask[..., None]
+                # warp img1 to img2
+                # first compute gt warpping
+                img1_np = np.transpose(img1, (1, 2, 0))  # [H, W, 3]
+                img2_np = np.transpose(img2, (1, 2, 0))  # [H, W, 3]
+                warped_img_gt = warp_image_with_flow(img1_np, covis_mask, img2_np, flow)
+                warped_img_gt = warped_img_gt.clip(0, 1)
+                warped_img_gt = Image.fromarray((warped_img_gt * 255).astype(np.uint8))
+                # compute prediction warping
+                warped_img_pred = warp_image_with_flow(img1_np, covis_mask, img2_np, pred_flow)
+                warped_img_pred = warped_img_pred.clip(0, 1)
+                warped_img_pred = Image.fromarray((warped_img_pred * 255).astype(np.uint8))
+                # compute pi3 warping
+                warped_img_pi3 = warp_image_with_flow(img1_np, covis_mask, img2_np, pi3_flow)
+                warped_img_pi3 = warped_img_pi3.clip(0, 1)
+                warped_img_pi3 = Image.fromarray((warped_img_pi3 * 255).astype(np.uint8))
+                # visualize images
+                img_array1 = np.transpose(img1, (1, 2, 0))
+                img1_pil = Image.fromarray((img_array1 * 255).astype(np.uint8))
+                img_array2 = np.transpose(img2, (1, 2, 0))
+                img2_pil = Image.fromarray((img_array2 * 255).astype(np.uint8))
+                # Calculate AEPE metrics
+                # Only calculate on valid covisible pixels
+                valid_mask = covis_mask > 0
+                if np.sum(valid_mask) > 0:
+                    # AEPE for predicted flow vs GT flow
+                    flow_diff_pred = np.sqrt(np.sum((masked_pred_flow - masked_flow) ** 2, axis=-1))
+                    aepe_pred = np.mean(flow_diff_pred[valid_mask])
+                    aepe_5px_pred = np.mean(flow_diff_pred[valid_mask] < 5.0) * 100  # percentage
+                    # AEPE for pi3 flow vs GT flow
+                    flow_diff_pi3 = np.sqrt(np.sum((masked_pi3_flow - masked_flow) ** 2, axis=-1))
+                    aepe_pi3 = np.mean(flow_diff_pi3[valid_mask])
+                    aepe_5px_pi3 = np.mean(flow_diff_pi3[valid_mask] < 5.0) * 100  # percentage
+                else:
+                    aepe_pred = float('inf')
+                    aepe_5px_pred = 0.0
+                    aepe_pi3 = float('inf')
+                    aepe_5px_pi3 = 0.0
+                # visualize flow
+                flow_vis_image_gt = flow_vis.flow_to_color(masked_flow)
+                flow_pil = Image.fromarray(flow_vis_image_gt.astype(np.uint8))
+                flow_vis_image_pred = flow_vis.flow_to_color(masked_pred_flow)
+                flow_pred_pil = Image.fromarray(flow_vis_image_pred.astype(np.uint8))
+                flow_vis_image_pi3 = flow_vis.flow_to_color(masked_pi3_flow)
+                flow_pi3_pil = Image.fromarray(flow_vis_image_pi3.astype(np.uint8))
+                # Create metrics text
+                metrics_text = {
+                    'pred_aepe': aepe_pred,
+                    'pred_5px_pct': aepe_5px_pred,
+                    'pi3_aepe': aepe_pi3,
+                    'pi3_5px_pct': aepe_5px_pi3,
+                    'covis_ratio': float(np.mean(covis_mask)) * 100,
+                    'pairs': pairs,
+                    'dataset': dataset_name,
+                }
+                # Save individual visualization and log to wandb
+                save_path = os.path.join(path, f"motion_flow_grid_batch_{batch_idx}_pair_{pair_idx}_imgs_{pairs[0]}_{pairs[1]}_iter_{iteration:08d}.png")
+                visualize_motion_grid_nodepth_with_metrics(
+                    img1_pil, img2_pil, flow_pil, flow_pred_pil, flow_pi3_pil,
+                    warped_img_gt, warped_img_pred, warped_img_pi3,
+                    metrics_text,
+                    save_path=save_path,
+                    pair_idx = pair_idx,
+                    step=iteration,
+                    log_to_wandb=True,  # We'll handle wandb logging separately
+                    accelerator=accelerator,
+                    dataset_name=dataset_name
+                )
+def visualize_motion_grid_nodepth_with_metrics(img1, img2, flow_pil, flow_pred_pil, flow_pi3_pil, warped_img_gt, warped_img_pred, warped_img_pi3, metrics_text, pair_idx, save_path="motion_flow_grid.png", step=None, log_to_wandb=True, accelerator=None, dataset_name=None):
+    fig, axes = plt.subplots(3, 3, figsize=(20, 16))
+    # images
+    axes[0, 0].imshow(img1)
+    axes[0, 0].set_title(f"Image {metrics_text['pairs'][0]}")
+    axes[0, 0].axis("off")
+    axes[0, 1].imshow(img2)
+    axes[0, 1].set_title(f"Image {metrics_text['pairs'][1]}")
+    axes[0, 1].axis("off")
+    # Add overall metrics in the third subplot
+    axes[0, 2].text(0.1, 0.9, f"{metrics_text['dataset']} Pair: {metrics_text['pairs'][0]} → {metrics_text['pairs'][1]}",
+                    fontsize=14, fontweight='bold', transform=axes[0, 2].transAxes)
+    axes[0, 2].text(0.1, 0.8, f"Covis Ratio: {metrics_text['covis_ratio']:.1f}%",
+                    fontsize=12, transform=axes[0, 2].transAxes)
+    axes[0, 2].text(0.1, 0.7, "Pred Flow Metrics:",
+                    fontsize=12, fontweight='bold', color='blue', transform=axes[0, 2].transAxes)
+    axes[0, 2].text(0.1, 0.6, f"AEPE: {metrics_text['pred_aepe']:.3f}",
+                    fontsize=11, color='blue', transform=axes[0, 2].transAxes)
+    axes[0, 2].text(0.1, 0.5, f"<5px: {metrics_text['pred_5px_pct']:.1f}%",
+                    fontsize=11, color='blue', transform=axes[0, 2].transAxes)
+    axes[0, 2].text(0.1, 0.4, "Pi3 Flow Metrics:",
+                    fontsize=12, fontweight='bold', color='red', transform=axes[0, 2].transAxes)
+    axes[0, 2].text(0.1, 0.3, f"AEPE: {metrics_text['pi3_aepe']:.3f}",
+                    fontsize=11, color='red', transform=axes[0, 2].transAxes)
+    axes[0, 2].text(0.1, 0.2, f"<5px: {metrics_text['pi3_5px_pct']:.1f}%",
+                    fontsize=11, color='red', transform=axes[0, 2].transAxes)
+    axes[0, 2].set_xlim(0, 1)
+    axes[0, 2].set_ylim(0, 1)
+    axes[0, 2].axis("off")
+    # GT flow and Pred flow
+    axes[1, 0].imshow(flow_pil)
+    axes[1, 0].set_title("GT Motion Flow")
+    axes[1, 0].axis("off")
+    axes[1, 1].imshow(flow_pred_pil)
+    axes[1, 1].set_title(f"Predicted Flow\nAEPE: {metrics_text['pred_aepe']:.3f}, <5px: {metrics_text['pred_5px_pct']:.1f}%")
+    axes[1, 1].axis("off")
+    axes[1, 2].imshow(flow_pi3_pil)
+    axes[1, 2].set_title(f"Pi3 Flow\nAEPE: {metrics_text['pi3_aepe']:.3f}, <5px: {metrics_text['pi3_5px_pct']:.1f}%")
+    axes[1, 2].axis("off")
+    # GT warp and Pred warp
+    axes[2, 0].imshow(warped_img_gt)
+    axes[2, 0].set_title("GT Warped Image")
+    axes[2, 0].axis("off")
+    axes[2, 1].imshow(warped_img_pred)
+    axes[2, 1].set_title("Pred Warped Image")
+    axes[2, 1].axis("off")
+    axes[2, 2].imshow(warped_img_pi3)
+    axes[2, 2].set_title("PI3 Warped Image")
+    axes[2, 2].axis("off")
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    if log_to_wandb:
+        accelerator.log({f"Visualization_{pair_idx}": wandb.Image(save_path)}, step=step)
+    plt.close()
+def calculate_flow_metrics(pred_motion_coords, motion_coords, covis_masks, sampled_pairs, pred_pi3_flow):
+    with torch.no_grad():
+        # Get dimensions
+        B, num_pairs = sampled_pairs.shape[0], sampled_pairs.shape[1]
+        H, W = motion_coords[0, 0].shape[0], motion_coords[0, 0].shape[1]
+        aepe_pred, aepe_5px_pred, aepe_pi3, aepe_5px_pi3 = [], [], [], []
+        # Process all pairs for all batches
+        for batch_idx in range(B):
+            for pair_idx in range(num_pairs):
+                # Convert ground truth coordinates to flow
+                gt_coords_ndc = motion_coords[batch_idx, pair_idx]  # NDC coordinates
+                gt_coords_pixel = ndc_to_pixel_coords(gt_coords_ndc, H, W)  # Convert to pixel coordinates
+                flow_tensor = coords_to_flow(gt_coords_pixel, H, W).float().cpu()  # (H, W, 2)
+                flow = flow_tensor.numpy()  # (H, W, 2)
+                covis_mask = covis_masks[batch_idx, pair_idx].float().cpu().numpy()  # (H, W)
+                masked_flow = flow * covis_mask[..., None]
+                # Convert predicted coordinates to flow
+                pred_coords_ndc = pred_motion_coords[batch_idx, pair_idx]  # NDC coordinates
+                pred_coords_pixel = ndc_to_pixel_coords(pred_coords_ndc, H, W)  # Convert to pixel coordinates
+                pred_flow = coords_to_flow(pred_coords_pixel, H, W).float().cpu().numpy()  # (H, W, 2)
+                masked_pred_flow = pred_flow * covis_mask[..., None]
+                pi3_flow = pred_pi3_flow[batch_idx, pair_idx].float().cpu().numpy()  # (H, W, 2)
+                masked_pi3_flow = pi3_flow * covis_mask[..., None]
+                # Calculate AEPE metrics
+                # Only calculate on valid covisible pixels
+                valid_mask = covis_mask > 0
+                if np.sum(valid_mask) > 0:
+                    # AEPE for predicted flow vs GT flow
+                    flow_diff_pred = np.sqrt(np.sum((masked_pred_flow - masked_flow) ** 2, axis=-1))
+                    aepe_pred.append(np.mean(flow_diff_pred[valid_mask]))
+                    aepe_5px_pred.append(np.mean(flow_diff_pred[valid_mask] < 5.0) * 100)  # percentage
+                    # AEPE for pi3 flow vs GT flow
+                    flow_diff_pi3 = np.sqrt(np.sum((masked_pi3_flow - masked_flow) ** 2, axis=-1))
+                    aepe_pi3.append(np.mean(flow_diff_pi3[valid_mask]))
+                    aepe_5px_pi3.append(np.mean(flow_diff_pi3[valid_mask] < 5.0) * 100)  # percentage
+                else:
+                    aepe_pred.append(float('inf'))
+                    aepe_5px_pred.append(0.0)
+                    aepe_pi3.append(float('inf'))
+                    aepe_5px_pi3.append(0.0)
+        # print("aepe 5px pi3 is",aepe_5px_pi3)
+        return np.mean(aepe_pred), np.mean(aepe_5px_pred), np.mean(aepe_pi3), np.mean(aepe_5px_pi3)

flow3r/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+def se3_inverse(T):
+    """
+    Computes the inverse of a batch of SE(3) matrices.
+    """
+    if torch.is_tensor(T):
+        R = T[..., :3, :3]
+        t = T[..., :3, 3].unsqueeze(-1)
+        R_inv = R.transpose(-2, -1)
+        t_inv = -torch.matmul(R_inv, t)
+        T_inv = torch.cat([
+            torch.cat([R_inv, t_inv], dim=-1),
+            torch.tensor([0, 0, 0, 1], device=T.device, dtype=T.dtype).repeat(*T.shape[:-2], 1, 1)
+        ], dim=-2)
+    else:
+        R = T[..., :3, :3]
+        t = T[..., :3, 3, np.newaxis]
+        R_inv = np.swapaxes(R, -2, -1)
+        t_inv = -R_inv @ t
+        bottom_row = np.zeros((*T.shape[:-2], 1, 4), dtype=T.dtype)
+        bottom_row[..., :, 3] = 1
+        top_part = np.concatenate([R_inv, t_inv], axis=-1)
+        T_inv = np.concatenate([top_part, bottom_row], axis=-2)
+    return T_inv
+def get_pixel(H, W):
+    # get 2D pixels (u, v) for image_a in cam_a pixel space
+    u_a, v_a = np.meshgrid(np.arange(W), np.arange(H))
+    # u_a = np.flip(u_a, axis=1)
+    # v_a = np.flip(v_a, axis=0)
+    pixels_a = np.stack([
+        u_a.flatten() + 0.5,
+        v_a.flatten() + 0.5,
+        np.ones_like(u_a.flatten())
+    ], axis=0)
+    return pixels_a
+def depthmap_to_absolute_camera_coordinates(depthmap, camera_intrinsics, camera_pose, z_far=0, **kw):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+        - camera_pose: a 4x3 or 4x4 cam2world matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels."""
+    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
+    if z_far > 0:
+        valid_mask = valid_mask & (depthmap < z_far)
+    X_world = X_cam # default
+    if camera_pose is not None:
+        # R_cam2world = np.float32(camera_params["R_cam2world"])
+        # t_cam2world = np.float32(camera_params["t_cam2world"]).squeeze()
+        R_cam2world = camera_pose[:3, :3]
+        t_cam2world = camera_pose[:3, 3]
+        # Express in absolute coordinates (invalid depth values)
+        X_world = np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
+    return X_world, valid_mask
+def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    camera_intrinsics = np.float32(camera_intrinsics)
+    H, W = depthmap.shape
+    # Compute 3D ray associated with each pixel
+    # Strong assumption: there are no skew terms
+    # assert camera_intrinsics[0, 1] == 0.0
+    # assert camera_intrinsics[1, 0] == 0.0
+    if pseudo_focal is None:
+        fu = camera_intrinsics[0, 0]
+        fv = camera_intrinsics[1, 1]
+    else:
+        assert pseudo_focal.shape == (H, W)
+        fu = fv = pseudo_focal
+    cu = camera_intrinsics[0, 2]
+    cv = camera_intrinsics[1, 2]
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+    z_cam = depthmap
+    x_cam = (u - cu) * z_cam / fu
+    y_cam = (v - cv) * z_cam / fv
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
+    # Mask for valid coordinates
+    valid_mask = (depthmap > 0.0)
+    # Invalid any depth > 80m
+    valid_mask = valid_mask
+    return X_cam, valid_mask
+def homogenize_points(
+    points,
+):
+    """Convert batched points (xyz) to (xyz1)."""
+    return torch.cat([points, torch.ones_like(points[..., :1])], dim=-1)
+def get_gt_warp(depth1, depth2, T_1to2, K1, K2, depth_interpolation_mode = 'bilinear', relative_depth_error_threshold = 0.05, H = None, W = None):
+    if H is None:
+        B,H,W = depth1.shape
+    else:
+        B = depth1.shape[0]
+    with torch.no_grad():
+        x1_n = torch.meshgrid(
+            *[
+                torch.linspace(
+                    -1 + 1 / n, 1 - 1 / n, n, device=depth1.device
+                )
+                for n in (B, H, W)
+            ],
+            indexing = 'ij'
+        )
+        x1_n = torch.stack((x1_n[2], x1_n[1]), dim=-1).reshape(B, H * W, 2)
+        mask, x2 = warp_kpts(
+            x1_n.double(),
+            depth1.double(),
+            depth2.double(),
+            T_1to2.double(),
+            K1.double(),
+            K2.double(),
+            depth_interpolation_mode = depth_interpolation_mode,
+            relative_depth_error_threshold = relative_depth_error_threshold,
+        )
+        prob = mask.float().reshape(B, H, W)
+        x2 = x2.reshape(B, H, W, 2)
+        return x2, prob
+@torch.no_grad()
+def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1, smooth_mask = False, return_relative_depth_error = False, depth_interpolation_mode = "bilinear", relative_depth_error_threshold = 0.05):
+    """Warp kpts0 from I0 to I1 with depth, K and Rt
+    Also check covisibility and depth consistency.
+    Depth is consistent if relative error < 0.2 (hard-coded).
+    # https://github.com/zju3dv/LoFTR/blob/94e98b695be18acb43d5d3250f52226a8e36f839/src/loftr/utils/geometry.py adapted from here
+    Args:
+        kpts0 (torch.Tensor): [N, L, 2] - <x, y>, should be normalized in (-1,1)
+        depth0 (torch.Tensor): [N, H, W],
+        depth1 (torch.Tensor): [N, H, W],
+        T_0to1 (torch.Tensor): [N, 3, 4],
+        K0 (torch.Tensor): [N, 3, 3],
+        K1 (torch.Tensor): [N, 3, 3],
+    Returns:
+        calculable_mask (torch.Tensor): [N, L]
+        warped_keypoints0 (torch.Tensor): [N, L, 2] <x0_hat, y1_hat>
+    """
+    (
+        n,
+        h,
+        w,
+    ) = depth0.shape
+    if depth_interpolation_mode == "combined":
+        # Inspired by approach in inloc, try to fill holes from bilinear interpolation by nearest neighbour interpolation
+        if smooth_mask:
+            raise NotImplementedError("Combined bilinear and NN warp not implemented")
+        valid_bilinear, warp_bilinear = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1,
+                  smooth_mask = smooth_mask,
+                  return_relative_depth_error = return_relative_depth_error,
+                  depth_interpolation_mode = "bilinear",
+                  relative_depth_error_threshold = relative_depth_error_threshold)
+        valid_nearest, warp_nearest = warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1,
+                  smooth_mask = smooth_mask,
+                  return_relative_depth_error = return_relative_depth_error,
+                  depth_interpolation_mode = "nearest-exact",
+                  relative_depth_error_threshold = relative_depth_error_threshold)
+        nearest_valid_bilinear_invalid = (~valid_bilinear).logical_and(valid_nearest)
+        warp = warp_bilinear.clone()
+        warp[nearest_valid_bilinear_invalid] = warp_nearest[nearest_valid_bilinear_invalid]
+        valid = valid_bilinear | valid_nearest
+        return valid, warp
+    kpts0_depth = F.grid_sample(depth0[:, None], kpts0[:, :, None], mode = depth_interpolation_mode, align_corners=False)[
+        :, 0, :, 0
+    ]
+    kpts0 = torch.stack(
+        (w * (kpts0[..., 0] + 1) / 2, h * (kpts0[..., 1] + 1) / 2), dim=-1
+    )  # [-1+1/h, 1-1/h] -> [0.5, h-0.5]
+    # Sample depth, get calculable_mask on depth != 0
+    # nonzero_mask = kpts0_depth != 0
+    # Sample depth, get calculable_mask on depth > 0
+    nonzero_mask = kpts0_depth > 0
+    # Unproject
+    kpts0_h = (
+        torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], dim=-1)
+        * kpts0_depth[..., None]
+    )  # (N, L, 3)
+    kpts0_n = K0.inverse() @ kpts0_h.transpose(2, 1)  # (N, 3, L)
+    kpts0_cam = kpts0_n
+    # Rigid Transform
+    w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, [3]]  # (N, 3, L)
+    w_kpts0_depth_computed = w_kpts0_cam[:, 2, :]
+    # Project
+    w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1)  # (N, L, 3)
+    w_kpts0 = w_kpts0_h[:, :, :2] / (
+        w_kpts0_h[:, :, [2]] + 1e-4
+    )  # (N, L, 2), +1e-4 to avoid zero depth
+    # Covisible Check
+    h, w = depth1.shape[1:3]
+    covisible_mask = (
+        (w_kpts0[:, :, 0] > 0)
+        * (w_kpts0[:, :, 0] < w - 1)
+        * (w_kpts0[:, :, 1] > 0)
+        * (w_kpts0[:, :, 1] < h - 1)
+    )
+    w_kpts0 = torch.stack(
+        (2 * w_kpts0[..., 0] / w - 1, 2 * w_kpts0[..., 1] / h - 1), dim=-1
+    )  # from [0.5,h-0.5] -> [-1+1/h, 1-1/h]
+    # w_kpts0[~covisible_mask, :] = -5 # xd
+    w_kpts0_depth = F.grid_sample(
+        depth1[:, None], w_kpts0[:, :, None], mode=depth_interpolation_mode, align_corners=False
+    )[:, 0, :, 0]
+    relative_depth_error = (
+        (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth
+    ).abs()
+    if not smooth_mask:
+        consistent_mask = relative_depth_error < relative_depth_error_threshold
+    else:
+        consistent_mask = (-relative_depth_error/smooth_mask).exp()
+    valid_mask = nonzero_mask * covisible_mask * consistent_mask
+    if return_relative_depth_error:
+        return relative_depth_error, w_kpts0
+    else:
+        return valid_mask, w_kpts0
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """ Apply a geometric transformation to a list of 3-D points.
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+    # optimized code
+    if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and
+            Trf.ndim == 3 and pts.ndim == 4):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d + 1:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d]
+        else:
+            raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}')
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim - 2
+            assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+            if pts.ndim > Trf.ndim:
+                # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+                # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+                pts = pts[:, None, :]
+        if pts.shape[-1] + 1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+def inv(mat):
+    """ Invert a torch or numpy matrix
+    """
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f'bad matrix type = {type(mat)}')
+def opencv_camera_to_plucker(poses, K, H, W):
+    device = poses.device
+    B = poses.shape[0]
+    pixel = torch.from_numpy(get_pixel(H, W).astype(np.float32)).to(device).T.reshape(H, W, 3)[None].repeat(B, 1, 1, 1)         # (3, H, W)
+    pixel = torch.einsum('bij, bhwj -> bhwi', torch.inverse(K), pixel)
+    ray_directions = torch.einsum('bij, bhwj -> bhwi', poses[..., :3, :3], pixel)
+    ray_origins = poses[..., :3, 3][:, None, None].repeat(1, H, W, 1)
+    ray_directions = ray_directions / ray_directions.norm(dim=-1, keepdim=True)
+    plucker_normal = torch.cross(ray_origins, ray_directions, dim=-1)
+    plucker_ray = torch.cat([ray_directions, plucker_normal], dim=-1)
+    return plucker_ray
+def depth_edge(depth: torch.Tensor, atol: float = None, rtol: float = None, kernel_size: int = 3, mask: torch.Tensor = None) -> torch.BoolTensor:
+    """
+    Compute the edge mask of a depth map. The edge is defined as the pixels whose neighbors have a large difference in depth.
+    Args:
+        depth (torch.Tensor): shape (..., height, width), linear depth map
+        atol (float): absolute tolerance
+        rtol (float): relative tolerance
+    Returns:
+        edge (torch.Tensor): shape (..., height, width) of dtype torch.bool
+    """
+    shape = depth.shape
+    depth = depth.reshape(-1, 1, *shape[-2:])
+    if mask is not None:
+        mask = mask.reshape(-1, 1, *shape[-2:])
+    if mask is None:
+        diff = (F.max_pool2d(depth, kernel_size, stride=1, padding=kernel_size // 2) + F.max_pool2d(-depth, kernel_size, stride=1, padding=kernel_size // 2))
+    else:
+        diff = (F.max_pool2d(torch.where(mask, depth, -torch.inf), kernel_size, stride=1, padding=kernel_size // 2) + F.max_pool2d(torch.where(mask, -depth, -torch.inf), kernel_size, stride=1, padding=kernel_size // 2))
+    edge = torch.zeros_like(depth, dtype=torch.bool)
+    if atol is not None:
+        edge |= diff > atol
+    if rtol is not None:
+        edge |= (diff / depth).nan_to_num_() > rtol
+    edge = edge.reshape(*shape)
+    return edge

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+torch==2.5.1
+torchvision==0.20.1
+numpy==1.26.4
+pillow
+opencv-python
+plyfile
+huggingface_hub
+safetensors
+# below for gradio
+gradio
+trimesh
+matplotlib
+scipy
+spaces