diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e6bcb5642a2cfb6fa7eaffb79ee6188aa2161a41
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+private/
+__pycache__/
+test.ipynb
+build/
\ No newline at end of file
diff --git a/app.py b/app.py
index 04cc31aa8d0e06aeaac3b59bb361ed71d831e43f..77700b088ea0660fbcc83e45695c542df0df1522 100644
--- a/app.py
+++ b/app.py
@@ -1,7 +1,504 @@
+import spaces
 import gradio as gr
+import torch
+import numpy as np
+from PIL import Image, ImageDraw
+import cv2
+import os
+import copy
+import json
+import logging
+import sys
 
-def greet(name):
-    return "Hello " + name + "!!"
+# --- Logging Configuration ---
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
 
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()
+# Import functions from your engine.py
+from engine import (
+    get_predictors,
+    get_views,
+    prepare_sam2_inputs,
+    must3r_features_and_output,
+    get_single_frame_mask,
+    get_tracked_masks
+)
+
+# --- Global Configuration & Model Loading ---
+
+PREDICTOR_ORIGINAL = None
+PREDICTOR = None
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+def load_models():
+    global PREDICTOR_ORIGINAL, PREDICTOR
+    if PREDICTOR is None or PREDICTOR_ORIGINAL is None:
+        logger.info(f"Initializing models on device: {DEVICE}...")
+        try:
+            PREDICTOR_ORIGINAL, PREDICTOR = get_predictors(device=DEVICE)
+            logger.info("Models loaded successfully.")
+        except Exception as e:
+            logger.error(f"Failed to load models: {e}")
+            raise e
+    return PREDICTOR_ORIGINAL, PREDICTOR
+
+# --- Helper Functions ---
+
+def video_to_frames(video_path, interval=1):
+    """
+    Extract frames from video path to a list of PIL Images.
+    Respects the frame interval (e.g., interval=5 takes every 5th frame).
+    """
+    logger.info(f"Extracting frames from video: {video_path} with interval {interval}")
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    count = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        
+        # Only keep frame if it matches the interval
+        if count % interval == 0:
+            # Convert BGR to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame_rgb))
+        
+        count += 1
+        
+    cap.release()
+    logger.info(f"Extracted {len(frames)} frames (sampled from {count} total frames).")
+    return frames
+
+def draw_points(image_pil, points, labels):
+    """Draws visual markers for clicks on the image."""
+    img_draw = image_pil.copy()
+    draw = ImageDraw.Draw(img_draw)
+    
+    # Radius of points
+    r = 5
+    
+    for pt, lbl in zip(points, labels):
+        x, y = pt
+        if lbl == 1: # Positive
+            color = "green"
+        elif lbl == 0: # Negative
+            color = "red"
+        elif lbl == 2: # Box Top-Left
+            color = "blue"
+        elif lbl == 3: # Box Bottom-Right
+            color = "cyan"
+        else:
+            color = "yellow"
+            
+        draw.ellipse((x-r, y-r, x+r, y+r), fill=color, outline="white")
+        
+    return img_draw
+
+def overlay_mask(image_pil, mask, color=(0, 255, 0), alpha=0.5):
+    """Overlay a binary mask on a PIL image."""
+    if mask is None:
+        return image_pil
+    
+    # Ensure mask is bool or 0/1
+    mask = mask > 0
+    
+    img_np = np.array(image_pil)
+    h, w = img_np.shape[:2]
+    
+    # Resize mask to image size if necessary
+    if mask.shape[0] != h or mask.shape[1] != w:
+        logger.debug(f"Resizing mask from {mask.shape} to {(h, w)}")
+        mask = cv2.resize(mask.astype(np.uint8), (w, h), interpolation=cv2.INTER_NEAREST).astype(bool)
+
+    overlay = img_np.copy()
+    overlay[mask] = np.array(color, dtype=np.uint8)
+    
+    combined = cv2.addWeighted(overlay, alpha, img_np, 1 - alpha, 0)
+    return Image.fromarray(combined)
+
+def create_video_from_masks(frames, masks_dict, output_path="output_tracking.mp4", fps=24):
+    """Combine original frames and tracking masks into a video."""
+    logger.info(f"Creating video output at {output_path} with {len(frames)} frames.")
+    if not frames:
+        logger.warning("No frames to create video.")
+        return None
+    
+    h, w = np.array(frames[0]).shape[:2]
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (w, h))
+    
+    for idx, frame in enumerate(frames):
+        mask = masks_dict.get(idx)
+        if mask is not None:
+            pil_out = overlay_mask(frame, mask, color=(255, 100, 0), alpha=0.6)
+            frame_np = np.array(pil_out)
+        else:
+            frame_np = np.array(frame)
+            
+        frame_bgr = cv2.cvtColor(frame_np, cv2.COLOR_RGB2BGR)
+        out.write(frame_bgr)
+        
+    out.release()
+    logger.info("Video creation complete.")
+    return output_path
+
+# --- GPU Wrapped Functions ---
+
+@spaces.GPU
+def process_video_and_features(video_path, interval):
+    """Load video, subsample frames, get views, MUSt3R features, SAM2 inputs."""
+    logger.info(f"Starting GPU process: Video feature extraction (Interval: {interval})")
+    load_models()
+    
+    # Pass interval to subsample frames immediately
+    pil_imgs = video_to_frames(video_path, interval=interval)
+    if not pil_imgs:
+        raise ValueError("Could not extract frames from video.")
+
+    logger.info("Step 1/3: Getting views...")
+    views, resize_funcs = get_views(pil_imgs)
+    # Ensure consistent resizing
+    pil_imgs_resized = [resize_funcs[0].transforms[0](p) for p in pil_imgs]
+    
+    logger.info("Step 2/3: Extracting MUSt3R features...")
+    must3r_feats, must3r_outputs = must3r_features_and_output(views, device=DEVICE)
+    logger.debug(f"MUSt3R features extracted. Output keys: {must3r_outputs.keys()}")
+    
+    logger.info("Step 3/3: Preparing SAM2 inputs...")
+    sam2_input_images, images_tensor = prepare_sam2_inputs(views, pil_imgs)
+    logger.debug(f"SAM2 input shape: {sam2_input_images.shape}")
+    
+    logger.info("Feature extraction complete.")
+    return pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor
+
+@spaces.GPU
+def generate_frame_mask(image_tensor, points, labels, original_size):
+    """Generate mask for a single frame based on clicks."""
+    logger.info(f"Generating mask for single frame. Points: {len(points)}")
+    load_models()
+    
+    pts_tensor = torch.tensor(points, dtype=torch.float32).unsqueeze(0).to(DEVICE)
+    lbl_tensor = torch.tensor(labels, dtype=torch.int32).unsqueeze(0).to(DEVICE)
+    
+    w, h = original_size
+    # Normalize points
+    pts_tensor[..., 0] /= (w / 1024.0)
+    pts_tensor[..., 1] /= (h / 1024.0)
+
+    try:
+        mask = get_single_frame_mask(
+            image=image_tensor,
+            predictor_original=PREDICTOR_ORIGINAL,
+            points=pts_tensor,
+            labels=lbl_tensor,
+            device=DEVICE
+        )
+        logger.info("Mask generation successful.")
+        mask_np = mask.squeeze().cpu().numpy()
+        return mask_np
+    except Exception as e:
+        logger.error(f"Error during mask generation: {e}")
+        raise e
+
+@spaces.GPU
+def run_tracking(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask):
+    """Track the mask across the video."""
+    logger.info(f"Starting tracking from frame index {start_idx}...")
+    load_models()
+    
+    mask_tensor = torch.tensor(first_frame_mask, dtype=torch.bool).to(DEVICE)
+    
+    try:
+        tracked_masks = get_tracked_masks(
+            sam2_input_images=sam2_input_images,
+            must3r_feats=must3r_feats,
+            must3r_outputs=must3r_outputs,
+            start_idx=start_idx,
+            first_frame_mask=mask_tensor,
+            predictor=PREDICTOR,
+            predictor_original=PREDICTOR_ORIGINAL,
+            device=DEVICE
+        )
+        logger.info(f"Tracking complete. Generated masks for {len(tracked_masks)} frames.")
+        return tracked_masks
+    except Exception as e:
+        logger.error(f"Error during tracking: {e}")
+        raise e
+
+# --- Gradio Callbacks ---
+
+def on_video_upload(video_path, interval):
+    logger.info(f"User uploaded video: {video_path}, Interval: {interval}")
+    if video_path is None:
+        return None, None, gr.Slider(value=0, maximum=0), None
+    
+    try:
+        pil_imgs, views, resize_funcs, must3r_feats, must3r_outputs, sam2_input_images, images_tensor = process_video_and_features(video_path, int(interval))
+    except Exception as e:
+        logger.error(f"Failed to process video: {e}")
+        raise gr.Error(f"Processing failed: {str(e)}")
+    
+    # Initialize state
+    state = {
+        "pil_imgs": pil_imgs,
+        "views": views,
+        "resize_funcs": resize_funcs,
+        "must3r_feats": must3r_feats,
+        "must3r_outputs": must3r_outputs,
+        "sam2_input_images": sam2_input_images,
+        "images_tensor": images_tensor,
+        "current_points": [],
+        "current_labels": [],
+        "current_mask": None,
+        "frame_idx": 0
+    }
+    
+    first_frame = pil_imgs[0]
+    new_slider = gr.Slider(value=0, maximum=len(pil_imgs)-1, step=1, interactive=True)
+    return first_frame, state, new_slider, gr.Image(value=first_frame)
+
+def on_slider_change(state, frame_idx):
+    if not state:
+        return None
+    
+    if frame_idx >= len(state["pil_imgs"]):
+        frame_idx = len(state["pil_imgs"]) - 1
+        
+    state["frame_idx"] = frame_idx
+    state["current_points"] = []
+    state["current_labels"] = []
+    state["current_mask"] = None
+    
+    frame = state["pil_imgs"][frame_idx]
+    return frame
+
+def on_image_click(state, evt: gr.SelectData, mode):
+    """
+    Registers the click, updates state, and draws the point/box corner.
+    Does NOT generate the mask.
+    """
+    if not state:
+        return None
+    
+    x, y = evt.index
+    logger.info(f"User clicked at ({x}, {y}) with mode: {mode}")
+    
+    label_map = {
+        "Positive Point": 1,
+        "Negative Point": 0,
+        "Box Top-Left": 2,
+        "Box Bottom-Right": 3
+    }
+    label = label_map[mode]
+    
+    # Update State
+    state["current_points"].append([x, y])
+    state["current_labels"].append(label)
+    
+    # Visual Feedback Only (Draw points)
+    frame_pil = state["pil_imgs"][state["frame_idx"]]
+    vis_img = draw_points(frame_pil, state["current_points"], state["current_labels"])
+    
+    # Keep old mask visible if it exists, but don't update it yet
+    if state["current_mask"] is not None:
+        vis_img = overlay_mask(vis_img, state["current_mask"])
+        
+    return vis_img
+
+def on_generate_mask_click(state):
+    """
+    Called when 'Generate Mask' button is clicked.
+    Validates inputs (box completion) and triggers GPU mask generation.
+    """
+    if not state:
+        return None
+    
+    logger.info("Generate Mask button clicked.")
+    
+    if not state["current_points"]:
+        raise gr.Error("No points or boxes annotated.")
+
+    # --- BOX VALIDATION LOGIC ---
+    num_tl = state["current_labels"].count(2)
+    num_br = state["current_labels"].count(3)
+    
+    if num_tl != num_br or num_tl > 1:
+        logger.warning(f"Box mismatch: TL={num_tl}, BR={num_br}")
+        raise gr.Error(f"Incomplete box detected! You have {num_tl} top-left(s) and {num_br} bottom-right(s). They must match and be <= 1.")
+
+    # Proceed to inference
+    frame_idx = state["frame_idx"]
+    full_tensor = state["sam2_input_images"]
+    frame_tensor = full_tensor[frame_idx].unsqueeze(0) 
+    original_size = state["pil_imgs"][frame_idx].size 
+    
+    try:
+        mask = generate_frame_mask(
+            frame_tensor, 
+            state["current_points"], 
+            state["current_labels"],
+            original_size
+        )
+    except Exception as e:
+        logger.error(f"Mask generation failed: {e}")
+        raise gr.Error("Failed to generate mask.")
+    
+    state["current_mask"] = mask
+    
+    # Visualization: Draw Mask AND Points
+    frame_pil = state["pil_imgs"][frame_idx]
+    vis_img = overlay_mask(frame_pil, mask)
+    vis_img = draw_points(vis_img, state["current_points"], state["current_labels"])
+    
+    return vis_img
+
+def on_track_click(state):
+    logger.info("Track button clicked.")
+    if not state or state["current_mask"] is None:
+        logger.warning("Track attempted without mask/state.")
+        raise gr.Error("Please annotate a frame and generate a mask first.")
+    
+    # Double check box consistency just in case
+    num_tl = state["current_labels"].count(2)
+    num_br = state["current_labels"].count(3)
+    if num_tl != num_br:
+        raise gr.Error("Incomplete box annotations.")
+        
+    start_idx = state["frame_idx"]
+    first_frame_mask = state["current_mask"]
+    
+    try:
+        tracked_masks_dict = run_tracking(
+            state["sam2_input_images"],
+            state["must3r_feats"],
+            state["must3r_outputs"],
+            start_idx,
+            first_frame_mask
+        )
+        
+        output_path = create_video_from_masks(state["pil_imgs"], tracked_masks_dict)
+        return output_path
+    except Exception as e:
+        logger.error(f"Tracking failed in UI callback: {e}")
+        raise gr.Error(f"Tracking failed: {str(e)}")
+
+def reset_annotations(state):
+    if not state:
+        return None
+    logger.info("Resetting annotations for current frame.")
+    state["current_points"] = []
+    state["current_labels"] = []
+    state["current_mask"] = None
+    frame_idx = state["frame_idx"]
+    return state["pil_imgs"][frame_idx]
+
+# --- App Layout ---
+
+description = """
+<div style="text-align: center;">
+    <h1>3AM: 3egment Anything with Geometric Consistency in Videos</h1>
+    <p>Upload a video, geometric features are extracted automatically. Select a frame, click to annotate objects, and track them in 3D-consistent space.</p>
+</div>
+"""
+
+with gr.Blocks(title="3AM: 3egment Anything") as app:
+    gr.HTML(description)
+    
+    app_state = gr.State()
+    
+    with gr.Row():
+        with gr.Column(scale=1):
+            with gr.Group():
+                video_input = gr.Video(label="Upload Video", sources=["upload"])
+                interval_slider = gr.Slider(
+                    label="Frame Interval (Applied to entire pipeline)", 
+                    minimum=1, 
+                    maximum=30, 
+                    step=1, 
+                    value=1,
+                    info="Process every N-th frame. Higher values = faster processing but lower temporal resolution."
+                )
+            process_status = gr.Textbox(label="Status", value="Waiting for upload...", interactive=False)
+        
+        with gr.Column(scale=2):
+            img_display = gr.Image(label="Annotate Frame", interactive=True)
+            frame_slider = gr.Slider(label="Select Frame", minimum=0, maximum=100, step=1, value=0)
+            
+            with gr.Row():
+                mode_radio = gr.Radio(
+                    choices=["Positive Point", "Negative Point", "Box Top-Left", "Box Bottom-Right"],
+                    value="Positive Point",
+                    label="Annotation Mode"
+                )
+                with gr.Column():
+                    gen_mask_btn = gr.Button("Generate Mask", variant="primary")
+                    reset_btn = gr.Button("Reset Annotations")
+    
+    with gr.Row():
+        track_btn = gr.Button("Start Tracking", variant="primary", scale=1)
+        
+    with gr.Row():
+        video_output = gr.Video(label="Tracking Output", autoplay=True)
+
+    # --- Events ---
+    
+    video_input.upload(
+        fn=lambda: "Processing video (MUSt3R + SAM2)...", 
+        outputs=process_status
+    ).then(
+        fn=on_video_upload,
+        inputs=[video_input, interval_slider],
+        outputs=[img_display, app_state, frame_slider, img_display]
+    ).then(
+        fn=lambda: "Ready to annotate.",
+        outputs=process_status
+    )
+    
+    frame_slider.change(
+        fn=on_slider_change,
+        inputs=[app_state, frame_slider],
+        outputs=[img_display]
+    )
+    
+    # 1. Click on image -> Draw point (no mask gen)
+    img_display.select(
+        fn=on_image_click,
+        inputs=[app_state, mode_radio],
+        outputs=[img_display]
+    )
+    
+    # 2. Click Generate -> Check box consistency & Gen Mask
+    gen_mask_btn.click(
+        fn=on_generate_mask_click,
+        inputs=[app_state],
+        outputs=[img_display]
+    )
+    
+    reset_btn.click(
+        fn=reset_annotations,
+        inputs=[app_state],
+        outputs=[img_display]
+    )
+    
+    track_btn.click(
+        fn=lambda: "Tracking in progress...",
+        outputs=process_status
+    ).then(
+        fn=on_track_click,
+        inputs=[app_state],
+        outputs=[video_output]
+    ).then(
+        fn=lambda: "Tracking complete!",
+        outputs=process_status
+    )
+
+if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
+    app.launch()
\ No newline at end of file
diff --git a/data.py b/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..4861a9970092b5ed788e61d000b4689ea0c57a21
--- /dev/null
+++ b/data.py
@@ -0,0 +1,938 @@
+import json
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+import torch
+from tqdm import tqdm
+import os
+from glob import glob
+from torch.utils.data import Dataset
+from must3r.tools.image import get_resize_function
+from PIL import Image
+import numpy as np
+from einops import rearrange
+from typing import List, Dict, Optional, Tuple
+from pycocotools import mask as mask_utils
+import random, cv2
+from scipy.spatial.transform import Rotation
+SAV_ANNOT_RATE = 4  # SA-V: annotations at 6 fps, video at 24 fps
+
+def load_images(folder_content, size, patch_size = 16, verbose = True):
+    imgs = []
+    transform = ImgNorm = T.Compose([T.ToTensor(), T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+    resize_funcs = []
+    for content in folder_content:
+        if isinstance(content, str):
+            if verbose:
+                print(f'Loading image from {content} ', end = '')
+            rgb_image = Image.open(content).convert('RGB')
+        elif isinstance(content, Image.Image):
+            rgb_image = content
+        else:
+            raise ValueError(f'Unknown content type: {type(content)}')
+        rgb_image.load()
+        W, H = rgb_image.size
+        resize_func, _, to_orig = get_resize_function(size, patch_size, H, W)
+        resize_funcs.append(resize_func)
+        rgb_tensor = resize_func(transform(rgb_image))
+        imgs.append(dict(img=rgb_tensor, true_shape=np.int32([rgb_tensor.shape[-2], rgb_tensor.shape[-1]])))
+        if verbose:
+            print(f'with resolution {W}x{H} --> {rgb_tensor.shape[-1]}x{rgb_tensor.shape[-2]}')
+    return imgs, resize_funcs
+
+
+def _decode_rle(rle: Dict, h: int, w: int) -> np.ndarray:
+    if not rle or "counts" not in rle:
+        return np.zeros((h, w), dtype=np.uint8)
+    counts = rle["counts"]
+    if isinstance(counts, str):
+        counts = counts.encode("utf-8")
+    m = mask_utils.decode({"size": [h, w], "counts": counts})
+    return (np.asarray(m).squeeze() > 0)
+
+def _read_frame_rgb(cap: cv2.VideoCapture, idx: int, fallback_hw: Optional[Tuple[int,int]]=None) -> np.ndarray:
+    ok = cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
+    if not ok:
+        raise RuntimeError(f"cv2.VideoCapture.set({idx}) failed")
+    else:
+        ok, bgr = cap.read()
+    return cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+
+class SAVTrainDataset(Dataset):
+    """
+    SA-V train Dataset (mp4 + {video_id}_{manual|auto}.json).
+    Scans JSON with pattern: root/*/*.json (non-recursive).
+    __getitem__ follows the requested 1–5 procedure.
+    """
+    def __init__(
+        self,
+        data_root: str,
+        mask_type: Optional[str] = None,      # None | "manual" | "auto"
+        img_mean = (0.485, 0.456, 0.406),
+        img_std  = (0.229, 0.224, 0.225),
+        N: int = 8,
+        image_size: int = 1024,
+        verbose: bool = False,
+        max_stride: int = 1,                  # kept for parity, not used in this flow
+        dataset_scale: int = 32,
+        area_thresh: float = 0.01,            # area ratio threshold at original HxW
+        valid_must3r_sizes = [224, 512]
+    ):
+        assert mask_type in (None, "manual", "auto")
+        assert N >= 1
+        self.verbose = verbose
+        self.data_root = data_root
+        self.dataset_scale = int(dataset_scale)
+        self.N = int(N)
+        self.mask_type = mask_type
+        self.area_thresh = float(area_thresh)
+        self.max_stride = int(max_stride)
+        self.valid_must3r_sizes = valid_must3r_sizes
+        self.image_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation=T.InterpolationMode.NEAREST_EXACT),
+            T.Normalize(mean=img_mean, std=img_std),
+        ])
+        self.instance_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation=T.InterpolationMode.NEAREST_EXACT),
+        ])
+
+        # --- collect through JSONs (non-recursive) ---
+        json_paths = glob(os.path.join(data_root, "*", "*.json"))
+        self.items: List[Tuple[str, str]] = []  # (vpath, jpath)
+
+        for jpath in tqdm(json_paths, desc="scanning jsons"):
+            base = os.path.splitext(os.path.basename(jpath))[0]
+            # filter by mask_type if specified
+            if self.mask_type is not None and not base.endswith(f"_{self.mask_type}"):
+                continue
+            if base.endswith("_manual"):
+                vid = base[:-7]
+            elif base.endswith("_auto"):
+                vid = base[:-5]
+            else:
+                # strictly require suffix
+                continue
+            vpath = os.path.join(os.path.dirname(jpath), f"{vid}.mp4")
+            if os.path.isfile(vpath):
+                self.items.append((vpath, jpath))
+
+        print(f"Collected {len(self.items)} video-json pairs")
+
+        self._log_path = "./sav_dataset_resample.log"
+
+    def __len__(self):
+        return self.dataset_scale * len(self.items)
+
+    def _resample(self):
+        return self[random.randrange(len(self))]
+
+    def _log(self, msg: str):
+        try:
+            with open(self._log_path, "a") as f:
+                f.write(msg.rstrip() + "\n")
+        except Exception:
+            pass
+
+    def __getitem__(self, idx: int):
+
+        vpath, jpath = self.items[idx % len(self.items)]
+
+        # 1) load json
+        with open(jpath, "r") as f:
+            meta = json.load(f)
+
+        masklet: List[List[Dict]] = meta.get("masklet", [])
+        if not isinstance(masklet, list) or len(masklet) < self.N:
+            self._log(f"[short_json] {jpath}: len(masklet)={len(masklet)} < N={self.N}")
+            return self._resample()
+
+        H, W = int(meta["video_height"]), int(meta["video_width"])
+
+        # 2) randomly sample a center frame idx in masklet, build sample_indices = [idx-N, idx+N]
+        center = random.randrange(len(masklet))
+        left = max(0, center - self.N * self.max_stride)
+        right = min(len(masklet), center + self.N * self.max_stride)
+        sample_indices = list(range(left, right))
+        
+        if len(sample_indices) < self.N:
+            self._log(f"[short_span] {jpath}: span={len(sample_indices)} < N={self.N}")
+            return self._resample()
+
+        obj_order = None
+        while True:
+            if len(sample_indices) < self.N:
+                self._log(f"[exhausted_span] {jpath}: remaining span < N; resample")
+                return self._resample()
+
+            f0 = sample_indices[0]
+            rles = masklet[f0] if isinstance(masklet[f0], list) else []
+            if len(rles) == 0:
+                # no objects at this frame, pop and continue
+                sample_indices.pop(0)
+                continue
+
+            obj_order = list(range(len(rles)))
+            random.shuffle(obj_order)
+
+            has_valid_id = False
+            for oid in obj_order:
+                m = _decode_rle(rles[oid], H, W)
+                area = int(m.sum())
+                if area <= 0:
+                    continue
+                ratio = area / float(H * W + 1e-6)
+                if ratio >= self.area_thresh:
+                    has_valid_id = True
+                    break
+            if has_valid_id:
+                break
+            else:
+                # tried all object indices, none passed; pop first frame and continue
+                sample_indices.pop(0)
+
+        # downsample sample_indices to exactly N
+        sample_indices = sample_indices[::min(len(sample_indices) // self.N, self.max_stride)][:self.N]
+        assert len(sample_indices) == self.N
+
+        # 5) similar to MOSE dataset: read frames, build masks only at anchor frame
+        cap = cv2.VideoCapture(vpath)
+        frames_rgb = []
+        frame_indices_24 = []
+        for f_annot in sample_indices:
+            f24 = int(f_annot * SAV_ANNOT_RATE)
+            frames_rgb.append(_read_frame_rgb(cap, f24, fallback_hw=(H, W)))
+            frame_indices_24.append(f24)
+        cap.release()
+
+        # build original_images tensor [N, 3, H, W]
+        original_imgs_pil = [Image.fromarray(fr) for fr in frames_rgb]
+        # must3r parity fields
+        must3r_size = np.random.choice(self.valid_must3r_sizes).item()
+        views, resize_funcs = load_images(original_imgs_pil, size = must3r_size, patch_size = 16, verbose = self.verbose)
+        original_instances = []
+        original_imgs = []
+        for frame_idx, (resize_func, sample_idx) in enumerate(zip(resize_funcs, sample_indices)):
+            assert len(resize_func.transforms) == 2, f'Expected 2 transforms, got {len(resize_func.transforms)}'
+            # assert resize_func.transforms[0].size[0] > resize_func.transforms[1].size[0], f'Expected first transform to be larger than second, got {resize_func.transforms[0].size} and {resize_func.transforms[1].size}'
+            # assert resize_func.transforms[0].size[1] / resize_func.transforms[1].size[1] == resize_func.transforms[0].size[0] / resize_func.transforms[1].size[0], f'Expected aspect ratio to be preserved, got {resize_func.transforms[0].size} and {resize_func.transforms[1].size}'
+            if frame_idx == 0:
+                for instance_id in obj_order + [None]:
+                    if instance_id is None:
+                        return self._resample()
+                    if (resize_func.transforms[0](torch.from_numpy(_decode_rle(masklet[sample_idx][instance_id], H, W))).sum() > (resize_func.transforms[0].size[0] * resize_func.transforms[0].size[1] * self.area_thresh)):
+                        break
+        
+            original_instances.append(resize_func.transforms[0](torch.from_numpy(_decode_rle(masklet[sample_idx][instance_id], H, W))))
+            original_imgs.append(resize_func.transforms[0](TF.to_tensor(original_imgs_pil[frame_idx])))
+
+        original_instances = torch.stack(original_instances).squeeze()[:, None]
+        instances = self.instance_transform(original_instances)
+        assert instances[0].sum() > 0 and instances.ndim == 4, f'{instances.shape=}, {instances[0].sum()=}'
+        original_imgs = torch.stack(original_imgs)
+        imgs = self.image_transform(original_imgs)
+
+        return {
+            "original_images": original_imgs,      # [N,3,H,W]
+            "images": imgs,                        # [N,3,S,S]
+            "original_masks": original_instances,  # [N,1,H,W]
+            "masks": instances,                    # [N,1,S,S]
+            "filelist": sample_indices,
+            "must3r_views": views,
+            "video": os.path.splitext(os.path.basename(vpath))[0],
+            "instance_id": int(instance_id),
+            "dataset": "sav",
+            "valid_masks": torch.ones_like(instances), # [N,1,S,S]
+            "must3r_size": must3r_size
+        }
+
+
+class MOSEDataset(Dataset):
+    def __init__(
+            self, 
+            data_root: str, 
+            img_mean = (0.485, 0.456, 0.406),
+            img_std = (0.229, 0.224, 0.225),
+            N: int = 8,
+            image_size: int = 1024,
+            verbose = False,
+            max_stride = 2,
+            dataset_scale = 1,
+            valid_must3r_sizes = [224, 512]
+        ):
+        
+        self.verbose = verbose
+        self.data_root = data_root
+        self.dataset_scale = dataset_scale
+        self.N = N
+        self.max_stride = max_stride
+        self.image_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation = T.InterpolationMode.NEAREST_EXACT),
+            T.Normalize(mean = img_mean, std = img_std)
+        ])
+        self.instance_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation = T.InterpolationMode.NEAREST_EXACT),
+        ])
+        self.valid_must3r_sizes = valid_must3r_sizes
+        self.videos = os.listdir(os.path.join(data_root, 'JPEGImages'))
+        self.frames = {}
+        self.masks = {}
+        self.indices = []
+        for video in tqdm(self.videos):
+            if not os.path.isdir(os.path.join(data_root, 'JPEGImages', video)):
+                continue
+            frames = sorted(glob(os.path.join(data_root, 'JPEGImages', video, '*.jpg')), key = lambda x: int(os.path.basename(x).split('.')[0]))
+            masks = sorted(glob(os.path.join(data_root, 'Annotations', video, '*.png')), key = lambda x: int(os.path.basename(x).split('.')[0]))
+            if len(frames) < self.N:
+                if self.verbose:
+                    print(f"skip video {video} as not enough frames")
+                continue
+            assert len(frames) == len(masks) and len(frames) >= self.N, f'{len(frames)=}, {len(masks)=} in {video}'
+            self.frames[video] = frames
+            self.masks[video] = masks
+            self.indices += [(video, idx) for idx in range(len(frames))]
+
+        print(f'Found {len(self.indices)} frames, and {len(self.frames)} videos, with min length {min([len(self.frames[video]) for video in self.frames])} and max length {max([len(self.frames[video]) for video in self.frames])}')
+
+    def __len__(self):
+        return len(self.indices) * self.dataset_scale
+    
+    
+    def __getitem__(self, idx):
+
+        idx = idx % len(self.indices)
+        video, idx = self.indices[idx]
+        sampled_indices = np.arange(max(0, idx - self.N), idx).tolist() + np.arange(idx, min(len(self.frames[video]), idx + self.N * self.max_stride)).tolist()
+        unique_ids = None
+        
+        while unique_ids is None or len(unique_ids) == 0:
+            if unique_ids is not None:
+                sampled_indices.pop(0)
+            if len(sampled_indices) < self.N:
+                return self[np.random.randint(len(self))]
+            unique_ids, counts = np.unique(np.array(Image.open(self.masks[video][sampled_indices[0]])), return_counts = True)
+            unique_ids = unique_ids[(unique_ids != 0) & (counts > counts.sum() * 0.01)]
+
+        sampled_indices = sampled_indices[::len(sampled_indices) // self.N][:self.N]
+        assert len(unique_ids) > 0 and len(sampled_indices) == self.N
+        
+        filelist = [self.frames[video][idx] for idx in sampled_indices]
+        must3r_size = np.random.choice(self.valid_must3r_sizes).item()
+        views, resize_funcs = load_images(filelist, size = must3r_size, patch_size = 16, verbose = self.verbose)
+        original_instances = []
+        original_imgs = []
+        for frame_idx, (resize_func, sample_idx) in enumerate(zip(resize_funcs, sampled_indices)):    
+            assert len(resize_func.transforms) == 2, f'Expected 2 transforms, got {len(resize_func.transforms)}'
+            # assert resize_func.transforms[0].size[0] > resize_func.transforms[1].size[0], f'Expected first transform to be larger than second, got {resize_func.transforms[0].size} and {resize_func.transforms[1].size}'
+            # assert resize_func.transforms[0].size[1] / resize_func.transforms[1].size[1] == resize_func.transforms[0].size[0] / resize_func.transforms[1].size[0], f'Expected aspect ratio to be preserved, got {resize_func.transforms[0].size} and {resize_func.transforms[1].size}'
+            if frame_idx == 0:
+                for instance_id in np.random.permutation(unique_ids).tolist() + [None]:
+                    if instance_id is None:
+                        return self[np.random.randint(len(self))] 
+                    if (resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][sample_idx]))) == instance_id)).sum() > (resize_func.transforms[0].size[0] * resize_func.transforms[0].size[1] * 0.01):
+                        break
+                
+            original_instances.append(resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][sample_idx]))) == instance_id))
+            original_imgs.append(resize_func.transforms[0](TF.to_tensor(Image.open(self.frames[video][sample_idx]))))
+ 
+        original_instances = torch.stack(original_instances).squeeze()[:, None]
+        instances = self.instance_transform(original_instances)
+        assert instances[0].sum() > 0 and instances.ndim == 4, f'{instances.shape=}, {instances[0].sum()=}'
+        original_imgs = torch.stack(original_imgs)
+        imgs = self.image_transform(original_imgs)
+
+        return {
+            'original_images': original_imgs,
+            'images': imgs,
+            'original_masks': original_instances,
+            'masks': instances,
+            'filelist': filelist,
+            'must3r_views': views,
+            'video': video,
+            'instance_id': instance_id,
+            'dataset': 'mose',
+            'valid_masks': torch.ones_like(instances),
+            'must3r_size': must3r_size,
+        }
+    
+# Reads a Ground truth trajectory file
+def read_trajectory_file(filepath):
+    def _transform_from_Rt(R, t):
+        M = np.identity(4)
+        M[:3, :3] = R
+        M[:3, 3] = t
+        return M
+    # Reads a Ground truth trajectory line
+    def _read_trajectory_line(line):
+        line = line.rstrip().split(",")
+        pose = {}
+        pose["timestamp"] = int(line[1])
+        translation = np.array([float(p) for p in line[3:6]])
+        quat_xyzw = np.array([float(o) for o in line[6:10]])
+        rot_matrix = Rotation.from_quat(quat_xyzw).as_matrix()
+        rot_matrix = np.array(rot_matrix)
+        pose["position"] = translation
+        pose["rotation"] = rot_matrix
+        pose["transform"] = _transform_from_Rt(rot_matrix, translation)
+
+        return pose
+
+    assert os.path.exists(filepath), f"Could not find trajectory file: {filepath}"
+    with open(filepath, "r") as f:
+        _ = f.readline()  # header
+        positions = []
+        rotations = []
+        transforms = []
+        timestamps = []
+        for line in f.readlines():
+            pose = _read_trajectory_line(line)
+            positions.append(pose["position"])
+            rotations.append(pose["rotation"])
+            transforms.append(pose["transform"])
+            timestamps.append(pose["timestamp"])
+        positions = np.stack(positions)
+        rotations = np.stack(rotations)
+        transforms = np.stack(transforms)
+        timestamps = np.array(timestamps)
+
+    return {
+        "ts": positions,
+        "Rs": rotations,
+        "Ts_world_from_device": transforms,
+        "timestamps": timestamps,
+    }
+
+from projectaria_tools.core import calibration
+from projectaria_tools.core.image import InterpolationMethod
+
+class ASEDataset(Dataset):
+    def __init__(
+            self, 
+            data_root: str, 
+            img_mean = (0.485, 0.456, 0.406),
+            img_std = (0.229, 0.224, 0.225),
+            N: int = 8,
+            image_size: int = 1024,
+            verbose = False,
+            dataset_scale = 1,
+            continuous_prob = 0,
+            invalid_classes = ['ceiling', 'wall', 'empty_space', 'background', 'floor', 'window'],
+            valid_must3r_sizes = [224, 512]
+        ):
+        
+        self.verbose = verbose
+        self.data_root = data_root
+        self.dataset_scale = dataset_scale
+        self.continuous_prob = continuous_prob
+        self.N = N
+        self.image_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation = T.InterpolationMode.NEAREST_EXACT),
+            T.Normalize(mean = img_mean, std = img_std)
+        ])
+        self.instance_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation = T.InterpolationMode.NEAREST_EXACT),
+        ])
+        self.valid_must3r_sizes = valid_must3r_sizes
+        from projectaria_tools.projects import ase
+        from projectaria_tools.core import calibration
+
+        self.ase_device = ase.get_ase_rgb_calibration()
+        self.ase_width, self.ase_height = self.ase_device.get_image_size()
+        assert self.ase_width == self.ase_height, f"Expected square images, got {self.ase_width}x{self.ase_height}"
+        self.ase_pinhole = calibration.get_linear_camera_calibration(
+            self.ase_width, self.ase_height, 320, "camera-rgb", self.ase_device.get_transform_device_camera()
+        )
+        self.fx, self.fy = self.ase_pinhole.get_focal_lengths()
+        self.cx, self.cy = self.ase_pinhole.get_principal_point()
+        self.K = np.array([[self.fx, 0,       self.cx],
+                           [0,       self.fy, self.cy],
+                           [0,       0,       1      ]], dtype = np.float32)
+        self.videos = os.listdir(os.path.join(data_root))
+        self.frames = {}
+        self.masks = {}
+        self.must3r_feats = {}
+        self.appearances = {}
+        self.mask2indices = {}
+        self.validindices = {}
+        self.indices = []
+        for video in tqdm(self.videos, desc='Loading ASE videos'):
+            if not os.path.isdir(os.path.join(data_root, video)):
+                print(f"skip {video} as not a directory")
+                continue
+            frames = sorted(glob(os.path.join(data_root, video, 'undistorted', '*.jpg')))
+            masks = sorted(glob(os.path.join(data_root, video, 'undistorted-instances', '*.png')))
+            must3r_feats = sorted(glob(os.path.join(data_root, video, 'must3r-features', '*.pt')))
+            if not  (len(must3r_feats) == len(frames) == len(masks)):
+                if self.verbose:
+                    print(f"skip {video} as {len(must3r_feats)=}, {len(frames)=}, {len(masks)=} in {video}")
+                continue
+            assert all([os.path.splitext(os.path.basename(must3r_feat))[0] == os.path.splitext(os.path.basename(frame))[0] for must3r_feat, frame in zip(must3r_feats, frames)]), f'Must3r features and frames do not match in {video}'
+            if len(frames) < self.N:
+                if self.verbose:                
+                    print(f"skip video {video} as not enough frames")
+                continue
+            self.frames[video] = frames
+            self.masks[video] = masks
+            self.must3r_feats[video] = must3r_feats
+            self.appearances[video] = json.load(open(os.path.join(data_root, video, 'instances-appearances.json')))
+            self.mask2indices[video] = {
+                os.path.basename(m): i for i, m in enumerate(masks)
+            }
+            self.indices += [(video, idx) for idx in range(len(frames) - self.N + 1)]
+            self.validindices[video] = [int(instance_id) for instance_id, class_name in json.load(open(os.path.join(data_root, video, 'object_instances_to_classes.json'))).items() if class_name not in invalid_classes] # if os.path.exists(os.path.join(data_root, video, 'object_instances_to_classes.json')) else None
+        print(f'Found {len(self.indices)} frames, and {len(self.frames)} videos, with min length {min([len(self.frames[video]) for video in self.frames])} and max length {max([len(self.frames[video]) for video in self.frames])} and {sum([(len(ids) if ids is not None else 0) for ids in self.validindices.values()])} valid instances')
+        self._log_path = "./ase_dataset_resample.log"
+
+    def __len__(self):
+        return len(self.indices) * self.dataset_scale
+    
+    
+    def __getitem__(self, idx):
+        
+        idx = idx % len(self.indices)
+        video, idx = self.indices[idx]
+        ## 1. Randomly shuffle frames
+        choices = np.delete(np.arange(len(self.frames[video]) - self.N + 1), idx)
+        sampled_indices = [idx] + np.random.choice(choices, size = len(choices), replace = False).tolist()
+        ## 2. Find unique instance IDs in the first frame
+        unique_ids = None
+        while unique_ids is None or len(unique_ids) == 0:
+            if unique_ids is not None:
+                sampled_indices.pop(0)
+            if len(sampled_indices) < self.N:
+                return self[np.random.randint(len(self))]
+            unique_ids = np.unique(np.array(Image.open(self.masks[video][sampled_indices[0]])), return_counts = False)
+            unique_ids = unique_ids[(unique_ids != 0) & np.array([class_id in self.validindices[video] for class_id in unique_ids])] # if self.validindices[video] is not None else True
+
+        first_frame_idx = sampled_indices[0]
+        assert len(unique_ids) > 0
+        ## 3. Load the resize funcs of the first frame
+        feat_len = torch.load(self.must3r_feats[video][first_frame_idx], map_location = 'cpu')[-1].shape[-2]
+        must3r_size = original_must3r_size = (224 if feat_len == 196 else 512)
+        is_continuous = (np.random.rand() < self.continuous_prob) or original_must3r_size not in self.valid_must3r_sizes
+        if is_continuous:
+            must3r_size = np.random.choice(self.valid_must3r_sizes).item()
+
+        _, [resize_func] = load_images([self.frames[video][first_frame_idx]], size = must3r_size, patch_size = 16, verbose = self.verbose)
+        assert len(resize_func.transforms) == 2, f'Expected 2 transforms, got {len(resize_func.transforms)}'
+        assert must3r_size != original_must3r_size or resize_func.transforms[1].size[0] * resize_func.transforms[1].size[1] == feat_len * 256, f'Expected {resize_func.transforms[1].size[0]}x{resize_func.transforms[1].size[1]} to be {feat_len * 256}, got {feat_len}'
+        for instance_id in np.random.permutation(unique_ids).tolist() + [None]:
+            if instance_id is None:
+                return self[np.random.randint(len(self))]
+            if (resize_func.transforms[0].size[0] * resize_func.transforms[0].size[1] * 0.2) > (resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][first_frame_idx]))) == instance_id)).sum() > (resize_func.transforms[0].size[0] * resize_func.transforms[0].size[1] * 0.01):
+                break
+        if is_continuous:
+            sampled_indices = np.arange(first_frame_idx, min(len(self.frames[video]), first_frame_idx + self.N)).tolist() 
+            # sampled_indices += np.random.choice(first_frame_idx, size = first_frame_idx, replace = False).tolist()
+            sampled_indices = sampled_indices[:self.N]
+            assert len(sampled_indices) == self.N and sampled_indices[0] == first_frame_idx, f'Expected {self.N} sampled indices and first index {first_frame_idx}, got {len(sampled_indices)} with first index {sampled_indices[0]}'
+        else:
+            sampled_indices = np.arange(first_frame_idx, len(self.frames[video])).tolist()[:2]
+            sampled_indices = sorted(sampled_indices, key = lambda sample_idx: resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][sample_idx]))) == instance_id).sum(), reverse = True) ## prioritize frames with larger masks
+            first_frame_idx = sampled_indices[0]
+
+        views, original_instances, original_imgs, filelist, extrinsics, depths, point_maps, fov_ratios = [], [], [], [], [], [], [], {}
+        pre_sampled_len = len(sampled_indices)
+        if len(sampled_indices) < self.N:
+            instance_appearance_candidates = set([self.mask2indices[video][p] for p in self.appearances[video][str(instance_id)]]) - set(sampled_indices)
+            sampled_indices += np.random.permutation(list(instance_appearance_candidates)).tolist()
+            sampled_indices += np.random.permutation(list(set(np.arange(len(self.frames[video])).tolist()) - set(instance_appearance_candidates) - set(sampled_indices))).tolist()
+
+        trajectory = read_trajectory_file(os.path.join(self.data_root, video, 'trajectory.csv'))
+        while len(views) < self.N and len(sampled_indices) >= self.N:
+            sample_idx = sampled_indices[len(views)]
+            [view], [resize_func] = load_images([self.frames[video][sample_idx]], size = must3r_size, patch_size = 16, verbose = self.verbose)
+            instance_map = resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][sample_idx])) == instance_id))
+            if len(views) >= pre_sampled_len and not (instance_map.shape[-1] * instance_map.shape[-2] * 0.005 < instance_map.sum() < instance_map.shape[-1] * instance_map.shape[-2] * 0.25):
+                sampled_indices.pop(len(views))
+                continue
+            extrinsic = trajectory['Ts_world_from_device'][sample_idx] @ self.ase_pinhole.get_transform_device_camera().to_matrix()
+            depth = calibration.distort_by_calibration(
+                np.array(Image.open(self.frames[video][sample_idx].replace('undistorted', 'depth').replace('vignette', 'depth').replace('.jpg', '.png'))), self.ase_pinhole, self.ase_device, InterpolationMethod.NEAREST_NEIGHBOR
+            ).astype(np.float32) / 1000.0
+            point_map = resize_func.transforms[0](torch.rot90(torch.from_numpy(depth_to_world_pointmap(depth, extrinsic, self.K).astype(np.float32)).permute(2, 0, 1), k = -1, dims = (1, 2)))
+            assert point_map.shape[-2] == instance_map.shape[-2], f"Expected height {instance_map.shape[-2]}, got {point_map.shape[-2]}"
+            fov_ratio = None
+            if len(views) < pre_sampled_len or instance_map.sum().item() == 0 or \
+                (fov_ratio := (in_fov_ratio(point_map[:, instance_map].permute(1, 0), extrinsics[0], K = self.K, W = self.ase_height, H = self.ase_width,  ## for rot -90
+                                            W_crop = abs(int(self.ase_height) - original_instances[0].shape[-2]) // 2, 
+                                            H_crop = abs(int(self.ase_width)  - original_instances[0].shape[-1]) // 2)[0])) > 0.25:
+                views.append(view)
+                original_instances.append(instance_map)
+                original_imgs.append(resize_func.transforms[0](TF.to_tensor(Image.open(self.frames[video][sample_idx]))))
+                filelist.append(self.frames[video][sample_idx])
+                extrinsics.append(extrinsic)
+                depths.append(resize_func.transforms[0](torch.rot90(torch.from_numpy(depth), k = -1, dims = (0, 1))))
+                point_maps.append(point_map)
+                fov_ratios[self.frames[video][sample_idx]] = fov_ratio if fov_ratio is not None else -1
+            else:
+                sampled_indices.pop(len(views))
+                continue
+        sampled_indices = sampled_indices[:len(views)]
+        if len(sampled_indices) < self.N:
+            open(self._log_path, "a").write(f"[short_span] {video}: span={len(sampled_indices)} < N={self.N}\n")
+            return self[np.random.randint(len(self))]
+        
+        assert len(sampled_indices) == self.N and sampled_indices[0] == first_frame_idx, f'Expected {self.N} sampled indices and first index {first_frame_idx}, got {len(sampled_indices)} with first index {sampled_indices[0]}'
+        if not is_continuous or (np.random.rand() < 0.8 and must3r_size == original_must3r_size):
+            assert original_must3r_size == must3r_size, f'If not continuous, must3r size should not change, got {must3r_size} and {original_must3r_size}'
+            must3r_feats_filelist = [self.must3r_feats[video][idx] for idx in sampled_indices]
+            must3r_feats = [torch.load(must3r_filepath, map_location = 'cpu') for must3r_filepath in must3r_feats_filelist]
+            must3r_feats_head = torch.cat([f[-1] for f in must3r_feats], dim = 0)
+            must3r_feats = [f[:-1] for f in must3r_feats]
+            must3r_feats = [torch.cat(f, dim = 0) for f in zip(*must3r_feats)]
+            must3r_feats = [
+                rearrange(f, 'b (h w) c -> b c h w', h = views[0]['true_shape'][0] // 16, w = views[0]['true_shape'][1] // 16)
+                for f in must3r_feats
+            ]
+        else:
+            assert is_continuous, f'If must3r size changed, should be continuous sampling, got {must3r_size} and {original_must3r_size}'
+            must3r_feats = None
+            must3r_feats_head = None
+
+        original_instances = torch.stack(original_instances).squeeze()[:, None]
+        instances = self.instance_transform(original_instances)
+        assert instances[0].sum() > 0 and instances.ndim == 4, f'{instances.shape=}, {instances[0].sum()=}'
+        original_imgs = torch.stack(original_imgs)
+        imgs = self.image_transform(original_imgs)
+
+        # if is_continuous:
+        #     permutation = torch.arange(len(instances))
+        # else:
+        #     permutation = torch.argsort(instances.squeeze().sum(dim = (1, 2)), descending = True)
+        permutation = torch.arange(len(instances))
+        permutation[pre_sampled_len:] = torch.randperm(len(instances) - pre_sampled_len) + pre_sampled_len
+        return {
+            'original_images': original_imgs[permutation],
+            'images': imgs[permutation],
+            'original_masks': original_instances[permutation],
+            'masks': instances[permutation],
+            'filelist': [filelist[idx] for idx in permutation],
+            'must3r_views': [views[idx] for idx in permutation],
+            'must3r_size': must3r_size,
+            'video': video,
+            'instance_id': instance_id,
+            'dataset': 'scannetpp',
+            'valid_masks': torch.ones_like(instances),
+            'intrinsics': torch.from_numpy(self.K).unsqueeze(0).repeat(self.N, 1, 1)[permutation],
+            'extrinsics': torch.from_numpy(np.stack(extrinsics, axis = 0))[permutation],
+            'depths': torch.from_numpy(np.stack(depths, axis = 0))[permutation],
+            'point_maps': torch.from_numpy(np.stack(point_maps, axis = 0))[permutation],
+            'fov_ratios': fov_ratios,
+            'is_continuous': is_continuous
+        } | (
+            {
+                'must3r_feats': [f[permutation] for f in must3r_feats],
+                'must3r_feats_head': must3r_feats_head[permutation],
+                'must3r_feats_filelist': [must3r_feats_filelist[idx] for idx in permutation],
+            } if must3r_feats is not None else {}
+        )
+
+def pose_from_qwxyz_txyz(elems):
+    qw, qx, qy, qz, tx, ty, tz = map(float, elems)
+    pose = np.eye(4)
+    pose[:3, :3] = Rotation.from_quat((qx, qy, qz, qw)).as_matrix()
+    pose[:3, 3] = (tx, ty, tz)
+    return np.linalg.inv(pose)  # returns cam2world
+
+def depth_to_world_pointmap(depth, c2w, K, depth_type = 'range'):
+    """
+    depth: (H,W) depth in meters, camera-Z
+    c2w: (4,4) camera-to-world transform
+    K: (3,3) camera intrinsics
+    Returns: (H,W,3) world xyz (NaN for invalid depth)
+    """
+    Kinv = np.linalg.inv(K)
+    H_, W_ = depth.shape
+    ys, xs = np.meshgrid(np.arange(H_), np.arange(W_), indexing='ij')
+    ones = np.ones_like(xs, dtype=np.float64)
+    pix = np.stack([xs, ys, ones], axis=-1).reshape(-1, 3).T            # (3,N)
+    rays_cam = Kinv @ pix                                                # (3,N)
+
+    z = depth.reshape(-1)                                                # (N,)
+    if depth_type == 'range':
+        rays_cam = rays_cam / np.linalg.norm(rays_cam, axis = 0, keepdims = True)  # (3,N)
+    elif depth_type == 'z-buf':
+        pass
+    else:
+        raise ValueError(f'Unknown depth_type {depth_type}')
+
+    xyz_cam = rays_cam * z                                               # scale each ray by depth
+
+    xyz_cam_h = np.vstack([xyz_cam, np.ones_like(z)])                    # (4,N)
+    xyz_w_h = c2w @ xyz_cam_h                                            # (4,N)
+    xyz_w = xyz_w_h[:3].T.reshape(H_, W_, 3)
+
+    mask = (depth <= 0) | ~np.isfinite(depth)
+    xyz_w[mask] = np.nan
+    return xyz_w
+
+def in_fov_ratio(points, c2w, K, H, W, H_crop, W_crop):
+    """
+    points: (N,3) world coords, torch tensor
+    c2w: (4,4) camera-to-world, torch tensor
+    K: (3,3) intrinsics, torch tensor
+    H,W: image size
+    """
+    # device = points.device
+    K = K # .to(device)
+    # world -> camera
+    w2c = np.linalg.inv(c2w) # .to(device)
+    Pc = (points @ w2c[:3, :3].T) + w2c[:3, 3]
+
+    X, Y, Z = Pc[:,0], Pc[:,1], Pc[:,2]
+
+    # projection
+    u = K[0, 0] * (X / Z) + K[0, 2]
+    v = K[1, 1] * (Y / Z) + K[1, 2]
+
+    mask = (Z > 0) & (u >= W_crop) & (u < W - W_crop) & (v >= H_crop) & (v < H - H_crop)
+
+    return mask.float().mean(), mask
+
+class ScanNetPPV2Dataset(Dataset):
+    def __init__(
+            self, 
+            data_root: str,
+            must3r_data_root: str = None,
+            img_mean = (0.485, 0.456, 0.406),
+            img_std = (0.229, 0.224, 0.225),
+            N: int = 8,
+            image_size: int = 1024,
+            verbose = False,
+            dataset_scale = 1,
+            continuous_prob = 0,
+            instance_classes_file = '<your path to scannetppv2>/metadata/semantic_benchmark/top100_instance.txt',
+            split_file: str = '<your path to scannetppv2>/splits/nvs_sem_train.txt',
+            excluding_scenes = ["09d6e808b4", "0f69aefe3d", "1b379f1114", "1cbb105c6a", "2c7c10379b", "46638cfd0f", "4f341f3af0", "6ef2ac745a", "898a7dfd0c", "aa852f7871", "eea4ad9c04", 'd27235711b'], ## horizontal / vertical flip issues
+            valid_must3r_sizes = [224, 512]
+        ):
+        
+        self.verbose = verbose
+        self.data_root = data_root
+        self.must3r_data_root = must3r_data_root if must3r_data_root is not None else data_root
+        self.dataset_scale = dataset_scale
+        self.excluding_scenes = excluding_scenes
+        self.instance_classes = open(instance_classes_file).read().splitlines()
+        self.valid_scene_names = open(split_file).read().splitlines()
+        self.continuous_prob = continuous_prob
+        self.N = N
+        self.image_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation = T.InterpolationMode.NEAREST_EXACT),
+            T.Normalize(mean = img_mean, std = img_std)
+        ])
+        self.instance_transform = T.Compose([
+            T.Resize((image_size, image_size), interpolation = T.InterpolationMode.NEAREST_EXACT),
+        ])
+        self.valid_must3r_sizes = valid_must3r_sizes
+        self.videos = os.listdir(os.path.join(data_root))
+        self.frames = {}
+        self.masks = {}
+        self.must3r_feats = {}
+        self.appearances = {}
+        self.id2label_name = {}
+        self.intrinsics = {}
+        self.extrinsics = {}
+        self.indices = []
+        self._log_path = "./scannetppv2_dataset_resample.log"
+
+        for video in tqdm(self.videos, desc = 'Loading ScanNet++V2 videos'):
+            if video not in self.valid_scene_names or video in self.excluding_scenes:
+                if self.verbose:
+                    print(f"skip {video} as not in split or excluded")
+                continue
+            if not os.path.isdir(os.path.join(data_root, video)):
+                print(f"skip {video} as not a directory")
+                continue
+            if video in ['46638cfd0f']:
+                if self.verbose:
+                    print(f"skip {video} as broken")
+                continue
+            masks = sorted(glob(os.path.join(self.data_root, video, 'iphone', 'render_instance', '*.png')))
+            if len(masks) == 0:
+                if self.verbose:
+                    print(f"skip {video} as no masks found")
+                continue
+            frames = [m.replace('render_instance', 'rgb').replace('.png', '.jpg') for m in masks]
+            must3r_feats = [m.replace(self.data_root, self.must3r_data_root).replace('iphone/render_instance', 'must3r-features').replace('.png', '.pt') for m in masks]
+            if not all([os.path.exists(p) for p in must3r_feats[:1]]):
+                if self.verbose:
+                    print(f"skip {video} as not all must3r features or frames exist")
+                continue
+            # assert  all([os.path.exists(p) for p in frames]), f'Not all frames exist in {video}'
+            self.frames[video] = frames
+            self.masks[video] = masks
+            self.must3r_feats[video] = must3r_feats
+            self.appearances[video] = json.loads(open(os.path.join(data_root, video, 'scans/instance-appearances.json')).read())
+            self.intrinsics[video] = self.load_intrinsics(os.path.join(data_root, video, 'iphone', 'colmap', 'cameras.txt'))
+            assert len(self.intrinsics[video]) == 1, f'Expected 1 camera, got {len(self.intrinsics[video])} in {video}'
+            self.extrinsics[video] = os.path.join(data_root, video, 'iphone', 'colmap', 'images.txt')
+            assert all([f_name == os.path.basename(m) for f_name, m in zip(self.appearances[video]['framenames'], self.masks[video])]), f'Frame names in appearances do not match masks in {video}'
+            self.id2label_name[video] = json.loads(open(os.path.join(data_root, video, 'scans/instance_id2label_name.json')).read())
+            self.indices += [(video, idx) for idx in range(len(frames) - self.N + 1)]
+            
+        print(f'Found {len(self.indices)} frames, and {len(self.frames)} videos, with min length {min([len(self.frames[video]) for video in self.frames])} and max length {max([len(self.frames[video]) for video in self.frames])}')
+
+    def load_intrinsics(self, path):
+        with open(path, 'r') as f:
+            raw = f.read().splitlines()[3:]  # skip header
+        intrinsics = {}
+        for camera in tqdm(raw, position = 1, leave = False):
+            camera = camera.split(' ')
+            intrinsics[int(camera[0])] = [camera[1]] + [float(cam) for cam in camera[2:]]
+        return intrinsics
+    
+    def __len__(self):
+        return len(self.indices) * self.dataset_scale
+    
+    def __getitem__(self, idx):
+
+        idx = idx % len(self.indices)
+        video, idx = self.indices[idx]
+        if len(glob(os.path.join(self.data_root, video, 'iphone/depth/*.png'))) == 0:
+            return self[np.random.randint(len(self))]
+
+        ## 1. Randomly shuffle frames
+        choices = np.delete(np.arange(len(self.frames[video]) - self.N + 1), idx)
+        sampled_indices = [idx] + np.random.choice(choices, size = len(choices), replace = False).tolist()
+        ## 2. Find unique instance IDs in the first frame
+        unique_ids = None
+        while unique_ids is None or len(unique_ids) == 0:
+            if unique_ids is not None:
+                sampled_indices.pop(0)
+            if len(sampled_indices) == 0:
+                return self[np.random.randint(len(self))]
+            unique_ids, _ = np.unique(np.array(Image.open(self.masks[video][sampled_indices[0]])), return_counts = True)
+            unique_ids = unique_ids[np.array([class_id not in [0, 65535] and self.id2label_name[video][str(class_id)] in self.instance_classes and all([s not in self.id2label_name[video][str(class_id)].lower() for s in ['wall', 'floor', 'ceiling', 'window', 'curtain', 'blind', 'table']]) for class_id in unique_ids])]
+
+        first_frame_idx = sampled_indices[0]
+        assert len(unique_ids) > 0
+        ## 3. Load the resize funcs of the first frame
+        feat_len = torch.load(self.must3r_feats[video][first_frame_idx], map_location = 'cpu')[-1].shape[-2]
+        must3r_size = original_must3r_size = (224 if feat_len == 196 else 512)
+        is_continuous = (np.random.rand() < self.continuous_prob) or original_must3r_size not in self.valid_must3r_sizes
+        if is_continuous:
+            must3r_size = np.random.choice(self.valid_must3r_sizes).item()
+
+        _, [resize_func] = load_images([self.frames[video][first_frame_idx]], size = must3r_size, patch_size = 16, verbose = self.verbose)
+        assert len(resize_func.transforms) == 2, f'Expected 2 transforms, got {len(resize_func.transforms)}'
+        # assert resize_func.transforms[0].size[0] > resize_func.transforms[1].size[0], f'Expected first transform to be larger than second, got {resize_func.transforms[0].size} and {resize_func.transforms[1].size}'
+        # assert resize_func.transforms[0].size[1] / resize_func.transforms[1].size[1] == resize_func.transforms[0].size[0] / resize_func.transforms[1].size[0], f'Expected aspect ratio to be preserved, got {resize_func.transforms[0].size} and {resize_func.transforms[1].size}'
+        assert must3r_size != original_must3r_size or resize_func.transforms[1].size[0] * resize_func.transforms[1].size[1] == feat_len * 256, f'Expected {resize_func.transforms[1].size[0]}x{resize_func.transforms[1].size[1]} to be {feat_len * 256}, got {feat_len}'
+        for instance_id in np.random.permutation(unique_ids).tolist() + [None]:
+            if instance_id is None:
+                return self[np.random.randint(len(self))]
+            if (resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][first_frame_idx]))) == instance_id)).sum() > (resize_func.transforms[0].size[0] * resize_func.transforms[0].size[1] * 0.01):
+                break
+
+        if is_continuous:
+            sampled_indices = np.arange(first_frame_idx, len(self.frames[video])).tolist()
+            # sampled_indices += np.random.permutation(list(set(np.arange(len(self.frames[video])).tolist()) - set(self.appearances[video][str(instance_id)]) - set(sampled_indices))).tolist()
+            sampled_indices = sampled_indices[:self.N]
+            assert len(sampled_indices) == self.N and sampled_indices[0] == first_frame_idx, f'Expected {self.N} sampled indices and first index {first_frame_idx}, got {len(sampled_indices)} with first index {sampled_indices[0]}'
+        else:
+            sampled_indices = np.arange(first_frame_idx, len(self.frames[video])).tolist()[:2]
+            sampled_indices = sorted(sampled_indices, key = lambda sample_idx: resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][sample_idx]))) == instance_id).sum(), reverse = True) ## prioritize frames with larger masks
+            first_frame_idx = sampled_indices[0]
+
+        raw_poses = {
+            raw.split()[-1].split('iphone/')[-1].split('video/')[-1]: raw.split()[1:-1]
+            for raw in open(self.extrinsics[video], 'r').read().splitlines() if (not raw.startswith('#')) and len(raw.split()) > 0
+        }    
+        views, original_instances, original_imgs, filelist, extrinsics, raw_intrinsics, intrinsics, depths, point_maps, fov_ratios = [], [], [], [], [], [], [], [], [], {}
+        pre_sampled_len = len(sampled_indices) 
+        if len(sampled_indices) < self.N:
+            sampled_indices = sampled_indices + np.random.permutation(list(set(self.appearances[video][self.id2label_name[video][str(instance_id)]]) - set(sampled_indices))).tolist() + \
+                                                np.random.permutation(list(set(np.arange(len(self.frames[video])).tolist()) - set(self.appearances[video][self.id2label_name[video][str(instance_id)]]) - set(sampled_indices))).tolist()
+        while len(views) < self.N and len(sampled_indices) >= self.N:
+            sample_idx = sampled_indices[len(views)]
+            [view], [resize_func] = load_images([self.frames[video][sample_idx]], size = must3r_size, patch_size = 16, verbose = self.verbose)
+            instance_map = resize_func.transforms[0](torch.from_numpy(np.array(Image.open(self.masks[video][sample_idx])) == instance_id))
+            if len(views) >= pre_sampled_len and (0 < instance_map.sum() < instance_map.shape[-1] * instance_map.shape[-2] * 0.01):
+                sampled_indices.pop(len(views))
+                continue
+            f_name = os.path.basename(self.frames[video][sample_idx])
+            extrinsic = pose_from_qwxyz_txyz(raw_poses[f_name][:-1])
+            raw_intrinsic = self.intrinsics[video][int(raw_poses[f_name][-1])]
+            intrinsic = np.array([[raw_intrinsic[3], 0,                raw_intrinsic[5]],
+                                  [0,                raw_intrinsic[4], raw_intrinsic[6]],
+                                  [0,                0,                1               ]], dtype = np.float32)
+            depth = np.array(Image.open(self.frames[video][sample_idx].replace('rgb', 'depth').replace('.jpg', '.png')).resize((int(raw_intrinsic[1]), int(raw_intrinsic[2]))), dtype = np.float32) / 1000.0
+            point_map = resize_func.transforms[0](torch.from_numpy(depth_to_world_pointmap(depth, extrinsic, intrinsic).astype(np.float32)).permute(2, 0, 1))
+            assert point_map.shape[-2] == instance_map.shape[-2] == int(raw_intrinsic[2]), f'Expected height {int(raw_intrinsic[2])}, got {point_map.shape[-2]} and {instance_map.shape[-2]}'
+            fov_ratio = None
+            if len(views) < pre_sampled_len or instance_map.sum().item() == 0 or \
+                (fov_ratio := (in_fov_ratio(point_map[:, instance_map].permute(1, 0), extrinsics[0], K = intrinsics[0], H = int(raw_intrinsics[0][2]), W = int(raw_intrinsics[0][1]), 
+                                            H_crop = abs(int(raw_intrinsics[0][2]) - original_instances[0].shape[-2]) // 2, 
+                                            W_crop = abs(int(raw_intrinsics[0][1]) - original_instances[0].shape[-1]) // 2)[0])) > 0.25:
+                views.append(view)
+                original_instances.append(instance_map)
+                original_imgs.append(resize_func.transforms[0](TF.to_tensor(Image.open(self.frames[video][sample_idx]))))
+                filelist.append(self.frames[video][sample_idx])
+                extrinsics.append(extrinsic)
+                raw_intrinsics.append(raw_intrinsic)
+                intrinsics.append(intrinsic)
+                depths.append(resize_func.transforms[0](torch.from_numpy(depth)))
+                point_maps.append(point_map)
+                fov_ratios[self.frames[video][sample_idx]] = fov_ratio if fov_ratio is not None else -1
+            else:
+                sampled_indices.pop(len(views))
+                continue
+
+        sampled_indices = sampled_indices[:len(views)]
+        if len(sampled_indices) < self.N:
+            open(self._log_path, "a").write(f"[short_span] {video}: span={len(sampled_indices)} < N={self.N}\n")
+            return self[np.random.randint(len(self))]
+        assert len(sampled_indices) == self.N and sampled_indices[0] == first_frame_idx, f'Expected {self.N} sampled indices and first index {first_frame_idx}, got {len(sampled_indices)} with first index {sampled_indices[0]}'
+        if not is_continuous or (np.random.rand() < 0.8 and must3r_size == original_must3r_size):
+            assert original_must3r_size == must3r_size, f'If not continuous, must3r size should not change, got {must3r_size} and {original_must3r_size}'
+            must3r_feats_filelist = [self.must3r_feats[video][idx] for idx in sampled_indices]
+            must3r_feats = [torch.load(must3r_filepath, map_location = 'cpu') for must3r_filepath in must3r_feats_filelist]
+            must3r_feats_head = torch.cat([f[-1] for f in must3r_feats], dim = 0)
+            must3r_feats = [f[:-1] for f in must3r_feats]
+            must3r_feats = [torch.cat(f, dim = 0) for f in zip(*must3r_feats)]
+            must3r_feats = [
+                rearrange(f, 'b (h w) c -> b c h w', h = views[0]['true_shape'][0] // 16, w = views[0]['true_shape'][1] // 16)
+                for f in must3r_feats
+            ]
+        else:
+            assert is_continuous, f'If must3r size changed, should be continuous sampling, got {must3r_size} and {original_must3r_size}'
+            must3r_feats = None
+            must3r_feats_head = None
+        
+        original_instances = torch.stack(original_instances).squeeze()[:, None]
+        instances = self.instance_transform(original_instances)
+        assert instances[0].sum() > 0 and instances.ndim == 4, f'{instances.shape=}, {instances[0].sum()=}'
+        # assert instances[1:].sum() == 0, f"Only first frame should have the instance, got {instances.sum()=}"
+        original_imgs = torch.stack(original_imgs)
+        imgs = self.image_transform(original_imgs)
+    
+        # if is_continuous:
+        #     permutation = torch.arange(len(instances))
+        # else:
+        #     permutation = torch.argsort(instances.squeeze().sum(dim = (1, 2)), descending = True)
+        permutation = torch.arange(len(instances))
+        permutation[pre_sampled_len:] = torch.randperm(len(instances) - pre_sampled_len) + pre_sampled_len
+        return {
+            'original_images': original_imgs[permutation],
+            'images': imgs[permutation],
+            'original_masks': original_instances[permutation],
+            'masks': instances[permutation],
+            'filelist': [filelist[idx] for idx in permutation],
+            'must3r_views': [views[idx] for idx in permutation],
+            'must3r_size': must3r_size,
+            'video': video,
+            'instance_id': instance_id,
+            'dataset': 'scannetpp',
+            'valid_masks': torch.ones_like(instances),
+            'intrinsics': torch.from_numpy(np.stack(intrinsics, axis = 0))[permutation],
+            'extrinsics': torch.from_numpy(np.stack(extrinsics, axis = 0))[permutation],
+            'depths': torch.from_numpy(np.stack(depths, axis = 0))[permutation],
+            'point_maps': torch.from_numpy(np.stack(point_maps, axis = 0))[permutation],
+            'fov_ratios': fov_ratios,
+            'is_continuous': is_continuous,
+        } | (
+            {
+                'must3r_feats': [f[permutation] for f in must3r_feats],
+                'must3r_feats_head': must3r_feats_head[permutation],
+                'must3r_feats_filelist': [must3r_feats_filelist[idx] for idx in permutation],
+            } if must3r_feats is not None else {}
+        )
\ No newline at end of file
diff --git a/dust3r/.gitignore b/dust3r/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..194e236cbd708160926c3513b4232285eb47b029
--- /dev/null
+++ b/dust3r/.gitignore
@@ -0,0 +1,132 @@
+data/
+checkpoints/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/dust3r/.gitmodules b/dust3r/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..c950ef981a8d2e47599dd7acbbe1bf8de9a42aca
--- /dev/null
+++ b/dust3r/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "croco"]
+	path = croco
+	url = https://github.com/naver/croco
diff --git a/dust3r/LICENSE b/dust3r/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..a97986e3a8ddd49973959f6c748dfa8b881b64d3
--- /dev/null
+++ b/dust3r/LICENSE
@@ -0,0 +1,7 @@
+DUSt3R, Copyright (c) 2024-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
diff --git a/dust3r/NOTICE b/dust3r/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..81da544dd534c5465361f35cf6a5a0cfff7c1d3f
--- /dev/null
+++ b/dust3r/NOTICE
@@ -0,0 +1,12 @@
+DUSt3R
+Copyright 2024-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms. 
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+naver/croco
+https://github.com/naver/croco/
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0
diff --git a/dust3r/README.md b/dust3r/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..858d00b9dad1289f30c3e3533875a6a9808b92f8
--- /dev/null
+++ b/dust3r/README.md
@@ -0,0 +1,394 @@
+![demo](assets/dust3r.jpg)
+
+Official implementation of `DUSt3R: Geometric 3D Vision Made Easy`  
+[[Project page](https://dust3r.europe.naverlabs.com/)], [[DUSt3R arxiv](https://arxiv.org/abs/2312.14132)]  
+
+> Make sure to also check our other works:  
+> [Grounding Image Matching in 3D with MASt3R](https://github.com/naver/mast3r): DUSt3R with a local feature head, metric pointmaps, and a more scalable global alignment!  
+> [Pow3R: Empowering Unconstrained 3D Reconstruction with Camera and Scene Priors](https://github.com/naver/pow3r): DUSt3R with known depth / focal length / poses.  
+> [MUSt3R: Multi-view Network for Stereo 3D Reconstruction](https://github.com/naver/must3r): Multi-view predictions (RGB SLAM/SfM) without any global alignment.    
+
+![Example of reconstruction from two images](assets/pipeline1.jpg)
+
+![High level overview of DUSt3R capabilities](assets/dust3r_archi.jpg)
+
+```bibtex
+@inproceedings{dust3r_cvpr24,
+      title={DUSt3R: Geometric 3D Vision Made Easy}, 
+      author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+      booktitle = {CVPR},
+      year = {2024}
+}
+
+@misc{dust3r_arxiv23,
+      title={DUSt3R: Geometric 3D Vision Made Easy}, 
+      author={Shuzhe Wang and Vincent Leroy and Yohann Cabon and Boris Chidlovskii and Jerome Revaud},
+      year={2023},
+      eprint={2312.14132},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
+
+## Table of Contents
+
+- [Table of Contents](#table-of-contents)
+- [License](#license)
+- [Get Started](#get-started)
+  - [Installation](#installation)
+  - [Checkpoints](#checkpoints)
+  - [Interactive demo](#interactive-demo)
+  - [Interactive demo with docker](#interactive-demo-with-docker)
+- [Usage](#usage)
+- [Training](#training)
+  - [Datasets](#datasets)
+  - [Demo](#demo)
+  - [Our Hyperparameters](#our-hyperparameters)
+
+## License
+
+The code is distributed under the CC BY-NC-SA 4.0 License.
+See [LICENSE](LICENSE) for more information.
+
+```python
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+```
+
+## Get Started
+
+### Installation
+
+1. Clone DUSt3R.
+```bash
+git clone --recursive https://github.com/naver/dust3r
+cd dust3r
+# if you have already cloned dust3r:
+# git submodule update --init --recursive
+```
+
+2. Create the environment, here we show an example using conda.
+```bash
+conda create -n dust3r python=3.11 cmake=3.14.0
+conda activate dust3r 
+conda install pytorch torchvision pytorch-cuda=12.1 -c pytorch -c nvidia  # use the correct version of cuda for your system
+pip install -r requirements.txt
+# Optional: you can also install additional packages to:
+# - add support for HEIC images
+# - add pyrender, used to render depthmap in some datasets preprocessing
+# - add required packages for visloc.py
+pip install -r requirements_optional.txt
+```
+
+3. Optional, compile the cuda kernels for RoPE (as in CroCo v2).
+```bash
+# DUST3R relies on RoPE positional embeddings for which you can compile some cuda kernels for faster runtime.
+cd croco/models/curope/
+python setup.py build_ext --inplace
+cd ../../../
+```
+
+### Checkpoints
+
+You can obtain the checkpoints by two ways:
+
+1) You can use our huggingface_hub integration: the models will be downloaded automatically.
+
+2) Otherwise, We provide several pre-trained models:
+
+| Modelname   | Training resolutions | Head | Encoder | Decoder |
+|-------------|----------------------|------|---------|---------|
+| [`DUSt3R_ViTLarge_BaseDecoder_224_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_224_linear.pth) | 224x224 | Linear | ViT-L | ViT-B |
+| [`DUSt3R_ViTLarge_BaseDecoder_512_linear.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_linear.pth)   | 512x384, 512x336, 512x288, 512x256, 512x160 | Linear | ViT-L | ViT-B |
+| [`DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth) | 512x384, 512x336, 512x288, 512x256, 512x160 | DPT | ViT-L | ViT-B |
+
+You can check the hyperparameters we used to train these models in the [section: Our Hyperparameters](#our-hyperparameters)
+
+To download a specific model, for example `DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth`:
+```bash
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth -P checkpoints/
+```
+
+For the checkpoints, make sure to agree to the license of all the public training datasets and base checkpoints we used, in addition to CC-BY-NC-SA 4.0. Again, see [section: Our Hyperparameters](#our-hyperparameters) for details.
+
+### Interactive demo
+
+In this demo, you should be able run DUSt3R on your machine to reconstruct a scene.
+First select images that depicts the same scene.
+
+You can adjust the global alignment schedule and its number of iterations.
+
+> [!NOTE]
+> If you selected one or two images, the global alignment procedure will be skipped (mode=GlobalAlignerMode.PairViewer)
+
+Hit "Run" and wait.
+When the global alignment ends, the reconstruction appears.
+Use the slider "min_conf_thr" to show or remove low confidence areas.
+
+```bash
+python3 demo.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt
+
+# Use --weights to load a checkpoint from a local file, eg --weights checkpoints/DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth
+# Use --image_size to select the correct resolution for the selected checkpoint. 512 (default) or 224
+# Use --local_network to make it accessible on the local network, or --server_name to specify the url manually
+# Use --server_port to change the port, by default it will search for an available port starting at 7860
+# Use --device to use a different device, by default it's "cuda"
+```
+
+### Interactive demo with docker
+
+To run DUSt3R using Docker, including with NVIDIA CUDA support, follow these instructions:
+
+1. **Install Docker**: If not already installed, download and install `docker` and `docker compose` from the [Docker website](https://www.docker.com/get-started).
+
+2. **Install NVIDIA Docker Toolkit**: For GPU support, install the NVIDIA Docker toolkit from the [Nvidia website](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+
+3. **Build the Docker image and run it**: `cd` into the `./docker` directory and run the following commands: 
+
+```bash
+cd docker
+bash run.sh --with-cuda --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+```
+
+Or if you want to run the demo without CUDA support, run the following command:
+
+```bash 
+cd docker
+bash run.sh --model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+```
+
+By default, `demo.py` is lanched with the option `--local_network`.  
+Visit `http://localhost:7860/` to access the web UI (or replace `localhost` with the machine's name to access it from the network).  
+
+`run.sh` will launch docker-compose using either the [docker-compose-cuda.yml](docker/docker-compose-cuda.yml) or [docker-compose-cpu.ym](docker/docker-compose-cpu.yml) config file, then it starts the demo using [entrypoint.sh](docker/files/entrypoint.sh).
+
+
+![demo](assets/demo.jpg)
+
+## Usage
+
+```python
+from dust3r.inference import inference
+from dust3r.model import AsymmetricCroCo3DStereo
+from dust3r.utils.image import load_images
+from dust3r.image_pairs import make_pairs
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+
+if __name__ == '__main__':
+    device = 'cuda'
+    batch_size = 1
+    schedule = 'cosine'
+    lr = 0.01
+    niter = 300
+
+    model_name = "naver/DUSt3R_ViTLarge_BaseDecoder_512_dpt"
+    # you can put the path to a local checkpoint in model_name if needed
+    model = AsymmetricCroCo3DStereo.from_pretrained(model_name).to(device)
+    # load_images can take a list of images or a directory
+    images = load_images(['croco/assets/Chateau1.png', 'croco/assets/Chateau2.png'], size=512)
+    pairs = make_pairs(images, scene_graph='complete', prefilter=None, symmetrize=True)
+    output = inference(pairs, model, device, batch_size=batch_size)
+
+    # at this stage, you have the raw dust3r predictions
+    view1, pred1 = output['view1'], output['pred1']
+    view2, pred2 = output['view2'], output['pred2']
+    # here, view1, pred1, view2, pred2 are dicts of lists of len(2)
+    #  -> because we symmetrize we have (im1, im2) and (im2, im1) pairs
+    # in each view you have:
+    # an integer image identifier: view1['idx'] and view2['idx']
+    # the img: view1['img'] and view2['img']
+    # the image shape: view1['true_shape'] and view2['true_shape']
+    # an instance string output by the dataloader: view1['instance'] and view2['instance']
+    # pred1 and pred2 contains the confidence values: pred1['conf'] and pred2['conf']
+    # pred1 contains 3D points for view1['img'] in view1['img'] space: pred1['pts3d']
+    # pred2 contains 3D points for view2['img'] in view1['img'] space: pred2['pts3d_in_other_view']
+
+    # next we'll use the global_aligner to align the predictions
+    # depending on your task, you may be fine with the raw output and not need it
+    # with only two input images, you could use GlobalAlignerMode.PairViewer: it would just convert the output
+    # if using GlobalAlignerMode.PairViewer, no need to run compute_global_alignment
+    scene = global_aligner(output, device=device, mode=GlobalAlignerMode.PointCloudOptimizer)
+    loss = scene.compute_global_alignment(init="mst", niter=niter, schedule=schedule, lr=lr)
+
+    # retrieve useful values from scene:
+    imgs = scene.imgs
+    focals = scene.get_focals()
+    poses = scene.get_im_poses()
+    pts3d = scene.get_pts3d()
+    confidence_masks = scene.get_masks()
+
+    # visualize reconstruction
+    scene.show()
+
+    # find 2D-2D matches between the two images
+    from dust3r.utils.geometry import find_reciprocal_matches, xy_grid
+    pts2d_list, pts3d_list = [], []
+    for i in range(2):
+        conf_i = confidence_masks[i].cpu().numpy()
+        pts2d_list.append(xy_grid(*imgs[i].shape[:2][::-1])[conf_i])  # imgs[i].shape[:2] = (H, W)
+        pts3d_list.append(pts3d[i].detach().cpu().numpy()[conf_i])
+    reciprocal_in_P2, nn2_in_P1, num_matches = find_reciprocal_matches(*pts3d_list)
+    print(f'found {num_matches} matches')
+    matches_im1 = pts2d_list[1][reciprocal_in_P2]
+    matches_im0 = pts2d_list[0][nn2_in_P1][reciprocal_in_P2]
+
+    # visualize a few matches
+    import numpy as np
+    from matplotlib import pyplot as pl
+    n_viz = 10
+    match_idx_to_viz = np.round(np.linspace(0, num_matches-1, n_viz)).astype(int)
+    viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+
+    H0, W0, H1, W1 = *imgs[0].shape[:2], *imgs[1].shape[:2]
+    img0 = np.pad(imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+    img1 = np.pad(imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+    img = np.concatenate((img0, img1), axis=1)
+    pl.figure()
+    pl.imshow(img)
+    cmap = pl.get_cmap('jet')
+    for i in range(n_viz):
+        (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T
+        pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+    pl.show(block=True)
+
+```
+![matching example on croco pair](assets/matching.jpg)
+
+## Training
+
+In this section, we present a short demonstration to get started with training DUSt3R.
+
+### Datasets
+At this moment, we have added the following training datasets:
+  - [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE)
+  - [ARKitScenes](https://github.com/apple/ARKitScenes) - [Creative Commons Attribution-NonCommercial-ShareAlike 4.0](https://github.com/apple/ARKitScenes/tree/main?tab=readme-ov-file#license)
+  - [ScanNet++](https://kaldir.vc.in.tum.de/scannetpp/) - [non-commercial research and educational purposes](https://kaldir.vc.in.tum.de/scannetpp/static/scannetpp-terms-of-use.pdf)
+  - [BlendedMVS](https://github.com/YoYo000/BlendedMVS) - [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/)
+  - [WayMo Open dataset](https://github.com/waymo-research/waymo-open-dataset) - [Non-Commercial Use](https://waymo.com/open/terms/)
+  - [Habitat-Sim](https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md)
+  - [MegaDepth](https://www.cs.cornell.edu/projects/megadepth/)
+  - [StaticThings3D](https://github.com/lmb-freiburg/robustmvd/blob/master/rmvd/data/README.md#staticthings3d)
+  - [WildRGB-D](https://github.com/wildrgbd/wildrgbd/)
+
+For each dataset, we provide a preprocessing script in the `datasets_preprocess` directory and an archive containing the list of pairs when needed.
+You have to download the datasets yourself from their official sources, agree to their license, download our list of pairs, and run the preprocessing script.
+
+Links:  
+  
+[ARKitScenes pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/arkitscenes_pairs.zip)  
+[ScanNet++ v1 pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/scannetpp_pairs.zip)  
+[ScanNet++ v2 pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/scannetpp_v2_pairs.zip)  
+[BlendedMVS pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/blendedmvs_pairs.npy)  
+[WayMo Open dataset pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/waymo_pairs.npz)  
+[Habitat metadata](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/habitat_5views_v1_512x512_metadata.tar.gz)  
+[MegaDepth pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/megadepth_pairs.npz)  
+[StaticThings3D pairs](https://download.europe.naverlabs.com/ComputerVision/DUSt3R/staticthings_pairs.npy)  
+
+> [!NOTE]
+> They are not strictly equivalent to what was used to train DUSt3R, but they should be close enough.
+
+### Demo
+For this training demo, we're going to download and prepare a subset of [CO3Dv2](https://github.com/facebookresearch/co3d) - [Creative Commons Attribution-NonCommercial 4.0 International](https://github.com/facebookresearch/co3d/blob/main/LICENSE) and launch the training code on it.
+The demo model will be trained for a few epochs on a very small dataset.
+It will not be very good.
+
+```bash
+# download and prepare the co3d subset
+mkdir -p data/co3d_subset
+cd data/co3d_subset
+git clone https://github.com/facebookresearch/co3d
+cd co3d
+python3 ./co3d/download_dataset.py --download_folder ../ --single_sequence_subset
+rm ../*.zip
+cd ../../..
+
+python3 datasets_preprocess/preprocess_co3d.py --co3d_dir data/co3d_subset --output_dir data/co3d_subset_processed  --single_sequence_subset
+
+# download the pretrained croco v2 checkpoint
+mkdir -p checkpoints/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth -P checkpoints/
+
+# the training of dust3r is done in 3 steps.
+# for this example we'll do fewer epochs, for the actual hyperparameters we used in the paper, see the next section: "Our Hyperparameters"
+# step 1 - train dust3r for 224 resolution
+torchrun --nproc_per_node=4 train.py \
+    --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter)" \
+    --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=224, seed=777)" \
+    --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --pretrained "checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 16 --accum_iter 1 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 \
+    --output_dir "checkpoints/dust3r_demo_224"	  
+
+# step 2 - train dust3r for 512 resolution
+torchrun --nproc_per_node=4 train.py \
+    --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \
+    --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \
+    --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --pretrained "checkpoints/dust3r_demo_224/checkpoint-best.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 4 --accum_iter 4 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 \
+    --output_dir "checkpoints/dust3r_demo_512"
+
+# step 3 - train dust3r for 512 resolution with dpt
+torchrun --nproc_per_node=4 train.py \
+    --train_dataset "1000 @ Co3d(split='train', ROOT='data/co3d_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)" \
+    --test_dataset "100 @ Co3d(split='test', ROOT='data/co3d_subset_processed', resolution=(512,384), seed=777)" \
+    --model "AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --train_criterion "ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion "Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --pretrained "checkpoints/dust3r_demo_512/checkpoint-best.pth" \
+    --lr 0.0001 --min_lr 1e-06 --warmup_epochs 1 --epochs 10 --batch_size 2 --accum_iter 8 \
+    --save_freq 1 --keep_freq 5 --eval_freq 1 --disable_cudnn_benchmark \
+    --output_dir "checkpoints/dust3r_demo_512dpt"
+
+```
+
+### Our Hyperparameters
+
+Here are the commands we used for training the models:
+
+```bash
+# NOTE: ROOT path omitted for datasets
+# 224 linear
+torchrun --nproc_per_node 8 train.py \
+    --train_dataset=" + 100_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ BlendedMVS(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ MegaDepth(split='train', aug_crop=16, resolution=224, transform=ColorJitter) + 100_000 @ ARKitScenes(aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=224, transform=ColorJitter) + 100_000 @ ScanNetpp(split='train', aug_crop=256, resolution=224, transform=ColorJitter) + 100_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=224, transform=ColorJitter) " \
+    --test_dataset=" Habitat(1_000, split='val', resolution=224, seed=777) + 1_000 @ BlendedMVS(split='val', resolution=224, seed=777) + 1_000 @ MegaDepth(split='val', resolution=224, seed=777) + 1_000 @ Co3d(split='test', mask_bg='rand', resolution=224, seed=777) " \
+    --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', img_size=(224, 224), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --pretrained="checkpoints/CroCo_V2_ViTLarge_BaseDecoder.pth" \
+    --lr=0.0001 --min_lr=1e-06 --warmup_epochs=10 --epochs=100 --batch_size=16 --accum_iter=1 \
+    --save_freq=5 --keep_freq=10 --eval_freq=1 \
+    --output_dir="checkpoints/dust3r_224"
+
+# 512 linear
+torchrun --nproc_per_node 8 train.py \
+    --train_dataset=" + 10_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepth(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \
+    --test_dataset=" Habitat(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepth(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d(split='test', resolution=(512,384), seed=777) " \
+    --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='linear', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --pretrained="checkpoints/dust3r_224/checkpoint-best.pth" \
+    --lr=0.0001 --min_lr=1e-06 --warmup_epochs=20 --epochs=100 --batch_size=4 --accum_iter=2 \
+    --save_freq=10 --keep_freq=10 --eval_freq=1 --print_freq=10 \
+    --output_dir="checkpoints/dust3r_512"
+
+# 512 dpt
+torchrun --nproc_per_node 8 train.py \
+    --train_dataset=" + 10_000 @ Habitat(1_000_000, split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ BlendedMVS(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ MegaDepth(split='train', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ARKitScenes(aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ Co3d(split='train', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ StaticThings3D(aug_crop=256, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ ScanNetpp(split='train', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) + 10_000 @ InternalUnreleasedDataset(aug_crop=128, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter) " \
+    --test_dataset=" Habitat(1_000, split='val', resolution=(512,384), seed=777) + 1_000 @ BlendedMVS(split='val', resolution=(512,384), seed=777) + 1_000 @ MegaDepth(split='val', resolution=(512,336), seed=777) + 1_000 @ Co3d(split='test', resolution=(512,384), seed=777) " \
+    --train_criterion="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)" \
+    --test_criterion="Regr3D_ScaleShiftInv(L21, gt_scale=True)" \
+    --model="AsymmetricCroCo3DStereo(pos_embed='RoPE100', patch_embed_cls='ManyAR_PatchEmbed', img_size=(512, 512), head_type='dpt', output_mode='pts3d', depth_mode=('exp', -inf, inf), conf_mode=('exp', 1, inf), enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_depth=12, dec_num_heads=12)" \
+    --pretrained="checkpoints/dust3r_512/checkpoint-best.pth" \
+    --lr=0.0001 --min_lr=1e-06 --warmup_epochs=15 --epochs=90 --batch_size=4 --accum_iter=2 \
+    --save_freq=5 --keep_freq=10 --eval_freq=1 --print_freq=10 --disable_cudnn_benchmark \
+    --output_dir="checkpoints/dust3r_512dpt"
+
+```
diff --git a/dust3r/__init__.py b/dust3r/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..95a6c4443436adedeba058eafe95cc66e1bc98bc
--- /dev/null
+++ b/dust3r/__init__.py
@@ -0,0 +1,3 @@
+import os
+__path__.append(os.path.join(os.path.dirname(__file__), 'dust3r'))
+__path__.append(os.path.join(os.path.dirname(__file__), 'croco'))
\ No newline at end of file
diff --git a/dust3r/croco/LICENSE b/dust3r/croco/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..d9b84b1a65f9db6d8920a9048d162f52ba3ea56d
--- /dev/null
+++ b/dust3r/croco/LICENSE
@@ -0,0 +1,52 @@
+CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license.
+
+A summary of the CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/
+
+The CC BY-NC-SA 4.0 license is located here:
+	https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode
+	
+	
+SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+
+This software in this file incorporates parts of the following software available here:
+
+Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py
+available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE
+
+MoCo v3: https://github.com/facebookresearch/moco-v3
+available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE
+
+DeiT: https://github.com/facebookresearch/deit
+available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE
+
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/facebookresearch/mae/blob/main/LICENSE
+
+Attribution-NonCommercial 4.0 International
+
+***************************
+
+NOTICE WITH RESPECT TO THE FILE: models/blocks.py
+
+This software is being redistributed in a modifiled form. The original form is available here:
+
+https://github.com/rwightman/pytorch-image-models
+
+ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW:
+
+https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/dust3r/croco/NOTICE b/dust3r/croco/NOTICE
new file mode 100644
index 0000000000000000000000000000000000000000..d51bb365036c12d428d6e3a4fd00885756d5261c
--- /dev/null
+++ b/dust3r/croco/NOTICE
@@ -0,0 +1,21 @@
+CroCo
+Copyright 2022-present NAVER Corp.
+
+This project contains subcomponents with separate copyright notices and license terms. 
+Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses.
+
+====
+
+facebookresearch/mae
+https://github.com/facebookresearch/mae
+
+Attribution-NonCommercial 4.0 International
+
+====
+
+rwightman/pytorch-image-models
+https://github.com/rwightman/pytorch-image-models
+
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
\ No newline at end of file
diff --git a/dust3r/croco/README.MD b/dust3r/croco/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..38e33b001a60bd16749317fb297acd60f28a6f1b
--- /dev/null
+++ b/dust3r/croco/README.MD
@@ -0,0 +1,124 @@
+# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow
+
+[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)]
+
+This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2:
+
+![image](assets/arch.jpg)
+
+```bibtex
+@inproceedings{croco,
+  title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}},
+  author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}},
+  booktitle={{NeurIPS}},
+  year={2022}
+}
+
+@inproceedings{croco_v2,
+  title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}},
+  author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me}, 
+  booktitle={ICCV},
+  year={2023}
+}
+```
+
+## License
+
+The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information.
+Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License.
+Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license.
+
+## Preparation
+
+1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version.
+
+```bash
+conda create -n croco python=3.7 cmake=3.14.0
+conda activate croco
+conda install habitat-sim headless -c conda-forge -c aihabitat
+conda install pytorch torchvision -c pytorch
+conda install notebook ipykernel matplotlib
+conda install ipywidgets widgetsnbextension
+conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation
+
+```
+
+2. Compile cuda kernels for RoPE
+
+CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels.
+```bash
+cd models/curope/
+python setup.py build_ext --inplace
+cd ../../
+```
+
+This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only.
+You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation.
+
+In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded.
+
+3. Download pre-trained model
+
+We provide several pre-trained models:
+
+| modelname                                                                                                                          | pre-training data | pos. embed. | Encoder | Decoder |
+|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------|
+| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth)                                                 | Habitat           | cosine      | ViT-B   | Small   |
+| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real    | RoPE        | ViT-B   | Small   |
+| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth)   | Habitat + real    | RoPE        | ViT-B   | Base    |
+| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real    | RoPE        | ViT-L   | Base    |
+
+To download a specific model, i.e., the first one (`CroCo.pth`)
+```bash
+mkdir -p pretrained_models/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/
+```
+
+## Reconstruction example
+
+Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`)
+```bash
+python demo.py
+```
+
+## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator
+
+First download the test scene from Habitat:
+```bash
+python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/
+```
+
+Then, run the Notebook demo `interactive_demo.ipynb`.
+
+In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo.
+![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg)
+
+## Pre-training 
+
+### CroCo 
+
+To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command:
+```
+torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/
+```
+
+Our CroCo pre-training was launched on a single server with 4 GPUs.
+It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+The first run can take a few minutes to start, to parse all available pre-training pairs.
+
+### CroCo v2 
+
+For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD).
+Then, run the following command for the largest model (ViT-L encoder, Base decoder):
+```
+torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/
+```
+
+Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases.
+The largest model should take around 12 days on A100.
+Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case.
+
+## Stereo matching and Optical flow downstream tasks
+
+For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD).
diff --git a/dust3r/croco/croco-stereo-flow-demo.ipynb b/dust3r/croco/croco-stereo-flow-demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2b00a7607ab5f82d1857041969bfec977e56b3e0
--- /dev/null
+++ b/dust3r/croco/croco-stereo-flow-demo.ipynb
@@ -0,0 +1,191 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9bca0f41",
+   "metadata": {},
+   "source": [
+    "# Simple inference example with CroCo-Stereo or CroCo-Flow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "80653ef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f033862",
+   "metadata": {},
+   "source": [
+    "First download the model(s) of your choice by running\n",
+    "```\n",
+    "bash stereoflow/download_model.sh crocostereo.pth\n",
+    "bash stereoflow/download_model.sh crocoflow.pth\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb2e392",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "import matplotlib.pylab as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0e25d77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from stereoflow.test import _load_model_and_criterion\n",
+    "from stereoflow.engine import tiled_pred\n",
+    "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n",
+    "from stereoflow.datasets_flow import flowToColor\n",
+    "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86a921f5",
+   "metadata": {},
+   "source": [
+    "### CroCo-Stereo example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64e483cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_left_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_right_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0d04303",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47dc14b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).squeeze(0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "583b9f16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(vis_disparity(pred))\n",
+    "plt.axis('off')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2df5d70",
+   "metadata": {},
+   "source": [
+    "### CroCo-Flow example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ee257a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image1 = np.asarray(Image.open('<path_to_first_image>'))\n",
+    "image2 = np.asarray(Image.open('<path_to_second_image>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5edccf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b19692c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n",
+    "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n",
+    "with torch.inference_mode():\n",
+    "    pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n",
+    "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26f79db3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(flowToColor(pred))\n",
+    "plt.axis('off')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/dust3r/croco/datasets/__init__.py b/dust3r/croco/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dust3r/croco/datasets/crops/README.MD b/dust3r/croco/datasets/crops/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..47ddabebb177644694ee247ae878173a3a16644f
--- /dev/null
+++ b/dust3r/croco/datasets/crops/README.MD
@@ -0,0 +1,104 @@
+## Generation of crops from the real datasets
+
+The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL.
+
+### Download the metadata of the crops to generate 
+
+First, download the metadata and put them in `./data/`:
+```
+mkdir -p data
+cd data/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip
+unzip crop_metadata.zip
+rm crop_metadata.zip
+cd ..
+```
+
+### Prepare the original datasets 
+
+Second, download the original datasets in `./data/original_datasets/`.
+```
+mkdir -p data/original_datasets
+```
+
+##### ARKitScenes
+
+Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`.
+The resulting file structure should be like:
+```
+./data/original_datasets/ARKitScenes/
+└───Training
+    └───40753679
+     │  │   ultrawide
+     │  │   ...
+     └───40753686
+     │   
+      ...
+```
+
+##### MegaDepth
+
+Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`.
+The resulting file structure should be like:
+
+```
+./data/original_datasets/MegaDepth/
+└───0000
+│   └───images
+│    │      │   1000557903_87fa96b8a4_o.jpg
+│    │      └ ...
+│    └─── ...
+└───0001
+│   │   
+│   └ ...
+└─── ...
+```
+
+##### 3DStreetView
+
+Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`.
+The resulting file structure should be like:
+
+``` 
+./data/original_datasets/3DStreetView/
+└───dataset_aligned
+│   └───0002
+│    │      │   0000002_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+└───dataset_unaligned
+│   └───0003
+│    │      │   0000003_0000001_0000002_0000001.jpg
+│    │      └ ...
+│    └─── ...
+```
+
+##### IndoorVL
+
+Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture).
+
+```
+pip install kapture
+mkdir -p ./data/original_datasets/IndoorVL
+cd ./data/original_datasets/IndoorVL
+kapture_download_dataset.py update
+kapture_download_dataset.py install  "HyundaiDepartmentStore_*"
+kapture_download_dataset.py install  "GangnamStation_*"
+cd -
+```
+
+### Extract the crops
+
+Now, extract the crops for each of the dataset:
+```
+for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL; 
+do 
+  python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500;
+done
+```
+
+##### Note for IndoorVL
+
+Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper.
+To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively.
+The impact on the performance is negligible.
diff --git a/dust3r/croco/datasets/crops/extract_crops_from_images.py b/dust3r/croco/datasets/crops/extract_crops_from_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb66a0474ce44b54c44c08887cbafdb045b11ff3
--- /dev/null
+++ b/dust3r/croco/datasets/crops/extract_crops_from_images.py
@@ -0,0 +1,159 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# Extracting crops for pre-training
+# --------------------------------------------------------
+
+import os
+import argparse
+from tqdm import tqdm
+from PIL import Image
+import functools
+from multiprocessing import Pool
+import math
+
+
+def arg_parser():
+    parser = argparse.ArgumentParser('Generate cropped image pairs from image crop list')
+
+    parser.add_argument('--crops', type=str, required=True, help='crop file')
+    parser.add_argument('--root-dir', type=str, required=True, help='root directory')
+    parser.add_argument('--output-dir', type=str, required=True, help='output directory')
+    parser.add_argument('--imsize', type=int, default=256, help='size of the crops')
+    parser.add_argument('--nthread', type=int, required=True, help='number of simultaneous threads')
+    parser.add_argument('--max-subdir-levels', type=int, default=5, help='maximum number of subdirectories')
+    parser.add_argument('--ideal-number-pairs-in-dir', type=int, default=500, help='number of pairs stored in a dir')
+    return parser
+
+
+def main(args):
+    listing_path = os.path.join(args.output_dir, 'listing.txt')
+
+    print(f'Loading list of crops ... ({args.nthread} threads)')
+    crops, num_crops_to_generate = load_crop_file(args.crops)
+
+    print(f'Preparing jobs ({len(crops)} candidate image pairs)...')
+    num_levels = min(math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), args.max_subdir_levels)
+    num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1/num_levels))
+
+    jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir)
+    del crops
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map
+    call = functools.partial(save_image_crops, args)
+
+    print(f"Generating cropped images to {args.output_dir} ...")
+    with open(listing_path, 'w') as listing:
+        listing.write('# pair_path\n')
+        for results in tqdm(mmap(call, jobs), total=len(jobs)):
+            for path in results:
+                listing.write(f'{path}\n')
+    print('Finished writing listing to', listing_path)
+
+
+def load_crop_file(path):
+    data = open(path).read().splitlines()
+    pairs = []
+    num_crops_to_generate = 0
+    for line in tqdm(data):
+        if line.startswith('#'):
+            continue
+        line = line.split(', ')
+        if len(line) < 8:
+            img1, img2, rotation = line
+            pairs.append((img1, img2, int(rotation), []))
+        else:
+            l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line)
+            rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2)
+            pairs[-1][-1].append((rect1, rect2))
+            num_crops_to_generate += 1
+    return pairs, num_crops_to_generate
+
+
+def prepare_jobs(pairs, num_levels, num_pairs_in_dir):
+    jobs = []
+    powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))]
+
+    def get_path(idx):
+        idx_array = []
+        d = idx
+        for level in range(num_levels - 1):
+            idx_array.append(idx // powers[level])
+            idx = idx % powers[level]
+        idx_array.append(d)
+        return '/'.join(map(lambda x: hex(x)[2:], idx_array))
+
+    idx = 0
+    for pair_data in tqdm(pairs):
+        img1, img2, rotation, crops = pair_data
+        if -60 <= rotation and rotation <= 60:
+            rotation = 0  # most likely not a true rotation
+        paths = [get_path(idx + k) for k in range(len(crops))]
+        idx += len(crops)
+        jobs.append(((img1, img2), rotation, crops, paths))
+    return jobs
+
+
+def load_image(path):
+    try:
+        return Image.open(path).convert('RGB')
+    except Exception as e:
+        print('skipping', path, e)
+        raise OSError()
+
+
+def save_image_crops(args, data):
+    # load images
+    img_pair, rot, crops, paths = data
+    try:
+        img1, img2 = [load_image(os.path.join(args.root_dir, impath)) for impath in img_pair]
+    except OSError as e:
+        return []
+
+    def area(sz):
+        return sz[0] * sz[1]
+
+    tgt_size = (args.imsize, args.imsize)
+
+    def prepare_crop(img, rect, rot=0):
+        # actual crop
+        img = img.crop(rect)
+
+        # resize to desired size
+        interp = Image.Resampling.LANCZOS if area(img.size) > 4*area(tgt_size) else Image.Resampling.BICUBIC
+        img = img.resize(tgt_size, resample=interp)
+
+        # rotate the image
+        rot90 = (round(rot/90) % 4) * 90
+        if rot90 == 90:
+            img = img.transpose(Image.Transpose.ROTATE_90)
+        elif rot90 == 180:
+            img = img.transpose(Image.Transpose.ROTATE_180)
+        elif rot90 == 270:
+            img = img.transpose(Image.Transpose.ROTATE_270)
+        return img
+
+    results = []
+    for (rect1, rect2), path in zip(crops, paths):
+        crop1 = prepare_crop(img1, rect1)
+        crop2 = prepare_crop(img2, rect2, rot)
+
+        fullpath1 = os.path.join(args.output_dir,  path+'_1.jpg')
+        fullpath2 = os.path.join(args.output_dir,  path+'_2.jpg')
+        os.makedirs(os.path.dirname(fullpath1), exist_ok=True)
+
+        assert not os.path.isfile(fullpath1), fullpath1
+        assert not os.path.isfile(fullpath2), fullpath2
+        crop1.save(fullpath1)
+        crop2.save(fullpath2)
+        results.append(path)
+
+    return results
+
+
+if __name__ == '__main__':
+    args = arg_parser().parse_args()
+    main(args)
+
diff --git a/dust3r/croco/datasets/habitat_sim/README.MD b/dust3r/croco/datasets/habitat_sim/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..a505781ff9eb91bce7f1d189e848f8ba1c560940
--- /dev/null
+++ b/dust3r/croco/datasets/habitat_sim/README.MD
@@ -0,0 +1,76 @@
+## Generation of synthetic image pairs using Habitat-Sim
+
+These instructions allow to generate pre-training pairs from the Habitat simulator.
+As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent.
+
+### Download Habitat-Sim scenes
+Download Habitat-Sim scenes:
+- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets.
+- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`.
+```
+./data/
+└──habitat-sim-data/
+   └──scene_datasets/
+      ├──hm3d/
+      ├──gibson/
+      ├──habitat-test-scenes/
+      ├──replica_cad_baked_lighting/
+      ├──replica_cad/
+      ├──ReplicaDataset/
+      └──scannet/
+```
+
+### Image pairs generation
+We provide metadata to generate reproducible images pairs for pretraining and validation.
+Experiments described in the paper used similar data, but whose generation was not reproducible at the time.
+
+Specifications:
+- 256x256 resolution images, with 60 degrees field of view .
+- Up to 1000 image pairs per scene.
+- Number of scenes considered/number of images pairs per dataset:
+  - Scannet: 1097 scenes / 985 209 pairs
+  - HM3D:
+    - hm3d/train: 800 / 800k pairs
+    - hm3d/val: 100 scenes / 100k pairs
+    - hm3d/minival: 10 scenes / 10k pairs
+  - habitat-test-scenes: 3 scenes / 3k pairs
+  - replica_cad_baked_lighting: 13 scenes / 13k pairs
+
+- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes.
+
+Download metadata and extract it:
+```bash
+mkdir -p data/habitat_release_metadata/
+cd data/habitat_release_metadata/
+wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz
+tar -xvf multiview_habitat_metadata.tar.gz
+cd ../..
+# Location of the metadata
+METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata"
+```
+
+Generate image pairs from metadata:
+- The following command will print a list of commandlines to generate image pairs for each scene:
+```bash
+# Target output directory
+PAIRS_DATASET_DIR="./data/habitat_release/"
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR
+```
+- One can launch multiple of such commands in parallel e.g. using GNU Parallel:
+```bash
+python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16
+```
+
+## Metadata generation
+
+Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible:
+```bash
+# Print commandlines to generate image pairs from the different scenes available.
+PAIRS_DATASET_DIR=MY_CUSTOM_PATH
+python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR
+
+# Once a dataset is generated, pack metadata files for reproducibility.
+METADATA_DIR=MY_CUSTON_PATH
+python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR  $METADATA_DIR
+```
diff --git a/dust3r/croco/datasets/habitat_sim/__init__.py b/dust3r/croco/datasets/habitat_sim/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py b/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe0d399084359495250dc8184671ff498adfbf2
--- /dev/null
+++ b/dust3r/croco/datasets/habitat_sim/generate_from_metadata.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script to generate image pairs for a given scene reproducing poses provided in a metadata file.
+"""
+import os
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator
+from datasets.habitat_sim.paths import SCENES_DATASET
+import argparse
+import quaternion
+import PIL.Image
+import cv2
+import json
+from tqdm import tqdm
+
+def generate_multiview_images_from_metadata(metadata_filename,
+                                            output_dir,
+                                            overload_params = dict(),
+                                            scene_datasets_paths=None,
+                                            exist_ok=False):   
+    """
+    Generate images from a metadata file for reproducibility purposes.
+    """
+    # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label
+    if scene_datasets_paths is not None:
+        scene_datasets_paths = dict(sorted(scene_datasets_paths.items(), key= lambda x: len(x[0]), reverse=True))
+
+    with open(metadata_filename, 'r') as f:
+        input_metadata = json.load(f)
+    metadata = dict()
+    for key, value in input_metadata.items():
+        # Optionally replace some paths
+        if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+            if scene_datasets_paths is not None:
+                for dataset_label, dataset_path in scene_datasets_paths.items():
+                    if value.startswith(dataset_label):
+                        value = os.path.normpath(os.path.join(dataset_path, os.path.relpath(value, dataset_label)))
+                        break
+        metadata[key] = value
+
+    # Overload some parameters
+    for key, value in overload_params.items():
+        metadata[key] = value
+
+    generation_entries = dict([(key, value) for key, value in metadata.items() if not (key in ('multiviews', 'output_dir', 'generate_depth'))])
+    generate_depth = metadata["generate_depth"]
+
+    os.makedirs(output_dir, exist_ok=exist_ok)
+ 
+    generator = MultiviewHabitatSimGenerator(**generation_entries)
+
+    # Generate views
+    for idx_label, data in tqdm(metadata['multiviews'].items()):
+        positions = data["positions"]
+        orientations = data["orientations"]
+        n = len(positions)
+        for oidx in range(n):
+            observation = generator.render_viewpoint(positions[oidx], quaternion.from_float_array(orientations[oidx]))
+            observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+            # Color image saved using PIL
+            img = PIL.Image.fromarray(observation['color'][:,:,:3])
+            filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+            img.save(filename)
+            if generate_depth:
+                # Depth image as EXR file
+                filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+                cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+                # Camera parameters
+                camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+                filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+                with open(filename, "w") as f:
+                    json.dump(camera_params, f)
+                # Save metadata
+    with open(os.path.join(output_dir, "metadata.json"), "w") as f:
+        json.dump(metadata, f)
+
+    generator.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--metadata_filename", required=True)
+    parser.add_argument("--output_dir", required=True)
+    args = parser.parse_args()
+
+    generate_multiview_images_from_metadata(metadata_filename=args.metadata_filename,
+                             output_dir=args.output_dir,
+                             scene_datasets_paths=SCENES_DATASET,
+                             overload_params=dict(),
+                             exist_ok=True)
+
+ 
\ No newline at end of file
diff --git a/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py b/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..962ef849d8c31397b8622df4f2d9140175d78873
--- /dev/null
+++ b/dust3r/croco/datasets/habitat_sim/generate_from_metadata_files.py
@@ -0,0 +1,27 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Script generating commandlines to generate image pairs from metadata files.
+"""
+import os
+import glob
+from tqdm import tqdm
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_dir", required=True)
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--prefix", default="", help="Commanline prefix, useful e.g. to setup environment.")
+    args = parser.parse_args()
+
+    input_metadata_filenames = glob.iglob(f"{args.input_dir}/**/metadata.json", recursive=True)
+
+    for metadata_filename in tqdm(input_metadata_filenames):
+        output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(metadata_filename), args.input_dir))
+        # Do not process the scene if the metadata file already exists
+        if os.path.exists(os.path.join(output_dir, "metadata.json")):
+            continue
+        commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}"
+        print(commandline)
diff --git a/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py b/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..421d49a1696474415940493296b3f2d982398850
--- /dev/null
+++ b/dust3r/croco/datasets/habitat_sim/generate_multiview_images.py
@@ -0,0 +1,177 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from tqdm import tqdm
+import argparse
+import PIL.Image
+import numpy as np
+import json
+from datasets.habitat_sim.multiview_habitat_sim_generator import MultiviewHabitatSimGenerator, NoNaviguableSpaceError
+from datasets.habitat_sim.paths import list_scenes_available
+import cv2
+import quaternion
+import shutil
+
+def generate_multiview_images_for_scene(scene_dataset_config_file,
+                                        scene,
+                                        navmesh,
+                                        output_dir, 
+                                        views_count,
+                                        size, 
+                                        exist_ok=False, 
+                                        generate_depth=False,
+                                        **kwargs):
+    """
+    Generate tuples of overlapping views for a given scene.
+    generate_depth: generate depth images and camera parameters.
+    """
+    if os.path.exists(output_dir) and not exist_ok:
+        print(f"Scene {scene}: data already generated. Ignoring generation.")
+        return
+    try:
+        print(f"Scene {scene}: {size} multiview acquisitions to generate...")
+        os.makedirs(output_dir, exist_ok=exist_ok)
+
+        metadata_filename = os.path.join(output_dir, "metadata.json")
+
+        metadata_template = dict(scene_dataset_config_file=scene_dataset_config_file,
+            scene=scene, 
+            navmesh=navmesh,
+            views_count=views_count,
+            size=size,
+            generate_depth=generate_depth,
+            **kwargs)
+        metadata_template["multiviews"] = dict()
+
+        if os.path.exists(metadata_filename):
+            print("Metadata file already exists:", metadata_filename)
+            print("Loading already generated metadata file...")
+            with open(metadata_filename, "r") as f:
+                metadata = json.load(f)
+
+            for key in metadata_template.keys():
+                if key != "multiviews":
+                    assert metadata_template[key] == metadata[key], f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}."
+        else:
+            print("No temporary file found. Starting generation from scratch...")
+            metadata = metadata_template
+
+        starting_id = len(metadata["multiviews"])
+        print(f"Starting generation from index {starting_id}/{size}...")
+        if starting_id >= size:
+            print("Generation already done.")
+            return
+
+        generator = MultiviewHabitatSimGenerator(scene_dataset_config_file=scene_dataset_config_file,
+                                                scene=scene,
+                                                navmesh=navmesh,
+                                                views_count = views_count,
+                                                size = size,
+                                                **kwargs)
+
+        for idx in tqdm(range(starting_id, size)):
+            # Generate / re-generate the observations
+            try:
+                data = generator[idx]
+                observations = data["observations"]
+                positions = data["positions"]
+                orientations = data["orientations"]
+
+                idx_label = f"{idx:08}"
+                for oidx, observation in enumerate(observations):
+                    observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1
+                    # Color image saved using PIL
+                    img = PIL.Image.fromarray(observation['color'][:,:,:3])
+                    filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg")
+                    img.save(filename)
+                    if generate_depth:
+                        # Depth image as EXR file
+                        filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_depth.exr")
+                        cv2.imwrite(filename, observation['depth'], [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+                        # Camera parameters
+                        camera_params = dict([(key, observation[key].tolist()) for key in ("camera_intrinsics", "R_cam2world", "t_cam2world")])
+                        filename = os.path.join(output_dir, f"{idx_label}_{observation_label}_camera_params.json")
+                        with open(filename, "w") as f:
+                            json.dump(camera_params, f)
+                metadata["multiviews"][idx_label] = {"positions": positions.tolist(),
+                                                    "orientations": orientations.tolist(),
+                                                    "covisibility_ratios": data["covisibility_ratios"].tolist(),
+                                                    "valid_fractions": data["valid_fractions"].tolist(),
+                                                    "pairwise_visibility_ratios": data["pairwise_visibility_ratios"].tolist()}
+            except RecursionError:
+                print("Recursion error: unable to sample observations for this scene. We will stop there.")
+                break
+
+            # Regularly save a temporary metadata file, in case we need to restart the generation
+            if idx % 10 == 0:
+                with open(metadata_filename, "w") as f:
+                    json.dump(metadata, f)
+
+        # Save metadata
+        with open(metadata_filename, "w") as f:
+            json.dump(metadata, f)
+
+        generator.close()
+    except NoNaviguableSpaceError:
+        pass
+
+def create_commandline(scene_data, generate_depth, exist_ok=False):
+    """
+    Create a commandline string to generate a scene.
+    """
+    def my_formatting(val):
+        if val is None or val == "":
+            return '""'
+        else:
+            return val
+    commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)} 
+    --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)} 
+    --navmesh {my_formatting(scene_data.navmesh)} 
+    --output_dir {my_formatting(scene_data.output_dir)} 
+    --generate_depth {int(generate_depth)} 
+    --exist_ok {int(exist_ok)}
+    """
+    commandline = " ".join(commandline.split())
+    return commandline
+
+if __name__ == "__main__":
+    os.umask(2)
+
+    parser = argparse.ArgumentParser(description="""Example of use -- listing commands to generate data for scenes available:
+    > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands
+    """)
+    
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument("--list_commands", action='store_true', help="list commandlines to run if true")
+    parser.add_argument("--scene", type=str, default="")
+    parser.add_argument("--scene_dataset_config_file", type=str, default="")
+    parser.add_argument("--navmesh", type=str, default="")
+    
+    parser.add_argument("--generate_depth", type=int, default=1)
+    parser.add_argument("--exist_ok", type=int, default=0)
+
+    kwargs = dict(resolution=(256,256), hfov=60, views_count = 2, size=1000)
+
+    args = parser.parse_args()
+    generate_depth=bool(args.generate_depth)
+    exist_ok = bool(args.exist_ok)
+
+    if args.list_commands:
+        # Listing scenes available...
+        scenes_data = list_scenes_available(base_output_dir=args.output_dir)
+        
+        for scene_data in scenes_data:
+            print(create_commandline(scene_data, generate_depth=generate_depth, exist_ok=exist_ok))
+    else:
+        if args.scene == "" or args.output_dir == "":
+            print("Missing scene or output dir argument!")
+            print(parser.format_help())
+        else:
+            generate_multiview_images_for_scene(scene=args.scene,
+                                                scene_dataset_config_file = args.scene_dataset_config_file,
+                                                navmesh = args.navmesh,
+                                                output_dir = args.output_dir,
+                                                exist_ok=exist_ok,
+                                                generate_depth=generate_depth,
+                                                **kwargs)
\ No newline at end of file
diff --git a/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e5f923b836a645caf5d8e4aacc425047e3c144
--- /dev/null
+++ b/dust3r/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py
@@ -0,0 +1,390 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+import numpy as np
+import quaternion
+import habitat_sim
+import json
+from sklearn.neighbors import NearestNeighbors
+import cv2
+
+# OpenCV to habitat camera convention transformation
+R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0)
+R_HABITAT2OPENCV = R_OPENCV2HABITAT.T
+DEG2RAD = np.pi / 180
+
+def compute_camera_intrinsics(height, width, hfov):
+    f = width/2 / np.tan(hfov/2 * np.pi/180)
+    cu, cv = width/2, height/2
+    return f, cu, cv
+
+def compute_camera_pose_opencv_convention(camera_position, camera_orientation):
+    R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT
+    t_cam2world = np.asarray(camera_position)
+    return R_cam2world, t_cam2world
+
+def compute_pointmap(depthmap, hfov):
+    """ Compute a HxWx3 pointmap in camera frame from a HxW depth map."""
+    height, width = depthmap.shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    # Cast depth map to point
+    z_cam = depthmap
+    u, v = np.meshgrid(range(width), range(height))
+    x_cam = (u - cu) / f * z_cam
+    y_cam = (v - cv) / f * z_cam
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1)
+    return X_cam
+
+def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation):
+    """Return a 3D point cloud corresponding to valid pixels of the depth map"""
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_position, camera_rotation)
+
+    X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov)
+    valid_mask = (X_cam[:,:,2] != 0.0)
+
+    X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()]
+    X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3)
+    return X_world
+
+def compute_pointcloud_overlaps_scikit(pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False):
+    """
+    Compute 'overlapping' metrics based on a distance threshold between two point clouds.
+    """
+    nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud2)
+    distances, indices = nbrs.kneighbors(pointcloud1)
+    intersection1 = np.count_nonzero(distances.flatten() < distance_threshold)
+
+    data = {"intersection1": intersection1,
+            "size1": len(pointcloud1)}
+    if compute_symmetric:
+        nbrs = NearestNeighbors(n_neighbors=1, algorithm = 'kd_tree').fit(pointcloud1)
+        distances, indices = nbrs.kneighbors(pointcloud2)
+        intersection2 = np.count_nonzero(distances.flatten() < distance_threshold)
+        data["intersection2"] = intersection2
+        data["size2"] = len(pointcloud2)
+
+    return data
+
+def _append_camera_parameters(observation, hfov, camera_location, camera_rotation):
+    """
+    Add camera parameters to the observation dictionnary produced by Habitat-Sim
+    In-place modifications.
+    """
+    R_cam2world, t_cam2world = compute_camera_pose_opencv_convention(camera_location, camera_rotation)
+    height, width = observation['depth'].shape
+    f, cu, cv = compute_camera_intrinsics(height, width, hfov)
+    K = np.asarray([[f, 0, cu],
+                    [0, f, cv],
+                    [0, 0, 1.0]])
+    observation["camera_intrinsics"] = K
+    observation["t_cam2world"] = t_cam2world
+    observation["R_cam2world"] = R_cam2world
+
+def look_at(eye, center, up, return_cam2world=True):
+    """
+    Return camera pose looking at a given center point.
+    Analogous of gluLookAt function, using OpenCV camera convention.
+    """
+    z = center - eye
+    z /= np.linalg.norm(z, axis=-1, keepdims=True)
+    y = -up
+    y = y - np.sum(y * z, axis=-1, keepdims=True) * z
+    y /= np.linalg.norm(y, axis=-1, keepdims=True)
+    x = np.cross(y, z, axis=-1)
+
+    if return_cam2world:
+        R = np.stack((x, y, z), axis=-1)
+        t = eye
+    else:
+        # World to camera transformation
+        # Transposed matrix
+        R = np.stack((x, y, z), axis=-2)
+        t = - np.einsum('...ij, ...j', R, eye)
+    return R, t
+
+def look_at_for_habitat(eye, center, up, return_cam2world=True):
+    R, t = look_at(eye, center, up)
+    orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T)
+    return orientation, t
+
+def generate_orientation_noise(pan_range, tilt_range, roll_range):
+    return (quaternion.from_rotation_vector(np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP)
+            * quaternion.from_rotation_vector(np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT)
+            * quaternion.from_rotation_vector(np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT))
+
+
+class NoNaviguableSpaceError(RuntimeError):
+    def __init__(self, *args):
+            super().__init__(*args)
+
+class MultiviewHabitatSimGenerator:
+    def __init__(self,
+                scene,
+                navmesh,
+                scene_dataset_config_file,
+                resolution = (240, 320),
+                views_count=2,
+                hfov = 60,
+                gpu_id = 0,
+                size = 10000,
+                minimum_covisibility = 0.5,
+                transform = None):
+        self.scene = scene
+        self.navmesh = navmesh
+        self.scene_dataset_config_file = scene_dataset_config_file
+        self.resolution = resolution
+        self.views_count = views_count
+        assert(self.views_count >= 1)
+        self.hfov = hfov
+        self.gpu_id = gpu_id
+        self.size = size
+        self.transform = transform
+
+        # Noise added to camera orientation
+        self.pan_range = (-3, 3)
+        self.tilt_range = (-10, 10)
+        self.roll_range = (-5, 5)
+
+        # Height range to sample cameras
+        self.height_range = (1.2, 1.8)
+
+        # Random steps between the camera views
+        self.random_steps_count = 5
+        self.random_step_variance = 2.0
+
+        # Minimum fraction of the scene which should be valid (well defined depth)
+        self.minimum_valid_fraction = 0.7
+
+        # Distance threshold to see  to select pairs
+        self.distance_threshold = 0.05
+        # Minimum IoU of a view point cloud with respect to the reference view to be kept.
+        self.minimum_covisibility = minimum_covisibility
+
+        # Maximum number of retries.
+        self.max_attempts_count = 100
+
+        self.seed = None
+        self._lazy_initialization()
+
+    def _lazy_initialization(self):
+        # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly
+        if self.seed == None:
+            # Re-seed numpy generator
+            np.random.seed()
+            self.seed = np.random.randint(2**32-1)
+            sim_cfg = habitat_sim.SimulatorConfiguration()
+            sim_cfg.scene_id = self.scene
+            if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "":
+                    sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file
+            sim_cfg.random_seed = self.seed
+            sim_cfg.load_semantic_mesh = False
+            sim_cfg.gpu_device_id = self.gpu_id
+
+            depth_sensor_spec = habitat_sim.CameraSensorSpec()
+            depth_sensor_spec.uuid = "depth"
+            depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+            depth_sensor_spec.resolution = self.resolution
+            depth_sensor_spec.hfov = self.hfov
+            depth_sensor_spec.position = [0.0, 0.0, 0]
+            depth_sensor_spec.orientation
+
+            rgb_sensor_spec = habitat_sim.CameraSensorSpec()
+            rgb_sensor_spec.uuid = "color"
+            rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR
+            rgb_sensor_spec.resolution = self.resolution
+            rgb_sensor_spec.hfov = self.hfov
+            rgb_sensor_spec.position = [0.0, 0.0, 0]
+            agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec, depth_sensor_spec])
+
+            cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])
+            self.sim = habitat_sim.Simulator(cfg)
+            if self.navmesh is not None and self.navmesh != "":
+                # Use pre-computed navmesh when available (usually better than those generated automatically)
+                self.sim.pathfinder.load_nav_mesh(self.navmesh)
+
+            if not self.sim.pathfinder.is_loaded:
+                # Try to compute a navmesh
+                navmesh_settings = habitat_sim.NavMeshSettings()
+                navmesh_settings.set_defaults()
+                self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True)
+
+            # Ensure that the navmesh is not empty
+            if not self.sim.pathfinder.is_loaded:
+                raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})")
+
+            self.agent = self.sim.initialize_agent(agent_id=0)
+
+    def close(self):
+        self.sim.close()
+
+    def __del__(self):
+        self.sim.close()
+
+    def __len__(self):
+        return self.size
+
+    def sample_random_viewpoint(self):
+        """ Sample a random viewpoint using the navmesh """
+        nav_point = self.sim.pathfinder.get_random_navigable_point()
+
+        # Sample a random viewpoint height
+        viewpoint_height = np.random.uniform(*self.height_range)
+        viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP
+        viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+        return viewpoint_position, viewpoint_orientation, nav_point
+
+    def sample_other_random_viewpoint(self, observed_point, nav_point):
+        """ Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point."""
+        other_nav_point = nav_point
+
+        walk_directions = self.random_step_variance * np.asarray([1,0,1])
+        for i in range(self.random_steps_count):
+            temp = self.sim.pathfinder.snap_point(other_nav_point + walk_directions * np.random.normal(size=3))
+            # Snapping may return nan when it fails
+            if not np.isnan(temp[0]):
+                    other_nav_point = temp
+
+        other_viewpoint_height = np.random.uniform(*self.height_range)
+        other_viewpoint_position = other_nav_point + other_viewpoint_height * habitat_sim.geo.UP
+
+        # Set viewing direction towards the central point
+        rotation, position = look_at_for_habitat(eye=other_viewpoint_position, center=observed_point, up=habitat_sim.geo.UP, return_cam2world=True)
+        rotation = rotation * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range)
+        return position, rotation, other_nav_point
+
+    def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud):
+        """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+        # Observation
+        pixels_count = self.resolution[0] * self.resolution[1]
+        valid_fraction = len(other_pointcloud) / pixels_count
+        assert valid_fraction <= 1.0 and valid_fraction >= 0.0
+        overlap = compute_pointcloud_overlaps_scikit(ref_pointcloud, other_pointcloud, self.distance_threshold, compute_symmetric=True)
+        covisibility = min(overlap["intersection1"] / pixels_count, overlap["intersection2"] / pixels_count)
+        is_valid = (valid_fraction >= self.minimum_valid_fraction) and (covisibility >= self.minimum_covisibility)
+        return is_valid, valid_fraction, covisibility
+
+    def is_other_viewpoint_overlapping(self, ref_pointcloud, observation, position, rotation):
+        """ Check if a viewpoint is valid and overlaps significantly with a reference one. """
+        # Observation
+        other_pointcloud = compute_pointcloud(observation['depth'], self.hfov, position, rotation)
+        return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+
+    def render_viewpoint(self, viewpoint_position, viewpoint_orientation):
+        agent_state = habitat_sim.AgentState()
+        agent_state.position = viewpoint_position
+        agent_state.rotation = viewpoint_orientation
+        self.agent.set_state(agent_state)
+        viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0)
+        _append_camera_parameters(viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation)
+        return viewpoint_observations
+
+    def __getitem__(self, useless_idx):
+        ref_position, ref_orientation, nav_point = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        # Extract point cloud
+        ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+                                        camera_position=ref_position, camera_rotation=ref_orientation)
+
+        pixels_count = self.resolution[0] * self.resolution[1]
+        ref_valid_fraction = len(ref_pointcloud) / pixels_count
+        assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0
+        if ref_valid_fraction < self.minimum_valid_fraction:
+                # This should produce a recursion error at some point when something is very wrong.
+                return self[0]
+        # Pick an reference observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+
+        # Add the first image as reference
+        viewpoints_observations = [ref_observations]
+        viewpoints_covisibility = [ref_valid_fraction]
+        viewpoints_positions = [ref_position]
+        viewpoints_orientations = [quaternion.as_float_array(ref_orientation)]
+        viewpoints_clouds = [ref_pointcloud]
+        viewpoints_valid_fractions = [ref_valid_fraction]
+
+        for _ in range(self.views_count - 1):
+            # Generate an other viewpoint using some dummy random walk
+            successful_sampling = False
+            for sampling_attempt in range(self.max_attempts_count):
+                position, rotation, _ = self.sample_other_random_viewpoint(observed_point, nav_point)
+                # Observation
+                other_viewpoint_observations = self.render_viewpoint(position, rotation)
+                other_pointcloud = compute_pointcloud(other_viewpoint_observations['depth'], self.hfov, position, rotation)
+
+                is_valid, valid_fraction, covisibility = self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud)
+                if is_valid:
+                        successful_sampling = True
+                        break
+            if not successful_sampling:
+                print("WARNING: Maximum number of attempts reached.")
+                # Dirty hack, try using a novel original viewpoint
+                return self[0]
+            viewpoints_observations.append(other_viewpoint_observations)
+            viewpoints_covisibility.append(covisibility)
+            viewpoints_positions.append(position)
+            viewpoints_orientations.append(quaternion.as_float_array(rotation)) # WXYZ convention for the quaternion encoding.
+            viewpoints_clouds.append(other_pointcloud)
+            viewpoints_valid_fractions.append(valid_fraction)
+
+        # Estimate relations between all pairs of images
+        pairwise_visibility_ratios = np.ones((len(viewpoints_observations), len(viewpoints_observations)))
+        for i in range(len(viewpoints_observations)):
+            pairwise_visibility_ratios[i,i] = viewpoints_valid_fractions[i]
+            for j in range(i+1, len(viewpoints_observations)):
+                overlap = compute_pointcloud_overlaps_scikit(viewpoints_clouds[i], viewpoints_clouds[j], self.distance_threshold, compute_symmetric=True)
+                pairwise_visibility_ratios[i,j] = overlap['intersection1'] / pixels_count
+                pairwise_visibility_ratios[j,i] = overlap['intersection2'] / pixels_count
+
+        # IoU is relative to the image 0
+        data = {"observations": viewpoints_observations,
+                "positions": np.asarray(viewpoints_positions),
+                "orientations": np.asarray(viewpoints_orientations),
+                "covisibility_ratios": np.asarray(viewpoints_covisibility),
+                "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float),
+                "pairwise_visibility_ratios": np.asarray(pairwise_visibility_ratios, dtype=float),
+                }
+
+        if self.transform is not None:
+            data = self.transform(data)
+        return  data
+
+    def generate_random_spiral_trajectory(self, images_count = 100, max_radius=0.5, half_turns=5, use_constant_orientation=False):
+        """
+        Return a list of images corresponding to a spiral trajectory from a random starting point.
+        Useful to generate nice visualisations.
+        Use an even number of half turns to get a nice "C1-continuous" loop effect 
+        """
+        ref_position, ref_orientation, navpoint = self.sample_random_viewpoint()
+        ref_observations = self.render_viewpoint(ref_position, ref_orientation)
+        ref_pointcloud = compute_pointcloud(depthmap=ref_observations['depth'], hfov=self.hfov,
+                                                        camera_position=ref_position, camera_rotation=ref_orientation)
+        pixels_count = self.resolution[0] * self.resolution[1]
+        if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction:
+            # Dirty hack: ensure that the valid part of the image is significant
+            return self.generate_random_spiral_trajectory(images_count, max_radius, half_turns, use_constant_orientation)
+
+        # Pick an observed point in the point cloud
+        observed_point = np.mean(ref_pointcloud, axis=0)
+        ref_R, ref_t = compute_camera_pose_opencv_convention(ref_position, ref_orientation)
+
+        images = []
+        is_valid = []
+        # Spiral trajectory, use_constant orientation
+        for i, alpha in enumerate(np.linspace(0, 1, images_count)):
+            r = max_radius * np.abs(np.sin(alpha * np.pi)) # Increase then decrease the radius
+            theta = alpha * half_turns * np.pi 
+            x = r * np.cos(theta)
+            y = r * np.sin(theta)
+            z = 0.0
+            position = ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3,1)).flatten()
+            if use_constant_orientation:
+                orientation = ref_orientation
+            else:
+                # trajectory looking at a mean point in front of the ref observation
+                orientation, position = look_at_for_habitat(eye=position, center=observed_point, up=habitat_sim.geo.UP)
+            observations = self.render_viewpoint(position, orientation)
+            images.append(observations['color'][...,:3])
+            _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping(ref_pointcloud, observations, position, orientation)
+            is_valid.append(_is_valid)
+        return images, np.all(is_valid)
\ No newline at end of file
diff --git a/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py b/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..10672a01f7dd615d3b4df37781f7f6f97e753ba6
--- /dev/null
+++ b/dust3r/croco/datasets/habitat_sim/pack_metadata_files.py
@@ -0,0 +1,69 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+"""
+Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere.
+"""
+import os
+import glob
+from tqdm import tqdm
+import shutil
+import json
+from datasets.habitat_sim.paths import *
+import argparse
+import collections
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_dir")
+    parser.add_argument("output_dir")
+    args = parser.parse_args()
+
+    input_dirname = args.input_dir
+    output_dirname = args.output_dir
+
+    input_metadata_filenames = glob.iglob(f"{input_dirname}/**/metadata.json", recursive=True)
+
+    images_count = collections.defaultdict(lambda : 0)
+    
+    os.makedirs(output_dirname)
+    for input_filename in tqdm(input_metadata_filenames):
+        # Ignore empty files
+        with open(input_filename, "r") as f:
+            original_metadata = json.load(f)
+            if "multiviews" not in original_metadata or len(original_metadata["multiviews"]) == 0:
+                print("No views in", input_filename)
+                continue
+
+        relpath = os.path.relpath(input_filename, input_dirname)
+        print(relpath)
+
+        # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability.
+        # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern.
+        scenes_dataset_paths = dict(sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True))
+        metadata = dict()
+        for key, value in original_metadata.items():
+            if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "":
+                known_path = False
+                for dataset, dataset_path in scenes_dataset_paths.items():
+                    if value.startswith(dataset_path):
+                        value = os.path.join(dataset, os.path.relpath(value, dataset_path))
+                        known_path = True
+                        break
+                if not known_path:
+                    raise KeyError("Unknown path:" + value)
+            metadata[key] = value
+
+        # Compile some general statistics while packing data
+        scene_split = metadata["scene"].split("/")
+        upper_level = "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0]
+        images_count[upper_level] += len(metadata["multiviews"])
+        
+        output_filename = os.path.join(output_dirname, relpath)
+        os.makedirs(os.path.dirname(output_filename), exist_ok=True)
+        with open(output_filename, "w") as f:
+            json.dump(metadata, f)
+
+    # Print statistics
+    print("Images count:")
+    for upper_level, count in images_count.items():
+        print(f"- {upper_level}: {count}")
\ No newline at end of file
diff --git a/dust3r/croco/datasets/habitat_sim/paths.py b/dust3r/croco/datasets/habitat_sim/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d63b5fa29c274ddfeae084734a35ba66d7edee8
--- /dev/null
+++ b/dust3r/croco/datasets/habitat_sim/paths.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+"""
+Paths to Habitat-Sim scenes
+"""
+
+import os
+import json
+import collections
+from tqdm import tqdm
+
+
+# Hardcoded path to the different scene datasets
+SCENES_DATASET = {
+    "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/",
+    "gibson": "./data/habitat-sim-data/scene_datasets/gibson/",
+    "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/",
+    "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/",
+    "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/",
+    "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/",
+    "scannet": "./data/habitat-sim/scene_datasets/scannet/"
+}
+
+SceneData = collections.namedtuple("SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"])
+
+def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]):
+    scene_dataset_config_file = os.path.join(base_path, "replicaCAD.scene_dataset_config.json")
+    scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"]
+    navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx])
+        # Add scene
+        data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+                    scene = scenes[idx] + ".scene_instance.json",
+                    navmesh = os.path.join(base_path, navmeshes[idx]),
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data
+
+def list_replica_cad_baked_lighting_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"]):
+    scene_dataset_config_file = os.path.join(base_path, "replicaCAD_baked.scene_dataset_config.json")
+    scenes = sum([[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], [])
+    navmeshes = ""#[f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"]
+    scenes_data = []
+    for idx in range(len(scenes)):
+        output_dir = os.path.join(base_output_dir, "replica_cad_baked_lighting", scenes[idx])
+        data = SceneData(scene_dataset_config_file=scene_dataset_config_file,
+                    scene = scenes[idx],
+                    navmesh = "",
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data    
+
+def list_replica_scenes(base_output_dir, base_path):
+    scenes_data = []
+    for scene_id in os.listdir(base_path):
+        scene = os.path.join(base_path, scene_id, "mesh.ply")
+        navmesh = os.path.join(base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh") # Not sure if I should use it
+        scene_dataset_config_file = ""
+        output_dir = os.path.join(base_output_dir, scene_id)
+        # Add scene only if it does not exist already, or if exist_ok
+        data = SceneData(scene_dataset_config_file = scene_dataset_config_file,
+                    scene = scene,
+                    navmesh = navmesh,
+                    output_dir = output_dir)
+        scenes_data.append(data)
+    return scenes_data
+
+
+def list_scenes(base_output_dir, base_path):
+    """
+    Generic method iterating through a base_path folder to find scenes.
+    """
+    scenes_data = []
+    for root, dirs, files in os.walk(base_path, followlinks=True):
+        folder_scenes_data = []
+        for file in files:
+            name, ext = os.path.splitext(file)
+            if ext == ".glb":
+                scene = os.path.join(root, name + ".glb")
+                navmesh = os.path.join(root, name + ".navmesh")
+                if not os.path.exists(navmesh):
+                    navmesh = ""
+                relpath = os.path.relpath(root, base_path)
+                output_dir = os.path.abspath(os.path.join(base_output_dir, relpath, name))
+                data = SceneData(scene_dataset_config_file="",
+                    scene = scene,
+                    navmesh = navmesh,
+                    output_dir = output_dir)
+                folder_scenes_data.append(data)
+
+        # Specific check for HM3D:
+        # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version.
+        basis_scenes = [data.scene[:-len(".basis.glb")] for data in folder_scenes_data if data.scene.endswith(".basis.glb")]
+        if len(basis_scenes) != 0:
+            folder_scenes_data = [data for data in folder_scenes_data if not (data.scene[:-len(".glb")] in basis_scenes)]
+
+        scenes_data.extend(folder_scenes_data)
+    return scenes_data
+
+def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET):
+    scenes_data = []
+
+    # HM3D
+    for split in ("minival", "train", "val", "examples"):
+        scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"),
+                                    base_path=f"{scenes_dataset_paths['hm3d']}/{split}")
+
+    # Gibson
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "gibson"),
+                                base_path=scenes_dataset_paths["gibson"])
+
+    # Habitat test scenes (just a few)
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"),
+                                base_path=scenes_dataset_paths["habitat-test-scenes"])
+
+    # ReplicaCAD (baked lightning)
+    scenes_data += list_replica_cad_baked_lighting_scenes(base_output_dir=base_output_dir)
+
+    # ScanNet
+    scenes_data += list_scenes(base_output_dir=os.path.join(base_output_dir, "scannet"), 
+                            base_path=scenes_dataset_paths["scannet"])
+    
+    # Replica
+    list_replica_scenes(base_output_dir=os.path.join(base_output_dir, "replica"),
+                        base_path=scenes_dataset_paths["replica"])
+    return scenes_data    
diff --git a/dust3r/croco/datasets/pairs_dataset.py b/dust3r/croco/datasets/pairs_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f107526b34e154d9013a9a7a0bde3d5ff6f581c
--- /dev/null
+++ b/dust3r/croco/datasets/pairs_dataset.py
@@ -0,0 +1,109 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import os
+from torch.utils.data import Dataset
+from PIL import Image
+
+from datasets.transforms import get_pair_transforms
+
+def load_image(impath):
+    return Image.open(impath)
+
+def load_pairs_from_cache_file(fname, root=''):
+    assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, 'r') as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [ (os.path.join(root,l.split()[0]), os.path.join(root,l.split()[1])) for l in lines]
+    return pairs
+    
+def load_pairs_from_list_file(fname, root=''):
+    assert os.path.isfile(fname), "cannot parse pairs from {:s}, file does not exist".format(fname)
+    with open(fname, 'r') as fid:
+        lines = fid.read().strip().splitlines()
+    pairs = [ (os.path.join(root,l+'_1.jpg'), os.path.join(root,l+'_2.jpg')) for l in lines if not l.startswith('#')]
+    return pairs
+    
+    
+def write_cache_file(fname, pairs, root=''):
+    if len(root)>0:
+        if not root.endswith('/'): root+='/'
+        assert os.path.isdir(root)
+    s = ''
+    for im1, im2 in pairs:
+        if len(root)>0:
+            assert im1.startswith(root), im1
+            assert im2.startswith(root), im2
+        s += '{:s} {:s}\n'.format(im1[len(root):], im2[len(root):])
+    with open(fname, 'w') as fid:
+        fid.write(s[:-1])
+    
+def parse_and_cache_all_pairs(dname, data_dir='./data/'):
+    if dname=='habitat_release':
+        dirname = os.path.join(data_dir, 'habitat_release')
+        assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+        cache_file = os.path.join(dirname, 'pairs.txt')
+        assert not os.path.isfile(cache_file), "cache file already exists: "+cache_file
+        
+        print('Parsing pairs for dataset: '+dname)
+        pairs = []
+        for root, dirs, files in os.walk(dirname):
+            if 'val' in root: continue
+            dirs.sort()
+            pairs += [ (os.path.join(root,f), os.path.join(root,f[:-len('_1.jpeg')]+'_2.jpeg')) for f in sorted(files) if f.endswith('_1.jpeg')]
+        print('Found {:,} pairs'.format(len(pairs)))
+        print('Writing cache to: '+cache_file)
+        write_cache_file(cache_file, pairs, root=dirname)
+
+    else:
+        raise NotImplementedError('Unknown dataset: '+dname)
+    
+def dnames_to_image_pairs(dnames, data_dir='./data/'):
+    """
+    dnames: list of datasets with image pairs, separated by +
+    """
+    all_pairs = []
+    for dname in dnames.split('+'):
+        if dname=='habitat_release':
+            dirname = os.path.join(data_dir, 'habitat_release')
+            assert os.path.isdir(dirname), "cannot find folder for habitat_release pairs: "+dirname
+            cache_file = os.path.join(dirname, 'pairs.txt')
+            assert os.path.isfile(cache_file), "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. "+cache_file
+            pairs = load_pairs_from_cache_file(cache_file, root=dirname)
+        elif dname in ['ARKitScenes', 'MegaDepth', '3DStreetView', 'IndoorVL']:
+            dirname = os.path.join(data_dir, dname+'_crops')
+            assert os.path.isdir(dirname), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname)
+            list_file = os.path.join(dirname, 'listing.txt')
+            assert os.path.isfile(list_file), "cannot find list file for {:s} pairs, see instructions. {:s}".format(dname, list_file)
+            pairs = load_pairs_from_list_file(list_file, root=dirname)            
+        print('  {:s}: {:,} pairs'.format(dname, len(pairs)))
+        all_pairs += pairs 
+    if '+' in dnames: print(' Total: {:,} pairs'.format(len(all_pairs)))
+    return all_pairs 
+
+
+class PairsDataset(Dataset):
+
+    def __init__(self, dnames, trfs='', totensor=True, normalize=True, data_dir='./data/'):
+        super().__init__()
+        self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir)
+        self.transforms = get_pair_transforms(transform_str=trfs, totensor=totensor, normalize=normalize)
+              
+    def __len__(self):
+        return len(self.image_pairs)
+            
+    def __getitem__(self, index):
+        im1path, im2path = self.image_pairs[index]
+        im1 = load_image(im1path)
+        im2 = load_image(im2path)
+        if self.transforms is not None: im1, im2 = self.transforms(im1, im2)
+        return im1, im2
+
+        
+if __name__=="__main__":
+    import argparse
+    parser = argparse.ArgumentParser(prog="Computing and caching list of pairs for a given dataset")
+    parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+    parser.add_argument('--dataset', default='habitat_release', type=str, help="name of the dataset")
+    args = parser.parse_args()
+    parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir)
diff --git a/dust3r/croco/datasets/transforms.py b/dust3r/croco/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..216bac61f8254fd50e7f269ee80301f250a2d11e
--- /dev/null
+++ b/dust3r/croco/datasets/transforms.py
@@ -0,0 +1,95 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+import torchvision.transforms
+import torchvision.transforms.functional as F
+
+# "Pair": apply a transform on a pair
+# "Both": apply the exact same transform to both images
+
+class ComposePair(torchvision.transforms.Compose):
+    def __call__(self, img1, img2):
+        for t in self.transforms:
+            img1, img2 = t(img1, img2)
+        return img1, img2
+
+class NormalizeBoth(torchvision.transforms.Normalize):
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+
+class ToTensorBoth(torchvision.transforms.ToTensor):
+    def __call__(self, img1, img2):
+        img1 = super().__call__(img1)
+        img2 = super().__call__(img2)
+        return img1, img2
+        
+class RandomCropPair(torchvision.transforms.RandomCrop): 
+    # the crop will be intentionally different for the two images with this class
+    def forward(self, img1, img2):
+        img1 = super().forward(img1)
+        img2 = super().forward(img2)
+        return img1, img2
+
+class ColorJitterPair(torchvision.transforms.ColorJitter): 
+    # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob  
+    def __init__(self, assymetric_prob, **kwargs):
+        super().__init__(**kwargs)
+        self.assymetric_prob = assymetric_prob
+    def jitter_one(self, img, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor):
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return img
+        
+    def forward(self, img1, img2):
+
+        fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+            self.brightness, self.contrast, self.saturation, self.hue
+        )
+        img1 = self.jitter_one(img1, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+        if torch.rand(1) < self.assymetric_prob: # assymetric:
+            fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = self.get_params(
+                self.brightness, self.contrast, self.saturation, self.hue
+            )
+        img2 = self.jitter_one(img2, fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor)
+        return img1, img2
+
+def get_pair_transforms(transform_str, totensor=True, normalize=True):
+    # transform_str is eg    crop224+color
+    trfs = []
+    for s in transform_str.split('+'):
+        if s.startswith('crop'):
+            size = int(s[len('crop'):])
+            trfs.append(RandomCropPair(size))
+        elif s=='acolor':
+            trfs.append(ColorJitterPair(assymetric_prob=1.0, brightness=(0.6, 1.4), contrast=(0.6, 1.4), saturation=(0.6, 1.4), hue=0.0))
+        elif s=='': # if transform_str was ""
+            pass
+        else:
+            raise NotImplementedError('Unknown augmentation: '+s)
+            
+    if totensor:
+        trfs.append( ToTensorBoth() )
+    if normalize:
+        trfs.append( NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) )
+
+    if len(trfs)==0:
+        return None
+    elif len(trfs)==1:
+        return trfs
+    else:
+        return ComposePair(trfs)
+        
+        
+        
+        
+        
diff --git a/dust3r/croco/demo.py b/dust3r/croco/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b80ccc5c98c18e20d1ce782511aa824ef28f77
--- /dev/null
+++ b/dust3r/croco/demo.py
@@ -0,0 +1,55 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+from models.croco import CroCoNet
+from PIL import Image
+import torchvision.transforms
+from torchvision.transforms import ToTensor, Normalize, Compose
+
+def main():
+    device = torch.device('cuda:0' if torch.cuda.is_available() and torch.cuda.device_count()>0 else 'cpu')
+    
+    # load 224x224 images and transform them to tensor 
+    imagenet_mean = [0.485, 0.456, 0.406]
+    imagenet_mean_tensor = torch.tensor(imagenet_mean).view(1,3,1,1).to(device, non_blocking=True)
+    imagenet_std = [0.229, 0.224, 0.225]
+    imagenet_std_tensor = torch.tensor(imagenet_std).view(1,3,1,1).to(device, non_blocking=True)
+    trfs = Compose([ToTensor(), Normalize(mean=imagenet_mean, std=imagenet_std)])
+    image1 = trfs(Image.open('assets/Chateau1.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+    image2 = trfs(Image.open('assets/Chateau2.png').convert('RGB')).to(device, non_blocking=True).unsqueeze(0)
+    
+    # load model 
+    ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')
+    model = CroCoNet( **ckpt.get('croco_kwargs',{})).to(device)
+    model.eval()
+    msg = model.load_state_dict(ckpt['model'], strict=True)
+    
+    # forward 
+    with torch.inference_mode():
+        out, mask, target = model(image1, image2)
+        
+    # the output is normalized, thus use the mean/std of the actual image to go back to RGB space 
+    patchified = model.patchify(image1)
+    mean = patchified.mean(dim=-1, keepdim=True)
+    var = patchified.var(dim=-1, keepdim=True)
+    decoded_image = model.unpatchify(out * (var + 1.e-6)**.5 + mean)
+    # undo imagenet normalization, prepare masked image
+    decoded_image = decoded_image * imagenet_std_tensor + imagenet_mean_tensor
+    input_image = image1 * imagenet_std_tensor + imagenet_mean_tensor
+    ref_image = image2 * imagenet_std_tensor + imagenet_mean_tensor
+    image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])
+    masked_input_image = ((1 - image_masks) * input_image)
+
+    # make visualization
+    visualization = torch.cat((ref_image, masked_input_image, decoded_image, input_image), dim=3) # 4*(B, 3, H, W) -> B, 3, H, W*4
+    B, C, H, W = visualization.shape
+    visualization = visualization.permute(1, 0, 2, 3).reshape(C, B*H, W)
+    visualization = torchvision.transforms.functional.to_pil_image(torch.clamp(visualization, 0, 1))
+    fname = "demo_output.png"
+    visualization.save(fname)
+    print('Visualization save in '+fname)
+    
+
+if __name__=="__main__":
+    main()
diff --git a/dust3r/croco/interactive_demo.ipynb b/dust3r/croco/interactive_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..6cfc960af5baac9a69029c29a16eea4e24123a71
--- /dev/null
+++ b/dust3r/croco/interactive_demo.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Interactive demo of Cross-view Completion."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n",
+    "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "from models.croco import CroCoNet\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "import matplotlib.pyplot as plt\n",
+    "import quaternion\n",
+    "import models.masking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load CroCo model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n",
+    "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n",
+    "msg = model.load_state_dict(ckpt['model'], strict=True)\n",
+    "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n",
+    "device = torch.device('cuda:0' if use_gpu else 'cpu')\n",
+    "model = model.eval()\n",
+    "model = model.to(device=device)\n",
+    "print(msg)\n",
+    "\n",
+    "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n",
+    "    \"\"\"\n",
+    "    Perform Cross-View completion using two input images, specified using Numpy arrays.\n",
+    "    \"\"\"\n",
+    "    # Replace the mask generator\n",
+    "    model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n",
+    "\n",
+    "    # ImageNet-1k color normalization\n",
+    "    imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n",
+    "    imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n",
+    "\n",
+    "    normalize_input_colors = True\n",
+    "    is_output_normalized = True\n",
+    "    with torch.no_grad():\n",
+    "        # Cast data to torch\n",
+    "        target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "        ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n",
+    "\n",
+    "        if normalize_input_colors:\n",
+    "            ref_image = (ref_image - imagenet_mean) / imagenet_std\n",
+    "            target_image = (target_image - imagenet_mean) / imagenet_std\n",
+    "\n",
+    "        out, mask, _ = model(target_image, ref_image)\n",
+    "        # # get target\n",
+    "        if not is_output_normalized:\n",
+    "            predicted_image = model.unpatchify(out)\n",
+    "        else:\n",
+    "            # The output only contains higher order information,\n",
+    "            # we retrieve mean and standard deviation from the actual target image\n",
+    "            patchified = model.patchify(target_image)\n",
+    "            mean = patchified.mean(dim=-1, keepdim=True)\n",
+    "            var = patchified.var(dim=-1, keepdim=True)\n",
+    "            pred_renorm = out * (var + 1.e-6)**.5 + mean\n",
+    "            predicted_image = model.unpatchify(pred_renorm)\n",
+    "\n",
+    "        image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n",
+    "        masked_target_image = (1 - image_masks) * target_image\n",
+    "      \n",
+    "        if not reconstruct_unmasked_patches:\n",
+    "            # Replace unmasked patches by their actual values\n",
+    "            predicted_image = predicted_image * image_masks + masked_target_image\n",
+    "\n",
+    "        # Unapply color normalization\n",
+    "        if normalize_input_colors:\n",
+    "            predicted_image = predicted_image * imagenet_std + imagenet_mean\n",
+    "            masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n",
+    "        \n",
+    "        # Cast to Numpy\n",
+    "        masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n",
+    "        return masked_target_image, predicted_image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n",
+    "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n",
+    "import habitat_sim\n",
+    "\n",
+    "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n",
+    "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n",
+    "\n",
+    "sim_cfg = habitat_sim.SimulatorConfiguration()\n",
+    "if use_gpu: sim_cfg.gpu_device_id = 0\n",
+    "sim_cfg.scene_id = scene\n",
+    "sim_cfg.load_semantic_mesh = False\n",
+    "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n",
+    "rgb_sensor_spec.uuid = \"color\"\n",
+    "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n",
+    "rgb_sensor_spec.resolution = (224,224)\n",
+    "rgb_sensor_spec.hfov = 56.56\n",
+    "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n",
+    "rgb_sensor_spec.orientation = [0, 0, 0]\n",
+    "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n",
+    "\n",
+    "\n",
+    "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n",
+    "sim = habitat_sim.Simulator(cfg)\n",
+    "if navmesh is not None:\n",
+    "    sim.pathfinder.load_nav_mesh(navmesh)\n",
+    "agent = sim.initialize_agent(agent_id=0)\n",
+    "\n",
+    "def sample_random_viewpoint():\n",
+    "    \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n",
+    "    nav_point = sim.pathfinder.get_random_navigable_point()\n",
+    "    # Sample a random viewpoint height\n",
+    "    viewpoint_height = np.random.uniform(1.0, 1.6)\n",
+    "    viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n",
+    "    viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n",
+    "    return viewpoint_position, viewpoint_orientation\n",
+    "\n",
+    "def render_viewpoint(position, orientation):\n",
+    "    agent_state = habitat_sim.AgentState()\n",
+    "    agent_state.position = position\n",
+    "    agent_state.rotation = orientation\n",
+    "    agent.set_state(agent_state)\n",
+    "    viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n",
+    "    image = viewpoint_observations['color'][:,:,:3]\n",
+    "    image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n",
+    "    return image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sample a random reference view"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref_position, ref_orientation = sample_random_viewpoint()\n",
+    "ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "plt.clf()\n",
+    "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n",
+    "axes[0,0].imshow(ref_image)\n",
+    "for ax in axes.flatten():\n",
+    "    ax.set_xticks([])\n",
+    "    ax.set_yticks([])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Interactive cross-view completion using CroCo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reconstruct_unmasked_patches = False\n",
+    "\n",
+    "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n",
+    "    R = quaternion.as_rotation_matrix(ref_orientation)\n",
+    "    target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n",
+    "    target_orientation = (ref_orientation\n",
+    "         * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n",
+    "         * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n",
+    "    \n",
+    "    ref_image = render_viewpoint(ref_position, ref_orientation)\n",
+    "    target_image = render_viewpoint(target_position, target_orientation)\n",
+    "\n",
+    "    masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n",
+    "\n",
+    "    fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n",
+    "    axes[0].imshow(ref_image)\n",
+    "    axes[0].set_xlabel(\"Reference\")\n",
+    "    axes[1].imshow(masked_target_image)\n",
+    "    axes[1].set_xlabel(\"Masked target\")\n",
+    "    axes[2].imshow(predicted_image)\n",
+    "    axes[2].set_xlabel(\"Reconstruction\")        \n",
+    "    axes[3].imshow(target_image)\n",
+    "    axes[3].set_xlabel(\"Target\")\n",
+    "    for ax in axes.flatten():\n",
+    "        ax.set_xticks([])\n",
+    "        ax.set_yticks([])\n",
+    "\n",
+    "interact(show_demo,\n",
+    "        masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n",
+    "        x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n",
+    "        panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n",
+    "        elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.13"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/dust3r/croco/models/blocks.py b/dust3r/croco/models/blocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..18133524f0ae265b0bd8d062d7c9eeaa63858a9b
--- /dev/null
+++ b/dust3r/croco/models/blocks.py
@@ -0,0 +1,241 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Main encoder/decoder blocks
+# --------------------------------------------------------
+# References: 
+# timm
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py
+# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py
+
+
+import torch
+import torch.nn as nn 
+
+from itertools import repeat
+import collections.abc
+
+
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+to_2tuple = _ntuple(2)
+
+def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0 and scale_by_keep:
+        random_tensor.div_(keep_prob)
+    return x * random_tensor
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+        self.scale_by_keep = scale_by_keep
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
+
+    def extra_repr(self):
+        return f'drop_prob={round(self.drop_prob,3):0.3f}'
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+class Attention(nn.Module):
+
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.rope = rope 
+
+    def forward(self, x, xpos):
+        B, N, C = x.shape
+
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1,3)
+        q, k, v = [qkv[:,:,i] for i in range(3)]
+        # q,k,v = qkv.unbind(2)  # make torchscript happy (cannot use tensor as tuple)
+               
+        if self.rope is not None:
+            q = self.rope(q, xpos)
+            k = self.rope(k, xpos)
+               
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+    def forward(self, x, xpos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+class CrossAttention(nn.Module):
+    
+    def __init__(self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        
+        self.rope = rope
+        
+    def forward(self, query, key, value, qpos, kpos):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+        
+        q = self.projq(query).reshape(B,Nq,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        k = self.projk(key).reshape(B,Nk,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        v = self.projv(value).reshape(B,Nv,self.num_heads, C// self.num_heads).permute(0, 2, 1, 3)
+        
+        if self.rope is not None:
+            q = self.rope(q, qpos)
+            k = self.rope(k, kpos)
+            
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+class DecoderBlock(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, norm_mem=True, rope=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.cross_attn = CrossAttention(dim, rope=rope, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.norm_y = norm_layer(dim) if norm_mem else nn.Identity()
+
+    def forward(self, x, y, xpos, ypos):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        y_ = self.norm_y(y)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos))
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x, y
+        
+        
+# patch embedding
+class PositionGetter(object):
+    """ return positions of patches """
+
+    def __init__(self):
+        self.cache_positions = {}
+        
+    def __call__(self, b, h, w, device):
+        if not (h,w) in self.cache_positions:
+            x = torch.arange(w, device=device)
+            y = torch.arange(h, device=device)
+            self.cache_positions[h,w] = torch.cartesian_prod(y, x) # (h, w, 2)
+        pos = self.cache_positions[h,w].view(1, h*w, 2).expand(b, -1, 2).clone()
+        return pos
+
+class PatchEmbed(nn.Module):
+    """ just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed"""
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+        
+        self.position_getter = PositionGetter()
+        
+    def forward(self, x):
+        B, C, H, W = x.shape
+        torch._assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        torch._assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+        
+    def _init_weights(self):
+        w = self.proj.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) 
+
diff --git a/dust3r/croco/models/criterion.py b/dust3r/croco/models/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..11696c40865344490f23796ea45e8fbd5e654731
--- /dev/null
+++ b/dust3r/croco/models/criterion.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# Criterion to train CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# --------------------------------------------------------
+
+import torch
+
+class MaskedMSE(torch.nn.Module):
+
+    def __init__(self, norm_pix_loss=False, masked=True):
+        """
+            norm_pix_loss: normalize each patch by their pixel mean and variance
+            masked: compute loss over the masked patches only 
+        """
+        super().__init__()
+        self.norm_pix_loss = norm_pix_loss
+        self.masked = masked 
+        
+    def forward(self, pred, mask, target):
+        
+        if self.norm_pix_loss:
+            mean = target.mean(dim=-1, keepdim=True)
+            var = target.var(dim=-1, keepdim=True)
+            target = (target - mean) / (var + 1.e-6)**.5
+            
+        loss = (pred - target) ** 2
+        loss = loss.mean(dim=-1)  # [N, L], mean loss per patch
+        if self.masked:
+            loss = (loss * mask).sum() / mask.sum()  # mean loss on masked patches
+        else:
+            loss = loss.mean()  # mean loss
+        return loss
diff --git a/dust3r/croco/models/croco.py b/dust3r/croco/models/croco.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f6fe7782e1994db89d15d3c0763e3fbc543a1a
--- /dev/null
+++ b/dust3r/croco/models/croco.py
@@ -0,0 +1,249 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# CroCo model during pretraining
+# --------------------------------------------------------
+
+
+
+import torch
+import torch.nn as nn
+torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
+from functools import partial
+
+from models.blocks import Block, DecoderBlock, PatchEmbed
+from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D 
+from models.masking import RandomMask
+
+
+class CroCoNet(nn.Module):
+
+    def __init__(self,
+                 img_size=224,           # input image size
+                 patch_size=16,          # patch_size 
+                 mask_ratio=0.9,         # ratios of masked tokens 
+                 enc_embed_dim=768,      # encoder feature dimension
+                 enc_depth=12,           # encoder depth 
+                 enc_num_heads=12,       # encoder number of heads in the transformer block 
+                 dec_embed_dim=512,      # decoder feature dimension 
+                 dec_depth=8,            # decoder depth 
+                 dec_num_heads=16,       # decoder number of heads in the transformer block 
+                 mlp_ratio=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 norm_im2_in_dec=True,   # whether to apply normalization of the 'memory' = (second image) in the decoder 
+                 pos_embed='cosine',     # positional embedding (either cosine or RoPE100)
+                ):
+                
+        super(CroCoNet, self).__init__()
+                
+        # patch embeddings  (with initialization done as in MAE)
+        self._set_patch_embed(img_size, patch_size, enc_embed_dim)
+
+        # mask generations
+        self._set_mask_generator(self.patch_embed.num_patches, mask_ratio)
+
+        self.pos_embed = pos_embed
+        if pos_embed=='cosine':
+            # positional embedding of the encoder 
+            enc_pos_embed = get_2d_sincos_pos_embed(enc_embed_dim, self.patch_embed.grid_size, n_cls_token=0)
+            self.register_buffer('enc_pos_embed', torch.from_numpy(enc_pos_embed).float())
+            # positional embedding of the decoder  
+            dec_pos_embed = get_2d_sincos_pos_embed(dec_embed_dim, self.patch_embed.grid_size, n_cls_token=0)
+            self.register_buffer('dec_pos_embed', torch.from_numpy(dec_pos_embed).float())
+            # pos embedding in each block
+            self.rope = None # nothing for cosine 
+        elif pos_embed.startswith('RoPE'): # eg RoPE100 
+            self.enc_pos_embed = None # nothing to add in the encoder with RoPE
+            self.dec_pos_embed = None # nothing to add in the decoder with RoPE
+            if RoPE2D is None: raise ImportError("Cannot find cuRoPE2D, please install it following the README instructions")
+            freq = float(pos_embed[len('RoPE'):])
+            self.rope = RoPE2D(freq=freq)
+        else:
+            raise NotImplementedError('Unknown pos_embed '+pos_embed)
+
+        # transformer for the encoder 
+        self.enc_depth = enc_depth
+        self.enc_embed_dim = enc_embed_dim
+        self.enc_blocks = nn.ModuleList([
+            Block(enc_embed_dim, enc_num_heads, mlp_ratio, qkv_bias=True, norm_layer=norm_layer, rope=self.rope)
+            for i in range(enc_depth)])
+        self.enc_norm = norm_layer(enc_embed_dim)
+        
+        # masked tokens 
+        self._set_mask_token(dec_embed_dim)
+
+        # decoder 
+        self._set_decoder(enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec)
+        
+        # prediction head 
+        self._set_prediction_head(dec_embed_dim, patch_size)
+        
+        # initializer weights
+        self.initialize_weights()           
+
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim)
+
+    def _set_mask_generator(self, num_patches, mask_ratio):
+        self.mask_generator = RandomMask(num_patches, mask_ratio)
+        
+    def _set_mask_token(self, dec_embed_dim):
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim))
+        
+    def _set_decoder(self, enc_embed_dim, dec_embed_dim, dec_num_heads, dec_depth, mlp_ratio, norm_layer, norm_im2_in_dec):
+        self.dec_depth = dec_depth
+        self.dec_embed_dim = dec_embed_dim
+        # transfer from encoder to decoder 
+        self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True)
+        # transformer for the decoder 
+        self.dec_blocks = nn.ModuleList([
+            DecoderBlock(dec_embed_dim, dec_num_heads, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer, norm_mem=norm_im2_in_dec, rope=self.rope)
+            for i in range(dec_depth)])
+        # final norm layer 
+        self.dec_norm = norm_layer(dec_embed_dim)
+        
+    def _set_prediction_head(self, dec_embed_dim, patch_size):
+         self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True)
+        
+        
+    def initialize_weights(self):
+        # patch embed 
+        self.patch_embed._init_weights()
+        # mask tokens
+        if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=.02)
+        # linears and layer norms
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+            
+    def _encode_image(self, image, do_mask=False, return_all_blocks=False):
+        """
+        image has B x 3 x img_size x img_size 
+        do_mask: whether to perform masking or not
+        return_all_blocks: if True, return the features at the end of every block 
+                           instead of just the features from the last block (eg for some prediction heads)
+        """
+        # embed the image into patches  (x has size B x Npatches x C) 
+        # and get position if each return patch (pos has size B x Npatches x 2)
+        x, pos = self.patch_embed(image)              
+        # add positional embedding without cls token  
+        if self.enc_pos_embed is not None: 
+            x = x + self.enc_pos_embed[None,...]
+        # apply masking 
+        B,N,C = x.size()
+        if do_mask:
+            masks = self.mask_generator(x)
+            x = x[~masks].view(B, -1, C)
+            posvis = pos[~masks].view(B, -1, 2)
+        else:
+            B,N,C = x.size()
+            masks = torch.zeros((B,N), dtype=bool)
+            posvis = pos
+        # now apply the transformer encoder and normalization        
+        if return_all_blocks:
+            out = []
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+                out.append(x)
+            out[-1] = self.enc_norm(out[-1])
+            return out, pos, masks
+        else:
+            for blk in self.enc_blocks:
+                x = blk(x, posvis)
+            x = self.enc_norm(x)
+            return x, pos, masks
+ 
+    def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False):
+        """
+        return_all_blocks: if True, return the features at the end of every block 
+                           instead of just the features from the last block (eg for some prediction heads)
+                           
+        masks1 can be None => assume image1 fully visible 
+        """
+        # encoder to decoder layer 
+        visf1 = self.decoder_embed(feat1)
+        f2 = self.decoder_embed(feat2)
+        # append masked tokens to the sequence
+        B,Nenc,C = visf1.size()
+        if masks1 is None: # downstreams
+            f1_ = visf1
+        else: # pretraining 
+            Ntotal = masks1.size(1)
+            f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype)
+            f1_[~masks1] = visf1.view(B * Nenc, C)
+        # add positional embedding
+        if self.dec_pos_embed is not None:
+            f1_ = f1_ + self.dec_pos_embed
+            f2 = f2 + self.dec_pos_embed
+        # apply Transformer blocks
+        out = f1_
+        out2 = f2 
+        if return_all_blocks:
+            _out, out = out, []
+            for blk in self.dec_blocks:
+                _out, out2 = blk(_out, out2, pos1, pos2)
+                out.append(_out)
+            out[-1] = self.dec_norm(out[-1])
+        else:
+            for blk in self.dec_blocks:
+                out, out2 = blk(out, out2, pos1, pos2)
+            out = self.dec_norm(out)
+        return out
+
+    def patchify(self, imgs):
+        """
+        imgs: (B, 3, H, W)
+        x: (B, L, patch_size**2 *3)
+        """
+        p = self.patch_embed.patch_size[0]
+        assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0
+
+        h = w = imgs.shape[2] // p
+        x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p))
+        x = torch.einsum('nchpwq->nhwpqc', x)
+        x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3))
+        
+        return x
+
+    def unpatchify(self, x, channels=3):
+        """
+        x: (N, L, patch_size**2 *channels)
+        imgs: (N, 3, H, W)
+        """
+        patch_size = self.patch_embed.patch_size[0]
+        h = w = int(x.shape[1]**.5)
+        assert h * w == x.shape[1]
+        x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels))
+        x = torch.einsum('nhwpqc->nchpwq', x)
+        imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size))
+        return imgs
+
+    def forward(self, img1, img2):
+        """
+        img1: tensor of size B x 3 x img_size x img_size
+        img2: tensor of size B x 3 x img_size x img_size
+        
+        out will be    B x N x (3*patch_size*patch_size)
+        masks are also returned as B x N just in case 
+        """
+        # encoder of the masked first image 
+        feat1, pos1, mask1 = self._encode_image(img1, do_mask=True)
+        # encoder of the second image 
+        feat2, pos2, _ = self._encode_image(img2, do_mask=False)
+        # decoder 
+        decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2)
+        # prediction head 
+        out = self.prediction_head(decfeat)
+        # get target
+        target = self.patchify(img1)
+        return out, mask1, target
diff --git a/dust3r/croco/models/croco_downstream.py b/dust3r/croco/models/croco_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..159dfff4d2c1461bc235e21441b57ce1e2088f76
--- /dev/null
+++ b/dust3r/croco/models/croco_downstream.py
@@ -0,0 +1,122 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# CroCo model for downstream tasks
+# --------------------------------------------------------
+
+import torch
+
+from .croco import CroCoNet
+
+
+def croco_args_from_ckpt(ckpt):
+    if 'croco_kwargs' in ckpt: # CroCo v2 released models
+        return ckpt['croco_kwargs']
+    elif 'args' in ckpt and hasattr(ckpt['args'], 'model'): # pretrained using the official code release
+        s = ckpt['args'].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)"
+        assert s.startswith('CroCoNet(')
+        return eval('dict'+s[len('CroCoNet'):]) # transform it into the string of a dictionary and evaluate it
+    else: # CroCo v1 released models
+        return dict()
+
+class CroCoDownstreamMonocularEncoder(CroCoNet):
+
+    def __init__(self,
+                 head,
+                 **kwargs):
+        """ Build network for monocular downstream task, only using the encoder.
+        It takes an extra argument head, that is called with the features 
+          and a dictionary img_info containing 'width' and 'height' keys
+        The head is setup with the croconet arguments in this init function
+        NOTE: It works by *calling super().__init__() but with redefined setters
+        
+        """
+        super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs)
+        head.setup(self)
+        self.head = head
+
+    def _set_mask_generator(self, *args, **kwargs):
+        """ No mask generator """
+        return
+
+    def _set_mask_token(self, *args, **kwargs):
+        """ No mask token """
+        self.mask_token = None
+        return
+
+    def _set_decoder(self, *args, **kwargs):
+        """ No decoder """
+        return
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """ No 'prediction head' for downstream tasks."""
+        return
+
+    def forward(self, img):
+        """
+        img if of size batch_size x 3 x h x w
+        """
+        B, C, H, W = img.size()
+        img_info = {'height': H, 'width': W}
+        need_all_layers = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+        out, _, _ = self._encode_image(img, do_mask=False, return_all_blocks=need_all_layers)
+        return self.head(out, img_info)
+        
+        
+class CroCoDownstreamBinocular(CroCoNet):
+
+    def __init__(self,
+                 head,
+                 **kwargs):
+        """ Build network for binocular downstream task
+        It takes an extra argument head, that is called with the features 
+          and a dictionary img_info containing 'width' and 'height' keys
+        The head is setup with the croconet arguments in this init function
+        """
+        super(CroCoDownstreamBinocular, self).__init__(**kwargs)
+        head.setup(self)
+        self.head = head
+
+    def _set_mask_generator(self, *args, **kwargs):
+        """ No mask generator """
+        return
+
+    def _set_mask_token(self, *args, **kwargs):
+        """ No mask token """
+        self.mask_token = None
+        return
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """ No prediction head for downstream tasks, define your own head """
+        return
+        
+    def encode_image_pairs(self, img1, img2, return_all_blocks=False):
+        """ run encoder for a pair of images
+            it is actually ~5% faster to concatenate the images along the batch dimension 
+             than to encode them separately
+        """
+        ## the two commented lines below is the naive version with separate encoding
+        #out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks)
+        #out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False)
+        ## and now the faster version
+        out, pos, _ = self._encode_image( torch.cat( (img1,img2), dim=0), do_mask=False, return_all_blocks=return_all_blocks )
+        if return_all_blocks:
+            out,out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out])))
+            out2 = out2[-1]
+        else:
+            out,out2 = out.chunk(2, dim=0)
+        pos,pos2 = pos.chunk(2, dim=0)            
+        return out, out2, pos, pos2
+
+    def forward(self, img1, img2):
+        B, C, H, W = img1.size()
+        img_info = {'height': H, 'width': W}
+        return_all_blocks = hasattr(self.head, 'return_all_blocks') and self.head.return_all_blocks
+        out, out2, pos, pos2 = self.encode_image_pairs(img1, img2, return_all_blocks=return_all_blocks)
+        if return_all_blocks:
+            decout = self._decoder(out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+            decout = out+decout
+        else:
+            decout = self._decoder(out, pos, None, out2, pos2, return_all_blocks=return_all_blocks)
+        return self.head(decout, img_info)
\ No newline at end of file
diff --git a/dust3r/croco/models/curope/__init__.py b/dust3r/croco/models/curope/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e3d48a162760260826080f6366838e83e26878
--- /dev/null
+++ b/dust3r/croco/models/curope/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from .curope2d import cuRoPE2D
diff --git a/dust3r/croco/models/curope/curope.cpp b/dust3r/croco/models/curope/curope.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8fe9058e05aa1bf3f37b0d970edc7312bc68455b
--- /dev/null
+++ b/dust3r/croco/models/curope/curope.cpp
@@ -0,0 +1,69 @@
+/* 
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include <torch/extension.h>
+
+// forward declaration
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd );
+
+void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd )
+{
+    const int B = tokens.size(0);
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3) / 4;
+
+    auto tok = tokens.accessor<float, 4>();
+    auto pos = positions.accessor<int64_t, 3>();
+
+    for (int b = 0; b < B; b++) {
+      for (int x = 0; x < 2; x++) { // y and then x (2d)
+        for (int n = 0; n < N; n++) {
+        
+            // grab the token position
+            const int p = pos[b][n][x];
+
+            for (int h = 0; h < H; h++) {
+                for (int d = 0; d < D; d++) {
+                    // grab the two values
+                    float u = tok[b][n][h][d+0+x*2*D];
+                    float v = tok[b][n][h][d+D+x*2*D];
+
+                    // grab the cos,sin
+                    const float inv_freq = fwd * p / powf(base, d/float(D));
+                    float c = cosf(inv_freq);
+                    float s = sinf(inv_freq);
+
+                    // write the result
+                    tok[b][n][h][d+0+x*2*D] = u*c - v*s;
+                    tok[b][n][h][d+D+x*2*D] = v*c + u*s;
+                }
+            }
+        }
+      }
+    }
+}
+
+void rope_2d( torch::Tensor tokens,     // B,N,H,D
+        const torch::Tensor positions,  // B,N,2
+        const float base, 
+        const float fwd )
+{
+    TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions");
+    TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions");
+    TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions");
+    TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions");
+    TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2");
+    TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" );
+
+    if (tokens.is_cuda())
+        rope_2d_cuda( tokens, positions, base, fwd );
+    else
+        rope_2d_cpu( tokens, positions, base, fwd );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward");
+}
diff --git a/dust3r/croco/models/curope/curope2d.py b/dust3r/croco/models/curope/curope2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49c12f8c529e9a889b5ac20c5767158f238e17d
--- /dev/null
+++ b/dust3r/croco/models/curope/curope2d.py
@@ -0,0 +1,40 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+import torch
+
+try:
+    import curope as _kernels # run `python setup.py install`
+except ModuleNotFoundError:
+    from . import curope as _kernels # run `python setup.py build_ext --inplace`
+
+
+class cuRoPE2D_func (torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, tokens, positions, base, F0=1):
+        ctx.save_for_backward(positions)
+        ctx.saved_base = base
+        ctx.saved_F0 = F0
+        # tokens = tokens.clone() # uncomment this if inplace doesn't work
+        _kernels.rope_2d( tokens, positions, base, F0 )
+        ctx.mark_dirty(tokens)
+        return tokens
+
+    @staticmethod
+    def backward(ctx, grad_res):
+        positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0
+        _kernels.rope_2d( grad_res, positions, base, -F0 )
+        ctx.mark_dirty(grad_res)
+        return grad_res, None, None, None
+
+
+class cuRoPE2D(torch.nn.Module):
+    def __init__(self, freq=100.0, F0=1.0):
+        super().__init__()
+        self.base = freq 
+        self.F0 = F0
+
+    def forward(self, tokens, positions): 
+        cuRoPE2D_func.apply( tokens.transpose(1,2), positions, self.base, self.F0 )
+        return tokens
\ No newline at end of file
diff --git a/dust3r/croco/models/curope/kernels.cu b/dust3r/croco/models/curope/kernels.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd434f51b66c5d64b7b5258a9aad70ad7003a15d
--- /dev/null
+++ b/dust3r/croco/models/curope/kernels.cu
@@ -0,0 +1,108 @@
+/* 
+  Copyright (C) 2022-present Naver Corporation. All rights reserved.
+  Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+*/
+
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+#define CHECK_CUDA(tensor) {\
+    TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \
+    TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); }
+void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));}
+
+
+template < typename scalar_t  >
+__global__ void rope_2d_cuda_kernel( 
+        //scalar_t* __restrict__ tokens, 
+        torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> tokens,
+        const int64_t* __restrict__ pos, 
+        const float base, 
+        const float fwd )
+        // const int N, const int H, const int D )
+{
+    // tokens shape = (B, N, H, D)
+    const int N = tokens.size(1);
+    const int H = tokens.size(2);
+    const int D = tokens.size(3);
+    
+    // each block update a single token, for all heads
+    // each thread takes care of a single output
+    extern __shared__ float shared[];
+    float* shared_inv_freq = shared + D;
+
+    const int b = blockIdx.x / N;
+    const int n = blockIdx.x % N;
+
+    const int Q = D / 4; 
+    // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D]
+    //              u_Y     v_Y     u_X      v_X
+
+    // shared memory: first, compute inv_freq
+    if (threadIdx.x < Q)
+        shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q));
+    __syncthreads();
+
+    // start of X or Y part
+    const int X = threadIdx.x < D/2 ? 0 : 1; 
+    const int m = (X*D/2) + (threadIdx.x % Q);   // index of u_Y or u_X
+
+    // grab the cos,sin appropriate for me
+    const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q];
+    const float cos = cosf(freq);
+    const float sin = sinf(freq);
+    /*
+    float* shared_cos_sin = shared + D + D/4;
+    if ((threadIdx.x % (D/2)) < Q)
+        shared_cos_sin[m+0] = cosf(freq);
+    else
+        shared_cos_sin[m+Q] = sinf(freq);
+    __syncthreads();
+    const float cos = shared_cos_sin[m+0];
+    const float sin = shared_cos_sin[m+Q];
+    */
+
+    for (int h = 0; h < H; h++)
+    {
+        // then, load all the token for this head in shared memory
+        shared[threadIdx.x] = tokens[b][n][h][threadIdx.x];
+        __syncthreads();
+
+        const float u = shared[m];
+        const float v = shared[m+Q];
+        
+        // write output
+        if ((threadIdx.x % (D/2)) < Q)
+            tokens[b][n][h][threadIdx.x] = u*cos - v*sin;
+        else
+            tokens[b][n][h][threadIdx.x] = v*cos + u*sin;
+    }
+}
+
+void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ) 
+{
+    const int B = tokens.size(0); // batch size
+    const int N = tokens.size(1); // sequence length
+    const int H = tokens.size(2); // number of heads
+    const int D = tokens.size(3); // dimension per head
+
+    TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous");
+    TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous");
+    TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape");
+    TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4");
+
+    // one block for each layer, one thread per local-max
+    const int THREADS_PER_BLOCK = D;
+    const int N_BLOCKS = B * N; // each block takes care of H*D values
+    const int SHARED_MEM = sizeof(float) * (D + D/4);
+
+    AT_DISPATCH_FLOATING_TYPES_AND2(at::kHalf, at::kBFloat16, tokens.scalar_type(), "rope_2d_cuda", ([&] {
+        rope_2d_cuda_kernel<scalar_t> <<<N_BLOCKS, THREADS_PER_BLOCK, SHARED_MEM>>> (
+            //tokens.data_ptr<scalar_t>(), 
+            tokens.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+            pos.data_ptr<int64_t>(), 
+            base, fwd); //, N, H, D );
+    }));
+}
diff --git a/dust3r/croco/models/curope/setup.py b/dust3r/croco/models/curope/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..230632ed05e309200e8f93a3a852072333975009
--- /dev/null
+++ b/dust3r/croco/models/curope/setup.py
@@ -0,0 +1,34 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+from setuptools import setup
+from torch import cuda
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+# compile for all possible CUDA architectures
+all_cuda_archs = cuda.get_gencode_flags().replace('compute=','arch=').split()
+# alternatively, you can list cuda archs that you want, eg:
+# all_cuda_archs = [
+    # '-gencode', 'arch=compute_70,code=sm_70',
+    # '-gencode', 'arch=compute_75,code=sm_75',
+    # '-gencode', 'arch=compute_80,code=sm_80',
+    # '-gencode', 'arch=compute_86,code=sm_86'
+# ]
+
+setup(
+    name = 'curope',
+    ext_modules = [
+        CUDAExtension(
+                name='curope',
+                sources=[
+                    "curope.cpp",
+                    "kernels.cu",
+                ],
+                extra_compile_args = dict(
+                    nvcc=['-O3','--ptxas-options=-v',"--use_fast_math"]+all_cuda_archs, 
+                    cxx=['-O3'])
+                )
+    ],
+    cmdclass = {
+        'build_ext': BuildExtension
+    })
diff --git a/dust3r/croco/models/dpt_block.py b/dust3r/croco/models/dpt_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4ddfb74e2769ceca88720d4c730e00afd71c763
--- /dev/null
+++ b/dust3r/croco/models/dpt_block.py
@@ -0,0 +1,450 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# DPT head for ViTs
+# --------------------------------------------------------
+# References: 
+# https://github.com/isl-org/DPT
+# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from typing import Union, Tuple, Iterable, List, Optional, Dict
+
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+
+def make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+
+    scratch.layer_rn = nn.ModuleList([
+        scratch.layer1_rn,
+        scratch.layer2_rn,
+        scratch.layer3_rn,
+        scratch.layer4_rn,
+    ])
+
+    return scratch
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        width_ratio=1,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.width_ratio = width_ratio
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            if self.width_ratio != 1:
+                res = F.interpolate(res, size=(output.shape[2], output.shape[3]), mode='bilinear')
+
+            output = self.skip_add.add(output, res)
+            # output += res
+
+        output = self.resConfUnit2(output)
+
+        if self.width_ratio != 1:
+            # and output.shape[3] < self.width_ratio * output.shape[2]
+            #size=(image.shape[])
+            if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio:
+                shape = 3 * output.shape[3]
+            else:
+                shape = int(self.width_ratio * 2 * output.shape[2])
+            output  = F.interpolate(output, size=(2* output.shape[2], shape), mode='bilinear')
+        else:
+            output = nn.functional.interpolate(output, scale_factor=2,
+                    mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output
+
+def make_fusion_block(features, use_bn, width_ratio=1):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        width_ratio=width_ratio,
+    )
+
+class Interpolate(nn.Module):
+    """Interpolation module."""
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+
+        return x
+
+class DPTOutputAdapter(nn.Module):
+    """DPT output adapter.
+
+    :param num_cahnnels: Number of output channels
+    :param stride_level: tride level compared to the full-sized image.
+        E.g. 4 for 1/4th the size of the image.
+    :param patch_size_full: Int or tuple of the patch size over the full image size.
+        Patch size for smaller inputs will be computed accordingly.
+    :param hooks: Index of intermediate layers
+    :param layer_dims: Dimension of intermediate layers
+    :param feature_dim: Feature dimension
+    :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression
+    :param use_bn: If set to True, activates batch norm
+    :param dim_tokens_enc:  Dimension of tokens coming from encoder
+    """
+
+    def __init__(self,
+                 num_channels: int = 1,
+                 stride_level: int = 1,
+                 patch_size: Union[int, Tuple[int, int]] = 16,
+                 main_tasks: Iterable[str] = ('rgb',),
+                 hooks: List[int] = [2, 5, 8, 11],
+                 layer_dims: List[int] = [96, 192, 384, 768],
+                 feature_dim: int = 256,
+                 last_dim: int = 32,
+                 use_bn: bool = False,
+                 dim_tokens_enc: Optional[int] = None,
+                 head_type: str = 'regression',
+                 output_width_ratio=1,
+                 **kwargs):
+        super().__init__()
+        self.num_channels = num_channels
+        self.stride_level = stride_level
+        self.patch_size = pair(patch_size)
+        self.main_tasks = main_tasks
+        self.hooks = hooks
+        self.layer_dims = layer_dims
+        self.feature_dim = feature_dim
+        self.dim_tokens_enc = dim_tokens_enc * len(self.main_tasks) if dim_tokens_enc is not None else None
+        self.head_type = head_type
+
+        # Actual patch height and width, taking into account stride of input
+        self.P_H = max(1, self.patch_size[0] // stride_level)
+        self.P_W = max(1, self.patch_size[1] // stride_level)
+
+        self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False)
+
+        self.scratch.refinenet1 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet2 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet3 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+        self.scratch.refinenet4 = make_fusion_block(feature_dim, use_bn, output_width_ratio)
+
+        if self.head_type == 'regression':
+            # The "DPTDepthModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+                nn.Conv2d(feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(last_dim, self.num_channels, kernel_size=1, stride=1, padding=0)
+            )
+        elif self.head_type == 'semseg':
+            # The "DPTSegmentationModel" head
+            self.head = nn.Sequential(
+                nn.Conv2d(feature_dim, feature_dim, kernel_size=3, padding=1, bias=False),
+                nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(),
+                nn.ReLU(True),
+                nn.Dropout(0.1, False),
+                nn.Conv2d(feature_dim, self.num_channels, kernel_size=1),
+                Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            )
+        else:
+            raise ValueError('DPT head_type must be "regression" or "semseg".')
+
+        if self.dim_tokens_enc is not None:
+            self.init(dim_tokens_enc=dim_tokens_enc)
+
+    def init(self, dim_tokens_enc=768):
+        """
+        Initialize parts of decoder that are dependent on dimension of encoder tokens.
+        Should be called when setting up MultiMAE.
+
+        :param dim_tokens_enc: Dimension of tokens coming from encoder
+        """
+        #print(dim_tokens_enc)
+
+        # Set up activation postprocessing layers
+        if isinstance(dim_tokens_enc, int):
+            dim_tokens_enc = 4 * [dim_tokens_enc]
+
+        self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc]
+
+        self.act_1_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[0],
+                out_channels=self.layer_dims[0],
+                kernel_size=4, stride=4, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+
+        self.act_2_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=self.layer_dims[1],
+                out_channels=self.layer_dims[1],
+                kernel_size=2, stride=2, padding=0,
+                bias=True, dilation=1, groups=1,
+            )
+        )
+
+        self.act_3_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[2],
+                out_channels=self.layer_dims[2],
+                kernel_size=1, stride=1, padding=0,
+            )
+        )
+
+        self.act_4_postprocess = nn.Sequential(
+            nn.Conv2d(
+                in_channels=self.dim_tokens_enc[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=1, stride=1, padding=0,
+            ),
+            nn.Conv2d(
+                in_channels=self.layer_dims[3],
+                out_channels=self.layer_dims[3],
+                kernel_size=3, stride=2, padding=1,
+            )
+        )
+
+        self.act_postprocess = nn.ModuleList([
+            self.act_1_postprocess,
+            self.act_2_postprocess,
+            self.act_3_postprocess,
+            self.act_4_postprocess
+        ])
+
+    def adapt_tokens(self, encoder_tokens):
+        # Adapt tokens
+        x = []
+        x.append(encoder_tokens[:, :])
+        x = torch.cat(x, dim=-1)
+        return x
+
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size):
+            #input_info: Dict):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        H, W = image_size
+        
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+        # Output head
+        out = self.head(path_1)
+
+        return out
diff --git a/dust3r/croco/models/head_downstream.py b/dust3r/croco/models/head_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd40c91ba244d6c3522c6efd4ed4d724b7bdc650
--- /dev/null
+++ b/dust3r/croco/models/head_downstream.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Heads for downstream tasks
+# --------------------------------------------------------
+
+"""
+A head is a module where the __init__ defines only the head hyperparameters.
+A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes.
+The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height'
+"""
+
+import torch
+import torch.nn as nn
+from .dpt_block import DPTOutputAdapter
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+    """ DPT module for CroCo.
+    by default, hooks_idx will be equal to:
+    * for encoder-only: 4 equally spread layers
+    * for encoder+decoder: last encoder + 3 equally spread layers of the decoder 
+    """
+
+    def __init__(self, *, hooks_idx=None, layer_dims=[96,192,384,768],
+                 output_width_ratio=1, num_channels=1, postprocess=None, **kwargs):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_blocks = True # backbone needs to return all layers 
+        self.postprocess = postprocess
+        self.output_width_ratio = output_width_ratio
+        self.num_channels = num_channels
+        self.hooks_idx = hooks_idx
+        self.layer_dims = layer_dims
+    
+    def setup(self, croconet):
+        dpt_args = {'output_width_ratio': self.output_width_ratio, 'num_channels': self.num_channels}
+        if self.hooks_idx is None:
+            if hasattr(croconet, 'dec_blocks'): # encoder + decoder 
+                step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth]
+                hooks_idx = [croconet.dec_depth+croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+            else: # encoder only
+                step = croconet.enc_depth//4
+                hooks_idx = [croconet.enc_depth-1-i*step for i in range(3,-1,-1)]
+            self.hooks_idx = hooks_idx
+            print(f'  PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}')
+        dpt_args['hooks'] = self.hooks_idx
+        dpt_args['layer_dims'] = self.layer_dims
+        self.dpt = DPTOutputAdapter(**dpt_args)
+        dim_tokens = [croconet.enc_embed_dim if hook<croconet.enc_depth else croconet.dec_embed_dim for hook in self.hooks_idx]
+        dpt_init_args = {'dim_tokens_enc': dim_tokens}
+        self.dpt.init(**dpt_init_args)
+
+
+    def forward(self, x, img_info):
+        out = self.dpt(x, image_size=(img_info['height'],img_info['width']))
+        if self.postprocess: out = self.postprocess(out)
+        return out
\ No newline at end of file
diff --git a/dust3r/croco/models/masking.py b/dust3r/croco/models/masking.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb0d36f53efb4d42f3270db515235dceea8a44c2
--- /dev/null
+++ b/dust3r/croco/models/masking.py
@@ -0,0 +1,25 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Masking utils
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn    
+    
+class RandomMask(nn.Module):
+    """
+    random masking
+    """
+
+    def __init__(self, num_patches, mask_ratio):
+        super().__init__()
+        self.num_patches = num_patches
+        self.num_mask = int(mask_ratio * self.num_patches)
+    
+    def __call__(self, x):
+        noise = torch.rand(x.size(0), self.num_patches, device=x.device) 
+        argsort = torch.argsort(noise, dim=1) 
+        return argsort < self.num_mask
diff --git a/dust3r/croco/models/pos_embed.py b/dust3r/croco/models/pos_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..33f6517ca17860aa757f5d4992667433b51b55cf
--- /dev/null
+++ b/dust3r/croco/models/pos_embed.py
@@ -0,0 +1,157 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+
+
+
+import numpy as np
+
+import torch
+
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0):
+    """
+    grid_size: tuple (height, width) of the grid
+    return:
+    pos_embed: [grid_size[0]*grid_size[1], embed_dim] or [n_cls_token+grid_size[0]*grid_size[1], embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float32)
+    grid_w = np.arange(grid_size[1], dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if n_cls_token>0:
+        pos_embed = np.concatenate([np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.
+    omega = 1. / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out) # (M, D/2)
+    emb_cos = np.cos(out) # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    keys = ['enc_pos_embed']+(['dec_pos_embed'] if hasattr(model,'dec_blocks') else [])
+    img_size = model.patch_embed.img_size
+    if isinstance(img_size,int): img_size = (img_size,img_size)
+    for k in keys:
+        if not k in checkpoint_model: continue
+        pos_embed_checkpoint = checkpoint_model[k]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_extra_tokens = 0 # no cls token
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        new_size = (img_size[0]//model.patch_embed.patch_size[0],img_size[1]//model.patch_embed.patch_size[1])
+        if orig_size != new_size[0] or orig_size != new_size[1]:
+            print("Position interpolate %s from %dx%d to %dx%d" % (k, orig_size, orig_size, new_size[0], new_size[1]))
+            extra_tokens = pos_embed_checkpoint[:num_extra_tokens,:]
+            pos_tokens = pos_embed_checkpoint[num_extra_tokens:,:]
+            pos_tokens = pos_tokens.reshape(1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size[0], new_size[1]), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2).squeeze(0)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=0)
+            checkpoint_model[k] = new_pos_embed.squeeze(0)
+
+#----------------------------------------------------------
+# RoPE2D: RoPE implementation in 2D
+#----------------------------------------------------------
+
+try:
+    from models.curope import cuRoPE2D
+    RoPE2D = cuRoPE2D
+except ImportError:
+    print('Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead')
+
+    class RoPE2D(torch.nn.Module):
+        
+        def __init__(self, freq=100.0, F0=1.0):
+            super().__init__()
+            self.base = freq 
+            self.F0 = F0
+            self.cache = {}
+
+        def get_cos_sin(self, D, seq_len, device, dtype):
+            if (D,seq_len,device,dtype) not in self.cache:
+                inv_freq = self.F0 / (self.base ** (torch.arange(0, D, 2).float().to(device) / D))
+                t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype)
+                freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype)
+                freqs = torch.cat((freqs, freqs), dim=-1)
+                cos = freqs.cos() # (Seq, Dim)
+                sin = freqs.sin()
+                self.cache[D,seq_len,device,dtype] = (cos,sin)
+            return self.cache[D,seq_len,device,dtype]
+            
+        @staticmethod
+        def rotate_half(x):
+            x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :]
+            return torch.cat((-x2, x1), dim=-1)
+            
+        def apply_rope1d(self, tokens, pos1d, cos, sin):
+            assert pos1d.ndim==2
+            cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :]
+            sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :]
+            return (tokens * cos) + (self.rotate_half(tokens) * sin)
+            
+        def forward(self, tokens, positions):
+            """
+            input:
+                * tokens: batch_size x nheads x ntokens x dim
+                * positions: batch_size x ntokens x 2 (y and x position of each token)
+            output:
+                * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim)
+            """
+            assert tokens.size(3)%2==0, "number of dimensions should be a multiple of two"
+            D = tokens.size(3) // 2
+            assert positions.ndim==3 and positions.shape[-1] == 2 # Batch, Seq, 2
+            cos, sin = self.get_cos_sin(D, int(positions.max())+1, tokens.device, tokens.dtype)
+            # split features into two along the feature dimension, and apply rope1d on each half
+            y, x = tokens.chunk(2, dim=-1)
+            y = self.apply_rope1d(y, positions[:,:,0], cos, sin)
+            x = self.apply_rope1d(x, positions[:,:,1], cos, sin)
+            tokens = torch.cat((y, x), dim=-1)
+            return tokens
\ No newline at end of file
diff --git a/dust3r/croco/pretrain.py b/dust3r/croco/pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c45e488015ef5380c71d0381ff453fdb860759e
--- /dev/null
+++ b/dust3r/croco/pretrain.py
@@ -0,0 +1,254 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# Pre-training CroCo 
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math 
+from pathlib import Path
+from typing import Iterable
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco import CroCoNet
+from models.criterion import MaskedMSE
+from datasets.pairs_dataset import PairsDataset
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('CroCo pre-training', add_help=False)
+    # model and criterion
+    parser.add_argument('--model', default='CroCoNet()', type=str, help="string containing the model to build")
+    parser.add_argument('--norm_pix_loss', default=1, choices=[0,1], help="apply per-patch mean/std normalization before applying the loss")
+    # dataset 
+    parser.add_argument('--dataset', default='habitat_release', type=str, help="training set")
+    parser.add_argument('--transforms', default='crop224+acolor', type=str, help="transforms to apply") # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful
+    # training 
+    parser.add_argument('--seed', default=0, type=int, help="Random seed")
+    parser.add_argument('--batch_size', default=64, type=int, help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus")
+    parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler")
+    parser.add_argument('--max_epoch', default=400, type=int, help="Stop training at this epoch")
+    parser.add_argument('--accum_iter', default=1, type=int, help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)")
+    parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)")
+    parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR', help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR')              
+    parser.add_argument('--amp', type=int, default=1, choices=[0,1], help="Use Automatic Mixed Precision for pretraining")
+    # others 
+    parser.add_argument('--num_workers', default=8, type=int)
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    parser.add_argument('--save_freq', default=1, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth')
+    parser.add_argument('--keep_freq', default=20, type=int, help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth')
+    parser.add_argument('--print_freq', default=20, type=int, help='frequence (number of iterations) to print infos while training')
+    # paths 
+    parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output")
+    parser.add_argument('--data_dir', default='./data/', type=str, help="path where data are stored")
+    return parser
+
+
+
+        
+def main(args):
+    misc.init_distributed_mode(args)
+    global_rank = misc.get_rank()
+    world_size = misc.get_world_size()
+    
+    print("output_dir: "+args.output_dir)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)                         
+
+    # auto resume 
+    last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+    args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # fix the seed
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = True
+
+    ## training dataset and loader 
+    print('Building dataset for {:s} with transforms {:s}'.format(args.dataset, args.transforms))
+    dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir)
+    if world_size>1:
+        sampler_train = torch.utils.data.DistributedSampler(
+            dataset, num_replicas=world_size, rank=global_rank, shuffle=True
+        )
+        print("Sampler_train = %s" % str(sampler_train))
+    else:
+        sampler_train = torch.utils.data.RandomSampler(dataset)
+    data_loader_train = torch.utils.data.DataLoader(
+        dataset, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+    )
+   
+    ## model 
+    print('Loading model: {:s}'.format(args.model))
+    model = eval(args.model)
+    print('Loading criterion: MaskedMSE(norm_pix_loss={:s})'.format(str(bool(args.norm_pix_loss))))
+    criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss))
+   
+    model.to(device)
+    model_without_ddp = model
+    print("Model = %s" % str(model_without_ddp))
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True)
+        model_without_ddp = model.module
+    
+    param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) # following timm: set wd as 0 for bias and norm layers
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(log_dir=args.output_dir)
+    else:
+        log_writer = None
+
+    print(f"Start training until {args.max_epoch} epochs")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.max_epoch):
+        if world_size>1:
+            data_loader_train.sampler.set_epoch(epoch)
+            
+        train_stats = train_one_epoch(
+            model, criterion, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            log_writer=log_writer,
+            args=args
+        )
+        
+        if args.output_dir and epoch % args.save_freq == 0 :
+            misc.save_model(
+                args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                loss_scaler=loss_scaler, epoch=epoch, fname='last')
+                
+        if args.output_dir and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch) and (epoch>0 or args.max_epoch==1):
+            misc.save_model(
+                args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                loss_scaler=loss_scaler, epoch=epoch)
+
+        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                        'epoch': epoch,}
+
+        if args.output_dir and misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    log_writer=None,
+                    args=None):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    for data_iter_step, (image1, image2) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+
+        # we use a per iteration  lr scheduler
+        if data_iter_step % accum_iter == 0:
+            misc.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args)
+
+        image1 = image1.to(device, non_blocking=True) 
+        image2 = image2.to(device, non_blocking=True)
+        with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+            out, mask, target = model(image1, image2)
+            loss = criterion(out, mask, target)
+
+        loss_value = loss.item()
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        loss /= accum_iter
+        loss_scaler(loss, optimizer, parameters=model.parameters(),
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+
+        metric_logger.update(loss=loss_value)
+
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=lr)
+
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if log_writer is not None and ((data_iter_step + 1) % (accum_iter*args.print_freq)) == 0:
+            # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes 
+            epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000)
+            log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('lr', lr, epoch_1000x)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+    
+    
+
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
diff --git a/dust3r/croco/stereoflow/README.MD b/dust3r/croco/stereoflow/README.MD
new file mode 100644
index 0000000000000000000000000000000000000000..81595380fadd274b523e0cf77921b1b65cbedb34
--- /dev/null
+++ b/dust3r/croco/stereoflow/README.MD
@@ -0,0 +1,318 @@
+## CroCo-Stereo and CroCo-Flow
+
+This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained.
+All commands should be launched from the root directory.
+
+### Simple inference example
+
+We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`.
+Before running it, please download the trained models with:
+```
+bash stereoflow/download_model.sh crocostereo.pth
+bash stereoflow/download_model.sh crocoflow.pth
+```
+
+### Prepare data for training or evaluation
+
+Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`).
+Please find below on the file structure should look for each dataset:
+<details>
+<summary>FlyingChairs</summary>
+
+```
+./data/stereoflow/FlyingChairs/
+└───chairs_split.txt
+└───data/
+    └─── ...
+```
+</details>
+
+<details>
+<summary>MPI-Sintel</summary>
+
+```
+./data/stereoflow/MPI-Sintel/
+└───training/
+│   └───clean/
+│   └───final/
+│   └───flow/
+└───test/
+    └───clean/
+    └───final/
+```
+</details>
+
+<details>
+<summary>SceneFlow (including FlyingThings)</summary>
+
+```
+./data/stereoflow/SceneFlow/
+└───Driving/
+│   └───disparity/
+│   └───frames_cleanpass/
+│   └───frames_finalpass/
+└───FlyingThings/
+│   └───disparity/
+│   └───frames_cleanpass/
+│   └───frames_finalpass/
+│   └───optical_flow/
+└───Monkaa/
+    └───disparity/
+    └───frames_cleanpass/
+    └───frames_finalpass/
+```
+</details>
+
+<details>
+<summary>TartanAir</summary>
+
+```
+./data/stereoflow/TartanAir/
+└───abandonedfactory/
+│   └───.../
+└───abandonedfactory_night/
+│   └───.../
+└───.../
+```
+</details>
+
+<details>
+<summary>Booster</summary>
+
+```
+./data/stereoflow/booster_gt/
+└───train/
+    └───balanced/
+        └───Bathroom/
+        └───Bedroom/
+        └───...
+```
+</details>
+
+<details>
+<summary>CREStereo</summary>
+
+```
+./data/stereoflow/crenet_stereo_trainset/
+└───stereo_trainset/
+    └───crestereo/
+        └───hole/
+        └───reflective/
+        └───shapenet/
+        └───tree/
+```
+</details>
+
+<details>
+<summary>ETH3D Two-view Low-res</summary>
+
+```
+./data/stereoflow/eth3d_lowres/
+└───test/
+│   └───lakeside_1l/
+│   └───...
+└───train/
+│   └───delivery_area_1l/
+│   └───...
+└───train_gt/
+    └───delivery_area_1l/
+    └───...
+```
+</details>
+
+<details>
+<summary>KITTI 2012</summary>
+
+```
+./data/stereoflow/kitti-stereo-2012/
+└───testing/
+│   └───colored_0/
+│   └───colored_1/
+└───training/
+    └───colored_0/
+    └───colored_1/
+    └───disp_occ/
+    └───flow_occ/
+```
+</details>
+
+<details>
+<summary>KITTI 2015</summary>
+
+```
+./data/stereoflow/kitti-stereo-2015/
+└───testing/
+│   └───image_2/
+│   └───image_3/
+└───training/
+    └───image_2/
+    └───image_3/
+    └───disp_occ_0/
+    └───flow_occ/
+```
+</details>
+
+<details>
+<summary>Middlebury</summary>
+
+```
+./data/stereoflow/middlebury
+└───2005/
+│   └───train/
+│       └───Art/
+│       └───...
+└───2006/
+│   └───Aloe/
+│   └───Baby1/
+│   └───...
+└───2014/
+│   └───Adirondack-imperfect/
+│   └───Adirondack-perfect/
+│   └───...
+└───2021/
+│   └───data/
+│       └───artroom1/
+│       └───artroom2/
+│       └───...
+└───MiddEval3_F/
+    └───test/
+    │   └───Australia/
+    │   └───...
+    └───train/
+        └───Adirondack/
+        └───...
+```
+</details>
+
+<details>
+<summary>Spring</summary>
+
+```
+./data/stereoflow/spring/
+└───test/
+│   └───0003/
+│   └───...
+└───train/
+    └───0001/
+    └───...
+```
+</details>
+
+
+### CroCo-Stereo
+
+##### Main model 
+
+The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo.pth
+# Middlebury v3 submission
+python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9
+# Training command that was used, using checkpoint-last.pth
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus:
+torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/
+```
+
+For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets.
+
+```
+# Download the model
+bash stereoflow/download_model.sh crocostereo_subtrain.pth
+# Evaluation on validation sets 
+python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9
+# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus
+python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/
+```
+
+##### Other models 
+
+<details>
+	<summary>Model for ETH3D</summary> 
+	The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss.
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_eth3d.pth
+	# ETH3D submission
+	python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9
+	# Training command that was used
+	python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/
+	
+</details>
+
+<details>
+	<summary>Main model finetuned on Kitti</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth
+	# Kitti submission 
+	python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9
+	# Training that was used
+	python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5
+</details>
+
+<details>
+	<summary>Main model finetuned on Spring</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocostereo_finetune_spring.pth
+	# Spring submission 
+	python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+	# Training command that was used
+	python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/
+</details>
+
+<details>
+	<summary>Smaller models</summary>
+	To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the <code>--pretrained</code> argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use <code>bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth</code>, and for the model with a ViT-Base encoder and a Base decoder, use <code>bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth</code>.
+</details>
+	
+
+### CroCo-Flow
+
+##### Main model
+
+The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets.
+It was used for our submission to the MPI-Sintel benchmark.
+
+```
+# Download the model 
+bash stereoflow/download_model.sh crocoflow.pth
+# Evaluation 
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9
+# Sintel submission
+python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9
+# Training command that was used, with checkpoint-best.pth
+python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/
+```
+
+##### Other models 
+
+<details>
+	<summary>Main model finetuned on Kitti</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth
+	# Kitti submission 
+	python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99
+	# Training that was used, with checkpoint-last.pth
+	python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/
+</details>
+
+<details>
+	<summary>Main model finetuned on Spring</summary>
+	
+	# Download the model
+	bash stereoflow/download_model.sh crocoflow_finetune_spring.pth
+	# Spring submission 
+	python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9
+	# Training command that was used, with checkpoint-last.pth
+	python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/
+</details>
+
+<details>
+	<summary>Smaller models</summary>
+	To train CroCo-Flow with smaller CroCo pretrained models, simply replace the <code>--pretrained</code> argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use <code>bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth</code>, and for the model with a ViT-Base encoder and a Base decoder, use <code>bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth</code>.
+</details>
diff --git a/dust3r/croco/stereoflow/augmentor.py b/dust3r/croco/stereoflow/augmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e6117151988d94cbc4b385e0d88e982133bf10
--- /dev/null
+++ b/dust3r/croco/stereoflow/augmentor.py
@@ -0,0 +1,290 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Data augmentation for training stereo and flow
+# --------------------------------------------------------
+
+# References
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py
+# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py
+
+
+import numpy as np
+import random
+from PIL import Image
+
+import cv2
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+import torch
+from torchvision.transforms import ColorJitter
+import torchvision.transforms.functional as FF
+
+class StereoAugmentor(object):
+
+    def __init__(self, crop_size, scale_prob=0.5, scale_xonly=True, lhth=800., lminscale=0.0, lmaxscale=1.0, hminscale=-0.2, hmaxscale=0.4, scale_interp_nearest=True, rightjitterprob=0.5, v_flip_prob=0.5, color_aug_asym=True, color_choice_prob=0.5):
+        self.crop_size = crop_size
+        self.scale_prob = scale_prob
+        self.scale_xonly = scale_xonly
+        self.lhth = lhth
+        self.lminscale = lminscale
+        self.lmaxscale = lmaxscale
+        self.hminscale = hminscale
+        self.hmaxscale = hmaxscale
+        self.scale_interp_nearest = scale_interp_nearest
+        self.rightjitterprob = rightjitterprob
+        self.v_flip_prob = v_flip_prob
+        self.color_aug_asym = color_aug_asym
+        self.color_choice_prob = color_choice_prob
+        
+    def _random_scale(self, img1, img2, disp):
+        ch,cw = self.crop_size
+        h,w = img1.shape[:2]
+        if self.scale_prob>0. and np.random.rand()<self.scale_prob:
+            min_scale, max_scale = (self.lminscale,self.lmaxscale) if min(h,w) < self.lhth else (self.hminscale,self.hmaxscale)
+            scale_x = 2. ** np.random.uniform(min_scale, max_scale)
+            scale_x = np.clip(scale_x, (cw+8) / float(w), None)
+            scale_y = 1.
+            if not self.scale_xonly:
+                scale_y = scale_x
+                scale_y = np.clip(scale_y, (ch+8) / float(h), None)
+            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x
+        else: # check if we need to resize to be able to crop 
+            h,w = img1.shape[:2]
+            clip_scale = (cw+8) / float(w)
+            if clip_scale>1.:
+                scale_x = clip_scale
+                scale_y = scale_x if not self.scale_xonly else 1.0
+                img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+                img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+                disp = cv2.resize(disp, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR if not self.scale_interp_nearest else cv2.INTER_NEAREST) * scale_x
+        return img1, img2, disp 
+                
+    def _random_crop(self, img1, img2, disp): 
+        h,w = img1.shape[:2]
+        ch,cw = self.crop_size
+        assert ch<=h and cw<=w, (img1.shape, h,w,ch,cw)
+        offset_x = np.random.randint(w - cw + 1)
+        offset_y = np.random.randint(h - ch + 1)
+        img1 = img1[offset_y:offset_y+ch,offset_x:offset_x+cw]
+        img2 = img2[offset_y:offset_y+ch,offset_x:offset_x+cw]
+        disp = disp[offset_y:offset_y+ch,offset_x:offset_x+cw]
+        return img1, img2, disp
+    
+    def _random_vflip(self, img1, img2, disp):
+        # vertical flip
+        if self.v_flip_prob>0 and np.random.rand() < self.v_flip_prob:
+            img1 = np.copy(np.flipud(img1))
+            img2 = np.copy(np.flipud(img2))
+            disp = np.copy(np.flipud(disp))
+        return img1, img2, disp
+        
+    def _random_rotate_shift_right(self, img2):
+        if self.rightjitterprob>0. and np.random.rand()<self.rightjitterprob:
+            angle, pixel = 0.1, 2
+            px = np.random.uniform(-pixel, pixel)
+            ag = np.random.uniform(-angle, angle)
+            image_center = (np.random.uniform(0, img2.shape[0]), np.random.uniform(0, img2.shape[1])  )
+            rot_mat = cv2.getRotationMatrix2D(image_center, ag, 1.0)
+            img2 = cv2.warpAffine(img2, rot_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR)
+            trans_mat = np.float32([[1, 0, 0], [0, 1, px]])
+            img2 = cv2.warpAffine(img2, trans_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR)
+        return img2
+            
+    def _random_color_contrast(self, img1, img2):
+        if np.random.random() < 0.5:
+            contrast_factor = np.random.uniform(0.8, 1.2)
+            img1 = FF.adjust_contrast(img1, contrast_factor)
+            if self.color_aug_asym and np.random.random() < 0.5: contrast_factor = np.random.uniform(0.8, 1.2)
+            img2 = FF.adjust_contrast(img2, contrast_factor)
+        return img1, img2
+    def _random_color_gamma(self, img1, img2):
+        if np.random.random() < 0.5:
+            gamma = np.random.uniform(0.7, 1.5)
+            img1 = FF.adjust_gamma(img1, gamma)
+            if self.color_aug_asym and np.random.random() < 0.5: gamma = np.random.uniform(0.7, 1.5)
+            img2 = FF.adjust_gamma(img2, gamma)
+        return img1, img2
+    def _random_color_brightness(self, img1, img2):
+        if np.random.random() < 0.5:
+            brightness = np.random.uniform(0.5, 2.0)
+            img1 = FF.adjust_brightness(img1, brightness)
+            if self.color_aug_asym and np.random.random() < 0.5: brightness = np.random.uniform(0.5, 2.0)
+            img2 = FF.adjust_brightness(img2, brightness)
+        return img1, img2
+    def _random_color_hue(self, img1, img2):
+        if np.random.random() < 0.5:
+            hue = np.random.uniform(-0.1, 0.1)
+            img1 = FF.adjust_hue(img1, hue)
+            if self.color_aug_asym and np.random.random() < 0.5: hue = np.random.uniform(-0.1, 0.1)
+            img2 = FF.adjust_hue(img2, hue)
+        return img1, img2
+    def _random_color_saturation(self, img1, img2):
+        if np.random.random() < 0.5:
+            saturation = np.random.uniform(0.8, 1.2)
+            img1 = FF.adjust_saturation(img1, saturation)
+            if self.color_aug_asym and np.random.random() < 0.5: saturation = np.random.uniform(-0.8,1.2)
+            img2 = FF.adjust_saturation(img2, saturation)
+        return img1, img2   
+    def _random_color(self, img1, img2):
+        trfs = [self._random_color_contrast,self._random_color_gamma,self._random_color_brightness,self._random_color_hue,self._random_color_saturation]
+        img1 = Image.fromarray(img1.astype('uint8'))
+        img2 = Image.fromarray(img2.astype('uint8'))
+        if np.random.random() < self.color_choice_prob:
+            # A single transform
+            t = random.choice(trfs)
+            img1, img2 = t(img1, img2)
+        else:
+            # Combination of trfs
+            # Random order
+            random.shuffle(trfs)
+            for t in trfs:
+                img1, img2 = t(img1, img2)
+        img1 = np.array(img1).astype(np.float32)
+        img2 = np.array(img2).astype(np.float32)
+        return img1, img2
+                    
+    def __call__(self, img1, img2, disp, dataset_name):
+        img1, img2, disp = self._random_scale(img1, img2, disp)
+        img1, img2, disp = self._random_crop(img1, img2, disp)
+        img1, img2, disp = self._random_vflip(img1, img2, disp)
+        img2 = self._random_rotate_shift_right(img2)
+        img1, img2 = self._random_color(img1, img2)
+        return img1, img2, disp
+
+
+
+class FlowAugmentor:
+
+    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, spatial_aug_prob=0.8, stretch_prob=0.8, max_stretch=0.2, h_flip_prob=0.5, v_flip_prob=0.1, asymmetric_color_aug_prob=0.2):
+    
+        # spatial augmentation params
+        self.crop_size = crop_size
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.spatial_aug_prob = spatial_aug_prob
+        self.stretch_prob = stretch_prob
+        self.max_stretch = max_stretch
+
+        # flip augmentation params
+        self.h_flip_prob = h_flip_prob
+        self.v_flip_prob = v_flip_prob
+
+        # photometric augmentation params
+        self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5 / 3.14)
+
+        self.asymmetric_color_aug_prob = asymmetric_color_aug_prob
+        
+    def color_transform(self, img1, img2):
+        """ Photometric augmentation """
+
+        # asymmetric
+        if np.random.rand() < self.asymmetric_color_aug_prob:
+            img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8)
+            img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8)
+
+        # symmetric
+        else:
+            image_stack = np.concatenate([img1, img2], axis=0)
+            image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
+            img1, img2 = np.split(image_stack, 2, axis=0)
+
+        return img1, img2
+
+    def _resize_flow(self, flow, scale_x, scale_y, factor=1.0):
+        if np.all(np.isfinite(flow)):
+            flow = cv2.resize(flow, None, fx=scale_x/factor, fy=scale_y/factor, interpolation=cv2.INTER_LINEAR)
+            flow = flow * [scale_x, scale_y]
+        else: # sparse version
+            fx, fy = scale_x, scale_y
+            ht, wd = flow.shape[:2]
+            coords = np.meshgrid(np.arange(wd), np.arange(ht))
+            coords = np.stack(coords, axis=-1)
+
+            coords = coords.reshape(-1, 2).astype(np.float32)
+            flow = flow.reshape(-1, 2).astype(np.float32)
+            valid = np.isfinite(flow[:,0])
+
+            coords0 = coords[valid]
+            flow0 = flow[valid]
+
+            ht1 = int(round(ht * fy/factor))
+            wd1 = int(round(wd * fx/factor))
+            
+            rescale = np.expand_dims(np.array([fx, fy]), axis=0)
+            coords1 = coords0 * rescale / factor
+            flow1 = flow0 * rescale
+
+            xx = np.round(coords1[:, 0]).astype(np.int32)
+            yy = np.round(coords1[:, 1]).astype(np.int32)
+
+            v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
+            xx = xx[v]
+            yy = yy[v]
+            flow1 = flow1[v]
+
+            flow = np.inf * np.ones([ht1, wd1, 2], dtype=np.float32) # invalid value every where, before we fill it with the correct ones
+            flow[yy, xx] = flow1
+        return flow
+        
+    def spatial_transform(self, img1, img2, flow, dname):
+    
+        if np.random.rand() < self.spatial_aug_prob:
+            # randomly sample scale
+            ht, wd = img1.shape[:2]
+            clip_min_scale = np.maximum(
+                (self.crop_size[0] + 8) / float(ht),
+                (self.crop_size[1] + 8) / float(wd))
+            min_scale, max_scale = self.min_scale, self.max_scale
+            scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
+            scale_x = scale
+            scale_y = scale
+            if np.random.rand() < self.stretch_prob:
+                scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+                scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+            scale_x = np.clip(scale_x, clip_min_scale, None)
+            scale_y = np.clip(scale_y, clip_min_scale, None)
+            # rescale the images
+            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            flow = self._resize_flow(flow, scale_x, scale_y, factor=2.0 if dname=='Spring' else 1.0)
+        elif dname=="Spring":
+            flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0)
+
+        if self.h_flip_prob>0. and np.random.rand() < self.h_flip_prob:  # h-flip
+            img1 = img1[:, ::-1]
+            img2 = img2[:, ::-1]
+            flow = flow[:, ::-1] * [-1.0, 1.0]
+
+        if self.v_flip_prob>0. and np.random.rand() < self.v_flip_prob:  # v-flip
+            img1 = img1[::-1, :]
+            img2 = img2[::-1, :]
+            flow = flow[::-1, :] * [1.0, -1.0]
+                
+        # In case no cropping
+        if img1.shape[0] - self.crop_size[0] > 0:
+            y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
+        else:
+            y0 = 0
+        if img1.shape[1] - self.crop_size[1] > 0:
+            x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
+        else:
+            x0 = 0
+
+        img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+
+        return img1, img2, flow
+
+    def __call__(self, img1, img2, flow, dname):
+        img1, img2, flow = self.spatial_transform(img1, img2, flow, dname)
+        img1, img2 = self.color_transform(img1, img2)
+        img1 = np.ascontiguousarray(img1)
+        img2 = np.ascontiguousarray(img2)
+        flow = np.ascontiguousarray(flow)
+        return img1, img2, flow
\ No newline at end of file
diff --git a/dust3r/croco/stereoflow/criterion.py b/dust3r/croco/stereoflow/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..57792ebeeee34827b317a4d32b7445837bb33f17
--- /dev/null
+++ b/dust3r/croco/stereoflow/criterion.py
@@ -0,0 +1,251 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Losses, metrics per batch, metrics per dataset 
+# --------------------------------------------------------
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+def _get_gtnorm(gt):
+    if gt.size(1)==1: # stereo
+        return gt
+    # flow 
+    return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True)) # Bx1xHxW
+
+############ losses without confidence
+
+class L1Loss(nn.Module):
+    
+    def __init__(self, max_gtnorm=None):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = False 
+    
+    def _error(self, gt, predictions):
+        return torch.abs(gt-predictions)
+    
+    def forward(self, predictions, gt, inspect=False):
+        mask = torch.isfinite(gt)
+        if self.max_gtnorm is not None: 
+            mask *= _get_gtnorm(gt).expand(-1,gt.size(1),-1,-1)<self.max_gtnorm
+        if inspect:
+            return self._error(gt, predictions)
+        return self._error(gt[mask],predictions[mask]).mean()
+
+############## losses with confience
+## there are several parametrizations
+
+class LaplacianLoss(nn.Module): # used for CroCo-Stereo on ETH3D, d'=exp(d)
+    
+    def __init__(self, max_gtnorm=None):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:,0,:,:]
+        if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:]<self.max_gtnorm
+        conf = conf.squeeze(1)
+        return ( torch.abs(gt-predictions).sum(dim=1)[mask] / torch.exp(conf[mask]) + conf[mask] ).mean()# + torch.log(2) => which is a constant
+
+
+class LaplacianLossBounded(nn.Module): # used for CroCo-Flow ; in the equation of the paper, we have a=1/b
+    def __init__(self, max_gtnorm=10000., a=0.25, b=4.):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        self.a, self.b = a, b
+        
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:,0,:,:]
+        if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:]<self.max_gtnorm
+        conf = conf.squeeze(1)
+        conf = (self.b - self.a) * torch.sigmoid(conf) + self.a
+        return ( torch.abs(gt-predictions).sum(dim=1)[mask] / conf[mask] + torch.log(conf)[mask] ).mean()# + torch.log(2) => which is a constant
+
+class LaplacianLossBounded2(nn.Module): # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b
+    def __init__(self, max_gtnorm=None, a=3.0, b=3.0):
+        super().__init__()
+        self.max_gtnorm = max_gtnorm
+        self.with_conf = True
+        self.a, self.b = a, b
+        
+    def forward(self, predictions, gt, conf):
+        mask = torch.isfinite(gt)
+        mask = mask[:,0,:,:]
+        if self.max_gtnorm is not None: mask *= _get_gtnorm(gt)[:,0,:,:]<self.max_gtnorm
+        conf = conf.squeeze(1)
+        conf = 2 * self.a * (torch.sigmoid(conf / self.b) - 0.5 )
+        return ( torch.abs(gt-predictions).sum(dim=1)[mask] / torch.exp(conf[mask]) + conf[mask] ).mean()# + torch.log(2) => which is a constant
+        
+############## metrics per batch 
+
+class StereoMetrics(nn.Module):
+
+    def __init__(self, do_quantile=False):
+        super().__init__()
+        self.bad_ths = [0.5,1,2,3]
+        self.do_quantile = do_quantile
+        
+    def forward(self, predictions, gt):
+        B = predictions.size(0)
+        metrics = {}
+        gtcopy = gt.clone() 
+        mask = torch.isfinite(gtcopy)
+        gtcopy[~mask] = 999999.0 # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0
+        Npx = mask.view(B,-1).sum(dim=1)
+        L1error = (torch.abs(gtcopy-predictions)*mask).view(B,-1)
+        L2error = (torch.square(gtcopy-predictions)*mask).view(B,-1)
+        # avgerr
+        metrics['avgerr'] = torch.mean(L1error.sum(dim=1)/Npx )
+        # rmse
+        metrics['rmse'] = torch.sqrt(L2error.sum(dim=1)/Npx).mean(dim=0)
+        # err > t for t in [0.5,1,2,3]
+        for ths in self.bad_ths:
+            metrics['bad@{:.1f}'.format(ths)] = (((L1error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+        return metrics
+        
+class FlowMetrics(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [1,3,5]
+        
+    def forward(self, predictions, gt):
+        B = predictions.size(0)        
+        metrics = {}
+        mask = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+        Npx = mask.view(B,-1).sum(dim=1)
+        gtcopy = gt.clone() # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored
+        gtcopy[:,0,:,:][~mask] = 999999.0
+        gtcopy[:,1,:,:][~mask] = 999999.0
+        L1error = (torch.abs(gtcopy-predictions).sum(dim=1)*mask).view(B,-1)
+        L2error = (torch.sqrt(torch.sum(torch.square(gtcopy-predictions),dim=1))*mask).view(B,-1)
+        metrics['L1err'] = torch.mean(L1error.sum(dim=1)/Npx )
+        metrics['EPE'] = torch.mean(L2error.sum(dim=1)/Npx )
+        for ths in self.bad_ths:
+            metrics['bad@{:.1f}'.format(ths)] = (((L2error>ths)* mask.view(B,-1)).sum(dim=1)/Npx).mean(dim=0) * 100
+        return metrics
+        
+############## metrics per dataset
+## we update the average and maintain the number of pixels while adding data batch per batch 
+## at the beggining, call reset()
+## after each batch, call add_batch(...)
+## at the end: call get_results()
+
+class StereoDatasetMetrics(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [0.5,1,2,3]
+        
+    def reset(self):
+        self.agg_N = 0 # number of pixels so far 
+        self.agg_L1err = torch.tensor(0.0) # L1 error so far 
+        self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels 
+        self._metrics = None
+                
+    def add_batch(self, predictions, gt):
+        assert predictions.size(1)==1, predictions.size()
+        assert gt.size(1)==1, gt.size()
+        if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+            L1err = torch.minimum( torch.minimum( torch.minimum(
+                torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+                torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+            valid = torch.isfinite(L1err)
+        else:
+            valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+            L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+        N = valid.sum()
+        Nnew = self.agg_N + N
+        self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+        self.agg_N = Nnew
+        for i,th in enumerate(self.bad_ths):
+            self.agg_Nbad[i] += (L1err[valid]>th).sum().cpu()
+   
+    def _compute_metrics(self):
+        if self._metrics is not None: return
+        out = {}
+        out['L1err'] = self.agg_L1err.item()
+        for i,th in enumerate(self.bad_ths):
+            out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0
+        self._metrics = out
+        
+    def get_results(self): 
+        self._compute_metrics() # to avoid recompute them multiple times
+        return self._metrics
+
+class FlowDatasetMetrics(nn.Module):
+    
+    def __init__(self):
+        super().__init__()
+        self.bad_ths = [0.5,1,3,5]
+        self.speed_ths = [(0,10),(10,40),(40,torch.inf)]
+    
+    def reset(self):
+        self.agg_N = 0 # number of pixels so far 
+        self.agg_L1err = torch.tensor(0.0) # L1 error so far 
+        self.agg_L2err = torch.tensor(0.0) # L2 (=EPE) error so far 
+        self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels 
+        self.agg_EPEspeed = [torch.tensor(0.0) for _ in self.speed_ths] # EPE per speed bin so far 
+        self.agg_Nspeed = [0 for _ in self.speed_ths] # N pixels per speed bin so far
+        self._metrics = None
+        self.pairname_results = {}
+
+    def add_batch(self, predictions, gt):
+        assert predictions.size(1)==2, predictions.size()
+        assert gt.size(1)==2, gt.size()
+        if gt.size(2)==predictions.size(2)*2 and gt.size(3)==predictions.size(3)*2: # special case for Spring ...
+            L1err = torch.minimum( torch.minimum( torch.minimum(
+                torch.sum(torch.abs(gt[:,:,0::2,0::2]-predictions),dim=1),
+                torch.sum(torch.abs(gt[:,:,1::2,0::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,0::2,1::2]-predictions),dim=1)),
+                torch.sum(torch.abs(gt[:,:,1::2,1::2]-predictions),dim=1))
+            L2err = torch.minimum( torch.minimum( torch.minimum(
+                torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]-predictions),dim=1)),
+                torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]-predictions),dim=1))),
+                torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]-predictions),dim=1))),
+                torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]-predictions),dim=1)))
+            valid = torch.isfinite(L1err)
+            gtspeed = (torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,0::2,1::2]),dim=1)) +\
+                       torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,0::2]),dim=1)) + torch.sqrt(torch.sum(torch.square(gt[:,:,1::2,1::2]),dim=1)) ) / 4.0 # let's just average them
+        else:
+            valid = torch.isfinite(gt[:,0,:,:]) # both x and y would be infinite
+            L1err = torch.sum(torch.abs(gt-predictions),dim=1)
+            L2err = torch.sqrt(torch.sum(torch.square(gt-predictions),dim=1))
+            gtspeed = torch.sqrt(torch.sum(torch.square(gt),dim=1))
+        N = valid.sum()
+        Nnew = self.agg_N + N
+        self.agg_L1err = float(self.agg_N)/Nnew * self.agg_L1err + L1err[valid].mean().cpu() * float(N)/Nnew
+        self.agg_L2err = float(self.agg_N)/Nnew * self.agg_L2err + L2err[valid].mean().cpu() * float(N)/Nnew
+        self.agg_N = Nnew
+        for i,th in enumerate(self.bad_ths):
+            self.agg_Nbad[i] += (L2err[valid]>th).sum().cpu()
+        for i,(th1,th2) in enumerate(self.speed_ths):
+            vv = (gtspeed[valid]>=th1) * (gtspeed[valid]<th2)
+            iNspeed = vv.sum()
+            if iNspeed==0: continue
+            iNnew = self.agg_Nspeed[i] + iNspeed
+            self.agg_EPEspeed[i] = float(self.agg_Nspeed[i]) / iNnew * self.agg_EPEspeed[i] + float(iNspeed) / iNnew * L2err[valid][vv].mean().cpu()
+            self.agg_Nspeed[i] = iNnew
+
+    def _compute_metrics(self):
+        if self._metrics is not None: return
+        out = {}
+        out['L1err'] = self.agg_L1err.item()
+        out['EPE']  = self.agg_L2err.item()
+        for i,th in enumerate(self.bad_ths):
+            out['bad@{:.1f}'.format(th)] = (float(self.agg_Nbad[i]) / self.agg_N).item() * 100.0
+        for i,(th1,th2) in enumerate(self.speed_ths):
+            out['s{:d}{:s}'.format(th1, '-'+str(th2) if th2<torch.inf else '+')] = self.agg_EPEspeed[i].item()
+        self._metrics = out
+    
+    def get_results(self): 
+        self._compute_metrics() # to avoid recompute them multiple times
+        return self._metrics
\ No newline at end of file
diff --git a/dust3r/croco/stereoflow/datasets_flow.py b/dust3r/croco/stereoflow/datasets_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f553ff0caf5924065e55bf81e106e645a4f74ff
--- /dev/null
+++ b/dust3r/croco/stereoflow/datasets_flow.py
@@ -0,0 +1,630 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Dataset structure for flow
+# --------------------------------------------------------
+
+import os
+import os.path as osp
+import pickle
+import numpy as np
+import struct
+from PIL import Image 
+import json
+import h5py
+import torch
+from torch.utils import data
+
+from .augmentor import FlowAugmentor
+from .datasets_stereo import _read_img, img_to_tensor, dataset_to_root, _read_pfm
+from copy import deepcopy
+dataset_to_root = deepcopy(dataset_to_root)
+
+dataset_to_root.update(**{
+    'TartanAir': './data/stereoflow/TartanAir',
+    'FlyingChairs': './data/stereoflow/FlyingChairs/',
+    'FlyingThings': osp.join(dataset_to_root['SceneFlow'],'FlyingThings')+'/',
+    'MPISintel': './data/stereoflow//MPI-Sintel/'+'/',
+})
+cache_dir = "./data/stereoflow/datasets_flow_cache/"
+
+
+def flow_to_tensor(disp):
+    return torch.from_numpy(disp).float().permute(2, 0, 1)
+
+class FlowDataset(data.Dataset):
+    
+    def __init__(self, split, augmentor=False, crop_size=None, totensor=True):
+        self.split = split
+        if not augmentor: assert crop_size is None 
+        if crop_size is not None: assert augmentor
+        self.crop_size = crop_size
+        self.augmentor_str = augmentor
+        self.augmentor = FlowAugmentor(crop_size) if augmentor else None
+        self.totensor = totensor
+        self.rmul = 1 # keep track of rmul
+        self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time)
+        self._prepare_data()
+        self._load_or_build_cache()
+        
+    def prepare_data(self):
+        """
+        to be defined for each dataset 
+        """
+        raise NotImplementedError 
+        
+    def __len__(self):
+        return len(self.pairnames) # each pairname is typically of the form (str, int1, int2) 
+        
+    def __getitem__(self, index):
+        pairname = self.pairnames[index]
+        
+        # get filenames 
+        img1name = self.pairname_to_img1name(pairname)
+        img2name = self.pairname_to_img2name(pairname)
+        flowname = self.pairname_to_flowname(pairname) if self.pairname_to_flowname is not None else None
+        
+        # load images and disparities
+        img1 = _read_img(img1name)
+        img2 = _read_img(img2name)
+        flow = self.load_flow(flowname) if flowname is not None else None
+
+        # apply augmentations
+        if self.augmentor is not None:
+            img1, img2, flow = self.augmentor(img1, img2, flow, self.name)
+        
+        if self.totensor:
+            img1 = img_to_tensor(img1)
+            img2 = img_to_tensor(img2)
+            if flow is not None: 
+                flow = flow_to_tensor(flow)
+            else:
+                flow = torch.tensor([]) # to allow dataloader batching with default collate_gn
+            pairname = str(pairname) # transform potential tuple to str to be able to batch it
+
+        return img1, img2, flow, pairname
+        
+    def __rmul__(self, v):
+        self.rmul *= v
+        self.pairnames = v * self.pairnames
+        return self
+        
+    def __str__(self):
+        return f'{self.__class__.__name__}_{self.split}'
+        
+    def __repr__(self):
+        s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})'
+        if self.rmul==1:
+            s+=f'\n\tnum pairs: {len(self.pairnames)}'
+        else:
+            s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})'
+        return s
+
+    def _set_root(self):
+        self.root = dataset_to_root[self.name]
+        assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}"       
+
+    def _load_or_build_cache(self):
+        cache_file = osp.join(cache_dir, self.name+'.pkl')
+        if osp.isfile(cache_file):
+            with open(cache_file, 'rb') as fid:
+                self.pairnames = pickle.load(fid)[self.split]
+        else:
+            tosave = self._build_cache()
+            os.makedirs(cache_dir, exist_ok=True)
+            with open(cache_file, 'wb') as fid:
+                pickle.dump(tosave, fid)
+            self.pairnames = tosave[self.split]
+
+class TartanAirDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "TartanAir"
+        self._set_root()
+        assert self.split in ['train']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[1]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'image_left/{:06d}_left.png'.format(pairname[2]))
+        self.pairname_to_flowname = lambda pairname: osp.join(self.root, pairname[0], 'flow/{:06d}_{:06d}_flow.npy'.format(pairname[1],pairname[2]))
+        self.pairname_to_str = lambda pairname: os.path.join(pairname[0][pairname[0].find('/')+1:], '{:06d}_{:06d}'.format(pairname[1], pairname[2]))
+        self.load_flow = _read_numpy_flow
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        pairs = [(osp.join(s,s,difficulty,Pxxx),int(a[:6]),int(a[:6])+1) for s in seqs for difficulty in ['Easy','Hard'] for Pxxx in sorted(os.listdir(osp.join(self.root,s,s,difficulty))) for a in sorted(os.listdir(osp.join(self.root,s,s,difficulty,Pxxx,'image_left/')))[:-1]]
+        assert len(pairs)==306268, "incorrect parsing of pairs in TartanAir"
+        tosave = {'train': pairs}
+        return tosave
+        
+class FlyingChairsDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "FlyingChairs"
+        self._set_root()
+        assert self.split in ['train','val']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, 'data', pairname+'_img1.ppm')
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, 'data', pairname+'_img2.ppm')
+        self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'data', pairname+'_flow.flo')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_flow = _read_flo_file
+        
+    def _build_cache(self):
+        split_file = osp.join(self.root, 'chairs_split.txt')
+        split_list = np.loadtxt(split_file, dtype=np.int32)
+        trainpairs = ['{:05d}'.format(i) for i in np.where(split_list==1)[0]+1]
+        valpairs = ['{:05d}'.format(i) for i in np.where(split_list==2)[0]+1]
+        assert len(trainpairs)==22232 and len(valpairs)==640, "incorrect parsing of pairs in MPI-Sintel"
+        tosave = {'train': trainpairs, 'val': valpairs}
+        return tosave
+        
+class FlyingThingsDataset(FlowDataset):
+    
+    def _prepare_data(self):
+        self.name = "FlyingThings"
+        self._set_root()
+        assert self.split in [f'{set_}_{pass_}pass{camstr}' for set_ in ['train','test','test1024'] for camstr in ['','_rightcam'] for pass_ in ['clean','final','all']]
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[1]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, f'frames_{pairname[3]}pass', pairname[0].replace('into_future','').replace('into_past',''), '{:04d}.png'.format(pairname[2]))
+        self.pairname_to_flowname = lambda pairname: osp.join(self.root, 'optical_flow', pairname[0], 'OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm'.format(f='Future' if 'future' in pairname[0] else 'Past', i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+        self.pairname_to_str = lambda pairname: os.path.join(pairname[3]+'pass', pairname[0], 'Into{f:s}_{i:04d}_{c:s}'.format(f='Future' if 'future' in pairname[0] else 'Past',  i=pairname[1], c='L' if 'left' in pairname[0] else 'R' ))
+        self.load_flow = _read_pfm_flow
+        
+    def _build_cache(self):
+        tosave = {}
+        # train and test splits for the different passes 
+        for set_ in ['train', 'test']:
+            sroot = osp.join(self.root, 'optical_flow', set_.upper())
+            fname_to_i = lambda f: int(f[len('OpticalFlowIntoFuture_'):-len('_L.pfm')])
+            pp = [(osp.join(set_.upper(), d, s, 'into_future/left'),fname_to_i(fname)) for d in sorted(os.listdir(sroot)) for s in sorted(os.listdir(osp.join(sroot,d))) for fname in sorted(os.listdir(osp.join(sroot,d, s, 'into_future/left')))[:-1]]
+            pairs  = [(a,i,i+1) for a,i in pp]
+            pairs += [(a.replace('into_future','into_past'),i+1,i) for a,i in pp]
+            assert len(pairs)=={'train': 40302, 'test': 7866}[set_], "incorrect parsing of pairs Flying Things"
+            for cam in ['left','right']:
+                camstr = '' if cam=='left' else f'_{cam}cam'
+                for pass_ in ['final', 'clean']:
+                    tosave[f'{set_}_{pass_}pass{camstr}'] = [(a.replace('left',cam),i,j,pass_) for a,i,j in pairs]
+                tosave[f'{set_}_allpass{camstr}'] = tosave[f'{set_}_cleanpass{camstr}'] + tosave[f'{set_}_finalpass{camstr}']
+        # test1024: this is the same split as unimatch 'validation' split
+        # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229
+        test1024_nsamples = 1024
+        alltest_nsamples = len(tosave['test_cleanpass'])  # 7866
+        stride = alltest_nsamples // test1024_nsamples
+        remove = alltest_nsamples % test1024_nsamples
+        for cam in ['left','right']:
+            camstr = '' if cam=='left' else f'_{cam}cam'
+            for pass_ in ['final','clean']:
+                tosave[f'test1024_{pass_}pass{camstr}'] = sorted(tosave[f'test_{pass_}pass{camstr}'])[:-remove][::stride] # warning, it was not sorted before
+            assert len(tosave['test1024_cleanpass'])==1024, "incorrect parsing of pairs in Flying Things"
+            tosave[f'test1024_allpass{camstr}'] = tosave[f'test1024_cleanpass{camstr}'] + tosave[f'test1024_finalpass{camstr}']
+        return tosave
+        
+      
+class MPISintelDataset(FlowDataset):
+    
+    def _prepare_data(self):
+        self.name = "MPISintel"
+        self._set_root()
+        assert self.split in [s+'_'+p for s in ['train','test','subval','subtrain'] for p in ['cleanpass','finalpass','allpass']]
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], 'frame_{:04d}.png'.format(pairname[1]+1))
+        self.pairname_to_flowname = lambda pairname: None if pairname[0].startswith('test/') else osp.join(self.root, pairname[0].replace('/clean/','/flow/').replace('/final/','/flow/'), 'frame_{:04d}.flo'.format(pairname[1]))
+        self.pairname_to_str = lambda pairname: osp.join(pairname[0], 'frame_{:04d}'.format(pairname[1]))
+        self.load_flow = _read_flo_file
+        
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir(self.root+'training/clean'))
+        trainpairs = [ (osp.join('training/clean', s),i) for s in trainseqs for i in range(1, len(os.listdir(self.root+'training/clean/'+s)))]
+        subvalseqs = ['temple_2','temple_3']
+        subtrainseqs = [s for s in trainseqs if s not in subvalseqs]
+        subvalpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subvalseqs)]
+        subtrainpairs = [ (p,i) for p,i in trainpairs if any(s in p for s in subtrainseqs)]
+        testseqs = sorted(os.listdir(self.root+'test/clean'))
+        testpairs = [ (osp.join('test/clean', s),i) for s in testseqs for i in range(1, len(os.listdir(self.root+'test/clean/'+s)))]
+        assert len(trainpairs)==1041 and len(testpairs)==552 and len(subvalpairs)==98 and len(subtrainpairs)==943, "incorrect parsing of pairs in MPI-Sintel"
+        tosave = {}
+        tosave['train_cleanpass'] = trainpairs
+        tosave['test_cleanpass'] = testpairs
+        tosave['subval_cleanpass'] = subvalpairs
+        tosave['subtrain_cleanpass'] = subtrainpairs         
+        for t in ['train','test','subval','subtrain']: 
+            tosave[t+'_finalpass'] = [(p.replace('/clean/','/final/'),i) for p,i in tosave[t+'_cleanpass']]
+            tosave[t+'_allpass'] = tosave[t+'_cleanpass'] + tosave[t+'_finalpass'] 
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, _time):
+        assert prediction.shape[2]==2
+        outfile = os.path.join(outdir, 'submission', self.pairname_to_str(pairname)+'.flo')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlowFile(prediction, outfile)
+        
+    def finalize_submission(self, outdir):
+        assert self.split == 'test_allpass'
+        bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler" # eg <bundle_exe> <path_to_results_for_clean> <path_to_results_for_final> <output/bundled.lzma>
+        if os.path.isfile(bundle_exe):
+            cmd = f'{bundle_exe} "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"'
+            print(cmd)
+            os.system(cmd)
+            print(f'Done. Submission file at: "{outdir}/submission/bundled.lzma"')
+        else:
+            print('Could not find bundler executable for submission.')
+            print('Please download it and run:')
+            print(f'<bundle_exe> "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"')
+        
+class SpringDataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "Spring"
+        self._set_root()
+        assert self.split in ['train','test','subtrain','subval']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4]))
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname[0], pairname[1], 'frame_'+pairname[3], 'frame_{:s}_{:04d}.png'.format(pairname[3], pairname[4]+(1 if pairname[2]=='FW' else -1)))
+        self.pairname_to_flowname = lambda pairname: None if pairname[0]=='test' else osp.join(self.root, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5')
+        self.pairname_to_str = lambda pairname: osp.join(pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}')
+        self.load_flow = _read_hdf5_flow
+
+    def _build_cache(self):
+        # train 
+        trainseqs = sorted(os.listdir( osp.join(self.root,'train')))
+        trainpairs = []
+        for leftright in ['left','right']:
+            for fwbw in ['FW','BW']:
+                trainpairs += [('train',s,fwbw,leftright,int(f[len(f'flow_{fwbw}_{leftright}_'):-len('.flo5')])) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,f'flow_{fwbw}_{leftright}')))]
+        # test 
+        testseqs = sorted(os.listdir( osp.join(self.root,'test')))
+        testpairs = []
+        for leftright in ['left','right']:
+            testpairs += [('test',s,'FW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]]
+            testpairs += [('test',s,'BW',leftright,int(f[len(f'frame_{leftright}_'):-len('.png')])+1) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,f'frame_{leftright}')))[:-1]]
+        # subtrain / subval
+        subtrainpairs = [p for p in trainpairs if p[1]!='0041']
+        subvalpairs = [p for p in trainpairs if p[1]=='0041']
+        assert len(trainpairs)==19852 and len(testpairs)==3960 and len(subtrainpairs)==19472 and len(subvalpairs)==380, "incorrect parsing of pairs in Spring"
+        tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==3
+        assert prediction.shape[2]==2
+        assert prediction.dtype==np.float32
+        outfile = osp.join(outdir, pairname[0], pairname[1], f'flow_{pairname[2]}_{pairname[3]}', f'flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlo5File(prediction, outfile)
+        
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        exe = "{self.root}/flow_subsampling"
+        if os.path.isfile(exe):
+            cmd = f'cd "{outdir}/test"; {exe} .'
+            print(cmd)
+            os.system(cmd)
+            print(f'Done. Submission file at {outdir}/test/flow_submission.hdf5')
+        else:
+            print('Could not find flow_subsampling executable for submission.')
+            print('Please download it and run:')
+            print(f'cd "{outdir}/test"; <flow_subsampling_exe> .')
+
+        
+class Kitti12Dataset(FlowDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti12"
+        self._set_root()
+        assert self.split in ['train','test']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png')
+        self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/flow_occ/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/')
+        self.load_flow = _read_kitti_flow
+        
+    def _build_cache(self):
+        trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)]
+        testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)]
+        assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12"
+        tosave = {'train': trainseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==3
+        assert prediction.shape[2]==2
+        outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlowKitti(outfile, prediction)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti12_flow_results.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti12_flow_results.zip')
+
+
+class Kitti15Dataset(FlowDataset):
+    
+    def _prepare_data(self):
+        self.name = "Kitti15"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval','test']
+        self.pairname_to_img1name = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_img2name = lambda pairname: osp.join(self.root, pairname+'_11.png')
+        self.pairname_to_flowname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/flow_occ/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/')
+        self.load_flow = _read_kitti_flow
+        
+    def _build_cache(self):
+        trainseqs = ["training/image_2/%06d"%(i) for i in range(200)]
+        subtrainseqs = trainseqs[:-10]
+        subvalseqs = trainseqs[-10:]
+        testseqs = ["testing/image_2/%06d"%(i) for i in range(200)]
+        assert len(trainseqs)==200 and len(subtrainseqs)==190 and len(subvalseqs)==10 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15"
+        tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==3
+        assert prediction.shape[2]==2
+        outfile = os.path.join(outdir, 'flow', pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeFlowKitti(outfile, prediction)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti15_flow_results.zip" flow'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti15_flow_results.zip')
+
+
+import cv2
+def _read_numpy_flow(filename): 
+    return np.load(filename)
+    
+def _read_pfm_flow(filename):
+    f, _ = _read_pfm(filename)
+    assert np.all(f[:,:,2]==0.0)
+    return np.ascontiguousarray(f[:,:,:2])
+
+TAG_FLOAT = 202021.25 # tag to check the sanity of the file
+TAG_STRING = 'PIEH'   # string containing the tag
+MIN_WIDTH = 1
+MAX_WIDTH = 99999
+MIN_HEIGHT = 1
+MAX_HEIGHT = 99999
+def readFlowFile(filename):
+    """
+    readFlowFile(<FILENAME>) reads a flow file <FILENAME> into a 2-band np.array.
+    if <FILENAME> does not exist, an IOError is raised.
+    if <FILENAME> does not finish by '.flo' or the tag, the width, the height or the file's size is illegal, an Expcetion is raised.
+    ---- PARAMETERS ----
+        filename: string containg the name of the file to read a flow
+    ---- OUTPUTS ----
+        a np.array of dimension (height x width x 2) containing the flow of type 'float32'
+    """
+        
+    # check filename
+    if not filename.endswith(".flo"):
+        raise Exception("readFlowFile({:s}): filename must finish with '.flo'".format(filename))
+    
+    # open the file and read it
+    with open(filename,'rb') as f:
+        # check tag
+        tag = struct.unpack('f',f.read(4))[0]
+        if tag != TAG_FLOAT:
+            raise Exception("flow_utils.readFlowFile({:s}): wrong tag".format(filename))
+        # read dimension
+        w,h = struct.unpack('ii',f.read(8))
+        if w < MIN_WIDTH or w > MAX_WIDTH:
+            raise Exception("flow_utils.readFlowFile({:s}: illegal width {:d}".format(filename,w))
+        if h < MIN_HEIGHT or h > MAX_HEIGHT:
+            raise Exception("flow_utils.readFlowFile({:s}: illegal height {:d}".format(filename,h))
+        flow = np.fromfile(f,'float32')
+        if not flow.shape == (h*w*2,):
+            raise Exception("flow_utils.readFlowFile({:s}: illegal size of the file".format(filename))
+        flow.shape = (h,w,2)
+        return flow
+
+def writeFlowFile(flow,filename):
+    """
+    writeFlowFile(flow,<FILENAME>) write flow to the file <FILENAME>.
+    if <FILENAME> does not exist, an IOError is raised.
+    if <FILENAME> does not finish with '.flo' or the flow has not 2 bands, an Exception is raised.
+    ---- PARAMETERS ----
+        flow: np.array of dimension (height x width x 2) containing the flow to write
+        filename: string containg the name of the file to write a flow
+    """
+    
+    # check filename
+    if not filename.endswith(".flo"):
+        raise Exception("flow_utils.writeFlowFile(<flow>,{:s}): filename must finish with '.flo'".format(filename))
+    
+    if not flow.shape[2:] == (2,):
+        raise Exception("flow_utils.writeFlowFile(<flow>,{:s}): <flow> must have 2 bands".format(filename))
+
+
+    # open the file and write it
+    with open(filename,'wb') as f:
+        # write TAG
+        f.write( TAG_STRING.encode('utf-8') )
+        # write dimension
+        f.write( struct.pack('ii',flow.shape[1],flow.shape[0]) )
+        # write the flow
+        
+        flow.astype(np.float32).tofile(f)
+        
+_read_flo_file = readFlowFile
+
+def _read_kitti_flow(filename):
+    flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    valid = flow[:, :, 2]>0
+    flow = flow[:, :, :2]
+    flow = (flow - 2 ** 15) / 64.0
+    flow[~valid,0] = np.inf
+    flow[~valid,1] = np.inf
+    return flow
+_read_hd1k_flow = _read_kitti_flow
+    
+        
+def writeFlowKitti(filename, uv):
+    uv = 64.0 * uv + 2 ** 15
+    valid = np.ones([uv.shape[0], uv.shape[1], 1])
+    uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
+    cv2.imwrite(filename, uv[..., ::-1])
+
+def writeFlo5File(flow, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5)
+    
+def _read_hdf5_flow(filename):
+    flow = np.asarray(h5py.File(filename)['flow'])
+    flow[np.isnan(flow)] = np.inf # make invalid values as +inf
+    return flow.astype(np.float32)
+
+# flow visualization
+RY = 15
+YG = 6
+GC = 4
+CB = 11
+BM = 13
+MR = 6
+UNKNOWN_THRESH = 1e9
+
+def colorTest():
+    """
+    flow_utils.colorTest(): display an example of image showing the color encoding scheme
+    """
+    import matplotlib.pylab as plt
+    truerange = 1
+    h,w = 151,151
+    trange = truerange*1.04
+    s2 = round(h/2)
+    x,y = np.meshgrid(range(w),range(h))
+    u = x*trange/s2-trange
+    v = y*trange/s2-trange
+    img = _computeColor(np.concatenate((u[:,:,np.newaxis],v[:,:,np.newaxis]),2)/trange/np.sqrt(2))
+    plt.imshow(img)
+    plt.axis('off')
+    plt.axhline(round(h/2),color='k')
+    plt.axvline(round(w/2),color='k')
+    
+def flowToColor(flow, maxflow=None, maxmaxflow=None, saturate=False):
+    """
+    flow_utils.flowToColor(flow): return a color code flow field, normalized based on the maximum l2-norm of the flow
+    flow_utils.flowToColor(flow,maxflow): return a color code flow field, normalized by maxflow
+    ---- PARAMETERS ----
+        flow: flow to display of shape (height x width x 2)
+        maxflow (default:None): if given, normalize the flow by its value, otherwise by the flow norm
+        maxmaxflow (default:None): if given, normalize the flow by the max of its value and the flow norm
+    ---- OUTPUT ----
+        an np.array of shape (height x width x 3) of type uint8 containing a color code of the flow
+    """
+    h,w,n = flow.shape
+    # check size of flow
+    assert n == 2, "flow_utils.flowToColor(flow): flow must have 2 bands"
+    # fix unknown flow
+    unknown_idx = np.max(np.abs(flow),2)>UNKNOWN_THRESH
+    flow[unknown_idx] = 0.0
+    # compute max flow if needed
+    if maxflow is None:
+        maxflow = flowMaxNorm(flow)
+    if maxmaxflow is not None:
+        maxflow = min(maxmaxflow, maxflow)
+    # normalize flow
+    eps = np.spacing(1) # minimum positive float value to avoid division by 0
+    # compute the flow
+    img = _computeColor(flow/(maxflow+eps), saturate=saturate)
+    # put black pixels in unknown location
+    img[ np.tile( unknown_idx[:,:,np.newaxis],[1,1,3]) ] = 0.0 
+    return img
+
+def flowMaxNorm(flow):
+    """
+    flow_utils.flowMaxNorm(flow): return the maximum of the l2-norm of the given flow
+    ---- PARAMETERS ----
+        flow: the flow
+        
+    ---- OUTPUT ----
+        a float containing the maximum of the l2-norm of the flow
+    """
+    return np.max( np.sqrt( np.sum( np.square( flow ) , 2) ) )
+
+def _computeColor(flow, saturate=True):
+    """
+    flow_utils._computeColor(flow): compute color codes for the flow field flow
+    
+    ---- PARAMETERS ----
+        flow: np.array of dimension (height x width x 2) containing the flow to display
+    ---- OUTPUTS ----
+        an np.array of dimension (height x width x 3) containing the color conversion of the flow
+    """
+    # set nan to 0
+    nanidx = np.isnan(flow[:,:,0])
+    flow[nanidx] = 0.0
+    
+    # colorwheel
+    ncols = RY + YG + GC + CB + BM + MR
+    nchans = 3
+    colorwheel = np.zeros((ncols,nchans),'uint8')
+    col = 0;
+    #RY
+    colorwheel[:RY,0] = 255
+    colorwheel[:RY,1] = [(255*i) // RY for i in range(RY)]
+    col += RY
+    # YG    
+    colorwheel[col:col+YG,0] = [255 - (255*i) // YG for i in range(YG)]
+    colorwheel[col:col+YG,1] = 255
+    col += YG
+    # GC
+    colorwheel[col:col+GC,1] = 255
+    colorwheel[col:col+GC,2] = [(255*i) // GC for i in range(GC)]
+    col += GC
+    # CB
+    colorwheel[col:col+CB,1] = [255 - (255*i) // CB for i in range(CB)]
+    colorwheel[col:col+CB,2] = 255
+    col += CB
+    # BM
+    colorwheel[col:col+BM,0] = [(255*i) // BM for i in range(BM)]
+    colorwheel[col:col+BM,2] = 255
+    col += BM
+    # MR
+    colorwheel[col:col+MR,0] = 255
+    colorwheel[col:col+MR,2] = [255 - (255*i) // MR for i in range(MR)]
+
+    # compute utility variables
+    rad = np.sqrt( np.sum( np.square(flow) , 2) ) # magnitude
+    a = np.arctan2( -flow[:,:,1] , -flow[:,:,0]) / np.pi # angle
+    fk = (a+1)/2 * (ncols-1) # map [-1,1] to [0,ncols-1]
+    k0 = np.floor(fk).astype('int')
+    k1 = k0+1
+    k1[k1==ncols] = 0
+    f = fk-k0
+
+    if not saturate:
+        rad = np.minimum(rad,1)
+
+    # compute the image
+    img = np.zeros( (flow.shape[0],flow.shape[1],nchans), 'uint8' )
+    for i in range(nchans):
+        tmp = colorwheel[:,i].astype('float')
+        col0 = tmp[k0]/255
+        col1 = tmp[k1]/255
+        col = (1-f)*col0 + f*col1
+        idx = (rad <= 1)
+        col[idx] = 1-rad[idx]*(1-col[idx]) # increase saturation with radius
+        col[~idx] *= 0.75 # out of range
+        img[:,:,i] = (255*col*(1-nanidx.astype('float'))).astype('uint8')
+
+    return img
+    
+# flow dataset getter 
+    
+def get_train_dataset_flow(dataset_str, augmentor=True, crop_size=None):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    if augmentor:
+        dataset_str = dataset_str.replace(')',', augmentor=True)')
+    if crop_size is not None:
+        dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size)))
+    return eval(dataset_str)
+    
+def get_test_datasets_flow(dataset_str):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    return [eval(s) for s in dataset_str.split('+')]
\ No newline at end of file
diff --git a/dust3r/croco/stereoflow/datasets_stereo.py b/dust3r/croco/stereoflow/datasets_stereo.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbdf841a6650afa71ae5782702902c79eba31a5c
--- /dev/null
+++ b/dust3r/croco/stereoflow/datasets_stereo.py
@@ -0,0 +1,674 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Dataset structure for stereo
+# --------------------------------------------------------
+
+import sys, os
+import os.path as osp
+import pickle
+import numpy as np
+from PIL import Image
+import json
+import h5py
+from glob import glob
+import cv2
+
+import torch
+from torch.utils import data
+
+from .augmentor import StereoAugmentor
+
+
+
+dataset_to_root = {
+    'CREStereo': './data/stereoflow//crenet_stereo_trainset/stereo_trainset/crestereo/',
+    'SceneFlow': './data/stereoflow//SceneFlow/',
+    'ETH3DLowRes': './data/stereoflow/eth3d_lowres/',
+    'Booster': './data/stereoflow/booster_gt/',
+    'Middlebury2021': './data/stereoflow/middlebury/2021/data/',
+    'Middlebury2014': './data/stereoflow/middlebury/2014/',
+    'Middlebury2006': './data/stereoflow/middlebury/2006/',
+    'Middlebury2005': './data/stereoflow/middlebury/2005/train/',
+    'MiddleburyEval3':  './data/stereoflow/middlebury/MiddEval3/',
+    'Spring': './data/stereoflow/spring/', 
+    'Kitti15': './data/stereoflow/kitti-stereo-2015/',
+    'Kitti12': './data/stereoflow/kitti-stereo-2012/',
+}
+cache_dir = "./data/stereoflow/datasets_stereo_cache/"
+
+
+in1k_mean = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1)
+in1k_std =  torch.tensor([0.229, 0.224, 0.225]).view(3,1,1)
+def img_to_tensor(img):
+    img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.
+    img = (img-in1k_mean)/in1k_std
+    return img
+def disp_to_tensor(disp):
+    return torch.from_numpy(disp)[None,:,:]
+
+class StereoDataset(data.Dataset):
+    
+    def __init__(self, split, augmentor=False, crop_size=None, totensor=True):
+        self.split = split
+        if not augmentor: assert crop_size is None 
+        if crop_size: assert augmentor
+        self.crop_size = crop_size
+        self.augmentor_str = augmentor
+        self.augmentor = StereoAugmentor(crop_size) if augmentor else None
+        self.totensor = totensor
+        self.rmul = 1 # keep track of rmul
+        self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time)
+        self._prepare_data()
+        self._load_or_build_cache()
+        
+    def prepare_data(self):
+        """
+        to be defined for each dataset 
+        """
+        raise NotImplementedError 
+        
+    def __len__(self):
+        return len(self.pairnames)
+        
+    def __getitem__(self, index):
+        pairname = self.pairnames[index]
+        
+        # get filenames 
+        Limgname = self.pairname_to_Limgname(pairname)
+        Rimgname = self.pairname_to_Rimgname(pairname)
+        Ldispname = self.pairname_to_Ldispname(pairname) if self.pairname_to_Ldispname is not None else None
+        
+        # load images and disparities
+        Limg = _read_img(Limgname)
+        Rimg = _read_img(Rimgname)
+        disp = self.load_disparity(Ldispname) if Ldispname is not None else None
+        
+        # sanity check
+        if disp is not None: assert np.all(disp>0) or self.name=="Spring", (self.name, pairname, Ldispname)
+        
+        # apply augmentations
+        if self.augmentor is not None:
+            Limg, Rimg, disp = self.augmentor(Limg, Rimg, disp, self.name)
+        
+        if self.totensor:
+            Limg = img_to_tensor(Limg)
+            Rimg = img_to_tensor(Rimg)
+            if disp is None:
+                disp = torch.tensor([]) # to allow dataloader batching with default collate_gn
+            else:
+                disp = disp_to_tensor(disp)
+        
+        return Limg, Rimg, disp, str(pairname)
+        
+    def __rmul__(self, v):
+        self.rmul *= v
+        self.pairnames = v * self.pairnames
+        return self
+        
+    def __str__(self):
+        return f'{self.__class__.__name__}_{self.split}'
+        
+    def __repr__(self):
+        s = f'{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})'
+        if self.rmul==1:
+            s+=f'\n\tnum pairs: {len(self.pairnames)}'
+        else:
+            s+=f'\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})'
+        return s
+
+    def _set_root(self):
+        self.root = dataset_to_root[self.name]
+        assert os.path.isdir(self.root), f"could not find root directory for dataset {self.name}: {self.root}"       
+
+    def _load_or_build_cache(self):
+        cache_file = osp.join(cache_dir, self.name+'.pkl')
+        if osp.isfile(cache_file):
+            with open(cache_file, 'rb') as fid:
+                self.pairnames = pickle.load(fid)[self.split]
+        else:
+            tosave = self._build_cache()
+            os.makedirs(cache_dir, exist_ok=True)
+            with open(cache_file, 'wb') as fid:
+                pickle.dump(tosave, fid)
+            self.pairnames = tosave[self.split]
+        
+class CREStereoDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = 'CREStereo'
+        self._set_root()
+        assert self.split in ['train']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_left.jpg')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'_right.jpg')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname+'_left.disp.png')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_crestereo_disp
+        
+    
+    def _build_cache(self):
+        allpairs = [s+'/'+f[:-len('_left.jpg')] for s in sorted(os.listdir(self.root)) for f in sorted(os.listdir(self.root+'/'+s)) if f.endswith('_left.jpg')]
+        assert len(allpairs)==200000, "incorrect parsing of pairs in CreStereo"
+        tosave = {'train': allpairs}
+        return tosave
+        
+class SceneFlowDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "SceneFlow"
+        self._set_root()
+        assert self.split in ['train_finalpass','train_cleanpass','train_allpass','test_finalpass','test_cleanpass','test_allpass','test1of100_cleanpass','test1of100_finalpass']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/left/','/right/')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname).replace('/frames_finalpass/','/disparity/').replace('/frames_cleanpass/','/disparity/')[:-4]+'.pfm'
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_sceneflow_disp
+        
+    def _build_cache(self):
+        trainpairs = []
+        # driving
+        pairs = sorted(glob(self.root+'Driving/frames_finalpass/*/*/*/left/*.png'))
+        pairs = list(map(lambda x: x[len(self.root):], pairs))
+        assert len(pairs) == 4400, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        # monkaa
+        pairs = sorted(glob(self.root+'Monkaa/frames_finalpass/*/left/*.png'))
+        pairs = list(map(lambda x: x[len(self.root):], pairs))
+        assert len(pairs) == 8664, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        # flyingthings
+        pairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TRAIN/*/*/left/*.png'))
+        pairs = list(map(lambda x: x[len(self.root):], pairs))
+        assert len(pairs) == 22390, "incorrect parsing of pairs in SceneFlow"
+        trainpairs += pairs
+        assert len(trainpairs) == 35454, "incorrect parsing of pairs in SceneFlow"
+        testpairs = sorted(glob(self.root+'FlyingThings/frames_finalpass/TEST/*/*/left/*.png'))
+        testpairs = list(map(lambda x: x[len(self.root):], testpairs))
+        assert len(testpairs) == 4370, "incorrect parsing of pairs in SceneFlow"
+        test1of100pairs = testpairs[::100]
+        assert len(test1of100pairs) == 44, "incorrect parsing of pairs in SceneFlow"
+        # all 
+        tosave = {'train_finalpass': trainpairs,
+                  'train_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), trainpairs)),
+                  'test_finalpass': testpairs,
+                  'test_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), testpairs)),
+                  'test1of100_finalpass': test1of100pairs,
+                  'test1of100_cleanpass': list(map(lambda x: x.replace('frames_finalpass','frames_cleanpass'), test1of100pairs)),
+                 }
+        tosave['train_allpass'] = tosave['train_finalpass']+tosave['train_cleanpass']
+        tosave['test_allpass'] = tosave['test_finalpass']+tosave['test_cleanpass']
+        return tosave
+   
+class Md21Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2021"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/im0','/im1'))
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp0.pfm')
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury_disp
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            #trainpairs += [s+'/im0.png'] # we should remove it, it is included as such in other lightings
+            trainpairs += [s+'/ambient/'+b+'/'+a for b in sorted(os.listdir(osp.join(self.root,s,'ambient'))) for a in sorted(os.listdir(osp.join(self.root,s,'ambient',b))) if a.startswith('im0')]
+        assert len(trainpairs)==355
+        subtrainpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[:-2])]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in seqs[-2:])]
+        assert len(subtrainpairs)==335 and len(subvalpairs)==20, "incorrect parsing of pairs in Middlebury 2021"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave 
+
+class Md14Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2014"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'im0.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'disp0.pfm')
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury_disp
+        self.has_constant_resolution = False
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            trainpairs += [s+'/im1.png',s+'/im1E.png',s+'/im1L.png']
+        assert len(trainpairs)==138
+        valseqs = ['Umbrella-imperfect','Vintage-perfect']
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)]
+        assert len(subtrainpairs)==132 and len(subvalpairs)==6, "incorrect parsing of pairs in Middlebury 2014"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave 
+
+class Md06Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2006"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png')
+        self.load_disparity = _read_middlebury20052006_disp
+        self.has_constant_resolution = False
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            for i in ['Illum1','Illum2','Illum3']:
+                for e in ['Exp0','Exp1','Exp2']:
+                    trainpairs.append(osp.join(s,i,e,'view1.png'))
+        assert len(trainpairs)==189
+        valseqs = ['Rocks1','Wood2']
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)]
+        assert len(subtrainpairs)==171 and len(subvalpairs)==18, "incorrect parsing of pairs in Middlebury 2006"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+
+class Md05Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Middlebury2005"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, osp.dirname(pairname), 'view5.png')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, pairname.split('/')[0], 'disp1.png')
+        self.pairname_to_str = lambda pairname: pairname[:-4]
+        self.load_disparity = _read_middlebury20052006_disp
+        
+    def _build_cache(self):
+        seqs = sorted(os.listdir(self.root))
+        trainpairs = []
+        for s in seqs:
+            for i in ['Illum1','Illum2','Illum3']:
+                for e in ['Exp0','Exp1','Exp2']:
+                    trainpairs.append(osp.join(s,i,e,'view1.png'))
+        assert len(trainpairs)==54, "incorrect parsing of pairs in Middlebury 2005"
+        valseqs = ['Reindeer']
+        assert all(s in seqs for s in valseqs)
+        subtrainpairs = [p for p in trainpairs if not any(p.startswith(s+'/') for s in valseqs)]
+        subvalpairs = [p for p in trainpairs if any(p.startswith(s+'/') for s in valseqs)]
+        assert len(subtrainpairs)==45 and len(subvalpairs)==9, "incorrect parsing of pairs in Middlebury 2005"
+        tosave = {'train': trainpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+        
+class MdEval3Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "MiddleburyEval3"
+        self._set_root()
+        assert self.split in [s+'_'+r for s in ['train','subtrain','subval','test','all'] for r in ['full','half','quarter']]
+        if self.split.endswith('_full'):
+            self.root = self.root.replace('/MiddEval3','/MiddEval3_F')
+        elif self.split.endswith('_half'):        
+            self.root = self.root.replace('/MiddEval3','/MiddEval3_H')
+        else:
+            assert self.split.endswith('_quarter')
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png')
+        self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname, 'disp0GT.pfm')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_middlebury_disp
+        # for submission only
+        self.submission_methodname = "CroCo-Stereo"
+        self.submission_sresolution = 'F' if self.split.endswith('_full') else ('H' if self.split.endswith('_half') else 'Q')
+        
+    def _build_cache(self):
+        trainpairs = ['train/'+s for s in sorted(os.listdir(self.root+'train/'))]
+        testpairs = ['test/'+s for s in sorted(os.listdir(self.root+'test/'))]
+        subvalpairs = trainpairs[-1:]
+        subtrainpairs = trainpairs[:-1]
+        allpairs = trainpairs+testpairs
+        assert len(trainpairs)==15 and len(testpairs)==15 and len(subvalpairs)==1 and len(subtrainpairs)==14 and len(allpairs)==30, "incorrect parsing of pairs in Middlebury Eval v3"
+        tosave = {}
+        for r in ['full','half','quarter']:
+            tosave.update(**{'train_'+r: trainpairs, 'subtrain_'+r: subtrainpairs, 'subval_'+r: subvalpairs, 'test_'+r: testpairs, 'all_'+r: allpairs})
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, pairname.split('/')[0].replace('train','training')+self.submission_sresolution, pairname.split('/')[1], 'disp0'+self.submission_methodname+'.pfm')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writePFM(outfile, prediction)
+        timefile = os.path.join( os.path.dirname(outfile), "time"+self.submission_methodname+'.txt')
+        with open(timefile, 'w') as fid:
+            fid.write(str(time))
+
+    def finalize_submission(self, outdir):
+        cmd = f'cd {outdir}/; zip -r "{self.submission_methodname}.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/{self.submission_methodname}.zip')
+
+class ETH3DLowResDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "ETH3DLowRes"
+        self._set_root()
+        assert self.split in ['train','test','subtrain','subval','all']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname, 'im0.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname, 'im1.png')
+        self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: None if pairname.startswith('test/') else osp.join(self.root, pairname.replace('train/','train_gt/'), 'disp0GT.pfm')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_eth3d_disp
+        self.has_constant_resolution = False
+        
+    def _build_cache(self):
+        trainpairs = ['train/' + s for s in sorted(os.listdir(self.root+'train/'))]
+        testpairs = ['test/' + s for s in sorted(os.listdir(self.root+'test/'))]
+        assert len(trainpairs) == 27 and len(testpairs) == 20, "incorrect parsing of pairs in ETH3D Low Res"
+        subvalpairs = ['train/delivery_area_3s','train/electro_3l','train/playground_3l']
+        assert all(p in trainpairs for p in subvalpairs)
+        subtrainpairs = [p for p in trainpairs if not p in subvalpairs]
+        assert len(subvalpairs)==3 and len(subtrainpairs)==24, "incorrect parsing of pairs in ETH3D Low Res"
+        tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs, 'all': trainpairs+testpairs}
+        return tosave
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, 'low_res_two_view', pairname.split('/')[1]+'.pfm')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writePFM(outfile, prediction)
+        timefile = outfile[:-4]+'.txt'
+        with open(timefile, 'w') as fid:
+            fid.write('runtime '+str(time))
+
+    def finalize_submission(self, outdir):
+        cmd = f'cd {outdir}/; zip -r "eth3d_low_res_two_view_results.zip" low_res_two_view'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/eth3d_low_res_two_view_results.zip')
+
+class BoosterDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Booster"
+        self._set_root()
+        assert self.split in ['train_balanced','test_balanced','subtrain_balanced','subval_balanced'] # we use only the balanced version
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname)
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname).replace('/camera_00/','/camera_02/')
+        self.pairname_to_Ldispname = lambda pairname: osp.join(self.root, osp.dirname(pairname), '../disp_00.npy') # same images with different colors, same gt per sequence
+        self.pairname_to_str = lambda pairname: pairname[:-4].replace('/camera_00/','/')
+        self.load_disparity = _read_booster_disp
+        
+        
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir(self.root+'train/balanced'))
+        trainpairs = ['train/balanced/'+s+'/camera_00/'+imname for s in trainseqs for imname in sorted(os.listdir(self.root+'train/balanced/'+s+'/camera_00/'))]
+        testpairs = ['test/balanced/'+s+'/camera_00/'+imname for s in sorted(os.listdir(self.root+'test/balanced')) for imname in sorted(os.listdir(self.root+'test/balanced/'+s+'/camera_00/'))]
+        assert len(trainpairs) == 228 and len(testpairs) == 191
+        subtrainpairs = [p for p in trainpairs if any(s in p for s in trainseqs[:-2])]
+        subvalpairs = [p for p in trainpairs if any(s in p for s in trainseqs[-2:])]
+        # warning: if we do validation split, we should split scenes!!!
+        tosave = {'train_balanced': trainpairs, 'test_balanced': testpairs, 'subtrain_balanced': subtrainpairs, 'subval_balanced': subvalpairs,}
+        return tosave
+        
+class SpringDataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Spring"
+        self._set_root()
+        assert self.split in ['train', 'test', 'subtrain', 'subval']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname+'.png').replace('frame_right','<frame_right>').replace('frame_left','frame_right').replace('<frame_right>','frame_left')
+        self.pairname_to_Ldispname = lambda pairname: None if pairname.startswith('test') else osp.join(self.root, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right')
+        self.pairname_to_str = lambda pairname: pairname
+        self.load_disparity = _read_hdf5_disp        
+        
+    def _build_cache(self):
+        trainseqs = sorted(os.listdir( osp.join(self.root,'train')))
+        trainpairs = [osp.join('train',s,'frame_left',f[:-4]) for s in trainseqs for f in sorted(os.listdir(osp.join(self.root,'train',s,'frame_left')))]
+        testseqs = sorted(os.listdir( osp.join(self.root,'test')))
+        testpairs = [osp.join('test',s,'frame_left',f[:-4]) for s in testseqs for f in sorted(os.listdir(osp.join(self.root,'test',s,'frame_left')))]
+        testpairs += [p.replace('frame_left','frame_right') for p in testpairs]
+        """maxnorm = {'0001': 32.88, '0002': 228.5, '0004': 298.2, '0005': 142.5, '0006': 113.6, '0007': 27.3, '0008': 554.5, '0009': 155.6, '0010': 126.1, '0011': 87.6, '0012': 303.2, '0013': 24.14, '0014': 82.56, '0015': 98.44, '0016': 156.9, '0017': 28.17, '0018': 21.03, '0020': 178.0, '0021': 58.06, '0022': 354.2, '0023': 8.79, '0024': 97.06, '0025': 55.16, '0026': 91.9, '0027': 156.6, '0030': 200.4, '0032': 58.66, '0033': 373.5, '0036': 149.4, '0037': 5.625, '0038': 37.0, '0039': 12.2, '0041': 453.5, '0043': 457.0, '0044': 379.5, '0045': 161.8, '0047': 105.44} # => let'use 0041"""
+        subtrainpairs = [p for p in trainpairs if p.split('/')[1]!='0041']
+        subvalpairs = [p for p in trainpairs if p.split('/')[1]=='0041']
+        assert len(trainpairs)==5000 and len(testpairs)==2000 and len(subtrainpairs)==4904 and len(subvalpairs)==96, "incorrect parsing of pairs in Spring"
+        tosave = {'train': trainpairs, 'test': testpairs, 'subtrain': subtrainpairs, 'subval': subvalpairs}
+        return tosave
+        
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, pairname+'.dsp5').replace('frame_left','disp1_left').replace('frame_right','disp1_right')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        writeDsp5File(prediction, outfile)
+        
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        exe = "{self.root}/disp1_subsampling"
+        if os.path.isfile(exe):
+            cmd = f'cd "{outdir}/test"; {exe} .'
+            print(cmd)
+            os.system(cmd)
+        else:
+            print('Could not find disp1_subsampling executable for submission.')
+            print('Please download it and run:')
+            print(f'cd "{outdir}/test"; <disp1_subsampling_exe> .')
+
+class Kitti12Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti12"
+        self._set_root()
+        assert self.split in ['train','test']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/colored_1/')+'_10.png')
+        self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/colored_0/','/disp_occ/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/colored_0/','/')
+        self.load_disparity = _read_kitti_disp
+        
+    def _build_cache(self):
+        trainseqs = ["training/colored_0/%06d"%(i) for i in range(194)]
+        testseqs = ["testing/colored_0/%06d"%(i) for i in range(195)]
+        assert len(trainseqs)==194 and len(testseqs)==195, "incorrect parsing of pairs in Kitti12"
+        tosave = {'train': trainseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        img = (prediction * 256).astype('uint16')
+        Image.fromarray(img).save(outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti12_results.zip" .'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti12_results.zip')
+
+class Kitti15Dataset(StereoDataset):
+
+    def _prepare_data(self):
+        self.name = "Kitti15"
+        self._set_root()
+        assert self.split in ['train','subtrain','subval','test']
+        self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname+'_10.png')
+        self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/image_3/')+'_10.png')
+        self.pairname_to_Ldispname = None if self.split=='test' else lambda pairname: osp.join(self.root, pairname.replace('/image_2/','/disp_occ_0/')+'_10.png')
+        self.pairname_to_str = lambda pairname: pairname.replace('/image_2/','/')
+        self.load_disparity = _read_kitti_disp
+        
+    def _build_cache(self):
+        trainseqs = ["training/image_2/%06d"%(i) for i in range(200)]
+        subtrainseqs = trainseqs[:-5]
+        subvalseqs = trainseqs[-5:]
+        testseqs = ["testing/image_2/%06d"%(i) for i in range(200)]
+        assert len(trainseqs)==200 and len(subtrainseqs)==195 and len(subvalseqs)==5 and len(testseqs)==200, "incorrect parsing of pairs in Kitti15"
+        tosave = {'train': trainseqs, 'subtrain': subtrainseqs, 'subval': subvalseqs, 'test': testseqs}
+        return tosave 
+
+    def submission_save_pairname(self, pairname, prediction, outdir, time):
+        assert prediction.ndim==2
+        assert prediction.dtype==np.float32
+        outfile = os.path.join(outdir, 'disp_0', pairname.split('/')[-1]+'_10.png')
+        os.makedirs( os.path.dirname(outfile), exist_ok=True)
+        img = (prediction * 256).astype('uint16')
+        Image.fromarray(img).save(outfile)
+
+    def finalize_submission(self, outdir):
+        assert self.split=='test'
+        cmd = f'cd {outdir}/; zip -r "kitti15_results.zip" disp_0'
+        print(cmd)
+        os.system(cmd)
+        print(f'Done. Submission file at {outdir}/kitti15_results.zip')
+
+
+### auxiliary functions
+
+def _read_img(filename):
+    # convert to RGB for scene flow finalpass data
+    img = np.asarray(Image.open(filename).convert('RGB'))
+    return img
+
+def _read_booster_disp(filename):
+    disp = np.load(filename)
+    disp[disp==0.0] = np.inf
+    return disp
+
+def _read_png_disp(filename, coef=1.0):
+    disp = np.asarray(Image.open(filename))
+    disp = disp.astype(np.float32) / coef
+    disp[disp==0.0] = np.inf
+    return disp 
+
+def _read_pfm_disp(filename):
+    disp = np.ascontiguousarray(_read_pfm(filename)[0])
+    disp[disp<=0] = np.inf # eg /nfs/data/ffs-3d/datasets/middlebury/2014/Shopvac-imperfect/disp0.pfm
+    return disp
+
+def _read_npy_disp(filename):
+    return np.load(filename)
+
+def _read_crestereo_disp(filename): return _read_png_disp(filename, coef=32.0)
+def _read_middlebury20052006_disp(filename): return _read_png_disp(filename, coef=1.0)
+def _read_kitti_disp(filename): return _read_png_disp(filename, coef=256.0)
+_read_sceneflow_disp = _read_pfm_disp
+_read_eth3d_disp = _read_pfm_disp
+_read_middlebury_disp = _read_pfm_disp
+_read_carla_disp = _read_pfm_disp
+_read_tartanair_disp = _read_npy_disp
+    
+def _read_hdf5_disp(filename):
+    disp = np.asarray(h5py.File(filename)['disparity'])
+    disp[np.isnan(disp)] = np.inf # make invalid values as +inf
+    #disp[disp==0.0] = np.inf # make invalid values as +inf
+    return disp.astype(np.float32)
+    
+import re
+def _read_pfm(file):
+    file = open(file, 'rb')
+
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().rstrip()
+    if header.decode("ascii") == 'PF':
+        color = True
+    elif header.decode("ascii") == 'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode("ascii"))
+    if dim_match:
+        width, height = list(map(int, dim_match.groups()))
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().decode("ascii").rstrip())
+    if scale < 0:  # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'  # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data, scale
+
+def writePFM(file, image, scale=1):
+    file = open(file, 'wb')
+
+    color = None
+
+    if image.dtype.name != 'float32':
+        raise Exception('Image dtype must be float32.')
+
+    image = np.flipud(image)
+
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1:  # greyscale
+        color = False
+    else:
+        raise Exception('Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+
+    file.write('PF\n' if color else 'Pf\n'.encode())
+    file.write('%d %d\n'.encode() % (image.shape[1], image.shape[0]))
+
+    endian = image.dtype.byteorder
+
+    if endian == '<' or endian == '=' and sys.byteorder == 'little':
+        scale = -scale
+
+    file.write('%f\n'.encode() % scale)
+
+    image.tofile(file)
+
+def writeDsp5File(disp, filename):
+    with h5py.File(filename, "w") as f:
+        f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5)
+
+
+# disp visualization
+
+def vis_disparity(disp, m=None, M=None):
+    if m is None: m = disp.min()
+    if M is None: M = disp.max()
+    disp_vis = (disp - m) / (M-m) * 255.0
+    disp_vis = disp_vis.astype("uint8")
+    disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO)
+    return disp_vis
+
+# dataset getter 
+    
+def get_train_dataset_stereo(dataset_str, augmentor=True, crop_size=None):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    if augmentor:
+        dataset_str = dataset_str.replace(')',', augmentor=True)')
+    if crop_size is not None:
+        dataset_str = dataset_str.replace(')',', crop_size={:s})'.format(str(crop_size)))
+    return eval(dataset_str)
+    
+def get_test_datasets_stereo(dataset_str):
+    dataset_str = dataset_str.replace('(','Dataset(')
+    return [eval(s) for s in dataset_str.split('+')]
\ No newline at end of file
diff --git a/dust3r/croco/stereoflow/download_model.sh b/dust3r/croco/stereoflow/download_model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..533119609108c5ec3c22ff79b10e9215c1ac5098
--- /dev/null
+++ b/dust3r/croco/stereoflow/download_model.sh
@@ -0,0 +1,12 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+model=$1
+outfile="stereoflow_models/${model}"
+if [[ ! -f $outfile ]]
+then
+	mkdir -p stereoflow_models/;
+	wget https://download.europe.naverlabs.com/ComputerVision/CroCo/StereoFlow_models/$1 -P stereoflow_models/;
+else
+	echo "Model ${model} already downloaded in ${outfile}."
+fi
\ No newline at end of file
diff --git a/dust3r/croco/stereoflow/engine.py b/dust3r/croco/stereoflow/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..c057346b99143bf6b9c4666a58215b2b91aca7a6
--- /dev/null
+++ b/dust3r/croco/stereoflow/engine.py
@@ -0,0 +1,280 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main function for training one epoch or testing
+# --------------------------------------------------------
+
+import math
+import sys
+from typing import Iterable
+import numpy as np
+import torch
+import torchvision
+
+from utils import misc as misc
+
+
+def split_prediction_conf(predictions, with_conf=False):
+    if not with_conf:
+        return predictions, None
+    conf = predictions[:,-1:,:,:]
+    predictions = predictions[:,:-1,:,:]
+    return predictions, conf
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, metrics: torch.nn.Module,
+                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    log_writer=None, print_freq = 20,
+                    args=None):
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    accum_iter = args.accum_iter
+
+    optimizer.zero_grad()
+
+    details = {}
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    if args.img_per_epoch:
+        iter_per_epoch = args.img_per_epoch // args.batch_size + int(args.img_per_epoch % args.batch_size > 0)
+        assert len(data_loader) >= iter_per_epoch, 'Dataset is too small for so many iterations'
+        len_data_loader = iter_per_epoch
+    else:
+        len_data_loader, iter_per_epoch = len(data_loader), None
+
+    for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_logger.log_every(data_loader, print_freq, header, max_iter=iter_per_epoch)):
+        
+        image1 = image1.to(device, non_blocking=True)
+        image2 = image2.to(device, non_blocking=True)
+        gt = gt.to(device, non_blocking=True)
+        
+        # we use a per iteration (instead of per epoch) lr scheduler
+        if data_iter_step % accum_iter == 0:
+            misc.adjust_learning_rate(optimizer, data_iter_step / len_data_loader + epoch, args)
+
+        with torch.cuda.amp.autocast(enabled=bool(args.amp)):
+            prediction = model(image1, image2)
+            prediction, conf = split_prediction_conf(prediction, criterion.with_conf)
+            batch_metrics = metrics(prediction.detach(), gt)
+            loss = criterion(prediction, gt) if conf is None else criterion(prediction, gt, conf)
+            
+        loss_value = loss.item()
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            sys.exit(1)
+
+        loss /= accum_iter
+        loss_scaler(loss, optimizer, parameters=model.parameters(),
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        torch.cuda.synchronize()
+        
+        metric_logger.update(loss=loss_value)
+        for k,v in batch_metrics.items():
+            metric_logger.update(**{k: v.item()})
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(lr=lr)
+
+        #if args.dsitributed: loss_value_reduce = misc.all_reduce_mean(loss_value)
+        time_to_log = ((data_iter_step + 1) % (args.tboard_log_step * accum_iter) == 0 or data_iter_step == len_data_loader-1)
+        loss_value_reduce = misc.all_reduce_mean(loss_value)
+        if log_writer is not None and time_to_log:
+            epoch_1000x = int((data_iter_step / len_data_loader + epoch) * 1000)
+            # We use epoch_1000x as the x-axis in tensorboard. This calibrates different curves when batch size changes.
+            log_writer.add_scalar('train/loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('lr', lr, epoch_1000x)
+            for k,v in batch_metrics.items():
+                log_writer.add_scalar('train/'+k, v.item(), epoch_1000x)
+
+    # gather the stats from all processes
+    #if args.distributed: metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def validate_one_epoch(model: torch.nn.Module,
+                   criterion: torch.nn.Module,
+                   metrics: torch.nn.Module,
+                   data_loaders: list[Iterable],
+                   device: torch.device,
+                   epoch: int,
+                   log_writer=None,
+                   args=None):
+
+    model.eval()
+    metric_loggers = []
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 20
+
+    conf_mode = args.tile_conf_mode
+    crop = args.crop
+    
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    results = {}
+    dnames = []
+    image1, image2, gt, prediction = None, None, None, None
+    for didx, data_loader in enumerate(data_loaders):
+        dname = str(data_loader.dataset)
+        dnames.append(dname)
+        metric_loggers.append(misc.MetricLogger(delimiter="  "))
+        for data_iter_step, (image1, image2, gt, pairname) in enumerate(metric_loggers[didx].log_every(data_loader, print_freq, header)):
+            image1 = image1.to(device, non_blocking=True)
+            image2 = image2.to(device, non_blocking=True)
+            gt = gt.to(device, non_blocking=True)
+            if dname.startswith('Spring'):
+                assert gt.size(2)==image1.size(2)*2 and gt.size(3)==image1.size(3)*2
+                gt = (gt[:,:,0::2,0::2] + gt[:,:,0::2,1::2] + gt[:,:,1::2,0::2] + gt[:,:,1::2,1::2] ) / 4.0 # we approximate the gt based on the 2x upsampled ones
+
+            with torch.inference_mode():
+                prediction, tiled_loss, c = tiled_pred(model, criterion, image1, image2, gt, conf_mode=conf_mode, overlap=args.val_overlap, crop=crop, with_conf=criterion.with_conf)
+                batch_metrics = metrics(prediction.detach(), gt)
+                loss = criterion(prediction.detach(), gt) if not criterion.with_conf else criterion(prediction.detach(), gt, c)
+                loss_value = loss.item()
+                metric_loggers[didx].update(loss_tiled=tiled_loss.item())
+                metric_loggers[didx].update(**{f'loss': loss_value})
+                for k,v in batch_metrics.items():
+                    metric_loggers[didx].update(**{dname+'_' + k: v.item()})
+        
+    results = {k: meter.global_avg for ml in metric_loggers for k, meter in ml.meters.items()}
+    if len(dnames)>1:
+        for k in batch_metrics.keys():
+            results['AVG_'+k] = sum(results[dname+'_'+k] for dname in dnames) / len(dnames)
+            
+    if log_writer is not None :
+        epoch_1000x = int((1 + epoch) * 1000)
+        for k,v in results.items():
+            log_writer.add_scalar('val/'+k, v, epoch_1000x)
+
+    print("Averaged stats:", results)
+    return results
+
+import torch.nn.functional as F
+def _resize_img(img, new_size):
+    return F.interpolate(img, size=new_size, mode='bicubic', align_corners=False)
+def _resize_stereo_or_flow(data, new_size):
+    assert data.ndim==4
+    assert data.size(1) in [1,2]
+    scale_x = new_size[1]/float(data.size(3))
+    out = F.interpolate(data, size=new_size, mode='bicubic', align_corners=False)
+    out[:,0,:,:] *= scale_x
+    if out.size(1)==2:
+        scale_y = new_size[0]/float(data.size(2))        
+        out[:,1,:,:] *= scale_y
+        print(scale_x, new_size, data.shape)
+    return out
+    
+
+@torch.no_grad()
+def tiled_pred(model, criterion, img1, img2, gt,
+               overlap=0.5, bad_crop_thr=0.05,
+               downscale=False, crop=512, ret='loss',
+               conf_mode='conf_expsigmoid_10_5', with_conf=False, 
+               return_time=False):
+                     
+    # for each image, we are going to run inference on many overlapping patches
+    # then, all predictions will be weighted-averaged
+    if gt is not None:
+        B, C, H, W = gt.shape
+    else:
+        B, _, H, W = img1.shape
+        C = model.head.num_channels-int(with_conf)
+    win_height, win_width = crop[0], crop[1]
+    
+    # upscale to be larger than the crop
+    do_change_scale =  H<win_height or W<win_width
+    if do_change_scale: 
+        upscale_factor = max(win_width/W, win_height/W)
+        original_size = (H,W)
+        new_size = (round(H*upscale_factor),round(W*upscale_factor))
+        img1 = _resize_img(img1, new_size)
+        img2 = _resize_img(img2, new_size)
+        # resize gt just for the computation of tiled losses
+        if gt is not None: gt = _resize_stereo_or_flow(gt, new_size)
+        H,W = img1.shape[2:4]
+        
+    if conf_mode.startswith('conf_expsigmoid_'): # conf_expsigmoid_30_10
+        beta, betasigmoid = map(float, conf_mode[len('conf_expsigmoid_'):].split('_'))
+    elif conf_mode.startswith('conf_expbeta'): # conf_expbeta3
+        beta = float(conf_mode[len('conf_expbeta'):])
+    else:
+        raise NotImplementedError(f"conf_mode {conf_mode} is not implemented")
+
+    def crop_generator():
+        for sy in _overlapping(H, win_height, overlap):
+          for sx in _overlapping(W, win_width, overlap):
+            yield sy, sx, sy, sx, True
+
+    # keep track of weighted sum of prediction*weights and weights
+    accu_pred = img1.new_zeros((B, C, H, W)) # accumulate the weighted sum of predictions 
+    accu_conf = img1.new_zeros((B, H, W)) + 1e-16 # accumulate the weights 
+    accu_c = img1.new_zeros((B, H, W)) # accumulate the weighted sum of confidences ; not so useful except for computing some losses
+
+    tiled_losses = []
+    
+    if return_time:
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+
+    for sy1, sx1, sy2, sx2, aligned in crop_generator():
+        # compute optical flow there
+        pred =  model(_crop(img1,sy1,sx1), _crop(img2,sy2,sx2))
+        pred, predconf = split_prediction_conf(pred, with_conf=with_conf)
+        
+        if gt is not None: gtcrop = _crop(gt,sy1,sx1)
+        if criterion is not None and gt is not None: 
+            tiled_losses.append( criterion(pred, gtcrop).item() if predconf is None else criterion(pred, gtcrop, predconf).item() )
+        
+        if conf_mode.startswith('conf_expsigmoid_'):
+            conf = torch.exp(- beta * 2 * (torch.sigmoid(predconf / betasigmoid) - 0.5)).view(B,win_height,win_width)
+        elif conf_mode.startswith('conf_expbeta'):
+            conf = torch.exp(- beta * predconf).view(B,win_height,win_width)
+        else:
+            raise NotImplementedError
+                        
+        accu_pred[...,sy1,sx1] += pred * conf[:,None,:,:]
+        accu_conf[...,sy1,sx1] += conf
+        accu_c[...,sy1,sx1] += predconf.view(B,win_height,win_width) * conf 
+        
+    pred = accu_pred / accu_conf[:, None,:,:]
+    c = accu_c / accu_conf
+    assert not torch.any(torch.isnan(pred))
+
+    if return_time:
+        end.record()
+        torch.cuda.synchronize()
+        time = start.elapsed_time(end)/1000.0 # this was in milliseconds
+
+    if do_change_scale:
+        pred = _resize_stereo_or_flow(pred, original_size)
+    
+    if return_time:
+        return pred, torch.mean(torch.tensor(tiled_losses)), c, time
+    return pred, torch.mean(torch.tensor(tiled_losses)), c
+
+
+def _overlapping(total, window, overlap=0.5):
+    assert total >= window and 0 <= overlap < 1, (total, window, overlap)
+    num_windows = 1 + int(np.ceil( (total - window) / ((1-overlap) * window) ))
+    offsets = np.linspace(0, total-window, num_windows).round().astype(int)
+    yield from (slice(x, x+window) for x in offsets)
+
+def _crop(img, sy, sx):
+    B, THREE, H, W = img.shape
+    if 0 <= sy.start and sy.stop <= H and 0 <= sx.start and sx.stop <= W:
+        return img[:,:,sy,sx]
+    l, r = max(0,-sx.start), max(0,sx.stop-W)
+    t, b = max(0,-sy.start), max(0,sy.stop-H)
+    img = torch.nn.functional.pad(img, (l,r,t,b), mode='constant')
+    return img[:, :, slice(sy.start+t,sy.stop+t), slice(sx.start+l,sx.stop+l)]
\ No newline at end of file
diff --git a/dust3r/croco/stereoflow/test.py b/dust3r/croco/stereoflow/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0248e56664c769752595af251e1eadcfa3a479d9
--- /dev/null
+++ b/dust3r/croco/stereoflow/test.py
@@ -0,0 +1,216 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main test function
+# --------------------------------------------------------
+
+import os
+import argparse
+import pickle
+from PIL import Image
+import numpy as np
+from tqdm import tqdm
+
+import torch
+from torch.utils.data import DataLoader
+
+import utils.misc as misc
+from models.croco_downstream import CroCoDownstreamBinocular
+from models.head_downstream import PixelwiseTaskWithDPT
+
+from stereoflow.criterion import *
+from stereoflow.datasets_stereo import get_test_datasets_stereo
+from stereoflow.datasets_flow import get_test_datasets_flow
+from stereoflow.engine import tiled_pred
+
+from stereoflow.datasets_stereo import vis_disparity
+from stereoflow.datasets_flow import flowToColor
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('Test CroCo models on stereo/flow', add_help=False)
+    # important argument 
+    parser.add_argument('--model', required=True, type=str, help='Path to the model to evaluate')
+    parser.add_argument('--dataset', required=True, type=str, help="test dataset (there can be multiple dataset separated by a +)")
+    # tiling 
+    parser.add_argument('--tile_conf_mode', type=str, default='', help='Weights for the tiling aggregation based on confidence (empty means use the formula from the loaded checkpoint')
+    parser.add_argument('--tile_overlap', type=float, default=0.7, help='overlap between tiles')
+    # save (it will automatically go to <model_path>_<dataset_str>/<tile_str>_<save>)
+    parser.add_argument('--save', type=str, nargs='+', default=[], 
+                        help='what to save: \
+                              metrics (pickle file), \
+                              pred (raw prediction save as torch tensor), \
+                              visu (visualization in png of each prediction), \
+                              err10 (visualization in png of the error clamp at 10 for each prediction), \
+                              submission (submission file)')
+    # other (no impact)
+    parser.add_argument('--num_workers', default=4, type=int)
+    return parser
+    
+    
+def _load_model_and_criterion(model_path, do_load_metrics, device):
+    print('loading model from', model_path)
+    assert os.path.isfile(model_path)
+    ckpt = torch.load(model_path, 'cpu')
+    
+    ckpt_args = ckpt['args']
+    task = ckpt_args.task
+    tile_conf_mode = ckpt_args.tile_conf_mode
+    num_channels = {'stereo': 1, 'flow': 2}[task]
+    with_conf =  eval(ckpt_args.criterion).with_conf
+    if with_conf: num_channels += 1
+    print('head: PixelwiseTaskWithDPT()')
+    head = PixelwiseTaskWithDPT()
+    head.num_channels = num_channels
+    print('croco_args:', ckpt_args.croco_args)
+    model = CroCoDownstreamBinocular(head, **ckpt_args.croco_args)
+    msg = model.load_state_dict(ckpt['model'], strict=True)
+    model.eval()
+    model = model.to(device)
+    
+    if do_load_metrics:
+        if task=='stereo':
+            metrics = StereoDatasetMetrics().to(device)
+        else:
+            metrics = FlowDatasetMetrics().to(device)
+    else:
+        metrics = None
+    
+    return model, metrics, ckpt_args.crop, with_conf, task, tile_conf_mode
+    
+    
+def _save_batch(pred, gt, pairnames, dataset, task, save, outdir, time, submission_dir=None):
+
+    for i in range(len(pairnames)):
+        
+        pairname = eval(pairnames[i]) if pairnames[i].startswith('(') else pairnames[i] # unbatch pairname 
+        fname = os.path.join(outdir, dataset.pairname_to_str(pairname))
+        os.makedirs(os.path.dirname(fname), exist_ok=True)
+        
+        predi = pred[i,...]
+        if gt is not None: gti = gt[i,...]
+        
+        if 'pred' in save:
+            torch.save(predi.squeeze(0).cpu(), fname+'_pred.pth')
+            
+        if 'visu' in save:
+            if task=='stereo':
+                disparity = predi.permute((1,2,0)).squeeze(2).cpu().numpy()
+                m,M = None
+                if gt is not None:
+                    mask = torch.isfinite(gti)
+                    m = gt[mask].min()
+                    M = gt[mask].max()
+                img_disparity = vis_disparity(disparity, m=m, M=M)
+                Image.fromarray(img_disparity).save(fname+'_pred.png')
+            else:
+                # normalize flowToColor according to the maxnorm of gt (or prediction if not available)
+                flowNorm = torch.sqrt(torch.sum( (gti if gt is not None else predi)**2, dim=0)).max().item()
+                imgflow = flowToColor(predi.permute((1,2,0)).cpu().numpy(), maxflow=flowNorm)
+                Image.fromarray(imgflow).save(fname+'_pred.png')
+                
+        if 'err10' in save:
+            assert gt is not None
+            L2err = torch.sqrt(torch.sum( (gti-predi)**2, dim=0))
+            valid = torch.isfinite(gti[0,:,:])
+            L2err[~valid] = 0.0
+            L2err = torch.clamp(L2err, max=10.0)
+            red = (L2err*255.0/10.0).to(dtype=torch.uint8)[:,:,None]
+            zer = torch.zeros_like(red)
+            imgerr = torch.cat( (red,zer,zer), dim=2).cpu().numpy()
+            Image.fromarray(imgerr).save(fname+'_err10.png')
+            
+        if 'submission' in save:
+            assert submission_dir is not None
+            predi_np = predi.permute(1,2,0).squeeze(2).cpu().numpy() # transform into HxWx2 for flow or HxW for stereo
+            dataset.submission_save_pairname(pairname, predi_np, submission_dir, time)
+
+def main(args):
+        
+    # load the pretrained model and metrics
+    device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+    model, metrics, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion(args.model, 'metrics' in args.save, device)
+    if args.tile_conf_mode=='': args.tile_conf_mode = tile_conf_mode
+    
+    # load the datasets 
+    datasets = (get_test_datasets_stereo if task=='stereo' else get_test_datasets_flow)(args.dataset)
+    dataloaders = [DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for dataset in datasets]    
+       
+    # run
+    for i,dataloader in enumerate(dataloaders):
+        dataset = datasets[i]
+        dstr = args.dataset.split('+')[i]
+        
+        outdir = args.model+'_'+misc.filename(dstr)
+        if 'metrics' in args.save and len(args.save)==1:
+            fname = os.path.join(outdir, f'conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}.pkl')
+            if os.path.isfile(fname) and len(args.save)==1:
+                print('  metrics already compute in '+fname)
+                with open(fname, 'rb') as fid:
+                    results = pickle.load(fid)
+                for k,v in results.items():
+                    print('{:s}: {:.3f}'.format(k, v))
+                continue
+                        
+        if 'submission' in args.save:
+            dirname = f'submission_conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}'
+            submission_dir = os.path.join(outdir, dirname)
+        else:
+            submission_dir = None
+           
+        print('')
+        print('saving {:s} in {:s}'.format('+'.join(args.save), outdir))
+        print(repr(dataset))
+    
+        if metrics is not None: 
+            metrics.reset()
+                
+        for data_iter_step, (image1, image2, gt, pairnames) in enumerate(tqdm(dataloader)):
+        
+            do_flip = (task=='stereo' and dstr.startswith('Spring') and any("right" in p for p in pairnames)) # we flip the images and will flip the prediction after as we assume img1 is on the left 
+            
+            image1 = image1.to(device, non_blocking=True)
+            image2 = image2.to(device, non_blocking=True)
+            gt = gt.to(device, non_blocking=True) if gt.numel()>0 else None # special case for test time
+            if do_flip:
+                assert all("right" in p for p in pairnames) 
+                image1 = image1.flip(dims=[3]) # this is already the right frame, let's flip it
+                image2 = image2.flip(dims=[3])
+                gt = gt # that is ok
+                        
+            with torch.inference_mode():
+                pred, _, _, time = tiled_pred(model, None, image1, image2, None if dataset.name=='Spring' else gt, conf_mode=args.tile_conf_mode, overlap=args.tile_overlap, crop=cropsize, with_conf=with_conf, return_time=True)
+
+                if do_flip:
+                    pred = pred.flip(dims=[3])
+                
+                if metrics is not None: 
+                    metrics.add_batch(pred, gt)
+                
+                if any(k in args.save for k in ['pred','visu','err10','submission']):
+                    _save_batch(pred, gt, pairnames, dataset, task, args.save, outdir, time, submission_dir=submission_dir)                
+            
+
+        # print 
+        if metrics is not None: 
+            results = metrics.get_results()
+            for k,v in results.items():
+                print('{:s}: {:.3f}'.format(k, v))
+                
+        # save if needed
+        if 'metrics' in args.save:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            with open(fname, 'wb') as fid:
+                pickle.dump(results, fid)
+            print('metrics saved in', fname)
+            
+        # finalize submission if needed
+        if 'submission' in args.save:
+            dataset.finalize_submission(submission_dir)
+                
+        
+            
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/dust3r/croco/stereoflow/train.py b/dust3r/croco/stereoflow/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..91f2414ffbe5ecd547d31c0e2455478d402719d6
--- /dev/null
+++ b/dust3r/croco/stereoflow/train.py
@@ -0,0 +1,253 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+
+# --------------------------------------------------------
+# Main training function
+# --------------------------------------------------------
+
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+
+import torch
+import torch.distributed as dist
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+from torch.utils.data import DataLoader
+
+import utils
+import utils.misc as misc
+from utils.misc import NativeScalerWithGradNormCount as NativeScaler
+from models.croco_downstream import CroCoDownstreamBinocular, croco_args_from_ckpt
+from models.pos_embed import interpolate_pos_embed
+from models.head_downstream import PixelwiseTaskWithDPT
+
+from stereoflow.datasets_stereo import get_train_dataset_stereo, get_test_datasets_stereo
+from stereoflow.datasets_flow import get_train_dataset_flow, get_test_datasets_flow
+from stereoflow.engine import train_one_epoch, validate_one_epoch
+from stereoflow.criterion import *
+
+
+def get_args_parser():
+    # prepare subparsers 
+    parser = argparse.ArgumentParser('Finetuning CroCo models on stereo or flow', add_help=False)
+    subparsers = parser.add_subparsers(title="Task (stereo or flow)", dest="task", required=True)
+    parser_stereo = subparsers.add_parser('stereo', help='Training stereo model')
+    parser_flow = subparsers.add_parser('flow', help='Training flow model')
+    def add_arg(name_or_flags, default=None, default_stereo=None, default_flow=None, **kwargs):
+        if default is not None: assert default_stereo is None and default_flow is None, "setting default makes default_stereo and default_flow disabled"
+        parser_stereo.add_argument(name_or_flags, default=default if default is not None else default_stereo, **kwargs)
+        parser_flow.add_argument(name_or_flags, default=default if default is not None else default_flow, **kwargs)
+    # output dir 
+    add_arg('--output_dir', required=True, type=str, help='path where to save, if empty, automatically created')
+    # model
+    add_arg('--crop', type=int, nargs = '+', default_stereo=[352, 704], default_flow=[320, 384], help = "size of the random image crops used during training.")
+    add_arg('--pretrained', required=True, type=str, help="Load pretrained model (required as croco arguments come from there)")
+    # criterion  
+    add_arg('--criterion', default_stereo='LaplacianLossBounded2()', default_flow='LaplacianLossBounded()', type=str, help='string to evaluate to get criterion')
+    add_arg('--bestmetric', default_stereo='avgerr', default_flow='EPE', type=str)
+    # dataset 
+    add_arg('--dataset', type=str, required=True, help="training set")
+    # training 
+    add_arg('--seed', default=0, type=int, help='seed')
+    add_arg('--batch_size', default_stereo=6, default_flow=8, type=int, help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
+    add_arg('--epochs', default=32, type=int, help='number of training epochs')
+    add_arg('--img_per_epoch', type=int, default=None, help='Fix the number of images seen in an epoch (None means use all training pairs)')
+    add_arg('--accum_iter', default=1, type=int, help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
+    add_arg('--weight_decay', type=float, default=0.05, help='weight decay (default: 0.05)')
+    add_arg('--lr', type=float, default_stereo=3e-5, default_flow=2e-5, metavar='LR', help='learning rate (absolute lr)')
+    add_arg('--min_lr', type=float, default=0., metavar='LR', help='lower lr bound for cyclic schedulers that hit 0')
+    add_arg('--warmup_epochs', type=int, default=1, metavar='N', help='epochs to warmup LR')
+    add_arg('--optimizer', default='AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))', type=str,
+                        help="Optimizer from torch.optim [ default: AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) ]")
+    add_arg('--amp', default=0, type=int, choices=[0,1], help='enable automatic mixed precision training')
+    # validation
+    add_arg('--val_dataset', type=str, default='', help="Validation sets, multiple separated by + (empty string means that no validation is performed)")
+    add_arg('--tile_conf_mode', type=str, default_stereo='conf_expsigmoid_15_3', default_flow='conf_expsigmoid_10_5', help='Weights for tile aggregation')
+    add_arg('--val_overlap', default=0.7, type=float, help='Overlap value for the tiling')
+    # others
+    add_arg('--num_workers', default=8, type=int)
+    add_arg('--eval_every', type=int, default=1, help='Val loss evaluation frequency')
+    add_arg('--save_every', type=int, default=1, help='Save checkpoint frequency')
+    add_arg('--start_from', type=str, default=None, help='Start training using weights from an other model (eg for finetuning)')
+    add_arg('--tboard_log_step', type=int, default=100, help='Log to tboard every so many steps')
+    add_arg('--dist_url', default='env://', help='url used to set up distributed training')
+
+    return parser
+    
+        
+def main(args):
+    misc.init_distributed_mode(args)
+    global_rank = misc.get_rank()
+    num_tasks = misc.get_world_size()
+
+    assert os.path.isfile(args.pretrained)
+    print("output_dir: "+args.output_dir)
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # fix the seed for reproducibility
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    cudnn.benchmark = True
+
+    # Metrics / criterion 
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    metrics = (StereoMetrics if args.task=='stereo' else FlowMetrics)().to(device)
+    criterion = eval(args.criterion).to(device)
+    print('Criterion: ', args.criterion)
+
+    # Prepare model
+    assert os.path.isfile(args.pretrained)
+    ckpt = torch.load(args.pretrained, 'cpu')
+    croco_args = croco_args_from_ckpt(ckpt)
+    croco_args['img_size'] = (args.crop[0], args.crop[1])
+    print('Croco args: '+str(croco_args))
+    args.croco_args = croco_args # saved for test time 
+    # prepare head 
+    num_channels = {'stereo': 1, 'flow': 2}[args.task]
+    if criterion.with_conf: num_channels += 1
+    print(f'Building head PixelwiseTaskWithDPT() with {num_channels} channel(s)')
+    head = PixelwiseTaskWithDPT()
+    head.num_channels = num_channels
+    # build model and load pretrained weights
+    model = CroCoDownstreamBinocular(head, **croco_args)
+    interpolate_pos_embed(model, ckpt['model'])
+    msg = model.load_state_dict(ckpt['model'], strict=False)
+    print(msg)
+
+    total_params = sum(p.numel() for p in model.parameters())
+    total_params_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"Total params: {total_params}")
+    print(f"Total params trainable: {total_params_trainable}")
+    model_without_ddp = model.to(device)
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    print("lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], static_graph=True)
+        model_without_ddp = model.module
+    
+    # following timm: set wd as 0 for bias and norm layers   
+    param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay)
+    optimizer = eval(f"torch.optim.{args.optimizer}")
+    print(optimizer)
+    loss_scaler = NativeScaler()
+    
+    # automatic restart
+    last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+    args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    if not args.resume and args.start_from:
+        print(f"Starting from an other model's weights: {args.start_from}")
+        best_so_far = None
+        args.start_epoch = 0
+        ckpt = torch.load(args.start_from, 'cpu')
+        msg = model_without_ddp.load_state_dict(ckpt['model'], strict=False)
+        print(msg)
+    else:
+        best_so_far = misc.load_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
+
+    if best_so_far is None: best_so_far = np.inf
+    
+    # tensorboard
+    log_writer = None
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(log_dir=args.output_dir, purge_step=args.start_epoch*1000)
+
+    #  dataset and loader 
+    print('Building Train Data loader for dataset: ', args.dataset)
+    train_dataset = (get_train_dataset_stereo if args.task=='stereo' else get_train_dataset_flow)(args.dataset, crop_size=args.crop)
+    def _print_repr_dataset(d):
+        if isinstance(d, torch.utils.data.dataset.ConcatDataset):
+            for dd in d.datasets:
+                _print_repr_dataset(dd)
+        else:
+            print(repr(d))
+    _print_repr_dataset(train_dataset)
+    print('  total length:', len(train_dataset))
+    if args.distributed:
+        sampler_train = torch.utils.data.DistributedSampler(
+            train_dataset, num_replicas=num_tasks, rank=global_rank, shuffle=True
+        )
+    else:
+        sampler_train = torch.utils.data.RandomSampler(train_dataset)
+    data_loader_train = torch.utils.data.DataLoader(
+        train_dataset, sampler=sampler_train,
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+    )
+    if args.val_dataset=='':
+        data_loaders_val = None
+    else:
+        print('Building Val Data loader for datasets: ', args.val_dataset)
+        val_datasets = (get_test_datasets_stereo if args.task=='stereo' else get_test_datasets_flow)(args.val_dataset)
+        for val_dataset in val_datasets: print(repr(val_dataset))
+        data_loaders_val = [DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=False) for val_dataset in val_datasets]
+        bestmetric = ("AVG_" if len(data_loaders_val)>1 else str(data_loaders_val[0].dataset)+'_')+args.bestmetric
+       
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    # Training Loop
+    for epoch in range(args.start_epoch, args.epochs):
+
+        if args.distributed: data_loader_train.sampler.set_epoch(epoch)
+            
+        # Train
+        epoch_start = time.time()
+        train_stats = train_one_epoch(model, criterion, metrics, data_loader_train, optimizer, device, epoch, loss_scaler, log_writer=log_writer, args=args)
+        epoch_time = time.time() - epoch_start
+
+        if args.distributed: dist.barrier()
+
+        # Validation (current naive implementation runs the validation on every gpu ... not smart ...)
+        if data_loaders_val is not None and args.eval_every > 0 and (epoch+1) % args.eval_every == 0:
+            val_epoch_start = time.time()
+            val_stats = validate_one_epoch(model, criterion, metrics, data_loaders_val, device, epoch, log_writer=log_writer, args=args)
+            val_epoch_time = time.time() - val_epoch_start
+
+            val_best = val_stats[bestmetric]
+            
+            # Save best of all
+            if val_best <= best_so_far:
+                best_so_far = val_best
+                misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='best')
+        
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         'epoch': epoch,
+                         **{f'val_{k}': v for k, v in val_stats.items()}}
+        else:
+            log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
+                         'epoch': epoch,}
+                             
+        if args.distributed: dist.barrier()
+        
+        # Save stuff
+        if args.output_dir and ((epoch+1) % args.save_every == 0 or epoch + 1 == args.epochs):
+            misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler, epoch=epoch, best_so_far=best_so_far, fname='last')
+
+        if args.output_dir:
+            if log_writer is not None:
+                log_writer.flush()
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+        
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+    
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/dust3r/croco/utils/misc.py b/dust3r/croco/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..132e102a662c987dce5282633cb8730b0e0d5c2d
--- /dev/null
+++ b/dust3r/croco/utils/misc.py
@@ -0,0 +1,463 @@
+# Copyright (C) 2022-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+# 
+# --------------------------------------------------------
+# utilitary functions for CroCo
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+
+import builtins
+import datetime
+import os
+import time
+import math
+import json
+from collections import defaultdict, deque
+from pathlib import Path
+import numpy as np
+
+import torch
+import torch.distributed as dist
+from torch import inf
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if v is None:
+                continue
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None, max_iter=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        len_iterable = min(len(iterable), max_iter) if max_iter else len(iterable)
+        space_fmt = ':' + str(len(str(len_iterable))) + 'd'
+        log_msg = [
+            header,
+            '[{0' + space_fmt + '}/{1}]',
+            'eta: {eta}',
+            '{meters}',
+            'time: {time}',
+            'data: {data}'
+        ]
+        if torch.cuda.is_available():
+            log_msg.append('max mem: {memory:.0f}')
+        log_msg = self.delimiter.join(log_msg)
+        MB = 1024.0 * 1024.0
+        for it,obj in enumerate(iterable):
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len_iterable - 1:
+                eta_seconds = iter_time.global_avg * (len_iterable - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len_iterable, eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len_iterable, eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+            if max_iter and it >= max_iter:
+                break
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len_iterable))
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    builtin_print = builtins.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        force = force or (get_world_size() > 8)
+        if is_master or force:
+            now = datetime.datetime.now().time()
+            builtin_print('[{}] '.format(now), end='')  # print with time stamp
+            builtin_print(*args, **kwargs)
+
+    builtins.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    nodist = args.nodist if hasattr(args,'nodist') else False 
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ and not nodist:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    else:
+        print('Not using distributed mode')
+        setup_for_distributed(is_master=True)  # hack
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}, gpu {}'.format(
+        args.rank, args.dist_url, args.gpu), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+
+
+class NativeScalerWithGradNormCount:
+    state_dict_key = "amp_scaler"
+
+    def __init__(self, enabled=True):
+        self._scaler = torch.cuda.amp.GradScaler(enabled=enabled)
+
+    def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True):
+        self._scaler.scale(loss).backward(create_graph=create_graph)
+        if update_grad:
+            if clip_grad is not None:
+                assert parameters is not None
+                self._scaler.unscale_(optimizer)  # unscale the gradients of optimizer's assigned params in-place
+                norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad)
+            else:
+                self._scaler.unscale_(optimizer)
+                norm = get_grad_norm_(parameters)
+            self._scaler.step(optimizer)
+            self._scaler.update()
+        else:
+            norm = None
+        return norm
+
+    def state_dict(self):
+        return self._scaler.state_dict()
+
+    def load_state_dict(self, state_dict):
+        self._scaler.load_state_dict(state_dict)
+
+
+def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    parameters = [p for p in parameters if p.grad is not None]
+    norm_type = float(norm_type)
+    if len(parameters) == 0:
+        return torch.tensor(0.)
+    device = parameters[0].grad.device
+    if norm_type == inf:
+        total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+    return total_norm
+
+
+
+
+def save_model(args, epoch, model_without_ddp, optimizer, loss_scaler, fname=None, best_so_far=None):
+    output_dir = Path(args.output_dir)
+    if fname is None: fname = str(epoch)
+    checkpoint_path = output_dir / ('checkpoint-%s.pth' % fname)
+    to_save = {
+        'model': model_without_ddp.state_dict(),
+        'optimizer': optimizer.state_dict(),
+        'scaler': loss_scaler.state_dict(),
+        'args': args,
+        'epoch': epoch,
+    }
+    if best_so_far is not None: to_save['best_so_far'] = best_so_far
+    print(f'>> Saving model to {checkpoint_path} ...')
+    save_on_master(to_save, checkpoint_path)
+
+
+def load_model(args, model_without_ddp, optimizer, loss_scaler):
+    args.start_epoch = 0
+    best_so_far = None
+    if args.resume is not None:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(
+                args.resume, map_location='cpu', check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        print("Resume checkpoint %s" % args.resume)
+        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
+        args.start_epoch = checkpoint['epoch'] + 1
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        if 'scaler' in checkpoint:
+            loss_scaler.load_state_dict(checkpoint['scaler'])
+        if 'best_so_far' in checkpoint:
+            best_so_far = checkpoint['best_so_far']
+            print(" & best_so_far={:g}".format(best_so_far))
+        else:
+            print("")
+        print("With optim & sched! start_epoch={:d}".format(args.start_epoch), end='')
+    return best_so_far
+
+def all_reduce_mean(x):
+    world_size = get_world_size()
+    if world_size > 1:
+        x_reduce = torch.tensor(x).cuda()
+        dist.all_reduce(x_reduce)
+        x_reduce /= world_size
+        return x_reduce.item()
+    else:
+        return x
+
+def _replace(text, src, tgt, rm=''):
+    """ Advanced string replacement.
+    Given a text:
+    - replace all elements in src by the corresponding element in tgt
+    - remove all elements in rm
+    """
+    if len(tgt) == 1: 
+        tgt = tgt * len(src)
+    assert len(src) == len(tgt), f"'{src}' and '{tgt}' should have the same len"
+    for s,t in zip(src, tgt):
+        text = text.replace(s,t)
+    for c in rm:
+        text = text.replace(c,'')
+    return text
+    
+def filename( obj ):
+    """ transform a python obj or cmd into a proper filename. 
+     - \1 gets replaced by slash '/'
+     - \2 gets replaced by comma ','
+    """
+    if not isinstance(obj, str): 
+        obj = repr(obj)
+    obj = str(obj).replace('()','')
+    obj = _replace(obj, '_,(*/\1\2','-__x%/,', rm=' )\'"')
+    assert all(len(s) < 256 for s in obj.split(os.sep)), 'filename too long (>256 characters):\n'+obj
+    return obj
+
+def _get_num_layer_for_vit(var_name, enc_depth, dec_depth):
+    if var_name in ("cls_token", "mask_token", "pos_embed", "global_tokens"):
+        return 0
+    elif var_name.startswith("patch_embed"):
+        return 0
+    elif var_name.startswith("enc_blocks"):
+        layer_id = int(var_name.split('.')[1])
+        return layer_id + 1
+    elif var_name.startswith('decoder_embed') or var_name.startswith('enc_norm'): # part of the last black
+        return enc_depth
+    elif var_name.startswith('dec_blocks'):
+        layer_id = int(var_name.split('.')[1])
+        return enc_depth + layer_id + 1
+    elif var_name.startswith('dec_norm'): # part of the last block
+        return enc_depth + dec_depth
+    elif any(var_name.startswith(k) for k in ['head','prediction_head']):
+        return enc_depth + dec_depth + 1
+    else:
+        raise NotImplementedError(var_name)
+
+def get_parameter_groups(model, weight_decay, layer_decay=1.0, skip_list=(), no_lr_scale_list=[]):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    enc_depth, dec_depth = None, None
+    # prepare layer decay values 
+    assert layer_decay==1.0 or 0.<layer_decay<1.
+    if layer_decay<1.:
+        enc_depth = model.enc_depth
+        dec_depth = model.dec_depth if hasattr(model, 'dec_blocks') else 0
+        num_layers = enc_depth+dec_depth
+        layer_decay_values = list(layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2))
+        
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+
+        # Assign weight decay values
+        if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list:
+            group_name = "no_decay"
+            this_weight_decay = 0.
+        else:
+            group_name = "decay"
+            this_weight_decay = weight_decay
+
+        # Assign layer ID for LR scaling
+        if layer_decay<1.:
+            skip_scale = False
+            layer_id = _get_num_layer_for_vit(name, enc_depth, dec_depth)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+            if name in no_lr_scale_list:
+                skip_scale = True
+                group_name = f'{group_name}_no_lr_scale'
+        else:
+            layer_id = 0
+            skip_scale = True
+
+        if group_name not in parameter_group_names:
+            if not skip_scale:
+                scale = layer_decay_values[layer_id]
+            else:
+                scale = 1.
+
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values())
+
+
+
+def adjust_learning_rate(optimizer, epoch, args):
+    """Decay the learning rate with half-cycle cosine after warmup"""
+    
+    if epoch < args.warmup_epochs:
+        lr = args.lr * epoch / args.warmup_epochs 
+    else:
+        lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \
+            (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs)))
+            
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+            
+    return lr
diff --git a/dust3r/datasets_preprocess/habitat/README.md b/dust3r/datasets_preprocess/habitat/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a24120c2374ebca77128be4600581ea94a5090c
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/README.md
@@ -0,0 +1,66 @@
+## Steps to reproduce synthetic training data using the Habitat-Sim simulator
+
+### Create a conda environment
+```bash
+conda create -n habitat python=3.8 habitat-sim=0.2.1 headless=2.0 -c aihabitat -c conda-forge
+conda active habitat
+conda install pytorch -c pytorch
+pip install opencv-python tqdm
+```
+
+or (if you get the error `For headless systems, compile with --headless for EGL support`)
+```
+git clone --branch stable https://github.com/facebookresearch/habitat-sim.git
+cd habitat-sim
+
+conda create -n habitat python=3.9 cmake=3.14.0
+conda activate habitat
+pip install . -v
+conda install pytorch -c pytorch
+pip install opencv-python tqdm
+```
+
+### Download Habitat-Sim scenes
+Download Habitat-Sim scenes:
+- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+- We used scenes from the HM3D, habitat-test-scenes, ReplicaCad and ScanNet datasets.
+- Please put the scenes in a directory `$SCENES_DIR` following the structure below:
+(Note: the habitat-sim dataset installer may install an incompatible version for ReplicaCAD backed lighting.
+The correct scene dataset can be dowloaded from Huggingface: `git clone git@hf.co:datasets/ai-habitat/ReplicaCAD_baked_lighting`).
+```
+$SCENES_DIR/
+├──hm3d/
+├──gibson/
+├──habitat-test-scenes/
+├──ReplicaCAD_baked_lighting/
+└──scannet/
+```
+
+### Download renderings metadata 
+
+Download metadata corresponding to each scene and extract them into a directory `$METADATA_DIR`
+```bash
+wget https://download.europe.naverlabs.com/ComputerVision/DUSt3R/habitat_5views_v1_512x512_metadata.tar.gz
+tar -xvzf habitat_5views_v1_512x512_metadata.tar.gz
+```
+
+### Render the scenes
+
+Render the scenes in an output directory `$OUTPUT_DIR`
+```bash
+export METADATA_DIR="/path/to/habitat/5views_v1_512x512_metadata"
+export SCENES_DIR="/path/to/habitat/data/scene_datasets/"
+export OUTPUT_DIR="data/habitat_processed"
+cd datasets_preprocess/habitat/
+export PYTHONPATH=$(pwd)
+# Print commandlines to generate images corresponding to each scene
+python preprocess_habitat.py --scenes_dir=$SCENES_DIR --metadata_dir=$METADATA_DIR --output_dir=$OUTPUT_DIR
+# Launch these commandlines in parallel e.g. using GNU-Parallel as follows:
+python preprocess_habitat.py --scenes_dir=$SCENES_DIR --metadata_dir=$METADATA_DIR --output_dir=$OUTPUT_DIR | parallel -j 16
+```
+
+### Make a list of scenes
+
+```bash
+python find_scenes.py --root $OUTPUT_DIR
+```
\ No newline at end of file
diff --git a/dust3r/datasets_preprocess/habitat/find_scenes.py b/dust3r/datasets_preprocess/habitat/find_scenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b57f649efc2e5c1903454aa4125a3d07e751e387
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/find_scenes.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to export the list of scenes for habitat (after having rendered them).
+# Usage:
+# python3 datasets_preprocess/preprocess_co3d.py --root data/habitat_processed
+# --------------------------------------------------------
+import numpy as np
+import os
+from collections import defaultdict
+from tqdm import tqdm
+
+
+def find_all_scenes(habitat_root, n_scenes=[100000]):
+    np.random.seed(777)
+
+    try:
+        fpath = os.path.join(habitat_root, f'Habitat_all_scenes.txt')
+        list_subscenes = open(fpath).read().splitlines()
+
+    except IOError:
+        if input('parsing sub-folders to find scenes? (y/n) ') != 'y':
+            return
+        list_subscenes = []
+        for root, dirs, files in tqdm(os.walk(habitat_root)):
+            for f in files:
+                if not f.endswith('_1_depth.exr'):
+                    continue
+                scene = os.path.join(os.path.relpath(root, habitat_root), f.replace('_1_depth.exr', ''))
+                if hash(scene) % 1000 == 0:
+                    print('... adding', scene)
+                list_subscenes.append(scene)
+
+        with open(fpath, 'w') as f:
+            f.write('\n'.join(list_subscenes))
+        print(f'>> wrote {fpath}')
+
+    print(f'Loaded {len(list_subscenes)} sub-scenes')
+
+    # separate scenes
+    list_scenes = defaultdict(list)
+    for scene in list_subscenes:
+        scene, id = os.path.split(scene)
+        list_scenes[scene].append(id)
+
+    list_scenes = list(list_scenes.items())
+    print(f'from {len(list_scenes)} scenes in total')
+
+    np.random.shuffle(list_scenes)
+    train_scenes = list_scenes[len(list_scenes) // 10:]
+    val_scenes = list_scenes[:len(list_scenes) // 10]
+
+    def write_scene_list(scenes, n, fpath):
+        sub_scenes = [os.path.join(scene, id) for scene, ids in scenes for id in ids]
+        np.random.shuffle(sub_scenes)
+
+        if len(sub_scenes) < n:
+            return
+
+        with open(fpath, 'w') as f:
+            f.write('\n'.join(sub_scenes[:n]))
+        print(f'>> wrote {fpath}')
+
+    for n in n_scenes:
+        write_scene_list(train_scenes, n, os.path.join(habitat_root, f'Habitat_{n}_scenes_train.txt'))
+        write_scene_list(val_scenes, n // 10, os.path.join(habitat_root, f'Habitat_{n//10}_scenes_val.txt'))
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--root", required=True)
+    parser.add_argument("--n_scenes", nargs='+', default=[1_000, 10_000, 100_000, 1_000_000], type=int)
+
+    args = parser.parse_args()
+    find_all_scenes(args.root, args.n_scenes)
diff --git a/dust3r/datasets_preprocess/habitat/habitat_renderer/__init__.py b/dust3r/datasets_preprocess/habitat/habitat_renderer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/habitat_renderer/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/datasets_preprocess/habitat/habitat_renderer/habitat_sim_envmaps_renderer.py b/dust3r/datasets_preprocess/habitat/habitat_renderer/habitat_sim_envmaps_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a31f1174a234b900ecaa76705fa271baf8a5669
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/habitat_renderer/habitat_sim_envmaps_renderer.py
@@ -0,0 +1,170 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Render environment maps from 3D meshes using the Habitat Sim simulator.
+# --------------------------------------------------------
+import numpy as np
+import habitat_sim
+import math
+from habitat_renderer import projections
+
+# OpenCV to habitat camera convention transformation
+R_OPENCV2HABITAT = np.stack((habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0)
+
+CUBEMAP_FACE_LABELS = ["left", "front", "right", "back", "up", "down"]
+# Expressed while considering Habitat coordinates systems
+CUBEMAP_FACE_ORIENTATIONS_ROTVEC = [
+    [0, math.pi / 2, 0],  # Left
+    [0, 0, 0],  # Front
+                [0, - math.pi / 2, 0],  # Right
+                [0, math.pi, 0],  # Back
+                [math.pi / 2, 0, 0],  # Up
+                [-math.pi / 2, 0, 0],]  # Down
+
+class NoNaviguableSpaceError(RuntimeError):
+    def __init__(self, *args):
+        super().__init__(*args)
+
+class HabitatEnvironmentMapRenderer:
+    def __init__(self,
+                 scene,
+                 navmesh,
+                 scene_dataset_config_file,
+                 render_equirectangular=False,
+                 equirectangular_resolution=(512, 1024),
+                 render_cubemap=False,
+                 cubemap_resolution=(512, 512),
+                 render_depth=False,
+                 gpu_id=0):
+        self.scene = scene
+        self.navmesh = navmesh
+        self.scene_dataset_config_file = scene_dataset_config_file
+        self.gpu_id = gpu_id
+
+        self.render_equirectangular = render_equirectangular
+        self.equirectangular_resolution = equirectangular_resolution
+        self.equirectangular_projection = projections.EquirectangularProjection(*equirectangular_resolution)
+        # 3D unit ray associated to each pixel of the equirectangular map
+        equirectangular_rays = projections.get_projection_rays(self.equirectangular_projection)
+        # Not needed, but just in case.
+        equirectangular_rays /= np.linalg.norm(equirectangular_rays, axis=-1, keepdims=True)
+        # Depth map created by Habitat are produced by warping a cubemap,
+        # so the values do not correspond to distance to the center and need some scaling.
+        self.equirectangular_depth_scale_factors = 1.0 / np.max(np.abs(equirectangular_rays), axis=-1)
+
+        self.render_cubemap = render_cubemap
+        self.cubemap_resolution = cubemap_resolution
+
+        self.render_depth = render_depth
+
+        self.seed = None
+        self._lazy_initialization()
+
+    def _lazy_initialization(self):
+        # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly
+        if self.seed == None:
+            # Re-seed numpy generator
+            np.random.seed()
+            self.seed = np.random.randint(2**32-1)
+            sim_cfg = habitat_sim.SimulatorConfiguration()
+            sim_cfg.scene_id = self.scene
+            if self.scene_dataset_config_file is not None and self.scene_dataset_config_file != "":
+                sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file
+            sim_cfg.random_seed = self.seed
+            sim_cfg.load_semantic_mesh = False
+            sim_cfg.gpu_device_id = self.gpu_id
+
+            sensor_specifications = []
+
+            # Add cubemaps
+            if self.render_cubemap:
+                for face_id, orientation in enumerate(CUBEMAP_FACE_ORIENTATIONS_ROTVEC):
+                    rgb_sensor_spec = habitat_sim.CameraSensorSpec()
+                    rgb_sensor_spec.uuid = f"color_cubemap_{CUBEMAP_FACE_LABELS[face_id]}"
+                    rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR
+                    rgb_sensor_spec.resolution = self.cubemap_resolution
+                    rgb_sensor_spec.hfov = 90
+                    rgb_sensor_spec.position = [0.0, 0.0, 0.0]
+                    rgb_sensor_spec.orientation = orientation
+                    sensor_specifications.append(rgb_sensor_spec)
+
+                    if self.render_depth:
+                        depth_sensor_spec = habitat_sim.CameraSensorSpec()
+                        depth_sensor_spec.uuid = f"depth_cubemap_{CUBEMAP_FACE_LABELS[face_id]}"
+                        depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+                        depth_sensor_spec.resolution = self.cubemap_resolution
+                        depth_sensor_spec.hfov = 90
+                        depth_sensor_spec.position = [0.0, 0.0, 0.0]
+                        depth_sensor_spec.orientation = orientation
+                        sensor_specifications.append(depth_sensor_spec)
+
+            # Add equirectangular map
+            if self.render_equirectangular:
+                rgb_sensor_spec = habitat_sim.bindings.EquirectangularSensorSpec()
+                rgb_sensor_spec.uuid = "color_equirectangular"
+                rgb_sensor_spec.resolution = self.equirectangular_resolution
+                rgb_sensor_spec.position = [0.0, 0.0, 0.0]
+                sensor_specifications.append(rgb_sensor_spec)
+
+                if self.render_depth:
+                    depth_sensor_spec = habitat_sim.bindings.EquirectangularSensorSpec()
+                    depth_sensor_spec.uuid = "depth_equirectangular"
+                    depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH
+                    depth_sensor_spec.resolution = self.equirectangular_resolution
+                    depth_sensor_spec.position = [0.0, 0.0, 0.0]
+                    depth_sensor_spec.orientation
+                    sensor_specifications.append(depth_sensor_spec)
+
+            agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=sensor_specifications)
+
+            cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])
+            self.sim = habitat_sim.Simulator(cfg)
+            if self.navmesh is not None and self.navmesh != "":
+                # Use pre-computed navmesh (the one generated automatically does some weird stuffs like going on top of the roof)
+                # See https://youtu.be/kunFMRJAu2U?t=1522 regarding navmeshes
+                self.sim.pathfinder.load_nav_mesh(self.navmesh)
+
+            # Check that the navmesh is not empty
+            if not self.sim.pathfinder.is_loaded:
+                # Try to compute a navmesh
+                navmesh_settings = habitat_sim.NavMeshSettings()
+                navmesh_settings.set_defaults()
+                self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True)
+
+            # Check that the navmesh is not empty
+            if not self.sim.pathfinder.is_loaded:
+                raise NoNaviguableSpaceError(f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})")
+
+            self.agent = self.sim.initialize_agent(agent_id=0)
+
+    def close(self):
+        if hasattr(self, 'sim'):
+            self.sim.close()
+
+    def __del__(self):
+        self.close()
+
+    def render_viewpoint(self, viewpoint_position):
+        agent_state = habitat_sim.AgentState()
+        agent_state.position = viewpoint_position
+        # agent_state.rotation = viewpoint_orientation
+        self.agent.set_state(agent_state)
+        viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0)
+
+        try:
+            # Depth map values have been obtained using cubemap rendering internally,
+            # so they do not really correspond to distance to the viewpoint in practice
+            # and they need some scaling
+            viewpoint_observations["depth_equirectangular"] *= self.equirectangular_depth_scale_factors
+        except KeyError:
+            pass
+
+        data = dict(observations=viewpoint_observations, position=viewpoint_position)
+        return data
+
+    def up_direction(self):
+        return np.asarray(habitat_sim.geo.UP).tolist()
+    
+    def R_cam_to_world(self):
+        return R_OPENCV2HABITAT.tolist()
diff --git a/dust3r/datasets_preprocess/habitat/habitat_renderer/multiview_crop_generator.py b/dust3r/datasets_preprocess/habitat/habitat_renderer/multiview_crop_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b86238b44a5cdd7a2e30b9d64773c2388f9711c3
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/habitat_renderer/multiview_crop_generator.py
@@ -0,0 +1,93 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Generate pairs of crops from a dataset of environment maps.
+# --------------------------------------------------------
+import os
+import numpy as np
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"  # noqa
+import cv2
+import collections
+from habitat_renderer import projections, projections_conversions
+from habitat_renderer.habitat_sim_envmaps_renderer import HabitatEnvironmentMapRenderer
+
+ViewpointData = collections.namedtuple("ViewpointData", ["colormap", "distancemap", "pointmap", "position"])
+
+class HabitatMultiviewCrops:
+    def __init__(self,
+                 scene,
+                 navmesh,
+                 scene_dataset_config_file,
+                 equirectangular_resolution=(400, 800),
+                 crop_resolution=(240, 320),
+                 pixel_jittering_iterations=5,
+                 jittering_noise_level=1.0):
+        self.crop_resolution = crop_resolution
+
+        self.pixel_jittering_iterations = pixel_jittering_iterations
+        self.jittering_noise_level = jittering_noise_level
+
+        # Instanciate the low resolution habitat sim renderer
+        self.lowres_envmap_renderer = HabitatEnvironmentMapRenderer(scene=scene,
+                                                                    navmesh=navmesh,
+                                                                    scene_dataset_config_file=scene_dataset_config_file,
+                                                                    equirectangular_resolution=equirectangular_resolution,
+                                                                    render_depth=True,
+                                                                    render_equirectangular=True)
+        self.R_cam_to_world = np.asarray(self.lowres_envmap_renderer.R_cam_to_world())
+        self.up_direction = np.asarray(self.lowres_envmap_renderer.up_direction())
+
+        # Projection applied by each environment map
+        self.envmap_height, self.envmap_width = self.lowres_envmap_renderer.equirectangular_resolution
+        base_projection = projections.EquirectangularProjection(self.envmap_height, self.envmap_width)
+        self.envmap_projection = projections.RotatedProjection(base_projection, self.R_cam_to_world.T)
+        # 3D Rays map associated to each envmap
+        self.envmap_rays = projections.get_projection_rays(self.envmap_projection)
+
+    def compute_pointmap(self, distancemap, position):
+        # Point cloud associated to each ray
+        return self.envmap_rays * distancemap[:, :, None] + position
+
+    def render_viewpoint_data(self, position):
+        data = self.lowres_envmap_renderer.render_viewpoint(np.asarray(position))
+        colormap = data['observations']['color_equirectangular'][..., :3]  # Ignore the alpha channel
+        distancemap = data['observations']['depth_equirectangular']
+        pointmap = self.compute_pointmap(distancemap, position)
+        return ViewpointData(colormap=colormap, distancemap=distancemap, pointmap=pointmap, position=position)
+
+    def extract_cropped_camera(self, projection, color_image, distancemap, pointmap, voxelmap=None):
+        remapper = projections_conversions.RemapProjection(input_projection=self.envmap_projection, output_projection=projection,
+                                                           pixel_jittering_iterations=self.pixel_jittering_iterations, jittering_noise_level=self.jittering_noise_level)
+        cropped_color_image = remapper.convert(
+            color_image, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_WRAP, single_map=False)
+        cropped_distancemap = remapper.convert(
+            distancemap, interpolation=cv2.INTER_NEAREST, borderMode=cv2.BORDER_WRAP, single_map=True)
+        cropped_pointmap = remapper.convert(pointmap, interpolation=cv2.INTER_NEAREST,
+                                            borderMode=cv2.BORDER_WRAP, single_map=True)
+        cropped_voxelmap = (None if voxelmap is None else
+                            remapper.convert(voxelmap, interpolation=cv2.INTER_NEAREST, borderMode=cv2.BORDER_WRAP, single_map=True))
+        # Convert the distance map into a depth map
+        cropped_depthmap = np.asarray(
+            cropped_distancemap / np.linalg.norm(remapper.output_rays, axis=-1), dtype=cropped_distancemap.dtype)
+
+        return cropped_color_image, cropped_depthmap, cropped_pointmap, cropped_voxelmap
+
+def perspective_projection_to_dict(persp_projection, position):
+    """
+    Serialization-like function."""
+    camera_params = dict(camera_intrinsics=projections.colmap_to_opencv_intrinsics(persp_projection.base_projection.K).tolist(),
+                         size=(persp_projection.base_projection.width, persp_projection.base_projection.height),
+                         R_cam2world=persp_projection.R_to_base_projection.T.tolist(),
+                         t_cam2world=position)
+    return camera_params
+
+
+def dict_to_perspective_projection(camera_params):
+    K = projections.opencv_to_colmap_intrinsics(np.asarray(camera_params["camera_intrinsics"]))
+    size = camera_params["size"]
+    R_cam2world = np.asarray(camera_params["R_cam2world"])
+    projection = projections.PerspectiveProjection(K, height=size[1], width=size[0])
+    projection = projections.RotatedProjection(projection, R_to_base_projection=R_cam2world.T)
+    position = camera_params["t_cam2world"]
+    return projection, position
\ No newline at end of file
diff --git a/dust3r/datasets_preprocess/habitat/habitat_renderer/projections.py b/dust3r/datasets_preprocess/habitat/habitat_renderer/projections.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db1f79d23e23a8ba144b4357c4d4daf10cf8fab
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/habitat_renderer/projections.py
@@ -0,0 +1,151 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Various 3D/2D projection utils, useful to sample virtual cameras.
+# --------------------------------------------------------
+import numpy as np
+
+class EquirectangularProjection:
+    """
+    Convention for the central pixel of the equirectangular map similar to OpenCV perspective model:
+        +X from left to right
+        +Y from top to bottom
+        +Z going outside the camera
+    EXCEPT that the top left corner of the image is assumed to have (0,0) coordinates (OpenCV assumes (-0.5,-0.5))
+    """
+
+    def __init__(self, height, width):
+        self.height = height
+        self.width = width
+        self.u_scaling = (2 * np.pi) / self.width
+        self.v_scaling = np.pi / self.height
+
+    def unproject(self, u, v):
+        """
+        Args:
+            u, v: 2D coordinates
+        Returns:
+            unnormalized 3D rays.
+        """
+        longitude = self.u_scaling * u - np.pi
+        minus_latitude = self.v_scaling * v - np.pi/2
+
+        cos_latitude = np.cos(minus_latitude)
+        x, z = np.sin(longitude) * cos_latitude, np.cos(longitude) * cos_latitude
+        y = np.sin(minus_latitude)
+
+        rays = np.stack([x, y, z], axis=-1)
+        return rays
+
+    def project(self, rays):
+        """
+        Args:
+            rays: Bx3 array of 3D rays.
+        Returns:
+            u, v: tuple of 2D coordinates.
+        """
+        rays = rays / np.linalg.norm(rays, axis=-1, keepdims=True)
+        x, y, z = [rays[..., i] for i in range(3)]
+
+        longitude = np.arctan2(x, z)
+        minus_latitude = np.arcsin(y)
+
+        u = (longitude + np.pi) * (1.0 / self.u_scaling)
+        v = (minus_latitude + np.pi/2) * (1.0 / self.v_scaling)
+        return u, v
+
+
+class PerspectiveProjection:
+    """
+    OpenCV convention:
+    World space:
+        +X from left to right
+        +Y from top to bottom
+        +Z going outside the camera
+    Pixel space:
+        +u from left to right
+        +v from top to bottom
+    EXCEPT that the top left corner of the image is assumed to have (0,0) coordinates (OpenCV assumes (-0.5,-0.5)).
+    """
+
+    def __init__(self, K, height, width):
+        self.height = height
+        self.width = width
+        self.K = K
+        self.Kinv = np.linalg.inv(K)
+
+    def project(self, rays):
+        uv_homogeneous = np.einsum("ik, ...k -> ...i", self.K, rays)
+        uv = uv_homogeneous[..., :2] / uv_homogeneous[..., 2, None]
+        return uv[..., 0], uv[..., 1]
+
+    def unproject(self, u, v):
+        uv_homogeneous = np.stack((u, v, np.ones_like(u)), axis=-1)
+        rays = np.einsum("ik, ...k -> ...i", self.Kinv, uv_homogeneous)
+        return rays
+
+
+class RotatedProjection:
+    def __init__(self, base_projection, R_to_base_projection):
+        self.base_projection = base_projection
+        self.R_to_base_projection = R_to_base_projection
+
+    @property
+    def width(self):
+        return self.base_projection.width
+
+    @property
+    def height(self):
+        return self.base_projection.height
+
+    def project(self, rays):
+        if self.R_to_base_projection is not None:
+            rays = np.einsum("ik, ...k -> ...i", self.R_to_base_projection, rays)
+        return self.base_projection.project(rays)
+
+    def unproject(self, u, v):
+        rays = self.base_projection.unproject(u, v)
+        if self.R_to_base_projection is not None:
+            rays = np.einsum("ik, ...k -> ...i", self.R_to_base_projection.T, rays)
+        return rays
+
+def get_projection_rays(projection, noise_level=0):
+    """
+    Return a 2D map of 3D rays corresponding to the projection.
+    If noise_level > 0, add some jittering noise to these rays.
+    """
+    grid_u, grid_v = np.meshgrid(0.5 + np.arange(projection.width), 0.5 + np.arange(projection.height))
+    if noise_level > 0:
+        grid_u += np.clip(0, noise_level * np.random.uniform(-0.5, 0.5, size=grid_u.shape), projection.width)
+        grid_v += np.clip(0, noise_level * np.random.uniform(-0.5, 0.5, size=grid_v.shape), projection.height)
+    return projection.unproject(grid_u, grid_v)
+
+def compute_camera_intrinsics(height, width, hfov):
+    f = width/2 / np.tan(hfov/2 * np.pi/180)
+    cu, cv = width/2, height/2
+    return f, cu, cv
+
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
\ No newline at end of file
diff --git a/dust3r/datasets_preprocess/habitat/habitat_renderer/projections_conversions.py b/dust3r/datasets_preprocess/habitat/habitat_renderer/projections_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bcfed4066bbac62fa4254ea6417bf429b098b75
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/habitat_renderer/projections_conversions.py
@@ -0,0 +1,45 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Remap data from one projection to an other
+# --------------------------------------------------------
+import numpy as np
+import cv2
+from habitat_renderer import projections
+
+class RemapProjection:
+    def __init__(self, input_projection, output_projection, pixel_jittering_iterations=0, jittering_noise_level=0):
+        """
+        Some naive random jittering can be introduced in the remapping to mitigate aliasing artecfacts.
+        """
+        assert jittering_noise_level >= 0
+        assert pixel_jittering_iterations >= 0
+
+        maps = []
+        # Initial map
+        self.output_rays = projections.get_projection_rays(output_projection)
+        map_u, map_v = input_projection.project(self.output_rays)
+        map_u, map_v = np.asarray(map_u, dtype=np.float32), np.asarray(map_v, dtype=np.float32)
+        maps.append((map_u, map_v))
+
+        for _ in range(pixel_jittering_iterations):
+            # Define multiple mappings using some coordinates jittering to mitigate aliasing effects
+            crop_rays = projections.get_projection_rays(output_projection, jittering_noise_level)
+            map_u, map_v = input_projection.project(crop_rays)
+            map_u, map_v = np.asarray(map_u, dtype=np.float32), np.asarray(map_v, dtype=np.float32)
+            maps.append((map_u, map_v))
+        self.maps = maps
+
+    def convert(self, img, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_WRAP, single_map=False):
+        remapped = []
+        for map_u, map_v in self.maps:
+            res = cv2.remap(img, map_u, map_v, interpolation=interpolation, borderMode=borderMode)
+            remapped.append(res)
+            if single_map:
+                break
+        if len(remapped) == 1:
+            res = remapped[0]
+        else:
+            res = np.asarray(np.mean(remapped, axis=0), dtype=img.dtype)
+        return res
diff --git a/dust3r/datasets_preprocess/habitat/preprocess_habitat.py b/dust3r/datasets_preprocess/habitat/preprocess_habitat.py
new file mode 100644
index 0000000000000000000000000000000000000000..cacbe2467a8e9629c2472b0e05fc0cf8326367e2
--- /dev/null
+++ b/dust3r/datasets_preprocess/habitat/preprocess_habitat.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# main executable for preprocessing habitat
+# export METADATA_DIR="/path/to/habitat/5views_v1_512x512_metadata"
+# export SCENES_DIR="/path/to/habitat/data/scene_datasets/"
+# export OUTPUT_DIR="data/habitat_processed"
+# export PYTHONPATH=$(pwd)
+# python preprocess_habitat.py --scenes_dir=$SCENES_DIR --metadata_dir=$METADATA_DIR --output_dir=$OUTPUT_DIR | parallel -j 16
+# --------------------------------------------------------
+import os
+import glob
+import json
+import os
+
+import PIL.Image
+import json
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"  # noqa
+import cv2
+from habitat_renderer import multiview_crop_generator
+from tqdm import tqdm
+
+
+def preprocess_metadata(metadata_filename,
+                        scenes_dir,
+                        output_dir,
+                        crop_resolution=[512, 512],
+                        equirectangular_resolution=None,
+                        fix_existing_dataset=False):
+    # Load data
+    with open(metadata_filename, "r") as f:
+        metadata = json.load(f)
+
+    if metadata["scene_dataset_config_file"] == "":
+        scene = os.path.join(scenes_dir, metadata["scene"])
+        scene_dataset_config_file = ""
+    else:
+        scene = metadata["scene"]
+        scene_dataset_config_file = os.path.join(scenes_dir, metadata["scene_dataset_config_file"])
+    navmesh = None
+
+    # Use 4 times the crop size as resolution for rendering the environment map.
+    max_res = max(crop_resolution)
+
+    if equirectangular_resolution == None:
+        # Use 4 times the crop size as resolution for rendering the environment map.
+        max_res = max(crop_resolution)
+        equirectangular_resolution = (4*max_res, 8*max_res)
+
+    print("equirectangular_resolution:", equirectangular_resolution)
+
+    if os.path.exists(output_dir) and not fix_existing_dataset:
+        raise FileExistsError(output_dir)
+
+    # Lazy initialization
+    highres_dataset = None
+
+    for batch_label, batch in tqdm(metadata["view_batches"].items()):
+        for view_label, view_params in batch.items():
+
+            assert view_params["size"] == crop_resolution
+            label = f"{batch_label}_{view_label}"
+
+            output_camera_params_filename = os.path.join(output_dir, f"{label}_camera_params.json")
+            if fix_existing_dataset and os.path.isfile(output_camera_params_filename):
+                # Skip generation if we are fixing a dataset and the corresponding output file already exists
+                continue
+
+            # Lazy initialization
+            if highres_dataset is None:
+                highres_dataset = multiview_crop_generator.HabitatMultiviewCrops(scene=scene,
+                                                                                 navmesh=navmesh,
+                                                                                 scene_dataset_config_file=scene_dataset_config_file,
+                                                                                 equirectangular_resolution=equirectangular_resolution,
+                                                                                 crop_resolution=crop_resolution,)
+                os.makedirs(output_dir, exist_ok=bool(fix_existing_dataset))
+
+            # Generate a higher resolution crop
+            original_projection, position = multiview_crop_generator.dict_to_perspective_projection(view_params)
+            # Render an envmap at the given position
+            viewpoint_data = highres_dataset.render_viewpoint_data(position)
+
+            projection = original_projection
+            colormap, depthmap, pointmap, _ = highres_dataset.extract_cropped_camera(
+                projection, viewpoint_data.colormap, viewpoint_data.distancemap, viewpoint_data.pointmap)
+
+            camera_params = multiview_crop_generator.perspective_projection_to_dict(projection, position)
+
+            # Color image
+            PIL.Image.fromarray(colormap).save(os.path.join(output_dir, f"{label}.jpeg"))
+            # Depth image
+            cv2.imwrite(os.path.join(output_dir, f"{label}_depth.exr"),
+                        depthmap, [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF])
+            # Camera parameters
+            with open(output_camera_params_filename, "w") as f:
+                json.dump(camera_params, f)
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--metadata_dir", required=True)
+    parser.add_argument("--scenes_dir", required=True)
+    parser.add_argument("--output_dir", required=True)
+    parser.add_argument("--metadata_filename", default="")
+
+    args = parser.parse_args()
+
+    if args.metadata_filename == "":
+        # Walk through the metadata dir to generate commandlines
+        for filename in glob.iglob(os.path.join(args.metadata_dir, "**/metadata.json"), recursive=True):
+            output_dir = os.path.join(args.output_dir, os.path.relpath(os.path.dirname(filename), args.metadata_dir))
+            if not os.path.exists(output_dir):
+                commandline = f"python {__file__} --metadata_filename={filename} --metadata_dir={args.metadata_dir} --scenes_dir={args.scenes_dir} --output_dir={output_dir}"
+                print(commandline)
+    else:
+        preprocess_metadata(metadata_filename=args.metadata_filename,
+                            scenes_dir=args.scenes_dir,
+                            output_dir=args.output_dir)
diff --git a/dust3r/datasets_preprocess/path_to_root.py b/dust3r/datasets_preprocess/path_to_root.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e076a17a408d0a9e043fbda2d73f1592e7cb71a
--- /dev/null
+++ b/dust3r/datasets_preprocess/path_to_root.py
@@ -0,0 +1,13 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUSt3R repo root import
+# --------------------------------------------------------
+
+import sys
+import os.path as path
+HERE_PATH = path.normpath(path.dirname(__file__))
+DUST3R_REPO_PATH = path.normpath(path.join(HERE_PATH, '../'))
+# workaround for sibling import
+sys.path.insert(0, DUST3R_REPO_PATH)
diff --git a/dust3r/datasets_preprocess/preprocess_arkitscenes.py b/dust3r/datasets_preprocess/preprocess_arkitscenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dbc103a82d646293e1d81f5132683e2b08cd879
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_arkitscenes.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to pre-process the arkitscenes dataset.
+# Usage:
+# python3 datasets_preprocess/preprocess_arkitscenes.py --arkitscenes_dir /path/to/arkitscenes --precomputed_pairs /path/to/arkitscenes_pairs
+# --------------------------------------------------------
+import os
+import json
+import os.path as osp
+import decimal
+import argparse
+import math
+from bisect import bisect_left
+from PIL import Image
+import numpy as np
+import quaternion
+from scipy import interpolate
+import cv2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--arkitscenes_dir', required=True)
+    parser.add_argument('--precomputed_pairs', required=True)
+    parser.add_argument('--output_dir', default='data/arkitscenes_processed')
+    return parser
+
+
+def value_to_decimal(value, decimal_places):
+    decimal.getcontext().rounding = decimal.ROUND_HALF_UP  # define rounding method
+    return decimal.Decimal(str(float(value))).quantize(decimal.Decimal('1e-{}'.format(decimal_places)))
+
+
+def closest(value, sorted_list):
+    index = bisect_left(sorted_list, value)
+    if index == 0:
+        return sorted_list[0]
+    elif index == len(sorted_list):
+        return sorted_list[-1]
+    else:
+        value_before = sorted_list[index - 1]
+        value_after = sorted_list[index]
+        if value_after - value < value - value_before:
+            return value_after
+        else:
+            return value_before
+
+
+def get_up_vectors(pose_device_to_world):
+    return np.matmul(pose_device_to_world, np.array([[0.0], [-1.0], [0.0], [0.0]]))
+
+
+def get_right_vectors(pose_device_to_world):
+    return np.matmul(pose_device_to_world, np.array([[1.0], [0.0], [0.0], [0.0]]))
+
+
+def read_traj(traj_path):
+    quaternions = []
+    poses = []
+    timestamps = []
+    poses_p_to_w = []
+    with open(traj_path) as f:
+        traj_lines = f.readlines()
+        for line in traj_lines:
+            tokens = line.split()
+            assert len(tokens) == 7
+            traj_timestamp = float(tokens[0])
+
+            timestamps_decimal_value = value_to_decimal(traj_timestamp, 3)
+            timestamps.append(float(timestamps_decimal_value))  # for spline interpolation
+
+            angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
+            r_w_to_p, _ = cv2.Rodrigues(np.asarray(angle_axis))
+            t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])])
+
+            pose_w_to_p = np.eye(4)
+            pose_w_to_p[:3, :3] = r_w_to_p
+            pose_w_to_p[:3, 3] = t_w_to_p
+
+            pose_p_to_w = np.linalg.inv(pose_w_to_p)
+
+            r_p_to_w_as_quat = quaternion.from_rotation_matrix(pose_p_to_w[:3, :3])
+            t_p_to_w = pose_p_to_w[:3, 3]
+            poses_p_to_w.append(pose_p_to_w)
+            poses.append(t_p_to_w)
+            quaternions.append(r_p_to_w_as_quat)
+    return timestamps, poses, quaternions, poses_p_to_w
+
+
+def main(rootdir, pairsdir, outdir):
+    os.makedirs(outdir, exist_ok=True)
+
+    subdirs = ['Test', 'Training']
+    for subdir in subdirs:
+        if not osp.isdir(osp.join(rootdir, subdir)):
+            continue
+        # STEP 1: list all scenes
+        outsubdir = osp.join(outdir, subdir)
+        os.makedirs(outsubdir, exist_ok=True)
+        listfile = osp.join(pairsdir, subdir, 'scene_list.json')
+        with open(listfile, 'r') as f:
+            scene_dirs = json.load(f)
+
+        valid_scenes = []
+        for scene_subdir in scene_dirs:
+            out_scene_subdir = osp.join(outsubdir, scene_subdir)
+            os.makedirs(out_scene_subdir, exist_ok=True)
+
+            scene_dir = osp.join(rootdir, subdir, scene_subdir)
+            depth_dir = osp.join(scene_dir, 'lowres_depth')
+            rgb_dir = osp.join(scene_dir, 'vga_wide')
+            intrinsics_dir = osp.join(scene_dir, 'vga_wide_intrinsics')
+            traj_path = osp.join(scene_dir, 'lowres_wide.traj')
+
+            # STEP 2: read selected_pairs.npz
+            selected_pairs_path = osp.join(pairsdir, subdir, scene_subdir, 'selected_pairs.npz')
+            selected_npz = np.load(selected_pairs_path)
+            selection, pairs = selected_npz['selection'], selected_npz['pairs']
+            selected_sky_direction_scene = str(selected_npz['sky_direction_scene'][0])
+            if len(selection) == 0 or len(pairs) == 0:
+                # not a valid scene
+                continue
+            valid_scenes.append(scene_subdir)
+
+            # STEP 3: parse the scene and export the list of valid (K, pose, rgb, depth) and convert images
+            scene_metadata_path = osp.join(out_scene_subdir, 'scene_metadata.npz')
+            if osp.isfile(scene_metadata_path):
+                continue
+            else:
+                print(f'parsing {scene_subdir}')
+                # loads traj
+                timestamps, poses, quaternions, poses_cam_to_world = read_traj(traj_path)
+
+                poses = np.array(poses)
+                quaternions = np.array(quaternions, dtype=np.quaternion)
+                quaternions = quaternion.unflip_rotors(quaternions)
+                timestamps = np.array(timestamps)
+
+                selected_images = [(basename, basename.split(".png")[0].split("_")[1]) for basename in selection]
+                timestamps_selected = [float(frame_id) for _, frame_id in selected_images]
+
+                sky_direction_scene, trajectories, intrinsics, images = convert_scene_metadata(scene_subdir,
+                                                                                               intrinsics_dir,
+                                                                                               timestamps,
+                                                                                               quaternions,
+                                                                                               poses,
+                                                                                               poses_cam_to_world,
+                                                                                               selected_images,
+                                                                                               timestamps_selected)
+                assert selected_sky_direction_scene == sky_direction_scene
+
+                os.makedirs(os.path.join(out_scene_subdir, 'vga_wide'), exist_ok=True)
+                os.makedirs(os.path.join(out_scene_subdir, 'lowres_depth'), exist_ok=True)
+                assert isinstance(sky_direction_scene, str)
+                for basename in images:
+                    img_out = os.path.join(out_scene_subdir, 'vga_wide', basename.replace('.png', '.jpg'))
+                    depth_out = os.path.join(out_scene_subdir, 'lowres_depth', basename)
+                    if osp.isfile(img_out) and osp.isfile(depth_out):
+                        continue
+
+                    vga_wide_path = osp.join(rgb_dir, basename)
+                    depth_path = osp.join(depth_dir, basename)
+
+                    img = Image.open(vga_wide_path)
+                    depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
+
+                    # rotate the image
+                    if sky_direction_scene == 'RIGHT':
+                        try:
+                            img = img.transpose(Image.Transpose.ROTATE_90)
+                        except Exception:
+                            img = img.transpose(Image.ROTATE_90)
+                        depth = cv2.rotate(depth, cv2.ROTATE_90_COUNTERCLOCKWISE)
+                    elif sky_direction_scene == 'LEFT':
+                        try:
+                            img = img.transpose(Image.Transpose.ROTATE_270)
+                        except Exception:
+                            img = img.transpose(Image.ROTATE_270)
+                        depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)
+                    elif sky_direction_scene == 'DOWN':
+                        try:
+                            img = img.transpose(Image.Transpose.ROTATE_180)
+                        except Exception:
+                            img = img.transpose(Image.ROTATE_180)
+                        depth = cv2.rotate(depth, cv2.ROTATE_180)
+
+                    W, H = img.size
+                    if not osp.isfile(img_out):
+                        img.save(img_out)
+
+                    depth = cv2.resize(depth, (W, H), interpolation=cv2.INTER_NEAREST_EXACT)
+                    if not osp.isfile(depth_out):  # avoid destroying the base dataset when you mess up the paths
+                        cv2.imwrite(depth_out, depth)
+
+                # save at the end
+                np.savez(scene_metadata_path,
+                         trajectories=trajectories,
+                         intrinsics=intrinsics,
+                         images=images,
+                         pairs=pairs)
+
+        outlistfile = osp.join(outsubdir, 'scene_list.json')
+        with open(outlistfile, 'w') as f:
+            json.dump(valid_scenes, f)
+
+        # STEP 5: concat all scene_metadata.npz into a single file
+        scene_data = {}
+        for scene_subdir in valid_scenes:
+            scene_metadata_path = osp.join(outsubdir, scene_subdir, 'scene_metadata.npz')
+            with np.load(scene_metadata_path) as data:
+                trajectories = data['trajectories']
+                intrinsics = data['intrinsics']
+                images = data['images']
+                pairs = data['pairs']
+            scene_data[scene_subdir] = {'trajectories': trajectories,
+                                        'intrinsics': intrinsics,
+                                        'images': images,
+                                        'pairs': pairs}
+        offset = 0
+        counts = []
+        scenes = []
+        sceneids = []
+        images = []
+        intrinsics = []
+        trajectories = []
+        pairs = []
+        for scene_idx, (scene_subdir, data) in enumerate(scene_data.items()):
+            num_imgs = data['images'].shape[0]
+            img_pairs = data['pairs']
+
+            scenes.append(scene_subdir)
+            sceneids.extend([scene_idx] * num_imgs)
+
+            images.append(data['images'])
+
+            K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0)
+            K[:, 0, 0] = [fx for _, _, fx, _, _, _ in data['intrinsics']]
+            K[:, 1, 1] = [fy for _, _, _, fy, _, _ in data['intrinsics']]
+            K[:, 0, 2] = [hw for _, _, _, _, hw, _ in data['intrinsics']]
+            K[:, 1, 2] = [hh for _, _, _, _, _, hh in data['intrinsics']]
+
+            intrinsics.append(K)
+            trajectories.append(data['trajectories'])
+
+            # offset pairs
+            img_pairs[:, 0:2] += offset
+            pairs.append(img_pairs)
+            counts.append(offset)
+
+            offset += num_imgs
+
+        images = np.concatenate(images, axis=0)
+        intrinsics = np.concatenate(intrinsics, axis=0)
+        trajectories = np.concatenate(trajectories, axis=0)
+        pairs = np.concatenate(pairs, axis=0)
+        np.savez(osp.join(outsubdir, 'all_metadata.npz'),
+                 counts=counts,
+                 scenes=scenes,
+                 sceneids=sceneids,
+                 images=images,
+                 intrinsics=intrinsics,
+                 trajectories=trajectories,
+                 pairs=pairs)
+
+
+def convert_scene_metadata(scene_subdir, intrinsics_dir,
+                           timestamps, quaternions, poses, poses_cam_to_world,
+                           selected_images, timestamps_selected):
+    # find scene orientation
+    sky_direction_scene, rotated_to_cam = find_scene_orientation(poses_cam_to_world)
+
+    # find/compute pose for selected timestamps
+    # most images have a valid timestamp / exact pose associated
+    timestamps_selected = np.array(timestamps_selected)
+    spline = interpolate.interp1d(timestamps, poses, kind='linear', axis=0)
+    interpolated_rotations = quaternion.squad(quaternions, timestamps, timestamps_selected)
+    interpolated_positions = spline(timestamps_selected)
+
+    trajectories = []
+    intrinsics = []
+    images = []
+    for i, (basename, frame_id) in enumerate(selected_images):
+        intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{frame_id}.pincam")
+        if not osp.exists(intrinsic_fn):
+            intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{float(frame_id) - 0.001:.3f}.pincam")
+        if not osp.exists(intrinsic_fn):
+            intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{float(frame_id) + 0.001:.3f}.pincam")
+        assert osp.exists(intrinsic_fn)
+        w, h, fx, fy, hw, hh = np.loadtxt(intrinsic_fn)  # PINHOLE
+
+        pose = np.eye(4)
+        pose[:3, :3] = quaternion.as_rotation_matrix(interpolated_rotations[i])
+        pose[:3, 3] = interpolated_positions[i]
+
+        images.append(basename)
+        if sky_direction_scene == 'RIGHT' or sky_direction_scene == 'LEFT':
+            intrinsics.append([h, w, fy, fx, hh, hw])  # swapped intrinsics
+        else:
+            intrinsics.append([w, h, fx, fy, hw, hh])
+        trajectories.append(pose  @ rotated_to_cam)  # pose_cam_to_world @ rotated_to_cam = rotated(cam) to world
+
+    return sky_direction_scene, trajectories, intrinsics, images
+
+
+def find_scene_orientation(poses_cam_to_world):
+    if len(poses_cam_to_world) > 0:
+        up_vector = sum(get_up_vectors(p) for p in poses_cam_to_world) / len(poses_cam_to_world)
+        right_vector = sum(get_right_vectors(p) for p in poses_cam_to_world) / len(poses_cam_to_world)
+        up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
+    else:
+        up_vector = np.array([[0.0], [-1.0], [0.0], [0.0]])
+        right_vector = np.array([[1.0], [0.0], [0.0], [0.0]])
+        up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
+
+    # value between 0, 180
+    device_up_to_world_up_angle = np.arccos(np.clip(np.dot(np.transpose(up_world),
+                                                           up_vector), -1.0, 1.0)).item() * 180.0 / np.pi
+    device_right_to_world_up_angle = np.arccos(np.clip(np.dot(np.transpose(up_world),
+                                                              right_vector), -1.0, 1.0)).item() * 180.0 / np.pi
+
+    up_closest_to_90 = abs(device_up_to_world_up_angle - 90.0) < abs(device_right_to_world_up_angle - 90.0)
+    if up_closest_to_90:
+        assert abs(device_up_to_world_up_angle - 90.0) < 45.0
+        # LEFT
+        if device_right_to_world_up_angle > 90.0:
+            sky_direction_scene = 'LEFT'
+            cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi / 2.0])
+        else:
+            # note that in metadata.csv RIGHT does not exist, but again it's not accurate...
+            # well, turns out there are scenes oriented like this
+            # for example Training/41124801
+            sky_direction_scene = 'RIGHT'
+            cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, -math.pi / 2.0])
+    else:
+        # right is close to 90
+        assert abs(device_right_to_world_up_angle - 90.0) < 45.0
+        if device_up_to_world_up_angle > 90.0:
+            sky_direction_scene = 'DOWN'
+            cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi])
+        else:
+            sky_direction_scene = 'UP'
+            cam_to_rotated_q = quaternion.quaternion(1, 0, 0, 0)
+    cam_to_rotated = np.eye(4)
+    cam_to_rotated[:3, :3] = quaternion.as_rotation_matrix(cam_to_rotated_q)
+    rotated_to_cam = np.linalg.inv(cam_to_rotated)
+    return sky_direction_scene, rotated_to_cam
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.arkitscenes_dir, args.precomputed_pairs, args.output_dir)
diff --git a/dust3r/datasets_preprocess/preprocess_blendedMVS.py b/dust3r/datasets_preprocess/preprocess_blendedMVS.py
new file mode 100644
index 0000000000000000000000000000000000000000..d22793793c1219ebb1b3ba8eff51226c2b13f657
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_blendedMVS.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Preprocessing code for the BlendedMVS dataset
+# dataset at https://github.com/YoYo000/BlendedMVS
+# 1) Download BlendedMVS.zip
+# 2) Download BlendedMVS+.zip
+# 3) Download BlendedMVS++.zip
+# 4) Unzip everything in the same /path/to/tmp/blendedMVS/ directory
+# 5) python datasets_preprocess/preprocess_blendedMVS.py --blendedmvs_dir /path/to/tmp/blendedMVS/
+# --------------------------------------------------------
+import os
+import os.path as osp
+import re
+from tqdm import tqdm
+import numpy as np
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+
+import path_to_root  # noqa
+from dust3r.utils.parallel import parallel_threads
+from dust3r.datasets.utils import cropping  # noqa
+
+
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--blendedmvs_dir', required=True)
+    parser.add_argument('--precomputed_pairs', required=True)
+    parser.add_argument('--output_dir', default='data/blendedmvs_processed')
+    return parser
+
+
+def main(db_root, pairs_path, output_dir):
+    print('>> Listing all sequences')
+    sequences = [f for f in os.listdir(db_root) if len(f) == 24]
+    # should find 502 scenes
+    assert sequences, f'did not found any sequences at {db_root}'
+    print(f'   (found {len(sequences)} sequences)')
+
+    for i, seq in enumerate(tqdm(sequences)):
+        out_dir = osp.join(output_dir, seq)
+        os.makedirs(out_dir, exist_ok=True)
+
+        # generate the crops
+        root = osp.join(db_root, seq)
+        cam_dir = osp.join(root, 'cams')
+        func_args = [(root, f[:-8], out_dir) for f in os.listdir(cam_dir) if not f.startswith('pair')]
+        parallel_threads(load_crop_and_save, func_args, star_args=True, leave=False)
+
+    # verify that all pairs are there
+    pairs = np.load(pairs_path)
+    for seqh, seql, img1, img2, score in tqdm(pairs):
+        for view_index in [img1, img2]:
+            impath = osp.join(output_dir, f"{seqh:08x}{seql:016x}", f"{view_index:08n}.jpg")
+            assert osp.isfile(impath), f'missing image at {impath=}'
+
+    print(f'>> Done, saved everything in {output_dir}/')
+
+
+def load_crop_and_save(root, img, out_dir):
+    if osp.isfile(osp.join(out_dir, img + '.npz')):
+        return  # already done
+
+    # load everything
+    intrinsics_in, R_camin2world, t_camin2world = _load_pose(osp.join(root, 'cams', img + '_cam.txt'))
+    color_image_in = cv2.cvtColor(cv2.imread(osp.join(root, 'blended_images', img +
+                                  '.jpg'), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+    depthmap_in = load_pfm_file(osp.join(root, 'rendered_depth_maps', img + '.pfm'))
+
+    # do the crop
+    H, W = color_image_in.shape[:2]
+    assert H * 4 == W * 3
+    image, depthmap, intrinsics_out, R_in2out = _crop_image(intrinsics_in, color_image_in, depthmap_in, (512, 384))
+
+    # write everything
+    image.save(osp.join(out_dir, img + '.jpg'), quality=80)
+    cv2.imwrite(osp.join(out_dir, img + '.exr'), depthmap)
+
+    # New camera parameters
+    R_camout2world = R_camin2world @ R_in2out.T
+    t_camout2world = t_camin2world
+    np.savez(osp.join(out_dir, img + '.npz'), intrinsics=intrinsics_out,
+             R_cam2world=R_camout2world, t_cam2world=t_camout2world)
+
+
+def _crop_image(intrinsics_in, color_image_in, depthmap_in, resolution_out=(800, 800)):
+    image, depthmap, intrinsics_out = cropping.rescale_image_depthmap(
+        color_image_in, depthmap_in, intrinsics_in, resolution_out)
+    R_in2out = np.eye(3)
+    return image, depthmap, intrinsics_out, R_in2out
+
+
+def _load_pose(path, ret_44=False):
+    f = open(path)
+    RT = np.loadtxt(f, skiprows=1, max_rows=4, dtype=np.float32)
+    assert RT.shape == (4, 4)
+    RT = np.linalg.inv(RT)  # world2cam to cam2world
+
+    K = np.loadtxt(f, skiprows=2, max_rows=3, dtype=np.float32)
+    assert K.shape == (3, 3)
+
+    if ret_44:
+        return K, RT
+    return K, RT[:3, :3], RT[:3, 3]  # , depth_uint8_to_f32
+
+
+def load_pfm_file(file_path):
+    with open(file_path, 'rb') as file:
+        header = file.readline().decode('UTF-8').strip()
+
+        if header == 'PF':
+            is_color = True
+        elif header == 'Pf':
+            is_color = False
+        else:
+            raise ValueError('The provided file is not a valid PFM file.')
+
+        dimensions = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('UTF-8'))
+        if dimensions:
+            img_width, img_height = map(int, dimensions.groups())
+        else:
+            raise ValueError('Invalid PFM header format.')
+
+        endian_scale = float(file.readline().decode('UTF-8').strip())
+        if endian_scale < 0:
+            dtype = '<f'  # little-endian
+        else:
+            dtype = '>f'  # big-endian
+
+        data_buffer = file.read()
+        img_data = np.frombuffer(data_buffer, dtype=dtype)
+
+        if is_color:
+            img_data = np.reshape(img_data, (img_height, img_width, 3))
+        else:
+            img_data = np.reshape(img_data, (img_height, img_width))
+
+        img_data = cv2.flip(img_data, 0)
+
+    return img_data
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.blendedmvs_dir, args.precomputed_pairs, args.output_dir)
diff --git a/dust3r/datasets_preprocess/preprocess_co3d.py b/dust3r/datasets_preprocess/preprocess_co3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e287b85ddf8791bd6f2d53a6992d13b916c209b6
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_co3d.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to pre-process the CO3D dataset.
+# Usage:
+# python3 datasets_preprocess/preprocess_co3d.py --co3d_dir /path/to/co3d
+# --------------------------------------------------------
+
+import argparse
+import random
+import gzip
+import json
+import os
+import os.path as osp
+
+import torch
+import PIL.Image
+import numpy as np
+import cv2
+
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+
+import path_to_root  # noqa
+import dust3r.datasets.utils.cropping as cropping  # noqa
+
+
+CATEGORIES = [
+    "apple", "backpack", "ball", "banana", "baseballbat", "baseballglove",
+    "bench", "bicycle", "book", "bottle", "bowl", "broccoli", "cake", "car", "carrot",
+    "cellphone", "chair", "couch", "cup", "donut", "frisbee", "hairdryer", "handbag",
+    "hotdog", "hydrant", "keyboard", "kite", "laptop", "microwave",
+    "motorcycle",
+    "mouse", "orange", "parkingmeter", "pizza", "plant", "remote", "sandwich",
+    "skateboard", "stopsign",
+    "suitcase", "teddybear", "toaster", "toilet", "toybus",
+    "toyplane", "toytrain", "toytruck", "tv",
+    "umbrella", "vase", "wineglass",
+]
+CATEGORIES_IDX = {cat: i for i, cat in enumerate(CATEGORIES)}  # for seeding
+
+SINGLE_SEQUENCE_CATEGORIES = sorted(set(CATEGORIES) - set(["microwave", "stopsign", "tv"]))
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--category", type=str, default=None)
+    parser.add_argument('--single_sequence_subset', default=False, action='store_true',
+                        help="prepare the single_sequence_subset instead.")
+    parser.add_argument("--output_dir", type=str, default="data/co3d_processed")
+    parser.add_argument("--co3d_dir", type=str, required=True)
+    parser.add_argument("--num_sequences_per_object", type=int, default=50)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--min_quality", type=float, default=0.5, help="Minimum viewpoint quality score.")
+
+    parser.add_argument("--img_size", type=int, default=512,
+                        help=("lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"))
+    return parser
+
+
+def convert_ndc_to_pinhole(focal_length, principal_point, image_size):
+    focal_length = np.array(focal_length)
+    principal_point = np.array(principal_point)
+    image_size_wh = np.array([image_size[1], image_size[0]])
+    half_image_size = image_size_wh / 2
+    rescale = half_image_size.min()
+    principal_point_px = half_image_size - principal_point * rescale
+    focal_length_px = focal_length * rescale
+    fx, fy = focal_length_px[0], focal_length_px[1]
+    cx, cy = principal_point_px[0], principal_point_px[1]
+    K = np.array([[fx, 0.0, cx], [0.0, fy, cy], [0.0, 0.0, 1.0]], dtype=np.float32)
+    return K
+
+
+def opencv_from_cameras_projection(R, T, focal, p0, image_size):
+    R = torch.from_numpy(R)[None, :, :]
+    T = torch.from_numpy(T)[None, :]
+    focal = torch.from_numpy(focal)[None, :]
+    p0 = torch.from_numpy(p0)[None, :]
+    image_size = torch.from_numpy(image_size)[None, :]
+
+    R_pytorch3d = R.clone()
+    T_pytorch3d = T.clone()
+    focal_pytorch3d = focal
+    p0_pytorch3d = p0
+    T_pytorch3d[:, :2] *= -1
+    R_pytorch3d[:, :, :2] *= -1
+    tvec = T_pytorch3d
+    R = R_pytorch3d.permute(0, 2, 1)
+
+    # Retype the image_size correctly and flip to width, height.
+    image_size_wh = image_size.to(R).flip(dims=(1,))
+
+    # NDC to screen conversion.
+    scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
+    scale = scale.expand(-1, 2)
+    c0 = image_size_wh / 2.0
+
+    principal_point = -p0_pytorch3d * scale + c0
+    focal_length = focal_pytorch3d * scale
+
+    camera_matrix = torch.zeros_like(R)
+    camera_matrix[:, :2, 2] = principal_point
+    camera_matrix[:, 2, 2] = 1.0
+    camera_matrix[:, 0, 0] = focal_length[:, 0]
+    camera_matrix[:, 1, 1] = focal_length[:, 1]
+    return R[0], tvec[0], camera_matrix[0]
+
+
+def get_set_list(category_dir, split, is_single_sequence_subset=False):
+    listfiles = os.listdir(osp.join(category_dir, "set_lists"))
+    if is_single_sequence_subset:
+        # not all objects have manyview_dev
+        subset_list_files = [f for f in listfiles if "manyview_dev" in f]
+    else:
+        subset_list_files = [f for f in listfiles if f"fewview_train" in f]
+
+    sequences_all = []
+    for subset_list_file in subset_list_files:
+        with open(osp.join(category_dir, "set_lists", subset_list_file)) as f:
+            subset_lists_data = json.load(f)
+            sequences_all.extend(subset_lists_data[split])
+
+    return sequences_all
+
+
+def prepare_sequences(category, co3d_dir, output_dir, img_size, split, min_quality, max_num_sequences_per_object,
+                      seed, is_single_sequence_subset=False):
+    random.seed(seed)
+    category_dir = osp.join(co3d_dir, category)
+    category_output_dir = osp.join(output_dir, category)
+    sequences_all = get_set_list(category_dir, split, is_single_sequence_subset)
+    sequences_numbers = sorted(set(seq_name for seq_name, _, _ in sequences_all))
+
+    frame_file = osp.join(category_dir, "frame_annotations.jgz")
+    sequence_file = osp.join(category_dir, "sequence_annotations.jgz")
+
+    with gzip.open(frame_file, "r") as fin:
+        frame_data = json.loads(fin.read())
+    with gzip.open(sequence_file, "r") as fin:
+        sequence_data = json.loads(fin.read())
+
+    frame_data_processed = {}
+    for f_data in frame_data:
+        sequence_name = f_data["sequence_name"]
+        frame_data_processed.setdefault(sequence_name, {})[f_data["frame_number"]] = f_data
+
+    good_quality_sequences = set()
+    for seq_data in sequence_data:
+        if seq_data["viewpoint_quality_score"] > min_quality:
+            good_quality_sequences.add(seq_data["sequence_name"])
+
+    sequences_numbers = [seq_name for seq_name in sequences_numbers if seq_name in good_quality_sequences]
+    if len(sequences_numbers) < max_num_sequences_per_object:
+        selected_sequences_numbers = sequences_numbers
+    else:
+        selected_sequences_numbers = random.sample(sequences_numbers, max_num_sequences_per_object)
+
+    selected_sequences_numbers_dict = {seq_name: [] for seq_name in selected_sequences_numbers}
+    sequences_all = [(seq_name, frame_number, filepath)
+                     for seq_name, frame_number, filepath in sequences_all
+                     if seq_name in selected_sequences_numbers_dict]
+
+    for seq_name, frame_number, filepath in tqdm(sequences_all):
+        frame_idx = int(filepath.split('/')[-1][5:-4])
+        selected_sequences_numbers_dict[seq_name].append(frame_idx)
+        mask_path = filepath.replace("images", "masks").replace(".jpg", ".png")
+        frame_data = frame_data_processed[seq_name][frame_number]
+        focal_length = frame_data["viewpoint"]["focal_length"]
+        principal_point = frame_data["viewpoint"]["principal_point"]
+        image_size = frame_data["image"]["size"]
+        K = convert_ndc_to_pinhole(focal_length, principal_point, image_size)
+        R, tvec, camera_intrinsics = opencv_from_cameras_projection(np.array(frame_data["viewpoint"]["R"]),
+                                                                    np.array(frame_data["viewpoint"]["T"]),
+                                                                    np.array(focal_length),
+                                                                    np.array(principal_point),
+                                                                    np.array(image_size))
+
+        frame_data = frame_data_processed[seq_name][frame_number]
+        depth_path = os.path.join(co3d_dir, frame_data["depth"]["path"])
+        assert frame_data["depth"]["scale_adjustment"] == 1.0
+        image_path = os.path.join(co3d_dir, filepath)
+        mask_path_full = os.path.join(co3d_dir, mask_path)
+
+        input_rgb_image = PIL.Image.open(image_path).convert('RGB')
+        input_mask = plt.imread(mask_path_full)
+
+        with PIL.Image.open(depth_path) as depth_pil:
+            # the image is stored with 16-bit depth but PIL reads it as I (32 bit).
+            # we cast it to uint16, then reinterpret as float16, then cast to float32
+            input_depthmap = (
+                np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
+                .astype(np.float32)
+                .reshape((depth_pil.size[1], depth_pil.size[0])))
+        depth_mask = np.stack((input_depthmap, input_mask), axis=-1)
+        H, W = input_depthmap.shape
+
+        camera_intrinsics = camera_intrinsics.numpy()
+        cx, cy = camera_intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        input_rgb_image, depth_mask, input_camera_intrinsics = cropping.crop_image_depthmap(
+            input_rgb_image, depth_mask, camera_intrinsics, crop_bbox)
+
+        # try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384
+        scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
+        output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+        if max(output_resolution) < img_size:
+            # let's put the max dimension to img_size
+            scale_final = (img_size / max(H, W)) + 1e-8
+            output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+
+        input_rgb_image, depth_mask, input_camera_intrinsics = cropping.rescale_image_depthmap(
+            input_rgb_image, depth_mask, input_camera_intrinsics, output_resolution)
+        input_depthmap = depth_mask[:, :, 0]
+        input_mask = depth_mask[:, :, 1]
+
+        # generate and adjust camera pose
+        camera_pose = np.eye(4, dtype=np.float32)
+        camera_pose[:3, :3] = R
+        camera_pose[:3, 3] = tvec
+        camera_pose = np.linalg.inv(camera_pose)
+
+        # save crop images and depth, metadata
+        save_img_path = os.path.join(output_dir, filepath)
+        save_depth_path = os.path.join(output_dir, frame_data["depth"]["path"])
+        save_mask_path = os.path.join(output_dir, mask_path)
+        os.makedirs(os.path.split(save_img_path)[0], exist_ok=True)
+        os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True)
+        os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True)
+
+        input_rgb_image.save(save_img_path)
+        scaled_depth_map = (input_depthmap / np.max(input_depthmap) * 65535).astype(np.uint16)
+        cv2.imwrite(save_depth_path, scaled_depth_map)
+        cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))
+
+        save_meta_path = save_img_path.replace('jpg', 'npz')
+        np.savez(save_meta_path, camera_intrinsics=input_camera_intrinsics,
+                 camera_pose=camera_pose, maximum_depth=np.max(input_depthmap))
+
+    return selected_sequences_numbers_dict
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    assert args.co3d_dir != args.output_dir
+    if args.category is None:
+        if args.single_sequence_subset:
+            categories = SINGLE_SEQUENCE_CATEGORIES
+        else:
+            categories = CATEGORIES
+    else:
+        categories = [args.category]
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    for split in ['train', 'test']:
+        selected_sequences_path = os.path.join(args.output_dir, f'selected_seqs_{split}.json')
+        if os.path.isfile(selected_sequences_path):
+            continue
+
+        all_selected_sequences = {}
+        for category in categories:
+            category_output_dir = osp.join(args.output_dir, category)
+            os.makedirs(category_output_dir, exist_ok=True)
+            category_selected_sequences_path = os.path.join(category_output_dir, f'selected_seqs_{split}.json')
+            if os.path.isfile(category_selected_sequences_path):
+                with open(category_selected_sequences_path, 'r') as fid:
+                    category_selected_sequences = json.load(fid)
+            else:
+                print(f"Processing {split} - category = {category}")
+                category_selected_sequences = prepare_sequences(
+                    category=category,
+                    co3d_dir=args.co3d_dir,
+                    output_dir=args.output_dir,
+                    img_size=args.img_size,
+                    split=split,
+                    min_quality=args.min_quality,
+                    max_num_sequences_per_object=args.num_sequences_per_object,
+                    seed=args.seed + CATEGORIES_IDX[category],
+                    is_single_sequence_subset=args.single_sequence_subset
+                )
+                with open(category_selected_sequences_path, 'w') as file:
+                    json.dump(category_selected_sequences, file)
+
+            all_selected_sequences[category] = category_selected_sequences
+        with open(selected_sequences_path, 'w') as file:
+            json.dump(all_selected_sequences, file)
diff --git a/dust3r/datasets_preprocess/preprocess_megadepth.py b/dust3r/datasets_preprocess/preprocess_megadepth.py
new file mode 100644
index 0000000000000000000000000000000000000000..b07c0c5dff0cfd828f9ce4fd204cf2eaa22487f1
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_megadepth.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Preprocessing code for the MegaDepth dataset
+# dataset at https://www.cs.cornell.edu/projects/megadepth/
+# --------------------------------------------------------
+import os
+import os.path as osp
+import collections
+from tqdm import tqdm
+import numpy as np
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+import h5py
+
+import path_to_root  # noqa
+from dust3r.utils.parallel import parallel_threads
+from dust3r.datasets.utils import cropping  # noqa
+
+
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--megadepth_dir', required=True)
+    parser.add_argument('--precomputed_pairs', required=True)
+    parser.add_argument('--output_dir', default='data/megadepth_processed')
+    return parser
+
+
+def main(db_root, pairs_path, output_dir):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # load all pairs
+    data = np.load(pairs_path, allow_pickle=True)
+    scenes = data['scenes']
+    images = data['images']
+    pairs = data['pairs']
+
+    # enumerate all unique images
+    todo = collections.defaultdict(set)
+    for scene, im1, im2, score in pairs:
+        todo[scene].add(im1)
+        todo[scene].add(im2)
+
+    # for each scene, load intrinsics and then parallel crops
+    for scene, im_idxs in tqdm(todo.items(), desc='Overall'):
+        scene, subscene = scenes[scene].split()
+        out_dir = osp.join(output_dir, scene, subscene)
+        os.makedirs(out_dir, exist_ok=True)
+
+        # load all camera params
+        _, pose_w2cam, intrinsics = _load_kpts_and_poses(db_root, scene, subscene, intrinsics=True)
+
+        in_dir = osp.join(db_root, scene, 'dense' + subscene)
+        args = [(in_dir, img, intrinsics[img], pose_w2cam[img], out_dir)
+                for img in [images[im_id] for im_id in im_idxs]]
+        parallel_threads(resize_one_image, args, star_args=True, front_num=0, leave=False, desc=f'{scene}/{subscene}')
+
+    # save pairs
+    print('Done! prepared all pairs in', output_dir)
+
+
+def resize_one_image(root, tag, K_pre_rectif, pose_w2cam, out_dir):
+    if osp.isfile(osp.join(out_dir, tag + '.npz')):
+        return
+
+    # load image
+    img = cv2.cvtColor(cv2.imread(osp.join(root, 'imgs', tag), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+    H, W = img.shape[:2]
+
+    # load depth
+    with h5py.File(osp.join(root, 'depths', osp.splitext(tag)[0] + '.h5'), 'r') as hd5:
+        depthmap = np.asarray(hd5['depth'])
+
+    # rectify = undistort the intrinsics
+    imsize_pre, K_pre, distortion = K_pre_rectif
+    imsize_post = img.shape[1::-1]
+    K_post = cv2.getOptimalNewCameraMatrix(K_pre, distortion, imsize_pre, alpha=0,
+                                           newImgSize=imsize_post, centerPrincipalPoint=True)[0]
+
+    # downscale
+    img_out, depthmap_out, intrinsics_out, R_in2out = _downscale_image(K_post, img, depthmap, resolution_out=(800, 600))
+
+    # write everything
+    img_out.save(osp.join(out_dir, tag + '.jpg'), quality=90)
+    cv2.imwrite(osp.join(out_dir, tag + '.exr'), depthmap_out)
+
+    camout2world = np.linalg.inv(pose_w2cam)
+    camout2world[:3, :3] = camout2world[:3, :3] @ R_in2out.T
+    np.savez(osp.join(out_dir, tag + '.npz'), intrinsics=intrinsics_out, cam2world=camout2world)
+
+
+def _downscale_image(camera_intrinsics, image, depthmap, resolution_out=(512, 384)):
+    H, W = image.shape[:2]
+    resolution_out = sorted(resolution_out)[::+1 if W < H else -1]
+
+    image, depthmap, intrinsics_out = cropping.rescale_image_depthmap(
+        image, depthmap, camera_intrinsics, resolution_out, force=False)
+    R_in2out = np.eye(3)
+
+    return image, depthmap, intrinsics_out, R_in2out
+
+
+def _load_kpts_and_poses(root, scene_id, subscene, z_only=False, intrinsics=False):
+    if intrinsics:
+        with open(os.path.join(root, scene_id, 'sparse', 'manhattan', subscene, 'cameras.txt'), 'r') as f:
+            raw = f.readlines()[3:]  # skip the header
+
+        camera_intrinsics = {}
+        for camera in raw:
+            camera = camera.split(' ')
+            width, height, focal, cx, cy, k0 = [float(elem) for elem in camera[2:]]
+            K = np.eye(3)
+            K[0, 0] = focal
+            K[1, 1] = focal
+            K[0, 2] = cx
+            K[1, 2] = cy
+            camera_intrinsics[int(camera[0])] = ((int(width), int(height)), K, (k0, 0, 0, 0))
+
+    with open(os.path.join(root, scene_id, 'sparse', 'manhattan', subscene, 'images.txt'), 'r') as f:
+        raw = f.read().splitlines()[4:]  # skip the header
+
+    extract_pose = colmap_raw_pose_to_principal_axis if z_only else colmap_raw_pose_to_RT
+
+    poses = {}
+    points3D_idxs = {}
+    camera = []
+
+    for image, points in zip(raw[:: 2], raw[1:: 2]):
+        image = image.split(' ')
+        points = points.split(' ')
+
+        image_id = image[-1]
+        camera.append(int(image[-2]))
+
+        # find the principal axis
+        raw_pose = [float(elem) for elem in image[1: -2]]
+        poses[image_id] = extract_pose(raw_pose)
+
+        current_points3D_idxs = {int(i) for i in points[2:: 3] if i != '-1'}
+        assert -1 not in current_points3D_idxs, bb()
+        points3D_idxs[image_id] = current_points3D_idxs
+
+    if intrinsics:
+        image_intrinsics = {im_id: camera_intrinsics[cam] for im_id, cam in zip(poses, camera)}
+        return points3D_idxs, poses, image_intrinsics
+    else:
+        return points3D_idxs, poses
+
+
+def colmap_raw_pose_to_principal_axis(image_pose):
+    qvec = image_pose[: 4]
+    qvec = qvec / np.linalg.norm(qvec)
+    w, x, y, z = qvec
+    z_axis = np.float32([
+        2 * x * z - 2 * y * w,
+        2 * y * z + 2 * x * w,
+        1 - 2 * x * x - 2 * y * y
+    ])
+    return z_axis
+
+
+def colmap_raw_pose_to_RT(image_pose):
+    qvec = image_pose[: 4]
+    qvec = qvec / np.linalg.norm(qvec)
+    w, x, y, z = qvec
+    R = np.array([
+        [
+            1 - 2 * y * y - 2 * z * z,
+            2 * x * y - 2 * z * w,
+            2 * x * z + 2 * y * w
+        ],
+        [
+            2 * x * y + 2 * z * w,
+            1 - 2 * x * x - 2 * z * z,
+            2 * y * z - 2 * x * w
+        ],
+        [
+            2 * x * z - 2 * y * w,
+            2 * y * z + 2 * x * w,
+            1 - 2 * x * x - 2 * y * y
+        ]
+    ])
+    # principal_axis.append(R[2, :])
+    t = image_pose[4: 7]
+    # World-to-Camera pose
+    current_pose = np.eye(4)
+    current_pose[: 3, : 3] = R
+    current_pose[: 3, 3] = t
+    return current_pose
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.megadepth_dir, args.precomputed_pairs, args.output_dir)
diff --git a/dust3r/datasets_preprocess/preprocess_scannetpp.py b/dust3r/datasets_preprocess/preprocess_scannetpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..798b128f5312864a15362cd883926de1bc4e276f
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_scannetpp.py
@@ -0,0 +1,400 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to pre-process the scannet++ dataset.
+# Usage:
+# python3 datasets_preprocess/preprocess_scannetpp.py --scannetpp_dir /path/to/scannetpp --precomputed_pairs /path/to/scannetpp_pairs --pyopengl-platform egl
+# --------------------------------------------------------
+import os
+import argparse
+import os.path as osp
+import re
+from tqdm import tqdm
+import json
+from scipy.spatial.transform import Rotation
+import pyrender
+import trimesh
+import trimesh.exchange.ply
+import numpy as np
+import cv2
+import PIL.Image as Image
+
+from dust3r.datasets.utils.cropping import rescale_image_depthmap
+import dust3r.utils.geometry as geometry
+
+inv = np.linalg.inv
+norm = np.linalg.norm
+REGEXPR_DSLR = re.compile(r'^.*DSC(?P<frameid>\d+).JPG$')
+REGEXPR_IPHONE = re.compile(r'.*frame_(?P<frameid>\d+).jpg$')
+
+DEBUG_VIZ = None  # 'iou'
+if DEBUG_VIZ is not None:
+    import matplotlib.pyplot as plt  # noqa
+
+
+OPENGL_TO_OPENCV = np.float32([[1, 0, 0, 0],
+                               [0, -1, 0, 0],
+                               [0, 0, -1, 0],
+                               [0, 0, 0, 1]])
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--scannetpp_dir', required=True)
+    parser.add_argument('--precomputed_pairs', required=True)
+    parser.add_argument('--output_dir', default='data/scannetpp_processed')
+    parser.add_argument('--target_resolution', default=920, type=int, help="images resolution")
+    parser.add_argument('--pyopengl-platform', type=str, default='', help='PyOpenGL env variable')
+    return parser
+
+
+def pose_from_qwxyz_txyz(elems):
+    qw, qx, qy, qz, tx, ty, tz = map(float, elems)
+    pose = np.eye(4)
+    pose[:3, :3] = Rotation.from_quat((qx, qy, qz, qw)).as_matrix()
+    pose[:3, 3] = (tx, ty, tz)
+    return np.linalg.inv(pose)  # returns cam2world
+
+
+def get_frame_number(name, cam_type='dslr'):
+    if cam_type == 'dslr':
+        regex_expr = REGEXPR_DSLR
+    elif cam_type == 'iphone':
+        regex_expr = REGEXPR_IPHONE
+    else:
+        raise NotImplementedError(f'wrong {cam_type=} for get_frame_number')
+    try:
+        matches = re.match(regex_expr, name)
+        return matches['frameid']
+    except Exception as e:
+        print(f'Error when parsing {name}')
+        raise ValueError(f'Invalid name {name}')
+
+
+def load_sfm(sfm_dir, cam_type='dslr'):
+    # load cameras
+    with open(osp.join(sfm_dir, 'cameras.txt'), 'r') as f:
+        raw = f.read().splitlines()[3:]  # skip header
+
+    intrinsics = {}
+    for camera in tqdm(raw, position=1, leave=False):
+        camera = camera.split(' ')
+        intrinsics[int(camera[0])] = [camera[1]] + [float(cam) for cam in camera[2:]]
+
+    # load images
+    with open(os.path.join(sfm_dir, 'images.txt'), 'r') as f:
+        raw = f.read().splitlines()
+        raw = [line for line in raw if not line.startswith('#')]  # skip header
+
+    img_idx = {}
+    img_infos = {}
+    for image, points in tqdm(zip(raw[0::2], raw[1::2]), total=len(raw) // 2, position=1, leave=False):
+        image = image.split(' ')
+        points = points.split(' ')
+
+        idx = image[0]
+        img_name = image[-1]
+        prefixes = ['iphone/', 'video/']
+        for prefix in prefixes:
+            if img_name.startswith(prefix):
+                img_name = img_name[len(prefix):]
+        assert img_name not in img_idx, 'duplicate db image: ' + img_name
+        img_idx[img_name] = idx  # register image name
+
+        current_points2D = {int(i): (float(x), float(y))
+                            for i, x, y in zip(points[2::3], points[0::3], points[1::3]) if i != '-1'}
+        img_infos[idx] = dict(intrinsics=intrinsics[int(image[-2])],
+                              path=img_name,
+                              frame_id=get_frame_number(img_name, cam_type),
+                              cam_to_world=pose_from_qwxyz_txyz(image[1: -2]),
+                              sparse_pts2d=current_points2D)
+
+    # load 3D points
+    with open(os.path.join(sfm_dir, 'points3D.txt'), 'r') as f:
+        raw = f.read().splitlines()
+        raw = [line for line in raw if not line.startswith('#')]  # skip header
+
+    points3D = {}
+    observations = {idx: [] for idx in img_infos.keys()}
+    for point in tqdm(raw, position=1, leave=False):
+        point = point.split()
+        point_3d_idx = int(point[0])
+        points3D[point_3d_idx] = tuple(map(float, point[1:4]))
+        if len(point) > 8:
+            for idx, point_2d_idx in zip(point[8::2], point[9::2]):
+                if idx not in observations:
+                    continue
+                observations[idx].append((point_3d_idx, int(point_2d_idx)))
+
+    return img_idx, img_infos, points3D, observations
+
+
+def subsample_img_infos(img_infos, num_images, allowed_name_subset=None):
+    img_infos_val = [(idx, val) for idx, val in img_infos.items()]
+    if allowed_name_subset is not None:
+        img_infos_val = [(idx, val) for idx, val in img_infos_val if val['path'] in allowed_name_subset]
+
+    if len(img_infos_val) > num_images:
+        img_infos_val = sorted(img_infos_val, key=lambda x: x[1]['frame_id'])
+        kept_idx = np.round(np.linspace(0, len(img_infos_val) - 1, num_images)).astype(int).tolist()
+        img_infos_val = [img_infos_val[idx] for idx in kept_idx]
+    return {idx: val for idx, val in img_infos_val}
+
+
+def undistort_images(intrinsics, rgb, mask):
+    camera_type = intrinsics[0]
+
+    width = int(intrinsics[1])
+    height = int(intrinsics[2])
+    fx = intrinsics[3]
+    fy = intrinsics[4]
+    cx = intrinsics[5]
+    cy = intrinsics[6]
+    distortion = np.array(intrinsics[7:])
+
+    K = np.zeros([3, 3])
+    K[0, 0] = fx
+    K[0, 2] = cx
+    K[1, 1] = fy
+    K[1, 2] = cy
+    K[2, 2] = 1
+
+    K = geometry.colmap_to_opencv_intrinsics(K)
+    if camera_type == "OPENCV_FISHEYE":
+        assert len(distortion) == 4
+
+        new_K = cv2.fisheye.estimateNewCameraMatrixForUndistortRectify(
+            K,
+            distortion,
+            (width, height),
+            np.eye(3),
+            balance=0.0,
+        )
+        # Make the cx and cy to be the center of the image
+        new_K[0, 2] = width / 2.0
+        new_K[1, 2] = height / 2.0
+
+        map1, map2 = cv2.fisheye.initUndistortRectifyMap(K, distortion, np.eye(3), new_K, (width, height), cv2.CV_32FC1)
+    else:
+        new_K, _ = cv2.getOptimalNewCameraMatrix(K, distortion, (width, height), 1, (width, height), True)
+        map1, map2 = cv2.initUndistortRectifyMap(K, distortion, np.eye(3), new_K, (width, height), cv2.CV_32FC1)
+
+    undistorted_image = cv2.remap(rgb, map1, map2, interpolation=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT_101)
+    undistorted_mask = cv2.remap(mask, map1, map2, interpolation=cv2.INTER_LINEAR,
+                                 borderMode=cv2.BORDER_CONSTANT, borderValue=255)
+    new_K = geometry.opencv_to_colmap_intrinsics(new_K)
+    return width, height, new_K, undistorted_image, undistorted_mask
+
+
+def process_scenes(root, pairsdir, output_dir, target_resolution):
+    os.makedirs(output_dir, exist_ok=True)
+
+    # default values from
+    # https://github.com/scannetpp/scannetpp/blob/main/common/configs/render.yml
+    znear = 0.05
+    zfar = 20.0
+
+    listfile = osp.join(pairsdir, 'scene_list.json')
+    with open(listfile, 'r') as f:
+        scenes = json.load(f)
+
+    # for each of these, we will select some dslr images and some iphone images
+    # we will undistort them and render their depth
+    renderer = pyrender.OffscreenRenderer(0, 0)
+    for scene in tqdm(scenes, position=0, leave=True):
+        data_dir = os.path.join(root, 'data', scene)
+        dir_dslr = os.path.join(data_dir, 'dslr')
+        dir_iphone = os.path.join(data_dir, 'iphone')
+        dir_scans = os.path.join(data_dir, 'scans')
+
+        assert os.path.isdir(data_dir) and os.path.isdir(dir_dslr) \
+            and os.path.isdir(dir_iphone) and os.path.isdir(dir_scans)
+
+        output_dir_scene = os.path.join(output_dir, scene)
+        scene_metadata_path = osp.join(output_dir_scene, 'scene_metadata.npz')
+        if osp.isfile(scene_metadata_path):
+            continue
+
+        pairs_dir_scene = os.path.join(pairsdir, scene)
+        pairs_dir_scene_selected_pairs = os.path.join(pairs_dir_scene, 'selected_pairs.npz')
+        assert osp.isfile(pairs_dir_scene_selected_pairs)
+        selected_npz = np.load(pairs_dir_scene_selected_pairs)
+        selection, pairs = selected_npz['selection'], selected_npz['pairs']
+
+        # set up the output paths
+        output_dir_scene_rgb = os.path.join(output_dir_scene, 'images')
+        output_dir_scene_depth = os.path.join(output_dir_scene, 'depth')
+        os.makedirs(output_dir_scene_rgb, exist_ok=True)
+        os.makedirs(output_dir_scene_depth, exist_ok=True)
+
+        ply_path = os.path.join(dir_scans, 'mesh_aligned_0.05.ply')
+
+        sfm_dir_dslr = os.path.join(dir_dslr, 'colmap')
+        rgb_dir_dslr = os.path.join(dir_dslr, 'resized_images')
+        mask_dir_dslr = os.path.join(dir_dslr, 'resized_anon_masks')
+
+        sfm_dir_iphone = os.path.join(dir_iphone, 'colmap')
+        rgb_dir_iphone = os.path.join(dir_iphone, 'rgb')
+        mask_dir_iphone = os.path.join(dir_iphone, 'rgb_masks')
+
+        # load the mesh
+        with open(ply_path, 'rb') as f:
+            mesh_kwargs = trimesh.exchange.ply.load_ply(f)
+        mesh_scene = trimesh.Trimesh(**mesh_kwargs)
+
+        # read colmap reconstruction, we will only use the intrinsics and pose here
+        img_idx_dslr, img_infos_dslr, points3D_dslr, observations_dslr = load_sfm(sfm_dir_dslr, cam_type='dslr')
+        dslr_paths = {
+            "in_colmap": sfm_dir_dslr,
+            "in_rgb": rgb_dir_dslr,
+            "in_mask": mask_dir_dslr,
+        }
+
+        img_idx_iphone, img_infos_iphone, points3D_iphone, observations_iphone = load_sfm(
+            sfm_dir_iphone, cam_type='iphone')
+        iphone_paths = {
+            "in_colmap": sfm_dir_iphone,
+            "in_rgb": rgb_dir_iphone,
+            "in_mask": mask_dir_iphone,
+        }
+
+        mesh = pyrender.Mesh.from_trimesh(mesh_scene, smooth=False)
+        pyrender_scene = pyrender.Scene()
+        pyrender_scene.add(mesh)
+
+        selection_iphone = [imgname + '.jpg' for imgname in selection if 'frame_' in imgname]
+        selection_dslr = [imgname + '.JPG' for imgname in selection if not 'frame_' in imgname]
+
+        # resize the image to a more manageable size and render depth
+        for selection_cam, img_idx, img_infos, paths_data in [(selection_dslr, img_idx_dslr, img_infos_dslr, dslr_paths),
+                                                              (selection_iphone, img_idx_iphone, img_infos_iphone, iphone_paths)]:
+            rgb_dir = paths_data['in_rgb']
+            mask_dir = paths_data['in_mask']
+            for imgname in tqdm(selection_cam, position=1, leave=False):
+                imgidx = img_idx[imgname]
+                img_infos_idx = img_infos[imgidx]
+                rgb = np.array(Image.open(os.path.join(rgb_dir, img_infos_idx['path'])))
+                mask = np.array(Image.open(os.path.join(mask_dir, img_infos_idx['path'][:-3] + 'png')))
+
+                _, _, K, rgb, mask = undistort_images(img_infos_idx['intrinsics'], rgb, mask)
+
+                # rescale_image_depthmap assumes opencv intrinsics
+                intrinsics = geometry.colmap_to_opencv_intrinsics(K)
+                image, mask, intrinsics = rescale_image_depthmap(
+                    rgb, mask, intrinsics, (target_resolution, target_resolution * 3.0 / 4))
+
+                W, H = image.size
+                intrinsics = geometry.opencv_to_colmap_intrinsics(intrinsics)
+
+                # update inpace img_infos_idx
+                img_infos_idx['intrinsics'] = intrinsics
+                rgb_outpath = os.path.join(output_dir_scene_rgb, img_infos_idx['path'][:-3] + 'jpg')
+                image.save(rgb_outpath)
+
+                depth_outpath = os.path.join(output_dir_scene_depth, img_infos_idx['path'][:-3] + 'png')
+                # render depth image
+                renderer.viewport_width, renderer.viewport_height = W, H
+                fx, fy, cx, cy = intrinsics[0, 0], intrinsics[1, 1], intrinsics[0, 2], intrinsics[1, 2]
+                camera = pyrender.camera.IntrinsicsCamera(fx, fy, cx, cy, znear=znear, zfar=zfar)
+                camera_node = pyrender_scene.add(camera, pose=img_infos_idx['cam_to_world'] @ OPENGL_TO_OPENCV)
+
+                _, depth = renderer.render(pyrender_scene, flags=pyrender.RenderFlags.SKIP_CULL_FACES)
+                pyrender_scene.remove_node(camera_node)  # dont forget to remove camera
+
+                depth = (depth * 1000).astype('uint16')
+                # invalidate depth from mask before saving
+                depth_mask = (mask < 255)
+                depth[depth_mask] = 0
+                Image.fromarray(depth).save(depth_outpath)
+
+        trajectories = []
+        intrinsics = []
+        for imgname in selection:
+            if 'frame_' in imgname:
+                imgidx = img_idx_iphone[imgname + '.jpg']
+                img_infos_idx = img_infos_iphone[imgidx]
+            elif 'DSC' in imgname:
+                imgidx = img_idx_dslr[imgname + '.JPG']
+                img_infos_idx = img_infos_dslr[imgidx]
+            else:
+                raise ValueError(f'invalid image name {imgname}')
+
+            intrinsics.append(img_infos_idx['intrinsics'])
+            trajectories.append(img_infos_idx['cam_to_world'])
+
+        intrinsics = np.stack(intrinsics, axis=0)
+        trajectories = np.stack(trajectories, axis=0)
+        # save metadata for this scene
+        np.savez(scene_metadata_path,
+                 trajectories=trajectories,
+                 intrinsics=intrinsics,
+                 images=selection,
+                 pairs=pairs)
+
+        del img_infos
+        del pyrender_scene
+
+    # concat all scene_metadata.npz into a single file
+    scene_data = {}
+    for scene_subdir in scenes:
+        scene_metadata_path = osp.join(output_dir, scene_subdir, 'scene_metadata.npz')
+        with np.load(scene_metadata_path) as data:
+            trajectories = data['trajectories']
+            intrinsics = data['intrinsics']
+            images = data['images']
+            pairs = data['pairs']
+        scene_data[scene_subdir] = {'trajectories': trajectories,
+                                    'intrinsics': intrinsics,
+                                    'images': images,
+                                    'pairs': pairs}
+
+    offset = 0
+    counts = []
+    scenes = []
+    sceneids = []
+    images = []
+    intrinsics = []
+    trajectories = []
+    pairs = []
+    for scene_idx, (scene_subdir, data) in enumerate(scene_data.items()):
+        num_imgs = data['images'].shape[0]
+        img_pairs = data['pairs']
+
+        scenes.append(scene_subdir)
+        sceneids.extend([scene_idx] * num_imgs)
+
+        images.append(data['images'])
+
+        intrinsics.append(data['intrinsics'])
+        trajectories.append(data['trajectories'])
+
+        # offset pairs
+        img_pairs[:, 0:2] += offset
+        pairs.append(img_pairs)
+        counts.append(offset)
+
+        offset += num_imgs
+
+    images = np.concatenate(images, axis=0)
+    intrinsics = np.concatenate(intrinsics, axis=0)
+    trajectories = np.concatenate(trajectories, axis=0)
+    pairs = np.concatenate(pairs, axis=0)
+    np.savez(osp.join(output_dir, 'all_metadata.npz'),
+             counts=counts,
+             scenes=scenes,
+             sceneids=sceneids,
+             images=images,
+             intrinsics=intrinsics,
+             trajectories=trajectories,
+             pairs=pairs)
+    print('all done')
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    if args.pyopengl_platform.strip():
+        os.environ['PYOPENGL_PLATFORM'] = args.pyopengl_platform
+    process_scenes(args.scannetpp_dir, args.precomputed_pairs, args.output_dir, args.target_resolution)
diff --git a/dust3r/datasets_preprocess/preprocess_staticthings3d.py b/dust3r/datasets_preprocess/preprocess_staticthings3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee3eec16321c14b12291699f1fee492b5a7d8b1c
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_staticthings3d.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Preprocessing code for the StaticThings3D dataset
+# dataset at https://github.com/lmb-freiburg/robustmvd/blob/master/rmvd/data/README.md#staticthings3d
+# 1) Download StaticThings3D in /path/to/StaticThings3D/
+#    with the script at https://github.com/lmb-freiburg/robustmvd/blob/master/rmvd/data/scripts/download_staticthings3d.sh
+#    --> depths.tar.bz2 frames_finalpass.tar.bz2 poses.tar.bz2 frames_cleanpass.tar.bz2 intrinsics.tar.bz2
+# 2) unzip everything in the same /path/to/StaticThings3D/ directory
+# 5) python datasets_preprocess/preprocess_staticthings3d.py --StaticThings3D_dir /path/to/tmp/StaticThings3D/
+# --------------------------------------------------------
+import os
+import os.path as osp
+import re
+from tqdm import tqdm
+import numpy as np
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+
+import path_to_root  # noqa
+from dust3r.utils.parallel import parallel_threads
+from dust3r.datasets.utils import cropping  # noqa
+
+
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--StaticThings3D_dir', required=True)
+    parser.add_argument('--precomputed_pairs', required=True)
+    parser.add_argument('--output_dir', default='data/staticthings3d_processed')
+    return parser
+
+
+def main(db_root, pairs_path, output_dir):
+    all_scenes = _list_all_scenes(db_root)
+
+    # crop images
+    args = [(db_root, osp.join(split, subsplit, seq), camera, f'{n:04d}', output_dir)
+            for split, subsplit, seq in all_scenes for camera in ['left', 'right'] for n in range(6, 16)]
+    parallel_threads(load_crop_and_save, args, star_args=True, front_num=1)
+
+    # verify that all images are there
+    CAM = {b'l': 'left', b'r': 'right'}
+    pairs = np.load(pairs_path)
+    for scene, seq, cam1, im1, cam2, im2 in tqdm(pairs):
+        seq_path = osp.join('TRAIN', scene.decode('ascii'), f'{seq:04d}')
+        for cam, idx in [(CAM[cam1], im1), (CAM[cam2], im2)]:
+            for ext in ['clean', 'final']:
+                impath = osp.join(output_dir, seq_path, cam, f"{idx:04n}_{ext}.jpg")
+                assert osp.isfile(impath), f'missing an image at {impath=}'
+
+    print(f'>> Saved all data to {output_dir}!')
+
+
+def load_crop_and_save(db_root, relpath_, camera, num, out_dir):
+    relpath = osp.join(relpath_, camera, num)
+    if osp.isfile(osp.join(out_dir, relpath + '.npz')):
+        return
+    os.makedirs(osp.join(out_dir, relpath_, camera), exist_ok=True)
+
+    # load everything
+    intrinsics_in = readFloat(osp.join(db_root, 'intrinsics', relpath_, num + '.float3'))
+    cam2world = np.linalg.inv(readFloat(osp.join(db_root, 'poses', relpath + '.float3')))
+    depthmap_in = readFloat(osp.join(db_root, 'depths', relpath + '.float3'))
+    img_clean = cv2.cvtColor(cv2.imread(osp.join(db_root, 'frames_cleanpass',
+                             relpath + '.png'), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+    img_final = cv2.cvtColor(cv2.imread(osp.join(db_root, 'frames_finalpass',
+                             relpath + '.png'), cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+
+    # do the crop
+    assert img_clean.shape[:2] == (540, 960)
+    assert img_final.shape[:2] == (540, 960)
+    (clean_out, final_out), depthmap, intrinsics_out, R_in2out = _crop_image(
+        intrinsics_in, (img_clean, img_final), depthmap_in, (512, 384))
+
+    # write everything
+    clean_out.save(osp.join(out_dir, relpath + '_clean.jpg'), quality=80)
+    final_out.save(osp.join(out_dir, relpath + '_final.jpg'), quality=80)
+    cv2.imwrite(osp.join(out_dir, relpath + '.exr'), depthmap)
+
+    # New camera parameters
+    cam2world[:3, :3] = cam2world[:3, :3] @ R_in2out.T
+    np.savez(osp.join(out_dir, relpath + '.npz'), intrinsics=intrinsics_out, cam2world=cam2world)
+
+
+def _crop_image(intrinsics_in, color_image_in, depthmap_in, resolution_out=(512, 512)):
+    image, depthmap, intrinsics_out = cropping.rescale_image_depthmap(
+        color_image_in, depthmap_in, intrinsics_in, resolution_out)
+    R_in2out = np.eye(3)
+    return image, depthmap, intrinsics_out, R_in2out
+
+
+def _list_all_scenes(path):
+    print('>> Listing all scenes')
+
+    res = []
+    for split in ['TRAIN']:
+        for subsplit in 'ABC':
+            for seq in os.listdir(osp.join(path, 'intrinsics', split, subsplit)):
+                res.append((split, subsplit, seq))
+    print(f'   (found ({len(res)}) scenes)')
+    assert res, f'Did not find anything at {path=}'
+    return res
+
+
+def readFloat(name):
+    with open(name, 'rb') as f:
+        if (f.readline().decode("utf-8")) != 'float\n':
+            raise Exception('float file %s did not contain <float> keyword' % name)
+
+        dim = int(f.readline())
+
+        dims = []
+        count = 1
+        for i in range(0, dim):
+            d = int(f.readline())
+            dims.append(d)
+            count *= d
+
+        dims = list(reversed(dims))
+        data = np.fromfile(f, np.float32, count).reshape(dims)
+    return data  # Hxw or CxHxW NxCxHxW
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.StaticThings3D_dir, args.precomputed_pairs, args.output_dir)
diff --git a/dust3r/datasets_preprocess/preprocess_waymo.py b/dust3r/datasets_preprocess/preprocess_waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..203f337330a7e06e61d2fb9dd99647063967922d
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_waymo.py
@@ -0,0 +1,257 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Preprocessing code for the WayMo Open dataset
+# dataset at https://github.com/waymo-research/waymo-open-dataset
+# 1) Accept the license
+# 2) download all training/*.tfrecord files from Perception Dataset, version 1.4.2
+# 3) put all .tfrecord files in '/path/to/waymo_dir'
+# 4) install the waymo_open_dataset package with
+#    `python3 -m pip install gcsfs waymo-open-dataset-tf-2-12-0==1.6.4`
+# 5) execute this script as `python preprocess_waymo.py --waymo_dir /path/to/waymo_dir`
+# --------------------------------------------------------
+import sys
+import os
+import os.path as osp
+import shutil
+import json
+from tqdm import tqdm
+import PIL.Image
+import numpy as np
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2
+
+import tensorflow.compat.v1 as tf
+tf.enable_eager_execution()
+
+import path_to_root  # noqa
+from dust3r.utils.geometry import geotrf, inv
+from dust3r.utils.image import imread_cv2
+from dust3r.utils.parallel import parallel_processes as parallel_map
+from dust3r.datasets.utils import cropping
+from dust3r.viz import show_raw_pointcloud
+
+
+def get_parser():
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--waymo_dir', required=True)
+    parser.add_argument('--precomputed_pairs', required=True)
+    parser.add_argument('--output_dir', default='data/waymo_processed')
+    parser.add_argument('--workers', type=int, default=1)
+    return parser
+
+
+def main(waymo_root, pairs_path, output_dir, workers=1):
+    extract_frames(waymo_root, output_dir, workers=workers)
+    make_crops(output_dir, workers=args.workers)
+
+    # make sure all pairs are there
+    with np.load(pairs_path) as data:
+        scenes = data['scenes']
+        frames = data['frames']
+        pairs = data['pairs']  # (array of (scene_id, img1_id, img2_id)
+
+    for scene_id, im1_id, im2_id in pairs:
+        for im_id in (im1_id, im2_id):
+            path = osp.join(output_dir, scenes[scene_id], frames[im_id] + '.jpg')
+            assert osp.isfile(path), f'Missing a file at {path=}\nDid you download all .tfrecord files?'
+
+    shutil.rmtree(osp.join(output_dir, 'tmp'))
+    print('Done! all data generated at', output_dir)
+
+
+def _list_sequences(db_root):
+    print('>> Looking for sequences in', db_root)
+    res = sorted(f for f in os.listdir(db_root) if f.endswith('.tfrecord'))
+    print(f'    found {len(res)} sequences')
+    return res
+
+
+def extract_frames(db_root, output_dir, workers=8):
+    sequences = _list_sequences(db_root)
+    output_dir = osp.join(output_dir, 'tmp')
+    print('>> outputing result to', output_dir)
+    args = [(db_root, output_dir, seq) for seq in sequences]
+    parallel_map(process_one_seq, args, star_args=True, workers=workers)
+
+
+def process_one_seq(db_root, output_dir, seq):
+    out_dir = osp.join(output_dir, seq)
+    os.makedirs(out_dir, exist_ok=True)
+    calib_path = osp.join(out_dir, 'calib.json')
+    if osp.isfile(calib_path):
+        return
+
+    try:
+        with tf.device('/CPU:0'):
+            calib, frames = extract_frames_one_seq(osp.join(db_root, seq))
+    except RuntimeError:
+        print(f'/!\\ Error with sequence {seq} /!\\', file=sys.stderr)
+        return  # nothing is saved
+
+    for f, (frame_name, views) in enumerate(tqdm(frames, leave=False)):
+        for cam_idx, view in views.items():
+            img = PIL.Image.fromarray(view.pop('img'))
+            img.save(osp.join(out_dir, f'{f:05d}_{cam_idx}.jpg'))
+            np.savez(osp.join(out_dir, f'{f:05d}_{cam_idx}.npz'), **view)
+
+    with open(calib_path, 'w') as f:
+        json.dump(calib, f)
+
+
+def extract_frames_one_seq(filename):
+    from waymo_open_dataset import dataset_pb2 as open_dataset
+    from waymo_open_dataset.utils import frame_utils
+
+    print('>> Opening', filename)
+    dataset = tf.data.TFRecordDataset(filename, compression_type='')
+
+    calib = None
+    frames = []
+
+    for data in tqdm(dataset, leave=False):
+        frame = open_dataset.Frame()
+        frame.ParseFromString(bytearray(data.numpy()))
+
+        content = frame_utils.parse_range_image_and_camera_projection(frame)
+        range_images, camera_projections, _, range_image_top_pose = content
+
+        views = {}
+        frames.append((frame.context.name, views))
+
+        # once in a sequence, read camera calibration info
+        if calib is None:
+            calib = []
+            for cam in frame.context.camera_calibrations:
+                calib.append((cam.name,
+                              dict(width=cam.width,
+                                   height=cam.height,
+                                   intrinsics=list(cam.intrinsic),
+                                   extrinsics=list(cam.extrinsic.transform))))
+
+        # convert LIDAR to pointcloud
+        points, cp_points = frame_utils.convert_range_image_to_point_cloud(
+            frame,
+            range_images,
+            camera_projections,
+            range_image_top_pose)
+
+        # 3d points in vehicle frame.
+        points_all = np.concatenate(points, axis=0)
+        cp_points_all = np.concatenate(cp_points, axis=0)
+
+        # The distance between lidar points and vehicle frame origin.
+        cp_points_all_tensor = tf.constant(cp_points_all, dtype=tf.int32)
+
+        for i, image in enumerate(frame.images):
+            # select relevant 3D points for this view
+            mask = tf.equal(cp_points_all_tensor[..., 0], image.name)
+            cp_points_msk_tensor = tf.cast(tf.gather_nd(cp_points_all_tensor, tf.where(mask)), dtype=tf.float32)
+
+            pose = np.asarray(image.pose.transform).reshape(4, 4)
+            timestamp = image.pose_timestamp
+
+            rgb = tf.image.decode_jpeg(image.image).numpy()
+
+            pix = cp_points_msk_tensor[..., 1:3].numpy().round().astype(np.int16)
+            pts3d = points_all[mask.numpy()]
+
+            views[image.name] = dict(img=rgb, pose=pose, pixels=pix, pts3d=pts3d, timestamp=timestamp)
+
+        if not 'show full point cloud':
+            show_raw_pointcloud([v['pts3d'] for v in views.values()], [v['img'] for v in views.values()])
+
+    return calib, frames
+
+
+def make_crops(output_dir, workers=16, **kw):
+    tmp_dir = osp.join(output_dir, 'tmp')
+    sequences = _list_sequences(tmp_dir)
+    args = [(tmp_dir, output_dir, seq) for seq in sequences]
+    parallel_map(crop_one_seq, args, star_args=True, workers=workers, front_num=0)
+
+
+def crop_one_seq(input_dir, output_dir, seq, resolution=512):
+    seq_dir = osp.join(input_dir, seq)
+    out_dir = osp.join(output_dir, seq)
+    if osp.isfile(osp.join(out_dir, '00100_1.jpg')):
+        return
+    os.makedirs(out_dir, exist_ok=True)
+
+    # load calibration file
+    try:
+        with open(osp.join(seq_dir, 'calib.json')) as f:
+            calib = json.load(f)
+    except IOError:
+        print(f'/!\\ Error: Missing calib.json in sequence {seq} /!\\', file=sys.stderr)
+        return
+
+    axes_transformation = np.array([
+        [0, -1, 0, 0],
+        [0, 0, -1, 0],
+        [1, 0, 0, 0],
+        [0, 0, 0, 1]])
+
+    cam_K = {}
+    cam_distortion = {}
+    cam_res = {}
+    cam_to_car = {}
+    for cam_idx, cam_info in calib:
+        cam_idx = str(cam_idx)
+        cam_res[cam_idx] = (W, H) = (cam_info['width'], cam_info['height'])
+        f1, f2, cx, cy, k1, k2, p1, p2, k3 = cam_info['intrinsics']
+        cam_K[cam_idx] = np.asarray([(f1, 0, cx), (0, f2, cy), (0, 0, 1)])
+        cam_distortion[cam_idx] = np.asarray([k1, k2, p1, p2, k3])
+        cam_to_car[cam_idx] = np.asarray(cam_info['extrinsics']).reshape(4, 4)  # cam-to-vehicle
+
+    frames = sorted(f[:-3] for f in os.listdir(seq_dir) if f.endswith('.jpg'))
+
+    # from dust3r.viz import SceneViz
+    # viz = SceneViz()
+
+    for frame in tqdm(frames, leave=False):
+        cam_idx = frame[-2]  # cam index
+        assert cam_idx in '12345', f'bad {cam_idx=} in {frame=}'
+        data = np.load(osp.join(seq_dir, frame + 'npz'))
+        car_to_world = data['pose']
+        W, H = cam_res[cam_idx]
+
+        # load depthmap
+        pos2d = data['pixels'].round().astype(np.uint16)
+        x, y = pos2d.T
+        pts3d = data['pts3d']  # already in the car frame
+        pts3d = geotrf(axes_transformation @ inv(cam_to_car[cam_idx]), pts3d)
+        # X=LEFT_RIGHT y=ALTITUDE z=DEPTH
+
+        # load image
+        image = imread_cv2(osp.join(seq_dir, frame + 'jpg'))
+
+        # downscale image
+        output_resolution = (resolution, 1) if W > H else (1, resolution)
+        image, _, intrinsics2 = cropping.rescale_image_depthmap(image, None, cam_K[cam_idx], output_resolution)
+        image.save(osp.join(out_dir, frame + 'jpg'), quality=80)
+
+        # save as an EXR file? yes it's smaller (and easier to load)
+        W, H = image.size
+        depthmap = np.zeros((H, W), dtype=np.float32)
+        pos2d = geotrf(intrinsics2 @ inv(cam_K[cam_idx]), pos2d).round().astype(np.int16)
+        x, y = pos2d.T
+        depthmap[y.clip(min=0, max=H - 1), x.clip(min=0, max=W - 1)] = pts3d[:, 2]
+        cv2.imwrite(osp.join(out_dir, frame + 'exr'), depthmap)
+
+        # save camera parametes
+        cam2world = car_to_world @ cam_to_car[cam_idx] @ inv(axes_transformation)
+        np.savez(osp.join(out_dir, frame + 'npz'), intrinsics=intrinsics2,
+                 cam2world=cam2world, distortion=cam_distortion[cam_idx])
+
+        # viz.add_rgbd(np.asarray(image), depthmap, intrinsics2, cam2world)
+    # viz.show()
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args.waymo_dir, args.precomputed_pairs, args.output_dir, workers=args.workers)
diff --git a/dust3r/datasets_preprocess/preprocess_wildrgbd.py b/dust3r/datasets_preprocess/preprocess_wildrgbd.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3f0f7abb7d9ef43bba6a7c6cd6f4e652a8f510
--- /dev/null
+++ b/dust3r/datasets_preprocess/preprocess_wildrgbd.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Script to pre-process the WildRGB-D dataset.
+# Usage:
+# python3 datasets_preprocess/preprocess_wildrgbd.py --wildrgbd_dir /path/to/wildrgbd
+# --------------------------------------------------------
+
+import argparse
+import random
+import json
+import os
+import os.path as osp
+
+import PIL.Image
+import numpy as np
+import cv2
+
+from tqdm.auto import tqdm
+import matplotlib.pyplot as plt
+
+import path_to_root  # noqa
+import dust3r.datasets.utils.cropping as cropping  # noqa
+from dust3r.utils.image import imread_cv2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", type=str, default="data/wildrgbd_processed")
+    parser.add_argument("--wildrgbd_dir", type=str, required=True)
+    parser.add_argument("--train_num_sequences_per_object", type=int, default=50)
+    parser.add_argument("--test_num_sequences_per_object", type=int, default=10)
+    parser.add_argument("--num_frames", type=int, default=100)
+    parser.add_argument("--seed", type=int, default=42)
+
+    parser.add_argument("--img_size", type=int, default=512,
+                        help=("lower dimension will be >= img_size * 3/4, and max dimension will be >= img_size"))
+    return parser
+
+
+def get_set_list(category_dir, split):
+    listfiles = ["camera_eval_list.json", "nvs_list.json"]
+
+    sequences_all = {s: {k: set() for k in listfiles} for s in ['train', 'val']}
+    for listfile in listfiles:
+        with open(osp.join(category_dir, listfile)) as f:
+            subset_lists_data = json.load(f)
+            for s in ['train', 'val']:
+                sequences_all[s][listfile].update(subset_lists_data[s])
+    train_intersection = set.intersection(*list(sequences_all['train'].values()))
+    if split == "train":
+        return train_intersection
+    else:
+        all_seqs = set.union(*list(sequences_all['train'].values()), *list(sequences_all['val'].values()))
+        return all_seqs.difference(train_intersection)
+
+
+def prepare_sequences(category, wildrgbd_dir, output_dir, img_size, split, max_num_sequences_per_object,
+                      output_num_frames, seed):
+    random.seed(seed)
+    category_dir = osp.join(wildrgbd_dir, category)
+    category_output_dir = osp.join(output_dir, category)
+    sequences_all = get_set_list(category_dir, split)
+    sequences_all = sorted(sequences_all)
+
+    sequences_all_tmp = []
+    for seq_name in sequences_all:
+        scene_dir = osp.join(wildrgbd_dir, category_dir, seq_name)
+        if not os.path.isdir(scene_dir):
+            print(f'{scene_dir} does not exist, skipped')
+            continue
+        sequences_all_tmp.append(seq_name)
+    sequences_all = sequences_all_tmp
+    if len(sequences_all) <= max_num_sequences_per_object:
+        selected_sequences = sequences_all
+    else:
+        selected_sequences = random.sample(sequences_all, max_num_sequences_per_object)
+
+    selected_sequences_numbers_dict = {}
+    for seq_name in tqdm(selected_sequences, leave=False):
+        scene_dir = osp.join(category_dir, seq_name)
+        scene_output_dir = osp.join(category_output_dir, seq_name)
+        with open(osp.join(scene_dir, 'metadata'), 'r') as f:
+            metadata = json.load(f)
+
+        K = np.array(metadata["K"]).reshape(3, 3).T
+        fx, fy, cx, cy = K[0, 0], K[1, 1], K[0, 2], K[1, 2]
+        w, h = metadata["w"], metadata["h"]
+
+        camera_intrinsics = np.array(
+            [[fx, 0, cx],
+             [0, fy, cy],
+             [0, 0, 1]]
+        )
+        camera_to_world_path = os.path.join(scene_dir, 'cam_poses.txt')
+        camera_to_world_content = np.genfromtxt(camera_to_world_path)
+        camera_to_world = camera_to_world_content[:, 1:].reshape(-1, 4, 4)
+
+        frame_idx = camera_to_world_content[:, 0]
+        num_frames = frame_idx.shape[0]
+        assert num_frames >= output_num_frames
+        assert np.all(frame_idx == np.arange(num_frames))
+
+        # selected_sequences_numbers_dict[seq_name] = num_frames
+
+        selected_frames = np.round(np.linspace(0, num_frames - 1, output_num_frames)).astype(int).tolist()
+        selected_sequences_numbers_dict[seq_name] = selected_frames
+
+        for frame_id in tqdm(selected_frames):
+            depth_path = os.path.join(scene_dir, 'depth', f'{frame_id:0>5d}.png')
+            masks_path = os.path.join(scene_dir, 'masks', f'{frame_id:0>5d}.png')
+            rgb_path = os.path.join(scene_dir, 'rgb', f'{frame_id:0>5d}.png')
+
+            input_rgb_image = PIL.Image.open(rgb_path).convert('RGB')
+            input_mask = plt.imread(masks_path)
+            input_depthmap = imread_cv2(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float64)
+            depth_mask = np.stack((input_depthmap, input_mask), axis=-1)
+            H, W = input_depthmap.shape
+
+            min_margin_x = min(cx, W - cx)
+            min_margin_y = min(cy, H - cy)
+
+            # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+            l, t = int(cx - min_margin_x), int(cy - min_margin_y)
+            r, b = int(cx + min_margin_x), int(cy + min_margin_y)
+            crop_bbox = (l, t, r, b)
+            input_rgb_image, depth_mask, input_camera_intrinsics = cropping.crop_image_depthmap(
+                input_rgb_image, depth_mask, camera_intrinsics, crop_bbox)
+
+            # try to set the lower dimension to img_size * 3/4 -> img_size=512 => 384
+            scale_final = ((img_size * 3 // 4) / min(H, W)) + 1e-8
+            output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+            if max(output_resolution) < img_size:
+                # let's put the max dimension to img_size
+                scale_final = (img_size / max(H, W)) + 1e-8
+                output_resolution = np.floor(np.array([W, H]) * scale_final).astype(int)
+
+            input_rgb_image, depth_mask, input_camera_intrinsics = cropping.rescale_image_depthmap(
+                input_rgb_image, depth_mask, input_camera_intrinsics, output_resolution)
+            input_depthmap = depth_mask[:, :, 0]
+            input_mask = depth_mask[:, :, 1]
+
+            camera_pose = camera_to_world[frame_id]
+
+            # save crop images and depth, metadata
+            save_img_path = os.path.join(scene_output_dir, 'rgb', f'{frame_id:0>5d}.jpg')
+            save_depth_path = os.path.join(scene_output_dir, 'depth', f'{frame_id:0>5d}.png')
+            save_mask_path = os.path.join(scene_output_dir, 'masks', f'{frame_id:0>5d}.png')
+            os.makedirs(os.path.split(save_img_path)[0], exist_ok=True)
+            os.makedirs(os.path.split(save_depth_path)[0], exist_ok=True)
+            os.makedirs(os.path.split(save_mask_path)[0], exist_ok=True)
+
+            input_rgb_image.save(save_img_path)
+            cv2.imwrite(save_depth_path, input_depthmap.astype(np.uint16))
+            cv2.imwrite(save_mask_path, (input_mask * 255).astype(np.uint8))
+
+            save_meta_path = os.path.join(scene_output_dir, 'metadata', f'{frame_id:0>5d}.npz')
+            os.makedirs(os.path.split(save_meta_path)[0], exist_ok=True)
+            np.savez(save_meta_path, camera_intrinsics=input_camera_intrinsics,
+                     camera_pose=camera_pose)
+
+    return selected_sequences_numbers_dict
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    assert args.wildrgbd_dir != args.output_dir
+
+    categories = sorted([
+        dirname for dirname in os.listdir(args.wildrgbd_dir)
+        if os.path.isdir(os.path.join(args.wildrgbd_dir, dirname, 'scenes'))
+    ])
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    splits_num_sequences_per_object = [args.train_num_sequences_per_object, args.test_num_sequences_per_object]
+    for split, num_sequences_per_object in zip(['train', 'test'], splits_num_sequences_per_object):
+        selected_sequences_path = os.path.join(args.output_dir, f'selected_seqs_{split}.json')
+        if os.path.isfile(selected_sequences_path):
+            continue
+        all_selected_sequences = {}
+        for category in categories:
+            category_output_dir = osp.join(args.output_dir, category)
+            os.makedirs(category_output_dir, exist_ok=True)
+            category_selected_sequences_path = os.path.join(category_output_dir, f'selected_seqs_{split}.json')
+            if os.path.isfile(category_selected_sequences_path):
+                with open(category_selected_sequences_path, 'r') as fid:
+                    category_selected_sequences = json.load(fid)
+            else:
+                print(f"Processing {split} - category = {category}")
+                category_selected_sequences = prepare_sequences(
+                    category=category,
+                    wildrgbd_dir=args.wildrgbd_dir,
+                    output_dir=args.output_dir,
+                    img_size=args.img_size,
+                    split=split,
+                    max_num_sequences_per_object=num_sequences_per_object,
+                    output_num_frames=args.num_frames,
+                    seed=args.seed + int("category".encode('ascii').hex(), 16),
+                )
+                with open(category_selected_sequences_path, 'w') as file:
+                    json.dump(category_selected_sequences, file)
+
+            all_selected_sequences[category] = category_selected_sequences
+        with open(selected_sequences_path, 'w') as file:
+            json.dump(all_selected_sequences, file)
diff --git a/dust3r/demo.py b/dust3r/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..35a5ff677f50c3db45fd49dabc8af03a688aebb8
--- /dev/null
+++ b/dust3r/demo.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dust3r gradio demo executable
+# --------------------------------------------------------
+import torch
+
+from dust3r.demo import main
+
+import matplotlib.pyplot as pl
+pl.ion()
+
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+
+
+if __name__ == '__main__':
+    main()
diff --git a/dust3r/docker/docker-compose-cpu.yml b/dust3r/docker/docker-compose-cpu.yml
new file mode 100644
index 0000000000000000000000000000000000000000..2015fd771e8b6246d288c03a38f6fbb3f17dff20
--- /dev/null
+++ b/dust3r/docker/docker-compose-cpu.yml
@@ -0,0 +1,16 @@
+version: '3.8'
+services:
+  dust3r-demo:
+    build:
+      context: ./files
+      dockerfile: cpu.Dockerfile 
+    ports:
+      - "7860:7860"
+    volumes:
+      - ./files/checkpoints:/dust3r/checkpoints
+    environment:
+      - DEVICE=cpu
+      - MODEL=${MODEL:-DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth}
+    cap_add:
+      - IPC_LOCK
+      - SYS_RESOURCE
diff --git a/dust3r/docker/docker-compose-cuda.yml b/dust3r/docker/docker-compose-cuda.yml
new file mode 100644
index 0000000000000000000000000000000000000000..85710af953d669fe618273de6ce3a062a7a84cca
--- /dev/null
+++ b/dust3r/docker/docker-compose-cuda.yml
@@ -0,0 +1,23 @@
+version: '3.8'
+services:
+  dust3r-demo:
+    build:
+      context: ./files
+      dockerfile: cuda.Dockerfile 
+    ports:
+      - "7860:7860"
+    environment:
+      - DEVICE=cuda
+      - MODEL=${MODEL:-DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth}
+    volumes:
+      - ./files/checkpoints:/dust3r/checkpoints
+    cap_add:
+      - IPC_LOCK
+      - SYS_RESOURCE
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
diff --git a/dust3r/docker/files/cpu.Dockerfile b/dust3r/docker/files/cpu.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c9ccc39682dd7c7723f447ff47f12531a593446f
--- /dev/null
+++ b/dust3r/docker/files/cpu.Dockerfile
@@ -0,0 +1,38 @@
+FROM python:3.11-slim
+
+LABEL description="Docker container for DUSt3R with dependencies installed. CPU VERSION"
+
+ENV DEVICE="cpu"
+ENV MODEL="DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y \
+    git \
+    libgl1-mesa-glx \
+    libegl1-mesa \
+    libxrandr2 \
+    libxrandr2 \
+    libxss1 \
+    libxcursor1 \
+    libxcomposite1 \
+    libasound2 \
+    libxi6 \
+    libxtst6 \
+    libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN git clone --recursive https://github.com/naver/dust3r /dust3r
+WORKDIR /dust3r
+
+RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
+RUN pip install -r requirements.txt
+RUN pip install -r requirements_optional.txt
+RUN pip install opencv-python==4.8.0.74
+
+WORKDIR /dust3r
+
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/dust3r/docker/files/cuda.Dockerfile b/dust3r/docker/files/cuda.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..a1d2edce1a5e7cee2fa3d66faf4f6ee019595267
--- /dev/null
+++ b/dust3r/docker/files/cuda.Dockerfile
@@ -0,0 +1,27 @@
+FROM nvcr.io/nvidia/pytorch:24.01-py3
+
+LABEL description="Docker container for DUSt3R with dependencies installed. CUDA VERSION"
+ENV DEVICE="cuda"
+ENV MODEL="DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y \
+    git=1:2.34.1-1ubuntu1.10 \
+    libglib2.0-0=2.72.4-0ubuntu2.2 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN git clone --recursive https://github.com/naver/dust3r /dust3r
+WORKDIR /dust3r
+RUN pip install -r requirements.txt
+RUN pip install -r requirements_optional.txt
+RUN pip install opencv-python==4.8.0.74
+
+WORKDIR /dust3r/croco/models/curope/
+RUN python setup.py build_ext --inplace
+
+WORKDIR /dust3r
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/dust3r/docker/files/entrypoint.sh b/dust3r/docker/files/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9637072a0af071f927ca0481bcaa4b600644b8b5
--- /dev/null
+++ b/dust3r/docker/files/entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+set -eux
+
+DEVICE=${DEVICE:-cuda}
+MODEL=${MODEL:-DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth}
+
+exec python3 demo.py --weights "checkpoints/$MODEL" --device "$DEVICE" --local_network "$@"
diff --git a/dust3r/docker/run.sh b/dust3r/docker/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6c920363d607fc6019f10780d072edf49bee3046
--- /dev/null
+++ b/dust3r/docker/run.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+set -eux
+
+# Default model name
+model_name="DUSt3R_ViTLarge_BaseDecoder_512_dpt.pth"
+
+check_docker() {
+    if ! command -v docker &>/dev/null; then
+        echo "Docker could not be found. Please install Docker and try again."
+        exit 1
+    fi
+}
+
+download_model_checkpoint() { 
+    if [ -f "./files/checkpoints/${model_name}" ]; then
+        echo "Model checkpoint ${model_name} already exists. Skipping download."
+        return
+    fi
+    echo "Downloading model checkpoint ${model_name}..."
+    wget "https://download.europe.naverlabs.com/ComputerVision/DUSt3R/${model_name}" -P ./files/checkpoints
+}
+
+set_dcomp() {
+    if command -v docker-compose &>/dev/null; then
+        dcomp="docker-compose"
+    elif command -v docker &>/dev/null && docker compose version &>/dev/null; then
+        dcomp="docker compose"
+    else
+        echo "Docker Compose could not be found. Please install Docker Compose and try again."
+        exit 1
+    fi
+}
+
+run_docker() {
+    export MODEL=${model_name}
+    if [ "$with_cuda" -eq 1 ]; then
+        $dcomp -f docker-compose-cuda.yml up --build
+    else
+        $dcomp -f docker-compose-cpu.yml up --build
+    fi
+}
+
+with_cuda=0
+for arg in "$@"; do
+    case $arg in
+        --with-cuda)
+            with_cuda=1
+            ;;
+        --model_name=*)
+            model_name="${arg#*=}.pth"
+            ;;
+        *)
+            echo "Unknown parameter passed: $arg"
+            exit 1
+            ;;
+    esac
+done
+
+
+main() {
+    check_docker
+    download_model_checkpoint
+    set_dcomp
+    run_docker
+}
+
+main
diff --git a/dust3r/dust3r/__init__.py b/dust3r/dust3r/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/dust3r/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/dust3r/cloud_opt/__init__.py b/dust3r/dust3r/cloud_opt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..faf5cd279a317c1efb9ba947682992c0949c1bdc
--- /dev/null
+++ b/dust3r/dust3r/cloud_opt/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# global alignment optimization wrapper function
+# --------------------------------------------------------
+from enum import Enum
+
+from .optimizer import PointCloudOptimizer
+from .modular_optimizer import ModularPointCloudOptimizer
+from .pair_viewer import PairViewer
+
+
+class GlobalAlignerMode(Enum):
+    PointCloudOptimizer = "PointCloudOptimizer"
+    ModularPointCloudOptimizer = "ModularPointCloudOptimizer"
+    PairViewer = "PairViewer"
+
+
+def global_aligner(dust3r_output, device, mode=GlobalAlignerMode.PointCloudOptimizer, **optim_kw):
+    # extract all inputs
+    view1, view2, pred1, pred2 = [dust3r_output[k] for k in 'view1 view2 pred1 pred2'.split()]
+    # build the optimizer
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    elif mode == GlobalAlignerMode.ModularPointCloudOptimizer:
+        net = ModularPointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    elif mode == GlobalAlignerMode.PairViewer:
+        net = PairViewer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    else:
+        raise NotImplementedError(f'Unknown mode {mode}')
+
+    return net
diff --git a/dust3r/dust3r/cloud_opt/base_opt.py b/dust3r/dust3r/cloud_opt/base_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d36e05bfca80509bced20add7c067987d538951
--- /dev/null
+++ b/dust3r/dust3r/cloud_opt/base_opt.py
@@ -0,0 +1,405 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class for the global alignement procedure
+# --------------------------------------------------------
+from copy import deepcopy
+
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+
+from dust3r.utils.geometry import inv, geotrf
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb
+from dust3r.viz import SceneViz, segment_sky, auto_cam_size
+from dust3r.optim_factory import adjust_learning_rate_by_lr
+
+from dust3r.cloud_opt.commons import (edge_str, ALL_DISTS, NoGradParamDict, get_imshapes, signed_expm1, signed_log1p,
+                                      cosine_schedule, linear_schedule, get_conf_trf)
+import dust3r.cloud_opt.init_im_poses as init_fun
+
+
+class BasePCOptimizer (nn.Module):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+
+    def __init__(self, *args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            other = deepcopy(args[0])
+            attrs = '''edges is_symmetrized dist n_imgs pred_i pred_j imshapes 
+                        min_conf_thr conf_thr conf_i conf_j im_conf
+                        base_scale norm_pw_scale POSE_DIM pw_poses 
+                        pw_adaptors pw_adaptors has_im_poses rand_pose imgs verbose'''.split()
+            self.__dict__.update({k: other[k] for k in attrs})
+        else:
+            self._init_from_views(*args, **kwargs)
+
+    def _init_from_views(self, view1, view2, pred1, pred2,
+                         dist='l1',
+                         conf='log',
+                         min_conf_thr=3,
+                         base_scale=0.5,
+                         allow_pw_adaptors=False,
+                         pw_break=20,
+                         rand_pose=torch.randn,
+                         iterationsCount=None,
+                         verbose=True):
+        super().__init__()
+        if not isinstance(view1['idx'], list):
+            view1['idx'] = view1['idx'].tolist()
+        if not isinstance(view2['idx'], list):
+            view2['idx'] = view2['idx'].tolist()
+        self.edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+        self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
+        self.dist = ALL_DISTS[dist]
+        self.verbose = verbose
+
+        self.n_imgs = self._check_edges()
+
+        # input data
+        pred1_pts = pred1['pts3d']
+        pred2_pts = pred2['pts3d_in_other_view']
+        self.pred_i = NoGradParamDict({ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.pred_j = NoGradParamDict({ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)})
+        self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
+
+        # work in log-scale with conf
+        pred1_conf = pred1['conf']
+        pred2_conf = pred2['conf']
+        self.min_conf_thr = min_conf_thr
+        self.conf_trf = get_conf_trf(conf)
+
+        self.conf_i = NoGradParamDict({ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.conf_j = NoGradParamDict({ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)})
+        self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
+        for i in range(len(self.im_conf)):
+            self.im_conf[i].requires_grad = False
+
+        # pairwise pose parameters
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(rand_pose((self.n_edges, 1+self.POSE_DIM)))  # pairwise poses
+        self.pw_adaptors = nn.Parameter(torch.zeros((self.n_edges, 2)))  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+
+        # possibly store images for show_pointcloud
+        self.imgs = None
+        if 'img' in view1 and 'img' in view2:
+            imgs = [torch.zeros((3,)+hw) for hw in self.imshapes]
+            for v in range(len(self.edges)):
+                idx = view1['idx'][v]
+                imgs[idx] = view1['img'][v]
+                idx = view2['idx'][v]
+                imgs[idx] = view2['img'][v]
+            self.imgs = rgb(imgs)
+
+    @property
+    def n_edges(self):
+        return len(self.edges)
+
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+
+    @property
+    def imsizes(self):
+        return [(w, h) for h, w in self.imshapes]
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    def state_dict(self, trainable=True):
+        all_params = super().state_dict()
+        return {k: v for k, v in all_params.items() if k.startswith(('_', 'pred_i.', 'pred_j.', 'conf_i.', 'conf_j.')) != trainable}
+
+    def load_state_dict(self, data):
+        return super().load_state_dict(self.state_dict(trainable=False) | data)
+
+    def _check_edges(self):
+        indices = sorted({i for edge in self.edges for i in edge})
+        assert indices == list(range(len(indices))), 'bad pair indices: missing values '
+        return len(indices)
+
+    @torch.no_grad()
+    def _compute_img_conf(self, pred1_conf, pred2_conf):
+        im_conf = nn.ParameterList([torch.zeros(hw, device=self.device) for hw in self.imshapes])
+        for e, (i, j) in enumerate(self.edges):
+            im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
+            im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
+        return im_conf
+
+    def get_adaptors(self):
+        adapt = self.pw_adaptors
+        adapt = torch.cat((adapt[:, 0:1], adapt), dim=-1)  # (scale_xy, scale_xy, scale_z)
+        if self.norm_pw_scale:  # normalize so that the product == 1
+            adapt = adapt - adapt.mean(dim=1, keepdim=True)
+        return (adapt / self.pw_break).exp()
+
+    def _get_poses(self, poses):
+        # normalize rotation
+        Q = poses[:, :4]
+        T = signed_expm1(poses[:, 4:7])
+        RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
+        return RT
+
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(T / (scale or 1))  # translation is function of scale
+
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+
+    def get_pw_scale(self):
+        scale = self.pw_poses[:, -1].exp()  # (n_edges,)
+        scale = scale * self.get_pw_norm_scale_factor()
+        return scale
+
+    def get_pw_poses(self):  # cam to world
+        RT = self._get_poses(self.pw_poses)
+        scaled_RT = RT.clone()
+        scaled_RT[:, :3] *= self.get_pw_scale().view(-1, 1, 1)  # scale the rotation AND translation
+        return scaled_RT
+
+    def get_masks(self):
+        return [(conf > self.min_conf_thr) for conf in self.im_conf]
+
+    def depth_to_pts3d(self):
+        raise NotImplementedError()
+
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def _set_focal(self, idx, focal, force=False):
+        raise NotImplementedError()
+
+    def get_focals(self):
+        raise NotImplementedError()
+
+    def get_known_focal_mask(self):
+        raise NotImplementedError()
+
+    def get_principal_points(self):
+        raise NotImplementedError()
+
+    def get_conf(self, mode=None):
+        trf = self.conf_trf if mode is None else get_conf_trf(mode)
+        return [trf(c) for c in self.im_conf]
+
+    def get_im_poses(self):
+        raise NotImplementedError()
+
+    def _set_depthmap(self, idx, depth, force=False):
+        raise NotImplementedError()
+
+    def get_depthmaps(self, raw=False):
+        raise NotImplementedError()
+
+    def clean_pointcloud(self, **kw):
+        cams = inv(self.get_im_poses())
+        K = self.get_intrinsics()
+        depthmaps = self.get_depthmaps()
+        all_pts3d = self.get_pts3d()
+
+        new_im_confs = clean_pointcloud(self.im_conf, K, cams, depthmaps, all_pts3d, **kw)
+
+        for i, new_conf in enumerate(new_im_confs):
+            self.im_conf[i].data[:] = new_conf
+        return self
+
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+
+        if ret_details:
+            return loss, details
+        return loss
+
+    @torch.cuda.amp.autocast(enabled=False)
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == 'msp' or init == 'mst':
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == 'known_poses':
+            init_fun.init_from_known_poses(self, min_conf_thr=self.min_conf_thr,
+                                           niter_PnP=niter_PnP)
+        else:
+            raise ValueError(f'bad value for {init=}')
+
+        return global_alignment_loop(self, **kw)
+
+    @torch.no_grad()
+    def mask_sky(self):
+        res = deepcopy(self)
+        for i in range(self.n_imgs):
+            sky = segment_sky(self.imgs[i])
+            res.im_conf[i][sky] = 0
+        return res
+
+    def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
+        viz = SceneViz()
+        if self.imgs is None:
+            colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
+            colors = list(map(tuple, colors.tolist()))
+            for n in range(self.n_imgs):
+                viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
+        else:
+            viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
+            colors = np.random.randint(256, size=(self.n_imgs, 3))
+
+        # camera poses
+        im_poses = to_numpy(self.get_im_poses())
+        if cam_size is None:
+            cam_size = auto_cam_size(im_poses)
+        viz.add_cameras(im_poses, self.get_focals(), colors=colors,
+                        images=self.imgs, imsizes=self.imsizes, cam_size=cam_size)
+        if show_pw_cams:
+            pw_poses = self.get_pw_poses()
+            viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
+
+            if show_pw_pts3d:
+                pts = [geotrf(pw_poses[e], self.pred_i[edge_str(i, j)]) for e, (i, j) in enumerate(self.edges)]
+                viz.add_pointcloud(pts, (128, 0, 128))
+
+        viz.show(**kw)
+        return viz
+
+
+def global_alignment_loop(net, lr=0.01, niter=300, schedule='cosine', lr_min=1e-6):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+
+    verbose = net.verbose
+    if verbose:
+        print('Global alignement - optimizing for:')
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+
+    loss = float('inf')
+    if verbose:
+        with tqdm.tqdm(total=niter) as bar:
+            while bar.n < bar.total:
+                loss, lr = global_alignment_iter(net, bar.n, niter, lr_base, lr_min, optimizer, schedule)
+                bar.set_postfix_str(f'{lr=:g} loss={loss:g}')
+                bar.update()
+    else:
+        for n in range(niter):
+            loss, _ = global_alignment_iter(net, n, niter, lr_base, lr_min, optimizer, schedule)
+    return loss
+
+
+def global_alignment_iter(net, cur_iter, niter, lr_base, lr_min, optimizer, schedule):
+    t = cur_iter / niter
+    if schedule == 'cosine':
+        lr = cosine_schedule(t, lr_base, lr_min)
+    elif schedule == 'linear':
+        lr = linear_schedule(t, lr_base, lr_min)
+    else:
+        raise ValueError(f'bad lr {schedule=}')
+    adjust_learning_rate_by_lr(optimizer, lr)
+    optimizer.zero_grad()
+    loss = net()
+    loss.backward()
+    optimizer.step()
+
+    return float(loss), lr
+
+
+@torch.no_grad()
+def clean_pointcloud( im_confs, K, cams, depthmaps, all_pts3d, 
+                      tol=0.001, bad_conf=0, dbg=()):
+    """ Method: 
+    1) express all 3d points in each camera coordinate frame
+    2) if they're in front of a depthmap --> then lower their confidence
+    """
+    assert len(im_confs) == len(cams) == len(K) == len(depthmaps) == len(all_pts3d)
+    assert 0 <= tol < 1
+    res = [c.clone() for c in im_confs]
+
+    # reshape appropriately
+    all_pts3d = [p.view(*c.shape,3) for p,c in zip(all_pts3d, im_confs)]
+    depthmaps = [d.view(*c.shape) for d,c in zip(depthmaps, im_confs)]
+    
+    for i, pts3d in enumerate(all_pts3d):
+        for j in range(len(all_pts3d)):
+            if i == j: continue
+
+            # project 3dpts in other view
+            proj = geotrf(cams[j], pts3d)
+            proj_depth = proj[:,:,2]
+            u,v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
+
+            # check which points are actually in the visible cone
+            H, W = im_confs[j].shape
+            msk_i = (proj_depth > 0) & (0 <= u) & (u < W) & (0 <= v) & (v < H)
+            msk_j = v[msk_i], u[msk_i]
+
+            # find bad points = those in front but less confident
+            bad_points = (proj_depth[msk_i] < (1-tol) * depthmaps[j][msk_j]) & (res[i][msk_i] < res[j][msk_j])
+
+            bad_msk_i = msk_i.clone()
+            bad_msk_i[msk_i] = bad_points
+            res[i][bad_msk_i] = res[i][bad_msk_i].clip_(max=bad_conf)
+
+    return res
diff --git a/dust3r/dust3r/cloud_opt/commons.py b/dust3r/dust3r/cloud_opt/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..3be9f855a69ea18c82dcc8e5769e0149a59649bd
--- /dev/null
+++ b/dust3r/dust3r/cloud_opt/commons.py
@@ -0,0 +1,90 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+
+
+def edge_str(i, j):
+    return f'{i}_{j}'
+
+
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+
+
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+
+
+def compute_edge_scores(edges, conf_i, conf_j):
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+
+
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+
+
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f'incorrect shape for image {i}'
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f'incorrect shape for image {j}'
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+
+
+def get_conf_trf(mode):
+    if mode == 'log':
+        def conf_trf(x): return x.log()
+    elif mode == 'sqrt':
+        def conf_trf(x): return x.sqrt()
+    elif mode == 'm1':
+        def conf_trf(x): return x-1
+    elif mode in ('id', 'none'):
+        def conf_trf(x): return x
+    else:
+        raise ValueError(f'bad mode for {mode=}')
+    return conf_trf
+
+
+def l2_dist(a, b, weight):
+    return ((a - b).square().sum(dim=-1) * weight)
+
+
+def l1_dist(a, b, weight):
+    return ((a - b).norm(dim=-1) * weight)
+
+
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+
+
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+
+
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+
+
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1+np.cos(t * np.pi))/2
+
+
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t
diff --git a/dust3r/dust3r/cloud_opt/init_im_poses.py b/dust3r/dust3r/cloud_opt/init_im_poses.py
new file mode 100644
index 0000000000000000000000000000000000000000..7887c5cde27115273601e704b81ca0b0301f3715
--- /dev/null
+++ b/dust3r/dust3r/cloud_opt/init_im_poses.py
@@ -0,0 +1,316 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Initialization functions for global alignment
+# --------------------------------------------------------
+from functools import cache
+
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+
+from dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
+from dust3r.post_process import estimate_focal_knowing_depth
+from dust3r.viz import to_numpy
+
+from dust3r.cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
+
+
+@torch.no_grad()
+def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
+    device = self.device
+
+    # indices of known poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    assert nkp == self.n_imgs, 'not all poses are known'
+
+    # get all focals
+    nkf, _, im_focals = get_known_focals(self)
+    assert nkf == self.n_imgs
+    im_pp = self.get_principal_points()
+
+    best_depthmaps = {}
+    # init all pairwise poses
+    for e, (i, j) in enumerate(tqdm(self.edges, disable=not self.verbose)):
+        i_j = edge_str(i, j)
+
+        # find relative pose for this pair
+        P1 = torch.eye(4, device=device)
+        msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
+        _, P2 = fast_pnp(self.pred_j[i_j], float(im_focals[i].mean()),
+                         pp=im_pp[i], msk=msk, device=device, niter_PnP=niter_PnP)
+
+        # align the two predicted camera with the two gt cameras
+        s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
+        # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
+        # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+
+        # remember if this is a good depthmap
+        score = float(self.conf_i[i_j].mean())
+        if score > best_depthmaps.get(i, (0,))[0]:
+            best_depthmaps[i] = score, i_j, s
+
+    # init all image poses
+    for n in range(self.n_imgs):
+        assert known_poses_msk[n]
+        _, i_j, scale = best_depthmaps[n]
+        depth = self.pred_i[i_j][:, :, 2]
+        self._set_depthmap(n, depth * scale)
+
+
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """ Init all camera poses (image-wise and pairwise poses) given
+        an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(self.imshapes, self.edges,
+                                                          self.pred_i, self.pred_j, self.conf_i, self.conf_j, self.im_conf, self.min_conf_thr,
+                                                          device, has_im_poses=self.has_im_poses, verbose=self.verbose,
+                                                          **kw)
+
+    return init_from_pts3d(self, pts3d, im_focals, im_poses)
+
+
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    if nkp == 1:
+        raise NotImplementedError("Would be simpler to just align everything afterwards on the single known pose")
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(im_poses[known_poses_msk], known_poses[known_poses_msk])
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+
+    # set all pairwise poses
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        s, R, T = rigid_points_registration(self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+
+    # take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factor
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world)
+            if im_focals[i] is not None:
+                self._set_focal(i, im_focals[i])
+
+    if self.verbose:
+        print(' init loss =', float(self()))
+
+
+def minimum_spanning_tree(imshapes, edges, pred_i, pred_j, conf_i, conf_j, im_conf, min_conf_thr,
+                          device, has_im_poses=True, niter_PnP=10, verbose=True):
+    n_imgs = len(imshapes)
+    sparse_graph = -dict_to_sparse_graph(compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j))
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
+
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes)
+
+    todo = sorted(zip(-msp.data, msp.row, msp.col))  # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+
+    # init with strongest edge
+    score, i, j = todo.pop()
+    if verbose:
+        print(f' init edge ({i}*,{j}*) {score=}')
+    i_j = edge_str(i, j)
+    pts3d[i] = pred_i[i_j].clone()
+    pts3d[j] = pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses:
+        im_poses[i] = torch.eye(4, device=device)
+        im_focals[i] = estimate_focal(pred_i[i_j])
+
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop()
+
+        if im_focals[i] is None:
+            im_focals[i] = estimate_focal(pred_i[i_j])
+
+        if i in done:
+            if verbose:
+                print(f' init edge ({i},{j}*) {score=}')
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[j] = geotrf(trf, pred_j[i_j])
+            done.add(j)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+
+        elif j in done:
+            if verbose:
+                print(f' init edge ({i}*,{j}) {score=}')
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, pred_i[i_j])
+            done.add(i)
+            msp_edges.append((i, j))
+
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(sparse_graph.values())  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[np.argsort(pair_scores)]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
+
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr
+                res = fast_pnp(pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP)
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+
+    return pts3d, msp_edges, im_focals, im_poses
+
+
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res
+
+
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration(
+        pts1.reshape(-1, 3), pts2.reshape(-1, 3), weights=conf.ravel(), compute_scaling=True)
+    return s, R, T  # return un-scaled (R, T)
+
+
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device)
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf
+
+
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W/2, H/2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode='weiszfeld').ravel()
+    return float(focal)
+
+
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+
+
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S/2, S*3, 21)
+    else:
+        tentative_focals = [focal]
+
+    if pp is None:
+        pp = (W/2, H/2)
+    else:
+        pp = to_numpy(pp)
+
+    best = 0,
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+
+        success, R, T, inliers = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                                    iterationsCount=niter_PnP, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+        if not success:
+            continue
+
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+
+    if not best[0]:
+        return None
+
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+
+
+def get_known_poses(self):
+    if self.has_im_poses:
+        known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
+        known_poses = self.get_im_poses()
+        return known_poses_msk.sum(), known_poses_msk, known_poses
+    else:
+        return 0, None, None
+
+
+def get_known_focals(self):
+    if self.has_im_poses:
+        known_focal_msk = self.get_known_focal_mask()
+        known_focals = self.get_focals()
+        return known_focal_msk.sum(), known_focal_msk, known_focals
+    else:
+        return 0, None, None
+
+
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+
+    def center_and_z(poses):
+        eps = get_med_dist_between_poses(poses) / 100
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps*poses[:, :3, 2]))
+    R, T, s = roma.rigid_points_registration(center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True)
+    return s, R, T
diff --git a/dust3r/dust3r/cloud_opt/modular_optimizer.py b/dust3r/dust3r/cloud_opt/modular_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d06464b40276684385c18b9195be1491c6f47f07
--- /dev/null
+++ b/dust3r/dust3r/cloud_opt/modular_optimizer.py
@@ -0,0 +1,145 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Slower implementation of the global alignment that allows to freeze partial poses/intrinsics
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+
+from dust3r.cloud_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import geotrf
+from dust3r.utils.device import to_cpu, to_numpy
+from dust3r.utils.geometry import depthmap_to_pts3d
+
+
+class ModularPointCloudOptimizer (BasePCOptimizer):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Unlike PointCloudOptimizer, you can fix parts of the optimization process (partial poses/intrinsics)
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+
+    def __init__(self, *args, optimize_pp=False, fx_and_fy=False, focal_brake=20, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.has_im_poses = True  # by definition of this class
+        self.focal_brake = focal_brake
+
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
+        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
+        default_focals = [self.focal_brake * np.log(max(H, W)) for H, W in self.imshapes]
+        self.im_focals = nn.ParameterList(torch.FloatTensor([f, f] if fx_and_fy else [
+                                          f]) for f in default_focals)  # camera intrinsics
+        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            if self.verbose:
+                print(f' (setting pose #{idx} = {pose[:3,3]})')
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose), force=True))
+
+        # normalize scale if there's less than 1 known pose
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = (n_known_poses <= 1)
+
+    def preset_intrinsics(self, known_intrinsics, msk=None):
+        if isinstance(known_intrinsics, torch.Tensor) and known_intrinsics.ndim == 2:
+            known_intrinsics = [known_intrinsics]
+        for K in known_intrinsics:
+            assert K.shape == (3, 3)
+        self.preset_focal([K.diagonal()[:2].mean() for K in known_intrinsics], msk)
+        self.preset_principal_point([K[:2, 2] for K in known_intrinsics], msk)
+
+    def preset_focal(self, known_focals, msk=None):
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            if self.verbose:
+                print(f' (setting focal #{idx} = {focal})')
+            self._no_grad(self._set_focal(idx, focal, force=True))
+
+    def preset_principal_point(self, known_pp, msk=None):
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            if self.verbose:
+                print(f' (setting principal point #{idx} = {pp})')
+            self._no_grad(self._set_principal_point(idx, pp, force=True))
+
+    def _no_grad(self, tensor):
+        return tensor.requires_grad_(False)
+
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.where(msk)[0]
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f'bad {msk=}')
+
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = self.focal_brake * np.log(focal)
+        return param
+
+    def get_focals(self):
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_brake).exp()
+
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
+        return param
+
+    def get_principal_points(self):
+        return torch.stack([pp.new((W/2, H/2))+10*pp for pp, (H, W) in zip(self.im_pp, self.imshapes)])
+
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().view(self.n_imgs, -1)
+        K[:, 0, 0] = focals[:, 0]
+        K[:, 1, 1] = focals[:, -1]
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+
+    def get_im_poses(self):  # cam to world
+        cam2world = self._get_poses(torch.stack(list(self.im_poses)))
+        return cam2world
+
+    def _set_depthmap(self, idx, depth, force=False):
+        param = self.im_depthmaps[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)
+        return param
+
+    def get_depthmaps(self):
+        return [d.exp() for d in self.im_depthmaps]
+
+    def depth_to_pts3d(self):
+        # Get depths and  projection params if not provided
+        focals = self.get_focals()
+        pp = self.get_principal_points()
+        im_poses = self.get_im_poses()
+        depth = self.get_depthmaps()
+
+        # convert focal to (1,2,H,W) constant field
+        def focal_ex(i): return focals[i][..., None, None].expand(1, *focals[i].shape, *self.imshapes[i])
+        # get pointmaps in camera frame
+        rel_ptmaps = [depthmap_to_pts3d(depth[i][None], focal_ex(i), pp=pp[i:i+1])[0] for i in range(im_poses.shape[0])]
+        # project to world frame
+        return [geotrf(pose, ptmap) for pose, ptmap in zip(im_poses, rel_ptmaps)]
+
+    def get_pts3d(self):
+        return self.depth_to_pts3d()
diff --git a/dust3r/dust3r/cloud_opt/optimizer.py b/dust3r/dust3r/cloud_opt/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..42e48613e55faa4ede5a366d1c0bfc4d18ffae4f
--- /dev/null
+++ b/dust3r/dust3r/cloud_opt/optimizer.py
@@ -0,0 +1,248 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Main class for the implementation of the global alignment
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+
+from dust3r.cloud_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import xy_grid, geotrf
+from dust3r.utils.device import to_cpu, to_numpy
+
+
+class PointCloudOptimizer(BasePCOptimizer):
+    """ Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+
+    def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.has_im_poses = True  # by definition of this class
+        self.focal_break = focal_break
+
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(torch.randn(H, W)/10-3 for H, W in self.imshapes)  # log(depth)
+        self.im_poses = nn.ParameterList(self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs))  # camera poses
+        self.im_focals = nn.ParameterList(torch.FloatTensor(
+            [self.focal_break*np.log(max(H, W))]) for H, W in self.imshapes)  # camera intrinsics
+        self.im_pp = nn.ParameterList(torch.zeros((2,)) for _ in range(self.n_imgs))  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+
+        self.imshape = self.imshapes[0]
+        im_areas = [h*w for h, w in self.imshapes]
+        self.max_area = max(im_areas)
+
+        # adding thing to optimize
+        self.im_depthmaps = ParameterStack(self.im_depthmaps, is_param=True, fill=self.max_area)
+        self.im_poses = ParameterStack(self.im_poses, is_param=True)
+        self.im_focals = ParameterStack(self.im_focals, is_param=True)
+        self.im_pp = ParameterStack(self.im_pp, is_param=True)
+        self.register_buffer('_pp', torch.tensor([(w/2, h/2) for h, w in self.imshapes]))
+        self.register_buffer('_grid', ParameterStack(
+            [xy_grid(W, H, device=self.device) for H, W in self.imshapes], fill=self.max_area))
+
+        # pre-compute pixel weights
+        self.register_buffer('_weight_i', ParameterStack(
+            [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges], fill=self.max_area))
+        self.register_buffer('_weight_j', ParameterStack(
+            [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges], fill=self.max_area))
+
+        # precompute aa
+        self.register_buffer('_stacked_pred_i', ParameterStack(self.pred_i, self.str_edges, fill=self.max_area))
+        self.register_buffer('_stacked_pred_j', ParameterStack(self.pred_j, self.str_edges, fill=self.max_area))
+        self.register_buffer('_ei', torch.tensor([i for i, j in self.edges]))
+        self.register_buffer('_ej', torch.tensor([j for i, j in self.edges]))
+        self.total_area_i = sum([im_areas[i] for i, j in self.edges])
+        self.total_area_j = sum([im_areas[j] for i, j in self.edges])
+
+    def _check_all_imgs_are_selected(self, msk):
+        assert np.all(self._get_msk_indices(msk) == np.arange(self.n_imgs)), 'incomplete mask!'
+
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        self._check_all_imgs_are_selected(pose_msk)
+
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            if self.verbose:
+                print(f' (setting pose #{idx} = {pose[:3,3]})')
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
+
+        # normalize scale if there's less than 1 known pose
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = (n_known_poses <= 1)
+
+        self.im_poses.requires_grad_(False)
+        self.norm_pw_scale = False
+
+    def preset_focal(self, known_focals, msk=None):
+        self._check_all_imgs_are_selected(msk)
+
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            if self.verbose:
+                print(f' (setting focal #{idx} = {focal})')
+            self._no_grad(self._set_focal(idx, focal))
+
+        self.im_focals.requires_grad_(False)
+
+    def preset_principal_point(self, known_pp, msk=None):
+        self._check_all_imgs_are_selected(msk)
+
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            if self.verbose:
+                print(f' (setting principal point #{idx} = {pp})')
+            self._no_grad(self._set_principal_point(idx, pp))
+
+        self.im_pp.requires_grad_(False)
+
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.where(msk)[0]
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f'bad {msk=}')
+
+    def _no_grad(self, tensor):
+        assert tensor.requires_grad, 'it must be True at this point, otherwise no modification occurs'
+
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = self.focal_break * np.log(focal)
+        return param
+
+    def get_focals(self):
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_break).exp()
+
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.im_focals])
+
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W/2, H/2)) / 10
+        return param
+
+    def get_principal_points(self):
+        return self._pp + 10 * self.im_pp
+
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().flatten()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+
+    def get_im_poses(self):  # cam to world
+        cam2world = self._get_poses(self.im_poses)
+        return cam2world
+
+    def _set_depthmap(self, idx, depth, force=False):
+        depth = _ravel_hw(depth, self.max_area)
+
+        param = self.im_depthmaps[idx]
+        if param.requires_grad or force:  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)
+        return param
+
+    def get_depthmaps(self, raw=False):
+        res = self.im_depthmaps.exp()
+        if not raw:
+            res = [dm[:h*w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def depth_to_pts3d(self):
+        # Get depths and  projection params if not provided
+        focals = self.get_focals()
+        pp = self.get_principal_points()
+        im_poses = self.get_im_poses()
+        depth = self.get_depthmaps(raw=True)
+
+        # get pointmaps in camera frame
+        rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp)
+        # project to world frame
+        return geotrf(im_poses, rel_ptmaps)
+
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[:h*w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+
+    def forward(self):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors().unsqueeze(1)
+        proj_pts3d = self.get_pts3d(raw=True)
+
+        # rotate pairwise prediction according to pw_poses
+        aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i)
+        aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
+
+        # compute the less
+        li = self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum() / self.total_area_i
+        lj = self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum() / self.total_area_j
+
+        return li + lj
+
+
+def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
+    pp = pp.unsqueeze(1)
+    focal = focal.unsqueeze(1)
+    assert focal.shape == (len(depth), 1, 1)
+    assert pp.shape == (len(depth), 1, 2)
+    assert pixel_grid.shape == depth.shape + (2,)
+    depth = depth.unsqueeze(-1)
+    return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1)
+
+
+def ParameterStack(params, keys=None, is_param=None, fill=0):
+    if keys is not None:
+        params = [params[k] for k in keys]
+
+    if fill > 0:
+        params = [_ravel_hw(p, fill) for p in params]
+
+    requires_grad = params[0].requires_grad
+    assert all(p.requires_grad == requires_grad for p in params)
+
+    params = torch.stack(list(params)).float().detach()
+    if is_param or requires_grad:
+        params = nn.Parameter(params)
+        params.requires_grad_(requires_grad)
+    return params
+
+
+def _ravel_hw(tensor, fill=0):
+    # ravel H,W
+    tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+
+    if len(tensor) < fill:
+        tensor = torch.cat((tensor, tensor.new_zeros((fill - len(tensor),)+tensor.shape[1:])))
+    return tensor
+
+
+def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
+    return minf*focal_base, maxf*focal_base
+
+
+def apply_mask(img, msk):
+    img = img.copy()
+    img[msk] = 0
+    return img
diff --git a/dust3r/dust3r/cloud_opt/pair_viewer.py b/dust3r/dust3r/cloud_opt/pair_viewer.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ae3b9a5fbca8b96711de051d9d6597830bd488
--- /dev/null
+++ b/dust3r/dust3r/cloud_opt/pair_viewer.py
@@ -0,0 +1,127 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dummy optimizer for visualizing pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+import cv2
+
+from dust3r.cloud_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import inv, geotrf, depthmap_to_absolute_camera_coordinates
+from dust3r.cloud_opt.commons import edge_str
+from dust3r.post_process import estimate_focal_knowing_depth
+
+
+class PairViewer (BasePCOptimizer):
+    """
+    This a Dummy Optimizer.
+    To use only when the goal is to visualize the results for a pair of images (with is_symmetrized)
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.is_symmetrized and self.n_edges == 2
+        self.has_im_poses = True
+
+        # compute all parameters directly from raw input
+        self.focals = []
+        self.pp = []
+        rel_poses = []
+        confs = []
+        for i in range(self.n_imgs):
+            conf = float(self.conf_i[edge_str(i, 1-i)].mean() * self.conf_j[edge_str(i, 1-i)].mean())
+            if self.verbose:
+                print(f'  - {conf=:.3} for edge {i}-{1-i}')
+            confs.append(conf)
+
+            H, W = self.imshapes[i]
+            pts3d = self.pred_i[edge_str(i, 1-i)]
+            pp = torch.tensor((W/2, H/2))
+            focal = float(estimate_focal_knowing_depth(pts3d[None], pp, focal_mode='weiszfeld'))
+            self.focals.append(focal)
+            self.pp.append(pp)
+
+            # estimate the pose of pts1 in image 2
+            pixels = np.mgrid[:W, :H].T.astype(np.float32)
+            pts3d = self.pred_j[edge_str(1-i, i)].numpy()
+            assert pts3d.shape[:2] == (H, W)
+            msk = self.get_masks()[i].numpy()
+            K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+
+            try:
+                res = cv2.solvePnPRansac(pts3d[msk], pixels[msk], K, None,
+                                         iterationsCount=100, reprojectionError=5, flags=cv2.SOLVEPNP_SQPNP)
+                success, R, T, inliers = res
+                assert success
+
+                R = cv2.Rodrigues(R)[0]  # world to cam
+                pose = inv(np.r_[np.c_[R, T], [(0, 0, 0, 1)]])  # cam to world
+            except:
+                pose = np.eye(4)
+            rel_poses.append(torch.from_numpy(pose.astype(np.float32)))
+
+        # let's use the pair with the most confidence
+        if confs[0] > confs[1]:
+            # ptcloud is expressed in camera1
+            self.im_poses = [torch.eye(4), rel_poses[1]]  # I, cam2-to-cam1
+            self.depth = [self.pred_i['0_1'][..., 2], geotrf(inv(rel_poses[1]), self.pred_j['0_1'])[..., 2]]
+        else:
+            # ptcloud is expressed in camera2
+            self.im_poses = [rel_poses[0], torch.eye(4)]  # I, cam1-to-cam2
+            self.depth = [geotrf(inv(rel_poses[0]), self.pred_j['1_0'])[..., 2], self.pred_i['1_0'][..., 2]]
+
+        self.im_poses = nn.Parameter(torch.stack(self.im_poses, dim=0), requires_grad=False)
+        self.focals = nn.Parameter(torch.tensor(self.focals), requires_grad=False)
+        self.pp = nn.Parameter(torch.stack(self.pp, dim=0), requires_grad=False)
+        self.depth = nn.ParameterList(self.depth)
+        for p in self.parameters():
+            p.requires_grad = False
+
+    def _set_depthmap(self, idx, depth, force=False):
+        if self.verbose:
+            print('_set_depthmap is ignored in PairViewer')
+        return
+
+    def get_depthmaps(self, raw=False):
+        depth = [d.to(self.device) for d in self.depth]
+        return depth
+
+    def _set_focal(self, idx, focal, force=False):
+        self.focals[idx] = focal
+
+    def get_focals(self):
+        return self.focals
+
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.focals])
+
+    def get_principal_points(self):
+        return self.pp
+
+    def get_intrinsics(self):
+        focals = self.get_focals()
+        pps = self.get_principal_points()
+        K = torch.zeros((len(focals), 3, 3), device=self.device)
+        for i in range(len(focals)):
+            K[i, 0, 0] = K[i, 1, 1] = focals[i]
+            K[i, :2, 2] = pps[i]
+            K[i, 2, 2] = 1
+        return K
+
+    def get_im_poses(self):
+        return self.im_poses
+
+    def depth_to_pts3d(self):
+        pts3d = []
+        for d, intrinsics, im_pose in zip(self.depth, self.get_intrinsics(), self.get_im_poses()):
+            pts, _ = depthmap_to_absolute_camera_coordinates(d.cpu().numpy(),
+                                                             intrinsics.cpu().numpy(),
+                                                             im_pose.cpu().numpy())
+            pts3d.append(torch.from_numpy(pts).to(device=self.device))
+        return pts3d
+
+    def forward(self):
+        return float('nan')
diff --git a/dust3r/dust3r/datasets/__init__.py b/dust3r/dust3r/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2123d09ec2840ab5ee9ca43057c35f93233bde89
--- /dev/null
+++ b/dust3r/dust3r/datasets/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from .utils.transforms import *
+from .base.batched_sampler import BatchedRandomSampler  # noqa
+from .arkitscenes import ARKitScenes  # noqa
+from .blendedmvs import BlendedMVS  # noqa
+from .co3d import Co3d  # noqa
+from .habitat import Habitat  # noqa
+from .megadepth import MegaDepth  # noqa
+from .scannetpp import ScanNetpp  # noqa
+from .staticthings3d import StaticThings3D  # noqa
+from .waymo import Waymo  # noqa
+from .wildrgbd import WildRGBD  # noqa
+
+
+def get_data_loader(dataset, batch_size, num_workers=8, shuffle=True, drop_last=True, pin_mem=True):
+    import torch
+    from croco.utils.misc import get_world_size, get_rank
+
+    # pytorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+
+    world_size = get_world_size()
+    rank = get_rank()
+
+    try:
+        sampler = dataset.make_sampler(batch_size, shuffle=shuffle, world_size=world_size,
+                                       rank=rank, drop_last=drop_last)
+    except (AttributeError, NotImplementedError):
+        # not avail for this dataset
+        if torch.distributed.is_initialized():
+            sampler = torch.utils.data.DistributedSampler(
+                dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, drop_last=drop_last
+            )
+        elif shuffle:
+            sampler = torch.utils.data.RandomSampler(dataset)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+        drop_last=drop_last,
+    )
+
+    return data_loader
diff --git a/dust3r/dust3r/datasets/arkitscenes.py b/dust3r/dust3r/datasets/arkitscenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fad51acdc18b82cd6a4d227de0dac3b25783e33
--- /dev/null
+++ b/dust3r/dust3r/datasets/arkitscenes.py
@@ -0,0 +1,102 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed arkitscenes
+# dataset at https://github.com/apple/ARKitScenes - Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Public License https://github.com/apple/ARKitScenes/tree/main?tab=readme-ov-file#license
+# See datasets_preprocess/preprocess_arkitscenes.py
+# --------------------------------------------------------
+import os.path as osp
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class ARKitScenes(BaseStereoViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        if split == "train":
+            self.split = "Training"
+        elif split == "test":
+            self.split = "Test"
+        else:
+            raise ValueError("")
+
+        self.loaded_data = self._load_data(self.split)
+
+    def _load_data(self, split):
+        with np.load(osp.join(self.ROOT, split, 'all_metadata.npz')) as data:
+            self.scenes = data['scenes']
+            self.sceneids = data['sceneids']
+            self.images = data['images']
+            self.intrinsics = data['intrinsics'].astype(np.float32)
+            self.trajectories = data['trajectories'].astype(np.float32)
+            self.pairs = data['pairs'][:, :2].astype(int)
+
+    def __len__(self):
+        return len(self.pairs)
+
+    def _get_views(self, idx, resolution, rng):
+
+        image_idx1, image_idx2 = self.pairs[idx]
+
+        views = []
+        for view_idx in [image_idx1, image_idx2]:
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
+
+            intrinsics = self.intrinsics[view_idx]
+            camera_pose = self.trajectories[view_idx]
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(scene_dir, 'vga_wide', basename.replace('.png', '.jpg')))
+            # Load depthmap
+            depthmap = imread_cv2(osp.join(scene_dir, 'lowres_depth', basename), cv2.IMREAD_UNCHANGED)
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx)
+
+            views.append(dict(
+                img=rgb_image,
+                depthmap=depthmap.astype(np.float32),
+                camera_pose=camera_pose.astype(np.float32),
+                camera_intrinsics=intrinsics.astype(np.float32),
+                dataset='arkitscenes',
+                label=self.scenes[scene_id] + '_' + basename,
+                instance=f'{str(idx)}_{str(view_idx)}',
+            ))
+
+        return views
+
+
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = ARKitScenes(split='train', ROOT="data/arkitscenes_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/base/__init__.py b/dust3r/dust3r/datasets/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/dust3r/datasets/base/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/dust3r/datasets/base/base_stereo_view_dataset.py b/dust3r/dust3r/datasets/base/base_stereo_view_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..e02c38ede8d8461471057b4294fa747987409910
--- /dev/null
+++ b/dust3r/dust3r/datasets/base/base_stereo_view_dataset.py
@@ -0,0 +1,220 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# base class for implementing datasets
+# --------------------------------------------------------
+import PIL
+import numpy as np
+import torch
+
+from dust3r.datasets.base.easy_dataset import EasyDataset
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+import dust3r.datasets.utils.cropping as cropping
+
+
+class BaseStereoViewDataset (EasyDataset):
+    """ Define all basic options.
+
+    Usage:
+        class MyDataset (BaseStereoViewDataset):
+            def _get_views(self, idx, rng):
+                # overload here
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+
+    def __init__(self, *,  # only keyword arguments
+                 split=None,
+                 resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+                 transform=ImgNorm,
+                 aug_crop=False,
+                 seed=None):
+        self.num_views = 2
+        self.split = split
+        self._set_resolutions(resolution)
+
+        if isinstance(transform, str):
+            transform = eval(transform)
+        self.transform = transform
+
+        self.aug_crop = aug_crop
+        self.seed = seed
+
+    def __len__(self):
+        return len(self.scenes)
+
+    def get_stats(self):
+        return f"{len(self)} pairs"
+
+    def __repr__(self):
+        resolutions_str = '[' + ';'.join(f'{w}x{h}' for w, h in self._resolutions) + ']'
+        return f"""{type(self).__name__}({self.get_stats()},
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace('self.', '').replace('\n', '').replace('   ', '')
+
+    def _get_views(self, idx, resolution, rng):
+        raise NotImplementedError()
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, '_rng'):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+
+        # over-loaded code
+        resolution = self._resolutions[ar_idx]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng)
+        assert len(views) == self.num_views
+
+        # check data-types
+        for v, view in enumerate(views):
+            assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view['idx'] = (idx, ar_idx, v)
+
+            # encode the image
+            width, height = view['img'].size
+            view['true_shape'] = np.int32((height, width))
+            view['img'] = self.transform(view['img'])
+
+            assert 'camera_intrinsics' in view
+            if 'camera_pose' not in view:
+                view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}'
+            assert 'pts3d' not in view
+            assert 'valid_mask' not in view
+            assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}'
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+
+            view['pts3d'] = pts3d
+            view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view['camera_intrinsics']
+
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view['rng'] = int.from_bytes(self._rng.bytes(4), 'big')
+        return views
+
+    def _set_resolutions(self, resolutions):
+        assert resolutions is not None, 'undefined resolution'
+
+        if not isinstance(resolutions, list):
+            resolutions = [resolutions]
+
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(width, int), f'Bad type for {width=} {type(width)=}, should be int'
+            assert isinstance(height, int), f'Bad type for {height=} {type(height)=}, should be int'
+            assert width >= height
+            self._resolutions.append((width, height))
+
+    def _crop_resize_if_necessary(self, image, depthmap, intrinsics, resolution, rng=None, info=None):
+        """ This function:
+            - first downsizes the image with LANCZOS inteprolation,
+              which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+
+        # downscale with lanczos interpolation so that image.size == resolution
+        # cropping centered on the principal point
+        W, H = image.size
+        cx, cy = intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+        # assert min_margin_x > W/5, f'Bad principal point in view={info}'
+        # assert min_margin_y > H/5, f'Bad principal point in view={info}'
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+
+        # transpose the resolution if necessary
+        W, H = image.size  # new size
+        assert resolution[0] >= resolution[1]
+        if H > 1.1 * W:
+            # image is portrait mode
+            resolution = resolution[::-1]
+        elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
+            # image is square, so we chose (portrait, landscape) randomly
+            if rng.integers(2):
+                resolution = resolution[::-1]
+
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        if self.aug_crop > 1:
+            target_resolution += rng.integers(0, self.aug_crop)
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, target_resolution)
+
+        # actual cropping (if necessary) with bilinear interpolation
+        intrinsics2 = cropping.camera_matrix_of_crop(intrinsics, image.size, resolution, offset_factor=0.5)
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(intrinsics, intrinsics2, resolution)
+        image, depthmap, intrinsics2 = cropping.crop_image_depthmap(image, depthmap, intrinsics, crop_bbox)
+
+        return image, depthmap, intrinsics2
+
+
+def is_good_type(key, v):
+    """ returns (is_good, err_msg) 
+    """
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+
+
+def view_name(view, batch_index=None):
+    def sel(x): return x[batch_index] if batch_index not in (None, slice(None)) else x
+    db = sel(view['dataset'])
+    label = sel(view['label'])
+    instance = sel(view['instance'])
+    return f"{db}/{label}/{instance}"
+
+
+def transpose_to_landscape(view):
+    height, width = view['true_shape']
+
+    if width < height:
+        # rectify portrait to landscape
+        assert view['img'].shape == (3, height, width)
+        view['img'] = view['img'].swapaxes(1, 2)
+
+        assert view['valid_mask'].shape == (height, width)
+        view['valid_mask'] = view['valid_mask'].swapaxes(0, 1)
+
+        assert view['depthmap'].shape == (height, width)
+        view['depthmap'] = view['depthmap'].swapaxes(0, 1)
+
+        assert view['pts3d'].shape == (height, width, 3)
+        view['pts3d'] = view['pts3d'].swapaxes(0, 1)
+
+        # transpose x and y pixels
+        view['camera_intrinsics'] = view['camera_intrinsics'][[1, 0, 2]]
diff --git a/dust3r/dust3r/datasets/base/batched_sampler.py b/dust3r/dust3r/datasets/base/batched_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f58a65d41bb8101159e032d5b0aac26a7cf1a1
--- /dev/null
+++ b/dust3r/dust3r/datasets/base/batched_sampler.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Random sampling under a constraint
+# --------------------------------------------------------
+import numpy as np
+import torch
+
+
+class BatchedRandomSampler:
+    """ Random sampling under a constraint: each sample in the batch has the same feature, 
+    which is chosen randomly from a known pool of 'features' for each batch.
+
+    For instance, the 'feature' could be the image aspect-ratio.
+
+    The index returned is a tuple (sample_idx, feat_idx).
+    This sampler ensures that each series of `batch_size` indices has the same `feat_idx`.
+    """
+
+    def __init__(self, dataset, batch_size, pool_size, world_size=1, rank=0, drop_last=True):
+        self.batch_size = batch_size
+        self.pool_size = pool_size
+
+        self.len_dataset = N = len(dataset)
+        self.total_size = round_by(N, batch_size*world_size) if drop_last else N
+        assert world_size == 1 or drop_last, 'must drop the last batch in distributed mode'
+
+        # distributed sampler
+        self.world_size = world_size
+        self.rank = rank
+        self.epoch = None
+
+    def __len__(self):
+        return self.total_size // self.world_size
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+    def __iter__(self):
+        # prepare RNG
+        if self.epoch is None:
+            assert self.world_size == 1 and self.rank == 0, 'use set_epoch() if distributed mode is used'
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        else:
+            seed = self.epoch + 777
+        rng = np.random.default_rng(seed=seed)
+
+        # random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+
+        # random feat_idxs (same across each batch)
+        n_batches = (self.total_size+self.batch_size-1) // self.batch_size
+        feat_idxs = rng.integers(self.pool_size, size=n_batches)
+        feat_idxs = np.broadcast_to(feat_idxs[:, None], (n_batches, self.batch_size))
+        feat_idxs = feat_idxs.ravel()[:self.total_size]
+
+        # put them together
+        idxs = np.c_[sample_idxs, feat_idxs]  # shape = (total_size, 2)
+
+        # Distributed sampler: we select a subset of batches
+        # make sure the slice for each node is aligned with batch_size
+        size_per_proc = self.batch_size * ((self.total_size + self.world_size *
+                                           self.batch_size-1) // (self.world_size * self.batch_size))
+        idxs = idxs[self.rank*size_per_proc: (self.rank+1)*size_per_proc]
+
+        yield from (tuple(idx) for idx in idxs)
+
+
+def round_by(total, multiple, up=False):
+    if up:
+        total = total + multiple-1
+    return (total//multiple) * multiple
diff --git a/dust3r/dust3r/datasets/base/easy_dataset.py b/dust3r/dust3r/datasets/base/easy_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..4939a88f02715a1f80be943ddb6d808e1be84db7
--- /dev/null
+++ b/dust3r/dust3r/datasets/base/easy_dataset.py
@@ -0,0 +1,157 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# A dataset base class that you can easily resize and combine.
+# --------------------------------------------------------
+import numpy as np
+from dust3r.datasets.base.batched_sampler import BatchedRandomSampler
+
+
+class EasyDataset:
+    """ a dataset that you can easily resize and combine.
+    Examples:
+    ---------
+        2 * dataset ==> duplicate each element 2x
+
+        10 @ dataset ==> set the size to 10 (random sampling, duplicates if necessary)
+
+        dataset1 + dataset2 ==> concatenate datasets
+    """
+
+    def __add__(self, other):
+        return CatDataset([self, other])
+
+    def __rmul__(self, factor):
+        return MulDataset(factor, self)
+
+    def __rmatmul__(self, factor):
+        return ResizedDataset(factor, self)
+
+    def set_epoch(self, epoch):
+        pass  # nothing to do by default
+
+    def make_sampler(self, batch_size, shuffle=True, world_size=1, rank=0, drop_last=True):
+        if not (shuffle):
+            raise NotImplementedError()  # cannot deal yet
+        num_of_aspect_ratios = len(self._resolutions)
+        return BatchedRandomSampler(self, batch_size, num_of_aspect_ratios, world_size=world_size, rank=rank, drop_last=drop_last)
+
+
+class MulDataset (EasyDataset):
+    """ Artifically augmenting the size of a dataset.
+    """
+    multiplicator: int
+
+    def __init__(self, multiplicator, dataset):
+        assert isinstance(multiplicator, int) and multiplicator > 0
+        self.multiplicator = multiplicator
+        self.dataset = dataset
+
+    def __len__(self):
+        return self.multiplicator * len(self.dataset)
+
+    def __repr__(self):
+        return f'{self.multiplicator}*{repr(self.dataset)}'
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            idx, other = idx
+            return self.dataset[idx // self.multiplicator, other]
+        else:
+            return self.dataset[idx // self.multiplicator]
+
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+
+
+class ResizedDataset (EasyDataset):
+    """ Artifically changing the size of a dataset.
+    """
+    new_size: int
+
+    def __init__(self, new_size, dataset):
+        assert isinstance(new_size, int) and new_size > 0
+        self.new_size = new_size
+        self.dataset = dataset
+
+    def __len__(self):
+        return self.new_size
+
+    def __repr__(self):
+        size_str = str(self.new_size)
+        for i in range((len(size_str)-1) // 3):
+            sep = -4*i-3
+            size_str = size_str[:sep] + '_' + size_str[sep:]
+        return f'{size_str} @ {repr(self.dataset)}'
+
+    def set_epoch(self, epoch):
+        # this random shuffle only depends on the epoch
+        rng = np.random.default_rng(seed=epoch+777)
+
+        # shuffle all indices
+        perm = rng.permutation(len(self.dataset))
+
+        # rotary extension until target size is met
+        shuffled_idxs = np.concatenate([perm] * (1 + (len(self)-1) // len(self.dataset)))
+        self._idxs_mapping = shuffled_idxs[:self.new_size]
+
+        assert len(self._idxs_mapping) == self.new_size
+
+    def __getitem__(self, idx):
+        assert hasattr(self, '_idxs_mapping'), 'You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()'
+        if isinstance(idx, tuple):
+            idx, other = idx
+            return self.dataset[self._idxs_mapping[idx], other]
+        else:
+            return self.dataset[self._idxs_mapping[idx]]
+
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+
+
+class CatDataset (EasyDataset):
+    """ Concatenation of several datasets 
+    """
+
+    def __init__(self, datasets):
+        for dataset in datasets:
+            assert isinstance(dataset, EasyDataset)
+        self.datasets = datasets
+        self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets])
+
+    def __len__(self):
+        return self._cum_sizes[-1]
+
+    def __repr__(self):
+        # remove uselessly long transform
+        return ' + '.join(repr(dataset).replace(',transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))', '') for dataset in self.datasets)
+
+    def set_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_epoch(epoch)
+
+    def __getitem__(self, idx):
+        other = None
+        if isinstance(idx, tuple):
+            idx, other = idx
+
+        if not (0 <= idx < len(self)):
+            raise IndexError()
+
+        db_idx = np.searchsorted(self._cum_sizes, idx, 'right')
+        dataset = self.datasets[db_idx]
+        new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
+
+        if other is not None:
+            new_idx = (new_idx, other)
+        return dataset[new_idx]
+
+    @property
+    def _resolutions(self):
+        resolutions = self.datasets[0]._resolutions
+        for dataset in self.datasets[1:]:
+            assert tuple(dataset._resolutions) == tuple(resolutions)
+        return resolutions
diff --git a/dust3r/dust3r/datasets/blendedmvs.py b/dust3r/dust3r/datasets/blendedmvs.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e68c28620cc47a7b1743834e45f82d576126d0
--- /dev/null
+++ b/dust3r/dust3r/datasets/blendedmvs.py
@@ -0,0 +1,104 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed BlendedMVS
+# dataset at https://github.com/YoYo000/BlendedMVS
+# See datasets_preprocess/preprocess_blendedmvs.py
+# --------------------------------------------------------
+import os.path as osp
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class BlendedMVS (BaseStereoViewDataset):
+    """ Dataset of outdoor street scenes, 5 images each time
+    """
+
+    def __init__(self, *args, ROOT, split=None, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self._load_data(split)
+
+    def _load_data(self, split):
+        pairs = np.load(osp.join(self.ROOT, 'blendedmvs_pairs.npy'))
+        if split is None:
+            selection = slice(None)
+        if split == 'train':
+            # select 90% of all scenes
+            selection = (pairs['seq_low'] % 10) > 0
+        if split == 'val':
+            # select 10% of all scenes
+            selection = (pairs['seq_low'] % 10) == 0
+        self.pairs = pairs[selection]
+
+        # list of all scenes
+        self.scenes = np.unique(self.pairs['seq_low'])  # low is unique enough
+
+    def __len__(self):
+        return len(self.pairs)
+
+    def get_stats(self):
+        return f'{len(self)} pairs from {len(self.scenes)} scenes'
+
+    def _get_views(self, pair_idx, resolution, rng):
+        seqh, seql, img1, img2, score = self.pairs[pair_idx]
+
+        seq = f"{seqh:08x}{seql:016x}"
+        seq_path = osp.join(self.ROOT, seq)
+
+        views = []
+
+        for view_index in [img1, img2]:
+            impath = f"{view_index:08n}"
+            image = imread_cv2(osp.join(seq_path, impath + ".jpg"))
+            depthmap = imread_cv2(osp.join(seq_path, impath + ".exr"))
+            camera_params = np.load(osp.join(seq_path, impath + ".npz"))
+
+            intrinsics = np.float32(camera_params['intrinsics'])
+            camera_pose = np.eye(4, dtype=np.float32)
+            camera_pose[:3, :3] = camera_params['R_cam2world']
+            camera_pose[:3, 3] = camera_params['t_cam2world']
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(seq_path, impath))
+
+            views.append(dict(
+                img=image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,  # cam2world
+                camera_intrinsics=intrinsics,
+                dataset='BlendedMVS',
+                label=osp.relpath(seq_path, self.ROOT),
+                instance=impath))
+
+        return views
+
+
+if __name__ == '__main__':
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = BlendedMVS(split='train', ROOT="data/blendedmvs_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(idx, view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/co3d.py b/dust3r/dust3r/datasets/co3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea5c8555d34b776e7a48396dcd0eecece713e34
--- /dev/null
+++ b/dust3r/dust3r/datasets/co3d.py
@@ -0,0 +1,165 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed Co3d_v2
+# dataset at https://github.com/facebookresearch/co3d - Creative Commons Attribution-NonCommercial 4.0 International
+# See datasets_preprocess/preprocess_co3d.py
+# --------------------------------------------------------
+import os.path as osp
+import json
+import itertools
+from collections import deque
+
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class Co3d(BaseStereoViewDataset):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        assert mask_bg in (True, False, 'rand')
+        self.mask_bg = mask_bg
+        self.dataset_label = 'Co3d_v2'
+
+        # load all scenes
+        with open(osp.join(self.ROOT, f'selected_seqs_{self.split}.json'), 'r') as f:
+            self.scenes = json.load(f)
+            self.scenes = {k: v for k, v in self.scenes.items() if len(v) > 0}
+            self.scenes = {(k, k2): v2 for k, v in self.scenes.items()
+                           for k2, v2 in v.items()}
+        self.scene_list = list(self.scenes.keys())
+
+        # for each scene, we have 100 images ==> 360 degrees (so 25 frames ~= 90 degrees)
+        # we prepare all combinations such that i-j = +/- [5, 10, .., 90] degrees
+        self.combinations = [(i, j)
+                             for i, j in itertools.combinations(range(100), 2)
+                             if 0 < abs(i - j) <= 30 and abs(i - j) % 5 == 0]
+
+        self.invalidate = {scene: {} for scene in self.scene_list}
+
+    def __len__(self):
+        return len(self.scene_list) * len(self.combinations)
+
+    def _get_metadatapath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'images', f'frame{view_idx:06n}.npz')
+
+    def _get_impath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'images', f'frame{view_idx:06n}.jpg')
+
+    def _get_depthpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'depths', f'frame{view_idx:06n}.jpg.geometric.png')
+
+    def _get_maskpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'masks', f'frame{view_idx:06n}.png')
+
+    def _read_depthmap(self, depthpath, input_metadata):
+        depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
+        depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num(input_metadata['maximum_depth'])
+        return depthmap
+
+    def _get_views(self, idx, resolution, rng):
+        # choose a scene
+        obj, instance = self.scene_list[idx // len(self.combinations)]
+        image_pool = self.scenes[obj, instance]
+        im1_idx, im2_idx = self.combinations[idx % len(self.combinations)]
+
+        # add a bit of randomness
+        last = len(image_pool) - 1
+
+        if resolution not in self.invalidate[obj, instance]:  # flag invalid images
+            self.invalidate[obj, instance][resolution] = [False for _ in range(len(image_pool))]
+
+        # decide now if we mask the bg
+        mask_bg = (self.mask_bg == True) or (self.mask_bg == 'rand' and rng.choice(2))
+
+        views = []
+        imgs_idxs = [max(0, min(im_idx + rng.integers(-4, 5), last)) for im_idx in [im2_idx, im1_idx]]
+        imgs_idxs = deque(imgs_idxs)
+        while len(imgs_idxs) > 0:  # some images (few) have zero depth
+            im_idx = imgs_idxs.pop()
+
+            if self.invalidate[obj, instance][resolution][im_idx]:
+                # search for a valid image
+                random_direction = 2 * rng.choice(2) - 1
+                for offset in range(1, len(image_pool)):
+                    tentative_im_idx = (im_idx + (random_direction * offset)) % len(image_pool)
+                    if not self.invalidate[obj, instance][resolution][tentative_im_idx]:
+                        im_idx = tentative_im_idx
+                        break
+
+            view_idx = image_pool[im_idx]
+
+            impath = self._get_impath(obj, instance, view_idx)
+            depthpath = self._get_depthpath(obj, instance, view_idx)
+
+            # load camera params
+            metadata_path = self._get_metadatapath(obj, instance, view_idx)
+            input_metadata = np.load(metadata_path)
+            camera_pose = input_metadata['camera_pose'].astype(np.float32)
+            intrinsics = input_metadata['camera_intrinsics'].astype(np.float32)
+
+            # load image and depth
+            rgb_image = imread_cv2(impath)
+            depthmap = self._read_depthmap(depthpath, input_metadata)
+
+            if mask_bg:
+                # load object mask
+                maskpath = self._get_maskpath(obj, instance, view_idx)
+                maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(np.float32)
+                maskmap = (maskmap / 255.0) > 0.1
+
+                # update the depthmap with mask
+                depthmap *= maskmap
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath)
+
+            num_valid = (depthmap > 0.0).sum()
+            if num_valid == 0:
+                # problem, invalidate image and retry
+                self.invalidate[obj, instance][resolution][im_idx] = True
+                imgs_idxs.append(im_idx)
+                continue
+
+            views.append(dict(
+                img=rgb_image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,
+                camera_intrinsics=intrinsics,
+                dataset=self.dataset_label,
+                label=osp.join(obj, instance),
+                instance=osp.split(impath)[1],
+            ))
+        return views
+
+
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = Co3d(split='train', ROOT="data/co3d_subset_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/habitat.py b/dust3r/dust3r/datasets/habitat.py
new file mode 100644
index 0000000000000000000000000000000000000000..11ce8a0ffb2134387d5fb794df89834db3ea8c9f
--- /dev/null
+++ b/dust3r/dust3r/datasets/habitat.py
@@ -0,0 +1,107 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed habitat
+# dataset at https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md
+# See datasets_preprocess/habitat for more details
+# --------------------------------------------------------
+import os.path as osp
+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"  # noqa
+import cv2  # noqa
+import numpy as np
+from PIL import Image
+import json
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+
+
+class Habitat(BaseStereoViewDataset):
+    def __init__(self, size, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        assert self.split is not None
+        # loading list of scenes
+        with open(osp.join(self.ROOT, f'Habitat_{size}_scenes_{self.split}.txt')) as f:
+            self.scenes = f.read().splitlines()
+        self.instances = list(range(1, 5))
+
+    def filter_scene(self, label, instance=None):
+        if instance:
+            subscene, instance = instance.split('_')
+            label += '/' + subscene
+            self.instances = [int(instance) - 1]
+        valid = np.bool_([scene.startswith(label) for scene in self.scenes])
+        assert sum(valid), 'no scene was selected for {label=} {instance=}'
+        self.scenes = [scene for i, scene in enumerate(self.scenes) if valid[i]]
+
+    def _get_views(self, idx, resolution, rng):
+        scene = self.scenes[idx]
+        data_path, key = osp.split(osp.join(self.ROOT, scene))
+        views = []
+        two_random_views = [0, rng.choice(self.instances)]  # view 0 is connected with all other views
+        for view_index in two_random_views:
+            # load the view (and use the next one if this one's broken)
+            for ii in range(view_index, view_index + 5):
+                image, depthmap, intrinsics, camera_pose = self._load_one_view(data_path, key, ii % 5, resolution, rng)
+                if np.isfinite(camera_pose).all():
+                    break
+            views.append(dict(
+                img=image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,  # cam2world
+                camera_intrinsics=intrinsics,
+                dataset='Habitat',
+                label=osp.relpath(data_path, self.ROOT),
+                instance=f"{key}_{view_index}"))
+        return views
+
+    def _load_one_view(self, data_path, key, view_index, resolution, rng):
+        view_index += 1  # file indices starts at 1
+        impath = osp.join(data_path, f"{key}_{view_index}.jpeg")
+        image = Image.open(impath)
+
+        depthmap_filename = osp.join(data_path, f"{key}_{view_index}_depth.exr")
+        depthmap = cv2.imread(depthmap_filename, cv2.IMREAD_GRAYSCALE | cv2.IMREAD_ANYDEPTH)
+
+        camera_params_filename = osp.join(data_path, f"{key}_{view_index}_camera_params.json")
+        with open(camera_params_filename, 'r') as f:
+            camera_params = json.load(f)
+
+        intrinsics = np.float32(camera_params['camera_intrinsics'])
+        camera_pose = np.eye(4, dtype=np.float32)
+        camera_pose[:3, :3] = camera_params['R_cam2world']
+        camera_pose[:3, 3] = camera_params['t_cam2world']
+
+        image, depthmap, intrinsics = self._crop_resize_if_necessary(
+            image, depthmap, intrinsics, resolution, rng, info=impath)
+        return image, depthmap, intrinsics, camera_pose
+
+
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = Habitat(1_000_000, split='train', ROOT="data/habitat_processed",
+                      resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/megadepth.py b/dust3r/dust3r/datasets/megadepth.py
new file mode 100644
index 0000000000000000000000000000000000000000..8131498b76d855e5293fe79b3686fc42bf87eea8
--- /dev/null
+++ b/dust3r/dust3r/datasets/megadepth.py
@@ -0,0 +1,123 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed MegaDepth
+# dataset at https://www.cs.cornell.edu/projects/megadepth/
+# See datasets_preprocess/preprocess_megadepth.py
+# --------------------------------------------------------
+import os.path as osp
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class MegaDepth(BaseStereoViewDataset):
+    def __init__(self, *args, split, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data(self.split)
+
+        if self.split is None:
+            pass
+        elif self.split == 'train':
+            self.select_scene(('0015', '0022'), opposite=True)
+        elif self.split == 'val':
+            self.select_scene(('0015', '0022'))
+        else:
+            raise ValueError(f'bad {self.split=}')
+
+    def _load_data(self, split):
+        with np.load(osp.join(self.ROOT, 'all_metadata.npz')) as data:
+            self.all_scenes = data['scenes']
+            self.all_images = data['images']
+            self.pairs = data['pairs']
+
+    def __len__(self):
+        return len(self.pairs)
+
+    def get_stats(self):
+        return f'{len(self)} pairs from {len(self.all_scenes)} scenes'
+
+    def select_scene(self, scene, *instances, opposite=False):
+        scenes = (scene,) if isinstance(scene, str) else tuple(scene)
+        scene_id = [s.startswith(scenes) for s in self.all_scenes]
+        assert any(scene_id), 'no scene found'
+
+        valid = np.in1d(self.pairs['scene_id'], np.nonzero(scene_id)[0])
+        if instances:
+            image_id = [i.startswith(instances) for i in self.all_images]
+            image_id = np.nonzero(image_id)[0]
+            assert len(image_id), 'no instance found'
+            # both together?
+            if len(instances) == 2:
+                valid &= np.in1d(self.pairs['im1_id'], image_id) & np.in1d(self.pairs['im2_id'], image_id)
+            else:
+                valid &= np.in1d(self.pairs['im1_id'], image_id) | np.in1d(self.pairs['im2_id'], image_id)
+
+        if opposite:
+            valid = ~valid
+        assert valid.any()
+        self.pairs = self.pairs[valid]
+
+    def _get_views(self, pair_idx, resolution, rng):
+        scene_id, im1_id, im2_id, score = self.pairs[pair_idx]
+
+        scene, subscene = self.all_scenes[scene_id].split()
+        seq_path = osp.join(self.ROOT, scene, subscene)
+
+        views = []
+
+        for im_id in [im1_id, im2_id]:
+            img = self.all_images[im_id]
+            try:
+                image = imread_cv2(osp.join(seq_path, img + '.jpg'))
+                depthmap = imread_cv2(osp.join(seq_path, img + ".exr"))
+                camera_params = np.load(osp.join(seq_path, img + ".npz"))
+            except Exception as e:
+                raise OSError(f'cannot load {img}, got exception {e}')
+
+            intrinsics = np.float32(camera_params['intrinsics'])
+            camera_pose = np.float32(camera_params['cam2world'])
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(seq_path, img))
+
+            views.append(dict(
+                img=image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,  # cam2world
+                camera_intrinsics=intrinsics,
+                dataset='MegaDepth',
+                label=osp.relpath(seq_path, self.ROOT),
+                instance=img))
+
+        return views
+
+
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = MegaDepth(split='train', ROOT="data/megadepth_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(idx, view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/scannetpp.py b/dust3r/dust3r/datasets/scannetpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..520deedd0eb8cba8663af941731d89e0b2e71a80
--- /dev/null
+++ b/dust3r/dust3r/datasets/scannetpp.py
@@ -0,0 +1,96 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed scannet++
+# dataset at https://github.com/scannetpp/scannetpp - non-commercial research and educational purposes
+# https://kaldir.vc.in.tum.de/scannetpp/static/scannetpp-terms-of-use.pdf
+# See datasets_preprocess/preprocess_scannetpp.py
+# --------------------------------------------------------
+import os.path as osp
+import cv2
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class ScanNetpp(BaseStereoViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        assert self.split == 'train'
+        self.loaded_data = self._load_data()
+
+    def _load_data(self):
+        with np.load(osp.join(self.ROOT, 'all_metadata.npz')) as data:
+            self.scenes = data['scenes']
+            self.sceneids = data['sceneids']
+            self.images = data['images']
+            self.intrinsics = data['intrinsics'].astype(np.float32)
+            self.trajectories = data['trajectories'].astype(np.float32)
+            self.pairs = data['pairs'][:, :2].astype(int)
+
+    def __len__(self):
+        return len(self.pairs)
+
+    def _get_views(self, idx, resolution, rng):
+
+        image_idx1, image_idx2 = self.pairs[idx]
+
+        views = []
+        for view_idx in [image_idx1, image_idx2]:
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+
+            intrinsics = self.intrinsics[view_idx]
+            camera_pose = self.trajectories[view_idx]
+            basename = self.images[view_idx]
+
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(scene_dir, 'images', basename + '.jpg'))
+            # Load depthmap
+            depthmap = imread_cv2(osp.join(scene_dir, 'depth', basename + '.png'), cv2.IMREAD_UNCHANGED)
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx)
+
+            views.append(dict(
+                img=rgb_image,
+                depthmap=depthmap.astype(np.float32),
+                camera_pose=camera_pose.astype(np.float32),
+                camera_intrinsics=intrinsics.astype(np.float32),
+                dataset='ScanNet++',
+                label=self.scenes[scene_id] + '_' + basename,
+                instance=f'{str(idx)}_{str(view_idx)}',
+            ))
+        return views
+
+
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = ScanNetpp(split='train', ROOT="data/scannetpp_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx*255, (1 - idx)*255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/staticthings3d.py b/dust3r/dust3r/datasets/staticthings3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7f70f0ee7bf8c8ab6bb1702aa2481f3d16df413
--- /dev/null
+++ b/dust3r/dust3r/datasets/staticthings3d.py
@@ -0,0 +1,96 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed StaticThings3D
+# dataset at https://github.com/lmb-freiburg/robustmvd/
+# See datasets_preprocess/preprocess_staticthings3d.py
+# --------------------------------------------------------
+import os.path as osp
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class StaticThings3D (BaseStereoViewDataset):
+    """ Dataset of indoor scenes, 5 images each time
+    """
+    def __init__(self, ROOT, *args, mask_bg='rand', **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+
+        assert mask_bg in (True, False, 'rand')
+        self.mask_bg = mask_bg
+
+        # loading all pairs
+        assert self.split is None
+        self.pairs = np.load(osp.join(ROOT, 'staticthings_pairs.npy'))
+
+    def __len__(self):
+        return len(self.pairs)
+
+    def get_stats(self):
+        return f'{len(self)} pairs'
+
+    def _get_views(self, pair_idx, resolution, rng):
+        scene, seq, cam1, im1, cam2, im2 = self.pairs[pair_idx]
+        seq_path = osp.join('TRAIN', scene.decode('ascii'), f'{seq:04d}')
+
+        views = []
+
+        mask_bg = (self.mask_bg == True) or (self.mask_bg == 'rand' and rng.choice(2))
+
+        CAM = {b'l':'left', b'r':'right'}
+        for cam, idx in [(CAM[cam1], im1), (CAM[cam2], im2)]:
+            num = f"{idx:04n}"
+            img = num+"_clean.jpg" if rng.choice(2) else num+"_final.jpg"
+            image = imread_cv2(osp.join(self.ROOT, seq_path, cam, img))
+            depthmap = imread_cv2(osp.join(self.ROOT, seq_path, cam, num+".exr"))
+            camera_params = np.load(osp.join(self.ROOT, seq_path, cam, num+".npz"))
+
+            intrinsics = camera_params['intrinsics']
+            camera_pose = camera_params['cam2world']
+
+            if mask_bg:
+                depthmap[depthmap > 200] = 0
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(image, depthmap, intrinsics, resolution, rng, info=(seq_path,cam,img))
+
+            views.append(dict(
+                img = image, 
+                depthmap = depthmap,
+                camera_pose = camera_pose, # cam2world
+                camera_intrinsics = intrinsics,
+                dataset = 'StaticThings3D',
+                label = seq_path,
+                instance = cam+'_'+img))
+
+        return views
+
+
+if __name__ == '__main__':
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = StaticThings3D(ROOT="data/staticthings3d_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(idx, view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx*255, (1 - idx)*255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/utils/__init__.py b/dust3r/dust3r/datasets/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/dust3r/datasets/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/dust3r/datasets/utils/cropping.py b/dust3r/dust3r/datasets/utils/cropping.py
new file mode 100644
index 0000000000000000000000000000000000000000..07a331847cb8df997b3012790f5a96f69f21464d
--- /dev/null
+++ b/dust3r/dust3r/datasets/utils/cropping.py
@@ -0,0 +1,124 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# croppping utilities
+# --------------------------------------------------------
+import PIL.Image
+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+from dust3r.utils.geometry import colmap_to_opencv_intrinsics, opencv_to_colmap_intrinsics  # noqa
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+    bicubic = PIL.Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+    bicubic = PIL.Image.BICUBIC
+
+
+class ImageList:
+    """ Convenience class to aply the same operation to a whole set of images.
+    """
+
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+
+    def __len__(self):
+        return len(self.images)
+
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch('resize', *args, **kwargs))
+
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch('crop', *args, **kwargs))
+
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+
+
+def rescale_image_depthmap(image, depthmap, camera_intrinsics, output_resolution, force=True):
+    """ Jointly rescale a (image, depthmap) 
+        so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        # can also use this with masks instead of depthmaps
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+
+    # define output resolution
+    assert output_resolution.shape == (2,)
+    scale_final = max(output_resolution / image.size) + 1e-8
+    if scale_final >= 1 and not force:  # image is already smaller than what is asked
+        return (image.to_pil(), depthmap, camera_intrinsics)
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+
+    # first rescale the image so that it contains the crop
+    image = image.resize(tuple(output_resolution), resample=lanczos if scale_final < 1 else bicubic)
+    if depthmap is not None:
+        depthmap = cv2.resize(depthmap, output_resolution, fx=scale_final,
+                              fy=scale_final, interpolation=cv2.INTER_NEAREST)
+
+    # no offset here; simple rescaling
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final)
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def camera_matrix_of_crop(input_camera_matrix, input_resolution, output_resolution, scaling=1, offset_factor=0.5, offset=None):
+    # Margins to offset the origin
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+
+    # Generate new camera parameters
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+
+    return output_camera_matrix
+
+
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+
+    return image.to_pil(), depthmap, camera_intrinsics
+
+
+def bbox_from_intrinsics_in_out(input_camera_matrix, output_camera_matrix, output_resolution):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l + out_width, t + out_height)
+    return crop_bbox
diff --git a/dust3r/dust3r/datasets/utils/transforms.py b/dust3r/dust3r/datasets/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb34f2f01d3f8f829ba71a7e03e181bf18f72c25
--- /dev/null
+++ b/dust3r/dust3r/datasets/utils/transforms.py
@@ -0,0 +1,11 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUST3R default transforms
+# --------------------------------------------------------
+import torchvision.transforms as tvf
+from dust3r.utils.image import ImgNorm
+
+# define the standard image transforms
+ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])
diff --git a/dust3r/dust3r/datasets/waymo.py b/dust3r/dust3r/datasets/waymo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9a135152cd8973532405b491450c22942dcd6ca
--- /dev/null
+++ b/dust3r/dust3r/datasets/waymo.py
@@ -0,0 +1,93 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed WayMo
+# dataset at https://github.com/waymo-research/waymo-open-dataset
+# See datasets_preprocess/preprocess_waymo.py
+# --------------------------------------------------------
+import os.path as osp
+import numpy as np
+
+from dust3r.datasets.base.base_stereo_view_dataset import BaseStereoViewDataset
+from dust3r.utils.image import imread_cv2
+
+
+class Waymo (BaseStereoViewDataset):
+    """ Dataset of outdoor street scenes, 5 images each time
+    """
+
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        super().__init__(*args, **kwargs)
+        self._load_data()
+
+    def _load_data(self):
+        with np.load(osp.join(self.ROOT, 'waymo_pairs.npz')) as data:
+            self.scenes = data['scenes']
+            self.frames = data['frames']
+            self.inv_frames = {frame: i for i, frame in enumerate(data['frames'])}
+            self.pairs = data['pairs']  # (array of (scene_id, img1_id, img2_id)
+            assert self.pairs[:, 0].max() == len(self.scenes) - 1
+
+    def __len__(self):
+        return len(self.pairs)
+
+    def get_stats(self):
+        return f'{len(self)} pairs from {len(self.scenes)} scenes'
+
+    def _get_views(self, pair_idx, resolution, rng):
+        seq, img1, img2 = self.pairs[pair_idx]
+        seq_path = osp.join(self.ROOT, self.scenes[seq])
+
+        views = []
+
+        for view_index in [img1, img2]:
+            impath = self.frames[view_index]
+            image = imread_cv2(osp.join(seq_path, impath + ".jpg"))
+            depthmap = imread_cv2(osp.join(seq_path, impath + ".exr"))
+            camera_params = np.load(osp.join(seq_path, impath + ".npz"))
+
+            intrinsics = np.float32(camera_params['intrinsics'])
+            camera_pose = np.float32(camera_params['cam2world'])
+
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(seq_path, impath))
+
+            views.append(dict(
+                img=image,
+                depthmap=depthmap,
+                camera_pose=camera_pose,  # cam2world
+                camera_intrinsics=intrinsics,
+                dataset='Waymo',
+                label=osp.relpath(seq_path, self.ROOT),
+                instance=impath))
+
+        return views
+
+
+if __name__ == '__main__':
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = Waymo(split='train', ROOT="data/megadepth_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(idx, view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/datasets/wildrgbd.py b/dust3r/dust3r/datasets/wildrgbd.py
new file mode 100644
index 0000000000000000000000000000000000000000..c41dd0b78402bf8ff1e62c6a50de338aa916e0af
--- /dev/null
+++ b/dust3r/dust3r/datasets/wildrgbd.py
@@ -0,0 +1,67 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Dataloader for preprocessed WildRGB-D
+# dataset at https://github.com/wildrgbd/wildrgbd/
+# See datasets_preprocess/preprocess_wildrgbd.py
+# --------------------------------------------------------
+import os.path as osp
+
+import cv2
+import numpy as np
+
+from dust3r.datasets.co3d import Co3d
+from dust3r.utils.image import imread_cv2
+
+
+class WildRGBD(Co3d):
+    def __init__(self, mask_bg=True, *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.dataset_label = 'WildRGBD'
+
+    def _get_metadatapath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'metadata', f'{view_idx:0>5d}.npz')
+
+    def _get_impath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'rgb', f'{view_idx:0>5d}.jpg')
+
+    def _get_depthpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'depth', f'{view_idx:0>5d}.png')
+
+    def _get_maskpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, 'masks', f'{view_idx:0>5d}.png')
+
+    def _read_depthmap(self, depthpath, input_metadata):
+        # We store depths in the depth scale of 1000.
+        # That is, when we load depth image and divide by 1000, we could get depth in meters.
+        depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
+        depthmap = depthmap.astype(np.float32) / 1000.0
+        return depthmap
+
+
+if __name__ == "__main__":
+    from dust3r.datasets.base.base_stereo_view_dataset import view_name
+    from dust3r.viz import SceneViz, auto_cam_size
+    from dust3r.utils.image import rgb
+
+    dataset = WildRGBD(split='train', ROOT="data/wildrgbd_processed", resolution=224, aug_crop=16)
+
+    for idx in np.random.permutation(len(dataset)):
+        views = dataset[idx]
+        assert len(views) == 2
+        print(view_name(views[0]), view_name(views[1]))
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in [0, 1]]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        for view_idx in [0, 1]:
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=(idx * 255, (1 - idx) * 255, 0),
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/dust3r/dust3r/demo.py b/dust3r/dust3r/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f744dded35442947aba57762e3f4e6c83e28dc
--- /dev/null
+++ b/dust3r/dust3r/demo.py
@@ -0,0 +1,320 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# gradio demo
+# --------------------------------------------------------
+import argparse
+import math
+import builtins
+import datetime
+import gradio
+import os
+import torch
+import numpy as np
+import functools
+import trimesh
+import copy
+import tempfile
+from scipy.spatial.transform import Rotation
+
+from dust3r.model import AsymmetricCroCo3DStereo
+from dust3r.inference import inference
+from dust3r.image_pairs import make_pairs
+from dust3r.utils.image import load_images, rgb
+from dust3r.utils.device import to_numpy
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+from dust3r.cloud_opt import global_aligner, GlobalAlignerMode
+
+import matplotlib.pyplot as pl
+pl.ion()
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser_url = parser.add_mutually_exclusive_group()
+    parser_url.add_argument("--local_network", action='store_true', default=False,
+                            help="make app accessible on local network: address will be set to 0.0.0.0")
+    parser_url.add_argument("--server_name", type=str, default=None, help="server url, default is 127.0.0.1")
+    parser.add_argument("--image_size", type=int, default=512, choices=[512, 224], help="image size")
+    parser.add_argument("--server_port", type=int, help=("will start gradio app on this port (if available). "
+                                                         "If None, will search for an available port starting at 7860."),
+                        default=None)
+    parser_weights = parser.add_mutually_exclusive_group(required=True)
+    parser_weights.add_argument("--weights", type=str, help="path to the model weights", default=None)
+    parser_weights.add_argument("--model_name", type=str, help="name of the model weights",
+                                choices=["DUSt3R_ViTLarge_BaseDecoder_512_dpt",
+                                         "DUSt3R_ViTLarge_BaseDecoder_512_linear",
+                                         "DUSt3R_ViTLarge_BaseDecoder_224_linear"])
+    parser.add_argument("--device", type=str, default='cuda', help="pytorch device")
+    parser.add_argument("--tmp_dir", type=str, default=None, help="value for tempfile.tempdir")
+    parser.add_argument("--silent", action='store_true', default=False,
+                        help="silence logs")
+    return parser
+
+
+def set_print_with_timestamp(time_format="%Y-%m-%d %H:%M:%S"):
+    builtin_print = builtins.print
+
+    def print_with_timestamp(*args, **kwargs):
+        now = datetime.datetime.now()
+        formatted_date_time = now.strftime(time_format)
+
+        builtin_print(f'[{formatted_date_time}] ', end='')  # print with time stamp
+        builtin_print(*args, **kwargs)
+
+    builtins.print = print_with_timestamp
+
+
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world, cam_size=0.05,
+                                 cam_color=None, as_pointcloud=False,
+                                 transparent_cams=False, silent=False):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+
+    scene = trimesh.Scene()
+
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else imgs[i], focals[i],
+                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    outfile = os.path.join(outdir, 'scene.glb')
+    if not silent:
+        print('(exporting 3D scene to', outfile, ')')
+    scene.export(file_obj=outfile)
+    return outfile
+
+
+def get_3D_model_from_scene(outdir, silent, scene, min_conf_thr=3, as_pointcloud=False, mask_sky=False,
+                            clean_depth=False, transparent_cams=False, cam_size=0.05):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene is None:
+        return None
+    # post processes
+    if clean_depth:
+        scene = scene.clean_pointcloud()
+    if mask_sky:
+        scene = scene.mask_sky()
+
+    # get optimized values from scene
+    rgbimg = scene.imgs
+    focals = scene.get_focals().cpu()
+    cams2world = scene.get_im_poses().cpu()
+    # 3D pointcloud from depthmap, poses and intrinsics
+    pts3d = to_numpy(scene.get_pts3d())
+    scene.min_conf_thr = float(scene.conf_trf(torch.tensor(min_conf_thr)))
+    msk = to_numpy(scene.get_masks())
+    return _convert_scene_output_to_glb(outdir, rgbimg, pts3d, msk, focals, cams2world, as_pointcloud=as_pointcloud,
+                                        transparent_cams=transparent_cams, cam_size=cam_size, silent=silent)
+
+
+def get_reconstructed_scene(outdir, model, device, silent, image_size, filelist, schedule, niter, min_conf_thr,
+                            as_pointcloud, mask_sky, clean_depth, transparent_cams, cam_size,
+                            scenegraph_type, winsize, refid):
+    """
+    from a list of images, run dust3r inference, global aligner.
+    then run get_3D_model_from_scene
+    """
+    try:
+        square_ok = model.square_ok
+    except Exception as e:
+        square_ok = False
+    imgs = load_images(filelist, size=image_size, verbose=not silent, patch_size=model.patch_size, square_ok=square_ok)
+    if len(imgs) == 1:
+        imgs = [imgs[0], copy.deepcopy(imgs[0])]
+        imgs[1]['idx'] = 1
+    if scenegraph_type == "swin":
+        scenegraph_type = scenegraph_type + "-" + str(winsize)
+    elif scenegraph_type == "oneref":
+        scenegraph_type = scenegraph_type + "-" + str(refid)
+
+    pairs = make_pairs(imgs, scene_graph=scenegraph_type, prefilter=None, symmetrize=True)
+    output = inference(pairs, model, device, batch_size=1, verbose=not silent)
+
+    mode = GlobalAlignerMode.PointCloudOptimizer if len(imgs) > 2 else GlobalAlignerMode.PairViewer
+    scene = global_aligner(output, device=device, mode=mode, verbose=not silent)
+    lr = 0.01
+
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        loss = scene.compute_global_alignment(init='mst', niter=niter, schedule=schedule, lr=lr)
+
+    outfile = get_3D_model_from_scene(outdir, silent, scene, min_conf_thr, as_pointcloud, mask_sky,
+                                      clean_depth, transparent_cams, cam_size)
+
+    # also return rgb, depth and confidence imgs
+    # depth is normalized with the max value for all images
+    # we apply the jet colormap on the confidence maps
+    rgbimg = scene.imgs
+    depths = to_numpy(scene.get_depthmaps())
+    confs = to_numpy([c for c in scene.im_conf])
+    cmap = pl.get_cmap('jet')
+    depths_max = max([d.max() for d in depths])
+    depths = [d / depths_max for d in depths]
+    confs_max = max([d.max() for d in confs])
+    confs = [cmap(d / confs_max) for d in confs]
+
+    imgs = []
+    for i in range(len(rgbimg)):
+        imgs.append(rgbimg[i])
+        imgs.append(rgb(depths[i]))
+        imgs.append(rgb(confs[i]))
+
+    return scene, outfile, imgs
+
+
+def set_scenegraph_options(inputfiles, winsize, refid, scenegraph_type):
+    num_files = len(inputfiles) if inputfiles is not None else 1
+    max_winsize = max(1, math.ceil((num_files - 1) / 2))
+    if scenegraph_type == "swin":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=True)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    elif scenegraph_type == "oneref":
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=True)
+    else:
+        winsize = gradio.Slider(label="Scene Graph: Window Size", value=max_winsize,
+                                minimum=1, maximum=max_winsize, step=1, visible=False)
+        refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0,
+                              maximum=num_files - 1, step=1, visible=False)
+    return winsize, refid
+
+
+def main_demo(tmpdirname, model, device, image_size, server_name, server_port, silent=False):
+    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, model, device, silent, image_size)
+    model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname, silent)
+    with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="DUSt3R Demo") as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        scene = gradio.State(None)
+        gradio.HTML('<h2 style="text-align: center;">DUSt3R Demo</h2>')
+        with gradio.Column():
+            inputfiles = gradio.File(file_count="multiple")
+            with gradio.Row():
+                schedule = gradio.Dropdown(["linear", "cosine"],
+                                           value='linear', label="schedule", info="For global alignment!")
+                niter = gradio.Number(value=300, precision=0, minimum=0, maximum=5000,
+                                      label="num_iterations", info="For global alignment!")
+                scenegraph_type = gradio.Dropdown([("complete: all possible image pairs", "complete"),
+                                                   ("swin: sliding window", "swin"),
+                                                   ("oneref: match one image with all", "oneref")],
+                                                  value='complete', label="Scenegraph",
+                                                  info="Define how to make pairs",
+                                                  interactive=True)
+                winsize = gradio.Slider(label="Scene Graph: Window Size", value=1,
+                                        minimum=1, maximum=1, step=1, visible=False)
+                refid = gradio.Slider(label="Scene Graph: Id", value=0, minimum=0, maximum=0, step=1, visible=False)
+
+            run_btn = gradio.Button("Run")
+
+            with gradio.Row():
+                # adjust the confidence threshold
+                min_conf_thr = gradio.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1)
+                # adjust the camera size in the output pointcloud
+                cam_size = gradio.Slider(label="cam_size", value=0.05, minimum=0.001, maximum=0.1, step=0.001)
+            with gradio.Row():
+                as_pointcloud = gradio.Checkbox(value=False, label="As pointcloud")
+                # two post process implemented
+                mask_sky = gradio.Checkbox(value=False, label="Mask sky")
+                clean_depth = gradio.Checkbox(value=True, label="Clean-up depthmaps")
+                transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+
+            outmodel = gradio.Model3D()
+            outgallery = gradio.Gallery(label='rgb,depth,confidence', columns=3, height="100%")
+
+            # events
+            scenegraph_type.change(set_scenegraph_options,
+                                   inputs=[inputfiles, winsize, refid, scenegraph_type],
+                                   outputs=[winsize, refid])
+            inputfiles.change(set_scenegraph_options,
+                              inputs=[inputfiles, winsize, refid, scenegraph_type],
+                              outputs=[winsize, refid])
+            run_btn.click(fn=recon_fun,
+                          inputs=[inputfiles, schedule, niter, min_conf_thr, as_pointcloud,
+                                  mask_sky, clean_depth, transparent_cams, cam_size,
+                                  scenegraph_type, winsize, refid],
+                          outputs=[scene, outmodel, outgallery])
+            min_conf_thr.release(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size],
+                                 outputs=outmodel)
+            cam_size.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size],
+                            outputs=outmodel)
+            as_pointcloud.change(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                         clean_depth, transparent_cams, cam_size],
+                                 outputs=outmodel)
+            mask_sky.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                    clean_depth, transparent_cams, cam_size],
+                            outputs=outmodel)
+            clean_depth.change(fn=model_from_scene_fun,
+                               inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                       clean_depth, transparent_cams, cam_size],
+                               outputs=outmodel)
+            transparent_cams.change(model_from_scene_fun,
+                                    inputs=[scene, min_conf_thr, as_pointcloud, mask_sky,
+                                            clean_depth, transparent_cams, cam_size],
+                                    outputs=outmodel)
+    demo.launch(share=False, server_name=server_name, server_port=server_port)
+
+
+def main():
+    torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+
+    parser = get_args_parser()
+    args = parser.parse_args()
+    set_print_with_timestamp()
+
+    if args.tmp_dir is not None:
+        tmp_path = args.tmp_dir
+        os.makedirs(tmp_path, exist_ok=True)
+        tempfile.tempdir = tmp_path
+
+    if args.server_name is not None:
+        server_name = args.server_name
+    else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+
+    if args.weights is not None:
+        weights_path = args.weights
+    else:
+        weights_path = "naver/" + args.model_name
+    model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(args.device)
+
+    # dust3r will write the 3D model inside tmpdirname
+    with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
+        if not args.silent:
+            print('Outputing stuff in', tmpdirname)
+        main_demo(tmpdirname, model, args.device, args.image_size, server_name, args.server_port, silent=args.silent)
diff --git a/dust3r/dust3r/heads/__init__.py b/dust3r/dust3r/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53d0aa5610cae95f34f96bdb3ff9e835a2d6208e
--- /dev/null
+++ b/dust3r/dust3r/heads/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# head factory
+# --------------------------------------------------------
+from .linear_head import LinearPts3d
+from .dpt_head import create_dpt_head
+
+
+def head_factory(head_type, output_mode, net, has_conf=False):
+    """" build a prediction head for the decoder 
+    """
+    if head_type == 'linear' and output_mode == 'pts3d':
+        return LinearPts3d(net, has_conf)
+    elif head_type == 'dpt' and output_mode == 'pts3d':
+        return create_dpt_head(net, has_conf=has_conf)
+    else:
+        raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}")
diff --git a/dust3r/dust3r/heads/dpt_head.py b/dust3r/dust3r/heads/dpt_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..0107afe6d51604fb40d13e82535833e0f4df9ef1
--- /dev/null
+++ b/dust3r/dust3r/heads/dpt_head.py
@@ -0,0 +1,115 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dpt head implementation for DUST3R
+# Downstream heads assume inputs of size B x N x C (where N is the number of tokens) ;
+# or if it takes as input the output at every layer, the attribute return_all_layers should be set to True
+# the forward function also takes as input a dictionnary img_info with key "height" and "width"
+# for PixelwiseTask, the output will be of dimension B x num_channels x H x W
+# --------------------------------------------------------
+from einops import rearrange
+from typing import List
+import torch
+import torch.nn as nn
+from dust3r.heads.postprocess import postprocess
+import dust3r.utils.path_to_croco  # noqa: F401
+from croco.models.dpt_block import DPTOutputAdapter  # noqa
+
+
+class DPTOutputAdapter_fix(DPTOutputAdapter):
+    """
+    Adapt croco's DPTOutputAdapter implementation for dust3r:
+    remove duplicated weigths, and fix forward for dust3r
+    """
+
+    def init(self, dim_tokens_enc=768):
+        super().init(dim_tokens_enc)
+        # these are duplicated weights
+        del self.act_1_postprocess
+        del self.act_2_postprocess
+        del self.act_3_postprocess
+        del self.act_4_postprocess
+
+    def forward(self, encoder_tokens: List[torch.Tensor], image_size=None):
+        assert self.dim_tokens_enc is not None, 'Need to call init(dim_tokens_enc) function first'
+        # H, W = input_info['image_size']
+        image_size = self.image_size if image_size is None else image_size
+        H, W = image_size
+        # Number of patches in height and width
+        N_H = H // (self.stride_level * self.P_H)
+        N_W = W // (self.stride_level * self.P_W)
+
+        # Hook decoder onto 4 layers from specified ViT layers
+        layers = [encoder_tokens[hook] for hook in self.hooks]
+
+        # Extract only task-relevant tokens and ignore global tokens.
+        layers = [self.adapt_tokens(l) for l in layers]
+
+        # Reshape tokens to spatial representation
+        layers = [rearrange(l, 'b (nh nw) c -> b c nh nw', nh=N_H, nw=N_W) for l in layers]
+
+        layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)]
+        # Project layers to chosen feature dim
+        layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)]
+
+        # Fuse layers using refinement stages
+        path_4 = self.scratch.refinenet4(layers[3])[:, :, :layers[2].shape[2], :layers[2].shape[3]]
+        path_3 = self.scratch.refinenet3(path_4, layers[2])
+        path_2 = self.scratch.refinenet2(path_3, layers[1])
+        path_1 = self.scratch.refinenet1(path_2, layers[0])
+
+        # Output head
+        out = self.head(path_1)
+
+        return out
+
+
+class PixelwiseTaskWithDPT(nn.Module):
+    """ DPT module for dust3r, can return 3D points + confidence for all pixels"""
+
+    def __init__(self, *, n_cls_token=0, hooks_idx=None, dim_tokens=None,
+                 output_width_ratio=1, num_channels=1, postprocess=None, depth_mode=None, conf_mode=None, **kwargs):
+        super(PixelwiseTaskWithDPT, self).__init__()
+        self.return_all_layers = True  # backbone needs to return all layers
+        self.postprocess = postprocess
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+
+        assert n_cls_token == 0, "Not implemented"
+        dpt_args = dict(output_width_ratio=output_width_ratio,
+                        num_channels=num_channels,
+                        **kwargs)
+        if hooks_idx is not None:
+            dpt_args.update(hooks=hooks_idx)
+        self.dpt = DPTOutputAdapter_fix(**dpt_args)
+        dpt_init_args = {} if dim_tokens is None else {'dim_tokens_enc': dim_tokens}
+        self.dpt.init(**dpt_init_args)
+
+    def forward(self, x, img_info):
+        out = self.dpt(x, image_size=(img_info[0], img_info[1]))
+        if self.postprocess:
+            out = self.postprocess(out, self.depth_mode, self.conf_mode)
+        return out
+
+
+def create_dpt_head(net, has_conf=False):
+    """
+    return PixelwiseTaskWithDPT for given net params
+    """
+    assert net.dec_depth > 9
+    l2 = net.dec_depth
+    feature_dim = 256
+    last_dim = feature_dim//2
+    out_nchan = 3
+    ed = net.enc_embed_dim
+    dd = net.dec_embed_dim
+    return PixelwiseTaskWithDPT(num_channels=out_nchan + has_conf,
+                                feature_dim=feature_dim,
+                                last_dim=last_dim,
+                                hooks_idx=[0, l2*2//4, l2*3//4, l2],
+                                dim_tokens=[ed, dd, dd, dd],
+                                postprocess=postprocess,
+                                depth_mode=net.depth_mode,
+                                conf_mode=net.conf_mode,
+                                head_type='regression')
diff --git a/dust3r/dust3r/heads/linear_head.py b/dust3r/dust3r/heads/linear_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b697f29eaa6f43fad0a3e27a8d9b8f1a602a833
--- /dev/null
+++ b/dust3r/dust3r/heads/linear_head.py
@@ -0,0 +1,41 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# linear head implementation for DUST3R
+# --------------------------------------------------------
+import torch.nn as nn
+import torch.nn.functional as F
+from dust3r.heads.postprocess import postprocess
+
+
+class LinearPts3d (nn.Module):
+    """ 
+    Linear head for dust3r
+    Each token outputs: - 16x16 3D points (+ confidence)
+    """
+
+    def __init__(self, net, has_conf=False):
+        super().__init__()
+        self.patch_size = net.patch_embed.patch_size[0]
+        self.depth_mode = net.depth_mode
+        self.conf_mode = net.conf_mode
+        self.has_conf = has_conf
+
+        self.proj = nn.Linear(net.dec_embed_dim, (3 + has_conf)*self.patch_size**2)
+
+    def setup(self, croconet):
+        pass
+
+    def forward(self, decout, img_shape):
+        H, W = img_shape
+        tokens = decout[-1]
+        B, S, D = tokens.shape
+
+        # extract 3D points
+        feat = self.proj(tokens)  # B,S,D
+        feat = feat.transpose(-1, -2).view(B, -1, H//self.patch_size, W//self.patch_size)
+        feat = F.pixel_shuffle(feat, self.patch_size)  # B,3,H,W
+
+        # permute + norm depth
+        return postprocess(feat, self.depth_mode, self.conf_mode)
diff --git a/dust3r/dust3r/heads/postprocess.py b/dust3r/dust3r/heads/postprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd68a90d89b8dcd7d8a4b4ea06ef8b17eb5da093
--- /dev/null
+++ b/dust3r/dust3r/heads/postprocess.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# post process function for all heads: extract 3D points/confidence from output
+# --------------------------------------------------------
+import torch
+
+
+def postprocess(out, depth_mode, conf_mode):
+    """
+    extract 3D points/confidence from prediction head output
+    """
+    fmap = out.permute(0, 2, 3, 1)  # B,H,W,3
+    res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode))
+
+    if conf_mode is not None:
+        res['conf'] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode)
+    return res
+
+
+def reg_dense_depth(xyz, mode):
+    """
+    extract 3D points from prediction head output
+    """
+    mode, vmin, vmax = mode
+
+    no_bounds = (vmin == -float('inf')) and (vmax == float('inf'))
+    assert no_bounds
+
+    if mode == 'linear':
+        if no_bounds:
+            return xyz  # [-inf, +inf]
+        return xyz.clip(min=vmin, max=vmax)
+
+    # distance to origin
+    d = xyz.norm(dim=-1, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+
+    if mode == 'square':
+        return xyz * d.square()
+
+    if mode == 'exp':
+        return xyz * torch.expm1(d)
+
+    raise ValueError(f'bad {mode=}')
+
+
+def reg_dense_conf(x, mode):
+    """
+    extract confidence from prediction head output
+    """
+    mode, vmin, vmax = mode
+    if mode == 'exp':
+        return vmin + x.exp().clip(max=vmax-vmin)
+    if mode == 'sigmoid':
+        return (vmax - vmin) * torch.sigmoid(x) + vmin
+    raise ValueError(f'bad {mode=}')
diff --git a/dust3r/dust3r/image_pairs.py b/dust3r/dust3r/image_pairs.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebcf902b4d07b83fe83ffceba3f45ca0d74dfcf7
--- /dev/null
+++ b/dust3r/dust3r/image_pairs.py
@@ -0,0 +1,104 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed to load image pairs
+# --------------------------------------------------------
+import numpy as np
+import torch
+
+
+def make_pairs(imgs, scene_graph='complete', prefilter=None, symmetrize=True):
+    pairs = []
+    if scene_graph == 'complete':  # complete graph
+        for i in range(len(imgs)):
+            for j in range(i):
+                pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('swin'):
+        iscyclic = not scene_graph.endswith('noncyclic')
+        try:
+            winsize = int(scene_graph.split('-')[1])
+        except Exception as e:
+            winsize = 3
+        pairsid = set()
+        for i in range(len(imgs)):
+            for j in range(1, winsize + 1):
+                idx = (i + j)
+                if iscyclic:
+                    idx = idx % len(imgs)  # explicit loop closure
+                if idx >= len(imgs):
+                    continue
+                pairsid.add((i, idx) if i < idx else (idx, i))
+        for i, j in pairsid:
+            pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('logwin'):
+        iscyclic = not scene_graph.endswith('noncyclic')
+        try:
+            winsize = int(scene_graph.split('-')[1])
+        except Exception as e:
+            winsize = 3
+        offsets = [2**i for i in range(winsize)]
+        pairsid = set()
+        for i in range(len(imgs)):
+            ixs_l = [i - off for off in offsets]
+            ixs_r = [i + off for off in offsets]
+            for j in ixs_l + ixs_r:
+                if iscyclic:
+                    j = j % len(imgs)  # Explicit loop closure
+                if j < 0 or j >= len(imgs) or j == i:
+                    continue
+                pairsid.add((i, j) if i < j else (j, i))
+        for i, j in pairsid:
+            pairs.append((imgs[i], imgs[j]))
+    elif scene_graph.startswith('oneref'):
+        refid = int(scene_graph.split('-')[1]) if '-' in scene_graph else 0
+        for j in range(len(imgs)):
+            if j != refid:
+                pairs.append((imgs[refid], imgs[j]))
+    if symmetrize:
+        pairs += [(img2, img1) for img1, img2 in pairs]
+
+    # now, remove edges
+    if isinstance(prefilter, str) and prefilter.startswith('seq'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]))
+
+    if isinstance(prefilter, str) and prefilter.startswith('cyc'):
+        pairs = filter_pairs_seq(pairs, int(prefilter[3:]), cyclic=True)
+
+    return pairs
+
+
+def sel(x, kept):
+    if isinstance(x, dict):
+        return {k: sel(v, kept) for k, v in x.items()}
+    if isinstance(x, (torch.Tensor, np.ndarray)):
+        return x[kept]
+    if isinstance(x, (tuple, list)):
+        return type(x)([x[k] for k in kept])
+
+
+def _filter_edges_seq(edges, seq_dis_thr, cyclic=False):
+    # number of images
+    n = max(max(e) for e in edges) + 1
+
+    kept = []
+    for e, (i, j) in enumerate(edges):
+        dis = abs(i - j)
+        if cyclic:
+            dis = min(dis, abs(i + n - j), abs(i - n - j))
+        if dis <= seq_dis_thr:
+            kept.append(e)
+    return kept
+
+
+def filter_pairs_seq(pairs, seq_dis_thr, cyclic=False):
+    edges = [(img1['idx'], img2['idx']) for img1, img2 in pairs]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    return [pairs[i] for i in kept]
+
+
+def filter_edges_seq(view1, view2, pred1, pred2, seq_dis_thr, cyclic=False):
+    edges = [(int(i), int(j)) for i, j in zip(view1['idx'], view2['idx'])]
+    kept = _filter_edges_seq(edges, seq_dis_thr, cyclic=cyclic)
+    print(f'>> Filtering edges more than {seq_dis_thr} frames apart: kept {len(kept)}/{len(edges)} edges')
+    return sel(view1, kept), sel(view2, kept), sel(pred1, kept), sel(pred2, kept)
diff --git a/dust3r/dust3r/inference.py b/dust3r/dust3r/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..90540486b077add90ca50f62a5072e082cb2f2d7
--- /dev/null
+++ b/dust3r/dust3r/inference.py
@@ -0,0 +1,150 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities needed for the inference
+# --------------------------------------------------------
+import tqdm
+import torch
+from dust3r.utils.device import to_cpu, collate_with_cat
+from dust3r.utils.misc import invalid_to_nans
+from dust3r.utils.geometry import depthmap_to_pts3d, geotrf
+
+
+def _interleave_imgs(img1, img2):
+    res = {}
+    for key, value1 in img1.items():
+        value2 = img2[key]
+        if isinstance(value1, torch.Tensor):
+            value = torch.stack((value1, value2), dim=1).flatten(0, 1)
+        else:
+            value = [x for pair in zip(value1, value2) for x in pair]
+        res[key] = value
+    return res
+
+
+def make_batch_symmetric(batch):
+    view1, view2 = batch
+    view1, view2 = (_interleave_imgs(view1, view2), _interleave_imgs(view2, view1))
+    return view1, view2
+
+
+def loss_of_one_batch(batch, model, criterion, device, symmetrize_batch=False, use_amp=False, ret=None):
+    view1, view2 = batch
+    ignore_keys = set(['depthmap', 'dataset', 'label', 'instance', 'idx', 'true_shape', 'rng'])
+    for view in batch:
+        for name in view.keys():  # pseudo_focal
+            if name in ignore_keys:
+                continue
+            view[name] = view[name].to(device, non_blocking=True)
+
+    if symmetrize_batch:
+        view1, view2 = make_batch_symmetric(batch)
+
+    with torch.cuda.amp.autocast(enabled=bool(use_amp)):
+        pred1, pred2 = model(view1, view2)
+
+        # loss is supposed to be symmetric
+        with torch.cuda.amp.autocast(enabled=False):
+            loss = criterion(view1, view2, pred1, pred2) if criterion is not None else None
+
+    result = dict(view1=view1, view2=view2, pred1=pred1, pred2=pred2, loss=loss)
+    return result[ret] if ret else result
+
+
+@torch.no_grad()
+def inference(pairs, model, device, batch_size=8, verbose=True):
+    if verbose:
+        print(f'>> Inference with model on {len(pairs)} image pairs')
+    result = []
+
+    # first, check if all images have the same size
+    multiple_shapes = not (check_if_same_size(pairs))
+    if multiple_shapes:  # force bs=1
+        batch_size = 1
+
+    for i in tqdm.trange(0, len(pairs), batch_size, disable=not verbose):
+        res = loss_of_one_batch(collate_with_cat(pairs[i:i + batch_size]), model, None, device)
+        result.append(to_cpu(res))
+
+    result = collate_with_cat(result, lists=multiple_shapes)
+
+    return result
+
+
+def check_if_same_size(pairs):
+    shapes1 = [img1['img'].shape[-2:] for img1, img2 in pairs]
+    shapes2 = [img2['img'].shape[-2:] for img1, img2 in pairs]
+    return all(shapes1[0] == s for s in shapes1) and all(shapes2[0] == s for s in shapes2)
+
+
+def get_pred_pts3d(gt, pred, use_pose=False):
+    if 'depth' in pred and 'pseudo_focal' in pred:
+        try:
+            pp = gt['camera_intrinsics'][..., :2, 2]
+        except KeyError:
+            pp = None
+        pts3d = depthmap_to_pts3d(**pred, pp=pp)
+
+    elif 'pts3d' in pred:
+        # pts3d from my camera
+        pts3d = pred['pts3d']
+
+    elif 'pts3d_in_other_view' in pred:
+        # pts3d from the other camera, already transformed
+        assert use_pose is True
+        return pred['pts3d_in_other_view']  # return!
+
+    if use_pose:
+        camera_pose = pred.get('camera_pose')
+        assert camera_pose is not None
+        pts3d = geotrf(camera_pose, pts3d)
+
+    return pts3d
+
+
+def find_opt_scaling(gt_pts1, gt_pts2, pr_pts1, pr_pts2=None, fit_mode='weiszfeld_stop_grad', valid1=None, valid2=None):
+    assert gt_pts1.ndim == pr_pts1.ndim == 4
+    assert gt_pts1.shape == pr_pts1.shape
+    if gt_pts2 is not None:
+        assert gt_pts2.ndim == pr_pts2.ndim == 4
+        assert gt_pts2.shape == pr_pts2.shape
+
+    # concat the pointcloud
+    nan_gt_pts1 = invalid_to_nans(gt_pts1, valid1).flatten(1, 2)
+    nan_gt_pts2 = invalid_to_nans(gt_pts2, valid2).flatten(1, 2) if gt_pts2 is not None else None
+
+    pr_pts1 = invalid_to_nans(pr_pts1, valid1).flatten(1, 2)
+    pr_pts2 = invalid_to_nans(pr_pts2, valid2).flatten(1, 2) if pr_pts2 is not None else None
+
+    all_gt = torch.cat((nan_gt_pts1, nan_gt_pts2), dim=1) if gt_pts2 is not None else nan_gt_pts1
+    all_pr = torch.cat((pr_pts1, pr_pts2), dim=1) if pr_pts2 is not None else pr_pts1
+
+    dot_gt_pr = (all_pr * all_gt).sum(dim=-1)
+    dot_gt_gt = all_gt.square().sum(dim=-1)
+
+    if fit_mode.startswith('avg'):
+        # scaling = (all_pr / all_gt).view(B, -1).mean(dim=1)
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+    elif fit_mode.startswith('median'):
+        scaling = (dot_gt_pr / dot_gt_gt).nanmedian(dim=1).values
+    elif fit_mode.startswith('weiszfeld'):
+        # init scaling with l2 closed form
+        scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1)
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (all_pr - scaling.view(-1, 1, 1) * all_gt).norm(dim=-1)
+            # print(dis.nanmean(-1))
+            w = dis.clip_(min=1e-8).reciprocal()
+            # update the scaling with the new weights
+            scaling = (w * dot_gt_pr).nanmean(dim=1) / (w * dot_gt_gt).nanmean(dim=1)
+    else:
+        raise ValueError(f'bad {fit_mode=}')
+
+    if fit_mode.endswith('stop_grad'):
+        scaling = scaling.detach()
+
+    scaling = scaling.clip(min=1e-3)
+    # assert scaling.isfinite().all(), bb()
+    return scaling
diff --git a/dust3r/dust3r/losses.py b/dust3r/dust3r/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f8febff1a2dd674e759bcf83d023099a59cc934
--- /dev/null
+++ b/dust3r/dust3r/losses.py
@@ -0,0 +1,299 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Implementation of DUSt3R training losses
+# --------------------------------------------------------
+from copy import copy, deepcopy
+import torch
+import torch.nn as nn
+
+from dust3r.inference import get_pred_pts3d, find_opt_scaling
+from dust3r.utils.geometry import inv, geotrf, normalize_pointcloud
+from dust3r.utils.geometry import get_joint_pointcloud_depth, get_joint_pointcloud_center_scale
+
+
+def Sum(*losses_and_masks):
+    loss, mask = losses_and_masks[0]
+    if loss.ndim > 0:
+        # we are actually returning the loss for every pixels
+        return losses_and_masks
+    else:
+        # we are returning the global loss
+        for loss2, mask2 in losses_and_masks[1:]:
+            loss = loss + loss2
+        return loss
+
+
+class BaseCriterion(nn.Module):
+    def __init__(self, reduction='mean'):
+        super().__init__()
+        self.reduction = reduction
+
+
+class LLoss (BaseCriterion):
+    """ L-norm loss
+    """
+
+    def forward(self, a, b):
+        assert a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3, f'Bad shape = {a.shape}'
+        dist = self.distance(a, b)
+        assert dist.ndim == a.ndim - 1  # one dimension less
+        if self.reduction == 'none':
+            return dist
+        if self.reduction == 'sum':
+            return dist.sum()
+        if self.reduction == 'mean':
+            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
+        raise ValueError(f'bad {self.reduction=} mode')
+
+    def distance(self, a, b):
+        raise NotImplementedError()
+
+
+class L21Loss (LLoss):
+    """ Euclidean distance between 3d points  """
+
+    def distance(self, a, b):
+        return torch.norm(a - b, dim=-1)  # normalized L2 distance
+
+
+L21 = L21Loss()
+
+
+class Criterion (nn.Module):
+    def __init__(self, criterion=None):
+        super().__init__()
+        assert isinstance(criterion, BaseCriterion), f'{criterion} is not a proper criterion!'
+        self.criterion = copy(criterion)
+
+    def get_name(self):
+        return f'{type(self).__name__}({self.criterion})'
+
+    def with_reduction(self, mode='none'):
+        res = loss = deepcopy(self)
+        while loss is not None:
+            assert isinstance(loss, Criterion)
+            loss.criterion.reduction = mode  # make it return the loss for each sample
+            loss = loss._loss2  # we assume loss is a Multiloss
+        return res
+
+
+class MultiLoss (nn.Module):
+    """ Easily combinable losses (also keep track of individual loss values):
+        loss = MyLoss1() + 0.1*MyLoss2()
+    Usage:
+        Inherit from this class and override get_name() and compute_loss()
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._alpha = 1
+        self._loss2 = None
+
+    def compute_loss(self, *args, **kwargs):
+        raise NotImplementedError()
+
+    def get_name(self):
+        raise NotImplementedError()
+
+    def __mul__(self, alpha):
+        assert isinstance(alpha, (int, float))
+        res = copy(self)
+        res._alpha = alpha
+        return res
+    __rmul__ = __mul__  # same
+
+    def __add__(self, loss2):
+        assert isinstance(loss2, MultiLoss)
+        res = cur = copy(self)
+        # find the end of the chain
+        while cur._loss2 is not None:
+            cur = cur._loss2
+        cur._loss2 = loss2
+        return res
+
+    def __repr__(self):
+        name = self.get_name()
+        if self._alpha != 1:
+            name = f'{self._alpha:g}*{name}'
+        if self._loss2:
+            name = f'{name} + {self._loss2}'
+        return name
+
+    def forward(self, *args, **kwargs):
+        loss = self.compute_loss(*args, **kwargs)
+        if isinstance(loss, tuple):
+            loss, details = loss
+        elif loss.ndim == 0:
+            details = {self.get_name(): float(loss)}
+        else:
+            details = {}
+        loss = loss * self._alpha
+
+        if self._loss2:
+            loss2, details2 = self._loss2(*args, **kwargs)
+            loss = loss + loss2
+            details |= details2
+
+        return loss, details
+
+
+class Regr3D (Criterion, MultiLoss):
+    """ Ensure that all 3D points are correct.
+        Asymmetric loss: view1 is supposed to be the anchor.
+
+        P1 = RT1 @ D1
+        P2 = RT2 @ D2
+        loss1 = (I @ pred_D1) - (RT1^-1 @ RT1 @ D1)
+        loss2 = (RT21 @ pred_D2) - (RT1^-1 @ P2)
+              = (RT21 @ pred_D2) - (RT1^-1 @ RT2 @ D2)
+    """
+
+    def __init__(self, criterion, norm_mode='avg_dis', gt_scale=False):
+        super().__init__(criterion)
+        self.norm_mode = norm_mode
+        self.gt_scale = gt_scale
+
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2, dist_clip=None):
+        # everything is normalized w.r.t. camera of view1
+        in_camera1 = inv(gt1['camera_pose'])
+        gt_pts1 = geotrf(in_camera1, gt1['pts3d'])  # B,H,W,3
+        gt_pts2 = geotrf(in_camera1, gt2['pts3d'])  # B,H,W,3
+
+        valid1 = gt1['valid_mask'].clone()
+        valid2 = gt2['valid_mask'].clone()
+
+        if dist_clip is not None:
+            # points that are too far-away == invalid
+            dis1 = gt_pts1.norm(dim=-1)  # (B, H, W)
+            dis2 = gt_pts2.norm(dim=-1)  # (B, H, W)
+            valid1 = valid1 & (dis1 <= dist_clip)
+            valid2 = valid2 & (dis2 <= dist_clip)
+
+        pr_pts1 = get_pred_pts3d(gt1, pred1, use_pose=False)
+        pr_pts2 = get_pred_pts3d(gt2, pred2, use_pose=True)
+
+        # normalize 3d points
+        if self.norm_mode:
+            pr_pts1, pr_pts2 = normalize_pointcloud(pr_pts1, pr_pts2, self.norm_mode, valid1, valid2)
+        if self.norm_mode and not self.gt_scale:
+            gt_pts1, gt_pts2 = normalize_pointcloud(gt_pts1, gt_pts2, self.norm_mode, valid1, valid2)
+
+        return gt_pts1, gt_pts2, pr_pts1, pr_pts2, valid1, valid2, {}
+
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = \
+            self.get_all_pts3d(gt1, gt2, pred1, pred2, **kw)
+        # loss on img1 side
+        l1 = self.criterion(pred_pts1[mask1], gt_pts1[mask1])
+        # loss on gt2 side
+        l2 = self.criterion(pred_pts2[mask2], gt_pts2[mask2])
+        self_name = type(self).__name__
+        details = {self_name + '_pts3d_1': float(l1.mean()), self_name + '_pts3d_2': float(l2.mean())}
+        return Sum((l1, mask1), (l2, mask2)), (details | monitoring)
+
+
+class ConfLoss (MultiLoss):
+    """ Weighted regression by learned confidence.
+        Assuming the input pixel_loss is a pixel-level regression loss.
+
+    Principle:
+        high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10)
+        low  confidence means low  conf = 10  ==> conf_loss = x * 10 - alpha*log(10) 
+
+        alpha: hyperparameter
+    """
+
+    def __init__(self, pixel_loss, alpha=1):
+        super().__init__()
+        assert alpha > 0
+        self.alpha = alpha
+        self.pixel_loss = pixel_loss.with_reduction('none')
+
+    def get_name(self):
+        return f'ConfLoss({self.pixel_loss})'
+
+    def get_conf_log(self, x):
+        return x, torch.log(x)
+
+    def compute_loss(self, gt1, gt2, pred1, pred2, **kw):
+        # compute per-pixel loss
+        ((loss1, msk1), (loss2, msk2)), details = self.pixel_loss(gt1, gt2, pred1, pred2, **kw)
+        if loss1.numel() == 0:
+            print('NO VALID POINTS in img1', force=True)
+        if loss2.numel() == 0:
+            print('NO VALID POINTS in img2', force=True)
+
+        # weight by confidence
+        conf1, log_conf1 = self.get_conf_log(pred1['conf'][msk1])
+        conf2, log_conf2 = self.get_conf_log(pred2['conf'][msk2])
+        conf_loss1 = loss1 * conf1 - self.alpha * log_conf1
+        conf_loss2 = loss2 * conf2 - self.alpha * log_conf2
+
+        # average + nan protection (in case of no valid pixels at all)
+        conf_loss1 = conf_loss1.mean() if conf_loss1.numel() > 0 else 0
+        conf_loss2 = conf_loss2.mean() if conf_loss2.numel() > 0 else 0
+
+        return conf_loss1 + conf_loss2, dict(conf_loss_1=float(conf_loss1), conf_loss2=float(conf_loss2), **details)
+
+
+class Regr3D_ShiftInv (Regr3D):
+    """ Same than Regr3D but invariant to depth shift.
+    """
+
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute unnormalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = \
+            super().get_all_pts3d(gt1, gt2, pred1, pred2)
+
+        # compute median depth
+        gt_z1, gt_z2 = gt_pts1[..., 2], gt_pts2[..., 2]
+        pred_z1, pred_z2 = pred_pts1[..., 2], pred_pts2[..., 2]
+        gt_shift_z = get_joint_pointcloud_depth(gt_z1, gt_z2, mask1, mask2)[:, None, None]
+        pred_shift_z = get_joint_pointcloud_depth(pred_z1, pred_z2, mask1, mask2)[:, None, None]
+
+        # subtract the median depth
+        gt_z1 -= gt_shift_z
+        gt_z2 -= gt_shift_z
+        pred_z1 -= pred_shift_z
+        pred_z2 -= pred_shift_z
+
+        # monitoring = dict(monitoring, gt_shift_z=gt_shift_z.mean().detach(), pred_shift_z=pred_shift_z.mean().detach())
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring
+
+
+class Regr3D_ScaleInv (Regr3D):
+    """ Same than Regr3D but invariant to depth shift.
+        if gt_scale == True: enforce the prediction to take the same scale than GT
+    """
+
+    def get_all_pts3d(self, gt1, gt2, pred1, pred2):
+        # compute depth-normalized points
+        gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring = super().get_all_pts3d(gt1, gt2, pred1, pred2)
+
+        # measure scene scale
+        _, gt_scale = get_joint_pointcloud_center_scale(gt_pts1, gt_pts2, mask1, mask2)
+        _, pred_scale = get_joint_pointcloud_center_scale(pred_pts1, pred_pts2, mask1, mask2)
+
+        # prevent predictions to be in a ridiculous range
+        pred_scale = pred_scale.clip(min=1e-3, max=1e3)
+
+        # subtract the median depth
+        if self.gt_scale:
+            pred_pts1 *= gt_scale / pred_scale
+            pred_pts2 *= gt_scale / pred_scale
+            # monitoring = dict(monitoring, pred_scale=(pred_scale/gt_scale).mean())
+        else:
+            gt_pts1 /= gt_scale
+            gt_pts2 /= gt_scale
+            pred_pts1 /= pred_scale
+            pred_pts2 /= pred_scale
+            # monitoring = dict(monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach())
+
+        return gt_pts1, gt_pts2, pred_pts1, pred_pts2, mask1, mask2, monitoring
+
+
+class Regr3D_ScaleShiftInv (Regr3D_ScaleInv, Regr3D_ShiftInv):
+    # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv
+    pass
diff --git a/dust3r/dust3r/model.py b/dust3r/dust3r/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d2a901652ee6972744d4f07a013e6c7d45872e2
--- /dev/null
+++ b/dust3r/dust3r/model.py
@@ -0,0 +1,211 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUSt3R model class
+# --------------------------------------------------------
+from copy import deepcopy
+import torch
+import os
+from packaging import version
+import huggingface_hub
+
+from .utils.misc import fill_default_args, freeze_all_params, is_symmetrized, interleave, transpose_to_landscape
+from .heads import head_factory
+from dust3r.patch_embed import get_patch_embed
+
+import dust3r.utils.path_to_croco  # noqa: F401
+from croco.models.croco import CroCoNet  # noqa
+
+inf = float('inf')
+
+hf_version_number = huggingface_hub.__version__
+assert version.parse(hf_version_number) >= version.parse("0.22.0"), ("Outdated huggingface_hub version, "
+                                                                     "please reinstall requirements.txt")
+
+
+def load_model(model_path, device, verbose=True):
+    if verbose:
+        print('... loading model from', model_path)
+    ckpt = torch.load(model_path, map_location='cpu')
+    args = ckpt['args'].model.replace("ManyAR_PatchEmbed", "PatchEmbedDust3R")
+    if 'landscape_only' not in args:
+        args = args[:-1] + ', landscape_only=False)'
+    else:
+        args = args.replace(" ", "").replace('landscape_only=True', 'landscape_only=False')
+    assert "landscape_only=False" in args
+    if verbose:
+        print(f"instantiating : {args}")
+    net = eval(args)
+    s = net.load_state_dict(ckpt['model'], strict=False)
+    if verbose:
+        print(s)
+    return net.to(device)
+
+
+class AsymmetricCroCo3DStereo (
+    CroCoNet,
+    huggingface_hub.PyTorchModelHubMixin,
+    library_name="dust3r",
+    repo_url="https://github.com/naver/dust3r",
+    tags=["image-to-3d"],
+):
+    """ Two siamese encoders, followed by two decoders.
+    The goal is to output 3d points directly, both images in view1's frame
+    (hence the asymmetry).   
+    """
+
+    def __init__(self,
+                 output_mode='pts3d',
+                 head_type='linear',
+                 depth_mode=('exp', -inf, inf),
+                 conf_mode=('exp', 1, inf),
+                 freeze='none',
+                 landscape_only=True,
+                 patch_embed_cls='PatchEmbedDust3R',  # PatchEmbedDust3R or ManyAR_PatchEmbed
+                 **croco_kwargs):
+        self.patch_embed_cls = patch_embed_cls
+        self.croco_args = fill_default_args(croco_kwargs, super().__init__)
+        super().__init__(**croco_kwargs)
+
+        # dust3r specific initialization
+        self.dec_blocks2 = deepcopy(self.dec_blocks)
+        self.set_downstream_head(output_mode, head_type, landscape_only, depth_mode, conf_mode, **croco_kwargs)
+        self.set_freeze(freeze)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kw):
+        if os.path.isfile(pretrained_model_name_or_path):
+            return load_model(pretrained_model_name_or_path, device='cpu')
+        else:
+            try:
+                model = super(AsymmetricCroCo3DStereo, cls).from_pretrained(pretrained_model_name_or_path, **kw)
+            except TypeError as e:
+                raise Exception(f'tried to load {pretrained_model_name_or_path} from huggingface, but failed')
+            return model
+
+    def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768):
+        self.patch_size = patch_size
+        self.patch_embed = get_patch_embed(self.patch_embed_cls, img_size, patch_size, enc_embed_dim)
+
+    def load_state_dict(self, ckpt, **kw):
+        # duplicate all weights for the second decoder if not present
+        new_ckpt = dict(ckpt)
+        if not any(k.startswith('dec_blocks2') for k in ckpt):
+            for key, value in ckpt.items():
+                if key.startswith('dec_blocks'):
+                    new_ckpt[key.replace('dec_blocks', 'dec_blocks2')] = value
+        return super().load_state_dict(new_ckpt, **kw)
+
+    def set_freeze(self, freeze):  # this is for use by downstream models
+        self.freeze = freeze
+        to_be_frozen = {
+            'none': [],
+            'mask': [self.mask_token],
+            'encoder': [self.mask_token, self.patch_embed, self.enc_blocks],
+        }
+        freeze_all_params(to_be_frozen[freeze])
+
+    def _set_prediction_head(self, *args, **kwargs):
+        """ No prediction head """
+        return
+
+    def set_downstream_head(self, output_mode, head_type, landscape_only, depth_mode, conf_mode, patch_size, img_size,
+                            **kw):
+        assert img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0, \
+            f'{img_size=} must be multiple of {patch_size=}'
+        self.output_mode = output_mode
+        self.head_type = head_type
+        self.depth_mode = depth_mode
+        self.conf_mode = conf_mode
+        # allocate heads
+        self.downstream_head1 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        self.downstream_head2 = head_factory(head_type, output_mode, self, has_conf=bool(conf_mode))
+        # magic wrapper
+        self.head1 = transpose_to_landscape(self.downstream_head1, activate=landscape_only)
+        self.head2 = transpose_to_landscape(self.downstream_head2, activate=landscape_only)
+
+    def _encode_image(self, image, true_shape):
+        # embed the image into patches  (x has size B x Npatches x C)
+        x, pos = self.patch_embed(image, true_shape=true_shape)
+
+        # add positional embedding without cls token
+        assert self.enc_pos_embed is None
+
+        # now apply the transformer encoder and normalization
+        for blk in self.enc_blocks:
+            x = blk(x, pos)
+
+        x = self.enc_norm(x)
+        return x, pos, None
+
+    def _encode_image_pairs(self, img1, img2, true_shape1, true_shape2):
+        if img1.shape[-2:] == img2.shape[-2:]:
+            out, pos, _ = self._encode_image(torch.cat((img1, img2), dim=0),
+                                             torch.cat((true_shape1, true_shape2), dim=0))
+            out, out2 = out.chunk(2, dim=0)
+            pos, pos2 = pos.chunk(2, dim=0)
+        else:
+            out, pos, _ = self._encode_image(img1, true_shape1)
+            out2, pos2, _ = self._encode_image(img2, true_shape2)
+        return out, out2, pos, pos2
+
+    def _encode_symmetrized(self, view1, view2):
+        img1 = view1['img']
+        img2 = view2['img']
+        B = img1.shape[0]
+        # Recover true_shape when available, otherwise assume that the img shape is the true one
+        shape1 = view1.get('true_shape', torch.tensor(img1.shape[-2:])[None].repeat(B, 1))
+        shape2 = view2.get('true_shape', torch.tensor(img2.shape[-2:])[None].repeat(B, 1))
+        # warning! maybe the images have different portrait/landscape orientations
+
+        if is_symmetrized(view1, view2):
+            # computing half of forward pass!'
+            feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1[::2], img2[::2], shape1[::2], shape2[::2])
+            feat1, feat2 = interleave(feat1, feat2)
+            pos1, pos2 = interleave(pos1, pos2)
+        else:
+            feat1, feat2, pos1, pos2 = self._encode_image_pairs(img1, img2, shape1, shape2)
+
+        return (shape1, shape2), (feat1, feat2), (pos1, pos2)
+
+    def _decoder(self, f1, pos1, f2, pos2):
+        final_output = [(f1, f2)]  # before projection
+
+        # project to decoder dim
+        f1 = self.decoder_embed(f1)
+        f2 = self.decoder_embed(f2)
+
+        final_output.append((f1, f2))
+        for blk1, blk2 in zip(self.dec_blocks, self.dec_blocks2):
+            # img1 side
+            f1, _ = blk1(*final_output[-1][::+1], pos1, pos2)
+            # img2 side
+            f2, _ = blk2(*final_output[-1][::-1], pos2, pos1)
+            # store the result
+            final_output.append((f1, f2))
+
+        # normalize last output
+        del final_output[1]  # duplicate with final_output[0]
+        final_output[-1] = tuple(map(self.dec_norm, final_output[-1]))
+        return zip(*final_output)
+
+    def _downstream_head(self, head_num, decout, img_shape):
+        B, S, D = decout[-1].shape
+        # img_shape = tuple(map(int, img_shape))
+        head = getattr(self, f'head{head_num}')
+        return head(decout, img_shape)
+
+    def forward(self, view1, view2):
+        # encode the two images --> B,S,D
+        (shape1, shape2), (feat1, feat2), (pos1, pos2) = self._encode_symmetrized(view1, view2)
+
+        # combine all ref images into object-centric representation
+        dec1, dec2 = self._decoder(feat1, pos1, feat2, pos2)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            res1 = self._downstream_head(1, [tok.float() for tok in dec1], shape1)
+            res2 = self._downstream_head(2, [tok.float() for tok in dec2], shape2)
+
+        res2['pts3d_in_other_view'] = res2.pop('pts3d')  # predict view2's pts3d in view1's frame
+        return res1, res2
diff --git a/dust3r/dust3r/optim_factory.py b/dust3r/dust3r/optim_factory.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b9c16e0e0fda3fd03c3def61abc1f354f75c584
--- /dev/null
+++ b/dust3r/dust3r/optim_factory.py
@@ -0,0 +1,14 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# optimization functions
+# --------------------------------------------------------
+
+
+def adjust_learning_rate_by_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
diff --git a/dust3r/dust3r/patch_embed.py b/dust3r/dust3r/patch_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5a101517559deef574135f6a88d6fec1ce3e054
--- /dev/null
+++ b/dust3r/dust3r/patch_embed.py
@@ -0,0 +1,70 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# PatchEmbed implementation for DUST3R,
+# in particular ManyAR_PatchEmbed that Handle images with non-square aspect ratio
+# --------------------------------------------------------
+import torch
+import dust3r.utils.path_to_croco  # noqa: F401
+from croco.models.blocks import PatchEmbed  # noqa
+
+
+def get_patch_embed(patch_embed_cls, img_size, patch_size, enc_embed_dim):
+    assert patch_embed_cls in ['PatchEmbedDust3R', 'ManyAR_PatchEmbed']
+    patch_embed = eval(patch_embed_cls)(img_size, patch_size, 3, enc_embed_dim)
+    return patch_embed
+
+
+class PatchEmbedDust3R(PatchEmbed):
+    def forward(self, x, **kw):
+        B, C, H, W = x.shape
+        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        x = self.proj(x)
+        pos = self.position_getter(B, x.size(2), x.size(3), x.device)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x, pos
+
+
+class ManyAR_PatchEmbed (PatchEmbed):
+    """ Handle images with non-square aspect ratio.
+        All images in the same batch have the same aspect ratio.
+        true_shape = [(height, width) ...] indicates the actual shape of each image.
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None, flatten=True):
+        self.embed_dim = embed_dim
+        super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten)
+
+    def forward(self, img, true_shape):
+        B, C, H, W = img.shape
+        assert W >= H, f'img should be in landscape mode, but got {W=} {H=}'
+        assert H % self.patch_size[0] == 0, f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})."
+        assert W % self.patch_size[1] == 0, f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})."
+        assert true_shape.shape == (B, 2), f"true_shape has the wrong shape={true_shape.shape}"
+
+        # size expressed in tokens
+        W //= self.patch_size[0]
+        H //= self.patch_size[1]
+        n_tokens = H * W
+
+        height, width = true_shape.T
+        is_landscape = (width >= height)
+        is_portrait = ~is_landscape
+
+        # allocate result
+        x = img.new_zeros((B, n_tokens, self.embed_dim))
+        pos = img.new_zeros((B, n_tokens, 2), dtype=torch.int64)
+
+        # linear projection, transposed if necessary
+        x[is_landscape] = self.proj(img[is_landscape]).permute(0, 2, 3, 1).flatten(1, 2).float()
+        x[is_portrait] = self.proj(img[is_portrait].swapaxes(-1, -2)).permute(0, 2, 3, 1).flatten(1, 2).float()
+
+        pos[is_landscape] = self.position_getter(1, H, W, pos.device)
+        pos[is_portrait] = self.position_getter(1, W, H, pos.device)
+
+        x = self.norm(x)
+        return x, pos
diff --git a/dust3r/dust3r/post_process.py b/dust3r/dust3r/post_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..550a9b41025ad003228ef16f97d045fc238746e4
--- /dev/null
+++ b/dust3r/dust3r/post_process.py
@@ -0,0 +1,60 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilities for interpreting the DUST3R output
+# --------------------------------------------------------
+import numpy as np
+import torch
+from dust3r.utils.geometry import xy_grid
+
+
+def estimate_focal_knowing_depth(pts3d, pp, focal_mode='median', min_focal=0., max_focal=np.inf):
+    """ Reprojection method, for when the absolute depth is known:
+        1) estimate the camera focal using a robust estimator
+        2) reproject points onto true rays, minimizing a certain error
+    """
+    B, H, W, THREE = pts3d.shape
+    assert THREE == 3
+
+    # centered pixel grid
+    pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(-1, 1, 2)  # B,HW,2
+    pts3d = pts3d.flatten(1, 2)  # (B, HW, 3)
+
+    if focal_mode == 'median':
+        with torch.no_grad():
+            # direct estimation of focal
+            u, v = pixels.unbind(dim=-1)
+            x, y, z = pts3d.unbind(dim=-1)
+            fx_votes = (u * z) / x
+            fy_votes = (v * z) / y
+
+            # assume square pixels, hence same focal for X and Y
+            f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
+            focal = torch.nanmedian(f_votes, dim=-1).values
+
+    elif focal_mode == 'weiszfeld':
+        # init focal with l2 closed form
+        # we try to find focal = argmin Sum | pixel - focal * (x,y)/z|
+        xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(posinf=0, neginf=0)  # homogeneous (x,y,1)
+
+        dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
+        dot_xy_xy = xy_over_z.square().sum(dim=-1)
+
+        focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
+
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1)
+            # print(dis.nanmean(-1))
+            w = dis.clip(min=1e-8).reciprocal()
+            # update the scaling with the new weights
+            focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
+    else:
+        raise ValueError(f'bad {focal_mode=}')
+
+    focal_base = max(H, W) / (2 * np.tan(np.deg2rad(60) / 2))  # size / 1.1547005383792515
+    focal = focal.clip(min=min_focal*focal_base, max=max_focal*focal_base)
+    # print(focal)
+    return focal
diff --git a/dust3r/dust3r/training.py b/dust3r/dust3r/training.py
new file mode 100644
index 0000000000000000000000000000000000000000..53af9764ebb03a0083c22294298ed674e9164edc
--- /dev/null
+++ b/dust3r/dust3r/training.py
@@ -0,0 +1,377 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# training code for DUSt3R
+# --------------------------------------------------------
+# References:
+# MAE: https://github.com/facebookresearch/mae
+# DeiT: https://github.com/facebookresearch/deit
+# BEiT: https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import Sized
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+
+from dust3r.model import AsymmetricCroCo3DStereo, inf  # noqa: F401, needed when loading the model
+from dust3r.datasets import get_data_loader  # noqa
+from dust3r.losses import *  # noqa: F401, needed when loading the model
+from dust3r.inference import loss_of_one_batch  # noqa
+
+import dust3r.utils.path_to_croco  # noqa: F401
+import croco.utils.misc as misc  # noqa
+from croco.utils.misc import NativeScalerWithGradNormCount as NativeScaler  # noqa
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('DUST3R training', add_help=False)
+    # model and criterion
+    parser.add_argument('--model', default="AsymmetricCroCo3DStereo(patch_embed_cls='ManyAR_PatchEmbed')",
+                        type=str, help="string containing the model to build")
+    parser.add_argument('--pretrained', default=None, help='path of a starting checkpoint')
+    parser.add_argument('--train_criterion', default="ConfLoss(Regr3D(L21, norm_mode='avg_dis'), alpha=0.2)",
+                        type=str, help="train criterion")
+    parser.add_argument('--test_criterion', default=None, type=str, help="test criterion")
+
+    # dataset
+    parser.add_argument('--train_dataset', required=True, type=str, help="training set")
+    parser.add_argument('--test_dataset', default='[None]', type=str, help="testing set")
+
+    # training
+    parser.add_argument('--seed', default=0, type=int, help="Random seed")
+    parser.add_argument('--batch_size', default=64, type=int,
+                        help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus")
+    parser.add_argument('--accum_iter', default=1, type=int,
+                        help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)")
+    parser.add_argument('--epochs', default=800, type=int, help="Maximum number of epochs for the scheduler")
+
+    parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)")
+    parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', help='epochs to warmup LR')
+
+    parser.add_argument('--amp', type=int, default=0,
+                        choices=[0, 1], help="Use Automatic Mixed Precision for pretraining")
+    parser.add_argument("--disable_cudnn_benchmark", action='store_true', default=False,
+                        help="set cudnn.benchmark = False")
+    # others
+    parser.add_argument('--num_workers', default=8, type=int)
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+
+    parser.add_argument('--eval_freq', type=int, default=1, help='Test loss evaluation frequency')
+    parser.add_argument('--save_freq', default=1, type=int,
+                        help='frequence (number of epochs) to save checkpoint in checkpoint-last.pth')
+    parser.add_argument('--keep_freq', default=20, type=int,
+                        help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth')
+    parser.add_argument('--print_freq', default=20, type=int,
+                        help='frequence (number of iterations) to print infos while training')
+
+    # output dir
+    parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output")
+    return parser
+
+
+def train(args):
+    misc.init_distributed_mode(args)
+    global_rank = misc.get_rank()
+    world_size = misc.get_world_size()
+
+    print("output_dir: " + args.output_dir)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+
+    # auto resume
+    last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+    args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # fix the seed
+    seed = args.seed + misc.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    cudnn.benchmark = not args.disable_cudnn_benchmark
+
+    # training dataset and loader
+    print('Building train dataset {:s}'.format(args.train_dataset))
+    #  dataset and loader
+    data_loader_train = build_dataset(args.train_dataset, args.batch_size, args.num_workers, test=False)
+    print('Building test dataset {:s}'.format(args.train_dataset))
+    data_loader_test = {dataset.split('(')[0]: build_dataset(dataset, args.batch_size, args.num_workers, test=True)
+                        for dataset in args.test_dataset.split('+')}
+
+    # model
+    print('Loading model: {:s}'.format(args.model))
+    model = eval(args.model)
+    print(f'>> Creating train criterion = {args.train_criterion}')
+    train_criterion = eval(args.train_criterion).to(device)
+    print(f'>> Creating test criterion = {args.test_criterion or args.train_criterion}')
+    test_criterion = eval(args.test_criterion or args.criterion).to(device)
+
+    model.to(device)
+    model_without_ddp = model
+    print("Model = %s" % str(model_without_ddp))
+
+    if args.pretrained and not args.resume:
+        print('Loading pretrained: ', args.pretrained)
+        ckpt = torch.load(args.pretrained, map_location=device)
+        print(model.load_state_dict(ckpt['model'], strict=False))
+        del ckpt  # in case it occupies memory
+
+    eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True)
+        model_without_ddp = model.module
+
+    # following timm: set wd as 0 for bias and norm layers
+    param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay)
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    def write_log_stats(epoch, train_stats, test_stats):
+        if misc.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+
+            log_stats = dict(epoch=epoch, **{f'train_{k}': v for k, v in train_stats.items()})
+            for test_name in data_loader_test:
+                if test_name not in test_stats:
+                    continue
+                log_stats.update({test_name + '_' + k: v for k, v in test_stats[test_name].items()})
+
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    def save_model(epoch, fname, best_so_far):
+        misc.save_model(args=args, model_without_ddp=model_without_ddp, optimizer=optimizer,
+                        loss_scaler=loss_scaler, epoch=epoch, fname=fname, best_so_far=best_so_far)
+
+    best_so_far = misc.load_model(args=args, model_without_ddp=model_without_ddp,
+                                  optimizer=optimizer, loss_scaler=loss_scaler)
+    if best_so_far is None:
+        best_so_far = float('inf')
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(log_dir=args.output_dir)
+    else:
+        log_writer = None
+
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    train_stats = test_stats = {}
+    for epoch in range(args.start_epoch, args.epochs + 1):
+
+        # Save immediately the last checkpoint
+        if epoch > args.start_epoch:
+            if args.save_freq and epoch % args.save_freq == 0 or epoch == args.epochs:
+                save_model(epoch - 1, 'last', best_so_far)
+
+        # Test on multiple datasets
+        new_best = False
+        if (epoch > 0 and args.eval_freq > 0 and epoch % args.eval_freq == 0):
+            test_stats = {}
+            for test_name, testset in data_loader_test.items():
+                stats = test_one_epoch(model, test_criterion, testset,
+                                       device, epoch, log_writer=log_writer, args=args, prefix=test_name)
+                test_stats[test_name] = stats
+
+                # Save best of all
+                if stats['loss_med'] < best_so_far:
+                    best_so_far = stats['loss_med']
+                    new_best = True
+
+        # Save more stuff
+        write_log_stats(epoch, train_stats, test_stats)
+
+        if epoch > args.start_epoch:
+            if args.keep_freq and epoch % args.keep_freq == 0:
+                save_model(epoch - 1, str(epoch), best_so_far)
+            if new_best:
+                save_model(epoch - 1, 'best', best_so_far)
+        if epoch >= args.epochs:
+            break  # exit after writing last test to disk
+
+        # Train
+        train_stats = train_one_epoch(
+            model, train_criterion, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            log_writer=log_writer,
+            args=args)
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+    save_final_model(args, args.epochs, model_without_ddp, best_so_far=best_so_far)
+
+
+def save_final_model(args, epoch, model_without_ddp, best_so_far=None):
+    output_dir = Path(args.output_dir)
+    checkpoint_path = output_dir / 'checkpoint-final.pth'
+    to_save = {
+        'args': args,
+        'model': model_without_ddp if isinstance(model_without_ddp, dict) else model_without_ddp.cpu().state_dict(),
+        'epoch': epoch
+    }
+    if best_so_far is not None:
+        to_save['best_so_far'] = best_so_far
+    print(f'>> Saving model to {checkpoint_path} ...')
+    misc.save_on_master(to_save, checkpoint_path)
+
+
+def build_dataset(dataset, batch_size, num_workers, test=False):
+    split = ['Train', 'Test'][test]
+    print(f'Building {split} Data loader for dataset: ', dataset)
+    loader = get_data_loader(dataset,
+                             batch_size=batch_size,
+                             num_workers=num_workers,
+                             pin_mem=True,
+                             shuffle=not (test),
+                             drop_last=not (test))
+
+    print(f"{split} dataset length: ", len(loader))
+    return loader
+
+
+def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                    data_loader: Sized, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    args,
+                    log_writer=None):
+    assert torch.backends.cuda.matmul.allow_tf32 == True
+
+    model.train(True)
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    accum_iter = args.accum_iter
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    if hasattr(data_loader, 'dataset') and hasattr(data_loader.dataset, 'set_epoch'):
+        data_loader.dataset.set_epoch(epoch)
+    if hasattr(data_loader, 'sampler') and hasattr(data_loader.sampler, 'set_epoch'):
+        data_loader.sampler.set_epoch(epoch)
+
+    optimizer.zero_grad()
+
+    for data_iter_step, batch in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+        epoch_f = epoch + data_iter_step / len(data_loader)
+
+        # we use a per iteration (instead of per epoch) lr scheduler
+        if data_iter_step % accum_iter == 0:
+            misc.adjust_learning_rate(optimizer, epoch_f, args)
+
+        loss_tuple = loss_of_one_batch(batch, model, criterion, device,
+                                       symmetrize_batch=True,
+                                       use_amp=bool(args.amp), ret='loss')
+        loss, loss_details = loss_tuple  # criterion returns two values
+        loss_value = float(loss)
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value), force=True)
+            sys.exit(1)
+
+        loss /= accum_iter
+        loss_scaler(loss, optimizer, parameters=model.parameters(),
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        del loss
+        del batch
+
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(epoch=epoch_f)
+        metric_logger.update(lr=lr)
+        metric_logger.update(loss=loss_value, **loss_details)
+
+        if (data_iter_step + 1) % accum_iter == 0 and ((data_iter_step + 1) % (accum_iter * args.print_freq)) == 0:
+            loss_value_reduce = misc.all_reduce_mean(loss_value)  # MUST BE EXECUTED BY ALL NODES
+            if log_writer is None:
+                continue
+            """ We use epoch_1000x as the x-axis in tensorboard.
+            This calibrates different curves when batch size changes.
+            """
+            epoch_1000x = int(epoch_f * 1000)
+            log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('train_lr', lr, epoch_1000x)
+            log_writer.add_scalar('train_iter', epoch_1000x, epoch_1000x)
+            for name, val in loss_details.items():
+                log_writer.add_scalar('train_' + name, val, epoch_1000x)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
+
+
+@torch.no_grad()
+def test_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
+                   data_loader: Sized, device: torch.device, epoch: int,
+                   args, log_writer=None, prefix='test'):
+
+    model.eval()
+    metric_logger = misc.MetricLogger(delimiter="  ")
+    metric_logger.meters = defaultdict(lambda: misc.SmoothedValue(window_size=9**9))
+    header = 'Test Epoch: [{}]'.format(epoch)
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    if hasattr(data_loader, 'dataset') and hasattr(data_loader.dataset, 'set_epoch'):
+        data_loader.dataset.set_epoch(epoch)
+    if hasattr(data_loader, 'sampler') and hasattr(data_loader.sampler, 'set_epoch'):
+        data_loader.sampler.set_epoch(epoch)
+
+    for _, batch in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+        loss_tuple = loss_of_one_batch(batch, model, criterion, device,
+                                       symmetrize_batch=True,
+                                       use_amp=bool(args.amp), ret='loss')
+        loss_value, loss_details = loss_tuple  # criterion returns two values
+        metric_logger.update(loss=float(loss_value), **loss_details)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+
+    aggs = [('avg', 'global_avg'), ('med', 'median')]
+    results = {f'{k}_{tag}': getattr(meter, attr) for k, meter in metric_logger.meters.items() for tag, attr in aggs}
+
+    if log_writer is not None:
+        for name, val in results.items():
+            log_writer.add_scalar(prefix + '_' + name, val, 1000 * epoch)
+
+    return results
diff --git a/dust3r/dust3r/utils/__init__.py b/dust3r/dust3r/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/dust3r/utils/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/dust3r/utils/device.py b/dust3r/dust3r/utils/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b6a74dac05a2e1ba3a2b2f0faa8cea08ece745
--- /dev/null
+++ b/dust3r/dust3r/utils/device.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for DUSt3R
+# --------------------------------------------------------
+import numpy as np
+import torch
+
+
+def todevice(batch, device, callback=None, non_blocking=False):
+    ''' Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    '''
+    if callback:
+        batch = callback(batch)
+
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+
+    x = batch
+    if device == 'numpy':
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+
+
+to_device = todevice  # alias
+
+
+def to_numpy(x): return todevice(x, 'numpy')
+def to_cpu(x): return todevice(x, 'cpu')
+def to_cuda(x): return todevice(x, 'cuda')
+
+
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem}
+
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return listify(whatever) if lists else torch.cat([torch.from_numpy(x) for x in whatever])
+
+        # otherwise, we just chain lists
+        return sum(whatever, T())
+
+
+def listify(elems):
+    return [x for e in elems for x in e]
diff --git a/dust3r/dust3r/utils/geometry.py b/dust3r/dust3r/utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce365faf2acb97ffaafa1b80cb8ee0c28de0b6d6
--- /dev/null
+++ b/dust3r/dust3r/utils/geometry.py
@@ -0,0 +1,366 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# geometry utilitary functions
+# --------------------------------------------------------
+import torch
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+
+from dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
+from dust3r.utils.device import to_numpy
+
+
+def xy_grid(W, H, device=None, origin=(0, 0), unsqueeze=None, cat_dim=-1, homogeneous=False, **arange_kw):
+    """ Output a (H,W,2) array of int32 
+        with output[j,i,0] = i + origin[0]
+             output[j,i,1] = j + origin[1]
+    """
+    if device is None:
+        # numpy
+        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
+    else:
+        # torch
+        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
+        meshgrid, stack = torch.meshgrid, torch.stack
+        ones = lambda *a: torch.ones(*a, device=device)
+
+    tw, th = [arange(o, o + s, **arange_kw) for s, o in zip((W, H), origin)]
+    grid = meshgrid(tw, th, indexing='xy')
+    if homogeneous:
+        grid = grid + (ones((H, W)),)
+    if unsqueeze is not None:
+        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
+    if cat_dim is not None:
+        grid = stack(grid, cat_dim)
+    return grid
+
+
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """ Apply a geometric transformation to a list of 3-D points.
+
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+
+    # optimized code
+    if (isinstance(Trf, torch.Tensor) and isinstance(pts, torch.Tensor) and
+            Trf.ndim == 3 and pts.ndim == 4):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d + 1:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts) + Trf[:, None, None, :d, d]
+        else:
+            raise ValueError(f'bad shape, not ending with 3 or 4, for {pts.shape=}')
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim - 2
+            assert Trf.shape[:n] == pts.shape[:n], 'batch size does not match'
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+
+            if pts.ndim > Trf.ndim:
+                # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+                # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+                pts = pts[:, None, :]
+
+        if pts.shape[-1] + 1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+
+
+def inv(mat):
+    """ Invert a torch or numpy matrix
+    """
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f'bad matrix type = {type(mat)}')
+
+
+def depthmap_to_pts3d(depth, pseudo_focal, pp=None, **_):
+    """
+    Args:
+        - depthmap (BxHxW array):
+        - pseudo_focal: [B,H,W] ; [B,2,H,W] or [B,1,H,W]
+    Returns:
+        pointmap of absolute coordinates (BxHxWx3 array)
+    """
+
+    if len(depth.shape) == 4:
+        B, H, W, n = depth.shape
+    else:
+        B, H, W = depth.shape
+        n = None
+
+    if len(pseudo_focal.shape) == 3:  # [B,H,W]
+        pseudo_focalx = pseudo_focaly = pseudo_focal
+    elif len(pseudo_focal.shape) == 4:  # [B,2,H,W] or [B,1,H,W]
+        pseudo_focalx = pseudo_focal[:, 0]
+        if pseudo_focal.shape[1] == 2:
+            pseudo_focaly = pseudo_focal[:, 1]
+        else:
+            pseudo_focaly = pseudo_focalx
+    else:
+        raise NotImplementedError("Error, unknown input focal shape format.")
+
+    assert pseudo_focalx.shape == depth.shape[:3]
+    assert pseudo_focaly.shape == depth.shape[:3]
+    grid_x, grid_y = xy_grid(W, H, cat_dim=0, device=depth.device)[:, None]
+
+    # set principal point
+    if pp is None:
+        grid_x = grid_x - (W - 1) / 2
+        grid_y = grid_y - (H - 1) / 2
+    else:
+        grid_x = grid_x.expand(B, -1, -1) - pp[:, 0, None, None]
+        grid_y = grid_y.expand(B, -1, -1) - pp[:, 1, None, None]
+
+    if n is None:
+        pts3d = torch.empty((B, H, W, 3), device=depth.device)
+        pts3d[..., 0] = depth * grid_x / pseudo_focalx
+        pts3d[..., 1] = depth * grid_y / pseudo_focaly
+        pts3d[..., 2] = depth
+    else:
+        pts3d = torch.empty((B, H, W, 3, n), device=depth.device)
+        pts3d[..., 0, :] = depth * (grid_x / pseudo_focalx)[..., None]
+        pts3d[..., 1, :] = depth * (grid_y / pseudo_focaly)[..., None]
+        pts3d[..., 2, :] = depth
+    return pts3d
+
+
+def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    camera_intrinsics = np.float32(camera_intrinsics)
+    H, W = depthmap.shape
+
+    # Compute 3D ray associated with each pixel
+    # Strong assumption: there are no skew terms
+    assert camera_intrinsics[0, 1] == 0.0
+    assert camera_intrinsics[1, 0] == 0.0
+    if pseudo_focal is None:
+        fu = camera_intrinsics[0, 0]
+        fv = camera_intrinsics[1, 1]
+    else:
+        assert pseudo_focal.shape == (H, W)
+        fu = fv = pseudo_focal
+    cu = camera_intrinsics[0, 2]
+    cv = camera_intrinsics[1, 2]
+
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+    z_cam = depthmap
+    x_cam = (u - cu) * z_cam / fu
+    y_cam = (v - cv) * z_cam / fv
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
+
+    # Mask for valid coordinates
+    valid_mask = (depthmap > 0.0)
+    return X_cam, valid_mask
+
+
+def depthmap_to_absolute_camera_coordinates(depthmap, camera_intrinsics, camera_pose, **kw):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+        - camera_pose: a 4x3 or 4x4 cam2world matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels."""
+    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
+
+    X_world = X_cam # default
+    if camera_pose is not None:
+        # R_cam2world = np.float32(camera_params["R_cam2world"])
+        # t_cam2world = np.float32(camera_params["t_cam2world"]).squeeze()
+        R_cam2world = camera_pose[:3, :3]
+        t_cam2world = camera_pose[:3, 3]
+
+        # Express in absolute coordinates (invalid depth values)
+        X_world = np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
+
+    return X_world, valid_mask
+
+
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+
+
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+
+
+def normalize_pointcloud(pts1, pts2, norm_mode='avg_dis', valid1=None, valid2=None, ret_factor=False):
+    """ renorm pointmaps pts1, pts2 with norm_mode
+    """
+    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
+    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split('_')
+
+    if norm_mode == 'avg':
+        # gather all points together (joint normalization)
+        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
+        nan_pts2, nnz2 = invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == 'dis':
+            pass  # do nothing
+        elif dis_mode == 'log1p':
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == 'warp-log1p':
+            # actually warp input points before normalizing them
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H1, W1 = pts1.shape[1:-1]
+            pts1 = pts1 * warp_factor[:, :W1 * H1].view(-1, H1, W1, 1)
+            if pts2 is not None:
+                H2, W2 = pts2.shape[1:-1]
+                pts2 = pts2 * warp_factor[:, W1 * H1:].view(-1, H2, W2, 1)
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f'bad {dis_mode=}')
+
+        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
+    else:
+        # gather all points together (joint normalization)
+        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
+        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+
+        if norm_mode == 'avg':
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == 'median':
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == 'sqrt':
+            norm_factor = all_dis.sqrt().nanmean(dim=1)**2
+        else:
+            raise ValueError(f'bad {norm_mode=}')
+
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts1.ndim:
+        norm_factor.unsqueeze_(-1)
+
+    res = pts1 / norm_factor
+    if pts2 is not None:
+        res = (res, pts2 / norm_factor)
+    if ret_factor:
+        res = res + (norm_factor,)
+    return res
+
+
+@torch.no_grad()
+def get_joint_pointcloud_depth(z1, z2, valid_mask1, valid_mask2=None, quantile=0.5):
+    # set invalid points to NaN
+    _z1 = invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
+    _z2 = invalid_to_nans(z2, valid_mask2).reshape(len(z2), -1) if z2 is not None else None
+    _z = torch.cat((_z1, _z2), dim=-1) if z2 is not None else _z1
+
+    # compute median depth overall (ignoring nans)
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_z, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_z, quantile, dim=-1)
+    return shift_z  # (B,)
+
+
+@torch.no_grad()
+def get_joint_pointcloud_center_scale(pts1, pts2, valid_mask1=None, valid_mask2=None, z_only=False, center=True):
+    # set invalid points to NaN
+    _pts1 = invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
+    _pts2 = invalid_to_nans(pts2, valid_mask2).reshape(len(pts2), -1, 3) if pts2 is not None else None
+    _pts = torch.cat((_pts1, _pts2), dim=1) if pts2 is not None else _pts1
+
+    # compute median center
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+
+    # compute median norm
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+
+
+def find_reciprocal_matches(P1, P2):
+    """
+    returns 3 values:
+    1 - reciprocal_in_P2: a boolean array of size P2.shape[0], a "True" value indicates a match
+    2 - nn2_in_P1: a int array of size P2.shape[0], it contains the indexes of the closest points in P1
+    3 - reciprocal_in_P2.sum(): the number of matches
+    """
+    tree1 = KDTree(P1)
+    tree2 = KDTree(P2)
+
+    _, nn1_in_P2 = tree2.query(P1, workers=8)
+    _, nn2_in_P1 = tree1.query(P2, workers=8)
+
+    reciprocal_in_P1 = (nn2_in_P1[nn1_in_P2] == np.arange(len(nn1_in_P2)))
+    reciprocal_in_P2 = (nn1_in_P2[nn2_in_P1] == np.arange(len(nn2_in_P1)))
+    assert reciprocal_in_P1.sum() == reciprocal_in_P2.sum()
+    return reciprocal_in_P2, nn2_in_P1, reciprocal_in_P2.sum()
+
+
+def get_med_dist_between_poses(poses):
+    from scipy.spatial.distance import pdist
+    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))
diff --git a/dust3r/dust3r/utils/image.py b/dust3r/dust3r/utils/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca1a5f9c97ccfb013d680735ed834853907906ec
--- /dev/null
+++ b/dust3r/dust3r/utils/image.py
@@ -0,0 +1,128 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions about images (loading/converting...)
+# --------------------------------------------------------
+import os
+import torch
+import numpy as np
+import PIL.Image
+from PIL.ImageOps import exif_transpose
+import torchvision.transforms as tvf
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+
+try:
+    from pillow_heif import register_heif_opener  # noqa
+    register_heif_opener()
+    heif_support_enabled = True
+except ImportError:
+    heif_support_enabled = False
+
+ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+
+
+def img_to_arr(img):
+    if isinstance(img, str):
+        img = imread_cv2(img)
+    return img
+
+
+def imread_cv2(path, options=cv2.IMREAD_COLOR):
+    """ Open an image or a depthmap with opencv-python.
+    """
+    if path.endswith(('.exr', 'EXR')):
+        options = cv2.IMREAD_ANYDEPTH
+    img = cv2.imread(path, options)
+    if img is None:
+        raise IOError(f'Could not load image={path} with {options=}')
+    if img.ndim == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+
+
+def rgb(ftensor, true_shape=None):
+    if isinstance(ftensor, list):
+        return [rgb(x, true_shape=true_shape) for x in ftensor]
+    if isinstance(ftensor, torch.Tensor):
+        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
+    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
+        ftensor = ftensor.transpose(1, 2, 0)
+    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
+        ftensor = ftensor.transpose(0, 2, 3, 1)
+    if true_shape is not None:
+        H, W = true_shape
+        ftensor = ftensor[:H, :W]
+    if ftensor.dtype == np.uint8:
+        img = np.float32(ftensor) / 255
+    else:
+        img = (ftensor * 0.5) + 0.5
+    return img.clip(min=0, max=1)
+
+
+def _resize_pil_image(img, long_edge_size):
+    S = max(img.size)
+    if S > long_edge_size:
+        interp = PIL.Image.LANCZOS
+    elif S <= long_edge_size:
+        interp = PIL.Image.BICUBIC
+    new_size = tuple(int(round(x*long_edge_size/S)) for x in img.size)
+    return img.resize(new_size, interp)
+
+
+def load_images(folder_or_list, size, square_ok=False, verbose=True, patch_size=16):
+    """ open and convert all images in a list or folder to proper input format for DUSt3R
+    """
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f'>> Loading images from {folder_or_list}')
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f'>> Loading a list of {len(folder_or_list)} images')
+        root, folder_content = '', folder_or_list
+
+    else:
+        raise ValueError(f'bad {folder_or_list=} ({type(folder_or_list)})')
+
+    supported_images_extensions = ['.jpg', '.jpeg', '.png']
+    if heif_support_enabled:
+        supported_images_extensions += ['.heic', '.heif']
+    supported_images_extensions = tuple(supported_images_extensions)
+
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert('RGB')
+        W1, H1 = img.size
+        if size == 224:
+            # resize short side to 224 (then crop)
+            img = _resize_pil_image(img, round(size * max(W1/H1, H1/W1)))
+        else:
+            # resize long side to 512
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W//2, H//2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx-half, cy-half, cx+half, cy+half))
+        else:
+            halfw = ((2 * cx) // patch_size) * patch_size / 2
+            halfh = ((2 * cy) // patch_size) * patch_size / 2
+            if not (square_ok) and W == H:
+                halfh = 3*halfw/4
+            img = img.crop((cx-halfw, cy-halfh, cx+halfw, cy+halfh))
+
+        W2, H2 = img.size
+        if verbose:
+            print(f' - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}')
+        imgs.append(dict(img=ImgNorm(img)[None], true_shape=np.int32(
+            [img.size[::-1]]), idx=len(imgs), instance=str(len(imgs))))
+
+    assert imgs, 'no images foud at '+root
+    if verbose:
+        print(f' (Found {len(imgs)} images)')
+    return imgs
diff --git a/dust3r/dust3r/utils/misc.py b/dust3r/dust3r/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..88c4d2dab6d5c14021ed9ed6646c3159a3a4637b
--- /dev/null
+++ b/dust3r/dust3r/utils/misc.py
@@ -0,0 +1,121 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for DUSt3R
+# --------------------------------------------------------
+import torch
+
+
+def fill_default_args(kwargs, func):
+    import inspect  # a bit hacky but it works reliably
+    signature = inspect.signature(func)
+
+    for k, v in signature.parameters.items():
+        if v.default is inspect.Parameter.empty:
+            continue
+        kwargs.setdefault(k, v.default)
+
+    return kwargs
+
+
+def freeze_all_params(modules):
+    for module in modules:
+        try:
+            for n, param in module.named_parameters():
+                param.requires_grad = False
+        except AttributeError:
+            # module is directly a parameter
+            module.requires_grad = False
+
+
+def is_symmetrized(gt1, gt2):
+    x = gt1['instance']
+    y = gt2['instance']
+    if len(x) == len(y) and len(x) == 1:
+        return False  # special case of batchsize 1
+    ok = True
+    for i in range(0, len(x), 2):
+        ok = ok and (x[i] == y[i + 1]) and (x[i + 1] == y[i])
+    return ok
+
+
+def flip(tensor):
+    """ flip so that tensor[0::2] <=> tensor[1::2] """
+    return torch.stack((tensor[1::2], tensor[0::2]), dim=1).flatten(0, 1)
+
+
+def interleave(tensor1, tensor2):
+    res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1)
+    res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1)
+    return res1, res2
+
+
+def transpose_to_landscape(head, activate=True):
+    """ Predict in the correct aspect-ratio,
+        then transpose the result in landscape 
+        and stack everything back together.
+    """
+    def wrapper_no(decout, true_shape):
+        B = len(true_shape)
+        assert true_shape[0:1].allclose(true_shape), 'true_shape must be all identical'
+        H, W = true_shape[0].cpu().tolist()
+        res = head(decout, (H, W))
+        return res
+
+    def wrapper_yes(decout, true_shape):
+        B = len(true_shape)
+        # by definition, the batch is in landscape mode so W >= H
+        H, W = int(true_shape.min()), int(true_shape.max())
+
+        height, width = true_shape.T
+        is_landscape = (width >= height)
+        is_portrait = ~is_landscape
+
+        # true_shape = true_shape.cpu()
+        if is_landscape.all():
+            return head(decout, (H, W))
+        if is_portrait.all():
+            return transposed(head(decout, (W, H)))
+
+        # batch is a mix of both portraint & landscape
+        def selout(ar): return [d[ar] for d in decout]
+        l_result = head(selout(is_landscape), (H, W))
+        p_result = transposed(head(selout(is_portrait), (W, H)))
+
+        # allocate full result
+        result = {}
+        for k in l_result | p_result:
+            x = l_result[k].new(B, *l_result[k].shape[1:])
+            x[is_landscape] = l_result[k]
+            x[is_portrait] = p_result[k]
+            result[k] = x
+
+        return result
+
+    return wrapper_yes if activate else wrapper_no
+
+
+def transposed(dic):
+    return {k: v.swapaxes(1, 2) for k, v in dic.items()}
+
+
+def invalid_to_nans(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = float('nan')
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr
+
+
+def invalid_to_zeros(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = 0
+        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
+    else:
+        nnz = arr.numel() // len(arr) if len(arr) else 0  # number of point per image
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr, nnz
diff --git a/dust3r/dust3r/utils/parallel.py b/dust3r/dust3r/utils/parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..06ae7fefdb9d2298929f0cbc20dfbc57eb7d7f7b
--- /dev/null
+++ b/dust3r/dust3r/utils/parallel.py
@@ -0,0 +1,79 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utilitary functions for multiprocessing
+# --------------------------------------------------------
+from tqdm import tqdm
+from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing import cpu_count
+
+
+def parallel_threads(function, args, workers=0, star_args=False, kw_args=False, front_num=1, Pool=ThreadPool, **tqdm_kw):
+    """ tqdm but with parallel execution.
+
+    Will essentially return 
+      res = [ function(arg) # default
+              function(*arg) # if star_args is True
+              function(**arg) # if kw_args is True
+              for arg in args]
+
+    Note:
+        the <front_num> first elements of args will not be parallelized. 
+        This can be useful for debugging.
+    """
+    while workers <= 0:
+        workers += cpu_count()
+    if workers == 1:
+        front_num = float('inf')
+
+    # convert into an iterable
+    try:
+        n_args_parallel = len(args) - front_num
+    except TypeError:
+        n_args_parallel = None
+    args = iter(args)
+
+    # sequential execution first
+    front = []
+    while len(front) < front_num:
+        try:
+            a = next(args)
+        except StopIteration:
+            return front  # end of the iterable
+        front.append(function(*a) if star_args else function(**a) if kw_args else function(a))
+
+    # then parallel execution
+    out = []
+    with Pool(workers) as pool:
+        # Pass the elements of args into function
+        if star_args:
+            futures = pool.imap(starcall, [(function, a) for a in args])
+        elif kw_args:
+            futures = pool.imap(starstarcall, [(function, a) for a in args])
+        else:
+            futures = pool.imap(function, args)
+        # Print out the progress as tasks complete
+        for f in tqdm(futures, total=n_args_parallel, **tqdm_kw):
+            out.append(f)
+    return front + out
+
+
+def parallel_processes(*args, **kwargs):
+    """ Same as parallel_threads, with processes
+    """
+    import multiprocessing as mp
+    kwargs['Pool'] = mp.Pool
+    return parallel_threads(*args, **kwargs)
+
+
+def starcall(args):
+    """ convenient wrapper for Process.Pool """
+    function, args = args
+    return function(*args)
+
+
+def starstarcall(args):
+    """ convenient wrapper for Process.Pool """
+    function, args = args
+    return function(**args)
diff --git a/dust3r/dust3r/utils/path_to_croco.py b/dust3r/dust3r/utils/path_to_croco.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfdf39dd678aac8d6414b917b2e8288b2afbb6a7
--- /dev/null
+++ b/dust3r/dust3r/utils/path_to_croco.py
@@ -0,0 +1,42 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# CroCo submodule import
+# --------------------------------------------------------
+import importlib
+import sys
+
+
+import sys
+import os.path as path
+HERE_PATH = path.normpath(path.dirname(__file__))
+CROCO_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../croco'))
+CROCO_MODELS_PATH = path.join(CROCO_REPO_PATH, 'models')
+CROCO_INIT_PATH = path.join(CROCO_REPO_PATH, '__init__.py')
+
+# check the presence of models directory in repo to be sure its cloned
+if path.isdir(CROCO_MODELS_PATH) and not path.isfile(CROCO_INIT_PATH):  # croco is a submodule (main branch)
+    # workaround for sibling import
+    sys.path.insert(0, CROCO_REPO_PATH)
+
+    # rewrite croco submodule imports to look like the package
+    def _alias(old: str, new: str):
+        mod = importlib.import_module(new)
+        sys.modules.setdefault(old, mod)
+    _alias("croco.models", "models")
+    _alias("croco.utils", "utils")
+else:
+    try:
+        from croco.models.croco import CroCoNet   # croco installed as a module
+    except ImportError as e:
+        raise ImportError(f"croco is not initialized, could not find: {CROCO_MODELS_PATH}.\n "
+                          "Did you forget to run 'git submodule update --init --recursive' ?")
+
+# patch curope submodule when installed
+try:
+    import curope  # curope installed
+    import models.curope
+    models.curope.cuRoPE2D = curope.cuRoPE2D
+except Exception as e:
+    pass
diff --git a/dust3r/dust3r/viz.py b/dust3r/dust3r/viz.py
new file mode 100644
index 0000000000000000000000000000000000000000..9150e8b850d9f1e6bf9ddf6e865d34fc743e276a
--- /dev/null
+++ b/dust3r/dust3r/viz.py
@@ -0,0 +1,381 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Visualization utilities using trimesh
+# --------------------------------------------------------
+import PIL.Image
+import numpy as np
+from scipy.spatial.transform import Rotation
+import torch
+
+from dust3r.utils.geometry import geotrf, get_med_dist_between_poses, depthmap_to_absolute_camera_coordinates
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb, img_to_arr
+
+try:
+    import trimesh
+except ImportError:
+    print('/!\\ module trimesh is not installed, cannot visualize results /!\\')
+
+
+
+def cat_3d(vecs):
+    if isinstance(vecs, (np.ndarray, torch.Tensor)):
+        vecs = [vecs]
+    return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)])
+
+
+def show_raw_pointcloud(pts3d, colors, point_size=2):
+    scene = trimesh.Scene()
+
+    pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors))
+    scene.add_geometry(pct)
+
+    scene.show(line_settings={'point_size': point_size})
+
+
+def pts3d_to_trimesh(img, pts3d, valid=None):
+    H, W, THREE = img.shape
+    assert THREE == 3
+    assert img.shape == pts3d.shape
+
+    vertices = pts3d.reshape(-1, 3)
+
+    # make squares: each pixel == 2 triangles
+    idx = np.arange(len(vertices)).reshape(H, W)
+    idx1 = idx[:-1, :-1].ravel()  # top-left corner
+    idx2 = idx[:-1, +1:].ravel()  # right-left corner
+    idx3 = idx[+1:, :-1].ravel()  # bottom-left corner
+    idx4 = idx[+1:, +1:].ravel()  # bottom-right corner
+    faces = np.concatenate((
+        np.c_[idx1, idx2, idx3],
+        np.c_[idx3, idx2, idx1],  # same triangle, but backward (cheap solution to cancel face culling)
+        np.c_[idx2, idx3, idx4],
+        np.c_[idx4, idx3, idx2],  # same triangle, but backward (cheap solution to cancel face culling)
+    ), axis=0)
+
+    # prepare triangle colors
+    face_colors = np.concatenate((
+        img[:-1, :-1].reshape(-1, 3),
+        img[:-1, :-1].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3),
+        img[+1:, +1:].reshape(-1, 3)
+    ), axis=0)
+
+    # remove invalid faces
+    if valid is not None:
+        assert valid.shape == (H, W)
+        valid_idxs = valid.ravel()
+        valid_faces = valid_idxs[faces].all(axis=-1)
+        faces = faces[valid_faces]
+        face_colors = face_colors[valid_faces]
+
+    assert len(faces) == len(face_colors)
+    return dict(vertices=vertices, face_colors=face_colors, faces=faces)
+
+
+def cat_meshes(meshes):
+    vertices, faces, colors = zip(*[(m['vertices'], m['faces'], m['face_colors']) for m in meshes])
+    n_vertices = np.cumsum([0]+[len(v) for v in vertices])
+    for i in range(len(faces)):
+        faces[i][:] += n_vertices[i]
+
+    vertices = np.concatenate(vertices)
+    colors = np.concatenate(colors)
+    faces = np.concatenate(faces)
+    return dict(vertices=vertices, face_colors=colors, faces=faces)
+
+
+def show_duster_pairs(view1, view2, pred1, pred2):
+    import matplotlib.pyplot as pl
+    pl.ion()
+
+    for e in range(len(view1['instance'])):
+        i = view1['idx'][e]
+        j = view2['idx'][e]
+        img1 = rgb(view1['img'][e])
+        img2 = rgb(view2['img'][e])
+        conf1 = pred1['conf'][e].squeeze()
+        conf2 = pred2['conf'][e].squeeze()
+        score = conf1.mean()*conf2.mean()
+        print(f">> Showing pair #{e} {i}-{j} {score=:g}")
+        pl.clf()
+        pl.subplot(221).imshow(img1)
+        pl.subplot(223).imshow(img2)
+        pl.subplot(222).imshow(conf1, vmin=1, vmax=30)
+        pl.subplot(224).imshow(conf2, vmin=1, vmax=30)
+        pts1 = pred1['pts3d'][e]
+        pts2 = pred2['pts3d_in_other_view'][e]
+        pl.subplots_adjust(0, 0, 1, 1, 0, 0)
+        if input('show pointcloud? (y/n) ') == 'y':
+            show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5)
+
+
+def auto_cam_size(im_poses):
+    return 0.1 * get_med_dist_between_poses(im_poses)
+
+
+class SceneViz:
+    def __init__(self):
+        self.scene = trimesh.Scene()
+
+    def add_rgbd(self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None):
+        image = img_to_arr(image)
+
+        # make up some intrinsics
+        if intrinsics is None:
+            H, W, THREE = image.shape
+            focal = max(H, W)
+            intrinsics = np.float32([[focal, 0, W/2], [0, focal, H/2], [0, 0, 1]])
+
+        # compute 3d points
+        pts3d = depthmap_to_pts3d(depth, intrinsics, cam2world=cam2world)
+
+        return self.add_pointcloud(pts3d, image, mask=(depth<zfar) if mask is None else mask)
+
+    def add_pointcloud(self, pts3d, color=(0,0,0), mask=None, denoise=False):
+        pts3d = to_numpy(pts3d)
+        mask = to_numpy(mask)
+        if not isinstance(pts3d, list):
+            pts3d = [pts3d.reshape(-1,3)]
+            if mask is not None: 
+                mask = [mask.ravel()]
+        if not isinstance(color, (tuple,list)):
+            color = [color.reshape(-1,3)]
+        if mask is None:
+            mask = [slice(None)] * len(pts3d)
+
+        pts = np.concatenate([p[m] for p,m in zip(pts3d,mask)])
+        pct = trimesh.PointCloud(pts)
+
+        if isinstance(color, (list, np.ndarray, torch.Tensor)):
+            color = to_numpy(color)
+            col = np.concatenate([p[m] for p,m in zip(color,mask)])
+            assert col.shape == pts.shape, bb()
+            pct.visual.vertex_colors = uint8(col.reshape(-1,3))
+        else:
+            assert len(color) == 3
+            pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape)
+
+        if denoise:
+            # remove points which are noisy
+            centroid = np.median(pct.vertices, axis=0)
+            dist_to_centroid = np.linalg.norm( pct.vertices - centroid, axis=-1)
+            dist_thr = np.quantile(dist_to_centroid, 0.99)
+            valid = (dist_to_centroid < dist_thr)
+            # new cleaned pointcloud
+            pct = trimesh.PointCloud(pct.vertices[valid], color=pct.visual.vertex_colors[valid])
+
+        self.scene.add_geometry(pct)
+        return self
+
+    def add_rgbd(self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None):
+        # make up some intrinsics
+        if intrinsics is None:
+            H, W, THREE = image.shape
+            focal = max(H, W)
+            intrinsics = np.float32([[focal, 0, W/2], [0, focal, H/2], [0, 0, 1]])
+
+        # compute 3d points
+        pts3d, mask2 = depthmap_to_absolute_camera_coordinates(depth, intrinsics, cam2world)
+        mask2 &= (depth<zfar) 
+
+        # combine with provided mask if any
+        if mask is not None:
+            mask2 &= mask
+
+        return self.add_pointcloud(pts3d, image, mask=mask2)
+
+    def add_camera(self, pose_c2w, focal=None, color=(0, 0, 0), image=None, imsize=None, cam_size=0.03):
+        pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image))
+        image = img_to_arr(image)
+        if isinstance(focal, np.ndarray) and focal.shape == (3,3):
+            intrinsics = focal
+            focal = (intrinsics[0,0] * intrinsics[1,1]) ** 0.5
+            if imsize is None:
+                imsize = (2*intrinsics[0,2], 2*intrinsics[1,2])
+        
+        add_scene_cam(self.scene, pose_c2w, color, image, focal, imsize=imsize, screen_width=cam_size, marker=None)
+        return self
+
+    def add_cameras(self, poses, focals=None, images=None, imsizes=None, colors=None, **kw):
+        get = lambda arr,idx: None if arr is None else arr[idx]
+        for i, pose_c2w in enumerate(poses):
+            self.add_camera(pose_c2w, get(focals,i), image=get(images,i), color=get(colors,i), imsize=get(imsizes,i), **kw)
+        return self
+
+    def show(self, point_size=2):
+        self.scene.show(line_settings= {'point_size': point_size})
+
+
+def show_raw_pointcloud_with_cams(imgs, pts3d, mask, focals, cams2world,
+                                  point_size=2, cam_size=0.05, cam_color=None):
+    """ Visualization of a pointcloud with cameras
+        imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...]
+        focals = (N,) or N-size list of [focal, ...]
+        cams2world = (N,4,4) or N-size list of [(4,4), ...]
+    """
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+
+    scene = trimesh.Scene()
+
+    # full pointcloud
+    pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+    col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+    pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+    scene.add_geometry(pct)
+
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      imgs[i] if i < len(imgs) else None, focals[i], screen_width=cam_size)
+
+    scene.show(line_settings={'point_size': point_size})
+
+
+def add_scene_cam(scene, pose_c2w, edge_color, image=None, focal=None, imsize=None, 
+                  screen_width=0.03, marker=None):
+    if image is not None:
+        image = np.asarray(image)
+        H, W, THREE = image.shape
+        assert THREE == 3
+        if image.dtype != np.uint8:
+            image = np.uint8(255*image)
+    elif imsize is not None:
+        W, H = imsize
+    elif focal is not None:
+        H = W = focal / 1.1
+    else:
+        H = W = 1
+
+    if isinstance(focal, np.ndarray):
+        focal = focal[0]
+    if not focal:
+        focal = min(H,W) * 1.1 # default value
+
+    # create fake camera
+    height = max( screen_width/10, focal * screen_width / H )
+    width = screen_width * 0.5**0.5
+    rot45 = np.eye(4)
+    rot45[:3, :3] = Rotation.from_euler('z', np.deg2rad(45)).as_matrix()
+    rot45[2, 3] = -height  # set the tip of the cone = optical center
+    aspect_ratio = np.eye(4)
+    aspect_ratio[0, 0] = W/H
+    transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45
+    cam = trimesh.creation.cone(width, height, sections=4)  # , transform=transform)
+
+    # this is the image
+    if image is not None:
+        vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]])
+        faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]])
+        img = trimesh.Trimesh(vertices=vertices, faces=faces)
+        uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]])
+        img.visual = trimesh.visual.TextureVisuals(uv_coords, image=PIL.Image.fromarray(image))
+        scene.add_geometry(img)
+
+    # this is the camera mesh
+    rot2 = np.eye(4)
+    rot2[:3, :3] = Rotation.from_euler('z', np.deg2rad(2)).as_matrix()
+    vertices = np.r_[cam.vertices, 0.95*cam.vertices, geotrf(rot2, cam.vertices)]
+    vertices = geotrf(transform, vertices)
+    faces = []
+    for face in cam.faces:
+        if 0 in face:
+            continue
+        a, b, c = face
+        a2, b2, c2 = face + len(cam.vertices)
+        a3, b3, c3 = face + 2*len(cam.vertices)
+
+        # add 3 pseudo-edges
+        faces.append((a, b, b2))
+        faces.append((a, a2, c))
+        faces.append((c2, b, c))
+
+        faces.append((a, b, b3))
+        faces.append((a, a3, c))
+        faces.append((c3, b, c))
+
+    # no culling
+    faces += [(c, b, a) for a, b, c in faces]
+
+    cam = trimesh.Trimesh(vertices=vertices, faces=faces)
+    cam.visual.face_colors[:, :3] = edge_color
+    scene.add_geometry(cam)
+
+    if marker == 'o':
+        marker = trimesh.creation.icosphere(3, radius=screen_width/4)
+        marker.vertices += pose_c2w[:3,3]
+        marker.visual.face_colors[:,:3] = edge_color
+        scene.add_geometry(marker)
+
+
+def cat(a, b):
+    return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3)))
+
+
+OPENGL = np.array([[1, 0, 0, 0],
+                   [0, -1, 0, 0],
+                   [0, 0, -1, 0],
+                   [0, 0, 0, 1]])
+
+
+CAM_COLORS = [(255, 0, 0), (0, 0, 255), (0, 255, 0), (255, 0, 255), (255, 204, 0), (0, 204, 204),
+              (128, 255, 255), (255, 128, 255), (255, 255, 128), (0, 0, 0), (128, 128, 128)]
+
+
+def uint8(colors):
+    if not isinstance(colors, np.ndarray):
+        colors = np.array(colors)
+    if np.issubdtype(colors.dtype, np.floating):
+        colors *= 255
+    assert 0 <= colors.min() and colors.max() < 256
+    return np.uint8(colors)
+
+
+def segment_sky(image):
+    import cv2
+    from scipy import ndimage
+
+    # Convert to HSV
+    image = to_numpy(image)
+    if np.issubdtype(image.dtype, np.floating):
+        image = np.uint8(255*image.clip(min=0, max=1))
+    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+
+    # Define range for blue color and create mask
+    lower_blue = np.array([0, 0, 100])
+    upper_blue = np.array([30, 255, 255])
+    mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool)
+
+    # add luminous gray
+    mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150)
+    mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180)
+    mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220)
+
+    # Morphological operations
+    kernel = np.ones((5, 5), np.uint8)
+    mask2 = ndimage.binary_opening(mask, structure=kernel)
+
+    # keep only largest CC
+    _, labels, stats, _ = cv2.connectedComponentsWithStats(mask2.view(np.uint8), connectivity=8)
+    cc_sizes = stats[1:, cv2.CC_STAT_AREA]
+    order = cc_sizes.argsort()[::-1]  # bigger first
+    i = 0
+    selection = []
+    while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2:
+        selection.append(1 + order[i])
+        i += 1
+    mask3 = np.in1d(labels, selection).reshape(labels.shape)
+
+    # Apply mask
+    return torch.from_numpy(mask3)
diff --git a/dust3r/dust3r_visloc/README.md b/dust3r/dust3r_visloc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..37754df8fa5f04840457883354da7856ff843957
--- /dev/null
+++ b/dust3r/dust3r_visloc/README.md
@@ -0,0 +1,93 @@
+# Visual Localization with DUSt3R
+
+## Dataset preparation
+
+### CambridgeLandmarks
+
+Each subscene should look like this:
+
+```
+Cambridge_Landmarks
+├─ mapping
+│   ├─ GreatCourt
+│   │  └─ colmap/reconstruction
+│   │     ├─ cameras.txt
+│   │     ├─ images.txt
+│   │     └─ points3D.txt
+├─ kapture
+│   ├─ GreatCourt
+│   │  └─ query  # https://github.com/naver/kapture/blob/main/doc/datasets.adoc#cambridge-landmarks
+│   ... 
+├─ GreatCourt 
+│   ├─ pairsfile/query
+│   │     └─ AP-GeM-LM18_top50.txt  # https://github.com/naver/deep-image-retrieval/blob/master/dirtorch/extract_kapture.py followed by https://github.com/naver/kapture-localization/blob/main/tools/kapture_compute_image_pairs.py
+│   ├─ seq1
+│   ...
+...
+```
+
+### 7Scenes
+Each subscene should look like this:
+
+```
+7-scenes
+├─ chess
+│   ├─ mapping/  # https://github.com/naver/kapture/blob/main/doc/datasets.adoc#1-7-scenes
+│   ├─ query/  # https://github.com/naver/kapture/blob/main/doc/datasets.adoc#1-7-scenes
+│   └─ pairsfile/query/
+│         └─ APGeM-LM18_top20.txt  # https://github.com/naver/deep-image-retrieval/blob/master/dirtorch/extract_kapture.py followed by https://github.com/naver/kapture-localization/blob/main/tools/kapture_compute_image_pairs.py
+...
+```
+
+### Aachen-Day-Night
+
+```
+Aachen-Day-Night-v1.1
+├─ mapping
+│   ├─ colmap/reconstruction
+│   │  ├─ cameras.txt
+│   │  ├─ images.txt
+│   │  └─ points3D.txt
+├─ kapture
+│   └─ query  # https://github.com/naver/kapture/blob/main/doc/datasets.adoc#2-aachen-day-night-v11
+├─ images
+│   ├─ db
+│   ├─ query
+│   └─ sequences
+└─ pairsfile/query
+    └─ fire_top50.txt  # https://github.com/naver/fire/blob/main/kapture_compute_pairs.py
+```
+
+### InLoc
+
+```
+InLoc
+├─ mapping  # https://github.com/naver/kapture/blob/main/doc/datasets.adoc#6-inloc
+├─ query    # https://github.com/naver/kapture/blob/main/doc/datasets.adoc#6-inloc
+└─ pairsfile/query
+    └─ pairs-query-netvlad40-temporal.txt  # https://github.com/cvg/Hierarchical-Localization/blob/master/pairs/inloc/pairs-query-netvlad40-temporal.txt
+```
+
+## Example Commands
+
+With `visloc.py` you can run our visual localization experiments on Aachen-Day-Night, InLoc, Cambridge Landmarks and 7 Scenes.
+
+```bash
+# Aachen-Day-Night-v1.1:
+# scene in 'day' 'night'
+# scene can also be 'all'
+python3 visloc.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt --dataset "VislocAachenDayNight('/path/to/prepared/Aachen-Day-Night-v1.1/', subscene='${scene}', pairsfile='fire_top50', topk=20)" --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Aachen-Day-Night-v1.1/${scene}/loc
+
+# InLoc
+python3 visloc.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt --dataset "VislocInLoc('/path/to/prepared/InLoc/', pairsfile='pairs-query-netvlad40-temporal', topk=20)" --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/InLoc/loc
+
+
+# 7-scenes:
+# scene in 'chess' 'fire' 'heads' 'office' 'pumpkin' 'redkitchen' 'stairs'
+python3 visloc.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt --dataset "VislocSevenScenes('/path/to/prepared/7-scenes/', subscene='${scene}', pairsfile='APGeM-LM18_top20', topk=1)" --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/7-scenes/${scene}/loc
+
+# Cambridge Landmarks:
+# scene in 'ShopFacade' 'GreatCourt' 'KingsCollege' 'OldHospital' 'StMarysChurch'
+python3 visloc.py --model_name DUSt3R_ViTLarge_BaseDecoder_512_dpt --dataset "VislocCambridgeLandmarks('/path/to/prepared/Cambridge_Landmarks/', subscene='${scene}', pairsfile='APGeM-LM18_top50', topk=20)" --pnp_mode poselib --reprojection_error_diag_ratio 0.008 --output_dir /path/to/output/Cambridge_Landmarks/${scene}/loc
+
+```
diff --git a/dust3r/dust3r_visloc/__init__.py b/dust3r/dust3r_visloc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32692113d830ddc4af4e6ed608f222fbe062e6e
--- /dev/null
+++ b/dust3r/dust3r_visloc/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
diff --git a/dust3r/dust3r_visloc/datasets/__init__.py b/dust3r/dust3r_visloc/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..566926b1e248e4b64fc5182031af634435bb8601
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+from .sevenscenes import VislocSevenScenes
+from .cambridge_landmarks import VislocCambridgeLandmarks
+from .aachen_day_night import VislocAachenDayNight
+from .inloc import VislocInLoc
diff --git a/dust3r/dust3r_visloc/datasets/aachen_day_night.py b/dust3r/dust3r_visloc/datasets/aachen_day_night.py
new file mode 100644
index 0000000000000000000000000000000000000000..159548e8b51a1b5872a2392cd9107ff96e40e801
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/aachen_day_night.py
@@ -0,0 +1,24 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# AachenDayNight dataloader
+# --------------------------------------------------------
+import os
+from dust3r_visloc.datasets.base_colmap import BaseVislocColmapDataset
+
+
+class VislocAachenDayNight(BaseVislocColmapDataset):
+    def __init__(self, root, subscene, pairsfile, topk=1, cache_sfm=False):
+        assert subscene in [None, '', 'day', 'night', 'all']
+        self.subscene = subscene
+        image_path = os.path.join(root, 'images')
+        map_path = os.path.join(root, 'mapping/colmap/reconstruction')
+        query_path = os.path.join(root, 'kapture', 'query')
+        pairsfile_path = os.path.join(root, 'pairsfile/query', pairsfile + '.txt')
+        super().__init__(image_path=image_path, map_path=map_path,
+                         query_path=query_path, pairsfile_path=pairsfile_path,
+                         topk=topk, cache_sfm=cache_sfm)
+        self.scenes = [filename for filename in self.scenes if filename in self.pairs]
+        if self.subscene == 'day' or self.subscene == 'night':
+            self.scenes = [filename for filename in self.scenes if self.subscene in filename]
diff --git a/dust3r/dust3r_visloc/datasets/base_colmap.py b/dust3r/dust3r_visloc/datasets/base_colmap.py
new file mode 100644
index 0000000000000000000000000000000000000000..def1da61b5d3b416db5845c2016082348df944a6
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/base_colmap.py
@@ -0,0 +1,282 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class for colmap / kapture
+# --------------------------------------------------------
+import os
+import numpy as np
+from tqdm import tqdm
+import collections
+import pickle
+import PIL.Image
+import torch
+from scipy.spatial.transform import Rotation
+import torchvision.transforms as tvf
+
+from kapture.core import CameraType
+from kapture.io.csv import kapture_from_dir
+from kapture_localization.utils.pairsfile import get_ordered_pairs_from_file
+
+from dust3r_visloc.datasets.utils import cam_to_world_from_kapture, get_resize_function, rescale_points3d
+from dust3r_visloc.datasets.base_dataset import BaseVislocDataset
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import colmap_to_opencv_intrinsics
+
+KaptureSensor = collections.namedtuple('Sensor', 'sensor_params camera_params')
+
+
+def kapture_to_opencv_intrinsics(sensor):
+    """
+    Convert from Kapture to OpenCV parameters.
+    Warning: we assume that the camera and pixel coordinates follow Colmap conventions here.
+    Args:
+        sensor: Kapture sensor
+    """
+    sensor_type = sensor.sensor_params[0]
+    if sensor_type == "SIMPLE_PINHOLE":
+        # Simple pinhole model.
+        # We still call OpenCV undistorsion however for code simplicity.
+        w, h, f, cx, cy = sensor.camera_params
+        k1 = 0
+        k2 = 0
+        p1 = 0
+        p2 = 0
+        fx = fy = f
+    elif sensor_type == "PINHOLE":
+        w, h, fx, fy, cx, cy = sensor.camera_params
+        k1 = 0
+        k2 = 0
+        p1 = 0
+        p2 = 0
+    elif sensor_type == "SIMPLE_RADIAL":
+        w, h, f, cx, cy, k1 = sensor.camera_params
+        k2 = 0
+        p1 = 0
+        p2 = 0
+        fx = fy = f
+    elif sensor_type == "RADIAL":
+        w, h, f, cx, cy, k1, k2 = sensor.camera_params
+        p1 = 0
+        p2 = 0
+        fx = fy = f
+    elif sensor_type == "OPENCV":
+        w, h, fx, fy, cx, cy, k1, k2, p1, p2 = sensor.camera_params
+    else:
+        raise NotImplementedError(f"Sensor type {sensor_type} is not supported yet.")
+
+    cameraMatrix = np.asarray([[fx, 0, cx],
+                               [0, fy, cy],
+                               [0, 0, 1]], dtype=np.float32)
+
+    # We assume that Kapture data comes from Colmap: the origin is different.
+    cameraMatrix = colmap_to_opencv_intrinsics(cameraMatrix)
+
+    distCoeffs = np.asarray([k1, k2, p1, p2], dtype=np.float32)
+    return cameraMatrix, distCoeffs, (w, h)
+
+
+def K_from_colmap(elems):
+    sensor = KaptureSensor(elems, tuple(map(float, elems[1:])))
+    cameraMatrix, distCoeffs, (w, h) = kapture_to_opencv_intrinsics(sensor)
+    res = dict(resolution=(w, h),
+               intrinsics=cameraMatrix,
+               distortion=distCoeffs)
+    return res
+
+
+def pose_from_qwxyz_txyz(elems):
+    qw, qx, qy, qz, tx, ty, tz = map(float, elems)
+    pose = np.eye(4)
+    pose[:3, :3] = Rotation.from_quat((qx, qy, qz, qw)).as_matrix()
+    pose[:3, 3] = (tx, ty, tz)
+    return np.linalg.inv(pose)  # returns cam2world
+
+
+class BaseVislocColmapDataset(BaseVislocDataset):
+    def __init__(self, image_path, map_path, query_path, pairsfile_path, topk=1, cache_sfm=False):
+        super().__init__()
+        self.topk = topk
+        self.num_views = self.topk + 1
+        self.image_path = image_path
+        self.cache_sfm = cache_sfm
+
+        self._load_sfm(map_path)
+
+        kdata_query = kapture_from_dir(query_path)
+        assert kdata_query.records_camera is not None and kdata_query.trajectories is not None
+
+        kdata_query_searchindex = {kdata_query.records_camera[(timestamp, sensor_id)]: (timestamp, sensor_id)
+                                   for timestamp, sensor_id in kdata_query.records_camera.key_pairs()}
+        self.query_data = {'kdata': kdata_query, 'searchindex': kdata_query_searchindex}
+
+        self.pairs = get_ordered_pairs_from_file(pairsfile_path)
+        self.scenes = kdata_query.records_camera.data_list()
+
+    def _load_sfm(self, sfm_dir):
+        sfm_cache_path = os.path.join(sfm_dir, 'dust3r_cache.pkl')
+        if os.path.isfile(sfm_cache_path) and self.cache_sfm:
+            with open(sfm_cache_path, "rb") as f:
+                data = pickle.load(f)
+                self.img_infos = data['img_infos']
+                self.points3D = data['points3D']
+            return
+
+        # load cameras
+        with open(os.path.join(sfm_dir, 'cameras.txt'), 'r') as f:
+            raw = f.read().splitlines()[3:]  # skip header
+
+        intrinsics = {}
+        for camera in tqdm(raw):
+            camera = camera.split(' ')
+            intrinsics[int(camera[0])] = K_from_colmap(camera[1:])
+
+        # load images
+        with open(os.path.join(sfm_dir, 'images.txt'), 'r') as f:
+            raw = f.read().splitlines()
+            raw = [line for line in raw if not line.startswith('#')]  # skip header
+
+        self.img_infos = {}
+        for image, points in tqdm(zip(raw[0::2], raw[1::2]), total=len(raw) // 2):
+            image = image.split(' ')
+            points = points.split(' ')
+
+            img_name = image[-1]
+            current_points2D = {int(i): (float(x), float(y))
+                                for i, x, y in zip(points[2::3], points[0::3], points[1::3]) if i != '-1'}
+            self.img_infos[img_name] = dict(intrinsics[int(image[-2])],
+                                            path=img_name,
+                                            camera_pose=pose_from_qwxyz_txyz(image[1: -2]),
+                                            sparse_pts2d=current_points2D)
+
+        # load 3D points
+        with open(os.path.join(sfm_dir, 'points3D.txt'), 'r') as f:
+            raw = f.read().splitlines()
+            raw = [line for line in raw if not line.startswith('#')]  # skip header
+
+        self.points3D = {}
+        for point in tqdm(raw):
+            point = point.split()
+            self.points3D[int(point[0])] = tuple(map(float, point[1:4]))
+
+        if self.cache_sfm:
+            to_save = \
+                {
+                    'img_infos': self.img_infos,
+                    'points3D': self.points3D
+                }
+            with open(sfm_cache_path, "wb") as f:
+                pickle.dump(to_save, f)
+
+    def __len__(self):
+        return len(self.scenes)
+
+    def _get_view_query(self, imgname):
+        kdata, searchindex = map(self.query_data.get, ['kdata', 'searchindex'])
+
+        timestamp, camera_id = searchindex[imgname]
+
+        camera_params = kdata.sensors[camera_id].camera_params
+        if kdata.sensors[camera_id].camera_type == CameraType.SIMPLE_PINHOLE:
+            W, H, f, cx, cy = camera_params
+            k1 = 0
+            fx = fy = f
+        elif kdata.sensors[camera_id].camera_type == CameraType.SIMPLE_RADIAL:
+            W, H, f, cx, cy, k1 = camera_params
+            fx = fy = f
+        else:
+            raise NotImplementedError('not implemented')
+
+        W, H = int(W), int(H)
+        intrinsics = np.float32([(fx, 0, cx),
+                                 (0, fy, cy),
+                                 (0, 0, 1)])
+        intrinsics = colmap_to_opencv_intrinsics(intrinsics)
+        distortion = [k1, 0, 0, 0]
+
+        if kdata.trajectories is not None and (timestamp, camera_id) in kdata.trajectories:
+            cam_to_world = cam_to_world_from_kapture(kdata, timestamp, camera_id)
+        else:
+            cam_to_world = np.eye(4, dtype=np.float32)
+
+        # Load RGB image
+        rgb_image = PIL.Image.open(os.path.join(self.image_path, imgname)).convert('RGB')
+        rgb_image.load()
+        resize_func, _, to_orig = get_resize_function(self.maxdim, self.patch_size, H, W)
+        rgb_tensor = resize_func(ImgNorm(rgb_image))
+
+        view = {
+            'intrinsics': intrinsics,
+            'distortion': distortion,
+            'cam_to_world': cam_to_world,
+            'rgb': rgb_image,
+            'rgb_rescaled': rgb_tensor,
+            'to_orig': to_orig,
+            'idx': 0,
+            'image_name': imgname
+        }
+        return view
+
+    def _get_view_map(self, imgname, idx):
+        infos = self.img_infos[imgname]
+
+        rgb_image = PIL.Image.open(os.path.join(self.image_path, infos['path'])).convert('RGB')
+        rgb_image.load()
+        W, H = rgb_image.size
+        intrinsics = infos['intrinsics']
+        intrinsics = colmap_to_opencv_intrinsics(intrinsics)
+        distortion_coefs = infos['distortion']
+
+        pts2d = infos['sparse_pts2d']
+        sparse_pos2d = np.float32(list(pts2d.values())).reshape((-1, 2))  # pts2d from colmap
+        sparse_pts3d = np.float32([self.points3D[i] for i in pts2d]).reshape((-1, 3))
+
+        # store full resolution 2D->3D
+        sparse_pos2d_cv2 = sparse_pos2d.copy()
+        sparse_pos2d_cv2[:, 0] -= 0.5
+        sparse_pos2d_cv2[:, 1] -= 0.5
+        sparse_pos2d_int = sparse_pos2d_cv2.round().astype(np.int64)
+        valid = (sparse_pos2d_int[:, 0] >= 0) & (sparse_pos2d_int[:, 0] < W) & (
+            sparse_pos2d_int[:, 1] >= 0) & (sparse_pos2d_int[:, 1] < H)
+        sparse_pos2d_int = sparse_pos2d_int[valid]
+        # nan => invalid
+        pts3d = np.full((H, W, 3), np.nan, dtype=np.float32)
+        pts3d[sparse_pos2d_int[:, 1], sparse_pos2d_int[:, 0]] = sparse_pts3d[valid]
+        pts3d = torch.from_numpy(pts3d)
+
+        cam_to_world = infos['camera_pose']  # cam2world
+
+        # also store resized resolution 2D->3D
+        resize_func, to_resize, to_orig = get_resize_function(self.maxdim, self.patch_size, H, W)
+        rgb_tensor = resize_func(ImgNorm(rgb_image))
+
+        HR, WR = rgb_tensor.shape[1:]
+        _, _, pts3d_rescaled, valid_rescaled = rescale_points3d(sparse_pos2d_cv2, sparse_pts3d, to_resize, HR, WR)
+        pts3d_rescaled = torch.from_numpy(pts3d_rescaled)
+        valid_rescaled = torch.from_numpy(valid_rescaled)
+
+        view = {
+            'intrinsics': intrinsics,
+            'distortion': distortion_coefs,
+            'cam_to_world': cam_to_world,
+            'rgb': rgb_image,
+            "pts3d": pts3d,
+            "valid": pts3d.sum(dim=-1).isfinite(),
+            'rgb_rescaled': rgb_tensor,
+            "pts3d_rescaled": pts3d_rescaled,
+            "valid_rescaled": valid_rescaled,
+            'to_orig': to_orig,
+            'idx': idx,
+            'image_name': imgname
+        }
+        return view
+
+    def __getitem__(self, idx):
+        assert self.maxdim is not None and self.patch_size is not None
+        query_image = self.scenes[idx]
+        map_images = [p[0] for p in self.pairs[query_image][:self.topk]]
+        views = []
+        views.append(self._get_view_query(query_image))
+        for idx, map_image in enumerate(map_images):
+            views.append(self._get_view_map(map_image, idx + 1))
+        return views
diff --git a/dust3r/dust3r_visloc/datasets/base_dataset.py b/dust3r/dust3r_visloc/datasets/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..cda3774c5ab5b668be5eecf89681abc96df5fe17
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/base_dataset.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class
+# --------------------------------------------------------
+class BaseVislocDataset:
+    def __init__(self):
+        pass
+
+    def set_resolution(self, model):
+        self.maxdim = max(model.patch_embed.img_size)
+        self.patch_size = model.patch_embed.patch_size
+
+    def __len__(self):
+        raise NotImplementedError()
+    
+    def __getitem__(self, idx):
+        raise NotImplementedError()
\ No newline at end of file
diff --git a/dust3r/dust3r_visloc/datasets/cambridge_landmarks.py b/dust3r/dust3r_visloc/datasets/cambridge_landmarks.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca3e131941bf444d86a709d23e518e7b93d3d0f6
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/cambridge_landmarks.py
@@ -0,0 +1,19 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Cambridge Landmarks dataloader
+# --------------------------------------------------------
+import os
+from dust3r_visloc.datasets.base_colmap import BaseVislocColmapDataset
+
+
+class VislocCambridgeLandmarks (BaseVislocColmapDataset):
+    def __init__(self, root, subscene, pairsfile, topk=1, cache_sfm=False):
+        image_path = os.path.join(root, subscene)
+        map_path = os.path.join(root, 'mapping', subscene, 'colmap/reconstruction')
+        query_path = os.path.join(root, 'kapture', subscene, 'query')
+        pairsfile_path = os.path.join(root, subscene, 'pairsfile/query', pairsfile + '.txt')
+        super().__init__(image_path=image_path, map_path=map_path,
+                         query_path=query_path, pairsfile_path=pairsfile_path,
+                          topk=topk, cache_sfm=cache_sfm)
\ No newline at end of file
diff --git a/dust3r/dust3r_visloc/datasets/inloc.py b/dust3r/dust3r_visloc/datasets/inloc.py
new file mode 100644
index 0000000000000000000000000000000000000000..99ed11f554203d353d0559d0589f40ec1ffbf66e
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/inloc.py
@@ -0,0 +1,167 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# InLoc dataloader
+# --------------------------------------------------------
+import os
+import numpy as np
+import torch
+import PIL.Image
+import scipy.io
+
+import kapture
+from kapture.io.csv import kapture_from_dir
+from kapture_localization.utils.pairsfile import get_ordered_pairs_from_file
+
+from dust3r_visloc.datasets.utils import cam_to_world_from_kapture, get_resize_function, rescale_points3d
+from dust3r_visloc.datasets.base_dataset import BaseVislocDataset
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import xy_grid, geotrf
+
+
+def read_alignments(path_to_alignment):
+    aligns = {}
+    with open(path_to_alignment, "r") as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            if len(line) == 4:
+                trans_nr = line[:-1]
+                while line != 'After general icp:\n':
+                    line = fid.readline()
+                line = fid.readline()
+                p = []
+                for i in range(4):
+                    elems = line.split(' ')
+                    line = fid.readline()
+                    for e in elems:
+                        if len(e) != 0:
+                            p.append(float(e))
+                P = np.array(p).reshape(4, 4)
+                aligns[trans_nr] = P
+    return aligns
+
+
+class VislocInLoc(BaseVislocDataset):
+    def __init__(self, root, pairsfile, topk=1):
+        super().__init__()
+        self.root = root
+        self.topk = topk
+        self.num_views = self.topk + 1
+        self.maxdim = None
+        self.patch_size = None
+
+        query_path = os.path.join(self.root, 'query')
+        kdata_query = kapture_from_dir(query_path)
+        assert kdata_query.records_camera is not None
+        kdata_query_searchindex = {kdata_query.records_camera[(timestamp, sensor_id)]: (timestamp, sensor_id)
+                                   for timestamp, sensor_id in kdata_query.records_camera.key_pairs()}
+        self.query_data = {'path': query_path, 'kdata': kdata_query, 'searchindex': kdata_query_searchindex}
+
+        map_path = os.path.join(self.root, 'mapping')
+        kdata_map = kapture_from_dir(map_path)
+        assert kdata_map.records_camera is not None and kdata_map.trajectories is not None
+        kdata_map_searchindex = {kdata_map.records_camera[(timestamp, sensor_id)]: (timestamp, sensor_id)
+                                 for timestamp, sensor_id in kdata_map.records_camera.key_pairs()}
+        self.map_data = {'path': map_path, 'kdata': kdata_map, 'searchindex': kdata_map_searchindex}
+
+        try:
+            self.pairs = get_ordered_pairs_from_file(os.path.join(self.root, 'pairfiles/query', pairsfile + '.txt'))
+        except Exception as e:
+            # if using pairs from hloc
+            self.pairs = {}
+            with open(os.path.join(self.root, 'pairfiles/query', pairsfile + '.txt'), 'r') as fid:
+                lines = fid.readlines()
+                for line in lines:
+                    splits = line.rstrip("\n\r").split(" ")
+                    self.pairs.setdefault(splits[0].replace('query/', ''), []).append(
+                        (splits[1].replace('database/cutouts/', ''), 1.0)
+                    )
+
+        self.scenes = kdata_query.records_camera.data_list()
+
+        self.aligns_DUC1 = read_alignments(os.path.join(self.root, 'mapping/DUC1_alignment/all_transformations.txt'))
+        self.aligns_DUC2 = read_alignments(os.path.join(self.root, 'mapping/DUC2_alignment/all_transformations.txt'))
+
+    def __len__(self):
+        return len(self.scenes)
+
+    def __getitem__(self, idx):
+        assert self.maxdim is not None and self.patch_size is not None
+        query_image = self.scenes[idx]
+        map_images = [p[0] for p in self.pairs[query_image][:self.topk]]
+        views = []
+        dataarray = [(query_image, self.query_data, False)] + [(map_image, self.map_data, True)
+                                                               for map_image in map_images]
+        for idx, (imgname, data, should_load_depth) in enumerate(dataarray):
+            imgpath, kdata, searchindex = map(data.get, ['path', 'kdata', 'searchindex'])
+
+            timestamp, camera_id = searchindex[imgname]
+
+            # for InLoc, SIMPLE_PINHOLE
+            camera_params = kdata.sensors[camera_id].camera_params
+            W, H, f, cx, cy = camera_params
+            distortion = [0, 0, 0, 0]
+            intrinsics = np.float32([(f, 0, cx),
+                                     (0, f, cy),
+                                     (0, 0, 1)])
+
+            if kdata.trajectories is not None and (timestamp, camera_id) in kdata.trajectories:
+                cam_to_world = cam_to_world_from_kapture(kdata, timestamp, camera_id)
+            else:
+                cam_to_world = np.eye(4, dtype=np.float32)
+
+            # Load RGB image
+            rgb_image = PIL.Image.open(os.path.join(imgpath, 'sensors/records_data', imgname)).convert('RGB')
+            rgb_image.load()
+
+            W, H = rgb_image.size
+            resize_func, to_resize, to_orig = get_resize_function(self.maxdim, self.patch_size, H, W)
+
+            rgb_tensor = resize_func(ImgNorm(rgb_image))
+
+            view = {
+                'intrinsics': intrinsics,
+                'distortion': distortion,
+                'cam_to_world': cam_to_world,
+                'rgb': rgb_image,
+                'rgb_rescaled': rgb_tensor,
+                'to_orig': to_orig,
+                'idx': idx,
+                'image_name': imgname
+            }
+
+            # Load depthmap
+            if should_load_depth:
+                depthmap_filename = os.path.join(imgpath, 'sensors/records_data', imgname + '.mat')
+                depthmap = scipy.io.loadmat(depthmap_filename)
+
+                pt3d_cut = depthmap['XYZcut']
+                scene_id = imgname.replace('\\', '/').split('/')[1]
+                if imgname.startswith('DUC1'):
+                    pts3d_full = geotrf(self.aligns_DUC1[scene_id], pt3d_cut)
+                else:
+                    pts3d_full = geotrf(self.aligns_DUC2[scene_id], pt3d_cut)
+
+                pts3d_valid = np.isfinite(pts3d_full.sum(axis=-1))
+
+                pts3d = pts3d_full[pts3d_valid]
+                pts2d_int = xy_grid(W, H)[pts3d_valid]
+                pts2d = pts2d_int.astype(np.float64)
+
+                # nan => invalid
+                pts3d_full[~pts3d_valid] = np.nan
+                pts3d_full = torch.from_numpy(pts3d_full)
+                view['pts3d'] = pts3d_full
+                view["valid"] = pts3d_full.sum(dim=-1).isfinite()
+
+                HR, WR = rgb_tensor.shape[1:]
+                _, _, pts3d_rescaled, valid_rescaled = rescale_points3d(pts2d, pts3d, to_resize, HR, WR)
+                pts3d_rescaled = torch.from_numpy(pts3d_rescaled)
+                valid_rescaled = torch.from_numpy(valid_rescaled)
+                view['pts3d_rescaled'] = pts3d_rescaled
+                view["valid_rescaled"] = valid_rescaled
+            views.append(view)
+        return views
diff --git a/dust3r/dust3r_visloc/datasets/sevenscenes.py b/dust3r/dust3r_visloc/datasets/sevenscenes.py
new file mode 100644
index 0000000000000000000000000000000000000000..c15e851d262f0d7ba7071c933d8fe8f0a6b1c49d
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/sevenscenes.py
@@ -0,0 +1,123 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# 7 Scenes dataloader
+# --------------------------------------------------------
+import os
+import numpy as np
+import torch
+import PIL.Image
+
+import kapture
+from kapture.io.csv import kapture_from_dir
+from kapture_localization.utils.pairsfile import get_ordered_pairs_from_file
+from kapture.io.records import depth_map_from_file
+
+from dust3r_visloc.datasets.utils import cam_to_world_from_kapture, get_resize_function, rescale_points3d
+from dust3r_visloc.datasets.base_dataset import BaseVislocDataset
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates, xy_grid, geotrf
+
+
+class VislocSevenScenes(BaseVislocDataset):
+    def __init__(self, root, subscene, pairsfile, topk=1):
+        super().__init__()
+        self.root = root
+        self.subscene = subscene
+        self.topk = topk
+        self.num_views = self.topk + 1
+        self.maxdim = None
+        self.patch_size = None
+
+        query_path = os.path.join(self.root, subscene, 'query')
+        kdata_query = kapture_from_dir(query_path)
+        assert kdata_query.records_camera is not None and kdata_query.trajectories is not None and kdata_query.rigs is not None
+        kapture.rigs_remove_inplace(kdata_query.trajectories, kdata_query.rigs)
+        kdata_query_searchindex = {kdata_query.records_camera[(timestamp, sensor_id)]: (timestamp, sensor_id)
+                                   for timestamp, sensor_id in kdata_query.records_camera.key_pairs()}
+        self.query_data = {'path': query_path, 'kdata': kdata_query, 'searchindex': kdata_query_searchindex}
+
+        map_path = os.path.join(self.root, subscene, 'mapping')
+        kdata_map = kapture_from_dir(map_path)
+        assert kdata_map.records_camera is not None and kdata_map.trajectories is not None and kdata_map.rigs is not None
+        kapture.rigs_remove_inplace(kdata_map.trajectories, kdata_map.rigs)
+        kdata_map_searchindex = {kdata_map.records_camera[(timestamp, sensor_id)]: (timestamp, sensor_id)
+                                 for timestamp, sensor_id in kdata_map.records_camera.key_pairs()}
+        self.map_data = {'path': map_path, 'kdata': kdata_map, 'searchindex': kdata_map_searchindex}
+
+        self.pairs = get_ordered_pairs_from_file(os.path.join(self.root, subscene,
+                                                              'pairfiles/query',
+                                                              pairsfile + '.txt'))
+        self.scenes = kdata_query.records_camera.data_list()
+
+    def __len__(self):
+        return len(self.scenes)
+
+    def __getitem__(self, idx):
+        assert self.maxdim is not None and self.patch_size is not None
+        query_image = self.scenes[idx]
+        map_images = [p[0] for p in self.pairs[query_image][:self.topk]]
+        views = []
+        dataarray = [(query_image, self.query_data, False)] + [(map_image, self.map_data, True)
+                                                               for map_image in map_images]
+        for idx, (imgname, data, should_load_depth) in enumerate(dataarray):
+            imgpath, kdata, searchindex = map(data.get, ['path', 'kdata', 'searchindex'])
+
+            timestamp, camera_id = searchindex[imgname]
+
+            # for 7scenes, SIMPLE_PINHOLE
+            camera_params = kdata.sensors[camera_id].camera_params
+            W, H, f, cx, cy = camera_params
+            distortion = [0, 0, 0, 0]
+            intrinsics = np.float32([(f, 0, cx),
+                                     (0, f, cy),
+                                     (0, 0, 1)])
+
+            cam_to_world = cam_to_world_from_kapture(kdata, timestamp, camera_id)
+
+            # Load RGB image
+            rgb_image = PIL.Image.open(os.path.join(imgpath, 'sensors/records_data', imgname)).convert('RGB')
+            rgb_image.load()
+
+            W, H = rgb_image.size
+            resize_func, to_resize, to_orig = get_resize_function(self.maxdim, self.patch_size, H, W)
+
+            rgb_tensor = resize_func(ImgNorm(rgb_image))
+
+            view = {
+                'intrinsics': intrinsics,
+                'distortion': distortion,
+                'cam_to_world': cam_to_world,
+                'rgb': rgb_image,
+                'rgb_rescaled': rgb_tensor,
+                'to_orig': to_orig,
+                'idx': idx,
+                'image_name': imgname
+            }
+
+            # Load depthmap
+            if should_load_depth:
+                depthmap_filename = os.path.join(imgpath, 'sensors/records_data',
+                                                 imgname.replace('color.png', 'depth.reg'))
+                depthmap = depth_map_from_file(depthmap_filename, (int(W), int(H))).astype(np.float32)
+                pts3d_full, pts3d_valid = depthmap_to_absolute_camera_coordinates(depthmap, intrinsics, cam_to_world)
+
+                pts3d = pts3d_full[pts3d_valid]
+                pts2d_int = xy_grid(W, H)[pts3d_valid]
+                pts2d = pts2d_int.astype(np.float64)
+
+                # nan => invalid
+                pts3d_full[~pts3d_valid] = np.nan
+                pts3d_full = torch.from_numpy(pts3d_full)
+                view['pts3d'] = pts3d_full
+                view["valid"] = pts3d_full.sum(dim=-1).isfinite()
+
+                HR, WR = rgb_tensor.shape[1:]
+                _, _, pts3d_rescaled, valid_rescaled = rescale_points3d(pts2d, pts3d, to_resize, HR, WR)
+                pts3d_rescaled = torch.from_numpy(pts3d_rescaled)
+                valid_rescaled = torch.from_numpy(valid_rescaled)
+                view['pts3d_rescaled'] = pts3d_rescaled
+                view["valid_rescaled"] = valid_rescaled
+            views.append(view)
+        return views
diff --git a/dust3r/dust3r_visloc/datasets/utils.py b/dust3r/dust3r_visloc/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6053ae2e5ba6c0b0f5f014161b666623d6e0f3f5
--- /dev/null
+++ b/dust3r/dust3r_visloc/datasets/utils.py
@@ -0,0 +1,118 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# dataset utilities
+# --------------------------------------------------------
+import numpy as np
+import quaternion
+import torchvision.transforms as tvf
+from dust3r.utils.geometry import geotrf
+
+
+def cam_to_world_from_kapture(kdata, timestamp, camera_id):
+    camera_to_world = kdata.trajectories[timestamp, camera_id].inverse()
+    camera_pose = np.eye(4, dtype=np.float32)
+    camera_pose[:3, :3] = quaternion.as_rotation_matrix(camera_to_world.r)
+    camera_pose[:3, 3] = camera_to_world.t_raw
+    return camera_pose
+
+
+ratios_resolutions = {
+    224: {1.0: [224, 224]},
+    512: {4 / 3: [512, 384], 32 / 21: [512, 336], 16 / 9: [512, 288], 2 / 1: [512, 256], 16 / 5: [512, 160]}
+}
+
+
+def get_HW_resolution(H, W, maxdim, patchsize=16):
+    assert maxdim in ratios_resolutions, "Error, maxdim can only be 224 or 512 for now. Other maxdims not implemented yet."
+    ratios_resolutions_maxdim = ratios_resolutions[maxdim]
+    mindims = set([min(res) for res in ratios_resolutions_maxdim.values()])
+    ratio = W / H
+    ref_ratios = np.array([*(ratios_resolutions_maxdim.keys())])
+    islandscape = (W >= H)
+    if islandscape:
+        diff = np.abs(ratio - ref_ratios)
+    else:
+        diff = np.abs(ratio - (1 / ref_ratios))
+    selkey = ref_ratios[np.argmin(diff)]
+    res = ratios_resolutions_maxdim[selkey]
+    # check patchsize and make sure output resolution is a multiple of patchsize
+    if isinstance(patchsize, tuple):
+        assert len(patchsize) == 2 and isinstance(patchsize[0], int) and isinstance(
+            patchsize[1], int), "What is your patchsize format? Expected a single int or a tuple of two ints."
+        assert patchsize[0] == patchsize[1], "Error, non square patches not managed"
+        patchsize = patchsize[0]
+    assert max(res) == maxdim
+    assert min(res) in mindims
+    return res[::-1] if islandscape else res  # return HW
+
+
+def get_resize_function(maxdim, patch_size, H, W, is_mask=False):
+    if [max(H, W), min(H, W)] in ratios_resolutions[maxdim].values():
+        return lambda x: x, np.eye(3), np.eye(3)
+    else:
+        target_HW = get_HW_resolution(H, W, maxdim=maxdim, patchsize=patch_size)
+
+        ratio = W / H
+        target_ratio = target_HW[1] / target_HW[0]
+        to_orig_crop = np.eye(3)
+        to_rescaled_crop = np.eye(3)
+        if abs(ratio - target_ratio) < np.finfo(np.float32).eps:
+            crop_W = W
+            crop_H = H
+        elif ratio - target_ratio < 0:
+            crop_W = W
+            crop_H = int(W / target_ratio)
+            to_orig_crop[1, 2] = (H - crop_H) / 2.0
+            to_rescaled_crop[1, 2] = -(H - crop_H) / 2.0
+        else:
+            crop_W = int(H * target_ratio)
+            crop_H = H
+            to_orig_crop[0, 2] = (W - crop_W) / 2.0
+            to_rescaled_crop[0, 2] = - (W - crop_W) / 2.0
+
+        crop_op = tvf.CenterCrop([crop_H, crop_W])
+
+        if is_mask:
+            resize_op = tvf.Resize(size=target_HW, interpolation=tvf.InterpolationMode.NEAREST_EXACT)
+        else:
+            resize_op = tvf.Resize(size=target_HW)
+        to_orig_resize = np.array([[crop_W / target_HW[1], 0, 0],
+                                   [0, crop_H / target_HW[0], 0],
+                                   [0, 0, 1]])
+        to_rescaled_resize = np.array([[target_HW[1] / crop_W, 0, 0],
+                                       [0, target_HW[0] / crop_H, 0],
+                                       [0, 0, 1]])
+
+        op = tvf.Compose([crop_op, resize_op])
+
+        return op, to_rescaled_resize @ to_rescaled_crop, to_orig_crop @ to_orig_resize
+
+
+def rescale_points3d(pts2d, pts3d, to_resize, HR, WR):
+    # rescale pts2d as floats
+    # to colmap, so that the image is in [0, D] -> [0, NewD]
+    pts2d = pts2d.copy()
+    pts2d[:, 0] += 0.5
+    pts2d[:, 1] += 0.5
+
+    pts2d_rescaled = geotrf(to_resize, pts2d, norm=True)
+
+    pts2d_rescaled_int = pts2d_rescaled.copy()
+    # convert back to cv2 before round [-0.5, 0.5] -> pixel 0
+    pts2d_rescaled_int[:, 0] -= 0.5
+    pts2d_rescaled_int[:, 1] -= 0.5
+    pts2d_rescaled_int = pts2d_rescaled_int.round().astype(np.int64)
+
+    # update valid (remove cropped regions)
+    valid_rescaled = (pts2d_rescaled_int[:, 0] >= 0) & (pts2d_rescaled_int[:, 0] < WR) & (
+        pts2d_rescaled_int[:, 1] >= 0) & (pts2d_rescaled_int[:, 1] < HR)
+
+    pts2d_rescaled_int = pts2d_rescaled_int[valid_rescaled]
+
+    # rebuild pts3d from rescaled ps2d poses
+    pts3d_rescaled = np.full((HR, WR, 3), np.nan, dtype=np.float32)  # pts3d in 512 x something
+    pts3d_rescaled[pts2d_rescaled_int[:, 1], pts2d_rescaled_int[:, 0]] = pts3d[valid_rescaled]
+
+    return pts2d_rescaled, pts2d_rescaled_int, pts3d_rescaled, np.isfinite(pts3d_rescaled.sum(axis=-1))
diff --git a/dust3r/dust3r_visloc/evaluation.py b/dust3r/dust3r_visloc/evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..027179f2b1007db558f57d3d67f48a6d7aa1ab9d
--- /dev/null
+++ b/dust3r/dust3r_visloc/evaluation.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# evaluation utilities
+# --------------------------------------------------------
+import numpy as np
+import quaternion
+import torch
+import roma
+import collections
+import os
+
+
+def aggregate_stats(info_str, pose_errors, angular_errors):
+    stats = collections.Counter()
+    median_pos_error = np.median(pose_errors)
+    median_angular_error = np.median(angular_errors)
+    out_str = f'{info_str}: {len(pose_errors)} images - {median_pos_error=}, {median_angular_error=}'
+
+    for trl_thr, ang_thr in [(0.1, 1), (0.25, 2), (0.5, 5), (5, 10)]:
+        for pose_error, angular_error in zip(pose_errors, angular_errors):
+            correct_for_this_threshold = (pose_error < trl_thr) and (angular_error < ang_thr)
+            stats[trl_thr, ang_thr] += correct_for_this_threshold
+    stats = {f'acc@{key[0]:g}m,{key[1]}deg': 100 * val / len(pose_errors) for key, val in stats.items()}
+    for metric, perf in stats.items():
+        out_str += f'  - {metric:12s}={float(perf):.3f}'
+    return out_str
+
+
+def get_pose_error(pr_camtoworld, gt_cam_to_world):
+    abs_transl_error = torch.linalg.norm(torch.tensor(pr_camtoworld[:3, 3]) - torch.tensor(gt_cam_to_world[:3, 3]))
+    abs_angular_error = roma.rotmat_geodesic_distance(torch.tensor(pr_camtoworld[:3, :3]),
+                                                      torch.tensor(gt_cam_to_world[:3, :3])) * 180 / np.pi
+    return abs_transl_error, abs_angular_error
+
+
+def export_results(output_dir, xp_label, query_names, poses_pred):
+    if output_dir is not None:
+        os.makedirs(output_dir, exist_ok=True)
+
+        lines = ""
+        lines_ltvl = ""
+        for query_name, pr_querycam_to_world in zip(query_names, poses_pred):
+            if pr_querycam_to_world is None:
+                pr_world_to_querycam = np.eye(4)
+            else:
+                pr_world_to_querycam = np.linalg.inv(pr_querycam_to_world)
+            query_shortname = os.path.basename(query_name)
+            pr_world_to_querycam_q = quaternion.from_rotation_matrix(pr_world_to_querycam[:3, :3])
+            pr_world_to_querycam_t = pr_world_to_querycam[:3, 3]
+
+            line_pose = quaternion.as_float_array(pr_world_to_querycam_q).tolist() + \
+                pr_world_to_querycam_t.flatten().tolist()
+
+            line_content = [query_name] + line_pose
+            lines += ' '.join(str(v) for v in line_content) + '\n'
+
+            line_content_ltvl = [query_shortname] + line_pose
+            lines_ltvl += ' '.join(str(v) for v in line_content_ltvl) + '\n'
+
+        with open(os.path.join(output_dir, xp_label + '_results.txt'), 'wt') as f:
+            f.write(lines)
+        with open(os.path.join(output_dir, xp_label + '_ltvl.txt'), 'wt') as f:
+            f.write(lines_ltvl)
diff --git a/dust3r/dust3r_visloc/localization.py b/dust3r/dust3r_visloc/localization.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac8ae198dc3479f12a976bab0bda692328880710
--- /dev/null
+++ b/dust3r/dust3r_visloc/localization.py
@@ -0,0 +1,140 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# main pnp code
+# --------------------------------------------------------
+import numpy as np
+import quaternion
+import cv2
+from packaging import version
+
+from dust3r.utils.geometry import opencv_to_colmap_intrinsics
+
+try:
+    import poselib  # noqa
+    HAS_POSELIB = True
+except Exception as e:
+    HAS_POSELIB = False
+
+try:
+    import pycolmap  # noqa
+    version_number = pycolmap.__version__
+    if version.parse(version_number) < version.parse("0.5.0"):
+        HAS_PYCOLMAP = False
+    else:
+        HAS_PYCOLMAP = True
+except Exception as e:
+    HAS_PYCOLMAP = False
+    
+def run_pnp(pts2D, pts3D, K, distortion = None, mode='cv2', reprojectionError=5, img_size = None):
+    """
+    use OPENCV model for distortion (4 values)
+    """
+    assert mode in ['cv2', 'poselib', 'pycolmap']
+    try:
+        if len(pts2D) > 4 and mode == "cv2":
+            confidence = 0.9999
+            iterationsCount = 10_000
+            if distortion is not None:
+                cv2_pts2ds = np.copy(pts2D)
+                cv2_pts2ds = cv2.undistortPoints(cv2_pts2ds, K, np.array(distortion), R=None, P=K)
+                pts2D = cv2_pts2ds.reshape((-1, 2))
+
+            success, r_pose, t_pose, _ = cv2.solvePnPRansac(pts3D, pts2D, K, None, flags=cv2.SOLVEPNP_SQPNP,
+                                                            iterationsCount=iterationsCount,
+                                                            reprojectionError=reprojectionError,
+                                                            confidence=confidence)
+            if not success:
+                return False, None
+            r_pose = cv2.Rodrigues(r_pose)[0]  # world2cam == world2cam2
+            RT = np.r_[np.c_[r_pose, t_pose], [(0,0,0,1)]] # world2cam2
+            return True, np.linalg.inv(RT)  # cam2toworld
+        elif len(pts2D) > 4 and mode == "poselib":
+            assert HAS_POSELIB
+            confidence = 0.9999
+            iterationsCount = 10_000
+            # NOTE: `Camera` struct currently contains `width`/`height` fields,
+            # however these are not used anywhere in the code-base and are provided simply to be consistent with COLMAP.
+            # so we put garbage in there
+            colmap_intrinsics = opencv_to_colmap_intrinsics(K)
+            fx = colmap_intrinsics[0, 0]
+            fy = colmap_intrinsics[1, 1]
+            cx = colmap_intrinsics[0, 2]
+            cy = colmap_intrinsics[1, 2]
+            width = img_size[0] if img_size is not None else int(cx*2)
+            height = img_size[1] if img_size is not None else int(cy*2)
+
+            if distortion is None:
+                camera = {'model': 'PINHOLE', 'width': width, 'height': height, 'params': [fx, fy, cx, cy]}
+            else:
+                camera = {'model': 'OPENCV', 'width': width, 'height': height,
+                          'params': [fx, fy, cx, cy] + distortion}
+            
+            pts2D = np.copy(pts2D)
+            pts2D[:, 0] += 0.5
+            pts2D[:, 1] += 0.5
+            pose, _ = poselib.estimate_absolute_pose(pts2D, pts3D, camera,
+                                                        {'max_reproj_error': reprojectionError,
+                                                        'max_iterations': iterationsCount,
+                                                        'success_prob': confidence}, {})
+            if pose is None:
+                return False, None
+            RT = pose.Rt  # (3x4)
+            RT = np.r_[RT, [(0,0,0,1)]]  # world2cam
+            return True, np.linalg.inv(RT)  # cam2toworld
+        elif len(pts2D) > 4 and mode == "pycolmap":
+            assert HAS_PYCOLMAP
+            assert img_size is not None
+            
+            pts2D = np.copy(pts2D)
+            pts2D[:, 0] += 0.5
+            pts2D[:, 1] += 0.5
+            colmap_intrinsics = opencv_to_colmap_intrinsics(K)
+            fx = colmap_intrinsics[0, 0]
+            fy = colmap_intrinsics[1, 1]
+            cx = colmap_intrinsics[0, 2]
+            cy = colmap_intrinsics[1, 2]
+            width = img_size[0]
+            height = img_size[1]
+            if distortion is None:
+                camera_dict = {'model': 'PINHOLE', 'width': width, 'height': height, 'params': [fx, fy, cx, cy]}
+            else:
+                camera_dict = {'model': 'OPENCV', 'width': width, 'height': height,
+                               'params': [fx, fy, cx, cy] + distortion}
+
+            pycolmap_camera = pycolmap.Camera(
+            model=camera_dict['model'], width=camera_dict['width'], height=camera_dict['height'],
+            params=camera_dict['params'])
+
+            pycolmap_estimation_options = dict(ransac=dict(max_error=reprojectionError, min_inlier_ratio=0.01,
+                                               min_num_trials=1000, max_num_trials=100000,
+                                            confidence=0.9999))
+            pycolmap_refinement_options=dict(refine_focal_length=False, refine_extra_params=False)
+            ret = pycolmap.absolute_pose_estimation(pts2D, pts3D, pycolmap_camera,
+                                                    estimation_options=pycolmap_estimation_options,
+                                                    refinement_options=pycolmap_refinement_options)
+            if ret is None:
+                ret = {'success': False}
+            else:
+                ret['success'] = True
+                if callable(ret['cam_from_world'].matrix):
+                    retmat = ret['cam_from_world'].matrix()
+                else:
+                    retmat = ret['cam_from_world'].matrix
+                ret['qvec'] = quaternion.from_rotation_matrix(retmat[:3, :3])
+                ret['tvec'] = retmat[:3, 3]
+                
+            if not (ret['success'] and ret['num_inliers'] > 0):
+                success = False
+                pose = None
+            else:
+                success = True
+                pr_world_to_querycam = np.r_[ret['cam_from_world'].matrix(), [(0,0,0,1)]]
+                pose = np.linalg.inv(pr_world_to_querycam)
+            return success, pose
+        else:
+            return False, None
+    except Exception as e:
+        print(f'error during pnp: {e}')
+        return False, None
\ No newline at end of file
diff --git a/dust3r/dust3r_visloc/setup.py b/dust3r/dust3r_visloc/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c4b883c4157845648f4c053510f83f4a173a4ed
--- /dev/null
+++ b/dust3r/dust3r_visloc/setup.py
@@ -0,0 +1,32 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import os
+from setuptools import setup, find_packages
+
+ROOT = os.path.abspath(os.path.dirname(__file__))
+ROOT = os.path.normpath(os.path.join(ROOT, "../"))
+LOCAL_DUST3R = os.path.isdir(os.path.join(ROOT, "dust3r"))
+
+if LOCAL_DUST3R:
+    # Use a file:// URL to install dust3r
+    dust3r_dep = f"dust3r @ file://{ROOT}"
+else:
+    # Fallback to fetching dust3r from URL
+    dust3r_dep = (
+        "dust3r @ git+https://github.com/naver/dust3r.git@dust3r_setup"
+    )
+
+
+setup(
+    name="dust3r_visloc",
+    version="1.0.0",
+    packages=find_packages(include=["dust3r_visloc", "dust3r_visloc.*"]),
+    install_requires=[
+        dust3r_dep,
+        'kapture',
+        'kapture-localization',
+        'numpy-quaternion',
+        'pycolmap',
+        'poselib',
+    ],
+    python_requires=">=3.11",
+)
diff --git a/dust3r/requirements.txt b/dust3r/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d2bf20ed439b43b0604f12985288d8b8d6b55f8f
--- /dev/null
+++ b/dust3r/requirements.txt
@@ -0,0 +1,13 @@
+torch
+torchvision
+roma
+gradio
+matplotlib
+tqdm
+opencv-python
+scipy
+einops
+trimesh
+tensorboard
+pyglet<2
+huggingface-hub[torch]>=0.22
\ No newline at end of file
diff --git a/dust3r/requirements_optional.txt b/dust3r/requirements_optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d42662c0e87c6ce4ac990f2afedecc96cdea7f06
--- /dev/null
+++ b/dust3r/requirements_optional.txt
@@ -0,0 +1,7 @@
+pillow-heif  # add heif/heic image support
+pyrender  # for rendering depths in scannetpp
+kapture  # for visloc data loading
+kapture-localization
+numpy-quaternion
+pycolmap  # for pnp
+poselib  # for pnp
diff --git a/dust3r/setup.py b/dust3r/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dc5ed2a1e84614f6143ca1ffc72565039cc1330
--- /dev/null
+++ b/dust3r/setup.py
@@ -0,0 +1,48 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import os
+from setuptools import setup, find_packages
+
+curope_dep = ['curope @ git+https://github.com/naver/croco.git@croco_module#subdirectory=curope']
+optional_dep = [
+    'pillow-heif',
+    'pyrender',
+]
+
+setup(
+    name="dust3r",
+    version="1.0.0",
+    packages=find_packages(include=["dust3r", "dust3r.*"]),
+    install_requires=[
+        'torch',
+        'torchvision',
+        'matplotlib',
+        'scikit-learn',
+        'tqdm',
+        'numpy',
+        'numpy-quaternion',
+        'opencv-python',
+        'einops',
+        'tensorboard',
+        'h5py',
+        'pillow',
+        'roma',
+        'gradio',
+        'scipy',
+        'trimesh',
+        'pyglet<2',
+        'huggingface-hub[torch]>=0.22',
+        'croco @ git+https://github.com/naver/croco.git@croco_module#egg=croco'
+    ],
+    python_requires=">=3.11",
+    extras_require={
+        "curope": curope_dep,
+        "optional": optional_dep,
+        "all": curope_dep + optional_dep
+
+    },
+    entry_points={
+        'console_scripts': [
+            'dust3r_demo=dust3r.demo:main'
+        ]
+    }
+)
diff --git a/dust3r/train.py b/dust3r/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..503e63572376c259e6b259850e19c3f6036aa535
--- /dev/null
+++ b/dust3r/train.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# training executable for DUSt3R
+# --------------------------------------------------------
+from dust3r.training import get_args_parser, train
+
+if __name__ == '__main__':
+    args = get_args_parser()
+    args = args.parse_args()
+    train(args)
diff --git a/dust3r/visloc.py b/dust3r/visloc.py
new file mode 100644
index 0000000000000000000000000000000000000000..6411b3eaf96dea961f9524e887a12d92f2012c6b
--- /dev/null
+++ b/dust3r/visloc.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Simple visloc script
+# --------------------------------------------------------
+import numpy as np
+import random
+import argparse
+from tqdm import tqdm
+import math
+
+from dust3r.inference import inference
+from dust3r.model import AsymmetricCroCo3DStereo
+from dust3r.utils.geometry import find_reciprocal_matches, xy_grid, geotrf
+
+from dust3r_visloc.datasets import *
+from dust3r_visloc.localization import run_pnp
+from dust3r_visloc.evaluation import get_pose_error, aggregate_stats, export_results
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", type=str, required=True, help="visloc dataset to eval")
+    parser_weights = parser.add_mutually_exclusive_group(required=True)
+    parser_weights.add_argument("--weights", type=str, help="path to the model weights", default=None)
+    parser_weights.add_argument("--model_name", type=str, help="name of the model weights",
+                                choices=["DUSt3R_ViTLarge_BaseDecoder_512_dpt",
+                                         "DUSt3R_ViTLarge_BaseDecoder_512_linear",
+                                         "DUSt3R_ViTLarge_BaseDecoder_224_linear"])
+    parser.add_argument("--confidence_threshold", type=float, default=3.0,
+                        help="confidence values higher than threshold are invalid")
+    parser.add_argument("--device", type=str, default='cuda', help="pytorch device")
+    parser.add_argument("--pnp_mode", type=str, default="cv2", choices=['cv2', 'poselib', 'pycolmap'],
+                        help="pnp lib to use")
+    parser_reproj = parser.add_mutually_exclusive_group()
+    parser_reproj.add_argument("--reprojection_error", type=float, default=5.0, help="pnp reprojection error")
+    parser_reproj.add_argument("--reprojection_error_diag_ratio", type=float, default=None,
+                               help="pnp reprojection error as a ratio of the diagonal of the image")
+
+    parser.add_argument("--pnp_max_points", type=int, default=100_000, help="pnp maximum number of points kept")
+    parser.add_argument("--viz_matches", type=int, default=0, help="debug matches")
+
+    parser.add_argument("--output_dir", type=str, default=None, help="output path")
+    parser.add_argument("--output_label", type=str, default='', help="prefix for results files")
+    return parser
+
+
+if __name__ == '__main__':
+    parser = get_args_parser()
+    args = parser.parse_args()
+    conf_thr = args.confidence_threshold
+    device = args.device
+    pnp_mode = args.pnp_mode
+    reprojection_error = args.reprojection_error
+    reprojection_error_diag_ratio = args.reprojection_error_diag_ratio
+    pnp_max_points = args.pnp_max_points
+    viz_matches = args.viz_matches
+
+    if args.weights is not None:
+        weights_path = args.weights
+    else:
+        weights_path = "naver/" + args.model_name
+    model = AsymmetricCroCo3DStereo.from_pretrained(weights_path).to(args.device)
+
+    dataset = eval(args.dataset)
+    dataset.set_resolution(model)
+
+    query_names = []
+    poses_pred = []
+    pose_errors = []
+    angular_errors = []
+    for idx in tqdm(range(len(dataset))):
+        views = dataset[(idx)]  # 0 is the query
+        query_view = views[0]
+        map_views = views[1:]
+        query_names.append(query_view['image_name'])
+
+        query_pts2d = []
+        query_pts3d = []
+        for map_view in map_views:
+            # prepare batch
+            imgs = []
+            for idx, img in enumerate([query_view['rgb_rescaled'], map_view['rgb_rescaled']]):
+                imgs.append(dict(img=img.unsqueeze(0), true_shape=np.int32([img.shape[1:]]),
+                                 idx=idx, instance=str(idx)))
+            output = inference([tuple(imgs)], model, device, batch_size=1, verbose=False)
+            pred1, pred2 = output['pred1'], output['pred2']
+            confidence_masks = [pred1['conf'].squeeze(0) >= conf_thr,
+                                (pred2['conf'].squeeze(0) >= conf_thr) & map_view['valid_rescaled']]
+            pts3d = [pred1['pts3d'].squeeze(0), pred2['pts3d_in_other_view'].squeeze(0)]
+
+            # find 2D-2D matches between the two images
+            pts2d_list, pts3d_list = [], []
+            for i in range(2):
+                conf_i = confidence_masks[i].cpu().numpy()
+                true_shape_i = imgs[i]['true_shape'][0]
+                pts2d_list.append(xy_grid(true_shape_i[1], true_shape_i[0])[conf_i])
+                pts3d_list.append(pts3d[i].detach().cpu().numpy()[conf_i])
+
+            PQ, PM = pts3d_list[0], pts3d_list[1]
+            if len(PQ) == 0 or len(PM) == 0:
+                continue
+            reciprocal_in_PM, nnM_in_PQ, num_matches = find_reciprocal_matches(PQ, PM)
+            if viz_matches > 0:
+                print(f'found {num_matches} matches')
+            matches_im1 = pts2d_list[1][reciprocal_in_PM]
+            matches_im0 = pts2d_list[0][nnM_in_PQ][reciprocal_in_PM]
+            valid_pts3d = map_view['pts3d_rescaled'][matches_im1[:, 1], matches_im1[:, 0]]
+
+            # from cv2 to colmap
+            matches_im0 = matches_im0.astype(np.float64)
+            matches_im1 = matches_im1.astype(np.float64)
+            matches_im0[:, 0] += 0.5
+            matches_im0[:, 1] += 0.5
+            matches_im1[:, 0] += 0.5
+            matches_im1[:, 1] += 0.5
+            # rescale coordinates
+            matches_im0 = geotrf(query_view['to_orig'], matches_im0, norm=True)
+            matches_im1 = geotrf(query_view['to_orig'], matches_im1, norm=True)
+            # from colmap back to cv2
+            matches_im0[:, 0] -= 0.5
+            matches_im0[:, 1] -= 0.5
+            matches_im1[:, 0] -= 0.5
+            matches_im1[:, 1] -= 0.5
+
+            # visualize a few matches
+            if viz_matches > 0:
+                viz_imgs = [np.array(query_view['rgb']), np.array(map_view['rgb'])]
+                from matplotlib import pyplot as pl
+                n_viz = viz_matches
+                match_idx_to_viz = np.round(np.linspace(0, num_matches - 1, n_viz)).astype(int)
+                viz_matches_im0, viz_matches_im1 = matches_im0[match_idx_to_viz], matches_im1[match_idx_to_viz]
+
+                H0, W0, H1, W1 = *viz_imgs[0].shape[:2], *viz_imgs[1].shape[:2]
+                img0 = np.pad(viz_imgs[0], ((0, max(H1 - H0, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+                img1 = np.pad(viz_imgs[1], ((0, max(H0 - H1, 0)), (0, 0), (0, 0)), 'constant', constant_values=0)
+                img = np.concatenate((img0, img1), axis=1)
+                pl.figure()
+                pl.imshow(img)
+                cmap = pl.get_cmap('jet')
+                for i in range(n_viz):
+                    (x0, y0), (x1, y1) = viz_matches_im0[i].T, viz_matches_im1[i].T
+                    pl.plot([x0, x1 + W0], [y0, y1], '-+', color=cmap(i / (n_viz - 1)), scalex=False, scaley=False)
+                pl.show(block=True)
+
+            if len(valid_pts3d) == 0:
+                pass
+            else:
+                query_pts3d.append(valid_pts3d.cpu().numpy())
+                query_pts2d.append(matches_im0)
+
+        if len(query_pts2d) == 0:
+            success = False
+            pr_querycam_to_world = None
+        else:
+            query_pts2d = np.concatenate(query_pts2d, axis=0).astype(np.float32)
+            query_pts3d = np.concatenate(query_pts3d, axis=0)
+            if len(query_pts2d) > pnp_max_points:
+                idxs = random.sample(range(len(query_pts2d)), pnp_max_points)
+                query_pts3d = query_pts3d[idxs]
+                query_pts2d = query_pts2d[idxs]
+
+            W, H = query_view['rgb'].size
+            if reprojection_error_diag_ratio is not None:
+                reprojection_error_img = reprojection_error_diag_ratio * math.sqrt(W**2 + H**2)
+            else:
+                reprojection_error_img = reprojection_error
+            success, pr_querycam_to_world = run_pnp(query_pts2d, query_pts3d,
+                                                    query_view['intrinsics'], query_view['distortion'],
+                                                    pnp_mode, reprojection_error_img, img_size=[W, H])
+
+        if not success:
+            abs_transl_error = float('inf')
+            abs_angular_error = float('inf')
+        else:
+            abs_transl_error, abs_angular_error = get_pose_error(pr_querycam_to_world, query_view['cam_to_world'])
+
+        pose_errors.append(abs_transl_error)
+        angular_errors.append(abs_angular_error)
+        poses_pred.append(pr_querycam_to_world)
+
+    xp_label = f'tol_conf_{conf_thr}'
+    if args.output_label:
+        xp_label = args.output_label + '_' + xp_label
+    if reprojection_error_diag_ratio is not None:
+        xp_label = xp_label + f'_reproj_diag_{reprojection_error_diag_ratio}'
+    else:
+        xp_label = xp_label + f'_reproj_err_{reprojection_error}'
+    export_results(args.output_dir, xp_label, query_names, poses_pred)
+    out_string = aggregate_stats(f'{args.dataset}', pose_errors, angular_errors)
+    print(out_string)
diff --git a/engine.py b/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..0042ab68f21fee95184e9bafb2ba716808dc0fd0
--- /dev/null
+++ b/engine.py
@@ -0,0 +1,444 @@
+import spaces
+from dust3r.models.blocks import PositionGetter
+from dust3r.post_process import estimate_focal_knowing_depth
+from must3r.model.blocks.attention import has_xformers, toggle_memory_efficient_attention
+toggle_memory_efficient_attention(enabled = has_xformers)
+from hydra import compose
+from hydra.utils import instantiate
+from sam2.build_sam import build_sam2_video_predictor
+from einops import rearrange, repeat
+from collections import OrderedDict
+import copy
+import torch
+from tqdm import tqdm
+import json
+from sam2.modeling.sam2_base import NO_OBJ_SCORE, SAM2Base
+from sam2.utils.misc import concat_points, fill_holes_in_mask_scores, load_video_frames
+from torchvision.transforms import functional as TF
+import torchvision.transforms as T
+import numpy as np
+from torch import nn
+from training_utils import load_checkpoint, BatchedVideoDatapoint, positional_encoding, postprocess_must3r_output
+from sam2.modeling.sam2_utils import LayerNorm2d
+
+
+def get_views(pil_imgs):
+    from data import load_images
+    must3r_size = 512
+    ## jpgs = a list of PIL Image
+    views, resize_funcs = load_images(jpgs, size = must3r_size, patch_size = 16)
+    return views, resize_funcs
+
+def prepare_sam2_inputs(views, pil_imgs):
+    image_transform = T.Compose([
+        T.Resize((1024, 1024), interpolation = T.InterpolationMode.BILINEAR),
+        T.Normalize(mean = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225))
+    ])
+    images = resize_funcs[0].transforms[0](torch.stack([TF.to_tensor(p) for p in pil_imgs], dim = 0).cpu())
+    sam2_input_images = image_transform(images)  # normalize to [0, 1] range and then normalize with ImageNet stats
+    return sam2_input_images, images
+    
+def must3r_features_and_output(views, device = 'cuda'):
+
+    import functools
+    from must3r.model import load_model, get_pointmaps_activation
+    from must3r.demo.gradio import get_args_parser, main_demo, get_reconstructed_scene
+    from must3r.demo.inference import must3r_inference_video, slam_is_keyframe, slam_update_scene_state, must3r_inference
+    from must3r.slam.model import get_searcher
+    from must3r.model import ActivationType
+    from must3r.demo.inference import get_pointmaps_activation
+    from must3r.tools.geometry import apply_exp_to_norm
+
+    cmd_params = ["--weights", "/home/user/app/private/MUSt3R_512.pth", "--retrieval", "/home/user/app/private/MUSt3R_512_retrieval_trainingfree.pth", "--image_size", "512", "--amp", "bf16", "--viser", "--allow_local_files", "--device", device]
+    parser = get_args_parser()
+    args = parser.parse_args(cmd_params)
+    weights_path = args.weights
+    model = load_model(weights_path, encoder=args.encoder, decoder=args.decoder, device=args.device,
+                        img_size=args.image_size, memory_mode=args.memory_mode, verbose=args.verbose)
+    model = [m.eval() for m in model]
+    assert model[0].patch_size == 16
+    assert get_pointmaps_activation(model[1]) == ActivationType.NORM_EXP
+    # model_224 = load_model("/home/user/app/private/MUSt3R_224.pth", encoder=args.encoder, decoder=args.decoder, device=args.device,
+    #                     img_size = 224, memory_mode=args.memory_mode, verbose=args.verbose)
+    # model_224 = [m.eval() for m in model_224]
+    assert get_pointmaps_activation(model_224[1]) == ActivationType.NORM_EXP
+    retrieval = "/home/user/app/private/MUSt3R_512_retrieval_trainingfree.pth"
+    retrieval_224 = "/home/user/app/private/MUSt3R_224_retrieval_trainingfree.pth"
+    verbose = False 
+    image_size = 512
+    image_size_224 = 224
+    amp = "bf16"
+    amp_224 = "fp16"
+    max_bs = 1
+    num_refinements_iterations = 0
+    execution_mode = "vidslam"
+    num_mem_images = 0
+    render_once = False
+    vidseq_local_context_size = 0
+    keyframe_interval = 0
+    slam_local_context_size = 0
+    subsample = 2
+    min_conf_keyframe = 1.5
+    keyframe_overlap_thr = 0.05 
+    overlap_percentile = 85
+    min_conf_thr = 3
+    as_pointcloud = True
+    transparent_cams = False
+    local_pointmaps = False 
+    cam_size = 0.05
+    camera_conf_thr = 1.5
+    local_context_size = slam_local_context_size
+    overlap_mode = "nn-norm"
+
+    assert must3r_size == 512
+    model[1].recorded_feats = []
+    model[1].all_feats = []
+    is_keyframe_function = functools.partial(slam_is_keyframe, subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile, overlap_mode)
+    scene_state = get_searcher("kdtree-scipy-quadrant_x2")
+    scene_state_update_function = functools.partial(slam_update_scene_state, subsample, min_conf_keyframe)
+    must3r_inference_video((model), device, image_size, amp, filelist = None, max_bs = max_bs, init_num_images = 2, batch_num_views = 1,
+                                        viser_server = None, num_refinements_iterations = num_refinements_iterations,
+                                        local_context_size = local_context_size, is_keyframe_function = is_keyframe_function,
+                                        scene_state = scene_state, scene_state_update_function = scene_state_update_function,
+                                        verbose = True, views = views)
+    must3r_feats = torch.cat(model[1].recorded_feats, dim = 0).to(device)
+    must3r_outputs = model[1]._compute_prediction_head(
+        torch.stack([torch.from_numpy(view['true_shape']).squeeze() for view in views]).to(device)[:, None],
+        len(views),
+        1,
+        [must3r_feats],
+        norm = False
+    ).squeeze()
+    must3r_feats = [[f[0], f[4], f[7], f[11]] for f in model[1].all_feats]
+    must3r_feats = [torch.cat(f, dim = 0).to(device) for f in zip(*must3r_feats)]
+    from einops import rearrange
+    must3r_feats = [
+        rearrange(f, 'b (h w) c -> b c h w', h = views[0]['true_shape'][0] // 16, w = views[0]['true_shape'][1] // 16).cpu()
+        for f in must3r_feats
+    ]
+    from training_utils import load_checkpoint, BatchedVideoDatapoint, positional_encoding, postprocess_must3r_output
+    from must3r.model import ActivationType, apply_activation
+    # must3r_outputs = postprocess_must3r_output(must3r_outputs, pointmaps_activation = ActivationType.NORM_EXP, compute_cam = True)
+    must3r_output_all = []
+    for f in tqdm(must3r_outputs):
+        must3r_output_all.append(postprocess_must3r_output(f.cpu()[None], pointmaps_activation = ActivationType.NORM_EXP, compute_cam = True))
+    must3r_outputs = {'pts3d': torch.cat([c['pts3d'] for c in must3r_output_all], dim = 0).squeeze(),
+                    'ray_plucker': torch.cat([c['ray_plucker'] for c in must3r_output_all], dim = 0).squeeze()}
+    must3r_outputs = {k: v.cpu() for k, v in must3r_outputs.items()}
+
+    return must3r_feats, must3r_outputs
+
+
+class FeatureFusion(nn.Module):
+    def __init__(self, cross_attn_blocks_3d, in_channels_2d = 256, in_channels_3d = 768):
+        super().__init__()
+        from einops.layers.torch import Rearrange
+        import copy
+        self.freqs = 6
+        self.position_getter = PositionGetter()
+        self.feat_conv_3d_224 = nn.ModuleList([
+            copy.deepcopy(block) for block in cross_attn_blocks_3d
+        ] + [nn.Linear(in_features = 1024, out_features = 768)])
+        self.feat_conv_3d_512 = nn.ModuleList([
+            copy.deepcopy(block) for block in cross_attn_blocks_3d
+        ] + [nn.Linear(in_features = 1024, out_features = 768)])
+        self.out = nn.Conv2d(in_channels = 768, out_channels = in_channels_2d, kernel_size = 3, padding = 1)
+        self.merge = nn.Conv2d(in_channels = in_channels_2d * 2, out_channels = in_channels_2d, kernel_size = 1, padding = 0)
+        self.explicit_3d_embedding = nn.Conv2d(in_channels = 3 * (2 * self.freqs + 1) + 6, out_channels = 768, kernel_size = 16, padding = 0, stride = 16)
+
+    def forward(self, feat_2d, feat_3d, explicit_3d = None, must3r_size = 512):
+        refinenets_3d = self.feat_conv_3d_224 if must3r_size == 224 else self.feat_conv_3d_512
+        assert len(feat_3d) == 4, f'Expected 4 levels of 3D features, got {len(feat_3d)}'
+        explicit_3d = torch.cat((positional_encoding(explicit_3d[:, :3], self.freqs, dim = 1), explicit_3d[:, 3:]), dim = 1)
+        explicit_3d = self.explicit_3d_embedding(explicit_3d)
+        pe_3d = rearrange(explicit_3d, 'b c h w -> b (h w) c')
+        B = pe_3d.shape[0]
+        assert B == 1
+        pe_2d = self.position_getter(B, explicit_3d.shape[2], explicit_3d.shape[3], device = explicit_3d.device)
+        feat_3d = [rearrange(f, 'b c h w -> b (h w) c') for f in feat_3d]
+        N = feat_3d[0].shape[1]
+        ca_attn_mask = torch.ones((B, 1, N, N * B), dtype = torch.bool, device = feat_3d[0].device)
+        for i in range(B):
+            ca_attn_mask[i, :, :, :(i + 1) * N] = False
+        feat_3d_post = feat_3d[0]
+        for i in range(len(feat_3d)):
+            if i == 0:
+                feat_3d_post = refinenets_3d[-1](feat_3d_post) + pe_3d
+                feat_3d_post = refinenets_3d[i](x = feat_3d_post, y = feat_3d_post, xpos = pe_2d)
+            else:
+                feat_3d_post = refinenets_3d[i](x = feat_3d_post + pe_3d, y = repeat(feat_3d[i] + pe_3d, 'b n c -> k (b n) c', k = B), xpos = pe_2d, ca_attn_mask = ca_attn_mask)
+        feat_3d_post = self.out(F.interpolate(rearrange(feat_3d_post, 'b (h w) c -> b c h w', b = B, h = explicit_3d.shape[2], w = explicit_3d.shape[3]), size = feat_2d.shape[-2:], mode = 'bilinear', align_corners = False))
+        feat_merged = self.merge(torch.cat([feat_3d_post, feat_2d], dim = 1))
+        return feat_merged
+
+def get_must3r_cross_attn_layers(device = 'cuda'):
+    from must3r.model import load_model
+    from must3r.demo.gradio import get_args_parser
+    cmd_params = ["--weights", "/home/user/app/private/MUSt3R_512.pth", "--retrieval", "/home/user/app/private/MUSt3R_512_retrieval_trainingfree.pth", "--image_size", "512", "--amp", "bf16", "--viser", "--allow_local_files", "--device", device]
+    parser = get_args_parser()
+    args = parser.parse_args(cmd_params)
+    model = load_model(args.weights, encoder=args.encoder, decoder=args.decoder, device=args.device,
+                        img_size=args.image_size, memory_mode=args.memory_mode, verbose=args.verbose)
+    return model[1].blocks_dec
+
+def get_predictors(device = 'cuda'):
+    predictor_original = build_sam2_video_predictor("configs/sam2.1/sam2.1_hiera_l.yaml", "/home/user/app/sam2-src/checkpoints/sam2.1_hiera_large.pt").to(device).eval()
+    predictor = build_sam2_video_predictor("configs/sam2.1/sam2.1_hiera_l_3d.yaml").to(device).eval()
+    cross_attn_blocks_3d = get_must3r_cross_attn_layers(device = device)
+    predictor.fusion_3d = FeatureFusion(cross_attn_blocks_3d = [copy.deepcopy(cross_attn_blocks_3d[i]) for i in [0, 4, 7, 11]])
+    predictor = load_checkpoint(predictor, torch.load('/home/user/app/private/sam2.1-must3r-fixed-vision-v1-decomp-standalone-regional-best-2.7851.pt', map_location = 'cpu'))
+    return predictor_original.cpu(), predictor.cpu()
+
+
+def get_image_feature(
+    predictor: SAM2Base,
+    images: torch.Tensor,
+):
+    backbone_out = predictor.forward_image(images)
+    backbone_out = {
+        "backbone_fpn": backbone_out["backbone_fpn"].copy(),
+        "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
+    }
+    backbone_out, vision_feats, vision_pos_embeds, feat_sizes = predictor._prepare_backbone_features(backbone_out)
+    return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
+
+class Tracker(nn.Module):
+
+    def __init__(self, predictor, predictor_original = None, device = 'cuda'):
+        super().__init__()
+        self.predictor = predictor.to(device)
+        self.predictor_original = predictor_original.to(device) if predictor_original is not None else None
+        self.device = device
+
+    def init(self, images, processing_order, points = None, labels = None, mask_inputs = None, must3r_feats = None, explicit_3d = None, image_features = None):
+
+        self.images = images
+        self.point_inputs = {"point_coords": points.to(self.device), "point_labels": labels.to(self.device)} if points is not None and labels is not None else None
+        self.mask_inputs = mask_inputs
+        self.processing_order = processing_order
+        self.output_dict = {'cond_frame_outputs': {}, 'non_cond_frame_outputs': {}}
+        self.pred_maskses = []
+        self.num_frames = len(processing_order)
+        self.current_idx = 0
+        self.image_features = image_features
+        self.must3r_feats = must3r_feats
+        self.explicit_3d = explicit_3d
+
+    @torch.no_grad()
+    @torch.autocast(device_type = 'cuda', dtype = torch.bfloat16)
+    def step(self, mask_inputs = None, point_inputs = None):
+        assert (mask_inputs is None or self.current_idx > 0) and (point_inputs is None or self.current_idx > 0), f"mask_inputs: {mask_inputs}, point_inputs: {point_inputs}"
+        frame_idx = self.processing_order[self.current_idx]
+        (
+            _,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+        ) = get_image_feature(self.predictor, images = self.images[:, frame_idx, :3].to(self.device))
+        if self.must3r_feats is not None:
+            feat_2d_original = rearrange(current_vision_feats[-1], '(x y) b c -> b c x y', x = 64, y = 64)
+            _, feat_2d, _ = self.predictor.fusion_3d(
+                feat_2d = feat_2d_original,
+                feat_3d = [f[frame_idx].to(self.device).squeeze()[None] for f in self.must3r_feats],
+                explicit_3d = self.explicit_3d[frame_idx].to(self.device).squeeze()[None],
+                must3r_size = 224 if self.must3r_feats[0][frame_idx].shape[-1] == 14 and self.must3r_feats[0][frame_idx].shape[-2] == 14 else 512
+            )
+            current_vision_feats[-1] = rearrange(feat_2d, 'b c h w -> (h w) b c')
+            assert not torch.allclose(feat_2d.float(), feat_2d_original.float(), 1e-4), 'Feature fusion did not change features'
+        self.current_vision_feats = current_vision_feats
+        self.feat_sizes = feat_sizes
+        memory_dict = {
+            'cond_frame_outputs': self.output_dict['cond_frame_outputs'], 
+            'non_cond_frame_outputs': {k: v for k, v in self.output_dict['non_cond_frame_outputs'].items() if (v['pred_masks'] > 0).any()}  | ({self.current_idx - 1: d} if (d := self.output_dict['non_cond_frame_outputs'].get(self.current_idx - 1)) else {})
+        }
+        if len(memory_dict['non_cond_frame_outputs']) > 32:
+            memory_dict['non_cond_frame_outputs'] = {self.current_idx - i: v for i, (k, v) in enumerate(sorted(memory_dict['non_cond_frame_outputs'].items(), key = lambda x: abs(x[0] - self.current_idx))[:32])}
+        if len(memory_dict['cond_frame_outputs']) > 32:
+            memory_dict['cond_frame_outputs'] = {self.current_idx - i: v for i, (k, v) in enumerate(sorted(memory_dict['cond_frame_outputs'].items(), key = lambda x: abs(x[0] - self.current_idx))[:32])}
+
+        current_out = self.predictor.track_step(
+            frame_idx = self.current_idx,
+            is_init_cond_frame = self.current_idx == 0,
+            current_vision_feats = current_vision_feats,
+            current_vision_pos_embeds = current_vision_pos_embeds,
+            feat_sizes = feat_sizes,
+            point_inputs = self.point_inputs if self.current_idx == 0 else point_inputs,
+            mask_inputs = self.mask_inputs.to(self.device) if self.current_idx == 0 else mask_inputs,
+            output_dict = memory_dict,
+            num_frames = self.num_frames,
+            track_in_reverse = False,
+            run_mem_encoder = False,
+            prev_sam_mask_logits = None,
+        )
+        current_out["pred_masks"] = fill_holes_in_mask_scores(
+                current_out["pred_masks"], self.predictor.fill_hole_area
+        )
+        current_out["pred_masks_high_res"] = torch.nn.functional.interpolate(
+                current_out["pred_masks"],
+                size = (self.predictor.image_size, self.predictor.image_size),
+                mode = "bilinear",
+                align_corners = False,
+        )
+        
+        # if self.predictor_original is not None and self.current_idx != 0:
+        #     current_out['pred_masks_high_res_lq'] = current_out['pred_masks_high_res'].clone()
+        #     self.predictor_original.use_mask_input_as_output_without_sam = False
+        #     current_vision_feats_original = current_vision_feats.copy()
+        #     current_vision_feats_original[-1] = rearrange(feat_2d_original, 'b c h w -> (h w) b c')
+        #     current_out_original = self.predictor_original.track_step(
+        #         frame_idx = 0,
+        #         is_init_cond_frame = True,
+        #         current_vision_feats = current_vision_feats_original,
+        #         current_vision_pos_embeds = current_vision_pos_embeds,
+        #         feat_sizes = feat_sizes,
+        #         point_inputs = None,
+        #         mask_inputs = current_out["pred_masks_high_res"].to(self.device).squeeze()[None, None],
+        #         output_dict = {},
+        #         num_frames = self.num_frames,
+        #         track_in_reverse = False,
+        #         run_mem_encoder = False,
+        #         prev_sam_mask_logits = None,
+        #     )
+        #     # if (current_out['pred_masks_high_res'] > 0).sum() > 0: assert (current_out_original['pred_masks'] > 0).sum() > 0, 'Original predictor produced empty mask'
+        #     current_out["pred_masks"] = fill_holes_in_mask_scores(
+        #         current_out_original["pred_masks"], self.predictor.fill_hole_area
+        #     )
+        #     current_out["pred_masks_high_res"] = torch.nn.functional.interpolate(
+        #             current_out["pred_masks"],
+        #             size = (self.predictor.image_size, self.predictor.image_size),
+        #             mode = "bilinear",
+        #             align_corners = False,
+        #     )
+
+        return current_out
+    
+    @torch.no_grad()
+    @torch.autocast(device_type = 'cuda', dtype = torch.bfloat16)
+    def postprocess(self, current_out):
+        maskmem_features, maskmem_pos_enc = self.predictor._encode_new_memory(
+            current_vision_feats = self.current_vision_feats,
+            feat_sizes = self.feat_sizes,
+            pred_masks_high_res = current_out["pred_masks_high_res"],
+            object_score_logits = current_out['object_score_logits'],
+            is_mask_from_pts = False
+        )
+        current_out["maskmem_features"] = maskmem_features.to(torch.bfloat16)
+        current_out["maskmem_pos_enc"] = maskmem_pos_enc
+        self.pred_maskses.append(current_out['pred_masks_high_res'].cpu())
+        self.output_dict['cond_frame_outputs'if self.current_idx == 0 else 'non_cond_frame_outputs'][self.current_idx] = current_out
+        if len(self.output_dict['non_cond_frame_outputs']) > 256:
+            self.output_dict['non_cond_frame_outputs'] = {k: v for k, v in self.output_dict['non_cond_frame_outputs'].items() if k >= self.current_idx - 256}
+        if len(self.output_dict['cond_frame_outputs']) > 256:
+            self.output_dict['cond_frame_outputs'] = {k: v for k, v in self.output_dict['cond_frame_outputs'].items() if k >= self.current_idx - 256}
+
+        self.current_idx += 1
+
+@torch.no_grad()
+@torch.autocast(device_type = 'cuda', dtype = torch.bfloat16)
+def forward_original(predictor: SAM2Base, images, points = None, labels = None, mask_inputs = None, processing_order = None, mask_3d = None):
+    B, T, _, H, W = images.shape
+    point_inputs = {"point_coords": points, "point_labels": labels} if points is not None and labels is not None else None
+    assert (mask_inputs is None) ^ (point_inputs is None), f"mask_inputs: {mask_inputs}, point_inputs: {point_inputs}"
+    processing_order = list(range(images.shape[1])) if processing_order is None else processing_order
+    num_frames = len(processing_order)
+    pred_maskses = []
+    ious = []
+    output_dict = {'cond_frame_outputs': {}, 'non_cond_frame_outputs': {}}
+    assert mask_3d is None or mask_3d.shape[1] == len(processing_order)
+    for idx, frame_idx in enumerate(tqdm(processing_order)):
+        (
+            _,
+            current_vision_feats,
+            current_vision_pos_embeds,
+            feat_sizes,
+        ) = get_image_feature(predictor, images = images[:, frame_idx, :3].to(device))
+        memory_dict = {'cond_frame_outputs': output_dict['cond_frame_outputs'], 'non_cond_frame_outputs': {k: v for k, v in output_dict['non_cond_frame_outputs'].items() if (v['pred_masks'] > 0).any()}}
+        if len(memory_dict['non_cond_frame_outputs']) > 0:
+            memory_dict['non_cond_frame_outputs'] = {idx - i: v for i, (k, v) in enumerate(sorted(memory_dict['non_cond_frame_outputs'].items(), key = lambda x: abs(x[0] - idx))[:24])}
+        current_out = predictor.track_step(
+            frame_idx = idx,
+            is_init_cond_frame = idx == 0,
+            current_vision_feats = current_vision_feats,
+            current_vision_pos_embeds = current_vision_pos_embeds,
+            feat_sizes = feat_sizes,
+            point_inputs = point_inputs if idx == 0 else None,
+            mask_inputs = mask_inputs if idx == 0 else None,
+            output_dict = memory_dict,
+            num_frames = num_frames,
+            track_in_reverse = False,
+            run_mem_encoder = False,
+            prev_sam_mask_logits = None,
+        )
+        current_out['ppred_masks_high_res_lq'] = current_out['pred_masks_high_res']
+        current_out["pred_masks"] = fill_holes_in_mask_scores(
+                current_out["pred_masks"], predictor.fill_hole_area
+        )
+        current_out["pred_masks_high_res"] = torch.nn.functional.interpolate(
+                current_out["pred_masks"],
+                size = (predictor.image_size, predictor.image_size),
+                mode = "bilinear",
+                align_corners = False,
+        )
+        maskmem_features, maskmem_pos_enc = predictor._encode_new_memory(
+            current_vision_feats = current_vision_feats,
+            feat_sizes = feat_sizes,
+            pred_masks_high_res = current_out["pred_masks_high_res"],
+            object_score_logits = current_out['object_score_logits'],
+            is_mask_from_pts = True
+        )
+        current_out["maskmem_features"] = maskmem_features.to(torch.bfloat16)
+        current_out["maskmem_pos_enc"] = maskmem_pos_enc
+
+        pred_maskses.append(current_out['pred_masks_high_res'].cpu())                     
+        output_dict['cond_frame_outputs'if idx == 0 else 'non_cond_frame_outputs'][idx] = current_out
+        if len(output_dict['non_cond_frame_outputs']) > 256:
+            output_dict['non_cond_frame_outputs'] = {k: v for k, v in output_dict['non_cond_frame_outputs'].items() if k >= idx - 256}
+    pred_maskses = torch.stack(pred_maskses, dim = 1).squeeze(2) # (B, T, H, W)
+    assert pred_maskses.shape == (B, len(processing_order), H, W)
+    return pred_maskses
+
+@spaces.GPU
+def get_single_frame_mask(image: torch.Tensor, predictor_original, points, labels, device = 'cuda'):
+    '''
+    points: 1 x N x 2
+    labels: 1 x N (positive 1, negative 0, box (top left 2, low right 3))
+    '''
+    return forward_original(
+        predictor_original.to(device),
+        images = image.squeeze()[None, None],
+        points = points,
+        abels = labels,
+        processing_order = [0],
+        mask_3d = None
+    )
+
+@spaces.GPU
+def get_tracked_masks(sam2_input_images, must3r_feats, must3r_outputs, start_idx, first_frame_mask, predictor, predictor_original, device = 'cuda'):
+    tracker = Tracker(predictor, predictor_original = predictor_original, device = device)
+    tracker.init(
+        images = sam2_input_images.squeeze()[None],
+        processing_order = range(start_idx, sam2_input_images.shape[0]),
+        mask_inputs = first_frame_mask.squeeze()[None, None] > 0,
+        must3r_feats = must3r_feats,
+        explicit_3d = torch.cat((must3r_outputs['pts3d'], must3r_outputs['ray_plucker']), dim = -1).permute(0, 3, 1, 2)
+    )
+    output_masks = {}
+    for idx, frame_idx in enumerate(tqdm(tracker.processing_order)):
+        current_out = tracker.step()
+        output_masks[frame_idx] = current_out['pred_masks_high_res'].squeeze().cpu().numpy() > 0
+        tracker.postprocess(current_out)
+
+    tracker.init(
+        images = sam2_input_images.squeeze()[None],
+        processing_order = range(start_idx, -1, -1),
+        mask_inputs = first_frame_mask.squeeze()[None, None] > 0,
+        must3r_feats = must3r_feats,
+        explicit_3d = torch.cat((must3r_outputs['pts3d'], must3r_outputs['ray_plucker']), dim = -1).permute(0, 3, 1, 2)
+    )
+
+    for idx, frame_idx in enumerate(tqdm(tracker.processing_order)):
+        current_out = tracker.step()
+        output_masks[frame_idx] = current_out['pred_masks_high_res'].squeeze().cpu().numpy() > 0
+        tracker.postprocess(current_out)
+
+    return output_masks
\ No newline at end of file
diff --git a/install.sh b/install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e418faa050bfdb372bd42e5e24753ddf007aff99
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,23 @@
+cd ./sam2-src
+    cd checkpoints &&
+    ./download_ckpts.sh && 
+    cd ..
+    pip install -e . &&
+    pip install -e ".[notebooks]" &&
+cd ..
+pip3 install -U xformers &&
+pip install -r dust3r/requirements.txt &&
+pip install -r dust3r/requirements_optional.txt &&
+pip install -r requirements.txt &&
+pip install faiss-cpu &&
+mkdir build
+cd build
+    git clone https://github.com/jenicek/asmk.git &&
+    cd asmk/
+        cd cython/
+            cythonize *.pyx &&
+        cd ..
+        pip install . --no-build-isolation &&
+    cd ..
+cd ..
+pip install tensordict
\ No newline at end of file
diff --git a/must3r-scripts b/must3r-scripts
new file mode 160000
index 0000000000000000000000000000000000000000..6e2b1eceb7e1fe56712e9130c88a32aba94adca3
--- /dev/null
+++ b/must3r-scripts
@@ -0,0 +1 @@
+Subproject commit 6e2b1eceb7e1fe56712e9130c88a32aba94adca3
diff --git a/must3r/__init__.py b/must3r/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..345ede2d5437f28f4929911c3614d5fe2caf828e
--- /dev/null
+++ b/must3r/__init__.py
@@ -0,0 +1 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
diff --git a/must3r/datasets/__init__.py b/must3r/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..565731351ba14fc9bd682a6d0288d707f90439fd
--- /dev/null
+++ b/must3r/datasets/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.datasets.utils.transforms import *
+from .co3d import Co3d
+from .scannetpp import ScanNetpp
+
+
+def get_data_loader(dataset, batch_size, num_workers=8, shuffle=True, drop_last=True, pin_mem=True):
+    import torch
+    from croco.utils.misc import get_world_size, get_rank
+
+    # pytorch dataset
+    if isinstance(dataset, str):
+        dataset = eval(dataset)
+
+    world_size = get_world_size()
+    rank = get_rank()
+
+    try:
+        sampler = dataset.make_sampler(batch_size, shuffle=shuffle, world_size=world_size,
+                                       rank=rank, drop_last=drop_last)
+    except (AttributeError, NotImplementedError):
+        # not avail for this dataset
+        if torch.distributed.is_initialized():
+            sampler = torch.utils.data.DistributedSampler(
+                dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, drop_last=drop_last
+            )
+        elif shuffle:
+            sampler = torch.utils.data.RandomSampler(dataset)
+        else:
+            sampler = torch.utils.data.SequentialSampler(dataset)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        sampler=sampler,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        pin_memory=pin_mem,
+        drop_last=drop_last,
+    )
+
+    return data_loader
diff --git a/must3r/datasets/base/__init__.py b/must3r/datasets/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..345ede2d5437f28f4929911c3614d5fe2caf828e
--- /dev/null
+++ b/must3r/datasets/base/__init__.py
@@ -0,0 +1 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
diff --git a/must3r/datasets/base/must3r_base_dataset.py b/must3r/datasets/base/must3r_base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..2853b6edde1cf7565c57c26c988be47288223094
--- /dev/null
+++ b/must3r/datasets/base/must3r_base_dataset.py
@@ -0,0 +1,206 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+import torch
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.datasets.base.base_stereo_view_dataset import (BaseStereoViewDataset, is_good_type, transpose_to_landscape,
+                                                           view_name)
+from dust3r.datasets.base.easy_dataset import EasyDataset, CatDataset, MulDataset, ResizedDataset
+from dust3r.datasets.base.batched_sampler import BatchedRandomSampler as DUSt3R_BatchedRandomSampler
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+
+
+class BatchedRandomSampler(DUSt3R_BatchedRandomSampler):
+
+    def __iter__(self):
+        # prepare RNG
+        if self.epoch is None:
+            assert self.world_size == 1 and self.rank == 0, 'use set_epoch() if distributed mode is used'
+            seed = int(torch.empty((), dtype=torch.int64).random_().item())
+        else:
+            seed = self.epoch + 777
+        rng = np.random.default_rng(seed=seed)
+
+        # random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+
+        # random feat_idxs (same across each batch)
+        pool_size = self.pool_size if isinstance(self.pool_size, list) else [self.pool_size]
+        idxs = []
+        for pool_size in pool_size:
+            n_batches = (self.total_size + self.batch_size - 1) // self.batch_size
+            if isinstance(pool_size, tuple):
+                feat_idxs = rng.integers(*pool_size, size=n_batches)
+            else:
+                feat_idxs = rng.integers(pool_size, size=n_batches)
+            feat_idxs = np.broadcast_to(feat_idxs[:, None], (n_batches, self.batch_size))
+            feat_idxs = feat_idxs.ravel()[:self.total_size]
+            idxs.append(feat_idxs)
+
+        # put them together
+        idxs = np.c_[sample_idxs, *idxs]  # shape = (total_size, n_feats)
+
+        # Distributed sampler: we select a subset of batches
+        # make sure the slice for each node is aligned with batch_size
+        size_per_proc = self.batch_size * ((self.total_size + self.world_size *
+                                           self.batch_size - 1) // (self.world_size * self.batch_size))
+        idxs = idxs[self.rank * size_per_proc: (self.rank + 1) * size_per_proc]
+
+        yield from (tuple(idx) for idx in idxs)
+
+
+class EasyDataset_MUSt3R(EasyDataset):
+    def __add__(self, other):
+        return CatDataset_MUSt3R([self, other])
+
+    def __rmul__(self, factor):
+        return MulDataset_MUSt3R(factor, self)
+
+    def __rmatmul__(self, factor):
+        return ResizedDataset_MUSt3R(factor, self)
+
+    def make_sampler(self, batch_size, shuffle=True, world_size=1, rank=0, drop_last=True):
+        if not (shuffle):
+            raise NotImplementedError()  # cannot deal yet
+        num_of_aspect_ratios = len(self._resolutions)
+        min_memory_num_views = self.min_memory_num_views
+        max_memory_num_views = self.max_memory_num_views
+        return BatchedRandomSampler(self, batch_size, [num_of_aspect_ratios, (min_memory_num_views, max_memory_num_views + 1)], world_size=world_size, rank=rank, drop_last=drop_last)
+
+
+class CatDataset_MUSt3R(CatDataset, EasyDataset_MUSt3R):
+
+    @property
+    def min_memory_num_views(self):
+        return self.datasets[0].min_memory_num_views
+
+    @property
+    def max_memory_num_views(self):
+        return self.datasets[0].max_memory_num_views
+
+    def __getitem__(self, idx):
+        other = None
+        if isinstance(idx, tuple):
+            other = idx[1:]
+            idx = idx[0]
+
+        if not (0 <= idx < len(self)):
+            raise IndexError()
+
+        db_idx = np.searchsorted(self._cum_sizes, idx, 'right')
+        dataset = self.datasets[db_idx]
+        new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
+
+        if other is not None:
+            new_idx = (new_idx, *other)
+        return dataset[new_idx]
+
+
+class MulDataset_MUSt3R(MulDataset, EasyDataset_MUSt3R):
+
+    @property
+    def min_memory_num_views(self):
+        return self.dataset.min_memory_num_views
+
+    @property
+    def max_memory_num_views(self):
+        return self.dataset.max_memory_num_views
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            return self.dataset[idx[0] // self.multiplicator, *idx[1:]]
+        else:
+            return self.dataset[idx // self.multiplicator]
+
+
+class ResizedDataset_MUSt3R(ResizedDataset, EasyDataset_MUSt3R):
+
+    @property
+    def min_memory_num_views(self):
+        return self.dataset.min_memory_num_views
+
+    @property
+    def max_memory_num_views(self):
+        return self.dataset.max_memory_num_views
+
+    def __getitem__(self, idx):
+        assert hasattr(self, '_idxs_mapping'), 'You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()'
+        if isinstance(idx, tuple):
+            return self.dataset[self._idxs_mapping[idx[0]], *idx[1:]]
+        else:
+            return self.dataset[self._idxs_mapping[idx]]
+
+
+class MUSt3RBaseDataset(BaseStereoViewDataset, EasyDataset_MUSt3R):
+    def __init__(self, *args, num_views, min_memory_num_views, max_memory_num_views, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.is_metric_scale = False  # by default a dataset is not metric scale, subclasses can overwrite this
+        self.num_views = num_views
+        self.min_memory_num_views = min_memory_num_views
+        self.max_memory_num_views = max_memory_num_views
+
+    def _get_views(self, idx, resolution, memory_num_views, rng):
+        raise NotImplementedError()
+
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx, memory_num_views = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+            memory_num_views = self.num_views
+
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, '_rng'):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+
+        # over-loaded code
+        resolution = self._resolutions[ar_idx]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, memory_num_views, self._rng)
+        assert len(views) == self.num_views
+
+        # check data-types
+        for v, view in enumerate(views):
+            assert 'pts3d' not in view, f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view['idx'] = (idx, ar_idx, v)
+
+            # encode the image
+            width, height = view['img'].size
+            view['true_shape'] = np.int32((height, width))
+            view['img'] = self.transform(view['img'])
+
+            assert 'camera_intrinsics' in view
+            if 'camera_pose' not in view:
+                view['camera_pose'] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(view['camera_pose']).all(), f'NaN in camera pose for view {view_name(view)}'
+            assert 'pts3d' not in view
+            assert 'valid_mask' not in view
+            assert np.isfinite(view['depthmap']).all(), f'NaN in depthmap for view {view_name(view)}'
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+
+            view['pts3d'] = pts3d
+            view['valid_mask'] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view['camera_intrinsics']
+            view['memory_num_views'] = memory_num_views
+            view['is_metric_scale'] = self.is_metric_scale
+            # Pixels for which depth is fundamentally undefined
+            view['sky_mask'] = (view['depthmap'] < 0)
+
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view['rng'] = int.from_bytes(self._rng.bytes(4), 'big')
+        return views
diff --git a/must3r/datasets/base/random.py b/must3r/datasets/base/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..be7c5bf7ac29dd5c0dd6fa0369088bd584b3b729
--- /dev/null
+++ b/must3r/datasets/base/random.py
@@ -0,0 +1,7 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+
+def get_random_choice(rng, possibilities):
+    if len(possibilities) > 0:
+        return rng.choice(sorted(possibilities))
+    else:
+        return None
diff --git a/must3r/datasets/base/tuple_maker.py b/must3r/datasets/base/tuple_maker.py
new file mode 100644
index 0000000000000000000000000000000000000000..a98213441c3a7916d0e74008c2ef8c57c0a906f0
--- /dev/null
+++ b/must3r/datasets/base/tuple_maker.py
@@ -0,0 +1,58 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import copy
+from must3r.datasets.base.random import get_random_choice
+
+
+def select_tuple_from_pairs(pairs_getter, view_getter, num_views, memory_num_views, rng, idx1, idx2):
+    selected_idx = [idx1, idx2]
+    selected_idx_set = set(selected_idx)
+
+    possibilities = pairs_getter(idx1).union(pairs_getter(idx2)).difference(selected_idx_set)
+
+    for _ in range(2, num_views):
+        if len(possibilities) == 0:
+            break
+        # pick a random value
+        new_idx = rng.choice(sorted(possibilities))
+        assert new_idx not in selected_idx_set
+        selected_idx.append(new_idx)
+        selected_idx_set.add(new_idx)
+        if len(selected_idx) <= memory_num_views:
+            possibilities = possibilities.union(pairs_getter(new_idx))
+        possibilities = possibilities.difference(selected_idx_set)
+
+    views = []
+    for view_idx in selected_idx:
+        views.append(view_getter(view_idx, rng))
+
+    return fill_views(views, num_views)
+
+
+def select_tuple_from_360_scene(is_valid_getter, is_valid_check, view_getter,
+                                nimg_per_scene, num_views, rng, idx):
+    views = []
+    possibilities = set(range(nimg_per_scene))
+    img_idx = idx
+    if img_idx not in possibilities:
+        img_idx = get_random_choice(rng, possibilities)
+    while len(views) < num_views and img_idx is not None:  # some images (few) have zero depth
+        possibilities.remove(img_idx)
+        if not is_valid_getter(img_idx):  # make sure that img_idx is valid
+            img_idx = get_random_choice(rng, possibilities)
+            continue
+        view = view_getter(img_idx, rng)  # get the view
+        if not is_valid_check(view, img_idx):
+            img_idx = get_random_choice(rng, possibilities)
+            continue
+        views.append(view)
+        img_idx = get_random_choice(rng, possibilities)  # select new idx for the next loop
+    return fill_views(views, num_views)
+
+
+def fill_views(views, num_views):
+    if len(views) < num_views:
+        # somehow failed to add all views: there wasn't enough valid
+        while len(views) != num_views:
+            views = views + copy.deepcopy(views)
+            views = views[:num_views]
+    return views
diff --git a/must3r/datasets/base/viz.py b/must3r/datasets/base/viz.py
new file mode 100644
index 0000000000000000000000000000000000000000..8afda66a5d4f03416c1339828ea7c23aa8e9a73d
--- /dev/null
+++ b/must3r/datasets/base/viz.py
@@ -0,0 +1,39 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.datasets.base.base_stereo_view_dataset import view_name
+from dust3r.viz import SceneViz, rgb, auto_cam_size
+
+
+def viz_dataset(dataset, sampler=None):
+    iterable = np.random.permutation(len(dataset)) if sampler is None else sampler
+    for idx in iterable:
+        views = dataset[idx]
+        assert len(views) == dataset.num_views
+        unique_views = set()
+        for view_idx in range(dataset.num_views):
+            v_name = view_name(views[view_idx])
+            print(v_name)
+            unique_views.add(v_name)
+        print(f'{len(unique_views)} unique views')
+        viz = SceneViz()
+        poses = [views[view_idx]['camera_pose'] for view_idx in range(dataset.num_views)]
+        cam_size = max(auto_cam_size(poses), 0.001)
+        memory_num_views = views[0]['memory_num_views']
+        print(f'{memory_num_views} memory views')
+        for view_idx in range(dataset.num_views):
+            v_idx = view_idx / (dataset.num_views - 1)
+            pts3d = views[view_idx]['pts3d']
+            valid_mask = views[view_idx]['valid_mask']
+            colors = rgb(views[view_idx]['img'])
+            viz.add_pointcloud(pts3d, colors, valid_mask)
+            if view_idx < memory_num_views:
+                color = (0, 0, v_idx * 255)
+            else:
+                color = (v_idx * 255, (1 - v_idx) * 255, 0)
+            viz.add_camera(pose_c2w=views[view_idx]['camera_pose'],
+                           focal=views[view_idx]['camera_intrinsics'][0, 0],
+                           color=color,
+                           image=colors,
+                           cam_size=cam_size)
+        viz.show()
diff --git a/must3r/datasets/co3d.py b/must3r/datasets/co3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80282e37ab63f37f37fd99a9aa73a2d9e585335
--- /dev/null
+++ b/must3r/datasets/co3d.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import os.path as osp
+import cv2
+import numpy as np
+
+from dust3r.utils.image import imread_cv2
+
+from must3r.datasets.base.must3r_base_dataset import MUSt3RBaseDataset
+from must3r.datasets.base.tuple_maker import select_tuple_from_360_scene
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.datasets.co3d import Co3d as DUSt3R_Co3d  # noqa
+
+
+class Co3d(DUSt3R_Co3d, MUSt3RBaseDataset):
+    def __init__(self, *args, num_views, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_views = num_views
+        self.num_images_per_object = 100
+        self.invalidate = {scene: [False for _ in range(self.num_images_per_object)] for scene in self.scene_list}
+
+    def __len__(self):
+        return len(self.scenes) * self.num_images_per_object
+
+    def _load_view(self, view_idx, obj, instance, resolution, rng, mask_bg):
+        impath = self._get_impath(obj, instance, view_idx)
+        depthpath = self._get_depthpath(obj, instance, view_idx)
+
+        # load camera params
+        metadata_path = self._get_metadatapath(obj, instance, view_idx)
+        input_metadata = np.load(metadata_path)
+        camera_pose = input_metadata['camera_pose'].astype(np.float32)
+        intrinsics = input_metadata['camera_intrinsics'].astype(np.float32)
+
+        # load image and depth
+        rgb_image = imread_cv2(impath)
+        depthmap = self._read_depthmap(depthpath, input_metadata)
+
+        if mask_bg:
+            # load object mask
+            maskpath = self._get_maskpath(obj, instance, view_idx)
+            maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(np.float32)
+            maskmap = (maskmap / 255.0) > 0.1
+
+            # update the depthmap with mask
+            depthmap *= maskmap
+
+        rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+            rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath)
+
+        view = dict(
+            img=rgb_image,
+            depthmap=depthmap,
+            camera_pose=camera_pose,
+            camera_intrinsics=intrinsics,
+            dataset=self.dataset_label,
+            label=osp.join(obj, instance),
+            instance=osp.split(impath)[1],
+        )
+        return view
+
+    def _get_views(self, idx, resolution, memory_num_views, rng):
+        # choose a scene
+        obj, instance = self.scene_list[idx // len(self.combinations)]
+        image_pool = self.scenes[obj, instance]
+        im1_idx = idx % self.num_images_per_object
+
+        # decide now if we mask the bg
+        mask_bg = (self.mask_bg == True) or (self.mask_bg == 'rand' and rng.choice(2))
+        nimg_per_scene = min(len(image_pool), self.num_images_per_object)
+
+        def is_valid_getter(view_idx):
+            return view_idx < len(image_pool) and not self.invalidate[obj, instance][view_idx]
+
+        def is_valid_check(view, view_idx):
+            view_valid = (view['depthmap'] > 0.0).sum() > 0
+            if not view_valid:
+                # problem, invalidate image
+                self.invalidate[obj, instance][view_idx] = True
+            return view_valid
+
+        def get_view(view_idx, rng): return self._load_view(image_pool[view_idx], obj, instance, resolution, rng,
+                                                            mask_bg)
+        views = select_tuple_from_360_scene(is_valid_getter, is_valid_check, get_view,
+                                            nimg_per_scene, self.num_views, rng, im1_idx)
+        return views
diff --git a/must3r/datasets/scannetpp.py b/must3r/datasets/scannetpp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c71000b7ff194ac77bb35d0b5d7bd45df9be79ef
--- /dev/null
+++ b/must3r/datasets/scannetpp.py
@@ -0,0 +1,57 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import os.path as osp
+import cv2
+import numpy as np
+
+from dust3r.utils.image import imread_cv2
+
+from must3r.datasets.base.must3r_base_dataset import MUSt3RBaseDataset
+from must3r.datasets.base.tuple_maker import select_tuple_from_pairs
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.datasets.scannetpp import ScanNetpp as DUSt3R_ScanNetpp  # noqa
+
+
+class ScanNetpp(DUSt3R_ScanNetpp, MUSt3RBaseDataset):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, split='train', **kwargs)
+        self.is_metric_scale = True
+        self.pairs_per_image = [set() for _ in range(len(self.images))]
+        for idx1, idx2 in self.pairs:
+            self.pairs_per_image[idx1].add(idx2)
+            self.pairs_per_image[idx2].add(idx1)
+
+    def _load_view(self, idx, view_idx, resolution, rng):
+        scene_id = self.sceneids[view_idx]
+        scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+
+        intrinsics = self.intrinsics[view_idx]
+        camera_pose = self.trajectories[view_idx]
+        basename = self.images[view_idx]
+
+        # Load RGB image
+        rgb_image = imread_cv2(osp.join(scene_dir, 'images', basename + '.jpg'))
+        # Load depthmap
+        depthmap = imread_cv2(osp.join(scene_dir, 'depth', basename + '.png'), cv2.IMREAD_UNCHANGED)
+        depthmap = depthmap.astype(np.float32) / 1000
+        depthmap[~np.isfinite(depthmap)] = 0  # invalid
+
+        rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+            rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx)
+
+        return dict(
+            img=rgb_image,
+            depthmap=depthmap.astype(np.float32),
+            camera_pose=camera_pose.astype(np.float32),
+            camera_intrinsics=intrinsics.astype(np.float32),
+            dataset='ScanNet++',
+            label=self.scenes[scene_id] + '_' + basename,
+            instance=f'{str(idx)}_{str(view_idx)}',
+        )
+
+    def _get_views(self, idx, resolution, memory_num_views, rng):
+        idx1, idx2 = self.pairs[idx]
+        def get_pairs(view_idx): return self.pairs_per_image[view_idx]
+        def get_view(view_idx, rng): return self._load_view(idx, view_idx, resolution, rng)
+        views = select_tuple_from_pairs(get_pairs, get_view, self.num_views, memory_num_views, rng, idx1, idx2)
+        return views
diff --git a/must3r/demo/__init__.py b/must3r/demo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..345ede2d5437f28f4929911c3614d5fe2caf828e
--- /dev/null
+++ b/must3r/demo/__init__.py
@@ -0,0 +1 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
diff --git a/must3r/demo/gradio.py b/must3r/demo/gradio.py
new file mode 100644
index 0000000000000000000000000000000000000000..324f31c0a0faadd7ea1e724ebc6b3cf5929dc460
--- /dev/null
+++ b/must3r/demo/gradio.py
@@ -0,0 +1,531 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+#
+# --------------------------------------------------------
+# gradio demo
+# --------------------------------------------------------
+import argparse
+import gradio
+import os
+import torch
+import numpy as np
+import functools
+import trimesh
+import datetime
+import tempfile
+from scipy.spatial.transform import Rotation
+import matplotlib.pyplot as pl
+
+from must3r.demo.viser import ViserWrapper
+from must3r.demo.inference import *
+from must3r.tools.image import is_valid_pil_image_file
+from must3r.slam.model import get_searcher
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.utils.device import to_numpy
+from dust3r.utils.geometry import geotrf
+from dust3r.viz import add_scene_cam, CAM_COLORS, OPENGL, pts3d_to_trimesh, cat_meshes
+
+from must3r.model import *
+from must3r.model.blocks.layers import MEMORY_MODES
+from must3r.model.blocks.attention import has_xformers, toggle_memory_efficient_attention
+
+try:
+    from pillow_heif import register_heif_opener  # noqa
+    register_heif_opener()
+except ImportError:
+    pass
+
+
+pl.ion()
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+
+    parser_url = parser.add_mutually_exclusive_group()
+    parser_url.add_argument("--local_network", action='store_true', default=False,
+                            help="make app accessible on local network: address will be set to 0.0.0.0")
+    parser_url.add_argument("--server_name", type=str, default=None, help="server url, default is 127.0.0.1")
+    parser.add_argument("--image_size", type=int, default=512, choices=[512, 384, 224, 336, 448, 768],
+                        help="image size: 224, 336, 448 are square images and others support multiple aspect ratios")
+    parser.add_argument("--server_port", type=int, help=("will start gradio app on this port (if available). "
+                                                         "If None, will search for an available port starting at 7860."),
+                        default=None)
+    parser.add_argument("--weights", type=str, help="path to the model weights", required=True)
+
+    parser.add_argument("--encoder", type=str, default=None, help="encoder class instantiation")
+    parser.add_argument("--decoder", type=str, default=None, help="decoder class instantiation")
+    parser.add_argument("--memory_mode", type=str, default=None, choices=MEMORY_MODES,
+                        help="decoder memory_mode override")
+
+    parser.add_argument("--retrieval", type=str, help="path to the retrieval weights", default=None)
+
+    parser.add_argument("--device", type=str, default='cuda', help="pytorch device")
+    parser.add_argument("--tmp_dir", type=str, default=None, help="value for tempfile.tempdir")
+    parser.add_argument('-q', '--silent', '--quiet', action='store_false', dest='verbose')
+
+    parser.add_argument("--viser", action='store_true', default=False)
+    parser.add_argument('--amp', choices=[False, "bf16", "fp16"], default=False,
+                        help="Use Automatic Mixed Precision, fp16 might be unstable")
+    parser.add_argument("--allow_local_files", action='store_true', default=False)
+    parser.add_argument("--embed_viser", action='store_true', default=False)
+    return parser
+
+
+def _convert_scene_output_to_glb(outdir, imgs, pts3d, mask, focals, cams2world,
+                                 cam_size=0.05, cam_color=None, as_pointcloud=False,
+                                 transparent_cams=False, verbose=True,
+                                 filename='scene.glb', camera_mask=None):
+    assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals)
+    if camera_mask is not None:
+        assert len(imgs) == len(camera_mask)
+    pts3d = to_numpy(pts3d)
+    imgs = to_numpy(imgs)
+    focals = to_numpy(focals)
+    cams2world = to_numpy(cams2world)
+
+    scene = trimesh.Scene()
+
+    # full pointcloud
+    if as_pointcloud:
+        pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)])
+        col = np.concatenate([p[m] for p, m in zip(imgs, mask)])
+        pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3))
+        scene.add_geometry(pct)
+    else:
+        meshes = []
+        for i in range(len(imgs)):
+            meshes.append(pts3d_to_trimesh(imgs[i], pts3d[i], mask[i]))
+        mesh = trimesh.Trimesh(**cat_meshes(meshes))
+        scene.add_geometry(mesh)
+
+    # add each camera
+    for i, pose_c2w in enumerate(cams2world):
+        if camera_mask is not None and not camera_mask[i]:
+            continue
+        if isinstance(cam_color, list):
+            camera_edge_color = cam_color[i]
+        else:
+            camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)]
+        add_scene_cam(scene, pose_c2w, camera_edge_color,
+                      None if transparent_cams else imgs[i], focals[i],
+                      imsize=imgs[i].shape[1::-1], screen_width=cam_size)
+
+    rot = np.eye(4)
+    rot[:3, :3] = Rotation.from_euler('y', np.deg2rad(180)).as_matrix()
+    scene.apply_transform(np.linalg.inv(cams2world[0] @ OPENGL @ rot))
+    outfile = os.path.join(outdir, filename)
+    if verbose:
+        print('(exporting 3D scene to', outfile, ')')
+        assert as_pointcloud
+
+    if filename.endswith('ply'):
+        if verbose:
+            print('WARNING: export to ply - cameras will be ignore')
+        pct.export(file_obj=outfile, file_type='ply')
+    else:
+        scene.export(file_obj=outfile)
+    return outfile
+
+
+@torch.no_grad()
+def get_3D_model_from_scene(outdir, verbose, scene, min_conf_thr=3.0, as_pointcloud=False,
+                            transparent_cams=False, local_pointmaps=False, cam_size=0.05, camera_conf_thr=0.0,
+                            filename='scene.glb'):
+    """
+    extract 3D_model (glb file) from a reconstructed scene
+    """
+    if scene is None:
+        return None
+
+    # get optimized values from scene
+    x_out, imgs = scene.x_out, scene.imgs
+    focals, cams2world = scene.focals, scene.cams2world
+    nimgs = len(imgs)
+
+    # 3D pointcloud from depthmap, poses and intrinsics
+    if local_pointmaps:
+        pts3d = [geotrf(cams2world[i], x_out[i]['pts3d_local'].cpu()) for i in range(nimgs)]
+    else:
+        pts3d = [x_out[i]['pts3d'].cpu() for i in range(nimgs)]
+    msk = [(x_out[i]['conf'] >= min_conf_thr).cpu() for i in range(nimgs)]
+    camera_mask = [(x_out[i]['conf'].median() >= camera_conf_thr).cpu() for i in range(nimgs)]
+    return _convert_scene_output_to_glb(outdir, imgs, pts3d, msk, focals, cams2world,
+                                        as_pointcloud=as_pointcloud,
+                                        transparent_cams=transparent_cams, cam_size=cam_size, verbose=verbose,
+                                        filename=filename, camera_mask=camera_mask)
+
+
+@torch.no_grad()
+def get_reconstructed_scene(outdir, viser_server, should_save_glb, model, retrieval, device, verbose, image_size, amp,
+                            filelist, max_bs, num_refinements_iterations,  # main params
+                            execution_mode, num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile,  # execution params
+                            min_conf_thr, as_pointcloud, transparent_cams, local_pointmaps, cam_size, camera_conf_thr=0.0,  # output params
+                            loaded_files=""
+                            ):
+    """
+    from a list of images, run dust3r inference, global aligner.
+    then run get_3D_model_from_scene
+    """
+    if filelist:
+        image_list = filelist
+    elif loaded_files:
+        image_list = loaded_files.split("\n")
+    else:
+        return None, None
+
+    if execution_mode == "vidseq" or execution_mode == "vidslam":
+        if execution_mode == "vidseq":
+            local_context_size = vidseq_local_context_size
+            def is_keyframe_function(id, res, scene_state): return (id % keyframe_interval == 0)
+            scene_state = None
+            def scene_state_update_function(res, scene_state): return scene_state
+        elif execution_mode == "vidslam":
+            local_context_size = slam_local_context_size
+            overlap_mode = "nn-norm"
+            is_keyframe_function = functools.partial(
+                slam_is_keyframe, subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile, overlap_mode)
+            scene_state = get_searcher("kdtree-scipy-quadrant_x2")
+            scene_state_update_function = functools.partial(slam_update_scene_state, subsample, min_conf_keyframe)
+        else:
+            raise ValueError(f"Invalid {execution_mode=}")
+        scene = must3r_inference_video(model, device, image_size, amp, image_list, max_bs, init_num_images=2, batch_num_views=1,
+                                       viser_server=viser_server, num_refinements_iterations=num_refinements_iterations,
+                                       local_context_size=local_context_size, is_keyframe_function=is_keyframe_function,
+                                       scene_state=scene_state, scene_state_update_function=scene_state_update_function,
+                                       verbose=verbose)
+    else:
+        is_sequence = (execution_mode == "linseq")
+        scene = must3r_inference(model, retrieval, device, image_size, amp, image_list,
+                                 num_mem_images, max_bs, init_num_images=2, batch_num_views=1, render_once=render_once,
+                                 is_sequence=is_sequence, viser_server=viser_server,
+                                 num_refinements_iterations=num_refinements_iterations,
+                                 verbose=verbose)
+    if verbose:
+        print('preparing pointcloud')
+    time_start = datetime.datetime.now()
+    if should_save_glb:
+        outfile = get_3D_model_from_scene(outdir, verbose, scene, min_conf_thr, as_pointcloud, transparent_cams,
+                                          local_pointmaps, cam_size, camera_conf_thr=camera_conf_thr)
+    else:
+        outfile = None
+
+    ellapsed = (datetime.datetime.now() - time_start)
+    if verbose:
+        print(f'pointcloud prepared in {ellapsed}')
+
+    return scene, outfile
+
+
+def load_local_files(inputfiles, textinput, execution_mode,
+                     num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile):
+
+    if textinput is not None and textinput:
+        files = os.listdir(textinput)
+        files = [os.path.join(textinput, f) for f in files]
+        files = [f for f in files if is_valid_pil_image_file(f)]
+        files = sorted(files)
+    inputfiles = gradio.File(value=None, file_count="multiple",
+                             file_types=list(PIL.Image.registered_extensions().keys()))
+    loaded_files = gradio.TextArea(interactive=False, value="\n".join(files), visible=True)
+
+    return inputfiles, loaded_files, *set_execution_params(files, execution_mode,
+                                                           num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile)
+
+
+def upload_files(inputfiles, loaded_files, execution_mode,
+                 num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile):
+    if inputfiles is not None:
+        loaded_files = gradio.TextArea(value="", interactive=False, visible=False)
+        valid_files = [f for f in inputfiles if is_valid_pil_image_file(f)]
+        inputfiles_component = gradio.File(value=valid_files, file_count="multiple",
+                                           file_types=list(PIL.Image.registered_extensions().keys()))
+    elif loaded_files:
+        inputfiles = loaded_files.split("\n")
+        loaded_files = gradio.TextArea(interactive=False, value=loaded_files, visible=True)
+        inputfiles_component = gradio.File(value=None, file_count="multiple",
+                                           file_types=list(PIL.Image.registered_extensions().keys()))
+    else:
+        loaded_files = gradio.TextArea(value="", interactive=False, visible=False)
+        inputfiles_component = gradio.File(value=None, file_count="multiple",
+                                           file_types=list(PIL.Image.registered_extensions().keys()))
+
+    return inputfiles_component, loaded_files, *set_execution_params(inputfiles, execution_mode,
+                                                                     num_mem_images, render_once, vidseq_local_context_size,
+                                                                     keyframe_interval, slam_local_context_size, slam_subsample,
+                                                                     min_conf_keyframe, keyframe_overlap_thr, overlap_percentile)
+
+
+def change_execution_mode(inputfiles, loaded_files, execution_mode, num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile):
+    if inputfiles is not None:
+        files = inputfiles
+    elif loaded_files:
+        files = loaded_files.split("\n")
+    else:
+        files = None
+    return set_execution_mode(files, execution_mode, num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile)
+
+
+def set_execution_params(inputfiles, execution_mode,
+                         num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile):
+    num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile = set_execution_mode(
+        inputfiles, execution_mode, num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile)
+    return num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile
+
+
+def set_execution_mode(inputfiles, execution_mode, num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile):
+    # linseq or retrieval
+    num_mem_images = gradio.Slider(label="Number of memory images", value=num_mem_images,
+                                   minimum=num_mem_images, maximum=num_mem_images, step=1, visible=False)
+    render_once = gradio.Checkbox(value=render_once, label="Render once", visible=False)
+
+    # vidseq
+    vidseq_local_context_size = gradio.Slider(label="Local context size", value=vidseq_local_context_size,
+                                              minimum=vidseq_local_context_size, maximum=vidseq_local_context_size, step=1, visible=False)
+    keyframe_interval = gradio.Slider(label="Keyframe Interval", value=keyframe_interval,
+                                      minimum=keyframe_interval, maximum=keyframe_interval, step=1, visible=False)
+
+    # vidslam
+    slam_local_context_size = gradio.Slider(label="Local context size", value=slam_local_context_size,
+                                            minimum=slam_local_context_size, maximum=slam_local_context_size, step=1, visible=False)
+    slam_subsample = gradio.Slider(label="subsample", value=slam_subsample,
+                                   minimum=1, maximum=8, step=1, visible=False)
+    min_conf_keyframe = gradio.Slider(label="min conf keyframe", value=min_conf_keyframe,
+                                      minimum=1.0, maximum=3.0, step=0.1, visible=False)
+    keyframe_overlap_thr = gradio.Slider(label="keyframe overlap thr", value=keyframe_overlap_thr,
+                                         minimum=0.01, maximum=0.3, step=0.01, visible=False)
+    overlap_percentile = gradio.Slider(label="overlap percentile", value=overlap_percentile,
+                                       minimum=10, maximum=100, step=1, visible=False)
+
+    if inputfiles is None or len(inputfiles) == 0:
+        return num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile
+
+    num_files = len(inputfiles)
+
+    if execution_mode in ["linseq", "retrieval"]:
+        current_num_mem_images = num_mem_images.constructor_args["value"] \
+            if num_mem_images.constructor_args["value"] > 0 else min(num_files, 50)
+        current_num_mem_images = min(num_files, current_num_mem_images)
+
+        num_mem_images = gradio.Slider(label="Number of memory images", value=current_num_mem_images,
+                                       minimum=1, maximum=num_files, step=1, visible=True)
+        render_once = gradio.Checkbox(value=render_once.constructor_args["value"], label="Render once", visible=True)
+    elif execution_mode == "vidseq":
+        curr_vidseq_local_context_size = vidseq_local_context_size.constructor_args["value"] \
+            if vidseq_local_context_size.constructor_args["value"] > 0 else min(num_files, 25)
+        curr_vidseq_local_context_size = min(num_files, curr_vidseq_local_context_size)
+        vidseq_local_context_size = gradio.Slider(label="Local context size", value=curr_vidseq_local_context_size,
+                                                  minimum=0, maximum=num_files, step=1, visible=True)
+
+        curr_keyframe_interval = keyframe_interval.constructor_args["value"] \
+            if keyframe_interval.constructor_args["value"] > 0 else min(num_files, 3)
+        curr_keyframe_interval = min(num_files, curr_keyframe_interval)
+        keyframe_interval = gradio.Slider(label="Keyframe Interval", value=curr_keyframe_interval,
+                                          minimum=1, maximum=num_files, step=1, visible=True)
+    else:
+        # vidslam
+        curr_slam_local_context_size = slam_local_context_size.constructor_args["value"] \
+            if slam_local_context_size.constructor_args["value"] > 0 else 0
+        curr_slam_local_context_size = min(num_files, curr_slam_local_context_size)
+        slam_local_context_size = gradio.Slider(label="Local context size", value=curr_slam_local_context_size,
+                                                minimum=0, maximum=num_files, step=1, visible=True)
+
+        slam_subsample = gradio.Slider(label="subsample", value=slam_subsample.constructor_args["value"],
+                                       minimum=1, maximum=8, step=1, visible=True)
+        min_conf_keyframe = gradio.Slider(label="min conf keyframe", value=min_conf_keyframe.constructor_args["value"],
+                                          minimum=1.0, maximum=3.0, step=0.1, visible=True)
+        keyframe_overlap_thr = gradio.Slider(label="keyframe overlap thr", value=keyframe_overlap_thr.constructor_args["value"],
+                                             minimum=0.01, maximum=0.3, step=0.01, visible=True)
+        overlap_percentile = gradio.Slider(label="overlap percentile", value=overlap_percentile.constructor_args["value"],
+                                           minimum=10, maximum=100, step=1, visible=True)
+
+    return num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile
+
+
+def main_demo(tmpdirname, model, retrieval, device, image_size, server_name, server_port,
+              verbose=True, amp=False, with_viser=False, allow_local_files=False, embed_viser=False):
+    with_viser = with_viser or embed_viser
+    if with_viser:
+        viser_server = ViserWrapper(host=server_name)
+    else:
+        viser_server = None
+
+    recon_fun = functools.partial(get_reconstructed_scene, tmpdirname, viser_server, not embed_viser, model,
+                                  retrieval, device, verbose, image_size, amp)
+    model_from_scene_fun = functools.partial(get_3D_model_from_scene, tmpdirname, verbose)
+    with gradio.Blocks(css=""".gradio-container {margin: 0 !important; min-width: 100%};""", title="MUSt3R Demo") as demo:
+        # scene state is save so that you can change conf_thr, cam_size... without rerunning the inference
+        scene = gradio.State(None)
+
+        available_modes = [("sequence: linspace", "linseq"),
+                           ("sequence: slam keyframes", "vidslam"),
+                           ("sequence: local context and linspace keyframes", "vidseq"),]
+        if retrieval:
+            available_modes.append(("unordered: retrieval", "retrieval"))
+
+        gradio.HTML('<h2 style="text-align: center;">MUSt3R Demo</h2>')
+        with gradio.Column():
+            with gradio.Tab("upload"):
+                inputfiles = gradio.File(file_count="multiple",
+                                         file_types=list(PIL.Image.registered_extensions().keys()))
+            with gradio.Tab("local_path", visible=allow_local_files):
+                textinput = gradio.Textbox(label="Path to a local directory")
+                load_files = gradio.Button("Load")
+                loaded_files = gradio.TextArea(value="", interactive=False, visible=False)
+
+            # inference options
+            with gradio.Row():
+                with gradio.Column():
+                    num_refinements_iterations = gradio.Slider(label="Number of refinement iterations", value=0,
+                                                               minimum=0, maximum=100, step=1, visible=True)
+                    max_bs = gradio.Number(value=1, minimum=0, maximum=100_000, step=1,
+                                           label="Maximum batch size", visible=True)
+                with gradio.Column():
+                    execution_mode = gradio.Dropdown(available_modes,
+                                                     value='vidslam', label="Mode",
+                                                     info="Define how to run MUSt3R",
+                                                     interactive=True)
+
+                    # linseq or retrieval
+                    num_mem_images = gradio.Slider(label="Number of memory images", value=0,
+                                                   minimum=0, maximum=0, step=1, visible=False)
+                    render_once = gradio.Checkbox(value=False, label="Render once", visible=False)
+
+                    # vidseq
+                    vidseq_local_context_size = gradio.Slider(label="Local context size", value=0,
+                                                              minimum=0, maximum=0, step=1, visible=False)
+                    keyframe_interval = gradio.Slider(label="Keyframe Interval", value=0,
+                                                      minimum=0, maximum=0, step=1, visible=False)
+
+                    # vidslam
+                    # also uses local_context_size
+                    slam_local_context_size = gradio.Slider(label="Local context size", value=0,
+                                                            minimum=0, maximum=0, step=1, visible=False)
+                    slam_subsample = gradio.Slider(label="subsample", value=2,
+                                                   minimum=1, maximum=8, step=1, visible=False)
+                    min_conf_keyframe = gradio.Slider(label="min conf keyframe", value=1.5,
+                                                      minimum=1.0, maximum=3.0, step=0.1, visible=False)
+                    keyframe_overlap_thr = gradio.Slider(label="keyframe overlap thr", value=0.05,
+                                                         minimum=0.01, maximum=0.3, step=0.01, visible=False)
+                    overlap_percentile = gradio.Slider(label="overlap percentile", value=85,
+                                                       minimum=10, maximum=100, step=1, visible=False)
+            run_btn = gradio.Button("Run")
+
+            # visualization options
+            with gradio.Row(visible=not embed_viser):
+                with gradio.Column():
+                    # adjust the confidence threshold
+                    min_conf_thr = gradio.Slider(label="min_conf_thr", value=3.0, minimum=1.0, maximum=20, step=0.1)
+                    camera_conf_thr = gradio.Slider(label="camera_conf_thr", value=1.5,
+                                                    minimum=1.0, maximum=20, step=0.1)
+                    # adjust the camera size in the output pointcloud
+                    cam_size = gradio.Slider(label="cam_size", value=0.05, minimum=0.001, maximum=0.1, step=0.001)
+
+                with gradio.Column():
+                    as_pointcloud = gradio.Checkbox(value=True, label="As pointcloud")
+                    transparent_cams = gradio.Checkbox(value=False, label="Transparent cameras")
+                    local_pointmaps = gradio.Checkbox(value=False, label="viz local pointmaps pointcloud")
+
+            if embed_viser:
+                assert viser_server is not None
+                viser_html = gradio.HTML(f"""<div style="width:100%; height:600px; border:1px solid #e4e4e7; border-radius: 4px; resize:vertical; overflow:auto;">
+                    <div style="padding: 5px 12px"><span style="color: #71717a">Visualization</span><span style="float: right"><a href="http://{viser_server.address}/?fixedDpr=1" target="_blank">Full screen</a><span></span></span></div>
+                    <iframe
+                        src="http://{viser_server.address}/?fixedDpr=1"
+                        style="width:100%; height: calc(100% - 36px); border:none;">
+                    </iframe>
+                    </div>""")
+                outmodel = gradio.Model3D(visible=False, render=False)
+            else:
+                outmodel = gradio.Model3D()
+
+            # events
+            inputfiles.upload(upload_files,
+                              inputs=[inputfiles, loaded_files, execution_mode, num_mem_images,
+                                      render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile],
+                              outputs=[inputfiles, loaded_files, num_mem_images, render_once,
+                                       vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile])
+
+            inputfiles.delete(upload_files,
+                              inputs=[inputfiles, loaded_files, execution_mode, num_mem_images,
+                                      render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile],
+                              outputs=[inputfiles, loaded_files, num_mem_images, render_once,
+                                       vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile])
+            inputfiles.clear(upload_files,
+                             inputs=[inputfiles, loaded_files, execution_mode, num_mem_images,
+                                     render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile],
+                             outputs=[inputfiles, loaded_files, num_mem_images, render_once,
+                                      vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile])
+
+            if allow_local_files:
+                load_files.click(fn=load_local_files,
+                                 inputs=[inputfiles, textinput, execution_mode, num_mem_images,
+                                         render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile],
+                                 outputs=[inputfiles, loaded_files, num_mem_images, render_once,
+                                          vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile])
+            execution_mode.change(change_execution_mode,
+                                  inputs=[inputfiles, loaded_files, execution_mode, num_mem_images, render_once,
+                                          vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile],
+                                  outputs=[num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile])
+
+            run_btn.click(fn=recon_fun,
+                          inputs=[inputfiles, max_bs, num_refinements_iterations,
+                                  execution_mode, num_mem_images, render_once, vidseq_local_context_size, keyframe_interval, slam_local_context_size, slam_subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile,
+                                  min_conf_thr, as_pointcloud, transparent_cams, local_pointmaps, cam_size, camera_conf_thr, loaded_files],
+                          outputs=[scene, outmodel])
+
+            min_conf_thr.release(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud,
+                                         transparent_cams, local_pointmaps, cam_size, camera_conf_thr],
+                                 outputs=outmodel)
+            camera_conf_thr.release(fn=model_from_scene_fun,
+                                    inputs=[scene, min_conf_thr, as_pointcloud, transparent_cams, local_pointmaps,
+                                            cam_size, camera_conf_thr],
+                                    outputs=outmodel)
+            cam_size.change(fn=model_from_scene_fun,
+                            inputs=[scene, min_conf_thr, as_pointcloud, transparent_cams, local_pointmaps,
+                                    cam_size, camera_conf_thr],
+                            outputs=outmodel)
+            as_pointcloud.change(fn=model_from_scene_fun,
+                                 inputs=[scene, min_conf_thr, as_pointcloud, transparent_cams, local_pointmaps,
+                                         cam_size, camera_conf_thr],
+                                 outputs=outmodel)
+            transparent_cams.change(model_from_scene_fun,
+                                    inputs=[scene, min_conf_thr, as_pointcloud, transparent_cams,
+                                            local_pointmaps, cam_size, camera_conf_thr],
+                                    outputs=outmodel)
+            local_pointmaps.change(model_from_scene_fun,
+                                   inputs=[scene, min_conf_thr, as_pointcloud, transparent_cams,
+                                           local_pointmaps, cam_size, camera_conf_thr],
+                                   outputs=outmodel)
+    demo.launch(share=False, server_name=server_name, server_port=server_port)
+
+
+def main():
+    torch.backends.cuda.matmul.allow_tf32 = True  # for gpu >= Ampere and pytorch >= 1.12
+    parser = get_args_parser()
+    args = parser.parse_args()
+
+    toggle_memory_efficient_attention(enabled=has_xformers)
+
+    if args.tmp_dir is not None:
+        tmp_path = args.tmp_dir
+        os.makedirs(tmp_path, exist_ok=True)
+        tempfile.tempdir = tmp_path
+
+    if args.server_name is not None:
+        server_name = args.server_name
+    else:
+        server_name = '0.0.0.0' if args.local_network else '127.0.0.1'
+
+    weights_path = args.weights
+    model = load_model(weights_path, encoder=args.encoder, decoder=args.decoder, device=args.device,
+                       img_size=args.image_size, memory_mode=args.memory_mode, verbose=args.verbose)
+
+    # must3r will write the 3D model inside tmpdirname
+    with tempfile.TemporaryDirectory(suffix='dust3r_gradio_demo') as tmpdirname:
+        if args.verbose:
+            print('Outputing stuff in', tmpdirname)
+        main_demo(tmpdirname, model, args.retrieval, args.device, args.image_size,
+                  server_name, args.server_port, verbose=args.verbose, amp=args.amp, with_viser=args.viser,
+                  allow_local_files=args.allow_local_files, embed_viser=args.embed_viser)
diff --git a/must3r/demo/inference.py b/must3r/demo/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..e60b3c5802a1d2700faec203e8e3c9c66e68befc
--- /dev/null
+++ b/must3r/demo/inference.py
@@ -0,0 +1,331 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import numpy as np
+import datetime
+import PIL.Image
+
+from must3r.retrieval.processor import Retriever
+from must3r.retrieval.graph import farthest_point_sampling
+
+from must3r.engine.inference import encoder_multi_ar, inference_multi_ar, postprocess, inference_video_multi_ar
+from must3r.model import get_pointmaps_activation, get_dtype
+from must3r.tools.image import get_resize_function
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.viz import rgb
+from dust3r.datasets import ImgNorm
+
+from must3r.slam.model import get_overlap_score, choose_keyframe_from_overlap
+
+
+class SceneState:
+    def __init__(self, x_out, imgs, true_shape, focals, cams2world, image_list):
+        self.x_out = x_out
+        self.imgs = imgs
+        self.true_shape = true_shape
+        self.focals = focals
+        self.cams2world = cams2world
+        self.image_list = image_list
+
+
+class MUSt3R_Retriever(Retriever):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def _preproc(self, enc_ims, device):
+        imids = []
+        features = []
+        with torch.no_grad():
+            for i, enc_im in enumerate(enc_ims):
+                feat, _, _ = self.model.forward_local(enc_im.to(device))
+                feat = feat.flatten(0, 1).cpu()
+                imids.append(i * torch.ones_like(feat[:, 0]).to(dtype=torch.int64))
+                features.append(feat)
+        features = torch.cat(features, dim=0)
+        imids = torch.cat(imids, dim=0)
+        return features, imids
+
+    def __call__(self, enc, device):
+        # build the database
+        feat, ids = self._preproc(enc, device)  # turn encoded image into feats for retrieval
+        feat = feat.cpu().numpy()
+        ids = ids.cpu().numpy()
+
+        asmk_dataset = self.asmk.build_ivf(feat, ids)
+        metadata, query_ids, ranks, ranked_scores = asmk_dataset.query_ivf(feat, ids)
+
+        scores = np.empty_like(ranked_scores)
+        scores[np.arange(ranked_scores.shape[0])[:, None], ranks] = ranked_scores
+
+        return scores
+
+
+def load_images(folder_content, size, patch_size=16, verbose=True):
+    imgs = []
+    transform = ImgNorm
+
+    for path in folder_content:
+        rgb_image = PIL.Image.open(path).convert('RGB')
+        rgb_image.load()
+        W, H = rgb_image.size
+        resize_func, _, to_orig = get_resize_function(size, patch_size, H, W)
+        rgb_tensor = resize_func(transform(rgb_image))
+        imgs.append(dict(img=rgb_tensor, true_shape=np.int32([rgb_tensor.shape[-2], rgb_tensor.shape[-1]])))
+        if verbose:
+            print(f' - adding {path} with resolution {W}x{H} --> {rgb_tensor.shape[-1]}x{rgb_tensor.shape[-2]}')
+    return imgs
+
+
+def slam_is_keyframe(subsample, min_conf_keyframe, keyframe_overlap_thr, overlap_percentile, overlap_mode, id, res, scene_state):
+    cam_center = res['c2w'][:3, -1]
+
+    res_unsqueeze = {k: v.unsqueeze(0).unsqueeze(0) for k, v in res.items()}
+    overlap_score = get_overlap_score(res_unsqueeze,
+                                      scene_state,
+                                      cam_center=cam_center,
+                                      mode=overlap_mode,
+                                      kf_x_subsamp=subsample,
+                                      min_conf_keyframe=min_conf_keyframe,
+                                      percentile=overlap_percentile,
+                                      )
+    assert not np.isnan(overlap_score)
+    return choose_keyframe_from_overlap(overlap_score, keyframe_overlap_thr, overlap_mode)
+
+
+def slam_update_scene_state(subsample, min_conf_keyframe, res, scene_state):
+    cam_center = res['c2w'][:3, -1]
+    msk = res['conf'] > min_conf_keyframe
+
+    if subsample:
+        msk = msk[::subsample, ::subsample]
+        pts = res['pts3d'][::subsample, ::subsample][msk]
+    else:
+        pts = res['pts3d'][msk]
+
+    scene_state.add_pts(pts, cam_center=cam_center)
+    return scene_state
+
+
+def must3r_inference(model, retrieval, device, image_size, amp,
+                     filelist, num_mem_images, max_bs, init_num_images, batch_num_views, render_once,
+                     is_sequence, viser_server=None, num_refinements_iterations=0, verbose=True):
+    dtype = get_dtype(amp)
+
+    max_bs = None if max_bs == 0 else max_bs
+    encoder, decoder = model
+    pointmaps_activation = get_pointmaps_activation(decoder, verbose=verbose)
+    def post_process_function(x): return postprocess(x, pointmaps_activation=pointmaps_activation, compute_cam=True)
+
+    if verbose:
+        print('loading images')
+    time_start = datetime.datetime.now()
+    views = load_images(filelist, size=image_size, patch_size=encoder.patch_size, verbose=verbose)
+    nimgs = len(views)
+
+    ellapsed = (datetime.datetime.now() - time_start)
+    if verbose:
+        print(f'loaded in {ellapsed}')
+        print('running inference')
+    time_start = datetime.datetime.now()
+    if viser_server is not None:
+        viser_server.reset(nimgs)
+
+    imgs = [b['img'].to('cpu') for b in views]
+    true_shape = [torch.from_numpy(b['true_shape']).to('cpu') for b in views]
+    true_shape = torch.stack(true_shape, dim=0)
+    nimgs = true_shape.shape[0]
+
+    # select keyframes
+    if is_sequence or retrieval is None:
+        keyframes = np.linspace(0, len(imgs) - 1, num_mem_images, dtype=int).tolist()
+        encoder_precomputed_features = None
+    else:
+        # run encoder
+        with torch.autocast("cuda", dtype=dtype):
+            x_start, pos_start = encoder_multi_ar(encoder, imgs, true_shape, verbose=verbose, max_bs=max_bs,
+                                                  device=device, preserve_gpu_mem=True)
+        encoder_precomputed_features = (x_start, pos_start)
+
+        retriever = MUSt3R_Retriever(retrieval, backbone=encoder, verbose=verbose)
+        sim_matrix = retriever([xi.unsqueeze(0).float() for xi in x_start], device=device)
+        # Cleanup
+        del retriever
+        torch.cuda.empty_cache()
+        anchor_idx, _ = farthest_point_sampling(1 - sim_matrix, N=num_mem_images, dist_thresh=None)
+        sim_matrix = sim_matrix[anchor_idx, :][:, anchor_idx]
+
+        diag = np.diag_indices(num_mem_images)
+        sim_matrix[diag[0], diag[1]] = 0
+        sim_sum = np.sum(sim_matrix, axis=-1)
+
+        keyframes = [np.argmax(sim_sum)]  # start with image that has the highest overlap
+        sim_matrix[:, keyframes[0]] = 0  # invalidate column
+        while len(keyframes) != num_mem_images:
+            # last_keyframe = keyframes[-1]
+            # best_next_image = np.argmax(sim_matrix[last_keyframe])
+            sim_matrix_sel = sim_matrix[np.array(keyframes)]
+            best_next_image = np.unravel_index(np.argmax(sim_matrix_sel),
+                                               sim_matrix_sel.shape)[1]  # we need the column index
+            keyframes.append(best_next_image)
+            sim_matrix[:, best_next_image] = 0
+        keyframes = [anchor_idx[k] for k in keyframes]
+
+    not_keyframes = sorted(set(range(nimgs)).difference(set(keyframes)))
+    assert (len(keyframes) + len(not_keyframes)) == nimgs
+    # reorder images
+    views = [views[i] for i in keyframes] + [views[i] for i in not_keyframes]
+    imgs = [b['img'].to(device) for b in views]
+    true_shape = [torch.from_numpy(b['true_shape']).to(device) for b in views]
+    filenames = [filelist[i] for i in keyframes + not_keyframes]
+    img_ids = [torch.tensor(v) for v in keyframes + not_keyframes]
+
+    if encoder_precomputed_features is not None:
+        x_start, pos_start = encoder_precomputed_features
+        x = [x_start[i] for i in keyframes] + [x_start[i] for i in not_keyframes]
+        pos = [pos_start[i] for i in keyframes] + [pos_start[i] for i in not_keyframes]
+        encoder_precomputed_features = (x, pos)
+
+    mem_batches = [min(init_num_images, nimgs)]
+    while (sum_b := sum(mem_batches)) != max(num_mem_images, init_num_images):
+        size_b = min(batch_num_views, num_mem_images - sum_b)
+        mem_batches.append(size_b)
+
+    if render_once:
+        to_render = list(range(num_mem_images, nimgs))
+    else:
+        to_render = None
+
+    with torch.autocast("cuda", dtype=dtype):
+        x_out_0, x_out = inference_multi_ar(encoder, decoder, imgs, img_ids, true_shape, mem_batches,
+                                            max_bs=max_bs, verbose=verbose, to_render=to_render,
+                                            encoder_precomputed_features=encoder_precomputed_features,
+                                            device=device, preserve_gpu_mem=True,
+                                            post_process_function=post_process_function,
+                                            viser_server=viser_server,
+                                            num_refinements_iterations=num_refinements_iterations)
+    if to_render is not None:
+        x_out = x_out_0 + x_out
+
+    ellapsed = (datetime.datetime.now() - time_start)
+    if verbose:
+        print(f'inference in {ellapsed}')
+        try:
+            print(str(int(torch.cuda.max_memory_reserved(device) / (1024 ** 2))) + " MB")
+        except Exception as e:
+            pass
+
+    if viser_server is not None:
+        viser_server.reset_cam_visility()
+        viser_server.send_message("Finished")
+
+    if verbose:
+        print('preparing pointcloud')
+    time_start = datetime.datetime.now()
+    focals = []
+    cams2world = []
+    for i in range(nimgs):
+        focals.append(float(x_out[i]['focal'].cpu()))
+        cams2world.append(x_out[i]['c2w'].cpu())
+
+    # x_out to cpu
+    for i in range(len(x_out)):
+        for k in x_out[i].keys():
+            x_out[i][k] = x_out[i][k].cpu()
+
+    rgbimg = [rgb(imgs[i], true_shape[i]) for i in range(nimgs)]
+    scene = SceneState(x_out, rgbimg, true_shape, focals, cams2world, filenames)
+
+    ellapsed = (datetime.datetime.now() - time_start)
+    if verbose:
+        print(f'pointcloud prepared in {ellapsed}')
+    return scene
+
+
+def must3r_inference_video(model, device, image_size, amp,
+                           filelist, max_bs, init_num_images, batch_num_views,
+                           viser_server=None, num_refinements_iterations=0, local_context_size: int = 25,
+                           is_keyframe_function=lambda id, res, scene_state: (id % 3 == 0),
+                           scene_state=None,
+                           scene_state_update_function=lambda res, scene_state: scene_state,
+                           verbose=True):
+    dtype = get_dtype(amp)
+
+    max_bs = None if max_bs == 0 else max_bs
+    encoder, decoder = model
+    pointmaps_activation = get_pointmaps_activation(decoder, verbose=verbose)
+    def post_process_function(x): return postprocess(x, pointmaps_activation=pointmaps_activation, compute_cam=True)
+
+    if verbose:
+        print('loading images')
+    time_start = datetime.datetime.now()
+    views = load_images(filelist, size=image_size, patch_size=encoder.patch_size, verbose=verbose)
+    nimgs = len(views)
+
+    ellapsed = (datetime.datetime.now() - time_start)
+    if verbose:
+        print(f'loaded in {ellapsed}')
+        print('running inference')
+    time_start = datetime.datetime.now()
+    if viser_server is not None:
+        viser_server.reset(nimgs)
+
+    imgs = [b['img'].to('cpu') for b in views]
+    true_shape = [torch.from_numpy(b['true_shape']).to('cpu') for b in views]
+    true_shape = torch.stack(true_shape, dim=0)
+    nimgs = true_shape.shape[0]
+
+    imgs = [b['img'].to(device) for b in views]
+    true_shape = [torch.from_numpy(b['true_shape']).to(device) for b in views]
+    filenames = filelist
+    # img_ids = [torch.tensor(v) for v in range(nimgs)]
+
+    mem_batches = [min(init_num_images, nimgs)]
+    while (sum_b := sum(mem_batches)) != nimgs:
+        size_b = min(batch_num_views, nimgs - sum_b)
+        mem_batches.append(size_b)
+
+    with torch.autocast("cuda", dtype=dtype):
+        x_out = inference_video_multi_ar(encoder, decoder, imgs, true_shape, mem_batches,
+                                         max_bs=max_bs, verbose=verbose,
+                                         device=device, preserve_gpu_mem=True,
+                                         post_process_function=post_process_function,
+                                         viser_server=viser_server,
+                                         num_refinements_iterations=num_refinements_iterations,
+                                         local_context_size=local_context_size,
+                                         is_keyframe_function=is_keyframe_function,
+                                         scene_state=scene_state,
+                                         scene_state_update_function=scene_state_update_function)
+
+    ellapsed = (datetime.datetime.now() - time_start)
+    if verbose:
+        print(f'inference in {ellapsed}')
+        try:
+            print(str(int(torch.cuda.max_memory_reserved(device) / (1024 ** 2))) + " MB")
+        except Exception as e:
+            pass
+
+    if viser_server is not None:
+        viser_server.reset_cam_visility()
+        viser_server.send_message("Finished")
+
+    if verbose:
+        print('preparing pointcloud')
+    time_start = datetime.datetime.now()
+    focals = []
+    cams2world = []
+    for i in range(nimgs):
+        focals.append(float(x_out[i]['focal'].cpu()))
+        cams2world.append(x_out[i]['c2w'].cpu())
+
+    # x_out to cpu
+    for i in range(len(x_out)):
+        for k in x_out[i].keys():
+            x_out[i][k] = x_out[i][k].cpu()
+
+    rgbimg = [rgb(imgs[i], true_shape[i]) for i in range(nimgs)]
+    scene = SceneState(x_out, rgbimg, true_shape, focals, cams2world, filenames)
+
+    ellapsed = (datetime.datetime.now() - time_start)
+    if verbose:
+        print(f'pointcloud prepared in {ellapsed}')
+    return scene
diff --git a/must3r/demo/viser.py b/must3r/demo/viser.py
new file mode 100644
index 0000000000000000000000000000000000000000..d01d3ece2ab71caddf412538a05831cc628094a1
--- /dev/null
+++ b/must3r/demo/viser.py
@@ -0,0 +1,276 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.utils.geometry import geotrf
+try:
+    import viser
+    import viser.transforms as viser_tf
+    viser_enabled = True
+except ImportError:
+    viser_enabled = False
+
+
+def get_pointcloud_key(frame_id):
+    return f"/frames/t{frame_id}/point_cloud"
+
+
+def get_camera_key(frame_id):
+    return f"/frames/t{frame_id}/frustum"
+
+
+def colorize_grayscale(depth: np.ndarray):
+    mind, maxd = depth.min(), depth.max()
+    depth = (depth - mind) / (maxd - mind + 1e-9)
+    return np.stack([depth, depth, depth], axis=-1)
+
+
+class ViserWrapper():
+    def __init__(self, host: str = "0.0.0.0", port: int = 8080, label: str | None = None, verbose: bool = True) -> None:
+        self.server = viser.ViserServer(host, port, label, verbose)
+
+        self.server.scene.set_up_direction('-y')
+
+        self.num_imgs = 0
+        self.progress_bar = self.server.gui.add_progress_bar(
+            value=100
+        )
+
+        @self.server.on_client_connect
+        def _(client: viser.ClientHandle) -> None:
+            """For each client that connects, create GUI elements for adjusting the
+            near/far clipping planes."""
+            client.camera.near = 0.01
+            client.camera.far = 100.0
+            camera_slider = client.gui.add_multi_slider(
+                "Camera Near/far", min=0.01, max=1000.0, step=0.001, initial_value=(client.camera.near, client.camera.far), order=0
+            )
+
+            @camera_slider.on_update
+            def _(_) -> None:
+                near, far = camera_slider.value
+                client.camera.near = near
+                client.camera.far = far
+
+        self.gui_point_size = self.server.gui.add_slider(
+            "Point size", min=0.001, max=0.1, step=0.001, initial_value=0.01
+        )
+        self.frustum_scale = self.server.gui.add_slider(
+            "Camera size", min=0.01, max=1.0, step=0.01, initial_value=0.05
+        )
+        self.confidence_threshold = self.server.gui.add_slider(
+            "Confidence", min=1.0, max=10.0, step=0.1, initial_value=3.0
+        )
+        self.max_points_per_frame = self.server.gui.add_slider(
+            "Max Points", min=0, max=250_000, step=1000, initial_value=20_000
+        )
+        self.local_pointmap = self.server.gui.add_checkbox(
+            "Local pointmaps", initial_value=True
+        )
+        self.follow_cam = self.server.gui.add_checkbox(
+            "Follow Cam", initial_value=False
+        )
+        self.keyframes_only = self.server.gui.add_checkbox(
+            "Keyframes Only", initial_value=True
+        )
+        self.hide_images = self.server.gui.add_checkbox(
+            "Hide Images", initial_value=False, hint="Hide the images in the camera frustum in the scene"
+        )
+        self.hide_images_gui = self.server.gui.add_checkbox(
+            "Hide Predictions", initial_value=False, hint="Hide the rgb,depth,conf images"
+        )
+
+        empty_img = np.array([[[0, 0, 0]]])
+        self.rgb = self.server.gui.add_image(empty_img, label="RGB", jpeg_quality=80, visible=False)
+        self.depth = self.server.gui.add_image(empty_img, label="Depth", jpeg_quality=80, visible=False)
+        self.conf = self.server.gui.add_image(empty_img, label="Confidence", jpeg_quality=80, visible=False)
+
+        self.point_nodes: dict[str, viser.PointCloudHandle] = {}
+        self.camera_nodes: dict[str, viser.CameraFrustumHandle] = {}
+        self.pointmaps: dict[str, dict] = {}
+        self.images: dict[str, dict] = {}
+
+        @self.gui_point_size.on_update
+        def _(_) -> None:
+            for frame_id in list(self.point_nodes.keys()):
+                self.point_nodes[frame_id].point_size = self.gui_point_size.value
+
+        @self.frustum_scale.on_update
+        def _(_) -> None:
+            for frame_id in list(self.camera_nodes.keys()):
+                self.camera_nodes[frame_id].scale = self.frustum_scale.value
+
+        @self.confidence_threshold.on_update
+        def _(_) -> None:
+            for frame_id in list(self.point_nodes.keys()):
+                self.make_point_cloud(frame_id)
+
+        @self.local_pointmap.on_update
+        def _(_) -> None:
+            for frame_id in list(self.point_nodes.keys()):
+                self.make_point_cloud(frame_id)
+
+        @self.follow_cam.on_update
+        def _(_) -> None:
+            self.reset_cam_visility()
+
+        @self.keyframes_only.on_update
+        def _(_) -> None:
+            self.reset_point_cloud_visility()
+
+        @self.hide_images.on_update
+        def _(_) -> None:
+            for frame_id in list(self.camera_nodes.keys()):
+                self.make_camera_frustum(frame_id)
+
+        @self.hide_images_gui.on_update
+        def _(_) -> None:
+            self.set_images_gui_visibility()
+
+        @self.max_points_per_frame.on_update
+        def _(_) -> None:
+            for frame_id in list(self.point_nodes.keys()):
+                self.make_point_cloud(frame_id)
+
+    @property
+    def address(self):
+        return f"{self.server.get_host()}:{self.server.get_port()}"
+
+    def set_images_gui_visibility(self):
+        if len(self.rgb.image) > 0:
+            self.rgb.visible = not self.hide_images_gui.value
+        if len(self.depth.image) > 0:
+            self.depth.visible = not self.hide_images_gui.value
+        if len(self.conf.image) > 0:
+            self.conf.visible = not self.hide_images_gui.value
+
+    def reset_cam_visility(self):
+        for frame_id in list(self.camera_nodes.keys()):
+            if not self.camera_nodes[frame_id].visible:
+                self.camera_nodes[frame_id].visible = True
+
+    def reset_point_cloud_visility(self):
+        for frame_id in list(self.point_nodes.keys()):
+            self.point_nodes[frame_id].visible = self.pointmaps[frame_id]['is_keyframe'] or not self.keyframes_only.value
+
+    def reset(self, num_imgs):
+        self.progress_bar.value = 0
+        self.num_imgs = num_imgs
+
+        for frame_id in self.point_nodes.keys():
+            self.server.scene.remove_by_name(get_pointcloud_key(frame_id))
+        self.point_nodes = {}
+
+        for frame_id in self.camera_nodes.keys():
+            self.server.scene.remove_by_name(get_camera_key(frame_id))
+        self.camera_nodes = {}
+
+        self.pointmaps = {}
+
+    def make_point_cloud(self, frame_id):
+        mask = self.pointmaps[frame_id]['conf'] >= self.confidence_threshold.value
+        points = self.pointmaps[frame_id]['pts3d_local'] if self.local_pointmap.value else self.pointmaps[frame_id]['pts3d']
+        points = points[mask]
+        colors = self.pointmaps[frame_id]['rgb'][mask]
+        is_keyframe = self.pointmaps[frame_id]['is_keyframe']
+
+        num_point = points.shape[0]
+        if self.max_points_per_frame.value > 0 and num_point > self.max_points_per_frame.value:
+            submask = np.linspace(0, num_point - 1, self.max_points_per_frame.value, dtype=np.int64)
+            points = points[submask]
+            colors = colors[submask]
+
+        self.point_nodes[frame_id] = \
+            self.server.scene.add_point_cloud(
+                name=get_pointcloud_key(frame_id),
+                points=points,
+                colors=colors,
+                point_size=self.gui_point_size.value,
+                point_shape="rounded",
+                visible=is_keyframe or not self.keyframes_only.value
+        )
+
+    def make_camera_frustum(self, frame_id):
+        fov = self.images[frame_id]['fov']
+        aspect = self.images[frame_id]['aspect']
+        c2w = self.images[frame_id]['c2w']
+        color = self.images[frame_id]['color']
+        img = self.images[frame_id]['img'] if not self.hide_images.value else None
+
+        self.camera_nodes[frame_id] = self.server.scene.add_camera_frustum(
+            get_camera_key(frame_id),
+            fov=fov,
+            aspect=aspect,
+            scale=self.frustum_scale.value,
+            image=img,
+            wxyz=viser_tf.SO3.from_matrix(c2w[:3, :3]).wxyz,
+            position=c2w[:3, 3],
+            color=color
+        )
+
+    def set_views(self, frame_ids, rgbs, pointmaps, is_keyframe=None):
+        if len(frame_ids) == 0:
+            return
+
+        for i, frame_id in enumerate(frame_ids):
+            frame_id = str(frame_id)
+            img = rgbs[i]
+            img = (img * 0.5) + 0.5  # unnormalize image
+            img = img.cpu().numpy().transpose(1, 2, 0)
+
+            if is_keyframe is None and frame_id in self.pointmaps:
+                is_keyframe_i = self.pointmaps[frame_id]['is_keyframe']
+            elif is_keyframe is None:
+                is_keyframe_i = False
+            else:
+                is_keyframe_i = is_keyframe[i]
+            c2w = pointmaps[i]['c2w'].cpu().numpy()
+            self.pointmaps[frame_id] = {
+                'pts3d': pointmaps[i]['pts3d'].cpu().numpy().reshape(-1, 3),
+                'pts3d_local': geotrf(c2w, pointmaps[i]['pts3d_local'].cpu().numpy().reshape(-1, 3)),
+                'conf': pointmaps[i]['conf'].cpu().numpy().ravel(),
+                'rgb': img.reshape(-1, 3),
+                'is_keyframe': bool(is_keyframe_i)
+            }
+            self.make_point_cloud(frame_id)
+
+            focal = float(pointmaps[i]['focal'].cpu())
+            H, W = img.shape[:2]
+            fov = 2 * np.arctan2(H / 2, focal)
+            aspect = W / H
+            color = (20, 20, 20) if not is_keyframe_i else (20, 200, 20)
+
+            self.images[frame_id] = {
+                'fov': fov,
+                'aspect': aspect,
+                'c2w': c2w,
+                'color': color,
+                'img': img
+            }
+            self.make_camera_frustum(frame_id)
+
+            self.progress_bar.value = int(100 * len(self.pointmaps) / self.num_imgs)
+
+        # only do this for the last one, we guarantee that c2w has a value with the early exit check
+        self.set_images_gui_visibility()
+        if not self.hide_images_gui.value:
+            self.rgb.image = img
+            self.depth.image = colorize_grayscale(pointmaps[-1]['pts3d_local'].cpu().numpy()[..., 2])
+            self.conf.image = colorize_grayscale(pointmaps[-1]['conf'].cpu().numpy())
+        if self.follow_cam.value:
+            self.reset_cam_visility()
+            self.camera_nodes[frame_id].visible = False
+            for client in self.server.get_clients().values():
+                with client.atomic():
+                    client.camera.wxyz = viser_tf.SO3.from_matrix(c2w[:3, :3]).wxyz
+                    client.camera.position = c2w[:3, 3]
+
+    def send_message(self, message):
+        for client in self.server.get_clients().values():
+            client.add_notification(
+                title="Gradio Update",
+                body=message,
+                loading=False,
+                with_close_button=True,
+                auto_close=False,
+            )
diff --git a/must3r/engine/__init__.py b/must3r/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..345ede2d5437f28f4929911c3614d5fe2caf828e
--- /dev/null
+++ b/must3r/engine/__init__.py
@@ -0,0 +1 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
diff --git a/must3r/engine/inference.py b/must3r/engine/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85340a28bebc727efc69c04fd2db50a30ed1e88
--- /dev/null
+++ b/must3r/engine/inference.py
@@ -0,0 +1,694 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+from contextlib import nullcontext
+import numpy as np
+import itertools
+from tqdm import tqdm
+import roma
+from collections import deque
+import math
+
+from must3r.model import ActivationType, apply_activation
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.post_process import estimate_focal_knowing_depth
+
+
+@torch.autocast("cuda", dtype=torch.float32)
+def postprocess(pointmaps, pointmaps_activation=ActivationType.NORM_EXP, compute_cam=False):
+    out = {}
+    channels = pointmaps.shape[-1]
+    out['pts3d'] = pointmaps[..., :3]
+    out['pts3d'] = apply_activation(out['pts3d'], activation=pointmaps_activation)
+    if channels >= 6:
+        out['pts3d_local'] = pointmaps[..., 3:6]
+        out['pts3d_local'] = apply_activation(out['pts3d_local'], activation=pointmaps_activation)
+    if channels == 4 or channels == 7:
+        out['conf'] = 1.0 + pointmaps[..., -1].exp()
+
+    if compute_cam:
+        batch_dims = out['pts3d'].shape[:-3]
+        num_batch_dims = len(batch_dims)
+        H, W = out['conf'].shape[-2:]
+        pp = torch.tensor((W / 2, H / 2), device=out['pts3d'].device)
+        focal = estimate_focal_knowing_depth(out['pts3d_local'].reshape(math.prod(batch_dims), H, W, 3), pp,
+                                             focal_mode='weiszfeld')
+        out['focal'] = focal.reshape(*batch_dims)
+
+        R, T = roma.rigid_points_registration(
+            out['pts3d_local'].reshape(*batch_dims, -1, 3),
+            out['pts3d'].reshape(*batch_dims, -1, 3),
+            weights=out['conf'].reshape(*batch_dims, -1) - 1.0, compute_scaling=False)
+
+        c2w = torch.eye(4, device=out['pts3d'].device)
+        c2w = c2w.view(*([1] * num_batch_dims), 4, 4).repeat(*batch_dims, 1, 1)
+        c2w[..., :3, :3] = R
+        c2w[..., :3, 3] = T.view(*batch_dims, 3)
+        out['c2w'] = c2w
+    return out
+
+
+def split_list(lst, split_size):
+    return [lst[i:i + split_size] for i in range(0, len(lst), split_size)]
+
+
+def split_list_of_tensors(tensor, max_bs):
+    tensor_splits = []
+    for s in tensor:
+        if isinstance(s, list):
+            tensor_splits.extend(split_list(s, max_bs))
+        else:
+            tensor_splits.extend(torch.split(s, max_bs))
+    return tensor_splits
+
+
+def stack_views(true_shape, values, max_bs=None):
+    # first figure out what the unique aspect ratios are
+    unique_true_shape, inverse_indices = torch.unique(true_shape, dim=0, return_inverse=True)
+
+    # we group the values that share the same AR
+    true_shape_stacks = [[] for _ in range(unique_true_shape.shape[0])]
+    index_stacks = [[] for _ in range(unique_true_shape.shape[0])]
+    value_stacks = [
+        [[] for _ in range(unique_true_shape.shape[0])]
+        for _ in range(len(values))
+    ]
+
+    for i in range(true_shape.shape[0]):
+        true_shape_stacks[inverse_indices[i]].append(true_shape[i])
+        index_stacks[inverse_indices[i]].append(i)
+
+        for j in range(len(values)):
+            value_stacks[j][inverse_indices[i]].append(values[j][i])
+
+    # regroup all None values together (these typically are missing encoder features that'll be recomputed later)
+    for i in range(len(true_shape_stacks)):
+        # get a mask for each type of value
+        none_mask = [[vl == None for vl in v[i]]
+                     for v in value_stacks
+                     ]
+        # apply "or" on all the different types of values
+        none_mask = [any([v[j] for v in none_mask]) for j in range(len(true_shape_stacks[i]))]
+        if not any(none_mask) or all(none_mask):
+            # there was no None or all were None skip
+            continue
+        not_none_mask = [not x for x in none_mask]
+
+        def get_filtered_list(l, local_mask):
+            return [v for v, m in zip(l, local_mask) if m]
+        true_shape_stacks.append(get_filtered_list(true_shape_stacks[i], none_mask))
+        true_shape_stacks[i] = get_filtered_list(true_shape_stacks[i], not_none_mask)
+
+        index_stacks.append(get_filtered_list(index_stacks[i], none_mask))
+        index_stacks[i] = get_filtered_list(index_stacks[i], not_none_mask)
+
+        for j in range(len(value_stacks)):
+            value_stacks[j].append(get_filtered_list(value_stacks[j][i], none_mask))
+            value_stacks[j][i] = get_filtered_list(value_stacks[j][i], not_none_mask)
+
+    # stack tensors
+    true_shape_stacks = [torch.stack(true_shape_stack, dim=0) for true_shape_stack in true_shape_stacks]
+    value_stacks = [
+        [torch.stack(v, dim=0) if None not in v else v for v in value_stack]
+        for value_stack in value_stacks
+    ]
+
+    # split all sub-tensors in blocks of max_size = max_bs
+    if max_bs is not None:
+        true_shape_stacks = split_list_of_tensors(true_shape_stacks, max_bs)
+
+        index_stacks = [torch.tensor(s) for s in index_stacks]
+        index_stacks = split_list_of_tensors(index_stacks, max_bs)
+        index_stacks = [s.tolist() for s in index_stacks]
+
+        value_stacks = [
+            split_list_of_tensors(value_stack, max_bs)
+            for value_stack in value_stacks
+        ]
+
+    # some cleaning, replace list of None by a single None
+    for value_stack in value_stacks:
+        for j in range(len(value_stack)):
+            if isinstance(value_stack[j], list):
+                if None in value_stack[j]:
+                    value_stack[j] = None
+
+    return true_shape_stacks, index_stacks, *value_stacks
+
+
+@torch.no_grad()
+def encoder_multi_ar(encoder, imgs, true_shape, verbose=False, max_bs=None, device=None, preserve_gpu_mem=False):
+    # forward through dust3r encoder
+    if verbose:
+        print(f'running encoder')
+    nimgs = true_shape.shape[0]
+    device = device or true_shape.device
+    outdevice = device if not preserve_gpu_mem else "cpu"
+    true_shape_stacks, index_stacks, imgs_stacks = stack_views(true_shape, [imgs], max_bs=max_bs)
+    x, pos = [None for _ in range(nimgs)], [None for _ in range(nimgs)]
+
+    pbar = tqdm(zip(imgs_stacks, true_shape_stacks, index_stacks),
+                disable=not verbose, total=len(imgs_stacks))
+    for imgs_stack, true_shape_stack, index_stack in pbar:
+        nimgs_stack = imgs_stack.shape[0]
+        # encode all images (concat them in the batch dimension for efficiency)
+        x_stack, pos_stack = encoder(imgs_stack.to(device), true_shape_stack.to(device))
+        for i in range(nimgs_stack):
+            x[index_stack[i]] = x_stack[i].to(outdevice)
+            pos[index_stack[i]] = pos_stack[i].to(outdevice)
+
+        try:
+            pbar.set_postfix({'Mem_r': str(int(torch.cuda.max_memory_reserved(device) / (1024 ** 2))) + " MB",
+                              'Mem_a': str(int(torch.cuda.max_memory_allocated(device) / (1024 ** 2))) + " MB"})
+        except Exception as e:
+            pass
+    return x, pos
+
+
+@torch.no_grad()
+def inference_multi_ar_batch(encoder, decoder, imgs, true_shape, mem=None, verbose=False,
+                             encoder_precomputed_features=None,
+                             preserve_gpu_mem=False, post_process_function=lambda x: {'pts3d': x}, device=None,
+                             render=False, viser_server=None):
+    device = device or true_shape.device
+    outdevice = device if not preserve_gpu_mem else "cpu"
+    if encoder_precomputed_features is None:
+        # already stacked
+        x, pos = [], []
+        for i in range(len(imgs)):
+            xi, posi = encoder(imgs[i].to(device), true_shape[i].to(device))
+            x.append(xi)
+            pos.append(posi)
+    else:
+        x, pos = encoder_precomputed_features
+
+    x = [v.unsqueeze(0).to(device) for v in x]
+    pos = [v.unsqueeze(0).to(device) for v in pos]
+    true_shape = [v.unsqueeze(0).to(device) for v in true_shape]
+    imgs = [v.unsqueeze(0).to(device) for v in imgs]
+
+    mem, pointmaps_0 = decoder(x, pos, true_shape, mem, render=render)
+    pointmaps_0_pp = []
+    for pointmaps_0_i in pointmaps_0:
+        pointmaps_0_i = pointmaps_0_i.squeeze(0)
+        if post_process_function is not None:
+            pointmaps_0_i = post_process_function(pointmaps_0_i)
+            pointmaps_0_i = {k: v.to(outdevice) for k, v in pointmaps_0_i.items()}
+        else:
+            pointmaps_0_i = pointmaps_0_i.to(outdevice)
+
+        pointmaps_0_pp.append(pointmaps_0_i)
+
+    return mem, pointmaps_0_pp
+
+
+def _remove_from_mem(mem_values, mem_labels, idx):
+    to_keep_mask = mem_labels != idx
+    B, _, D = mem_values[0].shape
+    mem_values = [
+        mem_value[to_keep_mask].view(B, -1, D)
+        for mem_value in mem_values
+    ]
+    mem_labels = mem_labels[to_keep_mask].view(B, -1)
+    return mem_values, mem_labels
+
+
+def _restore_label_in_mem(mem_labels, old_idx_to_restore, new_idx_to_remove):
+    mask = mem_labels == new_idx_to_remove
+    mem_labels[mask] = old_idx_to_restore
+    return mem_labels
+
+
+def _update_in_mem(old_values, new_values, old_labels, new_labels, old_idx, new_idx):
+    old_mask = old_labels == old_idx
+    new_mask = new_labels == new_idx
+
+    for k in range(len(old_values)):  # iterate over mem_vals
+        old_values[k][old_mask] = new_values[k][new_mask]
+    return old_values
+
+
+@torch.no_grad()
+def inference_video_multi_ar(encoder, decoder, imgs, true_shape, mem_batches,
+                             verbose=False, max_bs=None, encoder_precomputed_features=None,
+                             preserve_gpu_mem=False, post_process_function=lambda x: {'pts3d': x}, device=None,
+                             return_mem=False, viser_server=None, num_refinements_iterations=0, local_context_size=25,
+                             is_keyframe_function=lambda id, res, scene_state: (id % 3 == 0),
+                             scene_state=None, scene_state_update_function=lambda res, scene_state: scene_state):
+    true_shape = torch.stack(true_shape, dim=0)
+    nimgs = true_shape.shape[0]
+    device = device or true_shape.device
+    if encoder_precomputed_features is None:
+        x = [None for _ in range(nimgs)]
+        pos = [None for _ in range(nimgs)]
+    else:
+        x, pos = encoder_precomputed_features
+
+    # use the decoder to update the memory
+    # we'll also get first pass pointmaps in pointmaps_0
+    # not all images have to update the memory
+    if verbose:
+        print(f'updating memory')
+    mem = None
+    mem_batches = [0] + np.cumsum(mem_batches).tolist()
+    pointmaps_0 = [None for _ in range(mem_batches[-1])]
+    img_labels = {}
+    keyframes = set()
+    img_ids = [torch.tensor(v) for v in range(nimgs)]
+    for _ in range(num_refinements_iterations + 1):
+        pbar = tqdm(range(len(mem_batches) - 1), disable=not verbose, total=len(mem_batches) - 1)
+        working_memory_idx = deque()
+        for i in pbar:
+            true_shape_i = true_shape[mem_batches[i]:mem_batches[i + 1]]
+            imgs_i = imgs[mem_batches[i]:mem_batches[i + 1]]
+            img_ids_i = img_ids[mem_batches[i]:mem_batches[i + 1]]
+
+            # find out if we need to compute some encoder features
+            x_i = x[mem_batches[i]:mem_batches[i + 1]]
+            pos_i = pos[mem_batches[i]:mem_batches[i + 1]]
+            if None in x_i or None in pos_i:
+                x_i, pos_i = encoder_multi_ar(encoder, imgs_i, true_shape_i,
+                                              verbose=False, max_bs=max_bs, device=device)
+                x[mem_batches[i]:mem_batches[i + 1]] = x_i
+                pos[mem_batches[i]:mem_batches[i + 1]] = pos_i
+
+            true_shape_stacks_i, index_stacks_i, x_stacks_i, pos_stacks_i, imgs_stacks_i = stack_views(true_shape_i, [x_i, pos_i, imgs_i],
+                                                                                                       max_bs=max_bs)
+
+            Nmem_before = get_Nmem(mem)
+            new_mem, pointmaps_0_i = inference_multi_ar_batch(
+                encoder, decoder, imgs_stacks_i, true_shape_stacks_i, mem, verbose=verbose,
+                encoder_precomputed_features=(x_stacks_i, pos_stacks_i),
+                preserve_gpu_mem=preserve_gpu_mem, post_process_function=post_process_function, device=device,
+                viser_server=viser_server
+            )
+            # unstack
+            pointmaps_0_i = unstack_pointmaps(index_stacks_i, pointmaps_0_i)
+            pointmaps_0[mem_batches[i]:mem_batches[i + 1]] = pointmaps_0_i
+
+            new_mem = list(new_mem)  # cast tuple to list
+            new_labels = sorted(torch.unique(new_mem[1][:, Nmem_before:]))
+            new_labels = [int(v) for v in new_labels]
+            mem = new_mem
+            local_keyframes = []
+            if len(img_labels) == 0:  # at initialization, all keyframes (to simplify things a bit)
+                for j, img_id_i in enumerate(img_ids_i):
+                    img_id_i = int(img_id_i)
+                    img_labels[img_id_i] = new_labels[j]
+                    working_memory_idx.append(img_id_i)
+                    keyframes.add(img_id_i)
+                    local_keyframes.append(True)
+                    scene_state = scene_state_update_function(pointmaps_0_i[j], scene_state)
+            else:
+                # for each image, we will run some checks
+                for j, img_id_i in enumerate(img_ids_i):
+                    img_id_i = int(img_id_i)
+
+                    if img_id_i in img_labels:  # seen before
+                        # do not check again
+                        # maybe we want to re-check for non_keyframes (it might be slow though ?)
+                        is_keyframe = img_id_i in keyframes
+                    else:
+                        is_keyframe = is_keyframe_function(img_id_i, pointmaps_0_i[j], scene_state)
+                    working_memory_idx.append(img_id_i)
+                    local_keyframes.append(is_keyframe)
+                    if is_keyframe and img_id_i in img_labels:
+                        # if keyframe and seen before, it means we should update it (and remove the tokens)
+                        old_label_j = img_labels[img_id_i]
+                        if old_label_j != 0:  # for now ref img is not updated
+                            mem[0] = _update_in_mem(mem[0], mem[0], mem[1], mem[1], old_label_j, new_labels[j])
+                        mem[0], mem[1] = _remove_from_mem(mem[0], mem[1], new_labels[j])
+                    elif img_id_i in img_labels:
+                        # not a keyframe, has been seen before but doesn't stay in memory
+                        # let's just relabel it
+                        mem[1] = _restore_label_in_mem(mem[1], img_labels[img_id_i], new_labels[j])
+                    else:
+                        # never seen before, do nothing
+                        img_labels[img_id_i] = new_labels[j]
+                        if is_keyframe:
+                            keyframes.add(img_id_i)
+                            scene_state = scene_state_update_function(pointmaps_0_i[j], scene_state)
+
+            if viser_server is not None:
+                viser_server.set_views(img_ids_i, imgs_i, pointmaps_0_i, local_keyframes)
+            # cleaning
+            # remove local frames that are out of the local window
+            while len(working_memory_idx) > local_context_size:
+                to_remove_id = working_memory_idx.popleft()
+                if to_remove_id not in keyframes:
+                    mem[0], mem[1] = _remove_from_mem(mem[0], mem[1], img_labels[to_remove_id])
+
+            # restore mem_nimgs
+            mem[2] = len(img_labels)
+
+            try:
+                pbar.set_postfix({'Mem_r': str(int(torch.cuda.max_memory_reserved(device) / (1024 ** 2))) + " MB",
+                                  'Mem_a': str(int(torch.cuda.max_memory_allocated(device) / (1024 ** 2))) + " MB",
+                                  "keyframe": len(keyframes),
+                                  "Nmem": get_Nmem(mem)})
+                if preserve_gpu_mem:
+                    torch.cuda.empty_cache()
+            except Exception as e:
+                pass
+
+        # remove all non keyframes from memory to prepare for the new pass
+        assert mem is not None
+        while len(working_memory_idx) > 0:
+            to_remove_id = working_memory_idx.popleft()
+            if to_remove_id not in keyframes:
+                mem[0], mem[1] = _remove_from_mem(mem[0], mem[1], img_labels[to_remove_id])
+
+        pbar.close()
+
+    if return_mem:
+        return mem, pointmaps_0
+    else:
+        return pointmaps_0
+
+
+@torch.no_grad()
+def inference_multi_ar(encoder, decoder, imgs, img_ids, true_shape, mem_batches,
+                       verbose=False, max_bs=None, to_render=None, encoder_precomputed_features=None,
+                       precomputed_mem=None, preserve_gpu_mem=False, post_process_function=lambda x: {'pts3d': x},
+                       device=None, return_mem=False, viser_server=None, num_refinements_iterations=0):
+    true_shape = torch.stack(true_shape, dim=0)
+    nimgs = true_shape.shape[0]
+    device = device or true_shape.device
+    if encoder_precomputed_features is None:
+        x = [None for _ in range(nimgs)]
+        pos = [None for _ in range(nimgs)]
+    else:
+        x, pos = encoder_precomputed_features
+
+    if precomputed_mem is None:
+        # use the decoder to update the memory
+        # we'll also get first pass pointmaps in pointmaps_0
+        # not all images have to update the memory
+        if verbose:
+            print(f'updating memory')
+        mem = None
+        mem_batches = [0] + np.cumsum(mem_batches).tolist()
+        pointmaps_0 = [None for _ in range(mem_batches[-1])]
+        img_labels = {}
+
+        for _ in range(num_refinements_iterations + 1):
+            pbar = tqdm(range(len(mem_batches) - 1), disable=not verbose, total=len(mem_batches) - 1)
+            for i in pbar:
+                true_shape_i = true_shape[mem_batches[i]:mem_batches[i + 1]]
+                imgs_i = imgs[mem_batches[i]:mem_batches[i + 1]]
+                img_ids_i = img_ids[mem_batches[i]:mem_batches[i + 1]]
+
+                # find out if we need to compute some encoder features
+                x_i = x[mem_batches[i]:mem_batches[i + 1]]
+                pos_i = pos[mem_batches[i]:mem_batches[i + 1]]
+                if None in x_i or None in pos_i:
+                    x_i, pos_i = encoder_multi_ar(encoder, imgs_i, true_shape_i,
+                                                  verbose=False, max_bs=max_bs, device=device)
+                    x[mem_batches[i]:mem_batches[i + 1]] = x_i
+                    pos[mem_batches[i]:mem_batches[i + 1]] = pos_i
+
+                true_shape_stacks_i, index_stacks_i, x_stacks_i, pos_stacks_i, imgs_stacks_i = stack_views(true_shape_i, [x_i, pos_i, imgs_i],
+                                                                                                           max_bs=max_bs)
+
+                if all([int(img_ids_ij) in img_labels for img_ids_ij in img_ids_i]):
+                    update_mem = True
+                else:
+                    update_mem = False
+
+                new_mem, pointmaps_0_i = inference_multi_ar_batch(
+                    encoder, decoder, imgs_stacks_i, true_shape_stacks_i, mem, verbose=verbose,
+                    encoder_precomputed_features=(x_stacks_i, pos_stacks_i),
+                    preserve_gpu_mem=preserve_gpu_mem, post_process_function=post_process_function, device=device,
+                    viser_server=viser_server
+                )
+
+                Nmem_before = get_Nmem(mem)
+                new_labels = sorted(torch.unique(new_mem[1][:, Nmem_before:]))
+                if update_mem:
+                    # here we update the tokens of the image
+                    assert mem is not None
+                    for j, img_id_i in enumerate(img_ids_i):
+                        old_label_j = img_labels[int(img_id_i)]
+                        if old_label_j == 0:
+                            continue  # for now ignore ref img
+                        old_mask_j = mem[1] == old_label_j  # old mem_labels correspond to this image
+                        new_mask_j = new_mem[1] == new_labels[j]  # new mem_labels correspond to this image
+                        # assert torch.sum(old_mask_j) > 0
+                        # assert torch.sum(new_mask_j) == torch.sum(old_mask_j)
+                        for k in range(len(mem[0])):  # iterate over mem_vals
+                            mem[0][k][old_mask_j] = new_mem[0][k][new_mask_j]
+                    del new_mem
+                else:
+                    mem = new_mem
+                    for j, img_id_i in enumerate(img_ids_i):
+                        img_labels[int(img_id_i)] = int(new_labels[j])
+
+                # unstack
+                pointmaps_0_i = unstack_pointmaps(index_stacks_i, pointmaps_0_i)
+                pointmaps_0[mem_batches[i]:mem_batches[i + 1]] = pointmaps_0_i
+                if viser_server is not None:
+                    viser_server.set_views(img_ids_i, imgs_i, pointmaps_0_i, [True for _ in range(len(imgs_i))])
+
+                try:
+                    pbar.set_postfix({'Mem_r': str(int(torch.cuda.max_memory_reserved(device) / (1024 ** 2))) + " MB",
+                                      'Mem_a': str(int(torch.cuda.max_memory_allocated(device) / (1024 ** 2))) + " MB",
+                                      "Nmem": get_Nmem(mem)})
+                    if preserve_gpu_mem:
+                        torch.cuda.empty_cache()
+                except Exception as e:
+                    pass
+            pbar.close()
+    else:
+        pointmaps_0 = None
+        mem = precomputed_mem
+
+    if to_render is not None:
+        # with to_render, you can select a list of images to render, instead of rendering all of them
+        x = [x[v] for v in to_render]
+        pos = [pos[v] for v in to_render]
+        true_shape = true_shape[to_render].contiguous()
+        imgs = [imgs[v] for v in to_render]
+        img_ids = [img_ids[v] for v in to_render]
+        nimgs = len(x)
+
+    # render pointmaps using the accumulated memory
+    assert mem is not None
+    Nmem = get_Nmem(mem)
+    if verbose:
+        print(f"Nmem={Nmem}")
+
+    if nimgs == 0:
+        pointmaps = []
+        if return_mem:
+            return mem, pointmaps_0, pointmaps
+        else:
+            return pointmaps_0, pointmaps
+
+    if verbose:
+        print(f'rendering {nimgs} extra images')
+    true_shape_stacks, index_stacks, x_stacks, pos_stacks, imgs_stacks, img_ids_stacks = stack_views(true_shape,
+                                                                                                     [x, pos, imgs,
+                                                                                                         img_ids],
+                                                                                                     max_bs=max_bs)
+    pbar = tqdm(zip(x_stacks, pos_stacks, true_shape_stacks, imgs_stacks, img_ids_stacks),
+                disable=not verbose, total=len(x_stacks))
+
+    pointmaps_stacks = []
+    for x_stack, pos_stack, true_shape_stack, imgs_stack, img_ids_stack in pbar:
+        if x_stack is None or pos_stack is None:
+            encoder_precomputed_features = None
+        else:
+            encoder_precomputed_features = ([x_stack], [pos_stack])
+
+        _, pointmaps_stack = inference_multi_ar_batch(
+            encoder, decoder, [imgs_stack], [true_shape_stack], mem, verbose=verbose,
+            encoder_precomputed_features=encoder_precomputed_features,
+            preserve_gpu_mem=preserve_gpu_mem, post_process_function=post_process_function, device=device,
+            render=True, viser_server=viser_server
+        )
+
+        pointmaps_stacks.append(pointmaps_stack[0])
+        if viser_server is not None:
+            tmp_pointmaps_unstack = unstack_pointmaps([torch.arange(img_ids_stack.shape[0])], pointmaps_stack)
+            for i in range(img_ids_stack.shape[0]):
+                viser_server.set_views([img_ids_stack[i]], [imgs_stack[i]], [tmp_pointmaps_unstack[i]])
+
+        try:
+            pbar.set_postfix({'Mem_r': str(int(torch.cuda.max_memory_reserved(device) / (1024 ** 2))) + " MB",
+                              'Mem_a': str(int(torch.cuda.max_memory_allocated(device) / (1024 ** 2))) + " MB"})
+        except Exception as e:
+            pass
+
+    pointmaps = unstack_pointmaps(index_stacks, pointmaps_stacks)
+
+    if return_mem:
+        return mem, pointmaps_0, pointmaps
+    else:
+        return pointmaps_0, pointmaps
+
+
+def get_Nmem(mem):
+    if mem is None:
+        return 0
+    mem_labels = mem[1]
+    _, Nmem = mem_labels.shape
+    return Nmem
+
+
+def unstack_pointmaps(index_stacks_i, pointmaps_0_i):
+    num_elements = max([max(index_stack_i) for index_stack_i in index_stacks_i]) + 1
+    pointmaps_0 = [None for _ in range(num_elements)]
+    for pointmaps_0_i_stack, index_stack_i in zip(pointmaps_0_i, index_stacks_i):
+        out_pointmaps_0_i = {}
+        for k, v in pointmaps_0_i_stack.items():
+            for j in range(v.shape[0]):
+                if j not in out_pointmaps_0_i:
+                    out_pointmaps_0_i[j] = {}
+                out_pointmaps_0_i[j][k] = v[j]
+
+        for j in out_pointmaps_0_i.keys():
+            pointmaps_0[index_stack_i[j]] = out_pointmaps_0_i[j]
+    return pointmaps_0
+
+
+def groupby_consecutive(data):
+    """
+    identify groups of consecutive numbers
+    """
+    if not data:
+        return []
+    # Sort the data to ensure consecutive numbers are adjacent
+    data = sorted(data)
+    result = []
+    # consecutive numbers have the same (value - index)
+    for k, g in itertools.groupby(enumerate(data), lambda x: x[1] - x[0]):
+        group = list(map(lambda x: x[1], g))
+        result.append((group[0], group[-1]))
+    return result
+
+
+def inference_encoder(encoder, imgs, true_shape_view, max_bs=None, requires_grad=False):
+    def encoder_get_context():
+        return torch.no_grad() if not requires_grad \
+            else nullcontext()
+
+    with encoder_get_context():
+        # x, pos = encoder_blk(imgs)
+        B, nimgs = imgs.shape[:2]
+        if max_bs is None or B * nimgs <= max_bs:
+            # encode all images (concat them in the batch dimension for efficiency)
+            x, pos = encoder(imgs.view(B * nimgs, *imgs.shape[2:]), true_shape_view)
+        else:
+            # can also do it slice by slice in case all images don't fit at once
+            imgs_view = imgs.view(B * nimgs, *imgs.shape[2:])
+            x, pos = [], []
+            for imgs_view_slice, true_shape_slice in zip(torch.split(imgs_view, max_bs), torch.split(true_shape_view, max_bs)):
+                xi, posi = encoder(imgs_view_slice, true_shape_slice)
+                x.append(xi)
+                pos.append(posi)
+            x = torch.concatenate(x)
+            pos = torch.concatenate(pos)
+        return x.view(B, nimgs, *x.shape[1:]), pos.view(B, nimgs, *pos.shape[1:])
+    return x, pos
+
+
+def inference(encoder, decoder, imgs, true_shape, mem_batches, verbose=False, max_bs=None,
+              train_decoder_skip=0, to_render=None, encoder_requires_grad=False):
+    # forward through dust3r encoder
+    B, nimgs = imgs.shape[:2]
+    true_shape_view = true_shape.view(B * nimgs, 2)
+    x, pos = inference_encoder(encoder, imgs, true_shape_view, max_bs, encoder_requires_grad)
+    _, _, N, D = x.shape
+
+    # use the decoder to update the memory
+    # we'll also get first pass pointmaps in pointmaps_0
+    # not all images have to update the memory
+    mem = None
+    mem_batches = [0] + np.cumsum(mem_batches).tolist()
+    # when training for a large number of views, we may want to freeze the decoder for the first views
+    outshape = None
+    for i in range(train_decoder_skip):
+        with torch.no_grad():
+            xi = x[:, mem_batches[i]:mem_batches[i + 1]].contiguous()
+            posi = pos[:, mem_batches[i]:mem_batches[i + 1]].contiguous()
+            true_shapei = true_shape[:, mem_batches[i]:mem_batches[i + 1]].contiguous()
+            mem, pt_tmp = decoder(xi, posi, true_shapei, mem, render=False)
+            if outshape is None:
+                outshape = pt_tmp.shape
+
+    pointmaps_0 = []
+    for i in range(train_decoder_skip, len(mem_batches) - 1):
+        xi = x[:, mem_batches[i]:mem_batches[i + 1]].contiguous()
+        posi = pos[:, mem_batches[i]:mem_batches[i + 1]].contiguous()
+        true_shapei = true_shape[:, mem_batches[i]:mem_batches[i + 1]].contiguous()
+        mem, pointmaps_0i = decoder(xi, posi, true_shapei, mem, render=False)
+        if outshape is None:
+            outshape = pointmaps_0i.shape
+
+        pointmaps_0.append(pointmaps_0i)
+
+    # concatenate the first pass pointmaps together
+    if len(pointmaps_0) > 0:
+        # B, mem_batches[-1] - mem_batches[train_decoder_skip], N, D
+        pointmaps_0 = torch.concatenate(pointmaps_0, dim=1)
+    else:
+        pointmaps_0 = torch.empty((B, 0, *outshape[2:]), dtype=x.dtype, device=x.device)
+
+    if to_render is not None:
+        # with to_render, you can select a list of images to render, instead of rendering all of them
+        x = x[:, to_render].contiguous()
+        pos = pos[:, to_render].contiguous()
+        true_shape = true_shape[:, to_render].contiguous()
+        imgs = imgs[:, to_render].contiguous()
+        nimgs = x.shape[1]
+
+    # render pointmaps using the accumulated memory
+    assert mem is not None
+    mem_vals, mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens = mem
+    try:
+        _, Nmem, Dmem = mem_vals[-1].shape
+    except Exception as e:
+        _, Nmem, Dmem = mem_vals[0][-1].shape
+    if verbose:
+        print(f"Nmem={Nmem}")
+
+    if nimgs == 0:
+        pointmaps = torch.empty((B, 0, *pointmaps_0.shape[2:]), dtype=x.dtype, device=x.device)
+        return pointmaps_0, pointmaps
+    elif max_bs is None or B * nimgs <= max_bs:
+        # render all images (concat them in the batch dimension for efficiency)
+        _, pointmaps = decoder(x, pos, true_shape, mem, render=True)
+    else:
+        # can also do it slice by slice in case all images don't fit at once
+        x_view = x.view(B * nimgs, N, D)
+        pos_view = pos.view(B * nimgs, N, 2)
+        true_shape_view = true_shape.view(B * nimgs, 2)
+        pointmaps = []
+
+        mem_vals = [mem_vals[i].unsqueeze(1).expand(B, nimgs, Nmem, Dmem).reshape(B * nimgs, Nmem, Dmem)
+                    for i in range(len(mem_vals))]
+        mem_vals_splits = [torch.split(mem_vals[i], max_bs) for i in range(len(mem_vals))]
+        mem_labels = mem_labels.unsqueeze(1).expand(B, nimgs, Nmem).reshape(B * nimgs, Nmem)
+        mem_labels_splits = torch.split(mem_labels, max_bs)
+
+        for lidx, (x_view_slice, pos_view_slice, true_shape_view_slice) in enumerate(
+            zip(torch.split(x_view, max_bs),
+                torch.split(pos_view, max_bs),
+                torch.split(true_shape_view, max_bs))):
+            memi = [m[lidx] for m in mem_vals_splits]
+            mem_labelsi = mem_labels_splits[lidx]
+            _, xi_out = decoder(x_view_slice.unsqueeze(1),
+                                pos_view_slice.unsqueeze(1),
+                                true_shape_view_slice.unsqueeze(1),
+                                (memi, mem_labelsi, mem_nimgs, mem_protected_imgs, mem_protected_tokens), render=True)
+            pointmaps.append(xi_out.squeeze(1))
+        pointmaps = torch.concatenate(pointmaps)
+        pointmaps = pointmaps.view(B, nimgs, *pointmaps.shape[1:])
+
+    return pointmaps_0, pointmaps
+
+
+def concat_preds(out0, out):
+    for k in out.keys():
+        if k in out0:
+            out[k] = torch.concatenate([out0[k], out[k]], dim=1)
+    return out
diff --git a/must3r/engine/io.py b/must3r/engine/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b920f9330706ebd57e1fed3b91f58586cbe1eea
--- /dev/null
+++ b/must3r/engine/io.py
@@ -0,0 +1,43 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+from pathlib import Path
+
+import must3r.tools.path_to_dust3r  # noqa
+import dust3r.utils.path_to_croco  # noqa
+from croco.utils.misc import save_on_master
+
+
+def save_model(args, epoch, encoder, decoder, optimizer, loss_scaler, fname=None):
+    output_dir = Path(args.output_dir)
+    if fname is None:
+        fname = str(epoch)
+    checkpoint_path = output_dir / ('checkpoint-%s.pth' % fname)
+    optim_state_dict = optimizer.state_dict()
+    to_save = {
+        'encoder': encoder.state_dict(),
+        'decoder': decoder.state_dict(),
+        'optimizer': optim_state_dict,
+        'scaler': loss_scaler.state_dict(),
+        'args': args,
+        'epoch': epoch,
+    }
+    print(f'>> Saving model to {checkpoint_path} ...')
+    save_on_master(to_save, checkpoint_path)
+
+
+def load_model(args, chkpt_path, encoder, decoder, optimizer, loss_scaler):
+    args.start_epoch = 0
+    if chkpt_path is not None:
+        checkpoint = torch.load(chkpt_path, map_location='cpu', weights_only=False)
+
+        print("Resume checkpoint %s" % chkpt_path)
+        encoder.load_state_dict(checkpoint['encoder'], strict=False)
+        decoder.load_state_dict(checkpoint['decoder'], strict=False)
+        args.start_epoch = checkpoint['epoch'] + 1
+        optim_state_dict = checkpoint['optimizer']
+        optimizer.load_state_dict(optim_state_dict)
+        if 'scaler' in checkpoint:
+            loss_scaler.load_state_dict(checkpoint['scaler'])
+        else:
+            print("")
+        print("With optim & sched! start_epoch={:d}".format(args.start_epoch), end='')
diff --git a/must3r/engine/losses.py b/must3r/engine/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c6e0251757a06f0f615af164ba5b8d57a5064e
--- /dev/null
+++ b/must3r/engine/losses.py
@@ -0,0 +1,186 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+from must3r.tools.geometry import apply_log_to_norm, normalize_pointcloud
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.utils.geometry import geotrf
+from dust3r.losses import Criterion, L21, MultiLoss, Sum  # noqa
+
+
+class Regr3D (Criterion, MultiLoss):
+    def __init__(self, criterion, norm_mode='?avg_dis', sky_loss_value=2, loss_in_log=False):
+        super().__init__(criterion)
+        self.loss_in_log = loss_in_log
+        if norm_mode.startswith('?'):
+            # use the same scale factor as ground-truth for predictions in metric scale datasets
+            self.norm_all = False
+            self.norm_mode = norm_mode[1:]
+        else:
+            self.norm_all = True
+            self.norm_mode = norm_mode
+        self.sky_loss_value = sky_loss_value
+
+    def get_all_pts3d(self, gt, pred, dist_clip=None):
+        # everything is normalized w.r.t. camera of view1
+        device = pred['pts3d'].device
+
+        gt_c2w = [b['camera_pose'] for b in gt]
+        gt_c2w = torch.stack(gt_c2w, dim=1).to(device)  # B, nimgs, 4, 4
+        gt_w2c = torch.linalg.inv(gt_c2w)
+
+        in_camera0 = gt_w2c[:, 0]
+
+        gt_pts3d = [b['pts3d'] for b in gt]
+        gt_pts3d = torch.stack(gt_pts3d, dim=1).to(device)  # B, nimgs, H, W, 3
+
+        gt_pts3d_local = geotrf(gt_w2c, gt_pts3d)  # B, nimgs, H, W, 3
+        gt_pts = geotrf(in_camera0, gt_pts3d)  # B, nimgs, H, W, 3
+
+        valid = [b['valid_mask'] for b in gt]
+        valid = torch.stack(valid, dim=1).to(device).clone()
+
+        is_metric_scale = gt[0]['is_metric_scale'].to(device).clone()
+
+        sky_mask = [b['sky_mask'] for b in gt]
+        sky_mask = torch.stack(sky_mask, dim=1).to(device).clone()
+
+        if dist_clip is not None:
+            # points that are too far-away == invalid
+            dis_g = gt_pts.norm(dim=-1)  # (B, nimgs, H, W)
+            dis_l = gt_pts3d_local.norm(dim=-1)  # (B, nimgs, H, W)
+            valid_g = valid & (dis_g <= dist_clip)
+            valid_l = valid & (dis_l <= dist_clip)
+        else:
+            valid_g = valid
+            valid_l = valid
+
+        pr_pts = pred['pts3d'].clone()
+        if 'pts3d_local' in pred:
+            pr_pts_local = pred['pts3d_local'].clone()
+        else:
+            pr_pts_local = None
+
+        if not self.norm_all:
+            mask = ~is_metric_scale
+        else:
+            mask = torch.ones_like(is_metric_scale)
+
+        # normalize 3d points
+        if self.norm_mode and mask.any():
+            pr_pts[mask], norm_factor_pred = normalize_pointcloud(pr_pts[mask], None, self.norm_mode, valid[mask], None,
+                                                                  ret_factor=True)
+            if pr_pts_local is not None:
+                pr_pts_local[mask] = pr_pts_local[mask] / norm_factor_pred
+
+        if self.norm_mode:
+            gt_pts, norm_factor = normalize_pointcloud(gt_pts, None, self.norm_mode, valid, None, ret_factor=True)
+            gt_pts3d_local = gt_pts3d_local / norm_factor
+            pr_pts[~mask] = pr_pts[~mask] / norm_factor[~mask]
+            if pr_pts_local is not None:
+                pr_pts_local[~mask] = pr_pts_local[~mask] / norm_factor[~mask]
+
+        # return sky segmentation, making sure they don't include any labelled 3d points
+        sky_g = sky_mask & (~valid_g)
+        sky_l = sky_mask & (~valid_l)
+        return gt_pts, gt_pts3d_local, pr_pts, pr_pts_local, valid_g, valid_l, sky_g, sky_l, {}
+
+    def compute_loss(self, gt, pred, **kw):
+        gt_pts, gt_pts3d_local, pred_pts, pred_pts_local, mask_g, mask_l, sky_g, sky_l, monitoring = \
+            self.get_all_pts3d(gt, pred, **kw)
+
+        if self.sky_loss_value > 0:
+            assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss'
+            # add the sky pixel as "valid" pixels...
+            mask_g = mask_g | sky_g
+            mask_l = mask_l | sky_l
+
+        # loss on pts3d global
+        gt_pts = gt_pts[mask_g]
+        if self.loss_in_log:
+            gt_pts = apply_log_to_norm(gt_pts, dim=-1)
+            pred_pts = apply_log_to_norm(pred_pts, dim=-1)
+        pred_pts_m = pred_pts[mask_g]
+
+        l1 = self.criterion(pred_pts_m, gt_pts)
+
+        # loss on pts3d local
+        if pred_pts_local is not None:
+            pred_pts_local = pred_pts_local[mask_l]
+            gt_pts3d_local = gt_pts3d_local[mask_l]
+            if self.loss_in_log and self.loss_in_log != 'before':
+                gt_pts3d_local = apply_log_to_norm(gt_pts3d_local, dim=-1)
+                pred_pts_local = apply_log_to_norm(pred_pts_local, dim=-1)
+            l2 = self.criterion(pred_pts_local, gt_pts3d_local)
+        else:
+            l2 = None
+
+        if self.sky_loss_value > 0:
+            assert self.criterion.reduction == 'none', 'sky_loss_value should be 0 if no conf loss'
+            # ... but force the loss to be high there
+            l1 = torch.where(sky_g[mask_g], self.sky_loss_value, l1)
+            if l2 is not None:
+                l2 = torch.where(sky_l[mask_l], self.sky_loss_value, l2)
+
+        self_name = type(self).__name__
+        details = {self_name + '_pts3d': float(l1.mean())}
+        if l2 is not None:
+            details[self_name + '_pts3d_local'] = float(l2.mean())
+        return Sum((l1, mask_g), (l2, mask_l)), (details | monitoring)
+
+
+class ConfLoss (MultiLoss):
+    """ Weighted regression by learned confidence.
+        Assuming the input pixel_loss is a pixel-level regression loss.
+
+    Principle:
+        high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10)
+        low  confidence means low  conf = 10  ==> conf_loss = x * 10 - alpha*log(10)
+
+        alpha: low impact parameter?
+    """
+
+    def __init__(self, pixel_loss, alpha=1):
+        super().__init__()
+        assert alpha > 0
+        self.alpha = alpha
+        self.pixel_loss = pixel_loss.with_reduction('none')
+
+    def get_name(self):
+        return f'ConfLoss({self.pixel_loss})'
+
+    def get_conf_log(self, x):
+        return x, torch.log(x)
+
+    def compute_loss(self, gt, pred, **kw):
+        # compute per-pixel loss
+        ((loss_g, msk_g), (loss_l, msk_l)), details = self.pixel_loss(gt, pred, **kw)
+
+        # weight by confidence
+        if 'conf' not in pred:
+            # not an actual conf loss, so do nothing
+            conf_loss_g = loss_g.mean() if loss_g.numel() > 0 else 0
+            if loss_l is not None:
+                conf_loss_l = loss_l.mean() if loss_l.numel() > 0 else 0
+            else:
+                conf_loss_l = 0
+            details_conf = dict(conf_loss_g=float(conf_loss_g), **details)
+            if loss_l is not None:
+                details_conf['conf_loss_l'] = float(conf_loss_l)
+            return conf_loss_g + conf_loss_l, details_conf
+        else:
+            # compute conf loss for global point and local pointmap separately, then sum
+            conf_pred = pred['conf'][msk_g]
+            conf_g, log_conf_g = self.get_conf_log(conf_pred)
+            conf_loss_g = loss_g * conf_g - self.alpha * log_conf_g
+            # average + nan protection (in case of no valid pixels at all)
+            conf_loss_g = conf_loss_g.mean() if conf_loss_g.numel() > 0 else 0
+
+            if loss_l is not None:
+                conf_l, log_conf_l = self.get_conf_log(pred['conf'][msk_l])
+                conf_loss_l = loss_l * conf_l - self.alpha * log_conf_l
+                conf_loss_l = conf_loss_l.mean() if conf_loss_l.numel() > 0 else 0
+            else:
+                conf_loss_l = 0
+            details_conf = dict(conf_loss_g=float(conf_loss_g), **details)
+            if loss_l is not None:
+                details_conf['conf_loss_l'] = float(conf_loss_l)
+            return conf_loss_g + conf_loss_l, details_conf
diff --git a/must3r/engine/optimizer.py b/must3r/engine/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d4084b204669078e16c584e53e720bc404878e2
--- /dev/null
+++ b/must3r/engine/optimizer.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import json
+
+
+def _get_num_layer_for_vit(var_name, depth, offset=0):
+    if var_name.startswith('feat_embed') or var_name.startswith("patch_embed"):
+        return 0 + offset
+    elif var_name.startswith("blocks"):
+        layer_id = int(var_name.split('.')[1])
+        return layer_id + 1 + offset
+    elif var_name.startswith('norm'):  # part of the last block
+        return depth + offset
+    elif any(var_name.startswith(k) for k in ['head', 'prediction_head']):
+        return depth + 1 + offset
+    else:
+        raise NotImplementedError(var_name)
+
+
+def get_parameter_groups(model, offset, weight_decay, layer_decay=1.0, skip_list=(), no_lr_scale_list=[]):
+    parameter_group_names = {}
+    parameter_group_vars = {}
+    depth = None
+    # prepare layer decay values
+    assert layer_decay == 1.0 or 0. < layer_decay < 1.
+    if layer_decay < 1.:
+        depth = model.depth
+        num_layers = depth + offset
+        layer_decay_values = list(layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2))
+
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+
+        # Assign weight decay values
+        if name.endswith(".bias") or 'norm' in name or name in skip_list:
+            group_name = "no_decay"
+            this_weight_decay = 0.
+        else:
+            group_name = "decay"
+            this_weight_decay = weight_decay
+
+        # Assign layer ID for LR scaling
+        if layer_decay < 1.:
+            skip_scale = False
+            layer_id = _get_num_layer_for_vit(name, depth, offset)
+            group_name = "layer_%d_%s" % (layer_id, group_name)
+            if name in no_lr_scale_list:
+                skip_scale = True
+                group_name = f'{group_name}_no_lr_scale'
+        else:
+            layer_id = 0
+            skip_scale = True
+
+        if group_name not in parameter_group_names:
+            if not skip_scale:
+                scale = layer_decay_values[layer_id]
+            else:
+                scale = 1.
+
+            parameter_group_names[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+            parameter_group_vars[group_name] = {
+                "weight_decay": this_weight_decay,
+                "params": [],
+                "lr_scale": scale
+            }
+
+        parameter_group_vars[group_name]["params"].append(param)
+        parameter_group_names[group_name]["params"].append(name)
+    print("Param groups = %s" % json.dumps(parameter_group_names, indent=2))
+    return list(parameter_group_vars.values())
diff --git a/must3r/engine/train.py b/must3r/engine/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..86f77bd2e2f5c9b74893af6c2d52c009be430e2a
--- /dev/null
+++ b/must3r/engine/train.py
@@ -0,0 +1,510 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+
+import argparse
+import datetime
+import json
+import numpy as np
+import os
+import sys
+import time
+import math
+from pathlib import Path
+from typing import Sized
+from itertools import chain
+
+import torch
+import torch.backends.cudnn as cudnn
+from torch.utils.tensorboard import SummaryWriter
+
+from must3r.datasets import *
+from must3r.model import *  # noqa: F401, needed when loading the model
+from must3r.model.blocks.attention import toggle_memory_efficient_attention
+import must3r.engine.optimizer as optim
+from must3r.engine.inference import inference, concat_preds, postprocess
+from must3r.engine.losses import *
+import must3r.engine.io as checkpoints
+
+import must3r.tools.path_to_dust3r  # noqa
+import dust3r.utils.path_to_croco  # noqa: F401
+from croco.utils.misc import NativeScalerWithGradNormCount as NativeScaler  # noqa
+import croco.utils.misc as dist
+from croco.utils.misc import MetricLogger, SmoothedValue
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('DUST3R training', add_help=False)
+    # model and criterion
+
+    parser.add_argument('--encoder', default="Dust3rEncoder()", type=str, help="dust3r encoder init")
+    parser.add_argument('--decoder', default="CausalMUSt3R()", help='decoder init')
+
+    parser.add_argument('--memory_num_views', default=10, type=int,
+                        help="max number of views to use when updating the memory")
+    parser.add_argument('--memory_batch_views', default=None, type=int,
+                        help="max number of views to use when updating the memory")
+    parser.add_argument('--min_memory_num_views', default=2, type=int,
+                        help="min number of views to use when updating the memory")
+
+    parser.add_argument('--causal', action='store_true', default=False, help="update the memory in a single forward")
+    parser.add_argument('--ignore_dataloader_memory_num_views', action='store_true', default=False)
+
+    parser_render = parser.add_mutually_exclusive_group()
+    parser_render.add_argument('--render_once', action='store_true', default=False)
+    parser_render.add_argument('--disable_render', action='store_true', default=False)
+    parser.add_argument('--max_render_count', default=None, type=int)
+
+    parser.add_argument('--finetune_encoder', default=False, action='store_true', help="Also finetune dust3r's encoder")
+    parser.add_argument('--loss_in_log', action='store_true', default=False)
+    parser.add_argument('--criterion',
+                        default="ConfLoss(Regr3D(L21, norm_mode='?avg_dis', sky_loss_value=2, loss_in_log=args.loss_in_log), alpha=0.2)",
+                        type=str, help="loss")
+
+    parser_chkpt = parser.add_mutually_exclusive_group()
+    parser_chkpt.add_argument('--dust3r_chkpt', default=None, type=str, help="path to dust3r encoder weights")
+    parser_chkpt.add_argument('--croco_chkpt', default=None, type=str, help="path to croco decoder weights")
+    parser_chkpt.add_argument('--chkpt', default=None, type=str, help="optional path to decoder weights")
+
+    # dataset
+    parser.add_argument('--dataset', required=True, type=str, help="training set")
+
+    # training
+    parser.add_argument('--seed', default=777, type=int, help="Random seed")
+    parser.add_argument('--batch_size', default=2, type=int,
+                        help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus")
+    parser.add_argument('--accum_iter', default=2, type=int,
+                        help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)")
+    parser.add_argument('--max_batch_size', default=None, type=int)
+
+    parser.add_argument('--epochs', default=20, type=int, help="Maximum number of epochs for the scheduler")
+
+    parser.add_argument('--weight_decay', type=float, default=0.05, help="weight decay (default: 0.05)")
+    parser.add_argument('--lr', type=float, default=None, metavar='LR', help='learning rate (absolute lr)')
+    parser.add_argument('--blr', type=float, default=1.5e-4, metavar='LR',
+                        help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
+    parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
+                        help='lower lr bound for cyclic schedulers that hit 0')
+    parser.add_argument('--warmup_epochs', type=int, default=6, metavar='N', help='epochs to warmup LR')
+    parser.add_argument('--warmup_lr', type=float, default=0., help='lr at the start of warm-up')
+
+    parser.add_argument('--amp', choices=[False, "bf16", "fp16"], default=False,
+                        help="Use Automatic Mixed Precision for pretraining")
+    parser.add_argument('--use_memory_efficient_attention', action='store_true',
+                        help='use flash attention or xformers mem_eff_attention.')
+    parser.add_argument("--disable_cudnn_benchmark", action='store_true', default=False,
+                        help="set cudnn.benchmark = False")
+    parser.add_argument("--disable_tf32", action='store_true', default=False,
+                        help="set cudnn.benchmark = False")
+
+    # others
+    parser.add_argument('--num_workers', default=8, type=int)
+    parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')
+    parser.add_argument('--local_rank', default=-1, type=int)
+    parser.add_argument('--dist_on_itp', action='store_true')
+    parser.add_argument('--nodist', action='store_true')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+
+    parser.add_argument('--keep_freq', default=5, type=int,
+                        help='frequence (number of epochs) to save checkpoint in checkpoint-%d.pth')
+    parser.add_argument('--print_freq', default=20, type=int,
+                        help='frequence (number of iterations) to print infos while training')
+
+    # output dir
+    parser.add_argument('--output_dir', default='./output/', type=str, help="path where to save the output")
+    return parser
+
+
+def build_dataset(args, dataset=None):
+    if dataset is None:
+        dataset = getattr(args, 'dataset')
+
+    print(f'Building Data loader for dataset: ', dataset)
+    loader = get_data_loader(dataset,
+                             batch_size=args.batch_size,
+                             num_workers=args.num_workers,
+                             pin_mem=True,
+                             shuffle=True,
+                             drop_last=True)
+
+    print(f"dataset length: ", len(loader))
+    return loader
+
+
+def select_batch(device, args, rng, memory_num_views, progress, imgs, true_shape, nimgs):
+    to_skip = 0
+    to_render = None
+
+    if args.memory_num_views < nimgs:
+        # in this scenario, we will update part of the memory in no_grad
+        # we allow more and more images to be no_grad in a curriculum way
+        memory_num_views = 1
+        max_views = math.ceil(args.memory_num_views + progress * (nimgs - args.memory_num_views))
+        max_views = min(max_views, nimgs)
+        # choose how many images to no_grad
+        to_skip = rng.choice(max_views - args.min_memory_num_views + 1)
+        if to_skip < args.min_memory_num_views:
+            # let's not split the intialization
+            to_skip = 0
+            memory_num_views = args.min_memory_num_views
+
+        max_n_imgs = min(to_skip + memory_num_views + args.memory_num_views, max_views)
+        imgs = imgs[:, :max_n_imgs].contiguous()
+        true_shape = true_shape[:, :max_n_imgs].contiguous()
+
+        number_unseen = max_n_imgs - (to_skip + memory_num_views)
+        if args.render_once:
+            # render only unseen images
+            if number_unseen > 0:
+                to_render = torch.randperm(number_unseen, device=device) + to_skip + memory_num_views
+            else:
+                to_render = []
+        else:
+            # render half unseen, half random images
+            to_render = torch.randperm(number_unseen, device=device) + to_skip + memory_num_views
+            to_render = to_render[:math.ceil(args.memory_num_views / 2)]
+
+            n_selected = len(to_render)
+            to_render = torch.concatenate([to_render,
+                                           torch.randperm((to_skip + memory_num_views), device=device)[:(args.memory_num_views - n_selected)]])
+    elif args.render_once:
+        # render only unseen images
+        to_render = list(range(memory_num_views, nimgs))
+
+    to_skip_batches = []
+    mem_batches = []
+    if args.memory_batch_views is not None:
+        if not args.causal:
+            # will process multiple images at once
+            # will pick a random number of images to process each time
+            if to_skip > 0:
+                assert to_skip >= args.min_memory_num_views
+                while (sum_b := sum(to_skip_batches)) != to_skip:
+                    size_b = rng.choice(min(args.memory_batch_views, to_skip)) + 1
+                    size_b = min(size_b, to_skip - sum_b)
+                    to_skip_batches.append(size_b)
+            while (sum_b := sum(mem_batches)) != memory_num_views:
+                size_b = rng.choice(min(args.memory_batch_views, memory_num_views)) + 1
+                size_b = min(size_b, memory_num_views - sum_b)
+                mem_batches.append(size_b)
+        else:
+            # will process multiple images at once, maximum memory_batch_views
+            if to_skip > 0:
+                assert to_skip >= args.min_memory_num_views
+                while (sum_b := sum(to_skip_batches)) != to_skip:
+                    size_b = min(args.memory_batch_views, to_skip - sum_b)
+                    to_skip_batches.append(size_b)
+            while (sum_b := sum(mem_batches)) != memory_num_views:
+                size_b = min(args.memory_batch_views, memory_num_views - sum_b)
+                mem_batches.append(size_b)
+    else:
+        # process it dust3r like, one image at a time, except for initialization
+        if not args.causal:
+            if to_skip > 0:
+                assert to_skip >= args.min_memory_num_views
+                to_skip_batches = [args.min_memory_num_views] + \
+                    [1 for _ in range(to_skip - args.min_memory_num_views)]
+                mem_batches = [1 for _ in range(memory_num_views)]
+            else:
+                mem_batches = [args.min_memory_num_views] + \
+                    [1 for _ in range(memory_num_views - args.min_memory_num_views)]
+        else:
+            if to_skip > 0:
+                assert to_skip >= args.min_memory_num_views
+                to_skip_batches = [to_skip]
+            else:
+                mem_batches = [memory_num_views]
+
+    return imgs, true_shape, memory_num_views, to_skip, to_render, to_skip_batches, mem_batches
+
+
+def train(args):
+    assert os.environ.get('MKL_NUM_THREADS') == '1', 'otherwise inefficient'
+    assert os.environ.get('NUMEXPR_NUM_THREADS') == '1', 'otherwise inefficient'
+    assert os.environ.get('OMP_NUM_THREADS') == '1', 'otherwise inefficient'
+
+    dist.init_distributed_mode(args)
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    torch.backends.cuda.matmul.allow_tf32 = not args.disable_tf32
+    torch.backends.cudnn.allow_tf32 = not args.disable_tf32
+
+    toggle_memory_efficient_attention(enabled=args.use_memory_efficient_attention)
+
+    seed = args.seed + dist.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+    print("output_dir: " + args.output_dir)
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+
+    # auto resume
+    last_ckpt_fname = os.path.join(args.output_dir, f'checkpoint-last.pth')
+    last_ckpt_fname = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
+
+    print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
+    print("{}".format(args).replace(', ', ',\n'))
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    cudnn.benchmark = not args.disable_cudnn_benchmark
+
+    # training dataset and loader
+    print('Building train dataset {:s}'.format(args.dataset))
+    start_time = time.time()
+    data_loader_train = build_dataset(args)
+
+    # model
+    print('Loading encoder: {:s}'.format(args.encoder))
+    encoder = eval(args.encoder)
+    print('Loading decoder: {:s}'.format(args.decoder))
+    decoder = eval(args.decoder)
+
+    print(f'>> Creating criterion')
+    criterion = eval(args.criterion)
+
+    encoder.to(device)
+    decoder.to(device)
+    encoder_without_ddp = encoder
+    decoder_without_ddp = decoder
+    print("encoder = %s" % str(encoder_without_ddp))
+    print("decoder = %s" % str(decoder_without_ddp))
+
+    if args.chkpt and last_ckpt_fname is None:
+        print('Loading pretrained: ', args.chkpt)
+        ckpt = torch.load(args.chkpt, map_location=device, weights_only=False)
+        print(encoder.load_state_dict(ckpt['encoder'], strict=False))
+        print(decoder.load_state_dict(ckpt['decoder'], strict=False))
+        del ckpt  # in case it occupies memory
+    elif args.dust3r_chkpt is not None and last_ckpt_fname is None:
+        # load dust3r encoder
+        print('Loading pretrained: ', args.dust3r_chkpt)
+        ckpt = torch.load(args.dust3r_chkpt, map_location=device, weights_only=False)
+        encoder.from_dust3r(ckpt['model'])
+        decoder.from_dust3r(ckpt['model'])
+    elif args.croco_chkpt is not None and last_ckpt_fname is None:
+        # load croco decoder
+        print('Loading pretrained: ', args.croco_chkpt)
+        ckpt = torch.load(args.croco_chkpt, map_location=device, weights_only=False)
+        encoder.from_croco(ckpt['model'])
+        decoder.from_croco(ckpt['model'])
+    elif last_ckpt_fname is None:
+        print('from scratch')
+
+    eff_batch_size = args.batch_size * args.accum_iter * dist.get_world_size()
+    if args.lr is None:  # only base_lr is specified
+        args.lr = args.blr * eff_batch_size / 256
+    print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
+    print("actual lr: %.2e" % args.lr)
+    print("accumulate grad iterations: %d" % args.accum_iter)
+    print("effective batch size: %d" % eff_batch_size)
+
+    if args.distributed:
+        encoder = torch.nn.parallel.DistributedDataParallel(
+            encoder, device_ids=[args.gpu], find_unused_parameters=False, static_graph=False, broadcast_buffers=True)
+        encoder_without_ddp = encoder.module
+
+        decoder = torch.nn.parallel.DistributedDataParallel(
+            decoder, device_ids=[args.gpu], find_unused_parameters=False, static_graph=False, broadcast_buffers=True)
+        decoder_without_ddp = decoder.module
+
+    # following timm: set wd as 0 for bias and norm layers*
+    param_groups = []
+    if args.finetune_encoder:
+        param_groups += optim.get_parameter_groups(encoder_without_ddp, 0, args.weight_decay)
+    param_groups += optim.get_parameter_groups(decoder_without_ddp, encoder_without_ddp.depth, args.weight_decay)
+
+    optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
+    print(optimizer)
+    loss_scaler = NativeScaler()
+
+    def write_log_stats(epoch, train_stats):
+        if dist.is_main_process():
+            if log_writer is not None:
+                log_writer.flush()
+
+            log_stats = dict(epoch=epoch, **{f'train_{k}': v for k, v in train_stats.items()})
+
+            with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
+                f.write(json.dumps(log_stats) + "\n")
+
+    def save_model(epoch, fname):
+        checkpoints.save_model(args=args, encoder=encoder_without_ddp, decoder=decoder_without_ddp,
+                               optimizer=optimizer, loss_scaler=loss_scaler,
+                               epoch=epoch, fname=fname)
+
+    checkpoints.load_model(args=args, chkpt_path=last_ckpt_fname, encoder=encoder_without_ddp,
+                           decoder=decoder_without_ddp, optimizer=optimizer,
+                           loss_scaler=loss_scaler)
+    if global_rank == 0 and args.output_dir is not None:
+        log_writer = SummaryWriter(log_dir=args.output_dir)
+    else:
+        log_writer = None
+
+    args.pointmaps_activation = get_pointmaps_activation(decoder_without_ddp)
+    print(f"Start training for {args.epochs} epochs")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        # Train
+        train_stats = train_one_epoch(
+            encoder, decoder, criterion, data_loader_train,
+            optimizer, device, epoch, loss_scaler,
+            log_writer=log_writer,
+            args=args)
+
+        write_log_stats(epoch, train_stats)
+
+        # Save the 'last' checkpoint
+        if epoch >= args.start_epoch:
+            save_model(epoch, 'last')
+            if args.keep_freq and epoch % args.keep_freq == 0:
+                save_model(epoch, str(epoch))
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+    save_final_model(args, args.epochs, encoder=encoder_without_ddp, decoder=decoder_without_ddp)
+
+
+def save_final_model(args, epoch, encoder, decoder):
+    output_dir = Path(args.output_dir)
+    checkpoint_path = output_dir / 'checkpoint-final.pth'
+
+    to_save = {
+        'args': args,
+        'encoder': encoder if isinstance(encoder, dict) else encoder.state_dict(),
+        'decoder': decoder if isinstance(decoder, dict) else decoder.state_dict(),
+        'epoch': epoch
+    }
+    print(f'>> Saving model to {checkpoint_path} ...')
+    dist.save_on_master(to_save, checkpoint_path)
+
+
+def train_one_epoch(encoder: torch.nn.Module, decoder: torch.nn.Module,
+                    criterion: torch.nn.Module,
+                    data_loader: Sized, optimizer: torch.optim.Optimizer,
+                    device: torch.device, epoch: int, loss_scaler,
+                    args,
+                    log_writer=None):
+    assert torch.backends.cuda.matmul.allow_tf32 == (not args.disable_tf32)
+
+    # torch.set_anomaly_enabled(True)
+    encoder.train(args.finetune_encoder)
+    decoder.train(True)
+
+    metric_logger = MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    accum_iter = args.accum_iter
+
+    if log_writer is not None:
+        print('log_dir: {}'.format(log_writer.log_dir))
+
+    if hasattr(data_loader, 'dataset') and hasattr(data_loader.dataset, 'set_epoch'):
+        data_loader.dataset.set_epoch(epoch)
+    if hasattr(data_loader, 'sampler') and hasattr(data_loader.sampler, 'set_epoch'):
+        data_loader.sampler.set_epoch(epoch)
+
+    optimizer.zero_grad()
+
+    # fix the seed
+    seed = args.seed + epoch * dist.get_world_size() + dist.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    rng = np.random.default_rng(seed=args.seed + epoch)
+
+    dtype = get_dtype(args)
+    for data_iter_step, batch in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
+        epoch_f = epoch + data_iter_step / len(data_loader)
+        progress = epoch_f / args.epochs
+
+        # we use a per iteration (instead of per epoch) lr scheduler
+        if data_iter_step % accum_iter == 0:
+            dist.adjust_learning_rate(optimizer, epoch_f, args)
+
+        imgs = [b['img'] for b in batch]
+        imgs = torch.stack(imgs, dim=1).to(device)  # B, nimgs, 3, H, W
+        B, nimgs, three, H, W = imgs.shape
+
+        true_shape = [b['true_shape'] for b in batch]
+        true_shape = torch.stack(true_shape, dim=1).to(device)  # B, nimgs, 3, H, W
+
+        if args.ignore_dataloader_memory_num_views:  # similar to the CVPR implementation: extra images may not overlap with the keyframes
+            memory_num_views = rng.choice(args.memory_num_views - args.min_memory_num_views + 1) \
+                + args.min_memory_num_views
+        else:
+            memory_num_views = int(batch[0]['memory_num_views'][0])
+        imgs, true_shape, memory_num_views, to_skip, to_render, to_skip_batches, mem_batches = select_batch(
+            device, args, rng, memory_num_views, progress, imgs, true_shape, nimgs)
+
+        mem_batches = to_skip_batches + mem_batches
+
+        finetune_encoder = args.finetune_encoder
+        if args.max_render_count is not None:
+            if to_render is None:
+                to_render = list(range(nimgs))
+            to_render = rng.choice(to_render, size=args.max_render_count, replace=False)
+        if args.disable_render:
+            to_render = []
+        with torch.autocast("cuda", dtype=dtype):
+            x_out_0, x_out = inference(encoder, decoder, imgs, true_shape, mem_batches,
+                                       train_decoder_skip=len(to_skip_batches),
+                                       max_bs=args.max_batch_size,
+                                       to_render=to_render, encoder_requires_grad=finetune_encoder)
+        with torch.autocast("cuda", dtype=torch.float32):
+            x_out_0 = postprocess(x_out_0, pointmaps_activation=args.pointmaps_activation)
+            x_out = postprocess(x_out, pointmaps_activation=args.pointmaps_activation)
+
+            b0 = batch[to_skip:(to_skip + memory_num_views)]
+            if to_render is None:
+                br = batch
+            else:
+                br = [batch[i] for i in to_render]
+            gt = b0 + br
+            x_out = concat_preds(x_out_0, x_out)
+
+            loss, loss_details = criterion(gt, x_out)
+            loss_value = float(loss)
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value), force=True)
+            sys.exit(1)
+
+        loss /= accum_iter
+        if args.finetune_encoder:
+            parameters_chain = chain(encoder.parameters(), decoder.parameters())
+        else:
+            parameters_chain = decoder.parameters()
+        loss_scaler(loss, optimizer, parameters=parameters_chain,
+                    update_grad=(data_iter_step + 1) % accum_iter == 0)
+        if (data_iter_step + 1) % accum_iter == 0:
+            optimizer.zero_grad()
+
+        del loss
+        del batch
+
+        lr = optimizer.param_groups[0]["lr"]
+        metric_logger.update(epoch=epoch_f)
+        metric_logger.update(lr=lr)
+        metric_logger.update(loss=loss_value, **loss_details)
+
+        if (data_iter_step + 1) % accum_iter == 0 and ((data_iter_step + 1) % (accum_iter * args.print_freq)) == 0:
+            loss_value_reduce = dist.all_reduce_mean(loss_value)  # MUST BE EXECUTED BY ALL NODES
+            if log_writer is None:
+                continue
+            """ We use epoch_1000x as the x-axis in tensorboard.
+            This calibrates different curves when batch size changes.
+            """
+            epoch_1000x = int(epoch_f * 1000)
+            log_writer.add_scalar('train_loss', loss_value_reduce, epoch_1000x)
+            log_writer.add_scalar('train_lr', lr, epoch_1000x)
+            log_writer.add_scalar('train_iter', epoch_1000x, epoch_1000x)
+            for name, val in loss_details.items():
+                log_writer.add_scalar('train_' + name, val, epoch_1000x)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
diff --git a/must3r/model/__init__.py b/must3r/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57df27f8f175d2ed3316a3b3451fdeddb4e22585
--- /dev/null
+++ b/must3r/model/__init__.py
@@ -0,0 +1,108 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import re
+from .encoder import Dust3rEncoder  # noqa
+from .decoder import *  # noqa
+from .blocks.head import ActivationType, apply_activation  # noqa
+
+
+def get_pointmaps_activation(decoder, verbose=True):
+    try:
+        pointmaps_activation = decoder.pointmaps_activation
+    except Exception as e:
+        pointmaps_activation = ActivationType.NORM_EXP
+    if verbose:
+        print(f'pointmaps_activation set to {pointmaps_activation}')
+    return pointmaps_activation
+
+
+def get_dtype(amp):
+    if amp == "fp16":
+        dtype = torch.float16
+    elif amp == "bf16":
+        assert torch.cuda.is_bf16_supported()
+        dtype = torch.bfloat16
+    else:
+        assert not amp
+        dtype = torch.float32
+    return dtype
+
+
+def load_model(chkpt_path, encoder=None, decoder=None, device='cuda', img_size=None, memory_mode=None, verbose=True):
+    ckpt = torch.load(chkpt_path, map_location='cpu', weights_only=False)
+
+    encoder_args = encoder or ckpt['args'].encoder
+    decoder_args = decoder or convert_decoder_args(ckpt['args'].decoder)
+    if img_size is not None:
+        encoder_args = set_image_size_in_args(encoder_args, img_size, verbose=verbose)
+        decoder_args = set_image_size_in_args(decoder_args, img_size, verbose=verbose)
+    encoder = eval(encoder_args)
+    decoder = eval(decoder_args)
+    if memory_mode is not None:
+        decoder.change_memory_mode(memory_mode)
+
+    encoder.load_state_dict(ckpt['encoder'], strict=True)
+    decoder.load_state_dict(ckpt['decoder'], strict=True)
+    encoder.to(device)
+    decoder.to(device)
+    encoder.eval()
+    decoder.eval()
+
+    return encoder, decoder
+
+
+def convert_decoder_args(decoder_args):
+    dec_corresp_dict = {'CausalMUSt3R': 'MUSt3R',
+                        'landscape_only=True': "landscape_only=False",
+                        }
+
+    decoder_args = decoder_args.replace(' ', '')
+    for k, v in dec_corresp_dict.items():
+        decoder_args = decoder_args.replace(k, v)
+    if 'landscape_only=False' not in decoder_args:
+        decoder_args = decoder_args[:-1] + ",landscape_only=False)"
+    return decoder_args
+
+
+def set_image_size_in_args(model_args, img_size, verbose=True):
+    model_args = model_args.replace(' ', '')
+
+    match_size = re.search(r'img_size=\((\d+),(\d+)\)', model_args)
+    if not match_size:
+        raise ValueError("No image_size tuple found in model args")
+    h, w = map(int, match_size.groups())
+    assert h == w
+    if verbose:
+        print(f"image_size {h} -> {img_size}")
+
+    match_adaptative_pos_embed = re.search(r"pos_embed='([A-Za-z]+)(\d+)\_(\d+):(\d+)'", model_args)
+    if match_adaptative_pos_embed:
+        prefix, freq, base_size, new_size = match_adaptative_pos_embed.groups()
+        freq, base_size, new_size = map(int, (freq, base_size, new_size))
+        pos_embed_is_arg = True
+    else:
+        match_bare_pos_embed = re.search(r"pos_embed='([A-Za-z]+)(\d+)'", model_args)
+        if match_bare_pos_embed:
+            prefix, freq = match_bare_pos_embed.groups()
+            freq = int(freq)
+            pos_embed_is_arg = True
+        else:
+            # default value
+            prefix, freq = "RoPE", 100
+            pos_embed_is_arg = False
+        base_size = new_size = h
+
+    if verbose:
+        print(f"Parsed pos_embed: {prefix}{freq}, base size = {base_size}")
+
+    if img_size != h:
+        model_args = model_args.replace(f'img_size=({h},{h})', f'img_size=({img_size},{img_size})')
+    if img_size != new_size:
+        new_pos_embed = f"{prefix}{freq}_{base_size}:{img_size}"
+        if pos_embed_is_arg:
+            model_args = re.sub(
+                r"(pos_embed=')(?:[A-Za-z]+\d+(?:_\d+:\d+)?)(')",
+                rf"\1{new_pos_embed}\2",
+                model_args)
+        else:
+            model_args = model_args[:-1] + ",pos_embed='" + new_pos_embed + "')"
+    return model_args
diff --git a/must3r/model/blocks/__init__.py b/must3r/model/blocks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed5103edfcc1f403899e6323d3b9690e9a0278ee
--- /dev/null
+++ b/must3r/model/blocks/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+
+
+def get_current_dtype(default_dtype, verbose=False):
+    current_dtype = default_dtype
+    try:
+        if torch.is_autocast_cpu_enabled():
+            current_dtype = torch.get_autocast_cpu_dtype()
+        elif torch.is_autocast_enabled():
+            current_dtype = torch.get_autocast_gpu_dtype()
+    except Exception as e:
+        pass
+    if verbose:
+        print(current_dtype)
+    return current_dtype
diff --git a/must3r/model/blocks/attention.py b/must3r/model/blocks/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..95b0615799c0fdd75716012f6dc6cb5e8898ec1e
--- /dev/null
+++ b/must3r/model/blocks/attention.py
@@ -0,0 +1,149 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import torch.nn as nn
+
+try:
+    import xformers.ops
+    has_xformers = True
+except Exception as e:
+    has_xformers = False
+
+try:
+    from torch.nn.functional import scaled_dot_product_attention  # noqa
+    has_scaled_dot_product_attention = True
+except Exception as e:
+    has_scaled_dot_product_attention = False
+
+
+_use_memory_efficient_attention = False
+
+
+def toggle_memory_efficient_attention(enabled: bool = True):
+    global _use_memory_efficient_attention
+    _use_memory_efficient_attention = enabled
+
+
+def is_memory_efficient_attention_enabled():
+    return _use_memory_efficient_attention
+
+
+class CoreAttention (nn.Module):
+    def __init__(self, pos_embed=None, attn_drop=0.):
+        super().__init__()
+        self.pos_embed = pos_embed
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.attn_drop_val = attn_drop
+
+    def attention(self, q, k, v, qpos=None, kpos=None, attn_mask=None):
+        B, H, Nq, D = q.shape
+        C = D * H
+        assert H == self.num_heads
+
+        if self.pos_embed is not None:
+            q = self.pos_embed(q, qpos)
+            k = self.pos_embed(k, kpos)
+
+        if is_memory_efficient_attention_enabled() and (attn_mask is None or attn_mask.dtype != torch.bool):
+            assert has_xformers
+            # print('using xformers')
+            # q, k, v are batch, num_heads, seqlen, K
+            # Supported formats for inputs/outputs:
+            # [batch, seqlen, num_heads, K]
+            # [batch, seqlen, K] (Legacy format)
+            # with (batch, seqlen, num_heads, K), need to use contiguous() or something's wrong with the stride for bwd
+            # q, k, v = map(lambda val: val.transpose(1, 2).contiguous(), (q, k, v))
+            # x = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None).reshape(B, Nq, C)
+            # the second format is more natural for croco
+            if q.dtype != v.dtype:
+                q = q.to(v.dtype)
+            if k.dtype != v.dtype:
+                k = k.to(v.dtype)
+            assert attn_mask is None or attn_mask.dtype == v.dtype  # because casting it here will mess up stride
+            q, k, v = map(lambda val: val.reshape(B * self.num_heads, -1, D), (q, k, v))
+            x = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=attn_mask, p=self.attn_drop_val)
+            x = x.reshape(B, self.num_heads, -1, D).transpose(1, 2).reshape(B, -1, C)
+        elif has_scaled_dot_product_attention and (attn_mask is None or attn_mask.dtype == torch.bool):
+            if q.dtype != v.dtype:
+                q = q.to(v.dtype)
+            if k.dtype != v.dtype:
+                k = k.to(v.dtype)
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask,
+                                                                 dropout_p=self.attn_drop_val)
+            x = x.transpose(1, 2).reshape(B, Nq, C)
+        else:
+            assert attn_mask is None
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2).reshape(B, Nq, C)
+        return x
+
+
+class Attention(CoreAttention):
+    def __init__(self, dim, pos_embed=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., qkln=False):
+        super().__init__(pos_embed=pos_embed, attn_drop=attn_drop)
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, xpos):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).transpose(1, 3)
+        q, k, v = [qkv.select(2, i) for i in range(3)]
+        x = self.attention(q, k, v, xpos, xpos)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class CrossAttention(CoreAttention):
+    def __init__(self, dim, pos_embed=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__(pos_embed=pos_embed, attn_drop=attn_drop)
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.projq = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projk = nn.Linear(dim, dim, bias=qkv_bias)
+        self.projv = nn.Linear(dim, dim, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, query, key, value, qpos, kpos, attn_mask=None):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+
+        q = self.projq(query).reshape(B, Nq, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = self.projk(key).reshape(B, Nk, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = self.projv(value).reshape(B, Nv, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+
+        x = self.attention(q, k, v, qpos, kpos, attn_mask=attn_mask)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class CachedCrossAttention(CrossAttention):
+    def __init__(self, dim, pos_embed=None, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.):
+        super().__init__(dim=dim, pos_embed=pos_embed, num_heads=num_heads, qkv_bias=qkv_bias,
+                         attn_drop=attn_drop, proj_drop=proj_drop)
+
+    def prepare_kv(self, key, value):
+        k = self.projk(key)
+        v = self.projv(value)
+        return k, v
+
+    def forward(self, query, key, value, qpos, kpos, attn_mask=None):
+        B, Nq, C = query.shape
+        Nk = key.shape[1]
+        Nv = value.shape[1]
+        q = self.projq(query).reshape(B, Nq, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        k = key.reshape(B, Nk, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        v = value.reshape(B, Nv, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        x = self.attention(q, k, v, qpos, kpos, attn_mask=attn_mask)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
diff --git a/must3r/model/blocks/dropout.py b/must3r/model/blocks/dropout.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c89468ad34a36facfe68479c231aa283423a9bf
--- /dev/null
+++ b/must3r/model/blocks/dropout.py
@@ -0,0 +1,84 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+class MemoryDropoutSelector(nn.Module):
+    def __init__(self, p=0.0) -> None:
+        super().__init__()
+        self.p = p
+
+    def sel(self, N, protected=0, device='cuda', p=None):
+        p = self.p if p is None else p
+        N_x = N - protected
+        if N_x > 0:
+            if p < 1:
+                tokens_to_drop = torch.sum(torch.rand(N_x, device=device) < p)
+            else:
+                tokens_to_drop = max(0, min(N - p, N_x))
+
+            sel = torch.randperm(N_x, device=device)
+            sel = torch.sort(sel[tokens_to_drop:]).values
+            not_sel = torch.sort(sel[:tokens_to_drop]).values
+            if protected > 0:
+                sel = sel + protected
+                not_sel = not_sel + protected
+                protected_labels = torch.arange(protected, device=device)
+                sel = torch.cat([protected_labels, sel], dim=-1)
+        else:
+            sel = torch.arange(N, device=device)
+            not_sel = torch.zeros((0, ), device=device, dtype=torch.int)
+        return sel, not_sel
+
+    def forward(self, Nm, nimgs, N, protected=0, device='cuda', p=None):
+        p = self.p if p is None else p
+        if p == 0.0:
+            return None, None
+        assert nimgs > 0
+        sel, not_sel = [], []
+        sel0 = torch.arange(Nm, device=device)  # initialization or already dropped out at the previous iteration
+        not_sel0 = torch.arange(0, device=device)
+        sel.append(sel0)
+        not_sel.append(not_sel0)
+
+        for i in range(nimgs):
+            sel_prev = sel[-1]
+            not_sel_prev = not_sel[-1]
+
+            N_prev = len(sel_prev)
+
+            seli, not_seli = self.sel(N_prev + N, protected, device, p=p)
+            keep_new_vals = seli >= N_prev
+            discard_new_vals = not_seli >= N_prev
+
+            old_keep = sel_prev[seli[~keep_new_vals]]
+            old_discard = sel_prev[not_seli[~discard_new_vals]]
+
+            offset = (Nm + i * N) - N_prev
+            seli = torch.concatenate([old_keep, seli[keep_new_vals] + offset])
+            not_seli = torch.concatenate(
+                [not_sel_prev, old_discard, not_seli[discard_new_vals] + offset]
+            )
+            sel.append(seli)
+            not_sel.append(not_seli)
+        return sel, not_sel
+
+
+class TemporaryMemoryDropoutSelector(MemoryDropoutSelector):
+    def __init__(self, p=0.0) -> None:
+        super().__init__()
+        self.p = p
+
+    def forward(self, Nm, nimgs, N, protected=0, device='cuda', p=None):
+        p = self.p if p is None else p
+        if p == 0.0:
+            return None, None
+        sel, not_sel = [], []
+        for i in range(nimgs):
+            mem_cnt = Nm + (i * N)
+            seli, not_seli = self.sel(mem_cnt, protected, device, p=p)
+            new_vals = torch.arange(N, device=device) + mem_cnt
+            seli = torch.concatenate([seli, new_vals])
+            sel.append(seli)
+            not_sel.append(not_seli)
+        return sel, not_sel
diff --git a/must3r/model/blocks/head.py b/must3r/model/blocks/head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1135f71ae3bf47226becea3792609ae99931729
--- /dev/null
+++ b/must3r/model/blocks/head.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch.nn as nn
+from enum import Enum
+from must3r.tools.image import unpatchify
+from must3r.tools.geometry import apply_exp_to_norm
+
+
+class ActivationType(Enum):
+    NORM_EXP = "norm_exp"
+    LINEAR = "linear"
+
+
+def apply_activation(xyz, activation):
+    if isinstance(activation, str):
+        activation = ActivationType(activation)
+    if activation == ActivationType.NORM_EXP:
+        return apply_exp_to_norm(xyz, dim=-1)
+    elif activation == ActivationType.LINEAR:
+        return xyz
+    else:
+        raise ValueError(f"Unknown activation: {activation}")
+
+
+def transpose_to_landscape(head, activate=True):
+    """ Predict in the correct aspect-ratio,
+        then transpose the result in landscape 
+        and stack everything back together.
+    """
+    def wrapper_no(decout, true_shape):
+        B = len(true_shape)
+        assert true_shape[0:1].allclose(true_shape), 'true_shape must be all identical'
+        H, W = true_shape[0].cpu().tolist()
+        x = head(decout, (H, W))
+        return x
+
+    def wrapper_yes(decout, true_shape):
+        B = len(true_shape)
+        # by definition, the batch is in landscape mode so W >= H
+        H, W = int(true_shape.min()), int(true_shape.max())
+
+        height, width = true_shape.T
+        is_landscape = (width >= height)
+        is_portrait = ~is_landscape
+
+        if is_landscape.all():
+            return head(decout, (H, W))
+        if is_portrait.all():
+            return head(decout, (W, H)).swapaxes(1, 2)
+
+        # batch is a mix of both portrait & landscape
+        def selout(ar): return [d[ar] for d in decout]
+        l_result = head(selout(is_landscape), (H, W))
+        p_result = head(selout(is_portrait), (W, H)).swapaxes(1, 2)
+
+        x = l_result.new(B, *l_result.shape[1:])
+        x[is_landscape] = l_result
+        x[is_portrait] = p_result
+        return x
+
+    return wrapper_yes if activate else wrapper_no
+
+
+class LinearHead(nn.Module):
+    def __init__(self, embed_dim, output_dim, patch_size):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Linear(embed_dim, output_dim, bias=True)
+
+    def forward(self, feats, img_shape):
+        x = self.proj(feats[-1])
+        x = unpatchify(x, self.patch_size, img_shape).permute(0, 2, 3, 1)
+        return x
diff --git a/must3r/model/blocks/layers.py b/must3r/model/blocks/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eadf5ca41aaf6def1986a0ab730a2cb9e3cab81
--- /dev/null
+++ b/must3r/model/blocks/layers.py
@@ -0,0 +1,99 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import torch.nn as nn
+from must3r.model.blocks.attention import Attention, CachedCrossAttention
+import must3r.tools.path_to_dust3r  # noqa
+import dust3r.utils.path_to_croco  # noqa
+from croco.models.blocks import Mlp, DropPath
+
+MEMORY_MODES = ['norm_y', 'kv', 'raw']
+
+
+class BaseTransformer(nn.Module):
+    def initialize_weights(self):
+        # linears and layer norms
+        self.apply(self._init_weights)
+        self.apply(self._init_override)
+
+    def _init_override(self, m):
+        init_weight_override_fun = getattr(m, "_init_weight_override", None)
+        if callable(init_weight_override_fun):
+            init_weight_override_fun()
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            # we use xavier_uniform following official JAX ViT:
+            torch.nn.init.xavier_uniform_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+            if m.weight is not None:
+                nn.init.constant_(m.weight, 1.0)
+
+
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, pos_embed=None, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+
+        # SA
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, pos_embed=pos_embed, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop,
+                              proj_drop=drop)
+        # MLP
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def forward(self, x, xpos=None):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class CachedDecoderBlock(nn.Module):
+    def __init__(self, dim, num_heads, pos_embed=None, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, memory_mode="norm_y"):
+        super().__init__()
+        assert memory_mode in MEMORY_MODES
+        self.memory_mode = memory_mode
+
+        # SA
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(dim, pos_embed=pos_embed, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop,
+                              proj_drop=drop)
+
+        # CA
+        self.norm2 = norm_layer(dim)
+        self.norm_y = norm_layer(dim)
+        self.cross_attn = CachedCrossAttention(dim, pos_embed=None, num_heads=num_heads, qkv_bias=qkv_bias,
+                                               attn_drop=attn_drop, proj_drop=drop)
+
+        # MLP
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+
+    def prepare_y(self, y):
+        if self.memory_mode == 'raw':
+            return y
+        y_ = self.norm_y(y)
+        if self.memory_mode == 'norm_y':
+            return y_.to(y.dtype)
+        k, v = self.cross_attn.prepare_kv(y_, y_)
+        return torch.concatenate([k, v], dim=-1)
+
+    def forward(self, x, y, xpos=None, ypos=None, ca_attn_mask=None):
+        x = x + self.drop_path(self.attn(self.norm1(x), xpos))
+        y_ = self.norm_y(y) if self.memory_mode == 'raw' else y
+        if self.memory_mode == 'kv':
+            key, value = torch.split(y_, x.shape[-1], dim=-1)
+        else:
+            key, value = self.cross_attn.prepare_kv(y_, y_)
+        x = x + self.drop_path(self.cross_attn(self.norm2(x), key, value, xpos, ypos, ca_attn_mask))
+        x = x + self.drop_path(self.mlp(self.norm3(x)))
+        return x
diff --git a/must3r/model/blocks/pos_embed.py b/must3r/model/blocks/pos_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..c87294d111ebf72b76b0978489e7ec28cdff0dd3
--- /dev/null
+++ b/must3r/model/blocks/pos_embed.py
@@ -0,0 +1,22 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import must3r.tools.path_to_dust3r  # noqa
+import dust3r.utils.path_to_croco  # noqa
+from croco.models.pos_embed import RoPE2D  # noqa
+
+
+def get_pos_embed(pos_embed_name):
+    # adaptative frequencies
+    F0 = 1.0  # default
+    assert pos_embed_name.startswith('RoPE')
+    if '_' in pos_embed_name:
+        """ Adapting pose embeddings for higher-resolution.
+        if pos_embed_name == 'RoPE100_224:512':
+            => frequencies are now going to behave in [0,512] like they behaved in [0,224] before
+        """
+        pos_embed_name, resolutions = pos_embed_name.split('_')
+        old_grid, new_grid = resolutions.split(':')
+        F0 = float(old_grid) / float(new_grid)
+        print(f'>> Using adaptive frequencies: {F0=}={old_grid}/{new_grid}')
+    freq = float(pos_embed_name[len('RoPE'):])
+    block_pos_embed = RoPE2D(freq, F0=F0)
+    return block_pos_embed
diff --git a/must3r/model/decoder.py b/must3r/model/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..67aa6643c3da3129b6afa2e513d916152161865a
--- /dev/null
+++ b/must3r/model/decoder.py
@@ -0,0 +1,610 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import torch.nn as nn
+from functools import partial
+
+from must3r.model.blocks import get_current_dtype
+from must3r.model.blocks.layers import BaseTransformer, CachedDecoderBlock, MEMORY_MODES
+from must3r.model.blocks.head import ActivationType, LinearHead, transpose_to_landscape
+from must3r.model.blocks.dropout import MemoryDropoutSelector, TemporaryMemoryDropoutSelector
+from must3r.model.blocks.pos_embed import get_pos_embed
+from must3r.model.feedback_mechanism import create_feedback_layers, init_feedback_layers, run_feedback_layers
+
+
+class MUSt3R(BaseTransformer):
+    """
+    inference class
+    """
+
+    def __init__(self,
+                 img_size=(224, 224),           # input image size
+                 enc_embed_dim=1024,      # encoder feature dimension
+                 patch_size=16,          # encoder patch_size
+                 embed_dim=768,      # decoder feature dimension
+                 output_dim=1792,      # 16*16*7
+                 depth=12,           # decoder depth
+                 num_heads=12,       # decoder number of heads in the transformer block
+                 mlp_ratio=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 act_layer=nn.GELU,      # activation layer in the mlp
+                 pos_embed='RoPE100',
+                 landscape_only=True,
+                 head='Linear',
+                 feedback_type=None,
+                 memory_mode="norm_y",  # 3 choices, norm_y, kv and raw
+                 pointmaps_activation=ActivationType.NORM_EXP,
+                 block_type=CachedDecoderBlock,
+                 ** kv):
+        super(MUSt3R, self).__init__()
+        self.pointmaps_activation = pointmaps_activation
+        self._init_projector(enc_embed_dim, embed_dim)
+        self._init_pos_embed(img_size, patch_size, embed_dim, num_heads, pos_embed)
+        self._init_blocks(block_type, embed_dim, depth, num_heads, mlp_ratio, norm_layer, act_layer,
+                          memory_mode=memory_mode)
+        self._init_feedback_mechanism(embed_dim, depth, feedback_type)
+        self._init_head(enc_embed_dim, patch_size, embed_dim, output_dim, depth, norm_layer, landscape_only, head)
+        self.initialize_weights()
+        init_feedback_layers(self.feedback_type, self.feedback_layer)
+
+    def _init_projector(self, enc_embed_dim, embed_dim):
+        self.feat_embed_enc_to_dec = nn.Linear(enc_embed_dim, embed_dim, bias=True)
+        self.image2_embed = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        torch.nn.init.normal_(self.image2_embed, std=.02)
+
+    def _init_pos_embed(self, img_size, patch_size, embed_dim, num_heads, pos_embed):
+        self.max_seq_len = max(img_size) // patch_size
+        self.grid_size = (img_size[0] // patch_size, img_size[1] // patch_size)
+        self.rope = get_pos_embed(pos_embed)
+
+    def _init_blocks(self, block_type, embed_dim, depth, num_heads, mlp_ratio, norm_layer, act_layer, memory_mode):
+        if isinstance(block_type, str):
+            block_type = eval(block_type)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.memory_mode = memory_mode
+        self.attn_num_heads = num_heads
+        self.blocks_dec = nn.ModuleList([
+            block_type(embed_dim, num_heads, self.rope, mlp_ratio, qkv_bias=True,
+                       norm_layer=norm_layer, act_layer=act_layer, memory_mode=memory_mode)
+            for i in range(depth)])
+
+    def _init_feedback_mechanism(self, embed_dim, depth, feedback_type):
+        self.feedback_type = feedback_type
+        self.feedback_layer, self.feedback_norm = create_feedback_layers(embed_dim, depth, feedback_type)
+
+    def _init_head(self, enc_embed_dim, patch_size, embed_dim, output_dim, depth, norm_layer, landscape_only, head):
+        self.norm_dec = norm_layer(embed_dim)
+        if head == 'Linear':
+            self.head_dec = LinearHead(embed_dim, output_dim, patch_size)
+        else:
+            raise ValueError(f'invalid head {head}')
+        self._head_wrapper = transpose_to_landscape(self.head_dec, activate=landscape_only)
+
+    def from_dust3r(self, state_dict, verbose=True, load_head=False):
+        state_dict = {k.replace('dec_blocks.', 'blocks_dec.').replace(
+            'decoder_embed.', 'feat_embed_enc_to_dec.').replace(
+            'dec_norm.', 'norm_dec.'): v for k, v in state_dict.items()}
+        if load_head:
+            state_dict = {k.replace('downstream_head.proj.', 'head_dec.proj.'): v for k, v in state_dict.items()}
+        incompatible_keys = self.load_state_dict(state_dict, strict=False)
+        if verbose:
+            print(incompatible_keys)
+        return incompatible_keys
+
+    def from_croco(self, state_dict, verbose=True):
+        # same format
+        return self.from_dust3r(state_dict, verbose=verbose)
+
+    def set_freeze(self, freeze='none'):  # this is for use by downstream models
+        self.freeze = freeze
+        to_be_frozen = {
+            'none': [],
+            'not_head': [self.feat_embed_enc_to_dec, self.image2_embed, self.blocks_dec,
+                         self.feedback_layer, self.feedback_norm],
+        }
+        for module in to_be_frozen[freeze]:
+            try:
+                for n, param in module.named_parameters():
+                    param.requires_grad = False
+            except AttributeError:
+                # module is directly a parameter
+                module.requires_grad = False
+
+    def change_memory_mode(self, memory_mode="norm_y"):
+        assert memory_mode in MEMORY_MODES
+        for blk in self.blocks_dec:
+            blk.memory_mode = memory_mode
+        self.memory_mode = memory_mode
+
+    def make_mem_mask(self, nimgs, N, Nm, device):
+        if isinstance(nimgs, list):
+            assert isinstance(N, list)
+            tokens_images = [nimg * Ni for nimg, Ni in zip(nimgs, N)]
+
+            Nt = sum(tokens_images)
+            mem_masks = [torch.ones((nimg, Nm + Nt), dtype=torch.bool, device=device) for nimg in nimgs]
+            offset = 0
+            for i, (nimg, Ni) in enumerate(zip(nimgs, N)):
+                for j in range(nimg):
+                    mem_masks[i][j, Nm + offset + (j * Ni):Nm + offset + ((j + 1) * Ni)] = 0
+                offset += nimg * Ni
+            return mem_masks
+        else:
+            mem_mask = torch.ones((1, N), dtype=torch.bool, device=device)
+            mem_mask = mem_mask.repeat(nimgs, 1)  # nimgs, N
+            mem_mask = torch.block_diag(*mem_mask).view(nimgs, -1)  # nimgs, nimgs * N
+            mem_mask = torch.concatenate([torch.zeros((nimgs, Nm), dtype=mem_mask.dtype, device=device),
+                                          mem_mask], dim=1)  # nimgs, Nm + nimgs * N
+            mem_mask = ~mem_mask
+            return mem_mask
+
+    def _get_empty_memory(self, device, current_dtype, B, mem_D):
+        current_mem = [torch.zeros((B, 0, mem_D), dtype=current_dtype, device=device) for _ in range(self.depth)]
+        current_mem_labels = torch.zeros((B, 0), dtype=torch.int64, device=device)
+        mem_nimgs = 0
+        mem_protected_imgs = 0
+        mem_protected_tokens = 0
+        return current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens
+
+    def _compute_prediction_head(self, true_shape, B, nimgs, feats):
+        feats[-1] = self.norm_dec(feats[-1])
+        decout = feats
+        with torch.autocast("cuda", dtype=torch.float32):
+            decout = [tok.float() for tok in decout]
+            x = self._head_wrapper(decout, true_shape.view(B * nimgs, *true_shape.shape[2:]))
+            x = x.view(B, nimgs, *x.shape[1:])
+        return x
+
+    def forward_list(self, x, pos, true_shape, current_mem=None, render=False, return_feats=False):
+        # forward_list is called at inference when dealing with multiple aspect ratios or limited batch size
+        x = x.copy()  # to be able to make views without changing the parent list
+        pos = pos.copy()
+        true_shape.copy()
+
+        x_shapes = []
+        device = x[0].device
+        xdtype = x[0].dtype
+
+        current_dtype = get_current_dtype(xdtype)
+        feats = []
+        for i in range(len(x)):
+            B, nimg, Ni, Denc = x[i].shape
+            xi_v = x[i].view(B * nimg, Ni, Denc)
+            feats.append([xi_v])
+            x[i] = self.feat_embed_enc_to_dec(xi_v).view(B, nimg, Ni, -1)
+            if current_mem is None and i == 0:
+                # initialization
+                x[i][:, 1:] = x[i][:, 1:] + self.image2_embed.to(current_dtype)
+            else:
+                x[i] = x[i] + self.image2_embed.to(current_dtype)  # not the reference image / memory
+
+            x_shapes.append(x[i].shape)
+
+            D = x[i].shape[-1]
+            x[i] = x[i].view(B * nimg, Ni, D)
+            pos[i] = pos[i].view(B * nimg, Ni, 2)
+
+        B = x_shapes[0][0]
+        D = x_shapes[0][-1]
+        mem_D = 2 * D if self.memory_mode == "kv" else D
+        nimgs = [x_shapesi[1] for x_shapesi in x_shapes]
+        N = [x_shapesi[2] for x_shapesi in x_shapes]
+        if current_mem is None:
+            current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens = \
+                self._get_empty_memory(device, current_dtype, B, mem_D)
+        else:
+            current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens = current_mem
+
+        mem = []
+        Nm = current_mem[0].shape[1]
+        if not render and (Nm > 0 or sum(nimgs) > 1):
+            # when updating the memory, do not let an image do CA with its own tokens
+            # ignore this rule when initializing from only one image
+            mem_mask = self.make_mem_mask(nimgs, N, Nm, device)
+        else:
+            mem_mask = None
+
+        new_mem = []
+        for blk, current_mem_blk in zip(self.blocks_dec, current_mem):
+            if not render:
+                # update the memory for this layer
+                x_cat = [xi.view(B, -1, D) for xi in x]
+                x_cat = torch.concatenate(x_cat, dim=1)
+                new_mem.append(x_cat)
+                mem_i = torch.concatenate([current_mem_blk, blk.prepare_y(x_cat)], dim=1)
+            else:
+                mem_i = current_mem_blk
+
+            # mem is B, Nmi, D
+            # we need B*nimgs, Nmi, D for CA
+            if mem_mask is not None:
+                mem_l = [mem_i.unsqueeze(1).expand(-1, nimgs[i], -1, -1)[:, mem_mask[i]].reshape(B * nimgs[i], -1, mem_D)
+                         for i in range(len(nimgs))]
+            else:
+                Nmi = mem_i.shape[1]
+                mem_l = [mem_i.unsqueeze(1).expand(-1, nimgs[i], -1, -1).reshape(B * nimgs[i], Nmi, mem_D)
+                         for i in range(len(nimgs))]
+
+            # apply decoder
+            for i in range(len(x)):
+                x[i] = blk(x[i], mem_l[i], pos[i], None)
+                feats[i].append(x[i])
+
+        if not render:
+            new_mem = run_feedback_layers(self.feedback_layer, self.feedback_norm, new_mem)
+            mem = []
+            for i in range(len(new_mem)):
+                new_mem_i = self.blocks_dec[i].prepare_y(new_mem[i])
+                mem.append(torch.concatenate([current_mem[i], new_mem_i], dim=1))
+
+            new_labels = []
+            offset = 0
+            for i, (nimg, Ni) in enumerate(zip(nimgs, N)):
+                new_labels_i = torch.arange(nimg, dtype=current_mem_labels.dtype, device=current_mem_labels.device)
+                new_labels_i = new_labels_i.view(1, nimg, 1).repeat(B, 1, Ni).view(B, nimg * Ni)
+                new_labels_i = new_labels_i + mem_nimgs + offset
+                new_labels.append(new_labels_i)
+                offset += nimg
+            new_labels = torch.concatenate(new_labels, dim=1)
+            mem_labels = torch.concatenate([current_mem_labels, new_labels], dim=1)
+            mem_nimgs = mem_nimgs + sum(nimgs)
+            out = (mem, mem_labels, mem_nimgs, mem_nimgs, mem_labels.shape[1])
+        else:
+            out = (current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens)
+
+        # apply prediction head
+        for i in range(len(x)):
+            if hasattr(self, 'all_feats'):
+                self.all_feats.append([f.cpu() for f in feats[i]])
+            x[i] = self._compute_prediction_head(true_shape[i], B, nimgs[i], feats[i])
+            if hasattr(self, 'recorded_feats'):
+                self.recorded_feats.append(feats[i][-1].cpu())
+        if return_feats:
+            # return memory, pointmaps, feats
+            feats = [[feats[i][j].view(B, nimgs[i], *feats[i][j].shape[1:]) for j in range(len(feats[i]))]
+                     for i in range(len(feats))]
+            return out, x, feats
+        else:
+            # return memory, pointmaps
+            return out, x
+
+    def forward(self, x, pos, true_shape, current_mem=None, render=False, return_feats=False):
+        if isinstance(x, list):
+            # multiple ar in this batch
+            return self.forward_list(x, pos, true_shape, current_mem, render)
+
+        current_dtype = get_current_dtype(x.dtype)
+        B, nimgs, N, Denc = x.shape
+        feats = [x.view(B * nimgs, N, Denc)]
+        x = self.feat_embed_enc_to_dec(feats[0]).view(B, nimgs, N, -1)
+        B, nimgs, N, D = x.shape
+        mem_D = 2 * D if self.memory_mode == "kv" else D
+        assert not render or current_mem is not None
+
+        if current_mem is None:
+            # initialization
+            x[:, 1:] = x[:, 1:] + self.image2_embed.to(current_dtype)
+            current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens = \
+                self._get_empty_memory(x.device, current_dtype, B, mem_D)
+        else:
+            current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens = current_mem
+            x = x + self.image2_embed.to(current_dtype)  # not the reference image / memory
+        x = x.view(B * nimgs, N, D)
+        pos = pos.view(B * nimgs, N, 2)
+
+        mem = []
+        Nm = current_mem[0].shape[1]
+        if not render and (Nm > 0 or nimgs > 1):
+            # when updating the memory, do not let an image do CA with its own tokens
+            # ignore this rule when initializing from only one image
+            mem_mask = self.make_mem_mask(nimgs, N, Nm, x.device)
+        else:
+            mem_mask = None
+
+        new_mem = []
+        for blk, current_mem_blk in zip(self.blocks_dec, current_mem):
+            if not render:
+                # update the memory for this layer
+                xmem = x.view(B, nimgs * N, D)
+                new_mem.append(xmem)
+                mem_i = torch.concatenate([current_mem_blk, blk.prepare_y(xmem)], dim=1)
+            else:
+                mem_i = current_mem_blk
+
+            # mem is B, Nmi, D
+            # we need B*nimgs, Nmi, D for CA
+            if mem_mask is not None:
+                mem_i = mem_i.unsqueeze(1).expand(-1, nimgs, -1, -1)[:, mem_mask].reshape(
+                    B * nimgs, Nm + ((nimgs - 1)) * N, mem_D)
+            else:
+                Nmi = mem_i.shape[1]
+                mem_i = mem_i.unsqueeze(1).expand(-1, nimgs, -1, -1).reshape(B * nimgs, Nmi, mem_D)
+
+            # apply decoder
+            x = blk(x, mem_i, pos, None)
+            feats.append(x)
+
+        if not render:
+            # assert (Nm + nimgs * N) == mem[0].shape[1]
+            new_mem = run_feedback_layers(self.feedback_layer, self.feedback_norm, new_mem)
+
+            mem = []
+            for i in range(len(new_mem)):
+                new_mem_i = self.blocks_dec[i].prepare_y(new_mem[i])
+                mem.append(torch.concatenate([current_mem[i], new_mem_i], dim=1))
+
+            new_labels = torch.arange(nimgs, dtype=current_mem_labels.dtype, device=current_mem_labels.device).view(
+                1, nimgs, 1).repeat(B, 1, N).view(B, N * nimgs) + mem_nimgs
+            mem_labels = torch.concatenate([current_mem_labels, new_labels], dim=1)
+
+            mem_nimgs = mem_nimgs + nimgs
+            out = (mem, mem_labels, mem_nimgs, mem_nimgs, mem_labels.shape[1])
+        else:
+            out = (current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens)
+
+        # apply prediction head
+        x = self._compute_prediction_head(true_shape, B, nimgs, feats)
+
+        if return_feats:
+            # return memory, pointmaps, feats
+            feats = [feats[i].view(B, nimgs, *feats[i].shape[1:]) for i in range(len(feats))]
+            return out, x, feats
+        else:
+            # return memory, pointmaps
+            return out, x
+
+
+class CausalMUSt3R(MUSt3R):
+    """
+    Training class
+    """
+
+    def __init__(self,
+                 protected_imgs=1,
+                 mem_dropout=0.0,
+                 dropout_mode='temporary',
+                 use_xformers_mask=False,
+                 use_mem_mask=False, **kv):
+        super().__init__(**kv)
+        self._init_dropout(protected_imgs, mem_dropout, dropout_mode)
+        self.use_xformers_mask = use_xformers_mask
+        self.use_mem_mask = use_mem_mask
+
+    def _init_dropout(self, protected_imgs, mem_dropout, dropout_mode):
+        self.protected_imgs = protected_imgs
+        self.dropout_mode = dropout_mode
+        if dropout_mode == 'permanent':
+            self.mem_dropout = MemoryDropoutSelector(mem_dropout)
+        elif dropout_mode == 'temporary':
+            self.mem_dropout = TemporaryMemoryDropoutSelector(mem_dropout)
+        else:
+            raise ValueError(f'Invalid dropout mode = {dropout_mode}')
+
+    def make_mem_mask(self, nimgs, N, Nm, device):
+        mem_mask = torch.ones((1, N), dtype=torch.bool, device=device)
+        mem_mask = mem_mask.repeat(nimgs, 1)  # nimgs, N
+        mem_mask = torch.block_diag(*mem_mask).view(nimgs, -1)  # nimgs, nimgs * N
+        mem_mask = torch.concatenate([torch.zeros((nimgs, Nm), dtype=mem_mask.dtype, device=device),
+                                      mem_mask], dim=1)  # nimgs, Nm + nimgs * N
+        mem_mask = ~mem_mask
+        return mem_mask
+
+    def make_attn_mask(self, x, B, nimgs, N, mem_nimgs, Nm, mem_not_sel, mem_labels, mem_mask):
+        idx = torch.arange(nimgs, device=x.device).view(1, nimgs, 1) + mem_nimgs
+        idx = idx.expand(B, -1, mem_labels.shape[-1])  # B, nimgs, Nmem
+
+        mem_labels_view = mem_labels.view(B, 1, -1).expand(-1, nimgs, -1)  # B, nimgs, Nmem
+        # do not attend tokens from the same image
+        attn_mask = mem_labels_view != idx  # B, nimgs, Nmem
+
+        # only attend tokens of the previous images
+        if Nm == 0:  # exception for initialization, let the first image do CA with the second image
+            idx = idx.clone()
+            idx[:, 0] = idx[:, 0] + 2  # idx for img 0 will become 2
+        attn_mask = attn_mask & (mem_labels_view < idx)
+
+        if mem_not_sel is not None:
+            # mask dropped out tokens
+            for i in range(len(mem_not_sel) - 1):
+                mem_not_sel_c = mem_not_sel[i]  # Nmem_out
+                mem_not_sel_c = mem_not_sel_c.unsqueeze(0).expand(B, -1)
+                attn_mask[:, i] = attn_mask[:, i].scatter(
+                    dim=-1, index=mem_not_sel_c, src=torch.zeros_like(mem_not_sel_c, dtype=torch.bool))
+
+        if mem_mask is not None:
+            # use mem_mask on attn_mask
+            mem_mask_attn = mem_mask.view(1, nimgs, Nm + nimgs * N)
+            mem_mask_attn = mem_mask_attn.expand(B, -1, -1)
+            attn_mask = attn_mask[mem_mask_attn]
+
+        attn_mask = attn_mask.view(B, nimgs, 1, 1, -1)
+        attn_mask = attn_mask.repeat(1, 1, self.attn_num_heads, N, 1)
+        attn_mask = attn_mask.reshape(B * nimgs, self.attn_num_heads, N, -1)
+
+        if self.use_xformers_mask:
+            current_dtype = get_current_dtype(x.dtype)
+            # xformers mask is in an additive mask in float
+            # -torch.inf for ignored values, 0 for values we keep
+            # you need to ensure memory is aligned by slicing a bigger tensor
+            attn_mask = attn_mask.reshape(B * nimgs * self.attn_num_heads, N, -1)
+            last_dim = attn_mask.shape[-1]
+            last_dim = (last_dim + 7) // 8 * 8
+            attn_mask_float = torch.full((B * nimgs * self.attn_num_heads, N, last_dim),
+                                         -torch.inf, dtype=current_dtype, device=x.device
+                                         )[:, :, :attn_mask.shape[-1]]
+            attn_mask_float[attn_mask] = 0
+            attn_mask = attn_mask_float
+        return attn_mask
+
+    def forward(self, x, pos, true_shape, current_mem=None, render=False, return_feats=False):
+        current_dtype = get_current_dtype(x.dtype)
+        # project encoder features to the correct dimension
+        B, nimgs, N, Denc = x.shape
+        feats = [x.view(B * nimgs, N, Denc)]
+        x = self.feat_embed_enc_to_dec(feats[0]).view(B, nimgs, N, -1)
+        B, nimgs, N, D = x.shape
+        mem_D = 2 * D if self.memory_mode == "kv" else D
+        # render=True means we do not update the memory
+        assert not render or current_mem is not None
+
+        if current_mem is None:
+            # initialization
+            x[:, 1:] = x[:, 1:] + self.image2_embed.to(current_dtype)
+            current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens = \
+                self._get_empty_memory(x.device, current_dtype, B, mem_D)
+        else:
+            current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens = current_mem
+            x = x + self.image2_embed.to(current_dtype)  # not the reference image / memory
+
+        # protected tokens will not be dropped out
+        if not render:
+            current_mem_protected_imgs = mem_protected_imgs
+            mem_protected_imgs = min(self.protected_imgs, current_mem_protected_imgs + nimgs)
+            mem_protected_tokens = mem_protected_tokens + (mem_protected_imgs - current_mem_protected_imgs) * N
+
+        x = x.view(B * nimgs, N, D)
+        pos = pos.view(B * nimgs, N, 2)
+
+        Nm = current_mem[0].shape[1]  # number of memory tokens at the previous step
+
+        mem_sel = None
+        mem_not_sel = None
+        active_mem = current_mem
+        if not render and self.mem_dropout.p > 0.0:
+            # random token dropout, efficient for training
+            mem_sel, mem_not_sel = self.mem_dropout(Nm, nimgs, N, protected=mem_protected_tokens, device=x.device)
+        elif render and self.mem_dropout.p > 0.0 and self.dropout_mode == 'temporary':
+            new_mem_tokens = 0
+            mem_sel, mem_not_sel = self.mem_dropout(Nm, 1, new_mem_tokens, protected=mem_protected_tokens,
+                                                    device=x.device)
+
+            # dropout mem here
+            active_mem = [mem_i[:, mem_sel[0]] for mem_i in current_mem]
+            mem_sel, mem_not_sel = None, None
+            Nm = active_mem[0].shape[1]  # number of memory tokens at the previous step
+
+        if not render:
+            # prepare labels for the new memory tokens
+            new_labels = torch.arange(nimgs, dtype=current_mem_labels.dtype, device=current_mem_labels.device).view(
+                1, nimgs, 1).repeat(B, 1, N).view(B, N * nimgs) + mem_nimgs
+            mem_labels = torch.concatenate([current_mem_labels, new_labels], dim=1)
+        else:
+            mem_labels = current_mem_labels
+
+        if mem_sel is not None and self.dropout_mode == 'permanent':
+            # select the new memory labels after dropout
+            mem_labels_out = mem_labels[:, mem_sel[-1]]
+        else:
+            mem_labels_out = mem_labels
+
+        mem_mask = None
+        attn_mask = None
+        if not render and (Nm > 0 or nimgs > 1):
+            # when updating the memory, do not let an image do CA with its own tokens
+            # ignore this rule when initializing from only one image
+            if self.use_mem_mask:
+                # physically remove the self attending memory tokens
+                mem_mask = self.make_mem_mask(nimgs, N, Nm, x.device)
+            # create mask for the cross attention
+            attn_mask = self.make_attn_mask(x, B, nimgs, N, mem_nimgs, Nm, mem_not_sel, mem_labels, mem_mask)
+
+        new_mem = []
+        for blk, current_mem_blk in zip(self.blocks_dec, active_mem):
+            if not render:
+                # update the memory for this layer
+                xmem = x.view(B, nimgs * N, D)
+                new_mem.append(xmem)
+                mem_i = torch.concatenate([current_mem_blk, blk.prepare_y(xmem)], dim=1)
+            else:
+                mem_i = current_mem_blk
+
+            # mem is B, Nmi, D
+            # we need B*nimgs, Nmi, D for CA
+            if mem_mask is not None:
+                mem_i = mem_i.unsqueeze(1).expand(-1, nimgs, -1, -1)
+                mem_i = mem_i[:, mem_mask]
+                mem_i = mem_i.reshape(B * nimgs, Nm + ((nimgs - 1)) * N, mem_D)
+            else:
+                Nmi = mem_i.shape[1]
+                mem_i = mem_i.unsqueeze(1).expand(-1, nimgs, -1, -1).reshape(B * nimgs, Nmi, mem_D)
+
+            # apply decoder
+            x = blk(x, mem_i, pos, None, ca_attn_mask=attn_mask)
+            feats.append(x)
+
+        if not render:
+            new_mem = run_feedback_layers(self.feedback_layer, self.feedback_norm, new_mem)
+            mem = []
+            for i in range(len(new_mem)):
+                new_mem_i = self.blocks_dec[i].prepare_y(new_mem[i])
+                mem.append(torch.concatenate([current_mem[i], new_mem_i], dim=1))
+            if mem_sel is not None and self.dropout_mode == 'permanent':
+                mem = [mem_i[:, mem_sel[-1]] for mem_i in mem]
+            mem_nimgs = mem_nimgs + nimgs
+            out = (mem, mem_labels_out, mem_nimgs, mem_protected_imgs, mem_protected_tokens)
+        else:
+            out = (current_mem, current_mem_labels, mem_nimgs, mem_protected_imgs, mem_protected_tokens)
+
+        # apply prediction head
+        x = self._compute_prediction_head(true_shape, B, nimgs, feats)
+
+        if return_feats:
+            # return memory, pointmaps, feats
+            feats = [feats[i].view(B, nimgs, *feats[i].shape[1:]) for i in range(len(feats))]
+            return out, x, feats
+        else:
+            # return memory, pointmaps
+            return out, x
+
+
+if __name__ == '__main__':
+    from must3r.model.blocks.attention import toggle_memory_efficient_attention
+    from must3r.model.encoder import Dust3rEncoder
+    import must3r.tools.path_to_dust3r  # noqa
+    import dust3r.utils.path_to_croco  # noqa
+    from croco.models.blocks import PositionGetter
+    toggle_memory_efficient_attention(enabled=True)
+
+    enc = Dust3rEncoder(img_size=(224, 224), patch_embed='PatchEmbedDust3R').to('cuda')
+
+    dec = CausalMUSt3R(img_size=(224, 224), mem_dropout=0.00, feedback_type='single_mlp', use_xformers_mask=False,
+                       dropout_mode='temporary', memory_mode='norm_y', use_mem_mask=True).to('cuda')
+    # dec = MUSt3R(img_size=(224, 224), feedback_type='single_mlp').to('cuda')
+    MB = 1024.0 * 1024.0
+
+    BS = 2
+    device = 'cuda'
+
+    # true_shape = [[[512, 384], [384, 512]], [[512, 336]]]
+    # x = [torch.randn((BS, 2, 3, 384, 512)).to('cuda'), torch.randn((BS, 1, 3, 336, 512)).to(device)]
+    # true_shape = [torch.tensor(true_shape[0], dtype=torch.int64, device=device).repeat(BS, 1, 1),
+    #               torch.tensor(true_shape[1], dtype=torch.int64, device=device).repeat(BS, 1, 1)]
+    true_shape = [[224, 224], [224, 224], [224, 224]]
+    true_shape = torch.tensor(true_shape, dtype=torch.int64, device=device).repeat(BS, 1, 1)
+    x = torch.randn((BS, 3, 3, 224, 224)).to(device)
+    nimg = 3
+    from contextlib import nullcontext
+    with torch.cuda.amp.autocast(dtype=torch.float16):  # nullcontext():
+        with torch.no_grad():
+            x, pos = enc(x.view(BS * nimg, 3, 224, 224), true_shape.view(BS * nimg, 2))
+            x = x.view(BS, nimg, *x.shape[1:])
+            pos = pos.view(BS, nimg, *pos.shape[1:])
+            true_shape = true_shape.view(BS, nimg, 2)
+            o1 = None
+            for i in range(10):
+                o1, _ = dec(x, pos, true_shape, o1)
+                try:
+                    print(f'{i+1} - {o1[0][0].shape}')
+                except Exception as e:
+                    print(f'{i+1} - {o1[0][0][0].shape}')
+                memory = torch.cuda.max_memory_allocated() / MB
+                print(memory)
+
+            for i in range(10):
+                o1, _ = dec(x, pos, true_shape, o1, render=True)
+                try:
+                    print(f'{i+1} - {o1[0][0].shape}')
+                except Exception as e:
+                    print(f'{i+1} - {o1[0][0][0].shape}')
+                memory = torch.cuda.max_memory_allocated() / MB
+                print(memory)
diff --git a/must3r/model/encoder.py b/must3r/model/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a85cd8001a14bcde26542391597e4959a4f6d12
--- /dev/null
+++ b/must3r/model/encoder.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import torch.nn as nn
+from functools import partial
+
+from must3r.model.blocks.pos_embed import get_pos_embed
+from must3r.model.blocks.layers import BaseTransformer, Block
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.patch_embed import get_patch_embed
+
+
+class Dust3rEncoder(BaseTransformer):
+    def __init__(self,
+                 img_size=(224, 224),           # input image size
+                 patch_size=16,          # patch_size
+                 embed_dim=1024,      # encoder feature dimension
+                 depth=24,           # encoder depth
+                 num_heads=16,       # encoder number of heads in the transformer block
+                 mlp_ratio=4,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 patch_embed='PatchEmbedDust3R',
+                 pos_embed='RoPE100'):
+        super(Dust3rEncoder, self).__init__()
+        self.embed_dim = embed_dim
+        self.depth = depth
+
+        self.set_patch_embed(patch_embed, img_size, patch_size, embed_dim)
+
+        self.max_seq_len = max(img_size) // patch_size
+        self.grid_size = self.patch_embed.grid_size
+        self.rope = get_pos_embed(pos_embed)
+
+        self.blocks_enc = nn.ModuleList([
+            Block(embed_dim, num_heads, pos_embed=self.rope, mlp_ratio=mlp_ratio, qkv_bias=True, norm_layer=norm_layer)
+            for i in range(depth)])
+        self.norm_enc = norm_layer(embed_dim)
+        self.initialize_weights()
+
+    def set_patch_embed(self, patch_embed_name='PatchEmbedDust3R', img_size=224, patch_size=16, patch_embed_dim=768):
+        self.patch_size = patch_size
+        assert self.embed_dim == patch_embed_dim
+        self.patch_embed = get_patch_embed(patch_embed_name, img_size, patch_size, patch_embed_dim)
+        self.grid_size = self.patch_embed.grid_size
+
+    @torch.autocast("cuda", dtype=torch.float32)
+    def forward(self, img, true_shape):
+        x, pos = self.patch_embed(img, true_shape=true_shape)
+        for blk in self.blocks_enc:
+            x = blk(x, pos)
+        x = self.norm_enc(x)
+        return x, pos
+
+    def from_dust3r(self, state_dict, verbose=True):
+        state_dict = {k.replace('enc_blocks', 'blocks_enc').replace(
+            'enc_norm', 'norm_enc'): v for k, v in state_dict.items()}
+        incompatible_keys = self.load_state_dict(state_dict, strict=False)
+        if verbose:
+            print(incompatible_keys)
+        assert len(incompatible_keys.missing_keys) == 0
+        return incompatible_keys
+
+    def from_croco(self, state_dict, verbose=True):
+        # same format
+        return self.from_dust3r(state_dict, verbose=verbose)
diff --git a/must3r/model/feedback_mechanism.py b/must3r/model/feedback_mechanism.py
new file mode 100644
index 0000000000000000000000000000000000000000..87827297ad4d5873db3b1932542543287f6d76b7
--- /dev/null
+++ b/must3r/model/feedback_mechanism.py
@@ -0,0 +1,53 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import torch.nn as nn
+from typing import Optional, List, Callable
+
+import must3r.tools.path_to_dust3r  # noqa
+import dust3r.utils.path_to_croco  # noqa
+from croco.models.blocks import Mlp
+
+
+def create_feedback_layers(embed_dim, depth, feedback_type):
+    if feedback_type == 'single_mlp':
+        feedback_layer = Mlp(embed_dim, hidden_features=4 * embed_dim, out_features=embed_dim)
+        feedback_norm = nn.LayerNorm(embed_dim)
+    elif feedback_type == 'single_linear':
+        feedback_layer = nn.Linear(embed_dim, out_features=embed_dim)
+        feedback_norm = nn.LayerNorm(embed_dim)
+    else:
+        assert not feedback_type
+        feedback_layer = None
+        feedback_norm = None
+
+    return feedback_layer, feedback_norm
+
+
+def init_feedback_layers(feedback_type, feedback_layer):
+    # init as zeros so that it's inactive at the start
+    if feedback_layer is not None:
+        if feedback_type == 'single_mlp':
+            nn.init.constant_(feedback_layer.fc2.bias, 0)
+            nn.init.constant_(feedback_layer.fc2.weight, 0)
+        elif feedback_type == 'single_linear':
+            nn.init.constant_(feedback_layer.bias, 0)
+            nn.init.constant_(feedback_layer.weight, 0)
+        else:
+            raise ValueError(f"Unknown {feedback_type=}")
+
+
+def run_feedback_layers(
+    feedback_layer: Optional[Callable],
+    feedback_norm: Optional[Callable],
+    mem: List[torch.Tensor]
+) -> List[torch.Tensor]:
+    # nothing to do ?
+    if feedback_layer is None:
+        return mem
+    blk, blk_ln = feedback_layer, feedback_norm
+    offset = blk(blk_ln(mem[-1]))
+    new_mem = [
+        memi + offset for memi in mem[:-1]
+    ]
+    new_mem.append(mem[-1])
+    return new_mem
diff --git a/must3r/retrieval/__init__.py b/must3r/retrieval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..345ede2d5437f28f4929911c3614d5fe2caf828e
--- /dev/null
+++ b/must3r/retrieval/__init__.py
@@ -0,0 +1 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
diff --git a/must3r/retrieval/graph.py b/must3r/retrieval/graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..b45e86e20f8b638216441ef0dd56cc13d2b1ac34
--- /dev/null
+++ b/must3r/retrieval/graph.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+#
+# --------------------------------------------------------
+# Building the graph based on retrieval results.
+# --------------------------------------------------------
+import numpy as np
+
+
+def farthest_point_sampling(dist, N=None, dist_thresh=None):
+    """Farthest point sampling.
+
+    Args:
+        dist: NxN distance matrix.
+        N: Number of points to sample.
+        dist_thresh: Distance threshold. Point sampling terminates once the
+                     maximum distance is below this threshold.
+
+    Returns:
+        indices: Indices of the sampled points.
+    """
+
+    assert N is not None or dist_thresh is not None, "Either N or min_dist must be provided."
+
+    if N is None:
+        N = dist.shape[0]
+
+    indices = []
+    distances = [0]
+    indices.append(np.random.choice(dist.shape[0]))
+    for i in range(1, N):
+        d = dist[indices].min(axis=0)
+        bst = d.argmax()
+        bst_dist = d[bst]
+        if dist_thresh is not None and bst_dist < dist_thresh:
+            break
+        indices.append(bst)
+        distances.append(bst_dist)
+    return np.array(indices), np.array(distances)
+
+
+def make_pairs_fps(sim_mat, Na=20, tokK=1, dist_thresh=None):
+    dist_mat = 1 - sim_mat
+
+    pairs = set()
+    keyimgs_idx = np.array([])
+    if Na != 0:
+        keyimgs_idx, _ = farthest_point_sampling(dist_mat, N=Na, dist_thresh=dist_thresh)
+
+        # 1. Complete graph between key images
+        for i in range(len(keyimgs_idx)):
+            for j in range(i + 1, len(keyimgs_idx)):
+                idx_i, idx_j = keyimgs_idx[i], keyimgs_idx[j]
+                pairs.add((idx_i, idx_j))
+
+        # 2. Connect non-key images to the earest key image
+        keyimg_dist_mat = dist_mat[:, keyimgs_idx]
+        for i in range(keyimg_dist_mat.shape[0]):
+            if i in keyimgs_idx:
+                continue
+            j = keyimg_dist_mat[i].argmin()
+            i1, i2 = min(i, keyimgs_idx[j]), max(i, keyimgs_idx[j])
+            if i1 != i2 and (i1, i2) not in pairs:
+                pairs.add((i1, i2))
+
+    # 3. Add some local connections (k-NN) for each view
+    if tokK > 0:
+        for i in range(dist_mat.shape[0]):
+            idx = dist_mat[i].argsort()[:tokK]
+            for j in idx:
+                i1, i2 = min(i, j), max(i, j)
+                if i1 != i2 and (i1, i2) not in pairs:
+                    pairs.add((i1, i2))
+
+    pairs = list(pairs)
+
+    return pairs, keyimgs_idx
diff --git a/must3r/retrieval/model.py b/must3r/retrieval/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..a81dd1f9803b746bda84e0c799e02ca237d5c299
--- /dev/null
+++ b/must3r/retrieval/model.py
@@ -0,0 +1,216 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+#
+# --------------------------------------------------------
+# Whitener and RetrievalModel
+# --------------------------------------------------------
+import numpy as np
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.utils.image import load_images
+
+default_device = torch.device('cuda' if torch.cuda.is_available() and torch.cuda.device_count() > 0 else 'cpu')
+
+
+# from https://github.com/gtolias/how/blob/4d73c88e0ffb55506e2ce6249e2a015ef6ccf79f/how/utils/whitening.py#L20
+def pcawhitenlearn_shrinkage(X, s=1.0):
+    """Learn PCA whitening with shrinkage from given descriptors"""
+    N = X.shape[0]
+
+    # Learning PCA w/o annotations
+    m = X.mean(axis=0, keepdims=True)
+    Xc = X - m
+    Xcov = np.dot(Xc.T, Xc)
+    Xcov = (Xcov + Xcov.T) / (2 * N)
+    eigval, eigvec = np.linalg.eig(Xcov)
+    order = eigval.argsort()[::-1]
+    eigval = eigval[order]
+    eigvec = eigvec[:, order]
+
+    eigval = np.clip(eigval, a_min=1e-14, a_max=None)
+    P = np.dot(np.linalg.inv(np.diag(np.power(eigval, 0.5 * s))), eigvec.T)
+
+    return m, P.T
+
+
+class DusterInputFromImageList(torch.utils.data.Dataset):
+    def __init__(self, image_list, imsize=512):
+        super().__init__()
+        self.image_list = image_list
+        assert imsize == 512
+        self.imsize = imsize
+
+    def __len__(self):
+        return len(self.image_list)
+
+    def __getitem__(self, index):
+        return load_images([self.image_list[index]], size=self.imsize, verbose=False)[0]
+
+
+class Whitener(nn.Module):
+    def __init__(self, dim, l2norm=None):
+        super().__init__()
+        self.m = torch.nn.Parameter(torch.zeros((1, dim)).double())
+        self.p = torch.nn.Parameter(torch.eye(dim, dim).double())
+        self.l2norm = l2norm  # if not None, apply l2 norm along a given dimension
+
+    def forward(self, x):
+        with torch.autocast(self.m.device.type, enabled=False):
+            shape = x.size()
+            input_type = x.dtype
+            x_reshaped = x.view(-1, shape[-1]).to(dtype=self.m.dtype)
+            # Center the input data
+            x_centered = x_reshaped - self.m
+            # Apply PCA transformation
+            pca_output = torch.matmul(x_centered, self.p)
+            # reshape back
+            pca_output_shape = shape  # list(shape[:-1]) + [shape[-1]]
+            pca_output = pca_output.view(pca_output_shape)
+            if self.l2norm is not None:
+                return torch.nn.functional.normalize(pca_output, dim=self.l2norm).to(dtype=input_type)
+            return pca_output.to(dtype=input_type)
+
+
+def weighted_spoc(feat, attn):
+    """
+    feat: BxNxC
+    attn: BxN
+    output: BxC L2-normalization weighted-sum-pooling of features
+    """
+    return torch.nn.functional.normalize((feat * attn[:, :, None]).sum(dim=1), dim=1)
+
+
+def how_select_local(feat, attn, nfeat):
+    """
+    feat: BxNxC
+    attn: BxN
+    nfeat: nfeat to keep
+    """
+    # get nfeat
+    if nfeat < 0:
+        assert nfeat >= -1.0
+        nfeat = int(-nfeat * feat.size(1))
+    else:
+        nfeat = int(nfeat)
+    # asort
+    topk_attn, topk_indices = torch.topk(attn, min(nfeat, attn.size(1)), dim=1)
+    topk_indices_expanded = topk_indices.unsqueeze(-1).expand(-1, -1, feat.size(2))
+    topk_features = torch.gather(feat, 1, topk_indices_expanded)
+    return topk_features, topk_attn, topk_indices
+
+
+class RetrievalModel(nn.Module):
+    def __init__(self, backbone, freeze_backbone=1, prewhiten=None, hdims=[1024], residual=False, postwhiten=None, featweights='l2norm', nfeat=300, pretrained_retrieval=None):
+        super().__init__()
+        self.backbone = backbone
+        self.freeze_backbone = freeze_backbone
+        if freeze_backbone:
+            for p in self.backbone.parameters():
+                p.requires_grad = False
+        try:
+            self.backbone_dim = backbone.enc_embed_dim
+        except Exception as e:
+            self.backbone_dim = backbone.embed_dim
+
+        self.prewhiten = nn.Identity() if prewhiten is None else Whitener(self.backbone_dim)
+        self.prewhiten_freq = prewhiten
+        if prewhiten is not None and prewhiten != -1:
+            for p in self.prewhiten.parameters():
+                p.requires_grad = False
+        self.residual = residual
+        self.projector = self.build_projector(hdims, residual)
+        self.dim = hdims[-1] if len(hdims) > 0 else self.backbone_dim
+        self.postwhiten_freq = postwhiten
+        self.postwhiten = nn.Identity() if postwhiten is None else Whitener(self.dim)
+        if postwhiten is not None and postwhiten != -1:
+            assert len(hdims) > 0
+            for p in self.postwhiten.parameters():
+                p.requires_grad = False
+        self.featweights = featweights
+        if featweights == 'l2norm':
+            self.attention = lambda x: x.norm(dim=-1)
+        else:
+            raise NotImplementedError(featweights)
+        self.nfeat = nfeat
+        self.pretrained_retrieval = pretrained_retrieval
+        if self.pretrained_retrieval is not None:
+            ckpt = torch.load(pretrained_retrieval, 'cpu', weights_only=False)
+            msg = self.load_state_dict(ckpt['model'], strict=False)
+            assert len(msg.unexpected_keys) == 0 and all(k.startswith('backbone')
+                                                         or k.startswith('postwhiten') for k in msg.missing_keys)
+
+    def build_projector(self, hdims, residual):
+        if self.residual:
+            assert hdims[-1] == self.backbone_dim
+        d = self.backbone_dim
+        if len(hdims) == 0:
+            return nn.Identity()
+        layers = []
+        for i in range(len(hdims) - 1):
+            layers.append(nn.Linear(d, hdims[i]))
+            d = hdims[i]
+            layers.append(nn.LayerNorm(d))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(d, hdims[-1]))
+        return nn.Sequential(*layers)
+
+    def state_dict(self, *args, destination=None, prefix='', keep_vars=False):
+        ss = super().state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
+        if self.freeze_backbone:
+            ss = {k: v for k, v in ss.items() if not k.startswith('backbone')}
+        return ss
+
+    def extract_features_and_attention(self, x):
+        backbone_feat = x
+        backbone_feat_prewhitened = self.prewhiten(backbone_feat)
+        proj_feat = self.projector(backbone_feat_prewhitened) + \
+            (0.0 if not self.residual else backbone_feat_prewhitened)
+        attention = self.attention(proj_feat)
+        proj_feat_whitened = self.postwhiten(proj_feat)
+        return proj_feat_whitened, attention
+
+    def forward_local(self, x):
+        feat, attn = self.extract_features_and_attention(x)
+        return how_select_local(feat, attn, self.nfeat)
+
+    def forward_global(self, x):
+        feat, attn = self.extract_features_and_attention(x)
+        return weighted_spoc(feat, attn)
+
+    def forward(self, x):
+        return self.forward_global(x)
+
+
+def identity(x):  # to avoid Can't pickle local object 'extract_local_features.<locals>.<lambda>'
+    return x
+
+
+@torch.no_grad()
+def extract_local_features(model, images, imsize, seed=0, tocpu=False, max_nfeat_per_image=None,
+                           max_nfeat_per_image2=None, device=default_device):
+    model.eval()
+    imdataset = DusterInputFromImageList(images, imsize=imsize) if isinstance(images, list) else images
+
+    loader = torch.utils.data.DataLoader(imdataset, batch_size=1, shuffle=False,
+                                         num_workers=8, pin_memory=True, collate_fn=identity)
+    with torch.no_grad():
+        features = []
+        imids = []
+        for i, d in enumerate(tqdm(loader)):
+            dd = d[0]
+            dd['img'] = dd['img'].to(device, non_blocking=True)
+            feat, _, _ = model.forward_local(dd)
+            feat = feat.flatten(0, 1)
+            if max_nfeat_per_image is not None and feat.size(0) > max_nfeat_per_image:
+                feat = feat[torch.randperm(feat.size(0))[:max_nfeat_per_image], :]
+            if max_nfeat_per_image2 is not None and feat.size(0) > max_nfeat_per_image2:
+                feat = feat[:max_nfeat_per_image2, :]
+            features.append(feat)
+            if tocpu:
+                features[-1] = features[-1].cpu()
+            imids.append(i * torch.ones_like(features[-1][:, 0]).to(dtype=torch.int64))
+    features = torch.cat(features, dim=0)
+    imids = torch.cat(imids, dim=0)
+    return features, imids
diff --git a/must3r/retrieval/processor.py b/must3r/retrieval/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b35a15da6f880cf699c60165a48d8345eac549
--- /dev/null
+++ b/must3r/retrieval/processor.py
@@ -0,0 +1,126 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+#
+# --------------------------------------------------------
+# Main Retriever class
+# --------------------------------------------------------
+import os
+import argparse
+import numpy as np
+import torch
+from must3r.retrieval.model import RetrievalModel, extract_local_features
+
+try:
+    import faiss
+    faiss.StandardGpuResources()  # when loading the checkpoint, it will try to instanciate FaissGpuL2Index
+except AttributeError as e:
+    import asmk.index
+
+    class FaissCpuL2Index(asmk.index.FaissL2Index):
+        def __init__(self, gpu_id):
+            super().__init__()
+            self.gpu_id = gpu_id
+
+        def _faiss_index_flat(self, dim):
+            """Return initialized faiss.IndexFlatL2"""
+            return faiss.IndexFlatL2(dim)
+
+    asmk.index.FaissGpuL2Index = FaissCpuL2Index
+
+from asmk import asmk_method  # noqa
+
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('Retrieval scores from a set of retrieval', add_help=False, allow_abbrev=False)
+    parser.add_argument('--model', type=str, required=True,
+                        help="shortname of a retrieval model or path to the corresponding .pth")
+    parser.add_argument('--input', type=str, required=True,
+                        help="directory containing images or a file containing a list of image paths")
+    parser.add_argument('--outfile', type=str, required=True, help="numpy file where to store the matrix score")
+    return parser
+
+
+def get_impaths(imlistfile):
+    with open(imlistfile, 'r') as fid:
+        impaths = [f for f in imlistfile.read().splitlines() if not f.startswith('#')
+                   and len(f) > 0]  # ignore comments and empty lines
+    return impaths
+
+
+def get_impaths_from_imdir(imdir, extensions=['png', 'jpg', 'PNG', 'JPG']):
+    assert os.path.isdir(imdir)
+    impaths = [os.path.join(imdir, f) for f in sorted(os.listdir(imdir)) if any(f.endswith(ext) for ext in extensions)]
+    return impaths
+
+
+def get_impaths_from_imdir_or_imlistfile(input_imdir_or_imlistfile):
+    if os.path.isfile(input_imdir_or_imlistfile):
+        return get_impaths(input_imdir_or_imlistfile)
+    else:
+        return get_impaths_from_imdir(input_imdir_or_imlistfile)
+
+
+class Retriever(object):
+    def __init__(self, modelname, backbone, device='cuda', verbose=True):
+        # load the model
+        assert os.path.isfile(modelname), modelname
+        assert backbone is not None
+        if verbose:
+            print(f'Loading retrieval model from {modelname}')
+        ckpt = torch.load(modelname, 'cpu', weights_only=False)
+        ckpt_args = ckpt['args']
+        self.model = RetrievalModel(
+            backbone, freeze_backbone=ckpt_args.freeze_backbone, prewhiten=ckpt_args.prewhiten,
+            hdims=list(map(int, ckpt_args.hdims.split('_'))) if len(ckpt_args.hdims) > 0 else "",
+            residual=getattr(ckpt_args, 'residual', False), postwhiten=ckpt_args.postwhiten,
+            featweights=ckpt_args.featweights, nfeat=ckpt_args.nfeat
+        ).to(device)
+        self.device = device
+        msg = self.model.load_state_dict(ckpt['model'], strict=False)
+        assert all(k.startswith('backbone') for k in msg.missing_keys)
+        assert len(msg.unexpected_keys) == 0
+        self.imsize = ckpt_args.imsize
+
+        # load the asmk codebook
+        dname, bname = os.path.split(modelname)
+        bname_splits = bname.split('_')
+        cache_codebook_fname = os.path.join(dname, '_'.join(bname_splits[:-1]) + '_codebook.pkl')
+        assert os.path.isfile(cache_codebook_fname), cache_codebook_fname
+        asmk_params = {'index': {'gpu_id': 0}, 'train_codebook': {'codebook': {'size': '64k'}},
+                       'build_ivf': {'kernel': {'binary': True}, 'ivf': {'use_idf': False},
+                                     'quantize': {'multiple_assignment': 1}, 'aggregate': {}},
+                       'query_ivf': {'quantize': {'multiple_assignment': 5}, 'aggregate': {},
+                                     'search': {'topk': None},
+                                     'similarity': {'similarity_threshold': 0.0, 'alpha': 3.0}}}
+        asmk_params['train_codebook']['codebook']['size'] = ckpt_args.nclusters
+        self.asmk = asmk_method.ASMKMethod.initialize_untrained(asmk_params)
+        self.asmk = self.asmk.train_codebook(None, cache_path=cache_codebook_fname)
+
+    def __call__(self, input_imdir_or_imlistfile, outfile=None):
+        # get impaths
+        if isinstance(input_imdir_or_imlistfile, str):
+            impaths = get_impaths_from_imdir_or_imlistfile(input_imdir_or_imlistfile)
+        else:
+            impaths = input_imdir_or_imlistfile  # we're assuming a list has been passed
+        print(f'Found {len(impaths)} images')
+
+        # build the database
+        feat, ids = extract_local_features(self.model, impaths, self.imsize, tocpu=True, device=self.device)
+        feat = feat.cpu().numpy()
+        ids = ids.cpu().numpy()
+        asmk_dataset = self.asmk.build_ivf(feat, ids)
+
+        # we actually retrieve the same set of images
+        metadata, query_ids, ranks, ranked_scores = asmk_dataset.query_ivf(feat, ids)
+
+        # well ... scores are actually reordered according to ranks ...
+        # so we redo it the other way around...
+        scores = np.empty_like(ranked_scores)
+        scores[np.arange(ranked_scores.shape[0])[:, None], ranks] = ranked_scores
+
+        # save
+        if outfile is not None:
+            if os.path.isdir(os.path.dirname(outfile)):
+                os.makedirs(os.path.dirname(outfile), exist_ok=True)
+            np.save(outfile, scores)
+            print(f'Scores matrix saved in {outfile}')
+        return scores
diff --git a/must3r/slam/__init__.py b/must3r/slam/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..345ede2d5437f28f4929911c3614d5fe2caf828e
--- /dev/null
+++ b/must3r/slam/__init__.py
@@ -0,0 +1 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
diff --git a/must3r/slam/data.py b/must3r/slam/data.py
new file mode 100644
index 0000000000000000000000000000000000000000..10deba6dce4684ef0424eec844a7c2d92f562da0
--- /dev/null
+++ b/must3r/slam/data.py
@@ -0,0 +1,125 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+import cv2
+import os
+
+
+cv2_im_formats = ['jpg', 'jpeg', 'jpe', 'png', 'tiff', 'tif', 'bmp', 'dib', 'jp2',
+                  'webp', 'pbm', 'pgm', 'ppm', 'pxm', 'pnm', 'pfm', 'sr', 'ras', 'exr', 'hdr', 'pic']
+
+
+# Data management
+class BaseLoader():
+    """
+    Frame loading. Supported sources are 
+       - Camera stream
+       - Video file
+       - Image folder (you can specify a string to match image names)
+    """
+
+    def __init__(self, inp, image_string=None):
+        if 'cam:' in inp:
+            streamid = int(inp.split(':')[-1])
+            self.CAMERA = cv2.VideoCapture(streamid)
+        elif os.path.isdir(inp):
+            self.CAMERA = ImageCollection(inp, image_string)
+        elif os.path.isfile(inp):
+            self.CAMERA = VideoFile(inp)
+        else:
+            raise ValueError(f"Incorrect input {inp} for BaseLoader")
+
+    def __len__(self):
+        return len(self.CAMERA)
+
+    def set(self, target=cv2.CAP_PROP_POS_FRAMES, value=0):
+        self.CAMERA.set(target, value)
+
+    def grab(self):
+        self.CAMERA.grab()
+
+    def read(self):
+        return self.CAMERA.read()
+
+
+class AutoMultiLoader(BaseLoader):
+    """
+    MultiLoader: load frames alternatively from a list of sources 
+    """
+
+    def __init__(self, inp, image_string=None):
+        self.CAMERAS = [BaseLoader(cam, image_string) for cam in inp]
+        self.whos_turn = 0
+
+    def __len__(self):
+        return np.sum([len(cam) for cam in self.CAMERAS])
+
+    def set(self, target=cv2.CAP_PROP_POS_FRAMES, value=0):
+        for i in range(len(self.CAMERAS)):
+            self.CAMERAS[i].set(target, value)
+    
+    def next_agent(self):
+        self.whos_turn = (self.whos_turn + 1) % len(self.CAMERAS)
+        
+    def grab(self):
+        for _ in range(len(self.CAMERAS)):
+            self.CAMERAS[self.whos_turn].grab()
+            self.next_agent()
+            
+    def read(self):
+        frame = None
+        loop_c = 0
+        while frame is None and loop_c < len(self.CAMERAS):
+            ret, frame = self.CAMERAS[self.whos_turn].read()
+            camid = self.whos_turn
+            self.next_agent()
+            loop_c += 1
+        # TODO: batched forward for multiple agents to decrease impact in framerate
+        return ret, frame, camid
+
+class VideoFile(cv2.VideoCapture):
+    def __len__(self):
+        return int(self.get(cv2.CAP_PROP_FRAME_COUNT))
+
+class ImageCollection():
+    def __init__(self, rootdir, image_string=None, preload=True):
+        self.rootdir = rootdir
+
+        def sel_file(ff): return (ff.split('.')[-1].lower() in cv2_im_formats
+                                  and (image_string is None
+                                       or image_string in ff)
+                                  )
+        self.frames = [ff for ff in sorted(os.listdir(self.rootdir)) if sel_file(ff)]
+        assert len(self) != 0, ""
+        print(f"Found {len(self)} frames in {rootdir}")
+
+        self.current_frame = 0
+        self.all_images = None
+
+        if preload:
+            print("Preloading frames")
+            self.all_images = [cv2.imread(os.path.join(self.rootdir, frame)) for frame in self.frames]
+
+    def __len__(self):
+        return len(self.frames)
+
+    def set(self, target=cv2.CAP_PROP_POS_FRAMES, value=0):
+        if target == cv2.CAP_PROP_POS_FRAMES:
+            self.current_frame = value
+        else:
+            raise NotImplementedError(f"implement what you want to do with {target}")
+
+    def next_frame(self):
+        self.current_frame += 1
+        
+    def grab(self):
+        self.next_frame()
+        
+    def read(self):
+        im = None
+        if self.current_frame < len(self):
+            if self.all_images is not None:
+                im = self.all_images[self.current_frame]
+            else:
+                im = cv2.imread(os.path.join(self.rootdir, self.frames[self.current_frame]))
+            self.next_frame()
+        return None, im  # match cv2.VideoCapture signature
diff --git a/must3r/slam/model.py b/must3r/slam/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..650b8a4f7e6a0bc919fa0d34a44bba85aee2f917
--- /dev/null
+++ b/must3r/slam/model.py
@@ -0,0 +1,528 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+import torch
+import roma
+from PIL import Image
+from tqdm import tqdm
+import pickle as pkl
+
+from must3r.model import *
+from must3r.model import load_model
+from must3r.engine.inference import postprocess
+
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.post_process import estimate_focal_knowing_depth
+from dust3r.datasets.utils.transforms import ImgNorm
+from dust3r.utils.image import _resize_pil_image
+
+from .nns import get_searcher
+from .tools import laplacian_smoothing, laplacian_smoothing_with_confidence
+
+# Forward and processing
+@torch.no_grad()
+def forward_must3r(model,
+                   input_views,
+                   memory,
+                   render=False,
+                   device='cuda:0'):
+
+    encoder, decoder = model
+
+    true_shapes = []
+    encoded_inputs = []
+    pos = []
+    for input_view in input_views:
+        true_shapei = input_view['true_shape'][None]
+        encoded_input, posi = encoder(input_view['img'].to(device),
+                                      torch.as_tensor(true_shapei).to(device).view(-1, 2))
+
+        true_shapes.append(torch.tensor(true_shapei, device=device))
+        encoded_inputs.append(encoded_input[None])
+        pos.append(posi[None])
+
+    temp_memory = memory
+
+    # get pred and updated memory
+    new_memory, preds = decoder(encoded_inputs,
+                                pos,
+                                true_shapes,
+                                temp_memory,
+                                render=render)
+
+    torch.cuda.empty_cache()
+
+    pointmaps_activation = get_pointmaps_activation(decoder, verbose=False)
+    out = []
+    for pred in preds:
+        out.append(postprocess(pred, pointmaps_activation=pointmaps_activation))
+
+    return out, new_memory
+
+
+def get_overlap_score(res,
+                      overlap_tree,
+                      cam_center,  # if needed for ori in NN
+                      mode='nn',
+                      kf_x_subsamp=None,
+                      min_conf_keyframe=1.5,
+                      percentile=70,  # 50,
+                      eps=1e-9,
+                      ):
+    outscore = 0.
+    if mode == 'meanconf':
+        outscore = res['conf'].mean()
+    elif mode == 'medianconf':
+        outscore = res['conf'].median()
+    elif 'nn' in mode:
+        pts3d = res['pts3d'][0, 0, ::kf_x_subsamp, ::kf_x_subsamp] if kf_x_subsamp else res['pts3d']
+        msk = res['conf'][0, 0, ::kf_x_subsamp, ::kf_x_subsamp] if kf_x_subsamp else res['conf']
+        msk = msk > min_conf_keyframe
+        outscore = 0.
+        if msk.sum() > 0:
+            dists = overlap_tree.query(pts3d[msk], cam_center=cam_center)
+            if 'norm' in mode:
+                depths = res['pts3d_local'][0, 0, ::kf_x_subsamp, ::kf_x_subsamp, -1]
+                dists /= depths[msk].cpu().numpy() + eps
+            # if ended up in an unseen quadrant, put a number to avoid getting a nan from np.percentile
+            dists[np.isposinf(dists)] = np.finfo(dists.dtype).max
+            outscore = np.percentile(dists, percentile)
+    else:
+        raise ValueError(f"Unknown overlap score method {mode}")
+    return outscore
+
+
+def prep_imgs(cimg, img_mean, img_std):
+    cimg = cimg.permute(0, 2, 3, 1)  # [B, H, W, 3] color is float3 \in [0,1]
+    return (cimg * torch.tensor(img_std, device=cimg.device)) + torch.tensor(img_mean, device=cimg.device)
+
+
+def preproc_frame(img, idx, res=512, transform=ImgNorm):
+    img = Image.fromarray(img)
+    W1, H1 = img.size
+    halfw, halfh = cx, cy = W1 // 2, H1 // 2
+    longsize = res
+    if res in [224, 336, 448]: 
+        longsize = max(W1, H1) / min(W1, H1) * res  # mindim has to be at least 224
+    # resize long side to given size
+    img = _resize_pil_image(img, longsize)
+    # update size after resize
+    W, H = img.size
+    cx, cy = W // 2, H // 2
+
+    to_orig_focal = W1 / W
+
+    if res in [224, 336, 448]: # hardcoded from specific training runs, could be automatically detected
+        halfw = halfh = res // 2  # square crop
+    else:
+        # make sure we have multiple of 16
+        halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+    img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+    return dict(img=transform(img)[None], true_shape=np.int32([img.size[::-1]]), idx=idx, instance=str(idx), offset=np.int32([[cx - halfw, cy - halfh]])), to_orig_focal
+
+
+def choose_keyframe_from_overlap(overlap_score, thr, overlap_mode):
+    if 'nn' in overlap_mode:
+        outchoice = overlap_score > thr
+    else:
+        outchoice = overlap_score < thr
+    return outchoice
+
+
+def mean_focal(seq_focals):  # wavg of seq focals
+    out = None
+    if len(seq_focals['f']):
+        focals = np.array(seq_focals['f'])
+        confs = np.array(seq_focals['conf'])
+        out = (focals * confs / confs.sum()).sum()
+    return out
+
+
+def build_intr(focal, W, H, device, dtype):
+    out = torch.eye(3, device=device, dtype=dtype)
+    out[0, 0] = out[1, 1] = float(focal)
+    out[:2, -1] = torch.tensor([W / 2, H / 2], device=device, dtype=dtype)
+    return out
+
+
+def get_camera_pose(res, seq_focal, HW, is_first_frame=False, rectify=True):
+    device = res['pts3d'].device
+    B = res['pts3d'].shape[1]
+
+    H, W = HW
+    pp = torch.tensor((W / 2, H / 2), device=device)
+
+    focal = estimate_focal_knowing_depth(res['pts3d_local'][0], pp, focal_mode='weiszfeld')
+    focal_ratio = 1.
+    if seq_focal is not None and rectify:
+        focal_ratio = seq_focal / focal[:, None]  
+
+    if is_first_frame:  # first frame defines the origin of the coordinate system
+        R = torch.eye(3, device=device).repeat(B, 1, 1)
+        T = torch.zeros(3, device=device).repeat(B, 1)
+    else:
+        pts3d_local = res['pts3d_local'][0].view(B, -1, 3)
+        pts3d_local[..., -1] *= focal_ratio
+
+        R, T = roma.rigid_points_registration(pts3d_local, res['pts3d'][0].view(
+            B, -1, 3), weights=res['conf'][0].view(B, -1) - 1., compute_scaling=False)
+       
+    c2w = torch.eye(4, device=device).repeat(B, 1, 1)
+    c2w[:, :3, :3] = R
+    c2w[:, :3, 3] = T
+    return c2w, focal
+
+
+def get_map(ptscolsconfs, confthr):
+    allpts = []
+    allcols = []
+    for pts, cols, confs in ptscolsconfs:
+        msk = confs > confthr
+        allpts.append(pts[msk])
+        allcols.append(cols[0, 0, msk])
+    return torch.cat(allpts), torch.cat(allcols)
+
+
+def postproc_pred(img,
+                  res,
+                  is_first_frame,
+                  seq_focals,
+                  fixed_focal=True,
+                  overlap_mode='nn-norm',
+                  overlap_tree=None,
+                  kf_x_subsamp=None,
+                  keyframe_overlap_thr=.15,
+                  min_conf_keyframe=1.5,
+                  overlap_percentile=70,
+                  img_mean=[0.5, 0.5, 0.5],
+                  img_std=[0.5, 0.5, 0.5]):
+
+    assert res['pts3d'].shape[0] == 1 and res['pts3d'].shape[1] == 1, "Need to implement batching if ever needed, frames should come 1 by 1 here."
+
+    # assumes frames come in 1 by 1
+    # recover depth from local pointmap
+    depth = res['pts3d_local'][0, 0, ..., -1]  # query view depth
+    conf = res['conf'][0, 0]
+
+    # Mask pointmap and colors
+    msk = res['conf'] > min_conf_keyframe
+
+    if kf_x_subsamp: # view subsampling to increase frame rate
+        msk = msk[0, 0, ::kf_x_subsamp, ::kf_x_subsamp]
+        pts = res['pts3d'][0, 0, ::kf_x_subsamp, ::kf_x_subsamp][msk]
+    else:
+        pts = res['pts3d'][msk]
+
+    cols = prep_imgs(img['img'], img_mean=img_mean, img_std=img_std)[None]
+
+    c2w = None
+    seq_focal = mean_focal(seq_focals) if fixed_focal else None
+    c2w, focal = get_camera_pose(res, seq_focal, HW=img['true_shape'][0], is_first_frame=is_first_frame)
+    c2w = c2w[0]
+    cam_center = c2w[:3, -1]
+
+    res['overlap_score'] = get_overlap_score(res,
+                                             overlap_tree,
+                                             cam_center=cam_center,
+                                             mode=overlap_mode,
+                                             kf_x_subsamp=kf_x_subsamp,
+                                             min_conf_keyframe=min_conf_keyframe,
+                                             percentile=overlap_percentile)
+
+    # Check if memory frame
+    iskeyframe = is_first_frame or (
+        choose_keyframe_from_overlap(res['overlap_score'], keyframe_overlap_thr, overlap_mode)
+        and conf.median() > min_conf_keyframe
+    )
+
+    allpts = res['pts3d'][0, 0]
+
+    out = (pts,
+           allpts, 
+           cols.to(torch.float32),
+           depth,
+           conf,
+           focal,
+           torch.inverse(c2w),
+           cam_center,
+           iskeyframe)
+    return out
+
+
+class MUSt3R_Agent():
+    """
+    Manage focal length, and smoothing operations for each agent independently
+    """
+
+    def __init__(self,
+                 # a single focal for all sequence (we should be able to handle zoom but I have not GT to check that)
+                 fixed_focal=True,
+                 # Smoothing terms
+                 smooth_focal_changes=False,  # add smoothing term to focal length changes (only when non fixed_focal)
+                 img_mean=[0.5, 0.5, 0.5],
+                 img_std=[0.5, 0.5, 0.5]
+                 ):
+        assert fixed_focal or not smooth_focal_changes, "TODO maybe: online focal smoothing when varying focals"
+        self.fixed_focal = fixed_focal
+        self.smooth_focal_changes = smooth_focal_changes
+        self.img_mean = img_mean
+        self.img_std = img_std
+        self.reset()
+
+    def reset(self):
+        self.seq_focals = {'f': [], 'conf': [], 'to_orig': []}
+
+    def get_true_focal(self):
+        out = None
+        if len(self.seq_focals['f']) != 0:
+            if self.fixed_focal:
+                assert np.all(np.array(self.seq_focals['to_orig']) == self.seq_focals['to_orig']
+                              [0]), "To orig should be constant for a single true focal"
+                out = mean_focal(self.seq_focals) * self.seq_focals['to_orig'][0]
+            else:
+                out = [ff * tt for ff, tt in zip(self.seq_focals['f'], self.seq_focals['to_orig'])]
+        return out
+
+    @torch.no_grad()
+    def update(self,
+               inp,
+               pred,
+               is_first_frame,
+               overlap_mode,
+               overlap_tree,
+               kf_x_subsamp,
+               keyframe_overlap_thr,
+               min_conf_keyframe,
+               overlap_percentile,
+               to_orig_focal):
+
+        selpts3d, pts3d, colors, depth, conf, focal, w2c, cam_center, iskeyframe = postproc_pred(img=inp,
+                                                                                                 res=pred,
+                                                                                                 is_first_frame=is_first_frame,
+                                                                                                 seq_focals=self.seq_focals,
+                                                                                                 fixed_focal=self.fixed_focal,
+                                                                                                 overlap_mode=overlap_mode,
+                                                                                                 overlap_tree=overlap_tree,
+                                                                                                 kf_x_subsamp=kf_x_subsamp,
+                                                                                                 keyframe_overlap_thr=keyframe_overlap_thr,
+                                                                                                 min_conf_keyframe=min_conf_keyframe,
+                                                                                                 overlap_percentile=overlap_percentile,
+                                                                                                 img_mean=self.img_mean,
+                                                                                                 img_std=self.img_std
+                                                                                                 )
+
+        self.seq_focals['f'].append(focal[0].cpu().numpy())
+        self.seq_focals['to_orig'].append(to_orig_focal)
+        self.seq_focals['conf'].append(conf.mean().cpu().numpy() - 1.)
+        outfocal = mean_focal(self.seq_focals) if self.fixed_focal else self.seq_focals['f'][-1]
+        return selpts3d, pts3d, colors, depth, conf, outfocal, w2c, cam_center, iskeyframe
+
+
+class SLAM_MUSt3R():
+    """
+    Main memory manager, will take care of redistributing input images to respective agent, will gather agent's response and update memory accordingly
+    You can save/load memory for/from other runs
+    """
+
+    def __init__(self, chkpt,
+                 res=512,
+                 searcher='kdtree-scipy-quadrant_x2',  # 'kdtree-scipy',
+                 overlap_mode='nn-norm',  # 'nn',
+                 kf_x_subsamp=4,
+                 keyframe_overlap_thr=.15,
+                 min_conf_keyframe=1.5,
+                 overlap_percentile=70.,
+                 rerender=False,
+                 fixed_focal=True,
+                 keep_memory=False,
+                 load_memory=None,
+                 num_agents=1,
+                 device='cuda:0',
+                 num_init_frames=2,
+                 ):
+
+        self.agents = [MUSt3R_Agent(fixed_focal) for _ in range(num_agents)]
+        self.num_init_frames = num_init_frames
+
+        # inference resolution
+        self.res = res
+        self.device = device
+
+        # Load Model
+        self.transform = ImgNorm
+        self.model = load_model(chkpt, device=device)
+
+        # params
+        self.searcher = searcher
+        self.kf_x_subsamp = kf_x_subsamp
+        self.keyframe_overlap_thr = keyframe_overlap_thr
+        self.min_conf_keyframe = min_conf_keyframe
+        self.overlap_percentile = overlap_percentile
+        self.overlap_mode = overlap_mode
+        self.rerender = rerender  # once sequence is processed, repredict everything from full memory
+        self.keep_memory = keep_memory  # save latent_kf+pointmaps for export
+
+        # Loaded Memory
+        self.memory_memory = None
+        self.memory_map = None
+        self.memory_data = []
+        self.memory_overlap_tree = None
+        
+        if load_memory is not None:
+            self.load_memory(load_memory)
+        
+        self.reset()
+
+
+    def reset(self):
+        # Reset data structures to loaded memory if available else full reinit
+        # Sequence data
+        self.all_poses = []
+        self.all_confs = []
+        self.all_timestamps = []
+        # Memory and keyframes
+        self.memory = self.memory_memory
+        self.keyframe_pointmaps = self.memory_data
+        self.keyframes = []
+        # params for overlap, keyframe selection
+        if self.memory_overlap_tree is None:
+            self.overlap_tree = get_searcher(self.searcher if 'nn' in self.overlap_mode else 'none')
+        else:
+            self.overlap_tree = self.memory_overlap_tree
+        # Re-rendering
+        self.all_images = []
+        self.all_pts3d = None
+
+        self.mem_was_loaded = self.memory_memory is not None  # reset to loaded memory
+        # Reset all agents
+        for i in range(len(self.agents)):
+            self.agents[i].reset()
+
+    @property
+    def num_mem_frames(self):
+        return len(self.keyframes)
+
+    def get_true_focals(self):
+        # would be better if agid was a tag instead of an int
+        return {agid: agent.get_true_focal() for agid, agent in enumerate(self.agents)}
+
+    def write_all_poses(self, path, filtering_mode=None, filtering_steps=5, filtering_alpha=.5, **tolog):        
+        print(f"Writing full trajectory in {path}")
+        all_poses = torch.stack(self.all_poses).cpu().numpy()
+        timestamps = np.stack(self.all_timestamps).astype(int)
+        conf = torch.stack(self.all_confs).cpu().numpy()
+        focals = self.get_true_focals()
+
+        if filtering_mode is not None:
+            if 'laplacian' in filtering_mode:
+                trajectory = all_poses[:, :3, -1]
+                if 'conf' in filtering_mode:
+                    conf_remap = (conf-conf.min())/(conf.max()-conf.min())  # remap [1,inf] in between [0,1]
+                    smoothed_trajectory = laplacian_smoothing_with_confidence(
+                        trajectory, conf_remap, alpha=filtering_alpha, iterations=filtering_steps)
+                else:
+                    smoothed_trajectory = laplacian_smoothing(
+                        trajectory, alpha=filtering_alpha, iterations=filtering_steps)
+                all_poses[:, :3, -1] = smoothed_trajectory
+            else:
+                raise ValueError(f"Unknown filtering mode {filtering_mode}")
+
+        np.savez(path, poses=all_poses, timestamps=timestamps, confs=conf, focal=focals, **tolog)
+
+    def save_memory(self, output):
+        mem = (self.memory, self.keyframe_pointmaps, self.overlap_tree)
+        pkl.dump(mem, open(output, 'wb'))
+
+    def load_memory(self, mem_file):
+        self.memory_memory, self.memory_data, self.memory_overlap_tree = pkl.load(open(mem_file, 'rb'))
+        self.memory = self.memory_memory
+        self.keyframe_pointmaps = self.memory_data
+        self.overlap_tree = self.memory_overlap_tree
+        self.mem_was_loaded = True
+
+    def fetch_memory_map(self, conf_thr):
+        if self.mem_was_loaded:
+            self.memory_map = get_map(self.memory_data, conf_thr)
+            self.mem_was_loaded = False
+        return self.memory_map
+
+    @torch.no_grad()
+    def rerender_all_frames(self, maxbs=64):
+        assert len(self.agents) == 1, "Multiagent rerender to be managed (different focal lengths)"
+        if self.rerender:
+            B = len(self.all_images)
+            all_imgs = {}
+            all_preds = []
+            keys_of_interest = ['pts3d', 'pts3d_local', 'conf']
+            def keys_of_interest_to_cpu(dd): return {k: dd[k].cpu() for k in keys_of_interest}
+            for i in tqdm(range(B // maxbs + 1)):
+                sel = self.all_images[slice(i * maxbs, (i + 1) * maxbs)]
+                if sel == []:
+                    continue
+                all_imgs['img'] = torch.cat([im['img'] for im in sel])  # [1,B,3,H,W]
+                all_imgs['true_shape'] = np.concatenate([im['true_shape'] for im in sel])  # [1,B,3,H,W]
+                pred, _ = forward_must3r(self.model,
+                                         [all_imgs],
+                                         self.memory,
+                                         render=True,
+                                         device=self.device)
+                all_preds.append(keys_of_interest_to_cpu(pred[0]))
+            res = {}
+            def cat_pred(k): return torch.cat([pp[k] for pp in all_preds], dim=1)
+            for kk in keys_of_interest:
+                res[kk] = cat_pred(kk)
+
+            focal = mean_focal(self.agents[0].seq_focals)
+            c2w, _ = get_camera_pose(res, focal, HW=all_imgs['true_shape'][0], is_first_frame=False)
+
+            self.all_pts3d = res['pts3d']
+            self.all_poses = [cc for cc in c2w]
+
+    @torch.no_grad()
+    def __call__(self, img, frame_id, cam_id):
+        query_view_prep, to_orig_focal = preproc_frame(img, frame_id, res=self.res, transform=self.transform)
+
+        if self.memory is not None and len(self.all_images) < self.num_init_frames:
+            # we have not reached the correct initialization, reset memory
+            other_init_images = self.all_images
+            frame_ids = self.all_timestamps
+            self.reset()
+            self.all_images = other_init_images.copy()
+        else:
+            other_init_images = []
+            frame_ids = []
+
+        if self.rerender or (len(self.all_images) < self.num_init_frames):
+            self.all_images.append(query_view_prep)
+
+        query_images = other_init_images + [query_view_prep]
+        frame_ids += [frame_id]
+        preds, newmem = forward_must3r(self.model,
+                                       query_images,
+                                       self.memory,
+                                       device=self.device)
+
+        for query_view_prep, pred, frame_id in zip(query_images, preds, frame_ids):
+            HW = query_view_prep['true_shape'][0]
+            selpts3d, pts3d, colors, depth, conf, focal, w2c, cam_center, iskeyframe = self.agents[cam_id].update(query_view_prep,
+                                                                                                                  pred,
+                                                                                                                  self.memory is None,  # is first frame
+                                                                                                                  overlap_mode=self.overlap_mode,
+                                                                                                                  overlap_tree=self.overlap_tree,
+                                                                                                                  kf_x_subsamp=self.kf_x_subsamp,
+                                                                                                                  keyframe_overlap_thr=self.keyframe_overlap_thr,
+                                                                                                                  min_conf_keyframe=self.min_conf_keyframe,
+                                                                                                                  overlap_percentile=self.overlap_percentile,
+                                                                                                                  to_orig_focal=to_orig_focal)
+            self.all_timestamps.append(frame_id)
+            self.all_poses.append(w2c.inverse())
+            self.all_confs.append(conf.mean())
+
+            if iskeyframe:
+                self.memory = newmem
+                self.keyframes.append(frame_id)
+                if self.overlap_tree is not None:
+                    self.overlap_tree.add_pts(selpts3d, cam_center=cam_center)
+                if self.keep_memory:
+                    self.keyframe_pointmaps.append([pts3d.cpu(), colors.cpu(), conf.cpu()])
+
+        return pts3d.cpu().numpy(), colors.cpu().numpy(), depth, conf, focal, w2c, HW, iskeyframe
diff --git a/must3r/slam/nns.py b/must3r/slam/nns.py
new file mode 100644
index 0000000000000000000000000000000000000000..d969c41a9cfcf1802cbfbe5e015a38977de61042
--- /dev/null
+++ b/must3r/slam/nns.py
@@ -0,0 +1,92 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+from scipy.spatial import KDTree
+import numpy as np
+from functools import partial
+
+from must3r.slam.tools import get_quadrant_id, ravel3d, to_np
+
+
+def get_searcher(method, isquadrant=False):
+    if 'quadrant_x' in method and not isquadrant:
+        out = QuandrantSearcher(method)
+    elif "kdtree-scipy" in method:
+        out = KDTree_scipy()
+    elif method == 'none':
+        out = None
+    else:
+        raise ValueError(f"Unknown searcher method {method}")
+
+    return out
+
+
+class Base_NN():
+    """
+    Base NN class, that can add observations to the search struct and query points from it
+    IO is torch Tensors
+    """
+
+    def __init__(self, subsamp=None):
+        self.subsamp = subsamp
+
+    def add_pts(self, pts, **kw):  # add [N, 3] db pts
+        # Add points to the existing search structure
+        raise NotImplementedError("Overload this function for your needs")
+
+    def query(self, pts, **kw):  # [N, 3] query pts
+        # Query 3D points
+        raise NotImplementedError("Overload this function for your needs")
+
+
+class KDTree_scipy(Base_NN):
+    """
+    Simple KDTree from scipy
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.all_points = []
+        self.kdtree = None
+
+    def add_pts(self, pts, **kw):
+        if len(self.all_points) == 0:
+            self.all_points = ravel3d(pts)
+        else:
+            self.all_points = np.concatenate([self.all_points, ravel3d(pts)])
+        self.kdtree = KDTree(self.all_points)
+
+    def query(self, pts, **kw):
+        if self.kdtree is None:
+            distances = np.full(pts.shape[0], np.inf)
+        else:
+            distances, indices = self.kdtree.query(ravel3d(pts), k=1, workers=4)
+        return distances
+
+
+class QuandrantSearcher(Base_NN):
+    """ 
+    Split the view direction space in quadrants to account for ray directions (e.g. visibility) while querying
+    Simply one query struct per quadrant
+    """
+
+    def __init__(self, method):
+        super().__init__()
+        # split the rotation sphere into 2N**2 regular quadrants
+        self.quadrant_divider = int(method.split('quadrant_x')[-1].split('-')[0])
+
+        self.all_points = [[] for _ in range(2 * self.quadrant_divider**2)]
+        self.search_structs = [get_searcher(method, isquadrant=True) for _ in range(2 * self.quadrant_divider**2)]
+        self.get_quadrant_id = partial(get_quadrant_id, quadrant_divider=self.quadrant_divider)
+
+    def add_pts(self, pts, cam_center, **kw):
+        quadrant_id = self.get_quadrant_id(to_np(pts - cam_center[None]))
+        for quad in np.unique(quadrant_id):
+            idx = quadrant_id == quad
+            self.search_structs[quad].add_pts(pts[idx])
+
+    def query(self, pts, cam_center, **kw):
+        quadrant_id = self.get_quadrant_id(to_np(pts - cam_center[None]))
+        dists = np.zeros(pts.shape[0])
+        for quad in np.unique(quadrant_id):
+            idx = quadrant_id == quad
+            dists[idx] = self.search_structs[quad].query(pts[idx])
+        return dists
diff --git a/must3r/slam/slam.py b/must3r/slam/slam.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1fec65786f26bd7d3582144869921a59776ad26
--- /dev/null
+++ b/must3r/slam/slam.py
@@ -0,0 +1,693 @@
+#!/usr/bin/env python3
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+import torch
+import cv2
+import time
+import os
+from tqdm import tqdm
+import logging as log
+import argparse
+import threading
+from concurrent.futures import ThreadPoolExecutor
+import open3d as o3d
+import open3d.visualization.gui as gui
+import open3d.visualization.rendering as rendering
+
+from must3r.model.blocks.attention import has_xformers, toggle_memory_efficient_attention
+from must3r.slam.data import AutoMultiLoader
+from must3r.slam.model import SLAM_MUSt3R
+
+try:
+    o3d.cuda
+except AttributeError:
+    print('Fallback to open3d.cpu')
+    o3d.cuda = o3d.cpu  # workaround for module open3d has no attribute cuda
+
+
+MB = 1024.**2
+camcols = [  # different frustrum colors for each agent
+    [.1, .1, .9],  # blue
+    [1., .5, 0.],  # orange
+    [.5, 0., .5],  # purple
+    [0., 1., 1.],  # cyan
+]
+
+SKIP_EVERY = 1
+
+def grab_frame(camera):
+    read = camera.read()
+    frame = read[1]
+    camid = 0 if len(read) != 3 else read[2]
+
+    for _ in range(SKIP_EVERY - 1): camera.grab()
+
+    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) if frame is not None else None
+    return img, camid
+
+def img2o3d(im):
+    res = o3d.cuda.pybind.geometry.Image(im.astype(np.uint8))
+    return res
+
+def colorize_depth(depth, mode='grayscale'):
+    if depth is None:
+        return depth
+    colored_depth = None
+    if mode == 'grayscale':
+        mind, maxd = depth.min(), depth.max()
+        depth = 255. * (depth - mind) / (maxd - mind + 1e-9)
+        colored_depth = torch.stack([depth, depth, depth], dim=-1)
+    elif mode == 'conf':
+        colored_depth = depth - 1.0
+    else:
+        raise ValueError(f"Unknown colorization mode {mode}.")
+    return colored_depth.cpu().numpy()
+
+# Open3D classes
+# Processing
+class PipelineModel:
+    """Controls IO. Methods run
+    in worker threads."""
+
+    def __init__(self, 
+                 model, 
+                 camera,
+                 update_view,
+                 device=None,
+                 res=512,
+                 show_cameras=True,
+                 chunk=-1,  # -1 means no chunking
+                 chunking_overlap=4,
+                 viz_conf=2.5,  # conf thresh for pts3d viz
+                 ):
+        """Initialize.
+        Args:
+            update_view (callback): Callback to update display elements for a
+                frame.
+            device (str): Compute device (e.g.: 'cpu:0' or 'cuda:0').
+            res: maxdim of the images in pixels
+            show_camera: display camera locations with the 3D model
+            chunk: chunk size for keyframe chunking (split sequence memory to 
+                 redefine origin as the frame number augments since MUSt3R can hardly go above 50 keyframes)
+            chunking_overlap : when creating a new memory chunk, how many images of the previous one should be used
+        """
+        self.chunk = chunk
+        self.chunking_overlap = chunking_overlap
+        self.res = res
+        self.show_cameras = show_cameras
+        self.viz_conf = viz_conf
+        self.update_view = update_view
+        if device:
+            self.device = device.lower()
+        else:
+            self.device = 'cuda:0' if o3d.core.cuda.is_available() else 'cpu:0'
+        self.o3d_device = o3d.core.Device(self.device)
+
+        self.cv_capture = threading.Condition()  # condition variable
+        self.query_view = None
+        self.must3r = model
+        self.camera = camera
+        self.depth_in_color = None
+
+        self.pcd_stride = 2  # downsample point cloud, may increase frame rate
+        self.flag_start = False
+
+        self.keyframes_data = []
+        self.keyframe_focals = []
+        self.keyframe_confs = []
+
+        self.pcd_frame = None
+        self.rgbd_frame = None
+        self.executor = ThreadPoolExecutor(max_workers=3,
+                                           thread_name_prefix='Process')
+        self.flag_exit = False
+
+        self.cache = {}
+
+    @property
+    def max_points(self):
+        return 10 * self.res**2
+
+    def run(self):
+        """Run pipeline."""
+        frame_id = 0
+        t1 = time.perf_counter()
+        cam_centers = []
+        memory_map=None
+        while not self.flag_exit:
+            if not self.flag_start:
+                if self.query_view is not None:
+                    # Reset camera and memory
+                    self.query_view = None
+                    self.keyframes_data = []
+                    self.must3r.reset()
+                    frame_id = 0
+                    cam_centers = []
+                    self.camera.set(cv2.CAP_PROP_POS_FRAMES, 0)
+            else:
+                self.query_view, camid = grab_frame(self.camera)
+                if self.query_view is None:
+                    # print("End of video file, waiting...")
+                    continue
+
+                # Preproc, Forward, Postproc
+                pts3d, colors, depth, conf, focal, w2c, HW, iskeyframe = self.must3r(self.query_view, frame_id, camid)
+                c2w = w2c.inverse()
+                cam_centers.append(c2w[:3, -1])
+                # Conf thr
+                msk = conf > self.viz_conf
+                pts3d = pts3d[msk.cpu()]
+                colors = colors[0, 0, msk.cpu()]
+                if iskeyframe:
+                    self.keyframe_focals.append(focal)
+                    self.keyframe_confs.append(conf.mean().cpu())
+                self.depth_in_color = colorize_depth(depth)
+                self.conf_in_color = colorize_depth(conf)
+                dtype = o3d.core.float32
+                self.pcd_frame = None
+                self.frustrum = None
+                if pts3d.shape[0] != 0:
+                    self.pcd_frame = o3d.cuda.pybind.t.geometry.PointCloud()
+                    self.pcd_frame.point.positions = o3d.cuda.pybind.core.Tensor(pts3d, dtype=dtype)
+                    self.pcd_frame.point.colors = o3d.cuda.pybind.core.Tensor(colors, dtype=dtype)
+
+                if self.show_cameras:
+                    H, W = HW
+                    K = np.eye(3)
+                    K[0, 0] = K[1, 1] = focal
+                    K[0, -1] = W / 2
+                    K[1, -1] = H / 2
+                    self.frustrum = o3d.geometry.LineSet.create_camera_visualization(
+                        W, H, intrinsic=K, extrinsic=w2c.cpu().numpy(), scale=0.075)
+                    self.frustrum.paint_uniform_color([0.1, 0.9, 0.1] if iskeyframe else camcols[camid % len(camcols)])
+
+                if iskeyframe:
+                    # Move Pointmap and camera to keyframes data
+                    self.keyframes_data.append([f'{frame_id}_kpcd', self.pcd_frame])
+                    self.keyframes_data.append([f'{frame_id}_kfrustrum', self.frustrum])
+                    self.pcd_frame = None
+                    self.frustrum = None
+
+                t0, t1 = t1, time.perf_counter()
+                ms_per_frame = (t1 - t0) * 1000.
+                fps = 1000 / ms_per_frame
+                max_mem = torch.cuda.max_memory_allocated() / MB
+                if frame_id % 60 == 0 and frame_id > 0:
+                    t0, t1 = t1, time.perf_counter()
+                    print(f"frame_id = {frame_id},"
+                          f"\t{fps:0.2f} fps"
+                          f"\t{ms_per_frame:0.2f} ms/frame")
+
+                # Prepare camera centers to display trajectory
+                tempcamc = torch.stack(cam_centers).cpu().numpy()
+                camc_frame = o3d.cuda.pybind.t.geometry.PointCloud()
+                camc_frame.point.positions = o3d.cuda.pybind.core.Tensor(tempcamc, dtype=dtype)
+                camc_frame.point.colors = o3d.cuda.pybind.core.Tensor(np.zeros_like(tempcamc), dtype=dtype)
+
+                # Prepare memory map if needed
+                if frame_id == 0:
+                    mmap = self.must3r.fetch_memory_map(self.viz_conf)
+                    if mmap is not None : # only load memory map at first frame
+                        mempts, memcols = mmap
+                        memory_map = o3d.cuda.pybind.t.geometry.PointCloud()
+                        memory_map.point.positions = o3d.cuda.pybind.core.Tensor(mempts.cpu().numpy(), dtype=dtype)
+                        memory_map.point.colors = o3d.cuda.pybind.core.Tensor(memcols.cpu().numpy(), dtype=dtype)
+                else:
+                    memory_map = None 
+                    
+                focal_el = self.must3r.get_true_focals()[camid]
+                if isinstance(focal_el, list):
+                    focal_el = focal_el[-1]
+                frame_elements = {  # creating the entire window (3 images on the right and pcd on main left)
+                    'color': self.query_view,
+                    'depth': self.depth_in_color,
+                    'conf': self.conf_in_color,
+                    'pcd': self.pcd_frame,
+                    'cam_centers': camc_frame,
+                    f'frustrum_{camid}': self.frustrum,
+                    'keyframes_data': self.keyframes_data,
+                    'c2w': c2w.cpu().numpy(),
+                    'mem': max_mem,
+                    'fps': fps,
+                    'focal': focal_el,
+                    'num_mem_frames': self.must3r.num_mem_frames,
+                    'memory_map': memory_map,
+                }
+                self.update_view(frame_elements)
+
+                frame_id += 1
+
+        self.executor.shutdown()
+        print("Shutdown")
+
+# GUI and render
+
+
+class PipelineView:
+    """Controls display and user interface. All methods must run in the main thread."""
+
+    def __init__(self, vfov=60, max_pcd_vertices=1 << 20, num_sources=1, **callbacks):
+        """Initialize.
+        Args:
+            vfov (float): Vertical field of view for the 3D scene.
+            max_pcd_vertices (int): Maximum point cloud vertices for which memory
+                is allocated.
+            callbacks (dict of kwargs): Callbacks provided by the controller
+                for various operations.
+        """
+
+        self.vfov = vfov
+        self.max_pcd_vertices = max_pcd_vertices
+
+        gui.Application.instance.initialize()
+        self.window = gui.Application.instance.create_window(
+            "MUSt3R || Online RGB Video Processing", 1620, 1080)
+        # Called on window layout (eg: resize)
+        self.window.set_on_layout(self.on_layout)
+        self.window.set_on_close(callbacks['on_window_close'])
+
+        self.pcd_material = o3d.visualization.rendering.MaterialRecord()
+        self.pcd_material.shader = "defaultUnlit"  # "defaultLit"
+        # Set n_pixels displayed for each 3D point, accounting for HiDPI scaling
+        self.pcd_material.point_size = int(4 * self.window.scaling)
+
+        self.cam_material = o3d.visualization.rendering.MaterialRecord()
+        self.cam_material.shader = "unlitLine"
+        self.cam_material.line_width = 4
+
+        # 3D scene
+        self.pcdview = gui.SceneWidget()
+        self.window.add_child(self.pcdview)
+        self.pcdview.enable_scene_caching(
+            True)  # makes UI _much_ more responsive
+        self.pcdview.scene = rendering.Open3DScene(self.window.renderer)
+        self.pcdview.scene.set_background([1, 1, 1, 1])  # White background
+        self.pcdview.scene.set_lighting(
+            rendering.Open3DScene.LightingProfile.SOFT_SHADOWS, [0, -6, 0])
+        # Point cloud bounds, depends on the sensor range
+        self.pcd_bounds = o3d.geometry.AxisAlignedBoundingBox([-30, -30, -30],
+                                                              [30, 30, 30])
+        self.reset_view()  # Initial view
+        em = self.window.theme.font_size / 2
+        self.fps_panel = gui.Vert(em, gui.Margins(em, em, em, em))
+        self.fps_panel.preferred_width = int(200 * self.window.scaling)
+        self.window.add_child(self.fps_panel)
+        self.fps = gui.Label("FPS: N/A")
+        self.fps_panel.add_child(self.fps)
+        self.mem = gui.Label("Mem: N/A")
+        self.fps_panel.add_child(self.mem)
+        self.focal = gui.Label("Focal: N/A")
+        self.fps_panel.add_child(self.focal)
+        self.num_mem_frames = gui.Label("Mem frames: N/A")
+        self.fps_panel.add_child(self.num_mem_frames)
+
+        # Options panel
+        self.panel = gui.Vert(em, gui.Margins(em, em, em, em))
+        self.panel.preferred_width = int(400 * self.window.scaling)
+        self.window.add_child(self.panel)
+        toggles = gui.Horiz(em)
+        self.panel.add_child(toggles)
+
+        self.flag_followcam = True
+        self.toggle_followcam = gui.ToggleSwitch("Follow Cam")
+        self.toggle_followcam.is_on = True
+        self.toggle_followcam.set_on_clicked(
+            callbacks['on_toggle_followcam'])  # callback
+        toggles.add_child(self.toggle_followcam)
+
+        self.flag_start = False
+        self.toggle_start = gui.ToggleSwitch("Start/Stop")
+        self.toggle_start.is_on = False
+        self.toggle_start.set_on_clicked(
+            callbacks['on_toggle_start'])  # callback
+        toggles.add_child(self.toggle_start)
+
+        view_buttons = gui.Horiz(em)
+        self.panel.add_child(view_buttons)
+        view_buttons.add_stretch()  # for centering
+        reset_view = gui.Button("Reset View")
+        reset_view.set_on_clicked(self.reset_view)  # callback
+        view_buttons.add_child(reset_view)
+
+        self.current_view_viz = 0
+        self.num_sources = num_sources
+        if self.num_sources > 1:
+            self.current_view = gui.Button("Next agent")
+            self.current_view.set_on_clicked(
+                self.next_view)  # callback
+            view_buttons.add_child(self.current_view)
+
+        view_buttons.add_stretch()  # for centering
+        self.video_size = (int(240 * self.window.scaling),
+                           int(320 * self.window.scaling), 3)
+
+        # Camera stream
+        self.show_color = gui.CollapsableVert("Video stream")
+        self.show_color.set_is_open(True)
+        self.panel.add_child(self.show_color)
+        self.color_video = gui.ImageWidget(
+            o3d.geometry.Image(np.zeros(self.video_size, dtype=np.uint8)))
+        self.show_color.add_child(self.color_video)
+
+        # Show pred depth
+        self.show_depth = gui.CollapsableVert("Predicted Depth")
+        self.show_depth.set_is_open(True)
+        self.panel.add_child(self.show_depth)
+        self.depth_video = gui.ImageWidget(
+            o3d.geometry.Image(np.zeros(self.video_size, dtype=np.uint8)))
+        self.show_depth.add_child(self.depth_video)
+
+        # Show pred depth
+        self.show_conf = gui.CollapsableVert("Predicted Confidence")
+        self.show_conf.set_is_open(True)
+        self.panel.add_child(self.show_conf)
+        self.conf_video = gui.ImageWidget(
+            o3d.geometry.Image(np.zeros(self.video_size, dtype=np.uint8)))
+        self.show_conf.add_child(self.conf_video)
+
+        self.status_message = gui.Label("")
+        self.panel.add_child(self.status_message)
+
+        self.flag_exit = False
+        self.flag_gui_init = False
+        self.flag_normals = False
+
+    def next_view(self):
+        self.current_view_viz = (self.current_view_viz + 1) % self.num_sources
+
+    def update(self, frame_elements):
+        """Update visualization with point cloud and images. Must run in main
+        thread since this makes GUI calls.
+        Args:
+            frame_elements: dict {element_type: geometry element}.
+                Dictionary of element types to geometry elements to be updated
+                in the GUI:
+                    'pcd': point cloud,
+                    'color': rgb image (3 channel, uint8),
+                    'depth': depth image (uint8),
+                    'status_message': message
+        """
+        if not self.flag_gui_init:
+            self.pcdview.scene.clear_geometry()
+            # Set dummy point cloud to allocate graphics memory
+            dummy_pcd = o3d.t.geometry.PointCloud({
+                'positions':
+                    o3d.core.Tensor.zeros((self.max_pcd_vertices, 3),
+                                          o3d.core.Dtype.Float32),
+                'colors':
+                    o3d.core.Tensor.zeros((self.max_pcd_vertices, 3),
+                                          o3d.core.Dtype.Float32),
+                'normals':
+                    o3d.core.Tensor.zeros((self.max_pcd_vertices, 3),
+                                          o3d.core.Dtype.Float32)
+            })
+
+            # initialize camera
+            self.pcd_material.shader = "normals" if self.flag_normals else "defaultUnlit"  # "defaultLit"
+            self.pcdview.scene.add_geometry('pcd', dummy_pcd, self.pcd_material)
+            self.pcdview.scene.add_geometry('cam_centers', dummy_pcd, self.pcd_material)
+            self.pcdview.scene.add_geometry('memory_map', dummy_pcd, self.pcd_material)
+
+            self.flag_gui_init = True
+
+        update_flags = (rendering.Scene.UPDATE_POINTS_FLAG |
+                        rendering.Scene.UPDATE_COLORS_FLAG |
+                        (rendering.Scene.UPDATE_NORMALS_FLAG
+                            if self.flag_normals else 0))
+
+        def add_or_update_if_needed(tag, data):
+            if data is not None:
+                always_remove = ['frustrum', 'cam_centers', 'memory_map']
+                for toremove in always_remove:
+                    if toremove in tag and self.pcdview.scene.has_geometry(tag):
+                        self.pcdview.scene.remove_geometry(tag)
+                if self.pcdview.scene.has_geometry(tag):
+
+                    self.pcdview.scene.scene.update_geometry(tag, data, update_flags)
+                else:
+                    material = self.cam_material if 'frustrum' in tag else self.pcd_material
+                    self.pcdview.scene.add_geometry(tag, data, material)
+
+        # Load memory map if present
+        if frame_elements.get('memory_map', None) is not None:
+            add_or_update_if_needed('memory_map', frame_elements['memory_map'])
+
+        # update scene and cameras
+
+        update_cam = False
+        for kk in frame_elements:
+            if 'frustrum' in kk:
+                update_cam = int(kk.split('_')[1]) == self.current_view_viz
+                add_or_update_if_needed(kk, frame_elements[kk])
+
+        add_or_update_if_needed('cam_centers', frame_elements['cam_centers'])
+
+        for kf_key, kf_data in frame_elements['keyframes_data']:
+            add_or_update_if_needed(kf_key, kf_data)
+            # remove item from rendering queue
+            frame_elements['keyframes_data'] = None
+
+        if update_cam:
+            # Update current pointcloud, color and depth images
+            add_or_update_if_needed('pcd', frame_elements['pcd'])
+            if self.show_color.get_is_open() and 'color' in frame_elements:
+                self.color_video.update_image(img2o3d(frame_elements['color']))
+            if self.show_depth.get_is_open() and frame_elements.get('depth', None) is not None:
+                self.depth_video.update_image(img2o3d(frame_elements['depth']))
+            if self.show_conf.get_is_open() and frame_elements.get('conf', None) is not None:
+                self.conf_video.update_image(img2o3d(frame_elements['conf']))
+            if 'focal' in frame_elements:
+                self.focal.text = "Focal: " + f"{frame_elements['focal']:0.2f}"
+
+            if self.flag_followcam:
+                self.reset_view(pose=frame_elements['c2w'])
+
+        if 'status_message' in frame_elements:
+            self.status_message.text = frame_elements["status_message"]
+        if 'fps' in frame_elements:
+            self.fps.text = "FPS: " + f"{frame_elements['fps']:0.2f}"
+        if 'mem' in frame_elements:
+            self.mem.text = "Mem: " + str(int(frame_elements["mem"])) + " MB"
+        if 'num_mem_frames' in frame_elements:
+            self.num_mem_frames.text = f"Mem frames: {frame_elements['num_mem_frames']}"
+
+        self.pcdview.force_redraw()
+
+    def reset_view(self, pose=None):
+        """Callback to reset point cloud view to init or cam pose if given"""
+        if pose is None:
+            self.pcdview.setup_camera(self.vfov, self.pcd_bounds, [0, 0, 0])
+            self.pcdview.scene.camera.look_at([0, 0, 1.5], [0, 0, -2.], [0, -1, 0])
+        else:
+            Rp = pose[:3, :3].T
+            center = pose[:3, -1]  # look at the view camera center
+            eye = center + np.array([[0, -.6, -1.5]]) @ Rp  # put GUI camera behind view and slightly above
+            up = np.array([[0, -1, 0]]) @ Rp  # same orientation as input image
+            self.pcdview.scene.camera.look_at(center, eye[0], up[0])
+
+    def on_layout(self, layout_context):
+        # The on_layout callback should set the frame (position + size) of every
+        # child correctly. After the callback is done the window will layout
+        # the grandchildren.
+        """Callback on window initialize / resize"""
+        frame = self.window.content_rect
+        self.pcdview.frame = frame
+        panel_size = self.panel.calc_preferred_size(layout_context,
+                                                    self.panel.Constraints())
+        self.panel.frame = gui.Rect(frame.get_right() - panel_size.width,
+                                    frame.y, panel_size.width,
+                                    panel_size.height)
+
+        fps_size = self.fps_panel.calc_preferred_size(layout_context,
+                                                      self.fps_panel.Constraints())
+        self.fps_panel.frame = gui.Rect(0,
+                                        frame.y, fps_size.width,
+                                        fps_size.height)
+
+
+# Overall Controller
+class PipelineController:
+    """Entry point for the app. Controls the PipelineModel object for IO and
+    processing  and the PipelineView object for display and UI. All methods
+    operate on the main thread.
+    """
+
+    def __init__(self, args, camera):
+        self.pipeline_model = PipelineModel(args.model, 
+                                            camera,
+                                            self.update_view,
+                                            device=args.device,
+                                            res=args.res,
+                                            show_cameras=not args.hide_cameras,
+                                            viz_conf=args.viz_conf,
+                                            # chunk = args.chunk,
+                                            # chunking_overlap = args.chunking_overlap,
+                                            )
+        self.pipeline_view = PipelineView(
+            max_pcd_vertices=self.pipeline_model.max_points,
+            num_sources=len(args.input),
+            on_window_close=self.on_window_close,
+            on_toggle_followcam=self.on_toggle_followcam,
+            on_toggle_start=self.on_toggle_start)
+
+        threading.Thread(name='PipelineModel',
+                         target=self.pipeline_model.run).start()
+
+        time.sleep(1)
+        gui.Application.instance.run()
+
+    def update_view(self, frame_elements):
+        """Updates view with new data. May be called from any thread.
+        Args:
+            frame_elements (dict): Display elements (point cloud and images)
+                from the new frame to be shown.
+        """
+        gui.Application.instance.post_to_main_thread(
+            self.pipeline_view.window,
+            lambda: self.pipeline_view.update(frame_elements))
+
+    def on_toggle_followcam(self, is_enabled):
+        """Callback to toggle display of normals"""
+        self.pipeline_view.flag_followcam = is_enabled
+
+    def on_toggle_start(self, is_enabled):
+        """Callback to start/stop MUSt3r"""
+        self.pipeline_model.flag_start = is_enabled
+        self.pipeline_view.flag_start = is_enabled
+        self.pipeline_view.flag_gui_init = False
+
+    def on_window_close(self):
+        """Callback when the user closes the application window."""
+        self.pipeline_model.flag_exit = True
+        with self.pipeline_model.cv_capture:
+            self.pipeline_model.cv_capture.notify_all()
+        return True  # OK to close window
+
+
+# MAIN
+def main():
+    log.basicConfig(level=log.INFO)
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('--chkpt', required=True, help='Path to checkpoint.')
+    parser.add_argument('--device', default='cuda:0', help='Device to run on (e.g. "cpu" or "cuda:0").')
+    parser.add_argument('--input', default='cam:0', nargs='+',
+                        help="Media to load (can be paths to videos or webcam indices like 'cam:0 cam:1').")
+    parser.add_argument('--image_string', default=None, type=str,
+                        help="In the case of an image collection, string to identify image files.")
+    parser.add_argument('--load_memory', default=None, type=str, help="Load memory from another run.")
+    parser.add_argument('--output', default=None, type=str, help="Output directory to write predictions")
+
+    # Processing related opts
+    parser.add_argument('--res', default=224, choices=[224, 512],
+                        type=int, help="Image resolution that works for the model used.")
+    parser.add_argument('--skip_every', default=1, type=int, help="Subsample input by skipping frames.")
+    parser.add_argument('--rerender', action='store_true', default=False, help="Rerender all frames at the end.")
+    parser.add_argument('--rerender_bs', default=64, type=int, help="Re-rendering batch size")
+    parser.add_argument('--filter', action='store_true', default=False, help="Minimal Laplacian filtering after rerender.")
+
+    # Hyperparams
+    parser.add_argument('--searcher', default="kdtree-scipy-quadrant_x2", type=str,
+                        help="Method for overlap prediction")  # searcher='kdtree-scipy-quadrant_x2', # 'kdtree-scipy',
+    parser.add_argument('--overlap_mode', default="nn-norm", type=str,
+                        help="How to estimate overlap")  # overlap_mode='nn-norm', #'nn',
+    parser.add_argument('--subsamp', default=2, type=int)
+    # .15, # 3., # .15, ##3., #.05 for retrieval, # 2. - 3. for meanconf / median conf
+    parser.add_argument('--keyframe_overlap_thr', default=.1, type=float,
+                        help="At least this overlap to add incoming image in memory")
+    parser.add_argument('--min_conf_keyframe', default=1.2, type=float, help="Ignore 3D points below this confidence.")
+    parser.add_argument('--overlap_percentile', default=85., type=float,
+                        help="Percentile of image distances to compute overlap")
+    parser.add_argument('--varying_focals', action='store_true', default=False,
+                        help="Focals may vary along sequence (e.g. zoom-in/out).")
+
+    parser.add_argument('--force_first_keyframes', default=None, type=int)
+    parser.add_argument('--num_init_frames', default=2, type=int)
+
+    # GUI related opts
+    parser.add_argument('--viz_conf', default=4., type=float, help="Conf threshold for pts3d vizu")
+    parser.add_argument('--gui', action='store_true', default=False, help="Show predictions in GUI")
+    parser.add_argument('--hide_cameras', action='store_true', default=False)
+
+    args = parser.parse_args()
+
+    toggle_memory_efficient_attention(has_xformers)
+    SKIP_EVERY = args.skip_every
+    args.model = SLAM_MUSt3R(chkpt=args.chkpt,
+                             res=args.res,
+                             kf_x_subsamp=args.subsamp,
+                             searcher=args.searcher,
+                             overlap_mode=args.overlap_mode,
+                             keyframe_overlap_thr=args.keyframe_overlap_thr,
+                             min_conf_keyframe=args.min_conf_keyframe,
+                             overlap_percentile=args.overlap_percentile,
+                             rerender=args.rerender,
+                             keep_memory=args.output is not None,
+                             load_memory=args.load_memory,
+                             fixed_focal=not args.varying_focals,
+                             num_agents=len(args.input),
+                             device=args.device,
+                             num_init_frames=args.num_init_frames,
+                             )
+
+    # Prepare Camera Stream
+    CAMERA = AutoMultiLoader(args.input, args.image_string)
+
+    # prepare output
+    if args.output is not None:
+        os.makedirs(args.output, exist_ok=True)
+
+    if args.gui:
+        # Main GUI
+        PipelineController(args, CAMERA)
+        tolog = {}
+    else:
+        # Only write output
+        assert args.output is not None, "You should define an output folder"
+        print(f"Start processing sequence of {len(CAMERA)} frames")
+        frame, cam_id = grab_frame(CAMERA)
+        start = time.time()
+        imgHWs = [frame.shape[:2]]
+        for frame_id in tqdm(range(len(CAMERA) // SKIP_EVERY)):
+            out = args.model(frame, frame_id * SKIP_EVERY, cam_id)
+            frame, cam_id = grab_frame(CAMERA)
+            if frame is not None:
+                imgHWs.append(frame.shape[:2])
+
+        # Re-render if activated
+        if args.rerender:
+            args.model.rerender_all_frames(maxbs=args.rerender_bs)
+
+        # Logging FPS and GPU mem usage
+        wallclock_time = time.time() - start
+        fps = (len(CAMERA) // SKIP_EVERY) / wallclock_time
+        gpumem = torch.cuda.max_memory_allocated() / MB
+        print(f"Done @{fps}fps on average using {gpumem}MB GPU Memory")
+        tolog = {'fps': fps,
+                 'gpumem': gpumem,
+                 'imgHWs': imgHWs,
+                 }
+
+    if args.output is not None:
+        # Write full trajectory
+        if not args.filter:
+            args.model.write_all_poses(os.path.join(args.output, 'all_poses.npz'), **tolog)
+        else:
+            # Postprocessing
+            filtering_mode = 'laplacian'
+            filtering_alpha = .1
+            filtering_steps = 256
+            outfile = os.path.join(args.output, f"all_poses{filtering_mode}_{filtering_steps}-steps_{filtering_alpha}-alpha.npz")
+            args.model.write_all_poses(outfile,
+                                       filtering_mode=filtering_mode,
+                                       filtering_steps=filtering_steps,
+                                       filtering_alpha=filtering_alpha, **tolog)        
+        
+        # Export memory for later use
+        outname = os.path.join(args.output, "memory.pkl")
+        count = 0
+        while args.load_memory == outname:  # make sure you do not overwrite loaded memory file
+            outname = os.path.join(args.output, f"memory_{count}.pkl")
+        print(f"Dumping memory as {outname}")
+        args.model.save_memory(outname)
diff --git a/must3r/slam/tools.py b/must3r/slam/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7b3a923b23d1b0f6d6745f5cd5b7a87396091eb
--- /dev/null
+++ b/must3r/slam/tools.py
@@ -0,0 +1,63 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+
+
+def ravel3d(x): return x.view(-1, 3).cpu().numpy()
+def to_np(x): return x.cpu().numpy()
+
+
+def get_quadrant_id(rays, quadrant_divider=4, eps=1e-5):
+    # Simply split the rotation sphere into 2N**2 regularly spaced quadrants
+    # Input is ray directions in 3D
+    # turn them into spherical coordinates and quantize the theta/phis to get a 2D coord
+    # raveled into a single output index value
+    rays /= np.linalg.norm(rays, axis=-1, keepdims=True).clip(eps)
+
+    # Spherical coordinates (r=1)
+    thetas = np.arccos(rays[:, -1]) / np.pi  # acos(z)/pi in [0,1]
+    phis = np.arctan2(rays[:, 1], rays[:, 0]) / np.pi  # atan(y,x)/pi in [-1,1]
+
+    # Clip to prevent floating point errors
+    thetas = thetas.clip(eps, 1 - eps)
+    phis = phis.clip(-1 + eps, 1 - eps)
+
+    # Quantize
+    theta_idx = np.floor(thetas * quadrant_divider).astype(int)  # in [0, quadrant_divider]
+    phis_idx = np.floor(phis * quadrant_divider).astype(int) + quadrant_divider  # in [0, 2*quadrant_divider]
+
+    # turn the 2D quadrant coordinates into a 1D index
+    quadrant_index = theta_idx + phis_idx * quadrant_divider
+
+    return quadrant_index.astype(int)
+
+
+def laplacian_smoothing(trajectory, alpha=0.5, iterations=10):
+    smoothed_trajectory = trajectory.copy()
+    N = len(trajectory)
+    for _ in range(iterations):
+        for i in range(1, N - 1):  # Exclude endpoints from smoothing
+            smoothed_trajectory[i] = (1 - alpha) * smoothed_trajectory[i] + (alpha / 2) * \
+                (smoothed_trajectory[i - 1] + smoothed_trajectory[i + 1])
+    return smoothed_trajectory
+
+
+def laplacian_smoothing_with_confidence(trajectory, confidence, alpha=0.5, iterations=10):
+    smoothed_trajectory = trajectory.copy()
+    N = len(trajectory)
+    for _ in range(iterations):
+        for i in range(1, N - 1):  # Exclude endpoints from smoothing
+            # Weighted average based on confidence
+            weight_self = (1 - alpha) * confidence[i]
+            weight_previous = alpha * (1 - confidence[i - 1]) / 2
+            weight_next = alpha * (1 - confidence[i + 1]) / 2
+            sumw = weight_self + weight_previous + weight_next
+            # normalize
+            weight_self /= sumw
+            weight_previous /= sumw
+            weight_next /= sumw
+
+            smoothed_trajectory[i] = (
+                weight_self * smoothed_trajectory[i] +
+                weight_previous * smoothed_trajectory[i - 1] +
+                weight_next * smoothed_trajectory[i + 1])
+    return smoothed_trajectory
\ No newline at end of file
diff --git a/must3r/tools/__init__.py b/must3r/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..345ede2d5437f28f4929911c3614d5fe2caf828e
--- /dev/null
+++ b/must3r/tools/__init__.py
@@ -0,0 +1 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
diff --git a/must3r/tools/geometry.py b/must3r/tools/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ec1259089f680a27c9ae0737d11739b87aa0a86
--- /dev/null
+++ b/must3r/tools/geometry.py
@@ -0,0 +1,84 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import torch
+import must3r.tools.path_to_dust3r  # noqa
+from dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
+
+
+def apply_log_to_norm(xyz, dim=-1):
+    d = xyz.norm(dim=dim, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+    xyz = xyz * torch.log1p(d)
+    return xyz
+
+
+def apply_exp_to_norm(xyz, dim=-1):
+    d = xyz.norm(dim=dim, keepdim=True)
+    xyz = xyz / d.clip(min=1e-8)
+    xyz = xyz * torch.expm1(d)
+    return xyz
+
+
+def normalize_pointcloud(pts1, pts2, norm_mode='avg_dis', valid1=None, valid2=None, ret_factor=False):
+    """ renorm pointmaps pts1, pts2 with norm_mode, same as in dust3r but with a small fix
+    """
+    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
+    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split('_')
+
+    if norm_mode == 'avg':
+        # gather all points together (joint normalization)
+        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
+        nan_pts2, nnz2 = invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == 'dis':
+            pass  # do nothing
+        elif dis_mode == 'log1p':
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == 'warp-log1p':
+            # actually warp input points before normalizing them
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H1, W1 = pts1.shape[1:-1]
+            pts1 = pts1 * warp_factor[:, :W1 * H1].view(-1, H1, W1, 1)
+            if pts2 is not None:
+                H2, W2 = pts2.shape[1:-1]
+                pts2 = pts2 * warp_factor[:, W1 * H1:].view(-1, H2, W2, 1)
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f'bad {dis_mode=}')
+
+        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
+    else:
+        # gather all points together (joint normalization)
+        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
+        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
+        all_pts = torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+
+        # compute distance to origin
+        all_dis = all_pts.norm(dim=-1)
+
+        if norm_mode == 'avg':
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == 'median':
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == 'sqrt':
+            norm_factor = all_dis.sqrt().nanmean(dim=1)**2
+        else:
+            raise ValueError(f'bad {norm_mode=}')
+
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts1.ndim:
+        norm_factor.unsqueeze_(-1)
+
+    res = pts1 / norm_factor
+    if pts2 is not None:
+        res = (res, pts2 / norm_factor)
+    if ret_factor:
+        if not isinstance(res, tuple):
+            res = (res, norm_factor)
+        else:
+            res = res + (norm_factor,)
+    return res
diff --git a/must3r/tools/image.py b/must3r/tools/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..68022c3bda5fbead1caf2bf2f88e6ff54e199a0b
--- /dev/null
+++ b/must3r/tools/image.py
@@ -0,0 +1,118 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import os
+import PIL.Image
+import torch.nn.functional as F
+import numpy as np
+import torchvision.transforms as tvf
+
+
+def unpatchify(x, patch_size, true_shape):
+    B = x.shape[0]
+    H, W = true_shape
+    x = x.transpose(-1, -2).view(B, -1, H // patch_size, W // patch_size)
+    x = F.pixel_shuffle(x, patch_size)  # B,channels,H,W
+    return x
+
+
+ratios_resolutions = {
+    224: {1.0: [224, 224]},
+    336: {1.0: [336, 336]},
+    384: {4 / 3: [384, 288], 3 / 2: [384, 256], 2 / 1: [384, 192], 3 / 1: [384, 128]},
+    448: {1.0: [448, 448]},
+    512: {4 / 3: [512, 384], 32 / 21: [512, 336], 16 / 9: [512, 288], 2 / 1: [512, 256], 16 / 5: [512, 160]},
+    768: {4 / 3: [768, 576], 3 / 2: [768, 512], 16 / 9: [768, 432], 2 / 1: [768, 384], 16 / 5: [768, 240]},
+}
+
+
+def get_HW_resolution(H, W, maxdim, patchsize=16):
+    if isinstance(maxdim, int):
+        assert maxdim in ratios_resolutions, f"Error, {maxdim=} not implemented yet."
+    ratios_resolutions_maxdim = maxdim if isinstance(maxdim, dict) else ratios_resolutions[maxdim]
+    mindims = set([min(res) for res in ratios_resolutions_maxdim.values()])
+    ratio = W / H
+    ref_ratios = np.array([*(ratios_resolutions_maxdim.keys())])
+    islandscape = (W >= H)
+    if islandscape:
+        diff = np.abs(ratio - ref_ratios)
+    else:
+        diff = np.abs(ratio - (1 / ref_ratios))
+    selkey = ref_ratios[np.argmin(diff)]
+    res = ratios_resolutions_maxdim[selkey]
+    # check patchsize and make sure output resolution is a multiple of patchsize
+    if isinstance(patchsize, tuple):
+        assert len(patchsize) == 2 and isinstance(patchsize[0], int) and isinstance(
+            patchsize[1], int), "What is your patchsize format? Expected a single int or a tuple of two ints."
+        assert patchsize[0] == patchsize[1], "Error, non square patches not managed"
+        patchsize = patchsize[0]
+    if isinstance(maxdim, int):
+        assert max(res) == maxdim
+    assert min(res) in mindims
+    return res[::-1] if islandscape else res  # return HW
+
+
+def get_resize_function(maxdim, patch_size, H, W, is_mask=False):
+    resolutions_dict = maxdim if isinstance(maxdim, dict) else ratios_resolutions[maxdim]
+    if [max(H, W), min(H, W)] in resolutions_dict.values():
+        return lambda x: x, np.eye(3), np.eye(3)
+    else:
+        target_HW = get_HW_resolution(H, W, maxdim=maxdim, patchsize=patch_size)
+
+        ratio = W / H
+        target_ratio = target_HW[1] / target_HW[0]
+        to_orig_crop = np.eye(3)
+        to_rescaled_crop = np.eye(3)
+        if abs(ratio - target_ratio) < np.finfo(np.float32).eps:
+            crop_W = W
+            crop_H = H
+        elif ratio - target_ratio < 0:
+            crop_W = W
+            crop_H = int(W / target_ratio)
+            to_orig_crop[1, 2] = (H - crop_H) / 2.0
+            to_rescaled_crop[1, 2] = -(H - crop_H) / 2.0
+        else:
+            crop_W = int(H * target_ratio)
+            crop_H = H
+            to_orig_crop[0, 2] = (W - crop_W) / 2.0
+            to_rescaled_crop[0, 2] = - (W - crop_W) / 2.0
+
+        crop_op = tvf.CenterCrop([crop_H, crop_W])
+
+        if is_mask:
+            resize_op = tvf.Resize(size=target_HW, interpolation=tvf.InterpolationMode.NEAREST_EXACT)
+        else:
+            resize_op = tvf.Resize(size=target_HW)
+        to_orig_resize = np.array([[crop_W / target_HW[1], 0, 0],
+                                   [0, crop_H / target_HW[0], 0],
+                                   [0, 0, 1]])
+        to_rescaled_resize = np.array([[target_HW[1] / crop_W, 0, 0],
+                                       [0, target_HW[0] / crop_H, 0],
+                                       [0, 0, 1]])
+
+        op = tvf.Compose([crop_op, resize_op])
+
+        return op, to_rescaled_resize @ to_rescaled_crop, to_orig_crop @ to_orig_resize
+
+
+def is_image_extension_known_by_pil(file_path):
+    """
+    Returns True if the file has a “known” image extension according to PIL.
+    Does NOT open the file—it only inspects the extension.
+    """
+    _, ext = os.path.splitext(file_path)
+    valids_exts = PIL.Image.registered_extensions()
+    return ext.lower() in valids_exts
+
+
+def is_valid_pil_image_file(file_path):
+    """
+    First checks extension, then tries to open/verify the file.
+    """
+    if not is_image_extension_known_by_pil(file_path):
+        return False
+
+    try:
+        with PIL.Image.open(file_path) as img:
+            img.verify()     # Verify that it’s not truncated/corrupt
+        return True
+    except (PIL.UnidentifiedImageError, IOError):
+        return False
diff --git a/must3r/tools/path_to_dust3r.py b/must3r/tools/path_to_dust3r.py
new file mode 100644
index 0000000000000000000000000000000000000000..dde9ccd651dc2606b3fcce1f718881e6e4d56636
--- /dev/null
+++ b/must3r/tools/path_to_dust3r.py
@@ -0,0 +1,21 @@
+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+#
+# --------------------------------------------------------
+# dust3r submodule import
+# --------------------------------------------------------
+
+import sys
+import os.path as path
+HERE_PATH = path.normpath(path.dirname(__file__))
+DUSt3R_REPO_PATH = path.normpath(path.join(HERE_PATH, '../../dust3r'))
+DUSt3R_LIB_PATH = path.join(DUSt3R_REPO_PATH, 'dust3r')
+# check the presence of models directory in repo to be sure its cloned
+if path.isdir(DUSt3R_LIB_PATH):
+    # workaround for sibling import
+    sys.path.insert(0, DUSt3R_REPO_PATH)
+else:
+    try:
+        from dust3r.utils.geometry import geotrf  # check if dust3r is installed as a module
+    except ImportError as e:
+        raise ImportError(f"dust3r is not initialized, could not find: {DUSt3R_LIB_PATH}.\n "
+                        "Did you forget to run 'git submodule update --init --recursive' ?")
diff --git a/must3r/tools/ply_io.py b/must3r/tools/ply_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..07685fe0c3ebbc848b86dc4ee6718f8a421f57a8
--- /dev/null
+++ b/must3r/tools/ply_io.py
@@ -0,0 +1,155 @@
+# Copyright (C) 2025-present Naver Corporation. All rights reserved.
+import numpy as np
+import os
+import torch
+try:
+    from plyfile import PlyData, PlyElement
+except ImportError:
+    pass
+
+
+def ply_read(ply_file):
+    filedata = PlyData.read(ply_file)
+    # Recover points 3D coordinates and colors from ply file
+    return elements2colorpoints(filedata)
+
+
+def elements2points(filedata):
+    # Recover points 3D coordinates from elements (extracted from ply file)
+    points = np.stack(filedata['vertex'])
+    x_coords = points['x']
+    y_coords = points['y']
+    z_coords = points['z']
+
+    # return 3D points format
+    return np.transpose(np.squeeze(np.stack([x_coords, y_coords, z_coords])))
+
+
+def elements2colorpoints(filedata):
+    # Recover points 3D coordinates from elements (extracted from ply file)
+    points = np.stack(filedata['vertex'])
+    x_coords = points['x']
+    y_coords = points['y']
+    z_coords = points['z']
+
+    try:
+        r_colors = points['red']
+    except:
+        r_colors = 255 * np.ones_like(x_coords)
+
+    try:
+        g_colors = points['green']
+    except:
+        g_colors = 255 * np.ones_like(x_coords)
+
+    try:
+        b_colors = points['blue']
+    except:
+        b_colors = 255 * np.ones_like(x_coords)
+
+    # return 3D points format
+    return [np.transpose(np.squeeze(np.stack([x_coords, y_coords, z_coords]))), np.transpose(np.squeeze(np.stack([r_colors, g_colors, b_colors])))]
+
+
+def elements2pointsfaces(filedata):
+    # Recover points 3D coordinates from elements (extracted from ply file)
+    points = np.stack(filedata['vertex'])
+    x_coords = points['x']
+    y_coords = points['y']
+    z_coords = points['z']
+
+    faces = np.stack(filedata['face']['vertex_indices'])
+
+    # return 3D points format + faces (containing points indices)
+    return [np.transpose(np.squeeze(np.stack([x_coords, y_coords, z_coords]))), faces]
+
+
+def exportPointsPLY(vertices, outname="/tmp/test.ply"):  # color=[255,255,255],
+    """ Export 3D point cloud in ply format. Input: [N,3]"""
+    auto_create_f(outname)
+    vertices = vertices.detach().cpu() if isinstance(vertices, torch.Tensor) else torch.tensor(vertices)
+    B = vertices.shape[0]
+    x, y, z = vertices.T.numpy()
+
+    # connect the proper data structures
+    vertices = np.empty(B, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
+    vertices['x'] = x.astype('f4')
+    vertices['y'] = y.astype('f4')
+    vertices['z'] = z.astype('f4')
+
+    final_elements = PlyElement.describe(vertices, 'vertex')
+    PlyData([final_elements]).write(outname)
+    # reformatted_vs = [(r[0],r[1],r[2],0,0,-1.0,int(color[0]),int(color[1]),int(color[2])) for r in vertices]
+    # desc_vertices = np.asarray(reformatted_vs,dtype=[('x', 'f4'), ('y', 'f4'),('z', 'f4'),
+    #                                                   ('nx', 'f4'), ('ny', 'f4'),('nz', 'f4'),
+    #                                                   ('red', 'u1'),('green', 'u1'),('blue', 'u1')])
+    # final_elements = PlyElement.describe(desc_vertices, 'vertex')
+    # PlyData([final_elements]).write(outname)
+
+
+def exportColoredPointsPLY(colored_vertices, outname="/tmp/test.ply", autocolors=False):
+    """ Export 3D point cloud in ply format. Input: [N,6]"""
+
+    auto_create_f(outname)
+
+    if autocolors:
+        if colored_vertices.shape[-1] != 3:
+            print(f"Warning: AutoColor overwriting input colors in exportColoredPointsPLY(...) ")
+        colored_vertices = autoselfcolor(colored_vertices)
+
+    colored_vertices = colored_vertices.detach().cpu() if isinstance(
+        colored_vertices, torch.Tensor) else torch.tensor(colored_vertices)
+    B = colored_vertices.shape[0]
+    x, y, z, red, green, blue = colored_vertices.T.numpy()
+
+    # connect the proper data structures
+    vertices = np.empty(B, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'),
+                        ('red', 'u1'), ('green', 'u1'), ('blue', 'u1')])
+    vertices['x'] = x.astype('f4')
+    vertices['y'] = y.astype('f4')
+    vertices['z'] = z.astype('f4')
+    vertices['red'] = red.astype('u1')
+    vertices['green'] = green.astype('u1')
+    vertices['blue'] = blue.astype('u1')
+    final_elements = PlyElement.describe(vertices, 'vertex')
+    PlyData([final_elements]).write(outname)
+
+    # reformatted_vs = [(r[0],r[1],r[2],0,0,-1.0,int(r[3]),int(r[4]),int(r[5])) for r in colored_vertices]
+    # desc_vertices = np.asarray(reformatted_vs,dtype=[('x', 'f4'), ('y', 'f4'),('z', 'f4'),
+    #                                                   ('nx', 'f4'), ('ny', 'f4'),('nz', 'f4'),
+    #                                                   ('red', 'u1'),('green', 'u1'),('blue', 'u1')])
+    # final_elements = PlyElement.describe(desc_vertices, 'vertex')
+    # PlyData([final_elements]).write(outname)
+
+
+def exportRaysPLY(in_rays, in_centers, outfile, n_interpol=50):  # vizualisation debug
+    auto_create_f(outfile)
+    if np.shape(in_rays) != np.shape(in_centers):
+        print("Error, shape mismatch")
+        return
+    out_points = []
+    for r_count, ray in enumerate(in_rays):
+        for i in range(n_interpol):
+            t_r = np.array(ray)
+            c_r = np.array(in_centers[r_count])
+            out_points.append(float(i / n_interpol) * t_r + float((n_interpol - i) / n_interpol) * c_r)
+    exportPointsPLY(out_points, outfile)
+
+
+def debugExportRays(rays, centers, name, interpols=20):
+    auto_create_f(name)
+    outpoints = [[0.0, 0.0, 0.0] for _ in range(interpols * np.shape(rays)[0])]
+    for ii, ray in enumerate(rays):
+        for interp in range(interpols):
+            outpoints[ii * interpols + interp] = interp / interpols * centers[ii] + (1.0 - interp / interpols) * ray
+    print("outpoints: ", np.shape(outpoints))
+    exportPointsPLY(outpoints, name)
+
+
+# Utils funcs
+def autoselfcolor(verts):
+    return np.concatenate([verts, verts.clip(0, 1) * 255], axis=-1)
+
+
+def auto_create_f(path):
+    os.makedirs(os.path.dirname(path), exist_ok=True)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..89eae1c3f9385ca256e516d84b9e7baf9df52ef3
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+cython
+pyaml
+scikit-learn
+open3d
+viser>=1.0.0
+gradio>=5.0.0
\ No newline at end of file
diff --git a/sam2-src b/sam2-src
new file mode 160000
index 0000000000000000000000000000000000000000..2b90b9f5ceec907a1c18123530e92e794ad901a4
--- /dev/null
+++ b/sam2-src
@@ -0,0 +1 @@
+Subproject commit 2b90b9f5ceec907a1c18123530e92e794ad901a4
diff --git a/training_utils.py b/training_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c95a4f31d9043c37fa16a051e62c73d9a311f780
--- /dev/null
+++ b/training_utils.py
@@ -0,0 +1,256 @@
+from must3r.model import ActivationType, apply_activation
+from dust3r.post_process import estimate_focal_knowing_depth
+import torch
+import random, math, roma
+import torchvision.transforms.functional as TF 
+from tensordict import tensorclass
+import torch.nn.functional as F
+
+def save_checkpoint(model: torch.nn.Module, path: str) -> None:
+    while True:
+        try:
+            torch.save(model.state_dict(), path)
+            break
+        except Exception as e:
+            print(e)
+            continue
+
+def load_checkpoint(model: torch.nn.Module, ckpt_state_dict_raw: dict, strict = False) -> torch.nn.Module:
+
+    try:
+        if strict:
+            model.load_state_dict(ckpt_state_dict_raw)
+        else:
+            model_dict = model.state_dict()
+            ckpt_state_dict = {k: v for k, v in ckpt_state_dict_raw.items() if k in model_dict and v.shape == model_dict[k].shape}
+            model_dict.update(ckpt_state_dict)
+            model.load_state_dict(model_dict)
+            print(f'The following keys is in ckpt but not loaded: {set(ckpt_state_dict_raw.keys()) - set(ckpt_state_dict.keys())}')
+    except Exception as e:
+        print(e)
+    finally:
+        return model
+    
+
+def random_color_jitter(vid, brightness, contrast, saturation, hue = None):
+    '''
+    vid of shape [num_frames, num_channels, height, width]
+    '''
+    assert vid.ndim == 4
+    
+    if brightness > 0:
+        brightness_factor = random.uniform(1, 1 + brightness)
+    else:
+        brightness_factor = None
+    if contrast > 0:
+        contrast_factor = random.uniform(1, 1 + contrast)
+    else:
+        contrast_factor = None
+    if saturation > 0:
+        saturation_factor = random.uniform(1, 1 + saturation)
+    else:
+        saturation_factor = None
+    if hue > 0:
+        hue_factor = random.uniform(0, hue)
+    else:
+        hue_factor = None
+    vid_transforms = []
+    if brightness is not None:
+        vid_transforms.append(lambda img: TF.adjust_brightness(img, brightness_factor))
+    if saturation is not None:
+        vid_transforms.append(lambda img: TF.adjust_saturation(img, saturation_factor))
+    # if hue is not None:
+    #     vid_transforms.append(lambda img: TF.adjust_hue(img, hue_factor))
+    if contrast is not None:
+        vid_transforms.append(lambda img: TF.adjust_contrast(img, contrast_factor))
+    random.shuffle(vid_transforms)
+    for transform in vid_transforms:
+        vid = transform(vid)
+    return vid
+
+
+@tensorclass
+class BatchedVideoDatapoint:
+    """
+    This class represents a batch of videos with associated annotations and metadata.
+    Attributes:
+        img_batch: A [TxBxCxHxW] tensor containing the image data for each frame in the batch, where T is the number of frames per video, and B is the number of videos in the batch.
+        obj_to_frame_idx: A [TxOx2] tensor containing the image_batch index which the object belongs to. O is the number of objects in the batch.
+        masks: A [TxOxHxW] tensor containing binary masks for each object in the batch.
+    """
+
+    img_batch: torch.FloatTensor
+    masks: torch.BoolTensor
+    flat_obj_to_img_idx: torch.IntTensor
+    features_3d: torch.FloatTensor = None
+    
+    def pin_memory(self, device=None):
+        return self.apply(torch.Tensor.pin_memory, device=device)
+
+    @property
+    def num_frames(self) -> int:
+        """
+        Returns the number of frames per video.
+        """
+        return self.img_batch.shape[0]
+
+    @property
+    def num_videos(self) -> int:
+        """
+        Returns the number of videos in the batch.
+        """
+        return self.img_batch.shape[1]
+
+    @property
+    def flat_img_batch(self) -> torch.FloatTensor:
+        """
+        Returns a flattened img_batch_tensor of shape [(B*T)xCxHxW]
+        """
+        return self.img_batch.transpose(0, 1).flatten(0, 1)
+    @property
+    def flat_features_3d(self) -> torch.FloatTensor:
+        """
+        Returns a flattened img_batch_tensor of shape [(B*T)xCxHxW]
+        """
+        return self.features_3d.transpose(0, 1).flatten(0, 1)
+
+def sigmoid_focal_loss(
+    inputs,
+    targets,
+    alpha: float = 0.5,
+    gamma: float = 2,
+):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        focal loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction = "none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss
+
+
+def positional_encoding(positions, freqs, dim = 1):
+    """
+    Applies positional encoding along a specified dimension, expanding the
+    dimension size based on the number of frequency bands.
+
+    Args:
+        positions (torch.Tensor): Input tensor representing positions (e.g., shape (1, 3, 256, 256)).
+        freqs (int): Number of frequency bands for encoding.
+        dim (int): Dimension along which to apply encoding. Default is 1.
+
+    Returns:
+        torch.Tensor: Tensor with positional encoding applied along the specified dimension.
+    """
+    # Ensure that the specified dimension is valid
+    assert dim >= 0 and dim < positions.ndim, "Invalid dimension specified."
+    # Generate frequency bands
+    freq_bands = (2 ** torch.arange(freqs, dtype=positions.dtype, device=positions.device))
+    # Apply frequency bands to positions at the specified dimension
+    expanded_positions = positions.unsqueeze(dim + 1) * freq_bands.view(-1, *([1] * (positions.ndim - dim - 1)))
+
+    # Reshape to combine the new frequency dimension with the specified dim
+    encoded_positions = expanded_positions.reshape(
+        *positions.shape[:dim], -1, *positions.shape[dim + 1:]
+    )
+    # Concatenate sine and cosine encodings
+    positional_encoded = torch.cat([torch.sin(encoded_positions), torch.cos(encoded_positions), positions], dim = dim)
+
+    return positional_encoded
+
+
+@torch.autocast("cuda", dtype=torch.float32)
+def postprocess_must3r_output(pointmaps, pointmaps_activation = ActivationType.NORM_EXP, compute_cam = True):
+    out = {}
+    channels = pointmaps.shape[-1]
+    out['pts3d'] = pointmaps[..., :3]
+    out['pts3d'] = apply_activation(out['pts3d'], activation = pointmaps_activation)
+    if channels >= 6:
+        out['pts3d_local'] = pointmaps[..., 3:6]
+        out['pts3d_local'] = apply_activation(out['pts3d_local'], activation = pointmaps_activation)
+    if channels == 4 or channels == 7:
+        out['conf'] = 1.0 + pointmaps[..., -1].exp()
+
+    if compute_cam:
+        batch_dims = out['pts3d'].shape[:-3]
+        num_batch_dims = len(batch_dims)
+        H, W = out['conf'].shape[-2:]
+        pp = torch.tensor((W / 2, H / 2), device = out['pts3d'].device)
+        focal = estimate_focal_knowing_depth(out['pts3d_local'].reshape(math.prod(batch_dims), H, W, 3), pp,
+                                             focal_mode='weiszfeld')
+        out['focal'] = focal.reshape(*batch_dims)
+
+        R, T = roma.rigid_points_registration(
+            out['pts3d_local'].reshape(*batch_dims, -1, 3),
+            out['pts3d'].reshape(*batch_dims, -1, 3),
+            weights = out['conf'].reshape(*batch_dims, -1) - 1.0, compute_scaling = False)
+
+        c2w = torch.eye(4, device=out['pts3d'].device)
+        c2w = c2w.view(*([1] * num_batch_dims), 4, 4).repeat(*batch_dims, 1, 1)
+        c2w[..., :3, :3] = R
+        c2w[..., :3, 3] = T.view(*batch_dims, 3)
+        out['c2w'] = c2w
+
+        # pixel grid
+        ys, xs = torch.meshgrid(
+            torch.arange(H, device = out['pts3d'].device), 
+            torch.arange(W, device = out['pts3d'].device), 
+            indexing = 'ij'
+        )
+        # broadcast to batch
+        f = out['focal'].reshape(*batch_dims, 1, 1)  # assume fx = fy = focal
+        x = (xs - pp[0]) / f
+        y = (ys - pp[1]) / f
+
+        # directions in camera frame
+        d_cam = torch.stack([x, y, torch.ones_like(x)], dim=-1)
+        d_cam = F.normalize(d_cam, dim=-1)
+
+        # rotate to world frame
+        d_world = torch.einsum('...ij,...hwj->...hwi', R, d_cam)
+
+        # camera center in world frame
+        o_world = c2w[..., :3, 3].view(*batch_dims, 1, 1, 3).expand(*batch_dims, H, W, 3)
+
+        # Plücker coordinates: (m, d) with m = o × d
+        m_world = torch.cross(o_world, d_world, dim = -1)
+        plucker = torch.cat([m_world, d_world], dim = -1)  # shape: (*batch, H, W, 6)
+
+        out['ray_origin'] = o_world
+        out['ray_dir'] = d_world
+        out['ray_plucker'] = plucker
+
+    return out
+
+
+def to_device(x, device = 'cuda'):
+    if isinstance(x, torch.Tensor):
+        return x.to(device)
+    elif isinstance(x, dict):
+        return {k: to_device(v, device) for k, v in x.items()}
+    elif isinstance(x, list):
+        return [to_device(v, device) for v in x]
+    elif isinstance(x, tuple):
+        return tuple(to_device(v, device) for v in x)
+    elif isinstance(x, int) or isinstance(x, float) or isinstance(x, str) or x is None:
+        return x
+    else:
+        raise ValueError(f'Unsupported type {type(x)}')
\ No newline at end of file