qizhangslam commited on 30 days ago

Commit

ae2def3

verified ·

1 Parent(s): 69db92d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

outdoor_v48_16gpu_v2/.hydra/config.yaml +68 -0
outdoor_v48_16gpu_v2/.hydra/hydra.yaml +156 -0
outdoor_v48_16gpu_v2/.hydra/overrides.yaml +2 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/__init__.py +0 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/base_multiview_dataset.py +576 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/batched_sampler.py +93 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/easy_dataset.py +212 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/dynamic_replica.py +137 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/habitat_hm3d.py +174 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/hoi4d.py +84 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mapfree.py +282 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mvs_synth.py +144 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/omniobject3d.py +146 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/pointodyssey.py +178 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/realestate10k.py +139 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/scannet.py +149 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/scannetpp.py +211 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/smartportraits.py +85 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/threedkb.py +111 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/unreal4k.py +159 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/__init__.py +2 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/corr.py +129 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/cropping.py +147 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/transforms.py +80 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/waymo.py +178 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/wildrgbd.py +56 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/__init__.py +1 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/camera.py +463 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/device.py +88 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/geometry.py +554 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/image.py +271 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/misc.py +127 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/parallel.py +87 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/path_to_croco.py +47 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/render.py +75 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/__init__.py +6 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/hub/__init__.py +4 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/hub/backbones.py +156 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/hub/utils.py +39 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/__init__.py +11 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/attention.py +89 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/block.py +259 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/dino_head.py +58 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/drop_path.py +34 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/layer_scale.py +27 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/mlp.py +40 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/patch_embed.py +88 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/swiglu_ffn.py +72 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/models/__init__.py +43 -0
outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/models/vision_transformer.py +404 -0

outdoor_v48_16gpu_v2/.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+teacher: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
+pretrained: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
+load_only_encoder: false
+long_context: false
+fixed_length: true
+resume: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_4gpu_v2/checkpoint-last.pth
+benchmark: false
+num_views: 64
+num_test_views: 4
+n_corres_train: 0
+n_corres_test: 0
+train_criterion: DistillLoss()
+test_criterion: DistillLoss()
+allow_repeat: false
+root_vkitti2: /scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti
+root_kitti: /scratch-shared/wwei2/eval/kitti_odometry/dataset
+root_kitti_velo: /gpfs/work2/0/prjs0824/semantickitti/dataset
+root_kitti360: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
+root_kitti360_velo: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
+root_waymo: /scratch-shared/wwei2/waymo_v2
+root_waymo_lidar: /scratch-shared/wwei2/waymo_v2
+dataset_vkitti2: VirtualKITTI2_Multi(allow_repeat=${allow_repeat}, split='train',
+  ROOT="${root_vkitti2}", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294),
+  (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
+  n_corres=${n_corres_train})
+dataset_kitti360: KITTI360_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_kitti360}",
+  velodyne_root="${root_kitti360_velo}", aug_crop=16, resolution=[(518, 392), (518,
+  336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter,
+  num_views=${num_views}, n_corres=${n_corres_train})
+dataset_waymo: Waymo_v2_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_waymo}",
+  lidar_root="${root_waymo_lidar}", aug_crop=16, resolution=[(518, 392), (518, 336),
+  (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
+  n_corres=${n_corres_train})
+train_dataset: 6000 @ ${dataset_vkitti2} + 6000 @ ${dataset_kitti360} + 5400 @ ${dataset_waymo}
+test_dataset: 200 @ VirtualKITTI2_Multi(split='train', ROOT="${root_vkitti2}", resolution=(518,
+  154), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+seed: 0
+batch_size: 1
+accum_iter: 1
+gradient_checkpointing: false
+epochs: 10
+start_epoch: 0
+start_step: 0
+weight_decay: 0.05
+lr: 1.0e-05
+min_lr: 1.0e-08
+warmup_epochs: 0.5
+amp: 1
+num_workers: 4
+world_size: 1
+local-rank: -1
+dist_url: env://
+rank: 0
+gpu: 0
+distributed: false
+dist_backend: nccl
+eval_freq: 1
+save_freq: 0.1
+max_checkpoints: 10
+keep_freq: 1
+print_freq: 10
+print_img_freq: 50000000
+num_imgs_vis: 4
+save_dir: /scratch-shared/wwei2/training_upstream/checkpoints
+exp_name: outdoor_v48_16gpu_v2
+task: StreamVGGT
+logdir: ${save_dir}/${exp_name}/logs
+output_dir: ${save_dir}/${exp_name}/

outdoor_v48_16gpu_v2/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,156 @@

+hydra:
+  run:
+    dir: ${save_dir}/${exp_name}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - exp_name=outdoor_v48_16gpu_v2
+    - resume=/scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_4gpu_v2/checkpoint-last.pth
+  job:
+    name: mytrain
+    chdir: null
+    override_dirname: exp_name=outdoor_v48_16gpu_v2,resume=/scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_4gpu_v2/checkpoint-last.pth
+    id: ???
+    num: ???
+    config_name: outdoor_v48
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/src
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu_v2
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: true

outdoor_v48_16gpu_v2/.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ - exp_name=outdoor_v48_16gpu_v2
2	+ - resume=/scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_4gpu_v2/checkpoint-last.pth

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/__init__.py ADDED Viewed

File without changes

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/base_multiview_dataset.py ADDED Viewed

	@@ -0,0 +1,576 @@

+import PIL
+import numpy as np
+import torch
+import random
+import itertools
+from dust3r.datasets.base.easy_dataset import EasyDataset
+from dust3r.datasets.utils.transforms import ImgNorm, SeqColorJitter
+from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+import dust3r.datasets.utils.cropping as cropping
+from dust3r.datasets.utils.corr import extract_correspondences_from_pts3d
+from vggt.train_utils.augmentation import get_image_augmentation
+def get_ray_map(c2w1, c2w2, intrinsics, h, w):
+    c2w = np.linalg.inv(c2w1) @ c2w2
+    i, j = np.meshgrid(np.arange(w), np.arange(h), indexing="xy")
+    grid = np.stack([i, j, np.ones_like(i)], axis=-1)
+    ro = c2w[:3, 3]
+    rd = np.linalg.inv(intrinsics) @ grid.reshape(-1, 3).T
+    rd = (c2w @ np.vstack([rd, np.ones_like(rd[0])])).T[:, :3].reshape(h, w, 3)
+    rd = rd / np.linalg.norm(rd, axis=-1, keepdims=True)
+    ro = np.broadcast_to(ro, (h, w, 3))
+    ray_map = np.concatenate([ro, rd], axis=-1)
+    return ray_map
+class BaseMultiViewDataset(EasyDataset):
+    """Define all basic options.
+    Usage:
+        class MyDataset (BaseMultiViewDataset):
+            def _get_views(self, idx, rng):
+                # overload here
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+    def __init__(
+        self,
+        *,  # only keyword arguments
+        num_views=None,
+        split=None,
+        resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+        transform=ImgNorm,
+        aug_crop=False,
+        n_corres=0,
+        nneg=0,
+        seed=None,
+        allow_repeat=False,
+        seq_aug_crop=False,
+    ):
+        assert num_views is not None, "undefined num_views"
+        self.num_views = num_views
+        self.split = split
+        self._set_resolutions(resolution)
+        self.n_corres = n_corres
+        self.nneg = nneg
+        assert (
+            self.n_corres == "all"
+            or isinstance(self.n_corres, int)
+            or (
+                isinstance(self.n_corres, list) and len(self.n_corres) == self.num_views
+            )
+        ), f"Error, n_corres should either be 'all', a single integer or a list of length {self.num_views}"
+        assert (
+            self.nneg == 0 or self.n_corres != "all"
+        ), "nneg should be 0 if n_corres is all"
+        self.is_seq_color_jitter = False
+        if isinstance(transform, str):
+            transform = eval(transform)
+        if transform == SeqColorJitter:
+            transform = SeqColorJitter()
+            self.is_seq_color_jitter = True
+        self.transform = transform
+        self.image_aug = get_image_augmentation(
+            color_jitter={ 'brightness': 0.5,
+          'contrast': 0.5,
+          'saturation': 0.5,
+          'hue': 0.1,
+          'p': 0.9},
+#common_config.augs.color_jitter,
+            gray_scale=True,#common_config.augs.gray_scale,
+            gau_blur=False, #common_config.augs.gau_blur,
+        )
+        self.aug_crop = aug_crop
+        self.seed = seed
+        self.allow_repeat = allow_repeat
+        self.seq_aug_crop = seq_aug_crop
+    def __len__(self):
+        return len(self.scenes)
+    @staticmethod
+    def efficient_random_intervals(
+        start,
+        num_elements,
+        interval_range,
+        fixed_interval_prob=0.8,
+        weights=None,
+        seed=42,
+    ):
+        if random.random() < fixed_interval_prob:
+            intervals = random.choices(interval_range, weights=weights) * (
+                num_elements - 1
+            )
+        else:
+            intervals = [
+                random.choices(interval_range, weights=weights)[0]
+                for _ in range(num_elements - 1)
+            ]
+        return list(itertools.accumulate([start] + intervals))
+    def sample_based_on_timestamps(self, i, timestamps, num_views, interval=1):
+        time_diffs = np.abs(timestamps - timestamps[i])
+        ids_candidate = np.where(time_diffs < interval)[0]
+        ids_candidate = np.sort(ids_candidate)
+        if (self.allow_repeat and len(ids_candidate) < num_views // 3) or (
+            len(ids_candidate) < num_views
+        ):
+            return []
+        ids_sel_list = []
+        ids_candidate_left = ids_candidate.copy()
+        while len(ids_candidate_left) >= num_views:
+            ids_sel = np.random.choice(ids_candidate_left, num_views, replace=False)
+            ids_sel_list.append(sorted(ids_sel))
+            ids_candidate_left = np.setdiff1d(ids_candidate_left, ids_sel)
+        if len(ids_candidate_left) > 0 and len(ids_candidate) >= num_views:
+            ids_sel = np.concatenate(
+                [
+                    ids_candidate_left,
+                    np.random.choice(
+                        np.setdiff1d(ids_candidate, ids_candidate_left),
+                        num_views - len(ids_candidate_left),
+                        replace=False,
+                    ),
+                ]
+            )
+            ids_sel_list.append(sorted(ids_sel))
+        if self.allow_repeat:
+            ids_sel_list.append(
+                sorted(np.random.choice(ids_candidate, num_views, replace=True))
+            )
+        # add sequences with fixed intervals (all possible intervals)
+        pos_i = np.where(ids_candidate == i)[0][0]
+        curr_interval = 1
+        stop = len(ids_candidate) < num_views
+        while not stop:
+            pos_sel = [pos_i]
+            count = 0
+            while len(pos_sel) < num_views:
+                if count % 2 == 0:
+                    curr_pos_i = pos_sel[-1] + curr_interval
+                    if curr_pos_i >= len(ids_candidate):
+                        stop = True
+                        break
+                    pos_sel.append(curr_pos_i)
+                else:
+                    curr_pos_i = pos_sel[0] - curr_interval
+                    if curr_pos_i < 0:
+                        stop = True
+                        break
+                    pos_sel.insert(0, curr_pos_i)
+                count += 1
+            if not stop and len(pos_sel) == num_views:
+                ids_sel = sorted([ids_candidate[pos] for pos in pos_sel])
+                if ids_sel not in ids_sel_list:
+                    ids_sel_list.append(ids_sel)
+            curr_interval += 1
+        return ids_sel_list
+    @staticmethod
+    def blockwise_shuffle(x, rng, block_shuffle):
+        if block_shuffle is None:
+            return rng.permutation(x).tolist()
+        else:
+            assert block_shuffle > 0
+            blocks = [x[i : i + block_shuffle] for i in range(0, len(x), block_shuffle)]
+            shuffled_blocks = [rng.permutation(block).tolist() for block in blocks]
+            shuffled_list = [item for block in shuffled_blocks for item in block]
+            return shuffled_list
+    def get_seq_from_start_id(
+        self,
+        num_views,
+        id_ref,
+        ids_all,
+        rng,
+        min_interval=1,
+        max_interval=25,
+        video_prob=0.5,
+        fix_interval_prob=0.5,
+        block_shuffle=None,
+    ):
+        """
+        args:
+            num_views: number of views to return
+            id_ref: the reference id (first id)
+            ids_all: all the ids
+            rng: random number generator
+            max_interval: maximum interval between two views
+        returns:
+            pos: list of positions of the views in ids_all, i.e., index for ids_all
+            is_video: True if the views are consecutive
+        """
+        assert min_interval > 0, f"min_interval should be > 0, got {min_interval}"
+        assert (
+            min_interval <= max_interval
+        ), f"min_interval should be <= max_interval, got {min_interval} and {max_interval}"
+        assert id_ref in ids_all
+        pos_ref = ids_all.index(id_ref)
+        all_possible_pos = np.arange(pos_ref, len(ids_all))
+        remaining_sum = len(ids_all) - 1 - pos_ref
+        if remaining_sum >= num_views - 1:
+            if remaining_sum == num_views - 1:
+                assert ids_all[-num_views] == id_ref
+                return [pos_ref + i for i in range(num_views)], True
+            max_interval = min(max_interval, 2 * remaining_sum // (num_views - 1))
+            intervals = [
+                rng.choice(range(min_interval, max_interval + 1))
+                for _ in range(num_views - 1)
+            ]
+            # if video or collection
+            if rng.random() < video_prob:
+                # if fixed interval or random
+                if rng.random() < fix_interval_prob:
+                    # regular interval
+                    fixed_interval = rng.choice(
+                        range(
+                            1,
+                            min(remaining_sum // (num_views - 1) + 1, max_interval + 1),
+                        )
+                    )
+                    intervals = [fixed_interval for _ in range(num_views - 1)]
+                is_video = True
+            else:
+                is_video = False
+            pos = list(itertools.accumulate([pos_ref] + intervals))
+            pos = [p for p in pos if p < len(ids_all)]
+            pos_candidates = [p for p in all_possible_pos if p not in pos]
+            pos = (
+                pos
+                + rng.choice(
+                    pos_candidates, num_views - len(pos), replace=False
+                ).tolist()
+            )
+            pos = (
+                sorted(pos)
+                if is_video
+                else self.blockwise_shuffle(pos, rng, block_shuffle)
+            )
+        #elif remaining_sum>1:
+        else:
+            # assert self.allow_repeat
+            uniq_num = remaining_sum
+            new_pos_ref = rng.choice(np.arange(pos_ref + 1))
+            new_remaining_sum = len(ids_all) - 1 - new_pos_ref
+            new_max_interval = min(max_interval, new_remaining_sum // (uniq_num - 1))
+            new_intervals = [
+                rng.choice(range(1, new_max_interval + 1)) for _ in range(uniq_num - 1)
+            ]
+            revisit_random = rng.random()
+            video_random = rng.random()
+            if rng.random() < fix_interval_prob and video_random < video_prob:
+                # regular interval
+                fixed_interval = rng.choice(range(1, new_max_interval + 1))
+                new_intervals = [fixed_interval for _ in range(uniq_num - 1)]
+            pos = list(itertools.accumulate([new_pos_ref] + new_intervals))
+            is_video = False
+            if revisit_random < 0.5 or video_prob == 1.0:  # revisit, video / collection
+                is_video = video_random < video_prob
+                pos = (
+                    self.blockwise_shuffle(pos, rng, block_shuffle)
+                    if not is_video
+                    else pos
+                )
+                num_full_repeat = num_views // uniq_num
+                pos = (
+                    pos * num_full_repeat
+                    + pos[: num_views - len(pos) * num_full_repeat]
+                )
+            elif revisit_random < 0.9:  # random
+                pos = rng.choice(pos, num_views, replace=True)
+            else:  # ordered
+                pos = sorted(rng.choice(pos, num_views, replace=True))
+        assert len(pos) == num_views
+        return pos, is_video
+    def get_img_and_ray_masks(self, is_metric, v, rng, p=[0.8, 0.15, 0.05]):
+        # generate img mask and raymap mask
+        if v == 0 or (not is_metric):
+            img_mask = True
+            raymap_mask = False
+        else:
+            rand_val = rng.random()
+            if rand_val < p[0]:
+                img_mask = True
+                raymap_mask = False
+            elif rand_val < p[0] + p[1]:
+                img_mask = False
+                raymap_mask = True
+            else:
+                img_mask = True
+                raymap_mask = True
+        return img_mask, raymap_mask
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+    def __repr__(self):
+        resolutions_str = "[" + ";".join(f"{w}x{h}" for w, h in self._resolutions) + "]"
+        return (
+            f"""{type(self).__name__}({self.get_stats()},
+            {self.num_views=},
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace(
+                "self.", ""
+            )
+            .replace("\n", "")
+            .replace("   ", "")
+        )
+    def _get_views(self, idx, resolution, rng, num_views):
+        raise NotImplementedError()
+    def __getitem__(self, idx):
+        # print("Receiving:" , idx)
+        if isinstance(idx, (tuple, list, np.ndarray)):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx, nview = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+            nview = self.num_views
+        assert nview >= 1 and nview <= self.num_views
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, "_rng"):
+            seed = torch.randint(0, 2**32, (1,)).item()
+            self._rng = np.random.default_rng(seed=seed)
+        if self.aug_crop > 1 and self.seq_aug_crop:
+            self.delta_target_resolution = self._rng.integers(0, self.aug_crop)
+        # over-loaded code
+        resolution = self._resolutions[
+            ar_idx
+        ]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng, nview)
+        assert len(views) == nview
+        if "camera_pose" not in views[0]:
+            views[0]["camera_pose"] = np.ones((4, 4), dtype=np.float32)
+        first_view_camera_pose = views[0]["camera_pose"]
+        transform = SeqColorJitter() if self.is_seq_color_jitter else self.transform
+        for v, view in enumerate(views):
+            assert (
+                "pts3d" not in view
+            ), f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view["idx"] = (idx, ar_idx, v)
+            # encode the image
+            width, height = view["img"].size
+            view["true_shape"] = np.int32((height, width))
+            view["img"] = transform(view["img"])
+            view["sky_mask"] = view["depthmap"] < 0
+            assert "camera_intrinsics" in view
+            if "camera_pose" not in view:
+                view["camera_pose"] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(
+                    view["camera_pose"]
+                ).all(), f"NaN in camera pose for view {view_name(view)}"
+            ray_map = get_ray_map(
+                first_view_camera_pose,
+                view["camera_pose"],
+                view["camera_intrinsics"],
+                height,
+                width,
+            )
+            view["ray_map"] = ray_map.astype(np.float32)
+            assert "pts3d" not in view
+            assert "valid_mask" not in view
+            assert np.isfinite(
+                view["depthmap"]
+            ).all(), f"NaN in depthmap for view {view_name(view)}"
+            pts3d, pts3d_local, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+            view["pts3d"] = pts3d
+            view["pts3d_local"] = pts3d_local
+            view["valid_mask"] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view["camera_intrinsics"]
+        if False:
+            if random.random() > 0.3:#self.cojitter_ratio:
+                images = torch.stack([view['img'] for view in views],axis=0)
+                images = self.image_aug(images)
+                for v, view in enumerate(views):
+                    view['img'] = images[v]
+            else:
+                for view in views:
+                    view['img'] = self.image_aug(view['img'][None])[0]
+        if self.n_corres > 0:
+            ref_view = views[0]
+            for view in views:
+                corres1, corres2, valid = extract_correspondences_from_pts3d(
+                    ref_view, view, self.n_corres, self._rng, nneg=self.nneg
+                )
+                view["corres"] = (corres1, corres2)
+                view["valid_corres"] = valid
+        # last thing done!
+        for view in views:
+            view["rng"] = int.from_bytes(self._rng.bytes(4), "big")
+        return views
+    def _set_resolutions(self, resolutions):
+        assert resolutions is not None, "undefined resolution"
+        if not isinstance(resolutions, list):
+            resolutions = [resolutions]
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(
+                width, int
+            ), f"Bad type for {width=} {type(width)=}, should be int"
+            assert isinstance(
+                height, int
+            ), f"Bad type for {height=} {type(height)=}, should be int"
+            self._resolutions.append((width, height))
+    def _crop_resize_if_necessary(
+        self, image, depthmap, intrinsics, resolution, rng=None, info=None
+    ):
+        """This function:
+        - first downsizes the image with LANCZOS inteprolation,
+          which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+        # downscale with lanczos interpolation so that image.size == resolution
+        # cropping centered on the principal point
+        W, H = image.size
+        cx, cy = intrinsics[:2, 2].round().astype(int)
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+        assert min_margin_x > W / 5, f"Bad principal point in view={info}"
+        assert min_margin_y > H / 5, f"Bad principal point in view={info}"
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+        # transpose the resolution if necessary
+        W, H = image.size  # new size
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        if self.aug_crop > 1:
+            target_resolution += (
+                rng.integers(0, self.aug_crop)
+                if not self.seq_aug_crop
+                else self.delta_target_resolution
+            )
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(
+            image, depthmap, intrinsics, target_resolution
+        )
+        # actual cropping (if necessary) with bilinear interpolation
+        intrinsics2 = cropping.camera_matrix_of_crop(
+            intrinsics, image.size, resolution, offset_factor=0.5
+        )
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(
+            intrinsics, intrinsics2, resolution
+        )
+        image, depthmap, intrinsics2 = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+        return image, depthmap, intrinsics2
+def is_good_type(key, v):
+    """returns (is_good, err_msg)"""
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+def view_name(view, batch_index=None):
+    def sel(x):
+        return x[batch_index] if batch_index not in (None, slice(None)) else x
+    db = sel(view["dataset"])
+    label = sel(view["label"])
+    instance = sel(view["instance"])
+    return f"{db}/{label}/{instance}"
+def transpose_to_landscape(view):
+    height, width = view["true_shape"]
+    if width < height:
+        # rectify portrait to landscape
+        assert view["img"].shape == (3, height, width)
+        view["img"] = view["img"].swapaxes(1, 2)
+        assert view["valid_mask"].shape == (height, width)
+        view["valid_mask"] = view["valid_mask"].swapaxes(0, 1)
+        assert view["depthmap"].shape == (height, width)
+        view["depthmap"] = view["depthmap"].swapaxes(0, 1)
+        assert view["pts3d"].shape == (height, width, 3)
+        view["pts3d"] = view["pts3d"].swapaxes(0, 1)
+        # transpose x and y pixels
+        view["camera_intrinsics"] = view["camera_intrinsics"][[1, 0, 2]]
+        assert view["ray_map"].shape == (height, width, 6)
+        view["ray_map"] = view["ray_map"].swapaxes(0, 1)
+        assert view["sky_mask"].shape == (height, width)
+        view["sky_mask"] = view["sky_mask"].swapaxes(0, 1)
+        if "corres" in view:
+            # transpose correspondences x and y
+            view["corres"][0] = view["corres"][0][:, [1, 0]]
+            view["corres"][1] = view["corres"][1][:, [1, 0]]

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/batched_sampler.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import numpy as np
+import torch
+from accelerate import Accelerator
+import torch.utils
+from torch.utils.data import BatchSampler, Sampler
+import torch.utils.data
+class CustomRandomSampler(Sampler):
+    """Random sampling under a constraint: each sample in the batch has the same feature,
+    which is chosen randomly from a known pool of 'features' for each batch.
+    For instance, the 'feature' could be the image aspect-ratio.
+    The index returned is a tuple (sample_idx, feat_idx).
+    This sampler ensures that each series of `batch_size` indices has the same `feat_idx`.
+    """
+    def __init__(
+        self,
+        dataset,
+        batch_size,
+        pool_size,
+        min_view_size,
+        max_view_size,
+        world_size,
+        warmup=1,
+        drop_last=True,
+    ):
+        self.batch_size = batch_size
+        self.pool_size = pool_size
+        self.min_view_size = min_view_size
+        self.max_view_size = max_view_size
+        self.drop_last = drop_last
+        self.len_dataset = N = len(dataset)
+        self.total_size = N
+        self.epoch = None
+        self.epochf = 0.0
+    def __len__(self):
+        return self.total_size
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+    def __iter__(self):
+        if self.epoch is None:
+            raise ValueError(
+                "Epoch number not set. Please call 'set_epoch(epoch)' before iterating."
+            )
+        seed = self.epoch + 788
+        rng = np.random.default_rng(seed=seed)
+        # random indices (will restart from 0 if not drop_last)
+        sample_idxs = np.arange(self.total_size)
+        rng.shuffle(sample_idxs)
+        # random feat_idxs (same across each batch)
+        n_batches = (self.total_size + self.batch_size - 1) // self.batch_size
+        if self.pool_size > 1:
+            p = np.ones(self.pool_size)
+            p[: self.pool_size // 2] *= 2
+            p = p / p.sum()
+            _feat_idxs = rng.choice(self.pool_size, size=n_batches, p=p)
+        else:
+            _feat_idxs = rng.integers(self.pool_size, size=n_batches)
+        _feat_idxs = np.broadcast_to(_feat_idxs[:, None], (n_batches, self.batch_size))
+        _feat_idxs = _feat_idxs.ravel()[: self.total_size]
+        _view_idxs = rng.integers(
+            self.min_view_size, self.max_view_size + 1, size=n_batches
+        )
+        _view_idxs = np.broadcast_to(_view_idxs[:, None], (n_batches, self.batch_size))
+        _view_idxs = _view_idxs.ravel()[: self.total_size]
+        idxs = np.c_[sample_idxs, _feat_idxs, _view_idxs]
+        yield from (tuple(idx) for idx in idxs)
+class BatchedRandomSampler(BatchSampler):
+    """Batch sampler that groups indices from RandomSampler into batches."""
+    def __init__(self, sampler: CustomRandomSampler, batch_size, drop_last=True):
+        self.sampler = sampler  # An instance of RandomSampler
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+    def set_epoch(self, epoch):
+        self.sampler.set_epoch(epoch)
+def round_by(total, multiple, up=False):
+    if up:
+        total = total + multiple - 1
+    return (total // multiple) * multiple

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/base/easy_dataset.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+import numpy as np
+from dust3r.datasets.base.batched_sampler import (
+    BatchedRandomSampler,
+    CustomRandomSampler,
+)
+import torch
+class EasyDataset:
+    """a dataset that you can easily resize and combine.
+    Examples:
+    ---------
+        2 * dataset ==> duplicate each element 2x
+        10 @ dataset ==> set the size to 10 (random sampling, duplicates if necessary)
+        dataset1 + dataset2 ==> concatenate datasets
+    """
+    def __add__(self, other):
+        return CatDataset([self, other])
+    def __rmul__(self, factor):
+        return MulDataset(factor, self)
+    def __rmatmul__(self, factor):
+        return ResizedDataset(factor, self)
+    def set_epoch(self, epoch):
+        pass  # nothing to do by default
+    def make_sampler(
+        self, batch_size, shuffle=True, drop_last=True, world_size=1, rank=0, fixed_length=False
+    ):
+        if not (shuffle):
+            raise NotImplementedError()  # cannot deal yet
+        num_of_aspect_ratios = len(self._resolutions)
+        num_of_views = self.num_views
+        sampler = CustomRandomSampler(
+            self,
+            batch_size,
+            num_of_aspect_ratios,
+            4 if not fixed_length else num_of_views,
+            num_of_views,
+            world_size,
+            warmup=1,
+            drop_last=drop_last,
+        )
+        return BatchedRandomSampler(sampler, batch_size, drop_last)
+class MulDataset(EasyDataset):
+    """Artifically augmenting the size of a dataset."""
+    multiplicator: int
+    def __init__(self, multiplicator, dataset):
+        assert isinstance(multiplicator, int) and multiplicator > 0
+        self.multiplicator = multiplicator
+        self.dataset = dataset
+    def __len__(self):
+        return self.multiplicator * len(self.dataset)
+    def __repr__(self):
+        return f"{self.multiplicator}*{repr(self.dataset)}"
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            idx, other, another = idx
+            return self.dataset[idx // self.multiplicator, other, another]
+        else:
+            return self.dataset[idx // self.multiplicator]
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+    @property
+    def num_views(self):
+        return self.dataset.num_views
+class ResizedDataset(EasyDataset):
+    """Artifically changing the size of a dataset."""
+    new_size: int
+    def __init__(self, new_size, dataset):
+        assert isinstance(new_size, int) and new_size > 0
+        self.new_size = new_size
+        self.dataset = dataset
+    def __len__(self):
+        return self.new_size
+    def __repr__(self):
+        size_str = str(self.new_size)
+        for i in range((len(size_str) - 1) // 3):
+            sep = -4 * i - 3
+            size_str = size_str[:sep] + "_" + size_str[sep:]
+        return f"{size_str} @ {repr(self.dataset)}"
+    def set_epoch(self, epoch):
+        # this random shuffle only depends on the epoch
+        rng = np.random.default_rng(seed=epoch + 777)
+        # shuffle all indices
+        perm = rng.permutation(len(self.dataset))
+        # rotary extension until target size is met
+        shuffled_idxs = np.concatenate(
+            [perm] * (1 + (len(self) - 1) // len(self.dataset))
+        )
+        self._idxs_mapping = shuffled_idxs[: self.new_size]
+        assert len(self._idxs_mapping) == self.new_size
+    def __getitem__(self, idx):
+        assert hasattr(
+            self, "_idxs_mapping"
+        ), "You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()"
+        if isinstance(idx, tuple):
+            idx, other, another = idx
+            return self.dataset[self._idxs_mapping[idx], other, another]
+        else:
+            return self.dataset[self._idxs_mapping[idx]]
+    @property
+    def _resolutions(self):
+        return self.dataset._resolutions
+    @property
+    def num_views(self):
+        return self.dataset.num_views
+class CatDataset(EasyDataset):
+    """Concatenation of several datasets"""
+    def __init__(self, datasets):
+        for dataset in datasets:
+            assert isinstance(dataset, EasyDataset)
+        self.datasets = datasets
+        self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets])
+    def __len__(self):
+        return self._cum_sizes[-1]
+    def __repr__(self):
+        # remove uselessly long transform
+        return " + ".join(
+            repr(dataset).replace(
+                ",transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))",
+                "",
+            )
+            for dataset in self.datasets
+        )
+    def set_epoch(self, epoch):
+        for dataset in self.datasets:
+            dataset.set_epoch(epoch)
+    def __getitem__(self, idx):
+        other = None
+        if isinstance(idx, tuple):
+            idx, other, another = idx
+        cause_error = False
+        while True:
+            if not (0 <= idx < len(self)):
+                raise IndexError()
+            db_idx = np.searchsorted(self._cum_sizes, idx, "right")
+            dataset = self.datasets[db_idx]
+            new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
+            if other is not None and another is not None:
+                new_idx = (new_idx, other, another)
+            try:
+                res_data = dataset[new_idx]
+            except Exception as e:
+                print(e)
+                print("DATA ERROR", new_idx)
+                idx += 1
+                idx = idx % len(self)
+                continue
+            break
+        return res_data
+    @property
+    def _resolutions(self):
+        resolutions = self.datasets[0]._resolutions
+        for dataset in self.datasets[1:]:
+            assert tuple(dataset._resolutions) == tuple(resolutions)
+        return resolutions
+    @property
+    def num_views(self):
+        num_views = self.datasets[0].num_views
+        for dataset in self.datasets[1:]:
+            assert dataset.num_views == num_views
+        return num_views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/dynamic_replica.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class DynamicReplica(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 16
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data(self.split)
+    def _load_data(self, split):
+        self.scenes = os.listdir(os.path.join(self.ROOT, split))
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, self.split, scene, "left")
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+                key=lambda x: float(x),
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id], "left")
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+            basename = self.images[view_idx]
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.10, 0.05]
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="dynamic_replica",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/habitat_hm3d.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class HabitatHM3D_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = False
+        self.max_interval = 8
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(scene_dir) if f.endswith(".npz")],
+                key=lambda x: int(x),
+            )
+            num_imgs = len(basenames)
+            # TODO: because current minghui's training data is backward moving, now use seq from -1 to 0
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            start_img_ids.extend([(scene, id) for id in start_img_ids_])
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+        self.invalid_scenes = {scene: False for scene in self.scenes}
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        invalid_seq = True
+        scene, start_id = self.start_img_ids[idx]  # 获取指定索引idx对应的场景名scene和起始图像id
+        # 添加最大重试次数，防止无限循环导致分布式训练卡住
+        max_retries = 100
+        retry_count = 0
+        while invalid_seq:
+            retry_count += 1
+            # 超过重试次数限制，抛出异常
+            if retry_count > max_retries:
+                raise RuntimeError(
+                    f"[HabitatHM3D] Failed to get valid views after {max_retries} retries. "
+                    f"idx={idx}, scene={scene}, num_views={num_views}. "
+                    f"This may indicate insufficient valid frames in the dataset."
+                )
+            # 超过50次时打印警告
+            if retry_count == 50:
+                print(f"[HabitatHM3D WARNING] Already retried {retry_count} times for idx={idx}, scene={scene}")
+            # 如果当前场景被标记为invalid则随机选择一个新的场景和起始图像id
+            scene_retry = 0
+            while self.invalid_scenes[scene]:
+                scene_retry += 1
+                if scene_retry > len(self.start_img_ids):
+                    raise RuntimeError(
+                        f"[HabitatHM3D] All scenes are invalid! Cannot find valid scene after {scene_retry} attempts."
+                    )
+                idx = rng.integers(low=0, high=len(self.start_img_ids))
+                scene, start_id = self.start_img_ids[idx]
+            all_image_ids = self.scene_img_list[self.sceneids[start_id]]  # 获取当前场景的所有图像id列表
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
+            )  # 根据起始图像id和其他参数生成图像序列的索引pos 并返回有序视频
+            image_idxs = np.array(all_image_ids)[pos]  # 从all_image_ids提取图像序列
+            views = []
+            load_failed = False
+            for view_idx in image_idxs:
+                scene_id = self.sceneids[view_idx]
+                scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+                basename = self.images[view_idx]
+                try:
+                    # Load RGB image
+                    rgb_image = imread_cv2(osp.join(scene_dir, "image_" + basename + ".png"))
+                    # Load depthmap
+                    depthmap = imread_cv2(
+                        osp.join(scene_dir, "depth_" + basename + ".png"), cv2.IMREAD_UNCHANGED
+                    )
+                    depthmap = depthmap.astype(np.float32) / 1000
+                    depthmap[~np.isfinite(depthmap)] = 0  # invalid
+                    camera_params = np.load(osp.join(scene_dir, basename + ".npz"))
+                    intrinsics = np.float32(camera_params["intrinsics"])
+                    camera_pose = np.eye(4, dtype=np.float32)
+                    camera_pose[:3, :3] = camera_params["R_cam2world"]
+                    camera_pose[:3, 3] = camera_params["t_cam2world"]
+                except Exception as e:
+                    print(f"[HabitatHM3D] Error loading {scene} {basename}: {e}, skipping scene")
+                    self.invalid_scenes[scene] = True
+                    load_failed = True
+                    break
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+                )
+                views.append(
+                    dict(
+                        img=rgb_image,
+                        depthmap=depthmap.astype(np.float32),
+                        camera_pose=camera_pose.astype(np.float32),
+                        camera_intrinsics=intrinsics.astype(np.float32),
+                        dataset="habitatHM3D",
+                        label=self.scenes[scene_id] + "_" + basename,
+                        instance=f"{str(idx)}_{str(view_idx)}",
+                        is_metric=self.is_metric,
+                        is_video=ordered_video,
+                        quantile=np.array(0.98, dtype=np.float32),
+                        img_mask=True,
+                        ray_mask=False,
+                        camera_only=True,
+                        depth_only=False,
+                        single_view=False,
+                        reset=False,
+                    )
+                )
+            # 只有成功加载所有视图才退出循环
+            if not load_failed and len(views) == num_views:
+                invalid_seq = False
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/hoi4d.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), '..','..'))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class HOI4D_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+    def _load_data(self):
+        scenes = os.listdir(self.ROOT)
+        img_names = []
+        for scene in scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, 'rgb')
+            basenames = sorted([f[:-4] for f in os.listdir(rgb_dir) if f.endswith('.png')])
+            img_names.extend([(scene, basename) for basename in basenames])
+        self.img_names = img_names
+    def __len__(self):
+        return len(self.img_names)
+    def get_image_num(self):
+        return len(self.img_names)
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        invalid_seq = True
+        while invalid_seq:
+            img_names = new_rng.choice(self.img_names, num_views, replace=False)
+            views = []
+            for v, img_name in enumerate(img_names):
+                # Load RGB image
+                scene, img_name = img_name
+                try:
+                    rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"))
+                    depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy"))
+                    depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+                    intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))["intrinsics"]
+                except:
+                    print(f"Error loading {scene} {img_name}, skipping")
+                    break
+                # camera pose is not provided, placeholder
+                camera_pose = np.eye(4)
+                rgb_image, depthmap, intrinsics= self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name)
+                views.append(dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset='HOI4D',
+                    label=img_name,
+                    instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"),
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(0.99, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                ))
+            if len(views) == num_views:
+                invalid_seq = False
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mapfree.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import os.path as osp
+import numpy as np
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+import pickle
+import h5py
+from tqdm import tqdm
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class MapFree_Multi(BaseMultiViewDataset):
+    def __init__(self, ROOT, *args, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 30
+        super().__init__(*args, **kwargs)
+        self._load_data()
+    def imgid2path(self, img_id, scene):
+        first_seq_id, first_frame_id = img_id
+        return os.path.join(
+            self.ROOT,
+            scene,
+            f"dense{first_seq_id}",
+            "rgb",
+            f"frame_{first_frame_id:05d}.jpg",
+        )
+    def path2imgid(self, subscene, filename):
+        first_seq_id = int(subscene[5:])
+        first_frame_id = int(filename[6:-4])
+        return [first_seq_id, first_frame_id]
+    def _load_data(self):
+        cache_file = f"{self.ROOT}/cached_metadata_50_col_only.h5"
+        if os.path.exists(cache_file):
+            print(f"Loading cached metadata from {cache_file}")
+            with h5py.File(cache_file, "r") as hf:
+                self.scenes = list(map(lambda x: x.decode("utf-8"), hf["scenes"][:]))
+                self.sceneids = hf["sceneids"][:]
+                self.scope = hf["scope"][:]
+                self.video_flags = hf["video_flags"][:]
+                self.groups = hf["groups"][:]
+                self.id_ranges = hf["id_ranges"][:]
+                self.images = hf["images"][:]
+        else:
+            scene_dirs = sorted(
+                [
+                    d
+                    for d in os.listdir(self.ROOT)
+                    if os.path.isdir(os.path.join(self.ROOT, d))
+                ]
+            )
+            scenes = []
+            sceneids = []
+            groups = []
+            scope = []
+            images = []
+            id_ranges = []
+            is_video = []
+            start = 0
+            j = 0
+            offset = 0
+            for scene in tqdm(scene_dirs):
+                scenes.append(scene)
+                # video sequences
+                subscenes = sorted(
+                    [
+                        d
+                        for d in os.listdir(os.path.join(self.ROOT, scene))
+                        if d.startswith("dense")
+                    ]
+                )
+                id_range_subscenes = []
+                for subscene in subscenes:
+                    rgb_paths = sorted(
+                        [
+                            d
+                            for d in os.listdir(
+                                os.path.join(self.ROOT, scene, subscene, "rgb")
+                            )
+                            if d.endswith(".jpg")
+                        ]
+                    )
+                    assert (
+                        len(rgb_paths) > 0
+                    ), f"{os.path.join(self.ROOT, scene, subscene)} is empty."
+                    num_imgs = len(rgb_paths)
+                    images.extend(
+                        [self.path2imgid(subscene, rgb_path) for rgb_path in rgb_paths]
+                    )
+                    id_range_subscenes.append((offset, offset + num_imgs))
+                    offset += num_imgs
+                # image collections
+                metadata = pickle.load(
+                    open(os.path.join(self.ROOT, scene, "metadata.pkl"), "rb")
+                )
+                ref_imgs = list(metadata.keys())
+                img_groups = []
+                for ref_img in ref_imgs:
+                    other_imgs = metadata[ref_img]
+                    if len(other_imgs) + 1 < self.num_views:
+                        continue
+                    group = [(*other_img[0], other_img[1]) for other_img in other_imgs]
+                    group.insert(0, (*ref_img, 1))
+                    img_groups.append(np.array(group))
+                    id_ranges.append(id_range_subscenes[ref_img[0]])
+                    scope.append(start)
+                    start = start + len(group)
+                num_groups = len(img_groups)
+                sceneids.extend([j] * num_groups)
+                groups.extend(img_groups)
+                is_video.extend([False] * num_groups)
+                j += 1
+            self.scenes = np.array(scenes)
+            self.sceneids = np.array(sceneids)
+            self.scope = np.array(scope)
+            self.video_flags = np.array(is_video)
+            self.groups = np.concatenate(groups, 0)
+            self.id_ranges = np.array(id_ranges)
+            self.images = np.array(images)
+            data = dict(
+                scenes=self.scenes,
+                sceneids=self.sceneids,
+                scope=self.scope,
+                video_flags=self.video_flags,
+                groups=self.groups,
+                id_ranges=self.id_ranges,
+                images=self.images,
+            )
+            with h5py.File(cache_file, "w") as h5f:
+                h5f.create_dataset(
+                    "scenes",
+                    data=data["scenes"].astype(object),
+                    dtype=h5py.string_dtype(encoding="utf-8"),
+                    compression="lzf",
+                    chunks=True,
+                )
+                h5f.create_dataset(
+                    "sceneids", data=data["sceneids"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "scope", data=data["scope"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "video_flags",
+                    data=data["video_flags"],
+                    compression="lzf",
+                    chunks=True,
+                )
+                h5f.create_dataset(
+                    "groups", data=data["groups"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "id_ranges", data=data["id_ranges"], compression="lzf", chunks=True
+                )
+                h5f.create_dataset(
+                    "images", data=data["images"], compression="lzf", chunks=True
+                )
+    def __len__(self):
+        return len(self.scope)
+    def get_image_num(self):
+        return len(self.images)
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+    def _get_views(self, idx, resolution, rng, num_views):
+        scene = self.scenes[self.sceneids[idx]]
+        if rng.random() < 0.6:
+            ids = np.arange(self.id_ranges[idx][0], self.id_ranges[idx][1])
+            cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
+            start_ids = ids[: len(ids) - cut_off + 1]
+            start_id = rng.choice(start_ids)
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views,
+                start_id,
+                ids.tolist(),
+                rng,
+                max_interval=self.max_interval,
+                video_prob=0.8,
+                fix_interval_prob=0.5,
+                block_shuffle=16,
+            )
+            ids = np.array(ids)[pos]
+            image_idxs = self.images[ids]
+        else:
+            ordered_video = False
+            seq_start_index = self.scope[idx]
+            seq_end_index = self.scope[idx + 1] if idx < len(self.scope) - 1 else None
+            image_idxs = (
+                self.groups[seq_start_index:seq_end_index]
+                if seq_end_index is not None
+                else self.groups[seq_start_index:]
+            )
+            image_idxs, overlap_scores = image_idxs[:, :2], image_idxs[:, 2]
+            replace = (
+                True
+                if self.allow_repeat
+                or len(overlap_scores[overlap_scores > 0]) < num_views
+                else False
+            )
+            image_idxs = rng.choice(
+                image_idxs,
+                num_views,
+                replace=replace,
+                p=overlap_scores / np.sum(overlap_scores),
+            )
+            image_idxs = image_idxs.astype(np.int64)
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            img_path = self.imgid2path(view_idx, scene)
+            depth_path = img_path.replace("rgb", "depth").replace(".jpg", ".npy")
+            cam_path = img_path.replace("rgb", "cam").replace(".jpg", ".npz")
+            sky_mask_path = img_path.replace("rgb", "sky_mask")
+            image = imread_cv2(img_path)
+            depthmap = np.load(depth_path)
+            camera_params = np.load(cam_path)
+            sky_mask = cv2.imread(sky_mask_path, cv2.IMREAD_UNCHANGED) >= 127
+            intrinsics = camera_params["intrinsic"].astype(np.float32)
+            camera_pose = camera_params["pose"].astype(np.float32)
+            depthmap[sky_mask] = -1.0
+            depthmap[depthmap > 400.0] = 0.0
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(img_path)
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="MapFree",
+                    label=img_path,
+                    is_metric=self.is_metric,
+                    instance=img_path,
+                    is_video=ordered_video,
+                    quantile=np.array(0.96, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mvs_synth.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_pil
+class MVS_Synth_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = False
+        self.max_interval = 4
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+        print('DATA: mvs_synth', len(self))
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")]
+            )
+            num_imgs = len(basenames)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            img_ids = list(np.arange(num_imgs) + offset)
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+            basename = self.images[view_idx]
+            # Load RGB image
+            rgb_image = imread_pil(osp.join(rgb_dir, basename + ".jpg"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+            depthmap[depthmap > 1000] = 0.0
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.8, 0.15, 0.05]
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="MVS_Synth",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=osp.join(rgb_dir, basename + ".jpg"),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/omniobject3d.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+import json
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+import re
+def extract_number(filename):
+    match = re.search(r"\d+", filename)
+    if match:
+        return int(match.group())
+    return 0
+class OmniObject3D_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = False  # True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+    def _load_data(self):
+        self.scenes = [
+            d
+            for d in os.listdir(self.ROOT)
+            if os.path.isdir(os.path.join(self.ROOT, d)) and not d.startswith('.')
+        ]
+        with open(os.path.join(self.ROOT, "scale.json"), "r") as f:
+            self.scales = json.load(f)
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+                key=extract_number,
+            )
+            num_imgs = len(basenames)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            img_ids = list(np.arange(num_imgs) + offset)
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            start_img_ids.extend([(scene, id) for id in start_img_ids_])
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        scene, start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views, start_id, all_image_ids, rng, max_interval=100, video_prob=0.0
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+            basename = self.images[view_idx]
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            scale = self.scales[self.scenes[scene_id]]
+            depthmap = depthmap / scale / 1000.0
+            camera_pose[:3, 3] = camera_pose[:3, 3] / scale / 1000.0
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.8, 0.15, 0.05]
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="OmniObject3D",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/pointodyssey.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class PointOdyssey_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 4
+        super().__init__(*args, **kwargs)
+        assert self.split in ["train", "test", "val"]
+        self.scenes_to_use = [
+            # 'cab_h_bench_3rd', 'cab_h_bench_ego1', 'cab_h_bench_ego2',
+            "cnb_dlab_0215_3rd",
+            "cnb_dlab_0215_ego1",
+            "cnb_dlab_0225_3rd",
+            "cnb_dlab_0225_ego1",
+            "dancing",
+            "dancingroom0_3rd",
+            "footlab_3rd",
+            "footlab_ego1",
+            "footlab_ego2",
+            "girl",
+            "girl_egocentric",
+            "human_egocentric",
+            "human_in_scene",
+            "human_in_scene1",
+            "kg",
+            "kg_ego1",
+            "kg_ego2",
+            "kitchen_gfloor",
+            "kitchen_gfloor_ego1",
+            "kitchen_gfloor_ego2",
+            "scene_carb_h_tables",
+            "scene_carb_h_tables_ego1",
+            "scene_carb_h_tables_ego2",
+            "scene_j716_3rd",
+            "scene_j716_ego1",
+            "scene_j716_ego2",
+            "scene_recording_20210910_S05_S06_0_3rd",
+            "scene_recording_20210910_S05_S06_0_ego2",
+            "scene1_0129",
+            "scene1_0129_ego",
+            "seminar_h52_3rd",
+            "seminar_h52_ego1",
+            "seminar_h52_ego2",
+        ]
+        self.loaded_data = self._load_data(self.split)
+    def _load_data(self, split):
+        root = os.path.join(self.ROOT, split)
+        self.scenes = []
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+        j = 0
+        for scene in tqdm(os.listdir(root)):
+            if scene not in self.scenes_to_use:
+                continue
+            scene_dir = osp.join(root, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")]
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            # start_img_ids_ = img_ids[:-self.num_views+1]
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=1.0,
+            fix_interval_prob=1.0,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+            basename = self.images[view_idx]
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".jpg"))
+            # Load depthmap
+            depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            depthmap[depthmap > 1000] = 0.0
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.9, 0.05, 0.05]
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="PointOdyssey",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=osp.join(rgb_dir, basename + ".jpg"),
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/realestate10k.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class RE10K_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = False
+        self.max_interval = 128
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
+                key=lambda x: int(x),
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            if num_imgs < cut_off:
+                print(f"Skipping {scene}")
+                continue
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            start_img_ids.extend([(scene, id) for id in start_img_ids_])
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            scene_img_list.append(img_ids)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+        self.invalid_scenes = {scene: False for scene in self.scenes}
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        invalid_seq = True
+        scene, start_id = self.start_img_ids[idx]
+        while invalid_seq:
+            while self.invalid_scenes[scene]:
+                idx = rng.integers(low=0, high=len(self.start_img_ids))
+                scene, start_id = self.start_img_ids[idx]
+            all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
+            )
+            image_idxs = np.array(all_image_ids)[pos]
+            views = []
+            for view_idx in image_idxs:
+                scene_id = self.sceneids[view_idx]
+                scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+                rgb_dir = osp.join(scene_dir, "rgb")
+                cam_dir = osp.join(scene_dir, "cam")
+                basename = self.images[view_idx]
+                try:
+                    # Load RGB image
+                    rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+                    # Load depthmap, no depth, set to all ones
+                    depthmap = np.ones_like(rgb_image[..., 0], dtype=np.float32)
+                    cam = np.load(osp.join(cam_dir, basename + ".npz"))
+                    intrinsics = cam["intrinsics"]
+                    camera_pose = cam["pose"]
+                except:
+                    print(f"Error loading {scene} {basename}, skipping")
+                    self.invalid_scenes[scene] = True
+                    break
+                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                    rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+                )
+                views.append(
+                    dict(
+                        img=rgb_image,
+                        depthmap=depthmap.astype(np.float32),
+                        camera_pose=camera_pose.astype(np.float32),
+                        camera_intrinsics=intrinsics.astype(np.float32),
+                        dataset="realestate10k",
+                        label=self.scenes[scene_id] + "_" + basename,
+                        instance=f"{str(idx)}_{str(view_idx)}",
+                        is_metric=self.is_metric,
+                        is_video=ordered_video,
+                        quantile=np.array(0.98, dtype=np.float32),
+                        img_mask=True,
+                        ray_mask=False,
+                        camera_only=True,
+                        depth_only=False,
+                        single_view=False,
+                        reset=False,
+                    )
+                )
+            if len(views) == num_views:
+                invalid_seq = False
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/scannet.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2, imread_pil
+class ScanNet_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 30
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data(self.split)
+        print('DATA: scannet', len(self))
+    def _load_data(self, split):
+        self.scene_root = osp.join(
+            self.ROOT, "scans_train" if split == "train" else "scans_test"
+        )
+        self.scenes = [
+            scene for scene in os.listdir(self.scene_root) if scene.startswith("scene")
+        ]
+        offset = 0
+        scenes = []
+        sceneids = []
+        scene_img_list = []
+        images = []
+        start_img_ids = []
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.scene_root, scene)
+            with np.load(
+                osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
+            ) as data:
+                basenames = data["images"]
+                num_imgs = len(basenames)
+                img_ids = list(np.arange(num_imgs) + offset)
+                cut_off = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+                if num_imgs < cut_off:
+                    print(f"Skipping {scene}")
+                    continue
+                start_img_ids.extend(start_img_ids_)
+                sceneids.extend([j] * num_imgs)
+                images.extend(basenames)
+                scenes.append(scene)
+                scene_img_list.append(img_ids)
+                # offset groups
+                offset += num_imgs
+                j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=self.max_interval,
+            video_prob=0.6,
+            fix_interval_prob=0.6,
+            block_shuffle=16,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.scene_root, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "color")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+            basename = self.images[view_idx]
+            # Load RGB image
+            rgb_image = imread_pil(osp.join(rgb_dir, basename + ".jpg"))
+            # Load depthmap
+            depthmap = imread_cv2(
+                osp.join(depth_dir, basename + ".png"), cv2.IMREAD_UNCHANGED
+            )
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            camera_pose = cam["pose"]
+            intrinsics = cam["intrinsics"]
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="ScanNet",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.98, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/scannetpp.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2, imread_pil
+class ScanNetpp_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        self.max_interval = 3
+        super().__init__(*args, **kwargs)
+        assert self.split == "train"
+        self.loaded_data = self._load_data()
+    def _load_data(self):
+        with np.load(osp.join(self.ROOT, "all_metadata.npz")) as data:
+            self.scenes = data["scenes"]
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        intrinsics = []
+        trajectories = []
+        groups = []
+        id_ranges = []
+        j = 0
+        self.image_num = 0
+        for scene in self.scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            with np.load(
+                osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
+            ) as data:
+                imgs = data["images"]
+                self.image_num += len(imgs)
+                img_ids = np.arange(len(imgs)).tolist()
+                intrins = data["intrinsics"]
+                traj = data["trajectories"]
+                imgs_on_disk = sorted(os.listdir(osp.join(scene_dir, "images")))
+                imgs_on_disk = list(map(lambda x: x[:-4], imgs_on_disk))
+                dslr_ids = [
+                    i + offset
+                    for i in img_ids
+                    if imgs[i].startswith("DSC") and imgs[i] in imgs_on_disk
+                ]
+                iphone_ids = [
+                    i + offset
+                    for i in img_ids
+                    if imgs[i].startswith("frame") and imgs[i] in imgs_on_disk
+                ]
+                num_imgs = len(imgs)
+                assert max(dslr_ids) < min(iphone_ids)
+                assert "image_collection" in data
+                img_groups = []
+                img_id_ranges = []
+                # 使用与其他数据集一致的 cut_off 逻辑
+                min_group_len = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                for ref_id, group in data["image_collection"].item().items():
+                    if len(group) + 1 < min_group_len:
+                        continue
+                    group.insert(0, (ref_id, 1.0))
+                    sorted_group = sorted(group, key=lambda x: x[1], reverse=True)
+                    group = [int(x[0] + offset) for x in sorted_group]
+                    # 确定对应的视频帧列表
+                    if imgs[ref_id].startswith("frame"):
+                        video_ids = dslr_ids
+                    else:
+                        video_ids = iphone_ids
+                    # 只有当视频帧列表足够长时才添加
+                    if len(video_ids) >= min_group_len:
+                        img_groups.append(sorted(group))
+                        img_id_ranges.append(video_ids)
+                if len(img_groups) == 0:
+                    print(f"Skipping {scene}")
+                    continue
+                scenes.append(scene)
+                sceneids.extend([j] * num_imgs)
+                images.extend(imgs)
+                intrinsics.append(intrins)
+                trajectories.append(traj)
+                # offset groups
+                groups.extend(img_groups)
+                id_ranges.extend(img_id_ranges)
+                offset += num_imgs
+                j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.intrinsics = np.concatenate(intrinsics, axis=0)
+        self.trajectories = np.concatenate(trajectories, axis=0)
+        self.id_ranges = id_ranges
+        self.groups = groups
+    def __len__(self):
+        return len(self.groups) * 10
+    def get_image_num(self):
+        return self.image_num
+    def _get_views(self, idx, resolution, rng, num_views):
+        idx = idx // 10
+        image_idxs = self.groups[idx]
+        rand_val = rng.random()
+        image_idxs_video = self.id_ranges[idx]
+        cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
+        start_image_idxs = image_idxs_video[: len(image_idxs_video) - cut_off + 1]
+        if rand_val < 0.7 and len(start_image_idxs) > 0:
+            start_id = rng.choice(start_image_idxs)
+            pos, ordered_video = self.get_seq_from_start_id(
+                num_views,
+                start_id,
+                image_idxs_video,
+                rng,
+                max_interval=self.max_interval,
+                video_prob=0.8,
+                fix_interval_prob=0.5,
+                block_shuffle=16,
+            )
+            image_idxs = np.array(image_idxs_video)[pos]
+        else:
+            ordered_video = True
+            # ordered video with varying intervals
+            num_candidates = len(image_idxs)
+            max_id = min(num_candidates, int(num_views * (2 + 2 * rng.random())))
+            # 确保有足够的候选帧
+            if num_candidates < num_views:
+                # 如果候选帧不足，使用重复采样
+                image_idxs = sorted(rng.choice(image_idxs, size=num_views, replace=True))
+            else:
+                image_idxs = sorted(rng.permutation(image_idxs[:max_id])[:num_views])
+            if rand_val > 0.75:
+                ordered_video = False
+                image_idxs = rng.permutation(image_idxs)
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            intrinsics = self.intrinsics[view_idx]
+            camera_pose = self.trajectories[view_idx]
+            basename = self.images[view_idx]
+            # Load RGB image
+            rgb_image = imread_pil(osp.join(scene_dir, "images", basename + ".jpg"))
+            # Load depthmap
+            depthmap = imread_cv2(
+                osp.join(scene_dir, "depth", basename + ".png"), cv2.IMREAD_UNCHANGED
+            )
+            depthmap = depthmap.astype(np.float32) / 1000
+            depthmap[~np.isfinite(depthmap)] = 0  # invalid
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="ScanNet++",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=ordered_video,
+                    quantile=np.array(0.99, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/smartportraits.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class SmartPortraits_Multi(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = True
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+    def _load_data(self):
+        scenes = os.listdir(self.ROOT)
+        img_names = []
+        for scene in scenes:
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            img_names.extend([(scene, basename) for basename in basenames])
+        self.img_names = img_names
+    def __len__(self):
+        return len(self.img_names)
+    def get_image_num(self):
+        return len(self.img_names)
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        img_names = new_rng.choice(self.img_names, num_views, replace=False)
+        views = []
+        for v, img_name in enumerate(img_names):
+            # Load RGB image
+            scene, img_name = img_name
+            rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"))
+            depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy"))
+            depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
+            intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))[
+                "intrinsics"
+            ]
+            # camera pose is not provided, placeholder
+            camera_pose = np.eye(4)
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="SmartPortraits",
+                    label=img_name,
+                    instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"),
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(0.98, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/threedkb.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import os.path as osp
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from tqdm import tqdm
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class ThreeDKenBurns(BaseMultiViewDataset):
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.video = False
+        self.is_metric = False
+        super().__init__(*args, **kwargs)
+        self.loaded_data = self._load_data()
+    def _load_data(self):
+        self.scenes = os.listdir(self.ROOT)
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        img_ids = []
+        j = 0
+        for scene in tqdm(self.scenes):
+            scene_dir = osp.join(self.ROOT, scene)
+            rgb_dir = osp.join(scene_dir, "rgb")
+            basenames = sorted(
+                [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
+            )
+            num_imgs = len(basenames)
+            img_ids_ = list(np.arange(num_imgs) + offset)
+            img_ids.extend(img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(scene)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.img_ids = img_ids
+    def __len__(self):
+        return len(self.img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def _get_views(self, idx, resolution, rng, num_views):
+        new_seed = rng.integers(0, 2**32) + idx
+        new_rng = np.random.default_rng(new_seed)
+        image_idxs = new_rng.choice(self.img_ids, num_views, replace=False)
+        views = []
+        for view_idx in image_idxs:
+            scene_id = self.sceneids[view_idx]
+            scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
+            rgb_dir = osp.join(scene_dir, "rgb")
+            depth_dir = osp.join(scene_dir, "depth")
+            cam_dir = osp.join(scene_dir, "cam")
+            basename = self.images[view_idx]
+            # Load RGB image
+            rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
+            depthmap = imread_cv2(osp.join(depth_dir, basename + ".exr"))
+            depthmap[depthmap > 20000] = 0.0
+            depthmap = depthmap / 1000.0
+            cam = np.load(osp.join(cam_dir, basename + ".npz"))
+            intrinsics = cam["intrinsics"]
+            camera_pose = np.eye(4)
+            rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
+            )
+            views.append(
+                dict(
+                    img=rgb_image,
+                    depthmap=depthmap.astype(np.float32),
+                    camera_pose=camera_pose.astype(np.float32),
+                    camera_intrinsics=intrinsics.astype(np.float32),
+                    dataset="3DKenBurns",
+                    label=self.scenes[scene_id] + "_" + basename,
+                    instance=f"{str(idx)}_{str(view_idx)}",
+                    is_metric=self.is_metric,
+                    is_video=False,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=True,
+                    ray_mask=False,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=True,
+                    reset=True,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/unreal4k.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import os.path as osp
+import numpy as np
+import cv2
+import numpy as np
+import itertools
+import os
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+R_conv = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]).astype(
+    np.float32
+)
+class UnReal4K_Multi(BaseMultiViewDataset):
+    def __init__(self, ROOT, *args, **kwargs):
+        self.ROOT = ROOT
+        self.max_interval = 2
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        # loading all
+        assert self.split is None
+        self._load_data()
+    def _load_data(self):
+        scene_dirs = sorted(
+            [
+                d
+                for d in os.listdir(self.ROOT)
+                if os.path.isdir(os.path.join(self.ROOT, d))
+            ]
+        )
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        start_img_ids = []
+        scene_img_list = []
+        j = 0
+        seq_dirs = sorted(
+            [
+                os.path.join(self.ROOT, scene, mode)
+                for scene in scene_dirs
+                for mode in ["0", "1"]
+            ]
+        )
+        for seq_dir in seq_dirs:
+            basenames = sorted(
+                [f[:-8] for f in os.listdir(seq_dir) if f.endswith(".png")]
+            )
+            num_imgs = len(basenames)
+            img_ids = list(np.arange(num_imgs) + offset)
+            # start_img_ids_ = img_ids[:-self.num_views+1]
+            cut_off = (
+                self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+            )
+            start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+            if num_imgs < cut_off:
+                print(f"Skipping {seq_dir}")
+                continue
+            start_img_ids.extend(start_img_ids_)
+            sceneids.extend([j] * num_imgs)
+            images.extend(basenames)
+            scenes.append(seq_dir)
+            scene_img_list.append(img_ids)
+            # offset groups
+            offset += num_imgs
+            j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+    def __len__(self):
+        return len(self.start_img_ids) * 10
+    def get_image_num(self):
+        return len(self.images)
+    def get_stats(self):
+        return f"{len(self)//10} groups of views"
+    def _get_views(self, idx, resolution, rng, num_views):
+        idx = idx // 10
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir = self.scenes[scene_id]
+            basename = self.images[view_idx]
+            img = basename + "_rgb.png"
+            image = imread_cv2(osp.join(scene_dir, img))
+            depthmap = np.load(osp.join(scene_dir, basename + "_depth.npy"))
+            camera_params = np.load(osp.join(scene_dir, basename + ".npz"))
+            intrinsics = camera_params["intrinsics"].astype(np.float32)
+            camera_pose = camera_params["cam2world"].astype(np.float32)
+            camera_pose = R_conv @ camera_pose
+            sky_mask = depthmap >= 1000
+            depthmap[sky_mask] = -1.0  # sky
+            threshold = (
+                np.percentile(depthmap[depthmap > 0], 98)
+                if depthmap[depthmap > 0].size > 0
+                else 0
+            )
+            depthmap[depthmap > threshold] = 0.0
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(scene_dir, img)
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
+            )
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="UnReal4K",
+                    label=scene_dir,
+                    is_metric=self.is_metric,
+                    instance=scene_dir + "_" + img,
+                    is_video=ordered_video,
+                    quantile=np.array(1.0, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        assert len(views) == num_views
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024-present Naver Corporation. All rights reserved.
2	+ # Licensed under CC BY-NC-SA 4.0 (non-commercial use only).

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/corr.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+import numpy as np
+from dust3r.utils.device import to_numpy
+from dust3r.utils.geometry import inv, geotrf
+def reproject_view(pts3d, view2):
+    shape = view2["pts3d"].shape[:2]
+    return reproject(
+        pts3d, view2["camera_intrinsics"], inv(view2["camera_pose"]), shape
+    )
+def reproject(pts3d, K, world2cam, shape):
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    # reproject in camera2 space
+    with np.errstate(divide="ignore", invalid="ignore"):
+        pos = geotrf(K @ world2cam[:3], pts3d, norm=1, ncol=2)
+    # quantize to pixel positions
+    return (H, W), ravel_xy(pos, shape)
+def ravel_xy(pos, shape):
+    H, W = shape
+    with np.errstate(invalid="ignore"):
+        qx, qy = pos.reshape(-1, 2).round().astype(np.int32).T
+    quantized_pos = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(
+        min=0, max=H - 1, out=qy
+    )
+    return quantized_pos
+def unravel_xy(pos, shape):
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    return np.unravel_index(pos, shape)[0].base[:, ::-1].copy()
+def reciprocal_1d(corres_1_to_2, corres_2_to_1, ret_recip=False):
+    is_reciprocal1 = corres_2_to_1[corres_1_to_2] == np.arange(len(corres_1_to_2))
+    pos1 = is_reciprocal1.nonzero()[0]
+    pos2 = corres_1_to_2[pos1]
+    if ret_recip:
+        return is_reciprocal1, pos1, pos2
+    return pos1, pos2
+def extract_correspondences_from_pts3d(
+    view1, view2, target_n_corres, rng=np.random, ret_xy=True, nneg=0
+):
+    view1, view2 = to_numpy((view1, view2))
+    # project pixels from image1 --> 3d points --> image2 pixels
+    shape1, corres1_to_2 = reproject_view(view1["pts3d"], view2)
+    shape2, corres2_to_1 = reproject_view(view2["pts3d"], view1)
+    # compute reciprocal correspondences:
+    # pos1 == valid pixels (correspondences) in image1
+    is_reciprocal1, pos1, pos2 = reciprocal_1d(
+        corres1_to_2, corres2_to_1, ret_recip=True
+    )
+    is_reciprocal2 = corres1_to_2[corres2_to_1] == np.arange(len(corres2_to_1))
+    if target_n_corres is None:
+        if ret_xy:
+            pos1 = unravel_xy(pos1, shape1)
+            pos2 = unravel_xy(pos2, shape2)
+        return pos1, pos2
+    available_negatives = min((~is_reciprocal1).sum(), (~is_reciprocal2).sum())
+    target_n_positives = int(target_n_corres * (1 - nneg))
+    n_positives = min(len(pos1), target_n_positives)
+    n_negatives = min(target_n_corres - n_positives, available_negatives)
+    if n_negatives + n_positives != target_n_corres:
+        # should be really rare => when there are not enough negatives
+        # in that case, break nneg and add a few more positives ?
+        n_positives = target_n_corres - n_negatives
+        assert n_positives <= len(pos1)
+    assert n_positives <= len(pos1)
+    assert n_positives <= len(pos2)
+    assert n_negatives <= (~is_reciprocal1).sum()
+    assert n_negatives <= (~is_reciprocal2).sum()
+    assert n_positives + n_negatives == target_n_corres
+    valid = np.ones(n_positives, dtype=bool)
+    if n_positives < len(pos1):
+        # random sub-sampling of valid correspondences
+        perm = rng.permutation(len(pos1))[:n_positives]
+        pos1 = pos1[perm]
+        pos2 = pos2[perm]
+    if n_negatives > 0:
+        # add false correspondences if not enough
+        def norm(p):
+            return p / p.sum()
+        pos1 = np.r_[
+            pos1,
+            rng.choice(
+                shape1[0] * shape1[1],
+                size=n_negatives,
+                replace=False,
+                p=norm(~is_reciprocal1),
+            ),
+        ]
+        pos2 = np.r_[
+            pos2,
+            rng.choice(
+                shape2[0] * shape2[1],
+                size=n_negatives,
+                replace=False,
+                p=norm(~is_reciprocal2),
+            ),
+        ]
+        valid = np.r_[valid, np.zeros(n_negatives, dtype=bool)]
+    # convert (x+W*y) back to 2d (x,y) coordinates
+    if ret_xy:
+        pos1 = unravel_xy(pos1, shape1)
+        pos2 = unravel_xy(pos2, shape2)
+    return pos1, pos2, valid

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/cropping.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# croppping utilities
+# --------------------------------------------------------
+import PIL.Image
+import os
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+import numpy as np  # noqa
+from dust3r.utils.geometry import (
+    colmap_to_opencv_intrinsics,
+    opencv_to_colmap_intrinsics,
+)  # noqa
+try:
+    lanczos = PIL.Image.Resampling.LANCZOS
+    bicubic = PIL.Image.Resampling.BICUBIC
+except AttributeError:
+    lanczos = PIL.Image.LANCZOS
+    bicubic = PIL.Image.BICUBIC
+class ImageList:
+    """Convenience class to aply the same operation to a whole set of images."""
+    def __init__(self, images):
+        if not isinstance(images, (tuple, list, set)):
+            images = [images]
+        self.images = []
+        for image in images:
+            if not isinstance(image, PIL.Image.Image):
+                image = PIL.Image.fromarray(image)
+            self.images.append(image)
+    def __len__(self):
+        return len(self.images)
+    def to_pil(self):
+        return tuple(self.images) if len(self.images) > 1 else self.images[0]
+    @property
+    def size(self):
+        sizes = [im.size for im in self.images]
+        assert all(sizes[0] == s for s in sizes)
+        return sizes[0]
+    def resize(self, *args, **kwargs):
+        return ImageList(self._dispatch("resize", *args, **kwargs))
+    def crop(self, *args, **kwargs):
+        return ImageList(self._dispatch("crop", *args, **kwargs))
+    def _dispatch(self, func, *args, **kwargs):
+        return [getattr(im, func)(*args, **kwargs) for im in self.images]
+def rescale_image_depthmap(
+    image, depthmap, camera_intrinsics, output_resolution, force=True
+):
+    """Jointly rescale a (image, depthmap)
+    so that (out_width, out_height) >= output_res
+    """
+    image = ImageList(image)
+    input_resolution = np.array(image.size)  # (W,H)
+    output_resolution = np.array(output_resolution)
+    if depthmap is not None:
+        # can also use this with masks instead of depthmaps
+        assert tuple(depthmap.shape[:2]) == image.size[::-1]
+    # define output resolution
+    assert output_resolution.shape == (2,)
+    scale_final = max(output_resolution / image.size) + 1e-8
+    if scale_final >= 1 and not force:  # image is already smaller than what is asked
+        return (image.to_pil(), depthmap, camera_intrinsics)
+    output_resolution = np.floor(input_resolution * scale_final).astype(int)
+    # first rescale the image so that it contains the crop
+    image = image.resize(
+        output_resolution, resample=lanczos if scale_final < 1 else bicubic
+    )
+    if depthmap is not None:
+        depthmap = cv2.resize(
+            depthmap,
+            output_resolution,
+            fx=scale_final,
+            fy=scale_final,
+            interpolation=cv2.INTER_NEAREST,
+        )
+    # no offset here; simple rescaling
+    camera_intrinsics = camera_matrix_of_crop(
+        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final
+    )
+    return image.to_pil(), depthmap, camera_intrinsics
+def camera_matrix_of_crop(
+    input_camera_matrix,
+    input_resolution,
+    output_resolution,
+    scaling=1,
+    offset_factor=0.5,
+    offset=None,
+):
+    # Margins to offset the origin
+    margins = np.asarray(input_resolution) * scaling - output_resolution
+    assert np.all(margins >= 0.0)
+    if offset is None:
+        offset = offset_factor * margins
+    # Generate new camera parameters
+    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
+    output_camera_matrix_colmap[:2, :] *= scaling
+    output_camera_matrix_colmap[:2, 2] -= offset
+    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
+    return output_camera_matrix
+def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
+    """
+    Return a crop of the input view.
+    """
+    image = ImageList(image)
+    l, t, r, b = crop_bbox
+    image = image.crop((l, t, r, b))
+    depthmap = depthmap[t:b, l:r]
+    camera_intrinsics = camera_intrinsics.copy()
+    camera_intrinsics[0, 2] -= l
+    camera_intrinsics[1, 2] -= t
+    return image.to_pil(), depthmap, camera_intrinsics
+def bbox_from_intrinsics_in_out(
+    input_camera_matrix, output_camera_matrix, output_resolution
+):
+    out_width, out_height = output_resolution
+    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
+    crop_bbox = (l, t, l + out_width, t + out_height)
+    return crop_bbox

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/utils/transforms.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# DUST3R default transforms
+# --------------------------------------------------------
+import torchvision.transforms as tvf
+from dust3r.utils.image import ImgNorm
+# define the standard image transforms
+ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])
+def _check_input(value, center=1, bound=(0, float("inf")), clip_first_on_zero=True):
+    if isinstance(value, (int, float)):
+        if value < 0:
+            raise ValueError(f"If  is a single number, it must be non negative.")
+        value = [center - float(value), center + float(value)]
+        if clip_first_on_zero:
+            value[0] = max(value[0], 0.0)
+    elif isinstance(value, (tuple, list)) and len(value) == 2:
+        value = [float(value[0]), float(value[1])]
+    else:
+        raise TypeError(f"should be a single number or a list/tuple with length 2.")
+    if not bound[0] <= value[0] <= value[1] <= bound[1]:
+        raise ValueError(f"values should be between {bound}, but got {value}.")
+    # if value is 0 or (1., 1.) for brightness/contrast/saturation
+    # or (0., 0.) for hue, do nothing
+    if value[0] == value[1] == center:
+        return None
+    else:
+        return tuple(value)
+import torch
+import torchvision.transforms.functional as F
+def SeqColorJitter():
+    """
+    Return a color jitter transform with same random parameters
+    """
+    brightness = _check_input(0.5)
+    contrast = _check_input(0.5)
+    saturation = _check_input(0.5)
+    hue = _check_input(0.1, center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+    fn_idx = torch.randperm(4)
+    brightness_factor = (
+        None
+        if brightness is None
+        else float(torch.empty(1).uniform_(brightness[0], brightness[1]))
+    )
+    contrast_factor = (
+        None
+        if contrast is None
+        else float(torch.empty(1).uniform_(contrast[0], contrast[1]))
+    )
+    saturation_factor = (
+        None
+        if saturation is None
+        else float(torch.empty(1).uniform_(saturation[0], saturation[1]))
+    )
+    hue_factor = None if hue is None else float(torch.empty(1).uniform_(hue[0], hue[1]))
+    def _color_jitter(img):
+        for fn_id in fn_idx:
+            if fn_id == 0 and brightness_factor is not None:
+                img = F.adjust_brightness(img, brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                img = F.adjust_contrast(img, contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                img = F.adjust_saturation(img, saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                img = F.adjust_hue(img, hue_factor)
+        return ImgNorm(img)
+    return _color_jitter

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/waymo.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os.path as osp
+import os
+import numpy as np
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import h5py
+from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
+from dust3r.utils.image import imread_cv2
+class Waymo_Multi(BaseMultiViewDataset):
+    """Dataset of outdoor street scenes, 5 images each time"""
+    def __init__(self, *args, ROOT, **kwargs):
+        self.ROOT = ROOT
+        self.max_interval = 8
+        self.video = True
+        self.is_metric = True
+        super().__init__(*args, **kwargs)
+        assert self.split is None
+        self._load_data()
+    def load_invalid_dict(self, h5_file_path):
+        invalid_dict = {}
+        with h5py.File(h5_file_path, "r") as h5f:
+            for scene in h5f:
+                data = h5f[scene]["invalid_pairs"][:]
+                invalid_pairs = set(
+                    tuple(pair.decode("utf-8").split("_")) for pair in data
+                )
+                invalid_dict[scene] = invalid_pairs
+        return invalid_dict
+    def _load_data(self):
+        invalid_dict = self.load_invalid_dict(
+            os.path.join(self.ROOT, "invalid_files.h5")
+        )
+        scene_dirs = sorted(
+            [
+                d
+                for d in os.listdir(self.ROOT)
+                if os.path.isdir(os.path.join(self.ROOT, d))
+            ]
+        )
+        offset = 0
+        scenes = []
+        sceneids = []
+        images = []
+        start_img_ids = []
+        scene_img_list = []
+        is_video = []
+        j = 0
+        for scene in scene_dirs:
+            scene_dir = osp.join(self.ROOT, scene)
+            invalid_pairs = invalid_dict.get(scene, set())
+            seq2frames = {}
+            for f in os.listdir(scene_dir):
+                if not f.endswith(".jpg"):
+                    continue
+                basename = f[:-4]
+                frame_id = basename.split("_")[0]
+                seq_id = basename.split("_")[1]
+                if seq_id == "5":
+                    continue
+                if (seq_id, frame_id) in invalid_pairs:
+                    continue  # Skip invalid files
+                if seq_id not in seq2frames:
+                    seq2frames[seq_id] = []
+                seq2frames[seq_id].append(frame_id)
+            for seq_id, frame_ids in seq2frames.items():
+                frame_ids = sorted(frame_ids)
+                num_imgs = len(frame_ids)
+                img_ids = list(np.arange(num_imgs) + offset)
+                cut_off = (
+                    self.num_views
+                    if not self.allow_repeat
+                    else max(self.num_views // 3, 3)
+                )
+                start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
+                if num_imgs < cut_off:
+                    print(f"Skipping {scene}_{seq_id}")
+                    continue
+                scenes.append((scene, seq_id))
+                sceneids.extend([j] * num_imgs)
+                images.extend(frame_ids)
+                start_img_ids.extend(start_img_ids_)
+                scene_img_list.append(img_ids)
+                offset += num_imgs
+                j += 1
+        self.scenes = scenes
+        self.sceneids = sceneids
+        self.images = images
+        self.start_img_ids = start_img_ids
+        self.scene_img_list = scene_img_list
+        self.is_video = is_video
+    def __len__(self):
+        return len(self.start_img_ids)
+    def get_image_num(self):
+        return len(self.images)
+    def get_stats(self):
+        return f"{len(self)} groups of views"
+    def _get_views(self, idx, resolution, rng, num_views):
+        start_id = self.start_img_ids[idx]
+        all_image_ids = self.scene_img_list[self.sceneids[start_id]]
+        _, seq_id = self.scenes[self.sceneids[start_id]]
+        max_interval = self.max_interval // 2 if seq_id == "4" else self.max_interval
+        pos, ordered_video = self.get_seq_from_start_id(
+            num_views,
+            start_id,
+            all_image_ids,
+            rng,
+            max_interval=max_interval,
+            video_prob=0.9,
+            fix_interval_prob=0.9,
+            block_shuffle=16,
+        )
+        image_idxs = np.array(all_image_ids)[pos]
+        views = []
+        ordered_video = True
+        views = []
+        for v, view_idx in enumerate(image_idxs):
+            scene_id = self.sceneids[view_idx]
+            scene_dir, seq_id = self.scenes[scene_id]
+            scene_dir = osp.join(self.ROOT, scene_dir)
+            frame_id = self.images[view_idx]
+            impath = f"{frame_id}_{seq_id}"
+            image = imread_cv2(osp.join(scene_dir, impath + ".jpg"))
+            depthmap = imread_cv2(osp.join(scene_dir, impath + ".exr"))
+            camera_params = np.load(osp.join(scene_dir, impath + ".npz"))
+            intrinsics = np.float32(camera_params["intrinsics"])
+            camera_pose = np.float32(camera_params["cam2world"])
+            image, depthmap, intrinsics = self._crop_resize_if_necessary(
+                image, depthmap, intrinsics, resolution, rng, info=(scene_dir, impath)
+            )
+            # generate img mask and raymap mask
+            img_mask, ray_mask = self.get_img_and_ray_masks(
+                self.is_metric, v, rng, p=[0.85, 0.10, 0.05]
+            )
+            views.append(
+                dict(
+                    img=image,
+                    depthmap=depthmap,
+                    camera_pose=camera_pose,  # cam2world
+                    camera_intrinsics=intrinsics,
+                    dataset="Waymo",
+                    label=osp.relpath(scene_dir, self.ROOT),
+                    is_metric=self.is_metric,
+                    instance=osp.join(scene_dir, impath + ".jpg"),
+                    is_video=ordered_video,
+                    quantile=np.array(0.98, dtype=np.float32),
+                    img_mask=img_mask,
+                    ray_mask=ray_mask,
+                    camera_only=False,
+                    depth_only=False,
+                    single_view=False,
+                    reset=False,
+                )
+            )
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/wildrgbd.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os.path as osp
+import sys
+sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
+import cv2
+import numpy as np
+from dust3r.datasets.co3d import Co3d_Multi
+from dust3r.utils.image import imread_cv2
+class WildRGBD_Multi(Co3d_Multi):
+    def __init__(self, mask_bg="rand", *args, ROOT, **kwargs):
+        super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
+        self.dataset_label = "WildRGBD"
+        self.is_metric = True
+        # load all scenes
+        self.scenes.pop(("box", "scenes/scene_257"), None)
+        self.scene_list = list(self.scenes.keys())
+        cut_off = (
+            self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
+        )
+        self.cut_off = cut_off
+        self.all_ref_imgs = [
+            (key, value)
+            for key, values in self.scenes.items()
+            for value in values[: len(values) - cut_off + 1]
+        ]
+        self.invalidate = {scene: {} for scene in self.scene_list}
+        self.invalid_scenes = {scene: False for scene in self.scene_list}
+    def _get_metadatapath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "metadata", f"{view_idx:0>5d}.npz")
+    def _get_impath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "rgb", f"{view_idx:0>5d}.jpg")
+    def _get_depthpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "depth", f"{view_idx:0>5d}.png")
+    def _get_maskpath(self, obj, instance, view_idx):
+        return osp.join(self.ROOT, obj, instance, "masks", f"{view_idx:0>5d}.png")
+    def _read_depthmap(self, depthpath, input_metadata):
+        # We store depths in the depth scale of 1000.
+        # That is, when we load depth image and divide by 1000, we could get depth in meters.
+        depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
+        depthmap = depthmap.astype(np.float32) / 1000.0
+        return depthmap
+    def _get_views(self, idx, resolution, rng, num_views):
+        views = super()._get_views(idx, resolution, rng, num_views)
+        for view in views:
+            assert view["is_metric"]
+            view["quantile"] = np.array(0.96, dtype=np.float32)
+        return views

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/camera.py ADDED Viewed

	@@ -0,0 +1,463 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from croco.models.blocks import Mlp
+inf = float("inf")
+class PoseDecoder(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        mlp_ratio=4,
+        pose_encoding_type="absT_quaR",
+    ):
+        super().__init__()
+        self.pose_encoding_type = pose_encoding_type
+        if self.pose_encoding_type == "absT_quaR":
+            self.target_dim = 7
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=int(hidden_size * mlp_ratio),
+            out_features=self.target_dim,
+            drop=0,
+        )
+    def forward(
+        self,
+        pose_feat,
+    ):
+        """
+        pose_feat: BxC
+        preliminary_cameras: cameras in opencv coordinate.
+        """
+        pred_cameras = self.mlp(pose_feat)  # Bx7, 3 for absT, 4 for quaR
+        return pred_cameras
+class PoseEncoder(nn.Module):
+    def __init__(
+        self,
+        hidden_size=768,
+        mlp_ratio=4,
+        pose_mode=("exp", -inf, inf),
+        pose_encoding_type="absT_quaR",
+    ):
+        super().__init__()
+        self.pose_encoding_type = pose_encoding_type
+        self.pose_mode = pose_mode
+        if self.pose_encoding_type == "absT_quaR":
+            self.target_dim = 7
+        self.embed_pose = PoseEmbedding(
+            target_dim=self.target_dim,
+            out_dim=hidden_size,
+            n_harmonic_functions=10,
+            append_input=True,
+        )
+        self.pose_encoder = Mlp(
+            in_features=self.embed_pose.out_dim,
+            hidden_features=int(hidden_size * mlp_ratio),
+            out_features=hidden_size,
+            drop=0,
+        )
+    def forward(self, camera):
+        from dust3r.heads.postprocess import postprocess_pose
+        pose_enc = camera_to_pose_encoding(
+            camera,
+            pose_encoding_type=self.pose_encoding_type,
+        ).to(camera.dtype)
+        pose_enc = postprocess_pose(pose_enc, self.pose_mode, inverse=True)
+        pose_feat = self.embed_pose(pose_enc)
+        pose_feat = self.pose_encoder(pose_feat)
+        return pose_feat
+class HarmonicEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        n_harmonic_functions: int = 6,
+        omega_0: float = 1.0,
+        logspace: bool = True,
+        append_input: bool = True,
+    ) -> None:
+        """
+        The harmonic embedding layer supports the classical
+        Nerf positional encoding described in
+        `NeRF <https://arxiv.org/abs/2003.08934>`_
+        and the integrated position encoding in
+        `MIP-NeRF <https://arxiv.org/abs/2103.13415>`_.
+        During the inference you can provide the extra argument `diag_cov`.
+        If `diag_cov is None`, it converts
+        rays parametrized with a `ray_bundle` to 3D points by
+        extending each ray according to the corresponding length.
+        Then it converts each feature
+        (i.e. vector along the last dimension) in `x`
+        into a series of harmonic features `embedding`,
+        where for each i in range(dim) the following are present
+        in embedding[...]::
+            [
+                sin(f_1*x[..., i]),
+                sin(f_2*x[..., i]),
+                ...
+                sin(f_N * x[..., i]),
+                cos(f_1*x[..., i]),
+                cos(f_2*x[..., i]),
+                ...
+                cos(f_N * x[..., i]),
+                x[..., i],              # only present if append_input is True.
+            ]
+        where N corresponds to `n_harmonic_functions-1`, and f_i is a scalar
+        denoting the i-th frequency of the harmonic embedding.
+        If `diag_cov is not None`, it approximates
+        conical frustums following a ray bundle as gaussians,
+        defined by x, the means of the gaussians and diag_cov,
+        the diagonal covariances.
+        Then it converts each gaussian
+        into a series of harmonic features `embedding`,
+        where for each i in range(dim) the following are present
+        in embedding[...]::
+            [
+                sin(f_1*x[..., i]) * exp(0.5 * f_1**2 * diag_cov[..., i,]),
+                sin(f_2*x[..., i]) * exp(0.5 * f_2**2 * diag_cov[..., i,]),
+                ...
+                sin(f_N * x[..., i]) * exp(0.5 * f_N**2 * diag_cov[..., i,]),
+                cos(f_1*x[..., i]) * exp(0.5 * f_1**2 * diag_cov[..., i,]),
+                cos(f_2*x[..., i]) * exp(0.5 * f_2**2 * diag_cov[..., i,]),,
+                ...
+                cos(f_N * x[..., i]) * exp(0.5 * f_N**2 * diag_cov[..., i,]),
+                x[..., i],              # only present if append_input is True.
+            ]
+        where N equals `n_harmonic_functions-1`, and f_i is a scalar
+        denoting the i-th frequency of the harmonic embedding.
+        If `logspace==True`, the frequencies `[f_1, ..., f_N]` are
+        powers of 2:
+            `f_1, ..., f_N = 2**torch.arange(n_harmonic_functions)`
+        If `logspace==False`, frequencies are linearly spaced between
+        `1.0` and `2**(n_harmonic_functions-1)`:
+            `f_1, ..., f_N = torch.linspace(
+                1.0, 2**(n_harmonic_functions-1), n_harmonic_functions
+            )`
+        Note that `x` is also premultiplied by the base frequency `omega_0`
+        before evaluating the harmonic functions.
+        Args:
+            n_harmonic_functions: int, number of harmonic
+                features
+            omega_0: float, base frequency
+            logspace: bool, Whether to space the frequencies in
+                logspace or linear space
+            append_input: bool, whether to concat the original
+                input to the harmonic embedding. If true the
+                output is of the form (embed.sin(), embed.cos(), x)
+        """
+        super().__init__()
+        if logspace:
+            frequencies = 2.0 ** torch.arange(n_harmonic_functions, dtype=torch.float32)
+        else:
+            frequencies = torch.linspace(
+                1.0,
+                2.0 ** (n_harmonic_functions - 1),
+                n_harmonic_functions,
+                dtype=torch.float32,
+            )
+        self.register_buffer("_frequencies", frequencies * omega_0, persistent=False)
+        self.register_buffer(
+            "_zero_half_pi",
+            torch.tensor([0.0, 0.5 * torch.pi]),
+            persistent=False,
+        )
+        self.append_input = append_input
+    def forward(
+        self, x: torch.Tensor, diag_cov: Optional[torch.Tensor] = None, **kwargs
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: tensor of shape [..., dim]
+            diag_cov: An optional tensor of shape `(..., dim)`
+                representing the diagonal covariance matrices of our Gaussians, joined with x
+                as means of the Gaussians.
+        Returns:
+            embedding: a harmonic embedding of `x` of shape
+            [..., (n_harmonic_functions * 2 + int(append_input)) * num_points_per_ray]
+        """
+        embed = x[..., None] * self._frequencies
+        embed = embed[..., None, :, :] + self._zero_half_pi[..., None, None]
+        embed = embed.sin()
+        if diag_cov is not None:
+            x_var = diag_cov[..., None] * torch.pow(self._frequencies, 2)
+            exp_var = torch.exp(-0.5 * x_var)
+            embed = embed * exp_var[..., None, :, :]
+        embed = embed.reshape(*x.shape[:-1], -1)
+        if self.append_input:
+            return torch.cat([embed, x], dim=-1)
+        return embed
+    @staticmethod
+    def get_output_dim_static(
+        input_dims: int, n_harmonic_functions: int, append_input: bool
+    ) -> int:
+        """
+        Utility to help predict the shape of the output of `forward`.
+        Args:
+            input_dims: length of the last dimension of the input tensor
+            n_harmonic_functions: number of embedding frequencies
+            append_input: whether or not to concat the original
+                input to the harmonic embedding
+        Returns:
+            int: the length of the last dimension of the output tensor
+        """
+        return input_dims * (2 * n_harmonic_functions + int(append_input))
+    def get_output_dim(self, input_dims: int = 3) -> int:
+        """
+        Same as above. The default for input_dims is 3 for 3D applications
+        which use harmonic embedding for positional encoding,
+        so the input might be xyz.
+        """
+        return self.get_output_dim_static(
+            input_dims, len(self._frequencies), self.append_input
+        )
+class PoseEmbedding(nn.Module):
+    def __init__(self, target_dim, out_dim, n_harmonic_functions=10, append_input=True):
+        super().__init__()
+        self._emb_pose = HarmonicEmbedding(
+            n_harmonic_functions=n_harmonic_functions, append_input=append_input
+        )
+        self.out_dim = self._emb_pose.get_output_dim(target_dim)
+    def forward(self, pose_encoding):
+        e_pose_encoding = self._emb_pose(pose_encoding)
+        return e_pose_encoding
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+    out = quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :
+    ].reshape(batch_dim + (4,))
+    return standardize_quaternion(out)
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    quaternions = F.normalize(quaternions, p=2, dim=-1)
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+def camera_to_pose_encoding(
+    camera,
+    pose_encoding_type="absT_quaR",
+):
+    """
+    Inverse to pose_encoding_to_camera
+    camera: opencv, cam2world
+    """
+    if pose_encoding_type == "absT_quaR":
+        quaternion_R = matrix_to_quaternion(camera[:, :3, :3])
+        pose_encoding = torch.cat([camera[:, :3, 3], quaternion_R], dim=-1)
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+    return pose_encoding
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+def pose_encoding_to_camera(
+    pose_encoding,
+    pose_encoding_type="absT_quaR",
+):
+    """
+    Args:
+        pose_encoding: A tensor of shape `BxC`, containing a batch of
+                        `B` `C`-dimensional pose encodings.
+        pose_encoding_type: The type of pose encoding,
+    """
+    if pose_encoding_type == "absT_quaR":
+        abs_T = pose_encoding[:, :3]
+        quaternion_R = pose_encoding[:, 3:7]
+        R = quaternion_to_matrix(quaternion_R)
+    else:
+        raise ValueError(f"Unknown pose encoding {pose_encoding_type}")
+    c2w_mats = torch.eye(4, 4).to(R.dtype).to(R.device)
+    c2w_mats = c2w_mats[None].repeat(len(R), 1, 1)
+    c2w_mats[:, :3, :3] = R
+    c2w_mats[:, :3, 3] = abs_T
+    return c2w_mats
+def quaternion_conjugate(q):
+    """Compute the conjugate of quaternion q (w, x, y, z)."""
+    q_conj = torch.cat([q[..., :1], -q[..., 1:]], dim=-1)
+    return q_conj
+def quaternion_multiply(q1, q2):
+    """Multiply two quaternions q1 and q2."""
+    w1, x1, y1, z1 = q1.unbind(dim=-1)
+    w2, x2, y2, z2 = q2.unbind(dim=-1)
+    w = w1 * w2 - x1 * x2 - y1 * y2 - z1 * z2
+    x = w1 * x2 + x1 * w2 + y1 * z2 - z1 * y2
+    y = w1 * y2 - x1 * z2 + y1 * w2 + z1 * x2
+    z = w1 * z2 + x1 * y2 - y1 * x2 + z1 * w2
+    return torch.stack((w, x, y, z), dim=-1)
+def rotate_vector(q, v):
+    """Rotate vector v by quaternion q."""
+    q_vec = q[..., 1:]
+    q_w = q[..., :1]
+    t = 2.0 * torch.cross(q_vec, v, dim=-1)
+    v_rot = v + q_w * t + torch.cross(q_vec, t, dim=-1)
+    return v_rot
+def relative_pose_absT_quatR(t1, q1, t2, q2):
+    """Compute the relative translation and quaternion between two poses."""
+    q1_inv = quaternion_conjugate(q1)
+    q_rel = quaternion_multiply(q1_inv, q2)
+    delta_t = t2 - t1
+    t_rel = rotate_vector(q1_inv, delta_t)
+    return t_rel, q_rel

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/device.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+import numpy as np
+import torch
+def todevice(batch, device, callback=None, non_blocking=False):
+    """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    """
+    if callback:
+        batch = callback(batch)
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+    x = batch
+    if device == "numpy":
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+to_device = todevice  # alias
+def to_numpy(x):
+    return todevice(x, "numpy")
+def to_cpu(x):
+    return todevice(x, "cpu")
+def to_cuda(x):
+    return todevice(x, "cuda")
+def collate_with_cat(whatever, lists=False):
+    if isinstance(whatever, dict):
+        return {k: collate_with_cat(vals, lists=lists) for k, vals in whatever.items()}
+    elif isinstance(whatever, (tuple, list)):
+        if len(whatever) == 0:
+            return whatever
+        elem = whatever[0]
+        T = type(whatever)
+        if elem is None:
+            return None
+        if isinstance(elem, (bool, float, int, str)):
+            return whatever
+        if isinstance(elem, tuple):
+            return T(collate_with_cat(x, lists=lists) for x in zip(*whatever))
+        if isinstance(elem, dict):
+            return {
+                k: collate_with_cat([e[k] for e in whatever], lists=lists) for k in elem
+            }
+        if isinstance(elem, torch.Tensor):
+            return listify(whatever) if lists else torch.cat(whatever)
+        if isinstance(elem, np.ndarray):
+            return (
+                listify(whatever)
+                if lists
+                else torch.cat([torch.from_numpy(x) for x in whatever])
+            )
+        return sum(whatever, T())
+def listify(elems):
+    return [x for e in elems for x in e]

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,554 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+import torch
+import numpy as np
+from scipy.spatial import cKDTree as KDTree
+from dust3r.utils.misc import invalid_to_zeros, invalid_to_nans
+from dust3r.utils.device import to_numpy
+def xy_grid(
+    W,
+    H,
+    device=None,
+    origin=(0, 0),
+    unsqueeze=None,
+    cat_dim=-1,
+    homogeneous=False,
+    **arange_kw,
+):
+    """Output a (H,W,2) array of int32
+    with output[j,i,0] = i + origin[0]
+         output[j,i,1] = j + origin[1]
+    """
+    if device is None:
+        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
+    else:
+        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
+        meshgrid, stack = torch.meshgrid, torch.stack
+        ones = lambda *a: torch.ones(*a, device=device)
+    tw, th = [arange(o, o + s, **arange_kw) for s, o in zip((W, H), origin)]
+    grid = meshgrid(tw, th, indexing="xy")
+    if homogeneous:
+        grid = grid + (ones((H, W)),)
+    if unsqueeze is not None:
+        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
+    if cat_dim is not None:
+        grid = stack(grid, cat_dim)
+    return grid
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """Apply a geometric transformation to a list of 3-D points.
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+    if (
+        isinstance(Trf, torch.Tensor)
+        and isinstance(pts, torch.Tensor)
+        and Trf.ndim == 3
+        and pts.ndim == 4
+    ):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d + 1:
+            pts = (
+                torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts)
+                + Trf[:, None, None, :d, d]
+            )
+        else:
+            raise ValueError(f"bad shape, not ending with 3 or 4, for {pts.shape=}")
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim - 2
+            assert Trf.shape[:n] == pts.shape[:n], "batch size does not match"
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+            if pts.ndim > Trf.ndim:
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+                pts = pts[:, None, :]
+        if pts.shape[-1] + 1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+def inv(mat):
+    """Invert a torch or numpy matrix"""
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f"bad matrix type = {type(mat)}")
+def depthmap_to_pts3d(depth, pseudo_focal, pp=None, **_):
+    """
+    Args:
+        - depthmap (BxHxW array):
+        - pseudo_focal: [B,H,W] ; [B,2,H,W] or [B,1,H,W]
+    Returns:
+        pointmap of absolute coordinates (BxHxWx3 array)
+    """
+    if len(depth.shape) == 4:
+        B, H, W, n = depth.shape
+    else:
+        B, H, W = depth.shape
+        n = None
+    if len(pseudo_focal.shape) == 3:  # [B,H,W]
+        pseudo_focalx = pseudo_focaly = pseudo_focal
+    elif len(pseudo_focal.shape) == 4:  # [B,2,H,W] or [B,1,H,W]
+        pseudo_focalx = pseudo_focal[:, 0]
+        if pseudo_focal.shape[1] == 2:
+            pseudo_focaly = pseudo_focal[:, 1]
+        else:
+            pseudo_focaly = pseudo_focalx
+    else:
+        raise NotImplementedError("Error, unknown input focal shape format.")
+    assert pseudo_focalx.shape == depth.shape[:3]
+    assert pseudo_focaly.shape == depth.shape[:3]
+    grid_x, grid_y = xy_grid(W, H, cat_dim=0, device=depth.device)[:, None]
+    if pp is None:
+        grid_x = grid_x - (W - 1) / 2
+        grid_y = grid_y - (H - 1) / 2
+    else:
+        grid_x = grid_x.expand(B, -1, -1) - pp[:, 0, None, None]
+        grid_y = grid_y.expand(B, -1, -1) - pp[:, 1, None, None]
+    if n is None:
+        pts3d = torch.empty((B, H, W, 3), device=depth.device)
+        pts3d[..., 0] = depth * grid_x / pseudo_focalx
+        pts3d[..., 1] = depth * grid_y / pseudo_focaly
+        pts3d[..., 2] = depth
+    else:
+        pts3d = torch.empty((B, H, W, 3, n), device=depth.device)
+        pts3d[..., 0, :] = depth * (grid_x / pseudo_focalx)[..., None]
+        pts3d[..., 1, :] = depth * (grid_y / pseudo_focaly)[..., None]
+        pts3d[..., 2, :] = depth
+    return pts3d
+def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    camera_intrinsics = np.float32(camera_intrinsics)
+    H, W = depthmap.shape
+    assert camera_intrinsics[0, 1] == 0.0
+    assert camera_intrinsics[1, 0] == 0.0
+    if pseudo_focal is None:
+        fu = camera_intrinsics[0, 0]
+        fv = camera_intrinsics[1, 1]
+    else:
+        assert pseudo_focal.shape == (H, W)
+        fu = fv = pseudo_focal
+    cu = camera_intrinsics[0, 2]
+    cv = camera_intrinsics[1, 2]
+    u, v = np.meshgrid(np.arange(W), np.arange(H))
+    z_cam = depthmap
+    x_cam = (u - cu) * z_cam / fu
+    y_cam = (v - cv) * z_cam / fv
+    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
+    valid_mask = depthmap > 0.0
+    return X_cam, valid_mask
+def depthmap_to_absolute_camera_coordinates(
+    depthmap, camera_intrinsics, camera_pose, **kw
+):
+    """
+    Args:
+        - depthmap (HxW array):
+        - camera_intrinsics: a 3x3 matrix
+        - camera_pose: a 4x3 or 4x4 cam2world matrix
+    Returns:
+        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
+    """
+    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
+    X_world = X_cam  # default
+    if camera_pose is not None:
+        R_cam2world = camera_pose[:3, :3]
+        t_cam2world = camera_pose[:3, 3]
+        X_world = (
+            np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
+        )
+    return X_world, X_cam, valid_mask
+def colmap_to_opencv_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] -= 0.5
+    K[1, 2] -= 0.5
+    return K
+def opencv_to_colmap_intrinsics(K):
+    """
+    Modify camera intrinsics to follow a different convention.
+    Coordinates of the center of the top-left pixels are by default:
+    - (0.5, 0.5) in Colmap
+    - (0,0) in OpenCV
+    """
+    K = K.copy()
+    K[0, 2] += 0.5
+    K[1, 2] += 0.5
+    return K
+def normalize_pointcloud(
+    pts1, pts2, norm_mode="avg_dis", valid1=None, valid2=None, ret_factor=False
+):
+    """renorm pointmaps pts1, pts2 with norm_mode"""
+    assert pts1.ndim >= 3 and pts1.shape[-1] == 3
+    assert pts2 is None or (pts2.ndim >= 3 and pts2.shape[-1] == 3)
+    norm_mode, dis_mode = norm_mode.split("_")
+    if norm_mode == "avg":
+        nan_pts1, nnz1 = invalid_to_zeros(pts1, valid1, ndim=3)
+        nan_pts2, nnz2 = (
+            invalid_to_zeros(pts2, valid2, ndim=3) if pts2 is not None else (None, 0)
+        )
+        all_pts = (
+            torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        )
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == "dis":
+            pass  # do nothing
+        elif dis_mode == "log1p":
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == "warp-log1p":
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H1, W1 = pts1.shape[1:-1]
+            pts1 = pts1 * warp_factor[:, : W1 * H1].view(-1, H1, W1, 1)
+            if pts2 is not None:
+                H2, W2 = pts2.shape[1:-1]
+                pts2 = pts2 * warp_factor[:, W1 * H1 :].view(-1, H2, W2, 1)
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f"bad {dis_mode=}")
+        norm_factor = all_dis.sum(dim=1) / (nnz1 + nnz2 + 1e-8)
+    else:
+        nan_pts1 = invalid_to_nans(pts1, valid1, ndim=3)
+        nan_pts2 = invalid_to_nans(pts2, valid2, ndim=3) if pts2 is not None else None
+        all_pts = (
+            torch.cat((nan_pts1, nan_pts2), dim=1) if pts2 is not None else nan_pts1
+        )
+        all_dis = all_pts.norm(dim=-1)
+        if norm_mode == "avg":
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == "median":
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == "sqrt":
+            norm_factor = all_dis.sqrt().nanmean(dim=1) ** 2
+        else:
+            raise ValueError(f"bad {norm_mode=}")
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts1.ndim:
+        norm_factor.unsqueeze_(-1)
+    res = pts1 / norm_factor
+    if pts2 is not None:
+        res = (res, pts2 / norm_factor)
+    if ret_factor:
+        res = res + (norm_factor,)
+    return res
+def normalize_pointcloud_group(
+    pts_list,
+    norm_mode="avg_dis",
+    valid_list=None,
+    conf_list=None,
+    ret_factor=False,
+    ret_factor_only=False,
+):
+    """renorm pointmaps pts1, pts2 with norm_mode"""
+    for pts in pts_list:
+        assert pts.ndim >= 3 and pts.shape[-1] == 3
+    norm_mode, dis_mode = norm_mode.split("_")
+    if norm_mode == "avg":
+        nan_pts_list, nnz_list = zip(
+            *[
+                invalid_to_zeros(pts1, valid1, ndim=3)
+                for pts1, valid1 in zip(pts_list, valid_list)
+            ]
+        )
+        all_pts = torch.cat(nan_pts_list, dim=1)
+        if conf_list is not None:
+            nan_conf_list = [
+                invalid_to_zeros(conf1[..., None], valid1, ndim=3)[0]
+                for conf1, valid1 in zip(conf_list, valid_list)
+            ]
+            all_conf = torch.cat(nan_conf_list, dim=1)[..., 0]
+        else:
+            all_conf = torch.ones_like(all_pts[..., 0])
+        all_dis = all_pts.norm(dim=-1)
+        if dis_mode == "dis":
+            pass  # do nothing
+        elif dis_mode == "log1p":
+            all_dis = torch.log1p(all_dis)
+        elif dis_mode == "warp-log1p":
+            log_dis = torch.log1p(all_dis)
+            warp_factor = log_dis / all_dis.clip(min=1e-8)
+            H_W_list = [pts.shape[1:-1] for pts in pts_list]
+            pts_list = [
+                pts
+                * warp_factor[:, sum(H_W_list[:i]) : sum(H_W_list[: i + 1])].view(
+                    -1, H, W, 1
+                )
+                for i, (pts, (H, W)) in enumerate(zip(pts_list, H_W_list))
+            ]
+            all_dis = log_dis  # this is their true distance afterwards
+        else:
+            raise ValueError(f"bad {dis_mode=}")
+        norm_factor = (all_conf * all_dis).sum(dim=1) / (all_conf.sum(dim=1) + 1e-8)
+    else:
+        nan_pts_list = [
+            invalid_to_nans(pts1, valid1, ndim=3)
+            for pts1, valid1 in zip(pts_list, valid_list)
+        ]
+        all_pts = torch.cat(nan_pts_list, dim=1)
+        all_dis = all_pts.norm(dim=-1)
+        if norm_mode == "avg":
+            norm_factor = all_dis.nanmean(dim=1)
+        elif norm_mode == "median":
+            norm_factor = all_dis.nanmedian(dim=1).values.detach()
+        elif norm_mode == "sqrt":
+            norm_factor = all_dis.sqrt().nanmean(dim=1) ** 2
+        else:
+            raise ValueError(f"bad {norm_mode=}")
+    norm_factor = norm_factor.clip(min=1e-8)
+    while norm_factor.ndim < pts_list[0].ndim:
+        norm_factor.unsqueeze_(-1)
+    if ret_factor_only:
+        return norm_factor
+    res = [pts / norm_factor for pts in pts_list]
+    if ret_factor:
+        return res, norm_factor
+    return res
+@torch.no_grad()
+def get_joint_pointcloud_depth(z1, z2, valid_mask1, valid_mask2=None, quantile=0.5):
+    _z1 = invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
+    _z2 = (
+        invalid_to_nans(z2, valid_mask2).reshape(len(z2), -1)
+        if z2 is not None
+        else None
+    )
+    _z = torch.cat((_z1, _z2), dim=-1) if z2 is not None else _z1
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_z, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_z, quantile, dim=-1)
+    return shift_z  # (B,)
+@torch.no_grad()
+def get_group_pointcloud_depth(zs, valid_masks, quantile=0.5):
+    _zs = [
+        invalid_to_nans(z1, valid_mask1).reshape(len(z1), -1)
+        for z1, valid_mask1 in zip(zs, valid_masks)
+    ]
+    _z = torch.cat(_zs, dim=-1)
+    if quantile == 0.5:
+        shift_z = torch.nanmedian(_z, dim=-1).values
+    else:
+        shift_z = torch.nanquantile(_z, quantile, dim=-1)
+    return shift_z  # (B,)
+@torch.no_grad()
+def get_joint_pointcloud_center_scale(
+    pts1, pts2, valid_mask1=None, valid_mask2=None, z_only=False, center=True
+):
+    _pts1 = invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
+    _pts2 = (
+        invalid_to_nans(pts2, valid_mask2).reshape(len(pts2), -1, 3)
+        if pts2 is not None
+        else None
+    )
+    _pts = torch.cat((_pts1, _pts2), dim=1) if pts2 is not None else _pts1
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+@torch.no_grad()
+def get_group_pointcloud_center_scale(pts, valid_masks=None, z_only=False, center=True):
+    _pts = [
+        invalid_to_nans(pts1, valid_mask1).reshape(len(pts1), -1, 3)
+        for pts1, valid_mask1 in zip(pts, valid_masks)
+    ]
+    _pts = torch.cat(_pts, dim=1)
+    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
+    if z_only:
+        _center[..., :2] = 0  # do not center X and Y
+    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
+    scale = torch.nanmedian(_norm, dim=1).values
+    return _center[:, None, :, :], scale[:, None, None, None]
+def find_reciprocal_matches(P1, P2):
+    """
+    returns 3 values:
+    1 - reciprocal_in_P2: a boolean array of size P2.shape[0], a "True" value indicates a match
+    2 - nn2_in_P1: a int array of size P2.shape[0], it contains the indexes of the closest points in P1
+    3 - reciprocal_in_P2.sum(): the number of matches
+    """
+    tree1 = KDTree(P1)
+    tree2 = KDTree(P2)
+    _, nn1_in_P2 = tree2.query(P1, workers=8)
+    _, nn2_in_P1 = tree1.query(P2, workers=8)
+    reciprocal_in_P1 = nn2_in_P1[nn1_in_P2] == np.arange(len(nn1_in_P2))
+    reciprocal_in_P2 = nn1_in_P2[nn2_in_P1] == np.arange(len(nn2_in_P1))
+    assert reciprocal_in_P1.sum() == reciprocal_in_P2.sum()
+    return reciprocal_in_P2, nn2_in_P1, reciprocal_in_P2.sum()
+def get_med_dist_between_poses(poses):
+    from scipy.spatial.distance import pdist
+    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))
+def weighted_procrustes(A, B, w, use_weights=True, eps=1e-16, return_T=False):
+    """
+    X: torch tensor B x N x 3
+    Y: torch tensor B x N x 3
+    w: torch tensor B x N
+    """
+    assert len(A) == len(B)
+    if use_weights:
+        W1 = torch.abs(w).sum(1, keepdim=True)
+        w_norm = (w / (W1 + eps)).unsqueeze(-1)
+        a_mean = (w_norm * A).sum(dim=1, keepdim=True)
+        b_mean = (w_norm * B).sum(dim=1, keepdim=True)
+        A_c = A - a_mean
+        B_c = B - b_mean
+        H = torch.einsum("bni,bnj->bij", A_c, w_norm * B_c)
+    else:
+        a_mean = A.mean(axis=1, keepdim=True)
+        b_mean = B.mean(axis=1, keepdim=True)
+        A_c = A - a_mean
+        B_c = B - b_mean
+        H = torch.einsum("bij,bik->bjk", A_c, B_c)
+    U, S, V = torch.svd(H)  # U: B x 3 x 3, S: B x 3, V: B x 3 x 3
+    Z = torch.eye(3).unsqueeze(0).repeat(A.shape[0], 1, 1).to(A.device)
+    Z[:, -1, -1] = torch.sign(torch.linalg.det(U @ V.transpose(1, 2)))  # B x 3 x 3
+    R = V @ Z @ U.transpose(1, 2)  # B x 3 x 3
+    t = b_mean - torch.einsum("bij,bjk->bik", R, a_mean.transpose(-2, -1)).transpose(
+        -2, -1
+    )
+    if return_T:
+        T = torch.eye(4).unsqueeze(0).repeat(A.shape[0], 1, 1).to(A.device)
+        T[:, :3, :3] = R
+        T[:, :3, 3] = t.squeeze()
+        return T
+    return R, t.squeeze()

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/image.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+import os
+import torch
+import numpy as np
+import PIL.Image
+from PIL.ImageOps import exif_transpose
+import torchvision.transforms as tvf
+os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
+import cv2  # noqa
+try:
+    from pillow_heif import register_heif_opener  # noqa
+    register_heif_opener()
+    heif_support_enabled = True
+except ImportError:
+    heif_support_enabled = False
+ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
+def img_to_arr(img):
+    if isinstance(img, str):
+        img = imread_cv2(img)
+    return img
+def imread_cv2(path, options=cv2.IMREAD_COLOR):
+    """Open an image or a depthmap with opencv-python."""
+    if path.endswith((".exr", "EXR")):
+        options = cv2.IMREAD_ANYDEPTH
+    img = cv2.imread(path, options)
+    if img is None:
+        raise IOError(f"Could not load image={path} with {options=}")
+    if img.ndim == 3:
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
+def imread_pil(path):
+    """Open an RGB image using PIL and return as numpy array."""
+    img = PIL.Image.open(path)
+    img = exif_transpose(img)
+    img = img.convert("RGB")
+    return np.array(img)
+def rgb(ftensor, true_shape=None):
+    if isinstance(ftensor, list):
+        return [rgb(x, true_shape=true_shape) for x in ftensor]
+    if isinstance(ftensor, torch.Tensor):
+        ftensor = ftensor.detach().cpu().numpy()  # H,W,3
+    if ftensor.ndim == 3 and ftensor.shape[0] == 3:
+        ftensor = ftensor.transpose(1, 2, 0)
+    elif ftensor.ndim == 4 and ftensor.shape[1] == 3:
+        ftensor = ftensor.transpose(0, 2, 3, 1)
+    if true_shape is not None:
+        H, W = true_shape
+        ftensor = ftensor[:H, :W]
+    if ftensor.dtype == np.uint8:
+        img = np.float32(ftensor) / 255
+    else:
+        img = (ftensor * 0.5) + 0.5
+    return img.clip(min=0, max=1)
+def _resize_pil_image(img, long_edge_size):
+    S = max(img.size)
+    if S > long_edge_size:
+        interp = PIL.Image.LANCZOS
+    elif S <= long_edge_size:
+        interp = PIL.Image.BICUBIC
+    new_size = tuple(int(round(x * long_edge_size / S)) for x in img.size)
+    return img.resize(new_size, interp)
+def load_images(folder_or_list, size, square_ok=False, verbose=True):
+    """open and convert all images in a list or folder to proper input format for DUSt3R"""
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f">> Loading images from {folder_or_list}")
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f">> Loading a list of {len(folder_or_list)} images")
+        root, folder_content = "", folder_or_list
+    else:
+        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
+    supported_images_extensions = [".jpg", ".jpeg", ".png", ".bmp"]
+    if heif_support_enabled:
+        supported_images_extensions += [".heic", ".heif"]
+    supported_images_extensions = tuple(supported_images_extensions)
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
+        W1, H1 = img.size
+        if size == 224:
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx - half, cy - half, cx + half, cy + half))
+        else:
+            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+        W2, H2 = img.size
+        if verbose:
+            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
+        imgs.append(
+            dict(
+                img=ImgNorm(img)[None],
+                true_shape=np.int32([img.size[::-1]]),
+                idx=len(imgs),
+                instance=str(len(imgs)),
+            )
+        )
+    assert imgs, "no images foud at " + root
+    if verbose:
+        print(f" (Found {len(imgs)} images)")
+    return imgs
+def load_images_for_eval(
+    folder_or_list, size, square_ok=False, verbose=True, crop=True
+):
+    """open and convert all images in a list or folder to proper input format for DUSt3R"""
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f">> Loading images from {folder_or_list}")
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f">> Loading a list of {len(folder_or_list)} images")
+        root, folder_content = "", folder_or_list
+    else:
+        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
+    supported_images_extensions = [".jpg", ".jpeg", ".png"]
+    if heif_support_enabled:
+        supported_images_extensions += [".heic", ".heif"]
+    supported_images_extensions = tuple(supported_images_extensions)
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
+        W1, H1 = img.size
+        if size == 224:
+            # resize short side to 224 (then crop)
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+            # resize long side to 512
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            if crop:
+                img = img.crop((cx - half, cy - half, cx + half, cy + half))
+            else:  # resize
+                img = img.resize((2 * half, 2 * half), PIL.Image.LANCZOS)
+        else:
+            halfw, halfh = ((2 * cx) // 14) * 7, ((2 * cy) // 14) * 7
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            if crop:
+                img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+            else:  # resize
+                img = img.resize((2 * halfw, 2 * halfh), PIL.Image.LANCZOS)
+        W2, H2 = img.size
+        if verbose:
+            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
+        imgs.append(
+            dict(
+                img=ImgNorm(img)[None],
+                true_shape=np.int32([img.size[::-1]]),
+                idx=len(imgs),
+                instance=str(len(imgs)),
+            )
+        )
+    assert imgs, "no images foud at " + root
+    if verbose:
+        print(f" (Found {len(imgs)} images)")
+    return imgs
+def load_images_512(folder_or_list, size, square_ok=False, verbose=True):
+    """open and convert all images in a list or folder to proper input format for DUSt3R"""
+    if isinstance(folder_or_list, str):
+        if verbose:
+            print(f">> Loading images from {folder_or_list}")
+        root, folder_content = folder_or_list, sorted(os.listdir(folder_or_list))
+    elif isinstance(folder_or_list, list):
+        if verbose:
+            print(f">> Loading a list of {len(folder_or_list)} images")
+        root, folder_content = "", folder_or_list
+    else:
+        raise ValueError(f"bad {folder_or_list=} ({type(folder_or_list)})")
+    supported_images_extensions = [".jpg", ".jpeg", ".png", ".bmp"]
+    if heif_support_enabled:
+        supported_images_extensions += [".heic", ".heif"]
+    supported_images_extensions = tuple(supported_images_extensions)
+    imgs = []
+    for path in folder_content:
+        if not path.lower().endswith(supported_images_extensions):
+            continue
+        img = exif_transpose(PIL.Image.open(os.path.join(root, path))).convert("RGB")
+        img = img.resize((512, 384))
+        W1, H1 = img.size
+        if size == 224:
+            img = _resize_pil_image(img, round(size * max(W1 / H1, H1 / W1)))
+        else:
+            img = _resize_pil_image(img, size)
+        W, H = img.size
+        cx, cy = W // 2, H // 2
+        if size == 224:
+            half = min(cx, cy)
+            img = img.crop((cx - half, cy - half, cx + half, cy + half))
+        else:
+            halfw, halfh = ((2 * cx) // 16) * 8, ((2 * cy) // 16) * 8
+            if not (square_ok) and W == H:
+                halfh = 3 * halfw / 4
+            img = img.crop((cx - halfw, cy - halfh, cx + halfw, cy + halfh))
+        W2, H2 = img.size
+        if verbose:
+            print(f" - adding {path} with resolution {W1}x{H1} --> {W2}x{H2}")
+        imgs.append(
+            dict(
+                img=ImgNorm(img)[None],
+                true_shape=np.int32([img.size[::-1]]),
+                idx=len(imgs),
+                instance=str(len(imgs)),
+            )
+        )
+    assert imgs, "no images foud at " + root
+    if verbose:
+        print(f" (Found {len(imgs)} images)")
+    return imgs

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/misc.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+import torch
+def fill_default_args(kwargs, func):
+    import inspect  # a bit hacky but it works reliably
+    signature = inspect.signature(func)
+    for k, v in signature.parameters.items():
+        if v.default is inspect.Parameter.empty:
+            continue
+        kwargs.setdefault(k, v.default)
+    return kwargs
+def freeze_all_params(modules):
+    for module in modules:
+        try:
+            for n, param in module.named_parameters():
+                param.requires_grad = False
+        except AttributeError:
+            module.requires_grad = False
+def is_symmetrized(gt1, gt2):
+    x = gt1["instance"]
+    y = gt2["instance"]
+    if len(x) == len(y) and len(x) == 1:
+        return False  # special case of batchsize 1
+    ok = True
+    for i in range(0, len(x), 2):
+        ok = ok and (x[i] == y[i + 1]) and (x[i + 1] == y[i])
+    return ok
+def flip(tensor):
+    """flip so that tensor[0::2] <=> tensor[1::2]"""
+    return torch.stack((tensor[1::2], tensor[0::2]), dim=1).flatten(0, 1)
+def interleave(tensor1, tensor2):
+    res1 = torch.stack((tensor1, tensor2), dim=1).flatten(0, 1)
+    res2 = torch.stack((tensor2, tensor1), dim=1).flatten(0, 1)
+    return res1, res2
+def transpose_to_landscape(head, activate=True):
+    """Predict in the correct aspect-ratio,
+    then transpose the result in landscape
+    and stack everything back together.
+    """
+    def wrapper_no(decout, true_shape, **kwargs):
+        B = len(true_shape)
+        assert true_shape[0:1].allclose(true_shape), "true_shape must be all identical"
+        H, W = true_shape[0].cpu().tolist()
+        res = head(decout, (H, W), **kwargs)
+        return res
+    def wrapper_yes(decout, true_shape, **kwargs):
+        B = len(true_shape)
+        H, W = int(true_shape.min()), int(true_shape.max())
+        height, width = true_shape.T
+        is_landscape = width >= height
+        is_portrait = ~is_landscape
+        if is_landscape.all():
+            return head(decout, (H, W), **kwargs)
+        if is_portrait.all():
+            return transposed(head(decout, (W, H), **kwargs))
+        def selout(ar):
+            return [d[ar] for d in decout]
+        if "pos" in kwargs:
+            kwargs_landscape = kwargs.copy()
+            kwargs_landscape["pos"] = kwargs["pos"][is_landscape]
+            kwargs_portrait = kwargs.copy()
+            kwargs_portrait["pos"] = kwargs["pos"][is_portrait]
+        l_result = head(selout(is_landscape), (H, W), **kwargs_landscape)
+        p_result = transposed(head(selout(is_portrait), (W, H), **kwargs_portrait))
+        result = {}
+        for k in l_result | p_result:
+            x = l_result[k].new(B, *l_result[k].shape[1:])
+            x[is_landscape] = l_result[k]
+            x[is_portrait] = p_result[k]
+            result[k] = x
+        return result
+    return wrapper_yes if activate else wrapper_no
+def transposed(dic):
+    return {k: v.swapaxes(1, 2) if v.ndim > 2 else v for k, v in dic.items()}
+def invalid_to_nans(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = float("nan")
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr
+def invalid_to_zeros(arr, valid_mask, ndim=999):
+    if valid_mask is not None:
+        arr = arr.clone()
+        arr[~valid_mask] = 0
+        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
+    else:
+        nnz = arr.numel() // len(arr) if len(arr) else 0  # number of point per image
+    if arr.ndim > ndim:
+        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
+    return arr, nnz

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/parallel.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+from tqdm import tqdm
+from multiprocessing.dummy import Pool as ThreadPool
+from multiprocessing import cpu_count
+def parallel_threads(
+    function,
+    args,
+    workers=0,
+    star_args=False,
+    kw_args=False,
+    front_num=1,
+    Pool=ThreadPool,
+    **tqdm_kw
+):
+    """tqdm but with parallel execution.
+    Will essentially return
+      res = [ function(arg) # default
+              function(*arg) # if star_args is True
+              function(**arg) # if kw_args is True
+              for arg in args]
+    Note:
+        the <front_num> first elements of args will not be parallelized.
+        This can be useful for debugging.
+    """
+    while workers <= 0:
+        workers += cpu_count()
+    if workers == 1:
+        front_num = float("inf")
+    try:
+        n_args_parallel = len(args) - front_num
+    except TypeError:
+        n_args_parallel = None
+    args = iter(args)
+    front = []
+    while len(front) < front_num:
+        try:
+            a = next(args)
+        except StopIteration:
+            return front  # end of the iterable
+        front.append(
+            function(*a) if star_args else function(**a) if kw_args else function(a)
+        )
+    out = []
+    with Pool(workers) as pool:
+        if star_args:
+            futures = pool.imap(starcall, [(function, a) for a in args])
+        elif kw_args:
+            futures = pool.imap(starstarcall, [(function, a) for a in args])
+        else:
+            futures = pool.imap(function, args)
+        for f in tqdm(futures, total=n_args_parallel, **tqdm_kw):
+            out.append(f)
+    return front + out
+def parallel_processes(*args, **kwargs):
+    """Same as parallel_threads, with processes"""
+    import multiprocessing as mp
+    kwargs["Pool"] = mp.Pool
+    return parallel_threads(*args, **kwargs)
+def starcall(args):
+    """convenient wrapper for Process.Pool"""
+    function, args = args
+    return function(*args)
+def starstarcall(args):
+    """convenient wrapper for Process.Pool"""
+    function, args = args
+    return function(**args)

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/path_to_croco.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# modified from DUSt3R
+import sys
+import os.path as path
+import importlib
+HERE_PATH = path.normpath(path.dirname(__file__))
+CROCO_REPO_PATH = path.normpath(path.join(HERE_PATH, "../../croco"))
+CROCO_MODELS_PATH = path.join(CROCO_REPO_PATH, "models")
+# IMPORTANT:
+# Do NOT add `.../src/croco` directly to sys.path, otherwise subfolders like
+# `croco/datasets` become a top-level module named `datasets`, which will shadow
+# HuggingFace `datasets` and break `accelerate` (and others).
+# Instead, add `.../src` so we import as `croco.*`.
+SRC_PATH = path.normpath(path.join(HERE_PATH, "../../.."))
+if path.isdir(CROCO_MODELS_PATH):
+    # Prefer adding the `src` directory; this enables `import croco...` without
+    # polluting top-level module names.
+    if SRC_PATH not in sys.path:
+        sys.path.insert(0, SRC_PATH)
+    # In case an old run already inserted CROCO_REPO_PATH, remove it to avoid
+    # shadowing top-level modules (e.g., `datasets`).
+    while CROCO_REPO_PATH in sys.path:
+        sys.path.remove(CROCO_REPO_PATH)
+    # Backward-compat: DUSt3R code expects `models.*` to exist as a top-level package
+    # (historically achieved by adding CROCO_REPO_PATH to sys.path). We keep that
+    # import path working by aliasing `croco.models` to `models` without exposing
+    # other top-level names like `datasets`.
+    try:
+        _croco_models = importlib.import_module("croco.models")
+        sys.modules.setdefault("models", _croco_models)
+    except Exception:
+        # If croco isn't importable yet, downstream import will raise a clearer error.
+        pass
+else:
+    raise ImportError(
+        f"croco is not initialized, could not find: {CROCO_MODELS_PATH}.\n "
+        "Did you forget to run 'git submodule update --init --recursive' ?"
+    )

outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/utils/render.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+from gsplat import rasterization
+from dust3r.utils.geometry import inv, geotrf
+def render(
+    intrinsics: torch.Tensor,
+    pts3d: torch.Tensor,
+    rgbs: torch.Tensor | None = None,
+    scale: float = 0.002,
+    opacity: float = 0.95,
+):
+    device = pts3d.device
+    batch_size = len(intrinsics)
+    img_size = pts3d.shape[1:3]
+    pts3d = pts3d.reshape(batch_size, -1, 3)
+    num_pts = pts3d.shape[1]
+    quats = torch.randn((num_pts, 4), device=device)
+    quats = quats / quats.norm(dim=-1, keepdim=True)
+    scales = scale * torch.ones((num_pts, 3), device=device)
+    opacities = opacity * torch.ones((num_pts), device=device)
+    if rgbs is not None:
+        assert rgbs.shape[1] == 3
+        rgbs = rgbs.reshape(batch_size, 3, -1).transpose(1, 2)
+    else:
+        rgbs = torch.ones_like(pts3d[:, :, :3])
+    rendered_rgbs = []
+    rendered_depths = []
+    accs = []
+    for i in range(batch_size):
+        rgbd, acc, _ = rasterization(
+            pts3d[i],
+            quats,
+            scales,
+            opacities,
+            rgbs[i],
+            torch.eye(4, device=device)[None],
+            intrinsics[[i]],
+            width=img_size[1],
+            height=img_size[0],
+            packed=False,
+            render_mode="RGB+D",
+        )
+        rendered_depths.append(rgbd[..., 3])
+    rendered_depths = torch.cat(rendered_depths, dim=0)
+    return rendered_rgbs, rendered_depths, accs
+def get_render_results(gts, preds, self_view=False):
+    device = preds[0]["pts3d_in_other_view"].device
+    with torch.no_grad():
+        depths = []
+        gt_depths = []
+        for i, (gt, pred) in enumerate(zip(gts, preds)):
+            if self_view:
+                camera = inv(gt["camera_pose"]).to(device)
+                intrinsics = gt["camera_intrinsics"].to(device)
+                pred = pred["pts3d_in_other_view"]
+            else:
+                camera = inv(gts[0]["camera_pose"]).to(device)
+                intrinsics = gts[0]["camera_intrinsics"].to(device)
+                pred = pred["pts3d_in_other_view"]
+            gt_img = gt["img"].to(device)
+            gt_pts3d = gt["pts3d"].to(device)
+            _, depth, _ = render(intrinsics, pred, gt_img)
+            _, gt_depth, _ = render(intrinsics, geotrf(camera, gt_pts3d), gt_img)
+            depths.append(depth)
+            gt_depths.append(gt_depth)
+    return depths, gt_depths

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+__version__ = "0.0.1"

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/hub/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/hub/backbones.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from enum import Enum
+from typing import Union
+import torch
+from .utils import _DINOV2_BASE_URL, _make_dinov2_model_name
+class Weights(Enum):
+    LVD142M = "LVD142M"
+def _make_dinov2_model(
+    *,
+    arch_name: str = "vit_large",
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = "mlp",
+    block_chunks: int = 0,
+    num_register_tokens: int = 0,
+    interpolate_antialias: bool = False,
+    interpolate_offset: float = 0.1,
+    pretrained: bool = True,
+    weights: Union[Weights, str] = Weights.LVD142M,
+    **kwargs,
+):
+    from ..models import vision_transformer as vits
+    if isinstance(weights, str):
+        try:
+            weights = Weights[weights]
+        except KeyError:
+            raise AssertionError(f"Unsupported weights: {weights}")
+    model_base_name = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+        num_register_tokens=num_register_tokens,
+        interpolate_antialias=interpolate_antialias,
+        interpolate_offset=interpolate_offset,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+    if pretrained:
+        model_full_name = _make_dinov2_model_name(arch_name, patch_size, num_register_tokens)
+        url = _DINOV2_BASE_URL + f"/{model_base_name}/{model_full_name}_pretrain.pth"
+        state_dict = torch.hub.load_state_dict_from_url(url, map_location="cpu")
+        model.load_state_dict(state_dict, strict=True)
+    return model
+def dinov2_vits14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_small", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitb14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_base", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitl14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(arch_name="vit_large", pretrained=pretrained, weights=weights, **kwargs)
+def dinov2_vitg14(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        **kwargs,
+    )
+def dinov2_vits14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-S/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_small",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitb14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-B/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_base",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitl14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-L/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_large",
+        pretrained=pretrained,
+        weights=weights,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )
+def dinov2_vitg14_reg(*, pretrained: bool = True, weights: Union[Weights, str] = Weights.LVD142M, **kwargs):
+    """
+    DINOv2 ViT-g/14 model with registers (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name="vit_giant2",
+        ffn_layer="swiglufused",
+        weights=weights,
+        pretrained=pretrained,
+        num_register_tokens=4,
+        interpolate_antialias=True,
+        interpolate_offset=0.0,
+        **kwargs,
+    )

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/hub/utils.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import itertools
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+_DINOV2_BASE_URL = "https://dl.fbaipublicfiles.com/dinov2"
+def _make_dinov2_model_name(arch_name: str, patch_size: int, num_register_tokens: int = 0) -> str:
+    compact_arch_name = arch_name.replace("_", "")[:4]
+    registers_suffix = f"_reg{num_register_tokens}" if num_register_tokens else ""
+    return f"dinov2_{compact_arch_name}{patch_size}{registers_suffix}"
+class CenterPadding(nn.Module):
+    def __init__(self, multiple):
+        super().__init__()
+        self.multiple = multiple
+    def _get_pad(self, size):
+        new_size = math.ceil(size / self.multiple) * self.multiple
+        pad_size = new_size - size
+        pad_size_left = pad_size // 2
+        pad_size_right = pad_size - pad_size_left
+        return pad_size_left, pad_size_right
+    @torch.inference_mode()
+    def forward(self, x):
+        pads = list(itertools.chain.from_iterable(self._get_pad(m) for m in x.shape[:1:-1]))
+        output = F.pad(x, pads)
+        return output

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Attention)")
+    else:
+        # warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (Block)")
+    else:
+        # warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        # warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        # warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    # warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+from . import vision_transformer as vits
+logger = logging.getLogger("dinov2")
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)

outdoor_v48_16gpu_v2/code/05_02-22:24:00/slamformer/models/dinov2/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from torch.nn.init import trunc_normal_
+from ..layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+from ...layers.attention import FlashAttention
+# logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            # logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            # logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            # logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+                attn_class=FlashAttention
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=False)
+            else:
+                x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            if self.training:
+                x = checkpoint(blk, x, use_reentrant=False)
+            else:
+                x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model