Spaces:

dwellbot
/

dwellbot_stream3r

Configuration error

App Files Files Community

brian4dwell commited on Sep 17, 2025

Commit

9d31508

1 Parent(s): 594b88c

add stream3r

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
LICENSE +13 -0
assets/pipeline.png +3 -0
assets/teaser_dynamic.gif +3 -0
configs/__init__.py +7 -0
configs/callbacks/default.yaml +22 -0
configs/callbacks/early_stopping.yaml +15 -0
configs/callbacks/model_checkpoint.yaml +17 -0
configs/callbacks/model_summary.yaml +5 -0
configs/callbacks/none.yaml +0 -0
configs/callbacks/rich_progress_bar.yaml +18 -0
configs/data/multiview_dust3r.yaml +25 -0
configs/debug/ddp_debug.yaml +48 -0
configs/debug/default.yaml +35 -0
configs/debug/fdr.yaml +9 -0
configs/debug/limit.yaml +12 -0
configs/debug/overfit.yaml +13 -0
configs/debug/profiler.yaml +12 -0
configs/eval.yaml +19 -0
configs/experiment/stream3r/stream3r.yaml +125 -0
configs/extras/default.yaml +8 -0
configs/hparams_search/mnist_optuna.yaml +52 -0
configs/hydra/default.yaml +19 -0
configs/hydra/launcher/fair_a100.yaml +43 -0
configs/local/.gitkeep +0 -0
configs/logger/aim.yaml +28 -0
configs/logger/comet.yaml +12 -0
configs/logger/csv.yaml +7 -0
configs/logger/many_loggers.yaml +9 -0
configs/logger/mlflow.yaml +12 -0
configs/logger/neptune.yaml +9 -0
configs/logger/tensorboard.yaml +10 -0
configs/logger/wandb.yaml +16 -0
configs/model/stream3r.yaml +42 -0
configs/paths/default.yaml +21 -0
configs/train.yaml +49 -0
configs/trainer/cpu.yaml +5 -0
configs/trainer/ddp.yaml +12 -0
configs/trainer/ddp_eval.yaml +16 -0
configs/trainer/ddp_sim.yaml +7 -0
configs/trainer/deepspeed_stage_2.yaml +9 -0
configs/trainer/default.yaml +30 -0
configs/trainer/gpu.yaml +5 -0
configs/trainer/mps.yaml +5 -0
eval/monodepth/eval_metrics.py +211 -0
eval/monodepth/launch.py +146 -0
eval/monodepth/metadata.py +187 -0
eval/monodepth/run.sh +20 -0
eval/monodepth/tools.py +399 -0
eval/mv_recon/base.py +274 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,13 @@

+S-Lab License 1.0
+Copyright 2025 S-Lab
+Redistribution and use for non-commercial purpose in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+In the event that redistribution and/or use for commercial purpose in source or binary forms, with or without modification is required, please contact the contributor(s) of the work.

assets/pipeline.png ADDED Viewed

Git LFS Details

SHA256: 099a2c82e37b04878112826abcc85b02cf86e0bd059824dfe98e4a99782d6aac
Pointer size: 131 Bytes
Size of remote file: 655 kB

assets/teaser_dynamic.gif ADDED Viewed

Git LFS Details

SHA256: eb25ab7cf2e3dcff862a3e8e82657dbba7fc0cbc36856f315d2b6e25f9bb9d72
Pointer size: 132 Bytes
Size of remote file: 2.33 MB

configs/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# this file is needed here to include configs when building project as a package

configs/callbacks/default.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+defaults:
+  - model_checkpoint
+  - early_stopping
+  - model_summary
+  - rich_progress_bar
+  - _self_
+model_checkpoint:
+  dirpath: ${paths.output_dir}/checkpoints
+  filename: "epoch_{epoch:03d}"
+  monitor: "val/loss"
+  mode: "min"
+  save_last: True
+  auto_insert_metric_name: False
+early_stopping:
+  monitor: "val/loss"
+  patience: 100
+  mode: "min"
+model_summary:
+  max_depth: -1

configs/callbacks/early_stopping.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.EarlyStopping.html
+early_stopping:
+  _target_: lightning.pytorch.callbacks.EarlyStopping
+  monitor: ??? # quantity to be monitored, must be specified !!!
+  min_delta: 0. # minimum change in the monitored quantity to qualify as an improvement
+  patience: 3 # number of checks with no improvement after which training will be stopped
+  verbose: False # verbosity mode
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  strict: True # whether to crash the training if monitor is not found in the validation metrics
+  check_finite: True # when set True, stops training when the monitor becomes NaN or infinite
+  stopping_threshold: null # stop training immediately once the monitored quantity reaches this threshold
+  divergence_threshold: null # stop training as soon as the monitored quantity becomes worse than this threshold
+  check_on_train_epoch_end: null # whether to run early stopping at the end of the training epoch
+  # log_rank_zero_only: False  # this keyword argument isn't available in stable version

configs/callbacks/model_checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
+model_checkpoint:
+  _target_: lightning.pytorch.callbacks.ModelCheckpoint
+  dirpath: null # directory to save the model file
+  filename: null # checkpoint filename
+  monitor: 'val/loss' # name of the logged metric which determines when model is improving
+  verbose: False # verbosity mode
+  save_last: true # additionally always save an exact copy of the last checkpoint to a file last.ckpt
+  save_top_k: 1 # save k best models (determined by above metric)
+  mode: "min" # "max" means higher metric value is better, can be also "min"
+  auto_insert_metric_name: False # when True, the checkpoints filenames will contain the metric name
+  save_weights_only: False # if True, then only the model’s weights will be saved
+  every_n_train_steps: null # number of training steps between checkpoints
+  train_time_interval: null # checkpoints are monitored at the specified time interval
+  every_n_epochs: 20 # number of epochs between checkpoints
+  save_on_train_epoch_end: null # whether to run checkpointing at the end of the training epoch or the end of validation

configs/callbacks/model_summary.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+# https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.RichModelSummary.html
+model_summary:
+  _target_: lightning.pytorch.callbacks.RichModelSummary
+  max_depth: 1 # the maximum depth of layer nesting that the summary will include

configs/callbacks/none.yaml ADDED Viewed

File without changes

configs/callbacks/rich_progress_bar.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+# https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.RichProgressBar.html
+rich_progress_bar:
+  _target_: lightning.pytorch.callbacks.progress.rich_progress.RichProgressBar
+  refresh_rate: 1
+  leave: false
+  theme:
+    _target_: lightning.pytorch.callbacks.progress.rich_progress.RichProgressBarTheme
+    description: green_yellow
+    progress_bar: green1
+    progress_bar_finished: green1
+    progress_bar_pulse: "#6206E0"
+    batch_progress: green_yellow
+    time: blue
+    processing_speed: cyan
+    metrics: grey82
+    metrics_text_delimiter: " "
+    metrics_format: .4g

configs/data/multiview_dust3r.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+# Define the common data root and number of views
+data_root: /path/to/dust3r_data
+num_views: 4
+num_views_val: 10
+data_module:
+  _target_: stream3r.data.multiview_dust3r_datamodule.MultiViewDUSt3RDataModule
+  train_datasets:
+    - 80_000 @ Co3d_Multiview(split='train', num_views=${data.num_views}, window_degree_range=360, num_samples_per_window=100, ROOT='${data.data_root}/co3d_50_seqs_per_category_subset_processed', aug_crop=16, mask_bg='rand', resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)
+    - 80_000 @ MegaDepth_Multiview(split='train', num_views=${data.num_views}, window_size=${python_eval:"${data.num_views} * 2"}, num_samples_per_window=100, ROOT='${data.data_root}/megadepth_processed', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)
+    - 80_000 @ ScanNetpp_Multiview(split='train', num_views=${data.num_views}, window_size=${python_eval:"${data.num_views} * 2"}, num_samples_per_window=100, ROOT='${data.data_root}/scannetpp_processed', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)
+    - 80_000 @ ARKitScenes_Multiview(split='train', num_views=${data.num_views}, num_samples_per_window=10, ROOT='${data.data_root}/arkitscenes_processed', aug_crop=256, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)
+    - 80_000 @ Habitat_Multiview(1_000_000, split='train', num_views=${data.num_views}, ROOT='${data.data_root}/habitat_processed', aug_crop=16, resolution=[(512, 384), (512, 336), (512, 288), (512, 256), (512, 160)], transform=ColorJitter)
+  validation_datasets:
+    - 100 @ Co3d_Multiview(split='test', num_views=${data.num_views_val}, window_degree_range=360, num_samples_per_window=100, ROOT='${data.data_root}/co3d_50_seqs_per_category_subset_processed', resolution=(512, 384), seed=777)
+    - 100 @ MegaDepth_Multiview(split='val', num_views=${data.num_views_val}, window_size=${python_eval:"${data.num_views_val} * 2"}, num_samples_per_window=100, ROOT='${data.data_root}/megadepth_processed', resolution=(512, 336), seed=777)
+    - 100 @ ScanNetpp_Multiview(split='train', num_views=${data.num_views_val}, window_size=${python_eval:"${data.num_views_val} * 2"}, num_samples_per_window=100, ROOT='${data.data_root}/scannetpp_processed', resolution=(512, 384), seed=777)
+    - 100 @ ARKitScenes_Multiview(split='train', num_views=${data.num_views_val}, num_samples_per_window=10, ROOT='${data.data_root}/arkitscenes_processed', resolution=(512, 384), seed=777)
+    - 100 @ Habitat_Multiview(100_000, split='val', num_views=${data.num_views_val}, ROOT='${data.data_root}/habitat_processed', resolution=(512,384), seed=777)
+  batch_size_per_device: 6
+  batch_size_per_device_val: 4
+  num_workers: 6
+  pin_memory: True

configs/debug/ddp_debug.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+# @package _global_
+#use a smaller dataset for faster initializations
+defaults:
+  - override /data: multiview_dust3r_tiny
+  - override /logger:
+    - csv
+    - wandb
+# overwrite task name so debugging logs are stored in separate folder
+task_name: "debug"
+logger:
+  wandb:
+    name: ${paths.run_folder_name}
+# ckpt_path: /some/random/path
+extras:
+  ignore_warnings: False
+  enforce_tags: False
+# sets level of all command line loggers to 'DEBUG'
+# https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
+hydra:
+  job_logging:
+    root:
+      level: DEBUG
+model:
+  net:
+    random_image_idx_embedding: true
+data:
+  num_views: 4
+  data_module:
+    num_workers: 0 # debuggers don't like multiprocessing
+    pin_memory: false # disable gpu memory pin
+    batch_size_per_device: 6
+trainer:
+  log_every_n_steps: 1
+  devices: auto
+  # fast_dev_run: 1
+  limit_train_batches: 1
+  limit_val_batches: 10000
+  precision: 32

configs/debug/default.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+# @package _global_
+# default debugging setup, runs 1 full epoch
+# other debugging configs can inherit from this one
+# overwrite task name so debugging logs are stored in separate folder
+task_name: "debug"
+# disable callbacks and loggers during debugging
+callbacks: null
+logger: null
+extras:
+  ignore_warnings: False
+  enforce_tags: False
+# sets level of all command line loggers to 'DEBUG'
+# https://hydra.cc/docs/tutorials/basic/running_your_app/logging/
+hydra:
+  job_logging:
+    root:
+      level: DEBUG
+  # use this to also set hydra loggers to 'DEBUG'
+  # verbose: True
+trainer:
+  max_epochs: 1
+  accelerator: cpu # debuggers don't like gpus
+  devices: 1 # debuggers don't like multiprocessing
+  detect_anomaly: true # raise exception if NaN or +/-inf is detected in any tensor
+data:
+  num_workers: 0 # debuggers don't like multiprocessing
+  pin_memory: False # disable gpu memory pin

configs/debug/fdr.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# @package _global_
+# runs 1 train, 1 validation and 1 test step
+defaults:
+  - default
+trainer:
+  fast_dev_run: true

configs/debug/limit.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# @package _global_
+# uses only 1% of the training data and 5% of validation/test data
+defaults:
+  - default
+trainer:
+  max_epochs: 3
+  limit_train_batches: 0.01
+  limit_val_batches: 0.05
+  limit_test_batches: 0.05

configs/debug/overfit.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+# @package _global_
+# overfits to 3 batches
+defaults:
+  - default
+trainer:
+  max_epochs: 20
+  overfit_batches: 3
+# model ckpt and early stopping need to be disabled during overfitting
+callbacks: null

configs/debug/profiler.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# @package _global_
+# runs with execution time profiling
+defaults:
+  - default
+trainer:
+  max_epochs: 1
+  profiler: "simple"
+  # profiler: "advanced"
+  # profiler: "pytorch"

configs/eval.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# @package _global_
+defaults:
+  - _self_
+  - data: multiview_dust3r
+  - model: stream3r
+  - logger: many_loggers
+  - trainer: ddp_eval
+  - paths: default
+  - extras: default
+  - hydra: default
+  - eval: default
+task_name: "eval"
+tags: ["eval"]
+# passing checkpoint path is necessary for evaluation
+ckpt_path: ???

configs/experiment/stream3r/stream3r.yaml ADDED Viewed

	@@ -0,0 +1,125 @@

+# @package _global_
+defaults:
+  - override /model: stream3r
+# seed for random number generators in pytorch, numpy and python.random
+seed: 42
+tags: ["train", "stream3r"]
+task_name: stream3r
+slurm_job_id: 99999 # must set in the command line
+# ckpt_path: /path/to/resume.ckpt  # uncomment to resume training from a checkpoint
+paths:
+  run_folder_name: ${task_name}_${slurm_job_id}
+logger:
+  wandb:
+    name: ${task_name}_${slurm_job_id}
+    project: stream3r
+data:
+  data_scaling: 1.0
+  data_root: /data
+  num_views: 24
+  resolution:
+    - [518, 392]
+    - [518, 378]
+    - [518, 336]
+    - [518, 294]
+    - [518, 252]
+    - [518, 210]
+    - [518, 140]
+    - [378, 518]
+    - [336, 518]
+    - [294, 518]
+    - [252, 518]
+    - [224, 224]
+  allow_repeat: true
+  n_corres_train: 0
+  data_module:
+    _target_: stream3r.data.multiview_dust3r_datamodule.MultiViewDUSt3RDataModule
+    pin_memory: true
+    num_workers: 16
+    num_workers_val: 1 # have to be a low number when using DeepSpeed ZeRO-2
+    batch_size_per_device: 1
+    batch_size_per_device_val: 1
+    train_datasets:
+      - 44800 @ Co3d_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT='${data.data_root}/processed_co3d', aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 56000 @ WildRGBD_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_wildrgbd_mp", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 22400 @ ARKitScenesHighRes_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_arkitscene_highres", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 38400 @ ScanNet_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_scannet/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 16800 @ ScanNetpp_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_scannetpp/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 84000 @ MapFree_Multi(allow_repeat=${data.allow_repeat}, split=None, ROOT="${data.data_root}/processed_mapfree/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 20000 @ Waymo_Multi(allow_repeat=${data.allow_repeat}, split=None, ROOT="${data.data_root}/processed_waymo/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 56000 @ TartanAir_Multi(allow_repeat=${data.allow_repeat}, split=None, ROOT="${data.data_root}/processed_tartanair/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 9400 @ Spring(allow_repeat=${data.allow_repeat}, split=None, ROOT="${data.data_root}/processed_spring/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 36000 @ BEDLAM_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_bedlam/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 28800 @ MP3D_Multi(allow_repeat=${data.allow_repeat}, split=None, ROOT="${data.data_root}/processed_mp3d/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 14400 @ UASOL_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_uasol/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 1400 @ MVS_Synth_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_mvs_synth", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 7200 @ PointOdyssey_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_pointodyssey", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 11200 @ HyperSim_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_hypersim_new", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 22400 @ BlendedMVS_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_blendedmvs/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 22400 @ MegaDepth_Multi(allow_repeat=${data.allow_repeat}, split="train", ROOT="${data.data_root}/processed_megadepth", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 5600 @ VirtualKITTI2_Multi(allow_repeat=${data.allow_repeat}, split=None, ROOT="${data.data_root}/processed_vkitti", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 168 @ UnReal4K_Multi(allow_repeat=${data.allow_repeat}, split=None, ROOT="${data.data_root}/processed_unreal4k/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 74000 @ DL3DV_Multi(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_dl3dv_ours_parts/processed_dl3dv_ours", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train}) +
+      - 36000 @ DynamicReplica(allow_repeat=${data.allow_repeat}, split='train', ROOT="${data.data_root}/processed_dynamic_replica/", aug_crop=16, resolution=${data.resolution}, transform=ColorJitter, num_views=${data.num_views}, n_corres=${data.n_corres_train})
+model:
+  pretrained: weights/vggt/model.pt
+  net:
+    freeze: encoder
+  scheduler:
+    warmup_start_lr: 1e-6
+    warmup_epochs: 1
+  train_criterion:
+    _target_: stream3r.loss.losses.CausalLoss
+    gradient_loss: grad
+    is_metric: false
+  validation_criterion:
+    _target_: stream3r.loss.losses.CausalLoss
+    gradient_loss: grad
+    is_metric: false
+  optimizer:
+    _target_: torch.optim.AdamW
+    _partial_: true
+    lr: 1e-5
+    betas:
+      - 0.9
+      - 0.95
+    weight_decay: 0.05
+trainer:
+  devices: auto
+  max_epochs: 500
+  accumulate_grad_batches: 4
+  strategy:
+    _target_: lightning.pytorch.strategies.DeepSpeedStrategy
+    timeout:
+      _target_: datetime.timedelta
+      minutes: 80
+  plugins: null
+  limit_val_batches: 0
+  precision: bf16-mixed
+  log_every_n_steps: 20
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: 2000
+    every_n_epochs: null
+    save_top_k: -1
+    filename: "{epoch:03d}-{step:08d}"
+    save_last: false
+    monitor: "train/loss"
+  early_stopping:
+    monitor: "train/loss"

configs/extras/default.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+# disable python warnings if they annoy you
+ignore_warnings: False
+# ask user for tags if none are provided in the config
+enforce_tags: True
+# pretty print config tree at the start of the run using Rich library
+print_config: True

configs/hparams_search/mnist_optuna.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+# @package _global_
+# example hyperparameter optimization of some experiment with Optuna:
+# python train.py -m hparams_search=mnist_optuna experiment=example
+defaults:
+  - override /hydra/sweeper: optuna
+# choose metric which will be optimized by Optuna
+# make sure this is the correct name of some metric logged in lightning module!
+optimized_metric: "val/acc_best"
+# here we define Optuna hyperparameter search
+# it optimizes for value returned from function with @hydra.main decorator
+# docs: https://hydra.cc/docs/next/plugins/optuna_sweeper
+hydra:
+  mode: "MULTIRUN" # set hydra to multirun by default if this config is attached
+  sweeper:
+    _target_: hydra_plugins.hydra_optuna_sweeper.optuna_sweeper.OptunaSweeper
+    # storage URL to persist optimization results
+    # for example, you can use SQLite if you set 'sqlite:///example.db'
+    storage: null
+    # name of the study to persist optimization results
+    study_name: null
+    # number of parallel workers
+    n_jobs: 1
+    # 'minimize' or 'maximize' the objective
+    direction: maximize
+    # total number of runs that will be executed
+    n_trials: 20
+    # choose Optuna hyperparameter sampler
+    # you can choose bayesian sampler (tpe), random search (without optimization), grid sampler, and others
+    # docs: https://optuna.readthedocs.io/en/stable/reference/samplers.html
+    sampler:
+      _target_: optuna.samplers.TPESampler
+      seed: 1234
+      n_startup_trials: 10 # number of random sampling runs before optimization starts
+    # define hyperparameter search space
+    params:
+      model.optimizer.lr: interval(0.0001, 0.1)
+      data.batch_size: choice(32, 64, 128, 256)
+      model.net.lin1_size: choice(64, 128, 256)
+      model.net.lin2_size: choice(64, 128, 256)
+      model.net.lin3_size: choice(32, 64, 128, 256)

configs/hydra/default.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+# https://hydra.cc/docs/configure_hydra/intro/
+# enable color logging
+defaults:
+  - override hydra_logging: colorlog
+  - override job_logging: colorlog
+# output directory, generated dynamically on each run
+run:
+  dir: ${paths.log_dir}/${task_name}/runs/${paths.run_folder_name}
+sweep:
+  dir: ${paths.log_dir}/${task_name}/multiruns/${paths.run_folder_name}
+  subdir: ${hydra.job.num}
+job_logging:
+  handlers:
+    file:
+      # Incorporates fix from https://github.com/facebookresearch/hydra/pull/2242
+      filename: ${hydra.runtime.output_dir}/${task_name}.log

configs/hydra/launcher/fair_a100.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+defaults:
+  - submitit_slurm
+# see: https://github.com/facebookresearch/hydra/blob/main/plugins/hydra_submitit_launcher/hydra_plugins/hydra_submitit_launcher/config.py
+_target_: hydra_plugins.hydra_submitit_launcher.submitit_launcher.SlurmLauncher
+submitit_folder: ${hydra.sweep.dir}/.submitit/%j
+name: ${hydra.job.name}
+timeout_min: 20160 # 14 days : 60 * 24 * 14
+account: cortex
+qos: cortex_high
+comment: "multiview_dust3r experiment"
+nodes: 1
+gres: "gpu:8"
+tasks_per_node: 8
+cpus_per_task: 12
+signal_delay_s: 120 # USR1 signal delay (seconds) before timeout
+max_num_timeout: 0 # number of times the job can be restarted after timeout
+array_parallelism: 256     # Maximum number of jobs running in parallel
+# Useful to add parameters which are not currently available in the plugin.
+# Eg: {"mail-user": "blublu@fb.com", "mail-type": "BEGIN"}
+additional_parameters:
+  mail-user: "jianingy@meta.com"
+  mail-type: "BEGIN,END"
+  output: "/path/to/slurm_out/%x-%j.out"
+setup: # A list of commands to run in sbatch befure running srun
+  - echo "Begin setting up env on head node ($HOSTNAME)..."
+  - echo $(env | grep SLURM)
+  - export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+  - export MASTER_PORT=9929
+  - export RDZV_ID=$SLURM_JOBID
+  - export OMP_NUM_THREADS=12
+  - . /path/to/miniforge3/etc/profile.d/conda.sh  # activate conda
+  - conda activate dust3r
+  - cd /path/to/project  # cd to the project directory
+  - export NCCL_DEBUG=INFO
+  - export PYTHONFAULTHANDLER=1
+  - export TORCH_DISTRIBUTED_DEBUG=INFO
+  - echo "env setup on head node ($HOSTNAME) finished, starting srun..."
+srun_args:
+  - "--cpu-bind=none" # This is critical to ensure dataloaders uses all CPUs!

configs/local/.gitkeep ADDED Viewed

File without changes

configs/logger/aim.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+# https://aimstack.io/
+# example usage in lightning module:
+# https://github.com/aimhubio/aim/blob/main/examples/pytorch_lightning_track.py
+# open the Aim UI with the following command (run in the folder containing the `.aim` folder):
+# `aim up`
+aim:
+  _target_: aim.pytorch_lightning.AimLogger
+  repo: ${paths.root_dir} # .aim folder will be created here
+  # repo: "aim://ip_address:port" # can instead provide IP address pointing to Aim remote tracking server which manages the repo, see https://aimstack.readthedocs.io/en/latest/using/remote_tracking.html#
+  # aim allows to group runs under experiment name
+  experiment: null # any string, set to "default" if not specified
+  train_metric_prefix: "train/"
+  val_metric_prefix: "val/"
+  test_metric_prefix: "test/"
+  # sets the tracking interval in seconds for system usage metrics (CPU, GPU, memory, etc.)
+  system_tracking_interval: 10 # set to null to disable system metrics tracking
+  # enable/disable logging of system params such as installed packages, git info, env vars, etc.
+  log_system_params: true
+  # enable/disable tracking console logs (default value is true)
+  capture_terminal_logs: false # set to false to avoid infinite console log loop issue https://github.com/aimhubio/aim/issues/2550

configs/logger/comet.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# https://www.comet.ml
+comet:
+  _target_: lightning.pytorch.loggers.comet.CometLogger
+  api_key: ${oc.env:COMET_API_TOKEN} # api key is loaded from environment variable
+  save_dir: "${paths.output_dir}"
+  project_name: "lightning-hydra-template"
+  rest_api_key: null
+  # experiment_name: ""
+  experiment_key: null # set to resume experiment
+  offline: False
+  prefix: ""

configs/logger/csv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+# csv logger built in lightning
+csv:
+  _target_: lightning.pytorch.loggers.csv_logs.CSVLogger
+  save_dir: "${paths.output_dir}"
+  name: "csv/"
+  prefix: ""

configs/logger/many_loggers.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# train with many loggers at once
+defaults:
+  # - comet
+  - csv
+  # - mlflow
+  # - neptune
+  - tensorboard
+  - wandb

configs/logger/mlflow.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+# https://mlflow.org
+mlflow:
+  _target_: lightning.pytorch.loggers.mlflow.MLFlowLogger
+  # experiment_name: ""
+  # run_name: ""
+  tracking_uri: ${paths.log_dir}/mlflow/mlruns # run `mlflow ui` command inside the `logs/mlflow/` dir to open the UI
+  tags: null
+  # save_dir: "./mlruns"
+  prefix: ""
+  artifact_location: null
+  # run_id: ""

configs/logger/neptune.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+# https://neptune.ai
+neptune:
+  _target_: lightning.pytorch.loggers.neptune.NeptuneLogger
+  api_key: ${oc.env:NEPTUNE_API_TOKEN} # api key is loaded from environment variable
+  project: username/lightning-hydra-template
+  # name: ""
+  log_model_checkpoints: True
+  prefix: ""

configs/logger/tensorboard.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+# https://www.tensorflow.org/tensorboard/
+tensorboard:
+  _target_: lightning.pytorch.loggers.tensorboard.TensorBoardLogger
+  save_dir: "${paths.output_dir}/tensorboard/"
+  name: null
+  log_graph: False
+  default_hp_metric: True
+  prefix: ""
+  # version: ""

configs/logger/wandb.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+# https://wandb.ai
+wandb:
+  _target_: lightning.pytorch.loggers.wandb.WandbLogger
+  name: null # name of the run (normally generated by wandb)
+  save_dir: "${paths.output_dir}"
+  offline: False
+  id: null # pass correct id to resume experiment!
+  anonymous: null # enable anonymous logging
+  project: "stream3r"
+  log_model: False # upload lightning ckpts
+  prefix: "" # a string to put at the beginning of metric keys
+  # entity: "" # set to name of your wandb team
+  group: ""
+  tags: []
+  job_type: ""

configs/model/stream3r.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+_target_: stream3r.models.multiview_dust3r_module.MultiViewDUSt3RLitModule
+pretrained: null
+resume_from_checkpoint: ${ckpt_path}
+eval_use_pts3d_from_local_head: true
+train_criterion:
+  _target_: stream3r.loss.losses.CausalLoss
+validation_criterion:
+  _target_: stream3r.loss.losses.CausalLoss
+optimizer:
+  _target_: torch.optim.AdamW
+  _partial_: true
+  lr: 1e-4
+  betas:
+    - 0.9
+    - 0.95
+  weight_decay: 0.05
+# scheduler:
+#   _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
+#   _partial_: true
+#   mode: min
+#   factor: 0.1
+#   patience: 10
+scheduler:
+  _target_: pl_bolts.optimizers.lr_scheduler.LinearWarmupCosineAnnealingLR
+  _partial_: true
+  warmup_epochs: 10
+  max_epochs: ${trainer.max_epochs}
+  eta_min: 1e-06
+net:
+  _target_: stream3r.models.stream3r.STream3R
+# compile model for faster training with pytorch 2.0
+compile: false

configs/paths/default.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+# path to root directory
+# this requires PROJECT_ROOT environment variable to exist
+# you can replace it with "." if you want the root to be the current working directory
+# root_dir: R${oc.env:PROJECT_ROOT}
+root_dir: .
+# path to data directory
+data_dir: ${paths.root_dir}/data/
+# path to logging directory
+log_dir: ${paths.root_dir}/logs/
+# path to output directory, created dynamically by hydra
+# path generation pattern is specified in `configs/hydra/default.yaml`
+# use it to store all files generated during the run, like ckpts and metrics
+output_dir: ${hydra:runtime.output_dir}
+# path to working directory
+work_dir: ${hydra:runtime.cwd}
+run_folder_name: ${now:%Y-%m-%d}_${now:%H-%M-%S}

configs/train.yaml ADDED Viewed

	@@ -0,0 +1,49 @@

+# @package _global_
+# specify here default configuration
+# order of defaults determines the order in which configs override each other
+defaults:
+  - _self_
+  - data: multiview_dust3r
+  - model: stream3r
+  - callbacks: default
+  - logger: many_loggers # set logger here or use command line (e.g. `python train.py logger=tensorboard`)
+  - trainer: ddp
+  - paths: default
+  - extras: default
+  - hydra: default
+  # experiment configs allow for version control of specific hyperparameters
+  # e.g. best hyperparameters for given model and datamodule
+  - experiment: null
+  # config for hyperparameter optimization
+  - hparams_search: null
+  # optional local config for machine/user specific settings
+  # it's optional since it doesn't need to exist and is excluded from version control
+  - optional local: default
+  # debugging config (enable through command line, e.g. `python train.py debug=default)
+  - debug: null
+# task name, determines output directory path
+task_name: "train"
+# tags to help you identify your experiments
+# you can overwrite this in experiment configs
+# overwrite from command line with `python train.py tags="[first_tag, second_tag]"`
+tags: ["dev"]
+# set False to skip model training
+train: True
+# evaluate on test set, using best model weights achieved during training
+# lightning chooses best weights based on the metric specified in checkpoint callback
+test: True
+# simply provide checkpoint path to resume training
+ckpt_path: null
+# seed for random number generators in pytorch, numpy and python.random
+seed: 42

configs/trainer/cpu.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+  - default
+accelerator: cpu
+devices: 1

configs/trainer/ddp.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+defaults:
+  - default
+# strategy: ddp
+strategy: ddp_find_unused_parameters_true
+accelerator: gpu
+devices: auto
+num_nodes: 1
+sync_batchnorm: true
+use_distributed_sampler: false

configs/trainer/ddp_eval.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+defaults:
+  - default
+# strategy: ddp
+strategy:
+  _target_: lightning.pytorch.strategies.DDPStrategy
+  timeout:
+    _target_: datetime.timedelta
+    minutes: 30
+accelerator: gpu
+devices: auto
+num_nodes: 1
+sync_batchnorm: true
+use_distributed_sampler: false

configs/trainer/ddp_sim.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+defaults:
+  - default
+# simulate DDP on CPU, useful for debugging
+accelerator: cpu
+devices: 2
+strategy: ddp_spawn

configs/trainer/deepspeed_stage_2.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+defaults:
+  - default
+# strategy: deepspeed_stage_2
+strategy: deepspeed_stage_2
+accelerator: gpu
+devices: auto
+num_nodes: 1

configs/trainer/default.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+_target_: lightning.pytorch.trainer.Trainer
+_convert_: partial
+default_root_dir: ${paths.output_dir}
+min_epochs: 1 # prevents early stopping
+max_epochs: 100
+accelerator: cpu
+devices: 1
+# mixed precision for extra speed-up
+# precision: 16
+# perform a validation loop every N training epochs
+check_val_every_n_epoch: 1
+# set True to to ensure deterministic results
+# makes training slower but gives more reproducibility than just setting seeds
+deterministic: False
+plugins:
+  - _target_: lightning.pytorch.plugins.environments.SLURMEnvironment
+    auto_requeue: true  # auto-resubmit the job when it is preempted by slurm
+    requeue_signal: ${python_eval:"signal.SIGUSR1"}  # singal code is platform dependent, so it has to be decided at runtime
+    # requeue_signal:
+    #   _target_: signal.Signals
+    #   _args_:
+    #     - 10  # SIGUSR1, see: https://chromium.googlesource.com/chromiumos/docs/+/master/constants/signals.md

configs/trainer/gpu.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+  - default
+accelerator: gpu
+devices: 1

configs/trainer/mps.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+  - default
+accelerator: mps
+devices: 1

eval/monodepth/eval_metrics.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from eval.monodepth.tools import depth_evaluation
+import numpy as np
+import json
+from tqdm import tqdm
+import glob
+import cv2
+from eval.monodepth.metadata import dataset_metadata
+import argparse
+from PIL import Image
+TAG_FLOAT = 202021.25
+def depth_read_sintel(filename):
+    """Read depth data from file, return as numpy array."""
+    f = open(filename, "rb")
+    check = np.fromfile(f, dtype=np.float32, count=1)[0]
+    assert (
+        check == TAG_FLOAT
+    ), " depth_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format(
+        TAG_FLOAT, check
+    )
+    width = np.fromfile(f, dtype=np.int32, count=1)[0]
+    height = np.fromfile(f, dtype=np.int32, count=1)[0]
+    size = width * height
+    assert (
+        width > 0 and height > 0 and size > 1 and size < 100000000
+    ), " depth_read:: Wrong input size (width = {0}, height = {1}).".format(
+        width, height
+    )
+    depth = np.fromfile(f, dtype=np.float32, count=-1).reshape((height, width))
+    return depth
+def depth_read_bonn(filename):
+    # loads depth map D from png file
+    # and returns it as a numpy array
+    depth_png = np.asarray(Image.open(filename))
+    # make sure we have a proper 16bit depth map here.. not 8bit!
+    assert np.max(depth_png) > 255
+    depth = depth_png.astype(np.float64) / 5000.0
+    depth[depth_png == 0] = -1.0
+    return depth
+def depth_read_kitti(filename):
+    # loads depth map D from png file
+    # and returns it as a numpy array,
+    # for details see readme.txt
+    img_pil = Image.open(filename)
+    depth_png = np.array(img_pil, dtype=int)
+    # make sure we have a proper 16bit depth map here.. not 8bit!
+    assert np.max(depth_png) > 255
+    depth = depth_png.astype(float) / 256.0
+    depth[depth_png == 0] = -1.0
+    return depth
+def get_gt_depth(filename, dataset):
+    if dataset == "sintel":
+        return depth_read_sintel(filename)
+    elif dataset == "bonn":
+        return depth_read_bonn(filename)
+    elif dataset == "kitti":
+        return depth_read_kitti(filename)
+    elif dataset == "nyu":
+        return np.load(filename)
+    else:
+        raise NotImplementedError
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="",
+        help="value for outdir",
+    )
+    parser.add_argument(
+        "--eval_dataset", type=str, default="nyu", choices=list(dataset_metadata.keys())
+    )
+    return parser
+def main(args):
+    if args.eval_dataset == "nyu":
+        depth_pathes = glob.glob("data/nyu-v2/val/nyu_depths/*.npy")
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+    elif args.eval_dataset == "sintel":
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+        full = len(pred_pathes) > 643
+        if full:
+            depth_pathes = glob.glob(f"data/sintel/training/depth/*/*.dpt")
+            depth_pathes = sorted(depth_pathes)
+        else:
+            seq_list = [
+                "alley_2",
+                "ambush_4",
+                "ambush_5",
+                "ambush_6",
+                "cave_2",
+                "cave_4",
+                "market_2",
+                "market_5",
+                "market_6",
+                "shaman_3",
+                "sleeping_1",
+                "sleeping_2",
+                "temple_2",
+                "temple_3",
+            ]
+            depth_pathes_folder = [
+                f"data/sintel/training/depth/{seq}" for seq in seq_list
+            ]
+            depth_pathes = []
+            for depth_pathes_folder_i in depth_pathes_folder:
+                depth_pathes += glob.glob(depth_pathes_folder_i + "/*.dpt")
+            depth_pathes = sorted(depth_pathes)
+    elif args.eval_dataset == "bonn":
+        seq_list = ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"]
+        img_pathes_folder = [
+            f"data/bonn/rgbd_bonn_dataset/rgbd_bonn_{seq}/rgb_110/*.png"
+            for seq in seq_list
+        ]
+        img_pathes = []
+        for img_pathes_folder_i in img_pathes_folder:
+            img_pathes += glob.glob(img_pathes_folder_i)
+        img_pathes = sorted(img_pathes)
+        depth_pathes_folder = [
+            f"data/bonn/rgbd_bonn_dataset/rgbd_bonn_{seq}/depth_110/*.png"
+            for seq in seq_list
+        ]
+        depth_pathes = []
+        for depth_pathes_folder_i in depth_pathes_folder:
+            depth_pathes += glob.glob(depth_pathes_folder_i)
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/*.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+    elif args.eval_dataset == "kitti":
+        depth_pathes = glob.glob(
+            "data/kitti/depth_selection/val_selection_cropped/groundtruth_depth_gathered/*/*.png"
+        )
+        depth_pathes = sorted(depth_pathes)
+        pred_pathes = glob.glob(
+            f"{args.output_dir}/*/*depth.npy"
+        )  # TODO: update the path to your prediction
+        pred_pathes = sorted(pred_pathes)
+    else:
+        raise NotImplementedError
+    gathered_depth_metrics = []
+    for idx in tqdm(range(len(depth_pathes))):
+        pred_depth = np.load(pred_pathes[idx])
+        gt_depth = get_gt_depth(depth_pathes[idx], args.eval_dataset)
+        pred_depth = cv2.resize(
+            pred_depth,
+            (gt_depth.shape[1], gt_depth.shape[0]),
+            interpolation=cv2.INTER_CUBIC,
+        )
+        if args.eval_dataset == "nyu":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=None, lr=1e-3
+            )
+        elif args.eval_dataset == "sintel":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=70, use_gpu=True, post_clip_max=70
+            )
+        elif args.eval_dataset == "bonn":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=70, use_gpu=True
+            )
+        elif args.eval_dataset == "kitti":
+            depth_results, error_map, depth_predict, depth_gt = depth_evaluation(
+                pred_depth, gt_depth, max_depth=None, use_gpu=True
+            )
+        gathered_depth_metrics.append(depth_results)
+    depth_log_path = os.path.join(args.output_dir, "metric.json")
+    average_metrics = {
+        key: np.average(
+            [metrics[key] for metrics in gathered_depth_metrics],
+            weights=[metrics["valid_pixels"] for metrics in gathered_depth_metrics],
+        )
+        for key in gathered_depth_metrics[0].keys()
+        if key != "valid_pixels"
+    }
+    print(f"{args.eval_dataset} - Average depth evaluation metrics:", average_metrics)
+    with open(depth_log_path, "w") as f:
+        f.write(json.dumps(average_metrics))
+if __name__ == "__main__":
+    args = get_args_parser()
+    args = args.parse_args()
+    main(args)

eval/monodepth/launch.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import numpy as np
+import matplotlib
+import numpy as np
+import cv2
+import argparse
+from pathlib import Path
+from tqdm import tqdm
+import os
+import sys
+from stream3r.models.stream3r import STream3R
+from stream3r.dust3r.utils.device import collate_with_cat
+from stream3r.dust3r.utils.image import load_images_for_eval as load_images
+from stream3r.utils.utils import ImgDust3r2Stream3r
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from eval.monodepth.metadata import dataset_metadata
+torch.backends.cuda.matmul.allow_tf32 = True
+# avoid high cpu usage
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+os.environ["NUMEXPR_NUM_THREADS"] = "1"
+os.environ["OPENBLAS_NUM_THREADS"] = "1"
+torch.set_num_threads(1)
+# ===========================================
+def colorize_depth(depth: np.ndarray,
+                   mask: np.ndarray = None,
+                   normalize: bool = True,
+                   cmap: str = 'Spectral') -> np.ndarray:
+    if mask is None:
+        depth = np.where(depth > 0, depth, np.nan)
+    else:
+        depth = np.where((depth > 0) & mask, depth, np.nan)
+    disp = 1 / depth
+    if normalize:
+        min_disp, max_disp = np.nanquantile(disp,
+                                            0.001), np.nanquantile(disp, 0.99)
+        disp = (disp - min_disp) / (max_disp - min_disp)
+    colored = np.nan_to_num(matplotlib.colormaps[cmap](1.0 - disp)[..., :3], 0)
+    colored = np.ascontiguousarray((colored.clip(0, 1) * 255).astype(np.uint8))
+    return colored
+def get_args_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device",
+                        type=str,
+                        default="cuda",
+                        help="pytorch device")
+    parser.add_argument("--output_dir",
+                        type=str,
+                        default="",
+                        help="value for outdir")
+    parser.add_argument("--no_crop",
+                        type=bool,
+                        default=True,
+                        help="whether to crop input data")
+    parser.add_argument("--full_seq",
+                        type=bool,
+                        default=False,
+                        help="whether to use all seqs")
+    parser.add_argument("--seq_list", default=None)
+    parser.add_argument("--eval_dataset",
+                        type=str,
+                        default="nyu",
+                        choices=list(dataset_metadata.keys()))
+    return parser
+def eval_mono_depth_estimation(args, model, device):
+    metadata = dataset_metadata.get(args.eval_dataset)
+    if metadata is None:
+        raise ValueError(f"Unknown dataset: {args.eval_dataset}")
+    img_path = metadata.get("img_path")
+    if "img_path_func" in metadata:
+        img_path = metadata["img_path_func"](args)
+    process_func = metadata.get("process_func")
+    if process_func is None:
+        raise ValueError(
+            f"No processing function defined for dataset: {args.eval_dataset}")
+    for filelist, save_dir in process_func(args, img_path):
+        Path(save_dir).mkdir(parents=True, exist_ok=True)
+        eval_mono_depth(args, model, device, filelist, save_dir=save_dir)
+def eval_mono_depth(args, model, device, filelist, save_dir=None):
+    for file in tqdm(filelist):
+        file = [file]
+        images = load_images(
+            file,
+            size=518,
+            verbose=True,
+            crop=False,
+            patch_size=14,
+        )
+        images = collate_with_cat([tuple(images)])
+        images = torch.stack([view["img"] for view in images], dim=1)
+        images = ImgDust3r2Stream3r(images).to(device)
+        with torch.no_grad():
+            predictions = model(images)
+        depth_map = predictions['depth'][0,0].squeeze(-1).cpu()
+        if save_dir is not None:
+            # save the depth map to the save_dir as npy
+            np.save(
+                f"{save_dir}/{file[0].split('/')[-1].replace('.png','depth.npy')}",
+                depth_map.cpu().numpy(),
+            )
+            depth_map = colorize_depth(depth_map)
+            cv2.imwrite(
+                f"{save_dir}/{file[0].split('/')[-1].replace('.png','depth.jpg')}",
+                depth_map,
+            )
+def main():
+    args = get_args_parser()
+    args = args.parse_args()
+    if args.eval_dataset == "sintel":
+        args.full_seq = True
+    else:
+        args.full_seq = False
+    model = STream3R.from_pretrained("yslan/STream3R").to(args.device)
+    model.eval()
+    eval_mono_depth_estimation(args, model, args.device)
+if __name__ == "__main__":
+    main()

eval/monodepth/metadata.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import glob
+from tqdm import tqdm
+# Define the merged dataset metadata dictionary
+dataset_metadata = {
+    "sun_rgbd": {
+        "img_path": "data/sun_rgbd/image/test",
+        "mask_path": None,
+    },
+    "davis": {
+        "img_path": "data/davis/DAVIS/JPEGImages/480p",
+        "mask_path": "data/davis/DAVIS/masked_images/480p",
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: os.path.join(mask_path, seq),
+        "skip_condition": None,
+        "process_func": None,  # Not used in mono depth estimation
+    },
+    "kitti": {
+        "img_path": "data/kitti/depth_selection/val_selection_cropped/image_gathered",  # Default path
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: None,
+        "traj_format": None,
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_kitti(args, img_path),
+    },
+    "bonn": {
+        "img_path": "data/bonn/rgbd_bonn_dataset",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "rgb_110"
+        ),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, f"rgbd_bonn_{seq}", "groundtruth_110.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_bonn(args, img_path),
+    },
+    "nyu": {
+        "img_path": "data/nyu-v2/val/nyu_images",
+        "mask_path": None,
+        "process_func": lambda args, img_path: process_nyu(args, img_path),
+    },
+    "scannet": {
+        "img_path": "data/scannetv2",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "color_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "pose_90.txt"
+        ),
+        "traj_format": "replica",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,  # lambda save_dir, seq: os.path.exists(os.path.join(save_dir, seq)),
+        "process_func": lambda args, img_path: process_scannet(args, img_path),
+    },
+    "tum": {
+        "img_path": "data/tum",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq, "rgb_90"),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(
+            img_path, seq, "groundtruth_90.txt"
+        ),
+        "traj_format": "tum",
+        "seq_list": None,
+        "full_seq": True,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": None,
+    },
+    "sintel": {
+        "img_path": "data/sintel/training/final",
+        "anno_path": "data/sintel/training/camdata_left",
+        "mask_path": None,
+        "dir_path_func": lambda img_path, seq: os.path.join(img_path, seq),
+        "gt_traj_func": lambda img_path, anno_path, seq: os.path.join(anno_path, seq),
+        "traj_format": None,
+        "seq_list": [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ],
+        "full_seq": False,
+        "mask_path_seq_func": lambda mask_path, seq: None,
+        "skip_condition": None,
+        "process_func": lambda args, img_path: process_sintel(args, img_path),
+    },
+}
+# Define processing functions for each dataset
+def process_kitti(args, img_path):
+    for dir in tqdm(sorted(glob.glob(f"{img_path}/*"))):
+        filelist = sorted(glob.glob(f"{dir}/*.png"))
+        save_dir = f"{args.output_dir}/{os.path.basename(dir)}"
+        yield filelist, save_dir
+def process_bonn(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/rgb/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = (
+            ["balloon2", "crowd2", "crowd3", "person_tracking2", "synchronous"]
+            if args.seq_list is None
+            else args.seq_list
+        )
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/rgbd_bonn_{seq}/rgb_110/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir
+def process_sunrgbd(args, img_path):
+    filelist = sorted(glob.glob(f"{img_path}/*.jpg"))
+    save_dir = f"{args.output_dir}"
+    yield filelist, save_dir
+def process_nyu(args, img_path):
+    filelist = sorted(glob.glob(f"{img_path}/*.png"))
+    save_dir = f"{args.output_dir}"
+    yield filelist, save_dir
+def process_scannet(args, img_path):
+    seq_list = sorted(glob.glob(f"{img_path}/*"))
+    for seq in tqdm(seq_list):
+        filelist = sorted(glob.glob(f"{seq}/color_90/*.jpg"))
+        save_dir = f"{args.output_dir}/{os.path.basename(seq)}"
+        yield filelist, save_dir
+def process_sintel(args, img_path):
+    if args.full_seq:
+        for dir in tqdm(sorted(glob.glob(f"{img_path}/*/"))):
+            filelist = sorted(glob.glob(f"{dir}/*.png"))
+            save_dir = f"{args.output_dir}/{os.path.basename(os.path.dirname(dir))}"
+            yield filelist, save_dir
+    else:
+        seq_list = [
+            "alley_2",
+            "ambush_4",
+            "ambush_5",
+            "ambush_6",
+            "cave_2",
+            "cave_4",
+            "market_2",
+            "market_5",
+            "market_6",
+            "shaman_3",
+            "sleeping_1",
+            "sleeping_2",
+            "temple_2",
+            "temple_3",
+        ]
+        for seq in tqdm(seq_list):
+            filelist = sorted(glob.glob(f"{img_path}/{seq}/*.png"))
+            save_dir = f"{args.output_dir}/{seq}"
+            yield filelist, save_dir

eval/monodepth/run.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+set -e
+workdir='.'
+datasets=('sintel' 'bonn' 'kitti' 'nyu')
+model_name='stream3r'
+for data in "${datasets[@]}"; do
+    output_dir="${workdir}/eval_results/monodepth/${model_name}/${data}"
+    echo "$output_dir"
+    python eval/monodepth/launch.py \
+    --output_dir="$output_dir" \
+    --eval_dataset="$data" \
+    python eval/monodepth/eval_metrics.py \
+        --output_dir "$output_dir" \
+        --eval_dataset "$data"
+done

eval/monodepth/tools.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import torch
+import numpy as np
+import cv2
+import glob
+import argparse
+from pathlib import Path
+from tqdm import tqdm
+from copy import deepcopy
+from scipy.optimize import minimize
+import os
+from collections import defaultdict
+def group_by_directory(pathes, idx=-1):
+    """
+    Groups the file paths based on the second-to-last directory in their paths.
+    Parameters:
+    - pathes (list): List of file paths.
+    Returns:
+    - dict: A dictionary where keys are the second-to-last directory names and values are lists of file paths.
+    """
+    grouped_pathes = defaultdict(list)
+    for path in pathes:
+        # Extract the second-to-last directory
+        dir_name = os.path.dirname(path).split("/")[idx]
+        grouped_pathes[dir_name].append(path)
+    return grouped_pathes
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, torch.Tensor):
+        disparity = torch.zeros_like(depth)
+    elif isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+def absolute_error_loss(params, predicted_depth, ground_truth_depth):
+    s, t = params
+    predicted_aligned = s * predicted_depth + t
+    abs_error = np.abs(predicted_aligned - ground_truth_depth)
+    return np.sum(abs_error)
+def absolute_value_scaling(predicted_depth, ground_truth_depth, s=1, t=0):
+    predicted_depth_np = predicted_depth.cpu().numpy().reshape(-1)
+    ground_truth_depth_np = ground_truth_depth.cpu().numpy().reshape(-1)
+    initial_params = [s, t]  # s = 1, t = 0
+    result = minimize(
+        absolute_error_loss,
+        initial_params,
+        args=(predicted_depth_np, ground_truth_depth_np),
+    )
+    s, t = result.x
+    return s, t
+def absolute_value_scaling2(
+    predicted_depth,
+    ground_truth_depth,
+    s_init=1.0,
+    t_init=0.0,
+    lr=1e-4,
+    max_iters=1000,
+    tol=1e-6,
+):
+    # Initialize s and t as torch tensors with requires_grad=True
+    s = torch.tensor(
+        [s_init],
+        requires_grad=True,
+        device=predicted_depth.device,
+        dtype=predicted_depth.dtype,
+    )
+    t = torch.tensor(
+        [t_init],
+        requires_grad=True,
+        device=predicted_depth.device,
+        dtype=predicted_depth.dtype,
+    )
+    optimizer = torch.optim.Adam([s, t], lr=lr)
+    prev_loss = None
+    for i in range(max_iters):
+        optimizer.zero_grad()
+        # Compute predicted aligned depth
+        predicted_aligned = s * predicted_depth + t
+        # Compute absolute error
+        abs_error = torch.abs(predicted_aligned - ground_truth_depth)
+        # Compute loss
+        loss = torch.sum(abs_error)
+        # Backpropagate
+        loss.backward()
+        # Update parameters
+        optimizer.step()
+        # Check convergence
+        if prev_loss is not None and torch.abs(prev_loss - loss) < tol:
+            break
+        prev_loss = loss.item()
+    return s.detach().item(), t.detach().item()
+def depth_evaluation(
+    predicted_depth_original,
+    ground_truth_depth_original,
+    max_depth=80,
+    custom_mask=None,
+    post_clip_min=None,
+    post_clip_max=None,
+    pre_clip_min=None,
+    pre_clip_max=None,
+    align_with_lstsq=False,
+    align_with_lad=False,
+    align_with_lad2=False,
+    metric_scale=False,
+    lr=1e-4,
+    max_iters=1000,
+    use_gpu=False,
+    align_with_scale=False,
+    disp_input=False,
+):
+    """
+    Evaluate the depth map using various metrics and return a depth error parity map, with an option for least squares alignment.
+    Args:
+        predicted_depth (numpy.ndarray or torch.Tensor): The predicted depth map.
+        ground_truth_depth (numpy.ndarray or torch.Tensor): The ground truth depth map.
+        max_depth (float): The maximum depth value to consider. Default is 80 meters.
+        align_with_lstsq (bool): If True, perform least squares alignment of the predicted depth with ground truth.
+    Returns:
+        dict: A dictionary containing the evaluation metrics.
+        torch.Tensor: The depth error parity map.
+    """
+    if isinstance(predicted_depth_original, np.ndarray):
+        predicted_depth_original = torch.from_numpy(predicted_depth_original)
+    if isinstance(ground_truth_depth_original, np.ndarray):
+        ground_truth_depth_original = torch.from_numpy(ground_truth_depth_original)
+    if custom_mask is not None and isinstance(custom_mask, np.ndarray):
+        custom_mask = torch.from_numpy(custom_mask)
+    # if the dimension is 3, flatten to 2d along the batch dimension
+    if predicted_depth_original.dim() == 3:
+        _, h, w = predicted_depth_original.shape
+        predicted_depth_original = predicted_depth_original.view(-1, w)
+        ground_truth_depth_original = ground_truth_depth_original.view(-1, w)
+        if custom_mask is not None:
+            custom_mask = custom_mask.view(-1, w)
+    # put to device
+    if use_gpu:
+        predicted_depth_original = predicted_depth_original.cuda()
+        ground_truth_depth_original = ground_truth_depth_original.cuda()
+    # Filter out depths greater than max_depth
+    if max_depth is not None:
+        mask = (ground_truth_depth_original > 0) & (
+            ground_truth_depth_original < max_depth
+        )
+    else:
+        mask = ground_truth_depth_original > 0
+    predicted_depth = predicted_depth_original[mask]
+    ground_truth_depth = ground_truth_depth_original[mask]
+    # Clip the depth values
+    if pre_clip_min is not None:
+        predicted_depth = torch.clamp(predicted_depth, min=pre_clip_min)
+    if pre_clip_max is not None:
+        predicted_depth = torch.clamp(predicted_depth, max=pre_clip_max)
+    if disp_input:  # align the pred to gt in the disparity space
+        real_gt = ground_truth_depth.clone()
+        ground_truth_depth = 1 / (ground_truth_depth + 1e-8)
+    # various alignment methods
+    if metric_scale:
+        predicted_depth = predicted_depth
+    elif align_with_lstsq:
+        # Convert to numpy for lstsq
+        predicted_depth_np = predicted_depth.cpu().numpy().reshape(-1, 1)
+        ground_truth_depth_np = ground_truth_depth.cpu().numpy().reshape(-1, 1)
+        # Add a column of ones for the shift term
+        A = np.hstack([predicted_depth_np, np.ones_like(predicted_depth_np)])
+        # Solve for scale (s) and shift (t) using least squares
+        result = np.linalg.lstsq(A, ground_truth_depth_np, rcond=None)
+        s, t = result[0][0], result[0][1]
+        # convert to torch tensor
+        s = torch.tensor(s, device=predicted_depth_original.device)
+        t = torch.tensor(t, device=predicted_depth_original.device)
+        # Apply scale and shift
+        predicted_depth = s * predicted_depth + t
+    elif align_with_lad:
+        s, t = absolute_value_scaling(
+            predicted_depth,
+            ground_truth_depth,
+            s=torch.median(ground_truth_depth) / torch.median(predicted_depth),
+        )
+        predicted_depth = s * predicted_depth + t
+    elif align_with_lad2:
+        s_init = (
+            torch.median(ground_truth_depth) / torch.median(predicted_depth)
+        ).item()
+        s, t = absolute_value_scaling2(
+            predicted_depth,
+            ground_truth_depth,
+            s_init=s_init,
+            lr=lr,
+            max_iters=max_iters,
+        )
+        predicted_depth = s * predicted_depth + t
+    elif align_with_scale:
+        # Compute initial scale factor 's' using the closed-form solution (L2 norm)
+        dot_pred_gt = torch.nanmean(ground_truth_depth)
+        dot_pred_pred = torch.nanmean(predicted_depth)
+        s = dot_pred_gt / dot_pred_pred
+        # Iterative reweighted least squares using the Weiszfeld method
+        for _ in range(10):
+            # Compute residuals between scaled predictions and ground truth
+            residuals = s * predicted_depth - ground_truth_depth
+            abs_residuals = (
+                residuals.abs() + 1e-8
+            )  # Add small constant to avoid division by zero
+            # Compute weights inversely proportional to the residuals
+            weights = 1.0 / abs_residuals
+            # Update 's' using weighted sums
+            weighted_dot_pred_gt = torch.sum(
+                weights * predicted_depth * ground_truth_depth
+            )
+            weighted_dot_pred_pred = torch.sum(weights * predicted_depth**2)
+            s = weighted_dot_pred_gt / weighted_dot_pred_pred
+        # Optionally clip 's' to prevent extreme scaling
+        s = s.clamp(min=1e-3)
+        # Detach 's' if you want to stop gradients from flowing through it
+        s = s.detach()
+        # Apply the scale factor to the predicted depth
+        predicted_depth = s * predicted_depth
+    else:
+        # Align the predicted depth with the ground truth using median scaling
+        scale_factor = torch.median(ground_truth_depth) / torch.median(predicted_depth)
+        predicted_depth *= scale_factor
+    if disp_input:
+        # convert back to depth
+        ground_truth_depth = real_gt
+        predicted_depth = depth2disparity(predicted_depth)
+    # Clip the predicted depth values
+    if post_clip_min is not None:
+        predicted_depth = torch.clamp(predicted_depth, min=post_clip_min)
+    if post_clip_max is not None:
+        predicted_depth = torch.clamp(predicted_depth, max=post_clip_max)
+    if custom_mask is not None:
+        assert custom_mask.shape == ground_truth_depth_original.shape
+        mask_within_mask = custom_mask.cpu()[mask]
+        predicted_depth = predicted_depth[mask_within_mask]
+        ground_truth_depth = ground_truth_depth[mask_within_mask]
+    # Calculate the metrics
+    abs_rel = torch.mean(
+        torch.abs(predicted_depth - ground_truth_depth) / ground_truth_depth
+    ).item()
+    sq_rel = torch.mean(
+        ((predicted_depth - ground_truth_depth) ** 2) / ground_truth_depth
+    ).item()
+    # Correct RMSE calculation
+    rmse = torch.sqrt(torch.mean((predicted_depth - ground_truth_depth) ** 2)).item()
+    # Clip the depth values to avoid log(0)
+    predicted_depth = torch.clamp(predicted_depth, min=1e-5)
+    log_rmse = torch.sqrt(
+        torch.mean((torch.log(predicted_depth) - torch.log(ground_truth_depth)) ** 2)
+    ).item()
+    # Calculate the accuracy thresholds
+    max_ratio = torch.maximum(
+        predicted_depth / ground_truth_depth, ground_truth_depth / predicted_depth
+    )
+    threshold_0 = torch.mean((max_ratio < 1.0).float()).item()
+    threshold_1 = torch.mean((max_ratio < 1.25).float()).item()
+    threshold_2 = torch.mean((max_ratio < 1.25**2).float()).item()
+    threshold_3 = torch.mean((max_ratio < 1.25**3).float()).item()
+    # Compute the depth error parity map
+    if metric_scale:
+        predicted_depth_original = predicted_depth_original
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    elif align_with_lstsq or align_with_lad or align_with_lad2:
+        predicted_depth_original = predicted_depth_original * s + t
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    elif align_with_scale:
+        predicted_depth_original = predicted_depth_original * s
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    else:
+        predicted_depth_original = predicted_depth_original * scale_factor
+        if disp_input:
+            predicted_depth_original = depth2disparity(predicted_depth_original)
+        depth_error_parity_map = (
+            torch.abs(predicted_depth_original - ground_truth_depth_original)
+            / ground_truth_depth_original
+        )
+    # Reshape the depth_error_parity_map back to the original image size
+    depth_error_parity_map_full = torch.zeros_like(ground_truth_depth_original)
+    depth_error_parity_map_full = torch.where(
+        mask, depth_error_parity_map, depth_error_parity_map_full
+    )
+    predict_depth_map_full = predicted_depth_original
+    gt_depth_map_full = torch.zeros_like(ground_truth_depth_original)
+    gt_depth_map_full = torch.where(
+        mask, ground_truth_depth_original, gt_depth_map_full
+    )
+    num_valid_pixels = (
+        torch.sum(mask).item()
+        if custom_mask is None
+        else torch.sum(mask_within_mask).item()
+    )
+    if num_valid_pixels == 0:
+        (
+            abs_rel,
+            sq_rel,
+            rmse,
+            log_rmse,
+            threshold_0,
+            threshold_1,
+            threshold_2,
+            threshold_3,
+        ) = (0, 0, 0, 0, 0, 0, 0, 0)
+    results = {
+        "Abs Rel": abs_rel,
+        "Sq Rel": sq_rel,
+        "RMSE": rmse,
+        "Log RMSE": log_rmse,
+        "δ < 1.": threshold_0,
+        "δ < 1.25": threshold_1,
+        "δ < 1.25^2": threshold_2,
+        "δ < 1.25^3": threshold_3,
+        "valid_pixels": num_valid_pixels,
+    }
+    return (
+        results,
+        depth_error_parity_map_full,
+        predict_depth_map_full,
+        gt_depth_map_full,
+    )

eval/mv_recon/base.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# base class for implementing datasets
+# --------------------------------------------------------
+import PIL
+import numpy as np
+import torch
+from stream3r.dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
+from eval.mv_recon.dataset_utils.transforms import ImgNorm
+import eval.mv_recon.dataset_utils.cropping as cropping
+class BaseStereoViewDataset:
+    """Define all basic options.
+    Usage:
+        class MyDataset (BaseStereoViewDataset):
+            def _get_views(self, idx, rng):
+                # overload here
+                views = []
+                views.append(dict(img=, ...))
+                return views
+    """
+    def __init__(
+        self,
+        *,  # only keyword arguments
+        split=None,
+        resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
+        transform=ImgNorm,
+        aug_crop=False,
+        seed=None,
+    ):
+        self.num_views = 2
+        self.split = split
+        self._set_resolutions(resolution)
+        self.transform = transform
+        if isinstance(transform, str):
+            transform = eval(transform)
+        self.aug_crop = aug_crop
+        self.seed = seed
+    def __len__(self):
+        return len(self.scenes)
+    def get_stats(self):
+        return f"{len(self)} pairs"
+    def __repr__(self):
+        resolutions_str = "[" + ";".join(f"{w}x{h}" for w, h in self._resolutions) + "]"
+        return (
+            f"""{type(self).__name__}({self.get_stats()},
+            {self.split=},
+            {self.seed=},
+            resolutions={resolutions_str},
+            {self.transform=})""".replace(
+                "self.", ""
+            )
+            .replace("\n", "")
+            .replace("   ", "")
+        )
+    def _get_views(self, idx, resolution, rng):
+        raise NotImplementedError()
+    def __getitem__(self, idx):
+        if isinstance(idx, tuple):
+            # the idx is specifying the aspect-ratio
+            idx, ar_idx = idx
+        else:
+            assert len(self._resolutions) == 1
+            ar_idx = 0
+        # set-up the rng
+        if self.seed:  # reseed for each __getitem__
+            self._rng = np.random.default_rng(seed=self.seed + idx)
+        elif not hasattr(self, "_rng"):
+            seed = torch.initial_seed()  # this is different for each dataloader process
+            self._rng = np.random.default_rng(seed=seed)
+        # over-loaded code
+        resolution = self._resolutions[
+            ar_idx
+        ]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
+        views = self._get_views(idx, resolution, self._rng)
+        # check data-types
+        for v, view in enumerate(views):
+            assert (
+                "pts3d" not in view
+            ), f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
+            view["idx"] = v
+            # encode the image
+            width, height = view["img"].size
+            view["true_shape"] = np.int32((height, width))
+            view["img"] = self.transform(view["img"])
+            assert "camera_intrinsics" in view
+            if "camera_pose" not in view:
+                view["camera_pose"] = np.full((4, 4), np.nan, dtype=np.float32)
+            else:
+                assert np.isfinite(
+                    view["camera_pose"]
+                ).all(), f"NaN in camera pose for view {view_name(view)}"
+            assert "pts3d" not in view
+            assert "valid_mask" not in view
+            assert np.isfinite(
+                view["depthmap"]
+            ).all(), f"NaN in depthmap for view {view_name(view)}"
+            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
+            view["pts3d"] = pts3d
+            view["valid_mask"] = valid_mask & np.isfinite(pts3d).all(axis=-1)
+            # check all datatypes
+            for key, val in view.items():
+                res, err_msg = is_good_type(key, val)
+                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
+            K = view["camera_intrinsics"]
+            view["img_mask"] = True
+            view["ray_mask"] = False
+            view["ray_map"] = torch.full(
+                (6, view["img"].shape[-2], view["img"].shape[-1]), torch.nan
+            )
+            view["update"] = True
+            view["reset"] = False
+        # last thing done!
+        for view in views:
+            # transpose to make sure all views are the same size
+            transpose_to_landscape(view)
+            # this allows to check whether the RNG is is the same state each time
+            view["rng"] = int.from_bytes(self._rng.bytes(4), "big")
+        return views
+    def _set_resolutions(self, resolutions):
+        """Set the resolution(s) of the dataset.
+        Params:
+            - resolutions: int or tuple or list of tuples
+        """
+        assert resolutions is not None, "undefined resolution"
+        if not isinstance(resolutions, list):
+            resolutions = [resolutions]
+        self._resolutions = []
+        for resolution in resolutions:
+            if isinstance(resolution, int):
+                width = height = resolution
+            else:
+                width, height = resolution
+            assert isinstance(
+                width, int
+            ), f"Bad type for {width=} {type(width)=}, should be int"
+            assert isinstance(
+                height, int
+            ), f"Bad type for {height=} {type(height)=}, should be int"
+            assert width >= height
+            self._resolutions.append((width, height))
+    def _crop_resize_if_necessary(
+        self, image, depthmap, intrinsics, resolution, rng=None, info=None
+    ):
+        """This function:
+        - first downsizes the image with LANCZOS inteprolation,
+          which is better than bilinear interpolation in
+        """
+        if not isinstance(image, PIL.Image.Image):
+            image = PIL.Image.fromarray(image)
+        # downscale with lanczos interpolation so that image.size == resolution
+        # cropping centered on the principal point
+        W, H = image.size
+        cx, cy = intrinsics[:2, 2].round().astype(int)
+        # calculate min distance to margin
+        min_margin_x = min(cx, W - cx)
+        min_margin_y = min(cy, H - cy)
+        assert min_margin_x > W / 5, f"Bad principal point in view={info}"
+        assert min_margin_y > H / 5, f"Bad principal point in view={info}"
+        ## Center crop
+        # Crop on the principal point, make it always centered
+        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
+        l, t = cx - min_margin_x, cy - min_margin_y
+        r, b = cx + min_margin_x, cy + min_margin_y
+        crop_bbox = (l, t, r, b)
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+        # # transpose the resolution if necessary
+        W, H = image.size  # new size
+        assert resolution[0] >= resolution[1]
+        if H > 1.1 * W:
+            # image is portrait mode
+            resolution = resolution[::-1]
+        elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
+            # image is square, so we chose (portrait, landscape) randomly
+            if rng.integers(2):
+                resolution = resolution[::-1]
+        # high-quality Lanczos down-scaling
+        target_resolution = np.array(resolution)
+        # # if self.aug_crop > 1:
+        # #     target_resolution += rng.integers(0, self.aug_crop)
+        # if resolution != (224, 224):
+        #     halfw, halfh = ((2*(W//2))//16)*8, ((2*(H//2))//16)*8
+        #     ## Recale with max factor, so  one of width or height might be larger than target_resolution
+        #     image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, (2*halfw, 2*halfh))
+        # else:
+        image, depthmap, intrinsics = cropping.rescale_image_depthmap(
+            image, depthmap, intrinsics, target_resolution
+        )
+        # actual cropping (if necessary) with bilinear interpolation
+        # if resolution == (224, 224):
+        intrinsics2 = cropping.camera_matrix_of_crop(
+            intrinsics, image.size, resolution, offset_factor=0.5
+        )
+        crop_bbox = cropping.bbox_from_intrinsics_in_out(
+            intrinsics, intrinsics2, resolution
+        )
+        image, depthmap, intrinsics = cropping.crop_image_depthmap(
+            image, depthmap, intrinsics, crop_bbox
+        )
+        return image, depthmap, intrinsics
+def is_good_type(key, v):
+    """returns (is_good, err_msg)"""
+    if isinstance(v, (str, int, tuple)):
+        return True, None
+    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
+        return False, f"bad {v.dtype=}"
+    return True, None
+def view_name(view, batch_index=None):
+    def sel(x):
+        return x[batch_index] if batch_index not in (None, slice(None)) else x
+    db = sel(view["dataset"])
+    label = sel(view["label"])
+    instance = sel(view["instance"])
+    return f"{db}/{label}/{instance}"
+def transpose_to_landscape(view):
+    height, width = view["true_shape"]
+    if width < height:
+        # rectify portrait to landscape
+        assert view["img"].shape == (3, height, width)
+        view["img"] = view["img"].swapaxes(1, 2)
+        assert view["valid_mask"].shape == (height, width)
+        view["valid_mask"] = view["valid_mask"].swapaxes(0, 1)
+        assert view["depthmap"].shape == (height, width)
+        view["depthmap"] = view["depthmap"].swapaxes(0, 1)
+        assert view["pts3d"].shape == (height, width, 3)
+        view["pts3d"] = view["pts3d"].swapaxes(0, 1)
+        # transpose x and y pixels
+        view["camera_intrinsics"] = view["camera_intrinsics"][[1, 0, 2]]