Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- outdoor_v48_16gpu/.hydra/config.yaml +68 -0
- outdoor_v48_16gpu/.hydra/hydra.yaml +156 -0
- outdoor_v48_16gpu/.hydra/overrides.yaml +2 -0
- outdoor_v48_16gpu/mytrain.log +930 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/mytrain.py +601 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/camera_head.py +175 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/dpt_head.py +471 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/head_act.py +116 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_head.py +102 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/__init__.py +0 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/base_track_predictor.py +195 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/blocks.py +237 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/modules.py +211 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/utils.py +216 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/utils.py +99 -0
- outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/geometry.py +166 -0
- outdoor_v48_16gpu_v2/mytrain.log +985 -0
- outdoor_v48_4gpu/.hydra/config.yaml +68 -0
- outdoor_v48_4gpu/.hydra/hydra.yaml +155 -0
- outdoor_v48_4gpu/.hydra/overrides.yaml +1 -0
- outdoor_v48_4gpu/mytrain.log +0 -0
- outdoor_v48_4gpu_v2/.hydra/config.yaml +68 -0
- outdoor_v48_4gpu_v2/.hydra/hydra.yaml +155 -0
- outdoor_v48_4gpu_v2/.hydra/overrides.yaml +1 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/__init__.py +91 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/arkitscenes.py +246 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/arkitscenes_highres.py +175 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/__init__.py +0 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/base_multiview_dataset.py +576 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/batched_sampler.py +93 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/easy_dataset.py +212 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/blendedmvs.py +348 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/co3d.py +190 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/cop3d.py +110 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/dl3dv.py +166 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/dynamic_replica.py +137 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/habitat_hm3d.py +174 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/hoi4d.py +84 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/hypersim.py +142 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/kitti360.py +354 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/mapfree.py +282 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/megadepth.py +100 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/mvs_synth.py +144 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/omniobject3d.py +146 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/pointodyssey.py +178 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/realestate10k.py +139 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/scannet.py +149 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/scannetpp.py +211 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/smartportraits.py +85 -0
- outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/tartanair.py +164 -0
outdoor_v48_16gpu/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
teacher: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
|
| 2 |
+
pretrained: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
|
| 3 |
+
load_only_encoder: false
|
| 4 |
+
long_context: false
|
| 5 |
+
fixed_length: true
|
| 6 |
+
resume: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_8gpu/checkpoint-last.pth
|
| 7 |
+
benchmark: false
|
| 8 |
+
num_views: 64
|
| 9 |
+
num_test_views: 4
|
| 10 |
+
n_corres_train: 0
|
| 11 |
+
n_corres_test: 0
|
| 12 |
+
train_criterion: DistillLoss()
|
| 13 |
+
test_criterion: DistillLoss()
|
| 14 |
+
allow_repeat: false
|
| 15 |
+
root_vkitti2: /scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti
|
| 16 |
+
root_kitti: /scratch-shared/wwei2/eval/kitti_odometry/dataset
|
| 17 |
+
root_kitti_velo: /gpfs/work2/0/prjs0824/semantickitti/dataset
|
| 18 |
+
root_kitti360: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
|
| 19 |
+
root_kitti360_velo: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
|
| 20 |
+
root_waymo: /scratch-shared/wwei2/waymo_v2
|
| 21 |
+
root_waymo_lidar: /scratch-shared/wwei2/waymo_v2
|
| 22 |
+
dataset_vkitti2: VirtualKITTI2_Multi(allow_repeat=${allow_repeat}, split='train',
|
| 23 |
+
ROOT="${root_vkitti2}", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294),
|
| 24 |
+
(518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
|
| 25 |
+
n_corres=${n_corres_train})
|
| 26 |
+
dataset_kitti360: KITTI360_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_kitti360}",
|
| 27 |
+
velodyne_root="${root_kitti360_velo}", aug_crop=16, resolution=[(518, 392), (518,
|
| 28 |
+
336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter,
|
| 29 |
+
num_views=${num_views}, n_corres=${n_corres_train})
|
| 30 |
+
dataset_waymo: Waymo_v2_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_waymo}",
|
| 31 |
+
lidar_root="${root_waymo_lidar}", aug_crop=16, resolution=[(518, 392), (518, 336),
|
| 32 |
+
(518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
|
| 33 |
+
n_corres=${n_corres_train})
|
| 34 |
+
train_dataset: 6000 @ ${dataset_vkitti2} + 6000 @ ${dataset_kitti360} + 5400 @ ${dataset_waymo}
|
| 35 |
+
test_dataset: 200 @ VirtualKITTI2_Multi(split='train', ROOT="${root_vkitti2}", resolution=(518,
|
| 36 |
+
154), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
|
| 37 |
+
seed: 0
|
| 38 |
+
batch_size: 1
|
| 39 |
+
accum_iter: 1
|
| 40 |
+
gradient_checkpointing: false
|
| 41 |
+
epochs: 10
|
| 42 |
+
start_epoch: 0
|
| 43 |
+
start_step: 0
|
| 44 |
+
weight_decay: 0.05
|
| 45 |
+
lr: 1.0e-05
|
| 46 |
+
min_lr: 1.0e-08
|
| 47 |
+
warmup_epochs: 0.5
|
| 48 |
+
amp: 1
|
| 49 |
+
num_workers: 4
|
| 50 |
+
world_size: 1
|
| 51 |
+
local-rank: -1
|
| 52 |
+
dist_url: env://
|
| 53 |
+
rank: 0
|
| 54 |
+
gpu: 0
|
| 55 |
+
distributed: false
|
| 56 |
+
dist_backend: nccl
|
| 57 |
+
eval_freq: 1
|
| 58 |
+
save_freq: 0.1
|
| 59 |
+
max_checkpoints: 10
|
| 60 |
+
keep_freq: 1
|
| 61 |
+
print_freq: 10
|
| 62 |
+
print_img_freq: 50000000
|
| 63 |
+
num_imgs_vis: 4
|
| 64 |
+
save_dir: /scratch-shared/wwei2/training_upstream/checkpoints
|
| 65 |
+
exp_name: outdoor_v48_16gpu
|
| 66 |
+
task: StreamVGGT
|
| 67 |
+
logdir: ${save_dir}/${exp_name}/logs
|
| 68 |
+
output_dir: ${save_dir}/${exp_name}/
|
outdoor_v48_16gpu/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${save_dir}/${exp_name}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- exp_name=outdoor_v48_16gpu
|
| 116 |
+
- resume=/scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_8gpu/checkpoint-last.pth
|
| 117 |
+
job:
|
| 118 |
+
name: mytrain
|
| 119 |
+
chdir: null
|
| 120 |
+
override_dirname: exp_name=outdoor_v48_16gpu,resume=/scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_8gpu/checkpoint-last.pth
|
| 121 |
+
id: ???
|
| 122 |
+
num: ???
|
| 123 |
+
config_name: outdoor_v48
|
| 124 |
+
env_set: {}
|
| 125 |
+
env_copy: []
|
| 126 |
+
config:
|
| 127 |
+
override_dirname:
|
| 128 |
+
kv_sep: '='
|
| 129 |
+
item_sep: ','
|
| 130 |
+
exclude_keys: []
|
| 131 |
+
runtime:
|
| 132 |
+
version: 1.3.2
|
| 133 |
+
version_base: '1.3'
|
| 134 |
+
cwd: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/src
|
| 135 |
+
config_sources:
|
| 136 |
+
- path: hydra.conf
|
| 137 |
+
schema: pkg
|
| 138 |
+
provider: hydra
|
| 139 |
+
- path: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/config
|
| 140 |
+
schema: file
|
| 141 |
+
provider: main
|
| 142 |
+
- path: ''
|
| 143 |
+
schema: structured
|
| 144 |
+
provider: schema
|
| 145 |
+
output_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu
|
| 146 |
+
choices:
|
| 147 |
+
hydra/env: default
|
| 148 |
+
hydra/callbacks: null
|
| 149 |
+
hydra/job_logging: default
|
| 150 |
+
hydra/hydra_logging: default
|
| 151 |
+
hydra/hydra_help: default
|
| 152 |
+
hydra/help: default
|
| 153 |
+
hydra/sweeper: basic
|
| 154 |
+
hydra/launcher: basic
|
| 155 |
+
hydra/output: default
|
| 156 |
+
verbose: true
|
outdoor_v48_16gpu/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- exp_name=outdoor_v48_16gpu
|
| 2 |
+
- resume=/scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_8gpu/checkpoint-last.pth
|
outdoor_v48_16gpu/mytrain.log
ADDED
|
@@ -0,0 +1,930 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-05-02 09:28:25,135][__main__][INFO] - [RANK 0] output_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu/
|
| 2 |
+
[2026-05-02 09:28:25,901][__main__][INFO] - [RANK 0] Saving current code to /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu/code/05_02-09:28:25
|
| 3 |
+
[2026-05-02 09:28:25,901][__main__][INFO] - [RANK 0] job dir: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/src
|
| 4 |
+
[2026-05-02 09:28:25,901][__main__][INFO] - [RANK 0] Setting seed to 0 for process 0
|
| 5 |
+
[2026-05-02 09:28:25,903][__main__][INFO] - [RANK 0] Building train dataset 6000 @ VirtualKITTI2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 6000 @ KITTI360_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", velodyne_root="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 5400 @ Waymo_v2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/waymo_v2", lidar_root="/scratch-shared/wwei2/waymo_v2", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0)
|
| 6 |
+
[2026-05-02 09:28:25,903][__main__][INFO] - [RANK 0] Building Train Data loader for dataset: 6000 @ VirtualKITTI2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 6000 @ KITTI360_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", velodyne_root="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 5400 @ Waymo_v2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/waymo_v2", lidar_root="/scratch-shared/wwei2/waymo_v2", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0)
|
| 7 |
+
[2026-05-02 09:32:13,562][__main__][INFO] - [RANK 0] Building test dataset 200 @ VirtualKITTI2_Multi(split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", resolution=(518, 154), num_views=4, seed=42, n_corres=0)
|
| 8 |
+
[2026-05-02 09:32:13,562][__main__][INFO] - [RANK 0] Building Test Data loader for dataset: 200 @ VirtualKITTI2_Multi(split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", resolution=(518, 154), num_views=4, seed=42, n_corres=0)
|
| 9 |
+
[2026-05-02 09:32:13,641][__main__][INFO] - [RANK 0] Loading model
|
| 10 |
+
[2026-05-02 09:32:19,610][__main__][INFO] - [RANK 0] All model parameters: 958696732
|
| 11 |
+
[2026-05-02 09:32:19,610][__main__][INFO] - [RANK 0] >> Creating train criterion = DistillLoss()
|
| 12 |
+
[2026-05-02 09:32:19,610][__main__][INFO] - [RANK 0] >> Creating test criterion = DistillLoss()
|
| 13 |
+
[2026-05-02 09:32:20,033][__main__][INFO] - [RANK 0] Freezing patch embedding and positional encoding parameters...
|
| 14 |
+
[2026-05-02 09:32:20,038][__main__][INFO] - [RANK 0] Frozen 304,376,832 parameters out of 958,696,732 total parameters. (31.75%)
|
| 15 |
+
[2026-05-02 09:32:20,038][__main__][INFO] - [RANK 0] Trainable parameters: 654,319,900 (68.25%)
|
| 16 |
+
[2026-05-02 09:32:20,038][__main__][INFO] - [RANK 0] Example frozen parameters: register_token, encoder.cls_token, encoder.pos_embed, encoder.register_tokens, encoder.patch_embed.proj.weight...
|
| 17 |
+
[2026-05-02 09:32:20,063][croco.utils.misc][INFO] - [RANK 0] Param groups = {
|
| 18 |
+
"no_decay": {
|
| 19 |
+
"weight_decay": 0.0,
|
| 20 |
+
"params": [
|
| 21 |
+
"decoder.0.norm1.weight",
|
| 22 |
+
"decoder.0.norm1.bias",
|
| 23 |
+
"decoder.0.attn.qkv.bias",
|
| 24 |
+
"decoder.0.attn.proj.bias",
|
| 25 |
+
"decoder.0.attn.q_norm.weight",
|
| 26 |
+
"decoder.0.attn.q_norm.bias",
|
| 27 |
+
"decoder.0.attn.k_norm.weight",
|
| 28 |
+
"decoder.0.attn.k_norm.bias",
|
| 29 |
+
"decoder.0.ls1.gamma",
|
| 30 |
+
"decoder.0.norm2.weight",
|
| 31 |
+
"decoder.0.norm2.bias",
|
| 32 |
+
"decoder.0.mlp.fc1.bias",
|
| 33 |
+
"decoder.0.mlp.fc2.bias",
|
| 34 |
+
"decoder.0.ls2.gamma",
|
| 35 |
+
"decoder.1.norm1.weight",
|
| 36 |
+
"decoder.1.norm1.bias",
|
| 37 |
+
"decoder.1.attn.qkv.bias",
|
| 38 |
+
"decoder.1.attn.proj.bias",
|
| 39 |
+
"decoder.1.attn.q_norm.weight",
|
| 40 |
+
"decoder.1.attn.q_norm.bias",
|
| 41 |
+
"decoder.1.attn.k_norm.weight",
|
| 42 |
+
"decoder.1.attn.k_norm.bias",
|
| 43 |
+
"decoder.1.ls1.gamma",
|
| 44 |
+
"decoder.1.norm2.weight",
|
| 45 |
+
"decoder.1.norm2.bias",
|
| 46 |
+
"decoder.1.mlp.fc1.bias",
|
| 47 |
+
"decoder.1.mlp.fc2.bias",
|
| 48 |
+
"decoder.1.ls2.gamma",
|
| 49 |
+
"decoder.2.norm1.weight",
|
| 50 |
+
"decoder.2.norm1.bias",
|
| 51 |
+
"decoder.2.attn.qkv.bias",
|
| 52 |
+
"decoder.2.attn.proj.bias",
|
| 53 |
+
"decoder.2.attn.q_norm.weight",
|
| 54 |
+
"decoder.2.attn.q_norm.bias",
|
| 55 |
+
"decoder.2.attn.k_norm.weight",
|
| 56 |
+
"decoder.2.attn.k_norm.bias",
|
| 57 |
+
"decoder.2.ls1.gamma",
|
| 58 |
+
"decoder.2.norm2.weight",
|
| 59 |
+
"decoder.2.norm2.bias",
|
| 60 |
+
"decoder.2.mlp.fc1.bias",
|
| 61 |
+
"decoder.2.mlp.fc2.bias",
|
| 62 |
+
"decoder.2.ls2.gamma",
|
| 63 |
+
"decoder.3.norm1.weight",
|
| 64 |
+
"decoder.3.norm1.bias",
|
| 65 |
+
"decoder.3.attn.qkv.bias",
|
| 66 |
+
"decoder.3.attn.proj.bias",
|
| 67 |
+
"decoder.3.attn.q_norm.weight",
|
| 68 |
+
"decoder.3.attn.q_norm.bias",
|
| 69 |
+
"decoder.3.attn.k_norm.weight",
|
| 70 |
+
"decoder.3.attn.k_norm.bias",
|
| 71 |
+
"decoder.3.ls1.gamma",
|
| 72 |
+
"decoder.3.norm2.weight",
|
| 73 |
+
"decoder.3.norm2.bias",
|
| 74 |
+
"decoder.3.mlp.fc1.bias",
|
| 75 |
+
"decoder.3.mlp.fc2.bias",
|
| 76 |
+
"decoder.3.ls2.gamma",
|
| 77 |
+
"decoder.4.norm1.weight",
|
| 78 |
+
"decoder.4.norm1.bias",
|
| 79 |
+
"decoder.4.attn.qkv.bias",
|
| 80 |
+
"decoder.4.attn.proj.bias",
|
| 81 |
+
"decoder.4.attn.q_norm.weight",
|
| 82 |
+
"decoder.4.attn.q_norm.bias",
|
| 83 |
+
"decoder.4.attn.k_norm.weight",
|
| 84 |
+
"decoder.4.attn.k_norm.bias",
|
| 85 |
+
"decoder.4.ls1.gamma",
|
| 86 |
+
"decoder.4.norm2.weight",
|
| 87 |
+
"decoder.4.norm2.bias",
|
| 88 |
+
"decoder.4.mlp.fc1.bias",
|
| 89 |
+
"decoder.4.mlp.fc2.bias",
|
| 90 |
+
"decoder.4.ls2.gamma",
|
| 91 |
+
"decoder.5.norm1.weight",
|
| 92 |
+
"decoder.5.norm1.bias",
|
| 93 |
+
"decoder.5.attn.qkv.bias",
|
| 94 |
+
"decoder.5.attn.proj.bias",
|
| 95 |
+
"decoder.5.attn.q_norm.weight",
|
| 96 |
+
"decoder.5.attn.q_norm.bias",
|
| 97 |
+
"decoder.5.attn.k_norm.weight",
|
| 98 |
+
"decoder.5.attn.k_norm.bias",
|
| 99 |
+
"decoder.5.ls1.gamma",
|
| 100 |
+
"decoder.5.norm2.weight",
|
| 101 |
+
"decoder.5.norm2.bias",
|
| 102 |
+
"decoder.5.mlp.fc1.bias",
|
| 103 |
+
"decoder.5.mlp.fc2.bias",
|
| 104 |
+
"decoder.5.ls2.gamma",
|
| 105 |
+
"decoder.6.norm1.weight",
|
| 106 |
+
"decoder.6.norm1.bias",
|
| 107 |
+
"decoder.6.attn.qkv.bias",
|
| 108 |
+
"decoder.6.attn.proj.bias",
|
| 109 |
+
"decoder.6.attn.q_norm.weight",
|
| 110 |
+
"decoder.6.attn.q_norm.bias",
|
| 111 |
+
"decoder.6.attn.k_norm.weight",
|
| 112 |
+
"decoder.6.attn.k_norm.bias",
|
| 113 |
+
"decoder.6.ls1.gamma",
|
| 114 |
+
"decoder.6.norm2.weight",
|
| 115 |
+
"decoder.6.norm2.bias",
|
| 116 |
+
"decoder.6.mlp.fc1.bias",
|
| 117 |
+
"decoder.6.mlp.fc2.bias",
|
| 118 |
+
"decoder.6.ls2.gamma",
|
| 119 |
+
"decoder.7.norm1.weight",
|
| 120 |
+
"decoder.7.norm1.bias",
|
| 121 |
+
"decoder.7.attn.qkv.bias",
|
| 122 |
+
"decoder.7.attn.proj.bias",
|
| 123 |
+
"decoder.7.attn.q_norm.weight",
|
| 124 |
+
"decoder.7.attn.q_norm.bias",
|
| 125 |
+
"decoder.7.attn.k_norm.weight",
|
| 126 |
+
"decoder.7.attn.k_norm.bias",
|
| 127 |
+
"decoder.7.ls1.gamma",
|
| 128 |
+
"decoder.7.norm2.weight",
|
| 129 |
+
"decoder.7.norm2.bias",
|
| 130 |
+
"decoder.7.mlp.fc1.bias",
|
| 131 |
+
"decoder.7.mlp.fc2.bias",
|
| 132 |
+
"decoder.7.ls2.gamma",
|
| 133 |
+
"decoder.8.norm1.weight",
|
| 134 |
+
"decoder.8.norm1.bias",
|
| 135 |
+
"decoder.8.attn.qkv.bias",
|
| 136 |
+
"decoder.8.attn.proj.bias",
|
| 137 |
+
"decoder.8.attn.q_norm.weight",
|
| 138 |
+
"decoder.8.attn.q_norm.bias",
|
| 139 |
+
"decoder.8.attn.k_norm.weight",
|
| 140 |
+
"decoder.8.attn.k_norm.bias",
|
| 141 |
+
"decoder.8.ls1.gamma",
|
| 142 |
+
"decoder.8.norm2.weight",
|
| 143 |
+
"decoder.8.norm2.bias",
|
| 144 |
+
"decoder.8.mlp.fc1.bias",
|
| 145 |
+
"decoder.8.mlp.fc2.bias",
|
| 146 |
+
"decoder.8.ls2.gamma",
|
| 147 |
+
"decoder.9.norm1.weight",
|
| 148 |
+
"decoder.9.norm1.bias",
|
| 149 |
+
"decoder.9.attn.qkv.bias",
|
| 150 |
+
"decoder.9.attn.proj.bias",
|
| 151 |
+
"decoder.9.attn.q_norm.weight",
|
| 152 |
+
"decoder.9.attn.q_norm.bias",
|
| 153 |
+
"decoder.9.attn.k_norm.weight",
|
| 154 |
+
"decoder.9.attn.k_norm.bias",
|
| 155 |
+
"decoder.9.ls1.gamma",
|
| 156 |
+
"decoder.9.norm2.weight",
|
| 157 |
+
"decoder.9.norm2.bias",
|
| 158 |
+
"decoder.9.mlp.fc1.bias",
|
| 159 |
+
"decoder.9.mlp.fc2.bias",
|
| 160 |
+
"decoder.9.ls2.gamma",
|
| 161 |
+
"decoder.10.norm1.weight",
|
| 162 |
+
"decoder.10.norm1.bias",
|
| 163 |
+
"decoder.10.attn.qkv.bias",
|
| 164 |
+
"decoder.10.attn.proj.bias",
|
| 165 |
+
"decoder.10.attn.q_norm.weight",
|
| 166 |
+
"decoder.10.attn.q_norm.bias",
|
| 167 |
+
"decoder.10.attn.k_norm.weight",
|
| 168 |
+
"decoder.10.attn.k_norm.bias",
|
| 169 |
+
"decoder.10.ls1.gamma",
|
| 170 |
+
"decoder.10.norm2.weight",
|
| 171 |
+
"decoder.10.norm2.bias",
|
| 172 |
+
"decoder.10.mlp.fc1.bias",
|
| 173 |
+
"decoder.10.mlp.fc2.bias",
|
| 174 |
+
"decoder.10.ls2.gamma",
|
| 175 |
+
"decoder.11.norm1.weight",
|
| 176 |
+
"decoder.11.norm1.bias",
|
| 177 |
+
"decoder.11.attn.qkv.bias",
|
| 178 |
+
"decoder.11.attn.proj.bias",
|
| 179 |
+
"decoder.11.attn.q_norm.weight",
|
| 180 |
+
"decoder.11.attn.q_norm.bias",
|
| 181 |
+
"decoder.11.attn.k_norm.weight",
|
| 182 |
+
"decoder.11.attn.k_norm.bias",
|
| 183 |
+
"decoder.11.ls1.gamma",
|
| 184 |
+
"decoder.11.norm2.weight",
|
| 185 |
+
"decoder.11.norm2.bias",
|
| 186 |
+
"decoder.11.mlp.fc1.bias",
|
| 187 |
+
"decoder.11.mlp.fc2.bias",
|
| 188 |
+
"decoder.11.ls2.gamma",
|
| 189 |
+
"decoder.12.norm1.weight",
|
| 190 |
+
"decoder.12.norm1.bias",
|
| 191 |
+
"decoder.12.attn.qkv.bias",
|
| 192 |
+
"decoder.12.attn.proj.bias",
|
| 193 |
+
"decoder.12.attn.q_norm.weight",
|
| 194 |
+
"decoder.12.attn.q_norm.bias",
|
| 195 |
+
"decoder.12.attn.k_norm.weight",
|
| 196 |
+
"decoder.12.attn.k_norm.bias",
|
| 197 |
+
"decoder.12.ls1.gamma",
|
| 198 |
+
"decoder.12.norm2.weight",
|
| 199 |
+
"decoder.12.norm2.bias",
|
| 200 |
+
"decoder.12.mlp.fc1.bias",
|
| 201 |
+
"decoder.12.mlp.fc2.bias",
|
| 202 |
+
"decoder.12.ls2.gamma",
|
| 203 |
+
"decoder.13.norm1.weight",
|
| 204 |
+
"decoder.13.norm1.bias",
|
| 205 |
+
"decoder.13.attn.qkv.bias",
|
| 206 |
+
"decoder.13.attn.proj.bias",
|
| 207 |
+
"decoder.13.attn.q_norm.weight",
|
| 208 |
+
"decoder.13.attn.q_norm.bias",
|
| 209 |
+
"decoder.13.attn.k_norm.weight",
|
| 210 |
+
"decoder.13.attn.k_norm.bias",
|
| 211 |
+
"decoder.13.ls1.gamma",
|
| 212 |
+
"decoder.13.norm2.weight",
|
| 213 |
+
"decoder.13.norm2.bias",
|
| 214 |
+
"decoder.13.mlp.fc1.bias",
|
| 215 |
+
"decoder.13.mlp.fc2.bias",
|
| 216 |
+
"decoder.13.ls2.gamma",
|
| 217 |
+
"decoder.14.norm1.weight",
|
| 218 |
+
"decoder.14.norm1.bias",
|
| 219 |
+
"decoder.14.attn.qkv.bias",
|
| 220 |
+
"decoder.14.attn.proj.bias",
|
| 221 |
+
"decoder.14.attn.q_norm.weight",
|
| 222 |
+
"decoder.14.attn.q_norm.bias",
|
| 223 |
+
"decoder.14.attn.k_norm.weight",
|
| 224 |
+
"decoder.14.attn.k_norm.bias",
|
| 225 |
+
"decoder.14.ls1.gamma",
|
| 226 |
+
"decoder.14.norm2.weight",
|
| 227 |
+
"decoder.14.norm2.bias",
|
| 228 |
+
"decoder.14.mlp.fc1.bias",
|
| 229 |
+
"decoder.14.mlp.fc2.bias",
|
| 230 |
+
"decoder.14.ls2.gamma",
|
| 231 |
+
"decoder.15.norm1.weight",
|
| 232 |
+
"decoder.15.norm1.bias",
|
| 233 |
+
"decoder.15.attn.qkv.bias",
|
| 234 |
+
"decoder.15.attn.proj.bias",
|
| 235 |
+
"decoder.15.attn.q_norm.weight",
|
| 236 |
+
"decoder.15.attn.q_norm.bias",
|
| 237 |
+
"decoder.15.attn.k_norm.weight",
|
| 238 |
+
"decoder.15.attn.k_norm.bias",
|
| 239 |
+
"decoder.15.ls1.gamma",
|
| 240 |
+
"decoder.15.norm2.weight",
|
| 241 |
+
"decoder.15.norm2.bias",
|
| 242 |
+
"decoder.15.mlp.fc1.bias",
|
| 243 |
+
"decoder.15.mlp.fc2.bias",
|
| 244 |
+
"decoder.15.ls2.gamma",
|
| 245 |
+
"decoder.16.norm1.weight",
|
| 246 |
+
"decoder.16.norm1.bias",
|
| 247 |
+
"decoder.16.attn.qkv.bias",
|
| 248 |
+
"decoder.16.attn.proj.bias",
|
| 249 |
+
"decoder.16.attn.q_norm.weight",
|
| 250 |
+
"decoder.16.attn.q_norm.bias",
|
| 251 |
+
"decoder.16.attn.k_norm.weight",
|
| 252 |
+
"decoder.16.attn.k_norm.bias",
|
| 253 |
+
"decoder.16.ls1.gamma",
|
| 254 |
+
"decoder.16.norm2.weight",
|
| 255 |
+
"decoder.16.norm2.bias",
|
| 256 |
+
"decoder.16.mlp.fc1.bias",
|
| 257 |
+
"decoder.16.mlp.fc2.bias",
|
| 258 |
+
"decoder.16.ls2.gamma",
|
| 259 |
+
"decoder.17.norm1.weight",
|
| 260 |
+
"decoder.17.norm1.bias",
|
| 261 |
+
"decoder.17.attn.qkv.bias",
|
| 262 |
+
"decoder.17.attn.proj.bias",
|
| 263 |
+
"decoder.17.attn.q_norm.weight",
|
| 264 |
+
"decoder.17.attn.q_norm.bias",
|
| 265 |
+
"decoder.17.attn.k_norm.weight",
|
| 266 |
+
"decoder.17.attn.k_norm.bias",
|
| 267 |
+
"decoder.17.ls1.gamma",
|
| 268 |
+
"decoder.17.norm2.weight",
|
| 269 |
+
"decoder.17.norm2.bias",
|
| 270 |
+
"decoder.17.mlp.fc1.bias",
|
| 271 |
+
"decoder.17.mlp.fc2.bias",
|
| 272 |
+
"decoder.17.ls2.gamma",
|
| 273 |
+
"decoder.18.norm1.weight",
|
| 274 |
+
"decoder.18.norm1.bias",
|
| 275 |
+
"decoder.18.attn.qkv.bias",
|
| 276 |
+
"decoder.18.attn.proj.bias",
|
| 277 |
+
"decoder.18.attn.q_norm.weight",
|
| 278 |
+
"decoder.18.attn.q_norm.bias",
|
| 279 |
+
"decoder.18.attn.k_norm.weight",
|
| 280 |
+
"decoder.18.attn.k_norm.bias",
|
| 281 |
+
"decoder.18.ls1.gamma",
|
| 282 |
+
"decoder.18.norm2.weight",
|
| 283 |
+
"decoder.18.norm2.bias",
|
| 284 |
+
"decoder.18.mlp.fc1.bias",
|
| 285 |
+
"decoder.18.mlp.fc2.bias",
|
| 286 |
+
"decoder.18.ls2.gamma",
|
| 287 |
+
"decoder.19.norm1.weight",
|
| 288 |
+
"decoder.19.norm1.bias",
|
| 289 |
+
"decoder.19.attn.qkv.bias",
|
| 290 |
+
"decoder.19.attn.proj.bias",
|
| 291 |
+
"decoder.19.attn.q_norm.weight",
|
| 292 |
+
"decoder.19.attn.q_norm.bias",
|
| 293 |
+
"decoder.19.attn.k_norm.weight",
|
| 294 |
+
"decoder.19.attn.k_norm.bias",
|
| 295 |
+
"decoder.19.ls1.gamma",
|
| 296 |
+
"decoder.19.norm2.weight",
|
| 297 |
+
"decoder.19.norm2.bias",
|
| 298 |
+
"decoder.19.mlp.fc1.bias",
|
| 299 |
+
"decoder.19.mlp.fc2.bias",
|
| 300 |
+
"decoder.19.ls2.gamma",
|
| 301 |
+
"decoder.20.norm1.weight",
|
| 302 |
+
"decoder.20.norm1.bias",
|
| 303 |
+
"decoder.20.attn.qkv.bias",
|
| 304 |
+
"decoder.20.attn.proj.bias",
|
| 305 |
+
"decoder.20.attn.q_norm.weight",
|
| 306 |
+
"decoder.20.attn.q_norm.bias",
|
| 307 |
+
"decoder.20.attn.k_norm.weight",
|
| 308 |
+
"decoder.20.attn.k_norm.bias",
|
| 309 |
+
"decoder.20.ls1.gamma",
|
| 310 |
+
"decoder.20.norm2.weight",
|
| 311 |
+
"decoder.20.norm2.bias",
|
| 312 |
+
"decoder.20.mlp.fc1.bias",
|
| 313 |
+
"decoder.20.mlp.fc2.bias",
|
| 314 |
+
"decoder.20.ls2.gamma",
|
| 315 |
+
"decoder.21.norm1.weight",
|
| 316 |
+
"decoder.21.norm1.bias",
|
| 317 |
+
"decoder.21.attn.qkv.bias",
|
| 318 |
+
"decoder.21.attn.proj.bias",
|
| 319 |
+
"decoder.21.attn.q_norm.weight",
|
| 320 |
+
"decoder.21.attn.q_norm.bias",
|
| 321 |
+
"decoder.21.attn.k_norm.weight",
|
| 322 |
+
"decoder.21.attn.k_norm.bias",
|
| 323 |
+
"decoder.21.ls1.gamma",
|
| 324 |
+
"decoder.21.norm2.weight",
|
| 325 |
+
"decoder.21.norm2.bias",
|
| 326 |
+
"decoder.21.mlp.fc1.bias",
|
| 327 |
+
"decoder.21.mlp.fc2.bias",
|
| 328 |
+
"decoder.21.ls2.gamma",
|
| 329 |
+
"decoder.22.norm1.weight",
|
| 330 |
+
"decoder.22.norm1.bias",
|
| 331 |
+
"decoder.22.attn.qkv.bias",
|
| 332 |
+
"decoder.22.attn.proj.bias",
|
| 333 |
+
"decoder.22.attn.q_norm.weight",
|
| 334 |
+
"decoder.22.attn.q_norm.bias",
|
| 335 |
+
"decoder.22.attn.k_norm.weight",
|
| 336 |
+
"decoder.22.attn.k_norm.bias",
|
| 337 |
+
"decoder.22.ls1.gamma",
|
| 338 |
+
"decoder.22.norm2.weight",
|
| 339 |
+
"decoder.22.norm2.bias",
|
| 340 |
+
"decoder.22.mlp.fc1.bias",
|
| 341 |
+
"decoder.22.mlp.fc2.bias",
|
| 342 |
+
"decoder.22.ls2.gamma",
|
| 343 |
+
"decoder.23.norm1.weight",
|
| 344 |
+
"decoder.23.norm1.bias",
|
| 345 |
+
"decoder.23.attn.qkv.bias",
|
| 346 |
+
"decoder.23.attn.proj.bias",
|
| 347 |
+
"decoder.23.attn.q_norm.weight",
|
| 348 |
+
"decoder.23.attn.q_norm.bias",
|
| 349 |
+
"decoder.23.attn.k_norm.weight",
|
| 350 |
+
"decoder.23.attn.k_norm.bias",
|
| 351 |
+
"decoder.23.ls1.gamma",
|
| 352 |
+
"decoder.23.norm2.weight",
|
| 353 |
+
"decoder.23.norm2.bias",
|
| 354 |
+
"decoder.23.mlp.fc1.bias",
|
| 355 |
+
"decoder.23.mlp.fc2.bias",
|
| 356 |
+
"decoder.23.ls2.gamma",
|
| 357 |
+
"decoder.24.norm1.weight",
|
| 358 |
+
"decoder.24.norm1.bias",
|
| 359 |
+
"decoder.24.attn.qkv.bias",
|
| 360 |
+
"decoder.24.attn.proj.bias",
|
| 361 |
+
"decoder.24.attn.q_norm.weight",
|
| 362 |
+
"decoder.24.attn.q_norm.bias",
|
| 363 |
+
"decoder.24.attn.k_norm.weight",
|
| 364 |
+
"decoder.24.attn.k_norm.bias",
|
| 365 |
+
"decoder.24.ls1.gamma",
|
| 366 |
+
"decoder.24.norm2.weight",
|
| 367 |
+
"decoder.24.norm2.bias",
|
| 368 |
+
"decoder.24.mlp.fc1.bias",
|
| 369 |
+
"decoder.24.mlp.fc2.bias",
|
| 370 |
+
"decoder.24.ls2.gamma",
|
| 371 |
+
"decoder.25.norm1.weight",
|
| 372 |
+
"decoder.25.norm1.bias",
|
| 373 |
+
"decoder.25.attn.qkv.bias",
|
| 374 |
+
"decoder.25.attn.proj.bias",
|
| 375 |
+
"decoder.25.attn.q_norm.weight",
|
| 376 |
+
"decoder.25.attn.q_norm.bias",
|
| 377 |
+
"decoder.25.attn.k_norm.weight",
|
| 378 |
+
"decoder.25.attn.k_norm.bias",
|
| 379 |
+
"decoder.25.ls1.gamma",
|
| 380 |
+
"decoder.25.norm2.weight",
|
| 381 |
+
"decoder.25.norm2.bias",
|
| 382 |
+
"decoder.25.mlp.fc1.bias",
|
| 383 |
+
"decoder.25.mlp.fc2.bias",
|
| 384 |
+
"decoder.25.ls2.gamma",
|
| 385 |
+
"decoder.26.norm1.weight",
|
| 386 |
+
"decoder.26.norm1.bias",
|
| 387 |
+
"decoder.26.attn.qkv.bias",
|
| 388 |
+
"decoder.26.attn.proj.bias",
|
| 389 |
+
"decoder.26.attn.q_norm.weight",
|
| 390 |
+
"decoder.26.attn.q_norm.bias",
|
| 391 |
+
"decoder.26.attn.k_norm.weight",
|
| 392 |
+
"decoder.26.attn.k_norm.bias",
|
| 393 |
+
"decoder.26.ls1.gamma",
|
| 394 |
+
"decoder.26.norm2.weight",
|
| 395 |
+
"decoder.26.norm2.bias",
|
| 396 |
+
"decoder.26.mlp.fc1.bias",
|
| 397 |
+
"decoder.26.mlp.fc2.bias",
|
| 398 |
+
"decoder.26.ls2.gamma",
|
| 399 |
+
"decoder.27.norm1.weight",
|
| 400 |
+
"decoder.27.norm1.bias",
|
| 401 |
+
"decoder.27.attn.qkv.bias",
|
| 402 |
+
"decoder.27.attn.proj.bias",
|
| 403 |
+
"decoder.27.attn.q_norm.weight",
|
| 404 |
+
"decoder.27.attn.q_norm.bias",
|
| 405 |
+
"decoder.27.attn.k_norm.weight",
|
| 406 |
+
"decoder.27.attn.k_norm.bias",
|
| 407 |
+
"decoder.27.ls1.gamma",
|
| 408 |
+
"decoder.27.norm2.weight",
|
| 409 |
+
"decoder.27.norm2.bias",
|
| 410 |
+
"decoder.27.mlp.fc1.bias",
|
| 411 |
+
"decoder.27.mlp.fc2.bias",
|
| 412 |
+
"decoder.27.ls2.gamma",
|
| 413 |
+
"decoder.28.norm1.weight",
|
| 414 |
+
"decoder.28.norm1.bias",
|
| 415 |
+
"decoder.28.attn.qkv.bias",
|
| 416 |
+
"decoder.28.attn.proj.bias",
|
| 417 |
+
"decoder.28.attn.q_norm.weight",
|
| 418 |
+
"decoder.28.attn.q_norm.bias",
|
| 419 |
+
"decoder.28.attn.k_norm.weight",
|
| 420 |
+
"decoder.28.attn.k_norm.bias",
|
| 421 |
+
"decoder.28.ls1.gamma",
|
| 422 |
+
"decoder.28.norm2.weight",
|
| 423 |
+
"decoder.28.norm2.bias",
|
| 424 |
+
"decoder.28.mlp.fc1.bias",
|
| 425 |
+
"decoder.28.mlp.fc2.bias",
|
| 426 |
+
"decoder.28.ls2.gamma",
|
| 427 |
+
"decoder.29.norm1.weight",
|
| 428 |
+
"decoder.29.norm1.bias",
|
| 429 |
+
"decoder.29.attn.qkv.bias",
|
| 430 |
+
"decoder.29.attn.proj.bias",
|
| 431 |
+
"decoder.29.attn.q_norm.weight",
|
| 432 |
+
"decoder.29.attn.q_norm.bias",
|
| 433 |
+
"decoder.29.attn.k_norm.weight",
|
| 434 |
+
"decoder.29.attn.k_norm.bias",
|
| 435 |
+
"decoder.29.ls1.gamma",
|
| 436 |
+
"decoder.29.norm2.weight",
|
| 437 |
+
"decoder.29.norm2.bias",
|
| 438 |
+
"decoder.29.mlp.fc1.bias",
|
| 439 |
+
"decoder.29.mlp.fc2.bias",
|
| 440 |
+
"decoder.29.ls2.gamma",
|
| 441 |
+
"decoder.30.norm1.weight",
|
| 442 |
+
"decoder.30.norm1.bias",
|
| 443 |
+
"decoder.30.attn.qkv.bias",
|
| 444 |
+
"decoder.30.attn.proj.bias",
|
| 445 |
+
"decoder.30.attn.q_norm.weight",
|
| 446 |
+
"decoder.30.attn.q_norm.bias",
|
| 447 |
+
"decoder.30.attn.k_norm.weight",
|
| 448 |
+
"decoder.30.attn.k_norm.bias",
|
| 449 |
+
"decoder.30.ls1.gamma",
|
| 450 |
+
"decoder.30.norm2.weight",
|
| 451 |
+
"decoder.30.norm2.bias",
|
| 452 |
+
"decoder.30.mlp.fc1.bias",
|
| 453 |
+
"decoder.30.mlp.fc2.bias",
|
| 454 |
+
"decoder.30.ls2.gamma",
|
| 455 |
+
"decoder.31.norm1.weight",
|
| 456 |
+
"decoder.31.norm1.bias",
|
| 457 |
+
"decoder.31.attn.qkv.bias",
|
| 458 |
+
"decoder.31.attn.proj.bias",
|
| 459 |
+
"decoder.31.attn.q_norm.weight",
|
| 460 |
+
"decoder.31.attn.q_norm.bias",
|
| 461 |
+
"decoder.31.attn.k_norm.weight",
|
| 462 |
+
"decoder.31.attn.k_norm.bias",
|
| 463 |
+
"decoder.31.ls1.gamma",
|
| 464 |
+
"decoder.31.norm2.weight",
|
| 465 |
+
"decoder.31.norm2.bias",
|
| 466 |
+
"decoder.31.mlp.fc1.bias",
|
| 467 |
+
"decoder.31.mlp.fc2.bias",
|
| 468 |
+
"decoder.31.ls2.gamma",
|
| 469 |
+
"decoder.32.norm1.weight",
|
| 470 |
+
"decoder.32.norm1.bias",
|
| 471 |
+
"decoder.32.attn.qkv.bias",
|
| 472 |
+
"decoder.32.attn.proj.bias",
|
| 473 |
+
"decoder.32.attn.q_norm.weight",
|
| 474 |
+
"decoder.32.attn.q_norm.bias",
|
| 475 |
+
"decoder.32.attn.k_norm.weight",
|
| 476 |
+
"decoder.32.attn.k_norm.bias",
|
| 477 |
+
"decoder.32.ls1.gamma",
|
| 478 |
+
"decoder.32.norm2.weight",
|
| 479 |
+
"decoder.32.norm2.bias",
|
| 480 |
+
"decoder.32.mlp.fc1.bias",
|
| 481 |
+
"decoder.32.mlp.fc2.bias",
|
| 482 |
+
"decoder.32.ls2.gamma",
|
| 483 |
+
"decoder.33.norm1.weight",
|
| 484 |
+
"decoder.33.norm1.bias",
|
| 485 |
+
"decoder.33.attn.qkv.bias",
|
| 486 |
+
"decoder.33.attn.proj.bias",
|
| 487 |
+
"decoder.33.attn.q_norm.weight",
|
| 488 |
+
"decoder.33.attn.q_norm.bias",
|
| 489 |
+
"decoder.33.attn.k_norm.weight",
|
| 490 |
+
"decoder.33.attn.k_norm.bias",
|
| 491 |
+
"decoder.33.ls1.gamma",
|
| 492 |
+
"decoder.33.norm2.weight",
|
| 493 |
+
"decoder.33.norm2.bias",
|
| 494 |
+
"decoder.33.mlp.fc1.bias",
|
| 495 |
+
"decoder.33.mlp.fc2.bias",
|
| 496 |
+
"decoder.33.ls2.gamma",
|
| 497 |
+
"decoder.34.norm1.weight",
|
| 498 |
+
"decoder.34.norm1.bias",
|
| 499 |
+
"decoder.34.attn.qkv.bias",
|
| 500 |
+
"decoder.34.attn.proj.bias",
|
| 501 |
+
"decoder.34.attn.q_norm.weight",
|
| 502 |
+
"decoder.34.attn.q_norm.bias",
|
| 503 |
+
"decoder.34.attn.k_norm.weight",
|
| 504 |
+
"decoder.34.attn.k_norm.bias",
|
| 505 |
+
"decoder.34.ls1.gamma",
|
| 506 |
+
"decoder.34.norm2.weight",
|
| 507 |
+
"decoder.34.norm2.bias",
|
| 508 |
+
"decoder.34.mlp.fc1.bias",
|
| 509 |
+
"decoder.34.mlp.fc2.bias",
|
| 510 |
+
"decoder.34.ls2.gamma",
|
| 511 |
+
"decoder.35.norm1.weight",
|
| 512 |
+
"decoder.35.norm1.bias",
|
| 513 |
+
"decoder.35.attn.qkv.bias",
|
| 514 |
+
"decoder.35.attn.proj.bias",
|
| 515 |
+
"decoder.35.attn.q_norm.weight",
|
| 516 |
+
"decoder.35.attn.q_norm.bias",
|
| 517 |
+
"decoder.35.attn.k_norm.weight",
|
| 518 |
+
"decoder.35.attn.k_norm.bias",
|
| 519 |
+
"decoder.35.ls1.gamma",
|
| 520 |
+
"decoder.35.norm2.weight",
|
| 521 |
+
"decoder.35.norm2.bias",
|
| 522 |
+
"decoder.35.mlp.fc1.bias",
|
| 523 |
+
"decoder.35.mlp.fc2.bias",
|
| 524 |
+
"decoder.35.ls2.gamma",
|
| 525 |
+
"point_decoder.projects.bias",
|
| 526 |
+
"point_decoder.blocks.0.norm1.weight",
|
| 527 |
+
"point_decoder.blocks.0.norm1.bias",
|
| 528 |
+
"point_decoder.blocks.0.attn.qkv.bias",
|
| 529 |
+
"point_decoder.blocks.0.attn.proj.bias",
|
| 530 |
+
"point_decoder.blocks.0.norm2.weight",
|
| 531 |
+
"point_decoder.blocks.0.norm2.bias",
|
| 532 |
+
"point_decoder.blocks.0.mlp.fc1.bias",
|
| 533 |
+
"point_decoder.blocks.0.mlp.fc2.bias",
|
| 534 |
+
"point_decoder.blocks.1.norm1.weight",
|
| 535 |
+
"point_decoder.blocks.1.norm1.bias",
|
| 536 |
+
"point_decoder.blocks.1.attn.qkv.bias",
|
| 537 |
+
"point_decoder.blocks.1.attn.proj.bias",
|
| 538 |
+
"point_decoder.blocks.1.norm2.weight",
|
| 539 |
+
"point_decoder.blocks.1.norm2.bias",
|
| 540 |
+
"point_decoder.blocks.1.mlp.fc1.bias",
|
| 541 |
+
"point_decoder.blocks.1.mlp.fc2.bias",
|
| 542 |
+
"point_decoder.blocks.2.norm1.weight",
|
| 543 |
+
"point_decoder.blocks.2.norm1.bias",
|
| 544 |
+
"point_decoder.blocks.2.attn.qkv.bias",
|
| 545 |
+
"point_decoder.blocks.2.attn.proj.bias",
|
| 546 |
+
"point_decoder.blocks.2.norm2.weight",
|
| 547 |
+
"point_decoder.blocks.2.norm2.bias",
|
| 548 |
+
"point_decoder.blocks.2.mlp.fc1.bias",
|
| 549 |
+
"point_decoder.blocks.2.mlp.fc2.bias",
|
| 550 |
+
"point_decoder.blocks.3.norm1.weight",
|
| 551 |
+
"point_decoder.blocks.3.norm1.bias",
|
| 552 |
+
"point_decoder.blocks.3.attn.qkv.bias",
|
| 553 |
+
"point_decoder.blocks.3.attn.proj.bias",
|
| 554 |
+
"point_decoder.blocks.3.norm2.weight",
|
| 555 |
+
"point_decoder.blocks.3.norm2.bias",
|
| 556 |
+
"point_decoder.blocks.3.mlp.fc1.bias",
|
| 557 |
+
"point_decoder.blocks.3.mlp.fc2.bias",
|
| 558 |
+
"point_decoder.blocks.4.norm1.weight",
|
| 559 |
+
"point_decoder.blocks.4.norm1.bias",
|
| 560 |
+
"point_decoder.blocks.4.attn.qkv.bias",
|
| 561 |
+
"point_decoder.blocks.4.attn.proj.bias",
|
| 562 |
+
"point_decoder.blocks.4.norm2.weight",
|
| 563 |
+
"point_decoder.blocks.4.norm2.bias",
|
| 564 |
+
"point_decoder.blocks.4.mlp.fc1.bias",
|
| 565 |
+
"point_decoder.blocks.4.mlp.fc2.bias",
|
| 566 |
+
"point_decoder.linear_out.bias",
|
| 567 |
+
"point_head.proj.bias",
|
| 568 |
+
"conf_decoder.projects.bias",
|
| 569 |
+
"conf_decoder.blocks.0.norm1.weight",
|
| 570 |
+
"conf_decoder.blocks.0.norm1.bias",
|
| 571 |
+
"conf_decoder.blocks.0.attn.qkv.bias",
|
| 572 |
+
"conf_decoder.blocks.0.attn.proj.bias",
|
| 573 |
+
"conf_decoder.blocks.0.norm2.weight",
|
| 574 |
+
"conf_decoder.blocks.0.norm2.bias",
|
| 575 |
+
"conf_decoder.blocks.0.mlp.fc1.bias",
|
| 576 |
+
"conf_decoder.blocks.0.mlp.fc2.bias",
|
| 577 |
+
"conf_decoder.blocks.1.norm1.weight",
|
| 578 |
+
"conf_decoder.blocks.1.norm1.bias",
|
| 579 |
+
"conf_decoder.blocks.1.attn.qkv.bias",
|
| 580 |
+
"conf_decoder.blocks.1.attn.proj.bias",
|
| 581 |
+
"conf_decoder.blocks.1.norm2.weight",
|
| 582 |
+
"conf_decoder.blocks.1.norm2.bias",
|
| 583 |
+
"conf_decoder.blocks.1.mlp.fc1.bias",
|
| 584 |
+
"conf_decoder.blocks.1.mlp.fc2.bias",
|
| 585 |
+
"conf_decoder.blocks.2.norm1.weight",
|
| 586 |
+
"conf_decoder.blocks.2.norm1.bias",
|
| 587 |
+
"conf_decoder.blocks.2.attn.qkv.bias",
|
| 588 |
+
"conf_decoder.blocks.2.attn.proj.bias",
|
| 589 |
+
"conf_decoder.blocks.2.norm2.weight",
|
| 590 |
+
"conf_decoder.blocks.2.norm2.bias",
|
| 591 |
+
"conf_decoder.blocks.2.mlp.fc1.bias",
|
| 592 |
+
"conf_decoder.blocks.2.mlp.fc2.bias",
|
| 593 |
+
"conf_decoder.blocks.3.norm1.weight",
|
| 594 |
+
"conf_decoder.blocks.3.norm1.bias",
|
| 595 |
+
"conf_decoder.blocks.3.attn.qkv.bias",
|
| 596 |
+
"conf_decoder.blocks.3.attn.proj.bias",
|
| 597 |
+
"conf_decoder.blocks.3.norm2.weight",
|
| 598 |
+
"conf_decoder.blocks.3.norm2.bias",
|
| 599 |
+
"conf_decoder.blocks.3.mlp.fc1.bias",
|
| 600 |
+
"conf_decoder.blocks.3.mlp.fc2.bias",
|
| 601 |
+
"conf_decoder.blocks.4.norm1.weight",
|
| 602 |
+
"conf_decoder.blocks.4.norm1.bias",
|
| 603 |
+
"conf_decoder.blocks.4.attn.qkv.bias",
|
| 604 |
+
"conf_decoder.blocks.4.attn.proj.bias",
|
| 605 |
+
"conf_decoder.blocks.4.norm2.weight",
|
| 606 |
+
"conf_decoder.blocks.4.norm2.bias",
|
| 607 |
+
"conf_decoder.blocks.4.mlp.fc1.bias",
|
| 608 |
+
"conf_decoder.blocks.4.mlp.fc2.bias",
|
| 609 |
+
"conf_decoder.linear_out.bias",
|
| 610 |
+
"conf_head.proj.bias",
|
| 611 |
+
"camera_decoder.projects.bias",
|
| 612 |
+
"camera_decoder.blocks.0.norm1.weight",
|
| 613 |
+
"camera_decoder.blocks.0.norm1.bias",
|
| 614 |
+
"camera_decoder.blocks.0.attn.qkv.bias",
|
| 615 |
+
"camera_decoder.blocks.0.attn.proj.bias",
|
| 616 |
+
"camera_decoder.blocks.0.norm2.weight",
|
| 617 |
+
"camera_decoder.blocks.0.norm2.bias",
|
| 618 |
+
"camera_decoder.blocks.0.mlp.fc1.bias",
|
| 619 |
+
"camera_decoder.blocks.0.mlp.fc2.bias",
|
| 620 |
+
"camera_decoder.blocks.1.norm1.weight",
|
| 621 |
+
"camera_decoder.blocks.1.norm1.bias",
|
| 622 |
+
"camera_decoder.blocks.1.attn.qkv.bias",
|
| 623 |
+
"camera_decoder.blocks.1.attn.proj.bias",
|
| 624 |
+
"camera_decoder.blocks.1.norm2.weight",
|
| 625 |
+
"camera_decoder.blocks.1.norm2.bias",
|
| 626 |
+
"camera_decoder.blocks.1.mlp.fc1.bias",
|
| 627 |
+
"camera_decoder.blocks.1.mlp.fc2.bias",
|
| 628 |
+
"camera_decoder.blocks.2.norm1.weight",
|
| 629 |
+
"camera_decoder.blocks.2.norm1.bias",
|
| 630 |
+
"camera_decoder.blocks.2.attn.qkv.bias",
|
| 631 |
+
"camera_decoder.blocks.2.attn.proj.bias",
|
| 632 |
+
"camera_decoder.blocks.2.norm2.weight",
|
| 633 |
+
"camera_decoder.blocks.2.norm2.bias",
|
| 634 |
+
"camera_decoder.blocks.2.mlp.fc1.bias",
|
| 635 |
+
"camera_decoder.blocks.2.mlp.fc2.bias",
|
| 636 |
+
"camera_decoder.blocks.3.norm1.weight",
|
| 637 |
+
"camera_decoder.blocks.3.norm1.bias",
|
| 638 |
+
"camera_decoder.blocks.3.attn.qkv.bias",
|
| 639 |
+
"camera_decoder.blocks.3.attn.proj.bias",
|
| 640 |
+
"camera_decoder.blocks.3.norm2.weight",
|
| 641 |
+
"camera_decoder.blocks.3.norm2.bias",
|
| 642 |
+
"camera_decoder.blocks.3.mlp.fc1.bias",
|
| 643 |
+
"camera_decoder.blocks.3.mlp.fc2.bias",
|
| 644 |
+
"camera_decoder.blocks.4.norm1.weight",
|
| 645 |
+
"camera_decoder.blocks.4.norm1.bias",
|
| 646 |
+
"camera_decoder.blocks.4.attn.qkv.bias",
|
| 647 |
+
"camera_decoder.blocks.4.attn.proj.bias",
|
| 648 |
+
"camera_decoder.blocks.4.norm2.weight",
|
| 649 |
+
"camera_decoder.blocks.4.norm2.bias",
|
| 650 |
+
"camera_decoder.blocks.4.mlp.fc1.bias",
|
| 651 |
+
"camera_decoder.blocks.4.mlp.fc2.bias",
|
| 652 |
+
"camera_decoder.linear_out.bias",
|
| 653 |
+
"camera_head.res_conv.0.res_conv1.bias",
|
| 654 |
+
"camera_head.res_conv.0.res_conv2.bias",
|
| 655 |
+
"camera_head.res_conv.0.res_conv3.bias",
|
| 656 |
+
"camera_head.res_conv.1.res_conv1.bias",
|
| 657 |
+
"camera_head.res_conv.1.res_conv2.bias",
|
| 658 |
+
"camera_head.res_conv.1.res_conv3.bias",
|
| 659 |
+
"camera_head.more_mlps.0.bias",
|
| 660 |
+
"camera_head.more_mlps.2.bias",
|
| 661 |
+
"camera_head.fc_t.bias",
|
| 662 |
+
"camera_head.fc_rot.bias"
|
| 663 |
+
],
|
| 664 |
+
"lr_scale": 1.0
|
| 665 |
+
},
|
| 666 |
+
"decay": {
|
| 667 |
+
"weight_decay": 0.05,
|
| 668 |
+
"params": [
|
| 669 |
+
"decoder.0.attn.qkv.weight",
|
| 670 |
+
"decoder.0.attn.proj.weight",
|
| 671 |
+
"decoder.0.mlp.fc1.weight",
|
| 672 |
+
"decoder.0.mlp.fc2.weight",
|
| 673 |
+
"decoder.1.attn.qkv.weight",
|
| 674 |
+
"decoder.1.attn.proj.weight",
|
| 675 |
+
"decoder.1.mlp.fc1.weight",
|
| 676 |
+
"decoder.1.mlp.fc2.weight",
|
| 677 |
+
"decoder.2.attn.qkv.weight",
|
| 678 |
+
"decoder.2.attn.proj.weight",
|
| 679 |
+
"decoder.2.mlp.fc1.weight",
|
| 680 |
+
"decoder.2.mlp.fc2.weight",
|
| 681 |
+
"decoder.3.attn.qkv.weight",
|
| 682 |
+
"decoder.3.attn.proj.weight",
|
| 683 |
+
"decoder.3.mlp.fc1.weight",
|
| 684 |
+
"decoder.3.mlp.fc2.weight",
|
| 685 |
+
"decoder.4.attn.qkv.weight",
|
| 686 |
+
"decoder.4.attn.proj.weight",
|
| 687 |
+
"decoder.4.mlp.fc1.weight",
|
| 688 |
+
"decoder.4.mlp.fc2.weight",
|
| 689 |
+
"decoder.5.attn.qkv.weight",
|
| 690 |
+
"decoder.5.attn.proj.weight",
|
| 691 |
+
"decoder.5.mlp.fc1.weight",
|
| 692 |
+
"decoder.5.mlp.fc2.weight",
|
| 693 |
+
"decoder.6.attn.qkv.weight",
|
| 694 |
+
"decoder.6.attn.proj.weight",
|
| 695 |
+
"decoder.6.mlp.fc1.weight",
|
| 696 |
+
"decoder.6.mlp.fc2.weight",
|
| 697 |
+
"decoder.7.attn.qkv.weight",
|
| 698 |
+
"decoder.7.attn.proj.weight",
|
| 699 |
+
"decoder.7.mlp.fc1.weight",
|
| 700 |
+
"decoder.7.mlp.fc2.weight",
|
| 701 |
+
"decoder.8.attn.qkv.weight",
|
| 702 |
+
"decoder.8.attn.proj.weight",
|
| 703 |
+
"decoder.8.mlp.fc1.weight",
|
| 704 |
+
"decoder.8.mlp.fc2.weight",
|
| 705 |
+
"decoder.9.attn.qkv.weight",
|
| 706 |
+
"decoder.9.attn.proj.weight",
|
| 707 |
+
"decoder.9.mlp.fc1.weight",
|
| 708 |
+
"decoder.9.mlp.fc2.weight",
|
| 709 |
+
"decoder.10.attn.qkv.weight",
|
| 710 |
+
"decoder.10.attn.proj.weight",
|
| 711 |
+
"decoder.10.mlp.fc1.weight",
|
| 712 |
+
"decoder.10.mlp.fc2.weight",
|
| 713 |
+
"decoder.11.attn.qkv.weight",
|
| 714 |
+
"decoder.11.attn.proj.weight",
|
| 715 |
+
"decoder.11.mlp.fc1.weight",
|
| 716 |
+
"decoder.11.mlp.fc2.weight",
|
| 717 |
+
"decoder.12.attn.qkv.weight",
|
| 718 |
+
"decoder.12.attn.proj.weight",
|
| 719 |
+
"decoder.12.mlp.fc1.weight",
|
| 720 |
+
"decoder.12.mlp.fc2.weight",
|
| 721 |
+
"decoder.13.attn.qkv.weight",
|
| 722 |
+
"decoder.13.attn.proj.weight",
|
| 723 |
+
"decoder.13.mlp.fc1.weight",
|
| 724 |
+
"decoder.13.mlp.fc2.weight",
|
| 725 |
+
"decoder.14.attn.qkv.weight",
|
| 726 |
+
"decoder.14.attn.proj.weight",
|
| 727 |
+
"decoder.14.mlp.fc1.weight",
|
| 728 |
+
"decoder.14.mlp.fc2.weight",
|
| 729 |
+
"decoder.15.attn.qkv.weight",
|
| 730 |
+
"decoder.15.attn.proj.weight",
|
| 731 |
+
"decoder.15.mlp.fc1.weight",
|
| 732 |
+
"decoder.15.mlp.fc2.weight",
|
| 733 |
+
"decoder.16.attn.qkv.weight",
|
| 734 |
+
"decoder.16.attn.proj.weight",
|
| 735 |
+
"decoder.16.mlp.fc1.weight",
|
| 736 |
+
"decoder.16.mlp.fc2.weight",
|
| 737 |
+
"decoder.17.attn.qkv.weight",
|
| 738 |
+
"decoder.17.attn.proj.weight",
|
| 739 |
+
"decoder.17.mlp.fc1.weight",
|
| 740 |
+
"decoder.17.mlp.fc2.weight",
|
| 741 |
+
"decoder.18.attn.qkv.weight",
|
| 742 |
+
"decoder.18.attn.proj.weight",
|
| 743 |
+
"decoder.18.mlp.fc1.weight",
|
| 744 |
+
"decoder.18.mlp.fc2.weight",
|
| 745 |
+
"decoder.19.attn.qkv.weight",
|
| 746 |
+
"decoder.19.attn.proj.weight",
|
| 747 |
+
"decoder.19.mlp.fc1.weight",
|
| 748 |
+
"decoder.19.mlp.fc2.weight",
|
| 749 |
+
"decoder.20.attn.qkv.weight",
|
| 750 |
+
"decoder.20.attn.proj.weight",
|
| 751 |
+
"decoder.20.mlp.fc1.weight",
|
| 752 |
+
"decoder.20.mlp.fc2.weight",
|
| 753 |
+
"decoder.21.attn.qkv.weight",
|
| 754 |
+
"decoder.21.attn.proj.weight",
|
| 755 |
+
"decoder.21.mlp.fc1.weight",
|
| 756 |
+
"decoder.21.mlp.fc2.weight",
|
| 757 |
+
"decoder.22.attn.qkv.weight",
|
| 758 |
+
"decoder.22.attn.proj.weight",
|
| 759 |
+
"decoder.22.mlp.fc1.weight",
|
| 760 |
+
"decoder.22.mlp.fc2.weight",
|
| 761 |
+
"decoder.23.attn.qkv.weight",
|
| 762 |
+
"decoder.23.attn.proj.weight",
|
| 763 |
+
"decoder.23.mlp.fc1.weight",
|
| 764 |
+
"decoder.23.mlp.fc2.weight",
|
| 765 |
+
"decoder.24.attn.qkv.weight",
|
| 766 |
+
"decoder.24.attn.proj.weight",
|
| 767 |
+
"decoder.24.mlp.fc1.weight",
|
| 768 |
+
"decoder.24.mlp.fc2.weight",
|
| 769 |
+
"decoder.25.attn.qkv.weight",
|
| 770 |
+
"decoder.25.attn.proj.weight",
|
| 771 |
+
"decoder.25.mlp.fc1.weight",
|
| 772 |
+
"decoder.25.mlp.fc2.weight",
|
| 773 |
+
"decoder.26.attn.qkv.weight",
|
| 774 |
+
"decoder.26.attn.proj.weight",
|
| 775 |
+
"decoder.26.mlp.fc1.weight",
|
| 776 |
+
"decoder.26.mlp.fc2.weight",
|
| 777 |
+
"decoder.27.attn.qkv.weight",
|
| 778 |
+
"decoder.27.attn.proj.weight",
|
| 779 |
+
"decoder.27.mlp.fc1.weight",
|
| 780 |
+
"decoder.27.mlp.fc2.weight",
|
| 781 |
+
"decoder.28.attn.qkv.weight",
|
| 782 |
+
"decoder.28.attn.proj.weight",
|
| 783 |
+
"decoder.28.mlp.fc1.weight",
|
| 784 |
+
"decoder.28.mlp.fc2.weight",
|
| 785 |
+
"decoder.29.attn.qkv.weight",
|
| 786 |
+
"decoder.29.attn.proj.weight",
|
| 787 |
+
"decoder.29.mlp.fc1.weight",
|
| 788 |
+
"decoder.29.mlp.fc2.weight",
|
| 789 |
+
"decoder.30.attn.qkv.weight",
|
| 790 |
+
"decoder.30.attn.proj.weight",
|
| 791 |
+
"decoder.30.mlp.fc1.weight",
|
| 792 |
+
"decoder.30.mlp.fc2.weight",
|
| 793 |
+
"decoder.31.attn.qkv.weight",
|
| 794 |
+
"decoder.31.attn.proj.weight",
|
| 795 |
+
"decoder.31.mlp.fc1.weight",
|
| 796 |
+
"decoder.31.mlp.fc2.weight",
|
| 797 |
+
"decoder.32.attn.qkv.weight",
|
| 798 |
+
"decoder.32.attn.proj.weight",
|
| 799 |
+
"decoder.32.mlp.fc1.weight",
|
| 800 |
+
"decoder.32.mlp.fc2.weight",
|
| 801 |
+
"decoder.33.attn.qkv.weight",
|
| 802 |
+
"decoder.33.attn.proj.weight",
|
| 803 |
+
"decoder.33.mlp.fc1.weight",
|
| 804 |
+
"decoder.33.mlp.fc2.weight",
|
| 805 |
+
"decoder.34.attn.qkv.weight",
|
| 806 |
+
"decoder.34.attn.proj.weight",
|
| 807 |
+
"decoder.34.mlp.fc1.weight",
|
| 808 |
+
"decoder.34.mlp.fc2.weight",
|
| 809 |
+
"decoder.35.attn.qkv.weight",
|
| 810 |
+
"decoder.35.attn.proj.weight",
|
| 811 |
+
"decoder.35.mlp.fc1.weight",
|
| 812 |
+
"decoder.35.mlp.fc2.weight",
|
| 813 |
+
"point_decoder.projects.weight",
|
| 814 |
+
"point_decoder.blocks.0.attn.qkv.weight",
|
| 815 |
+
"point_decoder.blocks.0.attn.proj.weight",
|
| 816 |
+
"point_decoder.blocks.0.mlp.fc1.weight",
|
| 817 |
+
"point_decoder.blocks.0.mlp.fc2.weight",
|
| 818 |
+
"point_decoder.blocks.1.attn.qkv.weight",
|
| 819 |
+
"point_decoder.blocks.1.attn.proj.weight",
|
| 820 |
+
"point_decoder.blocks.1.mlp.fc1.weight",
|
| 821 |
+
"point_decoder.blocks.1.mlp.fc2.weight",
|
| 822 |
+
"point_decoder.blocks.2.attn.qkv.weight",
|
| 823 |
+
"point_decoder.blocks.2.attn.proj.weight",
|
| 824 |
+
"point_decoder.blocks.2.mlp.fc1.weight",
|
| 825 |
+
"point_decoder.blocks.2.mlp.fc2.weight",
|
| 826 |
+
"point_decoder.blocks.3.attn.qkv.weight",
|
| 827 |
+
"point_decoder.blocks.3.attn.proj.weight",
|
| 828 |
+
"point_decoder.blocks.3.mlp.fc1.weight",
|
| 829 |
+
"point_decoder.blocks.3.mlp.fc2.weight",
|
| 830 |
+
"point_decoder.blocks.4.attn.qkv.weight",
|
| 831 |
+
"point_decoder.blocks.4.attn.proj.weight",
|
| 832 |
+
"point_decoder.blocks.4.mlp.fc1.weight",
|
| 833 |
+
"point_decoder.blocks.4.mlp.fc2.weight",
|
| 834 |
+
"point_decoder.linear_out.weight",
|
| 835 |
+
"point_head.proj.weight",
|
| 836 |
+
"conf_decoder.projects.weight",
|
| 837 |
+
"conf_decoder.blocks.0.attn.qkv.weight",
|
| 838 |
+
"conf_decoder.blocks.0.attn.proj.weight",
|
| 839 |
+
"conf_decoder.blocks.0.mlp.fc1.weight",
|
| 840 |
+
"conf_decoder.blocks.0.mlp.fc2.weight",
|
| 841 |
+
"conf_decoder.blocks.1.attn.qkv.weight",
|
| 842 |
+
"conf_decoder.blocks.1.attn.proj.weight",
|
| 843 |
+
"conf_decoder.blocks.1.mlp.fc1.weight",
|
| 844 |
+
"conf_decoder.blocks.1.mlp.fc2.weight",
|
| 845 |
+
"conf_decoder.blocks.2.attn.qkv.weight",
|
| 846 |
+
"conf_decoder.blocks.2.attn.proj.weight",
|
| 847 |
+
"conf_decoder.blocks.2.mlp.fc1.weight",
|
| 848 |
+
"conf_decoder.blocks.2.mlp.fc2.weight",
|
| 849 |
+
"conf_decoder.blocks.3.attn.qkv.weight",
|
| 850 |
+
"conf_decoder.blocks.3.attn.proj.weight",
|
| 851 |
+
"conf_decoder.blocks.3.mlp.fc1.weight",
|
| 852 |
+
"conf_decoder.blocks.3.mlp.fc2.weight",
|
| 853 |
+
"conf_decoder.blocks.4.attn.qkv.weight",
|
| 854 |
+
"conf_decoder.blocks.4.attn.proj.weight",
|
| 855 |
+
"conf_decoder.blocks.4.mlp.fc1.weight",
|
| 856 |
+
"conf_decoder.blocks.4.mlp.fc2.weight",
|
| 857 |
+
"conf_decoder.linear_out.weight",
|
| 858 |
+
"conf_head.proj.weight",
|
| 859 |
+
"camera_decoder.projects.weight",
|
| 860 |
+
"camera_decoder.blocks.0.attn.qkv.weight",
|
| 861 |
+
"camera_decoder.blocks.0.attn.proj.weight",
|
| 862 |
+
"camera_decoder.blocks.0.mlp.fc1.weight",
|
| 863 |
+
"camera_decoder.blocks.0.mlp.fc2.weight",
|
| 864 |
+
"camera_decoder.blocks.1.attn.qkv.weight",
|
| 865 |
+
"camera_decoder.blocks.1.attn.proj.weight",
|
| 866 |
+
"camera_decoder.blocks.1.mlp.fc1.weight",
|
| 867 |
+
"camera_decoder.blocks.1.mlp.fc2.weight",
|
| 868 |
+
"camera_decoder.blocks.2.attn.qkv.weight",
|
| 869 |
+
"camera_decoder.blocks.2.attn.proj.weight",
|
| 870 |
+
"camera_decoder.blocks.2.mlp.fc1.weight",
|
| 871 |
+
"camera_decoder.blocks.2.mlp.fc2.weight",
|
| 872 |
+
"camera_decoder.blocks.3.attn.qkv.weight",
|
| 873 |
+
"camera_decoder.blocks.3.attn.proj.weight",
|
| 874 |
+
"camera_decoder.blocks.3.mlp.fc1.weight",
|
| 875 |
+
"camera_decoder.blocks.3.mlp.fc2.weight",
|
| 876 |
+
"camera_decoder.blocks.4.attn.qkv.weight",
|
| 877 |
+
"camera_decoder.blocks.4.attn.proj.weight",
|
| 878 |
+
"camera_decoder.blocks.4.mlp.fc1.weight",
|
| 879 |
+
"camera_decoder.blocks.4.mlp.fc2.weight",
|
| 880 |
+
"camera_decoder.linear_out.weight",
|
| 881 |
+
"camera_head.res_conv.0.res_conv1.weight",
|
| 882 |
+
"camera_head.res_conv.0.res_conv2.weight",
|
| 883 |
+
"camera_head.res_conv.0.res_conv3.weight",
|
| 884 |
+
"camera_head.res_conv.1.res_conv1.weight",
|
| 885 |
+
"camera_head.res_conv.1.res_conv2.weight",
|
| 886 |
+
"camera_head.res_conv.1.res_conv3.weight",
|
| 887 |
+
"camera_head.more_mlps.0.weight",
|
| 888 |
+
"camera_head.more_mlps.2.weight",
|
| 889 |
+
"camera_head.fc_t.weight",
|
| 890 |
+
"camera_head.fc_rot.weight"
|
| 891 |
+
],
|
| 892 |
+
"lr_scale": 1.0
|
| 893 |
+
}
|
| 894 |
+
}
|
| 895 |
+
[2026-05-02 09:32:22,943][croco.utils.misc][INFO] - [RANK 0] Resume checkpoint /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_8gpu/checkpoint-last.pth
|
| 896 |
+
[2026-05-02 09:32:22,968][croco.utils.misc][INFO] - [RANK 0] Moving optimizer state to device: cuda:0
|
| 897 |
+
[2026-05-02 09:32:22,979][croco.utils.misc][INFO] - [RANK 0] & best_so_far=inf
|
| 898 |
+
[2026-05-02 09:32:22,980][croco.utils.misc][INFO] - [RANK 0] With optim & sched! start_epoch=0
|
| 899 |
+
[2026-05-02 09:32:26,731][__main__][INFO] - [RANK 0] Start training for 10 epochs
|
| 900 |
+
[2026-05-02 09:32:26,735][__main__][INFO] - [RANK 0] log_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu/
|
| 901 |
+
[2026-05-02 09:34:11,147][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 0/1087] eta: 1 day, 7:31:30 lr: 0.000000 epoch: 0.0000 (0.0000) step: 0.0000 (0.0000) loss: 5081.2871 (5081.2871) Lcamera_frontend: 4.1017 (4.1017) Ldepth_frontend: 3.8557 (3.8557) Lpmap_frontend: 9.5382 (9.5382) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 4.0905 (4.0905) Ldepth_mix: 3.8498 (3.8498) Lpmap_mix: 9.5206 (9.5206) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 4.1017 (4.1017) Ldepth_backend: 3.8447 (3.8447) Lpmap_backend: 9.5162 (9.5162) Ltrack_backend: 0.0000 (0.0000) total: 5081.2871 (5081.2871) time: 104.4072 data: 28.0688 max mem: 37991
|
| 902 |
+
[2026-05-02 09:43:10,412][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 10/1087] eta: 17:30:11 lr: 0.000000 epoch: 0.0046 (0.0046) step: 5.0000 (5.0000) loss: 5081.2871 (4411.0437) Lcamera_frontend: 4.1017 (3.5236) Ldepth_frontend: 4.5584 (5.0590) Lpmap_frontend: 10.3262 (10.2973) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 4.0905 (3.5128) Ldepth_mix: 4.5539 (5.0601) Lpmap_mix: 10.3126 (10.2883) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 4.1017 (3.5232) Ldepth_backend: 4.5483 (5.0615) Lpmap_backend: 10.3111 (10.2872) Ltrack_backend: 0.0000 (0.0000) total: 5081.2871 (4411.0437) time: 58.5066 data: 2.6019 max mem: 78413
|
| 903 |
+
[2026-05-02 09:52:24,250][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 20/1087] eta: 16:53:59 lr: 0.000000 epoch: 0.0092 (0.0092) step: 10.0000 (10.0000) loss: 3951.7617 (3824.7185) Lcamera_frontend: 3.1362 (3.0310) Ldepth_frontend: 4.9514 (5.2168) Lpmap_frontend: 10.4624 (10.5351) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.1248 (3.0233) Ldepth_mix: 4.9618 (5.2202) Lpmap_mix: 10.4505 (10.5235) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.1384 (3.0304) Ldepth_backend: 4.9725 (5.2236) Lpmap_backend: 10.4471 (10.5182) Ltrack_backend: 0.0000 (0.0000) total: 3951.7617 (3824.7185) time: 54.6495 data: 0.0480 max mem: 78608
|
| 904 |
+
[2026-05-02 10:01:28,558][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 30/1087] eta: 16:29:46 lr: 0.000001 epoch: 0.0184 (0.0138) step: 20.0000 (15.0000) loss: 3951.7617 (3864.5277) Lcamera_frontend: 3.1362 (3.0669) Ldepth_frontend: 4.4016 (5.0846) Lpmap_frontend: 10.4417 (10.3998) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.1248 (3.0592) Ldepth_mix: 4.4010 (5.0863) Lpmap_mix: 10.4259 (10.3879) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.1384 (3.0663) Ldepth_backend: 4.4006 (5.0880) Lpmap_backend: 10.4266 (10.3822) Ltrack_backend: 0.0000 (0.0000) total: 3951.7617 (3864.5277) time: 54.9066 data: 0.0411 max mem: 78608
|
| 905 |
+
[2026-05-02 10:10:45,804][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 40/1087] eta: 16:18:27 lr: 0.000001 epoch: 0.0276 (0.0184) step: 30.0000 (20.0000) loss: 4213.7056 (3929.6543) Lcamera_frontend: 3.3595 (3.1187) Ldepth_frontend: 4.4016 (5.1378) Lpmap_frontend: 10.8743 (10.5532) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.3562 (3.1121) Ldepth_mix: 4.4010 (5.1384) Lpmap_mix: 10.8731 (10.5401) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.3618 (3.1185) Ldepth_backend: 4.4006 (5.1391) Lpmap_backend: 10.8691 (10.5334) Ltrack_backend: 0.0000 (0.0000) total: 4213.7056 (3929.6543) time: 55.0775 data: 0.0382 max mem: 78608
|
| 906 |
+
[2026-05-02 10:19:57,442][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 50/1087] eta: 16:06:01 lr: 0.000001 epoch: 0.0368 (0.0230) step: 40.0000 (25.0000) loss: 3829.2021 (4129.3481) Lcamera_frontend: 3.0496 (3.2863) Ldepth_frontend: 4.7301 (5.0672) Lpmap_frontend: 10.8536 (10.4917) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.0203 (3.2795) Ldepth_mix: 4.7268 (5.0677) Lpmap_mix: 10.8350 (10.4786) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.0508 (3.2863) Ldepth_backend: 4.7242 (5.0684) Lpmap_backend: 10.8184 (10.4724) Ltrack_backend: 0.0000 (0.0000) total: 3829.2021 (4129.3481) time: 55.4440 data: 0.0356 max mem: 78608
|
| 907 |
+
[2026-05-02 10:29:03,164][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 60/1087] eta: 15:53:00 lr: 0.000001 epoch: 0.0460 (0.0276) step: 50.0000 (30.0000) loss: 4683.7441 (4309.0910) Lcamera_frontend: 3.7755 (3.4374) Ldepth_frontend: 4.3455 (4.9989) Lpmap_frontend: 10.2377 (10.4435) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.7603 (3.4294) Ldepth_mix: 4.3478 (4.9995) Lpmap_mix: 10.2274 (10.4308) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.7750 (3.4373) Ldepth_backend: 4.3490 (5.0004) Lpmap_backend: 10.2330 (10.4253) Ltrack_backend: 0.0000 (0.0000) total: 4683.7441 (4309.0910) time: 54.8678 data: 0.0347 max mem: 78608
|
| 908 |
+
[2026-05-02 10:38:19,102][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 70/1087] eta: 15:43:31 lr: 0.000001 epoch: 0.0552 (0.0322) step: 60.0000 (35.0000) loss: 4761.5264 (4381.1845) Lcamera_frontend: 3.7872 (3.4981) Ldepth_frontend: 4.1337 (4.9323) Lpmap_frontend: 10.1581 (10.4514) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.7779 (3.4900) Ldepth_mix: 4.1218 (4.9323) Lpmap_mix: 10.1423 (10.4382) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.7848 (3.4980) Ldepth_backend: 4.1105 (4.9326) Lpmap_backend: 10.1356 (10.4330) Ltrack_backend: 0.0000 (0.0000) total: 4761.5264 (4381.1845) time: 55.0822 data: 0.0375 max mem: 78608
|
| 909 |
+
[2026-05-02 10:47:36,569][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 80/1087] eta: 15:34:24 lr: 0.000001 epoch: 0.0644 (0.0368) step: 70.0000 (40.0000) loss: 4113.8643 (4172.3970) Lcamera_frontend: 3.2871 (3.3233) Ldepth_frontend: 4.7628 (5.0174) Lpmap_frontend: 10.1581 (10.4412) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.2732 (3.3154) Ldepth_mix: 4.7717 (5.0186) Lpmap_mix: 10.1423 (10.4278) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.2865 (3.3232) Ldepth_backend: 4.7719 (5.0199) Lpmap_backend: 10.1356 (10.4220) Ltrack_backend: 0.0000 (0.0000) total: 4113.8643 (4172.3970) time: 55.6685 data: 0.0385 max mem: 78608
|
| 910 |
+
[2026-05-02 10:56:52,385][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 90/1087] eta: 15:24:57 lr: 0.000002 epoch: 0.0736 (0.0414) step: 80.0000 (45.0000) loss: 4400.7505 (4400.3412) Lcamera_frontend: 3.5322 (3.5147) Ldepth_frontend: 4.7628 (4.9277) Lpmap_frontend: 10.2119 (10.3956) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.5118 (3.5057) Ldepth_mix: 4.7717 (4.9287) Lpmap_mix: 10.1763 (10.3818) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.5319 (3.5146) Ldepth_backend: 4.7719 (4.9298) Lpmap_backend: 10.1456 (10.3764) Ltrack_backend: 0.0000 (0.0000) total: 4400.7505 (4400.3412) time: 55.6631 data: 0.0348 max mem: 78608
|
| 911 |
+
[2026-05-02 11:06:10,773][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 100/1087] eta: 15:15:58 lr: 0.000002 epoch: 0.0828 (0.0460) step: 90.0000 (49.9901) loss: 4485.5415 (4313.6692) Lcamera_frontend: 3.5890 (3.4421) Ldepth_frontend: 4.7662 (4.9857) Lpmap_frontend: 10.1037 (10.3825) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.5679 (3.4326) Ldepth_mix: 4.7777 (4.9867) Lpmap_mix: 10.0893 (10.3682) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.5857 (3.4420) Ldepth_backend: 4.7806 (4.9878) Lpmap_backend: 10.0853 (10.3625) Ltrack_backend: 0.0000 (0.0000) total: 4485.5415 (4313.6692) time: 55.7100 data: 0.0358 max mem: 78608
|
| 912 |
+
[2026-05-02 11:15:41,530][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 110/1087] eta: 15:08:43 lr: 0.000002 epoch: 0.0920 (0.0506) step: 100.0000 (54.9910) loss: 3342.0063 (4247.8372) Lcamera_frontend: 2.6406 (3.3874) Ldepth_frontend: 5.0779 (5.0109) Lpmap_frontend: 9.9820 (10.3502) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.6192 (3.3771) Ldepth_mix: 5.0791 (5.0126) Lpmap_mix: 9.9622 (10.3355) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6308 (3.3872) Ldepth_backend: 5.0791 (5.0143) Lpmap_backend: 9.9510 (10.3298) Ltrack_backend: 0.0000 (0.0000) total: 3342.0063 (4247.8372) time: 56.4571 data: 0.0390 max mem: 78608
|
| 913 |
+
[2026-05-02 11:24:58,761][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 120/1087] eta: 14:59:19 lr: 0.000002 epoch: 0.1012 (0.0552) step: 110.0000 (59.9917) loss: 3342.0063 (4146.7741) Lcamera_frontend: 2.6406 (3.3030) Ldepth_frontend: 5.5876 (5.0812) Lpmap_frontend: 9.6714 (10.3031) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.6192 (3.2928) Ldepth_mix: 5.6178 (5.0840) Lpmap_mix: 9.6466 (10.2883) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6308 (3.3027) Ldepth_backend: 5.6452 (5.0867) Lpmap_backend: 9.6339 (10.2825) Ltrack_backend: 0.0000 (0.0000) total: 3342.0063 (4146.7741) time: 56.3993 data: 0.0383 max mem: 78608
|
| 914 |
+
[2026-05-02 11:34:10,888][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 130/1087] eta: 14:49:17 lr: 0.000002 epoch: 0.1104 (0.0598) step: 120.0000 (64.9924) loss: 2631.2615 (4164.0187) Lcamera_frontend: 2.0486 (3.3171) Ldepth_frontend: 6.1372 (5.1172) Lpmap_frontend: 9.6384 (10.2905) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.0071 (3.3068) Ldepth_mix: 6.1516 (5.1200) Lpmap_mix: 9.6245 (10.2759) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.0492 (3.3169) Ldepth_backend: 6.1575 (5.1227) Lpmap_backend: 9.6327 (10.2702) Ltrack_backend: 0.0000 (0.0000) total: 2631.2615 (4164.0187) time: 55.4666 data: 0.0347 max mem: 78608
|
| 915 |
+
[2026-05-02 11:43:18,641][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 140/1087] eta: 14:38:54 lr: 0.000003 epoch: 0.1196 (0.0644) step: 130.0000 (69.9929) loss: 4879.4307 (4204.6965) Lcamera_frontend: 3.9324 (3.3515) Ldepth_frontend: 4.2465 (5.0660) Lpmap_frontend: 10.0131 (10.2895) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.9262 (3.3411) Ldepth_mix: 4.2521 (5.0687) Lpmap_mix: 9.9941 (10.2750) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.9341 (3.3513) Ldepth_backend: 4.2526 (5.0711) Lpmap_backend: 9.9882 (10.2696) Ltrack_backend: 0.0000 (0.0000) total: 4879.4307 (4204.6965) time: 54.9915 data: 0.0353 max mem: 78608
|
| 916 |
+
[2026-05-02 11:52:28,470][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 150/1087] eta: 14:28:53 lr: 0.000003 epoch: 0.1288 (0.0690) step: 140.0000 (74.9934) loss: 4827.8979 (4246.8539) Lcamera_frontend: 3.8852 (3.3870) Ldepth_frontend: 4.4470 (5.0662) Lpmap_frontend: 9.9330 (10.2601) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.8699 (3.3755) Ldepth_mix: 4.4516 (5.0692) Lpmap_mix: 9.9155 (10.2457) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.8851 (3.3868) Ldepth_backend: 4.4521 (5.0718) Lpmap_backend: 9.9124 (10.2407) Ltrack_backend: 0.0000 (0.0000) total: 4827.8979 (4246.8539) time: 54.8778 data: 0.0360 max mem: 78608
|
| 917 |
+
[2026-05-02 12:01:38,236][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 160/1087] eta: 14:18:59 lr: 0.000003 epoch: 0.1380 (0.0736) step: 150.0000 (79.9876) loss: 4349.0220 (4246.2913) Lcamera_frontend: 3.4843 (3.3870) Ldepth_frontend: 4.4890 (5.0494) Lpmap_frontend: 9.5575 (10.2359) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.4510 (3.3753) Ldepth_mix: 4.4851 (5.0522) Lpmap_mix: 9.5402 (10.2213) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.4829 (3.3868) Ldepth_backend: 4.4813 (5.0547) Lpmap_backend: 9.5382 (10.2163) Ltrack_backend: 0.0000 (0.0000) total: 4349.0220 (4246.2913) time: 54.9796 data: 0.0352 max mem: 78608
|
| 918 |
+
[2026-05-02 12:10:55,561][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 170/1087] eta: 14:09:50 lr: 0.000003 epoch: 0.1472 (0.0782) step: 160.0000 (84.9883) loss: 3598.7285 (4161.5525) Lcamera_frontend: 2.8777 (3.3157) Ldepth_frontend: 4.9348 (5.0914) Lpmap_frontend: 10.0642 (10.2605) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8557 (3.3044) Ldepth_mix: 4.9476 (5.0938) Lpmap_mix: 10.0399 (10.2456) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.8747 (3.3155) Ldepth_backend: 4.9643 (5.0959) Lpmap_backend: 10.0282 (10.2402) Ltrack_backend: 0.0000 (0.0000) total: 3598.7285 (4161.5525) time: 55.3544 data: 0.0373 max mem: 78608
|
| 919 |
+
[2026-05-02 12:20:10,516][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 180/1087] eta: 14:00:29 lr: 0.000003 epoch: 0.1564 (0.0828) step: 170.0000 (89.9890) loss: 2764.1545 (4105.6668) Lcamera_frontend: 2.1421 (3.2688) Ldepth_frontend: 4.9348 (5.1130) Lpmap_frontend: 10.8191 (10.2671) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.1340 (3.2577) Ldepth_mix: 4.9225 (5.1157) Lpmap_mix: 10.8135 (10.2519) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.1400 (3.2686) Ldepth_backend: 4.9108 (5.1179) Lpmap_backend: 10.8064 (10.2462) Ltrack_backend: 0.0000 (0.0000) total: 2764.1545 (4105.6668) time: 55.6139 data: 0.0548 max mem: 78608
|
| 920 |
+
[2026-05-02 12:29:22,152][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 190/1087] eta: 13:50:52 lr: 0.000003 epoch: 0.1656 (0.0874) step: 180.0000 (94.9895) loss: 3678.9834 (4134.5890) Lcamera_frontend: 2.8929 (3.2924) Ldepth_frontend: 4.6002 (5.1258) Lpmap_frontend: 10.6992 (10.2954) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8951 (3.2812) Ldepth_mix: 4.5934 (5.1285) Lpmap_mix: 10.6930 (10.2804) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.8936 (3.2923) Ldepth_backend: 4.5875 (5.1308) Lpmap_backend: 10.6846 (10.2748) Ltrack_backend: 0.0000 (0.0000) total: 3678.9834 (4134.5890) time: 55.3294 data: 0.0520 max mem: 78608
|
| 921 |
+
[2026-05-02 12:38:26,897][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 200/1087] eta: 13:40:47 lr: 0.000004 epoch: 0.1748 (0.0920) step: 190.0000 (99.9851) loss: 4778.2319 (4191.8865) Lcamera_frontend: 3.8469 (3.3410) Ldepth_frontend: 4.0356 (5.0816) Lpmap_frontend: 10.1077 (10.2607) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.7730 (3.3290) Ldepth_mix: 4.0307 (5.0842) Lpmap_mix: 10.0821 (10.2454) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.8457 (3.3410) Ldepth_backend: 4.0243 (5.0865) Lpmap_backend: 10.0822 (10.2402) Ltrack_backend: 0.0000 (0.0000) total: 4778.2319 (4191.8865) time: 54.8173 data: 0.0351 max mem: 78608
|
| 922 |
+
[2026-05-02 12:47:46,992][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 210/1087] eta: 13:31:52 lr: 0.000004 epoch: 0.1840 (0.0966) step: 200.0000 (104.9858) loss: 4876.9844 (4207.8285) Lcamera_frontend: 3.9352 (3.3538) Ldepth_frontend: 4.4036 (5.0991) Lpmap_frontend: 10.3494 (10.2996) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.9222 (3.3418) Ldepth_mix: 4.3864 (5.1015) Lpmap_mix: 10.3330 (10.2847) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.9361 (3.3537) Ldepth_backend: 4.3715 (5.1037) Lpmap_backend: 10.3395 (10.2797) Ltrack_backend: 0.0000 (0.0000) total: 4876.9844 (4207.8285) time: 55.2393 data: 0.0366 max mem: 78608
|
| 923 |
+
[2026-05-02 12:57:14,917][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 220/1087] eta: 13:23:26 lr: 0.000004 epoch: 0.1932 (0.1012) step: 210.0000 (109.9864) loss: 2947.5679 (4140.6311) Lcamera_frontend: 2.2958 (3.2974) Ldepth_frontend: 5.4840 (5.1449) Lpmap_frontend: 10.8195 (10.2870) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.2882 (3.2854) Ldepth_mix: 5.4841 (5.1478) Lpmap_mix: 10.8116 (10.2721) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.2958 (3.2973) Ldepth_backend: 5.4839 (5.1505) Lpmap_backend: 10.8040 (10.2669) Ltrack_backend: 0.0000 (0.0000) total: 2947.5679 (4140.6311) time: 56.4000 data: 0.0393 max mem: 78608
|
| 924 |
+
[2026-05-02 13:06:25,838][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 230/1087] eta: 13:13:51 lr: 0.000004 epoch: 0.2024 (0.1058) step: 220.0000 (114.9870) loss: 1353.0189 (4095.3858) Lcamera_frontend: 0.9452 (3.2596) Ldepth_frontend: 6.3815 (5.1701) Lpmap_frontend: 9.9192 (10.2703) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 0.9340 (3.2473) Ldepth_mix: 6.4087 (5.1736) Lpmap_mix: 9.8808 (10.2555) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 0.9452 (3.2596) Ldepth_backend: 6.4298 (5.1769) Lpmap_backend: 9.8618 (10.2505) Ltrack_backend: 0.0000 (0.0000) total: 1353.0189 (4095.3858) time: 55.9422 data: 0.0393 max mem: 78608
|
| 925 |
+
[2026-05-02 13:15:42,474][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 240/1087] eta: 13:04:38 lr: 0.000004 epoch: 0.2116 (0.1104) step: 230.0000 (119.9834) loss: 4021.2910 (4077.2525) Lcamera_frontend: 3.2231 (3.2445) Ldepth_frontend: 4.7492 (5.1738) Lpmap_frontend: 9.9525 (10.2648) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.1804 (3.2323) Ldepth_mix: 4.7549 (5.1775) Lpmap_mix: 9.9478 (10.2501) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.2225 (3.2445) Ldepth_backend: 4.7598 (5.1811) Lpmap_backend: 9.9530 (10.2452) Ltrack_backend: 0.0000 (0.0000) total: 4021.2910 (4077.2525) time: 55.3777 data: 0.0362 max mem: 78608
|
| 926 |
+
[2026-05-02 13:24:54,521][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 250/1087] eta: 12:55:10 lr: 0.000005 epoch: 0.2208 (0.1150) step: 240.0000 (124.9841) loss: 4021.2910 (4053.2323) Lcamera_frontend: 3.2231 (3.2247) Ldepth_frontend: 4.7114 (5.1795) Lpmap_frontend: 9.6903 (10.2431) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.1804 (3.2119) Ldepth_mix: 4.7149 (5.1835) Lpmap_mix: 9.6684 (10.2283) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.2225 (3.2246) Ldepth_backend: 4.7115 (5.1873) Lpmap_backend: 9.6601 (10.2235) Ltrack_backend: 0.0000 (0.0000) total: 4021.2910 (4053.2323) time: 55.4340 data: 0.0339 max mem: 78608
|
| 927 |
+
[2026-05-02 13:34:12,265][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 260/1087] eta: 12:46:00 lr: 0.000005 epoch: 0.2300 (0.1196) step: 250.0000 (129.9847) loss: 4096.2056 (4089.7413) Lcamera_frontend: 3.2443 (3.2553) Ldepth_frontend: 4.7496 (5.1702) Lpmap_frontend: 9.2404 (10.2283) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.2141 (3.2423) Ldepth_mix: 4.7389 (5.1742) Lpmap_mix: 9.2168 (10.2135) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.2442 (3.2553) Ldepth_backend: 4.7272 (5.1780) Lpmap_backend: 9.2088 (10.2088) Ltrack_backend: 0.0000 (0.0000) total: 4096.2056 (4089.7413) time: 55.4881 data: 0.0340 max mem: 78608
|
| 928 |
+
[2026-05-02 13:43:32,442][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 270/1087] eta: 12:36:58 lr: 0.000005 epoch: 0.2392 (0.1242) step: 260.0000 (134.9852) loss: 4514.8652 (4075.8639) Lcamera_frontend: 3.6334 (3.2437) Ldepth_frontend: 4.7496 (5.1786) Lpmap_frontend: 9.8342 (10.2301) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.6047 (3.2308) Ldepth_mix: 4.7389 (5.1824) Lpmap_mix: 9.8209 (10.2152) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.6336 (3.2436) Ldepth_backend: 4.7272 (5.1862) Lpmap_backend: 9.8241 (10.2105) Ltrack_backend: 0.0000 (0.0000) total: 4514.8652 (4075.8639) time: 55.8938 data: 0.0351 max mem: 78608
|
| 929 |
+
[2026-05-02 13:52:48,804][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 280/1087] eta: 12:27:43 lr: 0.000005 epoch: 0.2484 (0.1288) step: 270.0000 (139.9822) loss: 4863.7710 (4143.1015) Lcamera_frontend: 3.9223 (3.3000) Ldepth_frontend: 4.6959 (5.1673) Lpmap_frontend: 9.7367 (10.2134) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.9067 (3.2868) Ldepth_mix: 4.7100 (5.1713) Lpmap_mix: 9.7192 (10.1985) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.9224 (3.3000) Ldepth_backend: 4.7193 (5.1750) Lpmap_backend: 9.7193 (10.1939) Ltrack_backend: 0.0000 (0.0000) total: 4863.7710 (4143.1015) time: 55.8260 data: 0.0352 max mem: 78608
|
| 930 |
+
[2026-05-02 14:02:02,794][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 290/1087] eta: 12:18:22 lr: 0.000005 epoch: 0.2576 (0.1334) step: 280.0000 (144.9828) loss: 4426.5898 (4129.0049) Lcamera_frontend: 3.5444 (3.2882) Ldepth_frontend: 4.6959 (5.1711) Lpmap_frontend: 9.6949 (10.2120) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.5284 (3.2750) Ldepth_mix: 4.7100 (5.1751) Lpmap_mix: 9.6806 (10.1968) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.5446 (3.2882) Ldepth_backend: 4.7193 (5.1789) Lpmap_backend: 9.6830 (10.1921) Ltrack_backend: 0.0000 (0.0000) total: 4426.5898 (4129.0049) time: 55.5175 data: 0.0352 max mem: 78608
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/mytrain.py
ADDED
|
@@ -0,0 +1,601 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# training code for CUT3R
|
| 3 |
+
# --------------------------------------------------------
|
| 4 |
+
# References:
|
| 5 |
+
# DUSt3R: https://github.com/naver/dust3r
|
| 6 |
+
# --------------------------------------------------------
|
| 7 |
+
import argparse
|
| 8 |
+
import datetime
|
| 9 |
+
import json
|
| 10 |
+
import numpy as np
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
import time
|
| 14 |
+
import math
|
| 15 |
+
from collections import defaultdict
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Sized
|
| 18 |
+
from itertools import islice
|
| 19 |
+
|
| 20 |
+
import torch
|
| 21 |
+
import torch.backends.cudnn as cudnn
|
| 22 |
+
import torch.nn.functional as F
|
| 23 |
+
from torch.utils.tensorboard import SummaryWriter
|
| 24 |
+
|
| 25 |
+
torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12
|
| 26 |
+
|
| 27 |
+
from dust3r.model import (
|
| 28 |
+
PreTrainedModel,
|
| 29 |
+
ARCroco3DStereo,
|
| 30 |
+
ARCroco3DStereoConfig,
|
| 31 |
+
inf,
|
| 32 |
+
strip_module,
|
| 33 |
+
) # noqa: F401, needed when loading the model
|
| 34 |
+
from dust3r.datasets import get_data_loader
|
| 35 |
+
from dust3r.losses_noteacher import * # noqa: F401, needed when loading the model
|
| 36 |
+
from dust3r.inference import loss_of_one_batch # noqa
|
| 37 |
+
from dust3r.viz import colorize
|
| 38 |
+
from dust3r.utils.render import get_render_results
|
| 39 |
+
import dust3r.utils.path_to_croco # noqa: F401
|
| 40 |
+
import croco.utils.misc as misc # noqa
|
| 41 |
+
from croco.utils.misc import NativeScalerWithGradNormCount as NativeScaler # noqa
|
| 42 |
+
|
| 43 |
+
import hydra
|
| 44 |
+
from omegaconf import OmegaConf
|
| 45 |
+
import logging
|
| 46 |
+
import pathlib
|
| 47 |
+
from tqdm import tqdm
|
| 48 |
+
import random
|
| 49 |
+
import builtins
|
| 50 |
+
import shutil
|
| 51 |
+
|
| 52 |
+
from accelerate import Accelerator
|
| 53 |
+
from accelerate import DistributedDataParallelKwargs, InitProcessGroupKwargs
|
| 54 |
+
from accelerate.logging import get_logger
|
| 55 |
+
from datetime import timedelta
|
| 56 |
+
import torch.multiprocessing
|
| 57 |
+
|
| 58 |
+
from slamformer.models.slamformer import SLAMFormer # upstream typo: pi3 → slamformer
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
torch.multiprocessing.set_sharing_strategy("file_system")
|
| 62 |
+
|
| 63 |
+
printer = get_logger(__name__, log_level="DEBUG")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def setup_for_distributed(accelerator: Accelerator):
|
| 67 |
+
"""
|
| 68 |
+
This function disables printing when not in master process
|
| 69 |
+
"""
|
| 70 |
+
builtin_print = builtins.print
|
| 71 |
+
|
| 72 |
+
def print(*args, **kwargs):
|
| 73 |
+
force = kwargs.pop("force", False)
|
| 74 |
+
force = force or (accelerator.num_processes > 8)
|
| 75 |
+
if accelerator.is_main_process or force:
|
| 76 |
+
now = datetime.datetime.now().time()
|
| 77 |
+
builtin_print("[{}] ".format(now), end="") # print with time stamp
|
| 78 |
+
builtin_print(*args, **kwargs)
|
| 79 |
+
|
| 80 |
+
builtins.print = print
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def save_current_code(outdir):
|
| 84 |
+
now = datetime.datetime.now() # current date and time
|
| 85 |
+
date_time = now.strftime("%m_%d-%H:%M:%S")
|
| 86 |
+
src_dir = "."
|
| 87 |
+
dst_dir = os.path.join(outdir, "code", "{}".format(date_time))
|
| 88 |
+
shutil.copytree(
|
| 89 |
+
src_dir,
|
| 90 |
+
dst_dir,
|
| 91 |
+
ignore=shutil.ignore_patterns(
|
| 92 |
+
".vscode*",
|
| 93 |
+
"assets*",
|
| 94 |
+
"example*",
|
| 95 |
+
"checkpoints*",
|
| 96 |
+
"OLD*",
|
| 97 |
+
"logs*",
|
| 98 |
+
"out*",
|
| 99 |
+
"runs*",
|
| 100 |
+
"*.png",
|
| 101 |
+
"*.mp4",
|
| 102 |
+
"*__pycache__*",
|
| 103 |
+
"*.git*",
|
| 104 |
+
"*.idea*",
|
| 105 |
+
"*.zip",
|
| 106 |
+
"*.jpg",
|
| 107 |
+
),
|
| 108 |
+
dirs_exist_ok=True,
|
| 109 |
+
)
|
| 110 |
+
return dst_dir
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def train(args):
|
| 114 |
+
|
| 115 |
+
accelerator = Accelerator(
|
| 116 |
+
gradient_accumulation_steps=args.accum_iter,
|
| 117 |
+
mixed_precision="bf16",
|
| 118 |
+
kwargs_handlers=[
|
| 119 |
+
DistributedDataParallelKwargs(find_unused_parameters=True),
|
| 120 |
+
InitProcessGroupKwargs(timeout=timedelta(seconds=6000)),
|
| 121 |
+
],
|
| 122 |
+
)
|
| 123 |
+
device = accelerator.device
|
| 124 |
+
|
| 125 |
+
setup_for_distributed(accelerator)
|
| 126 |
+
|
| 127 |
+
printer.info("output_dir: " + args.output_dir)
|
| 128 |
+
if args.output_dir:
|
| 129 |
+
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
|
| 130 |
+
|
| 131 |
+
if accelerator.is_main_process:
|
| 132 |
+
dst_dir = save_current_code(outdir=args.output_dir)
|
| 133 |
+
printer.info(f"Saving current code to {dst_dir}")
|
| 134 |
+
|
| 135 |
+
# auto resume
|
| 136 |
+
if not args.resume:
|
| 137 |
+
last_ckpt_fname = os.path.join(args.output_dir, f"checkpoint-last.pth")
|
| 138 |
+
#last_ckpt_fname = os.path.join(args.output_dir, f"checkpoint-7.pth")
|
| 139 |
+
|
| 140 |
+
args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None
|
| 141 |
+
|
| 142 |
+
printer.info("job dir: {}".format(os.path.dirname(os.path.realpath(__file__))))
|
| 143 |
+
|
| 144 |
+
# fix the seed
|
| 145 |
+
seed = args.seed + accelerator.state.process_index
|
| 146 |
+
printer.info(
|
| 147 |
+
f"Setting seed to {seed} for process {accelerator.state.process_index}"
|
| 148 |
+
)
|
| 149 |
+
torch.manual_seed(seed)
|
| 150 |
+
np.random.seed(seed)
|
| 151 |
+
random.seed(seed)
|
| 152 |
+
cudnn.benchmark = args.benchmark
|
| 153 |
+
|
| 154 |
+
# training dataset and loader
|
| 155 |
+
printer.info("Building train dataset %s", args.train_dataset)
|
| 156 |
+
# dataset and loader
|
| 157 |
+
data_loader_train = build_dataset(
|
| 158 |
+
args.train_dataset,
|
| 159 |
+
args.batch_size,
|
| 160 |
+
args.num_workers,
|
| 161 |
+
accelerator=accelerator,
|
| 162 |
+
test=False,
|
| 163 |
+
fixed_length=args.fixed_length
|
| 164 |
+
)
|
| 165 |
+
printer.info("Building test dataset %s", args.test_dataset)
|
| 166 |
+
data_loader_test = {
|
| 167 |
+
dataset.split("(")[0]: build_dataset(
|
| 168 |
+
dataset,
|
| 169 |
+
args.batch_size,
|
| 170 |
+
args.num_workers,
|
| 171 |
+
accelerator=accelerator,
|
| 172 |
+
test=True,
|
| 173 |
+
fixed_length=True
|
| 174 |
+
)
|
| 175 |
+
for dataset in args.test_dataset.split("+")
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
# model
|
| 179 |
+
printer.info("Loading model")
|
| 180 |
+
model = SLAMFormer()
|
| 181 |
+
teacher = None
|
| 182 |
+
|
| 183 |
+
# model: PreTrainedModel = eval(args.model)
|
| 184 |
+
printer.info(f"All model parameters: {sum(p.numel() for p in model.parameters())}")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
printer.info(f">> Creating train criterion = {args.train_criterion}")
|
| 188 |
+
train_criterion = eval(args.train_criterion).to(device)
|
| 189 |
+
printer.info(
|
| 190 |
+
f">> Creating test criterion = {args.test_criterion or args.train_criterion}"
|
| 191 |
+
)
|
| 192 |
+
test_criterion = eval(args.test_criterion or args.criterion).to(device)
|
| 193 |
+
|
| 194 |
+
model.to(device)
|
| 195 |
+
|
| 196 |
+
if args.gradient_checkpointing:
|
| 197 |
+
model.gradient_checkpointing_enable()
|
| 198 |
+
if args.long_context:
|
| 199 |
+
model.fixed_input_length = False
|
| 200 |
+
|
| 201 |
+
freeze_keys = None
|
| 202 |
+
print('NOTE:', args.pretrained, args.resume)
|
| 203 |
+
if args.pretrained and not args.resume:
|
| 204 |
+
printer.info(f"Loading pretrained: {args.pretrained}")
|
| 205 |
+
ckpt = torch.load(args.pretrained, map_location=device)
|
| 206 |
+
'''
|
| 207 |
+
ckpt_ = dict()
|
| 208 |
+
for key, v in ckpt.items():
|
| 209 |
+
ckpt_[key[7:]] = v
|
| 210 |
+
'''
|
| 211 |
+
'''
|
| 212 |
+
freeze_keys = list(ckpt.keys())
|
| 213 |
+
|
| 214 |
+
ls = dict()
|
| 215 |
+
for key, v in ckpt.items():
|
| 216 |
+
if 'aggregator' in key:
|
| 217 |
+
key_ = key.replace('aggregator', 'backend_transformer')
|
| 218 |
+
ls[key_] = key
|
| 219 |
+
for key_ in ls.keys():
|
| 220 |
+
key = ls[key_]
|
| 221 |
+
ckpt[key_] = ckpt[key]
|
| 222 |
+
'''
|
| 223 |
+
printer.info(
|
| 224 |
+
model.load_state_dict(ckpt, strict=False)
|
| 225 |
+
)
|
| 226 |
+
del ckpt# in case it occupies memory
|
| 227 |
+
'''
|
| 228 |
+
if freeze_keys is None:
|
| 229 |
+
freeze_keys = []
|
| 230 |
+
|
| 231 |
+
for name, param in model.named_parameters():
|
| 232 |
+
if 'backend_transformer' not in name:
|
| 233 |
+
freeze_keys.append(name)
|
| 234 |
+
'''
|
| 235 |
+
'''
|
| 236 |
+
printer.info("Loading teacher model")
|
| 237 |
+
ckpt_teacher = torch.load(args.teacher, map_location=device)
|
| 238 |
+
teacher.load_state_dict(ckpt_teacher, strict=True)
|
| 239 |
+
teacher = teacher.to("cuda")
|
| 240 |
+
for p in teacher.parameters():
|
| 241 |
+
p.requires_grad = False
|
| 242 |
+
teacher.eval()
|
| 243 |
+
del ckpt_teacher
|
| 244 |
+
|
| 245 |
+
'''
|
| 246 |
+
# freeze
|
| 247 |
+
printer.info("Freezing patch embedding and positional encoding parameters...")
|
| 248 |
+
frozen_params = 0
|
| 249 |
+
total_params = 0
|
| 250 |
+
|
| 251 |
+
frozen_param_names = []
|
| 252 |
+
|
| 253 |
+
for name, param in model.named_parameters():
|
| 254 |
+
total_params += param.numel()
|
| 255 |
+
param.requires_grad = True
|
| 256 |
+
|
| 257 |
+
if hasattr(model, 'encoder'):# and hasattr(model.aggregator, 'patch_embed'):
|
| 258 |
+
for param in model.encoder.parameters():#aggregator.patch_embed.parameters():
|
| 259 |
+
if param.requires_grad:
|
| 260 |
+
param.requires_grad = False
|
| 261 |
+
|
| 262 |
+
if hasattr(model, 'register_token'):
|
| 263 |
+
model.register_token.requires_grad = False
|
| 264 |
+
|
| 265 |
+
# YIJUN: Skip the freezekeys
|
| 266 |
+
'''
|
| 267 |
+
for name, param in model.named_parameters():
|
| 268 |
+
if 'camera_decoder' in name or 'camera_head' in name:
|
| 269 |
+
print(name)
|
| 270 |
+
param.requires_grad = False
|
| 271 |
+
'''
|
| 272 |
+
|
| 273 |
+
for name, p in model.named_parameters():
|
| 274 |
+
if not p.requires_grad:
|
| 275 |
+
frozen_params += p.numel()
|
| 276 |
+
frozen_param_names.append(name)
|
| 277 |
+
|
| 278 |
+
printer.info(
|
| 279 |
+
f"Frozen {frozen_params:,} parameters out of {total_params:,} total parameters. ({frozen_params / total_params:.2%})")
|
| 280 |
+
printer.info(
|
| 281 |
+
f"Trainable parameters: {total_params - frozen_params:,} ({(total_params - frozen_params) / total_params:.2%})")
|
| 282 |
+
if frozen_param_names:
|
| 283 |
+
printer.info(
|
| 284 |
+
f"Example frozen parameters: {', '.join(frozen_param_names[:5])}{'...' if len(frozen_param_names) > 5 else ''}")
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
# following timm: set wd as 0 for bias and norm layers
|
| 289 |
+
param_groups = misc.get_parameter_groups(model, args.weight_decay)
|
| 290 |
+
optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))
|
| 291 |
+
# print(optimizer)
|
| 292 |
+
loss_scaler = NativeScaler(accelerator=accelerator)
|
| 293 |
+
|
| 294 |
+
best_so_far = misc.load_model(
|
| 295 |
+
args=args, model_without_ddp=model, optimizer=optimizer, loss_scaler=loss_scaler
|
| 296 |
+
)
|
| 297 |
+
if best_so_far is None:
|
| 298 |
+
best_so_far = float("inf")
|
| 299 |
+
|
| 300 |
+
accelerator.even_batches = False
|
| 301 |
+
optimizer, model, data_loader_train = accelerator.prepare(
|
| 302 |
+
optimizer, model, data_loader_train
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
def write_log_stats(epoch, train_stats, test_stats):
|
| 306 |
+
if accelerator.is_main_process:
|
| 307 |
+
if log_writer is not None:
|
| 308 |
+
log_writer.flush()
|
| 309 |
+
|
| 310 |
+
log_stats = dict(
|
| 311 |
+
epoch=epoch, **{f"train_{k}": v for k, v in train_stats.items()}
|
| 312 |
+
)
|
| 313 |
+
for test_name in data_loader_test:
|
| 314 |
+
if test_name not in test_stats:
|
| 315 |
+
continue
|
| 316 |
+
log_stats.update(
|
| 317 |
+
{test_name + "_" + k: v for k, v in test_stats[test_name].items()}
|
| 318 |
+
)
|
| 319 |
+
|
| 320 |
+
with open(
|
| 321 |
+
os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8"
|
| 322 |
+
) as f:
|
| 323 |
+
f.write(json.dumps(log_stats) + "\n")
|
| 324 |
+
|
| 325 |
+
def save_model(epoch, fname, best_so_far, data_iter_step):
|
| 326 |
+
misc.save_model(
|
| 327 |
+
accelerator=accelerator,
|
| 328 |
+
args=args,
|
| 329 |
+
model_without_ddp=model,
|
| 330 |
+
optimizer=optimizer,
|
| 331 |
+
loss_scaler=loss_scaler,
|
| 332 |
+
epoch=epoch,
|
| 333 |
+
step=data_iter_step,
|
| 334 |
+
fname=fname,
|
| 335 |
+
best_so_far=best_so_far,
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
log_writer = (
|
| 339 |
+
SummaryWriter(log_dir=args.output_dir) if accelerator.is_main_process else None
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
printer.info(f"Start training for {args.epochs} epochs")
|
| 343 |
+
start_time = time.time()
|
| 344 |
+
train_stats = test_stats = {}
|
| 345 |
+
|
| 346 |
+
for epoch in range(args.start_epoch, args.epochs + 1):
|
| 347 |
+
|
| 348 |
+
# Save immediately the last checkpoint
|
| 349 |
+
if epoch > args.start_epoch:
|
| 350 |
+
if (
|
| 351 |
+
args.save_freq
|
| 352 |
+
and np.allclose(epoch / args.save_freq, int(epoch / args.save_freq))
|
| 353 |
+
or epoch == args.epochs
|
| 354 |
+
):
|
| 355 |
+
save_model(epoch - 1, "last", best_so_far, args.start_step)
|
| 356 |
+
|
| 357 |
+
new_best = False
|
| 358 |
+
|
| 359 |
+
if epoch > args.start_epoch:
|
| 360 |
+
if args.keep_freq and epoch % args.keep_freq == 0:
|
| 361 |
+
save_model(epoch - 1, str(epoch), best_so_far, args.start_step)
|
| 362 |
+
if new_best:
|
| 363 |
+
save_model(epoch - 1, "best", best_so_far, args.start_step)
|
| 364 |
+
if epoch >= args.epochs:
|
| 365 |
+
break # exit after writing last test to disk
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
# Train
|
| 369 |
+
train_stats = train_one_epoch(
|
| 370 |
+
model,
|
| 371 |
+
teacher,
|
| 372 |
+
train_criterion,
|
| 373 |
+
data_loader_train,
|
| 374 |
+
optimizer,
|
| 375 |
+
accelerator,
|
| 376 |
+
epoch,
|
| 377 |
+
loss_scaler,
|
| 378 |
+
log_writer=log_writer,
|
| 379 |
+
args=args
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
total_time = time.time() - start_time
|
| 384 |
+
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
|
| 385 |
+
printer.info("Training time {}".format(total_time_str))
|
| 386 |
+
|
| 387 |
+
save_final_model(accelerator, args, args.epochs, model, best_so_far=best_so_far)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
+
def save_final_model(accelerator, args, epoch, model_without_ddp, best_so_far=None):
|
| 391 |
+
output_dir = Path(args.output_dir)
|
| 392 |
+
checkpoint_path = output_dir / "checkpoint-final.pth"
|
| 393 |
+
to_save = {
|
| 394 |
+
"args": args,
|
| 395 |
+
"model": (
|
| 396 |
+
model_without_ddp
|
| 397 |
+
if isinstance(model_without_ddp, dict)
|
| 398 |
+
else model_without_ddp.cpu().state_dict()
|
| 399 |
+
),
|
| 400 |
+
"epoch": epoch,
|
| 401 |
+
}
|
| 402 |
+
if best_so_far is not None:
|
| 403 |
+
to_save["best_so_far"] = best_so_far
|
| 404 |
+
printer.info(f">> Saving model to {checkpoint_path} ...")
|
| 405 |
+
misc.save_on_master(accelerator, to_save, checkpoint_path)
|
| 406 |
+
|
| 407 |
+
|
| 408 |
+
def build_dataset(dataset, batch_size, num_workers, accelerator, test=False, fixed_length=False):
|
| 409 |
+
split = ["Train", "Test"][test]
|
| 410 |
+
printer.info(f"Building {split} Data loader for dataset: {dataset}")
|
| 411 |
+
loader = get_data_loader(
|
| 412 |
+
dataset,
|
| 413 |
+
batch_size=batch_size,
|
| 414 |
+
num_workers=num_workers,
|
| 415 |
+
pin_mem=True,
|
| 416 |
+
shuffle=not (test),
|
| 417 |
+
drop_last=not (test),
|
| 418 |
+
accelerator=accelerator,
|
| 419 |
+
fixed_length=fixed_length
|
| 420 |
+
)
|
| 421 |
+
return loader
|
| 422 |
+
|
| 423 |
+
|
| 424 |
+
def train_one_epoch(
|
| 425 |
+
model: torch.nn.Module,
|
| 426 |
+
teacher: torch.nn.Module,
|
| 427 |
+
criterion: torch.nn.Module,
|
| 428 |
+
data_loader: Sized,
|
| 429 |
+
optimizer: torch.optim.Optimizer,
|
| 430 |
+
accelerator: Accelerator,
|
| 431 |
+
epoch: int,
|
| 432 |
+
loss_scaler,
|
| 433 |
+
args,
|
| 434 |
+
log_writer=None,
|
| 435 |
+
):
|
| 436 |
+
assert torch.backends.cuda.matmul.allow_tf32 == True
|
| 437 |
+
|
| 438 |
+
model.train(True)
|
| 439 |
+
metric_logger = misc.MetricLogger(delimiter=" ")
|
| 440 |
+
metric_logger.add_meter("lr", misc.SmoothedValue(window_size=1, fmt="{value:.6f}"))
|
| 441 |
+
header = "Epoch: [{}]".format(epoch)
|
| 442 |
+
accum_iter = args.accum_iter
|
| 443 |
+
|
| 444 |
+
def save_model(epoch, fname, best_so_far, data_iter_step):
|
| 445 |
+
unwrapped_model = accelerator.unwrap_model(model)
|
| 446 |
+
misc.save_model(
|
| 447 |
+
accelerator=accelerator,
|
| 448 |
+
args=args,
|
| 449 |
+
model_without_ddp=unwrapped_model,
|
| 450 |
+
optimizer=optimizer,
|
| 451 |
+
loss_scaler=loss_scaler,
|
| 452 |
+
epoch=epoch,
|
| 453 |
+
step=data_iter_step,
|
| 454 |
+
fname=fname,
|
| 455 |
+
best_so_far=best_so_far,
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
+
if log_writer is not None:
|
| 459 |
+
printer.info("log_dir: {}".format(log_writer.log_dir))
|
| 460 |
+
|
| 461 |
+
if hasattr(data_loader, "dataset") and hasattr(data_loader.dataset, "set_epoch"):
|
| 462 |
+
data_loader.dataset.set_epoch(epoch)
|
| 463 |
+
if (
|
| 464 |
+
hasattr(data_loader, "batch_sampler")
|
| 465 |
+
and hasattr(data_loader.batch_sampler, "batch_sampler")
|
| 466 |
+
and hasattr(data_loader.batch_sampler.batch_sampler, "set_epoch")
|
| 467 |
+
):
|
| 468 |
+
data_loader.batch_sampler.batch_sampler.set_epoch(epoch)
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
optimizer.zero_grad()
|
| 472 |
+
|
| 473 |
+
start_step = args.start_step
|
| 474 |
+
|
| 475 |
+
data_iter = metric_logger.log_every(data_loader, args.print_freq, accelerator, header)
|
| 476 |
+
|
| 477 |
+
for data_iter_step, batch in enumerate(data_iter):
|
| 478 |
+
|
| 479 |
+
with accelerator.accumulate(model):
|
| 480 |
+
# change the range of the image to [0, 1]
|
| 481 |
+
if isinstance(batch, dict) and "img" in batch:
|
| 482 |
+
batch["img"] = (batch["img"] + 1.0) / 2.0
|
| 483 |
+
elif isinstance(batch, list) and all(isinstance(v, dict) and "img" in v for v in batch):
|
| 484 |
+
for view in batch:
|
| 485 |
+
view["img"] = (view["img"] + 1.0) / 2.0
|
| 486 |
+
|
| 487 |
+
epoch_f = epoch + data_iter_step / len(data_loader)
|
| 488 |
+
# we use a per iteration (instead of per epoch) lr scheduler
|
| 489 |
+
if data_iter_step % accum_iter == 0:
|
| 490 |
+
misc.adjust_learning_rate(optimizer, epoch_f, args)
|
| 491 |
+
|
| 492 |
+
epoch_f = epoch + data_iter_step / len(data_loader)
|
| 493 |
+
step = int(epoch_f * len(data_loader))
|
| 494 |
+
|
| 495 |
+
result = loss_of_one_batch(
|
| 496 |
+
batch,
|
| 497 |
+
model,
|
| 498 |
+
criterion,
|
| 499 |
+
accelerator,
|
| 500 |
+
teacher=teacher,
|
| 501 |
+
inference=False,
|
| 502 |
+
symmetrize_batch=False,
|
| 503 |
+
use_amp=bool(args.amp),
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
loss, loss_details = result["loss"] # criterion returns two values
|
| 507 |
+
|
| 508 |
+
loss_value = float(loss)
|
| 509 |
+
|
| 510 |
+
if not math.isfinite(loss_value):
|
| 511 |
+
print(
|
| 512 |
+
f"Loss is {loss_value}, stopping training, loss details: {loss_details}"
|
| 513 |
+
)
|
| 514 |
+
sys.exit(1)
|
| 515 |
+
if not result.get("already_backprop", False):
|
| 516 |
+
loss_scaler(
|
| 517 |
+
loss,
|
| 518 |
+
optimizer,
|
| 519 |
+
parameters=model.parameters(),
|
| 520 |
+
update_grad=True,
|
| 521 |
+
clip_grad=1.0,
|
| 522 |
+
)
|
| 523 |
+
optimizer.zero_grad()
|
| 524 |
+
|
| 525 |
+
is_metric = batch[0]["is_metric"]
|
| 526 |
+
curr_num_view = len(batch)
|
| 527 |
+
|
| 528 |
+
del loss
|
| 529 |
+
|
| 530 |
+
tb_vis_img = (data_iter_step + 1) % accum_iter == 0 and (
|
| 531 |
+
(step + 1) % (args.print_img_freq)
|
| 532 |
+
) == 0
|
| 533 |
+
if not tb_vis_img:
|
| 534 |
+
del batch
|
| 535 |
+
else:
|
| 536 |
+
torch.cuda.empty_cache()
|
| 537 |
+
|
| 538 |
+
lr = optimizer.param_groups[0]["lr"]
|
| 539 |
+
metric_logger.update(epoch=epoch_f)
|
| 540 |
+
metric_logger.update(lr=lr)
|
| 541 |
+
metric_logger.update(step=step)
|
| 542 |
+
#
|
| 543 |
+
metric_logger.update(loss=loss_value, **loss_details)
|
| 544 |
+
#
|
| 545 |
+
if (data_iter_step + 1) % accum_iter == 0 and (
|
| 546 |
+
(data_iter_step + 1) % (accum_iter * args.print_freq)
|
| 547 |
+
) == 0:
|
| 548 |
+
loss_value_reduce = accelerator.gather(
|
| 549 |
+
torch.tensor(loss_value).to(accelerator.device)
|
| 550 |
+
).mean() # MUST BE EXECUTED BY ALL NODES
|
| 551 |
+
|
| 552 |
+
if log_writer is None:
|
| 553 |
+
continue
|
| 554 |
+
""" We use epoch_1000x as the x-axis in tensorboard.
|
| 555 |
+
This calibrates different curves when batch size changes.
|
| 556 |
+
"""
|
| 557 |
+
epoch_1000x = int(epoch_f * 1000)
|
| 558 |
+
log_writer.add_scalar("train_loss", loss_value_reduce, step)
|
| 559 |
+
log_writer.add_scalar("train_lr", lr, step)
|
| 560 |
+
log_writer.add_scalar("train_iter", epoch_1000x, step)
|
| 561 |
+
for name, val in loss_details.items():
|
| 562 |
+
if isinstance(val, torch.Tensor):
|
| 563 |
+
if val.ndim > 0:
|
| 564 |
+
continue
|
| 565 |
+
if isinstance(val, dict):
|
| 566 |
+
continue
|
| 567 |
+
log_writer.add_scalar("train_" + name, val, step)
|
| 568 |
+
|
| 569 |
+
if (
|
| 570 |
+
data_iter_step % int(args.save_freq * len(data_loader)) == 0
|
| 571 |
+
and data_iter_step != 0
|
| 572 |
+
and data_iter_step != len(data_loader) - 1
|
| 573 |
+
):
|
| 574 |
+
print("saving at step", data_iter_step)
|
| 575 |
+
save_model(epoch - 1, "last", float("inf"), data_iter_step)
|
| 576 |
+
|
| 577 |
+
# gather the stats from all processes
|
| 578 |
+
metric_logger.synchronize_between_processes(accelerator)
|
| 579 |
+
printer.info("Averaged stats: %s", metric_logger)
|
| 580 |
+
return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
|
| 581 |
+
|
| 582 |
+
def batch_append(original_list, new_list):
|
| 583 |
+
for sublist, new_item in zip(original_list, new_list):
|
| 584 |
+
sublist.append(new_item)
|
| 585 |
+
return original_list
|
| 586 |
+
|
| 587 |
+
|
| 588 |
+
@hydra.main(
|
| 589 |
+
version_base=None,
|
| 590 |
+
config_path=str(os.path.dirname(os.path.abspath(__file__))) + "/../config",
|
| 591 |
+
config_name="mytrain.yaml",
|
| 592 |
+
)
|
| 593 |
+
def run(cfg: OmegaConf):
|
| 594 |
+
OmegaConf.resolve(cfg)
|
| 595 |
+
logdir = pathlib.Path(cfg.logdir)
|
| 596 |
+
logdir.mkdir(parents=True, exist_ok=True)
|
| 597 |
+
train(cfg)
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
if __name__ == "__main__":
|
| 601 |
+
run()
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/camera_head.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
|
| 8 |
+
from streamvggt.layers import Mlp
|
| 9 |
+
from streamvggt.layers.block import Block
|
| 10 |
+
from streamvggt.heads.head_act import activate_pose
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CameraHead(nn.Module):
|
| 14 |
+
def __init__(
|
| 15 |
+
self,
|
| 16 |
+
dim_in: int = 2048,
|
| 17 |
+
trunk_depth: int = 4,
|
| 18 |
+
pose_encoding_type: str = "absT_quaR_FoV",
|
| 19 |
+
num_heads: int = 16,
|
| 20 |
+
mlp_ratio: int = 4,
|
| 21 |
+
init_values: float = 0.01,
|
| 22 |
+
trans_act: str = "linear",
|
| 23 |
+
quat_act: str = "linear",
|
| 24 |
+
fl_act: str = "relu", # Field of view activations: ensures FOV values are positive.
|
| 25 |
+
):
|
| 26 |
+
super().__init__()
|
| 27 |
+
|
| 28 |
+
if pose_encoding_type == "absT_quaR_FoV":
|
| 29 |
+
self.target_dim = 9
|
| 30 |
+
else:
|
| 31 |
+
raise ValueError(f"Unsupported camera encoding type: {pose_encoding_type}")
|
| 32 |
+
|
| 33 |
+
self.trans_act = trans_act
|
| 34 |
+
self.quat_act = quat_act
|
| 35 |
+
self.fl_act = fl_act
|
| 36 |
+
self.trunk_depth = trunk_depth
|
| 37 |
+
|
| 38 |
+
# Build the trunk using a sequence of transformer blocks.
|
| 39 |
+
self.trunk = nn.Sequential(
|
| 40 |
+
*[
|
| 41 |
+
Block(
|
| 42 |
+
dim=dim_in,
|
| 43 |
+
num_heads=num_heads,
|
| 44 |
+
mlp_ratio=mlp_ratio,
|
| 45 |
+
init_values=init_values,
|
| 46 |
+
)
|
| 47 |
+
for _ in range(trunk_depth)
|
| 48 |
+
]
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
# Normalizations for camera token and trunk output.
|
| 52 |
+
self.token_norm = nn.LayerNorm(dim_in)
|
| 53 |
+
self.trunk_norm = nn.LayerNorm(dim_in)
|
| 54 |
+
|
| 55 |
+
# Learnable empty camera pose token.
|
| 56 |
+
self.empty_pose_tokens = nn.Parameter(torch.zeros(1, 1, self.target_dim))
|
| 57 |
+
self.embed_pose = nn.Linear(self.target_dim, dim_in)
|
| 58 |
+
|
| 59 |
+
# Module for producing modulation parameters: shift, scale, and a gate.
|
| 60 |
+
self.poseLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(dim_in, 3 * dim_in, bias=True))
|
| 61 |
+
|
| 62 |
+
# Adaptive layer normalization without affine parameters.
|
| 63 |
+
self.adaln_norm = nn.LayerNorm(dim_in, elementwise_affine=False, eps=1e-6)
|
| 64 |
+
self.pose_branch = Mlp(
|
| 65 |
+
in_features=dim_in,
|
| 66 |
+
hidden_features=dim_in // 2,
|
| 67 |
+
out_features=self.target_dim,
|
| 68 |
+
drop=0,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
def forward(self, aggregated_tokens_list: list, num_iterations: int = 4, past_key_values_camera = None, use_cache: bool = False) -> list:
|
| 72 |
+
"""
|
| 73 |
+
Forward pass to predict camera parameters.
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
aggregated_tokens_list (list): List of token tensors from the network;
|
| 77 |
+
the last tensor is used for prediction.
|
| 78 |
+
num_iterations (int, optional): Number of iterative refinement steps. Defaults to 4.
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
list: A list of predicted camera encodings (post-activation) from each iteration.
|
| 82 |
+
"""
|
| 83 |
+
# Use tokens from the last block for camera prediction.
|
| 84 |
+
tokens = aggregated_tokens_list[-1]
|
| 85 |
+
|
| 86 |
+
# Extract the camera tokens
|
| 87 |
+
pose_tokens = tokens[:, :, 0]
|
| 88 |
+
pose_tokens = self.token_norm(pose_tokens)
|
| 89 |
+
|
| 90 |
+
if use_cache:
|
| 91 |
+
pred_pose_enc_list, past_key_values_camera = self.trunk_fn(pose_tokens, num_iterations, past_key_values_camera, use_cache)
|
| 92 |
+
return pred_pose_enc_list, past_key_values_camera
|
| 93 |
+
else:
|
| 94 |
+
pred_pose_enc_list = self.trunk_fn(pose_tokens, num_iterations, past_key_values_camera=None, use_cache=use_cache)
|
| 95 |
+
return pred_pose_enc_list
|
| 96 |
+
|
| 97 |
+
def trunk_fn(self, pose_tokens: torch.Tensor, num_iterations: int, past_key_values_camera, use_cache: bool) -> list:
|
| 98 |
+
"""
|
| 99 |
+
Iteratively refine camera pose predictions.
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
pose_tokens (torch.Tensor): Normalized camera tokens with shape [B, 1, C].
|
| 103 |
+
num_iterations (int): Number of refinement iterations.
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
list: List of activated camera encodings from each iteration.
|
| 107 |
+
"""
|
| 108 |
+
B, S, C = pose_tokens.shape # S is expected to be 1.
|
| 109 |
+
pred_pose_enc = None
|
| 110 |
+
pred_pose_enc_list = []
|
| 111 |
+
|
| 112 |
+
for _ in range(num_iterations):
|
| 113 |
+
# Use a learned empty pose for the first iteration.
|
| 114 |
+
if pred_pose_enc is None:
|
| 115 |
+
module_input = self.embed_pose(self.empty_pose_tokens.expand(B, S, -1))
|
| 116 |
+
else:
|
| 117 |
+
# Detach the previous prediction to avoid backprop through time.
|
| 118 |
+
pred_pose_enc = pred_pose_enc.detach()
|
| 119 |
+
module_input = self.embed_pose(pred_pose_enc)
|
| 120 |
+
|
| 121 |
+
# Generate modulation parameters and split them into shift, scale, and gate components.
|
| 122 |
+
shift_msa, scale_msa, gate_msa = self.poseLN_modulation(module_input).chunk(3, dim=-1)
|
| 123 |
+
|
| 124 |
+
# Adaptive layer normalization and modulation.
|
| 125 |
+
pose_tokens_modulated = gate_msa * modulate(self.adaln_norm(pose_tokens), shift_msa, scale_msa)
|
| 126 |
+
pose_tokens_modulated = pose_tokens_modulated + pose_tokens
|
| 127 |
+
|
| 128 |
+
if not use_cache:
|
| 129 |
+
L = S * 1
|
| 130 |
+
frame_ids = torch.arange(L, device=pose_tokens_modulated.device) // 1 # [0,0,...,1,1,...,S-1]
|
| 131 |
+
future_frame = frame_ids.unsqueeze(1) < frame_ids.unsqueeze(0)
|
| 132 |
+
attn_mask = future_frame.to(pose_tokens_modulated.dtype) * torch.finfo(pose_tokens_modulated.dtype).min
|
| 133 |
+
else:
|
| 134 |
+
attn_mask = None
|
| 135 |
+
|
| 136 |
+
if use_cache:
|
| 137 |
+
for idx in range(self.trunk_depth):
|
| 138 |
+
pose_tokens_modulated, block_kv = self.trunk[idx](
|
| 139 |
+
pose_tokens_modulated,
|
| 140 |
+
attn_mask=attn_mask,
|
| 141 |
+
past_key_values=past_key_values_camera[idx] if past_key_values_camera[idx] is not None else None,
|
| 142 |
+
use_cache=True
|
| 143 |
+
)
|
| 144 |
+
past_key_values_camera[idx] = block_kv
|
| 145 |
+
else:
|
| 146 |
+
for idx in range(self.trunk_depth):
|
| 147 |
+
pose_tokens_modulated = self.trunk[idx](pose_tokens_modulated, attn_mask=attn_mask)
|
| 148 |
+
|
| 149 |
+
# Compute the delta update for the pose encoding.
|
| 150 |
+
pred_pose_enc_delta = self.pose_branch(self.trunk_norm(pose_tokens_modulated))
|
| 151 |
+
|
| 152 |
+
if pred_pose_enc is None:
|
| 153 |
+
pred_pose_enc = pred_pose_enc_delta
|
| 154 |
+
else:
|
| 155 |
+
pred_pose_enc = pred_pose_enc + pred_pose_enc_delta
|
| 156 |
+
|
| 157 |
+
# Apply final activation functions for translation, quaternion, and field-of-view.
|
| 158 |
+
activated_pose = activate_pose(
|
| 159 |
+
pred_pose_enc,
|
| 160 |
+
trans_act=self.trans_act,
|
| 161 |
+
quat_act=self.quat_act,
|
| 162 |
+
fl_act=self.fl_act,
|
| 163 |
+
)
|
| 164 |
+
pred_pose_enc_list.append(activated_pose)
|
| 165 |
+
|
| 166 |
+
if use_cache:
|
| 167 |
+
return pred_pose_enc_list, past_key_values_camera
|
| 168 |
+
return pred_pose_enc_list
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
|
| 172 |
+
"""
|
| 173 |
+
Modulate the input tensor using scaling and shifting parameters.
|
| 174 |
+
"""
|
| 175 |
+
return x * (1 + scale) + shift
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/dpt_head.py
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from typing import List, Dict, Tuple, Union
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn as nn
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
from .head_act import activate_head
|
| 8 |
+
from .utils import create_uv_grid, position_grid_to_embed
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DPTHead(nn.Module):
|
| 12 |
+
"""
|
| 13 |
+
Args:
|
| 14 |
+
dim_in (int): Input dimension (channels).
|
| 15 |
+
patch_size (int, optional): Patch size. Default is 14.
|
| 16 |
+
output_dim (int, optional): Number of output channels. Default is 4.
|
| 17 |
+
activation (str, optional): Activation type. Default is "inv_log".
|
| 18 |
+
conf_activation (str, optional): Confidence activation type. Default is "expp1".
|
| 19 |
+
features (int, optional): Feature channels for intermediate representations. Default is 256.
|
| 20 |
+
out_channels (List[int], optional): Output channels for each intermediate layer.
|
| 21 |
+
intermediate_layer_idx (List[int], optional): Indices of layers from aggregated tokens used for DPT.
|
| 22 |
+
pos_embed (bool, optional): Whether to use positional embedding. Default is True.
|
| 23 |
+
feature_only (bool, optional): If True, return features only without the last several layers and activation head. Default is False.
|
| 24 |
+
down_ratio (int, optional): Downscaling factor for the output resolution. Default is 1.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
dim_in: int,
|
| 30 |
+
patch_size: int = 14,
|
| 31 |
+
output_dim: int = 4,
|
| 32 |
+
activation: str = "inv_log",
|
| 33 |
+
conf_activation: str = "expp1",
|
| 34 |
+
features: int = 256,
|
| 35 |
+
out_channels: List[int] = [256, 512, 1024, 1024],
|
| 36 |
+
intermediate_layer_idx: List[int] = [4, 11, 17, 23],
|
| 37 |
+
pos_embed: bool = True,
|
| 38 |
+
feature_only: bool = False,
|
| 39 |
+
down_ratio: int = 1,
|
| 40 |
+
) -> None:
|
| 41 |
+
super(DPTHead, self).__init__()
|
| 42 |
+
self.patch_size = patch_size
|
| 43 |
+
self.activation = activation
|
| 44 |
+
self.conf_activation = conf_activation
|
| 45 |
+
self.pos_embed = pos_embed
|
| 46 |
+
self.feature_only = feature_only
|
| 47 |
+
self.down_ratio = down_ratio
|
| 48 |
+
self.intermediate_layer_idx = intermediate_layer_idx
|
| 49 |
+
|
| 50 |
+
self.norm = nn.LayerNorm(dim_in)
|
| 51 |
+
|
| 52 |
+
# Projection layers for each output channel from tokens.
|
| 53 |
+
self.projects = nn.ModuleList(
|
| 54 |
+
[
|
| 55 |
+
nn.Conv2d(
|
| 56 |
+
in_channels=dim_in,
|
| 57 |
+
out_channels=oc,
|
| 58 |
+
kernel_size=1,
|
| 59 |
+
stride=1,
|
| 60 |
+
padding=0,
|
| 61 |
+
)
|
| 62 |
+
for oc in out_channels
|
| 63 |
+
]
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Resize layers for upsampling feature maps.
|
| 67 |
+
self.resize_layers = nn.ModuleList(
|
| 68 |
+
[
|
| 69 |
+
nn.ConvTranspose2d(
|
| 70 |
+
in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0
|
| 71 |
+
),
|
| 72 |
+
nn.ConvTranspose2d(
|
| 73 |
+
in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0
|
| 74 |
+
),
|
| 75 |
+
nn.Identity(),
|
| 76 |
+
nn.Conv2d(
|
| 77 |
+
in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1
|
| 78 |
+
),
|
| 79 |
+
]
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
self.scratch = _make_scratch(
|
| 83 |
+
out_channels,
|
| 84 |
+
features,
|
| 85 |
+
expand=False,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# Attach additional modules to scratch.
|
| 89 |
+
self.scratch.stem_transpose = None
|
| 90 |
+
self.scratch.refinenet1 = _make_fusion_block(features)
|
| 91 |
+
self.scratch.refinenet2 = _make_fusion_block(features)
|
| 92 |
+
self.scratch.refinenet3 = _make_fusion_block(features)
|
| 93 |
+
self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)
|
| 94 |
+
|
| 95 |
+
head_features_1 = features
|
| 96 |
+
head_features_2 = 32
|
| 97 |
+
|
| 98 |
+
if feature_only:
|
| 99 |
+
self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1)
|
| 100 |
+
else:
|
| 101 |
+
self.scratch.output_conv1 = nn.Conv2d(
|
| 102 |
+
head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1
|
| 103 |
+
)
|
| 104 |
+
conv2_in_channels = head_features_1 // 2
|
| 105 |
+
|
| 106 |
+
self.scratch.output_conv2 = nn.Sequential(
|
| 107 |
+
nn.Conv2d(conv2_in_channels, head_features_2, kernel_size=3, stride=1, padding=1),
|
| 108 |
+
nn.ReLU(inplace=True),
|
| 109 |
+
nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0),
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
def forward(
|
| 113 |
+
self,
|
| 114 |
+
aggregated_tokens_list: List[torch.Tensor],
|
| 115 |
+
images: torch.Tensor,
|
| 116 |
+
patch_start_idx: int,
|
| 117 |
+
frames_chunk_size: int = 8,
|
| 118 |
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
| 119 |
+
"""
|
| 120 |
+
Forward pass through the DPT head, supports processing by chunking frames.
|
| 121 |
+
Args:
|
| 122 |
+
aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers.
|
| 123 |
+
images (Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1].
|
| 124 |
+
patch_start_idx (int): Starting index for patch tokens in the token sequence.
|
| 125 |
+
Used to separate patch tokens from other tokens (e.g., camera or register tokens).
|
| 126 |
+
frames_chunk_size (int, optional): Number of frames to process in each chunk.
|
| 127 |
+
If None or larger than S, all frames are processed at once. Default: 8.
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Tensor or Tuple[Tensor, Tensor]:
|
| 131 |
+
- If feature_only=True: Feature maps with shape [B, S, C, H, W]
|
| 132 |
+
- Otherwise: Tuple of (predictions, confidence) both with shape [B, S, 1, H, W]
|
| 133 |
+
"""
|
| 134 |
+
B, S, _, H, W = images.shape
|
| 135 |
+
|
| 136 |
+
# If frames_chunk_size is not specified or greater than S, process all frames at once
|
| 137 |
+
if frames_chunk_size is None or frames_chunk_size >= S:
|
| 138 |
+
return self._forward_impl(aggregated_tokens_list, images, patch_start_idx)
|
| 139 |
+
|
| 140 |
+
# Otherwise, process frames in chunks to manage memory usage
|
| 141 |
+
assert frames_chunk_size > 0
|
| 142 |
+
|
| 143 |
+
# Process frames in batches
|
| 144 |
+
all_preds = []
|
| 145 |
+
all_conf = []
|
| 146 |
+
|
| 147 |
+
for frames_start_idx in range(0, S, frames_chunk_size):
|
| 148 |
+
frames_end_idx = min(frames_start_idx + frames_chunk_size, S)
|
| 149 |
+
|
| 150 |
+
# Process batch of frames
|
| 151 |
+
if self.feature_only:
|
| 152 |
+
chunk_output = self._forward_impl(
|
| 153 |
+
aggregated_tokens_list, images, patch_start_idx, frames_start_idx, frames_end_idx
|
| 154 |
+
)
|
| 155 |
+
all_preds.append(chunk_output)
|
| 156 |
+
else:
|
| 157 |
+
chunk_preds, chunk_conf = self._forward_impl(
|
| 158 |
+
aggregated_tokens_list, images, patch_start_idx, frames_start_idx, frames_end_idx
|
| 159 |
+
)
|
| 160 |
+
all_preds.append(chunk_preds)
|
| 161 |
+
all_conf.append(chunk_conf)
|
| 162 |
+
|
| 163 |
+
# Concatenate results along the sequence dimension
|
| 164 |
+
if self.feature_only:
|
| 165 |
+
return torch.cat(all_preds, dim=1)
|
| 166 |
+
else:
|
| 167 |
+
return torch.cat(all_preds, dim=1), torch.cat(all_conf, dim=1)
|
| 168 |
+
|
| 169 |
+
def _forward_impl(
|
| 170 |
+
self,
|
| 171 |
+
aggregated_tokens_list: List[torch.Tensor],
|
| 172 |
+
images: torch.Tensor,
|
| 173 |
+
patch_start_idx: int,
|
| 174 |
+
frames_start_idx: int = None,
|
| 175 |
+
frames_end_idx: int = None,
|
| 176 |
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
|
| 177 |
+
"""
|
| 178 |
+
Args:
|
| 179 |
+
aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers.
|
| 180 |
+
images (Tensor): Input images with shape [B, S, 3, H, W].
|
| 181 |
+
patch_start_idx (int): Starting index for patch tokens.
|
| 182 |
+
frames_start_idx (int, optional): Starting index for frames to process.
|
| 183 |
+
frames_end_idx (int, optional): Ending index for frames to process.
|
| 184 |
+
|
| 185 |
+
Returns:
|
| 186 |
+
Tensor or Tuple[Tensor, Tensor]: Feature maps or (predictions, confidence).
|
| 187 |
+
"""
|
| 188 |
+
if frames_start_idx is not None and frames_end_idx is not None:
|
| 189 |
+
images = images[:, frames_start_idx:frames_end_idx].contiguous()
|
| 190 |
+
|
| 191 |
+
B, S, _, H, W = images.shape
|
| 192 |
+
|
| 193 |
+
patch_h, patch_w = H // self.patch_size, W // self.patch_size
|
| 194 |
+
|
| 195 |
+
out = []
|
| 196 |
+
dpt_idx = 0
|
| 197 |
+
|
| 198 |
+
for layer_idx in self.intermediate_layer_idx:
|
| 199 |
+
x = aggregated_tokens_list[layer_idx][:, :, patch_start_idx:]
|
| 200 |
+
|
| 201 |
+
# Select frames if processing a chunk
|
| 202 |
+
if frames_start_idx is not None and frames_end_idx is not None:
|
| 203 |
+
x = x[:, frames_start_idx:frames_end_idx]
|
| 204 |
+
|
| 205 |
+
x = x.reshape(B * S, -1, x.shape[-1])
|
| 206 |
+
|
| 207 |
+
x = self.norm(x)
|
| 208 |
+
x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
|
| 209 |
+
|
| 210 |
+
x = self.projects[dpt_idx](x)
|
| 211 |
+
if self.pos_embed:
|
| 212 |
+
x = self._apply_pos_embed(x, W, H)
|
| 213 |
+
x = self.resize_layers[dpt_idx](x)
|
| 214 |
+
|
| 215 |
+
out.append(x)
|
| 216 |
+
dpt_idx += 1
|
| 217 |
+
|
| 218 |
+
# Fuse features from multiple layers.
|
| 219 |
+
out = self.scratch_forward(out)
|
| 220 |
+
# Interpolate fused output to match target image resolution.
|
| 221 |
+
out = custom_interpolate(
|
| 222 |
+
out,
|
| 223 |
+
(int(patch_h * self.patch_size / self.down_ratio), int(patch_w * self.patch_size / self.down_ratio)),
|
| 224 |
+
mode="bilinear",
|
| 225 |
+
align_corners=True,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
if self.pos_embed:
|
| 229 |
+
out = self._apply_pos_embed(out, W, H)
|
| 230 |
+
|
| 231 |
+
if self.feature_only:
|
| 232 |
+
return out.reshape(B, S, *out.shape[1:])
|
| 233 |
+
|
| 234 |
+
out = self.scratch.output_conv2(out)
|
| 235 |
+
preds, conf = activate_head(out, activation=self.activation, conf_activation=self.conf_activation)
|
| 236 |
+
|
| 237 |
+
preds = preds.reshape(B, S, *preds.shape[1:])
|
| 238 |
+
conf = conf.reshape(B, S, *conf.shape[1:])
|
| 239 |
+
return preds, conf
|
| 240 |
+
|
| 241 |
+
def _apply_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor:
|
| 242 |
+
"""
|
| 243 |
+
Apply positional embedding to tensor x.
|
| 244 |
+
"""
|
| 245 |
+
patch_w = x.shape[-1]
|
| 246 |
+
patch_h = x.shape[-2]
|
| 247 |
+
pos_embed = create_uv_grid(patch_w, patch_h, aspect_ratio=W / H, dtype=x.dtype, device=x.device)
|
| 248 |
+
pos_embed = position_grid_to_embed(pos_embed, x.shape[1])
|
| 249 |
+
pos_embed = pos_embed * ratio
|
| 250 |
+
pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
|
| 251 |
+
return x + pos_embed
|
| 252 |
+
|
| 253 |
+
def scratch_forward(self, features: List[torch.Tensor]) -> torch.Tensor:
|
| 254 |
+
"""
|
| 255 |
+
Forward pass through the fusion blocks.
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
features (List[Tensor]): List of feature maps from different layers.
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
Tensor: Fused feature map.
|
| 262 |
+
"""
|
| 263 |
+
layer_1, layer_2, layer_3, layer_4 = features
|
| 264 |
+
|
| 265 |
+
layer_1_rn = self.scratch.layer1_rn(layer_1)
|
| 266 |
+
layer_2_rn = self.scratch.layer2_rn(layer_2)
|
| 267 |
+
layer_3_rn = self.scratch.layer3_rn(layer_3)
|
| 268 |
+
layer_4_rn = self.scratch.layer4_rn(layer_4)
|
| 269 |
+
|
| 270 |
+
out = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
|
| 271 |
+
del layer_4_rn, layer_4
|
| 272 |
+
|
| 273 |
+
out = self.scratch.refinenet3(out, layer_3_rn, size=layer_2_rn.shape[2:])
|
| 274 |
+
del layer_3_rn, layer_3
|
| 275 |
+
|
| 276 |
+
out = self.scratch.refinenet2(out, layer_2_rn, size=layer_1_rn.shape[2:])
|
| 277 |
+
del layer_2_rn, layer_2
|
| 278 |
+
|
| 279 |
+
out = self.scratch.refinenet1(out, layer_1_rn)
|
| 280 |
+
del layer_1_rn, layer_1
|
| 281 |
+
|
| 282 |
+
out = self.scratch.output_conv1(out)
|
| 283 |
+
return out
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
def _make_fusion_block(features: int, size: int = None, has_residual: bool = True, groups: int = 1) -> nn.Module:
|
| 287 |
+
return FeatureFusionBlock(
|
| 288 |
+
features,
|
| 289 |
+
nn.ReLU(inplace=True),
|
| 290 |
+
deconv=False,
|
| 291 |
+
bn=False,
|
| 292 |
+
expand=False,
|
| 293 |
+
align_corners=True,
|
| 294 |
+
size=size,
|
| 295 |
+
has_residual=has_residual,
|
| 296 |
+
groups=groups,
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def _make_scratch(in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False) -> nn.Module:
|
| 301 |
+
scratch = nn.Module()
|
| 302 |
+
out_shape1 = out_shape
|
| 303 |
+
out_shape2 = out_shape
|
| 304 |
+
out_shape3 = out_shape
|
| 305 |
+
if len(in_shape) >= 4:
|
| 306 |
+
out_shape4 = out_shape
|
| 307 |
+
|
| 308 |
+
if expand:
|
| 309 |
+
out_shape1 = out_shape
|
| 310 |
+
out_shape2 = out_shape * 2
|
| 311 |
+
out_shape3 = out_shape * 4
|
| 312 |
+
if len(in_shape) >= 4:
|
| 313 |
+
out_shape4 = out_shape * 8
|
| 314 |
+
|
| 315 |
+
scratch.layer1_rn = nn.Conv2d(
|
| 316 |
+
in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
| 317 |
+
)
|
| 318 |
+
scratch.layer2_rn = nn.Conv2d(
|
| 319 |
+
in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
| 320 |
+
)
|
| 321 |
+
scratch.layer3_rn = nn.Conv2d(
|
| 322 |
+
in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
| 323 |
+
)
|
| 324 |
+
if len(in_shape) >= 4:
|
| 325 |
+
scratch.layer4_rn = nn.Conv2d(
|
| 326 |
+
in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
|
| 327 |
+
)
|
| 328 |
+
return scratch
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
class ResidualConvUnit(nn.Module):
|
| 332 |
+
"""Residual convolution module."""
|
| 333 |
+
|
| 334 |
+
def __init__(self, features, activation, bn, groups=1):
|
| 335 |
+
"""Init.
|
| 336 |
+
|
| 337 |
+
Args:
|
| 338 |
+
features (int): number of features
|
| 339 |
+
"""
|
| 340 |
+
super().__init__()
|
| 341 |
+
|
| 342 |
+
self.bn = bn
|
| 343 |
+
self.groups = groups
|
| 344 |
+
self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
|
| 345 |
+
self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
|
| 346 |
+
|
| 347 |
+
self.norm1 = None
|
| 348 |
+
self.norm2 = None
|
| 349 |
+
|
| 350 |
+
self.activation = activation
|
| 351 |
+
self.skip_add = nn.quantized.FloatFunctional()
|
| 352 |
+
|
| 353 |
+
def forward(self, x):
|
| 354 |
+
"""Forward pass.
|
| 355 |
+
|
| 356 |
+
Args:
|
| 357 |
+
x (tensor): input
|
| 358 |
+
|
| 359 |
+
Returns:
|
| 360 |
+
tensor: output
|
| 361 |
+
"""
|
| 362 |
+
|
| 363 |
+
out = self.activation(x)
|
| 364 |
+
out = self.conv1(out)
|
| 365 |
+
if self.norm1 is not None:
|
| 366 |
+
out = self.norm1(out)
|
| 367 |
+
|
| 368 |
+
out = self.activation(out)
|
| 369 |
+
out = self.conv2(out)
|
| 370 |
+
if self.norm2 is not None:
|
| 371 |
+
out = self.norm2(out)
|
| 372 |
+
|
| 373 |
+
return self.skip_add.add(out, x)
|
| 374 |
+
|
| 375 |
+
|
| 376 |
+
class FeatureFusionBlock(nn.Module):
|
| 377 |
+
"""Feature fusion block."""
|
| 378 |
+
|
| 379 |
+
def __init__(
|
| 380 |
+
self,
|
| 381 |
+
features,
|
| 382 |
+
activation,
|
| 383 |
+
deconv=False,
|
| 384 |
+
bn=False,
|
| 385 |
+
expand=False,
|
| 386 |
+
align_corners=True,
|
| 387 |
+
size=None,
|
| 388 |
+
has_residual=True,
|
| 389 |
+
groups=1,
|
| 390 |
+
):
|
| 391 |
+
"""Init.
|
| 392 |
+
|
| 393 |
+
Args:
|
| 394 |
+
features (int): number of features
|
| 395 |
+
"""
|
| 396 |
+
super(FeatureFusionBlock, self).__init__()
|
| 397 |
+
|
| 398 |
+
self.deconv = deconv
|
| 399 |
+
self.align_corners = align_corners
|
| 400 |
+
self.groups = groups
|
| 401 |
+
self.expand = expand
|
| 402 |
+
out_features = features
|
| 403 |
+
if self.expand == True:
|
| 404 |
+
out_features = features // 2
|
| 405 |
+
|
| 406 |
+
self.out_conv = nn.Conv2d(
|
| 407 |
+
features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=self.groups
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
if has_residual:
|
| 411 |
+
self.resConfUnit1 = ResidualConvUnit(features, activation, bn, groups=self.groups)
|
| 412 |
+
|
| 413 |
+
self.has_residual = has_residual
|
| 414 |
+
self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=self.groups)
|
| 415 |
+
|
| 416 |
+
self.skip_add = nn.quantized.FloatFunctional()
|
| 417 |
+
self.size = size
|
| 418 |
+
|
| 419 |
+
def forward(self, *xs, size=None):
|
| 420 |
+
"""Forward pass.
|
| 421 |
+
|
| 422 |
+
Returns:
|
| 423 |
+
tensor: output
|
| 424 |
+
"""
|
| 425 |
+
output = xs[0]
|
| 426 |
+
|
| 427 |
+
if self.has_residual:
|
| 428 |
+
res = self.resConfUnit1(xs[1])
|
| 429 |
+
output = self.skip_add.add(output, res)
|
| 430 |
+
|
| 431 |
+
output = self.resConfUnit2(output)
|
| 432 |
+
|
| 433 |
+
if (size is None) and (self.size is None):
|
| 434 |
+
modifier = {"scale_factor": 2}
|
| 435 |
+
elif size is None:
|
| 436 |
+
modifier = {"size": self.size}
|
| 437 |
+
else:
|
| 438 |
+
modifier = {"size": size}
|
| 439 |
+
|
| 440 |
+
output = custom_interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
|
| 441 |
+
output = self.out_conv(output)
|
| 442 |
+
|
| 443 |
+
return output
|
| 444 |
+
|
| 445 |
+
|
| 446 |
+
def custom_interpolate(
|
| 447 |
+
x: torch.Tensor,
|
| 448 |
+
size: Tuple[int, int] = None,
|
| 449 |
+
scale_factor: float = None,
|
| 450 |
+
mode: str = "bilinear",
|
| 451 |
+
align_corners: bool = True,
|
| 452 |
+
) -> torch.Tensor:
|
| 453 |
+
"""
|
| 454 |
+
Custom interpolate to avoid INT_MAX issues in nn.functional.interpolate.
|
| 455 |
+
"""
|
| 456 |
+
if size is None:
|
| 457 |
+
size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
|
| 458 |
+
|
| 459 |
+
INT_MAX = 1610612736
|
| 460 |
+
|
| 461 |
+
input_elements = size[0] * size[1] * x.shape[0] * x.shape[1]
|
| 462 |
+
|
| 463 |
+
if input_elements > INT_MAX:
|
| 464 |
+
chunks = torch.chunk(x, chunks=(input_elements // INT_MAX) + 1, dim=0)
|
| 465 |
+
interpolated_chunks = [
|
| 466 |
+
nn.functional.interpolate(chunk, size=size, mode=mode, align_corners=align_corners) for chunk in chunks
|
| 467 |
+
]
|
| 468 |
+
x = torch.cat(interpolated_chunks, dim=0)
|
| 469 |
+
return x.contiguous()
|
| 470 |
+
else:
|
| 471 |
+
return nn.functional.interpolate(x, size=size, mode=mode, align_corners=align_corners)
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/head_act.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn.functional as F
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def activate_pose(pred_pose_enc, trans_act="linear", quat_act="linear", fl_act="linear"):
|
| 6 |
+
"""
|
| 7 |
+
Args:
|
| 8 |
+
pred_pose_enc: Tensor containing encoded pose parameters [translation, quaternion, focal length]
|
| 9 |
+
trans_act: Activation type for translation component
|
| 10 |
+
quat_act: Activation type for quaternion component
|
| 11 |
+
fl_act: Activation type for focal length component
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
Activated pose parameters tensor
|
| 15 |
+
"""
|
| 16 |
+
T = pred_pose_enc[..., :3]
|
| 17 |
+
quat = pred_pose_enc[..., 3:7]
|
| 18 |
+
fl = pred_pose_enc[..., 7:] # or fov
|
| 19 |
+
|
| 20 |
+
T = base_pose_act(T, trans_act)
|
| 21 |
+
quat = base_pose_act(quat, quat_act)
|
| 22 |
+
fl = base_pose_act(fl, fl_act) # or fov
|
| 23 |
+
|
| 24 |
+
pred_pose_enc = torch.cat([T, quat, fl], dim=-1)
|
| 25 |
+
|
| 26 |
+
return pred_pose_enc
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def base_pose_act(pose_enc, act_type="linear"):
|
| 30 |
+
"""
|
| 31 |
+
Apply basic activation function to pose parameters.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
pose_enc: Tensor containing encoded pose parameters
|
| 35 |
+
act_type: Activation type ("linear", "inv_log", "exp", "relu")
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
Activated pose parameters
|
| 39 |
+
"""
|
| 40 |
+
if act_type == "linear":
|
| 41 |
+
return pose_enc
|
| 42 |
+
elif act_type == "inv_log":
|
| 43 |
+
return inverse_log_transform(pose_enc)
|
| 44 |
+
elif act_type == "exp":
|
| 45 |
+
return torch.exp(pose_enc)
|
| 46 |
+
elif act_type == "relu":
|
| 47 |
+
return F.relu(pose_enc)
|
| 48 |
+
else:
|
| 49 |
+
raise ValueError(f"Unknown act_type: {act_type}")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def activate_head(out, activation="norm_exp", conf_activation="expp1"):
|
| 53 |
+
"""
|
| 54 |
+
Process network output to extract 3D points and confidence values.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
out: Network output tensor (B, C, H, W)
|
| 58 |
+
activation: Activation type for 3D points
|
| 59 |
+
conf_activation: Activation type for confidence values
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
Tuple of (3D points tensor, confidence tensor)
|
| 63 |
+
"""
|
| 64 |
+
# Move channels from last dim to the 4th dimension => (B, H, W, C)
|
| 65 |
+
fmap = out.permute(0, 2, 3, 1) # B,H,W,C expected
|
| 66 |
+
|
| 67 |
+
# Split into xyz (first C-1 channels) and confidence (last channel)
|
| 68 |
+
xyz = fmap[:, :, :, :-1]
|
| 69 |
+
conf = fmap[:, :, :, -1]
|
| 70 |
+
|
| 71 |
+
if activation == "norm_exp":
|
| 72 |
+
d = xyz.norm(dim=-1, keepdim=True).clamp(min=1e-8)
|
| 73 |
+
xyz_normed = xyz / d
|
| 74 |
+
pts3d = xyz_normed * torch.expm1(d)
|
| 75 |
+
elif activation == "norm":
|
| 76 |
+
pts3d = xyz / xyz.norm(dim=-1, keepdim=True)
|
| 77 |
+
elif activation == "exp":
|
| 78 |
+
pts3d = torch.exp(xyz)
|
| 79 |
+
elif activation == "relu":
|
| 80 |
+
pts3d = F.relu(xyz)
|
| 81 |
+
elif activation == "inv_log":
|
| 82 |
+
pts3d = inverse_log_transform(xyz)
|
| 83 |
+
elif activation == "xy_inv_log":
|
| 84 |
+
xy, z = xyz.split([2, 1], dim=-1)
|
| 85 |
+
z = inverse_log_transform(z)
|
| 86 |
+
pts3d = torch.cat([xy * z, z], dim=-1)
|
| 87 |
+
elif activation == "sigmoid":
|
| 88 |
+
pts3d = torch.sigmoid(xyz)
|
| 89 |
+
elif activation == "linear":
|
| 90 |
+
pts3d = xyz
|
| 91 |
+
else:
|
| 92 |
+
raise ValueError(f"Unknown activation: {activation}")
|
| 93 |
+
|
| 94 |
+
if conf_activation == "expp1":
|
| 95 |
+
conf_out = 1 + conf.exp()
|
| 96 |
+
elif conf_activation == "expp0":
|
| 97 |
+
conf_out = conf.exp()
|
| 98 |
+
elif conf_activation == "sigmoid":
|
| 99 |
+
conf_out = torch.sigmoid(conf)
|
| 100 |
+
else:
|
| 101 |
+
raise ValueError(f"Unknown conf_activation: {conf_activation}")
|
| 102 |
+
|
| 103 |
+
return pts3d, conf_out
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def inverse_log_transform(y):
|
| 107 |
+
"""
|
| 108 |
+
Apply inverse log transform: sign(y) * (exp(|y|) - 1)
|
| 109 |
+
|
| 110 |
+
Args:
|
| 111 |
+
y: Input tensor
|
| 112 |
+
|
| 113 |
+
Returns:
|
| 114 |
+
Transformed tensor
|
| 115 |
+
"""
|
| 116 |
+
return torch.sign(y) * (torch.expm1(torch.abs(y)))
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_head.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
from .dpt_head import DPTHead
|
| 3 |
+
from .track_modules.base_track_predictor import BaseTrackerPredictor
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class TrackHead(nn.Module):
|
| 7 |
+
"""
|
| 8 |
+
Track head that uses DPT head to process tokens and BaseTrackerPredictor for tracking.
|
| 9 |
+
The tracking is performed iteratively, refining predictions over multiple iterations.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
def __init__(
|
| 13 |
+
self,
|
| 14 |
+
dim_in,
|
| 15 |
+
patch_size=14,
|
| 16 |
+
features=128,
|
| 17 |
+
iters=4,
|
| 18 |
+
predict_conf=True,
|
| 19 |
+
stride=2,
|
| 20 |
+
corr_levels=7,
|
| 21 |
+
corr_radius=4,
|
| 22 |
+
hidden_size=384,
|
| 23 |
+
):
|
| 24 |
+
"""
|
| 25 |
+
Initialize the TrackHead module.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
dim_in (int): Input dimension of tokens from the backbone.
|
| 29 |
+
patch_size (int): Size of image patches used in the vision transformer.
|
| 30 |
+
features (int): Number of feature channels in the feature extractor output.
|
| 31 |
+
iters (int): Number of refinement iterations for tracking predictions.
|
| 32 |
+
predict_conf (bool): Whether to predict confidence scores for tracked points.
|
| 33 |
+
stride (int): Stride value for the tracker predictor.
|
| 34 |
+
corr_levels (int): Number of correlation pyramid levels
|
| 35 |
+
corr_radius (int): Radius for correlation computation, controlling the search area.
|
| 36 |
+
hidden_size (int): Size of hidden layers in the tracker network.
|
| 37 |
+
"""
|
| 38 |
+
super().__init__()
|
| 39 |
+
|
| 40 |
+
self.patch_size = patch_size
|
| 41 |
+
|
| 42 |
+
# Feature extractor based on DPT architecture
|
| 43 |
+
# Processes tokens into feature maps for tracking
|
| 44 |
+
self.feature_extractor = DPTHead(
|
| 45 |
+
dim_in=dim_in,
|
| 46 |
+
patch_size=patch_size,
|
| 47 |
+
features=features,
|
| 48 |
+
feature_only=True, # Only output features, no activation
|
| 49 |
+
down_ratio=2, # Reduces spatial dimensions by factor of 2
|
| 50 |
+
pos_embed=False,
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# Tracker module that predicts point trajectories
|
| 54 |
+
# Takes feature maps and predicts coordinates and visibility
|
| 55 |
+
self.tracker = BaseTrackerPredictor(
|
| 56 |
+
latent_dim=features, # Match the output_dim of feature extractor
|
| 57 |
+
predict_conf=predict_conf,
|
| 58 |
+
stride=stride,
|
| 59 |
+
corr_levels=corr_levels,
|
| 60 |
+
corr_radius=corr_radius,
|
| 61 |
+
hidden_size=hidden_size,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
self.iters = iters
|
| 65 |
+
|
| 66 |
+
def forward(self, aggregated_tokens_list, images, patch_start_idx, query_points=None, iters=None):
|
| 67 |
+
"""
|
| 68 |
+
Forward pass of the TrackHead.
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
aggregated_tokens_list (list): List of aggregated tokens from the backbone.
|
| 72 |
+
images (torch.Tensor): Input images of shape (B, S, C, H, W) where:
|
| 73 |
+
B = batch size, S = sequence length.
|
| 74 |
+
patch_start_idx (int): Starting index for patch tokens.
|
| 75 |
+
query_points (torch.Tensor, optional): Initial query points to track.
|
| 76 |
+
If None, points are initialized by the tracker.
|
| 77 |
+
iters (int, optional): Number of refinement iterations. If None, uses self.iters.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
tuple:
|
| 81 |
+
- coord_preds (torch.Tensor): Predicted coordinates for tracked points.
|
| 82 |
+
- vis_scores (torch.Tensor): Visibility scores for tracked points.
|
| 83 |
+
- conf_scores (torch.Tensor): Confidence scores for tracked points (if predict_conf=True).
|
| 84 |
+
"""
|
| 85 |
+
B, S, _, H, W = images.shape
|
| 86 |
+
|
| 87 |
+
# Extract features from tokens
|
| 88 |
+
# feature_maps has shape (B, S, C, H//2, W//2) due to down_ratio=2
|
| 89 |
+
feature_maps = self.feature_extractor(aggregated_tokens_list, images, patch_start_idx)
|
| 90 |
+
|
| 91 |
+
# Use default iterations if not specified
|
| 92 |
+
if iters is None:
|
| 93 |
+
iters = self.iters
|
| 94 |
+
|
| 95 |
+
# Perform tracking using the extracted features
|
| 96 |
+
coord_preds, vis_scores, conf_scores = self.tracker(
|
| 97 |
+
query_points=query_points,
|
| 98 |
+
fmaps=feature_maps,
|
| 99 |
+
iters=iters,
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
return coord_preds, vis_scores, conf_scores
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/__init__.py
ADDED
|
File without changes
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/base_track_predictor.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from einops import rearrange, repeat
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from .blocks import EfficientUpdateFormer, CorrBlock
|
| 7 |
+
from .utils import sample_features4d, get_2d_embedding, get_2d_sincos_pos_embed
|
| 8 |
+
from .modules import Mlp
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class BaseTrackerPredictor(nn.Module):
|
| 12 |
+
def __init__(
|
| 13 |
+
self,
|
| 14 |
+
stride=1,
|
| 15 |
+
corr_levels=5,
|
| 16 |
+
corr_radius=4,
|
| 17 |
+
latent_dim=128,
|
| 18 |
+
hidden_size=384,
|
| 19 |
+
use_spaceatt=True,
|
| 20 |
+
depth=6,
|
| 21 |
+
max_scale=518,
|
| 22 |
+
predict_conf=True,
|
| 23 |
+
):
|
| 24 |
+
super(BaseTrackerPredictor, self).__init__()
|
| 25 |
+
self.stride = stride
|
| 26 |
+
self.latent_dim = latent_dim
|
| 27 |
+
self.corr_levels = corr_levels
|
| 28 |
+
self.corr_radius = corr_radius
|
| 29 |
+
self.hidden_size = hidden_size
|
| 30 |
+
self.max_scale = max_scale
|
| 31 |
+
self.predict_conf = predict_conf
|
| 32 |
+
|
| 33 |
+
self.flows_emb_dim = latent_dim // 2
|
| 34 |
+
|
| 35 |
+
self.corr_mlp = Mlp(
|
| 36 |
+
in_features=self.corr_levels * (self.corr_radius * 2 + 1) ** 2,
|
| 37 |
+
hidden_features=self.hidden_size,
|
| 38 |
+
out_features=self.latent_dim,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
self.transformer_dim = self.latent_dim + self.latent_dim + self.latent_dim + 4
|
| 42 |
+
|
| 43 |
+
self.query_ref_token = nn.Parameter(torch.randn(1, 2, self.transformer_dim))
|
| 44 |
+
|
| 45 |
+
space_depth = depth if use_spaceatt else 0
|
| 46 |
+
time_depth = depth
|
| 47 |
+
|
| 48 |
+
self.updateformer = EfficientUpdateFormer(
|
| 49 |
+
space_depth=space_depth,
|
| 50 |
+
time_depth=time_depth,
|
| 51 |
+
input_dim=self.transformer_dim,
|
| 52 |
+
hidden_size=self.hidden_size,
|
| 53 |
+
output_dim=self.latent_dim + 2,
|
| 54 |
+
mlp_ratio=4.0,
|
| 55 |
+
add_space_attn=use_spaceatt,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
self.fmap_norm = nn.LayerNorm(self.latent_dim)
|
| 59 |
+
self.ffeat_norm = nn.GroupNorm(1, self.latent_dim)
|
| 60 |
+
|
| 61 |
+
# A linear layer to update track feats at each iteration
|
| 62 |
+
self.ffeat_updater = nn.Sequential(nn.Linear(self.latent_dim, self.latent_dim), nn.GELU())
|
| 63 |
+
|
| 64 |
+
self.vis_predictor = nn.Sequential(nn.Linear(self.latent_dim, 1))
|
| 65 |
+
|
| 66 |
+
if predict_conf:
|
| 67 |
+
self.conf_predictor = nn.Sequential(nn.Linear(self.latent_dim, 1))
|
| 68 |
+
|
| 69 |
+
def forward(self, query_points, fmaps=None, iters=6, return_feat=False, down_ratio=1, apply_sigmoid=True):
|
| 70 |
+
"""
|
| 71 |
+
query_points: B x N x 2, the number of batches, tracks, and xy
|
| 72 |
+
fmaps: B x S x C x HH x WW, the number of batches, frames, and feature dimension.
|
| 73 |
+
note HH and WW is the size of feature maps instead of original images
|
| 74 |
+
"""
|
| 75 |
+
B, N, D = query_points.shape
|
| 76 |
+
B, S, C, HH, WW = fmaps.shape
|
| 77 |
+
|
| 78 |
+
assert D == 2, "Input points must be 2D coordinates"
|
| 79 |
+
|
| 80 |
+
# apply a layernorm to fmaps here
|
| 81 |
+
fmaps = self.fmap_norm(fmaps.permute(0, 1, 3, 4, 2))
|
| 82 |
+
fmaps = fmaps.permute(0, 1, 4, 2, 3)
|
| 83 |
+
|
| 84 |
+
# Scale the input query_points because we may downsample the images
|
| 85 |
+
# by down_ratio or self.stride
|
| 86 |
+
# e.g., if a 3x1024x1024 image is processed to a 128x256x256 feature map
|
| 87 |
+
# its query_points should be query_points/4
|
| 88 |
+
if down_ratio > 1:
|
| 89 |
+
query_points = query_points / float(down_ratio)
|
| 90 |
+
|
| 91 |
+
query_points = query_points / float(self.stride)
|
| 92 |
+
|
| 93 |
+
# Init with coords as the query points
|
| 94 |
+
# It means the search will start from the position of query points at the reference frames
|
| 95 |
+
coords = query_points.clone().reshape(B, 1, N, 2).repeat(1, S, 1, 1)
|
| 96 |
+
|
| 97 |
+
# Sample/extract the features of the query points in the query frame
|
| 98 |
+
query_track_feat = sample_features4d(fmaps[:, 0], coords[:, 0])
|
| 99 |
+
|
| 100 |
+
# init track feats by query feats
|
| 101 |
+
track_feats = query_track_feat.unsqueeze(1).repeat(1, S, 1, 1) # B, S, N, C
|
| 102 |
+
# back up the init coords
|
| 103 |
+
coords_backup = coords.clone()
|
| 104 |
+
|
| 105 |
+
fcorr_fn = CorrBlock(fmaps, num_levels=self.corr_levels, radius=self.corr_radius)
|
| 106 |
+
|
| 107 |
+
coord_preds = []
|
| 108 |
+
|
| 109 |
+
# Iterative Refinement
|
| 110 |
+
for _ in range(iters):
|
| 111 |
+
# Detach the gradients from the last iteration
|
| 112 |
+
# (in my experience, not very important for performance)
|
| 113 |
+
coords = coords.detach()
|
| 114 |
+
|
| 115 |
+
fcorrs = fcorr_fn.corr_sample(track_feats, coords)
|
| 116 |
+
|
| 117 |
+
corr_dim = fcorrs.shape[3]
|
| 118 |
+
fcorrs_ = fcorrs.permute(0, 2, 1, 3).reshape(B * N, S, corr_dim)
|
| 119 |
+
fcorrs_ = self.corr_mlp(fcorrs_)
|
| 120 |
+
|
| 121 |
+
# Movement of current coords relative to query points
|
| 122 |
+
flows = (coords - coords[:, 0:1]).permute(0, 2, 1, 3).reshape(B * N, S, 2)
|
| 123 |
+
|
| 124 |
+
flows_emb = get_2d_embedding(flows, self.flows_emb_dim, cat_coords=False)
|
| 125 |
+
|
| 126 |
+
# (In my trials, it is also okay to just add the flows_emb instead of concat)
|
| 127 |
+
flows_emb = torch.cat([flows_emb, flows / self.max_scale, flows / self.max_scale], dim=-1)
|
| 128 |
+
|
| 129 |
+
track_feats_ = track_feats.permute(0, 2, 1, 3).reshape(B * N, S, self.latent_dim)
|
| 130 |
+
|
| 131 |
+
# Concatenate them as the input for the transformers
|
| 132 |
+
transformer_input = torch.cat([flows_emb, fcorrs_, track_feats_], dim=2)
|
| 133 |
+
|
| 134 |
+
# 2D positional embed
|
| 135 |
+
pos_embed = get_2d_sincos_pos_embed(self.transformer_dim, grid_size=(HH, WW)).to(query_points.device)
|
| 136 |
+
sampled_pos_emb = sample_features4d(pos_embed.expand(B, -1, -1, -1), coords[:, 0])
|
| 137 |
+
|
| 138 |
+
sampled_pos_emb = rearrange(sampled_pos_emb, "b n c -> (b n) c").unsqueeze(1)
|
| 139 |
+
|
| 140 |
+
x = transformer_input + sampled_pos_emb
|
| 141 |
+
|
| 142 |
+
# Add the query ref token to the track feats
|
| 143 |
+
query_ref_token = torch.cat(
|
| 144 |
+
[self.query_ref_token[:, 0:1], self.query_ref_token[:, 1:2].expand(-1, S - 1, -1)], dim=1
|
| 145 |
+
)
|
| 146 |
+
x = x + query_ref_token.to(x.device).to(x.dtype)
|
| 147 |
+
|
| 148 |
+
# B, N, S, C
|
| 149 |
+
x = rearrange(x, "(b n) s d -> b n s d", b=B)
|
| 150 |
+
|
| 151 |
+
# Compute the delta coordinates and delta track features
|
| 152 |
+
delta, _ = self.updateformer(x)
|
| 153 |
+
|
| 154 |
+
# BN, S, C
|
| 155 |
+
delta = rearrange(delta, " b n s d -> (b n) s d", b=B)
|
| 156 |
+
delta_coords_ = delta[:, :, :2]
|
| 157 |
+
delta_feats_ = delta[:, :, 2:]
|
| 158 |
+
|
| 159 |
+
track_feats_ = track_feats_.reshape(B * N * S, self.latent_dim)
|
| 160 |
+
delta_feats_ = delta_feats_.reshape(B * N * S, self.latent_dim)
|
| 161 |
+
|
| 162 |
+
# Update the track features
|
| 163 |
+
track_feats_ = self.ffeat_updater(self.ffeat_norm(delta_feats_)) + track_feats_
|
| 164 |
+
|
| 165 |
+
track_feats = track_feats_.reshape(B, N, S, self.latent_dim).permute(0, 2, 1, 3) # BxSxNxC
|
| 166 |
+
|
| 167 |
+
# B x S x N x 2
|
| 168 |
+
coords = coords + delta_coords_.reshape(B, N, S, 2).permute(0, 2, 1, 3)
|
| 169 |
+
|
| 170 |
+
# Force coord0 as query
|
| 171 |
+
# because we assume the query points should not be changed
|
| 172 |
+
coords[:, 0] = coords_backup[:, 0]
|
| 173 |
+
|
| 174 |
+
# The predicted tracks are in the original image scale
|
| 175 |
+
if down_ratio > 1:
|
| 176 |
+
coord_preds.append(coords * self.stride * down_ratio)
|
| 177 |
+
else:
|
| 178 |
+
coord_preds.append(coords * self.stride)
|
| 179 |
+
|
| 180 |
+
# B, S, N
|
| 181 |
+
vis_e = self.vis_predictor(track_feats.reshape(B * S * N, self.latent_dim)).reshape(B, S, N)
|
| 182 |
+
if apply_sigmoid:
|
| 183 |
+
vis_e = torch.sigmoid(vis_e)
|
| 184 |
+
|
| 185 |
+
if self.predict_conf:
|
| 186 |
+
conf_e = self.conf_predictor(track_feats.reshape(B * S * N, self.latent_dim)).reshape(B, S, N)
|
| 187 |
+
if apply_sigmoid:
|
| 188 |
+
conf_e = torch.sigmoid(conf_e)
|
| 189 |
+
else:
|
| 190 |
+
conf_e = None
|
| 191 |
+
|
| 192 |
+
if return_feat:
|
| 193 |
+
return coord_preds, vis_e, track_feats, query_track_feat, conf_e
|
| 194 |
+
else:
|
| 195 |
+
return coord_preds, vis_e, conf_e
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/blocks.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
|
| 6 |
+
from .utils import bilinear_sampler
|
| 7 |
+
from .modules import Mlp, AttnBlock, CrossAttnBlock, ResidualBlock
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class EfficientUpdateFormer(nn.Module):
|
| 11 |
+
"""
|
| 12 |
+
Transformer model that updates track estimates.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
def __init__(
|
| 16 |
+
self,
|
| 17 |
+
space_depth=6,
|
| 18 |
+
time_depth=6,
|
| 19 |
+
input_dim=320,
|
| 20 |
+
hidden_size=384,
|
| 21 |
+
num_heads=8,
|
| 22 |
+
output_dim=130,
|
| 23 |
+
mlp_ratio=4.0,
|
| 24 |
+
add_space_attn=True,
|
| 25 |
+
num_virtual_tracks=64,
|
| 26 |
+
):
|
| 27 |
+
super().__init__()
|
| 28 |
+
|
| 29 |
+
self.out_channels = 2
|
| 30 |
+
self.num_heads = num_heads
|
| 31 |
+
self.hidden_size = hidden_size
|
| 32 |
+
self.add_space_attn = add_space_attn
|
| 33 |
+
|
| 34 |
+
# Add input LayerNorm before linear projection
|
| 35 |
+
self.input_norm = nn.LayerNorm(input_dim)
|
| 36 |
+
self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
|
| 37 |
+
|
| 38 |
+
# Add output LayerNorm before final projection
|
| 39 |
+
self.output_norm = nn.LayerNorm(hidden_size)
|
| 40 |
+
self.flow_head = torch.nn.Linear(hidden_size, output_dim, bias=True)
|
| 41 |
+
self.num_virtual_tracks = num_virtual_tracks
|
| 42 |
+
|
| 43 |
+
if self.add_space_attn:
|
| 44 |
+
self.virual_tracks = nn.Parameter(torch.randn(1, num_virtual_tracks, 1, hidden_size))
|
| 45 |
+
else:
|
| 46 |
+
self.virual_tracks = None
|
| 47 |
+
|
| 48 |
+
self.time_blocks = nn.ModuleList(
|
| 49 |
+
[
|
| 50 |
+
AttnBlock(
|
| 51 |
+
hidden_size,
|
| 52 |
+
num_heads,
|
| 53 |
+
mlp_ratio=mlp_ratio,
|
| 54 |
+
attn_class=nn.MultiheadAttention,
|
| 55 |
+
)
|
| 56 |
+
for _ in range(time_depth)
|
| 57 |
+
]
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if add_space_attn:
|
| 61 |
+
self.space_virtual_blocks = nn.ModuleList(
|
| 62 |
+
[
|
| 63 |
+
AttnBlock(
|
| 64 |
+
hidden_size,
|
| 65 |
+
num_heads,
|
| 66 |
+
mlp_ratio=mlp_ratio,
|
| 67 |
+
attn_class=nn.MultiheadAttention,
|
| 68 |
+
)
|
| 69 |
+
for _ in range(space_depth)
|
| 70 |
+
]
|
| 71 |
+
)
|
| 72 |
+
self.space_point2virtual_blocks = nn.ModuleList(
|
| 73 |
+
[CrossAttnBlock(hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(space_depth)]
|
| 74 |
+
)
|
| 75 |
+
self.space_virtual2point_blocks = nn.ModuleList(
|
| 76 |
+
[CrossAttnBlock(hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(space_depth)]
|
| 77 |
+
)
|
| 78 |
+
assert len(self.time_blocks) >= len(self.space_virtual2point_blocks)
|
| 79 |
+
self.initialize_weights()
|
| 80 |
+
|
| 81 |
+
def initialize_weights(self):
|
| 82 |
+
def _basic_init(module):
|
| 83 |
+
if isinstance(module, nn.Linear):
|
| 84 |
+
torch.nn.init.xavier_uniform_(module.weight)
|
| 85 |
+
if module.bias is not None:
|
| 86 |
+
nn.init.constant_(module.bias, 0)
|
| 87 |
+
torch.nn.init.trunc_normal_(self.flow_head.weight, std=0.001)
|
| 88 |
+
|
| 89 |
+
self.apply(_basic_init)
|
| 90 |
+
|
| 91 |
+
def forward(self, input_tensor, mask=None):
|
| 92 |
+
# Apply input LayerNorm
|
| 93 |
+
input_tensor = self.input_norm(input_tensor)
|
| 94 |
+
tokens = self.input_transform(input_tensor)
|
| 95 |
+
|
| 96 |
+
init_tokens = tokens
|
| 97 |
+
|
| 98 |
+
B, _, T, _ = tokens.shape
|
| 99 |
+
|
| 100 |
+
if self.add_space_attn:
|
| 101 |
+
virtual_tokens = self.virual_tracks.repeat(B, 1, T, 1)
|
| 102 |
+
tokens = torch.cat([tokens, virtual_tokens], dim=1)
|
| 103 |
+
|
| 104 |
+
_, N, _, _ = tokens.shape
|
| 105 |
+
|
| 106 |
+
j = 0
|
| 107 |
+
for i in range(len(self.time_blocks)):
|
| 108 |
+
time_tokens = tokens.contiguous().view(B * N, T, -1) # B N T C -> (B N) T C
|
| 109 |
+
|
| 110 |
+
time_tokens = self.time_blocks[i](time_tokens)
|
| 111 |
+
|
| 112 |
+
tokens = time_tokens.view(B, N, T, -1) # (B N) T C -> B N T C
|
| 113 |
+
if self.add_space_attn and (i % (len(self.time_blocks) // len(self.space_virtual_blocks)) == 0):
|
| 114 |
+
space_tokens = tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1) # B N T C -> (B T) N C
|
| 115 |
+
point_tokens = space_tokens[:, : N - self.num_virtual_tracks]
|
| 116 |
+
virtual_tokens = space_tokens[:, N - self.num_virtual_tracks :]
|
| 117 |
+
|
| 118 |
+
virtual_tokens = self.space_virtual2point_blocks[j](virtual_tokens, point_tokens, mask=mask)
|
| 119 |
+
virtual_tokens = self.space_virtual_blocks[j](virtual_tokens)
|
| 120 |
+
point_tokens = self.space_point2virtual_blocks[j](point_tokens, virtual_tokens, mask=mask)
|
| 121 |
+
|
| 122 |
+
space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1)
|
| 123 |
+
tokens = space_tokens.view(B, T, N, -1).permute(0, 2, 1, 3) # (B T) N C -> B N T C
|
| 124 |
+
j += 1
|
| 125 |
+
|
| 126 |
+
if self.add_space_attn:
|
| 127 |
+
tokens = tokens[:, : N - self.num_virtual_tracks]
|
| 128 |
+
|
| 129 |
+
tokens = tokens + init_tokens
|
| 130 |
+
|
| 131 |
+
# Apply output LayerNorm before final projection
|
| 132 |
+
tokens = self.output_norm(tokens)
|
| 133 |
+
flow = self.flow_head(tokens)
|
| 134 |
+
|
| 135 |
+
return flow, None
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class CorrBlock:
|
| 139 |
+
def __init__(self, fmaps, num_levels=4, radius=4, multiple_track_feats=False, padding_mode="zeros"):
|
| 140 |
+
"""
|
| 141 |
+
Build a pyramid of feature maps from the input.
|
| 142 |
+
|
| 143 |
+
fmaps: Tensor (B, S, C, H, W)
|
| 144 |
+
num_levels: number of pyramid levels (each downsampled by factor 2)
|
| 145 |
+
radius: search radius for sampling correlation
|
| 146 |
+
multiple_track_feats: if True, split the target features per pyramid level
|
| 147 |
+
padding_mode: passed to grid_sample / bilinear_sampler
|
| 148 |
+
"""
|
| 149 |
+
B, S, C, H, W = fmaps.shape
|
| 150 |
+
self.S, self.C, self.H, self.W = S, C, H, W
|
| 151 |
+
self.num_levels = num_levels
|
| 152 |
+
self.radius = radius
|
| 153 |
+
self.padding_mode = padding_mode
|
| 154 |
+
self.multiple_track_feats = multiple_track_feats
|
| 155 |
+
|
| 156 |
+
# Build pyramid: each level is half the spatial resolution of the previous
|
| 157 |
+
self.fmaps_pyramid = [fmaps] # level 0 is full resolution
|
| 158 |
+
current_fmaps = fmaps
|
| 159 |
+
for i in range(num_levels - 1):
|
| 160 |
+
B, S, C, H, W = current_fmaps.shape
|
| 161 |
+
# Merge batch & sequence dimensions
|
| 162 |
+
current_fmaps = current_fmaps.reshape(B * S, C, H, W)
|
| 163 |
+
# Avg pool down by factor 2
|
| 164 |
+
current_fmaps = F.avg_pool2d(current_fmaps, kernel_size=2, stride=2)
|
| 165 |
+
_, _, H_new, W_new = current_fmaps.shape
|
| 166 |
+
current_fmaps = current_fmaps.reshape(B, S, C, H_new, W_new)
|
| 167 |
+
self.fmaps_pyramid.append(current_fmaps)
|
| 168 |
+
|
| 169 |
+
# Precompute a delta grid (of shape (2r+1, 2r+1, 2)) for sampling.
|
| 170 |
+
# This grid is added to the (scaled) coordinate centroids.
|
| 171 |
+
r = self.radius
|
| 172 |
+
dx = torch.linspace(-r, r, 2 * r + 1, device=fmaps.device, dtype=fmaps.dtype)
|
| 173 |
+
dy = torch.linspace(-r, r, 2 * r + 1, device=fmaps.device, dtype=fmaps.dtype)
|
| 174 |
+
# delta: for every (dy,dx) displacement (i.e. Δx, Δy)
|
| 175 |
+
self.delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), dim=-1) # shape: (2r+1, 2r+1, 2)
|
| 176 |
+
|
| 177 |
+
def corr_sample(self, targets, coords):
|
| 178 |
+
"""
|
| 179 |
+
Instead of storing the entire correlation pyramid, we compute each level's correlation
|
| 180 |
+
volume, sample it immediately, then discard it. This saves GPU memory.
|
| 181 |
+
|
| 182 |
+
Args:
|
| 183 |
+
targets: Tensor (B, S, N, C) — features for the current targets.
|
| 184 |
+
coords: Tensor (B, S, N, 2) — coordinates at full resolution.
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
Tensor (B, S, N, L) where L = num_levels * (2*radius+1)**2 (concatenated sampled correlations)
|
| 188 |
+
"""
|
| 189 |
+
B, S, N, C = targets.shape
|
| 190 |
+
|
| 191 |
+
# If you have multiple track features, split them per level.
|
| 192 |
+
if self.multiple_track_feats:
|
| 193 |
+
targets_split = torch.split(targets, C // self.num_levels, dim=-1)
|
| 194 |
+
|
| 195 |
+
out_pyramid = []
|
| 196 |
+
for i, fmaps in enumerate(self.fmaps_pyramid):
|
| 197 |
+
# Get current spatial resolution H, W for this pyramid level.
|
| 198 |
+
B, S, C, H, W = fmaps.shape
|
| 199 |
+
# Reshape feature maps for correlation computation:
|
| 200 |
+
# fmap2s: (B, S, C, H*W)
|
| 201 |
+
fmap2s = fmaps.view(B, S, C, H * W)
|
| 202 |
+
# Choose appropriate target features.
|
| 203 |
+
fmap1 = targets_split[i] if self.multiple_track_feats else targets # shape: (B, S, N, C)
|
| 204 |
+
|
| 205 |
+
# Compute correlation directly
|
| 206 |
+
corrs = compute_corr_level(fmap1, fmap2s, C)
|
| 207 |
+
corrs = corrs.view(B, S, N, H, W)
|
| 208 |
+
|
| 209 |
+
# Prepare sampling grid:
|
| 210 |
+
# Scale down the coordinates for the current level.
|
| 211 |
+
centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / (2**i)
|
| 212 |
+
# Make sure our precomputed delta grid is on the same device/dtype.
|
| 213 |
+
delta_lvl = self.delta.to(coords.device).to(coords.dtype)
|
| 214 |
+
# Now the grid for grid_sample is:
|
| 215 |
+
# coords_lvl = centroid_lvl + delta_lvl (broadcasted over grid)
|
| 216 |
+
coords_lvl = centroid_lvl + delta_lvl.view(1, 2 * self.radius + 1, 2 * self.radius + 1, 2)
|
| 217 |
+
|
| 218 |
+
# Sample from the correlation volume using bilinear interpolation.
|
| 219 |
+
# We reshape corrs to (B * S * N, 1, H, W) so grid_sample acts over each target.
|
| 220 |
+
corrs_sampled = bilinear_sampler(
|
| 221 |
+
corrs.reshape(B * S * N, 1, H, W), coords_lvl, padding_mode=self.padding_mode
|
| 222 |
+
)
|
| 223 |
+
# The sampled output is (B * S * N, 1, 2r+1, 2r+1). Flatten the last two dims.
|
| 224 |
+
corrs_sampled = corrs_sampled.view(B, S, N, -1) # Now shape: (B, S, N, (2r+1)^2)
|
| 225 |
+
out_pyramid.append(corrs_sampled)
|
| 226 |
+
|
| 227 |
+
# Concatenate all levels along the last dimension.
|
| 228 |
+
out = torch.cat(out_pyramid, dim=-1).contiguous()
|
| 229 |
+
return out
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
def compute_corr_level(fmap1, fmap2s, C):
|
| 233 |
+
# fmap1: (B, S, N, C)
|
| 234 |
+
# fmap2s: (B, S, C, H*W)
|
| 235 |
+
corrs = torch.matmul(fmap1, fmap2s) # (B, S, N, H*W)
|
| 236 |
+
corrs = corrs.view(fmap1.shape[0], fmap1.shape[1], fmap1.shape[2], -1) # (B, S, N, H*W)
|
| 237 |
+
return corrs / math.sqrt(C)
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/modules.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from functools import partial
|
| 5 |
+
from typing import Callable
|
| 6 |
+
import collections
|
| 7 |
+
from torch import Tensor
|
| 8 |
+
from itertools import repeat
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# From PyTorch internals
|
| 12 |
+
def _ntuple(n):
|
| 13 |
+
def parse(x):
|
| 14 |
+
if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
|
| 15 |
+
return tuple(x)
|
| 16 |
+
return tuple(repeat(x, n))
|
| 17 |
+
|
| 18 |
+
return parse
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def exists(val):
|
| 22 |
+
return val is not None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def default(val, d):
|
| 26 |
+
return val if exists(val) else d
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
to_2tuple = _ntuple(2)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class ResidualBlock(nn.Module):
|
| 33 |
+
"""
|
| 34 |
+
ResidualBlock: construct a block of two conv layers with residual connections
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, in_planes, planes, norm_fn="group", stride=1, kernel_size=3):
|
| 38 |
+
super(ResidualBlock, self).__init__()
|
| 39 |
+
|
| 40 |
+
self.conv1 = nn.Conv2d(
|
| 41 |
+
in_planes,
|
| 42 |
+
planes,
|
| 43 |
+
kernel_size=kernel_size,
|
| 44 |
+
padding=1,
|
| 45 |
+
stride=stride,
|
| 46 |
+
padding_mode="zeros",
|
| 47 |
+
)
|
| 48 |
+
self.conv2 = nn.Conv2d(
|
| 49 |
+
planes,
|
| 50 |
+
planes,
|
| 51 |
+
kernel_size=kernel_size,
|
| 52 |
+
padding=1,
|
| 53 |
+
padding_mode="zeros",
|
| 54 |
+
)
|
| 55 |
+
self.relu = nn.ReLU(inplace=True)
|
| 56 |
+
|
| 57 |
+
num_groups = planes // 8
|
| 58 |
+
|
| 59 |
+
if norm_fn == "group":
|
| 60 |
+
self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
|
| 61 |
+
self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
|
| 62 |
+
if not stride == 1:
|
| 63 |
+
self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
|
| 64 |
+
|
| 65 |
+
elif norm_fn == "batch":
|
| 66 |
+
self.norm1 = nn.BatchNorm2d(planes)
|
| 67 |
+
self.norm2 = nn.BatchNorm2d(planes)
|
| 68 |
+
if not stride == 1:
|
| 69 |
+
self.norm3 = nn.BatchNorm2d(planes)
|
| 70 |
+
|
| 71 |
+
elif norm_fn == "instance":
|
| 72 |
+
self.norm1 = nn.InstanceNorm2d(planes)
|
| 73 |
+
self.norm2 = nn.InstanceNorm2d(planes)
|
| 74 |
+
if not stride == 1:
|
| 75 |
+
self.norm3 = nn.InstanceNorm2d(planes)
|
| 76 |
+
|
| 77 |
+
elif norm_fn == "none":
|
| 78 |
+
self.norm1 = nn.Sequential()
|
| 79 |
+
self.norm2 = nn.Sequential()
|
| 80 |
+
if not stride == 1:
|
| 81 |
+
self.norm3 = nn.Sequential()
|
| 82 |
+
else:
|
| 83 |
+
raise NotImplementedError
|
| 84 |
+
|
| 85 |
+
if stride == 1:
|
| 86 |
+
self.downsample = None
|
| 87 |
+
else:
|
| 88 |
+
self.downsample = nn.Sequential(
|
| 89 |
+
nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
|
| 90 |
+
self.norm3,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def forward(self, x):
|
| 94 |
+
y = x
|
| 95 |
+
y = self.relu(self.norm1(self.conv1(y)))
|
| 96 |
+
y = self.relu(self.norm2(self.conv2(y)))
|
| 97 |
+
|
| 98 |
+
if self.downsample is not None:
|
| 99 |
+
x = self.downsample(x)
|
| 100 |
+
|
| 101 |
+
return self.relu(x + y)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class Mlp(nn.Module):
|
| 105 |
+
"""MLP as used in Vision Transformer, MLP-Mixer and related networks"""
|
| 106 |
+
|
| 107 |
+
def __init__(
|
| 108 |
+
self,
|
| 109 |
+
in_features,
|
| 110 |
+
hidden_features=None,
|
| 111 |
+
out_features=None,
|
| 112 |
+
act_layer=nn.GELU,
|
| 113 |
+
norm_layer=None,
|
| 114 |
+
bias=True,
|
| 115 |
+
drop=0.0,
|
| 116 |
+
use_conv=False,
|
| 117 |
+
):
|
| 118 |
+
super().__init__()
|
| 119 |
+
out_features = out_features or in_features
|
| 120 |
+
hidden_features = hidden_features or in_features
|
| 121 |
+
bias = to_2tuple(bias)
|
| 122 |
+
drop_probs = to_2tuple(drop)
|
| 123 |
+
linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
|
| 124 |
+
|
| 125 |
+
self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
|
| 126 |
+
self.act = act_layer()
|
| 127 |
+
self.drop1 = nn.Dropout(drop_probs[0])
|
| 128 |
+
self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
|
| 129 |
+
self.drop2 = nn.Dropout(drop_probs[1])
|
| 130 |
+
|
| 131 |
+
def forward(self, x):
|
| 132 |
+
x = self.fc1(x)
|
| 133 |
+
x = self.act(x)
|
| 134 |
+
x = self.drop1(x)
|
| 135 |
+
x = self.fc2(x)
|
| 136 |
+
x = self.drop2(x)
|
| 137 |
+
return x
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
class AttnBlock(nn.Module):
|
| 141 |
+
def __init__(
|
| 142 |
+
self,
|
| 143 |
+
hidden_size,
|
| 144 |
+
num_heads,
|
| 145 |
+
attn_class: Callable[..., nn.Module] = nn.MultiheadAttention,
|
| 146 |
+
mlp_ratio=4.0,
|
| 147 |
+
**block_kwargs
|
| 148 |
+
):
|
| 149 |
+
"""
|
| 150 |
+
Self attention block
|
| 151 |
+
"""
|
| 152 |
+
super().__init__()
|
| 153 |
+
|
| 154 |
+
self.norm1 = nn.LayerNorm(hidden_size)
|
| 155 |
+
self.norm2 = nn.LayerNorm(hidden_size)
|
| 156 |
+
|
| 157 |
+
self.attn = attn_class(embed_dim=hidden_size, num_heads=num_heads, batch_first=True, **block_kwargs)
|
| 158 |
+
|
| 159 |
+
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
| 160 |
+
|
| 161 |
+
self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, drop=0)
|
| 162 |
+
|
| 163 |
+
def forward(self, x, mask=None):
|
| 164 |
+
# Prepare the mask for PyTorch's attention (it expects a different format)
|
| 165 |
+
# attn_mask = mask if mask is not None else None
|
| 166 |
+
# Normalize before attention
|
| 167 |
+
x = self.norm1(x)
|
| 168 |
+
|
| 169 |
+
# PyTorch's MultiheadAttention returns attn_output, attn_output_weights
|
| 170 |
+
# attn_output, _ = self.attn(x, x, x, attn_mask=attn_mask)
|
| 171 |
+
|
| 172 |
+
attn_output, _ = self.attn(x, x, x)
|
| 173 |
+
|
| 174 |
+
# Add & Norm
|
| 175 |
+
x = x + attn_output
|
| 176 |
+
x = x + self.mlp(self.norm2(x))
|
| 177 |
+
return x
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class CrossAttnBlock(nn.Module):
|
| 181 |
+
def __init__(self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, **block_kwargs):
|
| 182 |
+
"""
|
| 183 |
+
Cross attention block
|
| 184 |
+
"""
|
| 185 |
+
super().__init__()
|
| 186 |
+
|
| 187 |
+
self.norm1 = nn.LayerNorm(hidden_size)
|
| 188 |
+
self.norm_context = nn.LayerNorm(hidden_size)
|
| 189 |
+
self.norm2 = nn.LayerNorm(hidden_size)
|
| 190 |
+
|
| 191 |
+
self.cross_attn = nn.MultiheadAttention(
|
| 192 |
+
embed_dim=hidden_size, num_heads=num_heads, batch_first=True, **block_kwargs
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
mlp_hidden_dim = int(hidden_size * mlp_ratio)
|
| 196 |
+
|
| 197 |
+
self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, drop=0)
|
| 198 |
+
|
| 199 |
+
def forward(self, x, context, mask=None):
|
| 200 |
+
# Normalize inputs
|
| 201 |
+
x = self.norm1(x)
|
| 202 |
+
context = self.norm_context(context)
|
| 203 |
+
|
| 204 |
+
# Apply cross attention
|
| 205 |
+
# Note: nn.MultiheadAttention returns attn_output, attn_output_weights
|
| 206 |
+
attn_output, _ = self.cross_attn(x, context, context, attn_mask=mask)
|
| 207 |
+
|
| 208 |
+
# Add & Norm
|
| 209 |
+
x = x + attn_output
|
| 210 |
+
x = x + self.mlp(self.norm2(x))
|
| 211 |
+
return x
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/track_modules/utils.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
|
| 5 |
+
from typing import Optional, Tuple, Union
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def get_2d_sincos_pos_embed(embed_dim: int, grid_size: Union[int, Tuple[int, int]], return_grid=False) -> torch.Tensor:
|
| 9 |
+
"""
|
| 10 |
+
This function initializes a grid and generates a 2D positional embedding using sine and cosine functions.
|
| 11 |
+
It is a wrapper of get_2d_sincos_pos_embed_from_grid.
|
| 12 |
+
Args:
|
| 13 |
+
- embed_dim: The embedding dimension.
|
| 14 |
+
- grid_size: The grid size.
|
| 15 |
+
Returns:
|
| 16 |
+
- pos_embed: The generated 2D positional embedding.
|
| 17 |
+
"""
|
| 18 |
+
if isinstance(grid_size, tuple):
|
| 19 |
+
grid_size_h, grid_size_w = grid_size
|
| 20 |
+
else:
|
| 21 |
+
grid_size_h = grid_size_w = grid_size
|
| 22 |
+
grid_h = torch.arange(grid_size_h, dtype=torch.float)
|
| 23 |
+
grid_w = torch.arange(grid_size_w, dtype=torch.float)
|
| 24 |
+
grid = torch.meshgrid(grid_w, grid_h, indexing="xy")
|
| 25 |
+
grid = torch.stack(grid, dim=0)
|
| 26 |
+
grid = grid.reshape([2, 1, grid_size_h, grid_size_w])
|
| 27 |
+
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
|
| 28 |
+
if return_grid:
|
| 29 |
+
return (
|
| 30 |
+
pos_embed.reshape(1, grid_size_h, grid_size_w, -1).permute(0, 3, 1, 2),
|
| 31 |
+
grid,
|
| 32 |
+
)
|
| 33 |
+
return pos_embed.reshape(1, grid_size_h, grid_size_w, -1).permute(0, 3, 1, 2)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def get_2d_sincos_pos_embed_from_grid(embed_dim: int, grid: torch.Tensor) -> torch.Tensor:
|
| 37 |
+
"""
|
| 38 |
+
This function generates a 2D positional embedding from a given grid using sine and cosine functions.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
- embed_dim: The embedding dimension.
|
| 42 |
+
- grid: The grid to generate the embedding from.
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
- emb: The generated 2D positional embedding.
|
| 46 |
+
"""
|
| 47 |
+
assert embed_dim % 2 == 0
|
| 48 |
+
|
| 49 |
+
# use half of dimensions to encode grid_h
|
| 50 |
+
emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
|
| 51 |
+
emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
|
| 52 |
+
|
| 53 |
+
emb = torch.cat([emb_h, emb_w], dim=2) # (H*W, D)
|
| 54 |
+
return emb
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_1d_sincos_pos_embed_from_grid(embed_dim: int, pos: torch.Tensor) -> torch.Tensor:
|
| 58 |
+
"""
|
| 59 |
+
This function generates a 1D positional embedding from a given grid using sine and cosine functions.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
- embed_dim: The embedding dimension.
|
| 63 |
+
- pos: The position to generate the embedding from.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
- emb: The generated 1D positional embedding.
|
| 67 |
+
"""
|
| 68 |
+
assert embed_dim % 2 == 0
|
| 69 |
+
omega = torch.arange(embed_dim // 2, dtype=torch.double)
|
| 70 |
+
omega /= embed_dim / 2.0
|
| 71 |
+
omega = 1.0 / 10000**omega # (D/2,)
|
| 72 |
+
|
| 73 |
+
pos = pos.reshape(-1) # (M,)
|
| 74 |
+
out = torch.einsum("m,d->md", pos, omega) # (M, D/2), outer product
|
| 75 |
+
|
| 76 |
+
emb_sin = torch.sin(out) # (M, D/2)
|
| 77 |
+
emb_cos = torch.cos(out) # (M, D/2)
|
| 78 |
+
|
| 79 |
+
emb = torch.cat([emb_sin, emb_cos], dim=1) # (M, D)
|
| 80 |
+
return emb[None].float()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def get_2d_embedding(xy: torch.Tensor, C: int, cat_coords: bool = True) -> torch.Tensor:
|
| 84 |
+
"""
|
| 85 |
+
This function generates a 2D positional embedding from given coordinates using sine and cosine functions.
|
| 86 |
+
|
| 87 |
+
Args:
|
| 88 |
+
- xy: The coordinates to generate the embedding from.
|
| 89 |
+
- C: The size of the embedding.
|
| 90 |
+
- cat_coords: A flag to indicate whether to concatenate the original coordinates to the embedding.
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
- pe: The generated 2D positional embedding.
|
| 94 |
+
"""
|
| 95 |
+
B, N, D = xy.shape
|
| 96 |
+
assert D == 2
|
| 97 |
+
|
| 98 |
+
x = xy[:, :, 0:1]
|
| 99 |
+
y = xy[:, :, 1:2]
|
| 100 |
+
div_term = (torch.arange(0, C, 2, device=xy.device, dtype=torch.float32) * (1000.0 / C)).reshape(1, 1, int(C / 2))
|
| 101 |
+
|
| 102 |
+
pe_x = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
|
| 103 |
+
pe_y = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32)
|
| 104 |
+
|
| 105 |
+
pe_x[:, :, 0::2] = torch.sin(x * div_term)
|
| 106 |
+
pe_x[:, :, 1::2] = torch.cos(x * div_term)
|
| 107 |
+
|
| 108 |
+
pe_y[:, :, 0::2] = torch.sin(y * div_term)
|
| 109 |
+
pe_y[:, :, 1::2] = torch.cos(y * div_term)
|
| 110 |
+
|
| 111 |
+
pe = torch.cat([pe_x, pe_y], dim=2) # (B, N, C*3)
|
| 112 |
+
if cat_coords:
|
| 113 |
+
pe = torch.cat([xy, pe], dim=2) # (B, N, C*3+3)
|
| 114 |
+
return pe
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def bilinear_sampler(input, coords, align_corners=True, padding_mode="border"):
|
| 118 |
+
r"""Sample a tensor using bilinear interpolation
|
| 119 |
+
|
| 120 |
+
`bilinear_sampler(input, coords)` samples a tensor :attr:`input` at
|
| 121 |
+
coordinates :attr:`coords` using bilinear interpolation. It is the same
|
| 122 |
+
as `torch.nn.functional.grid_sample()` but with a different coordinate
|
| 123 |
+
convention.
|
| 124 |
+
|
| 125 |
+
The input tensor is assumed to be of shape :math:`(B, C, H, W)`, where
|
| 126 |
+
:math:`B` is the batch size, :math:`C` is the number of channels,
|
| 127 |
+
:math:`H` is the height of the image, and :math:`W` is the width of the
|
| 128 |
+
image. The tensor :attr:`coords` of shape :math:`(B, H_o, W_o, 2)` is
|
| 129 |
+
interpreted as an array of 2D point coordinates :math:`(x_i,y_i)`.
|
| 130 |
+
|
| 131 |
+
Alternatively, the input tensor can be of size :math:`(B, C, T, H, W)`,
|
| 132 |
+
in which case sample points are triplets :math:`(t_i,x_i,y_i)`. Note
|
| 133 |
+
that in this case the order of the components is slightly different
|
| 134 |
+
from `grid_sample()`, which would expect :math:`(x_i,y_i,t_i)`.
|
| 135 |
+
|
| 136 |
+
If `align_corners` is `True`, the coordinate :math:`x` is assumed to be
|
| 137 |
+
in the range :math:`[0,W-1]`, with 0 corresponding to the center of the
|
| 138 |
+
left-most image pixel :math:`W-1` to the center of the right-most
|
| 139 |
+
pixel.
|
| 140 |
+
|
| 141 |
+
If `align_corners` is `False`, the coordinate :math:`x` is assumed to
|
| 142 |
+
be in the range :math:`[0,W]`, with 0 corresponding to the left edge of
|
| 143 |
+
the left-most pixel :math:`W` to the right edge of the right-most
|
| 144 |
+
pixel.
|
| 145 |
+
|
| 146 |
+
Similar conventions apply to the :math:`y` for the range
|
| 147 |
+
:math:`[0,H-1]` and :math:`[0,H]` and to :math:`t` for the range
|
| 148 |
+
:math:`[0,T-1]` and :math:`[0,T]`.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
input (Tensor): batch of input images.
|
| 152 |
+
coords (Tensor): batch of coordinates.
|
| 153 |
+
align_corners (bool, optional): Coordinate convention. Defaults to `True`.
|
| 154 |
+
padding_mode (str, optional): Padding mode. Defaults to `"border"`.
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
Tensor: sampled points.
|
| 158 |
+
"""
|
| 159 |
+
coords = coords.detach().clone()
|
| 160 |
+
############################################################
|
| 161 |
+
# IMPORTANT:
|
| 162 |
+
coords = coords.to(input.device).to(input.dtype)
|
| 163 |
+
############################################################
|
| 164 |
+
|
| 165 |
+
sizes = input.shape[2:]
|
| 166 |
+
|
| 167 |
+
assert len(sizes) in [2, 3]
|
| 168 |
+
|
| 169 |
+
if len(sizes) == 3:
|
| 170 |
+
# t x y -> x y t to match dimensions T H W in grid_sample
|
| 171 |
+
coords = coords[..., [1, 2, 0]]
|
| 172 |
+
|
| 173 |
+
if align_corners:
|
| 174 |
+
scale = torch.tensor(
|
| 175 |
+
[2 / max(size - 1, 1) for size in reversed(sizes)], device=coords.device, dtype=coords.dtype
|
| 176 |
+
)
|
| 177 |
+
else:
|
| 178 |
+
scale = torch.tensor([2 / size for size in reversed(sizes)], device=coords.device, dtype=coords.dtype)
|
| 179 |
+
|
| 180 |
+
coords.mul_(scale) # coords = coords * scale
|
| 181 |
+
coords.sub_(1) # coords = coords - 1
|
| 182 |
+
|
| 183 |
+
return F.grid_sample(input, coords, align_corners=align_corners, padding_mode=padding_mode)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def sample_features4d(input, coords):
|
| 187 |
+
r"""Sample spatial features
|
| 188 |
+
|
| 189 |
+
`sample_features4d(input, coords)` samples the spatial features
|
| 190 |
+
:attr:`input` represented by a 4D tensor :math:`(B, C, H, W)`.
|
| 191 |
+
|
| 192 |
+
The field is sampled at coordinates :attr:`coords` using bilinear
|
| 193 |
+
interpolation. :attr:`coords` is assumed to be of shape :math:`(B, R,
|
| 194 |
+
2)`, where each sample has the format :math:`(x_i, y_i)`. This uses the
|
| 195 |
+
same convention as :func:`bilinear_sampler` with `align_corners=True`.
|
| 196 |
+
|
| 197 |
+
The output tensor has one feature per point, and has shape :math:`(B,
|
| 198 |
+
R, C)`.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
input (Tensor): spatial features.
|
| 202 |
+
coords (Tensor): points.
|
| 203 |
+
|
| 204 |
+
Returns:
|
| 205 |
+
Tensor: sampled features.
|
| 206 |
+
"""
|
| 207 |
+
|
| 208 |
+
B, _, _, _ = input.shape
|
| 209 |
+
|
| 210 |
+
# B R 2 -> B R 1 2
|
| 211 |
+
coords = coords.unsqueeze(2)
|
| 212 |
+
|
| 213 |
+
# B C R 1
|
| 214 |
+
feats = bilinear_sampler(input, coords)
|
| 215 |
+
|
| 216 |
+
return feats.permute(0, 2, 1, 3).view(B, -1, feats.shape[1] * feats.shape[3]) # B C R 1 -> B R C
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/heads/utils.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100) -> torch.Tensor:
|
| 6 |
+
"""
|
| 7 |
+
Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC)
|
| 8 |
+
|
| 9 |
+
Args:
|
| 10 |
+
pos_grid: Tensor of shape (H, W, 2) containing 2D coordinates
|
| 11 |
+
embed_dim: Output channel dimension for embeddings
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
Tensor of shape (H, W, embed_dim) with positional embeddings
|
| 15 |
+
"""
|
| 16 |
+
H, W, grid_dim = pos_grid.shape
|
| 17 |
+
assert grid_dim == 2
|
| 18 |
+
pos_flat = pos_grid.reshape(-1, grid_dim) # Flatten to (H*W, 2)
|
| 19 |
+
|
| 20 |
+
# Process x and y coordinates separately
|
| 21 |
+
emb_x = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0) # [1, H*W, D/2]
|
| 22 |
+
emb_y = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0) # [1, H*W, D/2]
|
| 23 |
+
|
| 24 |
+
# Combine and reshape
|
| 25 |
+
emb = torch.cat([emb_x, emb_y], dim=-1) # [1, H*W, D]
|
| 26 |
+
|
| 27 |
+
return emb.view(H, W, embed_dim) # [H, W, D]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100) -> torch.Tensor:
|
| 31 |
+
"""
|
| 32 |
+
This function generates a 1D positional embedding from a given grid using sine and cosine functions.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
- embed_dim: The embedding dimension.
|
| 36 |
+
- pos: The position to generate the embedding from.
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
- emb: The generated 1D positional embedding.
|
| 40 |
+
"""
|
| 41 |
+
assert embed_dim % 2 == 0
|
| 42 |
+
omega = torch.arange(embed_dim // 2, dtype=torch.double, device=pos.device)
|
| 43 |
+
omega /= embed_dim / 2.0
|
| 44 |
+
omega = 1.0 / omega_0**omega # (D/2,)
|
| 45 |
+
|
| 46 |
+
pos = pos.reshape(-1) # (M,)
|
| 47 |
+
out = torch.einsum("m,d->md", pos, omega) # (M, D/2), outer product
|
| 48 |
+
|
| 49 |
+
emb_sin = torch.sin(out) # (M, D/2)
|
| 50 |
+
emb_cos = torch.cos(out) # (M, D/2)
|
| 51 |
+
|
| 52 |
+
emb = torch.cat([emb_sin, emb_cos], dim=1) # (M, D)
|
| 53 |
+
return emb.float()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def create_uv_grid(
|
| 57 |
+
width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None
|
| 58 |
+
) -> torch.Tensor:
|
| 59 |
+
"""
|
| 60 |
+
Create a normalized UV grid of shape (width, height, 2).
|
| 61 |
+
|
| 62 |
+
The grid spans horizontally and vertically according to an aspect ratio,
|
| 63 |
+
ensuring the top-left corner is at (-x_span, -y_span) and the bottom-right
|
| 64 |
+
corner is at (x_span, y_span), normalized by the diagonal of the plane.
|
| 65 |
+
|
| 66 |
+
Args:
|
| 67 |
+
width (int): Number of points horizontally.
|
| 68 |
+
height (int): Number of points vertically.
|
| 69 |
+
aspect_ratio (float, optional): Width-to-height ratio. Defaults to width/height.
|
| 70 |
+
dtype (torch.dtype, optional): Data type of the resulting tensor.
|
| 71 |
+
device (torch.device, optional): Device on which the tensor is created.
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
torch.Tensor: A (width, height, 2) tensor of UV coordinates.
|
| 75 |
+
"""
|
| 76 |
+
# Derive aspect ratio if not explicitly provided
|
| 77 |
+
if aspect_ratio is None:
|
| 78 |
+
aspect_ratio = float(width) / float(height)
|
| 79 |
+
|
| 80 |
+
# Compute normalized spans for X and Y
|
| 81 |
+
diag_factor = (aspect_ratio**2 + 1.0) ** 0.5
|
| 82 |
+
span_x = aspect_ratio / diag_factor
|
| 83 |
+
span_y = 1.0 / diag_factor
|
| 84 |
+
|
| 85 |
+
# Establish the linspace boundaries
|
| 86 |
+
left_x = -span_x * (width - 1) / width
|
| 87 |
+
right_x = span_x * (width - 1) / width
|
| 88 |
+
top_y = -span_y * (height - 1) / height
|
| 89 |
+
bottom_y = span_y * (height - 1) / height
|
| 90 |
+
|
| 91 |
+
# Generate 1D coordinates
|
| 92 |
+
x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device)
|
| 93 |
+
y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device)
|
| 94 |
+
|
| 95 |
+
# Create 2D meshgrid (width x height) and stack into UV
|
| 96 |
+
uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy")
|
| 97 |
+
uv_grid = torch.stack((uu, vv), dim=-1)
|
| 98 |
+
|
| 99 |
+
return uv_grid
|
outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/geometry.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import torch
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def unproject_depth_map_to_point_map(
|
| 13 |
+
depth_map: np.ndarray, extrinsics_cam: np.ndarray, intrinsics_cam: np.ndarray
|
| 14 |
+
) -> np.ndarray:
|
| 15 |
+
"""
|
| 16 |
+
Unproject a batch of depth maps to 3D world coordinates.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
depth_map (np.ndarray): Batch of depth maps of shape (S, H, W, 1) or (S, H, W)
|
| 20 |
+
extrinsics_cam (np.ndarray): Batch of camera extrinsic matrices of shape (S, 3, 4)
|
| 21 |
+
intrinsics_cam (np.ndarray): Batch of camera intrinsic matrices of shape (S, 3, 3)
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
np.ndarray: Batch of 3D world coordinates of shape (S, H, W, 3)
|
| 25 |
+
"""
|
| 26 |
+
if isinstance(depth_map, torch.Tensor):
|
| 27 |
+
depth_map = depth_map.cpu().numpy()
|
| 28 |
+
if isinstance(extrinsics_cam, torch.Tensor):
|
| 29 |
+
extrinsics_cam = extrinsics_cam.cpu().numpy()
|
| 30 |
+
if isinstance(intrinsics_cam, torch.Tensor):
|
| 31 |
+
intrinsics_cam = intrinsics_cam.cpu().numpy()
|
| 32 |
+
|
| 33 |
+
world_points_list = []
|
| 34 |
+
for frame_idx in range(depth_map.shape[0]):
|
| 35 |
+
cur_world_points, _, _ = depth_to_world_coords_points(
|
| 36 |
+
depth_map[frame_idx].squeeze(-1), extrinsics_cam[frame_idx], intrinsics_cam[frame_idx]
|
| 37 |
+
)
|
| 38 |
+
world_points_list.append(cur_world_points)
|
| 39 |
+
world_points_array = np.stack(world_points_list, axis=0)
|
| 40 |
+
|
| 41 |
+
return world_points_array
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def depth_to_world_coords_points(
|
| 45 |
+
depth_map: np.ndarray,
|
| 46 |
+
extrinsic: np.ndarray,
|
| 47 |
+
intrinsic: np.ndarray,
|
| 48 |
+
eps=1e-8,
|
| 49 |
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
| 50 |
+
"""
|
| 51 |
+
Convert a depth map to world coordinates.
|
| 52 |
+
|
| 53 |
+
Args:
|
| 54 |
+
depth_map (np.ndarray): Depth map of shape (H, W).
|
| 55 |
+
intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3).
|
| 56 |
+
extrinsic (np.ndarray): Camera extrinsic matrix of shape (3, 4). OpenCV camera coordinate convention, cam from world.
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
tuple[np.ndarray, np.ndarray]: World coordinates (H, W, 3) and valid depth mask (H, W).
|
| 60 |
+
"""
|
| 61 |
+
if depth_map is None:
|
| 62 |
+
return None, None, None
|
| 63 |
+
|
| 64 |
+
# Valid depth mask
|
| 65 |
+
point_mask = depth_map > eps
|
| 66 |
+
|
| 67 |
+
# Convert depth map to camera coordinates
|
| 68 |
+
cam_coords_points = depth_to_cam_coords_points(depth_map, intrinsic)
|
| 69 |
+
|
| 70 |
+
# Multiply with the inverse of extrinsic matrix to transform to world coordinates
|
| 71 |
+
# extrinsic_inv is 4x4 (note closed_form_inverse_OpenCV is batched, the output is (N, 4, 4))
|
| 72 |
+
cam_to_world_extrinsic = closed_form_inverse_se3(extrinsic[None])[0]
|
| 73 |
+
|
| 74 |
+
R_cam_to_world = cam_to_world_extrinsic[:3, :3]
|
| 75 |
+
t_cam_to_world = cam_to_world_extrinsic[:3, 3]
|
| 76 |
+
|
| 77 |
+
# Apply the rotation and translation to the camera coordinates
|
| 78 |
+
world_coords_points = np.dot(cam_coords_points, R_cam_to_world.T) + t_cam_to_world # HxWx3, 3x3 -> HxWx3
|
| 79 |
+
# world_coords_points = np.einsum("ij,hwj->hwi", R_cam_to_world, cam_coords_points) + t_cam_to_world
|
| 80 |
+
|
| 81 |
+
return world_coords_points, cam_coords_points, point_mask
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def depth_to_cam_coords_points(depth_map: np.ndarray, intrinsic: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
| 85 |
+
"""
|
| 86 |
+
Convert a depth map to camera coordinates.
|
| 87 |
+
|
| 88 |
+
Args:
|
| 89 |
+
depth_map (np.ndarray): Depth map of shape (H, W).
|
| 90 |
+
intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3).
|
| 91 |
+
|
| 92 |
+
Returns:
|
| 93 |
+
tuple[np.ndarray, np.ndarray]: Camera coordinates (H, W, 3)
|
| 94 |
+
"""
|
| 95 |
+
H, W = depth_map.shape
|
| 96 |
+
assert intrinsic.shape == (3, 3), "Intrinsic matrix must be 3x3"
|
| 97 |
+
assert intrinsic[0, 1] == 0 and intrinsic[1, 0] == 0, "Intrinsic matrix must have zero skew"
|
| 98 |
+
|
| 99 |
+
# Intrinsic parameters
|
| 100 |
+
fu, fv = intrinsic[0, 0], intrinsic[1, 1]
|
| 101 |
+
cu, cv = intrinsic[0, 2], intrinsic[1, 2]
|
| 102 |
+
|
| 103 |
+
# Generate grid of pixel coordinates
|
| 104 |
+
u, v = np.meshgrid(np.arange(W), np.arange(H))
|
| 105 |
+
|
| 106 |
+
# Unproject to camera coordinates
|
| 107 |
+
x_cam = (u - cu) * depth_map / fu
|
| 108 |
+
y_cam = (v - cv) * depth_map / fv
|
| 109 |
+
z_cam = depth_map
|
| 110 |
+
|
| 111 |
+
# Stack to form camera coordinates
|
| 112 |
+
cam_coords = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
|
| 113 |
+
|
| 114 |
+
return cam_coords
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def closed_form_inverse_se3(se3, R=None, T=None):
|
| 118 |
+
"""
|
| 119 |
+
Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch.
|
| 120 |
+
|
| 121 |
+
If `R` and `T` are provided, they must correspond to the rotation and translation
|
| 122 |
+
components of `se3`. Otherwise, they will be extracted from `se3`.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
se3: Nx4x4 or Nx3x4 array or tensor of SE3 matrices.
|
| 126 |
+
R (optional): Nx3x3 array or tensor of rotation matrices.
|
| 127 |
+
T (optional): Nx3x1 array or tensor of translation vectors.
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Inverted SE3 matrices with the same type and device as `se3`.
|
| 131 |
+
|
| 132 |
+
Shapes:
|
| 133 |
+
se3: (N, 4, 4)
|
| 134 |
+
R: (N, 3, 3)
|
| 135 |
+
T: (N, 3, 1)
|
| 136 |
+
"""
|
| 137 |
+
# Check if se3 is a numpy array or a torch tensor
|
| 138 |
+
is_numpy = isinstance(se3, np.ndarray)
|
| 139 |
+
|
| 140 |
+
# Validate shapes
|
| 141 |
+
if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4):
|
| 142 |
+
raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.")
|
| 143 |
+
|
| 144 |
+
# Extract R and T if not provided
|
| 145 |
+
if R is None:
|
| 146 |
+
R = se3[:, :3, :3] # (N,3,3)
|
| 147 |
+
if T is None:
|
| 148 |
+
T = se3[:, :3, 3:] # (N,3,1)
|
| 149 |
+
|
| 150 |
+
# Transpose R
|
| 151 |
+
if is_numpy:
|
| 152 |
+
# Compute the transpose of the rotation for NumPy
|
| 153 |
+
R_transposed = np.transpose(R, (0, 2, 1))
|
| 154 |
+
# -R^T t for NumPy
|
| 155 |
+
top_right = -np.matmul(R_transposed, T)
|
| 156 |
+
inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1))
|
| 157 |
+
else:
|
| 158 |
+
R_transposed = R.transpose(1, 2) # (N,3,3)
|
| 159 |
+
top_right = -torch.bmm(R_transposed, T) # (N,3,1)
|
| 160 |
+
inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1)
|
| 161 |
+
inverted_matrix = inverted_matrix.to(R.dtype).to(R.device)
|
| 162 |
+
|
| 163 |
+
inverted_matrix[:, :3, :3] = R_transposed
|
| 164 |
+
inverted_matrix[:, :3, 3:] = top_right
|
| 165 |
+
|
| 166 |
+
return inverted_matrix
|
outdoor_v48_16gpu_v2/mytrain.log
ADDED
|
@@ -0,0 +1,985 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[2026-05-02 22:24:00,638][__main__][INFO] - [RANK 0] output_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu_v2/
|
| 2 |
+
[2026-05-02 22:24:01,213][__main__][INFO] - [RANK 0] Saving current code to /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu_v2/code/05_02-22:24:00
|
| 3 |
+
[2026-05-02 22:24:01,213][__main__][INFO] - [RANK 0] job dir: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/src
|
| 4 |
+
[2026-05-02 22:24:01,213][__main__][INFO] - [RANK 0] Setting seed to 0 for process 0
|
| 5 |
+
[2026-05-02 22:24:01,215][__main__][INFO] - [RANK 0] Building train dataset 6000 @ VirtualKITTI2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 6000 @ KITTI360_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", velodyne_root="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 5400 @ Waymo_v2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/waymo_v2", lidar_root="/scratch-shared/wwei2/waymo_v2", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0)
|
| 6 |
+
[2026-05-02 22:24:01,215][__main__][INFO] - [RANK 0] Building Train Data loader for dataset: 6000 @ VirtualKITTI2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 6000 @ KITTI360_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", velodyne_root="/scratch-shared/wwei2/downloads/kitti360/KITTI-360", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0) + 5400 @ Waymo_v2_Multi(allow_repeat=False, split='train', ROOT="/scratch-shared/wwei2/waymo_v2", lidar_root="/scratch-shared/wwei2/waymo_v2", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=64, n_corres=0)
|
| 7 |
+
[2026-05-02 22:27:56,308][__main__][INFO] - [RANK 0] Building test dataset 200 @ VirtualKITTI2_Multi(split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", resolution=(518, 154), num_views=4, seed=42, n_corres=0)
|
| 8 |
+
[2026-05-02 22:27:56,308][__main__][INFO] - [RANK 0] Building Test Data loader for dataset: 200 @ VirtualKITTI2_Multi(split='train', ROOT="/scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti", resolution=(518, 154), num_views=4, seed=42, n_corres=0)
|
| 9 |
+
[2026-05-02 22:27:56,361][__main__][INFO] - [RANK 0] Loading model
|
| 10 |
+
[2026-05-02 22:28:02,069][__main__][INFO] - [RANK 0] All model parameters: 958696732
|
| 11 |
+
[2026-05-02 22:28:02,069][__main__][INFO] - [RANK 0] >> Creating train criterion = DistillLoss()
|
| 12 |
+
[2026-05-02 22:28:02,070][__main__][INFO] - [RANK 0] >> Creating test criterion = DistillLoss()
|
| 13 |
+
[2026-05-02 22:28:02,336][__main__][INFO] - [RANK 0] Freezing patch embedding and positional encoding parameters...
|
| 14 |
+
[2026-05-02 22:28:02,341][__main__][INFO] - [RANK 0] Frozen 304,376,832 parameters out of 958,696,732 total parameters. (31.75%)
|
| 15 |
+
[2026-05-02 22:28:02,341][__main__][INFO] - [RANK 0] Trainable parameters: 654,319,900 (68.25%)
|
| 16 |
+
[2026-05-02 22:28:02,341][__main__][INFO] - [RANK 0] Example frozen parameters: register_token, encoder.cls_token, encoder.pos_embed, encoder.register_tokens, encoder.patch_embed.proj.weight...
|
| 17 |
+
[2026-05-02 22:28:02,345][croco.utils.misc][INFO] - [RANK 0] Param groups = {
|
| 18 |
+
"no_decay": {
|
| 19 |
+
"weight_decay": 0.0,
|
| 20 |
+
"params": [
|
| 21 |
+
"decoder.0.norm1.weight",
|
| 22 |
+
"decoder.0.norm1.bias",
|
| 23 |
+
"decoder.0.attn.qkv.bias",
|
| 24 |
+
"decoder.0.attn.proj.bias",
|
| 25 |
+
"decoder.0.attn.q_norm.weight",
|
| 26 |
+
"decoder.0.attn.q_norm.bias",
|
| 27 |
+
"decoder.0.attn.k_norm.weight",
|
| 28 |
+
"decoder.0.attn.k_norm.bias",
|
| 29 |
+
"decoder.0.ls1.gamma",
|
| 30 |
+
"decoder.0.norm2.weight",
|
| 31 |
+
"decoder.0.norm2.bias",
|
| 32 |
+
"decoder.0.mlp.fc1.bias",
|
| 33 |
+
"decoder.0.mlp.fc2.bias",
|
| 34 |
+
"decoder.0.ls2.gamma",
|
| 35 |
+
"decoder.1.norm1.weight",
|
| 36 |
+
"decoder.1.norm1.bias",
|
| 37 |
+
"decoder.1.attn.qkv.bias",
|
| 38 |
+
"decoder.1.attn.proj.bias",
|
| 39 |
+
"decoder.1.attn.q_norm.weight",
|
| 40 |
+
"decoder.1.attn.q_norm.bias",
|
| 41 |
+
"decoder.1.attn.k_norm.weight",
|
| 42 |
+
"decoder.1.attn.k_norm.bias",
|
| 43 |
+
"decoder.1.ls1.gamma",
|
| 44 |
+
"decoder.1.norm2.weight",
|
| 45 |
+
"decoder.1.norm2.bias",
|
| 46 |
+
"decoder.1.mlp.fc1.bias",
|
| 47 |
+
"decoder.1.mlp.fc2.bias",
|
| 48 |
+
"decoder.1.ls2.gamma",
|
| 49 |
+
"decoder.2.norm1.weight",
|
| 50 |
+
"decoder.2.norm1.bias",
|
| 51 |
+
"decoder.2.attn.qkv.bias",
|
| 52 |
+
"decoder.2.attn.proj.bias",
|
| 53 |
+
"decoder.2.attn.q_norm.weight",
|
| 54 |
+
"decoder.2.attn.q_norm.bias",
|
| 55 |
+
"decoder.2.attn.k_norm.weight",
|
| 56 |
+
"decoder.2.attn.k_norm.bias",
|
| 57 |
+
"decoder.2.ls1.gamma",
|
| 58 |
+
"decoder.2.norm2.weight",
|
| 59 |
+
"decoder.2.norm2.bias",
|
| 60 |
+
"decoder.2.mlp.fc1.bias",
|
| 61 |
+
"decoder.2.mlp.fc2.bias",
|
| 62 |
+
"decoder.2.ls2.gamma",
|
| 63 |
+
"decoder.3.norm1.weight",
|
| 64 |
+
"decoder.3.norm1.bias",
|
| 65 |
+
"decoder.3.attn.qkv.bias",
|
| 66 |
+
"decoder.3.attn.proj.bias",
|
| 67 |
+
"decoder.3.attn.q_norm.weight",
|
| 68 |
+
"decoder.3.attn.q_norm.bias",
|
| 69 |
+
"decoder.3.attn.k_norm.weight",
|
| 70 |
+
"decoder.3.attn.k_norm.bias",
|
| 71 |
+
"decoder.3.ls1.gamma",
|
| 72 |
+
"decoder.3.norm2.weight",
|
| 73 |
+
"decoder.3.norm2.bias",
|
| 74 |
+
"decoder.3.mlp.fc1.bias",
|
| 75 |
+
"decoder.3.mlp.fc2.bias",
|
| 76 |
+
"decoder.3.ls2.gamma",
|
| 77 |
+
"decoder.4.norm1.weight",
|
| 78 |
+
"decoder.4.norm1.bias",
|
| 79 |
+
"decoder.4.attn.qkv.bias",
|
| 80 |
+
"decoder.4.attn.proj.bias",
|
| 81 |
+
"decoder.4.attn.q_norm.weight",
|
| 82 |
+
"decoder.4.attn.q_norm.bias",
|
| 83 |
+
"decoder.4.attn.k_norm.weight",
|
| 84 |
+
"decoder.4.attn.k_norm.bias",
|
| 85 |
+
"decoder.4.ls1.gamma",
|
| 86 |
+
"decoder.4.norm2.weight",
|
| 87 |
+
"decoder.4.norm2.bias",
|
| 88 |
+
"decoder.4.mlp.fc1.bias",
|
| 89 |
+
"decoder.4.mlp.fc2.bias",
|
| 90 |
+
"decoder.4.ls2.gamma",
|
| 91 |
+
"decoder.5.norm1.weight",
|
| 92 |
+
"decoder.5.norm1.bias",
|
| 93 |
+
"decoder.5.attn.qkv.bias",
|
| 94 |
+
"decoder.5.attn.proj.bias",
|
| 95 |
+
"decoder.5.attn.q_norm.weight",
|
| 96 |
+
"decoder.5.attn.q_norm.bias",
|
| 97 |
+
"decoder.5.attn.k_norm.weight",
|
| 98 |
+
"decoder.5.attn.k_norm.bias",
|
| 99 |
+
"decoder.5.ls1.gamma",
|
| 100 |
+
"decoder.5.norm2.weight",
|
| 101 |
+
"decoder.5.norm2.bias",
|
| 102 |
+
"decoder.5.mlp.fc1.bias",
|
| 103 |
+
"decoder.5.mlp.fc2.bias",
|
| 104 |
+
"decoder.5.ls2.gamma",
|
| 105 |
+
"decoder.6.norm1.weight",
|
| 106 |
+
"decoder.6.norm1.bias",
|
| 107 |
+
"decoder.6.attn.qkv.bias",
|
| 108 |
+
"decoder.6.attn.proj.bias",
|
| 109 |
+
"decoder.6.attn.q_norm.weight",
|
| 110 |
+
"decoder.6.attn.q_norm.bias",
|
| 111 |
+
"decoder.6.attn.k_norm.weight",
|
| 112 |
+
"decoder.6.attn.k_norm.bias",
|
| 113 |
+
"decoder.6.ls1.gamma",
|
| 114 |
+
"decoder.6.norm2.weight",
|
| 115 |
+
"decoder.6.norm2.bias",
|
| 116 |
+
"decoder.6.mlp.fc1.bias",
|
| 117 |
+
"decoder.6.mlp.fc2.bias",
|
| 118 |
+
"decoder.6.ls2.gamma",
|
| 119 |
+
"decoder.7.norm1.weight",
|
| 120 |
+
"decoder.7.norm1.bias",
|
| 121 |
+
"decoder.7.attn.qkv.bias",
|
| 122 |
+
"decoder.7.attn.proj.bias",
|
| 123 |
+
"decoder.7.attn.q_norm.weight",
|
| 124 |
+
"decoder.7.attn.q_norm.bias",
|
| 125 |
+
"decoder.7.attn.k_norm.weight",
|
| 126 |
+
"decoder.7.attn.k_norm.bias",
|
| 127 |
+
"decoder.7.ls1.gamma",
|
| 128 |
+
"decoder.7.norm2.weight",
|
| 129 |
+
"decoder.7.norm2.bias",
|
| 130 |
+
"decoder.7.mlp.fc1.bias",
|
| 131 |
+
"decoder.7.mlp.fc2.bias",
|
| 132 |
+
"decoder.7.ls2.gamma",
|
| 133 |
+
"decoder.8.norm1.weight",
|
| 134 |
+
"decoder.8.norm1.bias",
|
| 135 |
+
"decoder.8.attn.qkv.bias",
|
| 136 |
+
"decoder.8.attn.proj.bias",
|
| 137 |
+
"decoder.8.attn.q_norm.weight",
|
| 138 |
+
"decoder.8.attn.q_norm.bias",
|
| 139 |
+
"decoder.8.attn.k_norm.weight",
|
| 140 |
+
"decoder.8.attn.k_norm.bias",
|
| 141 |
+
"decoder.8.ls1.gamma",
|
| 142 |
+
"decoder.8.norm2.weight",
|
| 143 |
+
"decoder.8.norm2.bias",
|
| 144 |
+
"decoder.8.mlp.fc1.bias",
|
| 145 |
+
"decoder.8.mlp.fc2.bias",
|
| 146 |
+
"decoder.8.ls2.gamma",
|
| 147 |
+
"decoder.9.norm1.weight",
|
| 148 |
+
"decoder.9.norm1.bias",
|
| 149 |
+
"decoder.9.attn.qkv.bias",
|
| 150 |
+
"decoder.9.attn.proj.bias",
|
| 151 |
+
"decoder.9.attn.q_norm.weight",
|
| 152 |
+
"decoder.9.attn.q_norm.bias",
|
| 153 |
+
"decoder.9.attn.k_norm.weight",
|
| 154 |
+
"decoder.9.attn.k_norm.bias",
|
| 155 |
+
"decoder.9.ls1.gamma",
|
| 156 |
+
"decoder.9.norm2.weight",
|
| 157 |
+
"decoder.9.norm2.bias",
|
| 158 |
+
"decoder.9.mlp.fc1.bias",
|
| 159 |
+
"decoder.9.mlp.fc2.bias",
|
| 160 |
+
"decoder.9.ls2.gamma",
|
| 161 |
+
"decoder.10.norm1.weight",
|
| 162 |
+
"decoder.10.norm1.bias",
|
| 163 |
+
"decoder.10.attn.qkv.bias",
|
| 164 |
+
"decoder.10.attn.proj.bias",
|
| 165 |
+
"decoder.10.attn.q_norm.weight",
|
| 166 |
+
"decoder.10.attn.q_norm.bias",
|
| 167 |
+
"decoder.10.attn.k_norm.weight",
|
| 168 |
+
"decoder.10.attn.k_norm.bias",
|
| 169 |
+
"decoder.10.ls1.gamma",
|
| 170 |
+
"decoder.10.norm2.weight",
|
| 171 |
+
"decoder.10.norm2.bias",
|
| 172 |
+
"decoder.10.mlp.fc1.bias",
|
| 173 |
+
"decoder.10.mlp.fc2.bias",
|
| 174 |
+
"decoder.10.ls2.gamma",
|
| 175 |
+
"decoder.11.norm1.weight",
|
| 176 |
+
"decoder.11.norm1.bias",
|
| 177 |
+
"decoder.11.attn.qkv.bias",
|
| 178 |
+
"decoder.11.attn.proj.bias",
|
| 179 |
+
"decoder.11.attn.q_norm.weight",
|
| 180 |
+
"decoder.11.attn.q_norm.bias",
|
| 181 |
+
"decoder.11.attn.k_norm.weight",
|
| 182 |
+
"decoder.11.attn.k_norm.bias",
|
| 183 |
+
"decoder.11.ls1.gamma",
|
| 184 |
+
"decoder.11.norm2.weight",
|
| 185 |
+
"decoder.11.norm2.bias",
|
| 186 |
+
"decoder.11.mlp.fc1.bias",
|
| 187 |
+
"decoder.11.mlp.fc2.bias",
|
| 188 |
+
"decoder.11.ls2.gamma",
|
| 189 |
+
"decoder.12.norm1.weight",
|
| 190 |
+
"decoder.12.norm1.bias",
|
| 191 |
+
"decoder.12.attn.qkv.bias",
|
| 192 |
+
"decoder.12.attn.proj.bias",
|
| 193 |
+
"decoder.12.attn.q_norm.weight",
|
| 194 |
+
"decoder.12.attn.q_norm.bias",
|
| 195 |
+
"decoder.12.attn.k_norm.weight",
|
| 196 |
+
"decoder.12.attn.k_norm.bias",
|
| 197 |
+
"decoder.12.ls1.gamma",
|
| 198 |
+
"decoder.12.norm2.weight",
|
| 199 |
+
"decoder.12.norm2.bias",
|
| 200 |
+
"decoder.12.mlp.fc1.bias",
|
| 201 |
+
"decoder.12.mlp.fc2.bias",
|
| 202 |
+
"decoder.12.ls2.gamma",
|
| 203 |
+
"decoder.13.norm1.weight",
|
| 204 |
+
"decoder.13.norm1.bias",
|
| 205 |
+
"decoder.13.attn.qkv.bias",
|
| 206 |
+
"decoder.13.attn.proj.bias",
|
| 207 |
+
"decoder.13.attn.q_norm.weight",
|
| 208 |
+
"decoder.13.attn.q_norm.bias",
|
| 209 |
+
"decoder.13.attn.k_norm.weight",
|
| 210 |
+
"decoder.13.attn.k_norm.bias",
|
| 211 |
+
"decoder.13.ls1.gamma",
|
| 212 |
+
"decoder.13.norm2.weight",
|
| 213 |
+
"decoder.13.norm2.bias",
|
| 214 |
+
"decoder.13.mlp.fc1.bias",
|
| 215 |
+
"decoder.13.mlp.fc2.bias",
|
| 216 |
+
"decoder.13.ls2.gamma",
|
| 217 |
+
"decoder.14.norm1.weight",
|
| 218 |
+
"decoder.14.norm1.bias",
|
| 219 |
+
"decoder.14.attn.qkv.bias",
|
| 220 |
+
"decoder.14.attn.proj.bias",
|
| 221 |
+
"decoder.14.attn.q_norm.weight",
|
| 222 |
+
"decoder.14.attn.q_norm.bias",
|
| 223 |
+
"decoder.14.attn.k_norm.weight",
|
| 224 |
+
"decoder.14.attn.k_norm.bias",
|
| 225 |
+
"decoder.14.ls1.gamma",
|
| 226 |
+
"decoder.14.norm2.weight",
|
| 227 |
+
"decoder.14.norm2.bias",
|
| 228 |
+
"decoder.14.mlp.fc1.bias",
|
| 229 |
+
"decoder.14.mlp.fc2.bias",
|
| 230 |
+
"decoder.14.ls2.gamma",
|
| 231 |
+
"decoder.15.norm1.weight",
|
| 232 |
+
"decoder.15.norm1.bias",
|
| 233 |
+
"decoder.15.attn.qkv.bias",
|
| 234 |
+
"decoder.15.attn.proj.bias",
|
| 235 |
+
"decoder.15.attn.q_norm.weight",
|
| 236 |
+
"decoder.15.attn.q_norm.bias",
|
| 237 |
+
"decoder.15.attn.k_norm.weight",
|
| 238 |
+
"decoder.15.attn.k_norm.bias",
|
| 239 |
+
"decoder.15.ls1.gamma",
|
| 240 |
+
"decoder.15.norm2.weight",
|
| 241 |
+
"decoder.15.norm2.bias",
|
| 242 |
+
"decoder.15.mlp.fc1.bias",
|
| 243 |
+
"decoder.15.mlp.fc2.bias",
|
| 244 |
+
"decoder.15.ls2.gamma",
|
| 245 |
+
"decoder.16.norm1.weight",
|
| 246 |
+
"decoder.16.norm1.bias",
|
| 247 |
+
"decoder.16.attn.qkv.bias",
|
| 248 |
+
"decoder.16.attn.proj.bias",
|
| 249 |
+
"decoder.16.attn.q_norm.weight",
|
| 250 |
+
"decoder.16.attn.q_norm.bias",
|
| 251 |
+
"decoder.16.attn.k_norm.weight",
|
| 252 |
+
"decoder.16.attn.k_norm.bias",
|
| 253 |
+
"decoder.16.ls1.gamma",
|
| 254 |
+
"decoder.16.norm2.weight",
|
| 255 |
+
"decoder.16.norm2.bias",
|
| 256 |
+
"decoder.16.mlp.fc1.bias",
|
| 257 |
+
"decoder.16.mlp.fc2.bias",
|
| 258 |
+
"decoder.16.ls2.gamma",
|
| 259 |
+
"decoder.17.norm1.weight",
|
| 260 |
+
"decoder.17.norm1.bias",
|
| 261 |
+
"decoder.17.attn.qkv.bias",
|
| 262 |
+
"decoder.17.attn.proj.bias",
|
| 263 |
+
"decoder.17.attn.q_norm.weight",
|
| 264 |
+
"decoder.17.attn.q_norm.bias",
|
| 265 |
+
"decoder.17.attn.k_norm.weight",
|
| 266 |
+
"decoder.17.attn.k_norm.bias",
|
| 267 |
+
"decoder.17.ls1.gamma",
|
| 268 |
+
"decoder.17.norm2.weight",
|
| 269 |
+
"decoder.17.norm2.bias",
|
| 270 |
+
"decoder.17.mlp.fc1.bias",
|
| 271 |
+
"decoder.17.mlp.fc2.bias",
|
| 272 |
+
"decoder.17.ls2.gamma",
|
| 273 |
+
"decoder.18.norm1.weight",
|
| 274 |
+
"decoder.18.norm1.bias",
|
| 275 |
+
"decoder.18.attn.qkv.bias",
|
| 276 |
+
"decoder.18.attn.proj.bias",
|
| 277 |
+
"decoder.18.attn.q_norm.weight",
|
| 278 |
+
"decoder.18.attn.q_norm.bias",
|
| 279 |
+
"decoder.18.attn.k_norm.weight",
|
| 280 |
+
"decoder.18.attn.k_norm.bias",
|
| 281 |
+
"decoder.18.ls1.gamma",
|
| 282 |
+
"decoder.18.norm2.weight",
|
| 283 |
+
"decoder.18.norm2.bias",
|
| 284 |
+
"decoder.18.mlp.fc1.bias",
|
| 285 |
+
"decoder.18.mlp.fc2.bias",
|
| 286 |
+
"decoder.18.ls2.gamma",
|
| 287 |
+
"decoder.19.norm1.weight",
|
| 288 |
+
"decoder.19.norm1.bias",
|
| 289 |
+
"decoder.19.attn.qkv.bias",
|
| 290 |
+
"decoder.19.attn.proj.bias",
|
| 291 |
+
"decoder.19.attn.q_norm.weight",
|
| 292 |
+
"decoder.19.attn.q_norm.bias",
|
| 293 |
+
"decoder.19.attn.k_norm.weight",
|
| 294 |
+
"decoder.19.attn.k_norm.bias",
|
| 295 |
+
"decoder.19.ls1.gamma",
|
| 296 |
+
"decoder.19.norm2.weight",
|
| 297 |
+
"decoder.19.norm2.bias",
|
| 298 |
+
"decoder.19.mlp.fc1.bias",
|
| 299 |
+
"decoder.19.mlp.fc2.bias",
|
| 300 |
+
"decoder.19.ls2.gamma",
|
| 301 |
+
"decoder.20.norm1.weight",
|
| 302 |
+
"decoder.20.norm1.bias",
|
| 303 |
+
"decoder.20.attn.qkv.bias",
|
| 304 |
+
"decoder.20.attn.proj.bias",
|
| 305 |
+
"decoder.20.attn.q_norm.weight",
|
| 306 |
+
"decoder.20.attn.q_norm.bias",
|
| 307 |
+
"decoder.20.attn.k_norm.weight",
|
| 308 |
+
"decoder.20.attn.k_norm.bias",
|
| 309 |
+
"decoder.20.ls1.gamma",
|
| 310 |
+
"decoder.20.norm2.weight",
|
| 311 |
+
"decoder.20.norm2.bias",
|
| 312 |
+
"decoder.20.mlp.fc1.bias",
|
| 313 |
+
"decoder.20.mlp.fc2.bias",
|
| 314 |
+
"decoder.20.ls2.gamma",
|
| 315 |
+
"decoder.21.norm1.weight",
|
| 316 |
+
"decoder.21.norm1.bias",
|
| 317 |
+
"decoder.21.attn.qkv.bias",
|
| 318 |
+
"decoder.21.attn.proj.bias",
|
| 319 |
+
"decoder.21.attn.q_norm.weight",
|
| 320 |
+
"decoder.21.attn.q_norm.bias",
|
| 321 |
+
"decoder.21.attn.k_norm.weight",
|
| 322 |
+
"decoder.21.attn.k_norm.bias",
|
| 323 |
+
"decoder.21.ls1.gamma",
|
| 324 |
+
"decoder.21.norm2.weight",
|
| 325 |
+
"decoder.21.norm2.bias",
|
| 326 |
+
"decoder.21.mlp.fc1.bias",
|
| 327 |
+
"decoder.21.mlp.fc2.bias",
|
| 328 |
+
"decoder.21.ls2.gamma",
|
| 329 |
+
"decoder.22.norm1.weight",
|
| 330 |
+
"decoder.22.norm1.bias",
|
| 331 |
+
"decoder.22.attn.qkv.bias",
|
| 332 |
+
"decoder.22.attn.proj.bias",
|
| 333 |
+
"decoder.22.attn.q_norm.weight",
|
| 334 |
+
"decoder.22.attn.q_norm.bias",
|
| 335 |
+
"decoder.22.attn.k_norm.weight",
|
| 336 |
+
"decoder.22.attn.k_norm.bias",
|
| 337 |
+
"decoder.22.ls1.gamma",
|
| 338 |
+
"decoder.22.norm2.weight",
|
| 339 |
+
"decoder.22.norm2.bias",
|
| 340 |
+
"decoder.22.mlp.fc1.bias",
|
| 341 |
+
"decoder.22.mlp.fc2.bias",
|
| 342 |
+
"decoder.22.ls2.gamma",
|
| 343 |
+
"decoder.23.norm1.weight",
|
| 344 |
+
"decoder.23.norm1.bias",
|
| 345 |
+
"decoder.23.attn.qkv.bias",
|
| 346 |
+
"decoder.23.attn.proj.bias",
|
| 347 |
+
"decoder.23.attn.q_norm.weight",
|
| 348 |
+
"decoder.23.attn.q_norm.bias",
|
| 349 |
+
"decoder.23.attn.k_norm.weight",
|
| 350 |
+
"decoder.23.attn.k_norm.bias",
|
| 351 |
+
"decoder.23.ls1.gamma",
|
| 352 |
+
"decoder.23.norm2.weight",
|
| 353 |
+
"decoder.23.norm2.bias",
|
| 354 |
+
"decoder.23.mlp.fc1.bias",
|
| 355 |
+
"decoder.23.mlp.fc2.bias",
|
| 356 |
+
"decoder.23.ls2.gamma",
|
| 357 |
+
"decoder.24.norm1.weight",
|
| 358 |
+
"decoder.24.norm1.bias",
|
| 359 |
+
"decoder.24.attn.qkv.bias",
|
| 360 |
+
"decoder.24.attn.proj.bias",
|
| 361 |
+
"decoder.24.attn.q_norm.weight",
|
| 362 |
+
"decoder.24.attn.q_norm.bias",
|
| 363 |
+
"decoder.24.attn.k_norm.weight",
|
| 364 |
+
"decoder.24.attn.k_norm.bias",
|
| 365 |
+
"decoder.24.ls1.gamma",
|
| 366 |
+
"decoder.24.norm2.weight",
|
| 367 |
+
"decoder.24.norm2.bias",
|
| 368 |
+
"decoder.24.mlp.fc1.bias",
|
| 369 |
+
"decoder.24.mlp.fc2.bias",
|
| 370 |
+
"decoder.24.ls2.gamma",
|
| 371 |
+
"decoder.25.norm1.weight",
|
| 372 |
+
"decoder.25.norm1.bias",
|
| 373 |
+
"decoder.25.attn.qkv.bias",
|
| 374 |
+
"decoder.25.attn.proj.bias",
|
| 375 |
+
"decoder.25.attn.q_norm.weight",
|
| 376 |
+
"decoder.25.attn.q_norm.bias",
|
| 377 |
+
"decoder.25.attn.k_norm.weight",
|
| 378 |
+
"decoder.25.attn.k_norm.bias",
|
| 379 |
+
"decoder.25.ls1.gamma",
|
| 380 |
+
"decoder.25.norm2.weight",
|
| 381 |
+
"decoder.25.norm2.bias",
|
| 382 |
+
"decoder.25.mlp.fc1.bias",
|
| 383 |
+
"decoder.25.mlp.fc2.bias",
|
| 384 |
+
"decoder.25.ls2.gamma",
|
| 385 |
+
"decoder.26.norm1.weight",
|
| 386 |
+
"decoder.26.norm1.bias",
|
| 387 |
+
"decoder.26.attn.qkv.bias",
|
| 388 |
+
"decoder.26.attn.proj.bias",
|
| 389 |
+
"decoder.26.attn.q_norm.weight",
|
| 390 |
+
"decoder.26.attn.q_norm.bias",
|
| 391 |
+
"decoder.26.attn.k_norm.weight",
|
| 392 |
+
"decoder.26.attn.k_norm.bias",
|
| 393 |
+
"decoder.26.ls1.gamma",
|
| 394 |
+
"decoder.26.norm2.weight",
|
| 395 |
+
"decoder.26.norm2.bias",
|
| 396 |
+
"decoder.26.mlp.fc1.bias",
|
| 397 |
+
"decoder.26.mlp.fc2.bias",
|
| 398 |
+
"decoder.26.ls2.gamma",
|
| 399 |
+
"decoder.27.norm1.weight",
|
| 400 |
+
"decoder.27.norm1.bias",
|
| 401 |
+
"decoder.27.attn.qkv.bias",
|
| 402 |
+
"decoder.27.attn.proj.bias",
|
| 403 |
+
"decoder.27.attn.q_norm.weight",
|
| 404 |
+
"decoder.27.attn.q_norm.bias",
|
| 405 |
+
"decoder.27.attn.k_norm.weight",
|
| 406 |
+
"decoder.27.attn.k_norm.bias",
|
| 407 |
+
"decoder.27.ls1.gamma",
|
| 408 |
+
"decoder.27.norm2.weight",
|
| 409 |
+
"decoder.27.norm2.bias",
|
| 410 |
+
"decoder.27.mlp.fc1.bias",
|
| 411 |
+
"decoder.27.mlp.fc2.bias",
|
| 412 |
+
"decoder.27.ls2.gamma",
|
| 413 |
+
"decoder.28.norm1.weight",
|
| 414 |
+
"decoder.28.norm1.bias",
|
| 415 |
+
"decoder.28.attn.qkv.bias",
|
| 416 |
+
"decoder.28.attn.proj.bias",
|
| 417 |
+
"decoder.28.attn.q_norm.weight",
|
| 418 |
+
"decoder.28.attn.q_norm.bias",
|
| 419 |
+
"decoder.28.attn.k_norm.weight",
|
| 420 |
+
"decoder.28.attn.k_norm.bias",
|
| 421 |
+
"decoder.28.ls1.gamma",
|
| 422 |
+
"decoder.28.norm2.weight",
|
| 423 |
+
"decoder.28.norm2.bias",
|
| 424 |
+
"decoder.28.mlp.fc1.bias",
|
| 425 |
+
"decoder.28.mlp.fc2.bias",
|
| 426 |
+
"decoder.28.ls2.gamma",
|
| 427 |
+
"decoder.29.norm1.weight",
|
| 428 |
+
"decoder.29.norm1.bias",
|
| 429 |
+
"decoder.29.attn.qkv.bias",
|
| 430 |
+
"decoder.29.attn.proj.bias",
|
| 431 |
+
"decoder.29.attn.q_norm.weight",
|
| 432 |
+
"decoder.29.attn.q_norm.bias",
|
| 433 |
+
"decoder.29.attn.k_norm.weight",
|
| 434 |
+
"decoder.29.attn.k_norm.bias",
|
| 435 |
+
"decoder.29.ls1.gamma",
|
| 436 |
+
"decoder.29.norm2.weight",
|
| 437 |
+
"decoder.29.norm2.bias",
|
| 438 |
+
"decoder.29.mlp.fc1.bias",
|
| 439 |
+
"decoder.29.mlp.fc2.bias",
|
| 440 |
+
"decoder.29.ls2.gamma",
|
| 441 |
+
"decoder.30.norm1.weight",
|
| 442 |
+
"decoder.30.norm1.bias",
|
| 443 |
+
"decoder.30.attn.qkv.bias",
|
| 444 |
+
"decoder.30.attn.proj.bias",
|
| 445 |
+
"decoder.30.attn.q_norm.weight",
|
| 446 |
+
"decoder.30.attn.q_norm.bias",
|
| 447 |
+
"decoder.30.attn.k_norm.weight",
|
| 448 |
+
"decoder.30.attn.k_norm.bias",
|
| 449 |
+
"decoder.30.ls1.gamma",
|
| 450 |
+
"decoder.30.norm2.weight",
|
| 451 |
+
"decoder.30.norm2.bias",
|
| 452 |
+
"decoder.30.mlp.fc1.bias",
|
| 453 |
+
"decoder.30.mlp.fc2.bias",
|
| 454 |
+
"decoder.30.ls2.gamma",
|
| 455 |
+
"decoder.31.norm1.weight",
|
| 456 |
+
"decoder.31.norm1.bias",
|
| 457 |
+
"decoder.31.attn.qkv.bias",
|
| 458 |
+
"decoder.31.attn.proj.bias",
|
| 459 |
+
"decoder.31.attn.q_norm.weight",
|
| 460 |
+
"decoder.31.attn.q_norm.bias",
|
| 461 |
+
"decoder.31.attn.k_norm.weight",
|
| 462 |
+
"decoder.31.attn.k_norm.bias",
|
| 463 |
+
"decoder.31.ls1.gamma",
|
| 464 |
+
"decoder.31.norm2.weight",
|
| 465 |
+
"decoder.31.norm2.bias",
|
| 466 |
+
"decoder.31.mlp.fc1.bias",
|
| 467 |
+
"decoder.31.mlp.fc2.bias",
|
| 468 |
+
"decoder.31.ls2.gamma",
|
| 469 |
+
"decoder.32.norm1.weight",
|
| 470 |
+
"decoder.32.norm1.bias",
|
| 471 |
+
"decoder.32.attn.qkv.bias",
|
| 472 |
+
"decoder.32.attn.proj.bias",
|
| 473 |
+
"decoder.32.attn.q_norm.weight",
|
| 474 |
+
"decoder.32.attn.q_norm.bias",
|
| 475 |
+
"decoder.32.attn.k_norm.weight",
|
| 476 |
+
"decoder.32.attn.k_norm.bias",
|
| 477 |
+
"decoder.32.ls1.gamma",
|
| 478 |
+
"decoder.32.norm2.weight",
|
| 479 |
+
"decoder.32.norm2.bias",
|
| 480 |
+
"decoder.32.mlp.fc1.bias",
|
| 481 |
+
"decoder.32.mlp.fc2.bias",
|
| 482 |
+
"decoder.32.ls2.gamma",
|
| 483 |
+
"decoder.33.norm1.weight",
|
| 484 |
+
"decoder.33.norm1.bias",
|
| 485 |
+
"decoder.33.attn.qkv.bias",
|
| 486 |
+
"decoder.33.attn.proj.bias",
|
| 487 |
+
"decoder.33.attn.q_norm.weight",
|
| 488 |
+
"decoder.33.attn.q_norm.bias",
|
| 489 |
+
"decoder.33.attn.k_norm.weight",
|
| 490 |
+
"decoder.33.attn.k_norm.bias",
|
| 491 |
+
"decoder.33.ls1.gamma",
|
| 492 |
+
"decoder.33.norm2.weight",
|
| 493 |
+
"decoder.33.norm2.bias",
|
| 494 |
+
"decoder.33.mlp.fc1.bias",
|
| 495 |
+
"decoder.33.mlp.fc2.bias",
|
| 496 |
+
"decoder.33.ls2.gamma",
|
| 497 |
+
"decoder.34.norm1.weight",
|
| 498 |
+
"decoder.34.norm1.bias",
|
| 499 |
+
"decoder.34.attn.qkv.bias",
|
| 500 |
+
"decoder.34.attn.proj.bias",
|
| 501 |
+
"decoder.34.attn.q_norm.weight",
|
| 502 |
+
"decoder.34.attn.q_norm.bias",
|
| 503 |
+
"decoder.34.attn.k_norm.weight",
|
| 504 |
+
"decoder.34.attn.k_norm.bias",
|
| 505 |
+
"decoder.34.ls1.gamma",
|
| 506 |
+
"decoder.34.norm2.weight",
|
| 507 |
+
"decoder.34.norm2.bias",
|
| 508 |
+
"decoder.34.mlp.fc1.bias",
|
| 509 |
+
"decoder.34.mlp.fc2.bias",
|
| 510 |
+
"decoder.34.ls2.gamma",
|
| 511 |
+
"decoder.35.norm1.weight",
|
| 512 |
+
"decoder.35.norm1.bias",
|
| 513 |
+
"decoder.35.attn.qkv.bias",
|
| 514 |
+
"decoder.35.attn.proj.bias",
|
| 515 |
+
"decoder.35.attn.q_norm.weight",
|
| 516 |
+
"decoder.35.attn.q_norm.bias",
|
| 517 |
+
"decoder.35.attn.k_norm.weight",
|
| 518 |
+
"decoder.35.attn.k_norm.bias",
|
| 519 |
+
"decoder.35.ls1.gamma",
|
| 520 |
+
"decoder.35.norm2.weight",
|
| 521 |
+
"decoder.35.norm2.bias",
|
| 522 |
+
"decoder.35.mlp.fc1.bias",
|
| 523 |
+
"decoder.35.mlp.fc2.bias",
|
| 524 |
+
"decoder.35.ls2.gamma",
|
| 525 |
+
"point_decoder.projects.bias",
|
| 526 |
+
"point_decoder.blocks.0.norm1.weight",
|
| 527 |
+
"point_decoder.blocks.0.norm1.bias",
|
| 528 |
+
"point_decoder.blocks.0.attn.qkv.bias",
|
| 529 |
+
"point_decoder.blocks.0.attn.proj.bias",
|
| 530 |
+
"point_decoder.blocks.0.norm2.weight",
|
| 531 |
+
"point_decoder.blocks.0.norm2.bias",
|
| 532 |
+
"point_decoder.blocks.0.mlp.fc1.bias",
|
| 533 |
+
"point_decoder.blocks.0.mlp.fc2.bias",
|
| 534 |
+
"point_decoder.blocks.1.norm1.weight",
|
| 535 |
+
"point_decoder.blocks.1.norm1.bias",
|
| 536 |
+
"point_decoder.blocks.1.attn.qkv.bias",
|
| 537 |
+
"point_decoder.blocks.1.attn.proj.bias",
|
| 538 |
+
"point_decoder.blocks.1.norm2.weight",
|
| 539 |
+
"point_decoder.blocks.1.norm2.bias",
|
| 540 |
+
"point_decoder.blocks.1.mlp.fc1.bias",
|
| 541 |
+
"point_decoder.blocks.1.mlp.fc2.bias",
|
| 542 |
+
"point_decoder.blocks.2.norm1.weight",
|
| 543 |
+
"point_decoder.blocks.2.norm1.bias",
|
| 544 |
+
"point_decoder.blocks.2.attn.qkv.bias",
|
| 545 |
+
"point_decoder.blocks.2.attn.proj.bias",
|
| 546 |
+
"point_decoder.blocks.2.norm2.weight",
|
| 547 |
+
"point_decoder.blocks.2.norm2.bias",
|
| 548 |
+
"point_decoder.blocks.2.mlp.fc1.bias",
|
| 549 |
+
"point_decoder.blocks.2.mlp.fc2.bias",
|
| 550 |
+
"point_decoder.blocks.3.norm1.weight",
|
| 551 |
+
"point_decoder.blocks.3.norm1.bias",
|
| 552 |
+
"point_decoder.blocks.3.attn.qkv.bias",
|
| 553 |
+
"point_decoder.blocks.3.attn.proj.bias",
|
| 554 |
+
"point_decoder.blocks.3.norm2.weight",
|
| 555 |
+
"point_decoder.blocks.3.norm2.bias",
|
| 556 |
+
"point_decoder.blocks.3.mlp.fc1.bias",
|
| 557 |
+
"point_decoder.blocks.3.mlp.fc2.bias",
|
| 558 |
+
"point_decoder.blocks.4.norm1.weight",
|
| 559 |
+
"point_decoder.blocks.4.norm1.bias",
|
| 560 |
+
"point_decoder.blocks.4.attn.qkv.bias",
|
| 561 |
+
"point_decoder.blocks.4.attn.proj.bias",
|
| 562 |
+
"point_decoder.blocks.4.norm2.weight",
|
| 563 |
+
"point_decoder.blocks.4.norm2.bias",
|
| 564 |
+
"point_decoder.blocks.4.mlp.fc1.bias",
|
| 565 |
+
"point_decoder.blocks.4.mlp.fc2.bias",
|
| 566 |
+
"point_decoder.linear_out.bias",
|
| 567 |
+
"point_head.proj.bias",
|
| 568 |
+
"conf_decoder.projects.bias",
|
| 569 |
+
"conf_decoder.blocks.0.norm1.weight",
|
| 570 |
+
"conf_decoder.blocks.0.norm1.bias",
|
| 571 |
+
"conf_decoder.blocks.0.attn.qkv.bias",
|
| 572 |
+
"conf_decoder.blocks.0.attn.proj.bias",
|
| 573 |
+
"conf_decoder.blocks.0.norm2.weight",
|
| 574 |
+
"conf_decoder.blocks.0.norm2.bias",
|
| 575 |
+
"conf_decoder.blocks.0.mlp.fc1.bias",
|
| 576 |
+
"conf_decoder.blocks.0.mlp.fc2.bias",
|
| 577 |
+
"conf_decoder.blocks.1.norm1.weight",
|
| 578 |
+
"conf_decoder.blocks.1.norm1.bias",
|
| 579 |
+
"conf_decoder.blocks.1.attn.qkv.bias",
|
| 580 |
+
"conf_decoder.blocks.1.attn.proj.bias",
|
| 581 |
+
"conf_decoder.blocks.1.norm2.weight",
|
| 582 |
+
"conf_decoder.blocks.1.norm2.bias",
|
| 583 |
+
"conf_decoder.blocks.1.mlp.fc1.bias",
|
| 584 |
+
"conf_decoder.blocks.1.mlp.fc2.bias",
|
| 585 |
+
"conf_decoder.blocks.2.norm1.weight",
|
| 586 |
+
"conf_decoder.blocks.2.norm1.bias",
|
| 587 |
+
"conf_decoder.blocks.2.attn.qkv.bias",
|
| 588 |
+
"conf_decoder.blocks.2.attn.proj.bias",
|
| 589 |
+
"conf_decoder.blocks.2.norm2.weight",
|
| 590 |
+
"conf_decoder.blocks.2.norm2.bias",
|
| 591 |
+
"conf_decoder.blocks.2.mlp.fc1.bias",
|
| 592 |
+
"conf_decoder.blocks.2.mlp.fc2.bias",
|
| 593 |
+
"conf_decoder.blocks.3.norm1.weight",
|
| 594 |
+
"conf_decoder.blocks.3.norm1.bias",
|
| 595 |
+
"conf_decoder.blocks.3.attn.qkv.bias",
|
| 596 |
+
"conf_decoder.blocks.3.attn.proj.bias",
|
| 597 |
+
"conf_decoder.blocks.3.norm2.weight",
|
| 598 |
+
"conf_decoder.blocks.3.norm2.bias",
|
| 599 |
+
"conf_decoder.blocks.3.mlp.fc1.bias",
|
| 600 |
+
"conf_decoder.blocks.3.mlp.fc2.bias",
|
| 601 |
+
"conf_decoder.blocks.4.norm1.weight",
|
| 602 |
+
"conf_decoder.blocks.4.norm1.bias",
|
| 603 |
+
"conf_decoder.blocks.4.attn.qkv.bias",
|
| 604 |
+
"conf_decoder.blocks.4.attn.proj.bias",
|
| 605 |
+
"conf_decoder.blocks.4.norm2.weight",
|
| 606 |
+
"conf_decoder.blocks.4.norm2.bias",
|
| 607 |
+
"conf_decoder.blocks.4.mlp.fc1.bias",
|
| 608 |
+
"conf_decoder.blocks.4.mlp.fc2.bias",
|
| 609 |
+
"conf_decoder.linear_out.bias",
|
| 610 |
+
"conf_head.proj.bias",
|
| 611 |
+
"camera_decoder.projects.bias",
|
| 612 |
+
"camera_decoder.blocks.0.norm1.weight",
|
| 613 |
+
"camera_decoder.blocks.0.norm1.bias",
|
| 614 |
+
"camera_decoder.blocks.0.attn.qkv.bias",
|
| 615 |
+
"camera_decoder.blocks.0.attn.proj.bias",
|
| 616 |
+
"camera_decoder.blocks.0.norm2.weight",
|
| 617 |
+
"camera_decoder.blocks.0.norm2.bias",
|
| 618 |
+
"camera_decoder.blocks.0.mlp.fc1.bias",
|
| 619 |
+
"camera_decoder.blocks.0.mlp.fc2.bias",
|
| 620 |
+
"camera_decoder.blocks.1.norm1.weight",
|
| 621 |
+
"camera_decoder.blocks.1.norm1.bias",
|
| 622 |
+
"camera_decoder.blocks.1.attn.qkv.bias",
|
| 623 |
+
"camera_decoder.blocks.1.attn.proj.bias",
|
| 624 |
+
"camera_decoder.blocks.1.norm2.weight",
|
| 625 |
+
"camera_decoder.blocks.1.norm2.bias",
|
| 626 |
+
"camera_decoder.blocks.1.mlp.fc1.bias",
|
| 627 |
+
"camera_decoder.blocks.1.mlp.fc2.bias",
|
| 628 |
+
"camera_decoder.blocks.2.norm1.weight",
|
| 629 |
+
"camera_decoder.blocks.2.norm1.bias",
|
| 630 |
+
"camera_decoder.blocks.2.attn.qkv.bias",
|
| 631 |
+
"camera_decoder.blocks.2.attn.proj.bias",
|
| 632 |
+
"camera_decoder.blocks.2.norm2.weight",
|
| 633 |
+
"camera_decoder.blocks.2.norm2.bias",
|
| 634 |
+
"camera_decoder.blocks.2.mlp.fc1.bias",
|
| 635 |
+
"camera_decoder.blocks.2.mlp.fc2.bias",
|
| 636 |
+
"camera_decoder.blocks.3.norm1.weight",
|
| 637 |
+
"camera_decoder.blocks.3.norm1.bias",
|
| 638 |
+
"camera_decoder.blocks.3.attn.qkv.bias",
|
| 639 |
+
"camera_decoder.blocks.3.attn.proj.bias",
|
| 640 |
+
"camera_decoder.blocks.3.norm2.weight",
|
| 641 |
+
"camera_decoder.blocks.3.norm2.bias",
|
| 642 |
+
"camera_decoder.blocks.3.mlp.fc1.bias",
|
| 643 |
+
"camera_decoder.blocks.3.mlp.fc2.bias",
|
| 644 |
+
"camera_decoder.blocks.4.norm1.weight",
|
| 645 |
+
"camera_decoder.blocks.4.norm1.bias",
|
| 646 |
+
"camera_decoder.blocks.4.attn.qkv.bias",
|
| 647 |
+
"camera_decoder.blocks.4.attn.proj.bias",
|
| 648 |
+
"camera_decoder.blocks.4.norm2.weight",
|
| 649 |
+
"camera_decoder.blocks.4.norm2.bias",
|
| 650 |
+
"camera_decoder.blocks.4.mlp.fc1.bias",
|
| 651 |
+
"camera_decoder.blocks.4.mlp.fc2.bias",
|
| 652 |
+
"camera_decoder.linear_out.bias",
|
| 653 |
+
"camera_head.res_conv.0.res_conv1.bias",
|
| 654 |
+
"camera_head.res_conv.0.res_conv2.bias",
|
| 655 |
+
"camera_head.res_conv.0.res_conv3.bias",
|
| 656 |
+
"camera_head.res_conv.1.res_conv1.bias",
|
| 657 |
+
"camera_head.res_conv.1.res_conv2.bias",
|
| 658 |
+
"camera_head.res_conv.1.res_conv3.bias",
|
| 659 |
+
"camera_head.more_mlps.0.bias",
|
| 660 |
+
"camera_head.more_mlps.2.bias",
|
| 661 |
+
"camera_head.fc_t.bias",
|
| 662 |
+
"camera_head.fc_rot.bias"
|
| 663 |
+
],
|
| 664 |
+
"lr_scale": 1.0
|
| 665 |
+
},
|
| 666 |
+
"decay": {
|
| 667 |
+
"weight_decay": 0.05,
|
| 668 |
+
"params": [
|
| 669 |
+
"decoder.0.attn.qkv.weight",
|
| 670 |
+
"decoder.0.attn.proj.weight",
|
| 671 |
+
"decoder.0.mlp.fc1.weight",
|
| 672 |
+
"decoder.0.mlp.fc2.weight",
|
| 673 |
+
"decoder.1.attn.qkv.weight",
|
| 674 |
+
"decoder.1.attn.proj.weight",
|
| 675 |
+
"decoder.1.mlp.fc1.weight",
|
| 676 |
+
"decoder.1.mlp.fc2.weight",
|
| 677 |
+
"decoder.2.attn.qkv.weight",
|
| 678 |
+
"decoder.2.attn.proj.weight",
|
| 679 |
+
"decoder.2.mlp.fc1.weight",
|
| 680 |
+
"decoder.2.mlp.fc2.weight",
|
| 681 |
+
"decoder.3.attn.qkv.weight",
|
| 682 |
+
"decoder.3.attn.proj.weight",
|
| 683 |
+
"decoder.3.mlp.fc1.weight",
|
| 684 |
+
"decoder.3.mlp.fc2.weight",
|
| 685 |
+
"decoder.4.attn.qkv.weight",
|
| 686 |
+
"decoder.4.attn.proj.weight",
|
| 687 |
+
"decoder.4.mlp.fc1.weight",
|
| 688 |
+
"decoder.4.mlp.fc2.weight",
|
| 689 |
+
"decoder.5.attn.qkv.weight",
|
| 690 |
+
"decoder.5.attn.proj.weight",
|
| 691 |
+
"decoder.5.mlp.fc1.weight",
|
| 692 |
+
"decoder.5.mlp.fc2.weight",
|
| 693 |
+
"decoder.6.attn.qkv.weight",
|
| 694 |
+
"decoder.6.attn.proj.weight",
|
| 695 |
+
"decoder.6.mlp.fc1.weight",
|
| 696 |
+
"decoder.6.mlp.fc2.weight",
|
| 697 |
+
"decoder.7.attn.qkv.weight",
|
| 698 |
+
"decoder.7.attn.proj.weight",
|
| 699 |
+
"decoder.7.mlp.fc1.weight",
|
| 700 |
+
"decoder.7.mlp.fc2.weight",
|
| 701 |
+
"decoder.8.attn.qkv.weight",
|
| 702 |
+
"decoder.8.attn.proj.weight",
|
| 703 |
+
"decoder.8.mlp.fc1.weight",
|
| 704 |
+
"decoder.8.mlp.fc2.weight",
|
| 705 |
+
"decoder.9.attn.qkv.weight",
|
| 706 |
+
"decoder.9.attn.proj.weight",
|
| 707 |
+
"decoder.9.mlp.fc1.weight",
|
| 708 |
+
"decoder.9.mlp.fc2.weight",
|
| 709 |
+
"decoder.10.attn.qkv.weight",
|
| 710 |
+
"decoder.10.attn.proj.weight",
|
| 711 |
+
"decoder.10.mlp.fc1.weight",
|
| 712 |
+
"decoder.10.mlp.fc2.weight",
|
| 713 |
+
"decoder.11.attn.qkv.weight",
|
| 714 |
+
"decoder.11.attn.proj.weight",
|
| 715 |
+
"decoder.11.mlp.fc1.weight",
|
| 716 |
+
"decoder.11.mlp.fc2.weight",
|
| 717 |
+
"decoder.12.attn.qkv.weight",
|
| 718 |
+
"decoder.12.attn.proj.weight",
|
| 719 |
+
"decoder.12.mlp.fc1.weight",
|
| 720 |
+
"decoder.12.mlp.fc2.weight",
|
| 721 |
+
"decoder.13.attn.qkv.weight",
|
| 722 |
+
"decoder.13.attn.proj.weight",
|
| 723 |
+
"decoder.13.mlp.fc1.weight",
|
| 724 |
+
"decoder.13.mlp.fc2.weight",
|
| 725 |
+
"decoder.14.attn.qkv.weight",
|
| 726 |
+
"decoder.14.attn.proj.weight",
|
| 727 |
+
"decoder.14.mlp.fc1.weight",
|
| 728 |
+
"decoder.14.mlp.fc2.weight",
|
| 729 |
+
"decoder.15.attn.qkv.weight",
|
| 730 |
+
"decoder.15.attn.proj.weight",
|
| 731 |
+
"decoder.15.mlp.fc1.weight",
|
| 732 |
+
"decoder.15.mlp.fc2.weight",
|
| 733 |
+
"decoder.16.attn.qkv.weight",
|
| 734 |
+
"decoder.16.attn.proj.weight",
|
| 735 |
+
"decoder.16.mlp.fc1.weight",
|
| 736 |
+
"decoder.16.mlp.fc2.weight",
|
| 737 |
+
"decoder.17.attn.qkv.weight",
|
| 738 |
+
"decoder.17.attn.proj.weight",
|
| 739 |
+
"decoder.17.mlp.fc1.weight",
|
| 740 |
+
"decoder.17.mlp.fc2.weight",
|
| 741 |
+
"decoder.18.attn.qkv.weight",
|
| 742 |
+
"decoder.18.attn.proj.weight",
|
| 743 |
+
"decoder.18.mlp.fc1.weight",
|
| 744 |
+
"decoder.18.mlp.fc2.weight",
|
| 745 |
+
"decoder.19.attn.qkv.weight",
|
| 746 |
+
"decoder.19.attn.proj.weight",
|
| 747 |
+
"decoder.19.mlp.fc1.weight",
|
| 748 |
+
"decoder.19.mlp.fc2.weight",
|
| 749 |
+
"decoder.20.attn.qkv.weight",
|
| 750 |
+
"decoder.20.attn.proj.weight",
|
| 751 |
+
"decoder.20.mlp.fc1.weight",
|
| 752 |
+
"decoder.20.mlp.fc2.weight",
|
| 753 |
+
"decoder.21.attn.qkv.weight",
|
| 754 |
+
"decoder.21.attn.proj.weight",
|
| 755 |
+
"decoder.21.mlp.fc1.weight",
|
| 756 |
+
"decoder.21.mlp.fc2.weight",
|
| 757 |
+
"decoder.22.attn.qkv.weight",
|
| 758 |
+
"decoder.22.attn.proj.weight",
|
| 759 |
+
"decoder.22.mlp.fc1.weight",
|
| 760 |
+
"decoder.22.mlp.fc2.weight",
|
| 761 |
+
"decoder.23.attn.qkv.weight",
|
| 762 |
+
"decoder.23.attn.proj.weight",
|
| 763 |
+
"decoder.23.mlp.fc1.weight",
|
| 764 |
+
"decoder.23.mlp.fc2.weight",
|
| 765 |
+
"decoder.24.attn.qkv.weight",
|
| 766 |
+
"decoder.24.attn.proj.weight",
|
| 767 |
+
"decoder.24.mlp.fc1.weight",
|
| 768 |
+
"decoder.24.mlp.fc2.weight",
|
| 769 |
+
"decoder.25.attn.qkv.weight",
|
| 770 |
+
"decoder.25.attn.proj.weight",
|
| 771 |
+
"decoder.25.mlp.fc1.weight",
|
| 772 |
+
"decoder.25.mlp.fc2.weight",
|
| 773 |
+
"decoder.26.attn.qkv.weight",
|
| 774 |
+
"decoder.26.attn.proj.weight",
|
| 775 |
+
"decoder.26.mlp.fc1.weight",
|
| 776 |
+
"decoder.26.mlp.fc2.weight",
|
| 777 |
+
"decoder.27.attn.qkv.weight",
|
| 778 |
+
"decoder.27.attn.proj.weight",
|
| 779 |
+
"decoder.27.mlp.fc1.weight",
|
| 780 |
+
"decoder.27.mlp.fc2.weight",
|
| 781 |
+
"decoder.28.attn.qkv.weight",
|
| 782 |
+
"decoder.28.attn.proj.weight",
|
| 783 |
+
"decoder.28.mlp.fc1.weight",
|
| 784 |
+
"decoder.28.mlp.fc2.weight",
|
| 785 |
+
"decoder.29.attn.qkv.weight",
|
| 786 |
+
"decoder.29.attn.proj.weight",
|
| 787 |
+
"decoder.29.mlp.fc1.weight",
|
| 788 |
+
"decoder.29.mlp.fc2.weight",
|
| 789 |
+
"decoder.30.attn.qkv.weight",
|
| 790 |
+
"decoder.30.attn.proj.weight",
|
| 791 |
+
"decoder.30.mlp.fc1.weight",
|
| 792 |
+
"decoder.30.mlp.fc2.weight",
|
| 793 |
+
"decoder.31.attn.qkv.weight",
|
| 794 |
+
"decoder.31.attn.proj.weight",
|
| 795 |
+
"decoder.31.mlp.fc1.weight",
|
| 796 |
+
"decoder.31.mlp.fc2.weight",
|
| 797 |
+
"decoder.32.attn.qkv.weight",
|
| 798 |
+
"decoder.32.attn.proj.weight",
|
| 799 |
+
"decoder.32.mlp.fc1.weight",
|
| 800 |
+
"decoder.32.mlp.fc2.weight",
|
| 801 |
+
"decoder.33.attn.qkv.weight",
|
| 802 |
+
"decoder.33.attn.proj.weight",
|
| 803 |
+
"decoder.33.mlp.fc1.weight",
|
| 804 |
+
"decoder.33.mlp.fc2.weight",
|
| 805 |
+
"decoder.34.attn.qkv.weight",
|
| 806 |
+
"decoder.34.attn.proj.weight",
|
| 807 |
+
"decoder.34.mlp.fc1.weight",
|
| 808 |
+
"decoder.34.mlp.fc2.weight",
|
| 809 |
+
"decoder.35.attn.qkv.weight",
|
| 810 |
+
"decoder.35.attn.proj.weight",
|
| 811 |
+
"decoder.35.mlp.fc1.weight",
|
| 812 |
+
"decoder.35.mlp.fc2.weight",
|
| 813 |
+
"point_decoder.projects.weight",
|
| 814 |
+
"point_decoder.blocks.0.attn.qkv.weight",
|
| 815 |
+
"point_decoder.blocks.0.attn.proj.weight",
|
| 816 |
+
"point_decoder.blocks.0.mlp.fc1.weight",
|
| 817 |
+
"point_decoder.blocks.0.mlp.fc2.weight",
|
| 818 |
+
"point_decoder.blocks.1.attn.qkv.weight",
|
| 819 |
+
"point_decoder.blocks.1.attn.proj.weight",
|
| 820 |
+
"point_decoder.blocks.1.mlp.fc1.weight",
|
| 821 |
+
"point_decoder.blocks.1.mlp.fc2.weight",
|
| 822 |
+
"point_decoder.blocks.2.attn.qkv.weight",
|
| 823 |
+
"point_decoder.blocks.2.attn.proj.weight",
|
| 824 |
+
"point_decoder.blocks.2.mlp.fc1.weight",
|
| 825 |
+
"point_decoder.blocks.2.mlp.fc2.weight",
|
| 826 |
+
"point_decoder.blocks.3.attn.qkv.weight",
|
| 827 |
+
"point_decoder.blocks.3.attn.proj.weight",
|
| 828 |
+
"point_decoder.blocks.3.mlp.fc1.weight",
|
| 829 |
+
"point_decoder.blocks.3.mlp.fc2.weight",
|
| 830 |
+
"point_decoder.blocks.4.attn.qkv.weight",
|
| 831 |
+
"point_decoder.blocks.4.attn.proj.weight",
|
| 832 |
+
"point_decoder.blocks.4.mlp.fc1.weight",
|
| 833 |
+
"point_decoder.blocks.4.mlp.fc2.weight",
|
| 834 |
+
"point_decoder.linear_out.weight",
|
| 835 |
+
"point_head.proj.weight",
|
| 836 |
+
"conf_decoder.projects.weight",
|
| 837 |
+
"conf_decoder.blocks.0.attn.qkv.weight",
|
| 838 |
+
"conf_decoder.blocks.0.attn.proj.weight",
|
| 839 |
+
"conf_decoder.blocks.0.mlp.fc1.weight",
|
| 840 |
+
"conf_decoder.blocks.0.mlp.fc2.weight",
|
| 841 |
+
"conf_decoder.blocks.1.attn.qkv.weight",
|
| 842 |
+
"conf_decoder.blocks.1.attn.proj.weight",
|
| 843 |
+
"conf_decoder.blocks.1.mlp.fc1.weight",
|
| 844 |
+
"conf_decoder.blocks.1.mlp.fc2.weight",
|
| 845 |
+
"conf_decoder.blocks.2.attn.qkv.weight",
|
| 846 |
+
"conf_decoder.blocks.2.attn.proj.weight",
|
| 847 |
+
"conf_decoder.blocks.2.mlp.fc1.weight",
|
| 848 |
+
"conf_decoder.blocks.2.mlp.fc2.weight",
|
| 849 |
+
"conf_decoder.blocks.3.attn.qkv.weight",
|
| 850 |
+
"conf_decoder.blocks.3.attn.proj.weight",
|
| 851 |
+
"conf_decoder.blocks.3.mlp.fc1.weight",
|
| 852 |
+
"conf_decoder.blocks.3.mlp.fc2.weight",
|
| 853 |
+
"conf_decoder.blocks.4.attn.qkv.weight",
|
| 854 |
+
"conf_decoder.blocks.4.attn.proj.weight",
|
| 855 |
+
"conf_decoder.blocks.4.mlp.fc1.weight",
|
| 856 |
+
"conf_decoder.blocks.4.mlp.fc2.weight",
|
| 857 |
+
"conf_decoder.linear_out.weight",
|
| 858 |
+
"conf_head.proj.weight",
|
| 859 |
+
"camera_decoder.projects.weight",
|
| 860 |
+
"camera_decoder.blocks.0.attn.qkv.weight",
|
| 861 |
+
"camera_decoder.blocks.0.attn.proj.weight",
|
| 862 |
+
"camera_decoder.blocks.0.mlp.fc1.weight",
|
| 863 |
+
"camera_decoder.blocks.0.mlp.fc2.weight",
|
| 864 |
+
"camera_decoder.blocks.1.attn.qkv.weight",
|
| 865 |
+
"camera_decoder.blocks.1.attn.proj.weight",
|
| 866 |
+
"camera_decoder.blocks.1.mlp.fc1.weight",
|
| 867 |
+
"camera_decoder.blocks.1.mlp.fc2.weight",
|
| 868 |
+
"camera_decoder.blocks.2.attn.qkv.weight",
|
| 869 |
+
"camera_decoder.blocks.2.attn.proj.weight",
|
| 870 |
+
"camera_decoder.blocks.2.mlp.fc1.weight",
|
| 871 |
+
"camera_decoder.blocks.2.mlp.fc2.weight",
|
| 872 |
+
"camera_decoder.blocks.3.attn.qkv.weight",
|
| 873 |
+
"camera_decoder.blocks.3.attn.proj.weight",
|
| 874 |
+
"camera_decoder.blocks.3.mlp.fc1.weight",
|
| 875 |
+
"camera_decoder.blocks.3.mlp.fc2.weight",
|
| 876 |
+
"camera_decoder.blocks.4.attn.qkv.weight",
|
| 877 |
+
"camera_decoder.blocks.4.attn.proj.weight",
|
| 878 |
+
"camera_decoder.blocks.4.mlp.fc1.weight",
|
| 879 |
+
"camera_decoder.blocks.4.mlp.fc2.weight",
|
| 880 |
+
"camera_decoder.linear_out.weight",
|
| 881 |
+
"camera_head.res_conv.0.res_conv1.weight",
|
| 882 |
+
"camera_head.res_conv.0.res_conv2.weight",
|
| 883 |
+
"camera_head.res_conv.0.res_conv3.weight",
|
| 884 |
+
"camera_head.res_conv.1.res_conv1.weight",
|
| 885 |
+
"camera_head.res_conv.1.res_conv2.weight",
|
| 886 |
+
"camera_head.res_conv.1.res_conv3.weight",
|
| 887 |
+
"camera_head.more_mlps.0.weight",
|
| 888 |
+
"camera_head.more_mlps.2.weight",
|
| 889 |
+
"camera_head.fc_t.weight",
|
| 890 |
+
"camera_head.fc_rot.weight"
|
| 891 |
+
],
|
| 892 |
+
"lr_scale": 1.0
|
| 893 |
+
}
|
| 894 |
+
}
|
| 895 |
+
[2026-05-02 22:28:05,615][croco.utils.misc][INFO] - [RANK 0] Resume checkpoint /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_4gpu_v2/checkpoint-last.pth
|
| 896 |
+
[2026-05-02 22:28:05,639][croco.utils.misc][INFO] - [RANK 0] Moving optimizer state to device: cuda:0
|
| 897 |
+
[2026-05-02 22:28:05,650][croco.utils.misc][INFO] - [RANK 0] & best_so_far=inf
|
| 898 |
+
[2026-05-02 22:28:05,650][croco.utils.misc][INFO] - [RANK 0] With optim & sched! start_epoch=0
|
| 899 |
+
[2026-05-02 22:28:09,695][__main__][INFO] - [RANK 0] Start training for 10 epochs
|
| 900 |
+
[2026-05-02 22:28:09,699][__main__][INFO] - [RANK 0] log_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_16gpu_v2/
|
| 901 |
+
[2026-05-02 22:29:54,327][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 0/1087] eta: 1 day, 7:35:24 lr: 0.000000 epoch: 0.0000 (0.0000) step: 0.0000 (0.0000) loss: 4202.3013 (4202.3013) Lcamera_frontend: 3.3617 (3.3617) Ldepth_frontend: 3.0077 (3.0077) Lpmap_frontend: 11.0651 (11.0651) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.3558 (3.3558) Ldepth_mix: 3.0005 (3.0005) Lpmap_mix: 11.0574 (11.0574) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.3619 (3.3619) Ldepth_backend: 2.9936 (2.9936) Lpmap_backend: 11.0585 (11.0585) Ltrack_backend: 0.0000 (0.0000) total: 4202.3013 (4202.3013) time: 104.6228 data: 26.5225 max mem: 37991
|
| 902 |
+
[2026-05-02 22:38:52,688][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 10/1087] eta: 17:29:11 lr: 0.000000 epoch: 0.0046 (0.0046) step: 5.0000 (5.0000) loss: 4242.6206 (3971.1784) Lcamera_frontend: 3.3956 (3.1482) Ldepth_frontend: 3.8659 (4.7507) Lpmap_frontend: 11.4682 (11.4307) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.3859 (3.1407) Ldepth_mix: 3.8547 (4.7488) Lpmap_mix: 11.4647 (11.4261) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.3970 (3.1482) Ldepth_backend: 3.8452 (4.7473) Lpmap_backend: 11.4647 (11.4284) Ltrack_backend: 0.0000 (0.0000) total: 4242.6206 (3971.1784) time: 58.4508 data: 2.4434 max mem: 78413
|
| 903 |
+
[2026-05-02 22:48:07,197][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 20/1087] eta: 16:54:02 lr: 0.000000 epoch: 0.0092 (0.0092) step: 10.0000 (10.0000) loss: 3202.9844 (3467.7612) Lcamera_frontend: 2.5076 (2.7240) Ldepth_frontend: 4.4754 (5.0151) Lpmap_frontend: 11.6369 (11.6323) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.4981 (2.7169) Ldepth_mix: 4.4711 (5.0133) Lpmap_mix: 11.6346 (11.6265) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.5080 (2.7240) Ldepth_backend: 4.4647 (5.0113) Lpmap_backend: 11.6393 (11.6276) Ltrack_backend: 0.0000 (0.0000) total: 3202.9844 (3467.7612) time: 54.6422 data: 0.0388 max mem: 78608
|
| 904 |
+
[2026-05-02 22:57:15,910][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 30/1087] eta: 16:32:19 lr: 0.000001 epoch: 0.0184 (0.0138) step: 20.0000 (15.0000) loss: 3202.9844 (3465.4637) Lcamera_frontend: 2.5076 (2.7249) Ldepth_frontend: 4.0582 (4.8808) Lpmap_frontend: 11.6826 (11.4852) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.4981 (2.7183) Ldepth_mix: 4.0387 (4.8782) Lpmap_mix: 11.6702 (11.4789) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.5080 (2.7249) Ldepth_backend: 4.0108 (4.8754) Lpmap_backend: 11.6630 (11.4792) Ltrack_backend: 0.0000 (0.0000) total: 3202.9844 (3465.4637) time: 55.1610 data: 0.0417 max mem: 78608
|
| 905 |
+
[2026-05-02 23:06:38,980][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 40/1087] eta: 16:22:50 lr: 0.000001 epoch: 0.0276 (0.0184) step: 30.0000 (20.0000) loss: 3448.9260 (3553.3549) Lcamera_frontend: 2.7252 (2.7964) Ldepth_frontend: 4.0274 (4.9665) Lpmap_frontend: 11.8865 (11.5849) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.7172 (2.7901) Ldepth_mix: 4.0149 (4.9630) Lpmap_mix: 11.8827 (11.5776) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.7249 (2.7962) Ldepth_backend: 4.0034 (4.9592) Lpmap_backend: 11.8838 (11.5771) Ltrack_backend: 0.0000 (0.0000) total: 3448.9260 (3553.3549) time: 55.5890 data: 0.0377 max mem: 78608
|
| 906 |
+
[2026-05-02 23:15:51,240][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 50/1087] eta: 16:09:43 lr: 0.000001 epoch: 0.0368 (0.0230) step: 40.0000 (25.0000) loss: 3448.9260 (3730.1944) Lcamera_frontend: 2.7171 (2.9446) Ldepth_frontend: 4.3692 (4.8668) Lpmap_frontend: 11.9106 (11.5888) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.7163 (2.9386) Ldepth_mix: 4.3693 (4.8634) Lpmap_mix: 11.9057 (11.5817) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.7197 (2.9446) Ldepth_backend: 4.3642 (4.8598) Lpmap_backend: 11.9027 (11.5820) Ltrack_backend: 0.0000 (0.0000) total: 3448.9260 (3730.1944) time: 55.7664 data: 0.0367 max mem: 78608
|
| 907 |
+
[2026-05-02 23:24:59,529][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 60/1087] eta: 15:56:47 lr: 0.000001 epoch: 0.0460 (0.0276) step: 50.0000 (30.0000) loss: 3853.3931 (3919.1408) Lcamera_frontend: 3.0787 (3.1032) Ldepth_frontend: 3.7769 (4.7695) Lpmap_frontend: 11.8250 (11.5821) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.0701 (3.0965) Ldepth_mix: 3.7747 (4.7661) Lpmap_mix: 11.8170 (11.5752) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.0787 (3.1031) Ldepth_backend: 3.7724 (4.7627) Lpmap_backend: 11.8129 (11.5762) Ltrack_backend: 0.0000 (0.0000) total: 3853.3931 (3919.1408) time: 55.0273 data: 0.0364 max mem: 78608
|
| 908 |
+
[2026-05-02 23:34:17,190][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 70/1087] eta: 15:47:09 lr: 0.000001 epoch: 0.0552 (0.0322) step: 60.0000 (35.0000) loss: 4532.8169 (3989.7981) Lcamera_frontend: 3.6279 (3.1625) Ldepth_frontend: 3.4384 (4.6902) Lpmap_frontend: 11.7723 (11.6082) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.6180 (3.1558) Ldepth_mix: 3.4318 (4.6861) Lpmap_mix: 11.7678 (11.6008) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.6298 (3.1625) Ldepth_backend: 3.4263 (4.6823) Lpmap_backend: 11.7694 (11.6018) Ltrack_backend: 0.0000 (0.0000) total: 4532.8169 (3989.7981) time: 55.2964 data: 0.0360 max mem: 78608
|
| 909 |
+
[2026-05-02 23:43:31,213][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 80/1087] eta: 15:36:51 lr: 0.000001 epoch: 0.0644 (0.0368) step: 70.0000 (40.0000) loss: 3389.3914 (3806.4857) Lcamera_frontend: 2.6853 (3.0091) Ldepth_frontend: 3.9082 (4.7790) Lpmap_frontend: 11.4536 (11.5821) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.6737 (3.0025) Ldepth_mix: 3.9067 (4.7758) Lpmap_mix: 11.4464 (11.5746) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6854 (3.0091) Ldepth_backend: 3.9059 (4.7728) Lpmap_backend: 11.4431 (11.5753) Ltrack_backend: 0.0000 (0.0000) total: 3389.3914 (3806.4857) time: 55.5826 data: 0.0368 max mem: 78608
|
| 910 |
+
[2026-05-02 23:52:50,237][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 90/1087] eta: 15:27:41 lr: 0.000002 epoch: 0.0736 (0.0414) step: 80.0000 (45.0000) loss: 3389.3914 (3999.8356) Lcamera_frontend: 2.6853 (3.1719) Ldepth_frontend: 3.8950 (4.6448) Lpmap_frontend: 11.4536 (11.5627) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.6737 (3.1649) Ldepth_mix: 3.8903 (4.6412) Lpmap_mix: 11.4464 (11.5551) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6854 (3.1718) Ldepth_backend: 3.8884 (4.6377) Lpmap_backend: 11.4431 (11.5559) Ltrack_backend: 0.0000 (0.0000) total: 3389.3914 (3999.8356) time: 55.6517 data: 0.0346 max mem: 78608
|
| 911 |
+
[2026-05-03 00:02:07,619][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 100/1087] eta: 15:18:14 lr: 0.000002 epoch: 0.0828 (0.0460) step: 90.0000 (49.9901) loss: 3579.2693 (3912.0880) Lcamera_frontend: 2.8308 (3.0985) Ldepth_frontend: 3.8950 (4.6963) Lpmap_frontend: 11.3769 (11.5519) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8226 (3.0910) Ldepth_mix: 3.8903 (4.6922) Lpmap_mix: 11.3684 (11.5436) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.8261 (3.0983) Ldepth_backend: 3.8884 (4.6884) Lpmap_backend: 11.3697 (11.5439) Ltrack_backend: 0.0000 (0.0000) total: 3579.2693 (3912.0880) time: 55.8202 data: 0.0385 max mem: 78608
|
| 912 |
+
[2026-05-03 00:11:35,712][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 110/1087] eta: 15:10:23 lr: 0.000002 epoch: 0.0920 (0.0506) step: 100.0000 (54.9910) loss: 2613.1426 (3837.1782) Lcamera_frontend: 2.0297 (3.0364) Ldepth_frontend: 4.1661 (4.7066) Lpmap_frontend: 11.3587 (11.5084) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.0106 (3.0286) Ldepth_mix: 4.1581 (4.7030) Lpmap_mix: 11.3454 (11.5001) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.0296 (3.0363) Ldepth_backend: 4.1517 (4.6994) Lpmap_backend: 11.3423 (11.5001) Ltrack_backend: 0.0000 (0.0000) total: 2613.1426 (3837.1782) time: 56.2736 data: 0.0437 max mem: 78608
|
| 913 |
+
[2026-05-03 00:20:58,624][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 120/1087] eta: 15:01:34 lr: 0.000002 epoch: 0.1012 (0.0552) step: 110.0000 (59.9917) loss: 2613.1426 (3752.6934) Lcamera_frontend: 2.0297 (2.9658) Ldepth_frontend: 4.3324 (4.7646) Lpmap_frontend: 11.3587 (11.4705) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.0106 (2.9580) Ldepth_mix: 4.3270 (4.7614) Lpmap_mix: 11.3454 (11.4620) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.0296 (2.9657) Ldepth_backend: 4.3216 (4.7582) Lpmap_backend: 11.3423 (11.4619) Ltrack_backend: 0.0000 (0.0000) total: 2613.1426 (3752.6934) time: 56.5501 data: 0.0401 max mem: 78608
|
| 914 |
+
[2026-05-03 00:30:12,133][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 130/1087] eta: 14:51:32 lr: 0.000002 epoch: 0.1104 (0.0598) step: 120.0000 (64.9924) loss: 2144.1643 (3774.6720) Lcamera_frontend: 1.6416 (2.9835) Ldepth_frontend: 5.1765 (4.8166) Lpmap_frontend: 11.8014 (11.4739) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.6108 (2.9756) Ldepth_mix: 5.1847 (4.8138) Lpmap_mix: 11.7932 (11.4655) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.6411 (2.9834) Ldepth_backend: 5.1862 (4.8108) Lpmap_backend: 11.7894 (11.4657) Ltrack_backend: 0.0000 (0.0000) total: 2144.1643 (3774.6720) time: 55.8203 data: 0.0341 max mem: 78608
|
| 915 |
+
[2026-05-03 00:39:25,201][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 140/1087] eta: 14:41:33 lr: 0.000003 epoch: 0.1196 (0.0644) step: 130.0000 (69.9929) loss: 4428.3130 (3811.5716) Lcamera_frontend: 3.5491 (3.0150) Ldepth_frontend: 3.5344 (4.7627) Lpmap_frontend: 11.5238 (11.4595) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.5423 (3.0068) Ldepth_mix: 3.5241 (4.7599) Lpmap_mix: 11.5170 (11.4512) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.5484 (3.0149) Ldepth_backend: 3.5188 (4.7569) Lpmap_backend: 11.5258 (11.4517) Ltrack_backend: 0.0000 (0.0000) total: 4428.3130 (3811.5716) time: 55.3273 data: 0.0338 max mem: 78608
|
| 916 |
+
[2026-05-03 00:48:29,854][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 150/1087] eta: 14:30:48 lr: 0.000003 epoch: 0.1288 (0.0690) step: 140.0000 (74.9934) loss: 4079.4248 (3840.8996) Lcamera_frontend: 3.2705 (3.0399) Ldepth_frontend: 3.7790 (4.7508) Lpmap_frontend: 11.3197 (11.4240) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.2565 (3.0313) Ldepth_mix: 3.7716 (4.7482) Lpmap_mix: 11.3102 (11.4155) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.2702 (3.0398) Ldepth_backend: 3.7647 (4.7454) Lpmap_backend: 11.3096 (11.4160) Ltrack_backend: 0.0000 (0.0000) total: 4079.4248 (3840.8996) time: 54.8851 data: 0.0448 max mem: 78608
|
| 917 |
+
[2026-05-03 00:57:45,911][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 160/1087] eta: 14:21:22 lr: 0.000003 epoch: 0.1380 (0.0736) step: 150.0000 (79.9876) loss: 3706.1357 (3838.3399) Lcamera_frontend: 2.9012 (3.0382) Ldepth_frontend: 3.9068 (4.7365) Lpmap_frontend: 11.0957 (11.3986) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8827 (3.0295) Ldepth_mix: 3.9063 (4.7341) Lpmap_mix: 11.0950 (11.3902) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.9018 (3.0381) Ldepth_backend: 3.9018 (4.7313) Lpmap_backend: 11.0915 (11.3909) Ltrack_backend: 0.0000 (0.0000) total: 3706.1357 (3838.3399) time: 55.0354 data: 0.0449 max mem: 78608
|
| 918 |
+
[2026-05-03 01:07:11,802][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 170/1087] eta: 14:12:49 lr: 0.000003 epoch: 0.1472 (0.0782) step: 160.0000 (84.9883) loss: 2894.3047 (3769.6565) Lcamera_frontend: 2.2870 (2.9803) Ldepth_frontend: 4.2580 (4.7964) Lpmap_frontend: 11.3292 (11.4032) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.2753 (2.9717) Ldepth_mix: 4.2547 (4.7936) Lpmap_mix: 11.3332 (11.3945) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.2859 (2.9802) Ldepth_backend: 4.2554 (4.7906) Lpmap_backend: 11.3373 (11.3947) Ltrack_backend: 0.0000 (0.0000) total: 2894.3047 (3769.6565) time: 56.0973 data: 0.0360 max mem: 78608
|
| 919 |
+
[2026-05-03 01:16:32,573][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 180/1087] eta: 14:03:45 lr: 0.000003 epoch: 0.1564 (0.0828) step: 170.0000 (89.9890) loss: 2615.0669 (3713.9764) Lcamera_frontend: 2.0345 (2.9336) Ldepth_frontend: 4.8844 (4.8284) Lpmap_frontend: 11.5031 (11.4055) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.0182 (2.9249) Ldepth_mix: 4.8753 (4.8257) Lpmap_mix: 11.4892 (11.3968) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.0357 (2.9335) Ldepth_backend: 4.8663 (4.8227) Lpmap_backend: 11.4933 (11.3970) Ltrack_backend: 0.0000 (0.0000) total: 2615.0669 (3713.9764) time: 56.3330 data: 0.0386 max mem: 78608
|
| 920 |
+
[2026-05-03 01:25:41,765][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 190/1087] eta: 13:53:45 lr: 0.000003 epoch: 0.1656 (0.0874) step: 180.0000 (94.9895) loss: 3651.0952 (3756.0374) Lcamera_frontend: 2.8870 (2.9682) Ldepth_frontend: 4.3808 (4.8599) Lpmap_frontend: 11.4447 (11.4136) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8607 (2.9592) Ldepth_mix: 4.3745 (4.8576) Lpmap_mix: 11.4449 (11.4050) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.8858 (2.9682) Ldepth_backend: 4.3690 (4.8550) Lpmap_backend: 11.4457 (11.4054) Ltrack_backend: 0.0000 (0.0000) total: 3651.0952 (3756.0374) time: 55.4980 data: 0.0370 max mem: 78608
|
| 921 |
+
[2026-05-03 01:34:53,678][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 200/1087] eta: 13:44:01 lr: 0.000004 epoch: 0.1748 (0.0920) step: 190.0000 (99.9851) loss: 4310.5767 (3799.8757) Lcamera_frontend: 3.4209 (3.0058) Ldepth_frontend: 3.5405 (4.7962) Lpmap_frontend: 11.0155 (11.3732) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.4082 (2.9966) Ldepth_mix: 3.5322 (4.7936) Lpmap_mix: 11.0036 (11.3645) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.4207 (3.0057) Ldepth_backend: 3.5222 (4.7908) Lpmap_backend: 11.0105 (11.3651) Ltrack_backend: 0.0000 (0.0000) total: 4310.5767 (3799.8757) time: 55.0541 data: 0.0353 max mem: 78608
|
| 922 |
+
[2026-05-03 01:44:10,757][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 210/1087] eta: 13:34:43 lr: 0.000004 epoch: 0.1840 (0.0966) step: 200.0000 (104.9858) loss: 4329.0322 (3826.5264) Lcamera_frontend: 3.4749 (3.0278) Ldepth_frontend: 4.2057 (4.8206) Lpmap_frontend: 11.3225 (11.3813) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.4592 (3.0184) Ldepth_mix: 4.1959 (4.8180) Lpmap_mix: 11.3116 (11.3726) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.4761 (3.0276) Ldepth_backend: 4.1878 (4.8154) Lpmap_backend: 11.3086 (11.3733) Ltrack_backend: 0.0000 (0.0000) total: 4329.0322 (3826.5264) time: 55.4476 data: 0.0379 max mem: 78608
|
| 923 |
+
[2026-05-03 01:53:38,674][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 220/1087] eta: 13:26:06 lr: 0.000004 epoch: 0.1932 (0.1012) step: 210.0000 (109.9864) loss: 2949.4153 (3767.0713) Lcamera_frontend: 2.2959 (2.9779) Ldepth_frontend: 5.0282 (4.8638) Lpmap_frontend: 11.3225 (11.3650) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.2833 (2.9686) Ldepth_mix: 5.0255 (4.8616) Lpmap_mix: 11.3116 (11.3561) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.2960 (2.9778) Ldepth_backend: 5.0224 (4.8594) Lpmap_backend: 11.3086 (11.3567) Ltrack_backend: 0.0000 (0.0000) total: 2949.4153 (3767.0713) time: 56.2488 data: 0.0409 max mem: 78608
|
| 924 |
+
[2026-05-03 02:02:50,649][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 230/1087] eta: 13:16:27 lr: 0.000004 epoch: 0.2024 (0.1058) step: 220.0000 (114.9870) loss: 1227.3260 (3724.5507) Lcamera_frontend: 0.8765 (2.9426) Ldepth_frontend: 5.5676 (4.8908) Lpmap_frontend: 11.0429 (11.3324) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 0.8432 (2.9331) Ldepth_mix: 5.5706 (4.8890) Lpmap_mix: 11.0351 (11.3234) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 0.8766 (2.9424) Ldepth_backend: 5.5732 (4.8873) Lpmap_backend: 11.0334 (11.3240) Ltrack_backend: 0.0000 (0.0000) total: 1227.3260 (3724.5507) time: 55.9945 data: 0.0396 max mem: 78608
|
| 925 |
+
[2026-05-03 02:12:12,330][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 240/1087] eta: 13:07:23 lr: 0.000004 epoch: 0.2116 (0.1104) step: 230.0000 (119.9834) loss: 3145.9524 (3707.0761) Lcamera_frontend: 2.4927 (2.9282) Ldepth_frontend: 4.0071 (4.8953) Lpmap_frontend: 10.8839 (11.3124) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.4697 (2.9186) Ldepth_mix: 3.9991 (4.8937) Lpmap_mix: 10.8632 (11.3032) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.4925 (2.9280) Ldepth_backend: 3.9916 (4.8922) Lpmap_backend: 10.8596 (11.3039) Ltrack_backend: 0.0000 (0.0000) total: 3145.9524 (3707.0761) time: 55.6827 data: 0.0365 max mem: 78608
|
| 926 |
+
[2026-05-03 02:21:22,385][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 250/1087] eta: 12:57:40 lr: 0.000005 epoch: 0.2208 (0.1150) step: 240.0000 (124.9841) loss: 3360.2373 (3679.5271) Lcamera_frontend: 2.6443 (2.9057) Ldepth_frontend: 4.0071 (4.8888) Lpmap_frontend: 10.6091 (11.2775) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.6377 (2.8956) Ldepth_mix: 3.9991 (4.8873) Lpmap_mix: 10.5963 (11.2681) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6462 (2.9055) Ldepth_backend: 3.9916 (4.8858) Lpmap_backend: 10.5966 (11.2687) Ltrack_backend: 0.0000 (0.0000) total: 3360.2373 (3679.5271) time: 55.5867 data: 0.0344 max mem: 78608
|
| 927 |
+
[2026-05-03 02:30:44,932][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 260/1087] eta: 12:48:38 lr: 0.000005 epoch: 0.2300 (0.1196) step: 250.0000 (129.9847) loss: 3391.5752 (3716.7938) Lcamera_frontend: 2.6879 (2.9371) Ldepth_frontend: 4.1649 (4.8764) Lpmap_frontend: 10.5907 (11.2571) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.6734 (2.9268) Ldepth_mix: 4.1562 (4.8750) Lpmap_mix: 10.5793 (11.2476) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6880 (2.9369) Ldepth_backend: 4.1492 (4.8736) Lpmap_backend: 10.5772 (11.2484) Ltrack_backend: 0.0000 (0.0000) total: 3391.5752 (3716.7938) time: 55.6291 data: 0.0358 max mem: 78608
|
| 928 |
+
[2026-05-03 02:40:01,223][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 270/1087] eta: 12:39:17 lr: 0.000005 epoch: 0.2392 (0.1242) step: 260.0000 (134.9852) loss: 3615.5190 (3707.4362) Lcamera_frontend: 2.8830 (2.9292) Ldepth_frontend: 4.0986 (4.8942) Lpmap_frontend: 11.0972 (11.2496) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8654 (2.9189) Ldepth_mix: 4.0911 (4.8929) Lpmap_mix: 11.0977 (11.2401) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.8833 (2.9290) Ldepth_backend: 4.0850 (4.8917) Lpmap_backend: 11.0993 (11.2409) Ltrack_backend: 0.0000 (0.0000) total: 3615.5190 (3707.4362) time: 55.9403 data: 0.0371 max mem: 78608
|
| 929 |
+
[2026-05-03 02:49:14,507][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 280/1087] eta: 12:29:46 lr: 0.000005 epoch: 0.2484 (0.1288) step: 270.0000 (139.9822) loss: 4492.9077 (3771.1266) Lcamera_frontend: 3.6113 (2.9824) Ldepth_frontend: 4.0747 (4.8840) Lpmap_frontend: 11.4186 (11.2470) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.6015 (2.9717) Ldepth_mix: 4.0737 (4.8827) Lpmap_mix: 11.4035 (11.2374) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.6116 (2.9823) Ldepth_backend: 4.0687 (4.8815) Lpmap_backend: 11.4096 (11.2384) Ltrack_backend: 0.0000 (0.0000) total: 4492.9077 (3771.1266) time: 55.4780 data: 0.0360 max mem: 78608
|
| 930 |
+
[2026-05-03 02:58:21,937][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 290/1087] eta: 12:20:02 lr: 0.000005 epoch: 0.2576 (0.1334) step: 280.0000 (144.9828) loss: 4368.5010 (3761.0733) Lcamera_frontend: 3.4531 (2.9740) Ldepth_frontend: 4.2899 (4.9012) Lpmap_frontend: 11.3882 (11.2331) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.4088 (2.9629) Ldepth_mix: 4.2923 (4.9000) Lpmap_mix: 11.3673 (11.2232) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.4621 (2.9739) Ldepth_backend: 4.2874 (4.8989) Lpmap_backend: 11.3639 (11.2242) Ltrack_backend: 0.0000 (0.0000) total: 4368.5010 (3761.0733) time: 55.0356 data: 0.0364 max mem: 78608
|
| 931 |
+
[2026-05-03 03:07:37,235][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 300/1087] eta: 12:10:40 lr: 0.000006 epoch: 0.2668 (0.1380) step: 290.0000 (149.9834) loss: 3446.0640 (3727.2401) Lcamera_frontend: 2.7435 (2.9460) Ldepth_frontend: 4.3725 (4.9154) Lpmap_frontend: 10.6490 (11.2016) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.7125 (2.9346) Ldepth_mix: 4.3465 (4.9148) Lpmap_mix: 10.6345 (11.1915) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.7433 (2.9459) Ldepth_backend: 4.3297 (4.9141) Lpmap_backend: 10.6376 (11.1924) Ltrack_backend: 0.0000 (0.0000) total: 3446.0640 (3727.2401) time: 55.1362 data: 0.0364 max mem: 78608
|
| 932 |
+
[2026-05-03 03:16:36,477][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 310/1087] eta: 12:00:38 lr: 0.000006 epoch: 0.2760 (0.1426) step: 300.0000 (154.9839) loss: 1748.7461 (3688.8335) Lcamera_frontend: 1.2842 (2.9140) Ldepth_frontend: 5.3224 (4.9373) Lpmap_frontend: 10.1626 (11.1831) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.2635 (2.9026) Ldepth_mix: 5.3156 (4.9370) Lpmap_mix: 10.1451 (11.1727) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.2813 (2.9138) Ldepth_backend: 5.3142 (4.9367) Lpmap_backend: 10.1451 (11.1734) Ltrack_backend: 0.0000 (0.0000) total: 1748.7461 (3688.8335) time: 54.7269 data: 0.0373 max mem: 78608
|
| 933 |
+
[2026-05-03 03:25:45,852][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 320/1087] eta: 11:51:05 lr: 0.000006 epoch: 0.2852 (0.1472) step: 310.0000 (159.9813) loss: 1933.6400 (3667.4677) Lcamera_frontend: 1.4398 (2.8960) Ldepth_frontend: 5.7850 (4.9677) Lpmap_frontend: 11.0741 (11.1793) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.4133 (2.8841) Ldepth_mix: 5.7826 (4.9677) Lpmap_mix: 11.0659 (11.1688) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.4415 (2.8958) Ldepth_backend: 5.7786 (4.9679) Lpmap_backend: 11.0489 (11.1693) Ltrack_backend: 0.0000 (0.0000) total: 1933.6400 (3667.4677) time: 54.4297 data: 0.0381 max mem: 78608
|
| 934 |
+
[2026-05-03 03:35:00,256][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 330/1087] eta: 11:41:44 lr: 0.000006 epoch: 0.2944 (0.1518) step: 320.0000 (164.9819) loss: 1478.3679 (3603.7438) Lcamera_frontend: 1.0652 (2.8427) Ldepth_frontend: 6.5051 (5.0058) Lpmap_frontend: 11.0741 (11.1597) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.0470 (2.8310) Ldepth_mix: 6.5272 (5.0065) Lpmap_mix: 11.0659 (11.1489) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.0661 (2.8425) Ldepth_backend: 6.5682 (5.0076) Lpmap_backend: 11.0489 (11.1493) Ltrack_backend: 0.0000 (0.0000) total: 1478.3679 (3603.7438) time: 55.1857 data: 0.0413 max mem: 78608
|
| 935 |
+
[2026-05-03 03:44:16,370][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 340/1087] eta: 11:32:28 lr: 0.000006 epoch: 0.3036 (0.1564) step: 330.0000 (169.9824) loss: 660.3922 (3575.3925) Lcamera_frontend: 0.3779 (2.8193) Ldepth_frontend: 5.5826 (5.0136) Lpmap_frontend: 10.2357 (11.1348) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 0.3272 (2.8070) Ldepth_mix: 5.6053 (5.0146) Lpmap_mix: 10.2233 (11.1237) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 0.3828 (2.8191) Ldepth_backend: 5.6269 (5.0157) Lpmap_backend: 10.2307 (11.1241) Ltrack_backend: 0.0000 (0.0000) total: 660.3922 (3575.3925) time: 55.5237 data: 0.0430 max mem: 78608
|
| 936 |
+
[2026-05-03 03:53:29,748][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 350/1087] eta: 11:23:05 lr: 0.000006 epoch: 0.3128 (0.1610) step: 340.0000 (174.9829) loss: 2127.1729 (3558.2403) Lcamera_frontend: 1.6259 (2.8049) Ldepth_frontend: 5.5155 (5.0409) Lpmap_frontend: 10.2984 (11.1208) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.5844 (2.7919) Ldepth_mix: 5.5179 (5.0421) Lpmap_mix: 10.2822 (11.1095) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.6279 (2.8047) Ldepth_backend: 5.5202 (5.0435) Lpmap_backend: 10.2864 (11.1101) Ltrack_backend: 0.0000 (0.0000) total: 2127.1729 (3558.2403) time: 55.4745 data: 0.0385 max mem: 78608
|
| 937 |
+
[2026-05-03 04:02:45,484][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 360/1087] eta: 11:13:49 lr: 0.000007 epoch: 0.3220 (0.1656) step: 350.0000 (179.9806) loss: 3043.0459 (3554.0205) Lcamera_frontend: 2.3864 (2.8014) Ldepth_frontend: 5.2218 (5.0499) Lpmap_frontend: 10.9820 (11.1108) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.2985 (2.7875) Ldepth_mix: 5.2499 (5.0512) Lpmap_mix: 10.9726 (11.0990) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.3887 (2.8013) Ldepth_backend: 5.2633 (5.0527) Lpmap_backend: 10.9631 (11.0997) Ltrack_backend: 0.0000 (0.0000) total: 3043.0459 (3554.0205) time: 55.4556 data: 0.0354 max mem: 78608
|
| 938 |
+
[2026-05-03 04:11:56,777][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 370/1087] eta: 11:04:23 lr: 0.000007 epoch: 0.3312 (0.1702) step: 360.0000 (184.9811) loss: 3459.4954 (3548.7099) Lcamera_frontend: 2.7502 (2.7974) Ldepth_frontend: 4.5297 (5.0422) Lpmap_frontend: 10.3133 (11.0893) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.6680 (2.7823) Ldepth_mix: 4.5317 (5.0437) Lpmap_mix: 10.2952 (11.0773) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.7496 (2.7973) Ldepth_backend: 4.5328 (5.0452) Lpmap_backend: 10.3024 (11.0782) Ltrack_backend: 0.0000 (0.0000) total: 3459.4954 (3548.7099) time: 55.3513 data: 0.0384 max mem: 78608
|
| 939 |
+
[2026-05-03 04:21:04,067][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 380/1087] eta: 10:54:51 lr: 0.000007 epoch: 0.3404 (0.1748) step: 370.0000 (189.9816) loss: 3702.8796 (3581.1575) Lcamera_frontend: 2.9399 (2.8248) Ldepth_frontend: 4.1629 (5.0211) Lpmap_frontend: 10.4718 (11.0842) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8749 (2.8083) Ldepth_mix: 4.1504 (5.0225) Lpmap_mix: 10.4454 (11.0719) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.9407 (2.8247) Ldepth_backend: 4.1369 (5.0237) Lpmap_backend: 10.4560 (11.0730) Ltrack_backend: 0.0000 (0.0000) total: 3702.8796 (3581.1575) time: 54.9282 data: 0.0367 max mem: 78608
|
| 940 |
+
[2026-05-03 04:30:12,784][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 390/1087] eta: 10:45:23 lr: 0.000007 epoch: 0.3496 (0.1794) step: 380.0000 (194.9821) loss: 3815.2349 (3621.6174) Lcamera_frontend: 3.0557 (2.8583) Ldepth_frontend: 4.1017 (5.0338) Lpmap_frontend: 11.1255 (11.0971) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.9788 (2.8408) Ldepth_mix: 4.0889 (5.0351) Lpmap_mix: 11.1028 (11.0845) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.0442 (2.8582) Ldepth_backend: 4.0802 (5.0362) Lpmap_backend: 11.1291 (11.0859) Ltrack_backend: 0.0000 (0.0000) total: 3815.2349 (3621.6174) time: 54.7987 data: 0.0372 max mem: 78608
|
| 941 |
+
[2026-05-03 04:39:29,897][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 400/1087] eta: 10:36:10 lr: 0.000007 epoch: 0.3588 (0.1840) step: 390.0000 (199.9800) loss: 3792.3845 (3632.4914) Lcamera_frontend: 2.9839 (2.8680) Ldepth_frontend: 4.1017 (5.0110) Lpmap_frontend: 10.8711 (11.0800) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.7402 (2.8488) Ldepth_mix: 4.0889 (5.0122) Lpmap_mix: 10.8333 (11.0670) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.9997 (2.8678) Ldepth_backend: 4.0802 (5.0131) Lpmap_backend: 10.8577 (11.0691) Ltrack_backend: 0.0000 (0.0000) total: 3792.3845 (3632.4914) time: 55.2907 data: 0.0400 max mem: 78608
|
| 942 |
+
[2026-05-03 04:48:42,673][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 410/1087] eta: 10:26:50 lr: 0.000008 epoch: 0.3680 (0.1886) step: 400.0000 (204.9805) loss: 3652.0527 (3621.1971) Lcamera_frontend: 2.9191 (2.8586) Ldepth_frontend: 4.2692 (5.0128) Lpmap_frontend: 10.6573 (11.0750) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8159 (2.8384) Ldepth_mix: 4.2618 (5.0139) Lpmap_mix: 10.5971 (11.0612) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.9032 (2.8586) Ldepth_backend: 4.2422 (5.0146) Lpmap_backend: 10.6582 (11.0634) Ltrack_backend: 0.0000 (0.0000) total: 3652.0527 (3621.1971) time: 55.4943 data: 0.0404 max mem: 78608
|
| 943 |
+
[2026-05-03 04:57:58,689][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 420/1087] eta: 10:17:35 lr: 0.000008 epoch: 0.3772 (0.1932) step: 410.0000 (209.9810) loss: 3599.6470 (3658.2743) Lcamera_frontend: 2.8657 (2.8906) Ldepth_frontend: 4.1086 (4.9857) Lpmap_frontend: 11.1647 (11.0617) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8159 (2.8664) Ldepth_mix: 4.1172 (4.9868) Lpmap_mix: 11.0853 (11.0471) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.8670 (2.8901) Ldepth_backend: 4.1287 (4.9875) Lpmap_backend: 11.1357 (11.0503) Ltrack_backend: 0.0000 (0.0000) total: 3599.6470 (3658.2743) time: 55.4395 data: 0.0508 max mem: 78608
|
| 944 |
+
[2026-05-03 05:07:12,491][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 430/1087] eta: 10:08:17 lr: 0.000008 epoch: 0.3864 (0.1978) step: 420.0000 (214.9814) loss: 4169.3975 (3674.5172) Lcamera_frontend: 3.3739 (2.9049) Ldepth_frontend: 4.1068 (4.9837) Lpmap_frontend: 10.4016 (11.0568) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.2131 (2.8778) Ldepth_mix: 4.0997 (4.9850) Lpmap_mix: 10.2944 (11.0410) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.3480 (2.9039) Ldepth_backend: 4.0674 (4.9860) Lpmap_backend: 10.3913 (11.0453) Ltrack_backend: 0.0000 (0.0000) total: 4169.3975 (3674.5172) time: 55.4907 data: 0.0462 max mem: 78608
|
| 945 |
+
[2026-05-03 05:16:38,261][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 440/1087] eta: 9:59:16 lr: 0.000008 epoch: 0.3956 (0.2024) step: 430.0000 (219.9796) loss: 3696.4048 (3699.8234) Lcamera_frontend: 2.9970 (2.9271) Ldepth_frontend: 4.1289 (4.9705) Lpmap_frontend: 11.1855 (11.0608) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.7336 (2.8954) Ldepth_mix: 4.1181 (4.9718) Lpmap_mix: 11.1081 (11.0446) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.9742 (2.9253) Ldepth_backend: 4.1189 (4.9727) Lpmap_backend: 11.1666 (11.0495) Ltrack_backend: 0.0000 (0.0000) total: 3696.4048 (3699.8234) time: 55.9778 data: 0.0363 max mem: 78608
|
| 946 |
+
[2026-05-03 05:25:56,064][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 450/1087] eta: 9:50:03 lr: 0.000008 epoch: 0.4048 (0.2070) step: 440.0000 (224.9800) loss: 3557.5820 (3685.2850) Lcamera_frontend: 2.9234 (2.9174) Ldepth_frontend: 4.1712 (4.9652) Lpmap_frontend: 10.5566 (11.0486) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.5110 (2.8790) Ldepth_mix: 4.1717 (4.9663) Lpmap_mix: 10.5271 (11.0313) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.8505 (2.9136) Ldepth_backend: 4.1736 (4.9670) Lpmap_backend: 10.5695 (11.0372) Ltrack_backend: 0.0000 (0.0000) total: 3557.5820 (3685.2850) time: 56.1767 data: 0.0383 max mem: 78608
|
| 947 |
+
[2026-05-03 05:35:13,979][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 460/1087] eta: 9:40:50 lr: 0.000008 epoch: 0.4140 (0.2116) step: 450.0000 (229.9805) loss: 3480.1802 (3715.1574) Lcamera_frontend: 2.9304 (2.9444) Ldepth_frontend: 4.2331 (4.9430) Lpmap_frontend: 10.6253 (11.0481) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.5110 (2.8997) Ldepth_mix: 4.2412 (4.9438) Lpmap_mix: 10.5307 (11.0303) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.7599 (2.9390) Ldepth_backend: 4.2408 (4.9443) Lpmap_backend: 10.6318 (11.0370) Ltrack_backend: 0.0000 (0.0000) total: 3480.1802 (3715.1574) time: 55.7847 data: 0.0405 max mem: 78608
|
| 948 |
+
[2026-05-03 05:44:35,874][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 470/1087] eta: 9:31:42 lr: 0.000009 epoch: 0.4232 (0.2162) step: 460.0000 (234.9809) loss: 3422.6602 (3696.1356) Lcamera_frontend: 2.8272 (2.9314) Ldepth_frontend: 4.5441 (4.9593) Lpmap_frontend: 10.7407 (11.0491) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.4745 (2.8820) Ldepth_mix: 4.5294 (4.9603) Lpmap_mix: 10.7328 (11.0303) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6872 (2.9228) Ldepth_backend: 4.4900 (4.9611) Lpmap_backend: 10.7370 (11.0375) Ltrack_backend: 0.0000 (0.0000) total: 3422.6602 (3696.1356) time: 55.9904 data: 0.0372 max mem: 78608
|
| 949 |
+
[2026-05-03 05:53:57,443][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 480/1087] eta: 9:22:33 lr: 0.000009 epoch: 0.4324 (0.2208) step: 470.0000 (239.9792) loss: 1953.1704 (3685.9747) Lcamera_frontend: 1.6280 (2.9252) Ldepth_frontend: 4.7953 (4.9523) Lpmap_frontend: 11.3316 (11.0568) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.2892 (2.8721) Ldepth_mix: 4.8133 (4.9534) Lpmap_mix: 11.2683 (11.0365) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.4712 (2.9143) Ldepth_backend: 4.8394 (4.9541) Lpmap_backend: 11.3175 (11.0445) Ltrack_backend: 0.0000 (0.0000) total: 1953.1704 (3685.9747) time: 56.1731 data: 0.0351 max mem: 78608
|
| 950 |
+
[2026-05-03 06:03:23,669][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 490/1087] eta: 9:13:30 lr: 0.000009 epoch: 0.4416 (0.2254) step: 480.0000 (244.9796) loss: 3448.4893 (3715.0480) Lcamera_frontend: 2.8162 (2.9481) Ldepth_frontend: 4.0748 (4.9510) Lpmap_frontend: 11.5225 (11.0709) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.4138 (2.8971) Ldepth_mix: 4.0684 (4.9520) Lpmap_mix: 11.3160 (11.0501) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.7307 (2.9384) Ldepth_backend: 4.0582 (4.9524) Lpmap_backend: 11.3917 (11.0582) Ltrack_backend: 0.0000 (0.0000) total: 3448.4893 (3715.0480) time: 56.3896 data: 0.0380 max mem: 78608
|
| 951 |
+
[2026-05-03 06:12:44,873][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 500/1087] eta: 9:04:19 lr: 0.000009 epoch: 0.4508 (0.2300) step: 490.0000 (249.9800) loss: 3181.1260 (3704.2903) Lcamera_frontend: 2.7117 (2.9441) Ldepth_frontend: 4.3194 (4.9549) Lpmap_frontend: 11.1899 (11.0778) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.3901 (2.8867) Ldepth_mix: 4.2974 (4.9559) Lpmap_mix: 11.1532 (11.0555) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.4224 (2.9290) Ldepth_backend: 4.2664 (4.9563) Lpmap_backend: 11.1807 (11.0642) Ltrack_backend: 0.0000 (0.0000) total: 3181.1260 (3704.2903) time: 56.3714 data: 0.0392 max mem: 78608
|
| 952 |
+
[2026-05-03 06:22:10,560][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 510/1087] eta: 8:55:13 lr: 0.000009 epoch: 0.4600 (0.2346) step: 500.0000 (254.9804) loss: 2312.1357 (3664.4238) Lcamera_frontend: 1.6983 (2.9067) Ldepth_frontend: 5.5187 (4.9843) Lpmap_frontend: 11.2489 (11.0845) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.6342 (2.8571) Ldepth_mix: 5.5271 (4.9860) Lpmap_mix: 11.1145 (11.0610) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.8280 (2.8953) Ldepth_backend: 5.5356 (4.9872) Lpmap_backend: 11.2275 (11.0709) Ltrack_backend: 0.0000 (0.0000) total: 2312.1357 (3664.4238) time: 56.3433 data: 0.0392 max mem: 78608
|
| 953 |
+
[2026-05-03 06:31:23,143][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 520/1087] eta: 8:45:52 lr: 0.000010 epoch: 0.4692 (0.2392) step: 510.0000 (259.9808) loss: 2312.1357 (3680.6249) Lcamera_frontend: 1.7494 (2.9227) Ldepth_frontend: 5.5370 (4.9785) Lpmap_frontend: 11.5558 (11.0989) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.6342 (2.8691) Ldepth_mix: 5.5576 (4.9800) Lpmap_mix: 11.4531 (11.0744) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.8280 (2.9087) Ldepth_backend: 5.5728 (4.9808) Lpmap_backend: 11.4899 (11.0847) Ltrack_backend: 0.0000 (0.0000) total: 2312.1357 (3680.6249) time: 55.9113 data: 0.0349 max mem: 78608
|
| 954 |
+
[2026-05-03 06:40:36,931][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 530/1087] eta: 8:36:33 lr: 0.000010 epoch: 0.4784 (0.2438) step: 520.0000 (264.9812) loss: 3759.7468 (3690.1346) Lcamera_frontend: 3.2916 (2.9316) Ldepth_frontend: 3.9535 (4.9874) Lpmap_frontend: 11.9431 (11.1189) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8017 (2.8779) Ldepth_mix: 3.9413 (4.9890) Lpmap_mix: 11.8807 (11.0935) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.9341 (2.9161) Ldepth_backend: 3.9288 (4.9899) Lpmap_backend: 11.8630 (11.1037) Ltrack_backend: 0.0000 (0.0000) total: 3759.7468 (3690.1346) time: 55.3174 data: 0.0345 max mem: 78608
|
| 955 |
+
[2026-05-03 06:49:38,804][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 540/1087] eta: 8:27:02 lr: 0.000010 epoch: 0.4876 (0.2484) step: 530.0000 (269.9815) loss: 3041.6226 (3678.3606) Lcamera_frontend: 2.7298 (2.9239) Ldepth_frontend: 4.7545 (4.9937) Lpmap_frontend: 11.9619 (11.1341) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.3337 (2.8681) Ldepth_mix: 4.7512 (4.9953) Lpmap_mix: 11.8882 (11.1076) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.3449 (2.9058) Ldepth_backend: 4.7452 (4.9960) Lpmap_backend: 11.8855 (11.1177) Ltrack_backend: 0.0000 (0.0000) total: 3041.6226 (3678.3606) time: 54.7829 data: 0.0372 max mem: 78608
|
| 956 |
+
[2026-05-03 06:58:46,010][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 550/1087] eta: 8:17:37 lr: 0.000010 epoch: 0.4968 (0.2530) step: 540.0000 (274.9819) loss: 3157.6709 (3695.2321) Lcamera_frontend: 2.8859 (2.9411) Ldepth_frontend: 4.2761 (4.9836) Lpmap_frontend: 11.9345 (11.1517) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.3596 (2.8817) Ldepth_mix: 4.2809 (4.9853) Lpmap_mix: 11.8205 (11.1240) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.4089 (2.9195) Ldepth_backend: 4.2791 (4.9860) Lpmap_backend: 11.8111 (11.1343) Ltrack_backend: 0.0000 (0.0000) total: 3157.6709 (3695.2321) time: 54.4538 data: 0.0362 max mem: 78608
|
| 957 |
+
[2026-05-03 07:07:59,114][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 560/1087] eta: 8:08:18 lr: 0.000010 epoch: 0.5060 (0.2576) step: 550.0000 (279.9804) loss: 3299.5527 (3685.7435) Lcamera_frontend: 2.8859 (2.9342) Ldepth_frontend: 4.2133 (4.9895) Lpmap_frontend: 12.1331 (11.1706) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.5620 (2.8742) Ldepth_mix: 4.2019 (4.9912) Lpmap_mix: 12.0707 (11.1417) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.5655 (2.9112) Ldepth_backend: 4.2065 (4.9920) Lpmap_backend: 12.0808 (11.1521) Ltrack_backend: 0.0000 (0.0000) total: 3299.5527 (3685.7435) time: 55.0154 data: 0.0348 max mem: 78608
|
| 958 |
+
[2026-05-03 07:17:13,870][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 570/1087] eta: 7:59:01 lr: 0.000010 epoch: 0.5152 (0.2622) step: 560.0000 (284.9807) loss: 2793.5979 (3674.3886) Lcamera_frontend: 2.5606 (2.9278) Ldepth_frontend: 4.4531 (4.9938) Lpmap_frontend: 12.1361 (11.1882) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.9259 (2.8630) Ldepth_mix: 4.4475 (4.9955) Lpmap_mix: 12.0357 (11.1578) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.1464 (2.9013) Ldepth_backend: 4.4535 (4.9964) Lpmap_backend: 12.0743 (11.1686) Ltrack_backend: 0.0000 (0.0000) total: 2793.5979 (3674.3886) time: 55.3924 data: 0.0518 max mem: 78608
|
| 959 |
+
[2026-05-03 07:26:34,637][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 580/1087] eta: 7:49:50 lr: 0.000010 epoch: 0.5244 (0.2668) step: 570.0000 (289.9811) loss: 2793.5979 (3664.5630) Lcamera_frontend: 2.5606 (2.9213) Ldepth_frontend: 4.7775 (5.0027) Lpmap_frontend: 12.1265 (11.2043) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.9259 (2.8544) Ldepth_mix: 4.7610 (5.0045) Lpmap_mix: 12.0290 (11.1729) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.1464 (2.8927) Ldepth_backend: 4.7341 (5.0055) Lpmap_backend: 12.0366 (11.1838) Ltrack_backend: 0.0000 (0.0000) total: 2793.5979 (3664.5630) time: 55.7751 data: 0.0523 max mem: 78608
|
| 960 |
+
[2026-05-03 07:35:47,292][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 590/1087] eta: 7:40:31 lr: 0.000010 epoch: 0.5336 (0.2714) step: 580.0000 (294.9814) loss: 3078.1255 (3672.0609) Lcamera_frontend: 2.6543 (2.9290) Ldepth_frontend: 5.1508 (5.0042) Lpmap_frontend: 12.1613 (11.2219) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.3590 (2.8611) Ldepth_mix: 5.1450 (5.0061) Lpmap_mix: 12.0374 (11.1892) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.3715 (2.8986) Ldepth_backend: 5.1472 (5.0072) Lpmap_backend: 12.0366 (11.2002) Ltrack_backend: 0.0000 (0.0000) total: 3078.1255 (3672.0609) time: 55.6706 data: 0.0362 max mem: 78608
|
| 961 |
+
[2026-05-03 07:45:05,642][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 600/1087] eta: 7:31:17 lr: 0.000010 epoch: 0.5428 (0.2760) step: 590.0000 (299.9800) loss: 2420.8750 (3653.3033) Lcamera_frontend: 2.1832 (2.9175) Ldepth_frontend: 5.1508 (5.0129) Lpmap_frontend: 12.2512 (11.2361) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.6398 (2.8439) Ldepth_mix: 5.1450 (5.0148) Lpmap_mix: 12.0856 (11.2016) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.8001 (2.8824) Ldepth_backend: 5.1472 (5.0159) Lpmap_backend: 12.0868 (11.2128) Ltrack_backend: 0.0000 (0.0000) total: 2420.8750 (3653.3033) time: 55.5500 data: 0.0383 max mem: 78608
|
| 962 |
+
[2026-05-03 07:54:26,088][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 610/1087] eta: 7:22:04 lr: 0.000010 epoch: 0.5520 (0.2806) step: 600.0000 (304.9804) loss: 1847.7075 (3619.7879) Lcamera_frontend: 1.7183 (2.8876) Ldepth_frontend: 6.1198 (5.0419) Lpmap_frontend: 12.1447 (11.2512) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.4303 (2.8188) Ldepth_mix: 6.1404 (5.0440) Lpmap_mix: 12.0819 (11.2154) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.4411 (2.8539) Ldepth_backend: 6.1645 (5.0453) Lpmap_backend: 12.0822 (11.2265) Ltrack_backend: 0.0000 (0.0000) total: 1847.7075 (3619.7879) time: 55.9397 data: 0.0387 max mem: 78608
|
| 963 |
+
[2026-05-03 08:03:46,791][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 620/1087] eta: 7:12:52 lr: 0.000010 epoch: 0.5612 (0.2852) step: 610.0000 (309.9807) loss: 1764.5923 (3613.3664) Lcamera_frontend: 1.7420 (2.8863) Ldepth_frontend: 5.8825 (5.0355) Lpmap_frontend: 12.1447 (11.2645) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.3862 (2.8133) Ldepth_mix: 5.8885 (5.0375) Lpmap_mix: 12.1045 (11.2282) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.3519 (2.8481) Ldepth_backend: 5.8812 (5.0388) Lpmap_backend: 12.0942 (11.2392) Ltrack_backend: 0.0000 (0.0000) total: 1764.5923 (3613.3664) time: 56.0573 data: 0.0348 max mem: 78608
|
| 964 |
+
[2026-05-03 08:13:07,153][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 630/1087] eta: 7:03:38 lr: 0.000010 epoch: 0.5704 (0.2898) step: 620.0000 (314.9810) loss: 3059.9080 (3603.2805) Lcamera_frontend: 2.6628 (2.8808) Ldepth_frontend: 4.4627 (5.0408) Lpmap_frontend: 12.1902 (11.2795) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.3353 (2.8049) Ldepth_mix: 4.4561 (5.0429) Lpmap_mix: 12.1202 (11.2425) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.3512 (2.8391) Ldepth_backend: 4.4338 (5.0442) Lpmap_backend: 12.1125 (11.2534) Ltrack_backend: 0.0000 (0.0000) total: 3059.9080 (3603.2805) time: 56.0523 data: 0.0357 max mem: 78608
|
| 965 |
+
[2026-05-03 08:22:25,019][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 640/1087] eta: 6:54:23 lr: 0.000010 epoch: 0.5796 (0.2944) step: 630.0000 (319.9797) loss: 2445.0342 (3595.1443) Lcamera_frontend: 2.2690 (2.8786) Ldepth_frontend: 5.0677 (5.0415) Lpmap_frontend: 12.1176 (11.2916) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.8117 (2.7976) Ldepth_mix: 5.0777 (5.0436) Lpmap_mix: 12.0499 (11.2539) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.8377 (2.8318) Ldepth_backend: 5.0785 (5.0450) Lpmap_backend: 12.0557 (11.2650) Ltrack_backend: 0.0000 (0.0000) total: 2445.0342 (3595.1443) time: 55.9098 data: 0.0467 max mem: 78608
|
| 966 |
+
[2026-05-03 08:31:48,897][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 650/1087] eta: 6:45:12 lr: 0.000010 epoch: 0.5888 (0.2990) step: 640.0000 (324.9800) loss: 2445.0342 (3586.5967) Lcamera_frontend: 2.3161 (2.8738) Ldepth_frontend: 4.6716 (5.0454) Lpmap_frontend: 12.1176 (11.3048) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.8117 (2.7902) Ldepth_mix: 4.6889 (5.0475) Lpmap_mix: 11.9498 (11.2663) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.8377 (2.8243) Ldepth_backend: 4.7019 (5.0488) Lpmap_backend: 11.9930 (11.2774) Ltrack_backend: 0.0000 (0.0000) total: 2445.0342 (3586.5967) time: 56.0863 data: 0.0486 max mem: 78608
|
| 967 |
+
[2026-05-03 08:41:02,453][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 660/1087] eta: 6:35:54 lr: 0.000010 epoch: 0.5980 (0.3036) step: 650.0000 (329.9803) loss: 2730.5120 (3580.1872) Lcamera_frontend: 2.3161 (2.8633) Ldepth_frontend: 5.5045 (5.0635) Lpmap_frontend: 12.1464 (11.3183) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.9963 (2.7858) Ldepth_mix: 5.5256 (5.0657) Lpmap_mix: 12.0755 (11.2787) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.0671 (2.8190) Ldepth_backend: 5.5443 (5.0670) Lpmap_backend: 12.1021 (11.2898) Ltrack_backend: 0.0000 (0.0000) total: 2730.5120 (3580.1872) time: 55.8715 data: 0.0395 max mem: 78608
|
| 968 |
+
[2026-05-03 08:50:18,799][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 670/1087] eta: 6:26:38 lr: 0.000010 epoch: 0.6072 (0.3082) step: 660.0000 (334.9806) loss: 1671.2928 (3558.1377) Lcamera_frontend: 1.3674 (2.8426) Ldepth_frontend: 6.3148 (5.0889) Lpmap_frontend: 12.0345 (11.3293) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.1993 (2.7676) Ldepth_mix: 6.3372 (5.0913) Lpmap_mix: 11.9413 (11.2881) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.2104 (2.8004) Ldepth_backend: 6.3731 (5.0929) Lpmap_backend: 11.9567 (11.2992) Ltrack_backend: 0.0000 (0.0000) total: 1671.2928 (3558.1377) time: 55.4949 data: 0.0354 max mem: 78608
|
| 969 |
+
[2026-05-03 08:59:14,886][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 680/1087] eta: 6:17:09 lr: 0.000010 epoch: 0.6164 (0.3128) step: 670.0000 (339.9794) loss: 1392.4603 (3546.1834) Lcamera_frontend: 1.2751 (2.8348) Ldepth_frontend: 6.0868 (5.0949) Lpmap_frontend: 11.9328 (11.3400) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 0.8983 (2.7576) Ldepth_mix: 6.0872 (5.0973) Lpmap_mix: 11.8686 (11.2986) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 0.9128 (2.7900) Ldepth_backend: 6.0722 (5.0989) Lpmap_backend: 11.8731 (11.3095) Ltrack_backend: 0.0000 (0.0000) total: 1392.4603 (3546.1834) time: 54.6215 data: 0.0341 max mem: 78608
|
| 970 |
+
[2026-05-03 09:08:26,305][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 690/1087] eta: 6:07:51 lr: 0.000010 epoch: 0.6256 (0.3174) step: 680.0000 (344.9797) loss: 3024.0039 (3550.7322) Lcamera_frontend: 2.7665 (2.8400) Ldepth_frontend: 4.4880 (5.0947) Lpmap_frontend: 12.1197 (11.3505) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.2599 (2.7622) Ldepth_mix: 4.4941 (5.0972) Lpmap_mix: 12.0705 (11.3088) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.2941 (2.7935) Ldepth_backend: 4.4907 (5.0987) Lpmap_backend: 12.0592 (11.3199) Ltrack_backend: 0.0000 (0.0000) total: 3024.0039 (3550.7322) time: 54.3744 data: 0.0348 max mem: 78608
|
| 971 |
+
[2026-05-03 09:17:43,346][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 700/1087] eta: 5:58:35 lr: 0.000010 epoch: 0.6348 (0.3220) step: 690.0000 (349.9800) loss: 2653.3569 (3546.6571) Lcamera_frontend: 2.6315 (2.8399) Ldepth_frontend: 4.4880 (5.0993) Lpmap_frontend: 12.1347 (11.3618) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.9773 (2.7598) Ldepth_mix: 4.4941 (5.1018) Lpmap_mix: 12.0656 (11.3196) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.9968 (2.7895) Ldepth_backend: 4.4907 (5.1032) Lpmap_backend: 12.0694 (11.3308) Ltrack_backend: 0.0000 (0.0000) total: 2653.3569 (3546.6571) time: 55.4217 data: 0.0344 max mem: 78608
|
| 972 |
+
[2026-05-03 09:26:59,488][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 710/1087] eta: 5:49:19 lr: 0.000010 epoch: 0.6440 (0.3266) step: 700.0000 (354.9803) loss: 2276.0320 (3538.5582) Lcamera_frontend: 2.2519 (2.8377) Ldepth_frontend: 4.5876 (5.0951) Lpmap_frontend: 11.9809 (11.3689) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.6760 (2.7523) Ldepth_mix: 4.5948 (5.0976) Lpmap_mix: 11.8900 (11.3253) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.7012 (2.7823) Ldepth_backend: 4.5958 (5.0990) Lpmap_backend: 11.8968 (11.3366) Ltrack_backend: 0.0000 (0.0000) total: 2276.0320 (3538.5582) time: 55.6586 data: 0.0364 max mem: 78608
|
| 973 |
+
[2026-05-03 09:36:09,910][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 720/1087] eta: 5:40:01 lr: 0.000010 epoch: 0.6532 (0.3312) step: 710.0000 (359.9792) loss: 2183.4126 (3522.9060) Lcamera_frontend: 2.1352 (2.8250) Ldepth_frontend: 5.1236 (5.1032) Lpmap_frontend: 11.8750 (11.3756) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.6790 (2.7410) Ldepth_mix: 5.1267 (5.1058) Lpmap_mix: 11.6824 (11.3311) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.5669 (2.7689) Ldepth_backend: 5.1289 (5.1074) Lpmap_backend: 11.7142 (11.3424) Ltrack_backend: 0.0000 (0.0000) total: 2183.4126 (3522.9060) time: 55.3280 data: 0.0390 max mem: 78608
|
| 974 |
+
[2026-05-03 09:45:24,115][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 730/1087] eta: 5:30:44 lr: 0.000010 epoch: 0.6624 (0.3358) step: 720.0000 (364.9795) loss: 2632.0852 (3538.6334) Lcamera_frontend: 2.3416 (2.8363) Ldepth_frontend: 4.6212 (5.0993) Lpmap_frontend: 11.9530 (11.3871) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.1370 (2.7544) Ldepth_mix: 4.5974 (5.1018) Lpmap_mix: 11.9262 (11.3426) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.9735 (2.7821) Ldepth_backend: 4.6038 (5.1033) Lpmap_backend: 11.8971 (11.3540) Ltrack_backend: 0.0000 (0.0000) total: 2632.0852 (3538.6334) time: 55.2311 data: 0.0394 max mem: 78608
|
| 975 |
+
[2026-05-03 09:54:34,387][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 740/1087] eta: 5:21:25 lr: 0.000010 epoch: 0.6716 (0.3404) step: 730.0000 (369.9798) loss: 3940.0229 (3533.7108) Lcamera_frontend: 3.3570 (2.8329) Ldepth_frontend: 4.6212 (5.1089) Lpmap_frontend: 12.2415 (11.4019) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.8580 (2.7488) Ldepth_mix: 4.5974 (5.1111) Lpmap_mix: 12.1585 (11.3570) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.1289 (2.7778) Ldepth_backend: 4.6038 (5.1119) Lpmap_backend: 12.1698 (11.3684) Ltrack_backend: 0.0000 (0.0000) total: 3940.0229 (3533.7108) time: 55.2237 data: 0.0406 max mem: 78608
|
| 976 |
+
[2026-05-03 10:03:56,791][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 750/1087] eta: 5:12:13 lr: 0.000010 epoch: 0.6808 (0.3450) step: 740.0000 (374.9800) loss: 2456.8555 (3527.7547) Lcamera_frontend: 2.2745 (2.8312) Ldepth_frontend: 5.3840 (5.1194) Lpmap_frontend: 12.2415 (11.4125) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.9090 (2.7457) Ldepth_mix: 5.4059 (5.1217) Lpmap_mix: 11.9992 (11.3643) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.8766 (2.7721) Ldepth_backend: 5.4270 (5.1226) Lpmap_backend: 12.1463 (11.3767) Ltrack_backend: 0.0000 (0.0000) total: 2456.8555 (3527.7547) time: 55.6336 data: 0.0390 max mem: 78608
|
| 977 |
+
[2026-05-03 10:13:16,348][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 760/1087] eta: 5:02:58 lr: 0.000010 epoch: 0.6900 (0.3496) step: 750.0000 (379.9790) loss: 2818.4995 (3523.2228) Lcamera_frontend: 2.4893 (2.8302) Ldepth_frontend: 5.7214 (5.1307) Lpmap_frontend: 12.1750 (11.4230) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.5757 (2.7421) Ldepth_mix: 5.6839 (5.1333) Lpmap_mix: 11.9527 (11.3724) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.0683 (2.7677) Ldepth_backend: 5.6014 (5.1345) Lpmap_backend: 12.0599 (11.3857) Ltrack_backend: 0.0000 (0.0000) total: 2818.4995 (3523.2228) time: 56.0967 data: 0.0372 max mem: 78608
|
| 978 |
+
[2026-05-03 10:22:27,922][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 770/1087] eta: 4:53:41 lr: 0.000010 epoch: 0.6992 (0.3542) step: 760.0000 (384.9792) loss: 3387.1812 (3518.5164) Lcamera_frontend: 2.8698 (2.8273) Ldepth_frontend: 5.2634 (5.1386) Lpmap_frontend: 12.1750 (11.4330) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.7236 (2.7379) Ldepth_mix: 5.2865 (5.1413) Lpmap_mix: 12.0448 (11.3823) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.6191 (2.7635) Ldepth_backend: 5.2979 (5.1425) Lpmap_backend: 12.0879 (11.3958) Ltrack_backend: 0.0000 (0.0000) total: 3387.1812 (3518.5164) time: 55.5538 data: 0.0388 max mem: 78608
|
| 979 |
+
[2026-05-03 10:31:51,892][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 780/1087] eta: 4:44:28 lr: 0.000010 epoch: 0.7084 (0.3588) step: 770.0000 (389.9795) loss: 2987.0747 (3514.9776) Lcamera_frontend: 2.8551 (2.8275) Ldepth_frontend: 4.8573 (5.1382) Lpmap_frontend: 12.1774 (11.4421) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.2459 (2.7352) Ldepth_mix: 4.8739 (5.1410) Lpmap_mix: 11.9692 (11.3895) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.2869 (2.7601) Ldepth_backend: 4.8861 (5.1422) Lpmap_backend: 12.0147 (11.4031) Ltrack_backend: 0.0000 (0.0000) total: 2987.0747 (3514.9776) time: 55.7757 data: 0.0375 max mem: 78608
|
| 980 |
+
[2026-05-03 10:40:55,736][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 790/1087] eta: 4:35:07 lr: 0.000010 epoch: 0.7176 (0.3634) step: 780.0000 (394.9798) loss: 2920.8914 (3533.7044) Lcamera_frontend: 2.8551 (2.8334) Ldepth_frontend: 4.8106 (5.1427) Lpmap_frontend: 12.1774 (11.4514) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.3363 (2.7519) Ldepth_mix: 4.8182 (5.1455) Lpmap_mix: 11.9840 (11.3997) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.2747 (2.7764) Ldepth_backend: 4.8160 (5.1468) Lpmap_backend: 12.0256 (11.4133) Ltrack_backend: 0.0000 (0.0000) total: 2920.8914 (3533.7044) time: 55.3906 data: 0.0353 max mem: 78608
|
| 981 |
+
[2026-05-03 10:50:15,123][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 800/1087] eta: 4:25:53 lr: 0.000010 epoch: 0.7268 (0.3680) step: 790.0000 (399.9788) loss: 4561.9434 (3540.7668) Lcamera_frontend: 3.2215 (2.8399) Ldepth_frontend: 4.5694 (5.1381) Lpmap_frontend: 12.0237 (11.4587) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 3.5356 (2.7579) Ldepth_mix: 4.5534 (5.1410) Lpmap_mix: 11.9905 (11.4068) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 3.6264 (2.7822) Ldepth_backend: 4.5277 (5.1421) Lpmap_backend: 11.9463 (11.4203) Ltrack_backend: 0.0000 (0.0000) total: 4561.9434 (3540.7668) time: 55.1615 data: 0.0344 max mem: 78608
|
| 982 |
+
[2026-05-03 10:59:13,062][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 810/1087] eta: 4:16:31 lr: 0.000010 epoch: 0.7360 (0.3726) step: 800.0000 (404.9790) loss: 3519.1130 (3530.4271) Lcamera_frontend: 2.8982 (2.8309) Ldepth_frontend: 4.3878 (5.1461) Lpmap_frontend: 11.9666 (11.4668) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.7501 (2.7496) Ldepth_mix: 4.3607 (5.1487) Lpmap_mix: 11.9215 (11.4149) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.7690 (2.7734) Ldepth_backend: 4.3180 (5.1496) Lpmap_backend: 11.9312 (11.4281) Ltrack_backend: 0.0000 (0.0000) total: 3519.1130 (3530.4271) time: 54.8662 data: 0.0341 max mem: 78608
|
| 983 |
+
[2026-05-03 11:08:28,691][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 820/1087] eta: 4:07:15 lr: 0.000010 epoch: 0.7452 (0.3772) step: 810.0000 (409.9793) loss: 2573.6677 (3529.3969) Lcamera_frontend: 2.3184 (2.8273) Ldepth_frontend: 6.1467 (5.1690) Lpmap_frontend: 11.9551 (11.4722) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.8582 (2.7502) Ldepth_mix: 6.1585 (5.1718) Lpmap_mix: 11.9255 (11.4196) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.9209 (2.7724) Ldepth_backend: 6.1818 (5.1729) Lpmap_backend: 11.8981 (11.4329) Ltrack_backend: 0.0000 (0.0000) total: 2573.6677 (3529.3969) time: 54.6779 data: 0.0351 max mem: 78608
|
| 984 |
+
[2026-05-03 11:17:46,381][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 830/1087] eta: 3:58:00 lr: 0.000010 epoch: 0.7544 (0.3818) step: 820.0000 (414.9795) loss: 2573.6677 (3534.8521) Lcamera_frontend: 2.3376 (2.8334) Ldepth_frontend: 5.3513 (5.1629) Lpmap_frontend: 11.8924 (11.4769) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 1.8582 (2.7547) Ldepth_mix: 5.3563 (5.1656) Lpmap_mix: 11.7717 (11.4241) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 1.9209 (2.7768) Ldepth_backend: 5.3373 (5.1664) Lpmap_backend: 11.7964 (11.4373) Ltrack_backend: 0.0000 (0.0000) total: 2573.6677 (3534.8521) time: 55.6651 data: 0.0355 max mem: 78608
|
| 985 |
+
[2026-05-03 11:26:59,109][croco.utils.misc][INFO] - [RANK 0] Epoch: [0] [ 840/1087] eta: 3:48:44 lr: 0.000010 epoch: 0.7636 (0.3864) step: 830.0000 (419.9786) loss: 2918.1069 (3529.2461) Lcamera_frontend: 2.9368 (2.8318) Ldepth_frontend: 4.7339 (5.1652) Lpmap_frontend: 11.7897 (11.4817) Ltrack_frontend: 0.0000 (0.0000) Lcamera_mix: 2.1487 (2.7492) Ldepth_mix: 4.7480 (5.1679) Lpmap_mix: 11.7146 (11.4282) Ltrack_mix: 0.0000 (0.0000) Lcamera_backend: 2.2180 (2.7718) Ldepth_backend: 4.7483 (5.1687) Lpmap_backend: 11.7211 (11.4415) Ltrack_backend: 0.0000 (0.0000) total: 2918.1069 (3529.2461) time: 55.5204 data: 0.0365 max mem: 78608
|
outdoor_v48_4gpu/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
teacher: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
|
| 2 |
+
pretrained: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
|
| 3 |
+
load_only_encoder: false
|
| 4 |
+
long_context: false
|
| 5 |
+
fixed_length: true
|
| 6 |
+
resume: null
|
| 7 |
+
benchmark: false
|
| 8 |
+
num_views: 64
|
| 9 |
+
num_test_views: 4
|
| 10 |
+
n_corres_train: 0
|
| 11 |
+
n_corres_test: 0
|
| 12 |
+
train_criterion: DistillLoss()
|
| 13 |
+
test_criterion: DistillLoss()
|
| 14 |
+
allow_repeat: false
|
| 15 |
+
root_vkitti2: /scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti
|
| 16 |
+
root_kitti: /scratch-shared/wwei2/eval/kitti_odometry/dataset
|
| 17 |
+
root_kitti_velo: /gpfs/work2/0/prjs0824/semantickitti/dataset
|
| 18 |
+
root_kitti360: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
|
| 19 |
+
root_kitti360_velo: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
|
| 20 |
+
root_waymo: /scratch-shared/wwei2/waymo_v2
|
| 21 |
+
root_waymo_lidar: /scratch-shared/wwei2/waymo_v2
|
| 22 |
+
dataset_vkitti2: VirtualKITTI2_Multi(allow_repeat=${allow_repeat}, split='train',
|
| 23 |
+
ROOT="${root_vkitti2}", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294),
|
| 24 |
+
(518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
|
| 25 |
+
n_corres=${n_corres_train})
|
| 26 |
+
dataset_kitti360: KITTI360_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_kitti360}",
|
| 27 |
+
velodyne_root="${root_kitti360_velo}", aug_crop=16, resolution=[(518, 392), (518,
|
| 28 |
+
336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter,
|
| 29 |
+
num_views=${num_views}, n_corres=${n_corres_train})
|
| 30 |
+
dataset_waymo: Waymo_v2_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_waymo}",
|
| 31 |
+
lidar_root="${root_waymo_lidar}", aug_crop=16, resolution=[(518, 392), (518, 336),
|
| 32 |
+
(518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
|
| 33 |
+
n_corres=${n_corres_train})
|
| 34 |
+
train_dataset: 6000 @ ${dataset_vkitti2} + 6000 @ ${dataset_kitti360} + 5400 @ ${dataset_waymo}
|
| 35 |
+
test_dataset: 200 @ VirtualKITTI2_Multi(split='train', ROOT="${root_vkitti2}", resolution=(518,
|
| 36 |
+
154), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
|
| 37 |
+
seed: 0
|
| 38 |
+
batch_size: 1
|
| 39 |
+
accum_iter: 1
|
| 40 |
+
gradient_checkpointing: false
|
| 41 |
+
epochs: 10
|
| 42 |
+
start_epoch: 0
|
| 43 |
+
start_step: 0
|
| 44 |
+
weight_decay: 0.05
|
| 45 |
+
lr: 1.0e-05
|
| 46 |
+
min_lr: 1.0e-08
|
| 47 |
+
warmup_epochs: 0.5
|
| 48 |
+
amp: 1
|
| 49 |
+
num_workers: 4
|
| 50 |
+
world_size: 1
|
| 51 |
+
local-rank: -1
|
| 52 |
+
dist_url: env://
|
| 53 |
+
rank: 0
|
| 54 |
+
gpu: 0
|
| 55 |
+
distributed: false
|
| 56 |
+
dist_backend: nccl
|
| 57 |
+
eval_freq: 1
|
| 58 |
+
save_freq: 0.1
|
| 59 |
+
max_checkpoints: 10
|
| 60 |
+
keep_freq: 1
|
| 61 |
+
print_freq: 10
|
| 62 |
+
print_img_freq: 50000000
|
| 63 |
+
num_imgs_vis: 4
|
| 64 |
+
save_dir: /scratch-shared/wwei2/training_upstream/checkpoints
|
| 65 |
+
exp_name: outdoor_v48_4gpu
|
| 66 |
+
task: StreamVGGT
|
| 67 |
+
logdir: ${save_dir}/${exp_name}/logs
|
| 68 |
+
output_dir: ${save_dir}/${exp_name}/
|
outdoor_v48_4gpu/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${save_dir}/${exp_name}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- exp_name=outdoor_v48_4gpu
|
| 116 |
+
job:
|
| 117 |
+
name: mytrain
|
| 118 |
+
chdir: null
|
| 119 |
+
override_dirname: exp_name=outdoor_v48_4gpu
|
| 120 |
+
id: ???
|
| 121 |
+
num: ???
|
| 122 |
+
config_name: outdoor_v48
|
| 123 |
+
env_set: {}
|
| 124 |
+
env_copy: []
|
| 125 |
+
config:
|
| 126 |
+
override_dirname:
|
| 127 |
+
kv_sep: '='
|
| 128 |
+
item_sep: ','
|
| 129 |
+
exclude_keys: []
|
| 130 |
+
runtime:
|
| 131 |
+
version: 1.3.2
|
| 132 |
+
version_base: '1.3'
|
| 133 |
+
cwd: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/src
|
| 134 |
+
config_sources:
|
| 135 |
+
- path: hydra.conf
|
| 136 |
+
schema: pkg
|
| 137 |
+
provider: hydra
|
| 138 |
+
- path: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/config
|
| 139 |
+
schema: file
|
| 140 |
+
provider: main
|
| 141 |
+
- path: ''
|
| 142 |
+
schema: structured
|
| 143 |
+
provider: schema
|
| 144 |
+
output_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_4gpu
|
| 145 |
+
choices:
|
| 146 |
+
hydra/env: default
|
| 147 |
+
hydra/callbacks: null
|
| 148 |
+
hydra/job_logging: default
|
| 149 |
+
hydra/hydra_logging: default
|
| 150 |
+
hydra/hydra_help: default
|
| 151 |
+
hydra/help: default
|
| 152 |
+
hydra/sweeper: basic
|
| 153 |
+
hydra/launcher: basic
|
| 154 |
+
hydra/output: default
|
| 155 |
+
verbose: true
|
outdoor_v48_4gpu/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
- exp_name=outdoor_v48_4gpu
|
outdoor_v48_4gpu/mytrain.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outdoor_v48_4gpu_v2/.hydra/config.yaml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
teacher: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
|
| 2 |
+
pretrained: /gpfs/work2/0/prjs0824/qi_proj/ckpt/checkpoint-10.pth.model
|
| 3 |
+
load_only_encoder: false
|
| 4 |
+
long_context: false
|
| 5 |
+
fixed_length: true
|
| 6 |
+
resume: null
|
| 7 |
+
benchmark: false
|
| 8 |
+
num_views: 64
|
| 9 |
+
num_test_views: 4
|
| 10 |
+
n_corres_train: 0
|
| 11 |
+
n_corres_test: 0
|
| 12 |
+
train_criterion: DistillLoss()
|
| 13 |
+
test_criterion: DistillLoss()
|
| 14 |
+
allow_repeat: false
|
| 15 |
+
root_vkitti2: /scratch-shared/wwei2/training/preprocessed_vkitti/mast3r_data/processed_vkitti
|
| 16 |
+
root_kitti: /scratch-shared/wwei2/eval/kitti_odometry/dataset
|
| 17 |
+
root_kitti_velo: /gpfs/work2/0/prjs0824/semantickitti/dataset
|
| 18 |
+
root_kitti360: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
|
| 19 |
+
root_kitti360_velo: /scratch-shared/wwei2/downloads/kitti360/KITTI-360
|
| 20 |
+
root_waymo: /scratch-shared/wwei2/waymo_v2
|
| 21 |
+
root_waymo_lidar: /scratch-shared/wwei2/waymo_v2
|
| 22 |
+
dataset_vkitti2: VirtualKITTI2_Multi(allow_repeat=${allow_repeat}, split='train',
|
| 23 |
+
ROOT="${root_vkitti2}", aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294),
|
| 24 |
+
(518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
|
| 25 |
+
n_corres=${n_corres_train})
|
| 26 |
+
dataset_kitti360: KITTI360_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_kitti360}",
|
| 27 |
+
velodyne_root="${root_kitti360_velo}", aug_crop=16, resolution=[(518, 392), (518,
|
| 28 |
+
336), (518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter,
|
| 29 |
+
num_views=${num_views}, n_corres=${n_corres_train})
|
| 30 |
+
dataset_waymo: Waymo_v2_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_waymo}",
|
| 31 |
+
lidar_root="${root_waymo_lidar}", aug_crop=16, resolution=[(518, 392), (518, 336),
|
| 32 |
+
(518, 294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views},
|
| 33 |
+
n_corres=${n_corres_train})
|
| 34 |
+
train_dataset: 6000 @ ${dataset_vkitti2} + 6000 @ ${dataset_kitti360} + 5400 @ ${dataset_waymo}
|
| 35 |
+
test_dataset: 200 @ VirtualKITTI2_Multi(split='train', ROOT="${root_vkitti2}", resolution=(518,
|
| 36 |
+
154), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
|
| 37 |
+
seed: 0
|
| 38 |
+
batch_size: 1
|
| 39 |
+
accum_iter: 1
|
| 40 |
+
gradient_checkpointing: false
|
| 41 |
+
epochs: 10
|
| 42 |
+
start_epoch: 0
|
| 43 |
+
start_step: 0
|
| 44 |
+
weight_decay: 0.05
|
| 45 |
+
lr: 1.0e-05
|
| 46 |
+
min_lr: 1.0e-08
|
| 47 |
+
warmup_epochs: 0.5
|
| 48 |
+
amp: 1
|
| 49 |
+
num_workers: 4
|
| 50 |
+
world_size: 1
|
| 51 |
+
local-rank: -1
|
| 52 |
+
dist_url: env://
|
| 53 |
+
rank: 0
|
| 54 |
+
gpu: 0
|
| 55 |
+
distributed: false
|
| 56 |
+
dist_backend: nccl
|
| 57 |
+
eval_freq: 1
|
| 58 |
+
save_freq: 0.1
|
| 59 |
+
max_checkpoints: 10
|
| 60 |
+
keep_freq: 1
|
| 61 |
+
print_freq: 10
|
| 62 |
+
print_img_freq: 50000000
|
| 63 |
+
num_imgs_vis: 4
|
| 64 |
+
save_dir: /scratch-shared/wwei2/training_upstream/checkpoints
|
| 65 |
+
exp_name: outdoor_v48_4gpu_v2
|
| 66 |
+
task: StreamVGGT
|
| 67 |
+
logdir: ${save_dir}/${exp_name}/logs
|
| 68 |
+
output_dir: ${save_dir}/${exp_name}/
|
outdoor_v48_4gpu_v2/.hydra/hydra.yaml
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
hydra:
|
| 2 |
+
run:
|
| 3 |
+
dir: ${save_dir}/${exp_name}
|
| 4 |
+
sweep:
|
| 5 |
+
dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
| 6 |
+
subdir: ${hydra.job.num}
|
| 7 |
+
launcher:
|
| 8 |
+
_target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
|
| 9 |
+
sweeper:
|
| 10 |
+
_target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
|
| 11 |
+
max_batch_size: null
|
| 12 |
+
params: null
|
| 13 |
+
help:
|
| 14 |
+
app_name: ${hydra.job.name}
|
| 15 |
+
header: '${hydra.help.app_name} is powered by Hydra.
|
| 16 |
+
|
| 17 |
+
'
|
| 18 |
+
footer: 'Powered by Hydra (https://hydra.cc)
|
| 19 |
+
|
| 20 |
+
Use --hydra-help to view Hydra specific help
|
| 21 |
+
|
| 22 |
+
'
|
| 23 |
+
template: '${hydra.help.header}
|
| 24 |
+
|
| 25 |
+
== Configuration groups ==
|
| 26 |
+
|
| 27 |
+
Compose your configuration from those groups (group=option)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
$APP_CONFIG_GROUPS
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
== Config ==
|
| 34 |
+
|
| 35 |
+
Override anything in the config (foo.bar=value)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
$CONFIG
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
${hydra.help.footer}
|
| 42 |
+
|
| 43 |
+
'
|
| 44 |
+
hydra_help:
|
| 45 |
+
template: 'Hydra (${hydra.runtime.version})
|
| 46 |
+
|
| 47 |
+
See https://hydra.cc for more info.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
== Flags ==
|
| 51 |
+
|
| 52 |
+
$FLAGS_HELP
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
== Configuration groups ==
|
| 56 |
+
|
| 57 |
+
Compose your configuration from those groups (For example, append hydra/job_logging=disabled
|
| 58 |
+
to command line)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
$HYDRA_CONFIG_GROUPS
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Use ''--cfg hydra'' to Show the Hydra config.
|
| 65 |
+
|
| 66 |
+
'
|
| 67 |
+
hydra_help: ???
|
| 68 |
+
hydra_logging:
|
| 69 |
+
version: 1
|
| 70 |
+
formatters:
|
| 71 |
+
simple:
|
| 72 |
+
format: '[%(asctime)s][HYDRA] %(message)s'
|
| 73 |
+
handlers:
|
| 74 |
+
console:
|
| 75 |
+
class: logging.StreamHandler
|
| 76 |
+
formatter: simple
|
| 77 |
+
stream: ext://sys.stdout
|
| 78 |
+
root:
|
| 79 |
+
level: INFO
|
| 80 |
+
handlers:
|
| 81 |
+
- console
|
| 82 |
+
loggers:
|
| 83 |
+
logging_example:
|
| 84 |
+
level: DEBUG
|
| 85 |
+
disable_existing_loggers: false
|
| 86 |
+
job_logging:
|
| 87 |
+
version: 1
|
| 88 |
+
formatters:
|
| 89 |
+
simple:
|
| 90 |
+
format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
|
| 91 |
+
handlers:
|
| 92 |
+
console:
|
| 93 |
+
class: logging.StreamHandler
|
| 94 |
+
formatter: simple
|
| 95 |
+
stream: ext://sys.stdout
|
| 96 |
+
file:
|
| 97 |
+
class: logging.FileHandler
|
| 98 |
+
formatter: simple
|
| 99 |
+
filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
|
| 100 |
+
root:
|
| 101 |
+
level: INFO
|
| 102 |
+
handlers:
|
| 103 |
+
- console
|
| 104 |
+
- file
|
| 105 |
+
disable_existing_loggers: false
|
| 106 |
+
env: {}
|
| 107 |
+
mode: RUN
|
| 108 |
+
searchpath: []
|
| 109 |
+
callbacks: {}
|
| 110 |
+
output_subdir: .hydra
|
| 111 |
+
overrides:
|
| 112 |
+
hydra:
|
| 113 |
+
- hydra.mode=RUN
|
| 114 |
+
task:
|
| 115 |
+
- exp_name=outdoor_v48_4gpu_v2
|
| 116 |
+
job:
|
| 117 |
+
name: mytrain
|
| 118 |
+
chdir: null
|
| 119 |
+
override_dirname: exp_name=outdoor_v48_4gpu_v2
|
| 120 |
+
id: ???
|
| 121 |
+
num: ???
|
| 122 |
+
config_name: outdoor_v48
|
| 123 |
+
env_set: {}
|
| 124 |
+
env_copy: []
|
| 125 |
+
config:
|
| 126 |
+
override_dirname:
|
| 127 |
+
kv_sep: '='
|
| 128 |
+
item_sep: ','
|
| 129 |
+
exclude_keys: []
|
| 130 |
+
runtime:
|
| 131 |
+
version: 1.3.2
|
| 132 |
+
version_base: '1.3'
|
| 133 |
+
cwd: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/src
|
| 134 |
+
config_sources:
|
| 135 |
+
- path: hydra.conf
|
| 136 |
+
schema: pkg
|
| 137 |
+
provider: hydra
|
| 138 |
+
- path: /gpfs/work2/0/prjs0824/qi_proj/slamformer_upstream/config
|
| 139 |
+
schema: file
|
| 140 |
+
provider: main
|
| 141 |
+
- path: ''
|
| 142 |
+
schema: structured
|
| 143 |
+
provider: schema
|
| 144 |
+
output_dir: /scratch-shared/wwei2/training_upstream/checkpoints/outdoor_v48_4gpu_v2
|
| 145 |
+
choices:
|
| 146 |
+
hydra/env: default
|
| 147 |
+
hydra/callbacks: null
|
| 148 |
+
hydra/job_logging: default
|
| 149 |
+
hydra/hydra_logging: default
|
| 150 |
+
hydra/hydra_help: default
|
| 151 |
+
hydra/help: default
|
| 152 |
+
hydra/sweeper: basic
|
| 153 |
+
hydra/launcher: basic
|
| 154 |
+
hydra/output: default
|
| 155 |
+
verbose: true
|
outdoor_v48_4gpu_v2/.hydra/overrides.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
- exp_name=outdoor_v48_4gpu_v2
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/__init__.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .utils.transforms import *
|
| 2 |
+
from .base.batched_sampler import BatchedRandomSampler # noqa
|
| 3 |
+
from .arkitscenes import ARKitScenes_Multi # noqa
|
| 4 |
+
from .arkitscenes_highres import ARKitScenesHighRes_Multi
|
| 5 |
+
from .bedlam import BEDLAM_Multi
|
| 6 |
+
from .blendedmvs import BlendedMVS_Multi # noqa
|
| 7 |
+
from .co3d import Co3d_Multi # noqa
|
| 8 |
+
from .cop3d import Cop3D_Multi
|
| 9 |
+
from .dl3dv import DL3DV_Multi
|
| 10 |
+
from .dynamic_replica import DynamicReplica
|
| 11 |
+
from .eden import EDEN_Multi
|
| 12 |
+
from .hypersim import HyperSim_Multi
|
| 13 |
+
from .hoi4d import HOI4D_Multi
|
| 14 |
+
from .irs import IRS
|
| 15 |
+
from .mapfree import MapFree_Multi
|
| 16 |
+
from .megadepth import MegaDepth_Multi # noqa
|
| 17 |
+
from .mp3d import MP3D_Multi
|
| 18 |
+
from .mvimgnet import MVImgNet_Multi
|
| 19 |
+
from .mvs_synth import MVS_Synth_Multi
|
| 20 |
+
from .omniobject3d import OmniObject3D_Multi
|
| 21 |
+
from .pointodyssey import PointOdyssey_Multi
|
| 22 |
+
from .realestate10k import RE10K_Multi
|
| 23 |
+
from .scannet import ScanNet_Multi
|
| 24 |
+
from .scannetpp import ScanNetpp_Multi # noqa
|
| 25 |
+
from .smartportraits import SmartPortraits_Multi
|
| 26 |
+
from .spring import Spring
|
| 27 |
+
from .synscapes import SynScapes
|
| 28 |
+
from .tartanair import TartanAir_Multi
|
| 29 |
+
from .threedkb import ThreeDKenBurns
|
| 30 |
+
from .uasol import UASOL_Multi
|
| 31 |
+
from .urbansyn import UrbanSyn
|
| 32 |
+
from .unreal4k import UnReal4K_Multi
|
| 33 |
+
from .vkitti2 import VirtualKITTI2_Multi # noqa
|
| 34 |
+
from .waymo import Waymo_Multi # noqa (legacy h5 format)
|
| 35 |
+
from .waymo_v2 import Waymo_v2_Multi # noqa (parquet v2.0.1, with TOP-lidar)
|
| 36 |
+
from .kitti import KITTI_Multi # noqa (KITTI odometry + Velodyne)
|
| 37 |
+
from .kitti360 import KITTI360_Multi # noqa (KITTI-360 + Velodyne)
|
| 38 |
+
from .wildrgbd import WildRGBD_Multi # noqa
|
| 39 |
+
|
| 40 |
+
from .habitat_hm3d import HabitatHM3D_Multi
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
from accelerate import Accelerator
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_data_loader(
|
| 47 |
+
dataset,
|
| 48 |
+
batch_size,
|
| 49 |
+
num_workers=8,
|
| 50 |
+
shuffle=True,
|
| 51 |
+
drop_last=True,
|
| 52 |
+
pin_mem=True,
|
| 53 |
+
accelerator: Accelerator = None,
|
| 54 |
+
fixed_length=False,
|
| 55 |
+
):
|
| 56 |
+
import torch
|
| 57 |
+
|
| 58 |
+
# pytorch dataset
|
| 59 |
+
if isinstance(dataset, str):
|
| 60 |
+
dataset = eval(dataset)
|
| 61 |
+
|
| 62 |
+
try:
|
| 63 |
+
sampler = dataset.make_sampler(
|
| 64 |
+
batch_size,
|
| 65 |
+
shuffle=shuffle,
|
| 66 |
+
drop_last=drop_last,
|
| 67 |
+
world_size=accelerator.num_processes,
|
| 68 |
+
fixed_length=fixed_length,
|
| 69 |
+
)
|
| 70 |
+
shuffle = False
|
| 71 |
+
|
| 72 |
+
data_loader = torch.utils.data.DataLoader(
|
| 73 |
+
dataset,
|
| 74 |
+
batch_sampler=sampler,
|
| 75 |
+
num_workers=num_workers,
|
| 76 |
+
pin_memory=pin_mem,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
except (AttributeError, NotImplementedError):
|
| 80 |
+
sampler = None
|
| 81 |
+
|
| 82 |
+
data_loader = torch.utils.data.DataLoader(
|
| 83 |
+
dataset,
|
| 84 |
+
batch_size=batch_size,
|
| 85 |
+
shuffle=shuffle,
|
| 86 |
+
num_workers=num_workers,
|
| 87 |
+
pin_memory=pin_mem,
|
| 88 |
+
drop_last=drop_last,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
return data_loader
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/arkitscenes.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import itertools
|
| 5 |
+
|
| 6 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 7 |
+
import cv2
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2, imread_pil
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def stratified_sampling(indices, num_samples, rng=None):
|
| 15 |
+
if num_samples > len(indices):
|
| 16 |
+
raise ValueError("num_samples cannot exceed the number of available indices.")
|
| 17 |
+
elif num_samples == len(indices):
|
| 18 |
+
return indices
|
| 19 |
+
|
| 20 |
+
sorted_indices = sorted(indices)
|
| 21 |
+
stride = len(sorted_indices) / num_samples
|
| 22 |
+
sampled_indices = []
|
| 23 |
+
if rng is None:
|
| 24 |
+
rng = np.random.default_rng()
|
| 25 |
+
|
| 26 |
+
for i in range(num_samples):
|
| 27 |
+
start = int(i * stride)
|
| 28 |
+
end = int((i + 1) * stride)
|
| 29 |
+
# Ensure end does not exceed the list
|
| 30 |
+
end = min(end, len(sorted_indices))
|
| 31 |
+
if start < end:
|
| 32 |
+
# Randomly select within the current stratum
|
| 33 |
+
rand_idx = rng.integers(start, end)
|
| 34 |
+
sampled_indices.append(sorted_indices[rand_idx])
|
| 35 |
+
else:
|
| 36 |
+
# In case of any rounding issues, select the last index
|
| 37 |
+
sampled_indices.append(sorted_indices[-1])
|
| 38 |
+
|
| 39 |
+
return rng.permutation(sampled_indices)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class ARKitScenes_Multi(BaseMultiViewDataset):
|
| 43 |
+
def __init__(self, *args, split, ROOT, **kwargs):
|
| 44 |
+
self.ROOT = ROOT
|
| 45 |
+
self.video = True
|
| 46 |
+
self.is_metric = True
|
| 47 |
+
self.max_interval = 8
|
| 48 |
+
super().__init__(*args, **kwargs)
|
| 49 |
+
if split == "train":
|
| 50 |
+
self.split = "Training"
|
| 51 |
+
elif split == "test":
|
| 52 |
+
self.split = "Test"
|
| 53 |
+
else:
|
| 54 |
+
raise ValueError("")
|
| 55 |
+
|
| 56 |
+
self.loaded_data = self._load_data(self.split)
|
| 57 |
+
print('DATA: arkit', len(self))
|
| 58 |
+
|
| 59 |
+
def _load_data(self, split):
|
| 60 |
+
with np.load(osp.join(self.ROOT, split, "all_metadata.npz")) as data:
|
| 61 |
+
self.scenes: np.ndarray = data["scenes"]
|
| 62 |
+
'''
|
| 63 |
+
high_res_list = np.array(
|
| 64 |
+
[
|
| 65 |
+
d
|
| 66 |
+
for d in os.listdir(
|
| 67 |
+
os.path.join(
|
| 68 |
+
self.ROOT.rstrip("/"),# + "_highres",
|
| 69 |
+
split if split == "Training" else "Test",#"Validation",
|
| 70 |
+
)
|
| 71 |
+
)
|
| 72 |
+
if os.path.join(self.ROOT, split, d)
|
| 73 |
+
#if os.path.join(self.ROOT + "_highres", split, d)
|
| 74 |
+
]
|
| 75 |
+
)
|
| 76 |
+
self.scenes = np.setdiff1d(self.scenes, high_res_list)
|
| 77 |
+
'''
|
| 78 |
+
offset = 0
|
| 79 |
+
counts = []
|
| 80 |
+
scenes = []
|
| 81 |
+
sceneids = []
|
| 82 |
+
images = []
|
| 83 |
+
intrinsics = []
|
| 84 |
+
trajectories = []
|
| 85 |
+
groups = []
|
| 86 |
+
id_ranges = []
|
| 87 |
+
j = 0
|
| 88 |
+
for scene_idx, scene in enumerate(self.scenes):
|
| 89 |
+
scene_dir = osp.join(self.ROOT, self.split, scene)
|
| 90 |
+
with np.load(
|
| 91 |
+
osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
|
| 92 |
+
) as data:
|
| 93 |
+
imgs = data["images"]
|
| 94 |
+
intrins = data["intrinsics"]
|
| 95 |
+
traj = data["trajectories"]
|
| 96 |
+
min_seq_len = (
|
| 97 |
+
self.num_views
|
| 98 |
+
if not self.allow_repeat
|
| 99 |
+
else max(self.num_views // 3, 3)
|
| 100 |
+
)
|
| 101 |
+
if len(imgs) < min_seq_len:
|
| 102 |
+
print(f"Skipping {scene}")
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
collections = {}
|
| 106 |
+
assert "image_collection" in data, "Image collection not found"
|
| 107 |
+
collections["image"] = data["image_collection"]
|
| 108 |
+
|
| 109 |
+
num_imgs = imgs.shape[0]
|
| 110 |
+
img_groups = []
|
| 111 |
+
min_group_len = (
|
| 112 |
+
self.num_views
|
| 113 |
+
if not self.allow_repeat
|
| 114 |
+
else max(self.num_views // 3, 3)
|
| 115 |
+
)
|
| 116 |
+
for ref_id, group in collections["image"].item().items():
|
| 117 |
+
if len(group) + 1 < min_group_len:
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
# groups are (idx, score)s
|
| 121 |
+
group.insert(0, (ref_id, 1.0))
|
| 122 |
+
group = [int(x[0] + offset) for x in group]
|
| 123 |
+
img_groups.append(sorted(group))
|
| 124 |
+
|
| 125 |
+
if len(img_groups) == 0:
|
| 126 |
+
print(f"Skipping {scene}")
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
scenes.append(scene)
|
| 130 |
+
sceneids.extend([j] * num_imgs)
|
| 131 |
+
id_ranges.extend([(offset, offset + num_imgs) for _ in range(num_imgs)])
|
| 132 |
+
images.extend(imgs)
|
| 133 |
+
K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0)
|
| 134 |
+
|
| 135 |
+
K[:, 0, 0] = [fx for _, _, fx, _, _, _ in intrins]
|
| 136 |
+
K[:, 1, 1] = [fy for _, _, _, fy, _, _ in intrins]
|
| 137 |
+
K[:, 0, 2] = [cx for _, _, _, _, cx, _ in intrins]
|
| 138 |
+
K[:, 1, 2] = [cy for _, _, _, _, _, cy in intrins]
|
| 139 |
+
intrinsics.extend(list(K))
|
| 140 |
+
trajectories.extend(list(traj))
|
| 141 |
+
|
| 142 |
+
# offset groups
|
| 143 |
+
groups.extend(img_groups)
|
| 144 |
+
counts.append(offset)
|
| 145 |
+
offset += num_imgs
|
| 146 |
+
j += 1
|
| 147 |
+
|
| 148 |
+
self.scenes = scenes
|
| 149 |
+
self.sceneids = sceneids
|
| 150 |
+
self.id_ranges = id_ranges
|
| 151 |
+
self.images = images
|
| 152 |
+
self.intrinsics = intrinsics
|
| 153 |
+
self.trajectories = trajectories
|
| 154 |
+
self.groups = groups
|
| 155 |
+
|
| 156 |
+
def __len__(self):
|
| 157 |
+
return len(self.groups)
|
| 158 |
+
|
| 159 |
+
def get_image_num(self):
|
| 160 |
+
return len(self.images)
|
| 161 |
+
|
| 162 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 163 |
+
|
| 164 |
+
if rng.choice([True, False]):
|
| 165 |
+
image_idxs = np.arange(self.id_ranges[idx][0], self.id_ranges[idx][1])
|
| 166 |
+
cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
|
| 167 |
+
start_image_idxs = image_idxs[: len(image_idxs) - cut_off + 1]
|
| 168 |
+
start_id = rng.choice(start_image_idxs)
|
| 169 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 170 |
+
num_views,
|
| 171 |
+
start_id,
|
| 172 |
+
image_idxs.tolist(),
|
| 173 |
+
rng,
|
| 174 |
+
max_interval=self.max_interval,
|
| 175 |
+
video_prob=0.8,
|
| 176 |
+
fix_interval_prob=0.5,
|
| 177 |
+
block_shuffle=16,
|
| 178 |
+
)
|
| 179 |
+
image_idxs = np.array(image_idxs)[pos]
|
| 180 |
+
else:
|
| 181 |
+
ordered_video = False
|
| 182 |
+
image_idxs = self.groups[idx]
|
| 183 |
+
image_idxs = rng.permutation(image_idxs)
|
| 184 |
+
if len(image_idxs) > num_views:
|
| 185 |
+
image_idxs = image_idxs[:num_views]
|
| 186 |
+
else:
|
| 187 |
+
if rng.random() < 0.8:
|
| 188 |
+
image_idxs = rng.choice(image_idxs, size=num_views, replace=True)
|
| 189 |
+
else:
|
| 190 |
+
repeat_num = num_views // len(image_idxs) + 1
|
| 191 |
+
image_idxs = np.tile(image_idxs, repeat_num)[:num_views]
|
| 192 |
+
|
| 193 |
+
views = []
|
| 194 |
+
for v, view_idx in enumerate(image_idxs):
|
| 195 |
+
scene_id = self.sceneids[view_idx]
|
| 196 |
+
scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
|
| 197 |
+
|
| 198 |
+
intrinsics = self.intrinsics[view_idx]
|
| 199 |
+
camera_pose = self.trajectories[view_idx]
|
| 200 |
+
basename = self.images[view_idx]
|
| 201 |
+
assert (
|
| 202 |
+
basename[:8] == self.scenes[scene_id]
|
| 203 |
+
), f"{basename}, {self.scenes[scene_id]}"
|
| 204 |
+
# print(scene_dir, basename)
|
| 205 |
+
# Load RGB image
|
| 206 |
+
rgb_image = imread_pil(
|
| 207 |
+
osp.join(scene_dir, "vga_wide", basename.replace(".png", ".jpg"))
|
| 208 |
+
)
|
| 209 |
+
# Load depthmap
|
| 210 |
+
depthmap = imread_cv2(
|
| 211 |
+
osp.join(scene_dir, "lowres_depth", basename), cv2.IMREAD_UNCHANGED
|
| 212 |
+
)
|
| 213 |
+
depthmap = depthmap.astype(np.float32) / 1000.0
|
| 214 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 215 |
+
|
| 216 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 217 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# generate img mask and raymap mask
|
| 221 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 222 |
+
self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
views.append(
|
| 226 |
+
dict(
|
| 227 |
+
img=rgb_image,
|
| 228 |
+
depthmap=depthmap.astype(np.float32),
|
| 229 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 230 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 231 |
+
dataset="arkitscenes",
|
| 232 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 233 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 234 |
+
is_metric=self.is_metric,
|
| 235 |
+
is_video=ordered_video,
|
| 236 |
+
quantile=np.array(0.98, dtype=np.float32),
|
| 237 |
+
img_mask=img_mask,
|
| 238 |
+
ray_mask=ray_mask,
|
| 239 |
+
camera_only=False,
|
| 240 |
+
depth_only=False,
|
| 241 |
+
single_view=False,
|
| 242 |
+
reset=False,
|
| 243 |
+
)
|
| 244 |
+
)
|
| 245 |
+
assert len(views) == num_views
|
| 246 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/arkitscenes_highres.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import itertools
|
| 5 |
+
|
| 6 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 7 |
+
import cv2
|
| 8 |
+
import numpy as np
|
| 9 |
+
import h5py
|
| 10 |
+
import math
|
| 11 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 12 |
+
from dust3r.utils.image import imread_cv2
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ARKitScenesHighRes_Multi(BaseMultiViewDataset):
|
| 16 |
+
def __init__(self, *args, split, ROOT, **kwargs):
|
| 17 |
+
self.ROOT = ROOT
|
| 18 |
+
self.video = True
|
| 19 |
+
self.max_interval = 8
|
| 20 |
+
self.is_metric = True
|
| 21 |
+
super().__init__(*args, **kwargs)
|
| 22 |
+
if split == "train":
|
| 23 |
+
self.split = "Training"
|
| 24 |
+
elif split == "test":
|
| 25 |
+
self.split = "Validation"
|
| 26 |
+
else:
|
| 27 |
+
raise ValueError("")
|
| 28 |
+
|
| 29 |
+
self.loaded_data = self._load_data(self.split)
|
| 30 |
+
|
| 31 |
+
def _load_data(self, split):
|
| 32 |
+
all_scenes = sorted(
|
| 33 |
+
[
|
| 34 |
+
d
|
| 35 |
+
for d in os.listdir(osp.join(self.ROOT, split))
|
| 36 |
+
if osp.isdir(osp.join(self.ROOT, split, d))
|
| 37 |
+
]
|
| 38 |
+
)
|
| 39 |
+
offset = 0
|
| 40 |
+
scenes = []
|
| 41 |
+
sceneids = []
|
| 42 |
+
images = []
|
| 43 |
+
start_img_ids = []
|
| 44 |
+
scene_img_list = []
|
| 45 |
+
timestamps = []
|
| 46 |
+
intrinsics = []
|
| 47 |
+
trajectories = []
|
| 48 |
+
scene_id = 0
|
| 49 |
+
for scene in all_scenes:
|
| 50 |
+
scene_dir = osp.join(self.ROOT, self.split, scene)
|
| 51 |
+
with np.load(osp.join(scene_dir, "scene_metadata.npz")) as data:
|
| 52 |
+
imgs_with_indices = sorted(
|
| 53 |
+
enumerate(data["images"]), key=lambda x: x[1]
|
| 54 |
+
)
|
| 55 |
+
imgs = [x[1] for x in imgs_with_indices]
|
| 56 |
+
cut_off = (
|
| 57 |
+
self.num_views
|
| 58 |
+
if not self.allow_repeat
|
| 59 |
+
else max(self.num_views // 3, 3)
|
| 60 |
+
)
|
| 61 |
+
if len(imgs) < cut_off:
|
| 62 |
+
print(f"Skipping {scene}")
|
| 63 |
+
continue
|
| 64 |
+
indices = [x[0] for x in imgs_with_indices]
|
| 65 |
+
tsps = np.array(
|
| 66 |
+
[float(img_name.split("_")[1][:-4]) for img_name in imgs]
|
| 67 |
+
)
|
| 68 |
+
assert [img[:8] == scene for img in imgs], f"{scene}, {imgs}"
|
| 69 |
+
num_imgs = data["images"].shape[0]
|
| 70 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 71 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 72 |
+
|
| 73 |
+
scenes.append(scene)
|
| 74 |
+
scene_img_list.append(img_ids)
|
| 75 |
+
sceneids.extend([scene_id] * num_imgs)
|
| 76 |
+
images.extend(imgs)
|
| 77 |
+
start_img_ids.extend(start_img_ids_)
|
| 78 |
+
timestamps.extend(tsps)
|
| 79 |
+
|
| 80 |
+
K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0)
|
| 81 |
+
intrins = data["intrinsics"][indices]
|
| 82 |
+
K[:, 0, 0] = [fx for _, _, fx, _, _, _ in intrins]
|
| 83 |
+
K[:, 1, 1] = [fy for _, _, _, fy, _, _ in intrins]
|
| 84 |
+
K[:, 0, 2] = [cx for _, _, _, _, cx, _ in intrins]
|
| 85 |
+
K[:, 1, 2] = [cy for _, _, _, _, _, cy in intrins]
|
| 86 |
+
intrinsics.extend(list(K))
|
| 87 |
+
trajectories.extend(list(data["trajectories"][indices]))
|
| 88 |
+
|
| 89 |
+
# offset groups
|
| 90 |
+
offset += num_imgs
|
| 91 |
+
scene_id += 1
|
| 92 |
+
|
| 93 |
+
self.scenes = scenes
|
| 94 |
+
self.sceneids = sceneids
|
| 95 |
+
self.images = images
|
| 96 |
+
self.scene_img_list = scene_img_list
|
| 97 |
+
self.intrinsics = intrinsics
|
| 98 |
+
self.trajectories = trajectories
|
| 99 |
+
self.start_img_ids = start_img_ids
|
| 100 |
+
assert len(self.images) == len(self.intrinsics) == len(self.trajectories)
|
| 101 |
+
|
| 102 |
+
def __len__(self):
|
| 103 |
+
return len(self.start_img_ids)
|
| 104 |
+
|
| 105 |
+
def get_image_num(self):
|
| 106 |
+
return len(self.images)
|
| 107 |
+
|
| 108 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 109 |
+
start_id = self.start_img_ids[idx]
|
| 110 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]]
|
| 111 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 112 |
+
num_views,
|
| 113 |
+
start_id,
|
| 114 |
+
all_image_ids,
|
| 115 |
+
rng,
|
| 116 |
+
max_interval=self.max_interval,
|
| 117 |
+
block_shuffle=16,
|
| 118 |
+
)
|
| 119 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 120 |
+
|
| 121 |
+
views = []
|
| 122 |
+
|
| 123 |
+
for v, view_idx in enumerate(image_idxs):
|
| 124 |
+
scene_id = self.sceneids[view_idx]
|
| 125 |
+
scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
|
| 126 |
+
|
| 127 |
+
intrinsics = self.intrinsics[view_idx]
|
| 128 |
+
camera_pose = self.trajectories[view_idx]
|
| 129 |
+
basename = self.images[view_idx]
|
| 130 |
+
assert (
|
| 131 |
+
basename[:8] == self.scenes[scene_id]
|
| 132 |
+
), f"{basename}, {self.scenes[scene_id]}"
|
| 133 |
+
# print(scene_dir, basename)
|
| 134 |
+
# Load RGB image
|
| 135 |
+
rgb_image = imread_cv2(
|
| 136 |
+
osp.join(scene_dir, "vga_wide", basename.replace(".png", ".jpg"))
|
| 137 |
+
)
|
| 138 |
+
# Load depthmap
|
| 139 |
+
depthmap = imread_cv2(
|
| 140 |
+
osp.join(scene_dir, "highres_depth", basename), cv2.IMREAD_UNCHANGED
|
| 141 |
+
)
|
| 142 |
+
depthmap = depthmap.astype(np.float32) / 1000.0
|
| 143 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 144 |
+
|
| 145 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 146 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# generate img mask and raymap mask
|
| 150 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 151 |
+
self.is_metric, v, rng, p=[0.7, 0.25, 0.05]
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
views.append(
|
| 155 |
+
dict(
|
| 156 |
+
img=rgb_image,
|
| 157 |
+
depthmap=depthmap.astype(np.float32),
|
| 158 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 159 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 160 |
+
dataset="arkitscenes_highres",
|
| 161 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 162 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 163 |
+
is_metric=self.is_metric,
|
| 164 |
+
is_video=ordered_video,
|
| 165 |
+
quantile=np.array(0.99, dtype=np.float32),
|
| 166 |
+
img_mask=img_mask,
|
| 167 |
+
ray_mask=ray_mask,
|
| 168 |
+
camera_only=False,
|
| 169 |
+
depth_only=False,
|
| 170 |
+
single_view=False,
|
| 171 |
+
reset=False,
|
| 172 |
+
)
|
| 173 |
+
)
|
| 174 |
+
assert len(views) == num_views
|
| 175 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/__init__.py
ADDED
|
File without changes
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/base_multiview_dataset.py
ADDED
|
@@ -0,0 +1,576 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import PIL
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
import random
|
| 5 |
+
import itertools
|
| 6 |
+
from dust3r.datasets.base.easy_dataset import EasyDataset
|
| 7 |
+
from dust3r.datasets.utils.transforms import ImgNorm, SeqColorJitter
|
| 8 |
+
from dust3r.utils.geometry import depthmap_to_absolute_camera_coordinates
|
| 9 |
+
import dust3r.datasets.utils.cropping as cropping
|
| 10 |
+
from dust3r.datasets.utils.corr import extract_correspondences_from_pts3d
|
| 11 |
+
|
| 12 |
+
from vggt.train_utils.augmentation import get_image_augmentation
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_ray_map(c2w1, c2w2, intrinsics, h, w):
|
| 17 |
+
c2w = np.linalg.inv(c2w1) @ c2w2
|
| 18 |
+
i, j = np.meshgrid(np.arange(w), np.arange(h), indexing="xy")
|
| 19 |
+
grid = np.stack([i, j, np.ones_like(i)], axis=-1)
|
| 20 |
+
ro = c2w[:3, 3]
|
| 21 |
+
rd = np.linalg.inv(intrinsics) @ grid.reshape(-1, 3).T
|
| 22 |
+
rd = (c2w @ np.vstack([rd, np.ones_like(rd[0])])).T[:, :3].reshape(h, w, 3)
|
| 23 |
+
rd = rd / np.linalg.norm(rd, axis=-1, keepdims=True)
|
| 24 |
+
ro = np.broadcast_to(ro, (h, w, 3))
|
| 25 |
+
ray_map = np.concatenate([ro, rd], axis=-1)
|
| 26 |
+
return ray_map
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class BaseMultiViewDataset(EasyDataset):
|
| 30 |
+
"""Define all basic options.
|
| 31 |
+
|
| 32 |
+
Usage:
|
| 33 |
+
class MyDataset (BaseMultiViewDataset):
|
| 34 |
+
def _get_views(self, idx, rng):
|
| 35 |
+
# overload here
|
| 36 |
+
views = []
|
| 37 |
+
views.append(dict(img=, ...))
|
| 38 |
+
return views
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
def __init__(
|
| 42 |
+
self,
|
| 43 |
+
*, # only keyword arguments
|
| 44 |
+
num_views=None,
|
| 45 |
+
split=None,
|
| 46 |
+
resolution=None, # square_size or (width, height) or list of [(width,height), ...]
|
| 47 |
+
transform=ImgNorm,
|
| 48 |
+
aug_crop=False,
|
| 49 |
+
n_corres=0,
|
| 50 |
+
nneg=0,
|
| 51 |
+
seed=None,
|
| 52 |
+
allow_repeat=False,
|
| 53 |
+
seq_aug_crop=False,
|
| 54 |
+
):
|
| 55 |
+
assert num_views is not None, "undefined num_views"
|
| 56 |
+
self.num_views = num_views
|
| 57 |
+
self.split = split
|
| 58 |
+
self._set_resolutions(resolution)
|
| 59 |
+
|
| 60 |
+
self.n_corres = n_corres
|
| 61 |
+
self.nneg = nneg
|
| 62 |
+
assert (
|
| 63 |
+
self.n_corres == "all"
|
| 64 |
+
or isinstance(self.n_corres, int)
|
| 65 |
+
or (
|
| 66 |
+
isinstance(self.n_corres, list) and len(self.n_corres) == self.num_views
|
| 67 |
+
)
|
| 68 |
+
), f"Error, n_corres should either be 'all', a single integer or a list of length {self.num_views}"
|
| 69 |
+
assert (
|
| 70 |
+
self.nneg == 0 or self.n_corres != "all"
|
| 71 |
+
), "nneg should be 0 if n_corres is all"
|
| 72 |
+
|
| 73 |
+
self.is_seq_color_jitter = False
|
| 74 |
+
if isinstance(transform, str):
|
| 75 |
+
transform = eval(transform)
|
| 76 |
+
if transform == SeqColorJitter:
|
| 77 |
+
transform = SeqColorJitter()
|
| 78 |
+
self.is_seq_color_jitter = True
|
| 79 |
+
self.transform = transform
|
| 80 |
+
|
| 81 |
+
self.image_aug = get_image_augmentation(
|
| 82 |
+
color_jitter={ 'brightness': 0.5,
|
| 83 |
+
'contrast': 0.5,
|
| 84 |
+
'saturation': 0.5,
|
| 85 |
+
'hue': 0.1,
|
| 86 |
+
'p': 0.9},
|
| 87 |
+
#common_config.augs.color_jitter,
|
| 88 |
+
gray_scale=True,#common_config.augs.gray_scale,
|
| 89 |
+
gau_blur=False, #common_config.augs.gau_blur,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
self.aug_crop = aug_crop
|
| 94 |
+
self.seed = seed
|
| 95 |
+
self.allow_repeat = allow_repeat
|
| 96 |
+
self.seq_aug_crop = seq_aug_crop
|
| 97 |
+
|
| 98 |
+
def __len__(self):
|
| 99 |
+
return len(self.scenes)
|
| 100 |
+
|
| 101 |
+
@staticmethod
|
| 102 |
+
def efficient_random_intervals(
|
| 103 |
+
start,
|
| 104 |
+
num_elements,
|
| 105 |
+
interval_range,
|
| 106 |
+
fixed_interval_prob=0.8,
|
| 107 |
+
weights=None,
|
| 108 |
+
seed=42,
|
| 109 |
+
):
|
| 110 |
+
if random.random() < fixed_interval_prob:
|
| 111 |
+
intervals = random.choices(interval_range, weights=weights) * (
|
| 112 |
+
num_elements - 1
|
| 113 |
+
)
|
| 114 |
+
else:
|
| 115 |
+
intervals = [
|
| 116 |
+
random.choices(interval_range, weights=weights)[0]
|
| 117 |
+
for _ in range(num_elements - 1)
|
| 118 |
+
]
|
| 119 |
+
return list(itertools.accumulate([start] + intervals))
|
| 120 |
+
|
| 121 |
+
def sample_based_on_timestamps(self, i, timestamps, num_views, interval=1):
|
| 122 |
+
time_diffs = np.abs(timestamps - timestamps[i])
|
| 123 |
+
ids_candidate = np.where(time_diffs < interval)[0]
|
| 124 |
+
ids_candidate = np.sort(ids_candidate)
|
| 125 |
+
if (self.allow_repeat and len(ids_candidate) < num_views // 3) or (
|
| 126 |
+
len(ids_candidate) < num_views
|
| 127 |
+
):
|
| 128 |
+
return []
|
| 129 |
+
ids_sel_list = []
|
| 130 |
+
ids_candidate_left = ids_candidate.copy()
|
| 131 |
+
while len(ids_candidate_left) >= num_views:
|
| 132 |
+
ids_sel = np.random.choice(ids_candidate_left, num_views, replace=False)
|
| 133 |
+
ids_sel_list.append(sorted(ids_sel))
|
| 134 |
+
ids_candidate_left = np.setdiff1d(ids_candidate_left, ids_sel)
|
| 135 |
+
|
| 136 |
+
if len(ids_candidate_left) > 0 and len(ids_candidate) >= num_views:
|
| 137 |
+
ids_sel = np.concatenate(
|
| 138 |
+
[
|
| 139 |
+
ids_candidate_left,
|
| 140 |
+
np.random.choice(
|
| 141 |
+
np.setdiff1d(ids_candidate, ids_candidate_left),
|
| 142 |
+
num_views - len(ids_candidate_left),
|
| 143 |
+
replace=False,
|
| 144 |
+
),
|
| 145 |
+
]
|
| 146 |
+
)
|
| 147 |
+
ids_sel_list.append(sorted(ids_sel))
|
| 148 |
+
|
| 149 |
+
if self.allow_repeat:
|
| 150 |
+
ids_sel_list.append(
|
| 151 |
+
sorted(np.random.choice(ids_candidate, num_views, replace=True))
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# add sequences with fixed intervals (all possible intervals)
|
| 155 |
+
pos_i = np.where(ids_candidate == i)[0][0]
|
| 156 |
+
curr_interval = 1
|
| 157 |
+
stop = len(ids_candidate) < num_views
|
| 158 |
+
while not stop:
|
| 159 |
+
pos_sel = [pos_i]
|
| 160 |
+
count = 0
|
| 161 |
+
while len(pos_sel) < num_views:
|
| 162 |
+
if count % 2 == 0:
|
| 163 |
+
curr_pos_i = pos_sel[-1] + curr_interval
|
| 164 |
+
if curr_pos_i >= len(ids_candidate):
|
| 165 |
+
stop = True
|
| 166 |
+
break
|
| 167 |
+
pos_sel.append(curr_pos_i)
|
| 168 |
+
else:
|
| 169 |
+
curr_pos_i = pos_sel[0] - curr_interval
|
| 170 |
+
if curr_pos_i < 0:
|
| 171 |
+
stop = True
|
| 172 |
+
break
|
| 173 |
+
pos_sel.insert(0, curr_pos_i)
|
| 174 |
+
count += 1
|
| 175 |
+
if not stop and len(pos_sel) == num_views:
|
| 176 |
+
ids_sel = sorted([ids_candidate[pos] for pos in pos_sel])
|
| 177 |
+
if ids_sel not in ids_sel_list:
|
| 178 |
+
ids_sel_list.append(ids_sel)
|
| 179 |
+
curr_interval += 1
|
| 180 |
+
return ids_sel_list
|
| 181 |
+
|
| 182 |
+
@staticmethod
|
| 183 |
+
def blockwise_shuffle(x, rng, block_shuffle):
|
| 184 |
+
if block_shuffle is None:
|
| 185 |
+
return rng.permutation(x).tolist()
|
| 186 |
+
else:
|
| 187 |
+
assert block_shuffle > 0
|
| 188 |
+
blocks = [x[i : i + block_shuffle] for i in range(0, len(x), block_shuffle)]
|
| 189 |
+
shuffled_blocks = [rng.permutation(block).tolist() for block in blocks]
|
| 190 |
+
shuffled_list = [item for block in shuffled_blocks for item in block]
|
| 191 |
+
return shuffled_list
|
| 192 |
+
|
| 193 |
+
def get_seq_from_start_id(
|
| 194 |
+
self,
|
| 195 |
+
num_views,
|
| 196 |
+
id_ref,
|
| 197 |
+
ids_all,
|
| 198 |
+
rng,
|
| 199 |
+
min_interval=1,
|
| 200 |
+
max_interval=25,
|
| 201 |
+
video_prob=0.5,
|
| 202 |
+
fix_interval_prob=0.5,
|
| 203 |
+
block_shuffle=None,
|
| 204 |
+
):
|
| 205 |
+
"""
|
| 206 |
+
args:
|
| 207 |
+
num_views: number of views to return
|
| 208 |
+
id_ref: the reference id (first id)
|
| 209 |
+
ids_all: all the ids
|
| 210 |
+
rng: random number generator
|
| 211 |
+
max_interval: maximum interval between two views
|
| 212 |
+
returns:
|
| 213 |
+
pos: list of positions of the views in ids_all, i.e., index for ids_all
|
| 214 |
+
is_video: True if the views are consecutive
|
| 215 |
+
"""
|
| 216 |
+
assert min_interval > 0, f"min_interval should be > 0, got {min_interval}"
|
| 217 |
+
assert (
|
| 218 |
+
min_interval <= max_interval
|
| 219 |
+
), f"min_interval should be <= max_interval, got {min_interval} and {max_interval}"
|
| 220 |
+
assert id_ref in ids_all
|
| 221 |
+
pos_ref = ids_all.index(id_ref)
|
| 222 |
+
all_possible_pos = np.arange(pos_ref, len(ids_all))
|
| 223 |
+
|
| 224 |
+
remaining_sum = len(ids_all) - 1 - pos_ref
|
| 225 |
+
|
| 226 |
+
if remaining_sum >= num_views - 1:
|
| 227 |
+
if remaining_sum == num_views - 1:
|
| 228 |
+
assert ids_all[-num_views] == id_ref
|
| 229 |
+
return [pos_ref + i for i in range(num_views)], True
|
| 230 |
+
max_interval = min(max_interval, 2 * remaining_sum // (num_views - 1))
|
| 231 |
+
intervals = [
|
| 232 |
+
rng.choice(range(min_interval, max_interval + 1))
|
| 233 |
+
for _ in range(num_views - 1)
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
# if video or collection
|
| 237 |
+
if rng.random() < video_prob:
|
| 238 |
+
# if fixed interval or random
|
| 239 |
+
if rng.random() < fix_interval_prob:
|
| 240 |
+
# regular interval
|
| 241 |
+
fixed_interval = rng.choice(
|
| 242 |
+
range(
|
| 243 |
+
1,
|
| 244 |
+
min(remaining_sum // (num_views - 1) + 1, max_interval + 1),
|
| 245 |
+
)
|
| 246 |
+
)
|
| 247 |
+
intervals = [fixed_interval for _ in range(num_views - 1)]
|
| 248 |
+
is_video = True
|
| 249 |
+
else:
|
| 250 |
+
is_video = False
|
| 251 |
+
|
| 252 |
+
pos = list(itertools.accumulate([pos_ref] + intervals))
|
| 253 |
+
pos = [p for p in pos if p < len(ids_all)]
|
| 254 |
+
pos_candidates = [p for p in all_possible_pos if p not in pos]
|
| 255 |
+
pos = (
|
| 256 |
+
pos
|
| 257 |
+
+ rng.choice(
|
| 258 |
+
pos_candidates, num_views - len(pos), replace=False
|
| 259 |
+
).tolist()
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
pos = (
|
| 263 |
+
sorted(pos)
|
| 264 |
+
if is_video
|
| 265 |
+
else self.blockwise_shuffle(pos, rng, block_shuffle)
|
| 266 |
+
)
|
| 267 |
+
#elif remaining_sum>1:
|
| 268 |
+
else:
|
| 269 |
+
# assert self.allow_repeat
|
| 270 |
+
uniq_num = remaining_sum
|
| 271 |
+
new_pos_ref = rng.choice(np.arange(pos_ref + 1))
|
| 272 |
+
new_remaining_sum = len(ids_all) - 1 - new_pos_ref
|
| 273 |
+
new_max_interval = min(max_interval, new_remaining_sum // (uniq_num - 1))
|
| 274 |
+
new_intervals = [
|
| 275 |
+
rng.choice(range(1, new_max_interval + 1)) for _ in range(uniq_num - 1)
|
| 276 |
+
]
|
| 277 |
+
|
| 278 |
+
revisit_random = rng.random()
|
| 279 |
+
video_random = rng.random()
|
| 280 |
+
|
| 281 |
+
if rng.random() < fix_interval_prob and video_random < video_prob:
|
| 282 |
+
# regular interval
|
| 283 |
+
fixed_interval = rng.choice(range(1, new_max_interval + 1))
|
| 284 |
+
new_intervals = [fixed_interval for _ in range(uniq_num - 1)]
|
| 285 |
+
pos = list(itertools.accumulate([new_pos_ref] + new_intervals))
|
| 286 |
+
|
| 287 |
+
is_video = False
|
| 288 |
+
if revisit_random < 0.5 or video_prob == 1.0: # revisit, video / collection
|
| 289 |
+
is_video = video_random < video_prob
|
| 290 |
+
pos = (
|
| 291 |
+
self.blockwise_shuffle(pos, rng, block_shuffle)
|
| 292 |
+
if not is_video
|
| 293 |
+
else pos
|
| 294 |
+
)
|
| 295 |
+
num_full_repeat = num_views // uniq_num
|
| 296 |
+
pos = (
|
| 297 |
+
pos * num_full_repeat
|
| 298 |
+
+ pos[: num_views - len(pos) * num_full_repeat]
|
| 299 |
+
)
|
| 300 |
+
elif revisit_random < 0.9: # random
|
| 301 |
+
pos = rng.choice(pos, num_views, replace=True)
|
| 302 |
+
else: # ordered
|
| 303 |
+
pos = sorted(rng.choice(pos, num_views, replace=True))
|
| 304 |
+
assert len(pos) == num_views
|
| 305 |
+
return pos, is_video
|
| 306 |
+
|
| 307 |
+
def get_img_and_ray_masks(self, is_metric, v, rng, p=[0.8, 0.15, 0.05]):
|
| 308 |
+
# generate img mask and raymap mask
|
| 309 |
+
if v == 0 or (not is_metric):
|
| 310 |
+
img_mask = True
|
| 311 |
+
raymap_mask = False
|
| 312 |
+
else:
|
| 313 |
+
rand_val = rng.random()
|
| 314 |
+
if rand_val < p[0]:
|
| 315 |
+
img_mask = True
|
| 316 |
+
raymap_mask = False
|
| 317 |
+
elif rand_val < p[0] + p[1]:
|
| 318 |
+
img_mask = False
|
| 319 |
+
raymap_mask = True
|
| 320 |
+
else:
|
| 321 |
+
img_mask = True
|
| 322 |
+
raymap_mask = True
|
| 323 |
+
return img_mask, raymap_mask
|
| 324 |
+
|
| 325 |
+
def get_stats(self):
|
| 326 |
+
return f"{len(self)} groups of views"
|
| 327 |
+
|
| 328 |
+
def __repr__(self):
|
| 329 |
+
resolutions_str = "[" + ";".join(f"{w}x{h}" for w, h in self._resolutions) + "]"
|
| 330 |
+
return (
|
| 331 |
+
f"""{type(self).__name__}({self.get_stats()},
|
| 332 |
+
{self.num_views=},
|
| 333 |
+
{self.split=},
|
| 334 |
+
{self.seed=},
|
| 335 |
+
resolutions={resolutions_str},
|
| 336 |
+
{self.transform=})""".replace(
|
| 337 |
+
"self.", ""
|
| 338 |
+
)
|
| 339 |
+
.replace("\n", "")
|
| 340 |
+
.replace(" ", "")
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 344 |
+
raise NotImplementedError()
|
| 345 |
+
|
| 346 |
+
def __getitem__(self, idx):
|
| 347 |
+
# print("Receiving:" , idx)
|
| 348 |
+
if isinstance(idx, (tuple, list, np.ndarray)):
|
| 349 |
+
# the idx is specifying the aspect-ratio
|
| 350 |
+
idx, ar_idx, nview = idx
|
| 351 |
+
else:
|
| 352 |
+
assert len(self._resolutions) == 1
|
| 353 |
+
ar_idx = 0
|
| 354 |
+
nview = self.num_views
|
| 355 |
+
|
| 356 |
+
assert nview >= 1 and nview <= self.num_views
|
| 357 |
+
# set-up the rng
|
| 358 |
+
if self.seed: # reseed for each __getitem__
|
| 359 |
+
self._rng = np.random.default_rng(seed=self.seed + idx)
|
| 360 |
+
elif not hasattr(self, "_rng"):
|
| 361 |
+
seed = torch.randint(0, 2**32, (1,)).item()
|
| 362 |
+
self._rng = np.random.default_rng(seed=seed)
|
| 363 |
+
|
| 364 |
+
if self.aug_crop > 1 and self.seq_aug_crop:
|
| 365 |
+
self.delta_target_resolution = self._rng.integers(0, self.aug_crop)
|
| 366 |
+
|
| 367 |
+
# over-loaded code
|
| 368 |
+
resolution = self._resolutions[
|
| 369 |
+
ar_idx
|
| 370 |
+
] # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
|
| 371 |
+
views = self._get_views(idx, resolution, self._rng, nview)
|
| 372 |
+
assert len(views) == nview
|
| 373 |
+
|
| 374 |
+
if "camera_pose" not in views[0]:
|
| 375 |
+
views[0]["camera_pose"] = np.ones((4, 4), dtype=np.float32)
|
| 376 |
+
first_view_camera_pose = views[0]["camera_pose"]
|
| 377 |
+
transform = SeqColorJitter() if self.is_seq_color_jitter else self.transform
|
| 378 |
+
|
| 379 |
+
for v, view in enumerate(views):
|
| 380 |
+
assert (
|
| 381 |
+
"pts3d" not in view
|
| 382 |
+
), f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
|
| 383 |
+
view["idx"] = (idx, ar_idx, v)
|
| 384 |
+
|
| 385 |
+
# encode the image
|
| 386 |
+
width, height = view["img"].size
|
| 387 |
+
|
| 388 |
+
view["true_shape"] = np.int32((height, width))
|
| 389 |
+
view["img"] = transform(view["img"])
|
| 390 |
+
view["sky_mask"] = view["depthmap"] < 0
|
| 391 |
+
|
| 392 |
+
assert "camera_intrinsics" in view
|
| 393 |
+
if "camera_pose" not in view:
|
| 394 |
+
view["camera_pose"] = np.full((4, 4), np.nan, dtype=np.float32)
|
| 395 |
+
else:
|
| 396 |
+
assert np.isfinite(
|
| 397 |
+
view["camera_pose"]
|
| 398 |
+
).all(), f"NaN in camera pose for view {view_name(view)}"
|
| 399 |
+
|
| 400 |
+
ray_map = get_ray_map(
|
| 401 |
+
first_view_camera_pose,
|
| 402 |
+
view["camera_pose"],
|
| 403 |
+
view["camera_intrinsics"],
|
| 404 |
+
height,
|
| 405 |
+
width,
|
| 406 |
+
)
|
| 407 |
+
view["ray_map"] = ray_map.astype(np.float32)
|
| 408 |
+
|
| 409 |
+
assert "pts3d" not in view
|
| 410 |
+
assert "valid_mask" not in view
|
| 411 |
+
assert np.isfinite(
|
| 412 |
+
view["depthmap"]
|
| 413 |
+
).all(), f"NaN in depthmap for view {view_name(view)}"
|
| 414 |
+
pts3d, pts3d_local, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
view["pts3d"] = pts3d
|
| 420 |
+
view["pts3d_local"] = pts3d_local
|
| 421 |
+
view["valid_mask"] = valid_mask & np.isfinite(pts3d).all(axis=-1)
|
| 422 |
+
|
| 423 |
+
# check all datatypes
|
| 424 |
+
for key, val in view.items():
|
| 425 |
+
res, err_msg = is_good_type(key, val)
|
| 426 |
+
assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
|
| 427 |
+
K = view["camera_intrinsics"]
|
| 428 |
+
if False:
|
| 429 |
+
if random.random() > 0.3:#self.cojitter_ratio:
|
| 430 |
+
images = torch.stack([view['img'] for view in views],axis=0)
|
| 431 |
+
images = self.image_aug(images)
|
| 432 |
+
for v, view in enumerate(views):
|
| 433 |
+
view['img'] = images[v]
|
| 434 |
+
|
| 435 |
+
else:
|
| 436 |
+
for view in views:
|
| 437 |
+
view['img'] = self.image_aug(view['img'][None])[0]
|
| 438 |
+
|
| 439 |
+
if self.n_corres > 0:
|
| 440 |
+
ref_view = views[0]
|
| 441 |
+
for view in views:
|
| 442 |
+
corres1, corres2, valid = extract_correspondences_from_pts3d(
|
| 443 |
+
ref_view, view, self.n_corres, self._rng, nneg=self.nneg
|
| 444 |
+
)
|
| 445 |
+
view["corres"] = (corres1, corres2)
|
| 446 |
+
view["valid_corres"] = valid
|
| 447 |
+
|
| 448 |
+
# last thing done!
|
| 449 |
+
for view in views:
|
| 450 |
+
view["rng"] = int.from_bytes(self._rng.bytes(4), "big")
|
| 451 |
+
return views
|
| 452 |
+
|
| 453 |
+
def _set_resolutions(self, resolutions):
|
| 454 |
+
assert resolutions is not None, "undefined resolution"
|
| 455 |
+
|
| 456 |
+
if not isinstance(resolutions, list):
|
| 457 |
+
resolutions = [resolutions]
|
| 458 |
+
|
| 459 |
+
self._resolutions = []
|
| 460 |
+
for resolution in resolutions:
|
| 461 |
+
if isinstance(resolution, int):
|
| 462 |
+
width = height = resolution
|
| 463 |
+
else:
|
| 464 |
+
width, height = resolution
|
| 465 |
+
assert isinstance(
|
| 466 |
+
width, int
|
| 467 |
+
), f"Bad type for {width=} {type(width)=}, should be int"
|
| 468 |
+
assert isinstance(
|
| 469 |
+
height, int
|
| 470 |
+
), f"Bad type for {height=} {type(height)=}, should be int"
|
| 471 |
+
self._resolutions.append((width, height))
|
| 472 |
+
|
| 473 |
+
def _crop_resize_if_necessary(
|
| 474 |
+
self, image, depthmap, intrinsics, resolution, rng=None, info=None
|
| 475 |
+
):
|
| 476 |
+
"""This function:
|
| 477 |
+
- first downsizes the image with LANCZOS inteprolation,
|
| 478 |
+
which is better than bilinear interpolation in
|
| 479 |
+
"""
|
| 480 |
+
if not isinstance(image, PIL.Image.Image):
|
| 481 |
+
image = PIL.Image.fromarray(image)
|
| 482 |
+
|
| 483 |
+
# downscale with lanczos interpolation so that image.size == resolution
|
| 484 |
+
# cropping centered on the principal point
|
| 485 |
+
W, H = image.size
|
| 486 |
+
cx, cy = intrinsics[:2, 2].round().astype(int)
|
| 487 |
+
min_margin_x = min(cx, W - cx)
|
| 488 |
+
min_margin_y = min(cy, H - cy)
|
| 489 |
+
assert min_margin_x > W / 5, f"Bad principal point in view={info}"
|
| 490 |
+
assert min_margin_y > H / 5, f"Bad principal point in view={info}"
|
| 491 |
+
# the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
|
| 492 |
+
l, t = cx - min_margin_x, cy - min_margin_y
|
| 493 |
+
r, b = cx + min_margin_x, cy + min_margin_y
|
| 494 |
+
crop_bbox = (l, t, r, b)
|
| 495 |
+
image, depthmap, intrinsics = cropping.crop_image_depthmap(
|
| 496 |
+
image, depthmap, intrinsics, crop_bbox
|
| 497 |
+
)
|
| 498 |
+
|
| 499 |
+
# transpose the resolution if necessary
|
| 500 |
+
W, H = image.size # new size
|
| 501 |
+
|
| 502 |
+
# high-quality Lanczos down-scaling
|
| 503 |
+
target_resolution = np.array(resolution)
|
| 504 |
+
if self.aug_crop > 1:
|
| 505 |
+
target_resolution += (
|
| 506 |
+
rng.integers(0, self.aug_crop)
|
| 507 |
+
if not self.seq_aug_crop
|
| 508 |
+
else self.delta_target_resolution
|
| 509 |
+
)
|
| 510 |
+
image, depthmap, intrinsics = cropping.rescale_image_depthmap(
|
| 511 |
+
image, depthmap, intrinsics, target_resolution
|
| 512 |
+
)
|
| 513 |
+
|
| 514 |
+
# actual cropping (if necessary) with bilinear interpolation
|
| 515 |
+
intrinsics2 = cropping.camera_matrix_of_crop(
|
| 516 |
+
intrinsics, image.size, resolution, offset_factor=0.5
|
| 517 |
+
)
|
| 518 |
+
crop_bbox = cropping.bbox_from_intrinsics_in_out(
|
| 519 |
+
intrinsics, intrinsics2, resolution
|
| 520 |
+
)
|
| 521 |
+
image, depthmap, intrinsics2 = cropping.crop_image_depthmap(
|
| 522 |
+
image, depthmap, intrinsics, crop_bbox
|
| 523 |
+
)
|
| 524 |
+
|
| 525 |
+
return image, depthmap, intrinsics2
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
def is_good_type(key, v):
|
| 529 |
+
"""returns (is_good, err_msg)"""
|
| 530 |
+
if isinstance(v, (str, int, tuple)):
|
| 531 |
+
return True, None
|
| 532 |
+
if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
|
| 533 |
+
return False, f"bad {v.dtype=}"
|
| 534 |
+
return True, None
|
| 535 |
+
|
| 536 |
+
|
| 537 |
+
def view_name(view, batch_index=None):
|
| 538 |
+
def sel(x):
|
| 539 |
+
return x[batch_index] if batch_index not in (None, slice(None)) else x
|
| 540 |
+
|
| 541 |
+
db = sel(view["dataset"])
|
| 542 |
+
label = sel(view["label"])
|
| 543 |
+
instance = sel(view["instance"])
|
| 544 |
+
return f"{db}/{label}/{instance}"
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
def transpose_to_landscape(view):
|
| 548 |
+
height, width = view["true_shape"]
|
| 549 |
+
|
| 550 |
+
if width < height:
|
| 551 |
+
# rectify portrait to landscape
|
| 552 |
+
assert view["img"].shape == (3, height, width)
|
| 553 |
+
view["img"] = view["img"].swapaxes(1, 2)
|
| 554 |
+
|
| 555 |
+
assert view["valid_mask"].shape == (height, width)
|
| 556 |
+
view["valid_mask"] = view["valid_mask"].swapaxes(0, 1)
|
| 557 |
+
|
| 558 |
+
assert view["depthmap"].shape == (height, width)
|
| 559 |
+
view["depthmap"] = view["depthmap"].swapaxes(0, 1)
|
| 560 |
+
|
| 561 |
+
assert view["pts3d"].shape == (height, width, 3)
|
| 562 |
+
view["pts3d"] = view["pts3d"].swapaxes(0, 1)
|
| 563 |
+
|
| 564 |
+
# transpose x and y pixels
|
| 565 |
+
view["camera_intrinsics"] = view["camera_intrinsics"][[1, 0, 2]]
|
| 566 |
+
|
| 567 |
+
assert view["ray_map"].shape == (height, width, 6)
|
| 568 |
+
view["ray_map"] = view["ray_map"].swapaxes(0, 1)
|
| 569 |
+
|
| 570 |
+
assert view["sky_mask"].shape == (height, width)
|
| 571 |
+
view["sky_mask"] = view["sky_mask"].swapaxes(0, 1)
|
| 572 |
+
|
| 573 |
+
if "corres" in view:
|
| 574 |
+
# transpose correspondences x and y
|
| 575 |
+
view["corres"][0] = view["corres"][0][:, [1, 0]]
|
| 576 |
+
view["corres"][1] = view["corres"][1][:, [1, 0]]
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/batched_sampler.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
from accelerate import Accelerator
|
| 4 |
+
import torch.utils
|
| 5 |
+
from torch.utils.data import BatchSampler, Sampler
|
| 6 |
+
import torch.utils.data
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class CustomRandomSampler(Sampler):
|
| 10 |
+
"""Random sampling under a constraint: each sample in the batch has the same feature,
|
| 11 |
+
which is chosen randomly from a known pool of 'features' for each batch.
|
| 12 |
+
|
| 13 |
+
For instance, the 'feature' could be the image aspect-ratio.
|
| 14 |
+
|
| 15 |
+
The index returned is a tuple (sample_idx, feat_idx).
|
| 16 |
+
This sampler ensures that each series of `batch_size` indices has the same `feat_idx`.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
dataset,
|
| 22 |
+
batch_size,
|
| 23 |
+
pool_size,
|
| 24 |
+
min_view_size,
|
| 25 |
+
max_view_size,
|
| 26 |
+
world_size,
|
| 27 |
+
warmup=1,
|
| 28 |
+
drop_last=True,
|
| 29 |
+
):
|
| 30 |
+
self.batch_size = batch_size
|
| 31 |
+
self.pool_size = pool_size
|
| 32 |
+
self.min_view_size = min_view_size
|
| 33 |
+
self.max_view_size = max_view_size
|
| 34 |
+
self.drop_last = drop_last
|
| 35 |
+
self.len_dataset = N = len(dataset)
|
| 36 |
+
self.total_size = N
|
| 37 |
+
self.epoch = None
|
| 38 |
+
self.epochf = 0.0
|
| 39 |
+
|
| 40 |
+
def __len__(self):
|
| 41 |
+
return self.total_size
|
| 42 |
+
|
| 43 |
+
def set_epoch(self, epoch):
|
| 44 |
+
self.epoch = epoch
|
| 45 |
+
|
| 46 |
+
def __iter__(self):
|
| 47 |
+
if self.epoch is None:
|
| 48 |
+
raise ValueError(
|
| 49 |
+
"Epoch number not set. Please call 'set_epoch(epoch)' before iterating."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
seed = self.epoch + 788
|
| 53 |
+
rng = np.random.default_rng(seed=seed)
|
| 54 |
+
# random indices (will restart from 0 if not drop_last)
|
| 55 |
+
sample_idxs = np.arange(self.total_size)
|
| 56 |
+
rng.shuffle(sample_idxs)
|
| 57 |
+
# random feat_idxs (same across each batch)
|
| 58 |
+
n_batches = (self.total_size + self.batch_size - 1) // self.batch_size
|
| 59 |
+
if self.pool_size > 1:
|
| 60 |
+
p = np.ones(self.pool_size)
|
| 61 |
+
p[: self.pool_size // 2] *= 2
|
| 62 |
+
p = p / p.sum()
|
| 63 |
+
_feat_idxs = rng.choice(self.pool_size, size=n_batches, p=p)
|
| 64 |
+
else:
|
| 65 |
+
_feat_idxs = rng.integers(self.pool_size, size=n_batches)
|
| 66 |
+
_feat_idxs = np.broadcast_to(_feat_idxs[:, None], (n_batches, self.batch_size))
|
| 67 |
+
_feat_idxs = _feat_idxs.ravel()[: self.total_size]
|
| 68 |
+
_view_idxs = rng.integers(
|
| 69 |
+
self.min_view_size, self.max_view_size + 1, size=n_batches
|
| 70 |
+
)
|
| 71 |
+
_view_idxs = np.broadcast_to(_view_idxs[:, None], (n_batches, self.batch_size))
|
| 72 |
+
_view_idxs = _view_idxs.ravel()[: self.total_size]
|
| 73 |
+
|
| 74 |
+
idxs = np.c_[sample_idxs, _feat_idxs, _view_idxs]
|
| 75 |
+
yield from (tuple(idx) for idx in idxs)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
class BatchedRandomSampler(BatchSampler):
|
| 79 |
+
"""Batch sampler that groups indices from RandomSampler into batches."""
|
| 80 |
+
|
| 81 |
+
def __init__(self, sampler: CustomRandomSampler, batch_size, drop_last=True):
|
| 82 |
+
self.sampler = sampler # An instance of RandomSampler
|
| 83 |
+
self.batch_size = batch_size
|
| 84 |
+
self.drop_last = drop_last
|
| 85 |
+
|
| 86 |
+
def set_epoch(self, epoch):
|
| 87 |
+
self.sampler.set_epoch(epoch)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def round_by(total, multiple, up=False):
|
| 91 |
+
if up:
|
| 92 |
+
total = total + multiple - 1
|
| 93 |
+
return (total // multiple) * multiple
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/base/easy_dataset.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (C) 2024-present Naver Corporation. All rights reserved.
|
| 2 |
+
# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
|
| 3 |
+
#
|
| 4 |
+
# --------------------------------------------------------
|
| 5 |
+
# modified from DUSt3R
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
from dust3r.datasets.base.batched_sampler import (
|
| 9 |
+
BatchedRandomSampler,
|
| 10 |
+
CustomRandomSampler,
|
| 11 |
+
)
|
| 12 |
+
import torch
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class EasyDataset:
|
| 16 |
+
"""a dataset that you can easily resize and combine.
|
| 17 |
+
Examples:
|
| 18 |
+
---------
|
| 19 |
+
2 * dataset ==> duplicate each element 2x
|
| 20 |
+
|
| 21 |
+
10 @ dataset ==> set the size to 10 (random sampling, duplicates if necessary)
|
| 22 |
+
|
| 23 |
+
dataset1 + dataset2 ==> concatenate datasets
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
def __add__(self, other):
|
| 27 |
+
return CatDataset([self, other])
|
| 28 |
+
|
| 29 |
+
def __rmul__(self, factor):
|
| 30 |
+
return MulDataset(factor, self)
|
| 31 |
+
|
| 32 |
+
def __rmatmul__(self, factor):
|
| 33 |
+
return ResizedDataset(factor, self)
|
| 34 |
+
|
| 35 |
+
def set_epoch(self, epoch):
|
| 36 |
+
pass # nothing to do by default
|
| 37 |
+
|
| 38 |
+
def make_sampler(
|
| 39 |
+
self, batch_size, shuffle=True, drop_last=True, world_size=1, rank=0, fixed_length=False
|
| 40 |
+
):
|
| 41 |
+
if not (shuffle):
|
| 42 |
+
raise NotImplementedError() # cannot deal yet
|
| 43 |
+
num_of_aspect_ratios = len(self._resolutions)
|
| 44 |
+
num_of_views = self.num_views
|
| 45 |
+
sampler = CustomRandomSampler(
|
| 46 |
+
self,
|
| 47 |
+
batch_size,
|
| 48 |
+
num_of_aspect_ratios,
|
| 49 |
+
4 if not fixed_length else num_of_views,
|
| 50 |
+
num_of_views,
|
| 51 |
+
world_size,
|
| 52 |
+
warmup=1,
|
| 53 |
+
drop_last=drop_last,
|
| 54 |
+
)
|
| 55 |
+
return BatchedRandomSampler(sampler, batch_size, drop_last)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class MulDataset(EasyDataset):
|
| 59 |
+
"""Artifically augmenting the size of a dataset."""
|
| 60 |
+
|
| 61 |
+
multiplicator: int
|
| 62 |
+
|
| 63 |
+
def __init__(self, multiplicator, dataset):
|
| 64 |
+
assert isinstance(multiplicator, int) and multiplicator > 0
|
| 65 |
+
self.multiplicator = multiplicator
|
| 66 |
+
self.dataset = dataset
|
| 67 |
+
|
| 68 |
+
def __len__(self):
|
| 69 |
+
return self.multiplicator * len(self.dataset)
|
| 70 |
+
|
| 71 |
+
def __repr__(self):
|
| 72 |
+
return f"{self.multiplicator}*{repr(self.dataset)}"
|
| 73 |
+
|
| 74 |
+
def __getitem__(self, idx):
|
| 75 |
+
if isinstance(idx, tuple):
|
| 76 |
+
idx, other, another = idx
|
| 77 |
+
return self.dataset[idx // self.multiplicator, other, another]
|
| 78 |
+
else:
|
| 79 |
+
return self.dataset[idx // self.multiplicator]
|
| 80 |
+
|
| 81 |
+
@property
|
| 82 |
+
def _resolutions(self):
|
| 83 |
+
return self.dataset._resolutions
|
| 84 |
+
|
| 85 |
+
@property
|
| 86 |
+
def num_views(self):
|
| 87 |
+
return self.dataset.num_views
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class ResizedDataset(EasyDataset):
|
| 91 |
+
"""Artifically changing the size of a dataset."""
|
| 92 |
+
|
| 93 |
+
new_size: int
|
| 94 |
+
|
| 95 |
+
def __init__(self, new_size, dataset):
|
| 96 |
+
assert isinstance(new_size, int) and new_size > 0
|
| 97 |
+
self.new_size = new_size
|
| 98 |
+
self.dataset = dataset
|
| 99 |
+
|
| 100 |
+
def __len__(self):
|
| 101 |
+
return self.new_size
|
| 102 |
+
|
| 103 |
+
def __repr__(self):
|
| 104 |
+
size_str = str(self.new_size)
|
| 105 |
+
for i in range((len(size_str) - 1) // 3):
|
| 106 |
+
sep = -4 * i - 3
|
| 107 |
+
size_str = size_str[:sep] + "_" + size_str[sep:]
|
| 108 |
+
return f"{size_str} @ {repr(self.dataset)}"
|
| 109 |
+
|
| 110 |
+
def set_epoch(self, epoch):
|
| 111 |
+
# this random shuffle only depends on the epoch
|
| 112 |
+
rng = np.random.default_rng(seed=epoch + 777)
|
| 113 |
+
|
| 114 |
+
# shuffle all indices
|
| 115 |
+
perm = rng.permutation(len(self.dataset))
|
| 116 |
+
|
| 117 |
+
# rotary extension until target size is met
|
| 118 |
+
shuffled_idxs = np.concatenate(
|
| 119 |
+
[perm] * (1 + (len(self) - 1) // len(self.dataset))
|
| 120 |
+
)
|
| 121 |
+
self._idxs_mapping = shuffled_idxs[: self.new_size]
|
| 122 |
+
|
| 123 |
+
assert len(self._idxs_mapping) == self.new_size
|
| 124 |
+
|
| 125 |
+
def __getitem__(self, idx):
|
| 126 |
+
assert hasattr(
|
| 127 |
+
self, "_idxs_mapping"
|
| 128 |
+
), "You need to call dataset.set_epoch() to use ResizedDataset.__getitem__()"
|
| 129 |
+
if isinstance(idx, tuple):
|
| 130 |
+
idx, other, another = idx
|
| 131 |
+
return self.dataset[self._idxs_mapping[idx], other, another]
|
| 132 |
+
else:
|
| 133 |
+
return self.dataset[self._idxs_mapping[idx]]
|
| 134 |
+
|
| 135 |
+
@property
|
| 136 |
+
def _resolutions(self):
|
| 137 |
+
return self.dataset._resolutions
|
| 138 |
+
|
| 139 |
+
@property
|
| 140 |
+
def num_views(self):
|
| 141 |
+
return self.dataset.num_views
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
class CatDataset(EasyDataset):
|
| 145 |
+
"""Concatenation of several datasets"""
|
| 146 |
+
|
| 147 |
+
def __init__(self, datasets):
|
| 148 |
+
for dataset in datasets:
|
| 149 |
+
assert isinstance(dataset, EasyDataset)
|
| 150 |
+
self.datasets = datasets
|
| 151 |
+
self._cum_sizes = np.cumsum([len(dataset) for dataset in datasets])
|
| 152 |
+
|
| 153 |
+
def __len__(self):
|
| 154 |
+
return self._cum_sizes[-1]
|
| 155 |
+
|
| 156 |
+
def __repr__(self):
|
| 157 |
+
# remove uselessly long transform
|
| 158 |
+
return " + ".join(
|
| 159 |
+
repr(dataset).replace(
|
| 160 |
+
",transform=Compose( ToTensor() Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))",
|
| 161 |
+
"",
|
| 162 |
+
)
|
| 163 |
+
for dataset in self.datasets
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
def set_epoch(self, epoch):
|
| 167 |
+
for dataset in self.datasets:
|
| 168 |
+
dataset.set_epoch(epoch)
|
| 169 |
+
|
| 170 |
+
def __getitem__(self, idx):
|
| 171 |
+
other = None
|
| 172 |
+
if isinstance(idx, tuple):
|
| 173 |
+
idx, other, another = idx
|
| 174 |
+
|
| 175 |
+
cause_error = False
|
| 176 |
+
while True:
|
| 177 |
+
|
| 178 |
+
if not (0 <= idx < len(self)):
|
| 179 |
+
raise IndexError()
|
| 180 |
+
|
| 181 |
+
db_idx = np.searchsorted(self._cum_sizes, idx, "right")
|
| 182 |
+
dataset = self.datasets[db_idx]
|
| 183 |
+
new_idx = idx - (self._cum_sizes[db_idx - 1] if db_idx > 0 else 0)
|
| 184 |
+
|
| 185 |
+
if other is not None and another is not None:
|
| 186 |
+
new_idx = (new_idx, other, another)
|
| 187 |
+
|
| 188 |
+
try:
|
| 189 |
+
res_data = dataset[new_idx]
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(e)
|
| 192 |
+
print("DATA ERROR", new_idx)
|
| 193 |
+
idx += 1
|
| 194 |
+
idx = idx % len(self)
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
break
|
| 198 |
+
return res_data
|
| 199 |
+
|
| 200 |
+
@property
|
| 201 |
+
def _resolutions(self):
|
| 202 |
+
resolutions = self.datasets[0]._resolutions
|
| 203 |
+
for dataset in self.datasets[1:]:
|
| 204 |
+
assert tuple(dataset._resolutions) == tuple(resolutions)
|
| 205 |
+
return resolutions
|
| 206 |
+
|
| 207 |
+
@property
|
| 208 |
+
def num_views(self):
|
| 209 |
+
num_views = self.datasets[0].num_views
|
| 210 |
+
for dataset in self.datasets[1:]:
|
| 211 |
+
assert dataset.num_views == num_views
|
| 212 |
+
return num_views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/blendedmvs.py
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import numpy as np
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
|
| 6 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 7 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 8 |
+
from dust3r.utils.image import imread_cv2, imread_pil
|
| 9 |
+
import h5py
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class BlendedMVS_Multi(BaseMultiViewDataset):
|
| 14 |
+
"""Dataset of outdoor street scenes, 5 images each time"""
|
| 15 |
+
|
| 16 |
+
def __init__(self, *args, ROOT, split=None, **kwargs):
|
| 17 |
+
self.ROOT = ROOT
|
| 18 |
+
self.video = False
|
| 19 |
+
self.is_metric = False
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
# assert split is None
|
| 22 |
+
self._load_data()
|
| 23 |
+
|
| 24 |
+
def _load_data(self):
|
| 25 |
+
self.data_dict = self.read_h5_file(os.path.join(self.ROOT, "new_overlap.h5"))
|
| 26 |
+
self.num_imgs = sum(
|
| 27 |
+
[len(self.data_dict[s]["basenames"]) for s in self.data_dict.keys()]
|
| 28 |
+
)
|
| 29 |
+
self.num_scenes = len(self.data_dict.keys())
|
| 30 |
+
self.invalid_scenes = []
|
| 31 |
+
self.is_reachable_cache = {scene: {} for scene in self.data_dict.keys()}
|
| 32 |
+
|
| 33 |
+
def read_h5_file(self, h5_file_path):
|
| 34 |
+
data_dict = {}
|
| 35 |
+
self.all_ref_imgs = []
|
| 36 |
+
with h5py.File(h5_file_path, "r") as f:
|
| 37 |
+
for scene_dir in tqdm(f.keys()):
|
| 38 |
+
group = f[scene_dir]
|
| 39 |
+
basenames = group["basenames"][:]
|
| 40 |
+
indices = group["indices"][:]
|
| 41 |
+
values = group["values"][:]
|
| 42 |
+
shape = group.attrs["shape"]
|
| 43 |
+
# Reconstruct the sparse matrix
|
| 44 |
+
score_matrix = np.zeros(shape, dtype=np.float32)
|
| 45 |
+
score_matrix[indices[0], indices[1]] = values
|
| 46 |
+
data_dict[scene_dir] = {
|
| 47 |
+
"basenames": basenames,
|
| 48 |
+
"score_matrix": self.build_adjacency_list(score_matrix),
|
| 49 |
+
}
|
| 50 |
+
self.all_ref_imgs.extend(
|
| 51 |
+
[(scene_dir, b) for b in range(len(basenames))]
|
| 52 |
+
)
|
| 53 |
+
return data_dict
|
| 54 |
+
|
| 55 |
+
@staticmethod
|
| 56 |
+
def build_adjacency_list(S, thresh=0.2):
|
| 57 |
+
adjacency_list = [[] for _ in range(len(S))]
|
| 58 |
+
S = S - thresh
|
| 59 |
+
S[S < 0] = 0
|
| 60 |
+
rows, cols = np.nonzero(S)
|
| 61 |
+
for i, j in zip(rows, cols):
|
| 62 |
+
adjacency_list[i].append((j, S[i][j]))
|
| 63 |
+
return adjacency_list
|
| 64 |
+
|
| 65 |
+
@staticmethod
|
| 66 |
+
def is_reachable(adjacency_list, start_index, k):
|
| 67 |
+
visited = set()
|
| 68 |
+
stack = [start_index]
|
| 69 |
+
while stack and len(visited) < k:
|
| 70 |
+
node = stack.pop()
|
| 71 |
+
if node not in visited:
|
| 72 |
+
visited.add(node)
|
| 73 |
+
for neighbor in adjacency_list[node]:
|
| 74 |
+
if neighbor[0] not in visited:
|
| 75 |
+
stack.append(neighbor[0])
|
| 76 |
+
return len(visited) >= k
|
| 77 |
+
|
| 78 |
+
@staticmethod
|
| 79 |
+
def random_sequence_no_revisit_with_backtracking(
|
| 80 |
+
adjacency_list, k, start_index, rng: np.random.Generator
|
| 81 |
+
):
|
| 82 |
+
path = [start_index]
|
| 83 |
+
visited = set([start_index])
|
| 84 |
+
|
| 85 |
+
neighbor_iterators = []
|
| 86 |
+
# Initialize the iterator for the start index
|
| 87 |
+
neighbors = adjacency_list[start_index]
|
| 88 |
+
neighbor_idxs = [n[0] for n in neighbors]
|
| 89 |
+
neighbor_weights = [n[1] for n in neighbors]
|
| 90 |
+
neighbor_idxs = rng.choice(
|
| 91 |
+
neighbor_idxs,
|
| 92 |
+
size=len(neighbor_idxs),
|
| 93 |
+
replace=False,
|
| 94 |
+
p=np.array(neighbor_weights) / np.sum(neighbor_weights),
|
| 95 |
+
).tolist()
|
| 96 |
+
neighbor_iterators.append(iter(neighbor_idxs))
|
| 97 |
+
|
| 98 |
+
while len(path) < k:
|
| 99 |
+
if not neighbor_iterators:
|
| 100 |
+
# No possible sequence
|
| 101 |
+
return None
|
| 102 |
+
current_iterator = neighbor_iterators[-1]
|
| 103 |
+
try:
|
| 104 |
+
next_index = next(current_iterator)
|
| 105 |
+
if next_index not in visited:
|
| 106 |
+
path.append(next_index)
|
| 107 |
+
visited.add(next_index)
|
| 108 |
+
|
| 109 |
+
# Prepare iterator for the next node
|
| 110 |
+
neighbors = adjacency_list[next_index]
|
| 111 |
+
neighbor_idxs = [n[0] for n in neighbors]
|
| 112 |
+
neighbor_weights = [n[1] for n in neighbors]
|
| 113 |
+
neighbor_idxs = rng.choice(
|
| 114 |
+
neighbor_idxs,
|
| 115 |
+
size=len(neighbor_idxs),
|
| 116 |
+
replace=False,
|
| 117 |
+
p=np.array(neighbor_weights) / np.sum(neighbor_weights),
|
| 118 |
+
).tolist()
|
| 119 |
+
neighbor_iterators.append(iter(neighbor_idxs))
|
| 120 |
+
except StopIteration:
|
| 121 |
+
# No more neighbors to try at this node, backtrack
|
| 122 |
+
neighbor_iterators.pop()
|
| 123 |
+
visited.remove(path.pop())
|
| 124 |
+
return path
|
| 125 |
+
|
| 126 |
+
@staticmethod
|
| 127 |
+
def random_sequence_with_optional_repeats(
|
| 128 |
+
adjacency_list,
|
| 129 |
+
k,
|
| 130 |
+
start_index,
|
| 131 |
+
rng: np.random.Generator,
|
| 132 |
+
max_k=None,
|
| 133 |
+
max_attempts=100,
|
| 134 |
+
):
|
| 135 |
+
if max_k is None:
|
| 136 |
+
max_k = k
|
| 137 |
+
path = [start_index]
|
| 138 |
+
visited = set([start_index])
|
| 139 |
+
current_index = start_index
|
| 140 |
+
attempts = 0
|
| 141 |
+
|
| 142 |
+
while len(path) < max_k and attempts < max_attempts:
|
| 143 |
+
attempts += 1
|
| 144 |
+
neighbors = adjacency_list[current_index]
|
| 145 |
+
neighbor_idxs = [n[0] for n in neighbors]
|
| 146 |
+
neighbor_weights = [n[1] for n in neighbors]
|
| 147 |
+
|
| 148 |
+
if not neighbor_idxs:
|
| 149 |
+
# No neighbors, cannot proceed further
|
| 150 |
+
break
|
| 151 |
+
|
| 152 |
+
# Try to find unvisited neighbors
|
| 153 |
+
unvisited_neighbors = [
|
| 154 |
+
(idx, wgt)
|
| 155 |
+
for idx, wgt in zip(neighbor_idxs, neighbor_weights)
|
| 156 |
+
if idx not in visited
|
| 157 |
+
]
|
| 158 |
+
if unvisited_neighbors:
|
| 159 |
+
# Select among unvisited neighbors
|
| 160 |
+
unvisited_idxs = [idx for idx, _ in unvisited_neighbors]
|
| 161 |
+
unvisited_weights = [wgt for _, wgt in unvisited_neighbors]
|
| 162 |
+
probabilities = np.array(unvisited_weights) / np.sum(unvisited_weights)
|
| 163 |
+
next_index = rng.choice(unvisited_idxs, p=probabilities)
|
| 164 |
+
visited.add(next_index)
|
| 165 |
+
else:
|
| 166 |
+
# All neighbors visited, but we need to reach length max_k
|
| 167 |
+
# So we can revisit nodes
|
| 168 |
+
probabilities = np.array(neighbor_weights) / np.sum(neighbor_weights)
|
| 169 |
+
next_index = rng.choice(neighbor_idxs, p=probabilities)
|
| 170 |
+
|
| 171 |
+
path.append(next_index)
|
| 172 |
+
current_index = next_index
|
| 173 |
+
|
| 174 |
+
if len(set(path)) >= k:
|
| 175 |
+
# If path is shorter than max_k, extend it by repeating existing elements
|
| 176 |
+
while len(path) < max_k:
|
| 177 |
+
# Randomly select nodes from the existing path to repeat
|
| 178 |
+
next_index = rng.choice(path)
|
| 179 |
+
path.append(next_index)
|
| 180 |
+
return path
|
| 181 |
+
else:
|
| 182 |
+
# Could not reach k unique nodes
|
| 183 |
+
return None
|
| 184 |
+
|
| 185 |
+
def __len__(self):
|
| 186 |
+
return len(self.all_ref_imgs)
|
| 187 |
+
|
| 188 |
+
def get_image_num(self):
|
| 189 |
+
return self.num_imgs
|
| 190 |
+
|
| 191 |
+
def get_stats(self):
|
| 192 |
+
return f"{len(self)} imgs from {self.num_scenes} scenes"
|
| 193 |
+
|
| 194 |
+
def generate_sequence(
|
| 195 |
+
self, scene, adj_list, num_views, start_index, rng, allow_repeat=False
|
| 196 |
+
):
|
| 197 |
+
cutoff = num_views if not allow_repeat else max(num_views // 5, 3)
|
| 198 |
+
if start_index in self.is_reachable_cache[scene]:
|
| 199 |
+
if not self.is_reachable_cache[scene][start_index]:
|
| 200 |
+
print(
|
| 201 |
+
f"Cannot reach {num_views} unique elements from index {start_index}."
|
| 202 |
+
)
|
| 203 |
+
return None
|
| 204 |
+
else:
|
| 205 |
+
self.is_reachable_cache[scene][start_index] = self.is_reachable(
|
| 206 |
+
adj_list, start_index, cutoff
|
| 207 |
+
)
|
| 208 |
+
if not self.is_reachable_cache[scene][start_index]:
|
| 209 |
+
print(
|
| 210 |
+
f"Cannot reach {num_views} unique elements from index {start_index}."
|
| 211 |
+
)
|
| 212 |
+
return None
|
| 213 |
+
if not allow_repeat:
|
| 214 |
+
sequence = self.random_sequence_no_revisit_with_backtracking(
|
| 215 |
+
adj_list, cutoff, start_index, rng
|
| 216 |
+
)
|
| 217 |
+
else:
|
| 218 |
+
sequence = self.random_sequence_with_optional_repeats(
|
| 219 |
+
adj_list, cutoff, start_index, rng, max_k=num_views
|
| 220 |
+
)
|
| 221 |
+
if not sequence:
|
| 222 |
+
self.is_reachable_cache[scene][start_index] = False
|
| 223 |
+
print("Failed to generate a sequence without revisiting.")
|
| 224 |
+
return sequence
|
| 225 |
+
|
| 226 |
+
def _get_views(self, idx, resolution, rng: np.random.Generator, num_views):
|
| 227 |
+
MAX_RETRIES = 100 # Maximum attempts to find a valid sequence
|
| 228 |
+
MAX_SCENE_RETRIES = 50 # Maximum attempts to find a valid scene
|
| 229 |
+
|
| 230 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 231 |
+
invalid_seq = True
|
| 232 |
+
ordered_video = False
|
| 233 |
+
|
| 234 |
+
outer_retry_count = 0
|
| 235 |
+
|
| 236 |
+
while invalid_seq and outer_retry_count < MAX_RETRIES:
|
| 237 |
+
outer_retry_count += 1
|
| 238 |
+
|
| 239 |
+
basenames = self.data_dict[scene_info]["basenames"]
|
| 240 |
+
if (
|
| 241 |
+
sum(
|
| 242 |
+
[
|
| 243 |
+
(1 - int(x))
|
| 244 |
+
for x in list(self.is_reachable_cache[scene_info].values())
|
| 245 |
+
]
|
| 246 |
+
)
|
| 247 |
+
> len(basenames) - self.num_views
|
| 248 |
+
):
|
| 249 |
+
self.invalid_scenes.append(scene_info)
|
| 250 |
+
|
| 251 |
+
inner_retry_count = 0
|
| 252 |
+
while scene_info in self.invalid_scenes and inner_retry_count < MAX_SCENE_RETRIES:
|
| 253 |
+
inner_retry_count += 1
|
| 254 |
+
idx = rng.integers(low=0, high=len(self.all_ref_imgs))
|
| 255 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 256 |
+
basenames = self.data_dict[scene_info]["basenames"]
|
| 257 |
+
|
| 258 |
+
# If we exhausted inner retries, skip to next sample
|
| 259 |
+
if inner_retry_count >= MAX_SCENE_RETRIES:
|
| 260 |
+
import warnings
|
| 261 |
+
warnings.warn(
|
| 262 |
+
f"BlendedMVS: Could not find valid scene after {MAX_SCENE_RETRIES} attempts. "
|
| 263 |
+
f"Skipping sample idx={idx}. This might indicate data quality issues."
|
| 264 |
+
)
|
| 265 |
+
# Try with a completely random sample
|
| 266 |
+
idx = rng.integers(low=0, high=len(self.all_ref_imgs))
|
| 267 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 268 |
+
basenames = self.data_dict[scene_info]["basenames"]
|
| 269 |
+
|
| 270 |
+
score_matrix = self.data_dict[scene_info]["score_matrix"]
|
| 271 |
+
imgs_idxs = self.generate_sequence(
|
| 272 |
+
scene_info, score_matrix, num_views, ref_img_idx, rng, self.allow_repeat
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
if imgs_idxs is None:
|
| 276 |
+
random_direction = 2 * rng.choice(2) - 1
|
| 277 |
+
for offset in range(1, len(basenames)):
|
| 278 |
+
tentative_im_idx = (
|
| 279 |
+
ref_img_idx + (random_direction * offset)
|
| 280 |
+
) % len(basenames)
|
| 281 |
+
if (
|
| 282 |
+
tentative_im_idx not in self.is_reachable_cache[scene_info]
|
| 283 |
+
or self.is_reachable_cache[scene_info][tentative_im_idx]
|
| 284 |
+
):
|
| 285 |
+
ref_img_idx = tentative_im_idx
|
| 286 |
+
break
|
| 287 |
+
else:
|
| 288 |
+
invalid_seq = False
|
| 289 |
+
|
| 290 |
+
# If we exhausted all retries, raise an error instead of hanging
|
| 291 |
+
if outer_retry_count >= MAX_RETRIES:
|
| 292 |
+
import warnings
|
| 293 |
+
warnings.warn(
|
| 294 |
+
f"BlendedMVS: Failed to generate valid sequence after {MAX_RETRIES} attempts. "
|
| 295 |
+
f"Skipping sample idx={idx}. This might indicate severe data quality issues."
|
| 296 |
+
)
|
| 297 |
+
# As a last resort, try one more time with a completely random sample
|
| 298 |
+
idx = rng.integers(low=0, high=len(self.all_ref_imgs))
|
| 299 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 300 |
+
basenames = self.data_dict[scene_info]["basenames"]
|
| 301 |
+
score_matrix = self.data_dict[scene_info]["score_matrix"]
|
| 302 |
+
imgs_idxs = self.generate_sequence(
|
| 303 |
+
scene_info, score_matrix, num_views, ref_img_idx, rng, self.allow_repeat
|
| 304 |
+
)
|
| 305 |
+
# If still None, use sequential indices as fallback
|
| 306 |
+
if imgs_idxs is None:
|
| 307 |
+
imgs_idxs = list(range(min(num_views, len(basenames))))
|
| 308 |
+
|
| 309 |
+
views = []
|
| 310 |
+
for view_idx in imgs_idxs:
|
| 311 |
+
scene_dir = osp.join(self.ROOT, scene_info)
|
| 312 |
+
impath = basenames[view_idx].decode("utf-8")
|
| 313 |
+
image = imread_pil(osp.join(scene_dir, impath + ".jpg"))
|
| 314 |
+
depthmap = imread_cv2(osp.join(scene_dir, impath + ".exr"))
|
| 315 |
+
camera_params = np.load(osp.join(scene_dir, impath + ".npz"))
|
| 316 |
+
|
| 317 |
+
intrinsics = np.float32(camera_params["intrinsics"])
|
| 318 |
+
camera_pose = np.eye(4, dtype=np.float32)
|
| 319 |
+
camera_pose[:3, :3] = camera_params["R_cam2world"]
|
| 320 |
+
camera_pose[:3, 3] = camera_params["t_cam2world"]
|
| 321 |
+
|
| 322 |
+
image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 323 |
+
image, depthmap, intrinsics, resolution, rng, info=(scene_dir, impath)
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
views.append(
|
| 327 |
+
dict(
|
| 328 |
+
img=image,
|
| 329 |
+
depthmap=depthmap,
|
| 330 |
+
camera_pose=camera_pose, # cam2world
|
| 331 |
+
camera_intrinsics=intrinsics,
|
| 332 |
+
dataset="BlendedMVS",
|
| 333 |
+
label=osp.relpath(scene_dir, self.ROOT),
|
| 334 |
+
is_metric=self.is_metric,
|
| 335 |
+
is_video=ordered_video,
|
| 336 |
+
instance=osp.join(scene_dir, impath + ".jpg"),
|
| 337 |
+
quantile=np.array(0.97, dtype=np.float32),
|
| 338 |
+
img_mask=True,
|
| 339 |
+
ray_mask=False,
|
| 340 |
+
camera_only=False,
|
| 341 |
+
depth_only=False,
|
| 342 |
+
single_view=False,
|
| 343 |
+
reset=False,
|
| 344 |
+
)
|
| 345 |
+
)
|
| 346 |
+
assert len(views) == num_views
|
| 347 |
+
return views
|
| 348 |
+
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/co3d.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import json
|
| 3 |
+
import itertools
|
| 4 |
+
from collections import deque
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 8 |
+
import cv2
|
| 9 |
+
import numpy as np
|
| 10 |
+
import time
|
| 11 |
+
|
| 12 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 13 |
+
from dust3r.utils.image import imread_cv2
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class Co3d_Multi(BaseMultiViewDataset):
|
| 17 |
+
def __init__(self, mask_bg="rand", *args, ROOT, **kwargs):
|
| 18 |
+
self.ROOT = ROOT
|
| 19 |
+
super().__init__(*args, **kwargs)
|
| 20 |
+
assert mask_bg in (True, False, "rand")
|
| 21 |
+
self.mask_bg = mask_bg
|
| 22 |
+
self.is_metric = False
|
| 23 |
+
self.dataset_label = "Co3d_v2"
|
| 24 |
+
|
| 25 |
+
# load all scenes
|
| 26 |
+
with open(osp.join(self.ROOT, f"selected_seqs_{self.split}.json"), "r") as f:
|
| 27 |
+
self.scenes = json.load(f)
|
| 28 |
+
self.scenes = {k: v for k, v in self.scenes.items() if len(v) > 0}
|
| 29 |
+
self.scenes = {
|
| 30 |
+
(k, k2): v2 for k, v in self.scenes.items() for k2, v2 in v.items()
|
| 31 |
+
}
|
| 32 |
+
self.scene_list = list(self.scenes.keys())
|
| 33 |
+
cut_off = (
|
| 34 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 35 |
+
)
|
| 36 |
+
self.cut_off = cut_off
|
| 37 |
+
self.all_ref_imgs = [
|
| 38 |
+
(key, value)
|
| 39 |
+
for key, values in self.scenes.items()
|
| 40 |
+
for value in values[: len(values) - cut_off + 1]
|
| 41 |
+
]
|
| 42 |
+
self.invalidate = {scene: {} for scene in self.scene_list}
|
| 43 |
+
self.invalid_scenes = {scene: False for scene in self.scene_list}
|
| 44 |
+
|
| 45 |
+
def __len__(self):
|
| 46 |
+
return len(self.all_ref_imgs)
|
| 47 |
+
|
| 48 |
+
def _get_metadatapath(self, obj, instance, view_idx):
|
| 49 |
+
return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.npz")
|
| 50 |
+
|
| 51 |
+
def _get_impath(self, obj, instance, view_idx):
|
| 52 |
+
return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg")
|
| 53 |
+
|
| 54 |
+
def _get_depthpath(self, obj, instance, view_idx):
|
| 55 |
+
return osp.join(
|
| 56 |
+
self.ROOT, obj, instance, "depths", f"frame{view_idx:06n}.jpg.geometric.png"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
def _get_maskpath(self, obj, instance, view_idx):
|
| 60 |
+
return osp.join(self.ROOT, obj, instance, "masks", f"frame{view_idx:06n}.png")
|
| 61 |
+
|
| 62 |
+
def _read_depthmap(self, depthpath, input_metadata):
|
| 63 |
+
depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
|
| 64 |
+
depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num(
|
| 65 |
+
input_metadata["maximum_depth"]
|
| 66 |
+
)
|
| 67 |
+
return depthmap
|
| 68 |
+
|
| 69 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 70 |
+
invalid_seq = True
|
| 71 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 72 |
+
|
| 73 |
+
while invalid_seq:
|
| 74 |
+
while self.invalid_scenes[scene_info]:
|
| 75 |
+
idx = rng.integers(low=0, high=len(self.all_ref_imgs))
|
| 76 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 77 |
+
|
| 78 |
+
obj, instance = scene_info
|
| 79 |
+
|
| 80 |
+
image_pool = self.scenes[obj, instance]
|
| 81 |
+
if len(image_pool) < self.cut_off:
|
| 82 |
+
print("Invalid scene!")
|
| 83 |
+
self.invalid_scenes[scene_info] = True
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
imgs_idxs, ordered_video = self.get_seq_from_start_id(
|
| 87 |
+
num_views, ref_img_idx, image_pool, rng
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
if resolution not in self.invalidate[obj, instance]: # flag invalid images
|
| 91 |
+
self.invalidate[obj, instance][resolution] = [
|
| 92 |
+
False for _ in range(len(image_pool))
|
| 93 |
+
]
|
| 94 |
+
# decide now if we mask the bg
|
| 95 |
+
mask_bg = (self.mask_bg == True) or (
|
| 96 |
+
self.mask_bg == "rand" and rng.choice(2, p=[0.9, 0.1])
|
| 97 |
+
)
|
| 98 |
+
views = []
|
| 99 |
+
|
| 100 |
+
imgs_idxs = deque(imgs_idxs)
|
| 101 |
+
|
| 102 |
+
while len(imgs_idxs) > 0: # some images (few) have zero depth
|
| 103 |
+
if (
|
| 104 |
+
len(image_pool) - sum(self.invalidate[obj, instance][resolution])
|
| 105 |
+
< self.cut_off
|
| 106 |
+
):
|
| 107 |
+
print("Invalid scene!")
|
| 108 |
+
invalid_seq = True
|
| 109 |
+
self.invalid_scenes[scene_info] = True
|
| 110 |
+
break
|
| 111 |
+
|
| 112 |
+
im_idx = imgs_idxs.pop()
|
| 113 |
+
if self.invalidate[obj, instance][resolution][im_idx]:
|
| 114 |
+
# search for a valid image
|
| 115 |
+
ordered_video = False
|
| 116 |
+
random_direction = 2 * rng.choice(2) - 1
|
| 117 |
+
for offset in range(1, len(image_pool)):
|
| 118 |
+
tentative_im_idx = (im_idx + (random_direction * offset)) % len(
|
| 119 |
+
image_pool
|
| 120 |
+
)
|
| 121 |
+
if not self.invalidate[obj, instance][resolution][
|
| 122 |
+
tentative_im_idx
|
| 123 |
+
]:
|
| 124 |
+
im_idx = tentative_im_idx
|
| 125 |
+
break
|
| 126 |
+
view_idx = image_pool[im_idx]
|
| 127 |
+
impath = self._get_impath(obj, instance, view_idx)
|
| 128 |
+
depthpath = self._get_depthpath(obj, instance, view_idx)
|
| 129 |
+
|
| 130 |
+
# load camera params
|
| 131 |
+
metadata_path = self._get_metadatapath(obj, instance, view_idx)
|
| 132 |
+
input_metadata = np.load(metadata_path)
|
| 133 |
+
camera_pose = input_metadata["camera_pose"].astype(np.float32)
|
| 134 |
+
intrinsics = input_metadata["camera_intrinsics"].astype(np.float32)
|
| 135 |
+
|
| 136 |
+
# load image and depth
|
| 137 |
+
rgb_image = imread_cv2(impath)
|
| 138 |
+
depthmap = self._read_depthmap(depthpath, input_metadata)
|
| 139 |
+
|
| 140 |
+
if mask_bg:
|
| 141 |
+
# load object mask
|
| 142 |
+
maskpath = self._get_maskpath(obj, instance, view_idx)
|
| 143 |
+
maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype(
|
| 144 |
+
np.float32
|
| 145 |
+
)
|
| 146 |
+
maskmap = (maskmap / 255.0) > 0.1
|
| 147 |
+
|
| 148 |
+
# update the depthmap with mask
|
| 149 |
+
depthmap *= maskmap
|
| 150 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 151 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath
|
| 152 |
+
)
|
| 153 |
+
num_valid = (depthmap > 0.0).sum()
|
| 154 |
+
if num_valid == 0:
|
| 155 |
+
# problem, invalidate image and retry
|
| 156 |
+
self.invalidate[obj, instance][resolution][im_idx] = True
|
| 157 |
+
imgs_idxs.append(im_idx)
|
| 158 |
+
continue
|
| 159 |
+
|
| 160 |
+
# generate img mask and raymap mask
|
| 161 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 162 |
+
self.is_metric, len(views), rng
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
views.append(
|
| 166 |
+
dict(
|
| 167 |
+
img=rgb_image,
|
| 168 |
+
depthmap=depthmap,
|
| 169 |
+
camera_pose=camera_pose,
|
| 170 |
+
camera_intrinsics=intrinsics,
|
| 171 |
+
dataset=self.dataset_label,
|
| 172 |
+
label=osp.join(obj, instance),
|
| 173 |
+
instance=osp.split(impath)[1],
|
| 174 |
+
is_metric=self.is_metric,
|
| 175 |
+
is_video=ordered_video,
|
| 176 |
+
quantile=np.array(0.9, dtype=np.float32),
|
| 177 |
+
img_mask=img_mask,
|
| 178 |
+
ray_mask=ray_mask,
|
| 179 |
+
camera_only=False,
|
| 180 |
+
depth_only=False,
|
| 181 |
+
single_view=False,
|
| 182 |
+
reset=False,
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
if len(views) == num_views and not all(
|
| 187 |
+
[view["instance"] == views[0]["instance"] for view in views]
|
| 188 |
+
):
|
| 189 |
+
invalid_seq = False
|
| 190 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/cop3d.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import sys
|
| 3 |
+
|
| 4 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 5 |
+
import cv2
|
| 6 |
+
import numpy as np
|
| 7 |
+
|
| 8 |
+
from dust3r.datasets.co3d import Co3d_Multi
|
| 9 |
+
from dust3r.utils.image import imread_cv2
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class Cop3D_Multi(Co3d_Multi):
|
| 13 |
+
def __init__(self, mask_bg="rand", *args, ROOT, **kwargs):
|
| 14 |
+
super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs)
|
| 15 |
+
self.dataset_label = "Cop3D"
|
| 16 |
+
self.is_metric = False
|
| 17 |
+
|
| 18 |
+
def _get_metadatapath(self, obj, instance, view_idx):
|
| 19 |
+
return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.npz")
|
| 20 |
+
|
| 21 |
+
def _get_impath(self, obj, instance, view_idx):
|
| 22 |
+
return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg")
|
| 23 |
+
|
| 24 |
+
def _get_depthpath(self, obj, instance, view_idx):
|
| 25 |
+
# no depth, pseduo path just for getting the right resolution
|
| 26 |
+
return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg")
|
| 27 |
+
|
| 28 |
+
def _get_maskpath(self, obj, instance, view_idx):
|
| 29 |
+
return osp.join(self.ROOT, obj, instance, "masks", f"frame{view_idx:06n}.png")
|
| 30 |
+
|
| 31 |
+
def _read_depthmap(self, impath, input_metadata):
|
| 32 |
+
# no depth, set to all ones
|
| 33 |
+
img = imread_cv2(impath, cv2.IMREAD_UNCHANGED)
|
| 34 |
+
depthmap = np.ones_like(img[..., 0], dtype=np.float32)
|
| 35 |
+
return depthmap
|
| 36 |
+
|
| 37 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 38 |
+
invalid_seq = True
|
| 39 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 40 |
+
|
| 41 |
+
while invalid_seq:
|
| 42 |
+
while self.invalid_scenes[scene_info]:
|
| 43 |
+
idx = rng.integers(low=0, high=len(self.all_ref_imgs))
|
| 44 |
+
scene_info, ref_img_idx = self.all_ref_imgs[idx]
|
| 45 |
+
|
| 46 |
+
obj, instance = scene_info
|
| 47 |
+
|
| 48 |
+
image_pool = self.scenes[obj, instance]
|
| 49 |
+
if len(image_pool) < self.num_views:
|
| 50 |
+
print("Invalid scene!")
|
| 51 |
+
self.invalid_scenes[scene_info] = True
|
| 52 |
+
continue
|
| 53 |
+
|
| 54 |
+
imgs_idxs, ordered_video = self.get_seq_from_start_id(
|
| 55 |
+
num_views,
|
| 56 |
+
ref_img_idx,
|
| 57 |
+
image_pool,
|
| 58 |
+
rng,
|
| 59 |
+
max_interval=5,
|
| 60 |
+
video_prob=1.0,
|
| 61 |
+
fix_interval_prob=0.9,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
views = []
|
| 65 |
+
|
| 66 |
+
for im_idx in imgs_idxs:
|
| 67 |
+
view_idx = image_pool[im_idx]
|
| 68 |
+
impath = self._get_impath(obj, instance, view_idx)
|
| 69 |
+
depthpath = self._get_depthpath(obj, instance, view_idx)
|
| 70 |
+
|
| 71 |
+
# load camera params
|
| 72 |
+
metadata_path = self._get_metadatapath(obj, instance, view_idx)
|
| 73 |
+
input_metadata = np.load(metadata_path)
|
| 74 |
+
camera_pose = input_metadata["camera_pose"].astype(np.float32)
|
| 75 |
+
intrinsics = input_metadata["camera_intrinsics"].astype(np.float32)
|
| 76 |
+
|
| 77 |
+
# load image and depth
|
| 78 |
+
rgb_image = imread_cv2(impath)
|
| 79 |
+
depthmap = self._read_depthmap(depthpath, input_metadata)
|
| 80 |
+
|
| 81 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 82 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
views.append(
|
| 86 |
+
dict(
|
| 87 |
+
img=rgb_image,
|
| 88 |
+
depthmap=depthmap,
|
| 89 |
+
camera_pose=camera_pose,
|
| 90 |
+
camera_intrinsics=intrinsics,
|
| 91 |
+
dataset=self.dataset_label,
|
| 92 |
+
label=osp.join(obj, instance),
|
| 93 |
+
instance=osp.split(impath)[1],
|
| 94 |
+
is_metric=self.is_metric,
|
| 95 |
+
is_video=ordered_video,
|
| 96 |
+
quantile=np.array(0.96, dtype=np.float32),
|
| 97 |
+
img_mask=True,
|
| 98 |
+
ray_mask=False,
|
| 99 |
+
camera_only=True,
|
| 100 |
+
depth_only=False,
|
| 101 |
+
single_view=False,
|
| 102 |
+
reset=False,
|
| 103 |
+
)
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
if len(views) == num_views and not all(
|
| 107 |
+
[view["instance"] == views[0]["instance"] for view in views]
|
| 108 |
+
):
|
| 109 |
+
invalid_seq = False
|
| 110 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/dl3dv.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import itertools
|
| 5 |
+
|
| 6 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 7 |
+
import cv2
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class DL3DV_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, split, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.max_interval = 20
|
| 19 |
+
self.is_metric = False
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
|
| 22 |
+
self.loaded_data = self._load_data()
|
| 23 |
+
|
| 24 |
+
def _load_data(self):
|
| 25 |
+
self.all_scenes = sorted(
|
| 26 |
+
[f for f in os.listdir(self.ROOT) if os.path.isdir(osp.join(self.ROOT, f))]
|
| 27 |
+
)
|
| 28 |
+
subscenes = []
|
| 29 |
+
for scene in self.all_scenes:
|
| 30 |
+
# not empty
|
| 31 |
+
subscenes.extend(
|
| 32 |
+
[
|
| 33 |
+
osp.join(scene, f)
|
| 34 |
+
for f in os.listdir(osp.join(self.ROOT, scene))
|
| 35 |
+
if os.path.isdir(osp.join(self.ROOT, scene, f))
|
| 36 |
+
and len(os.listdir(osp.join(self.ROOT, scene, f))) > 0
|
| 37 |
+
]
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
offset = 0
|
| 41 |
+
scenes = []
|
| 42 |
+
sceneids = []
|
| 43 |
+
images = []
|
| 44 |
+
scene_img_list = []
|
| 45 |
+
start_img_ids = []
|
| 46 |
+
j = 0
|
| 47 |
+
|
| 48 |
+
for scene_idx, scene in enumerate(subscenes):
|
| 49 |
+
scene_dir = osp.join(self.ROOT, scene, "dense")
|
| 50 |
+
rgb_paths = sorted(
|
| 51 |
+
[
|
| 52 |
+
f
|
| 53 |
+
for f in os.listdir(os.path.join(scene_dir, "rgb"))
|
| 54 |
+
if f.endswith(".png")
|
| 55 |
+
]
|
| 56 |
+
)
|
| 57 |
+
assert len(rgb_paths) > 0, f"{scene_dir} is empty."
|
| 58 |
+
num_imgs = len(rgb_paths)
|
| 59 |
+
cut_off = (
|
| 60 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
if num_imgs < cut_off:
|
| 64 |
+
print(f"Skipping {scene}")
|
| 65 |
+
continue
|
| 66 |
+
|
| 67 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 68 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 69 |
+
|
| 70 |
+
scenes.append(scene)
|
| 71 |
+
scene_img_list.append(img_ids)
|
| 72 |
+
sceneids.extend([j] * num_imgs)
|
| 73 |
+
images.extend(rgb_paths)
|
| 74 |
+
start_img_ids.extend(start_img_ids_)
|
| 75 |
+
offset += num_imgs
|
| 76 |
+
j += 1
|
| 77 |
+
|
| 78 |
+
self.scenes = scenes
|
| 79 |
+
self.sceneids = sceneids
|
| 80 |
+
self.images = images
|
| 81 |
+
self.start_img_ids = start_img_ids
|
| 82 |
+
self.scene_img_list = scene_img_list
|
| 83 |
+
|
| 84 |
+
def __len__(self):
|
| 85 |
+
return len(self.start_img_ids)
|
| 86 |
+
|
| 87 |
+
def get_image_num(self):
|
| 88 |
+
return len(self.images)
|
| 89 |
+
|
| 90 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 91 |
+
start_id = self.start_img_ids[idx]
|
| 92 |
+
scene_id = self.sceneids[start_id]
|
| 93 |
+
all_image_ids = self.scene_img_list[scene_id]
|
| 94 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 95 |
+
num_views,
|
| 96 |
+
start_id,
|
| 97 |
+
all_image_ids,
|
| 98 |
+
rng,
|
| 99 |
+
max_interval=self.max_interval,
|
| 100 |
+
block_shuffle=25,
|
| 101 |
+
)
|
| 102 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 103 |
+
|
| 104 |
+
views = []
|
| 105 |
+
for view_idx in image_idxs:
|
| 106 |
+
scene_id = self.sceneids[view_idx]
|
| 107 |
+
scene_dir = osp.join(self.ROOT, self.scenes[scene_id], "dense")
|
| 108 |
+
|
| 109 |
+
rgb_path = self.images[view_idx]
|
| 110 |
+
basename = rgb_path[:-4]
|
| 111 |
+
|
| 112 |
+
rgb_image = imread_cv2(
|
| 113 |
+
osp.join(scene_dir, "rgb", rgb_path), cv2.IMREAD_COLOR
|
| 114 |
+
)
|
| 115 |
+
depthmap = np.load(osp.join(scene_dir, "depth", basename + ".npy")).astype(
|
| 116 |
+
np.float32
|
| 117 |
+
)
|
| 118 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 119 |
+
cam_file = np.load(osp.join(scene_dir, "cam", basename + ".npz"))
|
| 120 |
+
sky_mask = (
|
| 121 |
+
cv2.imread(
|
| 122 |
+
osp.join(scene_dir, "sky_mask", rgb_path), cv2.IMREAD_UNCHANGED
|
| 123 |
+
)
|
| 124 |
+
>= 127
|
| 125 |
+
)
|
| 126 |
+
outlier_mask = cv2.imread(
|
| 127 |
+
osp.join(scene_dir, "outlier_mask", rgb_path), cv2.IMREAD_UNCHANGED
|
| 128 |
+
)
|
| 129 |
+
depthmap[sky_mask] = -1.0
|
| 130 |
+
depthmap[outlier_mask >= 127] = 0.0
|
| 131 |
+
depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
|
| 132 |
+
threshold = (
|
| 133 |
+
np.percentile(depthmap[depthmap > 0], 98)
|
| 134 |
+
if depthmap[depthmap > 0].size > 0
|
| 135 |
+
else 0
|
| 136 |
+
)
|
| 137 |
+
depthmap[depthmap > threshold] = 0.0
|
| 138 |
+
|
| 139 |
+
intrinsics = cam_file["intrinsic"].astype(np.float32)
|
| 140 |
+
camera_pose = cam_file["pose"].astype(np.float32)
|
| 141 |
+
|
| 142 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 143 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
views.append(
|
| 147 |
+
dict(
|
| 148 |
+
img=rgb_image,
|
| 149 |
+
depthmap=depthmap.astype(np.float32),
|
| 150 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 151 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 152 |
+
dataset="dl3dv",
|
| 153 |
+
label=self.scenes[scene_id] + "_" + rgb_path,
|
| 154 |
+
instance=osp.join(scene_dir, "rgb", rgb_path),
|
| 155 |
+
is_metric=self.is_metric,
|
| 156 |
+
is_video=ordered_video,
|
| 157 |
+
quantile=np.array(0.9, dtype=np.float32),
|
| 158 |
+
img_mask=True,
|
| 159 |
+
ray_mask=False,
|
| 160 |
+
camera_only=False,
|
| 161 |
+
depth_only=False,
|
| 162 |
+
single_view=False,
|
| 163 |
+
reset=False,
|
| 164 |
+
)
|
| 165 |
+
)
|
| 166 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/dynamic_replica.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class DynamicReplica(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = True
|
| 19 |
+
self.max_interval = 16
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
|
| 22 |
+
self.loaded_data = self._load_data(self.split)
|
| 23 |
+
|
| 24 |
+
def _load_data(self, split):
|
| 25 |
+
self.scenes = os.listdir(os.path.join(self.ROOT, split))
|
| 26 |
+
|
| 27 |
+
offset = 0
|
| 28 |
+
scenes = []
|
| 29 |
+
sceneids = []
|
| 30 |
+
scene_img_list = []
|
| 31 |
+
images = []
|
| 32 |
+
start_img_ids = []
|
| 33 |
+
|
| 34 |
+
j = 0
|
| 35 |
+
for scene in tqdm(self.scenes):
|
| 36 |
+
scene_dir = osp.join(self.ROOT, self.split, scene, "left")
|
| 37 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 38 |
+
basenames = sorted(
|
| 39 |
+
[f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
|
| 40 |
+
key=lambda x: float(x),
|
| 41 |
+
)
|
| 42 |
+
num_imgs = len(basenames)
|
| 43 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 44 |
+
cut_off = (
|
| 45 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 46 |
+
)
|
| 47 |
+
if num_imgs < cut_off:
|
| 48 |
+
print(f"Skipping {scene}")
|
| 49 |
+
continue
|
| 50 |
+
|
| 51 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 52 |
+
start_img_ids.extend(start_img_ids_)
|
| 53 |
+
sceneids.extend([j] * num_imgs)
|
| 54 |
+
images.extend(basenames)
|
| 55 |
+
scenes.append(scene)
|
| 56 |
+
scene_img_list.append(img_ids)
|
| 57 |
+
|
| 58 |
+
# offset groups
|
| 59 |
+
offset += num_imgs
|
| 60 |
+
j += 1
|
| 61 |
+
|
| 62 |
+
self.scenes = scenes
|
| 63 |
+
self.sceneids = sceneids
|
| 64 |
+
self.images = images
|
| 65 |
+
self.start_img_ids = start_img_ids
|
| 66 |
+
self.scene_img_list = scene_img_list
|
| 67 |
+
|
| 68 |
+
def __len__(self):
|
| 69 |
+
return len(self.start_img_ids)
|
| 70 |
+
|
| 71 |
+
def get_image_num(self):
|
| 72 |
+
return len(self.images)
|
| 73 |
+
|
| 74 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 75 |
+
start_id = self.start_img_ids[idx]
|
| 76 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]]
|
| 77 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 78 |
+
num_views,
|
| 79 |
+
start_id,
|
| 80 |
+
all_image_ids,
|
| 81 |
+
rng,
|
| 82 |
+
max_interval=self.max_interval,
|
| 83 |
+
video_prob=1.0,
|
| 84 |
+
fix_interval_prob=1.0,
|
| 85 |
+
)
|
| 86 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 87 |
+
|
| 88 |
+
views = []
|
| 89 |
+
for v, view_idx in enumerate(image_idxs):
|
| 90 |
+
scene_id = self.sceneids[view_idx]
|
| 91 |
+
scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id], "left")
|
| 92 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 93 |
+
depth_dir = osp.join(scene_dir, "depth")
|
| 94 |
+
cam_dir = osp.join(scene_dir, "cam")
|
| 95 |
+
|
| 96 |
+
basename = self.images[view_idx]
|
| 97 |
+
|
| 98 |
+
# Load RGB image
|
| 99 |
+
rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
|
| 100 |
+
# Load depthmap
|
| 101 |
+
depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
|
| 102 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 103 |
+
|
| 104 |
+
cam = np.load(osp.join(cam_dir, basename + ".npz"))
|
| 105 |
+
camera_pose = cam["pose"]
|
| 106 |
+
intrinsics = cam["intrinsics"]
|
| 107 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 108 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
# generate img mask and raymap mask
|
| 112 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 113 |
+
self.is_metric, v, rng, p=[0.85, 0.10, 0.05]
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
views.append(
|
| 117 |
+
dict(
|
| 118 |
+
img=rgb_image,
|
| 119 |
+
depthmap=depthmap.astype(np.float32),
|
| 120 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 121 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 122 |
+
dataset="dynamic_replica",
|
| 123 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 124 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 125 |
+
is_metric=self.is_metric,
|
| 126 |
+
is_video=ordered_video,
|
| 127 |
+
quantile=np.array(1.0, dtype=np.float32),
|
| 128 |
+
img_mask=img_mask,
|
| 129 |
+
ray_mask=ray_mask,
|
| 130 |
+
camera_only=False,
|
| 131 |
+
depth_only=False,
|
| 132 |
+
single_view=False,
|
| 133 |
+
reset=False,
|
| 134 |
+
)
|
| 135 |
+
)
|
| 136 |
+
assert len(views) == num_views
|
| 137 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/habitat_hm3d.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class HabitatHM3D_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = False
|
| 19 |
+
self.max_interval = 8
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
self.loaded_data = self._load_data()
|
| 22 |
+
|
| 23 |
+
def _load_data(self):
|
| 24 |
+
self.scenes = os.listdir(self.ROOT)
|
| 25 |
+
|
| 26 |
+
offset = 0
|
| 27 |
+
scenes = []
|
| 28 |
+
sceneids = []
|
| 29 |
+
scene_img_list = []
|
| 30 |
+
images = []
|
| 31 |
+
start_img_ids = []
|
| 32 |
+
|
| 33 |
+
j = 0
|
| 34 |
+
for scene in tqdm(self.scenes):
|
| 35 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 36 |
+
basenames = sorted(
|
| 37 |
+
[f[:-4] for f in os.listdir(scene_dir) if f.endswith(".npz")],
|
| 38 |
+
key=lambda x: int(x),
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
num_imgs = len(basenames)
|
| 42 |
+
# TODO: because current minghui's training data is backward moving, now use seq from -1 to 0
|
| 43 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 44 |
+
cut_off = (
|
| 45 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 46 |
+
)
|
| 47 |
+
if num_imgs < cut_off:
|
| 48 |
+
print(f"Skipping {scene}")
|
| 49 |
+
continue
|
| 50 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 51 |
+
|
| 52 |
+
start_img_ids.extend([(scene, id) for id in start_img_ids_])
|
| 53 |
+
sceneids.extend([j] * num_imgs)
|
| 54 |
+
images.extend(basenames)
|
| 55 |
+
scenes.append(scene)
|
| 56 |
+
scene_img_list.append(img_ids)
|
| 57 |
+
|
| 58 |
+
# offset groups
|
| 59 |
+
offset += num_imgs
|
| 60 |
+
j += 1
|
| 61 |
+
|
| 62 |
+
self.scenes = scenes
|
| 63 |
+
self.sceneids = sceneids
|
| 64 |
+
self.images = images
|
| 65 |
+
self.start_img_ids = start_img_ids
|
| 66 |
+
self.scene_img_list = scene_img_list
|
| 67 |
+
|
| 68 |
+
self.invalid_scenes = {scene: False for scene in self.scenes}
|
| 69 |
+
|
| 70 |
+
def __len__(self):
|
| 71 |
+
return len(self.start_img_ids)
|
| 72 |
+
|
| 73 |
+
def get_image_num(self):
|
| 74 |
+
return len(self.images)
|
| 75 |
+
|
| 76 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 77 |
+
invalid_seq = True
|
| 78 |
+
scene, start_id = self.start_img_ids[idx] # 获取指定索引idx对应的场景名scene和起始图像id
|
| 79 |
+
|
| 80 |
+
# 添加最大重试次数,防止无限循环导致分布式训练卡住
|
| 81 |
+
max_retries = 100
|
| 82 |
+
retry_count = 0
|
| 83 |
+
|
| 84 |
+
while invalid_seq:
|
| 85 |
+
retry_count += 1
|
| 86 |
+
|
| 87 |
+
# 超过重试次数限制,抛出异常
|
| 88 |
+
if retry_count > max_retries:
|
| 89 |
+
raise RuntimeError(
|
| 90 |
+
f"[HabitatHM3D] Failed to get valid views after {max_retries} retries. "
|
| 91 |
+
f"idx={idx}, scene={scene}, num_views={num_views}. "
|
| 92 |
+
f"This may indicate insufficient valid frames in the dataset."
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# 超过50次时打印警告
|
| 96 |
+
if retry_count == 50:
|
| 97 |
+
print(f"[HabitatHM3D WARNING] Already retried {retry_count} times for idx={idx}, scene={scene}")
|
| 98 |
+
|
| 99 |
+
# 如果当前场景被标记为invalid则随机选择一个新的场景和起始图像id
|
| 100 |
+
scene_retry = 0
|
| 101 |
+
while self.invalid_scenes[scene]:
|
| 102 |
+
scene_retry += 1
|
| 103 |
+
if scene_retry > len(self.start_img_ids):
|
| 104 |
+
raise RuntimeError(
|
| 105 |
+
f"[HabitatHM3D] All scenes are invalid! Cannot find valid scene after {scene_retry} attempts."
|
| 106 |
+
)
|
| 107 |
+
idx = rng.integers(low=0, high=len(self.start_img_ids))
|
| 108 |
+
scene, start_id = self.start_img_ids[idx]
|
| 109 |
+
|
| 110 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]] # 获取当前场景的所有图像id列表
|
| 111 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 112 |
+
num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
|
| 113 |
+
) # 根据起始图像id和其他参数生成图像序列的索引pos 并返回有序视频
|
| 114 |
+
image_idxs = np.array(all_image_ids)[pos] # 从all_image_ids提取图像序列
|
| 115 |
+
|
| 116 |
+
views = []
|
| 117 |
+
load_failed = False
|
| 118 |
+
for view_idx in image_idxs:
|
| 119 |
+
scene_id = self.sceneids[view_idx]
|
| 120 |
+
scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
|
| 121 |
+
|
| 122 |
+
basename = self.images[view_idx]
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
# Load RGB image
|
| 126 |
+
rgb_image = imread_cv2(osp.join(scene_dir, "image_" + basename + ".png"))
|
| 127 |
+
# Load depthmap
|
| 128 |
+
depthmap = imread_cv2(
|
| 129 |
+
osp.join(scene_dir, "depth_" + basename + ".png"), cv2.IMREAD_UNCHANGED
|
| 130 |
+
)
|
| 131 |
+
depthmap = depthmap.astype(np.float32) / 1000
|
| 132 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 133 |
+
|
| 134 |
+
camera_params = np.load(osp.join(scene_dir, basename + ".npz"))
|
| 135 |
+
intrinsics = np.float32(camera_params["intrinsics"])
|
| 136 |
+
camera_pose = np.eye(4, dtype=np.float32)
|
| 137 |
+
camera_pose[:3, :3] = camera_params["R_cam2world"]
|
| 138 |
+
camera_pose[:3, 3] = camera_params["t_cam2world"]
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"[HabitatHM3D] Error loading {scene} {basename}: {e}, skipping scene")
|
| 141 |
+
self.invalid_scenes[scene] = True
|
| 142 |
+
load_failed = True
|
| 143 |
+
break
|
| 144 |
+
|
| 145 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 146 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
views.append(
|
| 150 |
+
dict(
|
| 151 |
+
img=rgb_image,
|
| 152 |
+
depthmap=depthmap.astype(np.float32),
|
| 153 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 154 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 155 |
+
dataset="habitatHM3D",
|
| 156 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 157 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 158 |
+
is_metric=self.is_metric,
|
| 159 |
+
is_video=ordered_video,
|
| 160 |
+
quantile=np.array(0.98, dtype=np.float32),
|
| 161 |
+
img_mask=True,
|
| 162 |
+
ray_mask=False,
|
| 163 |
+
camera_only=True,
|
| 164 |
+
depth_only=False,
|
| 165 |
+
single_view=False,
|
| 166 |
+
reset=False,
|
| 167 |
+
)
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# 只有成功加载所有视图才退出循环
|
| 171 |
+
if not load_failed and len(views) == num_views:
|
| 172 |
+
invalid_seq = False
|
| 173 |
+
|
| 174 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/hoi4d.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
sys.path.append(osp.join(osp.dirname(__file__), '..','..'))
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 10 |
+
from dust3r.utils.image import imread_cv2
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class HOI4D_Multi(BaseMultiViewDataset):
|
| 14 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 15 |
+
self.ROOT = ROOT
|
| 16 |
+
self.video = True
|
| 17 |
+
self.is_metric = True
|
| 18 |
+
super().__init__(*args, **kwargs)
|
| 19 |
+
self.loaded_data = self._load_data()
|
| 20 |
+
|
| 21 |
+
def _load_data(self):
|
| 22 |
+
scenes = os.listdir(self.ROOT)
|
| 23 |
+
img_names = []
|
| 24 |
+
for scene in scenes:
|
| 25 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 26 |
+
rgb_dir = osp.join(scene_dir, 'rgb')
|
| 27 |
+
basenames = sorted([f[:-4] for f in os.listdir(rgb_dir) if f.endswith('.png')])
|
| 28 |
+
img_names.extend([(scene, basename) for basename in basenames])
|
| 29 |
+
|
| 30 |
+
self.img_names = img_names
|
| 31 |
+
|
| 32 |
+
def __len__(self):
|
| 33 |
+
return len(self.img_names)
|
| 34 |
+
|
| 35 |
+
def get_image_num(self):
|
| 36 |
+
return len(self.img_names)
|
| 37 |
+
|
| 38 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 39 |
+
new_seed = rng.integers(0, 2**32) + idx
|
| 40 |
+
new_rng = np.random.default_rng(new_seed)
|
| 41 |
+
invalid_seq = True
|
| 42 |
+
while invalid_seq:
|
| 43 |
+
img_names = new_rng.choice(self.img_names, num_views, replace=False)
|
| 44 |
+
|
| 45 |
+
views = []
|
| 46 |
+
for v, img_name in enumerate(img_names):
|
| 47 |
+
# Load RGB image
|
| 48 |
+
scene, img_name = img_name
|
| 49 |
+
try:
|
| 50 |
+
rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"))
|
| 51 |
+
depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy"))
|
| 52 |
+
depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
|
| 53 |
+
|
| 54 |
+
intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))["intrinsics"]
|
| 55 |
+
except:
|
| 56 |
+
print(f"Error loading {scene} {img_name}, skipping")
|
| 57 |
+
break
|
| 58 |
+
# camera pose is not provided, placeholder
|
| 59 |
+
camera_pose = np.eye(4)
|
| 60 |
+
|
| 61 |
+
rgb_image, depthmap, intrinsics= self._crop_resize_if_necessary(
|
| 62 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name)
|
| 63 |
+
|
| 64 |
+
views.append(dict(
|
| 65 |
+
img=rgb_image,
|
| 66 |
+
depthmap=depthmap.astype(np.float32),
|
| 67 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 68 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 69 |
+
dataset='HOI4D',
|
| 70 |
+
label=img_name,
|
| 71 |
+
instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"),
|
| 72 |
+
is_metric=self.is_metric,
|
| 73 |
+
is_video=False,
|
| 74 |
+
quantile=np.array(0.99, dtype=np.float32),
|
| 75 |
+
img_mask=True,
|
| 76 |
+
ray_mask=False,
|
| 77 |
+
camera_only=False,
|
| 78 |
+
depth_only=False,
|
| 79 |
+
single_view=True,
|
| 80 |
+
reset=True,
|
| 81 |
+
))
|
| 82 |
+
if len(views) == num_views:
|
| 83 |
+
invalid_seq = False
|
| 84 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/hypersim.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
+
import itertools
|
| 5 |
+
|
| 6 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 7 |
+
import cv2
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_pil
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class HyperSim_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, split, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = True
|
| 19 |
+
self.max_interval = 4
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
|
| 22 |
+
self.loaded_data = self._load_data()
|
| 23 |
+
print('DATA: hypersim', len(self))
|
| 24 |
+
|
| 25 |
+
def _load_data(self):
|
| 26 |
+
self.all_scenes = sorted(
|
| 27 |
+
[f for f in os.listdir(self.ROOT) if os.path.isdir(osp.join(self.ROOT, f))]
|
| 28 |
+
)
|
| 29 |
+
subscenes = []
|
| 30 |
+
for scene in self.all_scenes:
|
| 31 |
+
# not empty
|
| 32 |
+
subscenes.extend(
|
| 33 |
+
[
|
| 34 |
+
osp.join(scene, f)
|
| 35 |
+
for f in os.listdir(osp.join(self.ROOT, scene))
|
| 36 |
+
if os.path.isdir(osp.join(self.ROOT, scene, f))
|
| 37 |
+
and len(os.listdir(osp.join(self.ROOT, scene, f))) > 0
|
| 38 |
+
]
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
offset = 0
|
| 42 |
+
scenes = []
|
| 43 |
+
sceneids = []
|
| 44 |
+
images = []
|
| 45 |
+
start_img_ids = []
|
| 46 |
+
scene_img_list = []
|
| 47 |
+
j = 0
|
| 48 |
+
for scene_idx, scene in enumerate(subscenes):
|
| 49 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 50 |
+
rgb_paths = sorted([f for f in os.listdir(scene_dir) if f.endswith(".png")])
|
| 51 |
+
assert len(rgb_paths) > 0, f"{scene_dir} is empty."
|
| 52 |
+
num_imgs = len(rgb_paths)
|
| 53 |
+
cut_off = (
|
| 54 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 55 |
+
)
|
| 56 |
+
if num_imgs < cut_off:
|
| 57 |
+
print(f"Skipping {scene}")
|
| 58 |
+
continue
|
| 59 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 60 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 61 |
+
|
| 62 |
+
scenes.append(scene)
|
| 63 |
+
scene_img_list.append(img_ids)
|
| 64 |
+
sceneids.extend([j] * num_imgs)
|
| 65 |
+
images.extend(rgb_paths)
|
| 66 |
+
start_img_ids.extend(start_img_ids_)
|
| 67 |
+
offset += num_imgs
|
| 68 |
+
j += 1
|
| 69 |
+
|
| 70 |
+
self.scenes = scenes
|
| 71 |
+
self.sceneids = sceneids
|
| 72 |
+
self.images = images
|
| 73 |
+
self.scene_img_list = scene_img_list
|
| 74 |
+
self.start_img_ids = start_img_ids
|
| 75 |
+
|
| 76 |
+
def __len__(self):
|
| 77 |
+
return len(self.start_img_ids) * 10
|
| 78 |
+
|
| 79 |
+
def get_image_num(self):
|
| 80 |
+
return len(self.images)
|
| 81 |
+
|
| 82 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 83 |
+
idx = idx // 10
|
| 84 |
+
start_id = self.start_img_ids[idx]
|
| 85 |
+
scene_id = self.sceneids[start_id]
|
| 86 |
+
all_image_ids = self.scene_img_list[scene_id]
|
| 87 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 88 |
+
num_views,
|
| 89 |
+
start_id,
|
| 90 |
+
all_image_ids,
|
| 91 |
+
rng,
|
| 92 |
+
max_interval=self.max_interval,
|
| 93 |
+
block_shuffle=16,
|
| 94 |
+
)
|
| 95 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 96 |
+
views = []
|
| 97 |
+
for v, view_idx in enumerate(image_idxs):
|
| 98 |
+
scene_id = self.sceneids[view_idx]
|
| 99 |
+
scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
|
| 100 |
+
|
| 101 |
+
rgb_path = self.images[view_idx]
|
| 102 |
+
depth_path = rgb_path.replace("rgb.png", "depth.npy")
|
| 103 |
+
cam_path = rgb_path.replace("rgb.png", "cam.npz")
|
| 104 |
+
|
| 105 |
+
rgb_image = imread_pil(osp.join(scene_dir, rgb_path))
|
| 106 |
+
depthmap = np.load(osp.join(scene_dir, depth_path)).astype(np.float32)
|
| 107 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 108 |
+
cam_file = np.load(osp.join(scene_dir, cam_path))
|
| 109 |
+
intrinsics = cam_file["intrinsics"].astype(np.float32)
|
| 110 |
+
camera_pose = cam_file["pose"].astype(np.float32)
|
| 111 |
+
|
| 112 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 113 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
# generate img mask and raymap mask
|
| 117 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 118 |
+
self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
views.append(
|
| 122 |
+
dict(
|
| 123 |
+
img=rgb_image,
|
| 124 |
+
depthmap=depthmap.astype(np.float32),
|
| 125 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 126 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 127 |
+
dataset="hypersim",
|
| 128 |
+
label=self.scenes[scene_id] + "_" + rgb_path,
|
| 129 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 130 |
+
is_metric=self.is_metric,
|
| 131 |
+
is_video=ordered_video,
|
| 132 |
+
quantile=np.array(1.0, dtype=np.float32),
|
| 133 |
+
img_mask=img_mask,
|
| 134 |
+
ray_mask=ray_mask,
|
| 135 |
+
camera_only=False,
|
| 136 |
+
depth_only=False,
|
| 137 |
+
single_view=False,
|
| 138 |
+
reset=False,
|
| 139 |
+
)
|
| 140 |
+
)
|
| 141 |
+
assert len(views) == num_views
|
| 142 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/kitti360.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""KITTI-360 training dataset loader (real outdoor).
|
| 2 |
+
|
| 3 |
+
cam_00 perspective images (rectified) + poses.txt + perspective.txt P_rect_00
|
| 4 |
+
intrinsics. Optional Velodyne HDL64 sparse depth supervision when data_3d_raw
|
| 5 |
+
is on disk (loaded from `velodyne_root`, defaults to ROOT).
|
| 6 |
+
|
| 7 |
+
Layout expected:
|
| 8 |
+
ROOT/data_2d_raw/<sequence>/image_00/data_rect/{NNNNNNNNNN}.png
|
| 9 |
+
ROOT/data_poses/<sequence>/poses.txt # frame_idx + 12 floats c2w
|
| 10 |
+
ROOT/calibration/perspective.txt # P_rect_00 + R_rect_00
|
| 11 |
+
ROOT/calibration/calib_cam_to_velo.txt # cam0→velo (3×4)
|
| 12 |
+
velodyne_root/data_3d_raw/<sequence>/velodyne_points/data/{NNNNNNNNNN}.bin # optional
|
| 13 |
+
|
| 14 |
+
Train/test split (cvlibs convention):
|
| 15 |
+
train: 0000, 0002, 0003, 0004, 0005, 0006, 0009
|
| 16 |
+
test: 0007, 0010
|
| 17 |
+
"""
|
| 18 |
+
import os
|
| 19 |
+
import os.path as osp
|
| 20 |
+
import sys
|
| 21 |
+
|
| 22 |
+
import cv2
|
| 23 |
+
import numpy as np
|
| 24 |
+
|
| 25 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 26 |
+
|
| 27 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 28 |
+
from dust3r.utils.image import imread_cv2
|
| 29 |
+
|
| 30 |
+
TRAIN_SEQS = [
|
| 31 |
+
"2013_05_28_drive_0000_sync",
|
| 32 |
+
"2013_05_28_drive_0002_sync",
|
| 33 |
+
"2013_05_28_drive_0003_sync",
|
| 34 |
+
"2013_05_28_drive_0004_sync",
|
| 35 |
+
"2013_05_28_drive_0005_sync",
|
| 36 |
+
"2013_05_28_drive_0006_sync",
|
| 37 |
+
"2013_05_28_drive_0009_sync",
|
| 38 |
+
]
|
| 39 |
+
TEST_SEQS = [
|
| 40 |
+
"2013_05_28_drive_0007_sync",
|
| 41 |
+
"2013_05_28_drive_0010_sync",
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _parse_perspective_intrinsics(path: str):
|
| 46 |
+
"""Parse calibration/perspective.txt → (P_rect_00 (3,4), R_rect_00 (3,3), S_rect_00 (W, H))."""
|
| 47 |
+
P_rect = None
|
| 48 |
+
R_rect = None
|
| 49 |
+
S_rect = None
|
| 50 |
+
with open(path) as fh:
|
| 51 |
+
for line in fh:
|
| 52 |
+
line = line.strip()
|
| 53 |
+
if line.startswith("P_rect_00:"):
|
| 54 |
+
vals = list(map(float, line.split()[1:]))
|
| 55 |
+
P_rect = np.array(vals, dtype=np.float64).reshape(3, 4)
|
| 56 |
+
elif line.startswith("R_rect_00:"):
|
| 57 |
+
vals = list(map(float, line.split()[1:]))
|
| 58 |
+
R_rect = np.array(vals, dtype=np.float64).reshape(3, 3)
|
| 59 |
+
elif line.startswith("S_rect_00:"):
|
| 60 |
+
vals = list(map(float, line.split()[1:]))
|
| 61 |
+
S_rect = (int(vals[0]), int(vals[1])) # (W, H)
|
| 62 |
+
if P_rect is None:
|
| 63 |
+
raise RuntimeError(f"P_rect_00 missing in {path}")
|
| 64 |
+
if R_rect is None:
|
| 65 |
+
R_rect = np.eye(3, dtype=np.float64)
|
| 66 |
+
return P_rect, R_rect, S_rect
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _parse_cam_to_velo(path: str):
|
| 70 |
+
"""Parse calibration/calib_cam_to_velo.txt → T_cam0_to_velo (4×4 homogeneous).
|
| 71 |
+
|
| 72 |
+
File contains a single 3×4 row-major rigid transform (cam0 origin in velo frame).
|
| 73 |
+
"""
|
| 74 |
+
with open(path) as fh:
|
| 75 |
+
line = fh.readline().strip()
|
| 76 |
+
vals = list(map(float, line.split()))
|
| 77 |
+
if len(vals) != 12:
|
| 78 |
+
raise RuntimeError(f"Expected 12 floats in {path}, got {len(vals)}")
|
| 79 |
+
T = np.eye(4, dtype=np.float64)
|
| 80 |
+
T[:3, :] = np.array(vals, dtype=np.float64).reshape(3, 4)
|
| 81 |
+
return T
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _load_velodyne_bin(bin_path: str) -> np.ndarray:
|
| 85 |
+
"""Load (N,4) float32 [x,y,z,reflectance] from KITTI-360 .bin file."""
|
| 86 |
+
return np.fromfile(bin_path, dtype=np.float32).reshape(-1, 4)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _project_velo_to_depth_kitti360(velo_pts, P_rect_00, T_velo_to_cam_rect, H, W,
|
| 90 |
+
min_depth=0.5, max_depth=80.0):
|
| 91 |
+
"""Project KITTI-360 velodyne scan onto image_00 (rectified) → sparse depthmap.
|
| 92 |
+
|
| 93 |
+
pixel_h = P_rect_00 @ T_velo_to_cam_rect @ velo_h
|
| 94 |
+
where T_velo_to_cam_rect = R_rect_00 @ inv(T_cam0_to_velo) (4×4 incorporating rectification).
|
| 95 |
+
Closest-z wins on duplicate pixels.
|
| 96 |
+
"""
|
| 97 |
+
pts_h = np.concatenate(
|
| 98 |
+
[velo_pts[:, :3].astype(np.float64), np.ones((velo_pts.shape[0], 1))],
|
| 99 |
+
axis=1,
|
| 100 |
+
)
|
| 101 |
+
cam = pts_h @ T_velo_to_cam_rect.T # (N,4) in rectified cam0 frame
|
| 102 |
+
in_front = cam[:, 2] > min_depth
|
| 103 |
+
cam = cam[in_front]
|
| 104 |
+
if cam.shape[0] == 0:
|
| 105 |
+
return np.full((H, W), -1.0, dtype=np.float32)
|
| 106 |
+
uv_h = cam @ P_rect_00.T # (M,3)
|
| 107 |
+
z = uv_h[:, 2]
|
| 108 |
+
valid = z > min_depth
|
| 109 |
+
z = z[valid]
|
| 110 |
+
u = uv_h[valid, 0] / z
|
| 111 |
+
v = uv_h[valid, 1] / z
|
| 112 |
+
in_img = (u >= 0) & (u < W) & (v >= 0) & (v < H) & (z < max_depth)
|
| 113 |
+
u = u[in_img].astype(np.int32)
|
| 114 |
+
v = v[in_img].astype(np.int32)
|
| 115 |
+
z = z[in_img]
|
| 116 |
+
depthmap = np.full((H, W), -1.0, dtype=np.float32)
|
| 117 |
+
if z.size == 0:
|
| 118 |
+
return depthmap
|
| 119 |
+
order = np.argsort(-z) # closest-z written last → wins
|
| 120 |
+
depthmap[v[order], u[order]] = z[order].astype(np.float32)
|
| 121 |
+
return depthmap
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _load_kitti360_poses(path: str):
|
| 125 |
+
"""Read cam0_to_world.txt → dict[frame_idx] = (4,4) c2w matrix.
|
| 126 |
+
|
| 127 |
+
KITTI-360 ships TWO pose files per sequence:
|
| 128 |
+
- poses.txt : IMU/system pose (NOT camera pose)
|
| 129 |
+
- cam0_to_world.txt : actual camera-to-world for cam_00
|
| 130 |
+
The cam0 file has full 4x4 rows (16 floats); poses.txt is 3x4 (12 floats).
|
| 131 |
+
Using poses.txt makes pmap loss inconsistent with depth (~1m offset).
|
| 132 |
+
Note: not every frame has a pose (gaps where SLAM failed); skip missing.
|
| 133 |
+
"""
|
| 134 |
+
raw = np.loadtxt(path)
|
| 135 |
+
out = {}
|
| 136 |
+
for row in raw:
|
| 137 |
+
fid = int(row[0])
|
| 138 |
+
if row.shape[0] >= 17: # cam0_to_world.txt: 1 + 16
|
| 139 |
+
T = row[1:17].reshape(4, 4).astype(np.float32)
|
| 140 |
+
else: # poses.txt fallback: 1 + 12
|
| 141 |
+
T = np.eye(4, dtype=np.float32)
|
| 142 |
+
T[:3, :] = row[1:13].reshape(3, 4).astype(np.float32)
|
| 143 |
+
out[fid] = T
|
| 144 |
+
return out
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class KITTI360_Multi(BaseMultiViewDataset):
|
| 148 |
+
"""KITTI-360 perspective cam_00.
|
| 149 |
+
|
| 150 |
+
Camera-only by default; depth supervision activates per-frame when a Velodyne
|
| 151 |
+
.bin scan is present at velodyne_root/data_3d_raw/<seq>/velodyne_points/data/<fid>.bin.
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
def __init__(self, ROOT, *args, velodyne_root=None, **kwargs):
|
| 155 |
+
self.ROOT = ROOT
|
| 156 |
+
# Velodyne root for data_3d_raw/. If None, look under ROOT (in-place download).
|
| 157 |
+
self.velodyne_root = velodyne_root if velodyne_root else ROOT
|
| 158 |
+
self.video = True
|
| 159 |
+
self.is_metric = True
|
| 160 |
+
self.max_interval = 4
|
| 161 |
+
super().__init__(*args, **kwargs)
|
| 162 |
+
self._load_data(self.split)
|
| 163 |
+
|
| 164 |
+
def _load_data(self, split=None):
|
| 165 |
+
# Intrinsics + rectification (shared across all KITTI-360 sequences)
|
| 166 |
+
calib_dir = osp.join(self.ROOT, "calibration")
|
| 167 |
+
P_rect, R_rect, _ = _parse_perspective_intrinsics(
|
| 168 |
+
osp.join(calib_dir, "perspective.txt")
|
| 169 |
+
)
|
| 170 |
+
self.P_rect_00 = P_rect.copy()
|
| 171 |
+
self.K = P_rect[:, :3].copy().astype(np.float32)
|
| 172 |
+
|
| 173 |
+
# T_cam0→velo from calib_cam_to_velo.txt; lidar projection needs the inverse,
|
| 174 |
+
# composed with R_rect_00 to land in rectified cam0 frame.
|
| 175 |
+
cam_to_velo_path = osp.join(calib_dir, "calib_cam_to_velo.txt")
|
| 176 |
+
if osp.isfile(cam_to_velo_path):
|
| 177 |
+
T_cam_to_velo = _parse_cam_to_velo(cam_to_velo_path) # (4,4)
|
| 178 |
+
T_velo_to_cam = np.linalg.inv(T_cam_to_velo)
|
| 179 |
+
R_rect_h = np.eye(4, dtype=np.float64)
|
| 180 |
+
R_rect_h[:3, :3] = R_rect
|
| 181 |
+
self.T_velo_to_cam_rect = R_rect_h @ T_velo_to_cam # (4,4)
|
| 182 |
+
else:
|
| 183 |
+
self.T_velo_to_cam_rect = None # lidar disabled
|
| 184 |
+
|
| 185 |
+
seq_ids = TRAIN_SEQS if split == "train" else TEST_SEQS
|
| 186 |
+
scenes = []
|
| 187 |
+
seq_poses = [] # list of (M_i, 4, 4) per scene
|
| 188 |
+
seq_frame_ids = [] # list of [frame_idx, ...] (only those with poses + image)
|
| 189 |
+
seq_velo_dir = [] # absolute velodyne dir per seq, or None
|
| 190 |
+
scene_img_list = []
|
| 191 |
+
sceneids = []
|
| 192 |
+
start_img_ids = []
|
| 193 |
+
offset = 0
|
| 194 |
+
j = 0
|
| 195 |
+
|
| 196 |
+
for sid in seq_ids:
|
| 197 |
+
img_dir = osp.join(self.ROOT, "data_2d_raw", sid, "image_00", "data_rect")
|
| 198 |
+
pose_path = osp.join(self.ROOT, "data_poses", sid, "cam0_to_world.txt")
|
| 199 |
+
if not osp.isdir(img_dir) or not osp.isfile(pose_path):
|
| 200 |
+
continue
|
| 201 |
+
|
| 202 |
+
poses_dict = _load_kitti360_poses(pose_path)
|
| 203 |
+
# Walk image_00/data_rect for available frame_idx files. Skip zero-byte
|
| 204 |
+
# placeholders left over from partial / aborted downloads (would crash
|
| 205 |
+
# imread_cv2 at sample time).
|
| 206 |
+
avail = []
|
| 207 |
+
for fname in os.listdir(img_dir):
|
| 208 |
+
if not fname.endswith(".png"):
|
| 209 |
+
continue
|
| 210 |
+
try:
|
| 211 |
+
fid = int(osp.splitext(fname)[0])
|
| 212 |
+
except ValueError:
|
| 213 |
+
continue
|
| 214 |
+
if fid not in poses_dict:
|
| 215 |
+
continue
|
| 216 |
+
fpath = osp.join(img_dir, fname)
|
| 217 |
+
try:
|
| 218 |
+
if osp.getsize(fpath) <= 0:
|
| 219 |
+
continue
|
| 220 |
+
except OSError:
|
| 221 |
+
continue
|
| 222 |
+
avail.append(fid)
|
| 223 |
+
avail.sort()
|
| 224 |
+
if not avail:
|
| 225 |
+
continue
|
| 226 |
+
|
| 227 |
+
poses = np.stack([poses_dict[f] for f in avail], axis=0)
|
| 228 |
+
n_imgs = len(avail)
|
| 229 |
+
cut_off = (
|
| 230 |
+
self.num_views
|
| 231 |
+
if not self.allow_repeat
|
| 232 |
+
else max(self.num_views // 3, 3)
|
| 233 |
+
)
|
| 234 |
+
if n_imgs < cut_off:
|
| 235 |
+
continue
|
| 236 |
+
|
| 237 |
+
# Velodyne dir (per-seq); set to None if absent → frame falls back to camera-only.
|
| 238 |
+
velo_dir = osp.join(
|
| 239 |
+
self.velodyne_root, "data_3d_raw", sid, "velodyne_points", "data"
|
| 240 |
+
)
|
| 241 |
+
velo_dir = velo_dir if osp.isdir(velo_dir) else None
|
| 242 |
+
|
| 243 |
+
img_ids = list(np.arange(n_imgs) + offset)
|
| 244 |
+
start_img_ids_ = img_ids[: n_imgs - cut_off + 1]
|
| 245 |
+
|
| 246 |
+
scenes.append(sid)
|
| 247 |
+
seq_poses.append(poses)
|
| 248 |
+
seq_frame_ids.append(np.asarray(avail, dtype=np.int64))
|
| 249 |
+
seq_velo_dir.append(velo_dir)
|
| 250 |
+
scene_img_list.append(img_ids)
|
| 251 |
+
sceneids.extend([j] * n_imgs)
|
| 252 |
+
start_img_ids.extend(start_img_ids_)
|
| 253 |
+
offset += n_imgs
|
| 254 |
+
j += 1
|
| 255 |
+
|
| 256 |
+
self.scenes = scenes
|
| 257 |
+
self.seq_poses = seq_poses
|
| 258 |
+
self.seq_frame_ids = seq_frame_ids
|
| 259 |
+
self.seq_velo_dir = seq_velo_dir
|
| 260 |
+
self.scene_img_list = scene_img_list
|
| 261 |
+
self.sceneids = sceneids
|
| 262 |
+
self.start_img_ids = start_img_ids
|
| 263 |
+
|
| 264 |
+
def __len__(self):
|
| 265 |
+
return len(self.start_img_ids)
|
| 266 |
+
|
| 267 |
+
def get_image_num(self):
|
| 268 |
+
return sum(len(p) for p in self.seq_poses)
|
| 269 |
+
|
| 270 |
+
def get_stats(self):
|
| 271 |
+
return f"{len(self)} groups across {len(self.scenes)} KITTI-360 sequences"
|
| 272 |
+
|
| 273 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 274 |
+
start_id = self.start_img_ids[idx]
|
| 275 |
+
scene_id = self.sceneids[start_id]
|
| 276 |
+
all_image_ids = self.scene_img_list[scene_id]
|
| 277 |
+
n_frames = len(all_image_ids)
|
| 278 |
+
sid = self.scenes[scene_id]
|
| 279 |
+
img_dir = osp.join(self.ROOT, "data_2d_raw", sid, "image_00", "data_rect")
|
| 280 |
+
frame_ids = self.seq_frame_ids[scene_id]
|
| 281 |
+
poses = self.seq_poses[scene_id]
|
| 282 |
+
velo_dir = self.seq_velo_dir[scene_id]
|
| 283 |
+
K = self.K
|
| 284 |
+
has_lidar = velo_dir is not None and self.T_velo_to_cam_rect is not None
|
| 285 |
+
|
| 286 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 287 |
+
num_views,
|
| 288 |
+
start_id,
|
| 289 |
+
all_image_ids,
|
| 290 |
+
rng,
|
| 291 |
+
max_interval=self.max_interval,
|
| 292 |
+
video_prob=1.0,
|
| 293 |
+
fix_interval_prob=0.9,
|
| 294 |
+
)
|
| 295 |
+
local_idxs = np.asarray(pos, dtype=int)
|
| 296 |
+
|
| 297 |
+
views = []
|
| 298 |
+
for v, lid in enumerate(local_idxs):
|
| 299 |
+
lid = int(lid)
|
| 300 |
+
fid = int(frame_ids[lid])
|
| 301 |
+
img_path = osp.join(img_dir, f"{fid:010d}.png")
|
| 302 |
+
image = imread_cv2(img_path)
|
| 303 |
+
H, W = image.shape[:2]
|
| 304 |
+
|
| 305 |
+
# If velodyne available, project lidar to image_00 → sparse depthmap.
|
| 306 |
+
if has_lidar:
|
| 307 |
+
bin_path = osp.join(velo_dir, f"{fid:010d}.bin")
|
| 308 |
+
if osp.isfile(bin_path):
|
| 309 |
+
velo_pts = _load_velodyne_bin(bin_path)
|
| 310 |
+
depthmap = _project_velo_to_depth_kitti360(
|
| 311 |
+
velo_pts, self.P_rect_00, self.T_velo_to_cam_rect, H, W
|
| 312 |
+
)
|
| 313 |
+
frame_has_lidar = bool((depthmap > 0).any())
|
| 314 |
+
else:
|
| 315 |
+
depthmap = np.full((H, W), -1.0, dtype=np.float32)
|
| 316 |
+
frame_has_lidar = False
|
| 317 |
+
else:
|
| 318 |
+
depthmap = np.full((H, W), -1.0, dtype=np.float32)
|
| 319 |
+
frame_has_lidar = False
|
| 320 |
+
|
| 321 |
+
intrinsics = K.copy()
|
| 322 |
+
camera_pose = poses[lid].astype(np.float32)
|
| 323 |
+
|
| 324 |
+
image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 325 |
+
image, depthmap, intrinsics, resolution, rng, info=(img_dir, img_path)
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 329 |
+
self.is_metric, v, rng, p=[0.85, 0.1, 0.05]
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
views.append(
|
| 333 |
+
dict(
|
| 334 |
+
img=image,
|
| 335 |
+
depthmap=depthmap,
|
| 336 |
+
camera_pose=camera_pose,
|
| 337 |
+
camera_intrinsics=intrinsics,
|
| 338 |
+
dataset="KITTI360",
|
| 339 |
+
label=img_dir,
|
| 340 |
+
is_metric=self.is_metric,
|
| 341 |
+
instance=f"{sid}/image_00/{fid:010d}.png",
|
| 342 |
+
is_video=ordered_video,
|
| 343 |
+
quantile=np.array(1.0, dtype=np.float32),
|
| 344 |
+
img_mask=img_mask,
|
| 345 |
+
ray_mask=ray_mask,
|
| 346 |
+
camera_only=not frame_has_lidar,
|
| 347 |
+
depth_only=False,
|
| 348 |
+
single_view=False,
|
| 349 |
+
reset=False,
|
| 350 |
+
scene_tag=f"kitti360/{sid}",
|
| 351 |
+
)
|
| 352 |
+
)
|
| 353 |
+
assert len(views) == num_views
|
| 354 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/mapfree.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import numpy as np
|
| 3 |
+
import cv2
|
| 4 |
+
import numpy as np
|
| 5 |
+
import itertools
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import pickle
|
| 9 |
+
import h5py
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 13 |
+
|
| 14 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 15 |
+
from dust3r.utils.image import imread_cv2
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class MapFree_Multi(BaseMultiViewDataset):
|
| 19 |
+
|
| 20 |
+
def __init__(self, ROOT, *args, **kwargs):
|
| 21 |
+
self.ROOT = ROOT
|
| 22 |
+
self.video = True
|
| 23 |
+
self.is_metric = True
|
| 24 |
+
self.max_interval = 30
|
| 25 |
+
super().__init__(*args, **kwargs)
|
| 26 |
+
|
| 27 |
+
self._load_data()
|
| 28 |
+
|
| 29 |
+
def imgid2path(self, img_id, scene):
|
| 30 |
+
first_seq_id, first_frame_id = img_id
|
| 31 |
+
return os.path.join(
|
| 32 |
+
self.ROOT,
|
| 33 |
+
scene,
|
| 34 |
+
f"dense{first_seq_id}",
|
| 35 |
+
"rgb",
|
| 36 |
+
f"frame_{first_frame_id:05d}.jpg",
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
def path2imgid(self, subscene, filename):
|
| 40 |
+
first_seq_id = int(subscene[5:])
|
| 41 |
+
first_frame_id = int(filename[6:-4])
|
| 42 |
+
return [first_seq_id, first_frame_id]
|
| 43 |
+
|
| 44 |
+
def _load_data(self):
|
| 45 |
+
cache_file = f"{self.ROOT}/cached_metadata_50_col_only.h5"
|
| 46 |
+
if os.path.exists(cache_file):
|
| 47 |
+
print(f"Loading cached metadata from {cache_file}")
|
| 48 |
+
with h5py.File(cache_file, "r") as hf:
|
| 49 |
+
self.scenes = list(map(lambda x: x.decode("utf-8"), hf["scenes"][:]))
|
| 50 |
+
self.sceneids = hf["sceneids"][:]
|
| 51 |
+
self.scope = hf["scope"][:]
|
| 52 |
+
self.video_flags = hf["video_flags"][:]
|
| 53 |
+
self.groups = hf["groups"][:]
|
| 54 |
+
self.id_ranges = hf["id_ranges"][:]
|
| 55 |
+
self.images = hf["images"][:]
|
| 56 |
+
else:
|
| 57 |
+
scene_dirs = sorted(
|
| 58 |
+
[
|
| 59 |
+
d
|
| 60 |
+
for d in os.listdir(self.ROOT)
|
| 61 |
+
if os.path.isdir(os.path.join(self.ROOT, d))
|
| 62 |
+
]
|
| 63 |
+
)
|
| 64 |
+
scenes = []
|
| 65 |
+
sceneids = []
|
| 66 |
+
groups = []
|
| 67 |
+
scope = []
|
| 68 |
+
images = []
|
| 69 |
+
id_ranges = []
|
| 70 |
+
is_video = []
|
| 71 |
+
start = 0
|
| 72 |
+
j = 0
|
| 73 |
+
offset = 0
|
| 74 |
+
|
| 75 |
+
for scene in tqdm(scene_dirs):
|
| 76 |
+
scenes.append(scene)
|
| 77 |
+
# video sequences
|
| 78 |
+
subscenes = sorted(
|
| 79 |
+
[
|
| 80 |
+
d
|
| 81 |
+
for d in os.listdir(os.path.join(self.ROOT, scene))
|
| 82 |
+
if d.startswith("dense")
|
| 83 |
+
]
|
| 84 |
+
)
|
| 85 |
+
id_range_subscenes = []
|
| 86 |
+
for subscene in subscenes:
|
| 87 |
+
rgb_paths = sorted(
|
| 88 |
+
[
|
| 89 |
+
d
|
| 90 |
+
for d in os.listdir(
|
| 91 |
+
os.path.join(self.ROOT, scene, subscene, "rgb")
|
| 92 |
+
)
|
| 93 |
+
if d.endswith(".jpg")
|
| 94 |
+
]
|
| 95 |
+
)
|
| 96 |
+
assert (
|
| 97 |
+
len(rgb_paths) > 0
|
| 98 |
+
), f"{os.path.join(self.ROOT, scene, subscene)} is empty."
|
| 99 |
+
num_imgs = len(rgb_paths)
|
| 100 |
+
images.extend(
|
| 101 |
+
[self.path2imgid(subscene, rgb_path) for rgb_path in rgb_paths]
|
| 102 |
+
)
|
| 103 |
+
id_range_subscenes.append((offset, offset + num_imgs))
|
| 104 |
+
offset += num_imgs
|
| 105 |
+
|
| 106 |
+
# image collections
|
| 107 |
+
metadata = pickle.load(
|
| 108 |
+
open(os.path.join(self.ROOT, scene, "metadata.pkl"), "rb")
|
| 109 |
+
)
|
| 110 |
+
ref_imgs = list(metadata.keys())
|
| 111 |
+
img_groups = []
|
| 112 |
+
for ref_img in ref_imgs:
|
| 113 |
+
other_imgs = metadata[ref_img]
|
| 114 |
+
if len(other_imgs) + 1 < self.num_views:
|
| 115 |
+
continue
|
| 116 |
+
group = [(*other_img[0], other_img[1]) for other_img in other_imgs]
|
| 117 |
+
group.insert(0, (*ref_img, 1))
|
| 118 |
+
img_groups.append(np.array(group))
|
| 119 |
+
id_ranges.append(id_range_subscenes[ref_img[0]])
|
| 120 |
+
scope.append(start)
|
| 121 |
+
start = start + len(group)
|
| 122 |
+
|
| 123 |
+
num_groups = len(img_groups)
|
| 124 |
+
sceneids.extend([j] * num_groups)
|
| 125 |
+
groups.extend(img_groups)
|
| 126 |
+
is_video.extend([False] * num_groups)
|
| 127 |
+
j += 1
|
| 128 |
+
|
| 129 |
+
self.scenes = np.array(scenes)
|
| 130 |
+
self.sceneids = np.array(sceneids)
|
| 131 |
+
self.scope = np.array(scope)
|
| 132 |
+
self.video_flags = np.array(is_video)
|
| 133 |
+
self.groups = np.concatenate(groups, 0)
|
| 134 |
+
self.id_ranges = np.array(id_ranges)
|
| 135 |
+
self.images = np.array(images)
|
| 136 |
+
|
| 137 |
+
data = dict(
|
| 138 |
+
scenes=self.scenes,
|
| 139 |
+
sceneids=self.sceneids,
|
| 140 |
+
scope=self.scope,
|
| 141 |
+
video_flags=self.video_flags,
|
| 142 |
+
groups=self.groups,
|
| 143 |
+
id_ranges=self.id_ranges,
|
| 144 |
+
images=self.images,
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
with h5py.File(cache_file, "w") as h5f:
|
| 148 |
+
h5f.create_dataset(
|
| 149 |
+
"scenes",
|
| 150 |
+
data=data["scenes"].astype(object),
|
| 151 |
+
dtype=h5py.string_dtype(encoding="utf-8"),
|
| 152 |
+
compression="lzf",
|
| 153 |
+
chunks=True,
|
| 154 |
+
)
|
| 155 |
+
h5f.create_dataset(
|
| 156 |
+
"sceneids", data=data["sceneids"], compression="lzf", chunks=True
|
| 157 |
+
)
|
| 158 |
+
h5f.create_dataset(
|
| 159 |
+
"scope", data=data["scope"], compression="lzf", chunks=True
|
| 160 |
+
)
|
| 161 |
+
h5f.create_dataset(
|
| 162 |
+
"video_flags",
|
| 163 |
+
data=data["video_flags"],
|
| 164 |
+
compression="lzf",
|
| 165 |
+
chunks=True,
|
| 166 |
+
)
|
| 167 |
+
h5f.create_dataset(
|
| 168 |
+
"groups", data=data["groups"], compression="lzf", chunks=True
|
| 169 |
+
)
|
| 170 |
+
h5f.create_dataset(
|
| 171 |
+
"id_ranges", data=data["id_ranges"], compression="lzf", chunks=True
|
| 172 |
+
)
|
| 173 |
+
h5f.create_dataset(
|
| 174 |
+
"images", data=data["images"], compression="lzf", chunks=True
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
def __len__(self):
|
| 178 |
+
return len(self.scope)
|
| 179 |
+
|
| 180 |
+
def get_image_num(self):
|
| 181 |
+
return len(self.images)
|
| 182 |
+
|
| 183 |
+
def get_stats(self):
|
| 184 |
+
return f"{len(self)} groups of views"
|
| 185 |
+
|
| 186 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 187 |
+
scene = self.scenes[self.sceneids[idx]]
|
| 188 |
+
if rng.random() < 0.6:
|
| 189 |
+
ids = np.arange(self.id_ranges[idx][0], self.id_ranges[idx][1])
|
| 190 |
+
cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
|
| 191 |
+
start_ids = ids[: len(ids) - cut_off + 1]
|
| 192 |
+
start_id = rng.choice(start_ids)
|
| 193 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 194 |
+
num_views,
|
| 195 |
+
start_id,
|
| 196 |
+
ids.tolist(),
|
| 197 |
+
rng,
|
| 198 |
+
max_interval=self.max_interval,
|
| 199 |
+
video_prob=0.8,
|
| 200 |
+
fix_interval_prob=0.5,
|
| 201 |
+
block_shuffle=16,
|
| 202 |
+
)
|
| 203 |
+
ids = np.array(ids)[pos]
|
| 204 |
+
image_idxs = self.images[ids]
|
| 205 |
+
else:
|
| 206 |
+
ordered_video = False
|
| 207 |
+
seq_start_index = self.scope[idx]
|
| 208 |
+
seq_end_index = self.scope[idx + 1] if idx < len(self.scope) - 1 else None
|
| 209 |
+
image_idxs = (
|
| 210 |
+
self.groups[seq_start_index:seq_end_index]
|
| 211 |
+
if seq_end_index is not None
|
| 212 |
+
else self.groups[seq_start_index:]
|
| 213 |
+
)
|
| 214 |
+
image_idxs, overlap_scores = image_idxs[:, :2], image_idxs[:, 2]
|
| 215 |
+
replace = (
|
| 216 |
+
True
|
| 217 |
+
if self.allow_repeat
|
| 218 |
+
or len(overlap_scores[overlap_scores > 0]) < num_views
|
| 219 |
+
else False
|
| 220 |
+
)
|
| 221 |
+
image_idxs = rng.choice(
|
| 222 |
+
image_idxs,
|
| 223 |
+
num_views,
|
| 224 |
+
replace=replace,
|
| 225 |
+
p=overlap_scores / np.sum(overlap_scores),
|
| 226 |
+
)
|
| 227 |
+
image_idxs = image_idxs.astype(np.int64)
|
| 228 |
+
|
| 229 |
+
views = []
|
| 230 |
+
for v, view_idx in enumerate(image_idxs):
|
| 231 |
+
img_path = self.imgid2path(view_idx, scene)
|
| 232 |
+
depth_path = img_path.replace("rgb", "depth").replace(".jpg", ".npy")
|
| 233 |
+
cam_path = img_path.replace("rgb", "cam").replace(".jpg", ".npz")
|
| 234 |
+
sky_mask_path = img_path.replace("rgb", "sky_mask")
|
| 235 |
+
image = imread_cv2(img_path)
|
| 236 |
+
depthmap = np.load(depth_path)
|
| 237 |
+
camera_params = np.load(cam_path)
|
| 238 |
+
sky_mask = cv2.imread(sky_mask_path, cv2.IMREAD_UNCHANGED) >= 127
|
| 239 |
+
|
| 240 |
+
intrinsics = camera_params["intrinsic"].astype(np.float32)
|
| 241 |
+
camera_pose = camera_params["pose"].astype(np.float32)
|
| 242 |
+
|
| 243 |
+
depthmap[sky_mask] = -1.0
|
| 244 |
+
depthmap[depthmap > 400.0] = 0.0
|
| 245 |
+
depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
|
| 246 |
+
threshold = (
|
| 247 |
+
np.percentile(depthmap[depthmap > 0], 98)
|
| 248 |
+
if depthmap[depthmap > 0].size > 0
|
| 249 |
+
else 0
|
| 250 |
+
)
|
| 251 |
+
depthmap[depthmap > threshold] = 0.0
|
| 252 |
+
|
| 253 |
+
image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 254 |
+
image, depthmap, intrinsics, resolution, rng, info=(img_path)
|
| 255 |
+
)
|
| 256 |
+
# generate img mask and raymap mask
|
| 257 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 258 |
+
self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
views.append(
|
| 262 |
+
dict(
|
| 263 |
+
img=image,
|
| 264 |
+
depthmap=depthmap,
|
| 265 |
+
camera_pose=camera_pose, # cam2world
|
| 266 |
+
camera_intrinsics=intrinsics,
|
| 267 |
+
dataset="MapFree",
|
| 268 |
+
label=img_path,
|
| 269 |
+
is_metric=self.is_metric,
|
| 270 |
+
instance=img_path,
|
| 271 |
+
is_video=ordered_video,
|
| 272 |
+
quantile=np.array(0.96, dtype=np.float32),
|
| 273 |
+
img_mask=img_mask,
|
| 274 |
+
ray_mask=ray_mask,
|
| 275 |
+
camera_only=False,
|
| 276 |
+
depth_only=False,
|
| 277 |
+
single_view=False,
|
| 278 |
+
reset=False,
|
| 279 |
+
)
|
| 280 |
+
)
|
| 281 |
+
assert len(views) == num_views
|
| 282 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/megadepth.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import numpy as np
|
| 3 |
+
import itertools
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
|
| 7 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 8 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 9 |
+
from dust3r.utils.image import imread_cv2, imread_pil
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class MegaDepth_Multi(BaseMultiViewDataset):
|
| 13 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 14 |
+
self.ROOT = ROOT
|
| 15 |
+
super().__init__(*args, **kwargs)
|
| 16 |
+
self._load_data(self.split)
|
| 17 |
+
self.is_metric = False
|
| 18 |
+
if self.split is None:
|
| 19 |
+
pass
|
| 20 |
+
elif self.split == "train":
|
| 21 |
+
self.select_scene(("0015", "0022"), opposite=True)
|
| 22 |
+
elif self.split == "val":
|
| 23 |
+
self.select_scene(("0015", "0022"))
|
| 24 |
+
else:
|
| 25 |
+
raise ValueError(f"bad {self.split=}")
|
| 26 |
+
|
| 27 |
+
print('DATA: megadepth', len(self))
|
| 28 |
+
|
| 29 |
+
def _load_data(self, split):
|
| 30 |
+
with np.load(
|
| 31 |
+
osp.join(self.ROOT, "megadepth_sets_64.npz"), allow_pickle=True
|
| 32 |
+
) as data:
|
| 33 |
+
self.all_scenes = data["scenes"]
|
| 34 |
+
self.all_images = data["images"]
|
| 35 |
+
self.sets = data["sets"]
|
| 36 |
+
|
| 37 |
+
def __len__(self):
|
| 38 |
+
return len(self.sets)
|
| 39 |
+
|
| 40 |
+
def get_image_num(self):
|
| 41 |
+
return len(self.all_images)
|
| 42 |
+
|
| 43 |
+
def get_stats(self):
|
| 44 |
+
return f"{len(self)} groups from {len(self.all_scenes)} scenes"
|
| 45 |
+
|
| 46 |
+
def select_scene(self, scene, *instances, opposite=False):
|
| 47 |
+
scenes = (scene,) if isinstance(scene, str) else tuple(scene)
|
| 48 |
+
scene_id = [s.startswith(scenes) for s in self.all_scenes]
|
| 49 |
+
assert any(scene_id), "no scene found"
|
| 50 |
+
valid = np.in1d(self.sets[:, 0], np.nonzero(scene_id)[0])
|
| 51 |
+
if instances:
|
| 52 |
+
raise NotImplementedError("selecting instances not implemented")
|
| 53 |
+
if opposite:
|
| 54 |
+
valid = ~valid
|
| 55 |
+
assert valid.any()
|
| 56 |
+
self.sets = self.sets[valid]
|
| 57 |
+
|
| 58 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 59 |
+
scene_id = self.sets[idx][0]
|
| 60 |
+
image_idxs = self.sets[idx][1:65]
|
| 61 |
+
replace = False if not self.allow_repeat else True
|
| 62 |
+
image_idxs = rng.choice(image_idxs, num_views, replace=replace)
|
| 63 |
+
scene, subscene = self.all_scenes[scene_id].split()
|
| 64 |
+
seq_path = osp.join(self.ROOT, scene, subscene)
|
| 65 |
+
views = []
|
| 66 |
+
for im_id in image_idxs:
|
| 67 |
+
img = self.all_images[im_id]
|
| 68 |
+
try:
|
| 69 |
+
image = imread_pil(osp.join(seq_path, img + ".jpg"))
|
| 70 |
+
depthmap = imread_cv2(osp.join(seq_path, img + ".exr"))
|
| 71 |
+
camera_params = np.load(osp.join(seq_path, img + ".npz"))
|
| 72 |
+
except Exception as e:
|
| 73 |
+
raise OSError(f"cannot load {img}, got exception {e}")
|
| 74 |
+
intrinsics = np.float32(camera_params["intrinsics"])
|
| 75 |
+
camera_pose = np.float32(camera_params["cam2world"])
|
| 76 |
+
image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 77 |
+
image, depthmap, intrinsics, resolution, rng, info=(seq_path, img)
|
| 78 |
+
)
|
| 79 |
+
views.append(
|
| 80 |
+
dict(
|
| 81 |
+
img=image,
|
| 82 |
+
depthmap=depthmap,
|
| 83 |
+
camera_pose=camera_pose, # cam2world
|
| 84 |
+
camera_intrinsics=intrinsics,
|
| 85 |
+
dataset="MegaDepth",
|
| 86 |
+
label=osp.relpath(seq_path, self.ROOT),
|
| 87 |
+
is_metric=self.is_metric,
|
| 88 |
+
instance=img,
|
| 89 |
+
is_video=False,
|
| 90 |
+
quantile=np.array(0.96, dtype=np.float32),
|
| 91 |
+
img_mask=True,
|
| 92 |
+
ray_mask=False,
|
| 93 |
+
camera_only=False,
|
| 94 |
+
depth_only=False,
|
| 95 |
+
single_view=False,
|
| 96 |
+
reset=False,
|
| 97 |
+
)
|
| 98 |
+
)
|
| 99 |
+
assert len(views) == num_views
|
| 100 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/mvs_synth.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_pil
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class MVS_Synth_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = False
|
| 19 |
+
self.max_interval = 4
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
self.loaded_data = self._load_data()
|
| 22 |
+
print('DATA: mvs_synth', len(self))
|
| 23 |
+
|
| 24 |
+
def _load_data(self):
|
| 25 |
+
self.scenes = os.listdir(self.ROOT)
|
| 26 |
+
|
| 27 |
+
offset = 0
|
| 28 |
+
scenes = []
|
| 29 |
+
sceneids = []
|
| 30 |
+
scene_img_list = []
|
| 31 |
+
images = []
|
| 32 |
+
start_img_ids = []
|
| 33 |
+
|
| 34 |
+
j = 0
|
| 35 |
+
for scene in tqdm(self.scenes):
|
| 36 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 37 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 38 |
+
basenames = sorted(
|
| 39 |
+
[f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")]
|
| 40 |
+
)
|
| 41 |
+
num_imgs = len(basenames)
|
| 42 |
+
cut_off = (
|
| 43 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
if num_imgs < cut_off:
|
| 47 |
+
print(f"Skipping {scene}")
|
| 48 |
+
continue
|
| 49 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 50 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 51 |
+
|
| 52 |
+
start_img_ids.extend(start_img_ids_)
|
| 53 |
+
sceneids.extend([j] * num_imgs)
|
| 54 |
+
images.extend(basenames)
|
| 55 |
+
scenes.append(scene)
|
| 56 |
+
scene_img_list.append(img_ids)
|
| 57 |
+
|
| 58 |
+
# offset groups
|
| 59 |
+
offset += num_imgs
|
| 60 |
+
j += 1
|
| 61 |
+
|
| 62 |
+
self.scenes = scenes
|
| 63 |
+
self.sceneids = sceneids
|
| 64 |
+
self.images = images
|
| 65 |
+
self.start_img_ids = start_img_ids
|
| 66 |
+
self.scene_img_list = scene_img_list
|
| 67 |
+
|
| 68 |
+
def __len__(self):
|
| 69 |
+
return len(self.start_img_ids)
|
| 70 |
+
|
| 71 |
+
def get_image_num(self):
|
| 72 |
+
return len(self.images)
|
| 73 |
+
|
| 74 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 75 |
+
start_id = self.start_img_ids[idx]
|
| 76 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]]
|
| 77 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 78 |
+
num_views,
|
| 79 |
+
start_id,
|
| 80 |
+
all_image_ids,
|
| 81 |
+
rng,
|
| 82 |
+
max_interval=self.max_interval,
|
| 83 |
+
video_prob=1.0,
|
| 84 |
+
fix_interval_prob=1.0,
|
| 85 |
+
)
|
| 86 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 87 |
+
|
| 88 |
+
views = []
|
| 89 |
+
for v, view_idx in enumerate(image_idxs):
|
| 90 |
+
scene_id = self.sceneids[view_idx]
|
| 91 |
+
scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
|
| 92 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 93 |
+
depth_dir = osp.join(scene_dir, "depth")
|
| 94 |
+
cam_dir = osp.join(scene_dir, "cam")
|
| 95 |
+
|
| 96 |
+
basename = self.images[view_idx]
|
| 97 |
+
|
| 98 |
+
# Load RGB image
|
| 99 |
+
rgb_image = imread_pil(osp.join(rgb_dir, basename + ".jpg"))
|
| 100 |
+
# Load depthmap
|
| 101 |
+
depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
|
| 102 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 103 |
+
threshold = (
|
| 104 |
+
np.percentile(depthmap[depthmap > 0], 98)
|
| 105 |
+
if depthmap[depthmap > 0].size > 0
|
| 106 |
+
else 0
|
| 107 |
+
)
|
| 108 |
+
depthmap[depthmap > threshold] = 0.0
|
| 109 |
+
depthmap[depthmap > 1000] = 0.0
|
| 110 |
+
|
| 111 |
+
cam = np.load(osp.join(cam_dir, basename + ".npz"))
|
| 112 |
+
camera_pose = cam["pose"]
|
| 113 |
+
intrinsics = cam["intrinsics"]
|
| 114 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 115 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# generate img mask and raymap mask
|
| 119 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 120 |
+
self.is_metric, v, rng, p=[0.8, 0.15, 0.05]
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
views.append(
|
| 124 |
+
dict(
|
| 125 |
+
img=rgb_image,
|
| 126 |
+
depthmap=depthmap.astype(np.float32),
|
| 127 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 128 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 129 |
+
dataset="MVS_Synth",
|
| 130 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 131 |
+
instance=osp.join(rgb_dir, basename + ".jpg"),
|
| 132 |
+
is_metric=self.is_metric,
|
| 133 |
+
is_video=ordered_video,
|
| 134 |
+
quantile=np.array(1.0, dtype=np.float32),
|
| 135 |
+
img_mask=img_mask,
|
| 136 |
+
ray_mask=ray_mask,
|
| 137 |
+
camera_only=False,
|
| 138 |
+
depth_only=False,
|
| 139 |
+
single_view=False,
|
| 140 |
+
reset=False,
|
| 141 |
+
)
|
| 142 |
+
)
|
| 143 |
+
assert len(views) == num_views
|
| 144 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/omniobject3d.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
import json
|
| 8 |
+
|
| 9 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 12 |
+
from dust3r.utils.image import imread_cv2
|
| 13 |
+
import re
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def extract_number(filename):
|
| 17 |
+
match = re.search(r"\d+", filename)
|
| 18 |
+
if match:
|
| 19 |
+
return int(match.group())
|
| 20 |
+
return 0
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class OmniObject3D_Multi(BaseMultiViewDataset):
|
| 24 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 25 |
+
self.ROOT = ROOT
|
| 26 |
+
self.video = False
|
| 27 |
+
self.is_metric = False # True
|
| 28 |
+
super().__init__(*args, **kwargs)
|
| 29 |
+
|
| 30 |
+
self.loaded_data = self._load_data()
|
| 31 |
+
|
| 32 |
+
def _load_data(self):
|
| 33 |
+
self.scenes = [
|
| 34 |
+
d
|
| 35 |
+
for d in os.listdir(self.ROOT)
|
| 36 |
+
if os.path.isdir(os.path.join(self.ROOT, d)) and not d.startswith('.')
|
| 37 |
+
]
|
| 38 |
+
with open(os.path.join(self.ROOT, "scale.json"), "r") as f:
|
| 39 |
+
self.scales = json.load(f)
|
| 40 |
+
offset = 0
|
| 41 |
+
scenes = []
|
| 42 |
+
sceneids = []
|
| 43 |
+
scene_img_list = []
|
| 44 |
+
images = []
|
| 45 |
+
start_img_ids = []
|
| 46 |
+
|
| 47 |
+
j = 0
|
| 48 |
+
for scene in tqdm(self.scenes):
|
| 49 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 50 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 51 |
+
basenames = sorted(
|
| 52 |
+
[f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
|
| 53 |
+
key=extract_number,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
num_imgs = len(basenames)
|
| 57 |
+
cut_off = (
|
| 58 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
if num_imgs < cut_off:
|
| 62 |
+
print(f"Skipping {scene}")
|
| 63 |
+
continue
|
| 64 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 65 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 66 |
+
|
| 67 |
+
start_img_ids.extend([(scene, id) for id in start_img_ids_])
|
| 68 |
+
sceneids.extend([j] * num_imgs)
|
| 69 |
+
images.extend(basenames)
|
| 70 |
+
scenes.append(scene)
|
| 71 |
+
scene_img_list.append(img_ids)
|
| 72 |
+
|
| 73 |
+
# offset groups
|
| 74 |
+
offset += num_imgs
|
| 75 |
+
j += 1
|
| 76 |
+
|
| 77 |
+
self.scenes = scenes
|
| 78 |
+
self.sceneids = sceneids
|
| 79 |
+
self.images = images
|
| 80 |
+
self.start_img_ids = start_img_ids
|
| 81 |
+
self.scene_img_list = scene_img_list
|
| 82 |
+
|
| 83 |
+
def __len__(self):
|
| 84 |
+
return len(self.start_img_ids)
|
| 85 |
+
|
| 86 |
+
def get_image_num(self):
|
| 87 |
+
return len(self.images)
|
| 88 |
+
|
| 89 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 90 |
+
scene, start_id = self.start_img_ids[idx]
|
| 91 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]]
|
| 92 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 93 |
+
num_views, start_id, all_image_ids, rng, max_interval=100, video_prob=0.0
|
| 94 |
+
)
|
| 95 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 96 |
+
|
| 97 |
+
views = []
|
| 98 |
+
for v, view_idx in enumerate(image_idxs):
|
| 99 |
+
scene_id = self.sceneids[view_idx]
|
| 100 |
+
scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
|
| 101 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 102 |
+
depth_dir = osp.join(scene_dir, "depth")
|
| 103 |
+
cam_dir = osp.join(scene_dir, "cam")
|
| 104 |
+
|
| 105 |
+
basename = self.images[view_idx]
|
| 106 |
+
|
| 107 |
+
# Load RGB image
|
| 108 |
+
rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
|
| 109 |
+
depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
|
| 110 |
+
cam = np.load(osp.join(cam_dir, basename + ".npz"))
|
| 111 |
+
camera_pose = cam["pose"]
|
| 112 |
+
intrinsics = cam["intrinsics"]
|
| 113 |
+
scale = self.scales[self.scenes[scene_id]]
|
| 114 |
+
depthmap = depthmap / scale / 1000.0
|
| 115 |
+
camera_pose[:3, 3] = camera_pose[:3, 3] / scale / 1000.0
|
| 116 |
+
|
| 117 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 118 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 122 |
+
self.is_metric, v, rng, p=[0.8, 0.15, 0.05]
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
views.append(
|
| 126 |
+
dict(
|
| 127 |
+
img=rgb_image,
|
| 128 |
+
depthmap=depthmap.astype(np.float32),
|
| 129 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 130 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 131 |
+
dataset="OmniObject3D",
|
| 132 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 133 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 134 |
+
is_metric=self.is_metric,
|
| 135 |
+
is_video=ordered_video,
|
| 136 |
+
quantile=np.array(1.0, dtype=np.float32),
|
| 137 |
+
img_mask=img_mask,
|
| 138 |
+
ray_mask=ray_mask,
|
| 139 |
+
camera_only=False,
|
| 140 |
+
depth_only=False,
|
| 141 |
+
single_view=False,
|
| 142 |
+
reset=False,
|
| 143 |
+
)
|
| 144 |
+
)
|
| 145 |
+
assert len(views) == num_views
|
| 146 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/pointodyssey.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class PointOdyssey_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = True
|
| 19 |
+
self.max_interval = 4
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
assert self.split in ["train", "test", "val"]
|
| 22 |
+
self.scenes_to_use = [
|
| 23 |
+
# 'cab_h_bench_3rd', 'cab_h_bench_ego1', 'cab_h_bench_ego2',
|
| 24 |
+
"cnb_dlab_0215_3rd",
|
| 25 |
+
"cnb_dlab_0215_ego1",
|
| 26 |
+
"cnb_dlab_0225_3rd",
|
| 27 |
+
"cnb_dlab_0225_ego1",
|
| 28 |
+
"dancing",
|
| 29 |
+
"dancingroom0_3rd",
|
| 30 |
+
"footlab_3rd",
|
| 31 |
+
"footlab_ego1",
|
| 32 |
+
"footlab_ego2",
|
| 33 |
+
"girl",
|
| 34 |
+
"girl_egocentric",
|
| 35 |
+
"human_egocentric",
|
| 36 |
+
"human_in_scene",
|
| 37 |
+
"human_in_scene1",
|
| 38 |
+
"kg",
|
| 39 |
+
"kg_ego1",
|
| 40 |
+
"kg_ego2",
|
| 41 |
+
"kitchen_gfloor",
|
| 42 |
+
"kitchen_gfloor_ego1",
|
| 43 |
+
"kitchen_gfloor_ego2",
|
| 44 |
+
"scene_carb_h_tables",
|
| 45 |
+
"scene_carb_h_tables_ego1",
|
| 46 |
+
"scene_carb_h_tables_ego2",
|
| 47 |
+
"scene_j716_3rd",
|
| 48 |
+
"scene_j716_ego1",
|
| 49 |
+
"scene_j716_ego2",
|
| 50 |
+
"scene_recording_20210910_S05_S06_0_3rd",
|
| 51 |
+
"scene_recording_20210910_S05_S06_0_ego2",
|
| 52 |
+
"scene1_0129",
|
| 53 |
+
"scene1_0129_ego",
|
| 54 |
+
"seminar_h52_3rd",
|
| 55 |
+
"seminar_h52_ego1",
|
| 56 |
+
"seminar_h52_ego2",
|
| 57 |
+
]
|
| 58 |
+
self.loaded_data = self._load_data(self.split)
|
| 59 |
+
|
| 60 |
+
def _load_data(self, split):
|
| 61 |
+
root = os.path.join(self.ROOT, split)
|
| 62 |
+
self.scenes = []
|
| 63 |
+
|
| 64 |
+
offset = 0
|
| 65 |
+
scenes = []
|
| 66 |
+
sceneids = []
|
| 67 |
+
scene_img_list = []
|
| 68 |
+
images = []
|
| 69 |
+
start_img_ids = []
|
| 70 |
+
|
| 71 |
+
j = 0
|
| 72 |
+
for scene in tqdm(os.listdir(root)):
|
| 73 |
+
if scene not in self.scenes_to_use:
|
| 74 |
+
continue
|
| 75 |
+
scene_dir = osp.join(root, scene)
|
| 76 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 77 |
+
basenames = sorted(
|
| 78 |
+
[f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")]
|
| 79 |
+
)
|
| 80 |
+
num_imgs = len(basenames)
|
| 81 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 82 |
+
cut_off = (
|
| 83 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 84 |
+
)
|
| 85 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 86 |
+
# start_img_ids_ = img_ids[:-self.num_views+1]
|
| 87 |
+
|
| 88 |
+
if num_imgs < cut_off:
|
| 89 |
+
print(f"Skipping {scene}")
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
start_img_ids.extend(start_img_ids_)
|
| 93 |
+
sceneids.extend([j] * num_imgs)
|
| 94 |
+
images.extend(basenames)
|
| 95 |
+
scenes.append(scene)
|
| 96 |
+
scene_img_list.append(img_ids)
|
| 97 |
+
|
| 98 |
+
# offset groups
|
| 99 |
+
offset += num_imgs
|
| 100 |
+
j += 1
|
| 101 |
+
|
| 102 |
+
self.scenes = scenes
|
| 103 |
+
self.sceneids = sceneids
|
| 104 |
+
self.images = images
|
| 105 |
+
self.start_img_ids = start_img_ids
|
| 106 |
+
self.scene_img_list = scene_img_list
|
| 107 |
+
|
| 108 |
+
def __len__(self):
|
| 109 |
+
return len(self.start_img_ids)
|
| 110 |
+
|
| 111 |
+
def get_image_num(self):
|
| 112 |
+
return len(self.images)
|
| 113 |
+
|
| 114 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 115 |
+
start_id = self.start_img_ids[idx]
|
| 116 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]]
|
| 117 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 118 |
+
num_views,
|
| 119 |
+
start_id,
|
| 120 |
+
all_image_ids,
|
| 121 |
+
rng,
|
| 122 |
+
max_interval=self.max_interval,
|
| 123 |
+
video_prob=1.0,
|
| 124 |
+
fix_interval_prob=1.0,
|
| 125 |
+
)
|
| 126 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 127 |
+
|
| 128 |
+
views = []
|
| 129 |
+
for v, view_idx in enumerate(image_idxs):
|
| 130 |
+
scene_id = self.sceneids[view_idx]
|
| 131 |
+
scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id])
|
| 132 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 133 |
+
depth_dir = osp.join(scene_dir, "depth")
|
| 134 |
+
cam_dir = osp.join(scene_dir, "cam")
|
| 135 |
+
|
| 136 |
+
basename = self.images[view_idx]
|
| 137 |
+
|
| 138 |
+
# Load RGB image
|
| 139 |
+
rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".jpg"))
|
| 140 |
+
# Load depthmap
|
| 141 |
+
depthmap = np.load(osp.join(depth_dir, basename + ".npy"))
|
| 142 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 143 |
+
depthmap[depthmap > 1000] = 0.0
|
| 144 |
+
|
| 145 |
+
cam = np.load(osp.join(cam_dir, basename + ".npz"))
|
| 146 |
+
camera_pose = cam["pose"]
|
| 147 |
+
intrinsics = cam["intrinsics"]
|
| 148 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 149 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# generate img mask and raymap mask
|
| 153 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 154 |
+
self.is_metric, v, rng, p=[0.9, 0.05, 0.05]
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
views.append(
|
| 158 |
+
dict(
|
| 159 |
+
img=rgb_image,
|
| 160 |
+
depthmap=depthmap.astype(np.float32),
|
| 161 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 162 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 163 |
+
dataset="PointOdyssey",
|
| 164 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 165 |
+
instance=osp.join(rgb_dir, basename + ".jpg"),
|
| 166 |
+
is_metric=self.is_metric,
|
| 167 |
+
is_video=ordered_video,
|
| 168 |
+
quantile=np.array(1.0, dtype=np.float32),
|
| 169 |
+
img_mask=img_mask,
|
| 170 |
+
ray_mask=ray_mask,
|
| 171 |
+
camera_only=False,
|
| 172 |
+
depth_only=False,
|
| 173 |
+
single_view=False,
|
| 174 |
+
reset=False,
|
| 175 |
+
)
|
| 176 |
+
)
|
| 177 |
+
assert len(views) == num_views
|
| 178 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/realestate10k.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class RE10K_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = False
|
| 19 |
+
self.max_interval = 128
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
self.loaded_data = self._load_data()
|
| 22 |
+
|
| 23 |
+
def _load_data(self):
|
| 24 |
+
self.scenes = os.listdir(self.ROOT)
|
| 25 |
+
|
| 26 |
+
offset = 0
|
| 27 |
+
scenes = []
|
| 28 |
+
sceneids = []
|
| 29 |
+
scene_img_list = []
|
| 30 |
+
images = []
|
| 31 |
+
start_img_ids = []
|
| 32 |
+
|
| 33 |
+
j = 0
|
| 34 |
+
for scene in tqdm(self.scenes):
|
| 35 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 36 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 37 |
+
basenames = sorted(
|
| 38 |
+
[f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")],
|
| 39 |
+
key=lambda x: int(x),
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
num_imgs = len(basenames)
|
| 43 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 44 |
+
cut_off = (
|
| 45 |
+
self.num_views if not self.allow_repeat else max(self.num_views // 3, 3)
|
| 46 |
+
)
|
| 47 |
+
if num_imgs < cut_off:
|
| 48 |
+
print(f"Skipping {scene}")
|
| 49 |
+
continue
|
| 50 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 51 |
+
|
| 52 |
+
start_img_ids.extend([(scene, id) for id in start_img_ids_])
|
| 53 |
+
sceneids.extend([j] * num_imgs)
|
| 54 |
+
images.extend(basenames)
|
| 55 |
+
scenes.append(scene)
|
| 56 |
+
scene_img_list.append(img_ids)
|
| 57 |
+
|
| 58 |
+
# offset groups
|
| 59 |
+
offset += num_imgs
|
| 60 |
+
j += 1
|
| 61 |
+
|
| 62 |
+
self.scenes = scenes
|
| 63 |
+
self.sceneids = sceneids
|
| 64 |
+
self.images = images
|
| 65 |
+
self.start_img_ids = start_img_ids
|
| 66 |
+
self.scene_img_list = scene_img_list
|
| 67 |
+
|
| 68 |
+
self.invalid_scenes = {scene: False for scene in self.scenes}
|
| 69 |
+
|
| 70 |
+
def __len__(self):
|
| 71 |
+
return len(self.start_img_ids)
|
| 72 |
+
|
| 73 |
+
def get_image_num(self):
|
| 74 |
+
return len(self.images)
|
| 75 |
+
|
| 76 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 77 |
+
invalid_seq = True
|
| 78 |
+
scene, start_id = self.start_img_ids[idx]
|
| 79 |
+
|
| 80 |
+
while invalid_seq:
|
| 81 |
+
while self.invalid_scenes[scene]:
|
| 82 |
+
idx = rng.integers(low=0, high=len(self.start_img_ids))
|
| 83 |
+
scene, start_id = self.start_img_ids[idx]
|
| 84 |
+
|
| 85 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]]
|
| 86 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 87 |
+
num_views, start_id, all_image_ids, rng, max_interval=self.max_interval
|
| 88 |
+
)
|
| 89 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 90 |
+
|
| 91 |
+
views = []
|
| 92 |
+
for view_idx in image_idxs:
|
| 93 |
+
scene_id = self.sceneids[view_idx]
|
| 94 |
+
scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
|
| 95 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 96 |
+
cam_dir = osp.join(scene_dir, "cam")
|
| 97 |
+
|
| 98 |
+
basename = self.images[view_idx]
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
# Load RGB image
|
| 102 |
+
rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png"))
|
| 103 |
+
# Load depthmap, no depth, set to all ones
|
| 104 |
+
depthmap = np.ones_like(rgb_image[..., 0], dtype=np.float32)
|
| 105 |
+
cam = np.load(osp.join(cam_dir, basename + ".npz"))
|
| 106 |
+
intrinsics = cam["intrinsics"]
|
| 107 |
+
camera_pose = cam["pose"]
|
| 108 |
+
except:
|
| 109 |
+
print(f"Error loading {scene} {basename}, skipping")
|
| 110 |
+
self.invalid_scenes[scene] = True
|
| 111 |
+
break
|
| 112 |
+
|
| 113 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 114 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
views.append(
|
| 118 |
+
dict(
|
| 119 |
+
img=rgb_image,
|
| 120 |
+
depthmap=depthmap.astype(np.float32),
|
| 121 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 122 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 123 |
+
dataset="realestate10k",
|
| 124 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 125 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 126 |
+
is_metric=self.is_metric,
|
| 127 |
+
is_video=ordered_video,
|
| 128 |
+
quantile=np.array(0.98, dtype=np.float32),
|
| 129 |
+
img_mask=True,
|
| 130 |
+
ray_mask=False,
|
| 131 |
+
camera_only=True,
|
| 132 |
+
depth_only=False,
|
| 133 |
+
single_view=False,
|
| 134 |
+
reset=False,
|
| 135 |
+
)
|
| 136 |
+
)
|
| 137 |
+
if len(views) == num_views:
|
| 138 |
+
invalid_seq = False
|
| 139 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/scannet.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2, imread_pil
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ScanNet_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = True
|
| 19 |
+
self.max_interval = 30
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
|
| 22 |
+
self.loaded_data = self._load_data(self.split)
|
| 23 |
+
print('DATA: scannet', len(self))
|
| 24 |
+
|
| 25 |
+
def _load_data(self, split):
|
| 26 |
+
self.scene_root = osp.join(
|
| 27 |
+
self.ROOT, "scans_train" if split == "train" else "scans_test"
|
| 28 |
+
)
|
| 29 |
+
self.scenes = [
|
| 30 |
+
scene for scene in os.listdir(self.scene_root) if scene.startswith("scene")
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
offset = 0
|
| 34 |
+
scenes = []
|
| 35 |
+
sceneids = []
|
| 36 |
+
scene_img_list = []
|
| 37 |
+
images = []
|
| 38 |
+
start_img_ids = []
|
| 39 |
+
|
| 40 |
+
j = 0
|
| 41 |
+
for scene in tqdm(self.scenes):
|
| 42 |
+
scene_dir = osp.join(self.scene_root, scene)
|
| 43 |
+
with np.load(
|
| 44 |
+
osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
|
| 45 |
+
) as data:
|
| 46 |
+
basenames = data["images"]
|
| 47 |
+
num_imgs = len(basenames)
|
| 48 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 49 |
+
cut_off = (
|
| 50 |
+
self.num_views
|
| 51 |
+
if not self.allow_repeat
|
| 52 |
+
else max(self.num_views // 3, 3)
|
| 53 |
+
)
|
| 54 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 55 |
+
|
| 56 |
+
if num_imgs < cut_off:
|
| 57 |
+
print(f"Skipping {scene}")
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
start_img_ids.extend(start_img_ids_)
|
| 61 |
+
sceneids.extend([j] * num_imgs)
|
| 62 |
+
images.extend(basenames)
|
| 63 |
+
scenes.append(scene)
|
| 64 |
+
scene_img_list.append(img_ids)
|
| 65 |
+
|
| 66 |
+
# offset groups
|
| 67 |
+
offset += num_imgs
|
| 68 |
+
j += 1
|
| 69 |
+
|
| 70 |
+
self.scenes = scenes
|
| 71 |
+
self.sceneids = sceneids
|
| 72 |
+
self.images = images
|
| 73 |
+
self.start_img_ids = start_img_ids
|
| 74 |
+
self.scene_img_list = scene_img_list
|
| 75 |
+
|
| 76 |
+
def __len__(self):
|
| 77 |
+
return len(self.start_img_ids)
|
| 78 |
+
|
| 79 |
+
def get_image_num(self):
|
| 80 |
+
return len(self.images)
|
| 81 |
+
|
| 82 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 83 |
+
start_id = self.start_img_ids[idx]
|
| 84 |
+
all_image_ids = self.scene_img_list[self.sceneids[start_id]]
|
| 85 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 86 |
+
num_views,
|
| 87 |
+
start_id,
|
| 88 |
+
all_image_ids,
|
| 89 |
+
rng,
|
| 90 |
+
max_interval=self.max_interval,
|
| 91 |
+
video_prob=0.6,
|
| 92 |
+
fix_interval_prob=0.6,
|
| 93 |
+
block_shuffle=16,
|
| 94 |
+
)
|
| 95 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 96 |
+
|
| 97 |
+
views = []
|
| 98 |
+
for v, view_idx in enumerate(image_idxs):
|
| 99 |
+
scene_id = self.sceneids[view_idx]
|
| 100 |
+
scene_dir = osp.join(self.scene_root, self.scenes[scene_id])
|
| 101 |
+
rgb_dir = osp.join(scene_dir, "color")
|
| 102 |
+
depth_dir = osp.join(scene_dir, "depth")
|
| 103 |
+
cam_dir = osp.join(scene_dir, "cam")
|
| 104 |
+
|
| 105 |
+
basename = self.images[view_idx]
|
| 106 |
+
|
| 107 |
+
# Load RGB image
|
| 108 |
+
rgb_image = imread_pil(osp.join(rgb_dir, basename + ".jpg"))
|
| 109 |
+
# Load depthmap
|
| 110 |
+
depthmap = imread_cv2(
|
| 111 |
+
osp.join(depth_dir, basename + ".png"), cv2.IMREAD_UNCHANGED
|
| 112 |
+
)
|
| 113 |
+
depthmap = depthmap.astype(np.float32) / 1000
|
| 114 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 115 |
+
|
| 116 |
+
cam = np.load(osp.join(cam_dir, basename + ".npz"))
|
| 117 |
+
camera_pose = cam["pose"]
|
| 118 |
+
intrinsics = cam["intrinsics"]
|
| 119 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 120 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# generate img mask and raymap mask
|
| 124 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 125 |
+
self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
views.append(
|
| 129 |
+
dict(
|
| 130 |
+
img=rgb_image,
|
| 131 |
+
depthmap=depthmap.astype(np.float32),
|
| 132 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 133 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 134 |
+
dataset="ScanNet",
|
| 135 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 136 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 137 |
+
is_metric=self.is_metric,
|
| 138 |
+
is_video=ordered_video,
|
| 139 |
+
quantile=np.array(0.98, dtype=np.float32),
|
| 140 |
+
img_mask=img_mask,
|
| 141 |
+
ray_mask=ray_mask,
|
| 142 |
+
camera_only=False,
|
| 143 |
+
depth_only=False,
|
| 144 |
+
single_view=False,
|
| 145 |
+
reset=False,
|
| 146 |
+
)
|
| 147 |
+
)
|
| 148 |
+
assert len(views) == num_views
|
| 149 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/scannetpp.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2, imread_pil
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ScanNetpp_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = True
|
| 19 |
+
self.max_interval = 3
|
| 20 |
+
super().__init__(*args, **kwargs)
|
| 21 |
+
assert self.split == "train"
|
| 22 |
+
self.loaded_data = self._load_data()
|
| 23 |
+
|
| 24 |
+
def _load_data(self):
|
| 25 |
+
with np.load(osp.join(self.ROOT, "all_metadata.npz")) as data:
|
| 26 |
+
self.scenes = data["scenes"]
|
| 27 |
+
offset = 0
|
| 28 |
+
scenes = []
|
| 29 |
+
sceneids = []
|
| 30 |
+
images = []
|
| 31 |
+
intrinsics = []
|
| 32 |
+
trajectories = []
|
| 33 |
+
groups = []
|
| 34 |
+
id_ranges = []
|
| 35 |
+
j = 0
|
| 36 |
+
self.image_num = 0
|
| 37 |
+
for scene in self.scenes:
|
| 38 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 39 |
+
with np.load(
|
| 40 |
+
osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True
|
| 41 |
+
) as data:
|
| 42 |
+
imgs = data["images"]
|
| 43 |
+
self.image_num += len(imgs)
|
| 44 |
+
img_ids = np.arange(len(imgs)).tolist()
|
| 45 |
+
intrins = data["intrinsics"]
|
| 46 |
+
traj = data["trajectories"]
|
| 47 |
+
imgs_on_disk = sorted(os.listdir(osp.join(scene_dir, "images")))
|
| 48 |
+
imgs_on_disk = list(map(lambda x: x[:-4], imgs_on_disk))
|
| 49 |
+
|
| 50 |
+
dslr_ids = [
|
| 51 |
+
i + offset
|
| 52 |
+
for i in img_ids
|
| 53 |
+
if imgs[i].startswith("DSC") and imgs[i] in imgs_on_disk
|
| 54 |
+
]
|
| 55 |
+
iphone_ids = [
|
| 56 |
+
i + offset
|
| 57 |
+
for i in img_ids
|
| 58 |
+
if imgs[i].startswith("frame") and imgs[i] in imgs_on_disk
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
num_imgs = len(imgs)
|
| 62 |
+
assert max(dslr_ids) < min(iphone_ids)
|
| 63 |
+
assert "image_collection" in data
|
| 64 |
+
|
| 65 |
+
img_groups = []
|
| 66 |
+
img_id_ranges = []
|
| 67 |
+
|
| 68 |
+
# 使用与其他数据集一致的 cut_off 逻辑
|
| 69 |
+
min_group_len = (
|
| 70 |
+
self.num_views
|
| 71 |
+
if not self.allow_repeat
|
| 72 |
+
else max(self.num_views // 3, 3)
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
for ref_id, group in data["image_collection"].item().items():
|
| 76 |
+
if len(group) + 1 < min_group_len:
|
| 77 |
+
continue
|
| 78 |
+
group.insert(0, (ref_id, 1.0))
|
| 79 |
+
sorted_group = sorted(group, key=lambda x: x[1], reverse=True)
|
| 80 |
+
group = [int(x[0] + offset) for x in sorted_group]
|
| 81 |
+
|
| 82 |
+
# 确定对应的视频帧列表
|
| 83 |
+
if imgs[ref_id].startswith("frame"):
|
| 84 |
+
video_ids = dslr_ids
|
| 85 |
+
else:
|
| 86 |
+
video_ids = iphone_ids
|
| 87 |
+
|
| 88 |
+
# 只有当视频帧列表足够长时才添加
|
| 89 |
+
if len(video_ids) >= min_group_len:
|
| 90 |
+
img_groups.append(sorted(group))
|
| 91 |
+
img_id_ranges.append(video_ids)
|
| 92 |
+
|
| 93 |
+
if len(img_groups) == 0:
|
| 94 |
+
print(f"Skipping {scene}")
|
| 95 |
+
continue
|
| 96 |
+
scenes.append(scene)
|
| 97 |
+
sceneids.extend([j] * num_imgs)
|
| 98 |
+
images.extend(imgs)
|
| 99 |
+
intrinsics.append(intrins)
|
| 100 |
+
trajectories.append(traj)
|
| 101 |
+
|
| 102 |
+
# offset groups
|
| 103 |
+
groups.extend(img_groups)
|
| 104 |
+
id_ranges.extend(img_id_ranges)
|
| 105 |
+
offset += num_imgs
|
| 106 |
+
j += 1
|
| 107 |
+
|
| 108 |
+
self.scenes = scenes
|
| 109 |
+
self.sceneids = sceneids
|
| 110 |
+
self.images = images
|
| 111 |
+
self.intrinsics = np.concatenate(intrinsics, axis=0)
|
| 112 |
+
self.trajectories = np.concatenate(trajectories, axis=0)
|
| 113 |
+
self.id_ranges = id_ranges
|
| 114 |
+
self.groups = groups
|
| 115 |
+
|
| 116 |
+
def __len__(self):
|
| 117 |
+
return len(self.groups) * 10
|
| 118 |
+
|
| 119 |
+
def get_image_num(self):
|
| 120 |
+
return self.image_num
|
| 121 |
+
|
| 122 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 123 |
+
idx = idx // 10
|
| 124 |
+
image_idxs = self.groups[idx]
|
| 125 |
+
rand_val = rng.random()
|
| 126 |
+
|
| 127 |
+
image_idxs_video = self.id_ranges[idx]
|
| 128 |
+
cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3)
|
| 129 |
+
start_image_idxs = image_idxs_video[: len(image_idxs_video) - cut_off + 1]
|
| 130 |
+
|
| 131 |
+
if rand_val < 0.7 and len(start_image_idxs) > 0:
|
| 132 |
+
start_id = rng.choice(start_image_idxs)
|
| 133 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 134 |
+
num_views,
|
| 135 |
+
start_id,
|
| 136 |
+
image_idxs_video,
|
| 137 |
+
rng,
|
| 138 |
+
max_interval=self.max_interval,
|
| 139 |
+
video_prob=0.8,
|
| 140 |
+
fix_interval_prob=0.5,
|
| 141 |
+
block_shuffle=16,
|
| 142 |
+
)
|
| 143 |
+
image_idxs = np.array(image_idxs_video)[pos]
|
| 144 |
+
|
| 145 |
+
else:
|
| 146 |
+
ordered_video = True
|
| 147 |
+
# ordered video with varying intervals
|
| 148 |
+
num_candidates = len(image_idxs)
|
| 149 |
+
max_id = min(num_candidates, int(num_views * (2 + 2 * rng.random())))
|
| 150 |
+
|
| 151 |
+
# 确保有足够的候选帧
|
| 152 |
+
if num_candidates < num_views:
|
| 153 |
+
# 如果候选帧不足,使用重复采样
|
| 154 |
+
image_idxs = sorted(rng.choice(image_idxs, size=num_views, replace=True))
|
| 155 |
+
else:
|
| 156 |
+
image_idxs = sorted(rng.permutation(image_idxs[:max_id])[:num_views])
|
| 157 |
+
|
| 158 |
+
if rand_val > 0.75:
|
| 159 |
+
ordered_video = False
|
| 160 |
+
image_idxs = rng.permutation(image_idxs)
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
views = []
|
| 164 |
+
for v, view_idx in enumerate(image_idxs):
|
| 165 |
+
scene_id = self.sceneids[view_idx]
|
| 166 |
+
scene_dir = osp.join(self.ROOT, self.scenes[scene_id])
|
| 167 |
+
|
| 168 |
+
intrinsics = self.intrinsics[view_idx]
|
| 169 |
+
camera_pose = self.trajectories[view_idx]
|
| 170 |
+
basename = self.images[view_idx]
|
| 171 |
+
|
| 172 |
+
# Load RGB image
|
| 173 |
+
rgb_image = imread_pil(osp.join(scene_dir, "images", basename + ".jpg"))
|
| 174 |
+
# Load depthmap
|
| 175 |
+
depthmap = imread_cv2(
|
| 176 |
+
osp.join(scene_dir, "depth", basename + ".png"), cv2.IMREAD_UNCHANGED
|
| 177 |
+
)
|
| 178 |
+
depthmap = depthmap.astype(np.float32) / 1000
|
| 179 |
+
depthmap[~np.isfinite(depthmap)] = 0 # invalid
|
| 180 |
+
|
| 181 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 182 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
# generate img mask and raymap mask
|
| 186 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 187 |
+
self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
views.append(
|
| 191 |
+
dict(
|
| 192 |
+
img=rgb_image,
|
| 193 |
+
depthmap=depthmap.astype(np.float32),
|
| 194 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 195 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 196 |
+
dataset="ScanNet++",
|
| 197 |
+
label=self.scenes[scene_id] + "_" + basename,
|
| 198 |
+
instance=f"{str(idx)}_{str(view_idx)}",
|
| 199 |
+
is_metric=self.is_metric,
|
| 200 |
+
is_video=ordered_video,
|
| 201 |
+
quantile=np.array(0.99, dtype=np.float32),
|
| 202 |
+
img_mask=img_mask,
|
| 203 |
+
ray_mask=ray_mask,
|
| 204 |
+
camera_only=False,
|
| 205 |
+
depth_only=False,
|
| 206 |
+
single_view=False,
|
| 207 |
+
reset=False,
|
| 208 |
+
)
|
| 209 |
+
)
|
| 210 |
+
assert len(views) == num_views
|
| 211 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/smartportraits.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import itertools
|
| 5 |
+
import os
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 11 |
+
from dust3r.utils.image import imread_cv2
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SmartPortraits_Multi(BaseMultiViewDataset):
|
| 15 |
+
def __init__(self, *args, ROOT, **kwargs):
|
| 16 |
+
self.ROOT = ROOT
|
| 17 |
+
self.video = True
|
| 18 |
+
self.is_metric = True
|
| 19 |
+
super().__init__(*args, **kwargs)
|
| 20 |
+
self.loaded_data = self._load_data()
|
| 21 |
+
|
| 22 |
+
def _load_data(self):
|
| 23 |
+
scenes = os.listdir(self.ROOT)
|
| 24 |
+
img_names = []
|
| 25 |
+
for scene in scenes:
|
| 26 |
+
scene_dir = osp.join(self.ROOT, scene)
|
| 27 |
+
rgb_dir = osp.join(scene_dir, "rgb")
|
| 28 |
+
basenames = sorted(
|
| 29 |
+
[f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]
|
| 30 |
+
)
|
| 31 |
+
img_names.extend([(scene, basename) for basename in basenames])
|
| 32 |
+
|
| 33 |
+
self.img_names = img_names
|
| 34 |
+
|
| 35 |
+
def __len__(self):
|
| 36 |
+
return len(self.img_names)
|
| 37 |
+
|
| 38 |
+
def get_image_num(self):
|
| 39 |
+
return len(self.img_names)
|
| 40 |
+
|
| 41 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 42 |
+
new_seed = rng.integers(0, 2**32) + idx
|
| 43 |
+
new_rng = np.random.default_rng(new_seed)
|
| 44 |
+
img_names = new_rng.choice(self.img_names, num_views, replace=False)
|
| 45 |
+
|
| 46 |
+
views = []
|
| 47 |
+
for v, img_name in enumerate(img_names):
|
| 48 |
+
# Load RGB image
|
| 49 |
+
scene, img_name = img_name
|
| 50 |
+
rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"))
|
| 51 |
+
depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy"))
|
| 52 |
+
depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
|
| 53 |
+
|
| 54 |
+
intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))[
|
| 55 |
+
"intrinsics"
|
| 56 |
+
]
|
| 57 |
+
# camera pose is not provided, placeholder
|
| 58 |
+
camera_pose = np.eye(4)
|
| 59 |
+
|
| 60 |
+
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 61 |
+
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
views.append(
|
| 65 |
+
dict(
|
| 66 |
+
img=rgb_image,
|
| 67 |
+
depthmap=depthmap.astype(np.float32),
|
| 68 |
+
camera_pose=camera_pose.astype(np.float32),
|
| 69 |
+
camera_intrinsics=intrinsics.astype(np.float32),
|
| 70 |
+
dataset="SmartPortraits",
|
| 71 |
+
label=img_name,
|
| 72 |
+
instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"),
|
| 73 |
+
is_metric=self.is_metric,
|
| 74 |
+
is_video=False,
|
| 75 |
+
quantile=np.array(0.98, dtype=np.float32),
|
| 76 |
+
img_mask=True,
|
| 77 |
+
ray_mask=False,
|
| 78 |
+
camera_only=False,
|
| 79 |
+
depth_only=False,
|
| 80 |
+
single_view=True,
|
| 81 |
+
reset=True,
|
| 82 |
+
)
|
| 83 |
+
)
|
| 84 |
+
assert len(views) == num_views
|
| 85 |
+
return views
|
outdoor_v48_4gpu_v2/code/05_02-14:21:58/dust3r/datasets/tartanair.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os.path as osp
|
| 2 |
+
import numpy as np
|
| 3 |
+
import cv2
|
| 4 |
+
import numpy as np
|
| 5 |
+
import itertools
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
sys.path.append(osp.join(osp.dirname(__file__), "..", ".."))
|
| 10 |
+
|
| 11 |
+
from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset
|
| 12 |
+
from dust3r.utils.image import imread_cv2
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class TartanAir_Multi(BaseMultiViewDataset):
|
| 16 |
+
|
| 17 |
+
def __init__(self, ROOT, *args, **kwargs):
|
| 18 |
+
self.ROOT = ROOT
|
| 19 |
+
self.video = True
|
| 20 |
+
self.is_metric = True
|
| 21 |
+
self.max_interval = 20
|
| 22 |
+
super().__init__(*args, **kwargs)
|
| 23 |
+
# loading all
|
| 24 |
+
assert self.split is None
|
| 25 |
+
self._load_data()
|
| 26 |
+
|
| 27 |
+
def _load_data(self):
|
| 28 |
+
scene_dirs = sorted(
|
| 29 |
+
[
|
| 30 |
+
d
|
| 31 |
+
for d in os.listdir(self.ROOT)
|
| 32 |
+
if os.path.isdir(os.path.join(self.ROOT, d))
|
| 33 |
+
]
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
offset = 0
|
| 37 |
+
scenes = []
|
| 38 |
+
sceneids = []
|
| 39 |
+
images = []
|
| 40 |
+
scene_img_list = []
|
| 41 |
+
start_img_ids = []
|
| 42 |
+
j = 0
|
| 43 |
+
|
| 44 |
+
for scene in scene_dirs:
|
| 45 |
+
for mode in ["Easy", "Hard"]:
|
| 46 |
+
seq_dirs = sorted(
|
| 47 |
+
[
|
| 48 |
+
os.path.join(self.ROOT, scene, mode, d)
|
| 49 |
+
for d in os.listdir(os.path.join(self.ROOT, scene, mode))
|
| 50 |
+
if os.path.isdir(os.path.join(self.ROOT, scene, mode, d))
|
| 51 |
+
]
|
| 52 |
+
)
|
| 53 |
+
for seq_dir in seq_dirs:
|
| 54 |
+
basenames = sorted(
|
| 55 |
+
[f[:-8] for f in os.listdir(seq_dir) if f.endswith(".png")]
|
| 56 |
+
)
|
| 57 |
+
num_imgs = len(basenames)
|
| 58 |
+
cut_off = (
|
| 59 |
+
self.num_views
|
| 60 |
+
if not self.allow_repeat
|
| 61 |
+
else max(self.num_views // 3, 3)
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
if num_imgs < cut_off:
|
| 65 |
+
print(f"Skipping {scene}")
|
| 66 |
+
continue
|
| 67 |
+
img_ids = list(np.arange(num_imgs) + offset)
|
| 68 |
+
start_img_ids_ = img_ids[: num_imgs - cut_off + 1]
|
| 69 |
+
|
| 70 |
+
scenes.append(seq_dir)
|
| 71 |
+
scene_img_list.append(img_ids)
|
| 72 |
+
sceneids.extend([j] * num_imgs)
|
| 73 |
+
images.extend(basenames)
|
| 74 |
+
start_img_ids.extend(start_img_ids_)
|
| 75 |
+
offset += num_imgs
|
| 76 |
+
j += 1
|
| 77 |
+
|
| 78 |
+
self.scenes = scenes
|
| 79 |
+
self.sceneids = sceneids
|
| 80 |
+
self.images = images
|
| 81 |
+
self.start_img_ids = start_img_ids
|
| 82 |
+
self.scene_img_list = scene_img_list
|
| 83 |
+
|
| 84 |
+
def __len__(self):
|
| 85 |
+
return len(self.start_img_ids)
|
| 86 |
+
|
| 87 |
+
def get_image_num(self):
|
| 88 |
+
return len(self.images)
|
| 89 |
+
|
| 90 |
+
def get_stats(self):
|
| 91 |
+
return f"{len(self)} groups of views"
|
| 92 |
+
|
| 93 |
+
def _get_views(self, idx, resolution, rng, num_views):
|
| 94 |
+
start_id = self.start_img_ids[idx]
|
| 95 |
+
scene_id = self.sceneids[start_id]
|
| 96 |
+
all_image_ids = self.scene_img_list[scene_id]
|
| 97 |
+
pos, ordered_video = self.get_seq_from_start_id(
|
| 98 |
+
num_views,
|
| 99 |
+
start_id,
|
| 100 |
+
all_image_ids,
|
| 101 |
+
rng,
|
| 102 |
+
max_interval=self.max_interval,
|
| 103 |
+
video_prob=0.8,
|
| 104 |
+
fix_interval_prob=0.8,
|
| 105 |
+
block_shuffle=16,
|
| 106 |
+
)
|
| 107 |
+
image_idxs = np.array(all_image_ids)[pos]
|
| 108 |
+
|
| 109 |
+
views = []
|
| 110 |
+
|
| 111 |
+
for v, view_idx in enumerate(image_idxs):
|
| 112 |
+
scene_id = self.sceneids[view_idx]
|
| 113 |
+
scene_dir = self.scenes[scene_id]
|
| 114 |
+
basename = self.images[view_idx]
|
| 115 |
+
|
| 116 |
+
img = basename + "_rgb.png"
|
| 117 |
+
image = imread_cv2(osp.join(scene_dir, img))
|
| 118 |
+
depthmap = np.load(osp.join(scene_dir, basename + "_depth.npy"))
|
| 119 |
+
camera_params = np.load(osp.join(scene_dir, basename + "_cam.npz"))
|
| 120 |
+
|
| 121 |
+
intrinsics = camera_params["camera_intrinsics"]
|
| 122 |
+
camera_pose = camera_params["camera_pose"]
|
| 123 |
+
|
| 124 |
+
sky_mask = depthmap >= 1000
|
| 125 |
+
depthmap[sky_mask] = -1.0 # sky
|
| 126 |
+
depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0)
|
| 127 |
+
threshold = (
|
| 128 |
+
np.percentile(depthmap[depthmap > 0], 98)
|
| 129 |
+
if depthmap[depthmap > 0].size > 0
|
| 130 |
+
else 0
|
| 131 |
+
)
|
| 132 |
+
depthmap[depthmap > threshold] = 0.0
|
| 133 |
+
|
| 134 |
+
image, depthmap, intrinsics = self._crop_resize_if_necessary(
|
| 135 |
+
image, depthmap, intrinsics, resolution, rng, info=(scene_dir, img)
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
# generate img mask and raymap mask
|
| 139 |
+
img_mask, ray_mask = self.get_img_and_ray_masks(
|
| 140 |
+
self.is_metric, v, rng, p=[0.75, 0.2, 0.05]
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
views.append(
|
| 144 |
+
dict(
|
| 145 |
+
img=image,
|
| 146 |
+
depthmap=depthmap,
|
| 147 |
+
camera_pose=camera_pose, # cam2world
|
| 148 |
+
camera_intrinsics=intrinsics,
|
| 149 |
+
dataset="TartanAir",
|
| 150 |
+
label=scene_dir,
|
| 151 |
+
is_metric=self.is_metric,
|
| 152 |
+
instance=scene_dir + "_" + img,
|
| 153 |
+
is_video=ordered_video,
|
| 154 |
+
quantile=np.array(1.0, dtype=np.float32),
|
| 155 |
+
img_mask=img_mask,
|
| 156 |
+
ray_mask=ray_mask,
|
| 157 |
+
camera_only=False,
|
| 158 |
+
depth_only=False,
|
| 159 |
+
single_view=False,
|
| 160 |
+
reset=False,
|
| 161 |
+
)
|
| 162 |
+
)
|
| 163 |
+
assert len(views) == num_views
|
| 164 |
+
return views
|