larryshaw0079 commited on 19 days ago

Commit

873de4e

verified ·

1 Parent(s): 4543b45

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/.hydra/config.yaml +125 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/.hydra/hydra.yaml +186 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/.hydra/overrides.yaml +31 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-1.pth +3 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-2.pth +3 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-final.pth +3 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-last.pth +3 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/CHANGELOG.md +19 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/README.md +373 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/README_submap.md +225 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/base_opt.py +301 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/commons.py +102 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/__init__.py +31 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/base_opt.py +620 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/commons.py +102 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/init_im_poses.py +378 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/optimizer.py +301 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/init_all.py +222 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/utils.py +443 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/deepspeed_zero3_bf16.json +19 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune.yaml +102 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune_paper_h20.yaml +129 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune_pseudo_gt_high_recall.yaml +129 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune_sub_only.yaml +129 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/mytrain.yaml +92 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/environment.yml +245 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/eval_ate_scaled.py +54 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/get_ate.py +74 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/publish_submap.sh +138 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/requirements.txt +30 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/run_tum.sh +77 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/run_tum_top5.sh +66 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/setup.py +8 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/setup_env.sh +18 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/__init__.py +0 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/audit_dataset_num_views.py +412 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/batched_dynamic_router.py +243 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/demo.py +540 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/demo_infinite.py +493 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/demo_submap.py +927 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/download_data.sh +354 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/exp_joint_freeze_frontend_fsdp_8gpu.sh +438 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/graph_gated_memory.py +850 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/mine_pseudo_gt.py +588 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/pseudo_gt.py +348 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/__init__.py +197 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/generic_utils.py +274 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/geometry_utils.py +232 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/tmp.py +39 -0
checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/visualization_utils.py +167 -0

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,125 @@

+accum_iter: 1
+allow_repeat: false
+amp: 1
+batch_size: 1
+benchmark: false
+custom_lr_scale: 1.0
+data_root: /home/23068142r/work_dir/data
+root_arkit: /home/23068142r/work_dir/data/processed_arkitscenes
+root_scannetpp: /home/23068142r/work_dir/data/preprocessed_scannetpp
+root_scannet: /home/23068142r/work_dir/data/processed_scannet
+root_hypersim: /home/23068142r/work_dir/data/preprocessed_Hypersim
+root_blendedmvs: /home/23068142r/work_dir/data/processed_blendedmvs
+root_megadepth: /home/23068142r/work_dir/data/processed_megadepth
+root_mvs_synth: /home/23068142r/work_dir/data/processed_mvs_synth
+dataset_arkit: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_arkit}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_arkit}, n_corres=${n_corres_train})
+dataset_scannetpp: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannetpp}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_scannetpp}, n_corres=${n_corres_train})
+dataset_scannet: ScanNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannet}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_scannet}, n_corres=${n_corres_train})
+dataset_hypersim: HyperSim_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_hypersim}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_hypersim}, n_corres=${n_corres_train})
+dataset_blendedmvs: BlendedMVS_Multi(allow_repeat=${allow_repeat}, split='train',
+  ROOT="${root_blendedmvs}", aug_crop=16, resolution=[(518, 392), (518, 336), (518,
+  294), (518, 266), (518, 210), (518, 154)], transform=SeqColorJitter, num_views=${num_views_blendedmvs},
+  n_corres=${n_corres_train})
+dataset_megadepth: MegaDepth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_megadepth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_megadepth}, n_corres=${n_corres_train})
+dataset_mvs_synth: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_mvs_synth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_mvs_synth}, n_corres=${n_corres_train})
+desc_dim: 128
+detach_frontend_tokens: true
+dist_backend: nccl
+dist_url: env://
+distributed: false
+enable_dynamic_boundary: false
+enable_loop: true
+enable_submap: true
+enable_temporal: false
+epochs: 2
+eval_freq: 1
+exp_name: paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12
+fixed_length: true
+freeze_encoder: true
+gpu: 0
+gradient_checkpointing: true
+gumbel_tau: 5.0
+loop_mask_mode: soft_all
+retain_history_grad: true
+submap_train_mode: full_token
+submap_retrieval_topk: 0
+submap_fetch_source: frontend
+submap_descriptor_source: frontend
+train_submap_modules_only: false
+gumbel_tau_end: 0.1
+gumbel_tau_start: 5.0
+keep_freq: 1
+load_only_encoder: false
+local-rank: -1
+logdir: ${save_dir}/${exp_name}/logs
+long_context: false
+lr: 1.0e-05
+max_checkpoints: 10
+max_recursive_submaps: 5
+min_lr: 1.0e-08
+n_corres_test: 0
+n_corres_train: 0
+num_imgs_vis: 4
+num_test_views: 4
+num_views: 24
+num_views_arkit: 64
+num_views_scannetpp: 24
+num_views_scannet: 64
+num_views_hypersim: 24
+num_views_blendedmvs: 64
+num_views_megadepth: 64
+num_views_mvs_synth: 24
+num_workers: 4
+output_dir: ${save_dir}/${exp_name}/
+pretrained: /home/23068142r/work_dir/projects/e2e-semantic-SLAM-submap/ckpt/checkpoint-10.pth.model
+print_freq: 10
+print_img_freq: 50000000
+rank: 0
+resume: null
+retention_ratio: 0.5
+pseudo_gt:
+  enable: false
+  cache_path: null
+  use_soft_targets: true
+  min_confidence: 0.65
+  min_support_pairs: 1
+  topk_pairs: 4
+  loss_type: hybrid
+  loss_weight_gate: 0.1
+  loss_weight_desc: 0.1
+  geometric_support_scale: 0.25
+  ranking_margin: 0.1
+  use_l2m: false
+  l2m_min_certainty: 0.0
+  l2m_min_inlier_ratio: 0.0
+save_dir: /home/23068142r/work_dir/projects/e2e-semantic-SLAM/checkpoints/paper_smoke_local_8gpu/joint_freeze_frontend_fsdp_sub12
+save_freq: 0.1
+seed: 42
+soft_mask_bias: 0.2
+soft_mask_temperature: 0.25
+start_epoch: 0
+start_step: 0
+submap_size: 6
+task: SLAMFormer_Submap_Finetune
+tbptt_window: 0
+teacher: null
+temporal_embed_mode: learned
+test_criterion: DistillLoss()
+test_dataset: ''
+train_criterion: DistillLoss()
+train_dataset: 16 @ ${dataset_scannetpp} + 16 @ ${dataset_hypersim} + 16 @ ${dataset_mvs_synth}
+warmup_epochs: 0.5
+weight_decay: 0.05
+world_size: 1

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,186 @@

+hydra:
+  run:
+    dir: ${save_dir}/${exp_name}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - exp_name=paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12
+    - save_dir=/home/23068142r/work_dir/projects/e2e-semantic-SLAM/checkpoints/paper_smoke_local_8gpu/joint_freeze_frontend_fsdp_sub12
+    - pretrained=/home/23068142r/work_dir/projects/e2e-semantic-SLAM-submap/ckpt/checkpoint-10.pth.model
+    - resume=null
+    - data_root=/home/23068142r/work_dir/data
+    - root_arkit=/home/23068142r/work_dir/data/processed_arkitscenes
+    - root_scannetpp=/home/23068142r/work_dir/data/preprocessed_scannetpp
+    - root_scannet=/home/23068142r/work_dir/data/processed_scannet
+    - root_hypersim=/home/23068142r/work_dir/data/preprocessed_Hypersim
+    - root_blendedmvs=/home/23068142r/work_dir/data/processed_blendedmvs
+    - root_megadepth=/home/23068142r/work_dir/data/processed_megadepth
+    - root_mvs_synth=/home/23068142r/work_dir/data/processed_mvs_synth
+    - num_views=24
+    - num_views_arkit=64
+    - num_views_scannetpp=24
+    - num_views_scannet=64
+    - num_views_hypersim=24
+    - num_views_blendedmvs=64
+    - num_views_megadepth=64
+    - num_views_mvs_synth=24
+    - train_submap_modules_only=false
+    - detach_frontend_tokens=true
+    - submap_train_mode=full_token
+    - submap_retrieval_topk=0
+    - submap_fetch_source=frontend
+    - submap_descriptor_source=frontend
+    - pseudo_gt.enable=false
+    - pseudo_gt.cache_path=null
+    - train_dataset=16 @ ${dataset_scannetpp} + 16 @ ${dataset_hypersim} + 16 @ ${dataset_mvs_synth}
+    - epochs=2
+    - test_dataset=
+  job:
+    name: finetune
+    chdir: null
+    override_dirname: data_root=/home/23068142r/work_dir/data,detach_frontend_tokens=true,epochs=2,exp_name=paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12,num_views=24,num_views_arkit=64,num_views_blendedmvs=64,num_views_hypersim=24,num_views_megadepth=64,num_views_mvs_synth=24,num_views_scannet=64,num_views_scannetpp=24,pretrained=/home/23068142r/work_dir/projects/e2e-semantic-SLAM-submap/ckpt/checkpoint-10.pth.model,pseudo_gt.cache_path=null,pseudo_gt.enable=false,resume=null,root_arkit=/home/23068142r/work_dir/data/processed_arkitscenes,root_blendedmvs=/home/23068142r/work_dir/data/processed_blendedmvs,root_hypersim=/home/23068142r/work_dir/data/preprocessed_Hypersim,root_megadepth=/home/23068142r/work_dir/data/processed_megadepth,root_mvs_synth=/home/23068142r/work_dir/data/processed_mvs_synth,root_scannet=/home/23068142r/work_dir/data/processed_scannet,root_scannetpp=/home/23068142r/work_dir/data/preprocessed_scannetpp,save_dir=/home/23068142r/work_dir/projects/e2e-semantic-SLAM/checkpoints/paper_smoke_local_8gpu/joint_freeze_frontend_fsdp_sub12,submap_descriptor_source=frontend,submap_fetch_source=frontend,submap_retrieval_topk=0,submap_train_mode=full_token,test_dataset=,train_dataset=16
+      @ ${dataset_scannetpp} + 16 @ ${dataset_hypersim} + 16 @ ${dataset_mvs_synth},train_submap_modules_only=false
+    id: ???
+    num: ???
+    config_name: finetune_paper_h20.yaml
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /home/23068142r/work_dir/projects/e2e-semantic-SLAM
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /home/23068142r/work_dir/projects/e2e-semantic-SLAM/src/../config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/23068142r/work_dir/projects/e2e-semantic-SLAM/checkpoints/paper_smoke_local_8gpu/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: true

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1,31 @@

+- exp_name=paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12
+- save_dir=/home/23068142r/work_dir/projects/e2e-semantic-SLAM/checkpoints/paper_smoke_local_8gpu/joint_freeze_frontend_fsdp_sub12
+- pretrained=/home/23068142r/work_dir/projects/e2e-semantic-SLAM-submap/ckpt/checkpoint-10.pth.model
+- resume=null
+- data_root=/home/23068142r/work_dir/data
+- root_arkit=/home/23068142r/work_dir/data/processed_arkitscenes
+- root_scannetpp=/home/23068142r/work_dir/data/preprocessed_scannetpp
+- root_scannet=/home/23068142r/work_dir/data/processed_scannet
+- root_hypersim=/home/23068142r/work_dir/data/preprocessed_Hypersim
+- root_blendedmvs=/home/23068142r/work_dir/data/processed_blendedmvs
+- root_megadepth=/home/23068142r/work_dir/data/processed_megadepth
+- root_mvs_synth=/home/23068142r/work_dir/data/processed_mvs_synth
+- num_views=24
+- num_views_arkit=64
+- num_views_scannetpp=24
+- num_views_scannet=64
+- num_views_hypersim=24
+- num_views_blendedmvs=64
+- num_views_megadepth=64
+- num_views_mvs_synth=24
+- train_submap_modules_only=false
+- detach_frontend_tokens=true
+- submap_train_mode=full_token
+- submap_retrieval_topk=0
+- submap_fetch_source=frontend
+- submap_descriptor_source=frontend
+- pseudo_gt.enable=false
+- pseudo_gt.cache_path=null
+- train_dataset=16 @ ${dataset_scannetpp} + 16 @ ${dataset_hypersim} + 16 @ ${dataset_mvs_synth}
+- epochs=2
+- test_dataset=

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c96e1ed2d9231223d02406ef66794e5f9f39bae990354bd5bc4101ab2396afb6
+size 4516140233

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:465a4b460ad29b4d539348adebfb85a01845b0f905cad11206d260cf1b95f76f
+size 4516140233

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-final.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a618c80dcef89b1a41580d858134bac0d79b39130586e8d036a26ac9582c9f2
+size 3873507717

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/checkpoint-last.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb7f68ba708455c27ac4b78128cf8a34f0f9511caf47074e3eea5431d5dfdfdb
+size 4516145306

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# Changelog
+## Hardware
+| Component | Specification |
+|-----------|---------------|
+| GPU       | 8 x NVIDIA L40S |
+| CPU       | AMD EPYC 7763 64-Core Processor (112 vCPUs) |
+| Memory    | 755 GiB |
+## 2026-04-03
+### Added
+- new script `slam/exp_joint_freeze_frontend_fsdp_8gpu.sh`
+- `SKIP_TEST` flag in `slam/exp_joint_freeze_frontend_fsdp_8gpu.sh` (default `0`). When set to `1`, the test dataset Hydra override is cleared and no test data loaders are built.
+- Guard in `src/finetune.py` to skip test dataset construction when `test_dataset` is empty, setting `data_loader_test = {}`.
+- Guard in `src/finetune.py` (`train_one_epoch`) against `ZeroDivisionError` when `int(save_freq * len(data_loader))` truncates to 0 (e.g. small dataset on many GPUs). Intra-epoch checkpoint saving is now skipped gracefully in this case.
+- Adjust the dataset path to fit my own computer

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/README.md ADDED Viewed

	@@ -0,0 +1,373 @@

+<div align="center">
+<h1>SLAM-Former Submap Release Guide</h1>
+<h3>GitHub handoff, FSDP training, and full-sequence TUM runbook</h3>
+</div>
+## What this upload is for
+This branch is the submap-oriented training and inference release prepared for GitHub handoff.
+The release is built around the H20 launchers and the current submap memory path, with the following fixed requirements:
+- **Distributed strategy**: `FSDP`
+- **Submap length**: `submap_size=12`
+- **Descriptor source**: `frontend`
+- **Historical token fetch source**: `frontend`
+- **External epoch control**: set `EPOCHS` outside the script
+- **Comparison modes**:
+  - `submap-only`
+  - `backend + submap joint training with detached frontend tokens`
+If you want the original upstream project README, see `README_ori.md`.
+If you want more implementation detail on the submap system, see `README_submap.md` and `submap_handoff.md`.
+## Clone the published branch
+```bash
+git clone -b submap https://github.com/SlamMate/e2e-semantic-SLAM.git
+cd e2e-semantic-SLAM
+```
+## Environment
+```bash
+conda env create -f environment.yml
+conda activate SLAM-Former
+```
+The exported `environment.yml` is a full snapshot of the current Conda environment, including CUDA-related Python packages. If you want to move it between machines, you can keep the file as-is or delete the final `prefix:` line for a cleaner portable export.
+If you prefer the original manual install flow:
+```bash
+conda create -n SLAM-Former python=3.11
+conda activate SLAM-Former
+pip install -r requirements.txt
+pip install -e .
+```
+## Paths and launch parameters to set on a new cluster
+| Variable | Meaning | Typical example |
+| --- | --- | --- |
+| `PROJECT_DIR` | repository root | `/path/to/e2e-semantic-SLAM` |
+| `DATA_ROOT` | training data root | `/path/to/data/train` |
+| `PRETRAINED` | pretrained SLAM-Former checkpoint | `/path/to/ckpt/checkpoint-10.pth.model` |
+| `SAVE_DIR` | checkpoint output root | `/path/to/checkpoints` |
+| `CONDA_SH` | conda init script | `/path/to/miniconda3/etc/profile.d/conda.sh` |
+| `CONDA_ENV_NAME` | conda env name | `SLAM-Former` |
+| `MASTER_ADDR` | rank-0 node hostname or IP for multi-node bash launch | `node0` |
+| `MACHINE_RANK` | machine rank for multi-node bash launch | `0..7` |
+| `NUM_MACHINES` | machine count for remote long run | `8` |
+| `GPUS_PER_NODE` | visible GPUs per node | `8` |
+| `MASTER_PORT` | communication port | `29661` / `29671` etc. |
+For migration to a new machine, the entries above are the first ones to review. If the data layout is unchanged and each dataset still lives under `DATA_ROOT/<dataset>`, you can usually keep the `ROOT_*` defaults.
+The default data layout expected by the launchers is:
+- `processed_arkitscenes`
+- `processed_scannetpp`
+- `processed_scannet` or `processed_scannetv2`
+- `hypersim`
+- `processed_blendedmvs`
+- `processed_megadepth`
+- `processed_mvs_synth`
+## Fixed training invariants in this release
+- `CONFIG_NAME=finetune_paper_h20.yaml`
+- `DIST_STRATEGY=fsdp`
+- `SUBMAP_SIZE=12`
+- `SUBMAP_TRAIN_MODE=full_token`
+- `SUBMAP_RETRIEVAL_TOPK=0`
+- `SUBMAP_FETCH_SOURCE=frontend`
+- `SUBMAP_DESCRIPTOR_SOURCE=frontend`
+- `freeze_encoder=true` stays inherited from `config/finetune_paper_h20.yaml`
+- `DETACH_FRONTEND_TOKENS=1` in both comparison modes
+- `TRAIN_SUBMAP_MODULES_ONLY=1` for strict submap-only training
+- `TRAIN_SUBMAP_MODULES_ONLY=0` for backend + submap joint training while frontend tokens stay detached
+## `num_views` reference table
+The remote 8-node scripts use aggressive long-sequence defaults.
+These are **reference upper settings**, not no-skip guarantees.
+For ARKitScenes, the local audit shows a very small strict no-skip cap because of a few short scenes, but much longer clips are still possible if short scenes are skipped.
+| Dataset | Remote 8x8 default | StrictCapNoSkip | MedianCap | MaxCap | Notes |
+| --- | ---: | ---: | ---: | ---: | --- |
+| ARKitScenes | 478 | 2 | 92 | 478 | strict cap is dominated by short scenes; long clips work with scene skipping |
+| ScanNet++ | 150 | 45 | 143 | 150 | local processed data already supports long clips |
+| ScanNet | 64 | N/A | N/A | N/A | local audit unavailable; rerun audit on target cluster if needed |
+| HyperSim | 64 | N/A | N/A | N/A | local audit unavailable |
+| BlendedMVS | 64 | N/A | N/A | N/A | local audit unavailable |
+| MegaDepth | 64 | N/A | N/A | 64 | loader-side hard cap is 64 |
+| MVS-Synth | 100 | 69 | 100 | 100 | long clips are supported on all local scenes |
+If your target cluster has different processed data, rerun the audit and override `NUM_VIEWS_*` as needed.
+## Scripts provided in this release
+| Script | Launch mode | Purpose |
+| --- | --- | --- |
+| `slam/sbatch_smoke_submap_only_fsdp_2gpu.sh` | local `sbatch` | 1-node 2-GPU smoke validation for strict submap-only mode |
+| `slam/sbatch_smoke_joint_freeze_frontend_fsdp_2gpu.sh` | local `sbatch` | 1-node 2-GPU smoke validation for backend + submap with detached frontend |
+| `slam/train_remote_submap_only_fsdp_8node8gpu.sh` | remote `bash` | 8-node 8-GPU-per-node long-sequence submap-only training |
+| `slam/train_remote_joint_freeze_frontend_fsdp_8node8gpu.sh` | remote `bash` | 8-node 8-GPU-per-node long-sequence backend + submap training with detached frontend |
+## Migration guide: run the smoke scripts on another machine
+These two smoke launchers are self-contained. They no longer rely on the old wrapper scripts. Each launcher:
+- resolves `PROJECT_DIR` from `PROJECT_DIR`, `SLURM_SUBMIT_DIR`, or the script location;
+- requires `CONDA_SH` to point to a valid `conda.sh` file and activates `CONDA_ENV_NAME` from that shell;
+- loads `cuda12.1/toolkit` when the `module` command is available;
+- sets `PYTHONPATH` to include `src/`;
+- launches `src/finetune.py` through `accelerate launch`.
+### 1. Minimum checklist before the first run
+1. Clone or copy the repository onto the new machine.
+2. Create the environment with `conda env create -f environment.yml`, and make sure `CONDA_SH` points at the correct `conda.sh` on the new machine.
+3. Make sure the pretrained checkpoint exists at `PRETRAINED` or override `PRETRAINED=...`.
+4. Point `DATA_ROOT` to the processed training data on the new machine.
+5. Verify the dataset roots under `DATA_ROOT`, or override the individual `ROOT_*` variables.
+6. Check that the new machine has a compatible CUDA setup. If the cluster uses a different module name, edit the `module load cuda12.1/toolkit` line.
+7. If you submit with Slurm, keep the `#SBATCH` resource requests consistent with the machine’s GPU, CPU, and memory limits.
+### 2. Variables you usually need to change
+| Variable | Meaning | Typical reason to change |
+| --- | --- | --- |
+| `PROJECT_DIR` | repository root used for `PYTHONPATH`, output paths, and the default checkpoint path | the repo lives in a different directory |
+| `CONDA_SH` | path to `conda.sh` used to initialize Conda | the machine uses a different Miniconda install, or the default path does not exist |
+| `CONDA_ENV_NAME` | environment name to activate | you created the env under a different name |
+| `DATA_ROOT` | top-level directory that contains the processed datasets | the training data is mounted elsewhere |
+| `ROOT_ARKIT`, `ROOT_SCANNETPP`, `ROOT_SCANNET`, `ROOT_SCANNET_FALLBACK`, `ROOT_HYPERSIM`, `ROOT_BLENDEDMVS`, `ROOT_MEGADEPTH`, `ROOT_MVS_SYNTH` | per-dataset roots used by the launchers | the dataset folders do not live directly under `DATA_ROOT` or ScanNet needs a fallback root |
+| `PRETRAINED` | checkpoint loaded before training starts | the pretrained model is stored somewhere else |
+| `SAVE_DIR` | root directory for checkpoints and logs | you want outputs on a different disk |
+| `MASTER_PORT` | port used by `accelerate` to rendezvous the worker processes | another job is already using the default port |
+| `NUM_GPUS` | number of processes / GPUs launched | the target node exposes a different GPU count |
+| `AUTO_DISABLE_MISSING` | auto-disable a dataset whose root is missing or incomplete | set to `0` if you want the job to fail fast instead of silently skipping data |
+| `EXPERIMENT_ROOT`, `VARIANT_NAME`, `EXP_NAME` | folder and experiment naming used for outputs | you want a different output namespace or to avoid collisions with old runs |
+| `RESUME` | checkpoint path to resume from | you are continuing an interrupted run |
+The following Slurm header lines also need cluster-specific tuning:
+- `#SBATCH --gres=gpu:2` requests two GPUs.
+- `#SBATCH --cpus-per-task` controls CPU cores.
+- `#SBATCH --mem` controls RAM.
+- `#SBATCH --time` is the wall-time limit.
+### 3. Variables that control the training recipe
+| Variable | Meaning | Notes |
+| --- | --- | --- |
+| `CONFIG_NAME` | Hydra config file passed to `src/finetune.py` | both smoke scripts use `finetune_paper_h20.yaml` |
+| `DIST_STRATEGY` | distributed backend (`fsdp` or `ddp`) | both smoke scripts use `fsdp` |
+| `TRAIN_SUBMAP_MODULES_ONLY` | `1` = strict submap-only training; `0` = joint backend+submap training | `1` for `slam/sbatch_smoke_submap_only_fsdp_2gpu.sh`, `0` for `slam/sbatch_smoke_joint_freeze_frontend_fsdp_2gpu.sh` |
+| `DETACH_FRONTEND_TOKENS` | detach frontend tokens from gradients | `1` in both scripts |
+| `SUBMAP_SIZE` | number of frames / tokens per submap | default `12` in both scripts |
+| `SUBMAP_TRAIN_MODE` | submap training mode | default `full_token` |
+| `SUBMAP_RETRIEVAL_TOPK` | retrieval top-k setting | default `0` disables retrieval |
+| `SUBMAP_FETCH_SOURCE` | source used to fetch submap features | default `frontend` |
+| `SUBMAP_DESCRIPTOR_SOURCE` | source used to build submap descriptors | default `frontend` |
+| `ENABLE_PSEUDO_GT` | enable pseudo-GT cache usage | keep `0` unless you have a valid cache |
+| `PSEUDO_GT_CACHE_PATH` | path to the pseudo-GT cache | required when `ENABLE_PSEUDO_GT=1` |
+| `EPOCHS` | number of epochs passed to `src/finetune.py` | smoke scripts default to `2` |
+### 4. Dataset-mixture knobs
+| Variable | Meaning | Notes |
+| --- | --- | --- |
+| `SAMPLES_ARKIT`, `SAMPLES_SCANNETPP`, `SAMPLES_SCANNET`, `SAMPLES_HYPERSIM`, `SAMPLES_BLENDEDMVS`, `SAMPLES_MEGADEPTH`, `SAMPLES_MVS_SYNTH` | per-dataset sampling weights in the training mixture | increase or decrease them to rebalance the dataset mix |
+| `NUM_VIEWS_ARKIT`, `NUM_VIEWS_SCANNETPP`, `NUM_VIEWS_SCANNET`, `NUM_VIEWS_HYPERSIM`, `NUM_VIEWS_BLENDEDMVS`, `NUM_VIEWS_MEGADEPTH`, `NUM_VIEWS_MVS_SYNTH` | per-dataset view caps | change these if the processed data on the new machine has different sequence lengths or hard caps |
+| `GLOBAL_NUM_VIEWS` | optional global cap; if unset, the scripts derive it from the active datasets’ `NUM_VIEWS_*` values | set it when you want a single global value for all datasets |
+| `NUM_VIEWS_ALL` | compatibility placeholder kept by the scripts | usually leave it at the default; the launcher mainly uses the per-dataset values above |
+### 5. The two scripts differ only in these defaults
+| Item | Submap-only script | Joint + frozen frontend script | Meaning |
+| --- | --- | --- | --- |
+| `TRAIN_SUBMAP_MODULES_ONLY` | `1` | `0` | whether to train only the submap modules or the joint backend + submap stack |
+| `MASTER_PORT` | `29661` | `29662` | keep the two smoke jobs from colliding on the same node |
+| `VARIANT_NAME` | `submap_only_fsdp_sub12` | `joint_freeze_frontend_fsdp_sub12` | output subdirectory name under `SAVE_DIR` |
+| `EXP_NAME` | `paper_smoke_submap_only_fsdp_2gpu_sub12` | `paper_smoke_joint_freeze_frontend_fsdp_2gpu_sub12` | experiment name written into logs and Hydra config |
+| `#SBATCH --cpus-per-task` | `24` | `12` | CPU reservation for the smoke job |
+| `#SBATCH --mem` | `120G` | `24G` | memory reservation for the smoke job |
+### 6. Direct launch examples on a new machine
+To run on another Slurm machine, set the machine-specific variables inline and submit the launcher from the repo root.
+Submap-only smoke:
+```bash
+PROJECT_DIR=/path/to/e2e-semantic-SLAM \
+CONDA_SH=/path/to/miniconda3/etc/profile.d/conda.sh \
+CONDA_ENV_NAME=SLAM-Former \
+DATA_ROOT=/path/to/data/train \
+PRETRAINED=/path/to/ckpt/checkpoint-10.pth.model \
+SAVE_DIR=/path/to/checkpoints \
+MASTER_PORT=29661 \
+sbatch slam/sbatch_smoke_submap_only_fsdp_2gpu.sh
+```
+Joint + frozen frontend smoke:
+```bash
+PROJECT_DIR=/path/to/e2e-semantic-SLAM \
+CONDA_SH=/path/to/miniconda3/etc/profile.d/conda.sh \
+CONDA_ENV_NAME=SLAM-Former \
+DATA_ROOT=/path/to/data/train \
+PRETRAINED=/path/to/ckpt/checkpoint-10.pth.model \
+SAVE_DIR=/path/to/checkpoints \
+MASTER_PORT=29662 \
+sbatch slam/sbatch_smoke_joint_freeze_frontend_fsdp_2gpu.sh
+```
+If the new machine does not have Slurm, you can still run the scripts with `bash ...` as long as the same two GPUs are visible to the shell and `accelerate` / CUDA are available; the `#SBATCH` lines are then ignored by Bash.
+## Local smoke validation on the current cluster
+The two local `sbatch` scripts are intended to validate:
+- the launcher path
+- the FSDP wiring
+- the README instructions
+- the two comparison modes
+They intentionally use a small sample budget and `64` views so they can be checked quickly on 1 node and 2 GPUs.
+### 1. Submap-only smoke
+```bash
+EPOCHS=1 \
+bash slam/sbatch_smoke_submap_only_fsdp_2gpu.sh
+```
+### 2. Backend + submap smoke with detached frontend
+```bash
+EPOCHS=1 \
+bash slam/sbatch_smoke_joint_freeze_frontend_fsdp_2gpu.sh
+```
+Local smoke defaults:
+- `SAMPLES_ARKIT=32`
+- `SAMPLES_SCANNETPP=16`
+- all other local smoke sample weights default to `0`
+- `NUM_VIEWS_* = 64`
+- `SUBMAP_SIZE = 12`
+This keeps the smoke validation focused on code path correctness rather than final throughput.
+## Remote long-sequence training on 8 nodes x 8 GPUs
+The remote scripts are the actual release launchers for long-sequence comparison.
+Run the **same script on every node**, and only change `MACHINE_RANK`.
+Remote defaults:
+- `AUTO_DISABLE_MISSING=0`
+- full paper dataset sample mix from `finetune_paper_h20.yaml`
+- aggressive per-dataset `NUM_VIEWS_*` values from the table above
+- `SUBMAP_SIZE=12`
+- `SUBMAP_FETCH_SOURCE=frontend`
+- `SUBMAP_DESCRIPTOR_SOURCE=frontend`
+### 1. Remote 8x8 submap-only run
+On rank 0:
+```bash
+MASTER_ADDR=node0 \
+MACHINE_RANK=0 \
+NUM_MACHINES=8 \
+GPUS_PER_NODE=8 \
+DATA_ROOT=/path/to/data/train \
+SAVE_DIR=/path/to/checkpoints \
+PRETRAINED=/path/to/ckpt/checkpoint-10.pth.model \
+CONDA_SH=/path/to/miniconda3/etc/profile.d/conda.sh \
+CONDA_ENV_NAME=SLAM-Former \
+EPOCHS=10 \
+bash slam/train_remote_submap_only_fsdp_8node8gpu.sh
+```
+On the remaining nodes, rerun the same command with `MACHINE_RANK=1` through `7`.
+### 2. Remote 8x8 backend + submap run with detached frontend
+On rank 0:
+```bash
+MASTER_ADDR=node0 \
+MACHINE_RANK=0 \
+NUM_MACHINES=8 \
+GPUS_PER_NODE=8 \
+DATA_ROOT=/path/to/data/train \
+SAVE_DIR=/path/to/checkpoints \
+PRETRAINED=/path/to/ckpt/checkpoint-10.pth.model \
+CONDA_SH=/path/to/miniconda3/etc/profile.d/conda.sh \
+CONDA_ENV_NAME=SLAM-Former \
+EPOCHS=10 \
+bash slam/train_remote_joint_freeze_frontend_fsdp_8node8gpu.sh
+```
+On the remaining nodes, rerun the same command with `MACHINE_RANK=1` through `7`.
+## Whole-sequence TUM inference after training
+After training, use the saved checkpoint to run the full Freiburg1 TUM sequences.
+```bash
+CKPT_PATH=/path/to/checkpoint-last.pth \
+RUN_TAG=release_eval \
+SUBMAP_INFERENCE_MODE=full \
+SUBMAP_TRAIN_MODE=full_token \
+SUBMAP_FETCH_SOURCE=frontend \
+SUBMAP_DESCRIPTOR_SOURCE=frontend \
+sbatch run_tum.sh
+```
+This writes outputs to:
+```bash
+tum_results_aligned/<RUN_TAG>/rgbd_dataset_freiburg1_*/
+```
+Each sequence directory contains at least:
+- `final_traj.txt`
+- `final.ply`
+- `final_pc/`
+To evaluate ATE after the sequence finishes:
+```bash
+evo_ape tum <ground_truth.txt> <final_traj.txt> -a --t_max_diff 0.02
+```
+## Output layout
+Training outputs are written under:
+```bash
+$SAVE_DIR/$EXP_NAME/
+```
+Typical files include:
+- `checkpoint-last.pth`
+- `model.pth`
+- `logs/`
+- launcher stdout / stderr logs
+## Practical notes
+- The local `sbatch` scripts are smoke validators.
+- The remote `bash` scripts are the actual long-sequence release launchers.
+- If the target cluster has a different sequence-cap profile, rerun `slam/audit_dataset_num_views.py` there and override `NUM_VIEWS_*`.
+- If a remote run fails on missing dataset roots, keep `AUTO_DISABLE_MISSING=0` and fix the paths instead of silently training on a partial dataset mix.

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/README_submap.md ADDED Viewed

	@@ -0,0 +1,225 @@

+<div align="center">
+<h1>SLAM-Former Submap Companion / 子图系统说明</h1>
+</div>
+### Updates
+* [Mar 30, 2026] Added a submap-native training and inference path with switchable `full_token` / `top5_dual_queue` modes.
+* [Mar 30, 2026] Added dedicated TUM top5 inference support and a standalone `run_tum_top5.sh` launcher.
+* [Mar 26, 2026] Submap-only pseudo-GT training was exercised with the high-recall config and FSDP launchers.
+### Getting Started
+The environment setup is the same as the original `README.md`:
+```bash
+git clone https://github.com/Tsinghua-MARS-Lab/SLAM-Former.git
+cd SLAM-Former
+conda create -n SLAM-Former python=3.11
+conda activate SLAM-Former
+pip install -r requirements.txt
+pip install -e .
+```
+### Submap System Overview
+This companion branch keeps the native SLAM-Former pipeline as close as possible to the original implementation, while adding a submap-oriented memory backend and switchable queue semantics.
+Core ideas:
+* **`GraphGatedMemoryManager`** stores historical submaps on CPU, keeps descriptors, and performs loop retrieval.
+* **`slam/demo_submap.py`** is the submap-aware inference entrypoint.
+* **`src/forward_pass.py`** and **`src/forward_pass_submap.py`** slice the backend output so supervision is applied to the current submap while still letting the backend see `prev + curr + retrieved` context.
+* **`src/finetune.py`** and the H20 launchers expose the submap configuration through script variables.
+* **`run_tum.sh`** and **`run_tum_top5.sh`** drive TUM inference with either the full-token path or the top5 path.
+Important submap switches:
+* `TRAIN_SUBMAP_MODULES_ONLY`
+  * `1` freezes the main SLAMFormer parameters and trains the submap-side modules.
+  * `0` keeps the joint training path available.
+* `SUBMAP_TRAIN_MODE`
+  * `full_token`: keep the submap path close to the native full-token behavior.
+  * `top5_dual_queue`: use two queues, where the frontend queue is read-only and the backend queue receives write-back for retrieved historical submaps.
+* `SUBMAP_RETRIEVAL_TOPK`
+  * number of historical submaps fetched in the soft retrieval mode.
+* `SUBMAP_FETCH_SOURCE` / `SUBMAP_DESCRIPTOR_SOURCE`
+  * choose whether retrieval and descriptor storage read from `frontend` or `backend` banks.
+* `SUBMAP_INFERENCE_MODE`
+  * `full` or `top5` for the TUM launch scripts.
+### Training Modes
+There are three common training setups in this branch:
+#### 1. Joint baseline
+Use the original joint configuration when you want the closest comparison to the official training branch:
+* `config/finetune.yaml`
+* `TRAIN_SUBMAP_MODULES_ONLY=0`
+* `SUBMAP_TRAIN_MODE=full_token`
+* `SUBMAP_RETRIEVAL_TOPK=0`
+#### 2. Submap-only full-token training
+This is the first submap stage: descriptors and historical submaps are trained with full-token submaps so historical submaps can still receive gradients.
+Recommended config:
+* `config/finetune_sub_only.yaml`
+* or `config/finetune_pseudo_gt_high_recall.yaml`
+Typical launch knobs:
+* `TRAIN_SUBMAP_MODULES_ONLY=1`
+* `SUBMAP_TRAIN_MODE=full_token`
+* `SUBMAP_RETRIEVAL_TOPK=0`
+Example:
+```bash
+ENABLE_PSEUDO_GT=1 \
+PSEUDO_GT_CACHE_PATH=/var/scratch/qzhang2/SLAM-Former/data/train/pseudo_gt/arkitscenes_smoke_test.json \
+CONFIG_NAME=finetune_pseudo_gt_high_recall.yaml \
+TRAIN_SUBMAP_MODULES_ONLY=1 \
+SUBMAP_TRAIN_MODE=full_token \
+SUBMAP_RETRIEVAL_TOPK=0 \
+sbatch slam/sbatch_finetune.sh
+```
+#### 3. Submap top5 fine-tuning
+This is the second stage: the backend sees the top5 historical submaps, and the system uses the dual-queue semantics.
+Typical launch knobs:
+* `TRAIN_SUBMAP_MODULES_ONLY=1`
+* `SUBMAP_TRAIN_MODE=top5_dual_queue`
+* `SUBMAP_RETRIEVAL_TOPK=5`
+* `SUBMAP_FETCH_SOURCE=frontend`
+* `SUBMAP_DESCRIPTOR_SOURCE=frontend`
+Example:
+```bash
+ENABLE_PSEUDO_GT=1 \
+PSEUDO_GT_CACHE_PATH=/var/scratch/qzhang2/SLAM-Former/data/train/pseudo_gt/arkitscenes_smoke_test.json \
+CONFIG_NAME=finetune_pseudo_gt_high_recall.yaml \
+TRAIN_SUBMAP_MODULES_ONLY=1 \
+SUBMAP_TRAIN_MODE=top5_dual_queue \
+SUBMAP_RETRIEVAL_TOPK=5 \
+sbatch slam/sbatch_finetune.sh
+```
+### Inference Modes
+#### Official baseline comparison
+For the native baseline, keep using the original demo path from `README.md`:
+```bash
+python slam/demo.py \
+    --ckpt_path ckpt/checkpoint.pth.model \
+    --image_folder /path/to/your/images/ \
+    --output_dir /output/result \
+    --target_size 518 \
+    --retention_ratio 0.5
+```
+#### Submap TUM inference: full mode
+This path uses the submap-aware demo but keeps the inference behavior in `full` mode.
+```bash
+SUBMAP_INFERENCE_MODE=full \
+CKPT_PATH=/var/scratch/qzhang2/SLAM-Former/ckpt/checkpoint-10.pth.model \
+sbatch run_tum.sh
+```
+#### Submap TUM inference: top5 mode
+This is the dedicated top5 launcher for comparing against the full mode.
+```bash
+sbatch run_tum_top5.sh
+```
+You can also override the checkpoint and output root explicitly:
+```bash
+CKPT_PATH=/var/scratch/qzhang2/SLAM-Former/checkpoints/local_cluster_nv24_sub6/submap_only_pseudo_gt_high_recall_smoke/paper_local_submap_only_pseudo_gt_high_recall_smoke_nv24_sub6/checkpoint-last.pth \
+SUBMAP_INFERENCE_MODE=top5 \
+RUN_TAG=my_top5_compare \
+OUT_DIR=/var/scratch/qzhang2/SLAM-Former/tum_results_aligned_top5/my_top5_compare \
+sbatch run_tum_top5.sh
+```
+### Launch Scripts
+| Script | Purpose | Notes |
+| --- | --- | --- |
+| `slam/sbatch_finetune.sh` | Local 3-GPU FSDP finetune launcher | Uses environment variables to select config, submap mode, and pseudo-GT cache. |
+| `slam/run_train_h20_single.sh` | Single-node H20-style training launcher | Good when you want to run the training job directly without the wrapper. |
+| `slam/run_train_h20_multi.sh` | Multi-node H20-style training launcher | Keeps the same submap knobs, but launches across multiple machines. |
+| `run_tum.sh` | TUM inference launcher | Supports `SUBMAP_INFERENCE_MODE=full|top5`. |
+| `run_tum_top5.sh` | Dedicated TUM top5 launcher | Defaults to the latest submap checkpoint and top5 mode. |
+| `slam/demo_submap.py` | Manual inference entrypoint | Accepts `--submap_train_mode`, `--submap_retrieval_topk`, `--loop_mask_mode`, `--submap_fetch_source`, `--submap_descriptor_source`, and `--max_recursive_submaps`. |
+### Configuration Files
+| Config | Role |
+| --- | --- |
+| `config/finetune.yaml` | Joint training baseline. |
+| `config/finetune_sub_only.yaml` | Submap-only training with full-token semantics. |
+| `config/finetune_pseudo_gt_high_recall.yaml` | Submap-only training with higher-recall pseudo-GT settings. |
+### Data and Checkpoints
+The data layout is the same as the original project. The TUM root used by the launch scripts is:
+```bash
+/var/scratch/qzhang2/Feature-SLAM/datasets/tum
+```
+Typical checkpoint locations are under:
+```bash
+/var/scratch/qzhang2/SLAM-Former/checkpoints/
+```
+For inference, the scripts usually read `checkpoint-last.pth` from the latest experiment folder.
+### Output Layout
+Submap TUM inference writes one folder per sequence, for example:
+```bash
+tum_results_aligned_top5/<run_tag>/rgbd_dataset_freiburg1_360/
+```
+Inside each sequence directory you should expect:
+* `final_traj.txt`
+* `final.ply`
+* `final_pc/`
+### Visualization
+Static visualization is the same as the original branch:
+```bash
+python slam/visualize_results.py --result_dir /path/to/output_dir
+```
+For TUM results generated by the submap scripts, point `--result_dir` to the corresponding output folder.
+### Notes
+* Keep `README.md` as the official baseline reference.
+* Use this file when you want to describe or run the submap branch.
+* The main comparison axes are:
+  * official baseline vs submap branch
+  * full-token submap mode vs top5 dual-queue mode
+  * joint training vs submap-only training

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/base_opt.py ADDED Viewed

	@@ -0,0 +1,301 @@

+from copy import deepcopy
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+import os
+import matplotlib.pyplot as plt
+from cloud_opt.utils import *
+from cloud_opt.utils import _check_edges, _compute_img_conf
+import cloud_opt.init_all as init_fun
+class BaseOptimizer(nn.Module):
+    """Optimize a global scene, given a graph-organized observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2), pred2 is in pred1's coordinate
+    """
+    def __init__(self, *args, **kwargs):
+        pass
+    def _init_from_views(
+        self,
+        view1s,
+        view2s,
+        pred1s,
+        pred2s,  # whatever predictions, they should be organized into pairwise for graph optimization
+        dist="l1",
+        conf="log",
+        min_conf_thr=3,
+        thr_for_init_conf=False,
+        base_scale=0.5,
+        allow_pw_adaptors=False,
+        pw_break=20,
+        rand_pose=torch.randn,
+        empty_cache=False,
+        verbose=True,
+    ):
+        super().__init__()
+        self.edges = [
+            (int(view1["idx"]), int(view2["idx"]))
+            for view1, view2 in zip(view1s, view2s)
+        ]
+        self.dist = ALL_DISTS[dist]
+        self.n_imgs = _check_edges(self.edges)
+        self.edge2pts_i = NoGradParamDict(
+            {ij: pred1s[n]["pts3d_is_self_view"] for n, ij in enumerate(self.str_edges)}
+        )  # ij: the name of the edge
+        self.edge2pts_j = NoGradParamDict(
+            {
+                ij: pred2s[n]["pts3d_in_other_view"]
+                for n, ij in enumerate(self.str_edges)
+            }
+        )
+        self.edge2conf_i = NoGradParamDict(
+            {ij: pred1s[n]["conf_self"] for n, ij in enumerate(self.str_edges)}
+        )
+        self.edge2conf_j = NoGradParamDict(
+            {ij: pred2s[n]["conf"] for n, ij in enumerate(self.str_edges)}
+        )
+        self.imshapes = get_imshapes(self.edges, pred1s, pred2s)
+        self.min_conf_thr = min_conf_thr
+        self.thr_for_init_conf = thr_for_init_conf
+        self.conf_trf = get_conf_trf(conf)
+        self.im_conf = _compute_img_conf(
+            self.imshapes, self.device, self.edges, self.edge2conf_i, self.edge2conf_j
+        )
+        for i in range(len(self.im_conf)):
+            self.im_conf[i].requires_grad = False
+        self.init_conf_maps = [c.clone() for c in self.im_conf]
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(
+            rand_pose((self.n_edges, 1 + self.POSE_DIM))
+        )  # pairwise poses
+        self.pw_adaptors = nn.Parameter(
+            torch.zeros((self.n_edges, 2))
+        )  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+    def get_known_poses(self):
+        if self.has_im_poses:
+            known_poses_msk = torch.tensor(
+                [not (p.requires_grad) for p in self.im_poses]
+            )
+            known_poses = self.get_im_poses()
+            return known_poses_msk.sum(), known_poses_msk, known_poses
+        else:
+            return 0, None, None
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(
+                T / (scale or 1)
+            )  # translation is function of scale
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+        if ret_details:
+            return loss, details
+        return loss
+    @torch.cuda.amp.autocast(enabled=False)
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == "msp" or init == "mst":
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == "known_poses":
+            raise NotImplementedError
+            self.preset_pose(known_poses=self.camera_poses, requires_grad=True)
+            init_fun.init_from_known_poses(
+                self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP
+            )
+        else:
+            raise ValueError(f"bad value for {init=}")
+        return global_alignment_loop(self, **kw)
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+    @property
+    def n_edges(self):
+        return len(self.edges)
+def global_alignment_loop(
+    net,
+    lr=0.01,
+    niter=300,
+    schedule="cosine",
+    lr_min=1e-3,
+    temporal_smoothing_weight=0,
+    depth_map_save_dir=None,
+):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+    verbose = net.verbose
+    if verbose:
+        print("Global alignement - optimizing for:")
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+    loss = float("inf")
+    if verbose:
+        with tqdm.tqdm(total=niter) as bar:
+            while bar.n < bar.total:
+                if bar.n % 500 == 0 and depth_map_save_dir is not None:
+                    if not os.path.exists(depth_map_save_dir):
+                        os.makedirs(depth_map_save_dir)
+                    # visualize the depthmaps
+                    depth_maps = net.get_depthmaps()
+                    for i, depth_map in enumerate(depth_maps):
+                        depth_map_save_path = os.path.join(
+                            depth_map_save_dir, f"depthmaps_{i}_iter_{bar.n}.png"
+                        )
+                        plt.imsave(
+                            depth_map_save_path,
+                            depth_map.detach().cpu().numpy(),
+                            cmap="jet",
+                        )
+                    print(
+                        f"Saved depthmaps at iteration {bar.n} to {depth_map_save_dir}"
+                    )
+                loss, lr = global_alignment_iter(
+                    net,
+                    bar.n,
+                    niter,
+                    lr_base,
+                    lr_min,
+                    optimizer,
+                    schedule,
+                    temporal_smoothing_weight=temporal_smoothing_weight,
+                )
+                bar.set_postfix_str(f"{lr=:g} loss={loss:g}")
+                bar.update()
+    else:
+        for n in range(niter):
+            loss, _ = global_alignment_iter(
+                net,
+                n,
+                niter,
+                lr_base,
+                lr_min,
+                optimizer,
+                schedule,
+                temporal_smoothing_weight=temporal_smoothing_weight,
+            )
+    return loss
+def global_alignment_iter(
+    net,
+    cur_iter,
+    niter,
+    lr_base,
+    lr_min,
+    optimizer,
+    schedule,
+    temporal_smoothing_weight=0,
+):
+    t = cur_iter / niter
+    if schedule == "cosine":
+        lr = cosine_schedule(t, lr_base, lr_min)
+    elif schedule == "linear":
+        lr = linear_schedule(t, lr_base, lr_min)
+    elif schedule.startswith("cycle"):
+        try:
+            num_cycles = int(schedule[5:])
+        except ValueError:
+            num_cycles = 2
+        lr = cycled_linear_schedule(t, lr_base, lr_min, num_cycles=num_cycles)
+    else:
+        raise ValueError(f"bad lr {schedule=}")
+    adjust_learning_rate_by_lr(optimizer, lr)
+    optimizer.zero_grad()
+    if net.empty_cache:
+        torch.cuda.empty_cache()
+    loss = net(epoch=cur_iter)
+    if net.empty_cache:
+        torch.cuda.empty_cache()
+    loss.backward()
+    if net.empty_cache:
+        torch.cuda.empty_cache()
+    optimizer.step()
+    return float(loss), lr

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/commons.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+def edge_str(i, j):
+    return f"{i}_{j}"
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+def compute_edge_scores(edges, conf_i, conf_j):
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+def get_conf_trf(mode):
+    if mode == "log":
+        def conf_trf(x):
+            return x.log()
+    elif mode == "sqrt":
+        def conf_trf(x):
+            return x.sqrt()
+    elif mode == "m1":
+        def conf_trf(x):
+            return x - 1
+    elif mode in ("id", "none"):
+        def conf_trf(x):
+            return x
+    else:
+        raise ValueError(f"bad mode for {mode=}")
+    return conf_trf
+def l2_dist(a, b, weight):
+    return (a - b).square().sum(dim=-1) * weight
+def l1_dist(a, b, weight):
+    return (a - b).norm(dim=-1) * weight
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# global alignment optimization wrapper function
+# --------------------------------------------------------
+from enum import Enum
+from .optimizer import PointCloudOptimizer
+class GlobalAlignerMode(Enum):
+    PointCloudOptimizer = "PointCloudOptimizer"
+    ModularPointCloudOptimizer = "ModularPointCloudOptimizer"
+    PairViewer = "PairViewer"
+def global_aligner(
+    dust3r_output, device, mode=GlobalAlignerMode.PointCloudOptimizer, **optim_kw
+):
+    # extract all inputs
+    view1, view2, pred1, pred2 = [
+        dust3r_output[k] for k in "view1 view2 pred1 pred2".split()
+    ]
+    # build the optimizer
+    if mode == GlobalAlignerMode.PointCloudOptimizer:
+        net = PointCloudOptimizer(view1, view2, pred1, pred2, **optim_kw).to(device)
+    else:
+        raise NotImplementedError(f"Unknown mode {mode}")
+    return net

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/base_opt.py ADDED Viewed

	@@ -0,0 +1,620 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Base class for the global alignement procedure
+# --------------------------------------------------------
+from copy import deepcopy
+import numpy as np
+import torch
+import torch.nn as nn
+import roma
+from copy import deepcopy
+import tqdm
+import cv2
+from PIL import Image
+from dust3r.utils.geometry import inv, geotrf
+from dust3r.utils.device import to_numpy
+from dust3r.utils.image import rgb
+from dust3r.viz import SceneViz, segment_sky, auto_cam_size
+from cloud_opt.dust3r_opt.commons import (
+    edge_str,
+    ALL_DISTS,
+    NoGradParamDict,
+    get_imshapes,
+    signed_expm1,
+    signed_log1p,
+    cosine_schedule,
+    linear_schedule,
+    get_conf_trf,
+)
+import cloud_opt.dust3r_opt.init_im_poses as init_fun
+from pathlib import Path
+from scipy.spatial.transform import Rotation
+from evo.core.trajectory import PosePath3D, PoseTrajectory3D
+def adjust_learning_rate_by_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr
+def make_traj(args) -> PoseTrajectory3D:
+    if isinstance(args, tuple) or isinstance(args, list):
+        traj, tstamps = args
+        return PoseTrajectory3D(
+            positions_xyz=traj[:, :3],
+            orientations_quat_wxyz=traj[:, 3:],
+            timestamps=tstamps,
+        )
+    assert isinstance(args, PoseTrajectory3D), type(args)
+    return deepcopy(args)
+def save_trajectory_tum_format(traj, filename):
+    traj = make_traj(traj)
+    tostr = lambda a: " ".join(map(str, a))
+    with Path(filename).open("w") as f:
+        for i in range(traj.num_poses):
+            f.write(
+                f"{traj.timestamps[i]} {tostr(traj.positions_xyz[i])} {tostr(traj.orientations_quat_wxyz[i][[0,1,2,3]])}\n"
+            )
+    print(f"Saved trajectory to {filename}")
+def c2w_to_tumpose(c2w):
+    """
+    Convert a camera-to-world matrix to a tuple of translation and rotation
+    input: c2w: 4x4 matrix
+    output: tuple of translation and rotation (x y z qw qx qy qz)
+    """
+    # convert input to numpy
+    c2w = to_numpy(c2w)
+    xyz = c2w[:3, -1]
+    rot = Rotation.from_matrix(c2w[:3, :3])
+    qx, qy, qz, qw = rot.as_quat()
+    tum_pose = np.concatenate([xyz, [qw, qx, qy, qz]])
+    return tum_pose
+class BasePCOptimizer(nn.Module):
+    """Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+    def __init__(self, *args, **kwargs):
+        if len(args) == 1 and len(kwargs) == 0:
+            other = deepcopy(args[0])
+            attrs = """edges is_symmetrized dist n_imgs pred_i pred_j imshapes
+                        min_conf_thr conf_thr conf_i conf_j im_conf
+                        base_scale norm_pw_scale POSE_DIM pw_poses
+                        pw_adaptors pw_adaptors has_im_poses rand_pose imgs verbose""".split()
+            self.__dict__.update({k: other[k] for k in attrs})
+        else:
+            self._init_from_views(*args, **kwargs)
+    def _init_from_views(
+        self,
+        view1,
+        view2,
+        pred1,
+        pred2,
+        dist="l1",
+        conf="log",
+        min_conf_thr=3,
+        base_scale=0.5,
+        allow_pw_adaptors=False,
+        pw_break=20,
+        rand_pose=torch.randn,
+        iterationsCount=None,
+        verbose=True,
+    ):
+        super().__init__()
+        if not isinstance(view1["idx"], list):
+            view1["idx"] = view1["idx"].tolist()
+        if not isinstance(view2["idx"], list):
+            view2["idx"] = view2["idx"].tolist()
+        self.edges = [(int(i), int(j)) for i, j in zip(view1["idx"], view2["idx"])]
+        self.is_symmetrized = set(self.edges) == {(j, i) for i, j in self.edges}
+        self.dist = ALL_DISTS[dist]
+        self.verbose = verbose
+        self.n_imgs = self._check_edges()
+        # input data
+        pred1_pts = pred1["pts3d_in_self_view"]
+        pred2_pts = pred2["pts3d_in_other_view"]
+        self.pred_i = NoGradParamDict(
+            {ij: pred1_pts[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.pred_j = NoGradParamDict(
+            {ij: pred2_pts[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.imshapes = get_imshapes(self.edges, pred1_pts, pred2_pts)
+        # work in log-scale with conf
+        pred1_conf = pred1["conf_self"]
+        pred2_conf = pred2["conf"]
+        self.min_conf_thr = min_conf_thr
+        self.conf_trf = get_conf_trf(conf)
+        self.conf_i = NoGradParamDict(
+            {ij: pred1_conf[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.conf_j = NoGradParamDict(
+            {ij: pred2_conf[n] for n, ij in enumerate(self.str_edges)}
+        )
+        self.im_conf = self._compute_img_conf(pred1_conf, pred2_conf)
+        for i in range(len(self.im_conf)):
+            self.im_conf[i].requires_grad = False
+        # pairwise pose parameters
+        self.base_scale = base_scale
+        self.norm_pw_scale = True
+        self.pw_break = pw_break
+        self.POSE_DIM = 7
+        self.pw_poses = nn.Parameter(
+            rand_pose((self.n_edges, 1 + self.POSE_DIM))
+        )  # pairwise poses
+        self.pw_adaptors = nn.Parameter(
+            torch.zeros((self.n_edges, 2))
+        )  # slight xy/z adaptation
+        self.pw_adaptors.requires_grad_(allow_pw_adaptors)
+        self.has_im_poses = False
+        self.rand_pose = rand_pose
+        # possibly store images for show_pointcloud
+        self.imgs = None
+        if "img" in view1 and "img" in view2:
+            imgs = [torch.zeros((3,) + hw) for hw in self.imshapes]
+            for v in range(len(self.edges)):
+                idx = view1["idx"][v]
+                imgs[idx] = view1["img"][v]
+                idx = view2["idx"][v]
+                imgs[idx] = view2["img"][v]
+            self.imgs = rgb(imgs)
+    @property
+    def n_edges(self):
+        return len(self.edges)
+    @property
+    def str_edges(self):
+        return [edge_str(i, j) for i, j in self.edges]
+    @property
+    def imsizes(self):
+        return [(w, h) for h, w in self.imshapes]
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+    def state_dict(self, trainable=True):
+        all_params = super().state_dict()
+        return {
+            k: v
+            for k, v in all_params.items()
+            if k.startswith(("_", "pred_i.", "pred_j.", "conf_i.", "conf_j."))
+            != trainable
+        }
+    def load_state_dict(self, data):
+        return super().load_state_dict(self.state_dict(trainable=False) | data)
+    def _check_edges(self):
+        indices = sorted({i for edge in self.edges for i in edge})
+        assert indices == list(range(len(indices))), "bad pair indices: missing values "
+        return len(indices)
+    @torch.no_grad()
+    def _compute_img_conf(self, pred1_conf, pred2_conf):
+        im_conf = nn.ParameterList(
+            [torch.zeros(hw, device=self.device) for hw in self.imshapes]
+        )
+        for e, (i, j) in enumerate(self.edges):
+            im_conf[i] = torch.maximum(im_conf[i], pred1_conf[e])
+            im_conf[j] = torch.maximum(im_conf[j], pred2_conf[e])
+        return im_conf
+    def get_adaptors(self):
+        adapt = self.pw_adaptors
+        adapt = torch.cat(
+            (adapt[:, 0:1], adapt), dim=-1
+        )  # (scale_xy, scale_xy, scale_z)
+        if self.norm_pw_scale:  # normalize so that the product == 1
+            adapt = adapt - adapt.mean(dim=1, keepdim=True)
+        return (adapt / self.pw_break).exp()
+    def _get_poses(self, poses):
+        # normalize rotation
+        Q = poses[:, :4]
+        T = signed_expm1(poses[:, 4:7])
+        RT = roma.RigidUnitQuat(Q, T).normalize().to_homogeneous()
+        return RT
+    def _set_pose(self, poses, idx, R, T=None, scale=None, force=False):
+        # all poses == cam-to-world
+        pose = poses[idx]
+        if not (pose.requires_grad or force):
+            return pose
+        if R.shape == (4, 4):
+            assert T is None
+            T = R[:3, 3]
+            R = R[:3, :3]
+        if R is not None:
+            pose.data[0:4] = roma.rotmat_to_unitquat(R)
+        if T is not None:
+            pose.data[4:7] = signed_log1p(
+                T / (scale or 1)
+            )  # translation is function of scale
+        if scale is not None:
+            assert poses.shape[-1] in (8, 13)
+            pose.data[-1] = np.log(float(scale))
+        return pose
+    def get_pw_norm_scale_factor(self):
+        if self.norm_pw_scale:
+            # normalize scales so that things cannot go south
+            # we want that exp(scale) ~= self.base_scale
+            return (np.log(self.base_scale) - self.pw_poses[:, -1].mean()).exp()
+        else:
+            return 1  # don't norm scale for known poses
+    def get_pw_scale(self):
+        scale = self.pw_poses[:, -1].exp()  # (n_edges,)
+        scale = scale * self.get_pw_norm_scale_factor()
+        return scale
+    def get_pw_poses(self):  # cam to world
+        RT = self._get_poses(self.pw_poses)
+        scaled_RT = RT.clone()
+        scaled_RT[:, :3] *= self.get_pw_scale().view(
+            -1, 1, 1
+        )  # scale the rotation AND translation
+        return scaled_RT
+    def get_masks(self):
+        return [(conf > self.min_conf_thr) for conf in self.im_conf]
+    def depth_to_pts3d(self):
+        raise NotImplementedError()
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[: h * w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def _set_focal(self, idx, focal, force=False):
+        raise NotImplementedError()
+    def get_focals(self):
+        raise NotImplementedError()
+    def get_known_focal_mask(self):
+        raise NotImplementedError()
+    def get_principal_points(self):
+        raise NotImplementedError()
+    def get_conf(self, mode=None):
+        trf = self.conf_trf if mode is None else get_conf_trf(mode)
+        return [trf(c) for c in self.im_conf]
+    def get_im_poses(self):
+        raise NotImplementedError()
+    def _set_depthmap(self, idx, depth, force=False):
+        raise NotImplementedError()
+    def get_depthmaps(self, raw=False):
+        raise NotImplementedError()
+    def save_depth_maps(self, path):
+        depth_maps = self.get_depthmaps()
+        images = []
+        for i, depth_map in enumerate(depth_maps):
+            # Apply color map to depth map
+            depth_map_colored = cv2.applyColorMap(
+                (depth_map * 255).detach().cpu().numpy().astype(np.uint8),
+                cv2.COLORMAP_JET,
+            )
+            img_path = f"{path}/frame_{(i):04d}.png"
+            cv2.imwrite(img_path, depth_map_colored)
+            images.append(Image.open(img_path))
+            np.save(f"{path}/frame_{(i):04d}.npy", depth_map.detach().cpu().numpy())
+        images[0].save(
+            f"{path}/_depth_maps.gif",
+            save_all=True,
+            append_images=images[1:],
+            duration=100,
+            loop=0,
+        )
+        return depth_maps
+    def clean_pointcloud(self, **kw):
+        cams = inv(self.get_im_poses())
+        K = self.get_intrinsics()
+        depthmaps = self.get_depthmaps()
+        all_pts3d = self.get_pts3d()
+        new_im_confs = clean_pointcloud(
+            self.im_conf, K, cams, depthmaps, all_pts3d, **kw
+        )
+        for i, new_conf in enumerate(new_im_confs):
+            self.im_conf[i].data[:] = new_conf
+        return self
+    def get_tum_poses(self):
+        poses = self.get_im_poses()
+        tt = np.arange(len(poses)).astype(float)
+        tum_poses = [c2w_to_tumpose(p) for p in poses]
+        tum_poses = np.stack(tum_poses, 0)
+        return [tum_poses, tt]
+    def save_tum_poses(self, path):
+        traj = self.get_tum_poses()
+        save_trajectory_tum_format(traj, path)
+        return traj[0]  # return the poses
+    def save_focals(self, path):
+        # convert focal to txt
+        focals = self.get_focals()
+        np.savetxt(path, focals.detach().cpu().numpy(), fmt="%.6f")
+        return focals
+    def save_intrinsics(self, path):
+        K_raw = self.get_intrinsics()
+        K = K_raw.reshape(-1, 9)
+        np.savetxt(path, K.detach().cpu().numpy(), fmt="%.6f")
+        return K_raw
+    def save_conf_maps(self, path):
+        conf = self.get_conf()
+        for i, c in enumerate(conf):
+            np.save(f"{path}/conf_{i}.npy", c.detach().cpu().numpy())
+        return conf
+    def save_init_conf_maps(self, path):
+        conf = self.get_init_conf()
+        for i, c in enumerate(conf):
+            np.save(f"{path}/init_conf_{i}.npy", c.detach().cpu().numpy())
+        return conf
+    def save_rgb_imgs(self, path):
+        imgs = self.imgs
+        for i, img in enumerate(imgs):
+            # convert from rgb to bgr
+            img = img[..., ::-1]
+            cv2.imwrite(f"{path}/frame_{i:04d}.png", img * 255)
+        return imgs
+    def save_dynamic_masks(self, path):
+        dynamic_masks = (
+            self.dynamic_masks
+            if getattr(self, "sam2_dynamic_masks", None) is None
+            else self.sam2_dynamic_masks
+        )
+        for i, dynamic_mask in enumerate(dynamic_masks):
+            cv2.imwrite(
+                f"{path}/dynamic_mask_{i}.png",
+                (dynamic_mask * 255).detach().cpu().numpy().astype(np.uint8),
+            )
+        return dynamic_masks
+    def save_depth_maps(self, path):
+        depth_maps = self.get_depthmaps()
+        images = []
+        for i, depth_map in enumerate(depth_maps):
+            # Apply color map to depth map
+            depth_map_colored = cv2.applyColorMap(
+                (depth_map * 255).detach().cpu().numpy().astype(np.uint8),
+                cv2.COLORMAP_JET,
+            )
+            img_path = f"{path}/frame_{(i):04d}.png"
+            cv2.imwrite(img_path, depth_map_colored)
+            images.append(Image.open(img_path))
+            np.save(f"{path}/frame_{(i):04d}.npy", depth_map.detach().cpu().numpy())
+        images[0].save(
+            f"{path}/_depth_maps.gif",
+            save_all=True,
+            append_images=images[1:],
+            duration=100,
+            loop=0,
+        )
+        return depth_maps
+    def forward(self, ret_details=False):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors()
+        proj_pts3d = self.get_pts3d()
+        # pre-compute pixel weights
+        weight_i = {i_j: self.conf_trf(c) for i_j, c in self.conf_i.items()}
+        weight_j = {i_j: self.conf_trf(c) for i_j, c in self.conf_j.items()}
+        loss = 0
+        if ret_details:
+            details = -torch.ones((self.n_imgs, self.n_imgs))
+        for e, (i, j) in enumerate(self.edges):
+            i_j = edge_str(i, j)
+            # distance in image i and j
+            aligned_pred_i = geotrf(pw_poses[e], pw_adapt[e] * self.pred_i[i_j])
+            aligned_pred_j = geotrf(pw_poses[e], pw_adapt[e] * self.pred_j[i_j])
+            li = self.dist(proj_pts3d[i], aligned_pred_i, weight=weight_i[i_j]).mean()
+            lj = self.dist(proj_pts3d[j], aligned_pred_j, weight=weight_j[i_j]).mean()
+            loss = loss + li + lj
+            if ret_details:
+                details[i, j] = li + lj
+        loss /= self.n_edges  # average over all pairs
+        if ret_details:
+            return loss, details
+        return loss
+    @torch.cuda.amp.autocast(enabled=False)
+    def compute_global_alignment(self, init=None, niter_PnP=10, **kw):
+        if init is None:
+            pass
+        elif init == "msp" or init == "mst":
+            init_fun.init_minimum_spanning_tree(self, niter_PnP=niter_PnP)
+        elif init == "known_poses":
+            init_fun.init_from_known_poses(
+                self, min_conf_thr=self.min_conf_thr, niter_PnP=niter_PnP
+            )
+        else:
+            raise ValueError(f"bad value for {init=}")
+        return global_alignment_loop(self, **kw)
+    @torch.no_grad()
+    def mask_sky(self):
+        res = deepcopy(self)
+        for i in range(self.n_imgs):
+            sky = segment_sky(self.imgs[i])
+            res.im_conf[i][sky] = 0
+        return res
+    def show(self, show_pw_cams=False, show_pw_pts3d=False, cam_size=None, **kw):
+        viz = SceneViz()
+        if self.imgs is None:
+            colors = np.random.randint(0, 256, size=(self.n_imgs, 3))
+            colors = list(map(tuple, colors.tolist()))
+            for n in range(self.n_imgs):
+                viz.add_pointcloud(self.get_pts3d()[n], colors[n], self.get_masks()[n])
+        else:
+            viz.add_pointcloud(self.get_pts3d(), self.imgs, self.get_masks())
+            colors = np.random.randint(256, size=(self.n_imgs, 3))
+        # camera poses
+        im_poses = to_numpy(self.get_im_poses())
+        if cam_size is None:
+            cam_size = auto_cam_size(im_poses)
+        viz.add_cameras(
+            im_poses,
+            self.get_focals(),
+            colors=colors,
+            images=self.imgs,
+            imsizes=self.imsizes,
+            cam_size=cam_size,
+        )
+        if show_pw_cams:
+            pw_poses = self.get_pw_poses()
+            viz.add_cameras(pw_poses, color=(192, 0, 192), cam_size=cam_size)
+            if show_pw_pts3d:
+                pts = [
+                    geotrf(pw_poses[e], self.pred_i[edge_str(i, j)])
+                    for e, (i, j) in enumerate(self.edges)
+                ]
+                viz.add_pointcloud(pts, (128, 0, 128))
+        viz.show(**kw)
+        return viz
+def global_alignment_loop(net, lr=0.01, niter=300, schedule="cosine", lr_min=1e-6):
+    params = [p for p in net.parameters() if p.requires_grad]
+    if not params:
+        return net
+    verbose = net.verbose
+    if verbose:
+        print("Global alignement - optimizing for:")
+        print([name for name, value in net.named_parameters() if value.requires_grad])
+    lr_base = lr
+    optimizer = torch.optim.Adam(params, lr=lr, betas=(0.9, 0.9))
+    loss = float("inf")
+    if verbose:
+        with tqdm.tqdm(total=niter) as bar:
+            while bar.n < bar.total:
+                loss, lr = global_alignment_iter(
+                    net, bar.n, niter, lr_base, lr_min, optimizer, schedule
+                )
+                bar.set_postfix_str(f"{lr=:g} loss={loss:g}")
+                bar.update()
+    else:
+        for n in range(niter):
+            loss, _ = global_alignment_iter(
+                net, n, niter, lr_base, lr_min, optimizer, schedule
+            )
+    return loss
+def global_alignment_iter(net, cur_iter, niter, lr_base, lr_min, optimizer, schedule):
+    t = cur_iter / niter
+    if schedule == "cosine":
+        lr = cosine_schedule(t, lr_base, lr_min)
+    elif schedule == "linear":
+        lr = linear_schedule(t, lr_base, lr_min)
+    else:
+        raise ValueError(f"bad lr {schedule=}")
+    adjust_learning_rate_by_lr(optimizer, lr)
+    optimizer.zero_grad()
+    loss = net()
+    loss.backward()
+    optimizer.step()
+    return float(loss), lr
+@torch.no_grad()
+def clean_pointcloud(
+    im_confs, K, cams, depthmaps, all_pts3d, tol=0.001, bad_conf=0, dbg=()
+):
+    """Method:
+    1) express all 3d points in each camera coordinate frame
+    2) if they're in front of a depthmap --> then lower their confidence
+    """
+    assert len(im_confs) == len(cams) == len(K) == len(depthmaps) == len(all_pts3d)
+    assert 0 <= tol < 1
+    res = [c.clone() for c in im_confs]
+    # reshape appropriately
+    all_pts3d = [p.view(*c.shape, 3) for p, c in zip(all_pts3d, im_confs)]
+    depthmaps = [d.view(*c.shape) for d, c in zip(depthmaps, im_confs)]
+    for i, pts3d in enumerate(all_pts3d):
+        for j in range(len(all_pts3d)):
+            if i == j:
+                continue
+            # project 3dpts in other view
+            proj = geotrf(cams[j], pts3d)
+            proj_depth = proj[:, :, 2]
+            u, v = geotrf(K[j], proj, norm=1, ncol=2).round().long().unbind(-1)
+            # check which points are actually in the visible cone
+            H, W = im_confs[j].shape
+            msk_i = (proj_depth > 0) & (0 <= u) & (u < W) & (0 <= v) & (v < H)
+            msk_j = v[msk_i], u[msk_i]
+            # find bad points = those in front but less confident
+            bad_points = (proj_depth[msk_i] < (1 - tol) * depthmaps[j][msk_j]) & (
+                res[i][msk_i] < res[j][msk_j]
+            )
+            bad_msk_i = msk_i.clone()
+            bad_msk_i[msk_i] = bad_points
+            res[i][bad_msk_i] = res[i][bad_msk_i].clip_(max=bad_conf)
+    return res

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/commons.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# utility functions for global alignment
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import numpy as np
+def edge_str(i, j):
+    return f"{i}_{j}"
+def i_j_ij(ij):
+    return edge_str(*ij), ij
+def edge_conf(conf_i, conf_j, edge):
+    return float(conf_i[edge].mean() * conf_j[edge].mean())
+def compute_edge_scores(edges, conf_i, conf_j):
+    return {(i, j): edge_conf(conf_i, conf_j, e) for e, (i, j) in edges}
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e].shape[0:2])
+        shape_j = tuple(pred_j[e].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+def get_conf_trf(mode):
+    if mode == "log":
+        def conf_trf(x):
+            return x.log()
+    elif mode == "sqrt":
+        def conf_trf(x):
+            return x.sqrt()
+    elif mode == "m1":
+        def conf_trf(x):
+            return x - 1
+    elif mode in ("id", "none"):
+        def conf_trf(x):
+            return x
+    else:
+        raise ValueError(f"bad mode for {mode=}")
+    return conf_trf
+def l2_dist(a, b, weight):
+    return (a - b).square().sum(dim=-1) * weight
+def l1_dist(a, b, weight):
+    return (a - b).norm(dim=-1) * weight
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+def signed_expm1(x):
+    sign = torch.sign(x)
+    return sign * torch.expm1(torch.abs(x))
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/init_im_poses.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Initialization functions for global alignment
+# --------------------------------------------------------
+from functools import cache
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+from dust3r.utils.geometry import geotrf, inv, get_med_dist_between_poses
+from dust3r.post_process import estimate_focal_knowing_depth
+from dust3r.viz import to_numpy
+from cloud_opt.commons import edge_str, i_j_ij, compute_edge_scores
+@torch.no_grad()
+def init_from_known_poses(self, niter_PnP=10, min_conf_thr=3):
+    device = self.device
+    # indices of known poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    assert nkp == self.n_imgs, "not all poses are known"
+    # get all focals
+    nkf, _, im_focals = get_known_focals(self)
+    assert nkf == self.n_imgs
+    im_pp = self.get_principal_points()
+    best_depthmaps = {}
+    # init all pairwise poses
+    for e, (i, j) in enumerate(tqdm(self.edges, disable=not self.verbose)):
+        i_j = edge_str(i, j)
+        # find relative pose for this pair
+        P1 = torch.eye(4, device=device)
+        msk = self.conf_i[i_j] > min(min_conf_thr, self.conf_i[i_j].min() - 0.1)
+        _, P2 = fast_pnp(
+            self.pred_j[i_j],
+            float(im_focals[i].mean()),
+            pp=im_pp[i],
+            msk=msk,
+            device=device,
+            niter_PnP=niter_PnP,
+        )
+        # align the two predicted camera with the two gt cameras
+        s, R, T = align_multiple_poses(torch.stack((P1, P2)), known_poses[[i, j]])
+        # normally we have known_poses[i] ~= sRT_to_4x4(s,R,T,device) @ P1
+        # and geotrf(sRT_to_4x4(1,R,T,device), s*P2[:3,3])
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+        # remember if this is a good depthmap
+        score = float(self.conf_i[i_j].mean())
+        if score > best_depthmaps.get(i, (0,))[0]:
+            best_depthmaps[i] = score, i_j, s
+    # init all image poses
+    for n in range(self.n_imgs):
+        assert known_poses_msk[n]
+        _, i_j, scale = best_depthmaps[n]
+        depth = self.pred_i[i_j][:, :, 2]
+        self._set_depthmap(n, depth * scale)
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """Init all camera poses (image-wise and pairwise poses) given
+    an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(
+        self.imshapes,
+        self.edges,
+        self.pred_i,
+        self.pred_j,
+        self.conf_i,
+        self.conf_j,
+        self.im_conf,
+        self.min_conf_thr,
+        device,
+        has_im_poses=self.has_im_poses,
+        verbose=self.verbose,
+        **kw,
+    )
+    return init_from_pts3d(self, pts3d, im_focals, im_poses)
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = get_known_poses(self)
+    if nkp == 1:
+        raise NotImplementedError(
+            "Would be simpler to just align everything afterwards on the single known pose"
+        )
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(
+            im_poses[known_poses_msk], known_poses[known_poses_msk]
+        )
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+    # set all pairwise poses
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        s, R, T = rigid_points_registration(
+            self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]
+        )
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+    # take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factor
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world)
+            if im_focals[i] is not None:
+                self._set_focal(i, im_focals[i])
+    if self.verbose:
+        pass
+        # print(' init loss =', float(self()))
+def minimum_spanning_tree(
+    imshapes,
+    edges,
+    pred_i,
+    pred_j,
+    conf_i,
+    conf_j,
+    im_conf,
+    min_conf_thr,
+    device,
+    has_im_poses=True,
+    niter_PnP=10,
+    verbose=True,
+):
+    n_imgs = len(imshapes)
+    sparse_graph = -dict_to_sparse_graph(
+        compute_edge_scores(map(i_j_ij, edges), conf_i, conf_j)
+    )
+    print(sparse_graph)
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes)
+    todo = sorted(zip(-msp.data, msp.row, msp.col))  # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+    # init with strongest edge
+    score, i, j = todo.pop()
+    if verbose:
+        print(f" init edge ({i}*,{j}*) {score=}")
+    i_j = edge_str(i, j)
+    pts3d[i] = pred_i[i_j].clone()
+    pts3d[j] = pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses:
+        im_poses[i] = torch.eye(4, device=device)
+        im_focals[i] = estimate_focal(pred_i[i_j])
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop()
+        if im_focals[i] is None:
+            im_focals[i] = estimate_focal(pred_i[i_j])
+        if i in done:
+            if verbose:
+                print(f" init edge ({i},{j}*) {score=}")
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_i[i_j], pts3d[i], conf=conf_i[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[j] = geotrf(trf, pred_j[i_j])
+            done.add(j)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        elif j in done:
+            if verbose:
+                print(f" init edge ({i}*,{j}) {score=}")
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(pred_j[i_j], pts3d[j], conf=conf_j[i_j])
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, pred_i[i_j])
+            done.add(i)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(
+            sparse_graph.values()
+        )  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[
+            np.argsort(pair_scores)
+        ]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(pred_i[edge_str(i, j)])
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr
+                res = fast_pnp(
+                    pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP
+                )
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+    return pts3d, msp_edges, im_focals, im_poses
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration(
+        pts1.reshape(-1, 3),
+        pts2.reshape(-1, 3),
+        weights=conf.ravel(),
+        compute_scaling=True,
+    )
+    return s, R, T  # return un-scaled (R, T)
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device)
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W / 2, H / 2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(
+        pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode="weiszfeld"
+    ).ravel()
+    return float(focal)
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S / 2, S * 3, 21)
+    else:
+        tentative_focals = [focal]
+    if pp is None:
+        pp = (W / 2, H / 2)
+    else:
+        pp = to_numpy(pp)
+    best = (0,)
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+        try:
+            success, R, T, inliers = cv2.solvePnPRansac(
+                pts3d[msk],
+                pixels[msk],
+                K,
+                None,
+                iterationsCount=niter_PnP,
+                reprojectionError=5,
+                flags=cv2.SOLVEPNP_SQPNP,
+            )
+            if not success:
+                continue
+        except:
+            continue
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+    if not best[0]:
+        return None
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+def get_known_poses(self):
+    if self.has_im_poses:
+        known_poses_msk = torch.tensor([not (p.requires_grad) for p in self.im_poses])
+        known_poses = self.get_im_poses()
+        return known_poses_msk.sum(), known_poses_msk, known_poses
+    else:
+        return 0, None, None
+def get_known_focals(self):
+    if self.has_im_poses:
+        known_focal_msk = self.get_known_focal_mask()
+        known_focals = self.get_focals()
+        return known_focal_msk.sum(), known_focal_msk, known_focals
+    else:
+        return 0, None, None
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+    def center_and_z(poses):
+        eps = get_med_dist_between_poses(poses) / 100
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps * poses[:, :3, 2]))
+    R, T, s = roma.rigid_points_registration(
+        center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True
+    )
+    return s, R, T

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/dust3r_opt/optimizer.py ADDED Viewed

	@@ -0,0 +1,301 @@

+# Copyright (C) 2024-present Naver Corporation. All rights reserved.
+# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
+#
+# --------------------------------------------------------
+# Main class for the implementation of the global alignment
+# --------------------------------------------------------
+import numpy as np
+import torch
+import torch.nn as nn
+from cloud_opt.dust3r_opt.base_opt import BasePCOptimizer
+from dust3r.utils.geometry import xy_grid, geotrf
+from dust3r.utils.device import to_cpu, to_numpy
+class PointCloudOptimizer(BasePCOptimizer):
+    """Optimize a global scene, given a list of pairwise observations.
+    Graph node: images
+    Graph edges: observations = (pred1, pred2)
+    """
+    def __init__(self, *args, optimize_pp=False, focal_break=20, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.has_im_poses = True  # by definition of this class
+        self.focal_break = focal_break
+        # adding thing to optimize
+        self.im_depthmaps = nn.ParameterList(
+            torch.randn(H, W) / 10 - 3 for H, W in self.imshapes
+        )  # log(depth)
+        self.im_poses = nn.ParameterList(
+            self.rand_pose(self.POSE_DIM) for _ in range(self.n_imgs)
+        )  # camera poses
+        self.im_focals = nn.ParameterList(
+            torch.FloatTensor([self.focal_break * np.log(max(H, W))])
+            for H, W in self.imshapes
+        )  # camera intrinsics
+        self.im_pp = nn.ParameterList(
+            torch.zeros((2,)) for _ in range(self.n_imgs)
+        )  # camera intrinsics
+        self.im_pp.requires_grad_(optimize_pp)
+        self.imshape = self.imshapes[0]
+        im_areas = [h * w for h, w in self.imshapes]
+        self.max_area = max(im_areas)
+        # adding thing to optimize
+        self.im_depthmaps = ParameterStack(
+            self.im_depthmaps, is_param=True, fill=self.max_area
+        )
+        self.im_poses = ParameterStack(self.im_poses, is_param=True)
+        self.im_focals = ParameterStack(self.im_focals, is_param=True)
+        self.im_pp = ParameterStack(self.im_pp, is_param=True)
+        self.register_buffer(
+            "_pp", torch.tensor([(w / 2, h / 2) for h, w in self.imshapes])
+        )
+        self.register_buffer(
+            "_grid",
+            ParameterStack(
+                [xy_grid(W, H, device=self.device) for H, W in self.imshapes],
+                fill=self.max_area,
+            ),
+        )
+        # pre-compute pixel weights
+        self.register_buffer(
+            "_weight_i",
+            ParameterStack(
+                [self.conf_trf(self.conf_i[i_j]) for i_j in self.str_edges],
+                fill=self.max_area,
+            ),
+        )
+        self.register_buffer(
+            "_weight_j",
+            ParameterStack(
+                [self.conf_trf(self.conf_j[i_j]) for i_j in self.str_edges],
+                fill=self.max_area,
+            ),
+        )
+        # precompute aa
+        self.register_buffer(
+            "_stacked_pred_i",
+            ParameterStack(self.pred_i, self.str_edges, fill=self.max_area),
+        )
+        self.register_buffer(
+            "_stacked_pred_j",
+            ParameterStack(self.pred_j, self.str_edges, fill=self.max_area),
+        )
+        self.register_buffer("_ei", torch.tensor([i for i, j in self.edges]))
+        self.register_buffer("_ej", torch.tensor([j for i, j in self.edges]))
+        self.total_area_i = sum([im_areas[i] for i, j in self.edges])
+        self.total_area_j = sum([im_areas[j] for i, j in self.edges])
+    def _check_all_imgs_are_selected(self, msk):
+        assert np.all(
+            self._get_msk_indices(msk) == np.arange(self.n_imgs)
+        ), "incomplete mask!"
+    def preset_pose(self, known_poses, pose_msk=None):  # cam-to-world
+        self._check_all_imgs_are_selected(pose_msk)
+        if isinstance(known_poses, torch.Tensor) and known_poses.ndim == 2:
+            known_poses = [known_poses]
+        for idx, pose in zip(self._get_msk_indices(pose_msk), known_poses):
+            if self.verbose:
+                print(f" (setting pose #{idx} = {pose[:3,3]})")
+            self._no_grad(self._set_pose(self.im_poses, idx, torch.tensor(pose)))
+        # normalize scale if there's less than 1 known pose
+        n_known_poses = sum((p.requires_grad is False) for p in self.im_poses)
+        self.norm_pw_scale = n_known_poses <= 1
+        self.im_poses.requires_grad_(False)
+        self.norm_pw_scale = False
+    def preset_focal(self, known_focals, msk=None):
+        self._check_all_imgs_are_selected(msk)
+        for idx, focal in zip(self._get_msk_indices(msk), known_focals):
+            if self.verbose:
+                print(f" (setting focal #{idx} = {focal})")
+            self._no_grad(self._set_focal(idx, focal))
+        self.im_focals.requires_grad_(False)
+    def preset_principal_point(self, known_pp, msk=None):
+        self._check_all_imgs_are_selected(msk)
+        for idx, pp in zip(self._get_msk_indices(msk), known_pp):
+            if self.verbose:
+                print(f" (setting principal point #{idx} = {pp})")
+            self._no_grad(self._set_principal_point(idx, pp))
+        self.im_pp.requires_grad_(False)
+    def _get_msk_indices(self, msk):
+        if msk is None:
+            return range(self.n_imgs)
+        elif isinstance(msk, int):
+            return [msk]
+        elif isinstance(msk, (tuple, list)):
+            return self._get_msk_indices(np.array(msk))
+        elif msk.dtype in (bool, torch.bool, np.bool_):
+            assert len(msk) == self.n_imgs
+            return np.where(msk)[0]
+        elif np.issubdtype(msk.dtype, np.integer):
+            return msk
+        else:
+            raise ValueError(f"bad {msk=}")
+    def _no_grad(self, tensor):
+        assert (
+            tensor.requires_grad
+        ), "it must be True at this point, otherwise no modification occurs"
+    def _set_focal(self, idx, focal, force=False):
+        param = self.im_focals[idx]
+        if (
+            param.requires_grad or force
+        ):  # can only init a parameter not already initialized
+            param.data[:] = self.focal_break * np.log(focal)
+        return param
+    def get_focals(self):
+        log_focals = torch.stack(list(self.im_focals), dim=0)
+        return (log_focals / self.focal_break).exp()
+    def get_known_focal_mask(self):
+        return torch.tensor([not (p.requires_grad) for p in self.im_focals])
+    def _set_principal_point(self, idx, pp, force=False):
+        param = self.im_pp[idx]
+        H, W = self.imshapes[idx]
+        if (
+            param.requires_grad or force
+        ):  # can only init a parameter not already initialized
+            param.data[:] = to_cpu(to_numpy(pp) - (W / 2, H / 2)) / 10
+        return param
+    def get_principal_points(self):
+        return self._pp + 10 * self.im_pp
+    def get_intrinsics(self):
+        K = torch.zeros((self.n_imgs, 3, 3), device=self.device)
+        focals = self.get_focals().flatten()
+        K[:, 0, 0] = K[:, 1, 1] = focals
+        K[:, :2, 2] = self.get_principal_points()
+        K[:, 2, 2] = 1
+        return K
+    def get_im_poses(self):  # cam to world
+        cam2world = self._get_poses(self.im_poses)
+        return cam2world
+    def _set_depthmap(self, idx, depth, force=False):
+        depth = _ravel_hw(depth, self.max_area)
+        param = self.im_depthmaps[idx]
+        if (
+            param.requires_grad or force
+        ):  # can only init a parameter not already initialized
+            param.data[:] = depth.log().nan_to_num(neginf=0)
+        return param
+    def get_depthmaps(self, raw=False):
+        res = self.im_depthmaps.exp()
+        if not raw:
+            res = [dm[: h * w].view(h, w) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def depth_to_pts3d(self):
+        # Get depths and  projection params if not provided
+        focals = self.get_focals()
+        pp = self.get_principal_points()
+        im_poses = self.get_im_poses()
+        depth = self.get_depthmaps(raw=True)
+        # get pointmaps in camera frame
+        rel_ptmaps = _fast_depthmap_to_pts3d(depth, self._grid, focals, pp=pp)
+        # project to world frame
+        return geotrf(im_poses, rel_ptmaps)
+    def get_pts3d(self, raw=False):
+        res = self.depth_to_pts3d()
+        if not raw:
+            res = [dm[: h * w].view(h, w, 3) for dm, (h, w) in zip(res, self.imshapes)]
+        return res
+    def forward(self):
+        pw_poses = self.get_pw_poses()  # cam-to-world
+        pw_adapt = self.get_adaptors().unsqueeze(1)
+        proj_pts3d = self.get_pts3d(raw=True)
+        # rotate pairwise prediction according to pw_poses
+        aligned_pred_i = geotrf(pw_poses, pw_adapt * self._stacked_pred_i)
+        aligned_pred_j = geotrf(pw_poses, pw_adapt * self._stacked_pred_j)
+        # compute the less
+        li = (
+            self.dist(proj_pts3d[self._ei], aligned_pred_i, weight=self._weight_i).sum()
+            / self.total_area_i
+        )
+        lj = (
+            self.dist(proj_pts3d[self._ej], aligned_pred_j, weight=self._weight_j).sum()
+            / self.total_area_j
+        )
+        return li + lj
+def _fast_depthmap_to_pts3d(depth, pixel_grid, focal, pp):
+    pp = pp.unsqueeze(1)
+    focal = focal.unsqueeze(1)
+    assert focal.shape == (len(depth), 1, 1)
+    assert pp.shape == (len(depth), 1, 2)
+    assert pixel_grid.shape == depth.shape + (2,)
+    depth = depth.unsqueeze(-1)
+    return torch.cat((depth * (pixel_grid - pp) / focal, depth), dim=-1)
+def ParameterStack(params, keys=None, is_param=None, fill=0):
+    if keys is not None:
+        params = [params[k] for k in keys]
+    if fill > 0:
+        params = [_ravel_hw(p, fill) for p in params]
+    requires_grad = params[0].requires_grad
+    assert all(p.requires_grad == requires_grad for p in params)
+    params = torch.stack(list(params)).float().detach()
+    if is_param or requires_grad:
+        params = nn.Parameter(params)
+        params.requires_grad_(requires_grad)
+    return params
+def _ravel_hw(tensor, fill=0):
+    # ravel H,W
+    tensor = tensor.view((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:])
+    if len(tensor) < fill:
+        tensor = torch.cat(
+            (tensor, tensor.new_zeros((fill - len(tensor),) + tensor.shape[1:]))
+        )
+    return tensor
+def acceptable_focal_range(H, W, minf=0.5, maxf=3.5):
+    focal_base = max(H, W) / (
+        2 * np.tan(np.deg2rad(60) / 2)
+    )  # size / 1.1547005383792515
+    return minf * focal_base, maxf * focal_base
+def apply_mask(img, msk):
+    img = img.copy()
+    img[msk] = 0
+    return img

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/init_all.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from functools import cache
+import numpy as np
+import scipy.sparse as sp
+import torch
+import cv2
+import roma
+from tqdm import tqdm
+from cloud_opt.utils import *
+def compute_edge_scores(edges, edge2conf_i, edge2conf_j):
+    """
+    edges: 'i_j', (i,j)
+    """
+    score_dict = {
+        (i, j): edge_conf(edge2conf_i[e], edge2conf_j[e]) for e, (i, j) in edges
+    }
+    return score_dict
+def dict_to_sparse_graph(dic):
+    n_imgs = max(max(e) for e in dic) + 1
+    res = sp.dok_array((n_imgs, n_imgs))
+    for edge, value in dic.items():
+        res[edge] = value
+    return res
+@torch.no_grad()
+def init_minimum_spanning_tree(self, **kw):
+    """Init all camera poses (image-wise and pairwise poses) given
+    an initial set of pairwise estimations.
+    """
+    device = self.device
+    pts3d, _, im_focals, im_poses = minimum_spanning_tree(
+        self.imshapes,
+        self.edges,
+        self.edge2pts_i,
+        self.edge2pts_j,
+        self.edge2conf_i,
+        self.edge2conf_j,
+        self.im_conf,
+        self.min_conf_thr,
+        device,
+        has_im_poses=self.has_im_poses,
+        verbose=self.verbose,
+        **kw,
+    )
+    return init_from_pts3d(self, pts3d, im_focals, im_poses)
+def minimum_spanning_tree(
+    imshapes,
+    edges,
+    edge2pred_i,
+    edge2pred_j,
+    edge2conf_i,
+    edge2conf_j,
+    im_conf,
+    min_conf_thr,
+    device,
+    has_im_poses=True,
+    niter_PnP=10,
+    verbose=True,
+    save_score_path=None,
+):
+    n_imgs = len(imshapes)
+    eadge_and_scores = compute_edge_scores(map(i_j_ij, edges), edge2conf_i, edge2conf_j)
+    sparse_graph = -dict_to_sparse_graph(eadge_and_scores)
+    msp = sp.csgraph.minimum_spanning_tree(sparse_graph).tocoo()
+    # temp variable to store 3d points
+    pts3d = [None] * len(imshapes)
+    todo = sorted(zip(-msp.data, msp.row, msp.col))  # sorted edges
+    im_poses = [None] * n_imgs
+    im_focals = [None] * n_imgs
+    # init with strongest edge
+    score, i, j = todo.pop()
+    if verbose:
+        print(f" init edge ({i}*,{j}*) {score=}")
+    i_j = edge_str(i, j)
+    pts3d[i] = edge2pred_i[i_j].clone()
+    pts3d[j] = edge2pred_j[i_j].clone()
+    done = {i, j}
+    if has_im_poses:
+        im_poses[i] = torch.eye(4, device=device)
+        im_focals[i] = estimate_focal(edge2pred_i[i_j])
+    # set initial pointcloud based on pairwise graph
+    msp_edges = [(i, j)]
+    while todo:
+        # each time, predict the next one
+        score, i, j = todo.pop()
+        if im_focals[i] is None:
+            im_focals[i] = estimate_focal(edge2pred_i[i_j])
+        if i in done:
+            if verbose:
+                print(f" init edge ({i},{j}*) {score=}")
+            assert j not in done
+            # align pred[i] with pts3d[i], and then set j accordingly
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(
+                edge2pred_i[i_j], pts3d[i], conf=edge2conf_i[i_j]
+            )
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[j] = geotrf(trf, edge2pred_j[i_j])
+            done.add(j)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        elif j in done:
+            if verbose:
+                print(f" init edge ({i}*,{j}) {score=}")
+            assert i not in done
+            i_j = edge_str(i, j)
+            s, R, T = rigid_points_registration(
+                edge2pred_j[i_j], pts3d[j], conf=edge2conf_j[i_j]
+            )
+            trf = sRT_to_4x4(s, R, T, device)
+            pts3d[i] = geotrf(trf, edge2pred_i[i_j])
+            done.add(i)
+            msp_edges.append((i, j))
+            if has_im_poses and im_poses[i] is None:
+                im_poses[i] = sRT_to_4x4(1, R, T, device)
+        else:
+            # let's try again later
+            todo.insert(0, (score, i, j))
+    if has_im_poses:
+        # complete all missing informations
+        pair_scores = list(
+            sparse_graph.values()
+        )  # already negative scores: less is best
+        edges_from_best_to_worse = np.array(list(sparse_graph.keys()))[
+            np.argsort(pair_scores)
+        ]
+        for i, j in edges_from_best_to_worse.tolist():
+            if im_focals[i] is None:
+                im_focals[i] = estimate_focal(edge2pred_i[edge_str(i, j)])
+        for i in range(n_imgs):
+            if im_poses[i] is None:
+                msk = im_conf[i] > min_conf_thr
+                res = fast_pnp(
+                    pts3d[i], im_focals[i], msk=msk, device=device, niter_PnP=niter_PnP
+                )
+                if res:
+                    im_focals[i], im_poses[i] = res
+            if im_poses[i] is None:
+                im_poses[i] = torch.eye(4, device=device)
+        im_poses = torch.stack(im_poses)
+    else:
+        im_poses = im_focals = None
+    return pts3d, msp_edges, im_focals, im_poses
+def init_from_pts3d(self, pts3d, im_focals, im_poses):
+    # init poses
+    nkp, known_poses_msk, known_poses = self.get_known_poses()
+    if nkp == 1:
+        raise NotImplementedError(
+            "Would be simpler to just align everything afterwards on the single known pose"
+        )
+    elif nkp > 1:
+        # global rigid SE3 alignment
+        s, R, T = align_multiple_poses(
+            im_poses[known_poses_msk], known_poses[known_poses_msk]
+        )
+        trf = sRT_to_4x4(s, R, T, device=known_poses.device)
+        # rotate everything
+        im_poses = trf @ im_poses
+        im_poses[:, :3, :3] /= s  # undo scaling on the rotation part
+        for img_pts3d in pts3d:
+            img_pts3d[:] = geotrf(trf, img_pts3d)
+    else:
+        pass  # no known poses
+    # set all pairwise poses
+    for e, (i, j) in enumerate(self.edges):
+        i_j = edge_str(i, j)
+        # compute transform that goes from cam to world
+        s, R, T = rigid_points_registration(
+            self.pred_i[i_j], pts3d[i], conf=self.conf_i[i_j]
+        )
+        self._set_pose(self.pw_poses, e, R, T, scale=s)
+    # take into account the scale normalization
+    s_factor = self.get_pw_norm_scale_factor()
+    im_poses[:, :3, 3] *= s_factor  # apply downscaling factor
+    for img_pts3d in pts3d:
+        img_pts3d *= s_factor
+    # init all image poses
+    if self.has_im_poses:
+        for i in range(self.n_imgs):
+            cam2world = im_poses[i]
+            depth = geotrf(inv(cam2world), pts3d[i])[..., 2]
+            self._set_depthmap(i, depth)
+            self._set_pose(self.im_poses, i, cam2world)
+            if im_focals[i] is not None:
+                if not self.shared_focal:
+                    self._set_focal(i, im_focals[i])
+        if self.shared_focal:
+            self._set_focal(0, sum(im_focals) / self.n_imgs)
+        if self.n_imgs > 2:
+            self._set_init_depthmap()
+    if self.verbose:
+        with torch.no_grad():
+            print(" init loss =", float(self()))

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/cloud_opt/utils.py ADDED Viewed

	@@ -0,0 +1,443 @@

+import torch.nn as nn
+import torch
+import roma
+import numpy as np
+import cv2
+from functools import cache
+def todevice(batch, device, callback=None, non_blocking=False):
+    """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
+    batch: list, tuple, dict of tensors or other things
+    device: pytorch device or 'numpy'
+    callback: function that would be called on every sub-elements.
+    """
+    if callback:
+        batch = callback(batch)
+    if isinstance(batch, dict):
+        return {k: todevice(v, device) for k, v in batch.items()}
+    if isinstance(batch, (tuple, list)):
+        return type(batch)(todevice(x, device) for x in batch)
+    x = batch
+    if device == "numpy":
+        if isinstance(x, torch.Tensor):
+            x = x.detach().cpu().numpy()
+    elif x is not None:
+        if isinstance(x, np.ndarray):
+            x = torch.from_numpy(x)
+        if torch.is_tensor(x):
+            x = x.to(device, non_blocking=non_blocking)
+    return x
+to_device = todevice  # alias
+def to_numpy(x):
+    return todevice(x, "numpy")
+def to_cpu(x):
+    return todevice(x, "cpu")
+def to_cuda(x):
+    return todevice(x, "cuda")
+def signed_log1p(x):
+    sign = torch.sign(x)
+    return sign * torch.log1p(torch.abs(x))
+def l2_dist(a, b, weight):
+    return (a - b).square().sum(dim=-1) * weight
+def l1_dist(a, b, weight):
+    return (a - b).norm(dim=-1) * weight
+ALL_DISTS = dict(l1=l1_dist, l2=l2_dist)
+def _check_edges(edges):
+    indices = sorted({i for edge in edges for i in edge})
+    assert indices == list(range(len(indices))), "bad pair indices: missing values "
+    return len(indices)
+def NoGradParamDict(x):
+    assert isinstance(x, dict)
+    return nn.ParameterDict(x).requires_grad_(False)
+def edge_str(i, j):
+    return f"{i}_{j}"
+def i_j_ij(ij):
+    # inputs are (i, j)
+    return edge_str(*ij), ij
+def edge_conf(conf_i, conf_j):
+    score = float(conf_i.mean() * conf_j.mean())
+    return score
+def get_imshapes(edges, pred_i, pred_j):
+    n_imgs = max(max(e) for e in edges) + 1
+    imshapes = [None] * n_imgs
+    for e, (i, j) in enumerate(edges):
+        shape_i = tuple(pred_i[e]["pts3d_is_self_view"].shape[0:2])
+        shape_j = tuple(pred_j[e]["pts3d_in_other_view"].shape[0:2])
+        if imshapes[i]:
+            assert imshapes[i] == shape_i, f"incorrect shape for image {i}"
+        if imshapes[j]:
+            assert imshapes[j] == shape_j, f"incorrect shape for image {j}"
+        imshapes[i] = shape_i
+        imshapes[j] = shape_j
+    return imshapes
+def get_conf_trf(mode):
+    if mode == "log":
+        def conf_trf(x):
+            return x.log()
+    elif mode == "sqrt":
+        def conf_trf(x):
+            return x.sqrt()
+    elif mode == "m1":
+        def conf_trf(x):
+            return x - 1
+    elif mode in ("id", "none"):
+        def conf_trf(x):
+            return x
+    else:
+        raise ValueError(f"bad mode for {mode=}")
+    return conf_trf
+@torch.no_grad()
+def _compute_img_conf(imshapes, device, edges, edge2conf_i, edge2conf_j):
+    im_conf = nn.ParameterList([torch.zeros(hw, device=device) for hw in imshapes])
+    for e, (i, j) in enumerate(edges):
+        im_conf[i] = torch.maximum(im_conf[i], edge2conf_i[edge_str(i, j)])
+        im_conf[j] = torch.maximum(im_conf[j], edge2conf_j[edge_str(i, j)])
+    return im_conf
+def xy_grid(
+    W,
+    H,
+    device=None,
+    origin=(0, 0),
+    unsqueeze=None,
+    cat_dim=-1,
+    homogeneous=False,
+    **arange_kw,
+):
+    """Output a (H,W,2) array of int32
+    with output[j,i,0] = i + origin[0]
+         output[j,i,1] = j + origin[1]
+    """
+    if device is None:
+        # numpy
+        arange, meshgrid, stack, ones = np.arange, np.meshgrid, np.stack, np.ones
+    else:
+        # torch
+        arange = lambda *a, **kw: torch.arange(*a, device=device, **kw)
+        meshgrid, stack = torch.meshgrid, torch.stack
+        ones = lambda *a: torch.ones(*a, device=device)
+    tw, th = [arange(o, o + s, **arange_kw) for s, o in zip((W, H), origin)]
+    grid = meshgrid(tw, th, indexing="xy")
+    if homogeneous:
+        grid = grid + (ones((H, W)),)
+    if unsqueeze is not None:
+        grid = (grid[0].unsqueeze(unsqueeze), grid[1].unsqueeze(unsqueeze))
+    if cat_dim is not None:
+        grid = stack(grid, cat_dim)
+    return grid
+def estimate_focal_knowing_depth(
+    pts3d, pp, focal_mode="median", min_focal=0.0, max_focal=np.inf
+):
+    """Reprojection method, for when the absolute depth is known:
+    1) estimate the camera focal using a robust estimator
+    2) reproject points onto true rays, minimizing a certain error
+    """
+    B, H, W, THREE = pts3d.shape
+    assert THREE == 3
+    # centered pixel grid
+    pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view(
+        -1, 1, 2
+    )  # B,HW,2
+    pts3d = pts3d.flatten(1, 2)  # (B, HW, 3)
+    if focal_mode == "median":
+        with torch.no_grad():
+            # direct estimation of focal
+            u, v = pixels.unbind(dim=-1)
+            x, y, z = pts3d.unbind(dim=-1)
+            fx_votes = (u * z) / x
+            fy_votes = (v * z) / y
+            # assume square pixels, hence same focal for X and Y
+            f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1)
+            focal = torch.nanmedian(f_votes, dim=-1).values
+    elif focal_mode == "weiszfeld":
+        # init focal with l2 closed form
+        # we try to find focal = argmin Sum | pixel - focal * (x,y)/z|
+        xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num(
+            posinf=0, neginf=0
+        )  # homogeneous (x,y,1)
+        dot_xy_px = (xy_over_z * pixels).sum(dim=-1)
+        dot_xy_xy = xy_over_z.square().sum(dim=-1)
+        focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1)
+        # iterative re-weighted least-squares
+        for iter in range(10):
+            # re-weighting by inverse of distance
+            dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1)
+            # print(dis.nanmean(-1))
+            w = dis.clip(min=1e-8).reciprocal()
+            # update the scaling with the new weights
+            focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1)
+    else:
+        raise ValueError(f"bad {focal_mode=}")
+    focal_base = max(H, W) / (
+        2 * np.tan(np.deg2rad(60) / 2)
+    )  # size / 1.1547005383792515
+    focal = focal.clip(min=min_focal * focal_base, max=max_focal * focal_base)
+    # print(focal)
+    return focal
+def estimate_focal(pts3d_i, pp=None):
+    if pp is None:
+        H, W, THREE = pts3d_i.shape
+        assert THREE == 3
+        pp = torch.tensor((W / 2, H / 2), device=pts3d_i.device)
+    focal = estimate_focal_knowing_depth(
+        pts3d_i.unsqueeze(0), pp.unsqueeze(0), focal_mode="weiszfeld"
+    ).ravel()
+    return float(focal)
+def rigid_points_registration(pts1, pts2, conf):
+    R, T, s = roma.rigid_points_registration(
+        pts1.reshape(-1, 3),
+        pts2.reshape(-1, 3),
+        weights=conf.ravel(),
+        compute_scaling=True,
+    )
+    return s, R, T  # return un-scaled (R, T)
+def sRT_to_4x4(scale, R, T, device):
+    trf = torch.eye(4, device=device)
+    trf[:3, :3] = R * scale
+    trf[:3, 3] = T.ravel()  # doesn't need scaling
+    return trf
+def geotrf(Trf, pts, ncol=None, norm=False):
+    """Apply a geometric transformation to a list of 3-D points.
+    H: 3x3 or 4x4 projection matrix (typically a Homography)
+    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
+    ncol: int. number of columns of the result (2 or 3)
+    norm: float. if != 0, the resut is projected on the z=norm plane.
+    Returns an array of projected 2d points.
+    """
+    assert Trf.ndim >= 2
+    if isinstance(Trf, np.ndarray):
+        pts = np.asarray(pts)
+    elif isinstance(Trf, torch.Tensor):
+        pts = torch.as_tensor(pts, dtype=Trf.dtype)
+    # adapt shape if necessary
+    output_reshape = pts.shape[:-1]
+    ncol = ncol or pts.shape[-1]
+    # optimized code
+    if (
+        isinstance(Trf, torch.Tensor)
+        and isinstance(pts, torch.Tensor)
+        and Trf.ndim == 3
+        and pts.ndim == 4
+    ):
+        d = pts.shape[3]
+        if Trf.shape[-1] == d:
+            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
+        elif Trf.shape[-1] == d + 1:
+            pts = (
+                torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts)
+                + Trf[:, None, None, :d, d]
+            )
+        else:
+            raise ValueError(f"bad shape, not ending with 3 or 4, for {pts.shape=}")
+    else:
+        if Trf.ndim >= 3:
+            n = Trf.ndim - 2
+            assert Trf.shape[:n] == pts.shape[:n], "batch size does not match"
+            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
+            if pts.ndim > Trf.ndim:
+                # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d)
+                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
+            elif pts.ndim == 2:
+                # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d)
+                pts = pts[:, None, :]
+        if pts.shape[-1] + 1 == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
+        elif pts.shape[-1] == Trf.shape[-1]:
+            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
+            pts = pts @ Trf
+        else:
+            pts = Trf @ pts.T
+            if pts.ndim >= 2:
+                pts = pts.swapaxes(-1, -2)
+    if norm:
+        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
+        if norm != 1:
+            pts *= norm
+    res = pts[..., :ncol].reshape(*output_reshape, ncol)
+    return res
+def inv(mat):
+    """Invert a torch or numpy matrix"""
+    if isinstance(mat, torch.Tensor):
+        return torch.linalg.inv(mat)
+    if isinstance(mat, np.ndarray):
+        return np.linalg.inv(mat)
+    raise ValueError(f"bad matrix type = {type(mat)}")
+@cache
+def pixel_grid(H, W):
+    return np.mgrid[:W, :H].T.astype(np.float32)
+def fast_pnp(pts3d, focal, msk, device, pp=None, niter_PnP=10):
+    # extract camera poses and focals with RANSAC-PnP
+    if msk.sum() < 4:
+        return None  # we need at least 4 points for PnP
+    pts3d, msk = map(to_numpy, (pts3d, msk))
+    H, W, THREE = pts3d.shape
+    assert THREE == 3
+    pixels = pixel_grid(H, W)
+    if focal is None:
+        S = max(W, H)
+        tentative_focals = np.geomspace(S / 2, S * 3, 21)
+    else:
+        tentative_focals = [focal]
+    if pp is None:
+        pp = (W / 2, H / 2)
+    else:
+        pp = to_numpy(pp)
+    best = (0,)
+    for focal in tentative_focals:
+        K = np.float32([(focal, 0, pp[0]), (0, focal, pp[1]), (0, 0, 1)])
+        success, R, T, inliers = cv2.solvePnPRansac(
+            pts3d[msk],
+            pixels[msk],
+            K,
+            None,
+            iterationsCount=niter_PnP,
+            reprojectionError=5,
+            flags=cv2.SOLVEPNP_SQPNP,
+        )
+        if not success:
+            continue
+        score = len(inliers)
+        if success and score > best[0]:
+            best = score, R, T, focal
+    if not best[0]:
+        return None
+    _, R, T, best_focal = best
+    R = cv2.Rodrigues(R)[0]  # world to cam
+    R, T = map(torch.from_numpy, (R, T))
+    return best_focal, inv(sRT_to_4x4(1, R, T, device))  # cam to world
+def get_med_dist_between_poses(poses):
+    from scipy.spatial.distance import pdist
+    return np.median(pdist([to_numpy(p[:3, 3]) for p in poses]))
+def align_multiple_poses(src_poses, target_poses):
+    N = len(src_poses)
+    assert src_poses.shape == target_poses.shape == (N, 4, 4)
+    def center_and_z(poses):
+        eps = get_med_dist_between_poses(poses) / 100
+        return torch.cat((poses[:, :3, 3], poses[:, :3, 3] + eps * poses[:, :3, 2]))
+    R, T, s = roma.rigid_points_registration(
+        center_and_z(src_poses), center_and_z(target_poses), compute_scaling=True
+    )
+    return s, R, T
+def cosine_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_end + (lr_start - lr_end) * (1 + np.cos(t * np.pi)) / 2
+def linear_schedule(t, lr_start, lr_end):
+    assert 0 <= t <= 1
+    return lr_start + (lr_end - lr_start) * t
+def cycled_linear_schedule(t, lr_start, lr_end, num_cycles=2):
+    assert 0 <= t <= 1
+    cycle_t = t * num_cycles
+    cycle_t = cycle_t - int(cycle_t)
+    if t == 1:
+        cycle_t = 1
+    return linear_schedule(cycle_t, lr_start, lr_end)
+def adjust_learning_rate_by_lr(optimizer, lr):
+    for param_group in optimizer.param_groups:
+        if "lr_scale" in param_group:
+            param_group["lr"] = lr * param_group["lr_scale"]
+        else:
+            param_group["lr"] = lr

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/deepspeed_zero3_bf16.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "bf16": {
+    "enabled": true
+  },
+  "gradient_accumulation_steps": 1,
+  "gradient_clipping": 1.0,
+  "train_micro_batch_size_per_gpu": 1,
+  "steps_per_print": 2000,
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "reduce_bucket_size": 50000000,
+    "stage3_prefetch_bucket_size": 5000000,
+    "stage3_param_persistence_threshold": 100000,
+    "gather_16bit_weights_on_model_save": true
+  }
+}

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+accum_iter: 1
+allow_repeat: false
+amp: 1
+batch_size: 1
+benchmark: false
+custom_lr_scale: 1.0
+dataset_arkit: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="/var/scratch/qzhang2/SLAM-Former/data/train/processed_arkitscenes/",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_mvs_synth: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="/var/scratch/qzhang2/SLAM-Former/data/train/processed_mvs_synth",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_scannetpp: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="/var/scratch/qzhang2/SLAM-Former/data/train/processed_scannetpp/",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+desc_dim: 128
+detach_frontend_tokens: false
+dist_backend: nccl
+dist_url: env://
+distributed: false
+enable_dynamic_boundary: false
+enable_loop: true
+enable_submap: true
+enable_temporal: false
+epochs: 10
+eval_freq: 1
+exp_name: submap_joint_softall_64_24_v1
+fixed_length: true
+freeze_encoder: true
+gpu: 0
+gradient_checkpointing: true
+gumbel_tau: 5.0
+loop_mask_mode: soft_all
+retain_history_grad: true
+submap_train_mode: full_token
+submap_retrieval_topk: 0
+submap_fetch_source: frontend
+submap_descriptor_source: frontend
+train_submap_modules_only: false
+gumbel_tau_end: 0.1
+gumbel_tau_start: 5.0
+hydra:
+  run:
+    dir: ${save_dir}/${exp_name}
+  verbose: true
+keep_freq: 1
+load_only_encoder: false
+local-rank: -1
+logdir: ${save_dir}/${exp_name}/logs
+long_context: false
+lr: 1e-5
+max_checkpoints: 10
+max_recursive_submaps: 5
+min_lr: 1e-8
+n_corres_test: 0
+n_corres_train: 0
+num_imgs_vis: 4
+num_test_views: 4
+num_views: 24
+num_workers: 4
+output_dir: ${save_dir}/${exp_name}/
+pretrained: /var/scratch/qzhang2/SLAM-Former/ckpt/checkpoint-10.pth.model
+print_freq: 10
+print_img_freq: 50000000
+rank: 0
+resume: null
+retention_ratio: 0.5
+pseudo_gt:
+  enable: false
+  cache_path: null
+  use_soft_targets: true
+  min_confidence: 0.65
+  min_support_pairs: 1
+  topk_pairs: 4
+  loss_type: hybrid
+  loss_weight_gate: 0.1
+  loss_weight_desc: 0.1
+  geometric_support_scale: 0.25
+  ranking_margin: 0.1
+  use_l2m: false
+  l2m_min_certainty: 0.0
+  l2m_min_inlier_ratio: 0.0
+save_dir: /var/scratch/qzhang2/SLAM-Former/checkpoints
+save_freq: 0.1
+seed: 42
+soft_mask_bias: 0.2
+soft_mask_temperature: 0.25
+start_epoch: 0
+start_step: 0
+submap_size: 6
+task: SLAMFormer_Submap_Finetune
+tbptt_window: 0
+teacher: null
+temporal_embed_mode: learned
+test_criterion: DistillLoss()
+test_dataset: 500 @ ARKitScenes_Multi(split='test', ROOT="/var/scratch/qzhang2/SLAM-Former/data/train/processed_arkitscenes/",
+  resolution=(518, 392), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+train_criterion: DistillLoss()
+train_dataset: 2250 @ ${dataset_scannetpp} + 450 @ ${dataset_mvs_synth} + 4500 @ ${dataset_arkit}
+warmup_epochs: 0.5
+weight_decay: 0.05
+world_size: 1

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune_paper_h20.yaml ADDED Viewed

	@@ -0,0 +1,129 @@

+accum_iter: 1
+allow_repeat: false
+amp: 1
+batch_size: 1
+benchmark: false
+custom_lr_scale: 1.0
+data_root: /var/scratch/qzhang2/SLAM-Former/data/train
+root_arkit: ${data_root}/processed_arkitscenes
+root_scannetpp: ${data_root}/processed_scannetpp
+root_scannet: ${data_root}/processed_scannet
+root_hypersim: ${data_root}/hypersim
+root_blendedmvs: ${data_root}/processed_blendedmvs
+root_megadepth: ${data_root}/processed_megadepth
+root_mvs_synth: ${data_root}/processed_mvs_synth
+dataset_arkit: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_arkit}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_arkit}, n_corres=${n_corres_train})
+dataset_scannetpp: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannetpp}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_scannetpp}, n_corres=${n_corres_train})
+dataset_scannet: ScanNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannet}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_scannet}, n_corres=${n_corres_train})
+dataset_hypersim: HyperSim_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_hypersim}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_hypersim}, n_corres=${n_corres_train})
+dataset_blendedmvs: BlendedMVS_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_blendedmvs}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_blendedmvs}, n_corres=${n_corres_train})
+dataset_megadepth: MegaDepth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_megadepth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_megadepth}, n_corres=${n_corres_train})
+dataset_mvs_synth: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_mvs_synth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_mvs_synth}, n_corres=${n_corres_train})
+desc_dim: 128
+detach_frontend_tokens: false
+dist_backend: nccl
+dist_url: env://
+distributed: false
+enable_dynamic_boundary: false
+enable_loop: true
+enable_submap: true
+enable_temporal: false
+epochs: 10
+eval_freq: 1
+exp_name: paper_all_h20_joint
+fixed_length: true
+freeze_encoder: true
+gpu: 0
+gradient_checkpointing: true
+gumbel_tau: 5.0
+loop_mask_mode: soft_all
+retain_history_grad: true
+submap_train_mode: full_token
+submap_retrieval_topk: 0
+submap_fetch_source: frontend
+submap_descriptor_source: frontend
+train_submap_modules_only: false
+gumbel_tau_end: 0.1
+gumbel_tau_start: 5.0
+hydra:
+  run:
+    dir: ${save_dir}/${exp_name}
+  verbose: true
+keep_freq: 1
+load_only_encoder: false
+local-rank: -1
+logdir: ${save_dir}/${exp_name}/logs
+long_context: false
+lr: 1e-5
+max_checkpoints: 10
+max_recursive_submaps: 5
+min_lr: 1e-8
+n_corres_test: 0
+n_corres_train: 0
+num_imgs_vis: 4
+num_test_views: 4
+num_views: 24
+num_views_arkit: 24
+num_views_scannetpp: 24
+num_views_scannet: 24
+num_views_hypersim: 24
+num_views_blendedmvs: 24
+num_views_megadepth: 24
+num_views_mvs_synth: 24
+num_workers: 4
+output_dir: ${save_dir}/${exp_name}/
+pretrained: /var/scratch/qzhang2/SLAM-Former/ckpt/checkpoint-10.pth.model
+print_freq: 10
+print_img_freq: 50000000
+rank: 0
+resume: null
+retention_ratio: 0.5
+pseudo_gt:
+  enable: false
+  cache_path: null
+  use_soft_targets: true
+  min_confidence: 0.65
+  min_support_pairs: 1
+  topk_pairs: 4
+  loss_type: hybrid
+  loss_weight_gate: 0.1
+  loss_weight_desc: 0.1
+  geometric_support_scale: 0.25
+  ranking_margin: 0.1
+  use_l2m: false
+  l2m_min_certainty: 0.0
+  l2m_min_inlier_ratio: 0.0
+save_dir: /var/scratch/qzhang2/SLAM-Former/checkpoints
+save_freq: 0.1
+seed: 42
+soft_mask_bias: 0.2
+soft_mask_temperature: 0.25
+start_epoch: 0
+start_step: 0
+submap_size: 6
+task: SLAMFormer_Submap_Finetune
+tbptt_window: 0
+teacher: null
+temporal_embed_mode: learned
+test_criterion: DistillLoss()
+test_dataset: 500 @ ARKitScenes_Multi(split='test', ROOT="${root_arkit}",
+  resolution=(518, 392), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+train_criterion: DistillLoss()
+train_dataset: 4500 @ ${dataset_arkit} + 2250 @ ${dataset_scannetpp} + 4500 @ ${dataset_scannet} + 1200 @ ${dataset_hypersim} + 2250 @ ${dataset_blendedmvs} + 2250 @ ${dataset_megadepth} + 450 @ ${dataset_mvs_synth}
+warmup_epochs: 0.5
+weight_decay: 0.05
+world_size: 1

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune_pseudo_gt_high_recall.yaml ADDED Viewed

	@@ -0,0 +1,129 @@

+accum_iter: 1
+allow_repeat: false
+amp: 1
+batch_size: 1
+benchmark: false
+custom_lr_scale: 1.0
+data_root: /var/scratch/qzhang2/SLAM-Former/data/train
+root_arkit: ${data_root}/processed_arkitscenes
+root_scannetpp: ${data_root}/processed_scannetpp
+root_scannet: ${data_root}/processed_scannetv2
+root_hypersim: ${data_root}/hypersim
+root_blendedmvs: ${data_root}/processed_blendedmvs
+root_megadepth: ${data_root}/processed_megadepth
+root_mvs_synth: ${data_root}/processed_mvs_synth
+dataset_arkit: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_arkit}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_arkit}, n_corres=${n_corres_train})
+dataset_scannetpp: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannetpp}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_scannetpp}, n_corres=${n_corres_train})
+dataset_scannet: ScanNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannet}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_scannet}, n_corres=${n_corres_train})
+dataset_hypersim: HyperSim_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_hypersim}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_hypersim}, n_corres=${n_corres_train})
+dataset_blendedmvs: BlendedMVS_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_blendedmvs}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_blendedmvs}, n_corres=${n_corres_train})
+dataset_megadepth: MegaDepth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_megadepth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_megadepth}, n_corres=${n_corres_train})
+dataset_mvs_synth: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_mvs_synth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views_mvs_synth}, n_corres=${n_corres_train})
+desc_dim: 128
+detach_frontend_tokens: true
+dist_backend: nccl
+dist_url: env://
+distributed: false
+enable_dynamic_boundary: false
+enable_loop: true
+enable_submap: true
+enable_temporal: false
+epochs: 10
+eval_freq: 1
+exp_name: paper_all_h20_pseudo_gt_high_recall
+fixed_length: true
+freeze_encoder: true
+gpu: 0
+gradient_checkpointing: true
+gumbel_tau: 5.0
+loop_mask_mode: soft_all
+retain_history_grad: true
+submap_train_mode: full_token
+submap_retrieval_topk: 0
+submap_fetch_source: frontend
+submap_descriptor_source: frontend
+train_submap_modules_only: true
+gumbel_tau_end: 0.1
+gumbel_tau_start: 5.0
+hydra:
+  run:
+    dir: ${save_dir}/${exp_name}
+  verbose: true
+keep_freq: 1
+load_only_encoder: false
+local-rank: -1
+logdir: ${save_dir}/${exp_name}/logs
+long_context: false
+lr: 1e-5
+max_checkpoints: 10
+max_recursive_submaps: 5
+min_lr: 1e-8
+n_corres_test: 0
+n_corres_train: 0
+num_imgs_vis: 4
+num_test_views: 4
+num_views: 24
+num_views_arkit: 24
+num_views_scannetpp: 24
+num_views_scannet: 24
+num_views_hypersim: 24
+num_views_blendedmvs: 24
+num_views_megadepth: 24
+num_views_mvs_synth: 24
+num_workers: 4
+output_dir: ${save_dir}/${exp_name}/
+pretrained: /var/scratch/qzhang2/SLAM-Former/ckpt/checkpoint-10.pth.model
+print_freq: 10
+print_img_freq: 50000000
+rank: 0
+resume: null
+retention_ratio: 0.5
+pseudo_gt:
+  enable: false
+  cache_path: null
+  use_soft_targets: true
+  min_confidence: 0.5
+  min_support_pairs: 1
+  topk_pairs: 8
+  loss_type: hybrid
+  loss_weight_gate: 0.05
+  loss_weight_desc: 0.15
+  geometric_support_scale: 0.5
+  ranking_margin: 0.05
+  use_l2m: true
+  l2m_min_certainty: 0.35
+  l2m_min_inlier_ratio: 0.2
+save_dir: /var/scratch/qzhang2/SLAM-Former/checkpoints
+save_freq: 0.1
+seed: 42
+soft_mask_bias: 0.2
+soft_mask_temperature: 0.25
+start_epoch: 0
+start_step: 0
+submap_size: 6
+task: SLAMFormer_Submap_Finetune
+tbptt_window: 0
+teacher: null
+temporal_embed_mode: learned
+test_criterion: DistillLoss()
+test_dataset: 500 @ ARKitScenes_Multi(split='test', ROOT="${root_arkit}",
+  resolution=(518, 392), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+train_criterion: DistillLoss()
+train_dataset: 4500 @ ${dataset_arkit} + 2250 @ ${dataset_scannetpp} + 4500 @ ${dataset_scannet} + 1200 @ ${dataset_hypersim} + 2250 @ ${dataset_blendedmvs} + 2250 @ ${dataset_megadepth} + 450 @ ${dataset_mvs_synth}
+warmup_epochs: 0.5
+weight_decay: 0.05
+world_size: 1

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/finetune_sub_only.yaml ADDED Viewed

	@@ -0,0 +1,129 @@

+accum_iter: 1
+allow_repeat: false
+amp: 1
+batch_size: 1
+benchmark: false
+custom_lr_scale: 1.0
+data_root: /var/scratch/qzhang2/SLAM-Former/data/train
+root_arkit: ${data_root}/processed_arkitscenes
+root_scannetpp: ${data_root}/processed_scannetpp
+root_scannet: ${data_root}/processed_scannet
+root_hypersim: ${data_root}/hypersim
+root_blendedmvs: ${data_root}/processed_blendedmvs
+root_megadepth: ${data_root}/processed_megadepth
+root_mvs_synth: ${data_root}/processed_mvs_synth
+dataset_arkit: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_arkit}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_scannet: ScanNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannet}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_hypersim: HyperSim_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_hypersim}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_blendedmvs: BlendedMVS_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_blendedmvs}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_megadepth: MegaDepth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_megadepth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_mvs_synth: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_mvs_synth}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset_scannetpp: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="${root_scannetpp}",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210),
+  (518, 154)], transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+desc_dim: 128
+detach_frontend_tokens: true
+dist_backend: nccl
+dist_url: env://
+distributed: false
+enable_dynamic_boundary: false
+enable_loop: true
+enable_submap: true
+enable_temporal: false
+epochs: 10
+eval_freq: 1
+exp_name: submap_sub_only_softall_64_24_v1
+fixed_length: true
+freeze_encoder: true
+gpu: 0
+gradient_checkpointing: true
+gumbel_tau: 5.0
+loop_mask_mode: soft_all
+retain_history_grad: true
+submap_train_mode: full_token
+submap_retrieval_topk: 0
+submap_fetch_source: frontend
+submap_descriptor_source: frontend
+train_submap_modules_only: true
+gumbel_tau_end: 0.1
+gumbel_tau_start: 5.0
+hydra:
+  run:
+    dir: ${save_dir}/${exp_name}
+  verbose: true
+keep_freq: 1
+load_only_encoder: false
+local-rank: -1
+logdir: ${save_dir}/${exp_name}/logs
+long_context: false
+lr: 1e-5
+max_checkpoints: 10
+max_recursive_submaps: 5
+min_lr: 1e-8
+n_corres_test: 0
+n_corres_train: 0
+num_imgs_vis: 4
+num_test_views: 4
+num_views: 24
+num_views_arkit: 24
+num_views_scannetpp: 24
+num_views_scannet: 24
+num_views_hypersim: 24
+num_views_blendedmvs: 24
+num_views_megadepth: 24
+num_views_mvs_synth: 24
+num_workers: 4
+output_dir: ${save_dir}/${exp_name}/
+pretrained: /var/scratch/qzhang2/SLAM-Former/ckpt/checkpoint-10.pth.model
+print_freq: 10
+print_img_freq: 50000000
+rank: 0
+resume: null
+retention_ratio: 0.5
+pseudo_gt:
+  enable: false
+  cache_path: null
+  use_soft_targets: true
+  min_confidence: 0.65
+  min_support_pairs: 1
+  topk_pairs: 4
+  loss_type: hybrid
+  loss_weight_gate: 0.1
+  loss_weight_desc: 0.1
+  geometric_support_scale: 0.25
+  ranking_margin: 0.1
+  use_l2m: false
+  l2m_min_certainty: 0.0
+  l2m_min_inlier_ratio: 0.0
+save_dir: /var/scratch/qzhang2/SLAM-Former/checkpoints
+save_freq: 0.1
+seed: 42
+soft_mask_bias: 0.2
+soft_mask_temperature: 0.25
+start_epoch: 0
+start_step: 0
+submap_size: 6
+task: SLAMFormer_Submap_Finetune
+tbptt_window: 0
+teacher: null
+temporal_embed_mode: learned
+test_criterion: DistillLoss()
+test_dataset: 500 @ ARKitScenes_Multi(split='test', ROOT="${root_arkit}",
+  resolution=(518, 392), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+train_criterion: DistillLoss()
+train_dataset: 2250 @ ${dataset_scannetpp} + 450 @ ${dataset_mvs_synth} + 4500 @ ${dataset_arkit}
+warmup_epochs: 0.5
+weight_decay: 0.05
+world_size: 1

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/config/mytrain.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+teacher: ../ckpt/model.pt # no use
+pretrained: ../ckpt/pi3.pth
+load_only_encoder: False
+long_context: False
+fixed_length: True
+resume: Null
+benchmark: False
+num_views : 12
+num_test_views : 4
+n_corres_train: 0
+n_corres_test: 0
+train_criterion: DistillLoss()
+test_criterion: DistillLoss()
+allow_repeat: False
+dataset3: ARKitScenes_Multi(allow_repeat=${allow_repeat}, split='train', ROOT='../data/train/processed_arkitscenes/',
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)],
+  transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset5: ScanNetpp_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../data/train/processed_scannetpp/",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)],
+  transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset6: ScanNet_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../data/train/processed_scannet/",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)],
+  transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset7: HyperSim_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../data/train/hypersim",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)],
+  transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset8: BlendedMVS_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../data/train/processed_blendedmvs/",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)],
+  transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset9: MegaDepth_Multi(allow_repeat=${allow_repeat}, split="train", ROOT="../data/train/processed_megadepth",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)],
+  transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+dataset14: MVS_Synth_Multi(allow_repeat=${allow_repeat}, split='train', ROOT="../data/train/processed_mvs_synth",
+  aug_crop=16, resolution=[(518, 392), (518, 336), (518, 294), (518, 266), (518, 210), (518, 154)],
+  transform=SeqColorJitter, num_views=${num_views}, n_corres=${n_corres_train})
+train_dataset:  4500 @ ${dataset3} + 2250 @ ${dataset5} + 4500 @ ${dataset6} + 1200 @ ${dataset7} + 2250 @ ${dataset8} + 2250 @ ${dataset9} + 450 @ ${dataset14}
+test_dataset: 1000 @ ARKitScenes_Multi(split='test', ROOT='../data/train/processed_arkitscenes/', resolution=(518, 392), num_views=${num_test_views}, seed=42, n_corres=${n_corres_test})
+seed: 0
+batch_size: 1
+accum_iter: 1
+gradient_checkpointing: False
+epochs: 10
+start_epoch: 0
+start_step: 0
+weight_decay: 0.05
+lr: 1e-5
+min_lr: 1e-8
+warmup_epochs: 0.5
+amp: 1
+num_workers: 4 # 12
+world_size: 1
+local-rank: -1
+dist_url: 'env://'
+rank: 0
+gpu: 0
+distributed: False
+dist_backend: 'nccl'
+eval_freq: 1
+save_freq: 0.1
+max_checkpoints: 10
+keep_freq: 1
+print_freq: 10
+print_img_freq: 50000000
+num_imgs_vis: 4
+save_dir: '../checkpoints'
+exp_name: 'SLAMFormer_v1'
+task: 'StreamVGGT'
+logdir: ${save_dir}/${exp_name}/logs
+output_dir: ${save_dir}/${exp_name}/
+hydra:
+  verbose: True
+  run:
+    dir: ${save_dir}/${exp_name}

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/environment.yml ADDED Viewed

	@@ -0,0 +1,245 @@

+name: SLAM-Former
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2025.12.2=h06a4308_0
+  - expat=2.7.4=h7354ed3_0
+  - ld_impl_linux-64=2.44=h153f514_2
+  - libexpat=2.7.4=h7354ed3_0
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc=15.2.0=h69a1729_7
+  - libgcc-ng=15.2.0=h166f726_7
+  - libgomp=15.2.0=h4751f2c_7
+  - libnsl=2.0.0=h5eee18b_0
+  - libstdcxx=15.2.0=h39759b7_7
+  - libstdcxx-ng=15.2.0=hc03a8fd_7
+  - libuuid=1.41.5=h5eee18b_0
+  - libxcb=1.17.0=h9b100fa_0
+  - libzlib=1.3.1=hb25bd0a_0
+  - ncurses=6.5=h7934f7d_0
+  - openssl=3.5.5=h1b28b03_0
+  - packaging=25.0=py311h06a4308_1
+  - pip=26.0.1=pyhc872135_0
+  - pthread-stubs=0.3=h0ce48e5_1
+  - python=3.11.14=h6fa692b_0
+  - readline=8.3=hc2a1206_0
+  - setuptools=80.10.2=py311h06a4308_0
+  - sqlite=3.51.1=h3e8d24a_1
+  - tk=8.6.15=h54e0aa7_0
+  - tzdata=2026a=he532380_0
+  - wheel=0.46.3=py311h06a4308_0
+  - xorg-libx11=1.8.12=h9b100fa_1
+  - xorg-libxau=1.0.12=h9b100fa_0
+  - xorg-libxdmcp=1.1.5=h9b100fa_0
+  - xorg-xorgproto=2024.1=h5eee18b_1
+  - xz=5.8.2=h448239c_0
+  - zlib=1.3.1=hb25bd0a_0
+  - pip:
+      - absl-py==2.4.0
+      - accelerate==1.13.0
+      - addict==2.4.0
+      - aiofiles==24.1.0
+      - annotated-doc==0.0.4
+      - annotated-types==0.7.0
+      - antlr4-python3-runtime==4.9.3
+      - anyio==4.12.1
+      - argcomplete==3.6.3
+      - asttokens==3.0.1
+      - attrs==25.4.0
+      - beartype==0.22.9
+      - blinker==1.9.0
+      - brotli==1.2.0
+      - ccimport==0.4.4
+      - certifi==2026.2.25
+      - charset-normalizer==3.4.5
+      - click==8.3.1
+      - colorama==0.4.6
+      - colorlog==6.10.1
+      - comm==0.2.3
+      - configargparse==1.7.3
+      - contourpy==1.3.3
+      - cumm-cu124==0.7.11
+      - cycler==0.12.1
+      - dacite==1.9.2
+      - dash==4.0.0
+      - decorator==4.4.2
+      - einops==0.8.2
+      - embreex==2.17.7.post7
+      - evo==1.34.3
+      - executing==2.2.1
+      - fast-pytorch-kmeans==0.2.2
+      - fastapi==0.135.1
+      - fastjsonschema==2.21.2
+      - ffmpy==1.0.0
+      - filelock==3.25.0
+      - fire==0.7.1
+      - flask==3.1.3
+      - fonttools==4.61.1
+      - fsspec==2026.2.0
+      - gradio==6.9.0
+      - gradio-client==2.3.0
+      - groovy==0.1.2
+      - grpcio==1.78.0
+      - gsplat==1.5.3
+      - h11==0.16.0
+      - h5py==3.16.0
+      - hf-xet==1.3.2
+      - httpcore==1.0.9
+      - httpx==0.28.1
+      - huggingface-hub==1.6.0
+      - hydra-core==1.3.2
+      - idna==3.11
+      - imageio==2.37.2
+      - imageio-ffmpeg==0.6.0
+      - importlib-metadata==8.7.1
+      - ipython==9.10.0
+      - ipython-pygments-lexers==1.1.1
+      - ipywidgets==8.1.8
+      - itsdangerous==2.2.0
+      - jaxtyping==0.3.9
+      - jedi==0.19.2
+      - jinja2==3.1.6
+      - joblib==1.5.3
+      - jsonschema==4.26.0
+      - jsonschema-specifications==2025.9.1
+      - jupyter-core==5.9.1
+      - jupyterlab-widgets==3.0.16
+      - kiwisolver==1.4.9
+      - kornia==0.8.2
+      - kornia-rs==0.1.10
+      - lark==1.3.1
+      - lpips==0.1.4
+      - lxml==6.0.2
+      - lz4==4.4.5
+      - manifold3d==3.4.0
+      - mapbox-earcut==2.0.0
+      - markdown==3.10.2
+      - markdown-it-py==4.0.0
+      - markupsafe==3.0.3
+      - matplotlib==3.10.8
+      - matplotlib-inline==0.2.1
+      - mdurl==0.1.2
+      - moviepy==1.0.3
+      - mpmath==1.3.0
+      - msgspec==0.20.0
+      - narwhals==2.17.0
+      - natsort==8.4.0
+      - nbformat==5.10.4
+      - nest-asyncio==1.6.0
+      - networkx==3.6.1
+      - ninja==1.13.0
+      - numexpr==2.14.1
+      - numpy==2.2.6
+      - nvidia-cublas-cu12==12.8.4.1
+      - nvidia-cuda-cupti-cu12==12.8.90
+      - nvidia-cuda-nvrtc-cu12==12.8.93
+      - nvidia-cuda-runtime-cu12==12.8.90
+      - nvidia-cudnn-cu12==9.10.2.21
+      - nvidia-cufft-cu12==11.3.3.83
+      - nvidia-cufile-cu12==1.13.1.3
+      - nvidia-curand-cu12==10.3.9.90
+      - nvidia-cusolver-cu12==11.7.3.90
+      - nvidia-cusparse-cu12==12.5.8.93
+      - nvidia-cusparselt-cu12==0.7.1
+      - nvidia-nccl-cu12==2.27.5
+      - nvidia-nvjitlink-cu12==12.8.93
+      - nvidia-nvshmem-cu12==3.3.20
+      - nvidia-nvtx-cu12==12.8.90
+      - omegaconf==2.3.0
+      - open3d==0.18.0
+      - opencv-python==4.13.0.92
+      - orjson==3.11.7
+      - pandas==3.0.1
+      - parso==0.8.6
+      - pccm==0.4.16
+      - pexpect==4.9.0
+      - pillow==12.1.1
+      - platformdirs==4.9.4
+      - plotly==6.6.0
+      - plyfile==1.1.3
+      - portalocker==3.2.0
+      - proglog==0.1.12
+      - prompt-toolkit==3.0.52
+      - protobuf==7.34.0
+      - psutil==7.2.2
+      - ptyprocess==0.7.0
+      - pure-eval==0.2.3
+      - pyarrow==23.0.1
+      - pybind11==3.0.3
+      - pycollada==0.9.3
+      - pydantic==2.12.5
+      - pydantic-core==2.41.5
+      - pydub==0.25.1
+      - pyglet==1.5.31
+      - pygments==2.19.2
+      - pyparsing==3.3.2
+      - pyquaternion==0.9.9
+      - python-dateutil==2.9.0.post0
+      - python-multipart==0.0.22
+      - pytz==2026.1.post1
+      - pyyaml==6.0.3
+      - referencing==0.37.0
+      - regex==2026.2.28
+      - requests==2.32.5
+      - rerun-sdk==0.30.1
+      - retrying==1.4.2
+      - rich==14.3.3
+      - roma==1.5.6
+      - rosbags==0.11.0
+      - rpds-py==0.30.0
+      - rtree==1.4.1
+      - ruamel-yaml==0.19.1
+      - safehttpx==0.1.7
+      - safetensors==0.7.0
+      - scikit-learn==1.8.0
+      - scipy==1.17.1
+      - seaborn==0.13.2
+      - semantic-version==2.10.0
+      - shapely==2.1.2
+      - shellingham==1.5.4
+      - six==1.17.0
+      - slamformer==0.1.0
+      - spconv-cu124==2.3.8
+      - stack-data==0.6.3
+      - starlette==0.52.1
+      - svg-path==7.0
+      - sympy==1.14.0
+      - tensorboard==2.20.0
+      - tensorboard-data-server==0.7.2
+      - termcolor==3.3.0
+      - threadpoolctl==3.6.0
+      - timm==1.0.26
+      - tokenizers==0.22.2
+      - tomlkit==0.13.3
+      - torch==2.9.1
+      - torch-cluster==1.6.3+pt25cu124
+      - torch-scatter==2.1.2+pt25cu124
+      - torch-sparse==0.6.18+pt25cu124
+      - torch-spline-conv==1.2.2+pt25cu124
+      - torchvision==0.24.1
+      - tqdm==4.67.3
+      - traitlets==5.14.3
+      - transformers==5.3.0
+      - trimesh==4.11.3
+      - triton==3.5.1
+      - typer==0.24.1
+      - typing-extensions==4.15.0
+      - typing-inspection==0.4.2
+      - urllib3==2.6.3
+      - uvicorn==0.41.0
+      - vhacdx==0.0.10
+      - viser==1.0.24
+      - wadler-lindig==0.1.7
+      - wcwidth==0.6.0
+      - websockets==15.0.1
+      - werkzeug==3.1.6
+      - widgetsnbextension==4.0.15
+      - xxhash==3.6.0
+      - yapf==0.43.0
+      - yourdfpy==0.0.60
+      - zipp==3.23.0
+      - zstandard==0.25.0
+prefix: /var/scratch/qzhang2/miniconda3/envs/SLAM-Former

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/eval_ate_scaled.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import glob
+import numpy as np
+try:
+    import evo
+    from evo.core import metrics
+    import evo.main_ape as main_ape
+    from evo.core.metrics import PoseRelation
+    from evo.core.trajectory import PosePath3D
+    from evo.tools import file_interface
+    import evo.core.sync as sync
+    HAS_EVO = True
+except ImportError:
+    HAS_EVO = False
+    print("EVO not found. Please install evo: pip install evo")
+    exit(1)
+TUM_DIR = "/var/scratch/qzhang2/Feature-SLAM/datasets/tum"
+RESULTS_DIR = os.environ.get("RESULTS_DIR", "./tum_results")
+sequences = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(os.path.join(RESULTS_DIR, d))]
+print(f"{'Sequence':<40} | {'ATE (m) [Unscaled]':<20} | {'ATE (m) [Scale Aligned]':<20}")
+print("-" * 85)
+def get_ate(est_file, gt_file, align_scale=False):
+    try:
+        traj_ref = file_interface.read_tum_trajectory_file(gt_file)
+        traj_est = file_interface.read_tum_trajectory_file(est_file)
+        traj_ref, traj_est = sync.associate_trajectories(traj_ref, traj_est)
+        # Sim3 alignment (rotation, translation, and SCALE)
+        traj_est.align(traj_ref, correct_scale=align_scale)
+        result = main_ape.ape(traj_ref, traj_est, pose_relation=PoseRelation.translation_part, align=True, correct_scale=align_scale)
+        return result.stats["rmse"]
+    except Exception as e:
+        return f"Error: {e}"
+for seq in sorted(sequences):
+    est_file = os.path.join(RESULTS_DIR, seq, f"final_traj.txt")
+    gt_file = os.path.join(TUM_DIR, seq, "groundtruth.txt")
+    if os.path.exists(est_file) and os.path.exists(gt_file):
+        ate_unscaled = get_ate(est_file, gt_file, align_scale=False)
+        ate_scaled = get_ate(est_file, gt_file, align_scale=True)
+        unscaled_str = f"{ate_unscaled:.4f}" if isinstance(ate_unscaled, float) else str(ate_unscaled)
+        scaled_str = f"{ate_scaled:.4f}" if isinstance(ate_scaled, float) else str(ate_scaled)
+        print(f"{seq:<40} | {unscaled_str:<20} | {scaled_str:<20}")
+    else:
+        print(f"{seq:<40} | Missing files or still running")

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/get_ate.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import glob
+import numpy as np
+try:
+    import evo
+    from evo.core import metrics
+    import evo.main_ape as main_ape
+    from evo.core.metrics import PoseRelation
+    from evo.core.trajectory import PosePath3D
+    from evo.tools import file_interface
+    import evo.core.sync as sync
+    HAS_EVO = True
+except ImportError:
+    HAS_EVO = False
+    print("EVO not found, using simple ATE calculation")
+TUM_DIR = "/var/scratch/qzhang2/Feature-SLAM/datasets/tum"
+RESULTS_DIR = os.environ.get("RESULTS_DIR", "./tum_results")
+sequences = [d for d in os.listdir(RESULTS_DIR) if os.path.isdir(os.path.join(RESULTS_DIR, d))]
+print(f"{'Sequence':<40} | {'ATE (m)':<15}")
+print("-" * 58)
+def get_ate(est_file, gt_file):
+    if HAS_EVO:
+        try:
+            traj_ref = file_interface.read_tum_trajectory_file(gt_file)
+            traj_est = file_interface.read_tum_trajectory_file(est_file)
+            traj_ref, traj_est = sync.associate_trajectories(traj_ref, traj_est)
+            traj_est.align(traj_ref, correct_scale=False)
+            result = main_ape.ape(traj_ref, traj_est, pose_relation=PoseRelation.translation_part, align=True, correct_scale=False)
+            return result.stats["rmse"]
+        except Exception as e:
+            return f"Error: {e}"
+    else:
+        # Fallback to simple ATE if evo is not available
+        try:
+            est_data = np.loadtxt(est_file)
+            gt_data = np.loadtxt(gt_file)
+            # Simple timestamp matching
+            ate_sum = 0
+            count = 0
+            for est in est_data:
+                ts = est[0]
+                # Find closest gt timestamp
+                idx = np.argmin(np.abs(gt_data[:, 0] - ts))
+                if np.abs(gt_data[idx, 0] - ts) < 0.1: # 100ms threshold
+                    diff = est[1:4] - gt_data[idx, 1:4]
+                    ate_sum += np.sum(diff**2)
+                    count += 1
+            if count > 0:
+                return np.sqrt(ate_sum / count)
+            return "No matches"
+        except Exception as e:
+            return f"Error"
+for seq in sorted(sequences):
+    est_file = os.path.join(RESULTS_DIR, seq, f"final_traj.txt")
+    gt_file = os.path.join(TUM_DIR, seq, "groundtruth.txt")
+    if os.path.exists(est_file) and os.path.exists(gt_file):
+        ate = get_ate(est_file, gt_file)
+        if isinstance(ate, float):
+            print(f"{seq:<40} | {ate:.4f}")
+        else:
+            print(f"{seq:<40} | {ate}")
+    else:
+        print(f"{seq:<40} | Missing files or running")

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/publish_submap.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/usr/bin/env bash
+set -euo pipefail
+usage() {
+  cat <<'EOF'
+Usage:
+  GITHUB_TOKEN=... ./publish_submap.sh [commit message]
+Optional env vars:
+  SOURCE_ROOT   Source repository root (default: current git top-level)
+  PUBLISH_DIR   Export directory (default: /var/scratch/qzhang2/e2e-semantic-SLAM-publish)
+  TARGET_REPO   GitHub repo in owner/name form (default: SlamMate/e2e-semantic-SLAM)
+  BRANCH        Git branch to push (default: submap)
+  GH_TOKEN      Alternative to GITHUB_TOKEN
+  GIT_USER_NAME  Commit author name override
+  GIT_USER_EMAIL Commit author email override
+EOF
+}
+if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then
+  usage
+  exit 0
+fi
+if [[ $# -gt 0 ]]; then
+  COMMIT_MSG="$*"
+else
+  COMMIT_MSG="${COMMIT_MSG:-Auto export: $(date +%F_%H%M%S)}"
+fi
+SOURCE_ROOT="${SOURCE_ROOT:-$(git rev-parse --show-toplevel 2>/dev/null || pwd)}"
+PUBLISH_DIR="${PUBLISH_DIR:-/var/scratch/qzhang2/e2e-semantic-SLAM-publish}"
+TARGET_REPO="${TARGET_REPO:-SlamMate/e2e-semantic-SLAM}"
+BRANCH="${BRANCH:-submap}"
+TOKEN="${GITHUB_TOKEN:-${GH_TOKEN:-}}"
+if [[ -z "${TOKEN}" ]]; then
+  echo "Missing GITHUB_TOKEN (or GH_TOKEN)." >&2
+  exit 1
+fi
+case "${PUBLISH_DIR}" in
+  /tmp/*|/var/tmp/*|/var/scratch/*) ;;
+  *)
+    echo "Refusing to use unsafe PUBLISH_DIR: ${PUBLISH_DIR}" >&2
+    exit 1
+    ;;
+esac
+if [[ ! -d "${SOURCE_ROOT}" ]]; then
+  echo "SOURCE_ROOT does not exist: ${SOURCE_ROOT}" >&2
+  exit 1
+fi
+rm -rf "${PUBLISH_DIR}"
+mkdir -p "${PUBLISH_DIR}"
+copy_file() {
+  local rel="$1"
+  if [[ -f "${SOURCE_ROOT}/${rel}" ]]; then
+    rsync -a "${SOURCE_ROOT}/${rel}" "${PUBLISH_DIR}/"
+  fi
+}
+copy_dir() {
+  local rel="$1"
+  if [[ -d "${SOURCE_ROOT}/${rel}" ]]; then
+    mkdir -p "${PUBLISH_DIR}/${rel}"
+    rsync -a \
+      --exclude '__pycache__/' \
+      --exclude '*.pyc' \
+      --exclude '*.pyo' \
+      "${SOURCE_ROOT}/${rel}/" "${PUBLISH_DIR}/${rel}/"
+  fi
+}
+for file in \
+  .gitignore \
+  README.md \
+  README_submap.md \
+  README_cluster_migration.md \
+  requirements.txt \
+  setup.py \
+  setup_env.sh \
+  run_tum.sh \
+  run_tum_top5.sh \
+  eval_ate_scaled.py \
+  get_ate.py \
+  submap_handoff.md \
+  publish_submap.sh
+ do
+  copy_file "${file}"
+ done
+for dir in cloud_opt config slam src; do
+  copy_dir "${dir}"
+done
+if [[ ! -f "${PUBLISH_DIR}/README.md" ]]; then
+  echo "Export did not produce README.md; aborting." >&2
+  exit 1
+fi
+git -C "${PUBLISH_DIR}" init >/dev/null
+git -C "${PUBLISH_DIR}" checkout -b "${BRANCH}" >/dev/null
+GIT_USER_NAME="${GIT_USER_NAME:-$(git config --global user.name 2>/dev/null || echo Cascade)}"
+GIT_USER_EMAIL="${GIT_USER_EMAIL:-$(git config --global user.email 2>/dev/null || echo cascade@example.com)}"
+git -C "${PUBLISH_DIR}" config user.name "${GIT_USER_NAME}"
+git -C "${PUBLISH_DIR}" config user.email "${GIT_USER_EMAIL}"
+git -C "${PUBLISH_DIR}" add .
+if git -C "${PUBLISH_DIR}" diff --cached --quiet; then
+  echo "No changes to commit in ${PUBLISH_DIR}."
+else
+  git -C "${PUBLISH_DIR}" commit -m "${COMMIT_MSG}"
+fi
+git -C "${PUBLISH_DIR}" remote remove origin >/dev/null 2>&1 || true
+git -C "${PUBLISH_DIR}" remote add origin "https://github.com/${TARGET_REPO}.git"
+ASKPASS_HELPER="$(mktemp)"
+cat >"${ASKPASS_HELPER}" <<'EOF'
+#!/usr/bin/env bash
+case "$1" in
+  *Username*) printf '%s\n' 'x-access-token' ;;
+  *Password*) printf '%s\n' "${GIT_TOKEN}" ;;
+  *) printf '%s\n' '' ;;
+esac
+EOF
+chmod 700 "${ASKPASS_HELPER}"
+trap 'rm -f "${ASKPASS_HELPER}"' EXIT
+GIT_TOKEN="${TOKEN}" GIT_ASKPASS="${ASKPASS_HELPER}" GIT_TERMINAL_PROMPT=0 \
+  git -C "${PUBLISH_DIR}" push --force-with-lease -u origin "${BRANCH}"
+echo "Published ${SOURCE_ROOT} -> ${TARGET_REPO} [${BRANCH}]"
+echo "Export dir: ${PUBLISH_DIR}"

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+torch==2.9.1
+torchvision==0.24.1
+numpy==2.2.6
+Pillow
+huggingface_hub
+safetensors
+roma
+gradio
+matplotlib
+tqdm
+opencv-python
+scipy
+einops
+trimesh
+tensorboard
+pyglet<2
+viser
+gradio
+lpips
+hydra-core
+h5py
+accelerate
+transformers
+scikit-learn
+gsplat
+evo
+open3d
+rerun-sdk
+kornia
+moviepy==1.0.3

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/run_tum.sh ADDED Viewed

	@@ -0,0 +1,77 @@

+#!/bin/bash
+#SBATCH --job-name=slam_tum
+#SBATCH --output=slam_tum_%j.out
+#SBATCH --partition=defq
+#SBATCH --nodes=1
+#SBATCH --gpus=1
+#SBATCH --time=12:00:00
+source /var/scratch/qzhang2/miniconda3/etc/profile.d/conda.sh
+conda activate SLAM-Former
+module load cuda12.1/toolkit/12.1
+# Try with a different cudnn or skip it if it's causing issues. Let's not load cudnn module as it might conflict with pytorch's built-in cudnn
+# module load cuDNN/cuda12.1/9.1.0.70
+export CC=/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc
+export CXX=/opt/ohpc/pub/compiler/gcc/9.4.0/bin/g++
+TUM_DIR="/var/scratch/qzhang2/Feature-SLAM/datasets/tum"
+CKPT_PATH="${CKPT_PATH:-/var/scratch/qzhang2/SLAM-Former/ckpt/checkpoint-10.pth.model}"
+RESULT_ROOT="${RESULT_ROOT:-./tum_results_aligned}"
+SUBMAP_INFERENCE_MODE="${SUBMAP_INFERENCE_MODE:-full}"
+LOOP_MASK_MODE="${LOOP_MASK_MODE:-soft_all}"
+SUBMAP_TRAIN_MODE="${SUBMAP_TRAIN_MODE:-top5_dual_queue}"
+SUBMAP_RETRIEVAL_TOPK="${SUBMAP_RETRIEVAL_TOPK:-5}"
+SUBMAP_FETCH_SOURCE="${SUBMAP_FETCH_SOURCE:-frontend}"
+SUBMAP_DESCRIPTOR_SOURCE="${SUBMAP_DESCRIPTOR_SOURCE:-frontend}"
+MAX_RECURSIVE_SUBMAPS="${MAX_RECURSIVE_SUBMAPS:-5}"
+CKPT_NAME="$(basename "$CKPT_PATH")"
+CKPT_NAME="${CKPT_NAME%.pth.model}"
+CKPT_NAME="${CKPT_NAME%.pth}"
+CKPT_PARENT="$(basename "$(dirname "$CKPT_PATH")")"
+CKPT_HASH="$(printf '%s' "$CKPT_PATH" | sha1sum | cut -c1-8)"
+RUN_TAG="${RUN_TAG:-${CKPT_PARENT}__${CKPT_NAME}__${CKPT_HASH}}"
+OUT_DIR="${OUT_DIR:-${RESULT_ROOT}/${RUN_TAG}}"
+mkdir -p "$OUT_DIR"
+case "$SUBMAP_INFERENCE_MODE" in
+    full)
+        DEMO_ARGS=()
+        ;;
+    top5)
+        DEMO_ARGS=(
+            --loop_mask_mode "$LOOP_MASK_MODE"
+            --submap_train_mode "$SUBMAP_TRAIN_MODE"
+            --submap_retrieval_topk "$SUBMAP_RETRIEVAL_TOPK"
+            --submap_fetch_source "$SUBMAP_FETCH_SOURCE"
+            --submap_descriptor_source "$SUBMAP_DESCRIPTOR_SOURCE"
+            --max_recursive_submaps "$MAX_RECURSIVE_SUBMAPS"
+        )
+        ;;
+    *)
+        echo "Unknown SUBMAP_INFERENCE_MODE=$SUBMAP_INFERENCE_MODE (expected full or top5)" >&2
+        exit 1
+        ;;
+esac
+echo "Checkpoint: $CKPT_PATH"
+echo "Run tag: $RUN_TAG"
+echo "Output root: $OUT_DIR"
+echo "Submap inference mode: $SUBMAP_INFERENCE_MODE"
+for seq in "$TUM_DIR"/rgbd_dataset_freiburg1_*; do
+    if [ -d "$seq/rgb" ]; then
+        seq_name=$(basename "$seq")
+        echo "======================================"
+        echo "Running on $seq_name..."
+        echo "======================================"
+        # The demo expects the image folder which contains images. For TUM it is the 'rgb' folder.
+        python slam/demo_submap.py \
+            --ckpt_path "$CKPT_PATH" \
+            --image_folder "$seq/rgb" \
+            --output_dir "$OUT_DIR/$seq_name" \
+            --target_size 518 \
+            "${DEMO_ARGS[@]}"
+    fi
+done

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/run_tum_top5.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/bin/bash
+#SBATCH --job-name=slam_tum_top5
+#SBATCH --output=slam_tum_top5_%j.out
+#SBATCH --partition=defq
+#SBATCH --nodes=1
+#SBATCH --gpus=1
+#SBATCH --time=12:00:00
+source /var/scratch/qzhang2/miniconda3/etc/profile.d/conda.sh
+conda activate SLAM-Former
+module load cuda12.1/toolkit/12.1
+# Try with a different cudnn or skip it if it's causing issues. Let's not load cudnn module as it might conflict with pytorch's built-in cudnn
+# module load cuDNN/cuda12.1/9.1.0.70
+export CC=/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc
+export CXX=/opt/ohpc/pub/compiler/gcc/9.4.0/bin/g++
+TUM_DIR="/var/scratch/qzhang2/Feature-SLAM/datasets/tum"
+CKPT_PATH="${CKPT_PATH:-/var/scratch/qzhang2/SLAM-Former/checkpoints/local_cluster_nv24_sub6/submap_only_pseudo_gt_high_recall_smoke/paper_local_submap_only_pseudo_gt_high_recall_smoke_nv24_sub6/checkpoint-last.pth}"
+RESULT_ROOT="${RESULT_ROOT:-./tum_results_aligned_top5}"
+SUBMAP_INFERENCE_MODE="${SUBMAP_INFERENCE_MODE:-top5}"
+LOOP_MASK_MODE="${LOOP_MASK_MODE:-soft_all}"
+SUBMAP_TRAIN_MODE="${SUBMAP_TRAIN_MODE:-top5_dual_queue}"
+SUBMAP_RETRIEVAL_TOPK="${SUBMAP_RETRIEVAL_TOPK:-5}"
+SUBMAP_FETCH_SOURCE="${SUBMAP_FETCH_SOURCE:-frontend}"
+SUBMAP_DESCRIPTOR_SOURCE="${SUBMAP_DESCRIPTOR_SOURCE:-frontend}"
+MAX_RECURSIVE_SUBMAPS="${MAX_RECURSIVE_SUBMAPS:-5}"
+CKPT_NAME="$(basename "$CKPT_PATH")"
+CKPT_NAME="${CKPT_NAME%.pth.model}"
+CKPT_NAME="${CKPT_NAME%.pth}"
+CKPT_PARENT="$(basename "$(dirname "$CKPT_PATH")")"
+CKPT_HASH="$(printf '%s' "$CKPT_PATH" | sha1sum | cut -c1-8)"
+RUN_TAG="${RUN_TAG:-${CKPT_PARENT}__${CKPT_NAME}__${SUBMAP_INFERENCE_MODE}__${CKPT_HASH}}"
+OUT_DIR="${OUT_DIR:-${RESULT_ROOT}/${RUN_TAG}}"
+mkdir -p "$OUT_DIR"
+DEMO_ARGS=(
+    --loop_mask_mode "$LOOP_MASK_MODE"
+    --submap_train_mode "$SUBMAP_TRAIN_MODE"
+    --submap_retrieval_topk "$SUBMAP_RETRIEVAL_TOPK"
+    --submap_fetch_source "$SUBMAP_FETCH_SOURCE"
+    --submap_descriptor_source "$SUBMAP_DESCRIPTOR_SOURCE"
+    --max_recursive_submaps "$MAX_RECURSIVE_SUBMAPS"
+)
+echo "Checkpoint: $CKPT_PATH"
+echo "Run tag: $RUN_TAG"
+echo "Output root: $OUT_DIR"
+echo "Submap inference mode: $SUBMAP_INFERENCE_MODE"
+echo "Comparative baseline: run_tum.sh with SUBMAP_INFERENCE_MODE=full and the same CKPT_PATH"
+for seq in "$TUM_DIR"/rgbd_dataset_freiburg1_*; do
+    if [ -d "$seq/rgb" ]; then
+        seq_name=$(basename "$seq")
+        echo "======================================"
+        echo "Running on $seq_name..."
+        echo "======================================"
+        python slam/demo_submap.py \
+            --ckpt_path "$CKPT_PATH" \
+            --image_folder "$seq/rgb" \
+            --output_dir "$OUT_DIR/$seq_name" \
+            --target_size 518 \
+            "${DEMO_ARGS[@]}"
+    fi
+done

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/setup.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from setuptools import setup, find_packages
+setup(
+    name='slamformer',
+    version='0.1.0',
+    description='SLAM-Former: Putting SLAM into One Transformer.',
+    packages=find_packages(include=['evals', 'evals.*', 'src/slamformer', 'src/slamformer.*']),
+)

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/setup_env.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+#!/bin/bash
+#SBATCH --job-name=setup_slam
+#SBATCH --output=setup_slam.out
+#SBATCH --partition=defq
+#SBATCH --nodes=1
+#SBATCH --gpus=1
+#SBATCH --time=02:00:00
+source /var/scratch/qzhang2/miniconda3/etc/profile.d/conda.sh
+conda activate SLAM-Former
+module load cuda12.1/toolkit/12.1
+module load cuDNN/cuda12.1/9.1.0.70
+export CC=/opt/ohpc/pub/compiler/gcc/9.4.0/bin/gcc
+export CXX=/opt/ohpc/pub/compiler/gcc/9.4.0/bin/g++
+pip install -r requirements.txt
+pip install -e .

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/__init__.py ADDED Viewed

File without changes

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/audit_dataset_num_views.py ADDED Viewed

	@@ -0,0 +1,412 @@

+from __future__ import annotations
+import argparse
+import csv
+import os
+from pathlib import Path
+from typing import Callable
+import numpy as np
+try:
+    import h5py
+except Exception:
+    h5py = None
+README_STATUS = {
+    "ARKitScenes": "released",
+    "ScanNet": "released",
+    "ScanNet++": "coming_soon_in_readme",
+    "HyperSim": "released_separate_hf",
+    "BlendedMVS": "coming_soon_in_readme",
+    "MegaDepth": "coming_soon_in_readme",
+    "MVS-Synth": "released",
+}
+THRESHOLDS = [24, 32, 48, 64]
+def retention_str(count: int | None, total: int | None) -> str:
+    if count is None or total in (None, 0):
+        return "N/A"
+    return f"{count}/{total} ({100.0 * count / total:.1f}%)"
+def summarize_caps(caps: list[int], thresholds: list[int]) -> dict:
+    if not caps:
+        return {
+            "unit_count": 0,
+            "strict_cap_no_skip": None,
+            "median_cap": None,
+            "max_cap": None,
+            "threshold_counts": {t: None for t in thresholds},
+        }
+    arr = np.asarray(caps, dtype=np.int64)
+    return {
+        "unit_count": int(arr.size),
+        "strict_cap_no_skip": int(arr.min()),
+        "median_cap": int(np.median(arr)),
+        "max_cap": int(arr.max()),
+        "threshold_counts": {t: int((arr >= t).sum()) for t in thresholds},
+    }
+def audit_arkit(root: Path, thresholds: list[int]) -> dict:
+    meta_root = root / "Training"
+    all_meta = meta_root / "all_metadata.npz"
+    if not all_meta.is_file():
+        return {"available": False, "notes": "missing Training/all_metadata.npz"}
+    scene_caps = []
+    with np.load(all_meta) as data:
+        scenes = data["scenes"]
+    for scene in scenes:
+        scene_dir = meta_root / str(scene)
+        meta_path = scene_dir / "new_scene_metadata.npz"
+        if not scene_dir.is_dir() or not meta_path.is_file():
+            continue
+        with np.load(meta_path, allow_pickle=True) as data:
+            scene_len = len(data["images"])
+            best_group_len = max(
+                (len(group) + 1 for group in data["image_collection"].item().values()),
+                default=0,
+            )
+            scene_caps.append(min(scene_len, best_group_len))
+    stats = summarize_caps(scene_caps, thresholds)
+    stats.update(
+        {
+            "available": True,
+            "unit_name": "scene",
+            "constraint": "min(scene_len, best_image_collection_len_per_scene)",
+            "notes": "strict cap is dominated by a few very short scenes; higher num_views still work with scene skipping",
+        }
+    )
+    return stats
+def audit_scannetpp(root: Path, thresholds: list[int]) -> dict:
+    all_meta = root / "all_metadata.npz"
+    if not all_meta.is_file():
+        return {"available": False, "notes": "missing all_metadata.npz"}
+    with np.load(all_meta) as data:
+        scenes = data["scenes"]
+    scene_caps = []
+    for scene in scenes:
+        scene_dir = root / str(scene)
+        meta_path = scene_dir / "new_scene_metadata.npz"
+        images_dir = scene_dir / "images"
+        if not scene_dir.is_dir() or not meta_path.is_file() or not images_dir.is_dir():
+            continue
+        with np.load(meta_path, allow_pickle=True) as data:
+            images = data["images"]
+            imgs_on_disk = {name[:-4] for name in os.listdir(images_dir)}
+            dslr_ids = [
+                i
+                for i in range(len(images))
+                if images[i].startswith("DSC") and images[i] in imgs_on_disk
+            ]
+            iphone_ids = [
+                i
+                for i in range(len(images))
+                if images[i].startswith("frame") and images[i] in imgs_on_disk
+            ]
+            best_cap = 0
+            for ref_id, group in data["image_collection"].item().items():
+                group_len = len(group) + 1
+                video_len = len(dslr_ids) if images[ref_id].startswith("frame") else len(iphone_ids)
+                best_cap = max(best_cap, min(group_len, video_len))
+            if best_cap > 0:
+                scene_caps.append(best_cap)
+    stats = summarize_caps(scene_caps, thresholds)
+    stats.update(
+        {
+            "available": True,
+            "unit_name": "scene",
+            "constraint": "best min(group_len, paired_video_len) per scene",
+            "notes": "README still marks this split as coming soon, but local processed data is already present",
+        }
+    )
+    return stats
+def audit_scannet(root: Path, thresholds: list[int]) -> dict:
+    scans_train = root / "scans_train"
+    if not scans_train.is_dir():
+        return {"available": False, "notes": "missing scans_train"}
+    scene_caps = []
+    for scene in sorted(os.listdir(scans_train)):
+        if not scene.startswith("scene"):
+            continue
+        meta_path = scans_train / scene / "new_scene_metadata.npz"
+        if not meta_path.is_file():
+            continue
+        with np.load(meta_path, allow_pickle=True) as data:
+            scene_caps.append(len(data["images"]))
+    stats = summarize_caps(scene_caps, thresholds)
+    stats.update(
+        {
+            "available": True,
+            "unit_name": "scene",
+            "constraint": "scene_len",
+            "notes": "",
+        }
+    )
+    return stats
+def audit_hypersim(root: Path, thresholds: list[int]) -> dict:
+    if not root.is_dir():
+        return {"available": False, "notes": "missing hypersim root"}
+    scene_caps = []
+    for scene in sorted(os.listdir(root)):
+        scene_dir = root / scene
+        if not scene_dir.is_dir():
+            continue
+        for subscene in sorted(os.listdir(scene_dir)):
+            subscene_dir = scene_dir / subscene
+            if not subscene_dir.is_dir():
+                continue
+            rgb_paths = [name for name in os.listdir(subscene_dir) if name.endswith(".png")]
+            if rgb_paths:
+                scene_caps.append(len(rgb_paths))
+    stats = summarize_caps(scene_caps, thresholds)
+    stats.update(
+        {
+            "available": True,
+            "unit_name": "subscene",
+            "constraint": "scene_len",
+            "notes": "",
+        }
+    )
+    return stats
+def audit_mvs_synth(root: Path, thresholds: list[int]) -> dict:
+    if not root.is_dir():
+        return {"available": False, "notes": "missing processed_mvs_synth root"}
+    scene_caps = []
+    for scene in sorted(os.listdir(root)):
+        rgb_dir = root / scene / "rgb"
+        if not rgb_dir.is_dir():
+            continue
+        scene_len = len([name for name in os.listdir(rgb_dir) if name.endswith(".jpg")])
+        if scene_len > 0:
+            scene_caps.append(scene_len)
+    stats = summarize_caps(scene_caps, thresholds)
+    stats.update(
+        {
+            "available": True,
+            "unit_name": "scene",
+            "constraint": "scene_len",
+            "notes": "",
+        }
+    )
+    return stats
+def audit_megadepth(root: Path, thresholds: list[int]) -> dict:
+    sets_path = root / "megadepth_sets_64.npz"
+    if not sets_path.is_file():
+        return {"available": False, "notes": "missing megadepth_sets_64.npz; code hard cap is 64"}
+    with np.load(sets_path, allow_pickle=True) as data:
+        scenes = data["scenes"]
+        sets = data["sets"]
+    valid_scene = np.array([not str(scene).startswith(("0015", "0022")) for scene in scenes])
+    valid_scene_ids = np.nonzero(valid_scene)[0]
+    train_mask = np.in1d(sets[:, 0], valid_scene_ids)
+    caps = [64] * int(train_mask.sum())
+    stats = summarize_caps(caps, thresholds)
+    stats.update(
+        {
+            "available": True,
+            "unit_name": "set",
+            "constraint": "fixed_64_image_set",
+            "notes": "hard code cap is 64 because the loader slices image_idxs = sets[idx][1:65]",
+        }
+    )
+    return stats
+def build_adjacency_list(score_matrix: np.ndarray, thresh: float = 0.2) -> list[list[int]]:
+    score_matrix = score_matrix - thresh
+    score_matrix[score_matrix < 0] = 0
+    rows, cols = np.nonzero(score_matrix)
+    adjacency = [[] for _ in range(len(score_matrix))]
+    for row, col in zip(rows, cols):
+        adjacency[row].append(int(col))
+    return adjacency
+def reachable_count(adjacency: list[list[int]], start_index: int) -> int:
+    stack = [start_index]
+    visited = set()
+    while stack:
+        node = stack.pop()
+        if node in visited:
+            continue
+        visited.add(node)
+        stack.extend(neighbor for neighbor in adjacency[node] if neighbor not in visited)
+    return len(visited)
+def audit_blendedmvs(root: Path, thresholds: list[int]) -> dict:
+    overlap_path = root / "new_overlap.h5"
+    if not overlap_path.is_file():
+        return {"available": False, "notes": "missing new_overlap.h5"}
+    if h5py is None:
+        return {"available": False, "notes": "h5py is unavailable"}
+    ref_caps = []
+    with h5py.File(overlap_path, "r") as handle:
+        for scene_dir in handle.keys():
+            group = handle[scene_dir]
+            indices = group["indices"][:]
+            values = group["values"][:]
+            shape = group.attrs["shape"]
+            score_matrix = np.zeros(shape, dtype=np.float32)
+            score_matrix[indices[0], indices[1]] = values
+            adjacency = build_adjacency_list(score_matrix)
+            ref_caps.extend(reachable_count(adjacency, idx) for idx in range(len(adjacency)))
+    stats = summarize_caps(ref_caps, thresholds)
+    stats.update(
+        {
+            "available": True,
+            "unit_name": "reference",
+            "constraint": "reachable_unique_images_per_reference",
+            "notes": "for allow_repeat=false this is the reachable unique-image cap from the overlap graph",
+        }
+    )
+    return stats
+def build_dataset_specs(data_root: Path) -> list[tuple[str, Path, Callable[[Path, list[int]], dict]]]:
+    return [
+        ("ARKitScenes", data_root / "processed_arkitscenes", audit_arkit),
+        ("ScanNet", data_root / "processed_scannet", audit_scannet),
+        ("ScanNet++", data_root / "processed_scannetpp", audit_scannetpp),
+        ("HyperSim", data_root / "hypersim", audit_hypersim),
+        ("BlendedMVS", data_root / "processed_blendedmvs", audit_blendedmvs),
+        ("MegaDepth", data_root / "processed_megadepth", audit_megadepth),
+        ("MVS-Synth", data_root / "processed_mvs_synth", audit_mvs_synth),
+    ]
+def render_markdown(rows: list[dict], output_md: Path, thresholds: list[int]) -> None:
+    output_md.parent.mkdir(parents=True, exist_ok=True)
+    header = [
+        "# Paper Training Dataset num_views Audit",
+        "",
+        "This table is aligned with the paper's training dataset list in `refer/arXiv-2509.16909v1/sec/4_exp.tex` and the release status in `README.md`.",
+        "",
+        "`StrictCapNoSkip` means the largest `num_views` that keeps every current local scene/reference usable under `allow_repeat=false`.",
+        "Using a larger `num_views` can still work, but shorter scenes will be skipped by the dataset loader.",
+        "",
+        "| Dataset | READMEStatus | LocalAvailable | LocalPath | Unit | UnitCount | StrictCapNoSkip | MedianCap | MaxCap | >=24 | >=32 | >=48 | >=64 | Constraint | Notes |",
+        "|---|---|---|---|---:|---:|---:|---:|---:|---|---|---|---|---|---|",
+    ]
+    lines = []
+    for row in rows:
+        lines.append(
+            "| {dataset} | {readme_status} | {local_available} | `{local_path}` | {unit_name} | {unit_count} | {strict_cap_no_skip} | {median_cap} | {max_cap} | {ge24} | {ge32} | {ge48} | {ge64} | {constraint} | {notes} |".format(
+                dataset=row["dataset"],
+                readme_status=row["readme_status"],
+                local_available=row["local_available"],
+                local_path=row["local_path"],
+                unit_name=row["unit_name"],
+                unit_count=row["unit_count"],
+                strict_cap_no_skip=row["strict_cap_no_skip"],
+                median_cap=row["median_cap"],
+                max_cap=row["max_cap"],
+                ge24=row[f"ge_{thresholds[0]}"],
+                ge32=row[f"ge_{thresholds[1]}"],
+                ge48=row[f"ge_{thresholds[2]}"],
+                ge64=row[f"ge_{thresholds[3]}"],
+                constraint=row["constraint"],
+                notes=row["notes"],
+            )
+        )
+    output_md.write_text("\n".join(header + lines) + "\n")
+def render_csv(rows: list[dict], output_csv: Path, thresholds: list[int]) -> None:
+    output_csv.parent.mkdir(parents=True, exist_ok=True)
+    fieldnames = [
+        "dataset",
+        "readme_status",
+        "local_available",
+        "local_path",
+        "unit_name",
+        "unit_count",
+        "strict_cap_no_skip",
+        "median_cap",
+        "max_cap",
+        f"ge_{thresholds[0]}_count",
+        f"ge_{thresholds[1]}_count",
+        f"ge_{thresholds[2]}_count",
+        f"ge_{thresholds[3]}_count",
+        "constraint",
+        "notes",
+    ]
+    with output_csv.open("w", newline="") as handle:
+        writer = csv.DictWriter(handle, fieldnames=fieldnames)
+        writer.writeheader()
+        for row in rows:
+            writer.writerow(
+                {
+                    "dataset": row["dataset"],
+                    "readme_status": row["readme_status"],
+                    "local_available": row["local_available"],
+                    "local_path": row["local_path"],
+                    "unit_name": row["unit_name"],
+                    "unit_count": row["unit_count"],
+                    "strict_cap_no_skip": row["strict_cap_no_skip"],
+                    "median_cap": row["median_cap"],
+                    "max_cap": row["max_cap"],
+                    f"ge_{thresholds[0]}_count": row[f"ge_{thresholds[0]}_count"],
+                    f"ge_{thresholds[1]}_count": row[f"ge_{thresholds[1]}_count"],
+                    f"ge_{thresholds[2]}_count": row[f"ge_{thresholds[2]}_count"],
+                    f"ge_{thresholds[3]}_count": row[f"ge_{thresholds[3]}_count"],
+                    "constraint": row["constraint"],
+                    "notes": row["notes"],
+                }
+            )
+def main() -> None:
+    repo_root = Path(__file__).resolve().parents[1]
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-root", type=Path, default=repo_root / "data" / "train")
+    parser.add_argument("--output-md", type=Path, default=repo_root / "reports" / "paper_dataset_num_views.md")
+    parser.add_argument("--output-csv", type=Path, default=repo_root / "reports" / "paper_dataset_num_views.csv")
+    args = parser.parse_args()
+    rows = []
+    for dataset_name, local_path, audit_fn in build_dataset_specs(args.data_root):
+        stats = audit_fn(local_path, THRESHOLDS)
+        row = {
+            "dataset": dataset_name,
+            "readme_status": README_STATUS[dataset_name],
+            "local_available": "yes" if stats.get("available", False) else "no",
+            "local_path": str(local_path),
+            "unit_name": stats.get("unit_name", "N/A"),
+            "unit_count": stats.get("unit_count", "N/A"),
+            "strict_cap_no_skip": stats.get("strict_cap_no_skip", "N/A"),
+            "median_cap": stats.get("median_cap", "N/A"),
+            "max_cap": stats.get("max_cap", "N/A"),
+            "constraint": stats.get("constraint", "N/A"),
+            "notes": stats.get("notes", ""),
+        }
+        threshold_counts = stats.get("threshold_counts", {})
+        for threshold in THRESHOLDS:
+            count = threshold_counts.get(threshold)
+            row[f"ge_{threshold}"] = retention_str(count, stats.get("unit_count"))
+            row[f"ge_{threshold}_count"] = count if count is not None else "N/A"
+        rows.append(row)
+    render_markdown(rows, args.output_md, THRESHOLDS)
+    render_csv(rows, args.output_csv, THRESHOLDS)
+    print(args.output_md)
+    print(args.output_csv)
+if __name__ == "__main__":
+    main()

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/batched_dynamic_router.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""
+BatchedDynamicSubmapRouter: Differentiable dynamic submap boundary predictor.
+Key design decisions:
+  - Fully vectorized (Fix #4): NO Python-level conditionals on boundary_flag.
+    Every GPU always executes the full compute graph (descriptor + retrieval +
+    backendT).  The boundary decision is applied as a differentiable gating
+    multiplier so DDP AllReduce never deadlocks.
+  - Masked pooling (Fix #3): valid_mask [B, max_K] prevents zero-padding from
+    injecting dead gradients / NaN into descriptors.
+  - Gumbel-Softmax temperature annealing (Fix #5): τ decays from tau_start
+    to tau_end over training via cosine schedule.
+No original source files are modified.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+class BatchedDynamicSubmapRouter(nn.Module):
+    """Differentiable boundary predictor with vectorized soft gating.
+    At every frame step the router:
+      1. Appends the new token to a padded accumulation buffer.
+      2. Predicts a boundary probability via an MLP + Gumbel-Softmax STE.
+      3. **Always** computes descriptors and runs backend for the full batch
+         (no Python branching).
+      4. Soft-gates the result: ``final = flag * backend + (1-flag) * frontend``.
+      5. Soft-resets the accumulation buffer using the gate.
+    Args:
+        token_dim:           2C — full token feature dimension (default 2048).
+        boundary_hidden_dim: MLP hidden size (default 512).
+        tau_start:           initial Gumbel-Softmax temperature.
+        tau_end:             final Gumbel-Softmax temperature.
+        max_K:               maximum frames per submap (buffer size).
+    """
+    def __init__(
+        self,
+        token_dim: int = 2048,
+        boundary_hidden_dim: int = 512,
+        tau_start: float = 5.0,
+        tau_end: float = 0.1,
+        max_K: int = 20,
+    ):
+        super().__init__()
+        self.token_dim = token_dim
+        self.tau_start = tau_start
+        self.tau_end = tau_end
+        self.max_K = max_K
+        # Boundary predictor: takes [prev_token || curr_token] → logit
+        self.boundary_predictor = nn.Sequential(
+            nn.Linear(2 * token_dim, boundary_hidden_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(boundary_hidden_dim, 1),
+        )
+    # ── temperature annealing (Fix #5) ───────────────────
+    def get_tau(self, progress: float) -> float:
+        """Cosine-annealed Gumbel-Softmax temperature.
+        Args:
+            progress: training progress in [0, 1].
+        Returns:
+            current τ value.
+        """
+        progress = min(max(progress, 0.0), 1.0)
+        return self.tau_end + 0.5 * (self.tau_start - self.tau_end) * (
+            1.0 + math.cos(math.pi * progress)
+        )
+    # ── boundary prediction (vectorized) ─────────────────
+    def predict_boundary(
+        self,
+        prev_token: torch.Tensor,
+        curr_token: torch.Tensor,
+        tau: float = 1.0,
+    ) -> torch.Tensor:
+        """Predict boundary flag for each element in the batch.
+        Uses Straight-Through Estimator (STE) via Gumbel-Softmax:
+        forward = hard 0/1, backward = soft gradients.
+        Args:
+            prev_token: [B, D] — pooled previous-frame token.
+            curr_token: [B, D] — pooled current-frame token.
+            tau:        Gumbel-Softmax temperature.
+        Returns:
+            boundary_flag: [B, 1] — hard 0.0 or 1.0 (differentiable via STE).
+        """
+        combined = torch.cat([prev_token, curr_token], dim=-1)  # [B, 2D]
+        logit = self.boundary_predictor(combined)                # [B, 1]
+        # Stack [logit, -logit] → [B, 2]  (cut / no-cut)
+        logits_2 = torch.cat([logit, -logit], dim=-1)           # [B, 2]
+        one_hot = F.gumbel_softmax(logits_2, tau=tau, hard=True, dim=-1)
+        # one_hot[:, 0] = 1 means "cut", one_hot[:, 1] = 1 means "no cut"
+        boundary_flag = one_hot[:, :1]                           # [B, 1]
+        return boundary_flag
+    # ── masked pooling (Fix #3) ──────────────────────────
+    @staticmethod
+    def masked_pool(
+        accum_tokens: torch.Tensor,
+        valid_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """Safe masked average pooling over the accumulation buffer.
+        Args:
+            accum_tokens: [B, max_K, P, C] — padded token buffer.
+            valid_mask:   [B, max_K] — True where frames are valid.
+        Returns:
+            pooled: [B, C] — mean-pooled descriptor input.
+        """
+        # [B, max_K, 1, 1]
+        mask_f = valid_mask.float().unsqueeze(-1).unsqueeze(-1)
+        # Sum over frames and patches, divide by valid count
+        # First pool patches within each frame: [B, max_K, C]
+        frame_pooled = accum_tokens.mean(dim=2)               # [B, max_K, C]
+        mask_f_1d = valid_mask.float().unsqueeze(-1)           # [B, max_K, 1]
+        # Weighted sum over frames
+        numerator = (frame_pooled * mask_f_1d).sum(dim=1)      # [B, C]
+        denominator = mask_f_1d.sum(dim=1).clamp(min=1.0)      # [B, 1]
+        return numerator / denominator                          # [B, C]
+    # ── vectorized soft reset (Fix #4) ───────────────────
+    @staticmethod
+    def soft_reset(
+        accum_tokens: torch.Tensor,
+        valid_mask: torch.Tensor,
+        accum_len: torch.Tensor,
+        boundary_flag: torch.Tensor,
+    ):
+        """Vectorized buffer reset using the boundary gate.
+        Elements where boundary_flag == 1 get zeroed out;
+        elements where boundary_flag == 0 are kept unchanged.
+        No Python-level branching.
+        Args:
+            accum_tokens: [B, max_K, P, C].
+            valid_mask:   [B, max_K].
+            accum_len:    [B] long — current length per sequence.
+            boundary_flag: [B, 1] — 0.0 or 1.0.
+        Returns:
+            (accum_tokens, valid_mask, accum_len) — updated in-place style.
+        """
+        keep = 1.0 - boundary_flag                              # [B, 1]
+        # Tokens: multiply by keep (broadcasts over max_K, P, C)
+        accum_tokens = accum_tokens * keep.unsqueeze(-1).unsqueeze(-1)
+        # Valid mask: zero out where boundary triggered
+        valid_mask = valid_mask & keep.squeeze(-1).bool()
+        # Length: reset to 0 where boundary triggered
+        accum_len = accum_len * keep.squeeze(-1).long()
+        return accum_tokens, valid_mask, accum_len
+    # ── full vectorized step ─────────────────────────────
+    def step(
+        self,
+        new_token: torch.Tensor,
+        prev_token: torch.Tensor,
+        accum_tokens: torch.Tensor,
+        valid_mask: torch.Tensor,
+        accum_len: torch.Tensor,
+        tau: float = 1.0,
+    ):
+        """Execute one frame step for the full batch (vectorized).
+        Args:
+            new_token:    [B, P, C] — current frame token.
+            prev_token:   [B, D] — pooled previous-frame representation.
+            accum_tokens: [B, max_K, P, C] — accumulation buffer.
+            valid_mask:   [B, max_K] — bool mask.
+            accum_len:    [B] long — frames accumulated so far.
+            tau:          current Gumbel temperature.
+        Returns:
+            boundary_flag: [B, 1] — hard gate.
+            curr_desc:     [B, C] — pooled descriptor (for loop retrieval).
+            accum_tokens:  updated buffer.
+            valid_mask:    updated mask.
+            accum_len:     updated lengths.
+        """
+        B = new_token.shape[0]
+        device = new_token.device
+        # 1. Append token to buffer (scatter, no branching)
+        #    Clamp index to avoid out-of-bounds when buffer is full
+        write_idx = accum_len.clamp(max=self.max_K - 1)          # [B]
+        for b in range(B):
+            # NOTE: this loop is over a *fixed* batch size (same on all GPUs),
+            # so it does NOT cause DDP divergence.
+            idx = write_idx[b].item()
+            accum_tokens[b, idx] = new_token[b]
+            valid_mask[b, idx] = True
+        accum_len = (accum_len + 1).clamp(max=self.max_K)
+        # 2. Boundary prediction (always for ALL B)
+        curr_pooled = new_token.mean(dim=1)                      # [B, C]
+        boundary_flag = self.predict_boundary(prev_token, curr_pooled, tau=tau)
+        # 3. Compute descriptor (always for ALL B, masked pooling)
+        curr_desc = self.masked_pool(accum_tokens, valid_mask)   # [B, C]
+        # 4. Soft reset (vectorized, Fix #4)
+        accum_tokens, valid_mask, accum_len = self.soft_reset(
+            accum_tokens, valid_mask, accum_len, boundary_flag
+        )
+        return boundary_flag, curr_desc, accum_tokens, valid_mask, accum_len
+    # ── buffer initialization helper ─────────────────────
+    def init_buffers(
+        self,
+        batch_size: int,
+        P: int,
+        C: int,
+        device: torch.device,
+    ):
+        """Create fresh accumulation buffers for a batch.
+        Returns:
+            accum_tokens: [B, max_K, P, C] zeros.
+            valid_mask:   [B, max_K] False.
+            accum_len:    [B] zeros (long).
+        """
+        accum_tokens = torch.zeros(batch_size, self.max_K, P, C, device=device)
+        valid_mask = torch.zeros(batch_size, self.max_K, dtype=torch.bool, device=device)
+        accum_len = torch.zeros(batch_size, dtype=torch.long, device=device)
+        return accum_tokens, valid_mask, accum_len

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/demo.py ADDED Viewed

	@@ -0,0 +1,540 @@

+import os,sys
+from collections import OrderedDict
+import torch
+import numpy as np
+import re
+import cv2
+import glob
+import argparse
+import time
+import open3d as o3d
+from rich import print
+import matplotlib.pyplot as plt
+from scipy.spatial.transform import Rotation as R
+import rerun as rr
+import rerun.blueprint as rrb
+sys.path.append('src')
+from slamformer.models.slamformer import SLAMFormer
+current_directory = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_directory+'/../')
+import slam.utils as utils
+from slam.rerun_helper import log_camera, log_window
+def strip_module(state_dict):
+    """
+    Removes the 'module.' prefix from the keys of a state_dict.
+    Args:
+        state_dict (dict): The original state_dict with possible 'module.' prefixes.
+    Returns:
+        OrderedDict: A new state_dict with 'module.' prefixes removed.
+    """
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:] if k.startswith("module.") else k
+        new_state_dict[name] = v
+    return new_state_dict
+class SLAM:
+    def __init__(
+        self,
+        outdir='output/tmp',
+        kf_th=0.1,
+        bn_every=10,
+        vis=False,
+        save_gmem=True,
+        ckpt_path='path/to/ckpt.pth',
+        target_size=518,
+        retention_ratio=0.5
+        ):
+        self.outdir = outdir
+        self.kf_th=kf_th
+        self.save_gmem = save_gmem
+        self.bn_every=bn_every
+        self.vis = vis
+        self.ckpt_path = ckpt_path
+        self.target_size = target_size
+        self.times = []
+        self.kf_time = []
+        self.backend_time = []
+        # model params
+        self.model = SLAMFormer(retention_ratio=retention_ratio, bn_every=bn_every)
+        self.model = self.model.eval()
+        self.load_model()
+        self.model.eval()
+        self.model.to('cuda')
+        # SLAM params
+        self.fid = -1
+        self.kid = -1
+        self.kfids = []
+        self.last_kfid = 0
+        self.kf_timestamps = []
+        # frontend
+        self.frontend_times = 0
+        # Token map
+        self.map = None
+        self.map_opt = None
+        self.signal_backend = False
+        self.backend_every = self.bn_every #10
+        #
+        self.extrins = []
+        self.intrins = []
+        self.frames = []
+        self.kf_frames = []
+        #
+        self.K = None
+        self.update_K = False
+        # vis
+        if self.vis:
+            self.entity="world"
+            rr.init("SLAM", spawn=True)
+            rr.log(self.entity, rr.ViewCoordinates.RIGHT_HAND_Z_UP)
+            self.Twk = np.eye(4)
+            self.K = np.eye(3)
+    def load_model(self):
+        ckpt_raw = torch.load(self.ckpt_path, map_location='cuda', weights_only=False)
+        if isinstance(ckpt_raw, dict):
+            if "model" in ckpt_raw:
+                ckpt = ckpt_raw["model"]
+                print("Loaded state_dict from 'model' key in checkpoint.")
+            else:
+                ckpt = ckpt_raw
+        else:
+            ckpt = ckpt_raw
+        ckpt = utils.strip_module(ckpt)
+        self.model.load_state_dict(ckpt, strict=False)
+        del ckpt, ckpt_raw
+    @property
+    def time(self):
+        torch.cuda.synchronize()
+        return time.perf_counter()
+    def kf_detect(self, image):
+        if self.kid == -1:
+            self.extrins.append(torch.eye(4))
+            return True
+        frame = utils.load_image(image, self.target_size)
+        _,H,W = frame.shape
+        st = self.time #time.perf_counter()
+        token = self.model.KFT(torch.stack([self.kf_frames[-1],frame.cuda()]))
+        if self.vis:
+            # scale the pose to global
+            res = self.model.extract(token, cam_only=True)
+            #z = res['local_points'][0,0,:,:,-1].cpu().numpy()
+            if not hasattr(self,'depth_lask_kf'):
+                scale=1
+            else:
+                scale=1 #np.median(self.depth_last_kf/(z+1e-6))
+            camera_pose = res['camera_poses']
+            extrinsic = torch.inverse(camera_pose)
+            if extrinsic.shape[1] > 1:
+                extrinsic_ref=extrinsic.cpu()[0,-2]
+                extrinsic = extrinsic.cpu()[0,-1]
+                Tki = torch.inverse(camera_pose[0,0])@camera_pose[0,1]
+                Tki = Tki.cpu().numpy()
+                self.Twi = self.Twk@Tki
+                K44 = np.eye(4)
+                K44[:3,:3] = self.K
+                log_camera("camera",self.Twi, K44, kfd=True)
+                # make the window follow camera
+                log_window(f"{self.entity}",np.linalg.inv(self.Twi), K44)
+        else:
+            res = self.model.extract(token, cam_only=True)
+            camera_pose = res['camera_poses']
+            extrinsic = torch.inverse(camera_pose)
+            if extrinsic.shape[1] > 1:
+                extrinsic_ref=extrinsic.cpu()[0,-2]
+                extrinsic = extrinsic.cpu()[0,-1]
+                self.kft_extrinsic_ref = torch.eye(4)#extrinsic_ref
+        dist = torch.sqrt(torch.sum((extrinsic[:3,3] - extrinsic_ref[:3,3])**2))
+        isKF = dist > self.kf_th
+        print(dist)
+        if isKF:
+            self.extrins.append(extrinsic)
+        return isKF
+    def frontend(self, image):
+        if self.vis:
+            rr.log("image", rr.Image(image[:,:,::-1]))#,static=True)
+        self.fid += 1
+        print('Frame', self.fid)
+        # run kf detector
+        st = self.time
+        enough_disparity = self.kf_detect(image)
+        self.kf_time.append(self.time-st)
+        if not enough_disparity:
+            return False
+        torch.cuda.empty_cache()
+        # run T-frontend
+        H_,W_,_ = image.shape
+        frame = utils.load_image(image, self.target_size)
+        self.H,self.W,_ = frame.shape
+        st = self.time
+        self.last_kf = frame.cuda()
+        self.kf_frames.append(self.last_kf)
+        self.last_kfid = self.fid
+        self.frames.append(self.last_kf.clone())
+        self.kid += 1
+        print("[italic purple] # KEYFRAME", self.kid)
+        self.kf_timestamps.append(self.cur_timestamp)
+        frame = frame.cuda()
+        st = self.time
+        if self.nkf == 1:
+            pass
+        elif self.nkf == 2:
+            token = self.model.frontendT(torch.stack([self.kf_frames[0],frame]))
+            self.map_add(token)
+        else:
+            token = self.model.frontendT(frame)
+            print(self.time-st)
+            self.map_add(token)
+        self.kfids.append(self.fid)
+        self.times.append(self.time-st)
+        torch.cuda.empty_cache()
+        # send signal to backend
+        self.frontend_times += 1
+        if self.frontend_times % self.backend_every == 0:
+            self.signal_backend = True
+        if self.vis and self.map is not None:
+            st = time.time()
+            map_before_bn = None
+            if self.map_opt is None:
+                map_before_bn = self.map
+            else:
+                S = self.map.shape[0]
+                S_oldopt = self.map_opt.shape[0]
+                map_before_bn = torch.cat([self.map_opt, self.map[S_oldopt:]],axis=0)
+            if self.nkf == 2:
+                ps,cs,confs,poses = self.extract(self.map)
+            else:
+                ps,cs,confs,poses = self.extract(self.map[-1:])
+            self.vis_mem = [ps,cs,confs,poses]
+            conf_threshold = np.percentile(confs, 15)
+            msk = confs>=conf_threshold
+            ps = ps[msk]
+            cs = cs[msk]
+            K44 = np.eye(4)
+            K44[:3,:3] = self.K
+            if self.nkf == 2:
+                log_camera(f"{self.entity}/camera_kf/0",poses[0], K44)
+                log_camera(f"{self.entity}/camera_kf/1",poses[1], K44)
+                rr.log(f"{self.entity}/lines/0to1", rr.LineStrips3D([poses[:,:3,3].tolist()],colors=[0,0,255],radii=[0.005]))
+                self.last_kf_pose = poses[1]
+            else:
+                log_camera(f"{self.entity}/camera_kf/{self.nkf-1}",poses.reshape(4,4), K44)
+                rr.log(f"{self.entity}/lines/{self.nkf-2}to{self.nkf-1}", rr.LineStrips3D([np.stack([self.last_kf_pose[:3,3],poses[0,:3,3]]).tolist()],colors=[0,0,255],radii=[0.005]))
+                self.last_kf_pose = poses[0]
+            rr.log(
+                    f"{self.entity}/pointclouds/{self.nkf}",
+                    rr.Points3D(ps, colors=cs, radii=0.01),
+                )
+            print('log', time.time()-st)
+            self.Twk = poses[-1].reshape(4,4)
+    def backend(self, final=False):
+        if not self.signal_backend:
+            return
+        torch.cuda.empty_cache()
+        del self.model.fkv
+        torch.cuda.empty_cache()
+        print('Backending...', self.nkf, 'KFs')
+        st = time.perf_counter()
+        map_optimed = self.model.backendT(self.map.cuda())
+        self.backend_time.append(time.perf_counter()-st)
+        print('backend_take', time.perf_counter()-st)
+        torch.cuda.empty_cache()
+        if self.map_opt is not None:
+            del self.map_opt
+            torch.cuda.empty_cache()
+        self.map_opt = map_optimed.cpu()
+        self.signal_backend = False
+        torch.cuda.empty_cache()
+        if self.vis:
+            ps,cs,confs,poses = self.extract(self.map_opt)
+            self.vis_mem = [ps,cs,confs,poses]
+            conf_threshold = np.percentile(confs, 15)
+            msk = confs>=conf_threshold
+            ps = ps[msk]
+            cs = cs[msk]
+            for s in range(self.nkf+1):
+                rr.log(f"{self.entity}/pointclouds/{s}", rr.Points3D(np.array([])))
+            for s in range(self.nkf):
+                K44 = np.eye(4)
+                K44[:3,:3] = self.K
+                log_camera(f"{self.entity}/camera_kf/{s}",poses[s].reshape(4,4), K44, update=True)
+            for s in range(1, self.nkf):
+                rr.log(f"{self.entity}/lines/{s-1}to{s}", rr.LineStrips3D([poses[s-1:s+1,:3,3].tolist()],colors=[0,0,255],radii=[0.005]))
+            rr.log(
+                    f"{self.entity}/pointclouds/{self.nkf}",
+                    rr.Points3D(ps, colors=cs, radii=0.01),
+                )
+            self.last_kf_pose = poses[-1]
+    def step(self, timestamp, image):
+        if timestamp is None:
+            self.cur_timestamp = self.fid+1
+        else:
+            self.cur_timestamp = timestamp
+        self.frontend(image)
+        self.backend()
+    def map_add(self, token_kf):
+        if self.map is None:
+            self.map = token_kf.cpu() if self.save_gmem else token_kf #[tok.cpu() for tok in token_kf]
+        else:
+            if self.save_gmem:
+                self.map = torch.cat([self.map, token_kf.cpu()],axis=0) # S,P,C
+            else:
+                self.map = torch.cat([self.map, token_kf],axis=0) # S,P,C
+    @property
+    def nkf(self):
+        return self.kid+1
+    @property
+    def nf(self):
+        return self.fid+1
+    def terminate(self):
+        if self.nkf % self.backend_every != 0:
+            self.signal_backend = True
+            self.backend(final=True)
+        print(self.kf_time)
+        print(self.times)
+        print(self.backend_time)
+        print('frontend take', np.mean(self.times))
+        print('KFT')
+        print('total', np.sum(self.kf_time), 'FPS', float(len(self.kf_time))/np.sum(self.kf_time))
+        print('FT')
+        print('total', np.sum(self.times), 'FPS', float(len(self.times))/np.sum(self.times))
+        print('BT')
+        print('total', np.sum(self.backend_time), 'FPS', float(len(self.backend_time))/np.sum(self.backend_time))
+        print('Summary')
+        print('total', np.sum(self.kf_time)+np.sum(self.times)+np.sum(self.backend_time), 'FPS', float(len(self.kf_time))/(np.sum(self.kf_time)+np.sum(self.times)+np.sum(self.backend_time)))
+        self.save_result(f'{self.outdir}/final', self.map_opt)
+    def extract(self, map_all=None):
+        result = self.model.extract(map_all.cuda())
+        pts = result['points'].cpu().numpy() # 1,S,H,W,3
+        local_pts = result['local_points'].cpu().numpy() # 1,S,H,W,3
+        _,S,H,W,_ = pts.shape
+        conf = result['conf'].cpu().numpy()
+        point_clouds = [pts[0,s] for s in range(S)]
+        #conf_threshold = np.percentile(conf, 15)
+        #confs = [conf[0,s]>=conf_threshold for s in range(S)]
+        colors = torch.stack(self.frames[-S:]).permute(0,2,3,1).reshape(-1,3).cpu().numpy()[:,::-1] # S,H,W,C
+        confs = conf.reshape(-1)
+        camera_pose = result['camera_poses'].cpu().numpy()[0] # S,4,4
+        pts = pts.reshape(-1,3)
+        colors = colors.reshape(-1,3)
+        # set depth for the last kf
+        self.depth_last_kf = local_pts[0,-1,:,:,-1]
+        return pts, colors, confs, camera_pose
+    def save_result(self, output_path = 'output/tmp', map_all=None, traj=True):
+        '''
+        if map_all is None:
+            map_all = self.map
+            '''
+        print(self.kfids)
+        if map_all is None:
+            map_all = self.map_opt
+        result = self.model.extract(map_all.cuda())
+        pts = result['points'].cpu().numpy() # 1,S,H,W,3
+        _,S,H,W,_ = pts.shape
+        conf = result['conf'].cpu().numpy()
+        point_clouds = [pts[0,s] for s in range(S)]
+        conf_threshold = np.percentile(conf, 15)
+        confs = [conf[0,s]>=conf_threshold for s in range(S)]
+        colors = torch.stack(self.frames).permute(0,2,3,1).reshape(-1,3).cpu().numpy()[:,::-1] # S,H,W,C
+        msk = np.stack(confs).reshape(-1)
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(pts.reshape(-1,3).astype(np.float64)[msk])
+        pcd.colors = o3d.utility.Vector3dVector(colors.reshape(-1,3).astype(np.float64)[msk])
+        #downpcd = pcd.voxel_down_sample(voxel_size=0.005)
+        o3d.io.write_point_cloud(f"{output_path}.ply", pcd)
+        camera_pose = result['camera_poses'].cpu() #torch.Size([1, 14, 4,4])
+        poses = camera_pose[0].numpy()
+        self.write_poses_to_file(f"{output_path}_traj.txt", poses, self.kf_timestamps)
+        self.save_framewise_pointclouds(f"{output_path}_pc", point_clouds, self.kf_timestamps, confs)
+        return result
+    def write_poses_to_file(self, filename, poses, frame_ids):
+        with open(filename, "w") as f:
+            assert len(poses) == len(frame_ids), "Number of provided poses and number of frame ids do not match"
+            for frame_id, pose in zip(frame_ids, poses):
+                x, y, z = pose[0:3, 3]
+                rotation_matrix = pose[0:3, 0:3]
+                quaternion = R.from_matrix(rotation_matrix).as_quat() # x, y, z, w
+                output = np.array([float(frame_id), x, y, z, *quaternion])
+                f.write(" ".join(f"{v:.8f}" for v in output) + "\n")
+    def save_framewise_pointclouds(self, filename, pointclouds, frame_ids, conf_masks):
+        os.makedirs(filename, exist_ok=True)
+        for frame_id, pointcloud, conf_masks in zip(frame_ids, pointclouds, conf_masks):
+            # save pcd as numpy array
+            np.savez(f"{filename}/{frame_id}.npz", pointcloud=pointcloud, mask=conf_masks)
+def get_parser():
+    parser = argparse.ArgumentParser(description="SLAM-Former demo")
+    parser.add_argument("--ckpt_path", type=str, default="path/to/checkpoint.pth.model", help="Path to the checkpoint")
+    parser.add_argument("--image_folder", type=str, default="path/to/image/folder", help="Path to folder containing images")
+    parser.add_argument("--target_size", type=int, default=518, help="the target size of image(longer side)")
+    parser.add_argument("--output_dir", type=str, default="outputs/tmp", help="Path to save the output")
+    parser.add_argument("--stride", type=int, default=1, help="Frame stride for subsampling the input sequence")
+    parser.add_argument("--kf_th", type=float, default=0.1, help="Keyframe selection threshold (minimum translation distance)")
+    parser.add_argument("--retention_ratio", type=float, default=0.5, help="KV Pruning retention ratio")
+    parser.add_argument("--bn_every", type=int, default=10, help="Run backend optimization every N keyframes")
+    parser.add_argument("--vis", action="store_true", help="Enable real-time visualization with Rerun")
+    parser.add_argument("--resize_rate", type=float, default=1, help="Resize rate for input images before processing")
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = get_parser()
+    image_folder = args.image_folder
+    outdir = args.output_dir
+    os.makedirs(outdir, exist_ok=True)
+    if 'tum' in args.image_folder:
+        fx = 525.0  # focal length x
+        fy = 525.0  # focal length y
+        cx = 319.5  # optical center x
+        cy = 239.5  # optical center y
+        K = np.eye(3)
+        K[0,0] = fx
+        K[1,1] = fy
+        K[0,2] = cx
+        K[1,2] = cy
+    elif 'Replica' in args.image_folder:
+        fx = 600.  # focal length x
+        fy = 600.0  # focal length y
+        cx = 599.5  # optical center x
+        cy = 339.5  # optical center y
+        K = np.eye(3)
+        K[0,0] = fx
+        K[1,1] = fy
+        K[0,2] = cx
+        K[1,2] = cy
+    else:
+        K = None
+    # Use the provided image folder path
+    print(f"Loading images from {image_folder}...")
+    image_names = [f for f in glob.glob(os.path.join(image_folder, "*"))
+               if "depth" not in os.path.basename(f).lower() and "txt" not in os.path.basename(f).lower()
+               and "db" not in os.path.basename(f).lower()]
+    image_names = utils.sort_images_by_number(image_names)
+    frame_ids = []
+    for path in image_names:
+        filename = os.path.basename(path)
+        match = re.search(r'\d+(?:\.\d+)?', filename)  # matches integers and decimals
+        if match:
+            frame_ids.append(float(match.group()))
+        else:
+            raise ValueError(f"No number found in image name: {filename}")
+    print(f"Found {len(image_names)} images")
+    print('resize image', args.resize_rate)
+    slam = SLAM(
+        outdir=outdir,
+        kf_th=args.kf_th,
+        bn_every=args.bn_every,
+        vis=args.vis,
+        ckpt_path=args.ckpt_path,
+        target_size=args.target_size,
+        retention_ratio=args.retention_ratio
+    )
+    slam.K = K
+    for frame_id, image_name in zip(frame_ids[::args.stride], image_names[::args.stride]):
+        img = cv2.imread(image_name)
+        if args.resize_rate != 1:
+            H,W,_ = img.shape
+            img = cv2.resize(img, (int(W*args.resize_rate), int(H*args.resize_rate)), cv2.INTER_CUBIC)
+        slam.step(frame_id, img)
+    result = slam.terminate()

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/demo_infinite.py ADDED Viewed

	@@ -0,0 +1,493 @@

+#!/usr/bin/env python3
+"""
+Infinite SLAM-Former — Inference Demo with all CLI toggles.
+Wraps the original SLAMFormer + SLAM class with the new modules:
+  - GraphGatedMemoryManager (submap backend + loop closure)
+  - TemporalEmbedWrapper (dual temporal embedding injection)
+  - BatchedDynamicSubmapRouter (learned submap boundaries)
+When all toggles are off, behaviour is identical to slam/demo.py.
+Usage:
+    # Vanilla (same as demo.py):
+    python slam/demo_infinite.py --ckpt_path ckpt/checkpoint.pth.model \
+        --image_folder /path/to/images --output_dir outputs/tmp
+    # With submap backend + loop closure + temporal embed:
+    python slam/demo_infinite.py --ckpt_path ckpt/checkpoint.pth.model \
+        --image_folder /path/to/images --output_dir outputs/tmp \
+        --enable_submap_backend --enable_loop_closure --enable_temporal_embed
+"""
+import os
+import sys
+import re
+import glob
+import argparse
+import time
+from collections import OrderedDict
+import cv2
+import torch
+import numpy as np
+from scipy.spatial.transform import Rotation as R
+# ─── Path setup ──────────────────────────────────────────
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_DIR = os.path.dirname(SCRIPT_DIR)
+SRC_DIR = os.path.join(PROJECT_DIR, "src")
+if SRC_DIR not in sys.path:
+    sys.path.insert(0, SRC_DIR)
+if PROJECT_DIR not in sys.path:
+    sys.path.insert(0, PROJECT_DIR)
+from slamformer.models.slamformer import SLAMFormer
+import slam.utils as utils
+from slam.graph_gated_memory import (
+    GraphGatedMemoryManager,
+    TemporalEmbedWrapper,
+)
+def strip_module(state_dict):
+    if not isinstance(state_dict, dict):
+        return state_dict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:] if k.startswith("module.") else k
+        new_state_dict[name] = v
+    return new_state_dict
+def strip_prefix(state_dict, prefix):
+    if not isinstance(state_dict, dict):
+        return state_dict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[len(prefix):] if k.startswith(prefix) else k
+        new_state_dict[name] = v
+    return new_state_dict
+def get_cfg_value(cfg, key, default=None):
+    if cfg is None:
+        return default
+    try:
+        if hasattr(cfg, "get"):
+            return cfg.get(key, default)
+    except Exception:
+        pass
+    try:
+        return cfg[key]
+    except Exception:
+        return getattr(cfg, key, default)
+def extract_checkpoint_parts(ckpt_raw):
+    train_cfg = None
+    memory_state = None
+    temporal_state = None
+    if isinstance(ckpt_raw, dict):
+        train_cfg = ckpt_raw.get("args")
+        if "model" in ckpt_raw:
+            ckpt = ckpt_raw["model"]
+        else:
+            ckpt = ckpt_raw
+        nested = ckpt_raw.get("submap_modules", {})
+        if isinstance(nested, dict):
+            memory_state = ckpt_raw.get("memory_mgr", nested.get("memory_mgr"))
+            temporal_state = ckpt_raw.get("temporal_wrapper", nested.get("temporal_wrapper"))
+        else:
+            memory_state = ckpt_raw.get("memory_mgr")
+            temporal_state = ckpt_raw.get("temporal_wrapper")
+    else:
+        ckpt = ckpt_raw
+    return ckpt, train_cfg, memory_state, temporal_state
+class InfiniteSLAM:
+    """Extended SLAM pipeline with optional submap backend and loop closure.
+    When all enable_* flags are False, this behaves identically to the
+    original SLAM class in slam/demo.py.
+    """
+    def __init__(
+        self,
+        outdir: str = "output/tmp",
+        kf_th: float = 0.1,
+        bn_every: int = 10,
+        ckpt_path: str = "",
+        target_size: int = 518,
+        retention_ratio: float = 0.5,
+        # ── new toggles ──
+        enable_submap_backend: bool = False,
+        submap_size: int = 10,
+        max_recursive_submaps: int = 3,
+        enable_loop_closure: bool = False,
+        desc_dim: int = 128,
+        enable_temporal_embed: bool = False,
+        temporal_embed_mode: str = "learned",
+    ):
+        self.outdir = outdir
+        self.kf_th = kf_th
+        self.bn_every = bn_every
+        self.target_size = target_size
+        ckpt_raw = None
+        self.train_cfg = None
+        if ckpt_path and os.path.exists(ckpt_path):
+            ckpt_raw = torch.load(ckpt_path, map_location="cuda", weights_only=False)
+            _, self.train_cfg, _, _ = extract_checkpoint_parts(ckpt_raw)
+        retention_ratio = float(get_cfg_value(self.train_cfg, "retention_ratio", retention_ratio))
+        self.enable_submap = bool(get_cfg_value(self.train_cfg, "enable_submap", enable_submap_backend))
+        self.enable_loop = bool(get_cfg_value(self.train_cfg, "enable_loop", enable_loop_closure))
+        self.enable_temporal = bool(get_cfg_value(self.train_cfg, "enable_temporal", enable_temporal_embed))
+        submap_size = int(get_cfg_value(self.train_cfg, "submap_size", submap_size))
+        max_recursive_submaps = int(get_cfg_value(self.train_cfg, "max_recursive_submaps", max_recursive_submaps))
+        desc_dim = int(get_cfg_value(self.train_cfg, "desc_dim", desc_dim))
+        loop_mask_mode = get_cfg_value(self.train_cfg, "loop_mask_mode", "hard_top1")
+        soft_mask_temperature = float(get_cfg_value(self.train_cfg, "soft_mask_temperature", 0.25))
+        soft_mask_bias = float(get_cfg_value(self.train_cfg, "soft_mask_bias", 0.2))
+        submap_train_mode = get_cfg_value(self.train_cfg, "submap_train_mode", "full_token")
+        submap_retrieval_topk = int(get_cfg_value(self.train_cfg, "submap_retrieval_topk", 0))
+        submap_fetch_source = get_cfg_value(self.train_cfg, "submap_fetch_source", "frontend")
+        submap_descriptor_source = get_cfg_value(self.train_cfg, "submap_descriptor_source", "frontend")
+        temporal_embed_mode = get_cfg_value(self.train_cfg, "temporal_embed_mode", temporal_embed_mode)
+        # ── Model (frozen) ───────────────────────────────
+        self.model = SLAMFormer(
+            retention_ratio=retention_ratio, bn_every=bn_every
+        )
+        self.model.eval()
+        # ── Memory manager ───────────────────────────────
+        embed_dim = self.model.dec_embed_dim
+        self.memory_mgr = GraphGatedMemoryManager(
+            submap_size=submap_size,
+            max_recursive_submaps=max_recursive_submaps,
+            desc_dim=desc_dim,
+            embed_dim=embed_dim,
+            loop_mask_mode=loop_mask_mode,
+            soft_mask_temperature=soft_mask_temperature,
+            soft_mask_bias=soft_mask_bias,
+            retain_history_grad=False,
+            submap_train_mode=submap_train_mode,
+            submap_retrieval_topk=submap_retrieval_topk,
+            submap_fetch_source=submap_fetch_source,
+            submap_descriptor_source=submap_descriptor_source,
+        )
+        # ── Temporal wrapper ─────────────────────────────
+        self.temporal_wrapper = TemporalEmbedWrapper(
+            embed_dim=embed_dim,
+            max_frames=5000,
+            mode=temporal_embed_mode,
+        )
+        self._load_model(ckpt_raw)
+        self.model.to("cuda")
+        self.memory_mgr.to("cuda").eval()
+        self.temporal_wrapper.to("cuda").eval()
+        self.memory_mgr.reset()
+        # ── SLAM state ───────────────────────────────────
+        self.fid = -1
+        self.kid = -1
+        self.kfids = []
+        self.kf_timestamps = []
+        self.frames = []
+        self.kf_frames = []
+        self.map = None
+        self.map_opt = None
+        self.signal_backend = False
+        self.frontend_times = 0
+        self.times = []
+        self.kf_time = []
+        self.backend_time = []
+        self.K = None
+        self.cur_timestamp = 0
+    def _load_model(self, ckpt_raw):
+        if ckpt_raw is None:
+            return
+        ckpt, _, memory_state, temporal_state = extract_checkpoint_parts(ckpt_raw)
+        ckpt = strip_module(ckpt)
+        self.model.load_state_dict(ckpt, strict=False)
+        if memory_state is not None:
+            memory_state = strip_prefix(strip_module(memory_state), "memory_mgr.")
+            self.memory_mgr.load_state_dict(memory_state, strict=False)
+        if temporal_state is not None:
+            temporal_state = strip_prefix(strip_module(temporal_state), "temporal_wrapper.")
+            self.temporal_wrapper.load_state_dict(temporal_state, strict=False)
+        del ckpt
+    @property
+    def time(self):
+        torch.cuda.synchronize()
+        return time.perf_counter()
+    @property
+    def nkf(self):
+        return self.kid + 1
+    def kf_detect(self, image):
+        if self.kid == -1:
+            return True
+        frame = utils.load_image(image, self.target_size)
+        token = self.model.KFT(torch.stack([self.kf_frames[-1], frame.cuda()]))
+        res = self.model.extract(token, cam_only=True)
+        camera_pose = res["camera_poses"]
+        extrinsic = torch.inverse(camera_pose)
+        if extrinsic.shape[1] > 1:
+            extrinsic_ref = extrinsic.cpu()[0, -2]
+            extrinsic = extrinsic.cpu()[0, -1]
+        else:
+            return True
+        dist = torch.sqrt(torch.sum((extrinsic[:3, 3] - extrinsic_ref[:3, 3]) ** 2))
+        return dist > self.kf_th
+    def frontend(self, image):
+        self.fid += 1
+        print("Frame", self.fid)
+        st = self.time
+        enough_disparity = self.kf_detect(image)
+        self.kf_time.append(self.time - st)
+        if not enough_disparity:
+            return False
+        torch.cuda.empty_cache()
+        frame = utils.load_image(image, self.target_size)
+        st = self.time
+        self.last_kf = frame.cuda()
+        self.kf_frames.append(self.last_kf)
+        self.frames.append(self.last_kf.clone())
+        self.kid += 1
+        print(f" # KEYFRAME {self.kid}")
+        self.kf_timestamps.append(self.cur_timestamp)
+        st = self.time
+        if self.nkf == 1:
+            pass
+        elif self.nkf == 2:
+            token = self.model.frontendT(torch.stack([self.kf_frames[0], frame.cuda()]))
+            self._map_add(token)
+        else:
+            token = self.model.frontendT(frame.cuda())
+            self._map_add(token)
+        self.kfids.append(self.fid)
+        self.times.append(self.time - st)
+        torch.cuda.empty_cache()
+        # ── Submap accumulation ──────────────────────────
+        if self.enable_submap and self.map is not None:
+            last_token = self.map[-1:] if self.map.dim() == 3 else self.map
+            self.memory_mgr.accumulate(last_token, frame_id=self.fid)
+        self.frontend_times += 1
+        if self.frontend_times % self.bn_every == 0:
+            self.signal_backend = True
+    def backend(self, final=False):
+        if not self.signal_backend:
+            return
+        torch.cuda.empty_cache()
+        if hasattr(self.model, "fkv"):
+            del self.model.fkv
+        torch.cuda.empty_cache()
+        print("Backending...", self.nkf, "KFs")
+        st = time.perf_counter()
+        if self.enable_submap and self.memory_mgr.submap_complete:
+            # ── Submap-based backend ─────────────────────
+            hidden_B, loop_gate, meta = self.memory_mgr.finalize_submap(
+                model=self.model,
+                device=torch.device("cuda"),
+                temporal_wrapper=self.temporal_wrapper if self.enable_temporal else None,
+                enable_temporal_embed=self.enable_temporal,
+                enable_loop_closure=self.enable_loop,
+            )
+            # Use the active submap portion as map_opt
+            n_prev = meta['n_prev']
+            n_curr = meta['n_curr']
+            self.map_opt = hidden_B[n_prev:n_prev + n_curr].cpu()
+        else:
+            # ── Vanilla backend ──────────────────────────
+            map_optimed = self.model.backendT(self.map.cuda())
+            self.map_opt = map_optimed.cpu()
+        self.backend_time.append(time.perf_counter() - st)
+        print("backend_take", time.perf_counter() - st)
+        self.signal_backend = False
+        torch.cuda.empty_cache()
+    def step(self, timestamp, image):
+        self.cur_timestamp = timestamp if timestamp is not None else self.fid + 1
+        self.frontend(image)
+        self.backend()
+    def _map_add(self, token_kf):
+        if self.map is None:
+            self.map = token_kf.cpu()
+        else:
+            self.map = torch.cat([self.map, token_kf.cpu()], axis=0)
+    def terminate(self):
+        if self.nkf % self.bn_every != 0:
+            self.signal_backend = True
+            self.backend(final=True)
+        print("Frontend times:", self.times)
+        print("Backend times:", self.backend_time)
+        if self.times:
+            print("Frontend avg:", np.mean(self.times))
+        if self.backend_time:
+            print("Backend avg:", np.mean(self.backend_time))
+        print("Summary FPS:", float(len(self.kf_time)) / (
+            np.sum(self.kf_time) + np.sum(self.times) + np.sum(self.backend_time) + 1e-9
+        ))
+        self._save_result(f"{self.outdir}/final", self.map_opt)
+    def _save_result(self, output_path, map_all=None):
+        import open3d as o3d
+        print(self.kfids)
+        if map_all is None:
+            map_all = self.map_opt
+        map_gpu = map_all.cuda()
+        # Ensure shape_ is set correctly for extract
+        BN = map_gpu.shape[0]
+        # Use shape_ from the last frontendT call for H, W
+        _, _, H, W, ph, pw = self.model.shape_
+        self.model.shape_ = (1, BN, H, W, ph, pw)
+        result = self.model.extract(map_gpu)
+        pts = result["points"].cpu().numpy()
+        _, S, H, W, _ = pts.shape
+        conf = result["conf"].cpu().numpy()
+        conf_threshold = np.percentile(conf, 15)
+        confs = [conf[0, s] >= conf_threshold for s in range(S)]
+        colors = torch.stack(self.frames).permute(0, 2, 3, 1).reshape(-1, 3).cpu().numpy()[:, ::-1]
+        msk = np.stack(confs).reshape(-1)
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(pts.reshape(-1, 3).astype(np.float64)[msk])
+        pcd.colors = o3d.utility.Vector3dVector(colors.reshape(-1, 3).astype(np.float64)[msk])
+        o3d.io.write_point_cloud(f"{output_path}.ply", pcd)
+        camera_pose = result["camera_poses"].cpu()
+        poses = camera_pose[0].numpy()
+        self._write_poses(f"{output_path}_traj.txt", poses, self.kf_timestamps)
+    def _write_poses(self, filename, poses, frame_ids):
+        with open(filename, "w") as f:
+            for frame_id, pose in zip(frame_ids, poses):
+                x, y, z = pose[0:3, 3]
+                quat = R.from_matrix(pose[0:3, 0:3]).as_quat()
+                output = np.array([float(frame_id), x, y, z, *quat])
+                f.write(" ".join(f"{v:.8f}" for v in output) + "\n")
+def get_parser():
+    parser = argparse.ArgumentParser(description="Infinite SLAM-Former Demo")
+    # ── Original demo.py args ────────────────────────────
+    parser.add_argument("--ckpt_path", type=str, default="")
+    parser.add_argument("--image_folder", type=str, default="")
+    parser.add_argument("--target_size", type=int, default=518)
+    parser.add_argument("--output_dir", type=str, default="outputs/tmp")
+    parser.add_argument("--stride", type=int, default=1)
+    parser.add_argument("--kf_th", type=float, default=0.1)
+    parser.add_argument("--retention_ratio", type=float, default=0.5)
+    parser.add_argument("--bn_every", type=int, default=10)
+    parser.add_argument("--resize_rate", type=float, default=1.0)
+    # ── Task 1: submap backend ───────────────────────────
+    parser.add_argument("--enable_submap_backend", action="store_true")
+    parser.add_argument("--submap_size", type=int, default=10)
+    parser.add_argument("--max_recursive_submaps", type=int, default=3)
+    parser.add_argument("--enable_loop_closure", action="store_true")
+    parser.add_argument("--desc_dim", type=int, default=128)
+    # ── Fix #7: temporal embedding ───────────────────────
+    parser.add_argument("--enable_temporal_embed", action="store_true")
+    parser.add_argument("--temporal_embed_mode", type=str, default="learned",
+                        choices=["learned", "sinusoidal"])
+    return parser
+if __name__ == "__main__":
+    args = get_parser().parse_args()
+    os.makedirs(args.output_dir, exist_ok=True)
+    # ── Camera intrinsics (same logic as original demo.py) ──
+    if "tum" in args.image_folder:
+        K = np.eye(3)
+        K[0, 0], K[1, 1] = 525.0, 525.0
+        K[0, 2], K[1, 2] = 319.5, 239.5
+    elif "Replica" in args.image_folder:
+        K = np.eye(3)
+        K[0, 0], K[1, 1] = 600.0, 600.0
+        K[0, 2], K[1, 2] = 599.5, 339.5
+    else:
+        K = None
+    # ── Load images ──────────────────────────────────────
+    print(f"Loading images from {args.image_folder}...")
+    image_names = [
+        f for f in glob.glob(os.path.join(args.image_folder, "*"))
+        if "depth" not in os.path.basename(f).lower()
+        and "txt" not in os.path.basename(f).lower()
+        and "db" not in os.path.basename(f).lower()
+    ]
+    image_names = utils.sort_images_by_number(image_names)
+    frame_ids = []
+    for path in image_names:
+        match = re.search(r"\d+(?:\.\d+)?", os.path.basename(path))
+        if match:
+            frame_ids.append(float(match.group()))
+        else:
+            raise ValueError(f"No number found in image name: {path}")
+    print(f"Found {len(image_names)} images")
+    # ── Create SLAM instance ─────────────────────────────
+    slam = InfiniteSLAM(
+        outdir=args.output_dir,
+        kf_th=args.kf_th,
+        bn_every=args.bn_every,
+        ckpt_path=args.ckpt_path,
+        target_size=args.target_size,
+        retention_ratio=args.retention_ratio,
+        enable_submap_backend=args.enable_submap_backend,
+        submap_size=args.submap_size,
+        max_recursive_submaps=args.max_recursive_submaps,
+        enable_loop_closure=args.enable_loop_closure,
+        desc_dim=args.desc_dim,
+        enable_temporal_embed=args.enable_temporal_embed,
+        temporal_embed_mode=args.temporal_embed_mode,
+    )
+    slam.K = K
+    # ── Run ──────────────────────────────────────────────
+    for frame_id, image_name in zip(
+        frame_ids[:: args.stride], image_names[:: args.stride]
+    ):
+        img = cv2.imread(image_name)
+        if args.resize_rate != 1:
+            H, W, _ = img.shape
+            img = cv2.resize(
+                img, (int(W * args.resize_rate), int(H * args.resize_rate)),
+                cv2.INTER_CUBIC,
+            )
+        slam.step(frame_id, img)
+    slam.terminate()

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/demo_submap.py ADDED Viewed

	@@ -0,0 +1,927 @@

+import os,sys
+from collections import OrderedDict
+import torch
+import numpy as np
+import re
+import cv2
+import glob
+import argparse
+import time
+import open3d as o3d
+from rich import print
+import matplotlib.pyplot as plt
+from scipy.spatial.transform import Rotation as R
+import rerun as rr
+import rerun.blueprint as rrb
+current_directory = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_directory+'/../')
+sys.path.append('src')
+from slamformer.models.slamformer import SLAMFormer
+from slam.graph_gated_memory import GraphGatedMemoryManager, TemporalEmbedWrapper
+import slam.utils as utils
+from slam.rerun_helper import log_camera, log_window
+def strip_module(state_dict):
+    """
+    Removes the 'module.' prefix from the keys of a state_dict.
+    Args:
+        state_dict (dict): The original state_dict with possible 'module.' prefixes.
+    Returns:
+        OrderedDict: A new state_dict with 'module.' prefixes removed.
+    """
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[7:] if k.startswith("module.") else k
+        new_state_dict[name] = v
+    return new_state_dict
+def strip_prefix(state_dict, prefix):
+    if not isinstance(state_dict, dict):
+        return state_dict
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = k[len(prefix):] if k.startswith(prefix) else k
+        new_state_dict[name] = v
+    return new_state_dict
+def get_cfg_value(cfg, key, default=None):
+    if cfg is None:
+        return default
+    try:
+        if hasattr(cfg, "get"):
+            return cfg.get(key, default)
+    except Exception:
+        pass
+    try:
+        return cfg[key]
+    except Exception:
+        return getattr(cfg, key, default)
+def extract_checkpoint_parts(ckpt_raw):
+    train_cfg = None
+    memory_state = None
+    temporal_state = None
+    if isinstance(ckpt_raw, dict):
+        train_cfg = ckpt_raw.get("args")
+        if "model" in ckpt_raw:
+            ckpt = ckpt_raw["model"]
+            print("Loaded state_dict from 'model' key in checkpoint.")
+        else:
+            ckpt = ckpt_raw
+        nested = ckpt_raw.get("submap_modules", {})
+        if isinstance(nested, dict):
+            memory_state = ckpt_raw.get("memory_mgr", nested.get("memory_mgr"))
+            temporal_state = ckpt_raw.get("temporal_wrapper", nested.get("temporal_wrapper"))
+        else:
+            memory_state = ckpt_raw.get("memory_mgr")
+            temporal_state = ckpt_raw.get("temporal_wrapper")
+    else:
+        ckpt = ckpt_raw
+    return ckpt, train_cfg, memory_state, temporal_state
+class KVSubmapManager:
+    def __init__(
+        self,
+        submap_size=10,
+        max_loop_submaps=5,
+        loop_similarity_threshold=0.75,
+        desc_dim=128,
+        embed_dim=1024,
+    ):
+        self.K = submap_size
+        self.max_loop_submaps = max_loop_submaps
+        self.loop_similarity_threshold = loop_similarity_threshold
+        self.desc_dim = desc_dim
+        self.embed_dim = embed_dim
+        self.desc_proj = torch.nn.Linear(2 * embed_dim, desc_dim)
+        self.submap_tokens_cpu = {}
+        self.submap_descriptors = {}
+        self.submap_frame_ids = {}
+        self.adjacency = {}
+        self._curr_tokens = []
+        self._curr_frame_ids = []
+        self._current_submap_id = 0
+    @property
+    def current_submap_id(self):
+        return self._current_submap_id
+    @property
+    def current_frame_ids(self):
+        return list(self._curr_frame_ids)
+    def to(self, device):
+        self.desc_proj = self.desc_proj.to(device)
+        return self
+    def load_memory_state_dict(self, state_dict):
+        desc_state = OrderedDict()
+        for k, v in state_dict.items():
+            key = k
+            if key.startswith("memory_mgr."):
+                key = key[len("memory_mgr."):]
+            if key.startswith("desc_proj."):
+                desc_state[key[len("desc_proj."):]] = v
+        if desc_state:
+            msg = self.desc_proj.load_state_dict(desc_state, strict=False)
+            print(f"Loaded submap descriptor head: {msg}")
+    def accumulate(self, frame_token, frame_id):
+        if frame_token.dim() == 2:
+            frame_token = frame_token.unsqueeze(0)
+        self._curr_tokens.append(frame_token.detach().cpu())
+        self._curr_frame_ids.append(int(frame_id))
+    def has_current_tokens(self):
+        return len(self._curr_tokens) > 0
+    def current_tokens(self, device):
+        if not self._curr_tokens:
+            return None
+        return torch.cat(self._curr_tokens, dim=0).to(device, non_blocking=True)
+    def compute_descriptor(self, tokens):
+        if self.desc_proj.weight.device != tokens.device:
+            self.desc_proj = self.desc_proj.to(tokens.device)
+        pooled = tokens.mean(dim=(0, 1), keepdim=False).unsqueeze(0).float()
+        desc = self.desc_proj(pooled).squeeze(0)
+        return torch.nn.functional.normalize(desc, dim=0)
+    def select_context_submaps(self, curr_desc):
+        prev_sid = self._current_submap_id - 1
+        if prev_sid not in self.submap_tokens_cpu:
+            prev_sid = None
+        hist_ids = sorted([sid for sid in self.submap_tokens_cpu.keys() if sid < self._current_submap_id])
+        loop_ids = []
+        primary_sid = None
+        primary_sim = None
+        if self.max_loop_submaps > 0:
+            candidates = [sid for sid in hist_ids if sid != prev_sid]
+            if candidates:
+                curr_desc_cpu = curr_desc.detach().cpu().float()
+                scored = []
+                for sid in candidates:
+                    hist_desc = self.submap_descriptors[sid].float()
+                    sim = torch.nn.functional.cosine_similarity(
+                        curr_desc_cpu.unsqueeze(0),
+                        hist_desc.unsqueeze(0),
+                        dim=-1,
+                    ).item()
+                    scored.append((sim, sid))
+                scored.sort(reverse=True)
+                if scored:
+                    primary_sim, primary_sid = scored[0]
+                    if primary_sim >= self.loop_similarity_threshold:
+                        loop_ids.append(primary_sid)
+                        for nid in sorted(self.adjacency.get(primary_sid, set())):
+                            if len(loop_ids) >= self.max_loop_submaps:
+                                break
+                            if nid != self._current_submap_id and nid in self.submap_tokens_cpu and nid not in loop_ids:
+                                loop_ids.append(nid)
+                    else:
+                        primary_sid = None
+        deduped = []
+        seen = set()
+        for sid in loop_ids:
+            if sid not in seen:
+                deduped.append(sid)
+                seen.add(sid)
+        return prev_sid, deduped, primary_sid, primary_sim
+    def build_backend_tokens(self, device, prev_sid=None, loop_ids=None):
+        loop_ids = [] if loop_ids is None else list(loop_ids)
+        parts = []
+        n_prev = 0
+        loop_token_counts = []
+        if prev_sid is not None and prev_sid in self.submap_tokens_cpu:
+            prev_tokens = self.submap_tokens_cpu[prev_sid].to(device, non_blocking=True)
+            parts.append(prev_tokens)
+            n_prev = prev_tokens.shape[0]
+        curr_tokens = self.current_tokens(device)
+        n_curr = curr_tokens.shape[0]
+        parts.append(curr_tokens)
+        for sid in loop_ids:
+            loop_tokens = self.submap_tokens_cpu[sid].to(device, non_blocking=True)
+            parts.append(loop_tokens)
+            loop_token_counts.append(loop_tokens.shape[0])
+        combined = torch.cat(parts, dim=0)
+        meta = {
+            "n_prev": n_prev,
+            "n_curr": n_curr,
+            "curr_frame_ids": list(self._curr_frame_ids),
+            "loop_token_counts": loop_token_counts,
+        }
+        return combined, meta
+    def _store_refined_submap(self, sid, refined_tokens):
+        self.submap_tokens_cpu[sid] = refined_tokens.detach().cpu()
+        self.submap_descriptors[sid] = self.compute_descriptor(refined_tokens).detach().cpu()
+    def finalize_current_submap(
+        self,
+        hidden_B,
+        n_prev,
+        n_curr,
+        prev_sid=None,
+        loop_ids=None,
+        loop_token_counts=None,
+        primary_sid=None,
+    ):
+        loop_ids = [] if loop_ids is None else list(loop_ids)
+        loop_token_counts = [] if loop_token_counts is None else list(loop_token_counts)
+        offset = 0
+        if prev_sid is not None and n_prev > 0:
+            refined_prev = hidden_B[:n_prev]
+            self._store_refined_submap(prev_sid, refined_prev)
+            offset = n_prev
+        refined_curr = hidden_B[offset:offset + n_curr]
+        offset += n_curr
+        for sid, count in zip(loop_ids, loop_token_counts):
+            refined_loop = hidden_B[offset:offset + count]
+            self._store_refined_submap(sid, refined_loop)
+            offset += count
+        sid = self._current_submap_id
+        self.submap_tokens_cpu[sid] = refined_curr.detach().cpu()
+        self.submap_descriptors[sid] = self.compute_descriptor(refined_curr).detach().cpu()
+        self.submap_frame_ids[sid] = list(self._curr_frame_ids)
+        if primary_sid is not None and primary_sid in self.submap_tokens_cpu:
+            self.adjacency.setdefault(sid, set()).add(primary_sid)
+            self.adjacency.setdefault(primary_sid, set()).add(sid)
+        self._curr_tokens = []
+        self._curr_frame_ids = []
+        self._current_submap_id += 1
+class SLAM:
+    def __init__(
+        self,
+        outdir='output/tmp',
+        kf_th=0.1,
+        bn_every=None,
+        vis=False,
+        save_gmem=True,
+        ckpt_path='path/to/ckpt.pth',
+        target_size=518,
+        retention_ratio=None,
+        loop_mask_mode=None,
+        submap_train_mode=None,
+        submap_retrieval_topk=None,
+        submap_fetch_source=None,
+        submap_descriptor_source=None,
+        max_recursive_submaps=None,
+        ):
+        self.outdir = outdir
+        self.kf_th=kf_th
+        self.save_gmem = save_gmem
+        self.bn_every=bn_every
+        self.vis = vis
+        self.ckpt_path = ckpt_path
+        self.target_size = target_size
+        self.times = []
+        self.kf_time = []
+        self.backend_time = []
+        ckpt_raw = torch.load(self.ckpt_path, map_location='cpu', weights_only=False)
+        _, self.train_cfg, _, _ = extract_checkpoint_parts(ckpt_raw)
+        self.bn_every = int(bn_every if bn_every is not None else get_cfg_value(self.train_cfg, 'submap_size', 10))
+        self.retention_ratio = float(
+            retention_ratio if retention_ratio is not None else get_cfg_value(self.train_cfg, 'retention_ratio', 0.5)
+        )
+        self.enable_loop = bool(get_cfg_value(self.train_cfg, 'enable_loop', False))
+        self.enable_temporal = bool(get_cfg_value(self.train_cfg, 'enable_temporal', False))
+        self.tbptt_window = int(get_cfg_value(self.train_cfg, 'tbptt_window', 0))
+        self.max_recursive_submaps = int(
+            max_recursive_submaps if max_recursive_submaps is not None else get_cfg_value(self.train_cfg, 'max_recursive_submaps', 5)
+        )
+        self.desc_dim = int(get_cfg_value(self.train_cfg, 'desc_dim', 128))
+        self.loop_mask_mode = loop_mask_mode if loop_mask_mode is not None else get_cfg_value(self.train_cfg, 'loop_mask_mode', 'hard_top1')
+        self.soft_mask_temperature = float(get_cfg_value(self.train_cfg, 'soft_mask_temperature', 0.25))
+        self.soft_mask_bias = float(get_cfg_value(self.train_cfg, 'soft_mask_bias', 0.2))
+        self.submap_train_mode = (
+            submap_train_mode if submap_train_mode is not None else get_cfg_value(self.train_cfg, 'submap_train_mode', 'full_token')
+        )
+        self.submap_retrieval_topk = int(
+            submap_retrieval_topk if submap_retrieval_topk is not None else get_cfg_value(self.train_cfg, 'submap_retrieval_topk', 0)
+        )
+        self.submap_fetch_source = (
+            submap_fetch_source if submap_fetch_source is not None else get_cfg_value(self.train_cfg, 'submap_fetch_source', 'frontend')
+        )
+        self.submap_descriptor_source = (
+            submap_descriptor_source
+            if submap_descriptor_source is not None
+            else get_cfg_value(self.train_cfg, 'submap_descriptor_source', 'frontend')
+        )
+        self.temporal_embed_mode = get_cfg_value(self.train_cfg, 'temporal_embed_mode', 'learned')
+        # model params
+        self.model = SLAMFormer(retention_ratio=self.retention_ratio, bn_every=self.bn_every)
+        self.model = self.model.eval()
+        self.memory_mgr = GraphGatedMemoryManager(
+            submap_size=self.bn_every,
+            max_recursive_submaps=self.max_recursive_submaps,
+            desc_dim=self.desc_dim,
+            embed_dim=self.model.dec_embed_dim,
+            loop_mask_mode=self.loop_mask_mode,
+            soft_mask_temperature=self.soft_mask_temperature,
+            soft_mask_bias=self.soft_mask_bias,
+            retain_history_grad=False,
+            submap_train_mode=self.submap_train_mode,
+            submap_retrieval_topk=self.submap_retrieval_topk,
+            submap_fetch_source=self.submap_fetch_source,
+            submap_descriptor_source=self.submap_descriptor_source,
+        )
+        self.temporal_wrapper = (
+            TemporalEmbedWrapper(
+                embed_dim=self.model.dec_embed_dim,
+                max_frames=5000,
+                mode=self.temporal_embed_mode,
+            )
+            if self.enable_temporal else None
+        )
+        self.load_model(ckpt_raw)
+        del ckpt_raw
+        self.model.eval()
+        self.model.to('cuda')
+        self.memory_mgr.to('cuda')
+        self.memory_mgr.eval()
+        if self.temporal_wrapper is not None:
+            self.temporal_wrapper.to('cuda')
+            self.temporal_wrapper.eval()
+        self.memory_mgr.reset()
+        print(
+            f"Resolved inference config: submap_size={self.bn_every}, retention_ratio={self.retention_ratio}, "
+            f"enable_loop={self.enable_loop}, enable_temporal={self.enable_temporal}, loop_mask_mode={self.loop_mask_mode}, "
+            f"submap_train_mode={self.submap_train_mode}, submap_retrieval_topk={self.submap_retrieval_topk}, "
+            f"submap_fetch_source={self.submap_fetch_source}, submap_descriptor_source={self.submap_descriptor_source}, "
+            f"max_recursive_submaps={self.max_recursive_submaps}"
+        )
+        # SLAM params
+        self.fid = -1
+        self.kid = -1
+        self.kfids = []
+        self.last_kfid = 0
+        self.kf_timestamps = []
+        # frontend
+        self.frontend_times = 0
+        # Token map
+        self.map = None
+        self.map_opt = None
+        self.signal_backend = False
+        self.backend_every = self.bn_every #10
+        #
+        self.extrins = []
+        self.intrins = []
+        self.frames = []
+        self.kf_frames = []
+        #
+        self.K = None
+        self.update_K = False
+        # vis
+        if self.vis:
+            self.entity="world"
+            rr.init("SLAM", spawn=True)
+            rr.log(self.entity, rr.ViewCoordinates.RIGHT_HAND_Z_UP)
+            self.Twk = np.eye(4)
+            self.K = np.eye(3)
+    def load_model(self, ckpt_raw):
+        ckpt, _, memory_state, temporal_state = extract_checkpoint_parts(ckpt_raw)
+        ckpt = utils.strip_module(ckpt)
+        self.model.load_state_dict(ckpt, strict=False)
+        if memory_state is not None:
+            memory_state = strip_prefix(strip_module(memory_state), 'memory_mgr.')
+            msg = self.memory_mgr.load_state_dict(memory_state, strict=False)
+            print(f"Loaded memory manager: {msg}")
+        if self.temporal_wrapper is not None and temporal_state is not None:
+            temporal_state = strip_prefix(strip_module(temporal_state), 'temporal_wrapper.')
+            msg = self.temporal_wrapper.load_state_dict(temporal_state, strict=False)
+            print(f"Loaded temporal wrapper: {msg}")
+        del ckpt
+    @property
+    def time(self):
+        torch.cuda.synchronize()
+        return time.perf_counter()
+    def _estimate_kf_pose(self, frame_pair, use_amp=True):
+        token = self.model.KFT(frame_pair, use_amp=use_amp)
+        res = self.model.extract(token, cam_only=True, use_amp=use_amp)
+        camera_pose = res['camera_poses']
+        extrinsic = torch.linalg.inv(camera_pose)
+        return token, camera_pose, extrinsic
+    def kf_detect(self, image):
+        if self.kid == -1:
+            self.extrins.append(torch.eye(4))
+            return True
+        frame = utils.load_image(image, self.target_size)
+        _,H,W = frame.shape
+        st = self.time #time.perf_counter()
+        frame_pair = torch.stack([self.kf_frames[-1], frame.cuda()])
+        token, camera_pose, extrinsic = self._estimate_kf_pose(frame_pair, use_amp=True)
+        if (not torch.isfinite(camera_pose).all()) or (not torch.isfinite(extrinsic).all()):
+            print("[warning] Non-finite keyframe pose under AMP; retrying keyframe detection in float32.")
+            token, camera_pose, extrinsic = self._estimate_kf_pose(frame_pair, use_amp=False)
+        if (not torch.isfinite(camera_pose).all()) or (not torch.isfinite(extrinsic).all()):
+            print("[warning] Non-finite keyframe pose after float32 retry; skipping this frame for keyframe detection.")
+            return False
+        if self.vis:
+            # scale the pose to global
+            #z = res['local_points'][0,0,:,:,-1].cpu().numpy()
+            if not hasattr(self,'depth_lask_kf'):
+                scale=1
+            else:
+                scale=1 #np.median(self.depth_last_kf/(z+1e-6))
+            if extrinsic.shape[1] > 1:
+                extrinsic_ref=extrinsic.cpu()[0,-2]
+                extrinsic = extrinsic.cpu()[0,-1]
+                Tki = torch.linalg.inv(camera_pose[0,0])@camera_pose[0,1]
+                Tki = Tki.cpu().numpy()
+                self.Twi = self.Twk@Tki
+                K44 = np.eye(4)
+                K44[:3,:3] = self.K
+                log_camera("camera",self.Twi, K44, kfd=True)
+                # make the window follow camera
+                log_window(f"{self.entity}",np.linalg.inv(self.Twi), K44)
+        else:
+            if extrinsic.shape[1] > 1:
+                extrinsic_ref=extrinsic.cpu()[0,-2]
+                extrinsic = extrinsic.cpu()[0,-1]
+                self.kft_extrinsic_ref = torch.eye(4)#extrinsic_ref
+        dist = torch.sqrt(torch.sum((extrinsic[:3,3] - extrinsic_ref[:3,3])**2))
+        if not torch.isfinite(dist):
+            print("[warning] Non-finite keyframe distance after pose recovery; skipping this frame.")
+            return False
+        isKF = dist > self.kf_th
+        print(dist)
+        if isKF:
+            self.extrins.append(extrinsic)
+        return isKF
+    def frontend(self, image):
+        if self.vis:
+            rr.log("image", rr.Image(image[:,:,::-1]))#,static=True)
+        self.fid += 1
+        print('Frame', self.fid)
+        # run kf detector
+        st = self.time
+        enough_disparity = self.kf_detect(image)
+        self.kf_time.append(self.time-st)
+        if not enough_disparity:
+            return False
+        torch.cuda.empty_cache()
+        # run T-frontend
+        H_,W_,_ = image.shape
+        frame = utils.load_image(image, self.target_size)
+        self.H,self.W,_ = frame.shape
+        st = self.time
+        self.last_kf = frame.cuda()
+        self.kf_frames.append(self.last_kf)
+        self.last_kfid = self.fid
+        self.frames.append(self.last_kf.clone())
+        self.kid += 1
+        print("[italic purple] # KEYFRAME", self.kid)
+        self.kf_timestamps.append(self.cur_timestamp)
+        frame = frame.cuda()
+        st = self.time
+        if self.nkf == 1:
+            pass
+        elif self.nkf == 2:
+            token = self.model.frontendT(torch.stack([self.kf_frames[0],frame]))
+            self.map_add(token)
+        else:
+            token = self.model.frontendT(frame)
+            print(self.time-st)
+            self.map_add(token)
+        self.kfids.append(self.fid)
+        self.times.append(self.time-st)
+        torch.cuda.empty_cache()
+        # send signal to backend
+        self.frontend_times += 1
+        if self.memory_mgr.submap_complete:
+            self.signal_backend = True
+        if self.vis and self.map is not None:
+            st = time.time()
+            map_before_bn = None
+            if self.map_opt is None:
+                map_before_bn = self.map
+            else:
+                S = self.map.shape[0]
+                S_oldopt = self.map_opt.shape[0]
+                map_before_bn = torch.cat([self.map_opt, self.map[S_oldopt:]],axis=0)
+            if self.nkf == 2:
+                ps,cs,confs,poses = self.extract(self.map)
+            else:
+                ps,cs,confs,poses = self.extract(self.map[-1:])
+            self.vis_mem = [ps,cs,confs,poses]
+            conf_threshold = np.percentile(confs, 15)
+            msk = confs>=conf_threshold
+            ps = ps[msk]
+            cs = cs[msk]
+            K44 = np.eye(4)
+            K44[:3,:3] = self.K
+            if self.nkf == 2:
+                log_camera(f"{self.entity}/camera_kf/0",poses[0], K44)
+                log_camera(f"{self.entity}/camera_kf/1",poses[1], K44)
+                rr.log(f"{self.entity}/lines/0to1", rr.LineStrips3D([poses[:,:3,3].tolist()],colors=[0,0,255],radii=[0.005]))
+                self.last_kf_pose = poses[1]
+            else:
+                log_camera(f"{self.entity}/camera_kf/{self.nkf-1}",poses.reshape(4,4), K44)
+                rr.log(f"{self.entity}/lines/{self.nkf-2}to{self.nkf-1}", rr.LineStrips3D([np.stack([self.last_kf_pose[:3,3],poses[0,:3,3]]).tolist()],colors=[0,0,255],radii=[0.005]))
+                self.last_kf_pose = poses[0]
+            rr.log(
+                    f"{self.entity}/pointclouds/{self.nkf}",
+                    rr.Points3D(ps, colors=cs, radii=0.01),
+                )
+            print('log', time.time()-st)
+            self.Twk = poses[-1].reshape(4,4)
+    def backend(self, final=False):
+        if not self.signal_backend:
+            return
+        torch.cuda.empty_cache()
+        if not self.memory_mgr._curr_tokens:
+            self.signal_backend = False
+            return
+        if hasattr(self.model, 'fkv'):
+            del self.model.fkv
+        self.model.reset_backend_cache = True
+        self.model._prune_idx_cache = None
+        self.model._prune_idx_cache_N = 0
+        self.model._prune_idx_cache_hw = None
+        print('Backending...', self.nkf, 'KFs')
+        st = time.perf_counter()
+        map_optimed, loop_gate, backend_meta = self.memory_mgr.finalize_submap(
+            model=self.model,
+            device=torch.device('cuda'),
+            temporal_wrapper=self.temporal_wrapper,
+            enable_temporal_embed=self.enable_temporal,
+            enable_loop_closure=self.enable_loop,
+            tbptt_window=self.tbptt_window,
+        )
+        backend_take = time.perf_counter()-st
+        self.backend_time.append(backend_take)
+        print(
+            f'Submap backend: sid={self.memory_mgr.current_submap_id - 1}, '
+            f'prev_tokens={backend_meta["n_prev"]}, retrieved_tokens={backend_meta["n_retrieved"]}, '
+            f'loop_gate={float(loop_gate.squeeze().detach().cpu())}'
+        )
+        print('backend_take', backend_take)
+        torch.cuda.empty_cache()
+        map_cpu = self.map.detach().cpu() if self.map.is_cuda else self.map
+        if self.map_opt is None:
+            self.map_opt = map_cpu.clone()
+        elif self.map_opt.shape[0] < map_cpu.shape[0]:
+            self.map_opt = torch.cat([self.map_opt, map_cpu[self.map_opt.shape[0]:]], dim=0)
+        for local_idx, frame_id in enumerate(backend_meta["frame_ids"]):
+            self.map_opt[int(frame_id)] = map_optimed[local_idx].detach().cpu()
+        self.signal_backend = False
+        torch.cuda.empty_cache()
+        if self.vis:
+            ps,cs,confs,poses = self.extract(self.map_opt)
+            self.vis_mem = [ps,cs,confs,poses]
+            conf_threshold = np.percentile(confs, 15)
+            msk = confs>=conf_threshold
+            ps = ps[msk]
+            cs = cs[msk]
+            for s in range(self.nkf+1):
+                rr.log(f"{self.entity}/pointclouds/{s}", rr.Points3D(np.array([])))
+            for s in range(self.nkf):
+                K44 = np.eye(4)
+                K44[:3,:3] = self.K
+                log_camera(f"{self.entity}/camera_kf/{s}",poses[s].reshape(4,4), K44, update=True)
+            for s in range(1, self.nkf):
+                rr.log(f"{self.entity}/lines/{s-1}to{s}", rr.LineStrips3D([poses[s-1:s+1,:3,3].tolist()],colors=[0,0,255],radii=[0.005]))
+            rr.log(
+                    f"{self.entity}/pointclouds/{self.nkf}",
+                    rr.Points3D(ps, colors=cs, radii=0.01),
+                )
+            self.last_kf_pose = poses[-1]
+    def step(self, timestamp, image):
+        if timestamp is None:
+            self.cur_timestamp = self.fid+1
+        else:
+            self.cur_timestamp = timestamp
+        self.frontend(image)
+        self.backend()
+    def map_add(self, token_kf):
+        token_kf = token_kf.detach()
+        start_idx = 0 if self.map is None else int(self.map.shape[0])
+        for i, tok in enumerate(token_kf):
+            self.memory_mgr.accumulate(tok.unsqueeze(0), start_idx + i)
+        if self.map is None:
+            self.map = token_kf.cpu() if self.save_gmem else token_kf #[tok.cpu() for tok in token_kf]
+        else:
+            if self.save_gmem:
+                self.map = torch.cat([self.map, token_kf.cpu()],axis=0) # S,P,C
+            else:
+                self.map = torch.cat([self.map, token_kf],axis=0) # S,P,C
+    @property
+    def nkf(self):
+        return self.kid+1
+    @property
+    def nf(self):
+        return self.fid+1
+    def terminate(self):
+        if self.memory_mgr._curr_tokens:
+            self.signal_backend = True
+            self.backend(final=True)
+        print(self.kf_time)
+        print(self.times)
+        print(self.backend_time)
+        print('frontend take', np.mean(self.times))
+        print('KFT')
+        print('total', np.sum(self.kf_time), 'FPS', float(len(self.kf_time))/np.sum(self.kf_time))
+        print('FT')
+        print('total', np.sum(self.times), 'FPS', float(len(self.times))/np.sum(self.times))
+        print('BT')
+        print('total', np.sum(self.backend_time), 'FPS', float(len(self.backend_time))/np.sum(self.backend_time))
+        print('Summary')
+        print('total', np.sum(self.kf_time)+np.sum(self.times)+np.sum(self.backend_time), 'FPS', float(len(self.kf_time))/(np.sum(self.kf_time)+np.sum(self.times)+np.sum(self.backend_time)))
+        map_to_save = self.map_opt if self.map_opt is not None else self.map
+        if map_to_save is None:
+            print(
+                f"[warning] No map was built for this sequence (nkf={self.nkf}). "
+                "Skipping result export to avoid crashing on a single-keyframe run."
+            )
+            return None
+        self.save_result(f'{self.outdir}/final', map_to_save)
+    def extract(self, map_all=None):
+        result = self.model.extract(map_all.cuda())
+        pts = result['points'].cpu().numpy() # 1,S,H,W,3
+        local_pts = result['local_points'].cpu().numpy() # 1,S,H,W,3
+        _,S,H,W,_ = pts.shape
+        conf = result['conf'].cpu().numpy()
+        point_clouds = [pts[0,s] for s in range(S)]
+        #conf_threshold = np.percentile(conf, 15)
+        #confs = [conf[0,s]>=conf_threshold for s in range(S)]
+        colors = torch.stack(self.frames[-S:]).permute(0,2,3,1).reshape(-1,3).cpu().numpy()[:,::-1] # S,H,W,C
+        confs = conf.reshape(-1)
+        camera_pose = result['camera_poses'].cpu().numpy()[0] # S,4,4
+        pts = pts.reshape(-1,3)
+        colors = colors.reshape(-1,3)
+        # set depth for the last kf
+        self.depth_last_kf = local_pts[0,-1,:,:,-1]
+        return pts, colors, confs, camera_pose
+    def save_result(self, output_path = 'output/tmp', map_all=None, traj=True):
+        '''
+        if map_all is None:
+            map_all = self.map
+            '''
+        print(self.kfids)
+        if map_all is None:
+            map_all = self.map_opt if self.map_opt is not None else self.map
+        if map_all is None:
+            print(f"[warning] save_result() called with no map data; skipping export for {output_path}.")
+            return None
+        # Chunk-process to avoid OOM on long sequences
+        # (our finetuning removed torch.no_grad() from model internals)
+        S_total = map_all.shape[0]
+        chunk_size = 50  # process 50 frames at a time
+        all_pts, all_conf, all_poses = [], [], []
+        for start in range(0, S_total, chunk_size):
+            end = min(start + chunk_size, S_total)
+            chunk = map_all[start:end].cuda()
+            torch.cuda.empty_cache()
+            result_chunk = self.model.extract(chunk)
+            all_pts.append(result_chunk['points'].cpu())
+            all_conf.append(result_chunk['conf'].cpu())
+            all_poses.append(result_chunk['camera_poses'].cpu())
+            del result_chunk, chunk
+            torch.cuda.empty_cache()
+        pts = torch.cat(all_pts, dim=1).numpy()  # 1,S,H,W,3
+        conf = torch.cat(all_conf, dim=1).numpy()
+        camera_pose = torch.cat(all_poses, dim=1)  # 1,S,4,4
+        _,S,H,W,_ = pts.shape
+        point_clouds = [pts[0,s] for s in range(S)]
+        conf_threshold = np.percentile(conf, 15)
+        confs = [conf[0,s]>=conf_threshold for s in range(S)]
+        colors = torch.stack(self.frames).permute(0,2,3,1).reshape(-1,3).cpu().numpy()[:,::-1] # S,H,W,C
+        msk = np.stack(confs).reshape(-1)
+        pcd = o3d.geometry.PointCloud()
+        pcd.points = o3d.utility.Vector3dVector(pts.reshape(-1,3).astype(np.float64)[msk])
+        pcd.colors = o3d.utility.Vector3dVector(colors.reshape(-1,3).astype(np.float64)[msk])
+        #downpcd = pcd.voxel_down_sample(voxel_size=0.005)
+        o3d.io.write_point_cloud(f"{output_path}.ply", pcd)
+        poses = camera_pose[0].numpy()
+        self.write_poses_to_file(f"{output_path}_traj.txt", poses, self.kf_timestamps)
+        self.save_framewise_pointclouds(f"{output_path}_pc", point_clouds, self.kf_timestamps, confs)
+        return {'points': torch.from_numpy(pts), 'conf': torch.from_numpy(conf), 'camera_poses': camera_pose}
+    def write_poses_to_file(self, filename, poses, frame_ids):
+        with open(filename, "w") as f:
+            assert len(poses) == len(frame_ids), "Number of provided poses and number of frame ids do not match"
+            for frame_id, pose in zip(frame_ids, poses):
+                x, y, z = pose[0:3, 3]
+                rotation_matrix = pose[0:3, 0:3]
+                quaternion = R.from_matrix(rotation_matrix).as_quat() # x, y, z, w
+                output = np.array([float(frame_id), x, y, z, *quaternion])
+                f.write(" ".join(f"{v:.8f}" for v in output) + "\n")
+    def save_framewise_pointclouds(self, filename, pointclouds, frame_ids, conf_masks):
+        os.makedirs(filename, exist_ok=True)
+        for frame_id, pointcloud, conf_masks in zip(frame_ids, pointclouds, conf_masks):
+            # save pcd as numpy array
+            np.savez(f"{filename}/{frame_id}.npz", pointcloud=pointcloud, mask=conf_masks)
+def get_parser():
+    parser = argparse.ArgumentParser(description="SLAM-Former demo")
+    parser.add_argument("--ckpt_path", type=str, default="path/to/checkpoint.pth.model", help="Path to the checkpoint")
+    parser.add_argument("--image_folder", type=str, default="path/to/image/folder", help="Path to folder containing images")
+    parser.add_argument("--target_size", type=int, default=518, help="the target size of image(longer side)")
+    parser.add_argument("--output_dir", type=str, default="outputs/tmp", help="Path to save the output")
+    parser.add_argument("--stride", type=int, default=1, help="Frame stride for subsampling the input sequence")
+    parser.add_argument("--kf_th", type=float, default=0.1, help="Keyframe selection threshold (minimum translation distance)")
+    parser.add_argument("--retention_ratio", type=float, default=None, help="KV Pruning retention ratio")
+    parser.add_argument("--bn_every", type=int, default=None, help="Run backend optimization every N keyframes")
+    parser.add_argument("--loop_mask_mode", type=str, default=None, choices=["hard_top1", "soft_all"], help="Override loop retrieval masking mode")
+    parser.add_argument("--submap_train_mode", type=str, default=None, choices=["full_token", "top5_dual_queue"], help="Override submap queue mode")
+    parser.add_argument("--submap_retrieval_topk", type=int, default=None, help="Override number of historical submaps fetched in soft_all mode")
+    parser.add_argument("--submap_fetch_source", type=str, default=None, choices=["frontend", "backend"], help="Override token source used for retrieval")
+    parser.add_argument("--submap_descriptor_source", type=str, default=None, choices=["frontend", "backend"], help="Override descriptor source used for retrieval")
+    parser.add_argument("--max_recursive_submaps", type=int, default=None, help="Override recursive covisibility fetch limit")
+    parser.add_argument("--vis", action="store_true", help="Enable real-time visualization with Rerun")
+    parser.add_argument("--resize_rate", type=float, default=1, help="Resize rate for input images before processing")
+    args = parser.parse_args()
+    return args
+if __name__ == '__main__':
+    args = get_parser()
+    image_folder = args.image_folder
+    outdir = args.output_dir
+    os.makedirs(outdir, exist_ok=True)
+    if 'tum' in args.image_folder:
+        fx = 525.0  # focal length x
+        fy = 525.0  # focal length y
+        cx = 319.5  # optical center x
+        cy = 239.5  # optical center y
+        K = np.eye(3)
+        K[0,0] = fx
+        K[1,1] = fy
+        K[0,2] = cx
+        K[1,2] = cy
+    elif 'Replica' in args.image_folder:
+        fx = 600.  # focal length x
+        fy = 600.0  # focal length y
+        cx = 599.5  # optical center x
+        cy = 339.5  # optical center y
+        K = np.eye(3)
+        K[0,0] = fx
+        K[1,1] = fy
+        K[0,2] = cx
+        K[1,2] = cy
+    else:
+        K = None
+    # Use the provided image folder path
+    print(f"Loading images from {image_folder}...")
+    image_names = [f for f in glob.glob(os.path.join(image_folder, "*"))
+               if "depth" not in os.path.basename(f).lower() and "txt" not in os.path.basename(f).lower()
+               and "db" not in os.path.basename(f).lower()]
+    image_names = utils.sort_images_by_number(image_names)
+    frame_ids = []
+    for path in image_names:
+        filename = os.path.basename(path)
+        match = re.search(r'\d+(?:\.\d+)?', filename)  # matches integers and decimals
+        if match:
+            frame_ids.append(float(match.group()))
+        else:
+            raise ValueError(f"No number found in image name: {filename}")
+    print(f"Found {len(image_names)} images")
+    print('resize image', args.resize_rate)
+    slam = SLAM(
+        outdir=outdir,
+        kf_th=args.kf_th,
+        bn_every=args.bn_every,
+        vis=args.vis,
+        ckpt_path=args.ckpt_path,
+        target_size=args.target_size,
+        retention_ratio=args.retention_ratio,
+        loop_mask_mode=args.loop_mask_mode,
+        submap_train_mode=args.submap_train_mode,
+        submap_retrieval_topk=args.submap_retrieval_topk,
+        submap_fetch_source=args.submap_fetch_source,
+        submap_descriptor_source=args.submap_descriptor_source,
+        max_recursive_submaps=args.max_recursive_submaps,
+    )
+    slam.K = K
+    for frame_id, image_name in zip(frame_ids[::args.stride], image_names[::args.stride]):
+        img = cv2.imread(image_name)
+        if args.resize_rate != 1:
+            H,W,_ = img.shape
+            img = cv2.resize(img, (int(W*args.resize_rate), int(H*args.resize_rate)), cv2.INTER_CUBIC)
+        slam.step(frame_id, img)
+    result = slam.terminate()

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/download_data.sh ADDED Viewed

	@@ -0,0 +1,354 @@

+#!/bin/bash
+# ──────────────────────────────────────────────────────────
+# Local download checklist template for SLAM-Former fine-tuning
+#
+# README-backed links:
+# - ARKitScenes / MVS-Synth / ScanNet -> Hugging Face SLF dataset tree
+#   (ScanNet is stored as a split archive: processed_scannetv2.zip.part.aa/.ab/.ac/.ad)
+# - HyperSim -> Hugging Face preprocessed_Hypersim tree
+# - ScanNet++ / BlendedMVS / MegaDepth -> README says "coming soon" or no direct archive command is listed here
+#
+# This script is intentionally split into per-dataset toggles so you can fill
+# the missing datasets one by one on the local machine without overwriting
+# existing extracted folders.
+#
+# Example:
+#   DOWNLOAD_ARKITSCENES=1 DOWNLOAD_SCANNET=1 bash slam/download_data.sh
+# ──────────────────────────────────────────────────────────
+set -euo pipefail
+PROJECT_DIR="/var/scratch/qzhang2/SLAM-Former"
+DATA_DIR="$PROJECT_DIR/data/train"
+CKPT_DIR="$PROJECT_DIR/ckpt"
+LOG_FILE="$PROJECT_DIR/download.log"
+mkdir -p "$DATA_DIR" "$CKPT_DIR"
+log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" | tee -a "$LOG_FILE"; }
+HF_BASE="https://huggingface.co/datasets/KevinConnorLee/SLF/resolve/main"
+HF_HYPERSIM_BASE="https://huggingface.co/datasets/KevinConnorLee/preprocessed_Hypersim/resolve/main"
+HF_CKPT="https://huggingface.co/Jarrome/SLAM-Former/resolve/main/518/checkpoint-10.pth.model"
+HF_ARKITSCENES_REPO="https://huggingface.co/datasets/Pointcept/arkitscenes-compressed"
+HF_SCANNETPP_REPO="https://huggingface.co/datasets/Pointcept/scannetpp-compressed"
+HF_HYPERSIM_REPO="https://huggingface.co/datasets/geyongtao/hypersim"
+BLENDEDMVS_LOWRES_URL="https://1drv.ms/u/s!Ag8Dbz2Aqc81gVDu7FHfbPZwqhIy?e=BHY07t"
+BLENDEDMVS_HIGHRES_URL="https://1drv.ms/u/s!Ag8Dbz2Aqc81ezb9OciQ4zKwJ_w?e=afFOTi"
+MEGADEPTH_V1_URL="https://www.cs.cornell.edu/projects/megadepth/dataset/Megadepth_v1/MegaDepth_v1.tar.gz"
+DOWNLOAD_CHECKPOINT="${DOWNLOAD_CHECKPOINT:-1}"
+KEEP_ARCHIVES="${KEEP_ARCHIVES:-0}"
+DOWNLOAD_ARKITSCENES="${DOWNLOAD_ARKITSCENES:-0}"
+DOWNLOAD_SCANNETPP="${DOWNLOAD_SCANNETPP:-0}"
+DOWNLOAD_MVS_SYNTH="${DOWNLOAD_MVS_SYNTH:-0}"
+DOWNLOAD_SCANNET="${DOWNLOAD_SCANNET:-0}"
+DOWNLOAD_HYPERSIM="${DOWNLOAD_HYPERSIM:-0}"
+DOWNLOAD_BLENDEDMVS="${DOWNLOAD_BLENDEDMVS:-0}"
+DOWNLOAD_MEGADEPTH="${DOWNLOAD_MEGADEPTH:-0}"
+HF_BLENDEDMVS_BASE="${HF_BLENDEDMVS_BASE:-}"
+HF_MEGADEPTH_BASE="${HF_MEGADEPTH_BASE:-}"
+RELEASED_COMPLETION_MARKER="$PROJECT_DIR/.download_complete_released_paper_datasets"
+FULL_COMPLETION_MARKER="$PROJECT_DIR/.download_complete_all_requested_paper_datasets"
+IN_PROGRESS_MARKER="$PROJECT_DIR/.download_in_progress"
+rm -f "$PROJECT_DIR/.download_complete" "$RELEASED_COMPLETION_MARKER" "$FULL_COMPLETION_MARKER" "$IN_PROGRESS_MARKER"
+touch "$IN_PROGRESS_MARKER"
+download_file() {
+    local url="$1"
+    local output="$2"
+    if [ -f "$output" ]; then
+        log "Found existing file $(basename "$output"), attempting resume if incomplete."
+    fi
+    wget -c --progress=bar:force -O "$output" "$url" 2>&1 | tee -a "$LOG_FILE"
+}
+download_hf_repo_dataset() {
+    local label="$1"
+    local expected_dir="$2"
+    local repo_url="$3"
+    if [ -d "$expected_dir" ] && [ -n "$(find "$expected_dir" -mindepth 1 -maxdepth 1 2>/dev/null | head -n 1)" ]; then
+        log "$label already exists, skipping."
+        return
+    fi
+    rm -rf "$expected_dir"
+    mkdir -p "$expected_dir"
+    log "=== Cloning $label from Hugging Face ==="
+    git clone --depth 1 "$repo_url" "$expected_dir" 2>&1 | tee -a "$LOG_FILE"
+    if command -v git-lfs >/dev/null 2>&1; then
+        log "Fetching LFS files for $label..."
+        (cd "$expected_dir" && git lfs pull) 2>&1 | tee -a "$LOG_FILE"
+    else
+        log "WARNING: git-lfs not found; $label may contain LFS pointer files only."
+    fi
+}
+extract_archive_auto() {
+    local archive="$1"
+    local target_dir="$2"
+    mkdir -p "$target_dir"
+    if unzip -t "$archive" >/dev/null 2>&1; then
+        unzip -o "$archive" -d "$target_dir" 2>&1 | tee -a "$LOG_FILE"
+        return 0
+    fi
+    if tar -tf "$archive" >/dev/null 2>&1; then
+        tar -xf "$archive" -C "$target_dir" 2>&1 | tee -a "$LOG_FILE"
+        return 0
+    fi
+    log "ERROR: Unsupported archive format for $archive"
+    return 1
+}
+download_url_dataset() {
+    local label="$1"
+    local expected_dir="$2"
+    local url="$3"
+    local archive_name="$4"
+    if [ -d "$expected_dir" ] && [ -n "$(find "$expected_dir" -mindepth 1 -maxdepth 1 2>/dev/null | head -n 1)" ]; then
+        log "$label already exists, skipping."
+        return
+    fi
+    log "=== Downloading $label ==="
+    download_file "$url" "$DATA_DIR/$archive_name"
+    extract_archive_auto "$DATA_DIR/$archive_name" "$expected_dir"
+    if [ "$KEEP_ARCHIVES" != "1" ]; then
+        rm -f "$DATA_DIR/$archive_name"
+    fi
+    if [ -d "$expected_dir" ]; then
+        log "$label done."
+    else
+        log "WARNING: $label archive extracted, but expected path is still missing: $expected_dir"
+    fi
+}
+assemble_parts() {
+    local output="$1"
+    shift
+    if [ -f "$output" ]; then
+        log "Found existing assembled archive $(basename "$output"), skipping assembly."
+        return
+    fi
+    cat "$@" > "${output}.tmp"
+    mv "${output}.tmp" "$output"
+}
+rename_if_needed() {
+    local expected="$1"
+    shift
+    if [ -e "$expected" ]; then
+        return
+    fi
+    local candidate
+    for candidate in "$@"; do
+        if [ -e "$candidate" ]; then
+            mv "$candidate" "$expected"
+            return
+        fi
+    done
+}
+extract_zip() {
+    local archive="$1"
+    unzip -o "$archive" -d "$DATA_DIR/" 2>&1 | tee -a "$LOG_FILE"
+    if [ "$KEEP_ARCHIVES" != "1" ]; then
+        rm -f "$archive"
+    fi
+}
+cleanup_parts() {
+    if [ "$KEEP_ARCHIVES" = "1" ]; then
+        return
+    fi
+    rm -f "$@"
+}
+download_single_archive_dataset() {
+    local label="$1"
+    local expected_dir="$2"
+    local archive_name="$3"
+    local base_url="$4"
+    shift 4
+    if [ -d "$expected_dir" ]; then
+        log "$label already exists, skipping."
+        return
+    fi
+    cd "$DATA_DIR"
+    log "=== Downloading $label ==="
+    download_file "$base_url/$archive_name" "$DATA_DIR/$archive_name"
+    log "Extracting $label..."
+    extract_zip "$DATA_DIR/$archive_name"
+    rename_if_needed "$expected_dir" "$@"
+    if [ -d "$expected_dir" ]; then
+        log "$label done."
+    else
+        log "WARNING: $label archive extracted, but expected path is still missing: $expected_dir"
+    fi
+}
+download_split_archive_dataset() {
+    local label="$1"
+    local expected_dir="$2"
+    local assembled_archive="$3"
+    local base_url="$4"
+    local aliases_string="$5"
+    shift 5
+    local parts=("$@")
+    if [ -d "$expected_dir" ] && [ -n "$(find "$expected_dir" -mindepth 1 -maxdepth 1 2>/dev/null | head -n 1)" ]; then
+        log "$label already exists, skipping."
+        return
+    fi
+    cd "$DATA_DIR"
+    log "=== Downloading $label ==="
+    local downloaded_parts=()
+    local part
+    for part in "${parts[@]}"; do
+        download_file "$base_url/$part" "$DATA_DIR/$part"
+        downloaded_parts+=("$DATA_DIR/$part")
+    done
+    log "Assembling $label archive..."
+    assemble_parts "$DATA_DIR/$assembled_archive" "${downloaded_parts[@]}"
+    log "Extracting $label..."
+    extract_zip "$DATA_DIR/$assembled_archive"
+    IFS='|' read -r -a alias_candidates <<< "$aliases_string"
+    local alias_paths=()
+    local alias
+    for alias in "${alias_candidates[@]}"; do
+        alias_paths+=("$DATA_DIR/$alias")
+    done
+    rename_if_needed "$expected_dir" "${alias_paths[@]}"
+    cleanup_parts "${downloaded_parts[@]}"
+    if [ -d "$expected_dir" ]; then
+        log "$label done."
+    else
+        log "WARNING: $label archive extracted, but expected path is still missing: $expected_dir"
+    fi
+}
+if [ "$DOWNLOAD_CHECKPOINT" = "1" ]; then
+    log "=== Downloading pretrained checkpoint (3.84 GB) ==="
+    if [ ! -f "$CKPT_DIR/checkpoint-10.pth.model" ]; then
+        download_file "$HF_CKPT" "$CKPT_DIR/checkpoint-10.pth.model"
+        log "Checkpoint downloaded."
+    else
+        log "Checkpoint already exists, skipping."
+    fi
+fi
+if [ "$DOWNLOAD_ARKITSCENES" = "1" ]; then
+    download_hf_repo_dataset \
+        "ARKitScenes (HF repo: Pointcept/arkitscenes-compressed)" \
+        "$DATA_DIR/processed_arkitscenes" \
+        "$HF_ARKITSCENES_REPO"
+fi
+if [ "$DOWNLOAD_SCANNETPP" = "1" ]; then
+    download_hf_repo_dataset \
+        "ScanNet++ (HF repo: Pointcept/scannetpp-compressed)" \
+        "$DATA_DIR/processed_scannetpp" \
+        "$HF_SCANNETPP_REPO"
+fi
+if [ "$DOWNLOAD_MVS_SYNTH" = "1" ]; then
+    download_single_archive_dataset \
+        "MVS-Synth (README-backed HF tree)" \
+        "$DATA_DIR/processed_mvs_synth" \
+        "processed_mvs_synth.zip" \
+        "$HF_BASE" \
+        "processed_mvs_synth" \
+        "$DATA_DIR/processed_mvs_synth"
+fi
+if [ "$DOWNLOAD_SCANNET" = "1" ]; then
+    download_split_archive_dataset \
+        "ScanNet (HF split archive: KevinConnorLee/SLF)" \
+        "$DATA_DIR/processed_scannet" \
+        "processed_scannetv2.zip" \
+        "$HF_BASE" \
+        "processed_scannetv2|processed_scannet" \
+        "processed_scannetv2.zip.part.aa" \
+        "processed_scannetv2.zip.part.ab" \
+        "processed_scannetv2.zip.part.ac" \
+        "processed_scannetv2.zip.part.ad"
+fi
+if [ "$DOWNLOAD_HYPERSIM" = "1" ]; then
+    download_hf_repo_dataset \
+        "HyperSim (HF repo: geyongtao/hypersim)" \
+        "$DATA_DIR/hypersim" \
+        "$HF_HYPERSIM_REPO"
+fi
+if [ "$DOWNLOAD_BLENDEDMVS" = "1" ]; then
+    BLENDEDMVS_URL="${BLENDEDMVS_URL:-$BLENDEDMVS_LOWRES_URL}"
+    BLENDEDMVS_ARCHIVE_NAME="${BLENDEDMVS_ARCHIVE_NAME:-blendedmvs_lowres.download}"
+    if [ "${BLENDEDMVS_VARIANT:-lowres}" = "highres" ]; then
+        BLENDEDMVS_URL="$BLENDEDMVS_HIGHRES_URL"
+        BLENDEDMVS_ARCHIVE_NAME="${BLENDEDMVS_ARCHIVE_NAME:-blendedmvs_highres.download}"
+    fi
+    download_url_dataset \
+        "BlendedMVS (${BLENDEDMVS_VARIANT:-lowres})" \
+        "$DATA_DIR/processed_blendedmvs" \
+        "$BLENDEDMVS_URL" \
+        "$BLENDEDMVS_ARCHIVE_NAME"
+fi
+if [ "$DOWNLOAD_MEGADEPTH" = "1" ] && [ ! -d "$DATA_DIR/processed_megadepth" ]; then
+    download_url_dataset \
+        "MegaDepth v1 (Cornell official archive)" \
+        "$DATA_DIR/processed_megadepth" \
+        "$MEGADEPTH_V1_URL" \
+        "MegaDepth_v1.tar.gz"
+fi
+log "=== Download complete ==="
+log "Disk usage:"
+du -sh "$DATA_DIR"/* "$CKPT_DIR"/* 2>/dev/null | tee -a "$LOG_FILE"
+log "Total:"
+du -sh "$DATA_DIR" "$CKPT_DIR" | tee -a "$LOG_FILE"
+missing_released=()
+missing_all_requested=()
+if [ "$DOWNLOAD_CHECKPOINT" = "1" ] && [ ! -f "$CKPT_DIR/checkpoint-10.pth.model" ]; then
+    missing_released+=("ckpt/checkpoint-10.pth.model")
+fi
+if [ ! -d "$DATA_DIR/processed_scannetpp" ]; then
+    missing_released+=("data/train/processed_scannetpp")
+fi
+if [ ! -d "$DATA_DIR/processed_mvs_synth" ]; then
+    missing_released+=("data/train/processed_mvs_synth")
+fi
+if [ ! -d "$DATA_DIR/processed_arkitscenes" ]; then
+    missing_released+=("data/train/processed_arkitscenes")
+fi
+if [ "$DOWNLOAD_SCANNET" = "1" ] && [ ! -d "$DATA_DIR/processed_scannet" ]; then
+    missing_released+=("data/train/processed_scannet")
+fi
+if [ "$DOWNLOAD_HYPERSIM" = "1" ] && [ ! -d "$DATA_DIR/hypersim" ]; then
+    missing_released+=("data/train/hypersim")
+fi
+if [ "$DOWNLOAD_BLENDEDMVS" = "1" ] && [ ! -d "$DATA_DIR/processed_blendedmvs" ]; then
+    missing_released+=("data/train/processed_blendedmvs")
+fi
+missing_all_requested=("${missing_released[@]}")
+if [ "$DOWNLOAD_MEGADEPTH" = "1" ] && [ ! -d "$DATA_DIR/processed_megadepth" ]; then
+    missing_all_requested+=("data/train/processed_megadepth")
+fi
+rm -f "$IN_PROGRESS_MARKER"
+if [ "${#missing_released[@]}" -eq 0 ]; then
+    touch "$RELEASED_COMPLETION_MARKER"
+    log "Released paper datasets are complete. Marker created at $RELEASED_COMPLETION_MARKER"
+else
+    log "WARNING: Missing released download targets: ${missing_released[*]}"
+fi
+if [ "${#missing_all_requested[@]}" -eq 0 ]; then
+    touch "$FULL_COMPLETION_MARKER"
+    touch "$PROJECT_DIR/.download_complete"
+    log "All requested datasets are complete. Markers created at $FULL_COMPLETION_MARKER and $PROJECT_DIR/.download_complete"
+else
+    log "WARNING: Missing requested targets: ${missing_all_requested[*]}"
+fi

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/exp_joint_freeze_frontend_fsdp_8gpu.sh ADDED Viewed

	@@ -0,0 +1,438 @@

+#!/bin/bash
+#SBATCH --job-name=sf_smoke_joint_freeze_frontend_fsdp_2gpu
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=12
+#SBATCH --gres=gpu:2
+#SBATCH --mem=24G
+#SBATCH --time=24:00:00
+#SBATCH --output=%x_%j.out
+#SBATCH --error=%x_%j.err
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_DIR="${PROJECT_DIR:-${SLURM_SUBMIT_DIR:-$(dirname "$SCRIPT_DIR")}}"
+SRC_DIR="$PROJECT_DIR/src"
+# 迁移到其他集群时优先修改：GPU/端口、conda 初始化脚本、数据目录、预训练权重和保存目录。
+MASTER_PORT="${MASTER_PORT:-29662}"
+NUM_GPUS="${NUM_GPUS:-8}"
+CONDA_SH="${CONDA_SH:-/home/23068142r/miniconda3/etc/profile.d/conda.sh}"
+CONDA_ENV_NAME="SLAM-Former"
+DATA_ROOT="${DATA_ROOT:-/home/23068142r/work_dir/data}"
+ROOT_ARKIT="${ROOT_ARKIT:-$DATA_ROOT/processed_arkitscenes}"
+ROOT_SCANNETPP="${ROOT_SCANNETPP:-$DATA_ROOT/preprocessed_scannetpp}"
+ROOT_SCANNET="${ROOT_SCANNET:-$DATA_ROOT/processed_scannet}"
+ROOT_SCANNET_FALLBACK="${ROOT_SCANNET_FALLBACK:-$DATA_ROOT/processed_scannetv2}"
+ROOT_HYPERSIM="${ROOT_HYPERSIM:-$DATA_ROOT/preprocessed_Hypersim}"
+ROOT_BLENDEDMVS="${ROOT_BLENDEDMVS:-$DATA_ROOT/processed_blendedmvs}"
+ROOT_MEGADEPTH="${ROOT_MEGADEPTH:-$DATA_ROOT/processed_megadepth}"
+ROOT_MVS_SYNTH="${ROOT_MVS_SYNTH:-$DATA_ROOT/processed_mvs_synth}"
+EXPERIMENT_ROOT="${EXPERIMENT_ROOT:-paper_smoke_local_8gpu}"
+VARIANT_NAME="${VARIANT_NAME:-joint_freeze_frontend_fsdp_sub12}"
+EXP_NAME="${EXP_NAME:-paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12}"
+SAVE_DIR="${SAVE_DIR:-$PROJECT_DIR/checkpoints/$EXPERIMENT_ROOT/$VARIANT_NAME}"
+PRETRAINED="${PRETRAINED:-/home/23068142r/work_dir/projects/e2e-semantic-SLAM-submap/ckpt/checkpoint-10.pth.model}"
+RESUME="${RESUME:-null}"
+CONFIG_NAME="${CONFIG_NAME:-finetune_paper_h20.yaml}"
+DIST_STRATEGY="${DIST_STRATEGY:-fsdp}"
+AUTO_DISABLE_MISSING="${AUTO_DISABLE_MISSING:-1}"
+TRAIN_SUBMAP_MODULES_ONLY="${TRAIN_SUBMAP_MODULES_ONLY:-0}"
+DETACH_FRONTEND_TOKENS="${DETACH_FRONTEND_TOKENS:-1}"
+NUM_VIEWS_ALL="${NUM_VIEWS_ALL:-64}"
+NUM_VIEWS_ARKIT="${NUM_VIEWS_ARKIT:-64}"
+NUM_VIEWS_SCANNETPP="${NUM_VIEWS_SCANNETPP:-24}"
+NUM_VIEWS_SCANNET="${NUM_VIEWS_SCANNET:-64}"
+NUM_VIEWS_HYPERSIM="${NUM_VIEWS_HYPERSIM:-24}"
+NUM_VIEWS_BLENDEDMVS="${NUM_VIEWS_BLENDEDMVS:-64}"
+NUM_VIEWS_MEGADEPTH="${NUM_VIEWS_MEGADEPTH:-64}"
+NUM_VIEWS_MVS_SYNTH="${NUM_VIEWS_MVS_SYNTH:-24}"
+SUBMAP_SIZE="${SUBMAP_SIZE:-12}"
+SUBMAP_TRAIN_MODE="${SUBMAP_TRAIN_MODE:-full_token}"
+SUBMAP_RETRIEVAL_TOPK="${SUBMAP_RETRIEVAL_TOPK:-0}"
+SUBMAP_FETCH_SOURCE="${SUBMAP_FETCH_SOURCE:-frontend}"
+SUBMAP_DESCRIPTOR_SOURCE="${SUBMAP_DESCRIPTOR_SOURCE:-frontend}"
+ENABLE_PSEUDO_GT="${ENABLE_PSEUDO_GT:-0}"
+PSEUDO_GT_CACHE_PATH="${PSEUDO_GT_CACHE_PATH:-}"
+SKIP_TEST="${SKIP_TEST:-1}"
+EPOCHS="${EPOCHS:-2}"
+SAMPLES_ARKIT="${SAMPLES_ARKIT:-0}"
+SAMPLES_SCANNETPP="${SAMPLES_SCANNETPP:-16}"
+SAMPLES_SCANNET="${SAMPLES_SCANNET:-0}"
+SAMPLES_HYPERSIM="${SAMPLES_HYPERSIM:-16}"
+SAMPLES_BLENDEDMVS="${SAMPLES_BLENDEDMVS:-0}"
+SAMPLES_MEGADEPTH="${SAMPLES_MEGADEPTH:-0}"
+SAMPLES_MVS_SYNTH="${SAMPLES_MVS_SYNTH:-16}"
+GLOBAL_NUM_VIEWS="${GLOBAL_NUM_VIEWS:-}"
+if [ "$ENABLE_PSEUDO_GT" = "1" ]; then
+    if [ -z "$PSEUDO_GT_CACHE_PATH" ] || [ "$PSEUDO_GT_CACHE_PATH" = "null" ]; then
+        echo "ERROR: ENABLE_PSEUDO_GT=1 requires PSEUDO_GT_CACHE_PATH to be set."
+        exit 1
+    fi
+fi
+if [ ! -f "$PRETRAINED" ]; then
+    echo "ERROR: Missing pretrained checkpoint: $PRETRAINED"
+    exit 1
+fi
+if [ ! -f "$CONDA_SH" ]; then
+    echo "ERROR: Missing conda init script: $CONDA_SH"
+    exit 1
+fi
+source "$CONDA_SH"
+conda activate "$CONDA_ENV_NAME"
+export PATH="$CONDA_PREFIX/bin:$PATH"
+if command -v module >/dev/null 2>&1; then
+    module load cuda12.1/toolkit || true
+fi
+export PYTHONPATH="$PROJECT_DIR/src:$PROJECT_DIR:${PYTHONPATH:-}"
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-4}"
+export HYDRA_FULL_ERROR="${HYDRA_FULL_ERROR:-1}"
+export PYTORCH_ALLOC_CONF="${PYTORCH_ALLOC_CONF:-expandable_segments:True}"
+export PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}"
+export CONDA_SH
+export CONFIG_NAME
+export EXP_NAME
+export MASTER_PORT
+export DIST_STRATEGY
+export AUTO_DISABLE_MISSING
+export SAVE_DIR
+export PRETRAINED
+export RESUME
+export NUM_GPUS
+export TRAIN_SUBMAP_MODULES_ONLY
+export DETACH_FRONTEND_TOKENS
+export NUM_VIEWS_ALL
+export NUM_VIEWS_ARKIT
+export NUM_VIEWS_SCANNETPP
+export NUM_VIEWS_SCANNET
+export NUM_VIEWS_HYPERSIM
+export NUM_VIEWS_BLENDEDMVS
+export NUM_VIEWS_MEGADEPTH
+export NUM_VIEWS_MVS_SYNTH
+export SUBMAP_TRAIN_MODE
+export SUBMAP_RETRIEVAL_TOPK
+export SUBMAP_FETCH_SOURCE
+export SUBMAP_DESCRIPTOR_SOURCE
+export ENABLE_PSEUDO_GT
+export PSEUDO_GT_CACHE_PATH
+export SAMPLES_ARKIT
+export SAMPLES_SCANNETPP
+export SAMPLES_SCANNET
+export SAMPLES_HYPERSIM
+export SAMPLES_BLENDEDMVS
+export SAMPLES_MEGADEPTH
+export SAMPLES_MVS_SYNTH
+export DATA_ROOT
+export ROOT_ARKIT
+export ROOT_SCANNETPP
+export ROOT_SCANNET
+export ROOT_SCANNET_FALLBACK
+export ROOT_HYPERSIM
+export ROOT_BLENDEDMVS
+export ROOT_MEGADEPTH
+export ROOT_MVS_SYNTH
+export SKIP_TEST
+export GLOBAL_NUM_VIEWS
+dataset_is_ready() {
+    local label="$1"
+    local path="$2"
+    case "$label" in
+        "ARKitScenes")
+            [ -f "$path/Training/all_metadata.npz" ]
+            ;;
+        "ScanNet++")
+            [ -f "$path/all_metadata.npz" ]
+            ;;
+        "ScanNet")
+            [ -d "$path/scans_train" ] && [ -n "$(find "$path/scans_train" -mindepth 2 -maxdepth 2 -type f -name 'new_scene_metadata.npz' -print -quit 2>/dev/null)" ]
+            ;;
+        "HyperSim")
+            [ -d "$path" ] && [ -n "$(find "$path" -mindepth 3 -maxdepth 3 -type f -name '*rgb.png' -print -quit 2>/dev/null)" ]
+            ;;
+        "BlendedMVS")
+            [ -f "$path/new_overlap.h5" ]
+            ;;
+        "MegaDepth")
+            [ -f "$path/megadepth_sets_64.npz" ]
+            ;;
+        "MVS-Synth")
+            [ -d "$path" ] && [ -n "$(find "$path" -mindepth 2 -maxdepth 2 -type d -name 'cam' -print -quit 2>/dev/null)" ]
+            ;;
+        *)
+            [ -e "$path" ]
+            ;;
+    esac
+}
+dataset_probe_hint() {
+    local label="$1"
+    case "$label" in
+        "ARKitScenes")
+            echo "Training/all_metadata.npz"
+            ;;
+        "ScanNet++")
+            echo "all_metadata.npz"
+            ;;
+        "ScanNet")
+            echo "scans_train/*/new_scene_metadata.npz"
+            ;;
+        "HyperSim")
+            echo "scene/subscene/*rgb.png"
+            ;;
+        "BlendedMVS")
+            echo "new_overlap.h5"
+            ;;
+        "MegaDepth")
+            echo "megadepth_sets_64.npz"
+            ;;
+        "MVS-Synth")
+            echo "*/cam"
+            ;;
+        *)
+            echo "required dataset files"
+            ;;
+    esac
+}
+resolve_scannet_root() {
+    local preferred="$1"
+    local fallback="$2"
+    if dataset_is_ready "ScanNet" "$preferred"; then
+        echo "$preferred"
+        return
+    fi
+    if [ "$fallback" != "$preferred" ] && dataset_is_ready "ScanNet" "$fallback"; then
+        echo "INFO: ScanNet root $preferred is incomplete; falling back to $fallback" >&2
+        echo "$fallback"
+        return
+    fi
+    echo "$preferred"
+}
+handle_missing_dataset() {
+    local label="$1"
+    local path="$2"
+    local weight_var="$3"
+    local weight="${!weight_var}"
+    if [ "$weight" -le 0 ]; then
+        return
+    fi
+    if ! dataset_is_ready "$label" "$path"; then
+        local probe_hint
+        probe_hint="$(dataset_probe_hint "$label")"
+        if [ "$AUTO_DISABLE_MISSING" = "1" ]; then
+            echo "WARNING: Missing or incomplete ${label} dataset root: ${path}"
+            echo "WARNING: Expected ${probe_hint} under ${path}"
+            echo "WARNING: Disabling ${label} by setting ${weight_var}=0"
+            printf -v "$weight_var" '0'
+        else
+            echo "ERROR: Missing or incomplete ${label} dataset root: ${path}"
+            echo "ERROR: Expected ${probe_hint} under ${path}"
+            exit 1
+        fi
+    fi
+}
+append_dataset() {
+    local weight="$1"
+    local token="$2"
+    if [ "$weight" -le 0 ]; then
+        return
+    fi
+    DATASET_PARTS+=("${weight} @ \${${token}}")
+}
+mkdir -p "$SAVE_DIR/$EXP_NAME"
+ROOT_SCANNET="$(resolve_scannet_root "$ROOT_SCANNET" "$ROOT_SCANNET_FALLBACK")"
+handle_missing_dataset "ARKitScenes" "$ROOT_ARKIT" SAMPLES_ARKIT
+handle_missing_dataset "ScanNet++" "$ROOT_SCANNETPP" SAMPLES_SCANNETPP
+handle_missing_dataset "ScanNet" "$ROOT_SCANNET" SAMPLES_SCANNET
+handle_missing_dataset "HyperSim" "$ROOT_HYPERSIM" SAMPLES_HYPERSIM
+handle_missing_dataset "BlendedMVS" "$ROOT_BLENDEDMVS" SAMPLES_BLENDEDMVS
+handle_missing_dataset "MegaDepth" "$ROOT_MEGADEPTH" SAMPLES_MEGADEPTH
+handle_missing_dataset "MVS-Synth" "$ROOT_MVS_SYNTH" SAMPLES_MVS_SYNTH
+if [ ! -f "$PRETRAINED" ]; then
+    echo "ERROR: Missing pretrained checkpoint: $PRETRAINED"
+    exit 1
+fi
+PSEUDO_GT_OVERRIDES=()
+if [ "$ENABLE_PSEUDO_GT" = "1" ]; then
+    if [ -z "$PSEUDO_GT_CACHE_PATH" ] || [ "$PSEUDO_GT_CACHE_PATH" = "null" ]; then
+        echo "ERROR: ENABLE_PSEUDO_GT=1 requires PSEUDO_GT_CACHE_PATH to be set."
+        exit 1
+    fi
+    PSEUDO_GT_OVERRIDES=(
+        "pseudo_gt.enable=true"
+        "pseudo_gt.cache_path=${PSEUDO_GT_CACHE_PATH}"
+    )
+else
+    PSEUDO_GT_OVERRIDES=(
+        "pseudo_gt.enable=false"
+        "pseudo_gt.cache_path=null"
+    )
+fi
+if [ "$TRAIN_SUBMAP_MODULES_ONLY" = "1" ]; then
+    TRAIN_SUBMAP_MODULES_ONLY_HYDRA="true"
+else
+    TRAIN_SUBMAP_MODULES_ONLY_HYDRA="false"
+fi
+if [ "$DETACH_FRONTEND_TOKENS" = "1" ]; then
+    DETACH_FRONTEND_TOKENS_HYDRA="true"
+else
+    DETACH_FRONTEND_TOKENS_HYDRA="false"
+fi
+DATASET_PARTS=()
+append_dataset "$SAMPLES_ARKIT" dataset_arkit
+append_dataset "$SAMPLES_SCANNETPP" dataset_scannetpp
+append_dataset "$SAMPLES_SCANNET" dataset_scannet
+append_dataset "$SAMPLES_HYPERSIM" dataset_hypersim
+append_dataset "$SAMPLES_BLENDEDMVS" dataset_blendedmvs
+append_dataset "$SAMPLES_MEGADEPTH" dataset_megadepth
+append_dataset "$SAMPLES_MVS_SYNTH" dataset_mvs_synth
+if [ "${#DATASET_PARTS[@]}" -eq 0 ]; then
+    echo "ERROR: No training dataset remains after weight filtering."
+    exit 1
+fi
+if [ -z "$GLOBAL_NUM_VIEWS" ]; then
+    GLOBAL_NUM_VIEWS=0
+    if [ "$SAMPLES_ARKIT" -gt 0 ] && [ "$NUM_VIEWS_ARKIT" -gt "$GLOBAL_NUM_VIEWS" ]; then
+        GLOBAL_NUM_VIEWS="$NUM_VIEWS_ARKIT"
+    fi
+    if [ "$SAMPLES_SCANNETPP" -gt 0 ] && [ "$NUM_VIEWS_SCANNETPP" -gt "$GLOBAL_NUM_VIEWS" ]; then
+        GLOBAL_NUM_VIEWS="$NUM_VIEWS_SCANNETPP"
+    fi
+    if [ "$SAMPLES_SCANNET" -gt 0 ] && [ "$NUM_VIEWS_SCANNET" -gt "$GLOBAL_NUM_VIEWS" ]; then
+        GLOBAL_NUM_VIEWS="$NUM_VIEWS_SCANNET"
+    fi
+    if [ "$SAMPLES_HYPERSIM" -gt 0 ] && [ "$NUM_VIEWS_HYPERSIM" -gt "$GLOBAL_NUM_VIEWS" ]; then
+        GLOBAL_NUM_VIEWS="$NUM_VIEWS_HYPERSIM"
+    fi
+    if [ "$SAMPLES_BLENDEDMVS" -gt 0 ] && [ "$NUM_VIEWS_BLENDEDMVS" -gt "$GLOBAL_NUM_VIEWS" ]; then
+        GLOBAL_NUM_VIEWS="$NUM_VIEWS_BLENDEDMVS"
+    fi
+    if [ "$SAMPLES_MEGADEPTH" -gt 0 ] && [ "$NUM_VIEWS_MEGADEPTH" -gt "$GLOBAL_NUM_VIEWS" ]; then
+        GLOBAL_NUM_VIEWS="$NUM_VIEWS_MEGADEPTH"
+    fi
+    if [ "$SAMPLES_MVS_SYNTH" -gt 0 ] && [ "$NUM_VIEWS_MVS_SYNTH" -gt "$GLOBAL_NUM_VIEWS" ]; then
+        GLOBAL_NUM_VIEWS="$NUM_VIEWS_MVS_SYNTH"
+    fi
+fi
+TRAIN_DATASET="${DATASET_PARTS[0]}"
+for part in "${DATASET_PARTS[@]:1}"; do
+    TRAIN_DATASET="${TRAIN_DATASET} + ${part}"
+done
+HYDRA_ARGS=(
+    "--config-name" "$CONFIG_NAME"
+    "exp_name=$EXP_NAME"
+    "save_dir=$SAVE_DIR"
+    "pretrained=$PRETRAINED"
+    "resume=$RESUME"
+    "data_root=$DATA_ROOT"
+    "root_arkit=$ROOT_ARKIT"
+    "root_scannetpp=$ROOT_SCANNETPP"
+    "root_scannet=$ROOT_SCANNET"
+    "root_hypersim=$ROOT_HYPERSIM"
+    "root_blendedmvs=$ROOT_BLENDEDMVS"
+    "root_megadepth=$ROOT_MEGADEPTH"
+    "root_mvs_synth=$ROOT_MVS_SYNTH"
+    "num_views=$GLOBAL_NUM_VIEWS"
+    "num_views_arkit=$NUM_VIEWS_ARKIT"
+    "num_views_scannetpp=$NUM_VIEWS_SCANNETPP"
+    "num_views_scannet=$NUM_VIEWS_SCANNET"
+    "num_views_hypersim=$NUM_VIEWS_HYPERSIM"
+    "num_views_blendedmvs=$NUM_VIEWS_BLENDEDMVS"
+    "num_views_megadepth=$NUM_VIEWS_MEGADEPTH"
+    "num_views_mvs_synth=$NUM_VIEWS_MVS_SYNTH"
+    "train_submap_modules_only=$TRAIN_SUBMAP_MODULES_ONLY_HYDRA"
+    "detach_frontend_tokens=$DETACH_FRONTEND_TOKENS_HYDRA"
+    "submap_train_mode=$SUBMAP_TRAIN_MODE"
+    "submap_retrieval_topk=$SUBMAP_RETRIEVAL_TOPK"
+    "submap_fetch_source=$SUBMAP_FETCH_SOURCE"
+    "submap_descriptor_source=$SUBMAP_DESCRIPTOR_SOURCE"
+    "${PSEUDO_GT_OVERRIDES[@]}"
+    "train_dataset=$TRAIN_DATASET"
+    "epochs=$EPOCHS"
+)
+if [ "$SKIP_TEST" = "1" ]; then
+    HYDRA_ARGS+=("test_dataset=")
+fi
+HYDRA_ARGS+=("$@")
+echo "=== Starting 2GPU local smoke: joint backend+submap with frozen frontend tokens ==="
+echo "  Project dir           : $PROJECT_DIR"
+echo "  Launch entry          : inline"
+echo "  Config                : $CONFIG_NAME"
+echo "  Experiment            : $EXP_NAME"
+echo "  Save dir              : $SAVE_DIR"
+echo "  Pretrained            : $PRETRAINED"
+echo "  Resume                : $RESUME"
+echo "  Num GPUs              : $NUM_GPUS"
+echo "  Distributed strategy  : $DIST_STRATEGY"
+echo "  Train submap only     : $TRAIN_SUBMAP_MODULES_ONLY"
+echo "  Detach frontend tokens: $DETACH_FRONTEND_TOKENS"
+echo "  Num views             : arkit=$NUM_VIEWS_ARKIT scannetpp=$NUM_VIEWS_SCANNETPP scannet=$NUM_VIEWS_SCANNET hypersim=$NUM_VIEWS_HYPERSIM blendedmvs=$NUM_VIEWS_BLENDEDMVS megadepth=$NUM_VIEWS_MEGADEPTH mvs_synth=$NUM_VIEWS_MVS_SYNTH"
+echo "  Submap size           : $SUBMAP_SIZE"
+echo "  Samples               : arkit=$SAMPLES_ARKIT scannetpp=$SAMPLES_SCANNETPP scannet=$SAMPLES_SCANNET hypersim=$SAMPLES_HYPERSIM blendedmvs=$SAMPLES_BLENDEDMVS megadepth=$SAMPLES_MEGADEPTH mvs_synth=$SAMPLES_MVS_SYNTH"
+echo "  Epochs override       : ${EPOCHS:-<config default>}"
+echo "  Skip test             : $SKIP_TEST"
+echo "  Train dataset         : $TRAIN_DATASET"
+echo "  Global num views      : $GLOBAL_NUM_VIEWS"
+echo
+COMMON_ARGS=(
+    --num_machines 1
+    --num_processes "$NUM_GPUS"
+    --main_process_port "$MASTER_PORT"
+    --dynamo_backend no
+    --mixed_precision bf16
+)
+if [ "$DIST_STRATEGY" = "fsdp" ]; then
+    accelerate launch \
+        "${COMMON_ARGS[@]}" \
+        --use_fsdp \
+        --fsdp_sharding_strategy FULL_SHARD \
+        --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP \
+        --fsdp_transformer_layer_cls_to_wrap BlockRope \
+        --fsdp_state_dict_type FULL_STATE_DICT \
+        --fsdp_backward_prefetch BACKWARD_PRE \
+        --fsdp_use_orig_params true \
+        --fsdp_sync_module_states true \
+        --fsdp_activation_checkpointing true \
+        "$SRC_DIR/finetune.py" \
+        "${HYDRA_ARGS[@]}"
+elif [ "$DIST_STRATEGY" = "ddp" ]; then
+    if [ "$NUM_GPUS" -gt 1 ]; then
+        accelerate launch \
+            --multi_gpu \
+            "${COMMON_ARGS[@]}" \
+            "$SRC_DIR/finetune.py" \
+            "${HYDRA_ARGS[@]}"
+    else
+        accelerate launch \
+            "${COMMON_ARGS[@]}" \
+            "$SRC_DIR/finetune.py" \
+            "${HYDRA_ARGS[@]}"
+    fi
+else
+    echo "ERROR: Unsupported DIST_STRATEGY=$DIST_STRATEGY"
+    exit 1
+fi

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/graph_gated_memory.py ADDED Viewed

	@@ -0,0 +1,850 @@

+"""
+GraphGatedMemoryManager: Sparse Windowed & Recursively Retrieved Submap Backend.
+Key components:
+  - SubMapBuffer: CPU-side storage for historical submap tokens + descriptors
+  - GraphGatedMemoryManager: differentiable loop closure with [NO_LOOP] gating,
+    recursive covisibility fetching, GPU active workspace management
+  - TemporalEmbedWrapper: dual-injection temporal embedding (Fix #7)
+  - _safe_oom_retry / _build_temporal_mask: helpers
+All features are toggled via CLI arguments; when disabled, behaviour is
+identical to the original SLAM-Former pipeline.  No original source files
+are modified.
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Set, Tuple
+# ═══════════════════════════════════════════════════════════
+# Helpers
+# ═══════════════════════════════════════════════════════════
+def _safe_oom_retry(fn, *args, **kwargs):
+    """Run *fn*; on CUDA OOM, free cache and retry once."""
+    try:
+        return fn(*args, **kwargs)
+    except RuntimeError as e:
+        if "out of memory" in str(e):
+            torch.cuda.empty_cache()
+            return fn(*args, **kwargs)
+        raise
+def _build_temporal_mask(frame_id_map: torch.Tensor, P: int) -> torch.Tensor:
+    """Build a causal attention mask from a non-contiguous frame-id tensor.
+    Args:
+        frame_id_map: [L] int tensor — true temporal frame index for every token.
+        P:            tokens per frame (patch_h*patch_w + register_tokens).
+    Returns:
+        attn_mask: [L, L] float tensor where future-frame positions are -inf.
+    """
+    L = frame_id_map.shape[0]
+    fids = frame_id_map.unsqueeze(1)          # [L, 1]
+    fids_t = frame_id_map.unsqueeze(0)        # [1, L]
+    # A token at position i may NOT attend to a token at position j
+    # if j belongs to a strictly later frame than i  (causal).
+    future = fids < fids_t                     # [L, L] bool
+    mask = torch.zeros(L, L, device=frame_id_map.device, dtype=torch.float32)
+    mask.masked_fill_(future, float("-inf"))
+    return mask
+def _build_sinusoidal_pe(max_len: int, dim: int) -> torch.Tensor:
+    """Fixed sinusoidal position encoding [max_len, dim]."""
+    pe = torch.zeros(max_len, dim)
+    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+    div_term = torch.exp(
+        torch.arange(0, dim, 2, dtype=torch.float) * (-math.log(10000.0) / dim)
+    )
+    pe[:, 0::2] = torch.sin(position * div_term)
+    pe[:, 1::2] = torch.cos(position * div_term)
+    return pe
+# ═══════════════════════════════════════════════════════════
+# SubMapBuffer — CPU-side historical storage
+# ═══════════════════════════════════════════════════════════
+@dataclass
+class SubMapBuffer:
+    """Lightweight CPU buffer for completed submaps.
+    Each entry is keyed by ``submap_id`` (int, monotonically increasing).
+    Attributes:
+        cpu_frontend_token_buffer:      submap_id → [K, P, 2C] frontend token tensor.
+        cpu_backend_token_buffer:       submap_id → [K, P, 2C] backend-refined token tensor.
+        cpu_frontend_descriptor_buffer: submap_id → [desc_dim] frontend descriptor.
+        cpu_backend_descriptor_buffer:  submap_id → [desc_dim] backend descriptor.
+        cpu_frame_ids:         submap_id → list of original frame indices.
+    """
+    cpu_frontend_token_buffer: Dict[int, torch.Tensor] = field(default_factory=dict)
+    cpu_backend_token_buffer: Dict[int, torch.Tensor] = field(default_factory=dict)
+    cpu_frontend_descriptor_buffer: Dict[int, torch.Tensor] = field(default_factory=dict)
+    cpu_backend_descriptor_buffer: Dict[int, torch.Tensor] = field(default_factory=dict)
+    cpu_frame_ids: Dict[int, List[int]] = field(default_factory=dict)
+    store_on_cpu: bool = True
+    detach_stored: bool = True
+    default_token_source: str = "frontend"
+    default_descriptor_source: str = "frontend"
+    default_writeback_token_source: str = "frontend"
+    default_writeback_descriptor_source: str = "frontend"
+    # ── convenience ──────────────────────────────────────
+    @property
+    def cpu_token_buffer(self) -> Dict[int, torch.Tensor]:
+        return self._get_token_buffer(self.default_token_source)
+    @property
+    def cpu_descriptor_buffer(self) -> Dict[int, torch.Tensor]:
+        return self._get_descriptor_buffer(self.default_descriptor_source)
+    @property
+    def num_submaps(self) -> int:
+        return len(self.cpu_descriptor_buffer)
+    def _get_token_buffer(self, source: str) -> Dict[int, torch.Tensor]:
+        if source == "frontend":
+            return self.cpu_frontend_token_buffer
+        if source == "backend":
+            return self.cpu_backend_token_buffer
+        raise ValueError(f"Unsupported token source: {source}")
+    def _get_descriptor_buffer(self, source: str) -> Dict[int, torch.Tensor]:
+        if source == "frontend":
+            return self.cpu_frontend_descriptor_buffer
+        if source == "backend":
+            return self.cpu_backend_descriptor_buffer
+        raise ValueError(f"Unsupported descriptor source: {source}")
+    def _resolve_token_source(self, source: Optional[str], for_writeback: bool = False) -> str:
+        if source is not None:
+            return source
+        return self.default_writeback_token_source if for_writeback else self.default_token_source
+    def _resolve_descriptor_source(self, source: Optional[str], for_writeback: bool = False) -> str:
+        if source is not None:
+            return source
+        return self.default_writeback_descriptor_source if for_writeback else self.default_descriptor_source
+    def _prepare_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
+        if self.detach_stored:
+            tensor = tensor.detach()
+        if self.store_on_cpu:
+            tensor = tensor.cpu()
+        return tensor
+    def _move_to_device(self, tensor: torch.Tensor, device: torch.device) -> torch.Tensor:
+        if tensor.device == device:
+            return tensor
+        return tensor.to(device, non_blocking=self.store_on_cpu)
+    def store(
+        self,
+        submap_id: int,
+        frame_ids: List[int],
+        frontend_tokens: Optional[torch.Tensor] = None,
+        frontend_descriptor: Optional[torch.Tensor] = None,
+        backend_tokens: Optional[torch.Tensor] = None,
+        backend_descriptor: Optional[torch.Tensor] = None,
+    ):
+        """Store a completed submap in the configured history banks."""
+        if frontend_tokens is not None:
+            self.cpu_frontend_token_buffer[submap_id] = self._prepare_tensor(frontend_tokens)
+        if frontend_descriptor is not None:
+            self.cpu_frontend_descriptor_buffer[submap_id] = self._prepare_tensor(frontend_descriptor)
+        if backend_tokens is not None:
+            self.cpu_backend_token_buffer[submap_id] = self._prepare_tensor(backend_tokens)
+        if backend_descriptor is not None:
+            self.cpu_backend_descriptor_buffer[submap_id] = self._prepare_tensor(backend_descriptor)
+        self.cpu_frame_ids[submap_id] = list(frame_ids)
+    def fetch_tokens(
+        self,
+        submap_id: int,
+        device: torch.device,
+        source: Optional[str] = None,
+    ) -> torch.Tensor:
+        """Move a submap's tokens to *device*."""
+        source = self._resolve_token_source(source)
+        return self._move_to_device(self._get_token_buffer(source)[submap_id], device)
+    def fetch_frame_ids(self, submap_id: int) -> List[int]:
+        return self.cpu_frame_ids[submap_id]
+    def get_all_descriptors(
+        self,
+        device: torch.device,
+        source: Optional[str] = None,
+    ) -> torch.Tensor:
+        """Return [num_submaps, desc_dim] on *device*, ordered by submap_id."""
+        source = self._resolve_descriptor_source(source)
+        descriptor_buffer = self._get_descriptor_buffer(source)
+        if not descriptor_buffer:
+            return torch.empty(0, device=device)
+        ids = sorted(descriptor_buffer.keys())
+        descs = torch.stack([descriptor_buffer[i] for i in ids])
+        return self._move_to_device(descs, device)
+    def id_at_index(self, index: int, source: Optional[str] = None) -> int:
+        """Map a 0-based index (into the descriptor matrix) back to submap_id."""
+        source = self._resolve_descriptor_source(source)
+        return sorted(self._get_descriptor_buffer(source).keys())[index]
+    def update_descriptor(
+        self,
+        submap_id: int,
+        descriptor: torch.Tensor,
+        source: Optional[str] = None,
+    ):
+        source = self._resolve_descriptor_source(source, for_writeback=True)
+        self._get_descriptor_buffer(source)[submap_id] = self._prepare_tensor(descriptor)
+    def update_tokens(
+        self,
+        submap_id: int,
+        tokens: torch.Tensor,
+        source: Optional[str] = None,
+    ):
+        """Write-back refined tokens (after backend) to CPU buffer."""
+        source = self._resolve_token_source(source, for_writeback=True)
+        self._get_token_buffer(source)[submap_id] = self._prepare_tensor(tokens)
+    def detach_all(self):
+        for buffer_dict in (
+            self.cpu_frontend_token_buffer,
+            self.cpu_backend_token_buffer,
+            self.cpu_frontend_descriptor_buffer,
+            self.cpu_backend_descriptor_buffer,
+        ):
+            for sid in list(buffer_dict.keys()):
+                buffer_dict[sid] = buffer_dict[sid].detach()
+# ═══════════════════════════════════════════════════════════
+# TemporalEmbedWrapper — Dual injection (Fix #7)
+# ═══════════════════════════════════════════════════════════
+class TemporalEmbedWrapper(nn.Module):
+    """Dual temporal embedding: input injection + output injection.
+    Phase 1 (input):  Add t_emb to hidden_F[:,:,:C] BEFORE backendT.
+        → Temporal info participates in all 36 layers of attention.
+    Phase 2 (output): Add projected t_emb to BOTH halves of hidden_B AFTER
+        backendT.
+        → Guarantees Layer 35 (geometry) and Layer 36 (semantics) both
+          carry temporal information for downstream heads.
+    Args:
+        embed_dim:  feature dimension C (default 1024).
+        max_frames: maximum temporal index supported.
+        mode:       'learned' | 'sinusoidal'.
+    """
+    def __init__(self, embed_dim: int = 1024, max_frames: int = 2000,
+                 mode: str = "learned"):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_frames = max_frames
+        self.mode = mode
+        if mode == "learned":
+            self.temporal_embed = nn.Embedding(max_frames, embed_dim)
+        elif mode == "sinusoidal":
+            pe = _build_sinusoidal_pe(max_frames, embed_dim)
+            self.register_buffer("temporal_embed_fixed", pe)
+        else:
+            raise ValueError(f"Unknown temporal embed mode: {mode}")
+        # Separate projections for the two output halves
+        self.output_proj_layer35 = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.output_proj_layer36 = nn.Linear(embed_dim, embed_dim, bias=False)
+        # Learnable gates — initialised at 0  (sigmoid → 0.5 at start)
+        self.gate_layer35 = nn.Parameter(torch.zeros(1))
+        self.gate_layer36 = nn.Parameter(torch.zeros(1))
+    # ── core ─────────────────────────────────────────────
+    def get_temporal_embed(self, frame_ids: torch.Tensor) -> torch.Tensor:
+        """Return [N, C] temporal embedding for given frame indices."""
+        frame_ids = frame_ids.clamp(max=self.max_frames - 1)
+        if self.mode == "learned":
+            return self.temporal_embed(frame_ids)
+        else:
+            return self.temporal_embed_fixed[frame_ids]
+    def inject_input(self, hidden_F: torch.Tensor,
+                     frame_ids: torch.Tensor) -> torch.Tensor:
+        """Phase 1: add temporal embedding to first C dims of hidden_F.
+        Args:
+            hidden_F:  [N, P, 2C] frontend token map.
+            frame_ids: [N] long tensor of temporal frame indices.
+        Returns:
+            hidden_F with temporal embedding added to [:, :, :C].
+        """
+        N, P, C2 = hidden_F.shape
+        C = C2 // 2
+        t_emb = self.get_temporal_embed(frame_ids)       # [N, C]
+        hidden_F = hidden_F.clone()
+        hidden_F[:, :, :C] = hidden_F[:, :, :C] + t_emb.unsqueeze(1)
+        return hidden_F
+    def inject_output(self, hidden_B: torch.Tensor,
+                      frame_ids: torch.Tensor) -> torch.Tensor:
+        """Phase 2: add projected temporal embedding to BOTH halves of hidden_B.
+        Args:
+            hidden_B:  [N, P, 2C] backend output (Layer35 || Layer36).
+            frame_ids: [N] long tensor.
+        Returns:
+            hidden_B with gated temporal embedding added to both halves.
+        """
+        N, P, C2 = hidden_B.shape
+        C = C2 // 2
+        t_emb = self.get_temporal_embed(frame_ids)       # [N, C]
+        t_emb_35 = self.output_proj_layer35(t_emb)       # [N, C]
+        t_emb_36 = self.output_proj_layer36(t_emb)       # [N, C]
+        g35 = torch.sigmoid(self.gate_layer35)
+        g36 = torch.sigmoid(self.gate_layer36)
+        hidden_B = hidden_B.clone()
+        hidden_B[:, :, :C] = hidden_B[:, :, :C] + g35 * t_emb_35.unsqueeze(1)
+        hidden_B[:, :, C:] = hidden_B[:, :, C:] + g36 * t_emb_36.unsqueeze(1)
+        return hidden_B
+# ═══════════════════════════════════════════════════════════
+# GraphGatedMemoryManager
+# ═══════════════════════════════════════════════════════════
+class GraphGatedMemoryManager(nn.Module):
+    """Sparse Windowed & Recursively Retrieved Submap Backend.
+    Manages:
+      * GPU active workspace (S_prev + S_curr, up to 2K frames).
+      * Differentiable loop closure with a [NO_LOOP] dummy descriptor.
+      * Recursive covisibility fetching via an adjacency graph.
+      * CPU ↔ GPU memory offloading.
+    All operations are designed to be DDP-safe (Fix #4): the full
+    descriptor / retrieval / backendT path is always executed for every
+    batch element, gated by a differentiable multiplier.
+    Args:
+        submap_size:           K — number of frames per submap.
+        max_recursive_submaps: cap on historical submaps fetched at once.
+        desc_dim:              global descriptor dimension (default 128).
+        embed_dim:             token feature dimension C (default 1024).
+        gumbel_tau:            initial Gumbel-Softmax temperature.
+        loop_mask_mode:        "hard_top1" or "soft_all".
+        soft_mask_temperature: temperature for soft_all mode.
+        soft_mask_bias:        bias for soft_all mode.
+    """
+    def __init__(
+        self,
+        submap_size: int = 10,
+        max_recursive_submaps: int = 5,
+        desc_dim: int = 128,
+        embed_dim: int = 1024,
+        gumbel_tau: float = 1.0,
+        loop_mask_mode: str = "hard_top1",
+        soft_mask_temperature: float = 0.25,
+        soft_mask_bias: float = 0.2,
+        retain_history_grad: bool = False,
+        submap_train_mode: str = "full_token",
+        submap_retrieval_topk: int = 5,
+        submap_fetch_source: str = "frontend",
+        submap_descriptor_source: str = "frontend",
+    ):
+        super().__init__()
+        self.K = submap_size
+        self.max_recursive = max_recursive_submaps
+        self.desc_dim = desc_dim
+        self.embed_dim = embed_dim
+        self.gumbel_tau = gumbel_tau
+        self.loop_mask_mode = loop_mask_mode
+        self.soft_mask_temperature = soft_mask_temperature
+        self.soft_mask_bias = soft_mask_bias
+        self.retain_history_grad = retain_history_grad
+        self.submap_train_mode = submap_train_mode
+        self.submap_retrieval_topk = int(submap_retrieval_topk)
+        self.submap_fetch_source = submap_fetch_source
+        self.submap_descriptor_source = submap_descriptor_source
+        valid_modes = {"full_token", "top5_dual_queue"}
+        if self.submap_train_mode not in valid_modes:
+            raise ValueError(
+                f"Unsupported submap_train_mode: {self.submap_train_mode}. "
+                f"Expected one of {sorted(valid_modes)}."
+            )
+        valid_sources = {"frontend", "backend"}
+        if self.submap_fetch_source not in valid_sources:
+            raise ValueError(f"Unsupported submap_fetch_source: {self.submap_fetch_source}")
+        if self.submap_descriptor_source not in valid_sources:
+            raise ValueError(f"Unsupported submap_descriptor_source: {self.submap_descriptor_source}")
+        self.use_dual_queue = self.submap_train_mode == "top5_dual_queue"
+        if self.submap_retrieval_topk <= 0:
+            self.submap_retrieval_topk = 5 if self.use_dual_queue else 0
+        # ── learnable parameters ─────────────────────────
+        # [NO_LOOP] dummy descriptor at index 0
+        self.no_loop_descriptor = nn.Parameter(
+            torch.randn(1, desc_dim) * 0.02
+        )
+        # Project pooled tokens → global descriptor
+        self.desc_proj = nn.Linear(2 * embed_dim, desc_dim)
+        # ── non-parameter state ──────────────────────────
+        self.buffer = self._build_buffer()
+        self.adjacency: Dict[int, Set[int]] = {}
+        # Frame-level accumulation for the *current* submap
+        self._curr_tokens: List[torch.Tensor] = []      # list of [1, P, 2C]
+        self._curr_frame_ids: List[int] = []
+        # Tokens for the *previous* submap (kept on GPU for sliding window)
+        self._prev_tokens: Optional[torch.Tensor] = None  # [K, P, 2C]
+        self._prev_frame_ids: List[int] = []
+        self._current_submap_id: int = 0
+        self._global_frame_counter: int = 0
+    def _build_buffer(self) -> SubMapBuffer:
+        return SubMapBuffer(
+            store_on_cpu=not self.retain_history_grad,
+            detach_stored=not self.retain_history_grad,
+            default_token_source=self.submap_fetch_source,
+            default_descriptor_source=self.submap_descriptor_source,
+            default_writeback_token_source=(
+                "backend" if self.use_dual_queue else self.submap_fetch_source
+            ),
+            default_writeback_descriptor_source=(
+                "backend"
+                if (self.use_dual_queue or self.submap_descriptor_source == "backend")
+                else self.submap_descriptor_source
+            ),
+        )
+    # ── properties ───────────────────────────────────────
+    @property
+    def current_submap_id(self) -> int:
+        return self._current_submap_id
+    @property
+    def submap_complete(self) -> bool:
+        return len(self._curr_tokens) >= self.K
+    # ── accumulate ───────────────────────────────────────
+    def accumulate(self, frame_token: torch.Tensor, frame_id: Optional[int] = None):
+        """Append a single frame's token to the current submap.
+        Args:
+            frame_token: [1, P, 2C] or [P, 2C] — output of frontendT.
+            frame_id:    original temporal frame index (auto-incremented if None).
+        """
+        if frame_token.dim() == 2:
+            frame_token = frame_token.unsqueeze(0)
+        self._curr_tokens.append(frame_token)
+        fid = frame_id if frame_id is not None else self._global_frame_counter
+        self._curr_frame_ids.append(fid)
+        self._global_frame_counter += 1
+    # ── descriptor computation ───────────────────────────
+    def compute_descriptor(self, tokens: torch.Tensor) -> torch.Tensor:
+        """Pool submap tokens → global descriptor.
+        Args:
+            tokens: [K, P, 2C] on GPU.
+        Returns:
+            descriptor: [1, desc_dim].
+        """
+        # Mean-pool over frames and patches → [1, 2C]
+        device_type = tokens.device.type if tokens.is_cuda else "cpu"
+        with torch.amp.autocast(device_type=device_type, enabled=False):
+            pooled = tokens.float().mean(dim=(0, 1), keepdim=False).unsqueeze(0)
+            pooled = torch.nan_to_num(pooled, nan=0.0, posinf=0.0, neginf=0.0)
+            desc = self.desc_proj(pooled.to(dtype=self.desc_proj.weight.dtype)).float()
+        return torch.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
+    # ── loop retrieval (differentiable, DDP-safe) ────────
+    def retrieve(
+        self,
+        curr_desc: torch.Tensor,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor, List[int], int, int, List[int], List[int], Optional[torch.Tensor]]:
+        """Differentiable loop closure retrieval (Fix B2: no Python branching).
+        ALL code paths always fetch tokens so every GPU executes the same
+        compute graph.  The gate multiplier controls whether retrieved
+        tokens contribute to the output.
+        Returns:
+            gate:               [1, 1] — differentiable: 0 = no loop, 1 = loop.
+            retrieved_tokens:   [R, P, 2C] on GPU (R may be 0 if no history).
+            retrieved_fids:     list[int] matching dim-0 of retrieved_tokens.
+            n_valid_retrieved:  int — how many tokens are real.
+            primary_sid:        submap id of the primary retrieved submap (-1 if none).
+            fetch_ids:          list[int] — submap IDs that were fetched (for write-back).
+            fetch_token_counts: list[int] — number of tokens per fetched submap.
+            retrieval_weights:  [R] weights for soft_all mode.
+        """
+        num_hist = self.buffer.num_submaps
+        if num_hist == 0:
+            # No history at all — return empty (no padding waste: A5 fix)
+            P = self._curr_tokens[0].shape[1] if self._curr_tokens else 1
+            C2 = self._curr_tokens[0].shape[2] if self._curr_tokens else 2 * self.embed_dim
+            gate = torch.zeros(1, 1, device=device)
+            return gate, torch.empty(0, P, C2, device=device), [], 0, -1, [], [], None
+        if self.loop_mask_mode == "soft_all":
+            hist_ids = sorted(self.buffer.cpu_descriptor_buffer.keys())
+            prev_sid = self._current_submap_id - 1 if self._current_submap_id > 0 else None
+            fetch_ids = [sid for sid in hist_ids if sid != prev_sid]
+            P = self._curr_tokens[0].shape[1]
+            C2 = self._curr_tokens[0].shape[2]
+            if not fetch_ids:
+                gate = torch.zeros(1, 1, device=device)
+                return gate, torch.empty(0, P, C2, device=device), [], 0, -1, [], [], None
+            hist_descs = torch.stack(
+                [self.buffer.cpu_descriptor_buffer[sid] for sid in fetch_ids]
+            ).to(device, non_blocking=True).float()
+            curr_desc_safe = torch.nan_to_num(curr_desc.float(), nan=0.0, posinf=0.0, neginf=0.0)
+            hist_descs = torch.nan_to_num(hist_descs, nan=0.0, posinf=0.0, neginf=0.0)
+            sim = F.cosine_similarity(
+                curr_desc_safe.unsqueeze(1),
+                hist_descs.unsqueeze(0),
+                dim=-1,
+            )
+            sim = torch.nan_to_num(sim, nan=-1.0, posinf=1.0, neginf=-1.0).clamp(min=-1.0, max=1.0)
+            selected_fetch_ids = list(fetch_ids)
+            selected_sim = sim
+            if self.submap_retrieval_topk > 0 and len(fetch_ids) > self.submap_retrieval_topk:
+                topk = min(self.submap_retrieval_topk, len(fetch_ids))
+                top_scores, top_indices = torch.topk(sim.squeeze(0), k=topk, dim=-1)
+                if topk == 1:
+                    selected_index_list = [int(top_indices.item())]
+                else:
+                    selected_index_list = [int(idx) for idx in top_indices.tolist()]
+                selected_fetch_ids = [fetch_ids[idx] for idx in selected_index_list]
+                selected_sim = top_scores.unsqueeze(0)
+            tau = max(float(self.soft_mask_temperature), 1e-6)
+            weights = torch.sigmoid((selected_sim - self.soft_mask_bias) / tau)
+            weights = torch.nan_to_num(weights, nan=0.0, posinf=1.0, neginf=0.0).clamp(min=0.0, max=1.0)
+            gate = weights.max(dim=-1, keepdim=True).values
+            selected_idx = int(selected_sim.argmax(dim=-1).item())
+            primary_submap_id = selected_fetch_ids[selected_idx]
+            retrieved_list: List[torch.Tensor] = []
+            retrieved_fids: List[int] = []
+            fetch_token_counts: List[int] = []
+            for sid in selected_fetch_ids:
+                t = self.buffer.fetch_tokens(sid, device)
+                retrieved_list.append(t)
+                fetch_token_counts.append(t.shape[0])
+                retrieved_fids.extend(self.buffer.fetch_frame_ids(sid))
+            retrieved = torch.cat(retrieved_list, dim=0) if retrieved_list else torch.empty(0, P, C2, device=device)
+            n_valid = retrieved.shape[0]
+            return gate, retrieved, retrieved_fids, n_valid, primary_submap_id, selected_fetch_ids, fetch_token_counts, weights.squeeze(0)
+        # Build descriptor bank: [NO_LOOP] + all historical
+        hist_descs = torch.nan_to_num(
+            self.buffer.get_all_descriptors(device).float(),
+            nan=0.0,
+            posinf=0.0,
+            neginf=0.0,
+        )   # [H, D]
+        bank = torch.cat([self.no_loop_descriptor.float(), hist_descs], dim=0)  # [H+1, D]
+        curr_desc_safe = torch.nan_to_num(curr_desc.float(), nan=0.0, posinf=0.0, neginf=0.0)
+        # Cosine similarity → Gumbel-Softmax
+        sim = F.cosine_similarity(
+            curr_desc_safe.unsqueeze(1),   # [1, 1, D]
+            bank.unsqueeze(0),        # [1, H+1, D]
+            dim=-1,
+        )  # [1, H+1]
+        sim = torch.nan_to_num(sim, nan=-1.0, posinf=1.0, neginf=-1.0).clamp(min=-1.0, max=1.0)
+        selection = F.gumbel_softmax(sim, tau=max(float(self.gumbel_tau), 1e-6), hard=True, dim=-1)
+        selection = torch.nan_to_num(selection, nan=0.0, posinf=1.0, neginf=0.0)
+        selected_idx = selection.argmax(dim=-1).item()        # int
+        # Gate: differentiable sum of non-NO_LOOP probabilities
+        gate = selection[:, 1:].sum(dim=-1, keepdim=True).clamp(min=0.0, max=1.0)     # [1, 1]
+        # ── ALWAYS fetch primary + recursive neighbours (Fix B2) ──
+        # When NO_LOOP is selected (idx 0), we still fetch the *best*
+        # historical submap's tokens so the compute graph is identical
+        # across GPUs; the gate multiplier zeros them out.
+        P = self._curr_tokens[0].shape[1]
+        C2 = self._curr_tokens[0].shape[2]
+        retrieved_list: List[torch.Tensor] = []
+        retrieved_fids: List[int] = []
+        fetch_token_counts: List[int] = []
+        # Determine which submap to fetch (always pick one)
+        if selected_idx > 0:
+            primary_submap_id = self.buffer.id_at_index(selected_idx - 1)
+        else:
+            # NO_LOOP selected — still fetch the highest-similarity submap
+            # so the compute graph is the same across GPUs
+            non_noloop_sim = sim[0, 1:]  # [H]
+            fallback_idx = non_noloop_sim.argmax().item()
+            primary_submap_id = self.buffer.id_at_index(fallback_idx)
+        fetch_ids = [primary_submap_id]
+        neighbours = self.adjacency.get(primary_submap_id, set())
+        for nid in sorted(neighbours):
+            if len(fetch_ids) >= self.max_recursive:
+                break
+            if nid != self._current_submap_id and nid in self.buffer.cpu_token_buffer:
+                fetch_ids.append(nid)
+        for sid in fetch_ids:
+            t = self.buffer.fetch_tokens(sid, device)
+            retrieved_list.append(t)
+            fetch_token_counts.append(t.shape[0])
+            retrieved_fids.extend(self.buffer.fetch_frame_ids(sid))
+        # No fixed-size padding (Fix A5): return actual tokens only
+        if retrieved_list:
+            retrieved = torch.cat(retrieved_list, dim=0)       # [R, P, C2]
+        else:
+            retrieved = torch.empty(0, P, C2, device=device)
+        n_valid = retrieved.shape[0]
+        return gate, retrieved, retrieved_fids, n_valid, primary_submap_id, fetch_ids, fetch_token_counts, None
+    # ── finalize submap ──────────────────────────────────
+    def finalize_submap(
+        self,
+        model,
+        device: torch.device,
+        temporal_wrapper: Optional[TemporalEmbedWrapper] = None,
+        enable_temporal_embed: bool = False,
+        enable_loop_closure: bool = False,
+        tbptt_window: int = 10,
+    ) -> Tuple[torch.Tensor, torch.Tensor, dict]:
+        """Finalize the current submap: pool, retrieve, run backend, slide window.
+        DDP-safe: always executes the full compute graph (Fix #4).
+        A4 fix: tokens within ``tbptt_window`` recent submaps keep gradients;
+        older ones are detached to cap memory.
+        A5 fix: retrieved tokens are NOT padded to a fixed size; only real
+        tokens enter backendT.  The gate multiplier zeros out retrieved
+        contributions when NO_LOOP is selected.
+        Args:
+            model:                SLAMFormer instance.
+            device:               GPU device.
+            temporal_wrapper:     TemporalEmbedWrapper (or None).
+            enable_temporal_embed: whether to inject temporal embeddings.
+            enable_loop_closure:  whether to attempt differentiable loop retrieval.
+            tbptt_window:         number of recent submaps whose stored tokens
+                                  keep gradients (A4 fix). Older submaps are
+                                  detached.
+        Returns:
+            backend_out:   [N_total, P, 2C] — refined tokens from backendT.
+            loop_gate:     [1, 1] — differentiable gate (0 = no loop, 1 = loop).
+            meta:          dict with keys 'n_prev', 'n_curr', 'n_retrieved',
+                           'frame_ids' (full list), 'curr_frame_ids'.
+        """
+        # ── 1. Stack current submap tokens ───────────────
+        curr_tokens = torch.cat(self._curr_tokens, dim=0).to(device)  # [K, P, 2C]
+        curr_desc = self.compute_descriptor(curr_tokens)               # [1, D]
+        # ── 2. Loop retrieval (always executed, DDP-safe: Fix B2) ─
+        if enable_loop_closure:
+            loop_gate, retrieved_tokens, retrieved_fids, n_valid_ret, primary_sid, \
+                fetch_ids, fetch_token_counts, retrieval_weights = self.retrieve(curr_desc, device)
+        else:
+            P, C2 = curr_tokens.shape[1], curr_tokens.shape[2]
+            loop_gate = torch.zeros(1, 1, device=device)
+            retrieved_tokens = torch.empty(0, P, C2, device=device)
+            retrieved_fids = []
+            n_valid_ret = 0
+            primary_sid = -1
+            fetch_ids = []
+            fetch_token_counts = []
+            retrieval_weights = None
+        # ── 3. Build combined token tensor ───────────────
+        parts = []
+        fid_parts: List[int] = []
+        if self._prev_tokens is not None:
+            parts.append(self._prev_tokens.to(device))
+            fid_parts.extend(self._prev_frame_ids)
+        parts.append(curr_tokens)
+        fid_parts.extend(self._curr_frame_ids)
+        # A5 fix: only append retrieved tokens if there are any (no zero-padding)
+        n_retrieved = 0
+        if retrieved_tokens.shape[0] > 0:
+            if retrieval_weights is not None and len(fetch_token_counts) == len(retrieval_weights):
+                gated_chunks = []
+                offset = 0
+                for weight, count in zip(retrieval_weights, fetch_token_counts):
+                    gated_chunks.append(
+                        retrieved_tokens[offset: offset + count] * weight.reshape(1, 1, 1)
+                    )
+                    offset += count
+                gated_retrieved = torch.cat(gated_chunks, dim=0) if gated_chunks else retrieved_tokens
+            else:
+                gated_retrieved = retrieved_tokens * loop_gate.unsqueeze(-1)
+            parts.append(gated_retrieved)
+            fid_parts.extend(retrieved_fids)
+            n_retrieved = gated_retrieved.shape[0]
+        combined = torch.cat(parts, dim=0) if parts else curr_tokens  # [N_total, P, 2C]
+        frame_ids_tensor = torch.tensor(fid_parts, dtype=torch.long, device=device)
+        # ── 4. Temporal embedding — Phase 1 (input) ──────
+        if enable_temporal_embed and temporal_wrapper is not None:
+            combined = temporal_wrapper.inject_input(combined, frame_ids_tensor)
+        # ── 5. Run backendT (always, DDP-safe) ───────────
+        hidden_B = _safe_oom_retry(model.backendT, combined)
+        # ── 6. Temporal embedding — Phase 2 (output) ─────
+        if enable_temporal_embed and temporal_wrapper is not None:
+            hidden_B = temporal_wrapper.inject_output(hidden_B, frame_ids_tensor)
+        # ── 7. Update adjacency graph (always, no Python branching) ──
+        if enable_loop_closure and primary_sid >= 0:
+            cid = self._current_submap_id
+            self.adjacency.setdefault(cid, set()).add(primary_sid)
+            self.adjacency.setdefault(primary_sid, set()).add(cid)
+        # ── 8. Slice refined tokens for each part ────────
+        n_prev = self._prev_tokens.shape[0] if self._prev_tokens is not None else 0
+        n_curr = curr_tokens.shape[0]
+        completed_submap_id = self._current_submap_id
+        prev_sid = completed_submap_id - 1 if n_prev > 0 else None
+        curr_backend_tokens = hidden_B[n_prev:n_prev + n_curr]
+        should_store_backend_tokens = self.use_dual_queue or self.submap_fetch_source == "backend"
+        should_store_backend_desc = self.use_dual_queue or self.submap_descriptor_source == "backend"
+        curr_backend_desc = (
+            self.compute_descriptor(curr_backend_tokens) if should_store_backend_desc else None
+        )
+        if self.use_dual_queue or should_store_backend_tokens or should_store_backend_desc:
+            if prev_sid is not None:
+                refined_prev = hidden_B[:n_prev]
+                if refined_prev.shape[0] > 0:
+                    if should_store_backend_tokens:
+                        self.buffer.update_tokens(prev_sid, refined_prev, source="backend")
+                    if should_store_backend_desc:
+                        self.buffer.update_descriptor(
+                            prev_sid,
+                            self.compute_descriptor(refined_prev).squeeze(0),
+                            source="backend",
+                        )
+            offset = n_prev + n_curr
+            for sid, count in zip(fetch_ids, fetch_token_counts):
+                refined_ret = hidden_B[offset: offset + count]
+                if refined_ret.shape[0] > 0:
+                    if should_store_backend_tokens:
+                        self.buffer.update_tokens(sid, refined_ret, source="backend")
+                    if should_store_backend_desc:
+                        self.buffer.update_descriptor(
+                            sid,
+                            self.compute_descriptor(refined_ret).squeeze(0),
+                            source="backend",
+                        )
+                offset += count
+        # ── 9. Store current submap
+        self.buffer.store(
+            submap_id=completed_submap_id,
+            frame_ids=self._curr_frame_ids,
+            frontend_tokens=curr_tokens,
+            frontend_descriptor=curr_desc.squeeze(0),
+            backend_tokens=curr_backend_tokens if should_store_backend_tokens else None,
+            backend_descriptor=(
+                curr_backend_desc.squeeze(0) if curr_backend_desc is not None else None
+            ),
+        )
+        # ── 10. Release retrieved tensors ─────────────────
+        del retrieved_tokens
+        # ── 11. TBPTT: detach memory states periodically ──
+        #  When submap_id crosses a tbptt_window boundary, detach ALL
+        #  stored tokens in the buffer and _prev_tokens.  This cuts the
+        #  backward graph at memory-state level (not loss level), so
+        #  gradients flow within the window but not across.
+        next_id = self._current_submap_id + 1
+        should_detach_history = (
+            tbptt_window is not None and tbptt_window > 0 and next_id % tbptt_window == 0 and next_id > 0
+        )
+        if should_detach_history:
+            self.buffer.detach_all()
+        # ── 12. Slide window ─────────────────────────────
+        next_prev_tokens = curr_tokens
+        if self.submap_fetch_source == "backend" and should_store_backend_tokens:
+            next_prev_tokens = curr_backend_tokens
+        if self.retain_history_grad and not should_detach_history:
+            self._prev_tokens = next_prev_tokens
+        elif self.retain_history_grad:
+            self._prev_tokens = next_prev_tokens.detach()
+        else:
+            self._prev_tokens = next_prev_tokens.detach().cpu()
+        self._prev_frame_ids = list(self._curr_frame_ids)
+        self._curr_tokens = []
+        self._curr_frame_ids = []
+        self._current_submap_id += 1
+        active_curr_desc = curr_desc
+        if self.submap_descriptor_source == "backend" and curr_backend_desc is not None:
+            active_curr_desc = curr_backend_desc
+        meta = {
+            'submap_id': completed_submap_id,
+            'n_prev': n_prev,
+            'n_curr': n_curr,
+            'n_retrieved': n_retrieved,
+            'frame_ids': fid_parts,
+            'curr_frame_ids': list(self._prev_frame_ids),  # after slide, prev = old curr
+            'curr_descriptor': active_curr_desc,
+            'curr_frontend_descriptor': curr_desc,
+            'curr_backend_descriptor': curr_backend_desc,
+            'submap_train_mode': self.submap_train_mode,
+            'submap_descriptor_source': self.submap_descriptor_source,
+        }
+        return hidden_B, loop_gate, meta
+    # ── reset ────────────────────────────────────────────
+    def reset(self):
+        """Clear all state (e.g. between sequences)."""
+        self.buffer = self._build_buffer()
+        self.adjacency.clear()
+        self._curr_tokens.clear()
+        self._curr_frame_ids.clear()
+        self._prev_tokens = None
+        self._prev_frame_ids.clear()
+        self._current_submap_id = 0
+        self._global_frame_counter = 0

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/mine_pseudo_gt.py ADDED Viewed

	@@ -0,0 +1,588 @@

+import argparse
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+import cv2
+import numpy as np
+import torch
+def canonical_view_key_from_values(dataset: Optional[str], label: Optional[str]) -> Optional[str]:
+    if dataset is None or label is None:
+        return None
+    return f"{dataset}::{label}"
+def load_payload(path: Optional[str]):
+    if not path:
+        return None
+    path = os.path.expanduser(path)
+    if path.endswith(".json"):
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    if path.endswith(".jsonl"):
+        records = []
+        with open(path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    records.append(json.loads(line))
+        return {"records": records}
+    return torch.load(path, map_location="cpu", weights_only=False)
+def _scalar(value, default=None):
+    if isinstance(value, (list, tuple)):
+        if not value:
+            return default
+        return _scalar(value[0], default)
+    if torch.is_tensor(value):
+        if value.numel() == 0:
+            return default
+        return value.detach().reshape(-1)[0].cpu().item()
+    return value if value is not None else default
+def _float(value, default=0.0):
+    value = _scalar(value, default)
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return float(default)
+def _bool(record: Optional[Dict[str, Any]], score_threshold: float = 0.0) -> Optional[bool]:
+    if record is None:
+        return None
+    for key in ("is_positive", "accepted", "match", "loop"):
+        if key in record:
+            return bool(record[key])
+    tag = str(record.get("tag", "")).strip().lower()
+    if tag in {"positive", "pos", "match", "loop", "true", "1"}:
+        return True
+    if tag in {"negative", "neg", "false", "0"}:
+        return False
+    score = _float(record.get("score"), _float(record.get("confidence"), 0.0))
+    if score_threshold > 0:
+        return score >= score_threshold
+    return None
+def load_pair_cache(path: Optional[str]) -> Dict[Tuple[str, str], Dict[str, Any]]:
+    payload = load_payload(path)
+    if payload is None:
+        return {}
+    raw_records = payload
+    if isinstance(payload, dict):
+        raw_records = payload.get("records", payload.get("pairs", payload))
+    if isinstance(raw_records, dict):
+        iterator = [
+            ({**value, "pair_key": key} if isinstance(value, dict) else value)
+            for key, value in raw_records.items()
+        ]
+    else:
+        iterator = raw_records
+    cache = {}
+    for record in iterator:
+        if not isinstance(record, dict):
+            continue
+        key_a = record.get("key_a") or record.get("frame_key_a")
+        key_b = record.get("key_b") or record.get("frame_key_b")
+        if key_a is None:
+            key_a = canonical_view_key_from_values(record.get("dataset_a"), record.get("label_a"))
+        if key_b is None:
+            key_b = canonical_view_key_from_values(record.get("dataset_b"), record.get("label_b"))
+        pair_key = record.get("pair_key")
+        if (key_a is None or key_b is None) and isinstance(pair_key, str) and "||" in pair_key:
+            key_a, key_b = pair_key.split("||", 1)
+        if key_a is None or key_b is None or key_a == key_b:
+            continue
+        pair = (key_a, key_b) if key_a <= key_b else (key_b, key_a)
+        cache[pair] = record
+    return cache
+def inverse_pose(pose: np.ndarray) -> np.ndarray:
+    pose = np.asarray(pose, dtype=np.float32)
+    inv_pose = np.eye(4, dtype=np.float32)
+    R = pose[:3, :3]
+    t = pose[:3, 3]
+    inv_pose[:3, :3] = R.T
+    inv_pose[:3, 3] = -R.T @ t
+    return inv_pose
+def pose_distance(pose_a: np.ndarray, pose_b: np.ndarray) -> float:
+    return float(np.linalg.norm(np.asarray(pose_a[:3, 3]) - np.asarray(pose_b[:3, 3])))
+def heading_angle_deg(pose_a: np.ndarray, pose_b: np.ndarray) -> float:
+    forward_a = np.asarray(pose_a[:3, 2], dtype=np.float32)
+    forward_b = np.asarray(pose_b[:3, 2], dtype=np.float32)
+    norm_a = np.linalg.norm(forward_a)
+    norm_b = np.linalg.norm(forward_b)
+    if norm_a <= 1e-8 or norm_b <= 1e-8:
+        return 180.0
+    dot = float(np.clip(np.dot(forward_a / norm_a, forward_b / norm_b), -1.0, 1.0))
+    return float(np.degrees(np.arccos(dot)))
+def sample_pixels(depth: np.ndarray, sample_points: int, rng: np.random.Generator) -> Optional[np.ndarray]:
+    valid = np.argwhere(depth > 0)
+    if len(valid) == 0:
+        return None
+    if len(valid) > sample_points:
+        valid = valid[rng.choice(len(valid), size=sample_points, replace=False)]
+    uv = valid[:, [1, 0]].astype(np.float32)
+    uv += 0.5
+    return uv
+def project_points(uv: np.ndarray, depth: np.ndarray, intrinsics: np.ndarray, pose_src: np.ndarray, pose_dst: np.ndarray):
+    fx = float(intrinsics[0, 0])
+    fy = float(intrinsics[1, 1])
+    cx = float(intrinsics[0, 2])
+    cy = float(intrinsics[1, 2])
+    u = uv[:, 0]
+    v = uv[:, 1]
+    z = depth[np.clip((v - 0.5).astype(np.int64), 0, depth.shape[0] - 1), np.clip((u - 0.5).astype(np.int64), 0, depth.shape[1] - 1)]
+    x = (u - cx) * z / max(fx, 1e-6)
+    y = (v - cy) * z / max(fy, 1e-6)
+    points_cam = np.stack([x, y, z], axis=-1)
+    world = points_cam @ pose_src[:3, :3].T + pose_src[:3, 3]
+    world_to_dst = inverse_pose(pose_dst)
+    points_dst = world @ world_to_dst[:3, :3].T + world_to_dst[:3, 3]
+    return points_dst
+def directed_overlap(
+    src_depth: np.ndarray,
+    dst_depth: np.ndarray,
+    intrinsics_src: np.ndarray,
+    intrinsics_dst: np.ndarray,
+    pose_src: np.ndarray,
+    pose_dst: np.ndarray,
+    sample_points: int,
+    depth_tolerance_ratio: float,
+    rng: np.random.Generator,
+) -> Dict[str, float]:
+    uv = sample_pixels(src_depth, sample_points, rng)
+    if uv is None:
+        return {"frustum": 0.0, "depth": 0.0, "consistent": 0.0, "count": 0.0}
+    points_dst = project_points(uv, src_depth, intrinsics_src, pose_src, pose_dst)
+    z_dst = points_dst[:, 2]
+    fx = float(intrinsics_dst[0, 0])
+    fy = float(intrinsics_dst[1, 1])
+    cx = float(intrinsics_dst[0, 2])
+    cy = float(intrinsics_dst[1, 2])
+    u_dst = fx * points_dst[:, 0] / np.clip(z_dst, 1e-6, None) + cx
+    v_dst = fy * points_dst[:, 1] / np.clip(z_dst, 1e-6, None) + cy
+    inside = (
+        (z_dst > 1e-6)
+        & (u_dst >= 0.0)
+        & (u_dst < dst_depth.shape[1])
+        & (v_dst >= 0.0)
+        & (v_dst < dst_depth.shape[0])
+    )
+    frustum = float(inside.mean())
+    if not inside.any():
+        return {"frustum": frustum, "depth": 0.0, "consistent": 0.0, "count": float(len(uv))}
+    u_idx = np.clip(np.round(u_dst[inside]).astype(np.int64), 0, dst_depth.shape[1] - 1)
+    v_idx = np.clip(np.round(v_dst[inside]).astype(np.int64), 0, dst_depth.shape[0] - 1)
+    sampled_dst = dst_depth[v_idx, u_idx]
+    valid_dst = sampled_dst > 0
+    consistent = valid_dst & (np.abs(sampled_dst - z_dst[inside]) / np.clip(sampled_dst, 1e-6, None) <= depth_tolerance_ratio)
+    depth_overlap = float(consistent.sum() / max(1, len(uv)))
+    return {
+        "frustum": frustum,
+        "depth": depth_overlap,
+        "consistent": float(consistent.sum()),
+        "count": float(len(uv)),
+    }
+def symmetric_overlap(frame_a, frame_b, depth_a, depth_b, args, rng: np.random.Generator) -> Dict[str, float]:
+    a_to_b = directed_overlap(
+        depth_a,
+        depth_b,
+        frame_a["intrinsics"],
+        frame_b["intrinsics"],
+        frame_a["pose"],
+        frame_b["pose"],
+        args.sample_points,
+        args.depth_tolerance_ratio,
+        rng,
+    )
+    b_to_a = directed_overlap(
+        depth_b,
+        depth_a,
+        frame_b["intrinsics"],
+        frame_a["intrinsics"],
+        frame_b["pose"],
+        frame_a["pose"],
+        args.sample_points,
+        args.depth_tolerance_ratio,
+        rng,
+    )
+    return {
+        "frustum_overlap": 0.5 * (a_to_b["frustum"] + b_to_a["frustum"]),
+        "depth_overlap": 0.5 * (a_to_b["depth"] + b_to_a["depth"]),
+        "geometric_support_count": int(round(a_to_b["consistent"] + b_to_a["consistent"])),
+        "sample_count": int(round(a_to_b["count"] + b_to_a["count"])),
+    }
+def load_arkitscenes(root: str, split: str) -> List[Dict[str, Any]]:
+    split_dir = "Training" if split.lower() == "train" else "Test"
+    meta_root = Path(root) / split_dir
+    all_metadata = np.load(meta_root / "all_metadata.npz")
+    scenes = []
+    for scene_name in all_metadata["scenes"]:
+        scene_name = str(scene_name)
+        scene_dir = meta_root / scene_name
+        meta_path = scene_dir / "new_scene_metadata.npz"
+        if not meta_path.is_file():
+            continue
+        with np.load(meta_path, allow_pickle=True) as meta:
+            images = meta["images"]
+            intrinsics = meta["intrinsics"]
+            trajectories = meta["trajectories"]
+        frames = []
+        for basename, intri, pose in zip(images, intrinsics, trajectories):
+            basename = str(basename)
+            K = np.eye(3, dtype=np.float32)
+            K[0, 0] = intri[2]
+            K[1, 1] = intri[3]
+            K[0, 2] = intri[4]
+            K[1, 2] = intri[5]
+            depth_path = scene_dir / "lowres_depth" / basename
+            image_path = scene_dir / "vga_wide" / basename.replace(".png", ".jpg")
+            if not depth_path.is_file():
+                continue
+            frames.append({
+                "dataset": "arkitscenes",
+                "scene": scene_name,
+                "label": f"{scene_name}_{basename}",
+                "pose": np.asarray(pose, dtype=np.float32),
+                "intrinsics": K,
+                "depth_path": str(depth_path),
+                "image_path": str(image_path),
+            })
+        if frames:
+            scenes.append({"scene": scene_name, "frames": frames})
+    return scenes
+def load_scannetpp(root: str) -> List[Dict[str, Any]]:
+    scenes = []
+    all_metadata = np.load(Path(root) / "all_metadata.npz")
+    for scene_name in all_metadata["scenes"]:
+        scene_name = str(scene_name)
+        scene_dir = Path(root) / scene_name
+        meta_path = scene_dir / "new_scene_metadata.npz"
+        if not meta_path.is_file():
+            continue
+        with np.load(meta_path, allow_pickle=True) as meta:
+            images = meta["images"]
+            intrinsics = meta["intrinsics"]
+            trajectories = meta["trajectories"]
+        frames = []
+        for basename, intri, pose in zip(images, intrinsics, trajectories):
+            basename = str(basename)
+            depth_path = scene_dir / "depth" / f"{basename}.png"
+            image_path = scene_dir / "images" / f"{basename}.jpg"
+            if not depth_path.is_file():
+                continue
+            frames.append({
+                "dataset": "ScanNet++",
+                "scene": scene_name,
+                "label": f"{scene_name}_{basename}",
+                "pose": np.asarray(pose, dtype=np.float32),
+                "intrinsics": np.asarray(intri, dtype=np.float32),
+                "depth_path": str(depth_path),
+                "image_path": str(image_path),
+            })
+        if frames:
+            scenes.append({"scene": scene_name, "frames": frames})
+    return scenes
+def load_mvs_synth(root: str) -> List[Dict[str, Any]]:
+    scenes = []
+    for scene_name in sorted(os.listdir(root)):
+        scene_dir = Path(root) / scene_name
+        rgb_dir = scene_dir / "rgb"
+        depth_dir = scene_dir / "depth"
+        cam_dir = scene_dir / "cam"
+        if not rgb_dir.is_dir() or not depth_dir.is_dir() or not cam_dir.is_dir():
+            continue
+        basenames = sorted([path.stem for path in rgb_dir.glob("*.jpg")])
+        frames = []
+        for basename in basenames:
+            cam_path = cam_dir / f"{basename}.npz"
+            depth_path = depth_dir / f"{basename}.npy"
+            if not cam_path.is_file() or not depth_path.is_file():
+                continue
+            cam = np.load(cam_path)
+            frames.append({
+                "dataset": "MVS_Synth",
+                "scene": scene_name,
+                "label": f"{scene_name}_{basename}",
+                "pose": np.asarray(cam["pose"], dtype=np.float32),
+                "intrinsics": np.asarray(cam["intrinsics"], dtype=np.float32),
+                "depth_path": str(depth_path),
+                "image_path": str(rgb_dir / f"{basename}.jpg"),
+            })
+        if frames:
+            scenes.append({"scene": scene_name, "frames": frames})
+    return scenes
+def load_depth(frame: Dict[str, Any]) -> np.ndarray:
+    depth_path = frame["depth_path"]
+    if depth_path.endswith(".npy"):
+        depth = np.load(depth_path).astype(np.float32)
+        valid = depth > 0
+        if valid.any():
+            threshold = np.percentile(depth[valid], 98)
+            depth[depth > threshold] = 0.0
+            depth[depth > 1000.0] = 0.0
+    else:
+        depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0
+    depth[~np.isfinite(depth)] = 0.0
+    return depth
+def load_scenes(dataset: str, root: str, split: str) -> List[Dict[str, Any]]:
+    dataset = dataset.lower()
+    if dataset == "arkitscenes":
+        return load_arkitscenes(root, split)
+    if dataset == "scannetpp":
+        return load_scannetpp(root)
+    if dataset == "mvs_synth":
+        return load_mvs_synth(root)
+    raise ValueError(f"Unsupported dataset: {dataset}")
+def limit_frames(frames: List[Dict[str, Any]], args) -> List[Dict[str, Any]]:
+    frames = frames[:: max(1, args.frame_stride)]
+    if args.max_frames_per_scene > 0:
+        frames = frames[: args.max_frames_per_scene]
+    return frames
+def pair_key(frame_a: Dict[str, Any], frame_b: Dict[str, Any]) -> Tuple[str, str]:
+    key_a = canonical_view_key_from_values(frame_a["dataset"], frame_a["label"])
+    key_b = canonical_view_key_from_values(frame_b["dataset"], frame_b["label"])
+    return (key_a, key_b) if key_a <= key_b else (key_b, key_a)
+def lookup_cache(cache: Dict[Tuple[str, str], Dict[str, Any]], frame_a: Dict[str, Any], frame_b: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    return cache.get(pair_key(frame_a, frame_b))
+def l2m_positive(record: Optional[Dict[str, Any]], args) -> bool:
+    if record is None:
+        return False
+    match_count = int(round(_float(record.get("l2m_match_count"), _float(record.get("match_count"), 0.0))))
+    certainty = _float(record.get("l2m_mean_certainty"), _float(record.get("mean_certainty"), 0.0))
+    inlier_ratio = _float(record.get("l2m_inlier_ratio"), _float(record.get("inlier_ratio"), 0.0))
+    return (
+        match_count >= args.l2m_min_match_count
+        and certainty >= args.l2m_min_certainty
+        and inlier_ratio >= args.l2m_min_inlier_ratio
+    )
+def mine_scene(scene: Dict[str, Any], args, sage_cache, l2m_cache, rng: np.random.Generator):
+    frames = limit_frames(scene["frames"], args)
+    if len(frames) <= args.min_frame_gap:
+        return []
+    depth_cache: Dict[str, np.ndarray] = {}
+    records = []
+    num_pairs = 0
+    for i in range(len(frames)):
+        for j in range(i + args.min_frame_gap, len(frames), max(1, args.pair_step)):
+            if args.max_pairs_per_scene > 0 and num_pairs >= args.max_pairs_per_scene:
+                return records
+            frame_a = frames[i]
+            frame_b = frames[j]
+            dist = pose_distance(frame_a["pose"], frame_b["pose"])
+            heading = heading_angle_deg(frame_a["pose"], frame_b["pose"])
+            positive_coarse = dist <= args.max_translation and heading <= args.max_heading_deg
+            negative_coarse = dist <= args.hard_negative_max_translation and heading <= args.hard_negative_max_heading_deg
+            if not positive_coarse and not negative_coarse:
+                continue
+            if frame_a["depth_path"] not in depth_cache:
+                depth_cache[frame_a["depth_path"]] = load_depth(frame_a)
+            if frame_b["depth_path"] not in depth_cache:
+                depth_cache[frame_b["depth_path"]] = load_depth(frame_b)
+            overlap = symmetric_overlap(frame_a, frame_b, depth_cache[frame_a["depth_path"]], depth_cache[frame_b["depth_path"]], args, rng)
+            frustum_overlap = overlap["frustum_overlap"]
+            depth_overlap = overlap["depth_overlap"]
+            geometric_support_count = overlap["geometric_support_count"]
+            num_pairs += 1
+            sage_record = lookup_cache(sage_cache, frame_a, frame_b)
+            l2m_record = lookup_cache(l2m_cache, frame_a, frame_b)
+            sage_pass = _bool(sage_record, args.sage_min_score)
+            l2m_pass = l2m_positive(l2m_record, args)
+            if positive_coarse and frustum_overlap >= args.min_frustum_overlap and depth_overlap >= args.min_depth_overlap:
+                accepted = True
+                verification_path = "geometry_only"
+                if sage_cache:
+                    if sage_pass is False:
+                        accepted = False
+                    elif sage_pass is True:
+                        verification_path = "geometry+sage"
+                if l2m_cache:
+                    if l2m_pass and verification_path == "geometry+sage":
+                        verification_path = "geometry+sage+l2m"
+                    elif l2m_pass and verification_path != "geometry+sage":
+                        verification_path = "geometry+l2m_rescue"
+                        accepted = True
+                    elif sage_cache and sage_pass is False:
+                        accepted = False
+                if accepted:
+                    l2m_match_count = int(round(_float((l2m_record or {}).get("l2m_match_count"), _float((l2m_record or {}).get("match_count"), 0.0))))
+                    l2m_mean_certainty = _float((l2m_record or {}).get("l2m_mean_certainty"), _float((l2m_record or {}).get("mean_certainty"), 0.0))
+                    l2m_inlier_ratio = _float((l2m_record or {}).get("l2m_inlier_ratio"), _float((l2m_record or {}).get("inlier_ratio"), 0.0))
+                    score = max(depth_overlap, frustum_overlap)
+                    if sage_pass is True:
+                        score = max(score, _float((sage_record or {}).get("score"), _float((sage_record or {}).get("confidence"), score)))
+                    if l2m_pass:
+                        score = max(score, l2m_mean_certainty)
+                    record = {
+                        "scene": scene["scene"],
+                        "dataset_a": frame_a["dataset"],
+                        "dataset_b": frame_b["dataset"],
+                        "label_a": frame_a["label"],
+                        "label_b": frame_b["label"],
+                        "key_a": canonical_view_key_from_values(frame_a["dataset"], frame_a["label"]),
+                        "key_b": canonical_view_key_from_values(frame_b["dataset"], frame_b["label"]),
+                        "tag": "positive",
+                        "is_positive": True,
+                        "score": float(score),
+                        "soft_overlap_target": float(depth_overlap),
+                        "overlap": float(depth_overlap),
+                        "pair_confidence_weight": float(max(0.05, min(1.0, score))),
+                        "weight": float(max(0.05, min(1.0, score))),
+                        "pose_distance": float(dist),
+                        "heading_deg": float(heading),
+                        "frustum_overlap": float(frustum_overlap),
+                        "depth_overlap": float(depth_overlap),
+                        "geometric_support_count": int(geometric_support_count),
+                        "verification_path": verification_path,
+                        "l2m_match_count": l2m_match_count,
+                        "l2m_mean_certainty": float(l2m_mean_certainty),
+                        "l2m_inlier_ratio": float(l2m_inlier_ratio),
+                        "image_path_a": frame_a["image_path"],
+                        "image_path_b": frame_b["image_path"],
+                    }
+                    records.append(record)
+            elif negative_coarse and frustum_overlap <= args.negative_max_frustum_overlap and depth_overlap <= args.negative_max_depth_overlap:
+                score = max(1.0 - max(frustum_overlap, depth_overlap), 0.0)
+                records.append({
+                    "scene": scene["scene"],
+                    "dataset_a": frame_a["dataset"],
+                    "dataset_b": frame_b["dataset"],
+                    "label_a": frame_a["label"],
+                    "label_b": frame_b["label"],
+                    "key_a": canonical_view_key_from_values(frame_a["dataset"], frame_a["label"]),
+                    "key_b": canonical_view_key_from_values(frame_b["dataset"], frame_b["label"]),
+                    "tag": "hard_negative",
+                    "is_positive": False,
+                    "score": float(score),
+                    "soft_overlap_target": 0.0,
+                    "overlap": 0.0,
+                    "pair_confidence_weight": float(max(0.05, min(1.0, score))),
+                    "weight": float(max(0.05, min(1.0, score))),
+                    "pose_distance": float(dist),
+                    "heading_deg": float(heading),
+                    "frustum_overlap": float(frustum_overlap),
+                    "depth_overlap": float(depth_overlap),
+                    "geometric_support_count": 0,
+                    "verification_path": "geometry_negative",
+                    "l2m_match_count": 0,
+                    "l2m_mean_certainty": 0.0,
+                    "l2m_inlier_ratio": 0.0,
+                    "image_path_a": frame_a["image_path"],
+                    "image_path_b": frame_b["image_path"],
+                })
+    return records
+def build_argparser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", required=True, choices=["arkitscenes", "scannetpp", "mvs_synth"])
+    parser.add_argument("--root", required=True)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--split", default="train")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--max-scenes", type=int, default=0)
+    parser.add_argument("--max-frames-per-scene", type=int, default=0)
+    parser.add_argument("--frame-stride", type=int, default=1)
+    parser.add_argument("--pair-step", type=int, default=1)
+    parser.add_argument("--min-frame-gap", type=int, default=12)
+    parser.add_argument("--max-pairs-per-scene", type=int, default=0)
+    parser.add_argument("--sample-points", type=int, default=2048)
+    parser.add_argument("--depth-tolerance-ratio", type=float, default=0.05)
+    parser.add_argument("--max-translation", type=float, default=2.5)
+    parser.add_argument("--max-heading-deg", type=float, default=45.0)
+    parser.add_argument("--hard-negative-max-translation", type=float, default=3.5)
+    parser.add_argument("--hard-negative-max-heading-deg", type=float, default=75.0)
+    parser.add_argument("--min-frustum-overlap", type=float, default=0.2)
+    parser.add_argument("--min-depth-overlap", type=float, default=0.1)
+    parser.add_argument("--negative-max-frustum-overlap", type=float, default=0.05)
+    parser.add_argument("--negative-max-depth-overlap", type=float, default=0.02)
+    parser.add_argument("--sage-cache", default=None)
+    parser.add_argument("--sage-min-score", type=float, default=0.5)
+    parser.add_argument("--l2m-cache", default=None)
+    parser.add_argument("--l2m-min-match-count", type=int, default=64)
+    parser.add_argument("--l2m-min-certainty", type=float, default=0.5)
+    parser.add_argument("--l2m-min-inlier-ratio", type=float, default=0.3)
+    return parser
+def main():
+    args = build_argparser().parse_args()
+    rng = np.random.default_rng(args.seed)
+    scenes = load_scenes(args.dataset, args.root, args.split)
+    if args.max_scenes > 0:
+        scenes = scenes[: args.max_scenes]
+    sage_cache = load_pair_cache(args.sage_cache)
+    l2m_cache = load_pair_cache(args.l2m_cache)
+    all_records = []
+    scene_stats = []
+    for index, scene in enumerate(scenes):
+        records = mine_scene(scene, args, sage_cache, l2m_cache, rng)
+        all_records.extend(records)
+        positives = sum(1 for record in records if record.get("is_positive", False))
+        negatives = sum(1 for record in records if not record.get("is_positive", False))
+        scene_stats.append({
+            "scene": scene["scene"],
+            "num_frames": len(scene["frames"]),
+            "num_records": len(records),
+            "num_positives": positives,
+            "num_negatives": negatives,
+        })
+        print(f"[{index + 1}/{len(scenes)}] {scene['scene']}: {len(records)} records ({positives} pos / {negatives} neg)")
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    metadata = {
+        "dataset": args.dataset,
+        "root": args.root,
+        "split": args.split,
+        "num_scenes": len(scenes),
+        "num_records": len(all_records),
+        "num_positive_records": sum(1 for record in all_records if record.get("is_positive", False)),
+        "num_negative_records": sum(1 for record in all_records if not record.get("is_positive", False)),
+        "args": vars(args),
+        "scene_stats": scene_stats,
+    }
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump({"metadata": metadata, "records": all_records}, f, indent=2)
+    print(f"Wrote {len(all_records)} records to {output_path}")
+if __name__ == "__main__":
+    main()

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/pseudo_gt.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import json
+import os
+from typing import Any, Dict, List, Optional, Sequence, Tuple
+import torch
+import torch.nn.functional as F
+_POSITIVE_TAGS = {"positive", "pos", "loop", "match", "1", "true"}
+_NEGATIVE_TAGS = {"negative", "neg", "hard_negative", "hard-neg", "0", "false"}
+def _cfg_get(cfg, key, default=None):
+    if cfg is None:
+        return default
+    if isinstance(cfg, dict):
+        return cfg.get(key, default)
+    if hasattr(cfg, "get"):
+        return cfg.get(key, default)
+    return getattr(cfg, key, default)
+def _first_scalar(value):
+    if isinstance(value, (list, tuple)):
+        if len(value) == 0:
+            return None
+        return _first_scalar(value[0])
+    if torch.is_tensor(value):
+        if value.numel() == 0:
+            return None
+        return value.detach().reshape(-1)[0].cpu().item()
+    return value
+def _to_str(value) -> Optional[str]:
+    value = _first_scalar(value)
+    if value is None:
+        return None
+    return str(value)
+def _to_float(value, default: float = 0.0) -> float:
+    value = _first_scalar(value)
+    if value is None:
+        return float(default)
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return float(default)
+def canonical_view_key_from_values(dataset: Optional[str], label: Optional[str]) -> Optional[str]:
+    if dataset is None or label is None:
+        return None
+    return f"{dataset}::{label}"
+def canonical_view_key(view: Dict[str, Any]) -> Optional[str]:
+    if not isinstance(view, dict):
+        return None
+    return canonical_view_key_from_values(_to_str(view.get("dataset")), _to_str(view.get("label")))
+class FramePairPseudoGTDatabase:
+    def __init__(self, records: Dict[Tuple[str, str], Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None):
+        self.records = records
+        self.metadata = metadata or {}
+    def __len__(self) -> int:
+        return len(self.records)
+    @classmethod
+    def from_file(cls, path: str) -> "FramePairPseudoGTDatabase":
+        path = os.path.expanduser(path)
+        if not os.path.isfile(path):
+            raise FileNotFoundError(f"Pseudo-GT cache not found: {path}")
+        if path.endswith(".json"):
+            with open(path, "r", encoding="utf-8") as f:
+                payload = json.load(f)
+        elif path.endswith(".jsonl"):
+            raw_records = []
+            with open(path, "r", encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line:
+                        raw_records.append(json.loads(line))
+            payload = {"records": raw_records}
+        else:
+            payload = torch.load(path, map_location="cpu", weights_only=False)
+        metadata = payload.get("metadata", {}) if isinstance(payload, dict) else {}
+        raw_records = payload
+        if isinstance(payload, dict):
+            raw_records = payload.get("records", payload.get("pairs", payload))
+        records: Dict[Tuple[str, str], Dict[str, Any]] = {}
+        if isinstance(raw_records, dict):
+            iterator = [
+                ({**value, "pair_key": key} if isinstance(value, dict) else value)
+                for key, value in raw_records.items()
+            ]
+        else:
+            iterator = raw_records
+        for raw_record in iterator:
+            record = cls._normalize_record(raw_record)
+            if record is None:
+                continue
+            pair = record["pair"]
+            previous = records.get(pair)
+            if previous is None or record["score"] > previous["score"]:
+                records[pair] = record
+        return cls(records, metadata=metadata)
+    @classmethod
+    def _normalize_record(cls, raw_record: Any) -> Optional[Dict[str, Any]]:
+        if not isinstance(raw_record, dict):
+            return None
+        key_a = _to_str(raw_record.get("key_a") or raw_record.get("frame_key_a"))
+        key_b = _to_str(raw_record.get("key_b") or raw_record.get("frame_key_b"))
+        if key_a is None:
+            key_a = canonical_view_key_from_values(
+                _to_str(raw_record.get("dataset_a")),
+                _to_str(raw_record.get("label_a")),
+            )
+        if key_b is None:
+            key_b = canonical_view_key_from_values(
+                _to_str(raw_record.get("dataset_b")),
+                _to_str(raw_record.get("label_b")),
+            )
+        pair_key = _to_str(raw_record.get("pair_key"))
+        if (key_a is None or key_b is None) and pair_key and "||" in pair_key:
+            key_a, key_b = pair_key.split("||", 1)
+        if key_a is None or key_b is None or key_a == key_b:
+            return None
+        pair = (key_a, key_b) if key_a <= key_b else (key_b, key_a)
+        tag = (_to_str(raw_record.get("tag")) or "").strip().lower()
+        is_positive = bool(raw_record.get("is_positive", False))
+        if tag in _POSITIVE_TAGS:
+            is_positive = True
+        elif tag in _NEGATIVE_TAGS:
+            is_positive = False
+        score = _to_float(
+            raw_record.get("score"),
+            _to_float(raw_record.get("confidence"), _to_float(raw_record.get("pair_confidence_weight"), 0.0)),
+        )
+        overlap = _to_float(
+            raw_record.get("soft_overlap_target"),
+            _to_float(raw_record.get("overlap"), score if is_positive else 0.0),
+        )
+        weight = _to_float(raw_record.get("weight"), _to_float(raw_record.get("pair_confidence_weight"), max(score, overlap)))
+        geometric_support_count = int(round(_to_float(raw_record.get("geometric_support_count"), 0.0)))
+        l2m_match_count = int(round(_to_float(raw_record.get("l2m_match_count"), 0.0)))
+        l2m_mean_certainty = _to_float(raw_record.get("l2m_mean_certainty"), _to_float(raw_record.get("l2m_certainty"), 0.0))
+        l2m_inlier_ratio = _to_float(raw_record.get("l2m_inlier_ratio"), 0.0)
+        return {
+            "pair": pair,
+            "is_positive": is_positive,
+            "score": float(score),
+            "overlap": float(overlap),
+            "weight": float(weight),
+            "geometric_support_count": geometric_support_count,
+            "l2m_match_count": l2m_match_count,
+            "l2m_mean_certainty": float(l2m_mean_certainty),
+            "l2m_inlier_ratio": float(l2m_inlier_ratio),
+        }
+    def lookup(self, key_a: Optional[str], key_b: Optional[str]) -> Optional[Dict[str, Any]]:
+        if key_a is None or key_b is None or key_a == key_b:
+            return None
+        pair = (key_a, key_b) if key_a <= key_b else (key_b, key_a)
+        return self.records.get(pair)
+class PseudoGTLoopSupervisor:
+    def __init__(self, frame_db: FramePairPseudoGTDatabase, pseudo_gt_cfg):
+        self.frame_db = frame_db
+        self.use_soft_targets = bool(_cfg_get(pseudo_gt_cfg, "use_soft_targets", True))
+        self.min_confidence = float(_cfg_get(pseudo_gt_cfg, "min_confidence", 0.65))
+        self.min_support_pairs = max(1, int(_cfg_get(pseudo_gt_cfg, "min_support_pairs", 1)))
+        self.topk_pairs = max(1, int(_cfg_get(pseudo_gt_cfg, "topk_pairs", 4)))
+        self.loss_weight_gate = float(_cfg_get(pseudo_gt_cfg, "loss_weight_gate", 0.1))
+        self.loss_weight_desc = float(_cfg_get(pseudo_gt_cfg, "loss_weight_desc", 0.1))
+        self.loss_type = str(_cfg_get(pseudo_gt_cfg, "loss_type", "hybrid"))
+        self.geometric_support_scale = float(_cfg_get(pseudo_gt_cfg, "geometric_support_scale", 0.25))
+        self.ranking_margin = float(_cfg_get(pseudo_gt_cfg, "ranking_margin", 0.1))
+        self.use_l2m = bool(_cfg_get(pseudo_gt_cfg, "use_l2m", False))
+        self.l2m_min_certainty = float(_cfg_get(pseudo_gt_cfg, "l2m_min_certainty", 0.0))
+        self.l2m_min_inlier_ratio = float(_cfg_get(pseudo_gt_cfg, "l2m_min_inlier_ratio", 0.0))
+    @classmethod
+    def from_config(cls, pseudo_gt_cfg) -> Optional["PseudoGTLoopSupervisor"]:
+        if pseudo_gt_cfg is None or not bool(_cfg_get(pseudo_gt_cfg, "enable", False)):
+            return None
+        cache_path = _cfg_get(pseudo_gt_cfg, "cache_path", None)
+        if cache_path in (None, "", "null"):
+            raise ValueError("`pseudo_gt.enable=true` requires `pseudo_gt.cache_path`.")
+        return cls(FramePairPseudoGTDatabase.from_file(cache_path), pseudo_gt_cfg)
+    def _frame_keys(self, batch: Sequence[Dict[str, Any]], frame_ids: Sequence[int]) -> List[str]:
+        keys: List[str] = []
+        if batch is None:
+            return keys
+        num_views = len(batch)
+        for frame_id in frame_ids:
+            if num_views <= 0:
+                continue
+            index = min(max(int(frame_id), 0), num_views - 1)
+            key = canonical_view_key(batch[index])
+            if key is not None:
+                keys.append(key)
+        return keys
+    def _has_geometry(self, record: Dict[str, Any]) -> bool:
+        support = int(record.get("geometric_support_count", 0)) > 0 or int(record.get("l2m_match_count", 0)) > 0
+        if not support:
+            return False
+        if self.use_l2m and record.get("l2m_match_count", 0) > 0:
+            if float(record.get("l2m_mean_certainty", 0.0)) < self.l2m_min_certainty:
+                return False
+            if float(record.get("l2m_inlier_ratio", 0.0)) < self.l2m_min_inlier_ratio:
+                return False
+        return True
+    def build_submap_targets(self, batch, current_frame_ids, history_frame_ids_by_submap):
+        current_keys = self._frame_keys(batch, current_frame_ids)
+        if not current_keys:
+            return []
+        targets = []
+        for submap_id, history_frame_ids in sorted(history_frame_ids_by_submap.items()):
+            history_keys = self._frame_keys(batch, history_frame_ids)
+            if not history_keys:
+                continue
+            records = []
+            for current_key in current_keys:
+                for history_key in history_keys:
+                    record = self.frame_db.lookup(current_key, history_key)
+                    if record is not None:
+                        records.append(record)
+            if not records:
+                continue
+            positives = [record for record in records if record.get("is_positive", False) and record.get("score", 0.0) >= self.min_confidence]
+            negatives = [record for record in records if (not record.get("is_positive", False)) and record.get("score", 0.0) >= self.min_confidence]
+            if len(positives) >= self.min_support_pairs:
+                ranked = sorted(positives, key=lambda record: max(record.get("weight", 0.0), record.get("overlap", 0.0), record.get("score", 0.0)), reverse=True)[: self.topk_pairs]
+                soft_target = sum(record.get("overlap", record.get("score", 0.0)) for record in ranked) / max(1, len(ranked))
+                confidence = sum(max(record.get("weight", 0.0), record.get("score", 0.0), record.get("overlap", 0.0)) for record in ranked) / max(1, len(ranked))
+                geometry = sum(1 for record in positives if self._has_geometry(record))
+                targets.append({
+                    "submap_id": int(submap_id),
+                    "binary": 1.0,
+                    "soft": float(soft_target if self.use_soft_targets else 1.0),
+                    "weight": float(max(0.05, min(1.0, confidence))),
+                    "geometry": int(geometry),
+                })
+            elif negatives and not positives:
+                ranked = sorted(negatives, key=lambda record: max(record.get("weight", 0.0), record.get("score", 0.0)), reverse=True)[: self.topk_pairs]
+                confidence = sum(max(record.get("weight", 0.0), record.get("score", 0.0)) for record in ranked) / max(1, len(ranked))
+                targets.append({
+                    "submap_id": int(submap_id),
+                    "binary": 0.0,
+                    "soft": 0.0,
+                    "weight": float(max(0.05, min(1.0, confidence))),
+                    "geometry": 0,
+                })
+        return targets
+    def compute_loss(self, memory_mgr, batch, hidden_B, meta, loop_gate):
+        current_submap_id = int(meta.get("submap_id", -1))
+        if current_submap_id <= 0:
+            return None, {}
+        history_frame_ids_by_submap = {
+            int(submap_id): list(frame_ids)
+            for submap_id, frame_ids in memory_mgr.buffer.cpu_frame_ids.items()
+            if int(submap_id) < current_submap_id
+        }
+        if not history_frame_ids_by_submap:
+            return None, {}
+        targets = self.build_submap_targets(batch, meta.get("curr_frame_ids", []), history_frame_ids_by_submap)
+        if not targets:
+            return None, {}
+        current_desc = meta.get("curr_descriptor")
+        if current_desc is None:
+            n_prev = int(meta.get("n_prev", 0))
+            n_curr = int(meta.get("n_curr", 0))
+            current_tokens = hidden_B[n_prev:n_prev + n_curr]
+            if current_tokens.numel() == 0:
+                return None, {}
+            current_desc = memory_mgr.compute_descriptor(current_tokens)
+        current_desc = current_desc.float()
+        valid_targets = []
+        history_descs = []
+        for target in targets:
+            history_desc = memory_mgr.buffer.cpu_descriptor_buffer.get(target["submap_id"])
+            if history_desc is None:
+                continue
+            history_desc = history_desc.reshape(-1).to(current_desc.device, non_blocking=True).float()
+            history_descs.append(history_desc)
+            valid_targets.append(target)
+        if not history_descs:
+            return None, {}
+        history_descs = torch.stack(history_descs, dim=0)
+        predicted_cosine = F.cosine_similarity(current_desc.expand(history_descs.shape[0], -1), history_descs, dim=-1).clamp(min=-1.0, max=1.0)
+        predicted_similarity = 0.5 * (predicted_cosine + 1.0)
+        target_binary = torch.tensor([target["binary"] for target in valid_targets], device=predicted_similarity.device, dtype=predicted_similarity.dtype)
+        target_soft = torch.tensor([target["soft"] for target in valid_targets], device=predicted_similarity.device, dtype=predicted_similarity.dtype)
+        weights = torch.tensor([target["weight"] for target in valid_targets], device=predicted_similarity.device, dtype=predicted_similarity.dtype)
+        geometry = torch.tensor([target["geometry"] for target in valid_targets], device=predicted_similarity.device, dtype=predicted_similarity.dtype)
+        if self.geometric_support_scale > 0:
+            weights = (weights * (1.0 + self.geometric_support_scale * geometry.clamp(max=1.0))).clamp(max=1.5)
+        gate_loss = predicted_similarity.new_zeros(())
+        if self.loss_weight_gate > 0 and torch.is_tensor(loop_gate) and loop_gate.requires_grad:
+            gate_target = target_binary.max()
+            gate_weight = weights[target_binary.argmax()] if target_binary.numel() > 0 else weights.new_ones(())
+            gate_pred = loop_gate.reshape(-1)[0].clamp(min=1e-6, max=1.0 - 1e-6)
+            gate_loss = F.binary_cross_entropy(gate_pred, gate_target.clamp(min=0.0, max=1.0), reduction="none") * gate_weight
+            gate_loss = gate_loss.mean() * self.loss_weight_gate
+        descriptor_loss = predicted_similarity.new_zeros(())
+        if self.loss_weight_desc > 0:
+            regression = F.smooth_l1_loss(predicted_similarity, target_soft.clamp(min=0.0, max=1.0), reduction="none")
+            descriptor_loss = (regression * weights).sum() / weights.sum().clamp(min=1e-6)
+            if self.loss_type in {"hybrid", "ranking"}:
+                pos_mask = target_binary > 0.5
+                neg_mask = target_binary < 0.5
+                if pos_mask.any() and neg_mask.any():
+                    pos_score = predicted_similarity[pos_mask].max()
+                    neg_score = predicted_similarity[neg_mask].max()
+                    descriptor_loss = descriptor_loss + F.relu(self.ranking_margin - pos_score + neg_score)
+            descriptor_loss = descriptor_loss * self.loss_weight_desc
+        total_loss = gate_loss + descriptor_loss
+        details = {
+            "pseudo_gt_gate_loss": float(gate_loss.detach()),
+            "pseudo_gt_desc_loss": float(descriptor_loss.detach()),
+            "pseudo_gt_total": float(total_loss.detach()),
+            "pseudo_gt_pairs": float(len(valid_targets)),
+            "pseudo_gt_positive_pairs": float((target_binary > 0.5).sum().detach()),
+            "pseudo_gt_negative_pairs": float((target_binary < 0.5).sum().detach()),
+        }
+        return total_loss, details

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/__init__.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import os
+import pickle
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn.functional as F
+import trimesh
+from PIL import Image
+from tqdm import tqdm
+from .geometry_utils import NormalGenerator
+import rerun as rr
+from .visualization_utils import reverse_imagenet_normalize, colormap_image
+from typing import Dict, Any
+# depth prediction normals computer
+#PRED_FORMAT_SIZE = [480,640]#[192, 256]
+PRED_FORMAT_SIZE = [680,1200]#[192, 256]
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+compute_normals = NormalGenerator(PRED_FORMAT_SIZE[0], PRED_FORMAT_SIZE[1]).to(device)
+def to_device(input_dict, key_ignores=[], device="cuda"):
+    """ " Moves tensors in the input dict to the gpu and ignores tensors/elements
+    as with keys in key_ignores.
+    """
+    for k, v in input_dict.items():
+        if k not in key_ignores:
+            input_dict[k] = v.to(device).float()
+    return input_dict
+def log_source_data(src_entity_path: str, src_data: Dict[str, Any]) -> None:
+    src_images_k3hw = reverse_imagenet_normalize(
+        torch.tensor(src_data["image_b3hw"][0].to(device))
+    )
+    num_src_cameras = src_data["world_T_cam_b44"][0].shape[0]
+    for src_idx in range(num_src_cameras):
+        src_cam_path = f"{src_entity_path}/{src_idx}"
+        world_T_cam_44 = src_data["world_T_cam_b44"][0][src_idx].squeeze().cpu().numpy()
+        K_44 = src_data["K_s0_b44"][0][src_idx].squeeze().cpu().numpy()
+        log_camera(src_cam_path, world_T_cam_44, K_44)
+        log_image(src_cam_path, src_images_k3hw[src_idx], denormalize=False)
+def log_camera(
+        entity_path: str, world_T_cam_44: torch.Tensor, K_44: torch.Tensor, kfd=False, update=False,
+) -> None:
+    assert world_T_cam_44.shape == (4, 4)
+    assert K_44.shape == (4, 4)
+    # Convert and log camera parameters
+    Rot, trans = world_T_cam_44[:3, :3], world_T_cam_44[:3, 3]
+    K_33 = K_44[:3, :3]
+    K_33[:2] /= 4
+    rr.log(entity_path, rr.Transform3D(translation=trans, mat3x3=Rot))#, axis_length=0))
+    if not update: # frontend
+        if not kfd:
+            rr.log(
+                entity_path+'/frustum',
+                rr.Pinhole(
+                    #image_from_camera=K_33,
+                    #width=PRED_FORMAT_SIZE[1]/4,
+                    #height=PRED_FORMAT_SIZE[0]/4,
+                    fov_y=0.7853982,
+                    aspect_ratio=1.7777778,
+                    #camera_xyz=rr.ViewCoordinates.RUB,
+                    camera_xyz=None,
+                    image_plane_distance=0.1,
+                    color=[0, 255, 0],
+                    line_width=0.003,
+                ),
+            )
+        else:
+            rr.log(
+                entity_path+'/frustum',
+                rr.Pinhole(
+                    image_from_camera=K_33,
+                    width=PRED_FORMAT_SIZE[1]/4,
+                    height=PRED_FORMAT_SIZE[0]/4,
+                ),
+            )
+    else:# backend
+        pass
+def log_window(
+    entity_path: str, world_T_cam_44: torch.Tensor, K_44: torch.Tensor
+) -> None:
+    assert world_T_cam_44.shape == (4, 4)
+    assert K_44.shape == (4, 4)
+    # Convert and log camera parameters
+    Rot, trans = world_T_cam_44[:3, :3], world_T_cam_44[:3, 3]
+    rr.log(entity_path, rr.Transform3D(translation=trans, mat3x3=Rot))#, axis_length=0))
+def log_image(
+    entity_path: str, color_frame_b3hw: torch.Tensor, denormalize=True
+) -> None:
+    # Image logging
+    color_frame_3hw = color_frame_b3hw.squeeze(0)
+    if denormalize:
+        main_color_3hw = reverse_imagenet_normalize(color_frame_3hw)
+    else:
+        main_color_3hw = color_frame_3hw
+    pil_image = Image.fromarray(
+        np.uint8(main_color_3hw.permute(1, 2, 0).cpu().detach().numpy() * 255)
+    )
+    pil_image = pil_image.resize((PRED_FORMAT_SIZE[1], PRED_FORMAT_SIZE[0]))
+    rr.log(f"{entity_path}/image/rgb", rr.Image(pil_image))
+def log_rerun(
+    entity_path: str,
+    cur_data: Dict[str, Any],
+    src_data: Dict[str, Any],
+    outputs: Dict[str, Any],
+    scene_trimesh_mesh: trimesh.Trimesh,
+    should_log_source_cams: bool = True,
+) -> None:
+    """
+    Logs camera intri/extri, depth, rgb, and mesh to rerun.
+    """
+    curr_entity_path = f"{entity_path}/current_cam"
+    src_entity_path = f"{entity_path}/source_cam"
+    if should_log_source_cams:
+        log_source_data(src_entity_path, src_data)
+    world_T_cam_44 = cur_data["world_T_cam_b44"].squeeze().cpu().numpy()
+    K_44 = cur_data["K_s0_b44"].squeeze().cpu().numpy()
+    log_camera(curr_entity_path, world_T_cam_44, K_44)
+    # Depth logging
+    depth_pred = outputs["depth_pred_s0_b1hw"]
+    our_depth_3hw = depth_pred.squeeze(0)
+    our_depth_hw3 = our_depth_3hw.permute(1, 2, 0)
+    rr.log(
+        f"{curr_entity_path}/image/depth",
+        rr.DepthImage(our_depth_hw3.numpy(force=True)),
+    )
+    # Normal logging
+    invK_s0_b44 = cur_data["invK_s0_b44"].to(device)
+    normals_b3hw = compute_normals(depth_pred, invK_s0_b44)
+    our_normals_3hw = 0.5 * (1 + normals_b3hw).squeeze(0)
+    pil_normal = Image.fromarray(
+        np.uint8(our_normals_3hw.permute(1, 2, 0).cpu().detach().numpy() * 255)
+    )
+    rr.log(f"{curr_entity_path}/image/normal", rr.Image(pil_normal))
+    # Image logging
+    color_frame_b3hw = (
+        cur_data["high_res_color_b3hw"]
+        if "high_res_color_b3hw" in cur_data
+        else cur_data["image_b3hw"]
+    )
+    color_frame_3hw = color_frame_b3hw.squeeze(0)
+    main_color_3hw = reverse_imagenet_normalize(color_frame_3hw)
+    pil_image = Image.fromarray(
+        np.uint8(main_color_3hw.permute(1, 2, 0).cpu().detach().numpy() * 255)
+    )
+    pil_image = pil_image.resize((PRED_FORMAT_SIZE[1], PRED_FORMAT_SIZE[0]))
+    rr.log(f"{curr_entity_path}/image/rgb", rr.Image(pil_image))
+    # lowest cost guess from the cost volume
+    lowest_cost_bhw = outputs["lowest_cost_bhw"]
+    lowest_cost_3hw = colormap_image(
+        lowest_cost_bhw,
+        vmin=0,
+        vmax=5,
+    )
+    pil_cost = Image.fromarray(
+        np.uint8(lowest_cost_3hw.permute(1, 2, 0).cpu().detach().numpy() * 255)
+    )
+    pil_cost = pil_cost.resize((PRED_FORMAT_SIZE[1], PRED_FORMAT_SIZE[0]))
+    rr.log("lowest_cost_volume", rr.Image(pil_cost))
+    # Fused mesh logging
+    rr.log(
+        f"{entity_path}/mesh",
+        rr.Mesh3D(
+            vertex_positions=scene_trimesh_mesh.vertices,
+            triangle_indices=scene_trimesh_mesh.faces,
+            vertex_colors=scene_trimesh_mesh.visual.vertex_colors,
+        ),
+    )

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/generic_utils.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import logging
+import os
+import pickle
+from pathlib import Path
+import kornia
+import torch
+import torchvision.transforms.functional as TF
+from PIL import Image
+from torch import nn
+logger = logging.getLogger(__name__)
+def copy_code_state(path):
+    """Copies the code directory into the path specified using rsync. It will
+    use a .gitignore file to exclude files in rsync. We preserve modification
+    times in rsync."""
+    # create dir
+    Path(os.path.join(path)).mkdir(parents=True, exist_ok=True)
+    if os.path.exists("./.gitignore"):
+        # use .gitignore to remove junk
+        rsync_command = (
+            f"rsync -art --exclude-from='./.gitignore' --exclude '.git' . {path}"
+        )
+    else:
+        print("WARNING: no .gitignore found so can't use that to exlcude large "
+            "files when making a back up of files in copy_code_state.")
+        rsync_command = (
+            f"rsync -art --exclude '.git' . {path}"
+        )
+    os.system(rsync_command)
+def readlines(filepath):
+    """ Reads in a text file and returns lines in a list. """
+    with open(filepath, 'r') as f:
+        lines = f.read().splitlines()
+    return lines
+def normalize_depth_single(depth_11hw, mask_11hw, robust=False):
+    if mask_11hw is not None:
+        valid_depth_vals_N = depth_11hw.masked_select(mask_11hw)
+    else:
+        valid_depth_vals_N = torch.flatten(depth_11hw)
+    num_valid_pix = valid_depth_vals_N.nelement()
+    num_percentile_pix = num_valid_pix // 10
+    if num_valid_pix == 0:
+        return depth_11hw
+    sorted_depth_vals_N = torch.sort(valid_depth_vals_N)[0]
+    depth_flat_N = sorted_depth_vals_N[num_percentile_pix:-num_percentile_pix]
+    if depth_flat_N.nelement() == 0:
+        depth_flat_N = valid_depth_vals_N
+    if robust:
+        depth_shift = depth_flat_N.median()
+        depth_scale = torch.mean(torch.abs(depth_flat_N - depth_shift))
+    else:
+        depth_shift = depth_flat_N.mean()
+        depth_scale = depth_flat_N.std()
+    depth_norm = (depth_11hw - depth_shift) / depth_scale
+    return depth_norm
+def normalize_depth(depth_b1hw: torch.Tensor,
+                mask_b1hw: torch.Tensor = None,
+                robust: bool = False):
+    depths_11hw = torch.split(depth_b1hw, 1, 0)
+    masks_11hw = ([None] * len(depths_11hw) if mask_b1hw is None
+                                            else torch.split(mask_b1hw, 1, 0))
+    depths_norm_11hw = [normalize_depth_single(d, m, robust)
+                                    for d, m in zip(depths_11hw, masks_11hw)]
+    return torch.cat(depths_norm_11hw, dim=0)
+def upsample(x):
+    """
+    Upsample input tensor by a factor of 2
+    """
+    return nn.functional.interpolate(
+                                x,
+                                scale_factor=2,
+                                mode="bilinear",
+                                align_corners=False,
+                            )
+def batched_trace(mat_bNN):
+    return mat_bNN.diagonal(offset=0, dim1=-1, dim2=-2).sum(-1)
+def tensor_B_to_bM(tensor_BS, batch_size, num_views):
+    """Unpacks a flattened tensor of tupled elements (BS) into bMS. Tuple size
+        is M."""
+    # S for wild card number of dims in the middle
+    # tensor_bSM = tensor_BS.unfold(0, step=num_views, size=num_views)
+    # tensor_bMS = tensor_bSM.movedim(-1, 1)
+    tensor_bMS = tensor_BS.view([batch_size, num_views] + list(tensor_BS.shape[1:]))
+    return tensor_bMS
+def tensor_bM_to_B(tensor_bMS):
+    """Packs an inflated tensor of tupled elements (bMS) into BS. Tuple size
+        is M."""
+    # S for wild card number of dims in the middle
+    num_views = tensor_bMS.shape[1]
+    num_batches = tensor_bMS.shape[0]
+    tensor_BS = tensor_bMS.view([num_views * num_batches] + list(tensor_bMS.shape[2:]))
+    return tensor_BS
+def combine_dims(x, dim_begin, dim_end):
+    """Views x with the dimensions from dim_begin to dim_end folded."""
+    combined_shape = list(x.shape[:dim_begin]) + [-1] + list(x.shape[dim_end:])
+    return x.view(combined_shape)
+def to_gpu(input_dict, key_ignores=[]):
+    """" Moves tensors in the input dict to the gpu and ignores tensors/elements
+        as with keys in key_ignores.
+    """
+    for k, v in input_dict.items():
+        if k not in key_ignores:
+            input_dict[k] = v.cuda().float()
+    return input_dict
+def imagenet_normalize(image):
+    """ Normalizes an image with ImageNet statistics. """
+    image = TF.normalize(tensor=image,
+        mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+    return image
+def reverse_imagenet_normalize(image):
+    """ Reverses ImageNet normalization in an input image. """
+    image = TF.normalize(tensor=image,
+        mean=(-2.11790393, -2.03571429, -1.80444444),
+        std=(4.36681223, 4.46428571, 4.44444444))
+    return image
+def read_image_file(filepath,
+                    height=None,
+                    width=None,
+                    value_scale_factor=1.0,
+                    resampling_mode=Image.BILINEAR,
+                    disable_warning=False,
+                    target_aspect_ratio=None):
+    """" Reads an image file using PIL, then optionally resizes the image,
+    with selective resampling, scales values, and returns the image as a
+    tensor
+    Args:
+        filepath: path to the image.
+        height, width: resolution to resize the image to. Both must not be
+            None for scaling to take place.
+        value_scale_factor: value to scale image values with, default is 1.0
+        resampling_mode: resampling method when resizing using PIL. Default
+            is PIL.Image.BILINEAR
+        target_aspect_ratio: if not None, will crop the image to match this
+        aspect ratio. Default is None
+    Returns:
+        img: tensor with (optionally) scaled and resized image data.
+    """
+    img = Image.open(filepath)
+    if target_aspect_ratio:
+        crop_image_to_target_ratio(img, target_aspect_ratio)
+    # resize if both width and height are not none.
+    if height is not None and width is not None:
+        img_width, img_height = img.size
+        # do we really need to resize? If not, skip.
+        if (img_width, img_height) != (width, height):
+            # warn if it doesn't make sense.
+            if ((width > img_width or height > img_height) and
+                    not disable_warning):
+                logger.warning(
+                    f"WARNING: target size ({width}, {height}) has a "
+                    f"dimension larger than input size ({img_width}, "
+                    f"{img_height}).")
+            img = img.resize((width, height), resample=resampling_mode)
+    img = TF.to_tensor(img).float() * value_scale_factor
+    return img
+def crop_image_to_target_ratio(image, target_aspect_ratio=4.0/3.0):
+    """ Crops an image to satisfy a target aspect ratio. """
+    actual_aspect_ratio = image.width/image.height
+    if actual_aspect_ratio > target_aspect_ratio:
+        # we should crop width
+        new_width = image.height * target_aspect_ratio
+        left = (image.width - new_width)/2
+        top = 0
+        right = (image.width + new_width)/2
+        bottom = image.height
+        # Crop the center of the image
+        image = image.crop((left, top, right, bottom))
+    elif actual_aspect_ratio < target_aspect_ratio:
+        # we should crop height
+        new_height = image.width/target_aspect_ratio
+        left = 0
+        top = (image.height - new_height)/2
+        right = image.width
+        bottom = (image.height + new_height)/2
+        # Crop the center of the image
+        image = image.crop((left, top, right, bottom))
+    return image
+def cache_model_outputs(
+                output_path,
+                outputs,
+                cur_data,
+                src_data,
+                batch_ind,
+                batch_size,
+            ):
+    """ Helper function for model output during inference. """
+    for elem_ind in range(outputs["depth_pred_s0_b1hw"].shape[0]):
+        if "frame_id_string" in cur_data:
+            frame_id = cur_data["frame_id_string"][elem_ind]
+        else:
+            frame_id = (batch_ind * batch_size) + elem_ind
+            frame_id = f"{str(frame_id):6d}"
+        elem_filepath = os.path.join(output_path, f"{frame_id}.pickle")
+        elem_output_dict = {}
+        for key in outputs:
+            if outputs[key] is not None:
+                elem_output_dict[key] = outputs[key][elem_ind].unsqueeze(0)
+            else:
+                elem_output_dict[key] = None
+        # include some auxiliary information
+        elem_output_dict["K_full_depth_b44"] = cur_data[
+                                                    "K_full_depth_b44"
+                                                ][elem_ind].unsqueeze(0)
+        elem_output_dict["K_s0_b44"] = cur_data[
+                                            "K_s0_b44"
+                                        ][elem_ind].unsqueeze(0)
+        elem_output_dict["frame_id"] = cur_data["frame_id_string"][elem_ind]
+        elem_output_dict["src_ids"] = []
+        for src_id_list in src_data["frame_id_string"]:
+            elem_output_dict["src_ids"].append(src_id_list[elem_ind])
+        with open(elem_filepath, 'wb') as handle:
+            pickle.dump(elem_output_dict, handle)

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/geometry_utils.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import kornia
+import numpy as np
+import torch
+import torch.jit as jit
+import torch.nn.functional as F
+from torch import Tensor
+@torch.jit.script
+def to_homogeneous(input_tensor: Tensor, dim: int = 0) -> Tensor:
+    """
+    Converts tensor to homogeneous coordinates by adding ones to the specified
+    dimension
+    """
+    ones = torch.ones_like(input_tensor.select(dim, 0).unsqueeze(dim))
+    output_bkN = torch.cat([input_tensor, ones], dim=dim)
+    return output_bkN
+class BackprojectDepth(jit.ScriptModule):
+    """
+    Layer that projects points from 2D camera to 3D space. The 3D points are
+    represented in homogeneous coordinates.
+    """
+    def __init__(self, height: int, width: int):
+        super().__init__()
+        self.height = height
+        self.width = width
+        xx, yy = torch.meshgrid(
+                            torch.arange(self.width),
+                            torch.arange(self.height),
+                            indexing='xy',
+                        )
+        pix_coords_2hw = torch.stack((xx, yy), axis=0) + 0.5
+        pix_coords_13N = to_homogeneous(
+                                pix_coords_2hw,
+                                dim=0,
+                            ).flatten(1).unsqueeze(0)
+        # make these tensors into buffers so they are put on the correct GPU
+        # automatically
+        self.register_buffer("pix_coords_13N", pix_coords_13N)
+    @jit.script_method
+    def forward(self, depth_b1hw: Tensor, invK_b44: Tensor) -> Tensor:
+        """
+        Backprojects spatial points in 2D image space to world space using
+        invK_b44 at the depths defined in depth_b1hw.
+        """
+        cam_points_b3N = torch.matmul(invK_b44[:, :3, :3], self.pix_coords_13N)
+        cam_points_b3N = depth_b1hw.flatten(start_dim=2) * cam_points_b3N
+        cam_points_b4N = to_homogeneous(cam_points_b3N, dim=1)
+        return cam_points_b4N
+class Project3D(jit.ScriptModule):
+    """
+    Layer that projects 3D points into the 2D camera
+    """
+    def __init__(self, eps: float = 1e-8):
+        super().__init__()
+        self.register_buffer("eps", torch.tensor(eps).view(1, 1, 1))
+    @jit.script_method
+    def forward(self, points_b4N: Tensor,
+                K_b44: Tensor, cam_T_world_b44: Tensor) -> Tensor:
+        """
+        Projects spatial points in 3D world space to camera image space using
+        the extrinsics matrix cam_T_world_b44 and intrinsics K_b44.
+        """
+        P_b44 = K_b44 @ cam_T_world_b44
+        cam_points_b3N = P_b44[:, :3] @ points_b4N
+        # from Kornia and OpenCV, https://kornia.readthedocs.io/en/latest/_modules/kornia/geometry/conversions.html#convert_points_from_homogeneous
+        mask = torch.abs(cam_points_b3N[:, 2:]) > self.eps
+        depth_b1N = (cam_points_b3N[:, 2:] + self.eps)
+        scale = torch.where(mask, 1.0 / depth_b1N, torch.tensor(1.0, device=depth_b1N.device))
+        pix_coords_b2N = cam_points_b3N[:, :2] * scale
+        return torch.cat([pix_coords_b2N, depth_b1N], dim=1)
+class NormalGenerator(jit.ScriptModule):
+    def __init__(self, height: int, width: int,
+                smoothing_kernel_size: int=5, smoothing_kernel_std: float=2.0):
+        """
+        Estimates normals from depth maps.
+        """
+        super().__init__()
+        self.height = height
+        self.width = width
+        self.backproject = BackprojectDepth(self.height, self.width)
+        self.kernel_size = smoothing_kernel_size
+        self.std = smoothing_kernel_std
+    def forward(self, depth_b1hw: Tensor, invK_b44: Tensor) -> Tensor:
+        """
+        First smoothes incoming depth maps with a gaussian blur, backprojects
+        those depth points into world space (see BackprojectDepth), estimates
+        the spatial gradient at those points, and finally uses normalized cross
+        correlation to estimate a normal vector at each location.
+        """
+        depth_smooth_b1hw = kornia.filters.gaussian_blur2d(
+                                depth_b1hw,
+                                (self.kernel_size, self.kernel_size),
+                                (self.std, self.std),
+                            )
+        cam_points_b4N = self.backproject(depth_smooth_b1hw, invK_b44)
+        cam_points_b3hw = cam_points_b4N[:, :3].view(-1, 3, self.height, self.width)
+        gradients_b32hw = kornia.filters.spatial_gradient(cam_points_b3hw)
+        return F.normalize(
+                        torch.cross(
+                            gradients_b32hw[:, :, 0],
+                            gradients_b32hw[:, :, 1],
+                            dim=1,
+                        ),
+                        dim=1,
+                    )
+def get_camera_rays(
+            world_T_cam_b44,
+            world_points_b3N,
+            in_camera_frame,
+            cam_T_world_b44=None,
+            eps=1e-4,
+        ):
+    """
+    Computes camera rays for given camera data and points, optionally shifts
+    rays to camera frame.
+    """
+    if in_camera_frame:
+        batch_size = world_points_b3N.shape[0]
+        num_points = world_points_b3N.shape[2]
+        world_points_b4N = torch.cat(
+            [
+                world_points_b3N,
+                torch.ones(batch_size, 1, num_points).to(world_points_b3N.device),
+            ],
+            1,
+        )
+        camera_points_b3N = torch.matmul(cam_T_world_b44[:, :3, :4],
+                                                            world_points_b4N)
+        rays_b3N = camera_points_b3N
+    else:
+        rays_b3N = world_points_b3N - world_T_cam_b44[:, 0:3, 3][:, :, None].expand(
+                        world_points_b3N.shape
+                    )
+    rays_b3N = torch.nn.functional.normalize(rays_b3N, dim=1)
+    return rays_b3N
+def pose_distance(pose_b44):
+    """
+    DVMVS frame pose distance.
+    """
+    R = pose_b44[:, :3, :3]
+    t = pose_b44[:, :3, 3]
+    R_trace = R.diagonal(offset=0, dim1=-1, dim2=-2).sum(-1)
+    R_measure = torch.sqrt(2 *
+                (1 - torch.minimum(torch.ones_like(R_trace)*3.0, R_trace) / 3))
+    t_measure = torch.norm(t, dim=1)
+    combined_measure = torch.sqrt(t_measure ** 2 + R_measure ** 2)
+    return combined_measure, R_measure, t_measure
+def qvec2rotmat(qvec):
+    """
+    Quaternion to 3x3 rotation matrix.
+    """
+    return np.array([
+        [
+            1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
+            2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+            2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
+        ], [
+            2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+            1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
+            2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
+        ], [
+            2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+            2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+            1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
+        ]
+    ])
+def rotx(t):
+    """
+    3D Rotation about the x-axis.
+    """
+    c = np.cos(t)
+    s = np.sin(t)
+    return np.array([[1, 0, 0],
+                    [0, c, -s],
+                    [0, s, c]])
+def roty(t):
+    """
+    3D Rotation about the y-axis.
+    """
+    c = np.cos(t)
+    s = np.sin(t)
+    return np.array([[c, 0, s],
+                    [0, 1, 0],
+                    [-s, 0, c]])
+def rotz(t):
+    """
+    3D Rotation about the z-axis.
+    """
+    c = np.cos(t)
+    s = np.sin(t)
+    return np.array([[c, -s, 0],
+                    [s, c, 0],
+                    [0, 0, 1]])

checkpoints/paper_smoke_local_8gpu_submap12/joint_freeze_frontend_fsdp_sub12/paper_smoke_joint_freeze_frontend_fsdp_8gpu_sub12/code/04_04-00:52:12/slam/rerun_helper/tmp.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import numpy as np
+import rerun as rr
+# 初始化 Rerun
+rr.init("Multi-Camera Pose Example",spawn=True)
+# 假设你有两个摄像头的位姿和内参
+# 摄像头1的位姿和内参
+pose1 = np.array([
+    [0.99, -0.10, 0.10, 1.0],
+    [0.10, 0.99, -0.10, 2.0],
+    [-0.10, 0.10, 0.99, 3.0],
+    [0.0, 0.0, 0.0, 1.0]
+])
+intrinsic1 = np.array([
+    [500, 0, 320],
+    [0, 500, 240],
+    [0, 0, 1]
+])
+# 摄像头2的位姿和内参
+pose2 = np.array([
+    [0.99, 0.10, -0.10, -1.0],
+    [-0.10, 0.99, 0.10, -2.0],
+    [0.10, -0.10, 0.99, -3.0],
+    [0.0, 0.0, 0.0, 1.0]
+])
+intrinsic2 = np.array([
+    [500, 0, 320],
+    [0, 500, 240],
+    [0, 0, 1]
+])
+# 展示摄像头1
+rr.log_camera("camera1", pose=pose1, intrinsic=intrinsic1)
+# 展示摄像头2
+rr.log_camera("camera2", pose=pose2, intrinsic=intrinsic2)

	@@ -0,0 +1,167 @@

+import os
+import matplotlib.pyplot as plt
+import moviepy.editor as mpy
+import numpy as np
+import torch
+from PIL import Image
+from .generic_utils import reverse_imagenet_normalize
+def colormap_image(
+                image_1hw,
+                mask_1hw=None,
+                invalid_color=(0.0, 0, 0.0),
+                flip=True,
+                vmin=None,
+                vmax=None,
+                return_vminvmax=False,
+                colormap="turbo",
+            ):
+    """
+    Colormaps a one channel tensor using a matplotlib colormap.
+    Args:
+        image_1hw: the tensor to colomap.
+        mask_1hw: an optional float mask where 1.0 donates valid pixels.
+        colormap: the colormap to use. Default is turbo.
+        invalid_color: the color to use for invalid pixels.
+        flip: should we flip the colormap? True by default.
+        vmin: if provided uses this as the minimum when normalizing the tensor.
+        vmax: if provided uses this as the maximum when normalizing the tensor.
+            When either of vmin or vmax are None, they are computed from the
+            tensor.
+        return_vminvmax: when true, returns vmin and vmax.
+    Returns:
+        image_cm_3hw: image of the colormapped tensor.
+        vmin, vmax: returned when return_vminvmax is true.
+    """
+    valid_vals = image_1hw if mask_1hw is None else image_1hw[mask_1hw.bool()]
+    if vmin is None:
+        vmin = valid_vals.min()
+    if vmax is None:
+        vmax = valid_vals.max()
+    cmap = torch.Tensor(
+                            plt.cm.get_cmap(colormap)(
+                                                torch.linspace(0, 1, 256)
+                                            )[:, :3]
+                        ).to(image_1hw.device)
+    if flip:
+        cmap = torch.flip(cmap, (0,))
+    h, w = image_1hw.shape[1:]
+    image_norm_1hw = (image_1hw - vmin) / (vmax - vmin)
+    image_int_1hw = (torch.clamp(image_norm_1hw * 255, 0, 255)).byte().long()
+    image_cm_3hw = cmap[image_int_1hw.flatten(start_dim=1)
+                                        ].permute([0, 2, 1]).view([-1, h, w])
+    if mask_1hw is not None:
+        invalid_color = torch.Tensor(invalid_color).view(3, 1, 1).to(image_1hw.device)
+        image_cm_3hw = image_cm_3hw * mask_1hw + invalid_color * (1 - mask_1hw)
+    if return_vminvmax:
+        return image_cm_3hw, vmin, vmax
+    else:
+        return image_cm_3hw
+def save_viz_video_frames(frame_list, path, fps=30):
+    """
+    Saves a video file of numpy RGB frames in frame_list.
+    """
+    clip = mpy.ImageSequenceClip(frame_list, fps=fps)
+    clip.write_videofile(path, verbose=False, logger=None)
+    return
+def quick_viz_export(
+            output_path,
+            outputs,
+            cur_data,
+            batch_ind,
+            valid_mask_b,
+            batch_size):
+    """ Helper function for quickly exporting depth maps during inference. """
+    if valid_mask_b.sum() == 0:
+        batch_vmin = 0.0
+        batch_vmax = 5.0
+    else:
+        batch_vmin = cur_data["full_res_depth_b1hw"][valid_mask_b].min()
+        batch_vmax = cur_data["full_res_depth_b1hw"][valid_mask_b].max()
+    if batch_vmax == batch_vmin:
+        batch_vmin = 0.0
+        batch_vmax = 5.0
+    for elem_ind in range(outputs["depth_pred_s0_b1hw"].shape[0]):
+        if "frame_id_string" in cur_data:
+            frame_id = cur_data["frame_id_string"][elem_ind]
+        else:
+            frame_id = (batch_ind * batch_size) + elem_ind
+            frame_id = f"{str(frame_id):6d}"
+        # check for valid depths from dataloader
+        if valid_mask_b[elem_ind].sum() == 0:
+            sample_vmin = 0.0
+            sample_vmax = 0.0
+        else:
+            # these will be the same when the depth map is all ones.
+            sample_vmin = cur_data["full_res_depth_b1hw"][elem_ind][valid_mask_b[elem_ind]].min()
+            sample_vmax = cur_data["full_res_depth_b1hw"][elem_ind][valid_mask_b[elem_ind]].max()
+        # if no meaningful gt depth in dataloader, don't viz gt and
+        # set vmin/max to default
+        if sample_vmax != sample_vmin:
+            full_res_depth_1hw = cur_data["full_res_depth_b1hw"][elem_ind]
+            full_res_depth_3hw = colormap_image(
+                                        full_res_depth_1hw,
+                                        vmin=batch_vmin, vmax=batch_vmax
+                                    )
+            full_res_depth_hw3 = np.uint8(
+                                full_res_depth_3hw.permute(1,2,0
+                            ).cpu().detach().numpy() * 255
+                        )
+            Image.fromarray(full_res_depth_hw3).save(
+                                        os.path.join(output_path,
+                                        f"{frame_id}_gt_depth.png")
+                                    )
+        lowest_cost_3hw = colormap_image(
+                                    outputs["lowest_cost_bhw"][elem_ind].unsqueeze(0),
+                                    vmin=batch_vmin, vmax=batch_vmax
+                                )
+        pil_image = Image.fromarray(
+                            np.uint8(
+                                lowest_cost_3hw.permute(1,2,0
+                                    ).cpu().detach().numpy() * 255)
+                        )
+        pil_image.save(os.path.join(output_path,
+                                f"{frame_id}_lowest_cost_pred.png"))
+        depth_3hw = colormap_image(
+                            outputs["depth_pred_s0_b1hw"][elem_ind],
+                            vmin=batch_vmin, vmax=batch_vmax)
+        pil_image = Image.fromarray(
+                            np.uint8(depth_3hw.permute(1,2,0
+                                    ).cpu().detach().numpy() * 255)
+                        )
+        pil_image.save(os.path.join(output_path, f"{frame_id}_pred_depth.png"))
+        main_color_3hw = cur_data["high_res_color_b3hw"][elem_ind]
+        main_color_3hw = reverse_imagenet_normalize(main_color_3hw)
+        pil_image = Image.fromarray(
+                            np.uint8(main_color_3hw.permute(1,2,0
+                                    ).cpu().detach().numpy() * 255)
+                        )
+        pil_image.save(os.path.join(output_path, f"{frame_id}_color.png"))