FrickYinn commited on Oct 10, 2024

Commit

e170a8e

verified ·

1 Parent(s): ee4e79a

Upload 53 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +35 -0
LICENSE +201 -0
README.md +183 -3
assets/figures/obj_vis_gt.png +0 -0
assets/figures/obj_vis_query.png +0 -0
assets/figures/obj_vis_reference_labeled.png +0 -0
assets/figures/scene5_vis_0.png +0 -0
assets/figures/scene5_vis_1.png +0 -0
assets/figures/scene5_vis_gt.png +0 -0
assets/ho3d_test_3000/ho3d_test.json +0 -0
assets/linemod_test_1500/linemod_test.json +0 -0
assets/mapfree_submission.zip +3 -0
assets/megadepth_test_1500_scene_info/0015_0.1_0.3.npz +3 -0
assets/megadepth_test_1500_scene_info/0015_0.3_0.5.npz +3 -0
assets/megadepth_test_1500_scene_info/0022_0.1_0.3.npz +3 -0
assets/megadepth_test_1500_scene_info/0022_0.3_0.5.npz +3 -0
assets/megadepth_test_1500_scene_info/0022_0.5_0.7.npz +3 -0
assets/megadepth_test_1500_scene_info/megadepth_test_1500.txt +5 -0
assets/scannet_test_1500/intrinsics.npz +3 -0
assets/scannet_test_1500/scannet_test.txt +1 -0
assets/scannet_test_1500/statistics.json +102 -0
assets/scannet_test_1500/test.npz +3 -0
baselines/matchers.py +72 -0
baselines/pose.py +92 -0
baselines/pose_solver.py +320 -0
configs/default.py +85 -0
configs/ho3d.yaml +19 -0
configs/linemod.yaml +20 -0
configs/mapfree.yaml +14 -0
configs/matterport.yaml +11 -0
configs/megadepth.yaml +29 -0
configs/scannet.yaml +33 -0
datasets/__init__.py +20 -0
datasets/ho3d.py +331 -0
datasets/linemod.py +441 -0
datasets/mapfree.py +178 -0
datasets/matterport.py +86 -0
datasets/megadepth.py +125 -0
datasets/sampler.py +77 -0
datasets/scannet.py +154 -0
eval.py +48 -0
eval_add_reproj.py +138 -0
eval_baselines.py +189 -0
model/__init__.py +4 -0
model/pl_trainer.py +201 -0
model/relpose.py +465 -0
requirements.txt +11 -0
train.py +104 -0
utils/__init__.py +19 -0
utils/augment.py +15 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+**/__pycache__
+/checkpoints
+/log
+/lightning_logs
+/pyramid
+*.ipynb
+/data
+preprocess_megadepth.py
+*.ckpt
+test.py
+/RelPoseRepo
+eval_regressor.py
+/assets/megadepth_test_new
+/configs/megadepth_new.yaml
+/results
+assets/new_submission.zip
+__eval_baselines.py
+__pose_tracking.py
+__track.py
+/__pose_tracking
+/baselines/configs
+/baselines/repo
+/baselines/weights
+/baselines/__models.py
+baselines/demo.html
+utils/__reprojection.py
+utils/__pose_solver.py
+utils/__generate_epipolar_imgs.py
+utils/__visualize.py
+submission.py
+/qualitative

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,3 +1,183 @@
----
-license: apache-2.0
----

+# SRPose: Two-view Relative Pose Estimation with Sparse Keypoints
+**SRPose**: A **S**parse keypoint-based framework for **R**elative **Pose** estimation between two views in both camera-to-world and object-to-camera scenarios.
+| Reference | Query  |	Ground Truth |
+|:--------:|:---------:|:--------:|
+| ![](assets/figures/scene5_vis_0.png) | ![](assets/figures/scene5_vis_1.png) | ![](assets/figures/scene5_vis_gt.png) |
+| ![](assets/figures/obj_vis_reference_labeled.png) | ![](assets/figures/obj_vis_query.png) |![](assets/figures/obj_vis_gt.png)|
+## [Project page](https://frickyinn.github.io/srpose/) | [arXiv](https://arxiv.org/abs/2407.08199)
+## Setup
+Please first intall PyTorch according to [here](https://pytorch.org/get-started/locally/), then install other dependencies using pip:
+```
+cd SRPose
+pip install -r requirements.txt
+```
+## Evaluation
+1. Download pretrained models [here](https://drive.google.com/drive/folders/1bBlds3UX7-XDCevbIl4bnnywvWzzP5nN) for evaluation.
+2. Create new folders:
+```
+mkdir checkpoints & mkdir data
+```
+3. Organize the downloaded checkpoints like this:
+```
+SRPose
+|-- checkpoints
+    |-- ho3d.ckpt
+    |-- linemod.ckpt
+    |-- mapfree.ckpt
+    |-- matterport.ckpt
+    |-- megadepth.ckpt
+    `-- scannet.ckpt
+    ...
+```
+### Matterport
+1. Download Matterport dataset [here](https://github.com/jinlinyi/SparsePlanes/blob/main/docs/data.md), only `mp3d_planercnn_json.zip` and `rgb.zip` are required.
+2. Unzip and organize the downloaded files:
+```
+mkdir data/mp3d
+mkdir data/mp3d/mp3d_planercnn_json & mkdir data/mp3d/rgb
+unzip <pathto>/mp3d_planercnn_json.zip -d data/mp3d/mp3d_planercnn_json
+unzip <pathto>/rgb.zip -d data/mp3d/rgb
+```
+3. The resulted directory tree should be like this:
+```
+SRPose
+|-- data
+    |-- mp3d
+        |-- mp3d_planercnn_json
+        |   |-- cached_set_test.json
+        |   |-- cached_set_train.json
+        |   `-- cached_set_val.json
+        `-- rgb
+            |-- 17DRP5sb8fy
+                ...
+        ...
+    ...
+```
+4. Evaluate with the following command:
+```
+python eval.py configs/matterport.yaml checkpoints/matterport.ckpt
+```
+### ScanNet & MegaDepth
+1. Download and organize the ScanNet-1500 and MegaDepth-1500 test sets according to the [LoFTR Training Script](https://github.com/zju3dv/LoFTR/blob/master/docs/TRAINING.md). Note that only the test sets and the dataset indices are required.
+2. The resulted directory tree should be:
+```
+SRPose
+|-- data
+    |-- scannet
+    |   |-- index
+    |   |-- test
+    |   `-- train (optional)
+    |-- megadepth
+        |-- index
+        |-- test
+        `-- train (optional)
+        ...
+    ...
+```
+3. Evaluate with the following commands:
+```
+python eval.py configs/scannet.yaml checkpoints/scannet.ckpt
+python eval.py configs/megadepth.yaml checkpoints/megedepth.ckpt
+```
+### HO3D
+1. Download HO3D (version 3) dataset [here](https://www.tugraz.at/institute/icg/research/team-lepetit/research-projects/hand-object-3d-pose-annotation/), `HO3D_v3.zip` and `HO3D_v3_segmentations_rendered.zip` are required.
+2. Unzip and organize the downloaded files:
+```
+mkdir data/ho3d
+unzip <pathto>/HO3D_v3.zip -d data/ho3d
+unzip <pathto>/HO3D_v3_segmentations_rendered.zip -d data/ho3d
+```
+3. Evaluate with the following commands:
+```
+python eval.py configs/ho3d.yaml checkpoints/ho3d.ckpt
+python eval_add_reproj.py configs/ho3d.yaml checkpoints/ho3d.ckpt
+```
+### Linemod
+1. Download Linemod dataset [here](https://bop.felk.cvut.cz/datasets/) or run the following commands:
+```
+cd data
+export SRC=https://bop.felk.cvut.cz/media/data/bop_datasets
+wget $SRC/lm_base.zip         # Base archive with dataset info, camera parameters, etc.
+wget $SRC/lm_models.zip       # 3D object models.
+wget $SRC/lm_test_all.zip     # All test images ("_bop19" for a subset used in the BOP Challenge 2019/2020).
+wget $SRC/lm_train_pbr.zip    # PBR training images (rendered with BlenderProc4BOP).
+unzip lm_base.zip             # Contains folder "lm".
+unzip lm_models.zip -d lm     # Unpacks to "lm".
+unzip lm_test_all.zip -d lm   # Unpacks to "lm".
+unzip lm_train_pbr.zip -d lm  # Unpacks to "lm".
+```
+2. Evaluate with the following commands:
+```
+python eval.py configs/linemod.yaml checkpoints/linemod.ckpt
+python eval_add_reproj.py configs/linemod.yaml checkpoints/linemod.ckpt
+```
+### Niantic
+1. Download Niantic dataset [here](https://research.nianticlabs.com/mapfree-reloc-benchmark/dataset).
+2. Unzip and organize the downloaded files:
+```
+mkdir data/mapfree
+unzip <pathto>/train.zip -d data/mapfree
+unzip <pathto>/val.zip -d data/mapfree
+unzip <pathto>/test.zip -d data/mapfree
+```
+3. The ground truth of the test set is not publicly available, but you can run the following command to produce a new submission file and submit it on the [project page](https://research.nianticlabs.com/mapfree-reloc-benchmark/submit) for evaluation:
+```
+python eval_add_reproj.py configs/mapfree.yaml checkpoints/mapfree.ckpt
+```
+You should be able to find a `new_submission.zip` in `SRPose/assets/` afterwards, or you can submit the already produced file `SRPose/assets/mapfree_submission.zip` instead.
+## Training
+Download and organize the datasets following [Evaluation](#evaluation), then run the following command for training:
+```
+python train.py configs/<dataset>.yaml
+```
+Please refer to the `.yaml` files in `SRPose/configs/` for detailed configurations.
+## Baselines
+We also offer two publicly available matcher-based baselines, [LightGlue](https://github.com/cvg/LightGlue) and [LoFTR](https://github.com/zju3dv/LoFTR), for evaluation and comparison.
+Just run the following commands:
+```
+# For Matterport, ScanNet and MegaDepth
+python eval_baselines.py configs/<dataset>.yaml lightglue
+python eval_baselines.py configs/<dataset>.yaml loftr
+# For HO3D and Linemod
+python eval_baselines.py configs/<dataset>.yaml lightglue --resize 640 --depth
+python eval_baselines.py configs/<dataset>.yaml loftr --resize 640 --depth
+```
+The `--resize xx` option controls the larger dimension of cropped target object images that will be resized to.
+The `--depth` option controls whether the depth maps will be used to obtain scaled pose estimation.
+## Acknowledgements
+In this repository, we have used codes from the following repositories. We thank all the authors for sharing great codes.
+- [LightGlue](https://github.com/cvg/LightGlue)
+- [LoFTR](https://github.com/zju3dv/LoFTR)
+- [8point](https://github.com/crockwell/rel_pose)
+- [SparsePlanes](https://github.com/jinlinyi/SparsePlanes/tree/main)
+- [Map-free](https://github.com/nianticlabs/map-free-reloc/tree/main)
+## Citation
+```
+@inproceedings{yin2024srpose,
+    title={SRPose: Two-view Relative Pose Estimation with Sparse Keypoints},
+    author={Yin, Rui and Zhang, Yulun and Pan, Zherong and Zhu, Jianjun and Wang, Cheng and Jia, Biao},
+    booktitle={ECCV},
+    year={2024}
+}
+```

assets/figures/obj_vis_gt.png ADDED Viewed

assets/figures/obj_vis_query.png ADDED Viewed

assets/figures/obj_vis_reference_labeled.png ADDED Viewed

assets/figures/scene5_vis_0.png ADDED Viewed

assets/figures/scene5_vis_1.png ADDED Viewed

assets/figures/scene5_vis_gt.png ADDED Viewed

assets/ho3d_test_3000/ho3d_test.json ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/linemod_test_1500/linemod_test.json ADDED Viewed

The diff for this file is too large to render. See raw diff

assets/mapfree_submission.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:026e799de5bf9eed2a1f627e64d6981f983cb729337036390481c402c47fbc5c
+size 6569663

assets/megadepth_test_1500_scene_info/0015_0.1_0.3.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d441df1d380b2ed34449b944d9f13127e695542fa275098d38a6298835672f22
+size 231253

assets/megadepth_test_1500_scene_info/0015_0.3_0.5.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f34b5231d04a84d84378c671dd26854869663b5eafeae2ebaf624a279325139
+size 231253

assets/megadepth_test_1500_scene_info/0022_0.1_0.3.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba46e6b9ec291fc7271eb9741d5c75ca04b83d3d7281e049815de9cb9024f4d9
+size 272610

assets/megadepth_test_1500_scene_info/0022_0.3_0.5.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f4465da174b96deba61e5328886e4f2e687d34b890efca69e0c838736f8ae12
+size 272610

assets/megadepth_test_1500_scene_info/0022_0.5_0.7.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:684ae10f03001917c3ca0d12d441f372ce3c7e6637bd1277a3cda60df4207fe9
+size 272610

assets/megadepth_test_1500_scene_info/megadepth_test_1500.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+0022_0.1_0.3
+0015_0.1_0.3
+0015_0.3_0.5
+0022_0.3_0.5
+0022_0.5_0.7

assets/scannet_test_1500/intrinsics.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25ac102c69e2e4e2f0ab9c0d64f4da2b815e0901630768bdfde30080ced3605c
+size 23922

assets/scannet_test_1500/scannet_test.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ test.npz

assets/scannet_test_1500/statistics.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+    "scene0707_00": 15,
+    "scene0708_00": 15,
+    "scene0709_00": 15,
+    "scene0710_00": 15,
+    "scene0711_00": 15,
+    "scene0712_00": 15,
+    "scene0713_00": 15,
+    "scene0714_00": 15,
+    "scene0715_00": 15,
+    "scene0716_00": 15,
+    "scene0717_00": 15,
+    "scene0718_00": 15,
+    "scene0719_00": 15,
+    "scene0720_00": 15,
+    "scene0721_00": 15,
+    "scene0722_00": 15,
+    "scene0723_00": 15,
+    "scene0724_00": 15,
+    "scene0725_00": 15,
+    "scene0726_00": 15,
+    "scene0727_00": 15,
+    "scene0728_00": 15,
+    "scene0729_00": 15,
+    "scene0730_00": 15,
+    "scene0731_00": 15,
+    "scene0732_00": 15,
+    "scene0733_00": 15,
+    "scene0734_00": 15,
+    "scene0735_00": 15,
+    "scene0736_00": 15,
+    "scene0737_00": 15,
+    "scene0738_00": 15,
+    "scene0739_00": 15,
+    "scene0740_00": 15,
+    "scene0741_00": 15,
+    "scene0742_00": 15,
+    "scene0743_00": 15,
+    "scene0744_00": 15,
+    "scene0745_00": 15,
+    "scene0746_00": 15,
+    "scene0747_00": 15,
+    "scene0748_00": 15,
+    "scene0749_00": 15,
+    "scene0750_00": 15,
+    "scene0751_00": 15,
+    "scene0752_00": 15,
+    "scene0753_00": 15,
+    "scene0754_00": 15,
+    "scene0755_00": 15,
+    "scene0756_00": 15,
+    "scene0757_00": 15,
+    "scene0758_00": 15,
+    "scene0759_00": 15,
+    "scene0760_00": 15,
+    "scene0761_00": 15,
+    "scene0762_00": 15,
+    "scene0763_00": 15,
+    "scene0764_00": 15,
+    "scene0765_00": 15,
+    "scene0766_00": 15,
+    "scene0767_00": 15,
+    "scene0768_00": 15,
+    "scene0769_00": 15,
+    "scene0770_00": 15,
+    "scene0771_00": 15,
+    "scene0772_00": 15,
+    "scene0773_00": 15,
+    "scene0774_00": 15,
+    "scene0775_00": 15,
+    "scene0776_00": 15,
+    "scene0777_00": 15,
+    "scene0778_00": 15,
+    "scene0779_00": 15,
+    "scene0780_00": 15,
+    "scene0781_00": 15,
+    "scene0782_00": 15,
+    "scene0783_00": 15,
+    "scene0784_00": 15,
+    "scene0785_00": 15,
+    "scene0786_00": 15,
+    "scene0787_00": 15,
+    "scene0788_00": 15,
+    "scene0789_00": 15,
+    "scene0790_00": 15,
+    "scene0791_00": 15,
+    "scene0792_00": 15,
+    "scene0793_00": 15,
+    "scene0794_00": 15,
+    "scene0795_00": 15,
+    "scene0796_00": 15,
+    "scene0797_00": 15,
+    "scene0798_00": 15,
+    "scene0799_00": 15,
+    "scene0800_00": 15,
+    "scene0801_00": 15,
+    "scene0802_00": 15,
+    "scene0803_00": 15,
+    "scene0804_00": 15,
+    "scene0805_00": 15,
+    "scene0806_00": 15
+}

assets/scannet_test_1500/test.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b982b9c1f762e7d31af552ecc1ccf1a6add013197f74ec69c84a6deaa6f580ad
+size 71687

baselines/matchers.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import time
+from lightglue import LightGlue as LightGlue_
+from lightglue import SuperPoint
+from lightglue.utils import rbd
+from kornia.feature import LoFTR as LoFTR_
+def image_rgb2gray(image):
+    # in: torch.tensor - (3, H, W)
+    # out: (1, H, W)
+    image = image[0] * 0.3 + image[1] * 0.59 + image[2] * 0.11
+    return image[None]
+class LightGlue():
+    def __init__(self, num_keypoints=2048, device='cuda'):
+        self.extractor = SuperPoint(max_num_keypoints=num_keypoints).eval().to(device)  # load the extractor
+        self.matcher = LightGlue_(features='superpoint').eval().to(device)  # load the matcher
+        self.device = device
+    @torch.no_grad()
+    def match(self, image0, image1):
+        start_time = time.time()
+        # image: torch.tensor - (3, H, W)
+        image0 = image0.to(self.device)
+        image1 = image1.to(self.device)
+        preprocess_time = time.time()
+        # extract local features
+        feats0 = self.extractor.extract(image0)  # auto-resize the image, disable with resize=None
+        feats1 = self.extractor.extract(image1)
+        extract_time = time.time()
+        # match the features
+        matches01 = self.matcher({'image0': feats0, 'image1': feats1})
+        feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension
+        matches = matches01['matches']  # indices with shape (K,2)
+        points0 = feats0['keypoints'][matches[..., 0]]  # coordinates in image #0, shape (K,2)
+        points1 = feats1['keypoints'][matches[..., 1]]  # coordinates in image #1, shape (K,2)
+        match_time = time.time()
+        return points0, points1, preprocess_time-start_time, extract_time-preprocess_time, match_time-extract_time
+class LoFTR():
+    def __init__(self, pretrained='indoor', device='cuda'):
+        self.loftr = LoFTR_(pretrained=pretrained).eval().to(device)
+        self.device = device
+    @torch.no_grad()
+    def match(self, image0, image1):
+        start_time = time.time()
+        # image: torch.tensor - (3, H, W)
+        image0 = image_rgb2gray(image0)[None].to(self.device)
+        image1 = image_rgb2gray(image1)[None].to(self.device)
+        preprocess_time = time.time()
+        extract_time = time.time()
+        out = self.loftr({'image0': image0, 'image1': image1})
+        points0, points1 = out['keypoints0'], out['keypoints1']
+        match_time = time.time()
+        return points0, points1, preprocess_time-start_time, extract_time-preprocess_time, match_time-extract_time

baselines/pose.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import torch
+from torchvision.transforms import Resize
+from .matchers import LightGlue, LoFTR
+# from .__models import SuperGlue, SGMNet, ASpanFormer, DKM
+from .pose_solver import EssentialMatrixSolver, EssentialMatrixMetricSolver, PnPSolver, ProcrustesSolver
+import time
+class PoseRecover():
+    def __init__(self, matcher='lightglue', solver='procrustes', img_resize=None, device='cuda'):
+        self.device = device
+        if matcher == 'lightglue':
+            self.matcher = LightGlue(device=device)
+        elif matcher == 'loftr':
+            self.matcher = LoFTR(device=device)
+        # elif matcher == 'superglue':
+        #     self.matcher = SuperGlue(device=device)
+        # elif matcher == 'aspanformer':
+        #     self.matcher = ASpanFormer(device=device)
+        # elif matcher == 'sgmnet':
+        #     self.matcher = SGMNet(device=device)
+        # elif matcher == 'dkm':
+        #     self.matcher = DKM(device=device)
+        else:
+            raise NotImplementedError
+        self.img_resize = img_resize
+        self.basic_solver = EssentialMatrixSolver()
+        if solver == 'essential':
+            self.scaled_solver = EssentialMatrixMetricSolver()
+        elif solver == 'pnp':
+            self.scaled_solver = PnPSolver()
+        elif solver == 'procrustes':
+            self.scaled_solver = ProcrustesSolver()
+    def recover(self, image0, image1, K0, K1, bbox0=None, bbox1=None, mask0=None, mask1=None, depth0=None, depth1=None):
+        if self.img_resize is not None:
+            h, w = image0.shape[-2:]
+            if h > w:
+                h_new = self.img_resize
+                w_new = int(w * h_new / h)
+            else:
+                w_new = self.img_resize
+                h_new = int(h * w_new / w)
+            # h_new, w_new = 480, 640
+            resize = Resize((h_new, w_new), antialias=True)
+            scale0 = torch.tensor([image0.shape[-1]/w_new, image0.shape[-2]/h_new], dtype=torch.float)
+            scale1 = torch.tensor([image1.shape[-1]/w_new, image1.shape[-2]/h_new], dtype=torch.float)
+            image0 = resize(image0)
+            image1 = resize(image1)
+        points0, points1, preprocess_time, extract_time, match_time = self.matcher.match(image0, image1)
+        if self.img_resize is not None:
+            points0 *= scale0.unsqueeze(0).to(points0.device)
+            points1 *= scale1.unsqueeze(0).to(points1.device)
+        if bbox0 is not None and bbox1 is not None:
+            x1, y1, x2, y2 = bbox0
+            u1, v1, u2, v2 = bbox1
+            points0[:, 0] += x1
+            points0[:, 1] += y1
+            points1[:, 0] += u1
+            points1[:, 1] += v1
+        if mask0 is not None and mask1 is not None:
+            filtered_ind0 = mask0[(points0[:, 1]).int(), (points0[:, 0]).int()]
+            filtered_ind1 = mask1[(points1[:, 1]).int(), (points1[:, 0]).int()]
+            filtered_inds = filtered_ind0 * filtered_ind1
+            points0 = points0[filtered_inds]
+            points1 = points1[filtered_inds]
+        points0, points1 = points0.cpu().numpy(), points1.cpu().numpy()
+        start_time = time.time()
+        if depth0 is None or depth1 is None:
+            R_est, t_est, _ = self.basic_solver.estimate_pose(points0, points1, {'K_color0': K0, 'K_color1': K1})
+        else:
+            R_est, t_est, _ = self.scaled_solver.estimate_pose(points0, points1, {'K_color0': K0, 'K_color1': K1, 'depth0': depth0, 'depth1': depth1})
+        recover_time = time.time()
+        return R_est, t_est, points0, points1, preprocess_time, extract_time, match_time, recover_time-start_time

baselines/pose_solver.py ADDED Viewed

	@@ -0,0 +1,320 @@

+import numpy as np
+import cv2 as cv
+import open3d as o3d
+def backproject_3d(uv, depth, K):
+    '''
+    Backprojects 2d points given by uv coordinates into 3D using their depth values and intrinsic K
+    :param uv: array [N,2]
+    :param depth: array [N]
+    :param K: array [3,3]
+    :return: xyz: array [N,3]
+    '''
+    uv1 = np.concatenate([uv, np.ones((uv.shape[0], 1))], axis=1)
+    xyz = depth.reshape(-1, 1) * (np.linalg.inv(K) @ uv1.T).T
+    return xyz
+class EssentialMatrixSolver:
+    '''Obtain relative pose (up to scale) given a set of 2D-2D correspondences'''
+    def __init__(self, ransac_pix_threshold=0.5, ransac_confidence=0.99999):
+        # EMat RANSAC parameters
+        self.ransac_pix_threshold = ransac_pix_threshold
+        self.ransac_confidence = ransac_confidence
+    def estimate_pose(self, kpts0, kpts1, data):
+        R = np.full((3, 3), np.nan)
+        t = np.full((3), np.nan)
+        if len(kpts0) < 5:
+            return R, t, 0
+        K0 = data['K_color0'].numpy()
+        K1 = data['K_color1'].numpy()
+        # normalize keypoints
+        kpts0 = (kpts0 - K0[[0, 1], [2, 2]][None]) / K0[[0, 1], [0, 1]][None]
+        kpts1 = (kpts1 - K1[[0, 1], [2, 2]][None]) / K1[[0, 1], [0, 1]][None]
+        # normalize ransac threshold
+        ransac_thr = self.ransac_pix_threshold / np.mean([K0[0, 0], K1[1, 1], K0[1, 1], K1[0, 0]])
+        # compute pose with OpenCV
+        E, mask = cv.findEssentialMat(
+            kpts0, kpts1, np.eye(3),
+            threshold=ransac_thr, prob=self.ransac_confidence, method=cv.RANSAC)
+        self.mask = mask
+        if E is None:
+            return R, t, 0
+        # recover pose from E
+        best_num_inliers = 0
+        ret = R, t, 0
+        for _E in np.split(E, len(E) / 3):
+            n, R, t, _ = cv.recoverPose(_E, kpts0, kpts1, np.eye(3), 1e9, mask=mask)
+            if n > best_num_inliers:
+                best_num_inliers = n
+                ret = (R, t[:, 0], n)
+        return ret
+class EssentialMatrixMetricSolverMEAN(EssentialMatrixSolver):
+    '''Obtains relative pose with scale using E-Mat decomposition and depth values at inlier correspondences'''
+    def __init__(self, cfg):
+        super().__init__(cfg)
+    def estimate_pose(self, kpts0, kpts1, data):
+        '''Estimates metric translation vector using by back-projecting E-mat inliers to 3D using depthmaps.
+        The metric translation vector can be obtained by looking at the residual vector (projected to the translation vector direction).
+        In this version, each 3D-3D correspondence gives an optimal scale for the translation vector.
+        We simply aggregate them by averaging them.
+        '''
+        # get pose up to scale
+        R, t, inliers = super().estimate_pose(kpts0, kpts1, data)
+        if inliers == 0:
+            return R, t, inliers
+        # backproject E-mat inliers at each camera
+        K0 = data['K_color0']
+        K1 = data['K_color1']
+        mask = self.mask.ravel() == 1        # get E-mat inlier mask from super class
+        inliers_kpts0 = np.int32(kpts0[mask])
+        inliers_kpts1 = np.int32(kpts1[mask])
+        depth_inliers_0 = data['depth0'][inliers_kpts0[:, 1], inliers_kpts0[:, 0]].numpy()
+        depth_inliers_1 = data['depth1'][inliers_kpts1[:, 1], inliers_kpts1[:, 0]].numpy()
+        # check for valid depth
+        valid = (depth_inliers_0 > 0) * (depth_inliers_1 > 0)
+        if valid.sum() < 1:
+            R = np.full((3, 3), np.nan)
+            t = np.full((3, 1), np.nan)
+            inliers = 0
+            return R, t, inliers
+        xyz0 = backproject_3d(inliers_kpts0[valid], depth_inliers_0[valid], K0)
+        xyz1 = backproject_3d(inliers_kpts1[valid], depth_inliers_1[valid], K1)
+        # rotate xyz0 to xyz1 CS (so that axes are parallel)
+        xyz0 = (R @ xyz0.T).T
+        # get average point for each camera
+        pmean0 = np.mean(xyz0, axis=0)
+        pmean1 = np.mean(xyz1, axis=0)
+        # find scale as the 'length' of the translation vector that minimises the 3D distance between projected points from 0 and the corresponding points in 1
+        scale = np.dot(pmean1 - pmean0, t)
+        t_metric = scale * t
+        t_metric = t_metric.reshape(3, 1)
+        return R, t_metric[:, 0], inliers
+class EssentialMatrixMetricSolver(EssentialMatrixSolver):
+    '''
+        Obtains relative pose with scale using E-Mat decomposition and RANSAC for scale based on depth values at inlier correspondences.
+        The scale of the translation vector is obtained using RANSAC over the possible scales recovered from 3D-3D correspondences.
+    '''
+    def __init__(self, ransac_pix_threshold=0.5, ransac_confidence=0.99999, ransac_scale_threshold=0.1):
+        super().__init__(ransac_pix_threshold, ransac_confidence)
+        self.ransac_scale_threshold = ransac_scale_threshold
+    def estimate_pose(self, kpts0, kpts1, data):
+        '''Estimates metric translation vector using by back-projecting E-mat inliers to 3D using depthmaps.
+        '''
+        # get pose up to scale
+        R, t, inliers = super().estimate_pose(kpts0, kpts1, data)
+        if inliers == 0:
+            return R, t, inliers
+        # backproject E-mat inliers at each camera
+        K0 = data['K_color0']
+        K1 = data['K_color1']
+        mask = self.mask.ravel() == 1        # get E-mat inlier mask from super class
+        inliers_kpts0 = np.int32(kpts0[mask])
+        inliers_kpts1 = np.int32(kpts1[mask])
+        depth_inliers_0 = data['depth0'][inliers_kpts0[:, 1], inliers_kpts0[:, 0]].numpy()
+        depth_inliers_1 = data['depth1'][inliers_kpts1[:, 1], inliers_kpts1[:, 0]].numpy()
+        # check for valid depth
+        valid = (depth_inliers_0 > 0) * (depth_inliers_1 > 0)
+        if valid.sum() < 1:
+            R = np.full((3, 3), np.nan)
+            t = np.full((3, ), np.nan)
+            inliers = 0
+            return R, t, inliers
+        xyz0 = backproject_3d(inliers_kpts0[valid], depth_inliers_0[valid], K0)
+        xyz1 = backproject_3d(inliers_kpts1[valid], depth_inliers_1[valid], K1)
+        # rotate xyz0 to xyz1 CS (so that axes are parallel)
+        xyz0 = (R @ xyz0.T).T
+        # get individual scales (for each 3D-3D correspondence)
+        scale = np.dot(xyz1 - xyz0, t.reshape(3, 1))  # [N, 1]
+        # RANSAC loop
+        best_inliers = 0
+        best_scale = None
+        for scale_hyp in scale:
+            inliers_hyp = (np.abs(scale - scale_hyp) < self.ransac_scale_threshold).sum().item()
+            if inliers_hyp > best_inliers:
+                best_scale = scale_hyp
+                best_inliers = inliers_hyp
+        # Output results
+        t_metric = best_scale * t
+        t_metric = t_metric.reshape(3, 1)
+        return R, t_metric[:, 0], best_inliers
+class PnPSolver:
+    '''Estimate relative pose (metric) using Perspective-n-Point algorithm (2D-3D) correspondences'''
+    def __init__(self, ransac_iterations=1000, reprojection_inlier_threshold=3, confidence=0.99999):
+        # PnP RANSAC parameters
+        self.ransac_iterations = ransac_iterations
+        self.reprojection_inlier_threshold = reprojection_inlier_threshold
+        self.confidence = confidence
+    def estimate_pose(self, pts0, pts1, data):
+        # uses nearest neighbour
+        pts0 = np.int32(pts0)
+        if len(pts0) < 4:
+            return np.full((3, 3), np.nan), np.full((3, 1), np.nan), 0
+        # get depth at correspondence points
+        depth_0 = data['depth0']
+        depth_pts0 = depth_0[pts0[:, 1], pts0[:, 0]]
+        # remove invalid pts (depth == 0)
+        valid = depth_pts0 > depth_0.min()
+        if valid.sum() < 4:
+            return np.full((3, 3), np.nan), np.full((3, 1), np.nan), 0
+        pts0 = pts0[valid]
+        pts1 = pts1[valid]
+        depth_pts0 = depth_pts0[valid]
+        # backproject points to 3D in each sensors' local coordinates
+        K0 = data['K_color0']
+        K1 = data['K_color1']
+        xyz_0 = backproject_3d(pts0, depth_pts0, K0).numpy()
+        # get relative pose using PnP + RANSAC
+        succ, rvec, tvec, inliers = cv.solvePnPRansac(
+            xyz_0, pts1, K1.numpy(),
+            None, iterationsCount=self.ransac_iterations,
+            reprojectionError=self.reprojection_inlier_threshold, confidence=self.confidence,
+            flags=cv.SOLVEPNP_P3P)
+        # refine with iterative PnP using inliers only
+        if succ and len(inliers) >= 6:
+            succ, rvec, tvec, _ = cv.solvePnPGeneric(xyz_0[inliers], pts1[inliers], K1.numpy(
+            ), None, useExtrinsicGuess=True, rvec=rvec, tvec=tvec, flags=cv.SOLVEPNP_ITERATIVE)
+            rvec = rvec[0]
+            tvec = tvec[0]
+        # avoid degenerate solutions
+        if succ:
+            if np.linalg.norm(tvec) > 1000:
+                succ = False
+        if succ:
+            R, _ = cv.Rodrigues(rvec)
+            t = tvec.reshape(3, 1)
+        else:
+            R = np.full((3, 3), np.nan)
+            t = np.full((3, 1), np.nan)
+            inliers = []
+        return R, t[:, 0], inliers
+class ProcrustesSolver:
+    '''Estimate relative pose (metric) using 3D-3D correspondences'''
+    def __init__(self, ransac_max_corr_distance=0.5, refine=False):
+        # Procrustes RANSAC parameters
+        self.ransac_max_corr_distance = ransac_max_corr_distance
+        self.refine = refine
+    def estimate_pose(self, pts0, pts1, data):
+        # uses nearest neighbour
+        pts0 = np.int32(pts0)
+        pts1 = np.int32(pts1)
+        if len(pts0) < 3:
+            return np.full((3, 3), np.nan), np.full((3), np.nan), 0
+        # get depth at correspondence points
+        depth_0, depth_1 = data['depth0'], data['depth1']
+        depth_pts0 = depth_0[pts0[:, 1], pts0[:, 0]]
+        depth_pts1 = depth_1[pts1[:, 1], pts1[:, 0]]
+        # remove invalid pts (depth == 0)
+        valid = (depth_pts0 > depth_0.min()) * (depth_pts1 > depth_1.min())
+        if valid.sum() < 3:
+            return np.full((3, 3), np.nan), np.full((3), np.nan), 0
+        pts0 = pts0[valid]
+        pts1 = pts1[valid]
+        depth_pts0 = depth_pts0[valid]
+        depth_pts1 = depth_pts1[valid]
+        # backproject points to 3D in each sensors' local coordinates
+        K0 = data['K_color0']
+        K1 = data['K_color1']
+        xyz_0 = backproject_3d(pts0, depth_pts0, K0)
+        xyz_1 = backproject_3d(pts1, depth_pts1, K1)
+        # create open3d point cloud objects and correspondences idxs
+        pcl_0 = o3d.geometry.PointCloud()
+        pcl_0.points = o3d.utility.Vector3dVector(xyz_0)
+        pcl_1 = o3d.geometry.PointCloud()
+        pcl_1.points = o3d.utility.Vector3dVector(xyz_1)
+        corr_idx = np.arange(pts0.shape[0])
+        corr_idx = np.tile(corr_idx.reshape(-1, 1), (1, 2))
+        corr_idx = o3d.utility.Vector2iVector(corr_idx)
+        # obtain relative pose using procrustes
+        ransac_criteria = o3d.pipelines.registration.RANSACConvergenceCriteria()
+        res = o3d.pipelines.registration.registration_ransac_based_on_correspondence(
+            pcl_0, pcl_1, corr_idx, self.ransac_max_corr_distance, criteria=ransac_criteria)
+        inliers = int(res.fitness * np.asarray(pcl_1.points).shape[0])
+        # refine with ICP
+        if self.refine:
+            # first, backproject both (whole) point clouds
+            vv, uu = np.mgrid[0:depth_0.shape[0], 0:depth_1.shape[1]]
+            uv_coords = np.concatenate([uu.reshape(-1, 1), vv.reshape(-1, 1)], axis=1)
+            valid = depth_0.reshape(-1) > 0
+            xyz_0 = backproject_3d(uv_coords[valid], depth_0.reshape(-1)[valid], K0)
+            valid = depth_1.reshape(-1) > 0
+            xyz_1 = backproject_3d(uv_coords[valid], depth_1.reshape(-1)[valid], K1)
+            pcl_0 = o3d.geometry.PointCloud()
+            pcl_0.points = o3d.utility.Vector3dVector(xyz_0)
+            pcl_1 = o3d.geometry.PointCloud()
+            pcl_1.points = o3d.utility.Vector3dVector(xyz_1)
+            icp_criteria = o3d.pipelines.registration.ICPConvergenceCriteria(relative_fitness=1e-4,
+                                                                             relative_rmse=1e-4,
+                                                                             max_iteration=30)
+            res = o3d.pipelines.registration.registration_icp(pcl_0,
+                                                              pcl_1,
+                                                              self.ransac_max_corr_distance,
+                                                              init=res.transformation,
+                                                              criteria=icp_criteria)
+        R = res.transformation[:3, :3]
+        t = res.transformation[:3, -1]
+        inliers = int(res.fitness * np.asarray(pcl_1.points).shape[0])
+        return R, t, inliers

configs/default.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from yacs.config import CfgNode as CN
+_CN = CN()
+# Model
+_CN.MODEL = CN()
+_CN.MODEL.NUM_KEYPOINTS = 1024
+_CN.MODEL.TEST_NUM_KEYPOINTS = 2048
+_CN.MODEL.N_LAYERS = 6
+_CN.MODEL.NUM_HEADS = 4
+_CN.MODEL.FEATURES = 'superpoint'
+# Dataset
+_CN.DATASET = CN()
+_CN.DATASET.TASK = None
+_CN.DATASET.DATA_SOURCE = None
+_CN.DATASET.DATA_ROOT = None
+_CN.DATASET.MIN_OVERLAP_SCORE = None
+## For MapFree
+_CN.DATASET.ESTIMATED_DEPTH = None
+## For Linemod(BOP)
+_CN.DATASET.OBJECT_ID = None
+_CN.DATASET.MIN_VISIBLE_FRACT = None
+_CN.DATASET.MAX_ANGLE_ERROR = None
+_CN.DATASET.JSON_PATH = None
+## For MegaDepth/ScanNet
+_CN.DATASET.TRAIN = CN()
+_CN.DATASET.TRAIN.DATA_ROOT = None
+_CN.DATASET.TRAIN.NPZ_ROOT = None
+_CN.DATASET.TRAIN.LIST_PATH = None
+_CN.DATASET.TRAIN.INTRINSIC_PATH = None
+_CN.DATASET.TRAIN.MIN_OVERLAP_SCORE = None
+_CN.DATASET.VAL = CN()
+_CN.DATASET.VAL.DATA_ROOT = None
+_CN.DATASET.VAL.NPZ_ROOT = None
+_CN.DATASET.VAL.LIST_PATH = None
+_CN.DATASET.VAL.INTRINSIC_PATH = None
+_CN.DATASET.VAL.MIN_OVERLAP_SCORE = None
+_CN.DATASET.TEST = CN()
+_CN.DATASET.TEST.DATA_ROOT = None
+_CN.DATASET.TEST.NPZ_ROOT = None
+_CN.DATASET.TEST.LIST_PATH = None
+_CN.DATASET.TEST.INTRINSIC_PATH = None
+_CN.DATASET.TEST.MIN_OVERLAP_SCORE = None
+# Train
+_CN.TRAINER = CN()
+_CN.TRAINER.EPOCHS = None
+_CN.TRAINER.LEARNING_RATE = None
+_CN.TRAINER.PCT_START = None
+_CN.TRAINER.BATCH_SIZE = None
+_CN.TRAINER.NUM_WORKERS = None
+_CN.TRAINER.PIN_MEMORY = True
+_CN.TRAINER.N_SAMPLES_PER_SUBSET = None
+_CN.RANDOM_SEED = 0
+# _CN.EMAT_RANSAC = CN()
+# _CN.EMAT_RANSAC.PIX_THRESHOLD = 0.5
+# _CN.EMAT_RANSAC.SCALE_THRESHOLD = 0.1
+# _CN.EMAT_RANSAC.CONFIDENCE = 0.99999
+# _CN.PNP = CN()
+# _CN.PNP.RANSAC_ITER = 1000
+# _CN.PNP.REPROJECTION_INLIER_THRESHOLD = 3
+# _CN.PNP.CONFIDENCE = 0.99999
+# _CN.PROCRUSTES = CN()
+# _CN.PROCRUSTES.MAX_CORR_DIST = 0.05 # meters
+# _CN.PROCRUSTES.REFINE = False
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _CN.clone()

configs/ho3d.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+MODEL:
+  NUM_KEYPOINTS: 1024
+DATASET:
+  TASK: 'object'
+  DATA_SOURCE: 'ho3d'
+  DATA_ROOT: 'data/ho3d'
+  JSON_PATH: 'assets/ho3d_test_3000/ho3d_test.json'
+  MAX_ANGLE_ERROR: 45
+TRAINER:
+  EPOCHS: 200
+  LEARNING_RATE: 0.00002
+  BATCH_SIZE: 32
+  NUM_WORKERS: 8
+  PCT_START: 0.3
+  N_SAMPLES_PER_SUBSET: 4000

configs/linemod.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+MODEL:
+  NUM_KEYPOINTS: 1200
+DATASET:
+  TASK: 'object'
+  DATA_SOURCE: 'linemod'
+  DATA_ROOT: 'data/lm'
+  JSON_PATH: 'assets/linemod_test_1500/linemod_test.json'
+  MIN_VISIBLE_FRACT: 0.75
+  MAX_ANGLE_ERROR: 45
+TRAINER:
+  EPOCHS: 200
+  LEARNING_RATE: 0.00002
+  BATCH_SIZE: 32
+  NUM_WORKERS: 8
+  PCT_START: 0.3
+  N_SAMPLES_PER_SUBSET: 200

configs/mapfree.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+DATASET:
+  TASK: 'scene'
+  DATA_SOURCE: 'mapfree'
+  DATA_ROOT: 'data/mapfree/'
+  # ESTIMATED_DEPTH: None  # To load estimated depth map, provide the suffix to the depth files, e.g. 'dptnyu', 'dptkiti'
+TRAINER:
+  EPOCHS: 200
+  LEARNING_RATE: 0.00002
+  BATCH_SIZE: 32
+  NUM_WORKERS: 6
+  PCT_START: 0.3
+  N_SAMPLES_PER_SUBSET: 200

configs/matterport.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+DATASET:
+  TASK: 'scene'
+  DATA_SOURCE: 'matterport'
+  DATA_ROOT: 'data/mp3d'
+TRAINER:
+  EPOCHS: 200
+  LEARNING_RATE: 0.00005
+  BATCH_SIZE: 32
+  NUM_WORKERS: 8
+  PCT_START: 0.3

configs/megadepth.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+DATASET:
+  TASK: "scene"
+  DATA_SOURCE: "megadepth"
+  TRAIN:
+    DATA_ROOT: "data/megadepth/train"
+    NPZ_ROOT: "data/megadepth/index/scene_info_0.1_0.7"
+    LIST_PATH: "data/megadepth/index/trainvaltest_list/train_list.txt"
+    MIN_OVERLAP_SCORE: 0.0
+  VAL:
+    DATA_ROOT: "data/megadepth/test"
+    NPZ_ROOT: "data/megadepth/index/scene_info_val_1500"
+    LIST_PATH: "data/megadepth/index/trainvaltest_list/val_list.txt"
+    MIN_OVERLAP_SCORE: 0.0
+  TEST:
+    DATA_ROOT: "data/megadepth/test"
+    NPZ_ROOT: "assets/megadepth_test_1500_scene_info"
+    LIST_PATH: "assets/megadepth_test_1500_scene_info/megadepth_test_1500.txt"
+    MIN_OVERLAP_SCORE: 0.0
+TRAINER:
+  EPOCHS: 500
+  LEARNING_RATE: 0.00002
+  BATCH_SIZE: 32
+  NUM_WORKERS: 8
+  PCT_START: 0.3
+  N_SAMPLES_PER_SUBSET: 200

configs/scannet.yaml ADDED Viewed

	@@ -0,0 +1,33 @@

+DATASET:
+  TASK: "scene"
+  DATA_SOURCE: "scannet"
+  TRAIN:
+    DATA_ROOT: "data/scannet/train"
+    NPZ_ROOT: "data/scannet/index/scene_data/train"
+    LIST_PATH: "data/scannet/index/scene_data/train_list/scannet_all.txt"
+    INTRINSIC_PATH: "data/scannet/index/intrinsics.npz"
+    MIN_OVERLAP_SCORE: 0.4
+  VAL:
+    DATA_ROOT: "data/scannet/test"
+    NPZ_ROOT: "assets/scannet_test_1500"
+    LIST_PATH: "assets/scannet_test_1500/scannet_test.txt"
+    INTRINSIC_PATH: "assets/scannet_test_1500/intrinsics.npz"
+    MIN_OVERLAP_SCORE: 0.0
+  TEST:
+    DATA_ROOT: "data/scannet/test"
+    NPZ_ROOT: "assets/scannet_test_1500"
+    LIST_PATH: "assets/scannet_test_1500/scannet_test.txt"
+    INTRINSIC_PATH: "assets/scannet_test_1500/intrinsics.npz"
+    MIN_OVERLAP_SCORE: 0.0
+TRAINER:
+  EPOCHS: 500
+  LEARNING_RATE: 0.0001
+  BATCH_SIZE: 32
+  NUM_WORKERS: 8
+  PCT_START: 0.3
+  N_SAMPLES_PER_SUBSET: 200

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from .matterport import build_matterport
+from .linemod import build_linemod
+from .megadepth import build_concat_megadepth
+from .scannet import build_concat_scannet
+from .ho3d import build_ho3d
+from .mapfree import build_concat_mapfree
+from .sampler import RandomConcatSampler
+dataset_dict = {
+    'scene': {
+        'matterport': build_matterport,
+        'megadepth': build_concat_megadepth,
+        'scannet': build_concat_scannet,
+        'mapfree': build_concat_mapfree,
+    },
+    'object': {
+        'linemod': build_linemod,
+        'ho3d': build_ho3d,
+    }
+}

datasets/ho3d.py ADDED Viewed

	@@ -0,0 +1,331 @@

+from pathlib import Path
+import numpy as np
+from PIL import Image
+import cv2
+import pickle
+import json
+from tqdm import tqdm
+import torch
+from torch.utils.data import Dataset, ConcatDataset
+from utils.augment import Augmentor
+class HO3D(Dataset):
+    def __init__(self, data_root, sequence_path, mode):
+        self.data_root = Path(data_root)
+        mode = 'evaluation' if mode != 'train' else 'train'
+        self.sequence_dir = self.data_root / mode / sequence_path
+        self.color_dir = self.sequence_dir / 'rgb'
+        self.mask_dir = self.sequence_dir / 'seg'
+        self.depth_dir = self.sequence_dir / 'depth'
+        self.meta_dir = self.sequence_dir / 'meta'
+        self.color_paths = list(self.color_dir.iterdir())
+        self.color_paths = sorted(self.color_paths)
+        self.mask_paths = [self.mask_dir / f'{x.stem}.png' for x in self.color_paths]
+        self.depth_paths = [self.depth_dir / f'{x.stem}.png' for x in self.color_paths]
+        self.meta_paths = [self.meta_dir / f'{x.stem}.pkl' for x in self.color_paths]
+        # self.glcam_in_cvcam = torch.tensor([
+        #     [1,0,0,0],
+        #     [0,-1,0,0],
+        #     [0,0,-1,0],
+        #     [0,0,0,1]
+        # ]).float()
+        self.intrinsics, self.extrinsics, self.objCorners, self.objNames, valid = self._load_meta(self.meta_paths)
+        self.color_paths = np.array(self.color_paths)[valid.numpy()]
+        self.mask_paths = np.array(self.mask_paths)[valid.numpy()]
+        self.depth_paths = np.array(self.depth_paths)[valid.numpy()]
+        self.meta_paths = np.array(self.meta_paths)[valid.numpy()]
+        self.bboxes, valid = self._load_bboxes(self.mask_paths)
+        self.intrinsics = self.intrinsics[valid]
+        self.extrinsics = self.extrinsics[valid]
+        self.objCorners = self.objCorners[valid]
+        self.objNames = self.objNames[valid.numpy()]
+        self.color_paths = self.color_paths[valid.numpy()]
+        self.mask_paths = self.mask_paths[valid.numpy()]
+        self.depth_paths = self.depth_paths[valid.numpy()]
+        self.meta_paths = self.meta_paths[valid.numpy()]
+        assert len(self.color_paths) == self.intrinsics.shape[0]
+        assert len(self.objNames) == self.extrinsics.shape[0]
+        self.augment = Augmentor(mode=='train')
+    def __len__(self):
+        return len(self.color_paths)
+    def _load_bboxes(self, mask_paths):
+        bboxes = []
+        valid = []
+        for mask_path in mask_paths:
+            mask = cv2.imread(str(mask_path))
+            # mask = cv2.resize(mask, (640, 480))
+            w_scale, h_scale = 640 / mask.shape[1], 480 / mask.shape[0]
+            obj_mask = torch.from_numpy(mask[..., 1] == 255)
+            if obj_mask.float().sum() < 100:
+                valid.append(False)
+                continue
+            valid.append(True)
+            mask_inds = torch.where(obj_mask)
+            x1, x2 = mask_inds[0].aminmax()
+            y1, y2 = mask_inds[1].aminmax()
+            bbox = torch.tensor([y1*h_scale, x1*w_scale, y2*h_scale, x2*w_scale]).int()
+            bboxes.append(bbox)
+        bboxes = torch.stack(bboxes)
+        valid = torch.tensor(valid)
+        return bboxes, valid
+    def _load_meta(self, meta_paths):
+        intrinsics = []
+        extrinsics = []
+        objCorners = []
+        objNames = []
+        valid = []
+        for meta_path in meta_paths:
+            with open(meta_path, 'rb') as f:
+                anno = pickle.load(f, encoding='latin1')
+            if anno['camMat'] is None:
+                valid.append(False)
+                continue
+            valid.append(True)
+            camMat = torch.from_numpy(anno['camMat'])
+            ex = torch.eye(4)
+            ex[:3, :3] = torch.from_numpy(cv2.Rodrigues(anno['objRot'])[0])
+            ex[:3, 3] = torch.from_numpy(anno['objTrans'])
+            # ex = self.glcam_in_cvcam @ ex
+            objCorners3DRest = torch.from_numpy(anno['objCorners3DRest']).float()
+            # objCorners3DRest = (ex[:3, :3] @ objCorners3DRest.T + ex[:3, 3:]).T
+            objCorners3DRest = objCorners3DRest @ ex[:3, :3].T + ex[:3, 3]
+            intrinsics.append(camMat)
+            extrinsics.append(ex)
+            objCorners.append(objCorners3DRest)
+            objNames.append(anno['objName'])
+        intrinsics = torch.stack(intrinsics).float()
+        extrinsics = torch.stack(extrinsics).float()
+        objCorners = torch.stack(objCorners)
+        objNames = np.array(objNames)
+        valid = torch.tensor(valid)
+        return intrinsics, extrinsics, objCorners, objNames, valid
+    def _load_mask(self, mask_path):
+        mask = cv2.imread(str(mask_path))
+        mask = cv2.resize(mask, (640, 480))
+        mask = mask[..., 1] == 255
+        return mask
+    def _load_depth(self, depth_path):
+        depth_scale = 0.00012498664727900177
+        depth_img = cv2.imread(str(depth_path))
+        dpt = depth_img[:, :, 2] + depth_img[:, :, 1] * 256
+        dpt = dpt * depth_scale
+        return dpt
+    def __getitem__(self, idx):
+        color = cv2.imread(str(self.color_paths[idx]))
+        color = cv2.cvtColor(color, cv2.COLOR_BGR2RGB)
+        # color = self.augment(color)
+        color = (torch.tensor(color).float() / 255.0).permute(2, 0, 1)
+        mask = self._load_mask(self.mask_paths[idx])
+        mask = torch.from_numpy(mask)
+        depth = self._load_depth(self.depth_paths[idx])
+        depth = torch.from_numpy(depth)
+        bbox = self.bboxes[idx]
+        intrinsic = self.intrinsics[idx]
+        extrinsic = self.extrinsics[idx]
+        objCorners = self.objCorners[idx]
+        objName = self.objNames[idx]
+        return {
+            'color': color,
+            'mask': mask,
+            'depth': depth,
+            'extrinsic': extrinsic,
+            'intrinsic': intrinsic,
+            'objCorners': objCorners,
+            'bbox': bbox,
+            'color_path': str(self.color_paths[idx]).split('/', 2)[-1],
+            'objName': objName,
+        }
+class HO3DPair(Dataset):
+    def __init__(self, data_root, mode, sequence_id, max_angle_error):
+        self.ho3d_dataset = HO3D(data_root, sequence_id, mode)
+        angle_err = self.get_angle_error(self.ho3d_dataset.extrinsics[:, :3, :3])
+        index0, index1 = torch.where(angle_err < max_angle_error)
+        filter = torch.where(index0 < index1)
+        self.index0, self.index1 = index0[filter], index1[filter]
+        # angle_err_filtered = angle_err[row, col]
+        self.indices = torch.tensor(list(zip(self.index0, self.index1)))
+        if mode == 'val' or mode == 'test':
+            self.indices = self.indices[torch.randperm(self.indices.size(0))[:1500]]
+    def get_angle_error(self, R):
+        # R: (B, 3, 3)
+        residual = torch.einsum('aij,bik->abjk', R, R)
+        trace = torch.diagonal(residual, dim1=-2, dim2=-1).sum(-1)
+        cosine = (trace - 1) / 2
+        cosine = torch.clip(cosine, -1, 1)
+        R_err = torch.acos(cosine)
+        angle_err = R_err.rad2deg()
+        return angle_err
+    def __len__(self):
+        return len(self.indices)
+    def __getitem__(self, idx):
+        idx0, idx1 = self.indices[idx]
+        data0, data1 = self.ho3d_dataset[idx0], self.ho3d_dataset[idx1]
+        images = torch.stack([data0['color'], data1['color']], dim=0)
+        ex0, ex1 = data0['extrinsic'], data1['extrinsic']
+        rel_ex = ex1 @ ex0.inverse()
+        rel_R = rel_ex[:3, :3]
+        rel_t = rel_ex[:3, 3]
+        intrinsics = torch.stack([data0['intrinsic'], data1['intrinsic']], dim=0)
+        bboxes = torch.stack([data0['bbox'], data1['bbox']])
+        objCorners = torch.stack([data0['objCorners'], data1['objCorners']])
+        return {
+            'images': images,
+            'rotation': rel_R,
+            'translation': rel_t,
+            'intrinsics': intrinsics,
+            'bboxes': bboxes,
+            'objCorners': objCorners,
+            'pair_names': (data0['color_path'], data1['color_path']),
+            'objName': data0['objName']
+        }
+class HO3DfromJson(Dataset):
+    def __init__(self, data_root, json_path):
+        self.data_root = Path(data_root)
+        with open(json_path, 'r') as f:
+            self.scene_info = json.load(f)
+        self.obj_names = [
+            '003_cracker_box',
+            '006_mustard_bottle',
+            '011_banana',
+            '025_mug',
+            '037_scissors'
+        ]
+        self.object_points = {obj: np.loadtxt(self.data_root / 'models' / obj / 'points.xyz')  for obj in self.obj_names}
+    def _load_color(self, path):
+        color = cv2.imread(path)
+        color = cv2.cvtColor(color, cv2.COLOR_BGR2RGB)
+        return color
+    def _load_mask(self, path):
+        mask_path = str(path).replace('rgb', 'seg').replace('.jpg', '.png')
+        mask = cv2.imread(str(mask_path))
+        mask = cv2.resize(mask, (640, 480))
+        mask = mask[..., 1] == 255
+        return mask
+    def _load_depth(self, path):
+        depth_scale = 0.00012498664727900177
+        depth_path = str(path).replace('rgb', 'depth').replace('.jpg', '.png')
+        depth_img = cv2.imread(depth_path)
+        dpt = depth_img[:, :, 2] + depth_img[:, :, 1] * 256
+        dpt = dpt * depth_scale
+        return dpt
+    def __len__(self):
+        return len(self.scene_info)
+    def __getitem__(self, idx):
+        info = self.scene_info[str(idx)]
+        pair_names = info['pair_names']
+        image0 = self._load_color(str(self.data_root / pair_names[0]))
+        image0 = (torch.tensor(image0).float() / 255.0).permute(2, 0, 1)
+        image1 = self._load_color(str(self.data_root / pair_names[1]))
+        image1 = (torch.tensor(image1).float() / 255.0).permute(2, 0, 1)
+        images = torch.stack([image0, image1], dim=0)
+        mask0 = self._load_mask(str(self.data_root / pair_names[0]))
+        mask0 = torch.from_numpy(mask0)
+        mask1 = self._load_mask(str(self.data_root / pair_names[1]))
+        mask1 = torch.from_numpy(mask1)
+        masks = torch.stack([mask0, mask1], dim=0)
+        depth0 = self._load_depth(str(self.data_root / pair_names[0]))
+        depth0 = torch.from_numpy(depth0)
+        depth1 = self._load_depth(str(self.data_root / pair_names[1]))
+        depth1 = torch.from_numpy(depth1)
+        depths = torch.stack([depth0, depth1], dim=0)
+        rotation = torch.tensor(info['rotation']).reshape(3, 3)
+        translation = torch.tensor(info['translation'])
+        intrinsics = torch.tensor(info['intrinsics']).reshape(2, 3, 3)
+        bboxes = torch.tensor(info['bboxes'])
+        objCorners = torch.tensor(info['objCorners'])
+        return {
+            'images': images,
+            'masks': masks,
+            'depths': depths,
+            'rotation': rotation,
+            'translation': translation,
+            'intrinsics': intrinsics,
+            'bboxes': bboxes,
+            'objCorners': objCorners,
+            'objName': info['objName'][0],
+            'point_cloud': self.object_points[info['objName'][0]]
+        }
+def build_ho3d(mode, config):
+    config = config.DATASET
+    data_root = config.DATA_ROOT
+    seq_id_list = [x.stem for x in (Path(data_root) / 'train').iterdir()]
+    val_id_list = ['BB14', 'SMu1', 'MC1', 'GSF14', 'SM2', 'SM3', 'SM4', 'SM5', 'MC2', 'MC4', 'MC5', 'MC6']
+    for val_id in val_id_list:
+        seq_id_list.remove(val_id)
+    if mode == 'train':
+        datasets = []
+        for seq_id in tqdm(seq_id_list, desc=f'Loading HO3D {mode} dataset'):
+            datasets.append(HO3DPair(data_root, mode, seq_id, config.MAX_ANGLE_ERROR))
+        return ConcatDataset(datasets)
+    elif mode == 'test' or mode == 'val':
+        # datasets = []
+        # for seq_id in tqdm(val_id_list[:5], desc=f'Loading HO3D {mode} dataset'):
+        #     datasets.append(HO3DPair(data_root, mode, seq_id, config.MAX_ANGLE_ERROR))
+        # return ConcatDataset(datasets)
+        return HO3DfromJson(config.DATA_ROOT, config.JSON_PATH)

datasets/linemod.py ADDED Viewed

	@@ -0,0 +1,441 @@

+import json
+from pathlib import Path
+import math
+import random
+from tqdm import tqdm, trange
+import plyfile
+import numpy as np
+import torch
+from PIL import Image
+from torch.utils.data import Dataset, ConcatDataset
+from torch.nn import functional as F
+from utils import Augmentor
+LINEMOD_ID_TO_NAME = {
+    '000001': 'ape',
+    '000002': 'benchvise',
+    '000003': 'bowl',
+    '000004': 'camera',
+    '000005': 'can',
+    '000006': 'cat',
+    '000007': 'mug',
+    '000008': 'driller',
+    '000009': 'duck',
+    '000010': 'eggbox',
+    '000011': 'glue',
+    '000012': 'holepuncher',
+    '000013': 'iron',
+    '000014': 'lamp',
+    '000015': 'phone',
+}
+def inverse_transform(trans):
+    rot = trans[:3, :3]
+    t = trans[:3, 3]
+    rot = np.transpose(rot)
+    t = -np.matmul(rot, t)
+    output = np.zeros((4, 4), dtype=np.float32)
+    output[3][3] = 1
+    output[:3, :3] = rot
+    output[:3, 3] = t
+    return output
+class BOPDataset(Dataset):
+    def __init__(self,
+                 dataset_path,
+                 scene_path,
+                 object_id,
+                 min_visible_fract,
+                 mode,
+                 rgb_postfix='.png',
+                 object_scale=None
+                 ):
+        super().__init__()
+        self.dataset_path = dataset_path
+        self.scene_path = scene_path
+        self.object_id = object_id
+        if dataset_path.name == 'lm' or dataset_path.name == 'lmo':
+            base_obj_scale = 1.0
+            self.models_path = self.dataset_path / 'models'
+        elif dataset_path.name == 'tless':
+            base_obj_scale = 0.60
+            self.models_path = self.dataset_path / 'models_reconst'
+        else:
+            raise ValueError(f'Unknown dataset type {dataset_path.name}')
+        self.model_path = self.models_path / f'obj_{self.object_id:06d}.ply'
+        self.pointcloud_path = self.dataset_path / 'models_eval' / f'obj_{self.object_id:06d}.ply'
+        models_info_path = self.dataset_path / 'models_eval' / 'models_info.json'
+        with open(models_info_path, 'r') as f:
+            self.model_info = json.load(f)[str(object_id)]
+        # self.center_object = center_object
+        if object_scale is None:
+            self.object_scale = base_obj_scale / self.model_info['diameter']
+        else:
+            self.object_scale = object_scale
+        # self.image_scale = 1.0
+        self.bounds = torch.tensor([
+            (self.model_info['min_x'], self.model_info['min_x'] + self.model_info['size_x']),
+            (self.model_info['min_y'], self.model_info['min_y'] + self.model_info['size_y']),
+            (self.model_info['min_z'], self.model_info['min_z'] + self.model_info['size_z']),
+        ])
+        self.centroid = self.bounds.mean(dim=1)
+        self.depth_dir = self.scene_path / 'depth'
+        self.mask_dir = self.scene_path / 'mask_visib'
+        self.color_dir = self.scene_path / 'rgb'
+        self.intrinsics_path = self.scene_path / 'scene_camera.json'
+        self.extrinsics_path = self.scene_path / 'scene_gt.json'
+        self.gt_info_path = self.scene_path / 'scene_gt_info.json'
+        self.intrinsics, self.depth_scales = self.load_intrinsics(self.intrinsics_path)
+        self.extrinsics, self.scene_object_inds = self.load_extrinsics(self.extrinsics_path)
+        self.extrinsics = torch.stack(self.extrinsics, dim=0)
+        self.gt_info = self.load_gt_info(self.gt_info_path)
+        # # Compute quaternions for sampling.
+        # rotation, translation = three.decompose(self.extrinsics)
+        # self.quaternions = three.quaternion.mat_to_quat(rotation[:, :3, :3])
+        self.depth_paths = sorted([self.depth_dir / f'{frame_ind:06d}.png'
+                                   for frame_ind in self.scene_object_inds.keys()])
+        self.mask_paths = [
+            self.mask_dir / f'{frame_ind:06d}_{obj_ind:06d}.png'
+            for frame_ind, obj_ind in self.scene_object_inds.items()
+        ]
+        self.color_paths = sorted([self.color_dir / f'{frame_ind:06d}{rgb_postfix}'
+                                   for frame_ind in self.scene_object_inds.keys()])
+        visib_filter = np.array(self.gt_info['visib_fract']) >= min_visible_fract
+        self.color_paths = np.array(self.color_paths)[visib_filter]
+        self.mask_paths = np.array(self.mask_paths)[visib_filter]
+        self.depth_paths = np.array(self.depth_paths)[visib_filter]
+        self.depth_scales = np.array(self.depth_scales)[visib_filter]
+        for k in self.gt_info:
+            self.gt_info[k] = np.array(self.gt_info[k])[visib_filter]
+        self.extrinsics = np.array(self.extrinsics)[visib_filter]
+        self.intrinsics = np.array(self.intrinsics)[visib_filter]
+        self.augment = Augmentor(mode=='train')
+        assert len(self.depth_paths) == len(self.mask_paths)
+        assert len(self.depth_paths) == len(self.color_paths)
+    # def load_pointcloud(self):
+    #     obj = meshutils.Object3D(self.pointcloud_path)
+    #     points = torch.tensor(obj.vertices, dtype=torch.float32)
+    #     points = points * self.object_scale
+    #     return points
+    def load_gt_info(self, path):
+        with open(path, 'r') as f:
+            gt_info_json = json.load(f)
+            keys = sorted([int(k) for k in gt_info_json.keys()])
+            gt_info = {k: [] for k in gt_info_json['0'][0]}
+            for key in keys:
+                value = gt_info_json[str(key)]
+                obj_info = value[self.scene_object_inds[key]]
+                for info_k in obj_info:
+                    gt_info[info_k].append(obj_info[info_k])
+        return gt_info
+    def load_intrinsics(self, path):
+        intrinsics = []
+        depth_scales = []
+        with open(path, 'r') as f:
+            intrinsics_json = json.load(f)
+            keys = sorted([int(k) for k in intrinsics_json.keys()])
+            for key in keys:
+                value = intrinsics_json[str(key)]
+                intrinsic_3x3 = value['cam_K']
+                intrinsics.append(torch.tensor(intrinsic_3x3).reshape(3, 3))
+                depth_scales.append(value['depth_scale'])
+        return intrinsics, depth_scales
+    def load_extrinsics(self, path):
+        extrinsics = []
+        scene_object_inds = {}
+        with open(path, 'r') as f:
+            extrinsics_json = json.load(f)
+            frame_inds = sorted([int(k) for k in extrinsics_json.keys()])
+            for frame_ind in frame_inds:
+                for obj_ind, cam_d in enumerate(extrinsics_json[str(frame_ind)]):
+                    if cam_d['obj_id'] == self.object_id:
+                        rotation = torch.tensor(
+                            cam_d['cam_R_m2c'], dtype=torch.float32).reshape(3, 3)
+                        translation = torch.tensor(cam_d['cam_t_m2c'], dtype=torch.float32) / 1000.
+                        # quaternion = three.quaternion.mat_to_quat(rotation)
+                        # extrinsics.append(three.to_extrinsic_matrix(translation, quaternion))
+                        extrinsic = torch.eye(4)
+                        extrinsic[:3, :3] = rotation
+                        extrinsic[:3, 3] = translation
+                        extrinsics.append(extrinsic)
+                        scene_object_inds[frame_ind] = obj_ind
+        return extrinsics, scene_object_inds
+    def __len__(self):
+        return len(self.color_paths)
+    def get_ids(self):
+        return [p.stem for p in self.color_paths]
+    def _load_color(self, path):
+        image = Image.open(path)
+        image = np.array(image)
+        return image
+    def _load_mask(self, path):
+        image = Image.open(path)
+        image = np.array(image, dtype=bool)
+        if len(image.shape) > 2:
+            image = image[:, :, 0]
+        return image
+    def _load_depth(self, path):
+        image = Image.open(path)
+        image = np.array(image, dtype=np.float32)
+        return image
+    def __getitem__(self, idx):
+        color = self._load_color(self.color_paths[idx])
+        # color = self.augment(color)
+        color = (torch.tensor(color).float() / 255.0).permute(2, 0, 1)
+        mask = self._load_mask(self.mask_paths[idx])
+        mask = torch.tensor(mask).bool()
+        depth = self._load_depth(self.depth_paths[idx])
+        depth = torch.tensor(depth) * self.object_scale * self.depth_scales[idx]
+        # intrinsic = self.normalize_intrinsic(self.intrinsics[idx])
+        # extrinsic = self.normalize_extrinsic(self.extrinsics[idx])
+        intrinsic = torch.from_numpy(self.intrinsics[idx])
+        extrinsic = torch.from_numpy(self.extrinsics[idx])
+        bbox_obj = self.gt_info['bbox_obj'][idx]
+        bbox_visib = self.gt_info['bbox_visib'][idx]
+        bbox_obj = torch.tensor([bbox_obj[0], bbox_obj[1], bbox_obj[0]+bbox_obj[2], bbox_obj[1]+bbox_obj[3]])
+        bbox_visib = torch.tensor([bbox_visib[0], bbox_visib[1], bbox_visib[0]+bbox_visib[2], bbox_visib[1]+bbox_visib[3]])
+        visib_fract = self.gt_info['visib_fract'][idx]
+        px_count_visib = self.gt_info['px_count_visib'][idx]
+        return {
+            'color': color,
+            'mask': mask,
+            'depth': depth,
+            'extrinsic': extrinsic,
+            'intrinsic': intrinsic,
+            'bbox_obj': bbox_obj,
+            'bbox_visib': bbox_visib,
+            'visib_fract': visib_fract,
+            'px_count_visib': px_count_visib,
+            'color_path': str(self.color_paths[idx]).split('/', 2)[-1],
+            'object_scale': self.object_scale,
+            'depth_scale': self.depth_scales[idx]
+        }
+class Linemod(Dataset):
+    def __init__(self, data_root, mode, object_id, scene_id, min_visible_fract, max_angle_error):
+        if mode == 'train':
+            type_path = 'train_pbr'
+            rgb_postfix = '.jpg'
+            scene_id = scene_id
+        elif mode == 'val' or mode == 'test':
+            type_path = 'test'
+            rgb_postfix = '.png'
+            scene_id = object_id
+        else:
+            raise NotImplementedError(f'mode {mode}')
+        data_root = Path(data_root)
+        scene_path = data_root / type_path / f'{scene_id:06d}'
+        self.bop_dataset = BOPDataset(data_root, scene_path, object_id=object_id, min_visible_fract=min_visible_fract, mode=mode, rgb_postfix=rgb_postfix)
+        angle_err = self.get_angle_error(torch.from_numpy(self.bop_dataset.extrinsics[:, :3, :3]))
+        index0, index1 = torch.where(angle_err < max_angle_error)
+        filter = torch.where(index0 < index1)
+        self.index0, self.index1 = index0[filter], index1[filter]
+        # angle_err_filtered = angle_err[row, col]
+        self.indices = torch.tensor(list(zip(self.index0, self.index1)))
+        if mode == 'val':
+            self.indices = self.indices[torch.randperm(self.indices.size(0))[:1500]]
+    def get_angle_error(self, R):
+        # R: (B, 3, 3)
+        residual = torch.einsum('aij,bik->abjk', R, R)
+        trace = torch.diagonal(residual, dim1=-2, dim2=-1).sum(-1)
+        cosine = (trace - 1) / 2
+        cosine = torch.clip(cosine, -1, 1)
+        R_err = torch.acos(cosine)
+        angle_err = R_err.rad2deg()
+        return angle_err
+    def __len__(self):
+        return len(self.indices)
+    def __getitem__(self, idx):
+        idx0, idx1 = self.indices[idx]
+        data0, data1 = self.bop_dataset[idx0], self.bop_dataset[idx1]
+        images = torch.stack([data0['color'], data1['color']], dim=0)
+        ex0, ex1 = data0['extrinsic'], data1['extrinsic']
+        rel_ex = ex1 @ ex0.inverse()
+        rel_R = rel_ex[:3, :3]
+        rel_t = rel_ex[:3, 3]
+        intrinsics = torch.stack([data0['intrinsic'], data1['intrinsic']], dim=0)
+        bboxes = torch.stack([data0['bbox_visib'], data1['bbox_visib']])
+        return {
+            'images': images,
+            'rotation': rel_R,
+            'translation': rel_t,
+            'intrinsics': intrinsics,
+            'bboxes': bboxes,
+            'pair_names': (data0['color_path'], data1['color_path']),
+            'object_scale': data0['object_scale'],
+            'depth_scale': (data0['depth_scale'], data1['depth_scale']),
+        }
+class LinemodfromJson(Dataset):
+    def __init__(self, data_root, json_path):
+        self.data_root = Path(data_root)
+        with open(json_path, 'r') as f:
+            self.scene_info = json.load(f)
+        # self.image_scale = 1.0
+        models_info_path = self.data_root / 'models_eval' / 'models_info.json'
+        with open(models_info_path, 'r') as f:
+            model_info = json.load(f)
+        self.object_diameters = {obj: model_info[obj]['diameter'] for obj in model_info}
+        self.object_points = {obj: self._load_point_cloud(obj) for obj in self.object_diameters}
+    def _load_point_cloud(self, obj_id):
+        with open(self.data_root / 'models_eval' / f'obj_{int(obj_id):06d}.ply', "rb") as f:
+            plydata = plyfile.PlyData.read(f)
+            xyz = np.stack([np.array(plydata["vertex"][c]).astype(float) for c in ("x", "y", "z")], axis=1)
+        return xyz
+    def _load_color(self, path):
+        image = Image.open(path)
+        # new_shape = (int(image.width * self.image_scale), int(image.height * self.image_scale))
+        # image = image.resize(new_shape)
+        image = np.array(image)
+        return image
+    def _load_mask(self, path):
+        path = path.replace('rgb', 'mask_visib').replace('.png', '_000000.png')
+        image = Image.open(path)
+        # new_shape = (int(image.width * self.image_scale), int(image.height * self.image_scale))
+        # image = image.resize(new_shape)
+        image = np.array(image, dtype=bool)
+        if len(image.shape) > 2:
+            image = image[:, :, 0]
+        return image
+    def _load_depth(self, path):
+        path = path.replace('rgb', 'depth')
+        image = Image.open(path)
+        # new_shape = (int(image.width * self.image_scale), int(image.height * self.image_scale))
+        # image = image.resize(new_shape)
+        image = np.array(image, dtype=np.float32)
+        return image
+    def __len__(self):
+        return len(self.scene_info)
+    def __getitem__(self, idx):
+        info = self.scene_info[str(idx)]
+        pair_names = info['pair_names']
+        image0 = self._load_color(str(self.data_root / pair_names[0]))
+        image0 = (torch.tensor(image0).float() / 255.0).permute(2, 0, 1)
+        image1 = self._load_color(str(self.data_root / pair_names[1]))
+        image1 = (torch.tensor(image1).float() / 255.0).permute(2, 0, 1)
+        images = torch.stack([image0, image1], dim=0)
+        mask0 = self._load_mask(str(self.data_root / pair_names[0]))
+        mask0 = torch.tensor(mask0).bool()
+        mask1 = self._load_mask(str(self.data_root / pair_names[1]))
+        mask1 = torch.tensor(mask1).bool()
+        masks = torch.stack([mask0, mask1], dim=0)
+        depth0 = self._load_depth(str(self.data_root / pair_names[0]))
+        depth0 = torch.tensor(depth0) * info['depth_scale'][0]
+        depth1 = self._load_depth(str(self.data_root / pair_names[1]))
+        depth1 = torch.tensor(depth1) * info['depth_scale'][1]
+        depths = torch.stack([depth0, depth1], dim=0) / 1000.
+        rotation = torch.tensor(info['rotation']).reshape(3, 3)
+        translation = torch.tensor(info['translation'])
+        intrinsics = torch.tensor(info['intrinsics']).reshape(2, 3, 3)
+        bboxes = torch.tensor(info['bboxes'])
+        obj_id = str(int(pair_names[0].split('/')[1]))
+        diameter = self.object_diameters[obj_id]
+        point_cloud = torch.from_numpy(self.object_points[obj_id]) / 1000.
+        return {
+            'images': images,
+            'masks': masks,
+            'depths': depths,
+            'rotation': rotation,
+            'translation': translation,
+            'intrinsics': intrinsics,
+            'bboxes': bboxes,
+            'diameter': diameter,
+            'point_cloud': point_cloud,
+        }
+def build_linemod(mode, config):
+    config = config.DATASET
+    # datasets = []
+    # for i, _ in enumerate(LINEMOD_ID_TO_NAME):
+    #     datasets.append(Linemod(config.DATA_ROOT, mode, i+1, config.MIN_VISIBLE_FRACT, config.MAX_ANGLE_ERROR))
+    # return ConcatDataset(datasets)
+    if mode == 'train':
+        datasets = []
+        with tqdm(total=len(LINEMOD_ID_TO_NAME) * 50) as t:
+            t.set_description(f'Loading Linemod {mode} datasets')
+            for i, _ in enumerate(LINEMOD_ID_TO_NAME):
+                for j in range(50):
+                    t.update(1)
+                    try:
+                        datasets.append(Linemod(config.DATA_ROOT, mode, i+1, j, config.MIN_VISIBLE_FRACT, config.MAX_ANGLE_ERROR))
+                    except KeyError:
+                        continue
+        return ConcatDataset(datasets)
+    elif mode == 'test' or mode == 'val':
+        # datasets = []
+        # for i, _ in enumerate(LINEMOD_ID_TO_NAME):
+        #     datasets.append(Linemod(config.DATA_ROOT, mode, i+1, i+1, config.MIN_VISIBLE_FRACT, config.MAX_ANGLE_ERROR))
+        # return ConcatDataset(datasets)
+        return LinemodfromJson(config.DATA_ROOT, config.JSON_PATH)

datasets/mapfree.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from pathlib import Path
+import torch
+import torch.utils.data as data
+import cv2
+import numpy as np
+from transforms3d.quaternions import qinverse, qmult, rotate_vector, quat2mat
+from utils.transform import correct_intrinsic_scale
+from utils import Augmentor
+class MapFreeScene(data.Dataset):
+    def __init__(self, scene_root, resize, sample_factor=1, overlap_limits=None, estimated_depth=None, mode='train'):
+        super().__init__()
+        self.scene_root = Path(scene_root)
+        self.resize = resize
+        self.sample_factor = sample_factor
+        self.estimated_depth = estimated_depth
+        # load absolute poses
+        self.poses = self.read_poses(self.scene_root)
+        # read intrinsics
+        self.K = self.read_intrinsics(self.scene_root, resize)
+        # load pairs
+        self.pairs = self.load_pairs(self.scene_root, overlap_limits, self.sample_factor)
+        self.augment = Augmentor(mode=='train')
+    @staticmethod
+    def read_intrinsics(scene_root: Path, resize=None):
+        Ks = {}
+        with (scene_root / 'intrinsics.txt').open('r') as f:
+            for line in f.readlines():
+                if '#' in line:
+                    continue
+                line = line.strip().split(' ')
+                img_name = line[0]
+                fx, fy, cx, cy, W, H = map(float, line[1:])
+                K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
+                if resize is not None:
+                    K = correct_intrinsic_scale(K, resize[0] / W, resize[1] / H)
+                Ks[img_name] = K
+        return Ks
+    @staticmethod
+    def read_poses(scene_root: Path):
+        """
+        Returns a dictionary that maps: img_path -> (q, t) where
+        np.array q = (qw, qx qy qz) quaternion encoding rotation matrix;
+        np.array t = (tx ty tz) translation vector;
+        (q, t) encodes absolute pose (world-to-camera), i.e. X_c = R(q) X_W + t
+        """
+        poses = {}
+        with (scene_root / 'poses.txt').open('r') as f:
+            for line in f.readlines():
+                if '#' in line:
+                    continue
+                line = line.strip().split(' ')
+                img_name = line[0]
+                qt = np.array(list(map(float, line[1:])))
+                poses[img_name] = (qt[:4], qt[4:])
+        return poses
+    def load_pairs(self, scene_root: Path, overlap_limits: tuple = None, sample_factor: int = 1):
+        """
+        For training scenes, filter pairs of frames based on overlap (pre-computed in overlaps.npz)
+        For test/val scenes, pairs are formed between keyframe and every other sample_factor query frames.
+        If sample_factor == 1, all query frames are used. Note: sample_factor applicable only to test/val
+        Returns:
+        pairs: nd.array [Npairs, 4], where each column represents seaA, imA, seqB, imB, respectively
+        """
+        overlaps_path = scene_root / 'overlaps.npz'
+        if overlaps_path.exists():
+            f = np.load(overlaps_path, allow_pickle=True)
+            idxs, overlaps = f['idxs'], f['overlaps']
+            if overlap_limits is not None:
+                min_overlap, max_overlap = overlap_limits
+                mask = (overlaps > min_overlap) * (overlaps < max_overlap)
+                idxs = idxs[mask]
+                return idxs.copy()
+        else:
+            idxs = np.zeros((len(self.poses) - 1, 4), dtype=np.uint16)
+            idxs[:, 2] = 1
+            idxs[:, 3] = np.array([int(fn[-9:-4])
+                                  for fn in self.poses.keys() if 'seq0' not in fn], dtype=np.uint16)
+            return idxs[::sample_factor]
+    def get_pair_path(self, pair):
+        seqA, imgA, seqB, imgB = pair
+        return (f'seq{seqA}/frame_{imgA:05}.jpg', f'seq{seqB}/frame_{imgB:05}.jpg')
+    def __len__(self):
+        return len(self.pairs)
+    def __getitem__(self, index):
+        # image paths (relative to scene_root)
+        img_name0, img_name1 = self.get_pair_path(self.pairs[index])
+        w_new, h_new = self.resize
+        image0 = cv2.imread(str(self.scene_root / img_name0))
+        # image0 = cv2.resize(image0, (w_new, h_new))
+        image0 = cv2.cvtColor(image0, cv2.COLOR_BGR2RGB)
+        image0 = self.augment(image0)
+        image0 = torch.from_numpy(image0).permute(2, 0, 1).float() / 255.
+        image1 = cv2.imread(str(self.scene_root / img_name1))
+        # image1 = cv2.resize(image1, (w_new, h_new))
+        image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
+        image1 = self.augment(image1)
+        image1 = torch.from_numpy(image1).permute(2, 0, 1).float() / 255.
+        images = torch.stack([image0, image1], dim=0)
+        depth0 = np.load(str(self.scene_root / img_name0).replace('.jpg', f'.da.npy'))
+        depth0 = torch.from_numpy(depth0).float()
+        depth1 = np.load(str(self.scene_root / img_name1).replace('.jpg', f'.da.npy'))
+        depth1 = torch.from_numpy(depth1).float()
+        depths = torch.stack([depth0, depth1], dim=0)
+        # get absolute pose of im0 and im1
+        # quaternion and translation vector that transforms World-to-Cam
+        q1, t1 = self.poses[img_name0]
+        # quaternion and translation vector that transforms World-to-Cam
+        q2, t2 = self.poses[img_name1]
+        # get 4 x 4 relative pose transformation matrix (from im1 to im2)
+        # for test/val set, q1,t1 is the identity pose, so the relative pose matches the absolute pose
+        q12 = qmult(q2, qinverse(q1))
+        t12 = t2 - rotate_vector(t1, q12)
+        T = np.eye(4, dtype=np.float32)
+        T[:3, :3] = quat2mat(q12)
+        T[:3, -1] = t12
+        T = torch.from_numpy(T)
+        K_0 = torch.from_numpy(self.K[img_name0].copy()).reshape(3, 3)
+        K_1 = torch.from_numpy(self.K[img_name1].copy()).reshape(3, 3)
+        intrinsics = torch.stack([K_0, K_1], dim=0).float()
+        data = {
+            'images': images,
+            'depths': depths,
+            'rotation': T[:3, :3],
+            'translation': T[:3, 3],
+            'intrinsics': intrinsics,
+            'scene_id': self.scene_root.stem,
+            'scene_root': str(self.scene_root),
+            'pair_id': index*self.sample_factor,
+            'pair_names': (img_name0, img_name1),
+        }
+        return data
+def build_concat_mapfree(mode, config):
+    assert mode in ['train', 'val', 'test'], 'Invalid dataset mode'
+    data_root = Path(config.DATASET.DATA_ROOT) / mode
+    scenes = scenes = [s.name for s in data_root.iterdir() if s.is_dir()]
+    sample_factor = {'train': 1, 'val': 5, 'test': 1}[mode]
+    estimated_depth = config.DATASET.ESTIMATED_DEPTH
+    resize = (540, 720)
+    overlap_limits = (0.2, 0.7)
+    # Init dataset objects for each scene
+    datasets = [MapFreeScene(data_root / scene, resize, sample_factor, overlap_limits, estimated_depth, mode) for scene in scenes]
+    return data.ConcatDataset(datasets)

datasets/matterport.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import numpy as np
+import cv2
+from pathlib import Path
+import json
+import torch
+from torch.utils.data import Dataset
+from utils import rotation_matrix_from_quaternion, Augmentor
+class Matterport3D(Dataset):
+    def __init__(self, data_root, mode='train'):
+        data_root = Path(data_root)
+        json_path = data_root / 'mp3d_planercnn_json' / f'cached_set_{mode}.json'
+        scene_info = {'images': [], 'rotation': [], 'translation': [], 'intrinsics': []}
+        with open(json_path) as f:
+            split = json.load(f)
+        for _, data in enumerate(split['data']):
+            images = []
+            for imgnum in ['0', '1']:
+                img_name = data_root / '/'.join(data[imgnum]['file_name'].split('/')[6:])
+                images.append(img_name)
+            rel_rotation = data['rel_pose']['rotation']
+            rel_translation = data['rel_pose']['position']
+            intrinsic = [
+                [517.97, 0, 320],
+                [0, 517.97, 240],
+                [0, 0, 1]
+            ]
+            intrinsics = [intrinsic, intrinsic]
+            scene_info['images'].append(images)
+            scene_info['rotation'].append(rel_rotation)
+            scene_info['translation'].append(rel_translation)
+            scene_info['intrinsics'].append(intrinsics)
+        scene_info['rotation'] = torch.tensor(scene_info['rotation'])
+        scene_info['translation'] = torch.tensor(scene_info['translation'])
+        scene_info['intrinsics'] = torch.tensor(scene_info['intrinsics'])
+        self.scene_info = scene_info
+        self.augment = Augmentor(mode=='train')
+        self.is_training = mode == 'train'
+    def __len__(self):
+        return len(self.scene_info['images'])
+    def __getitem__(self, idx):
+        img_names = self.scene_info['images'][idx]
+        rotation = self.scene_info['rotation'][idx]
+        translation = self.scene_info['translation'][idx]
+        intrinsics = self.scene_info['intrinsics'][idx]
+        images = []
+        for i in range(2):
+            image = cv2.imread(str(img_names[i]))
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            image = self.augment(image)
+            image = torch.from_numpy(image).permute(2, 0, 1)
+            images.append(image)
+        images = torch.stack(images)
+        images = images.float() / 255.
+        rotation = -rotation if rotation[0] < 0 else rotation
+        rotation /= rotation.norm(2)
+        rotation = rotation_matrix_from_quaternion(rotation[None,])[0]
+        rotation = rotation.mT
+        translation = -rotation @ translation.unsqueeze(-1)
+        translation = translation[:, 0]
+        return {
+            'images': images,
+            'rotation': rotation,
+            'translation': translation,
+            'intrinsics': intrinsics,
+        }
+def build_matterport(mode, config):
+    return Matterport3D(config.DATASET.DATA_ROOT, mode)

datasets/megadepth.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import os.path as osp
+import numpy as np
+import cv2
+from tqdm import tqdm
+import torch
+from torch.utils.data import Dataset, ConcatDataset
+from utils import Augmentor
+class MegaDepthDataset(Dataset):
+    def __init__(self,
+                 root_dir,
+                 npz_path,
+                 mode='train',
+                 min_overlap_score=0.4,
+                 ):
+        """
+        Manage one scene(npz_path) of MegaDepth dataset.
+        Args:
+            root_dir (str): megadepth root directory that has `phoenix`.
+            npz_path (str): {scene_id}.npz path. This contains image pair information of a scene.
+            mode (str): options are ['train', 'val', 'test']
+            min_overlap_score (float): how much a pair should have in common. In range of [0, 1]. Set to 0 when testing.
+        """
+        super().__init__()
+        self.root_dir = root_dir
+        self.mode = mode
+        self.scene_id = npz_path.split('.')[0]
+        # prepare scene_info and pair_info
+        if mode == 'test':
+            min_overlap_score = 0
+        self.scene_info = np.load(npz_path, allow_pickle=True)
+        self.pair_infos = self.scene_info['pair_infos'].copy()
+        del self.scene_info['pair_infos']
+        self.pair_infos = [pair_info for pair_info in self.pair_infos if pair_info[1] > min_overlap_score]
+        self.augment = Augmentor(mode=='train')
+    def __len__(self):
+        return len(self.pair_infos)
+    def __getitem__(self, idx):
+        (idx0, idx1), overlap_score, central_matches = self.pair_infos[idx]
+        # read grayscale image and mask. (1, h, w) and (h, w)
+        img_name0 = osp.join(self.root_dir, self.scene_info['image_paths'][idx0])
+        img_name1 = osp.join(self.root_dir, self.scene_info['image_paths'][idx1])
+        w_new, h_new = 640, 480
+        image0 = cv2.imread(img_name0)
+        scale0 = torch.tensor([image0.shape[1]/w_new, image0.shape[0]/h_new], dtype=torch.float)
+        image0 = cv2.resize(image0, (w_new, h_new))
+        image0 = cv2.cvtColor(image0, cv2.COLOR_BGR2RGB)
+        # image0 = self.augment(image0)
+        image0 = torch.from_numpy(image0).permute(2, 0, 1).float() / 255.
+        image1 = cv2.imread(img_name1)
+        scale1 = torch.tensor([image1.shape[1]/w_new, image1.shape[0]/h_new], dtype=torch.float)
+        image1 = cv2.resize(image1, (w_new, h_new))
+        image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
+        # image1 = self.augment(image1)
+        image1 = torch.from_numpy(image1).permute(2, 0, 1).float() / 255.
+        scales = torch.stack([scale0, scale1], dim=0)
+        images = torch.stack([image0, image1], dim=0)
+        # read intrinsics of original size
+        K_0 = torch.tensor(self.scene_info['intrinsics'][idx0].copy(), dtype=torch.float).reshape(3, 3)
+        K_1 = torch.tensor(self.scene_info['intrinsics'][idx1].copy(), dtype=torch.float).reshape(3, 3)
+        intrinsics = torch.stack([K_0, K_1], dim=0)
+        # read and compute relative poses
+        T0 = self.scene_info['poses'][idx0]
+        T1 = self.scene_info['poses'][idx1]
+        T_0to1 = torch.tensor(np.matmul(T1, np.linalg.inv(T0)), dtype=torch.float)[:4, :4]  # (4, 4)
+        data = {
+            'images': images,
+            'scales': scales,  # (2, 2): [scale_w, scale_h]
+            'rotation': T_0to1[:3, :3],
+            'translation': T_0to1[:3, 3],
+            'intrinsics': intrinsics,
+            'pair_names': (self.scene_info['image_paths'][idx0], self.scene_info['image_paths'][idx1]),
+            'depth_pair_names': (self.scene_info['depth_paths'][idx0], self.scene_info['depth_paths'][idx1]),
+        }
+        return data
+def build_concat_megadepth(mode, config):
+    if mode == 'train':
+        config = config.DATASET.TRAIN
+    elif mode == 'val':
+        config = config.DATASET.VAL
+    elif mode == 'test':
+        config = config.DATASET.TEST
+    else:
+        raise NotImplementedError(f'mode {mode}')
+    data_root = config.DATA_ROOT
+    # pose_root = config.POSE_ROOT
+    npz_root = config.NPZ_ROOT
+    list_path = config.LIST_PATH
+    # intrinsic_path = config.INTRINSIC_PATH
+    min_overlap_score = config.MIN_OVERLAP_SCORE
+    with open(list_path, 'r') as f:
+        npz_names = [name.split()[0] for name in f.readlines()]
+    datasets = []
+    npz_names = [f'{n}.npz' for n in npz_names]
+    for npz_name in tqdm(npz_names, desc=f'Loading MegaDepth {mode} datasets',):
+        npz_path = osp.join(npz_root, npz_name)
+        datasets.append(MegaDepthDataset(
+            data_root,
+            npz_path,
+            mode=mode,
+            min_overlap_score=min_overlap_score,
+        ))
+    return ConcatDataset(datasets)

datasets/sampler.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+from torch.utils.data import Sampler, ConcatDataset
+class RandomConcatSampler(Sampler):
+    """ Random sampler for ConcatDataset. At each epoch, `n_samples_per_subset` samples will be draw from each subset
+    in the ConcatDataset. If `subset_replacement` is ``True``, sampling within each subset will be done with replacement.
+    However, it is impossible to sample data without replacement between epochs, unless bulding a stateful sampler lived along the entire training phase.
+    For current implementation, the randomness of sampling is ensured no matter the sampler is recreated across epochs or not and call `torch.manual_seed()` or not.
+    Args:
+        shuffle (bool): shuffle the random sampled indices across all sub-datsets.
+        repeat (int): repeatedly use the sampled indices multiple times for training.
+            [arXiv:1902.05509, arXiv:1901.09335]
+    NOTE: Don't re-initialize the sampler between epochs (will lead to repeated samples)
+    NOTE: This sampler behaves differently with DistributedSampler.
+          It assume the dataset is splitted across ranks instead of replicated.
+    TODO: Add a `set_epoch()` method to fullfill sampling without replacement across epochs.
+          ref: https://github.com/PyTorchLightning/pytorch-lightning/blob/e9846dd758cfb1500eb9dba2d86f6912eb487587/pytorch_lightning/trainer/training_loop.py#L373
+    """
+    def __init__(self,
+                 data_source: ConcatDataset,
+                 n_samples_per_subset: int,
+                 subset_replacement: bool=True,
+                 shuffle: bool=True,
+                 repeat: int=1,
+                 seed: int=None):
+        if not isinstance(data_source, ConcatDataset):
+            raise TypeError("data_source should be torch.utils.data.ConcatDataset")
+        self.data_source = data_source
+        self.n_subset = len(self.data_source.datasets)
+        self.n_samples_per_subset = n_samples_per_subset
+        self.n_samples = self.n_subset * self.n_samples_per_subset * repeat
+        self.subset_replacement = subset_replacement
+        self.repeat = repeat
+        self.shuffle = shuffle
+        self.generator = torch.manual_seed(seed)
+        assert self.repeat >= 1
+    def __len__(self):
+        return self.n_samples
+    def __iter__(self):
+        indices = []
+        # sample from each sub-dataset
+        for d_idx in range(self.n_subset):
+            low = 0 if d_idx==0 else self.data_source.cumulative_sizes[d_idx-1]
+            high = self.data_source.cumulative_sizes[d_idx]
+            if self.subset_replacement:
+                rand_tensor = torch.randint(low, high, (self.n_samples_per_subset, ),
+                                            generator=self.generator, dtype=torch.int64)
+            else:  # sample without replacement
+                len_subset = len(self.data_source.datasets[d_idx])
+                rand_tensor = torch.randperm(len_subset, generator=self.generator) + low
+                if len_subset >= self.n_samples_per_subset:
+                    rand_tensor = rand_tensor[:self.n_samples_per_subset]
+                else: # padding with replacement
+                    rand_tensor_replacement = torch.randint(low, high, (self.n_samples_per_subset - len_subset, ),
+                                                            generator=self.generator, dtype=torch.int64)
+                    rand_tensor = torch.cat([rand_tensor, rand_tensor_replacement])
+            indices.append(rand_tensor)
+        indices = torch.cat(indices)
+        if self.shuffle:  # shuffle the sampled dataset (from multiple subsets)
+            rand_tensor = torch.randperm(len(indices), generator=self.generator)
+            indices = indices[rand_tensor]
+        # repeat the sampled indices (can be used for RepeatAugmentation or pure RepeatSampling)
+        if self.repeat > 1:
+            repeat_indices = [indices.clone() for _ in range(self.repeat - 1)]
+            if self.shuffle:
+                _choice = lambda x: x[torch.randperm(len(x), generator=self.generator)]
+                repeat_indices = map(_choice, repeat_indices)
+            indices = torch.cat([indices, *repeat_indices], 0)
+        assert indices.shape[0] == self.n_samples
+        return iter(indices.tolist())

datasets/scannet.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from os import path as osp
+import numpy as np
+from numpy.linalg import inv
+import cv2
+from tqdm import tqdm
+import torch
+from torch.utils.data import Dataset, ConcatDataset
+from utils import Augmentor
+def read_scannet_pose(path):
+    """ Read ScanNet's Camera2World pose and transform it to World2Camera.
+    Returns:
+        pose_w2c (np.ndarray): (4, 4)
+    """
+    cam2world = np.loadtxt(path, delimiter=' ')
+    world2cam = inv(cam2world)
+    return world2cam
+class ScanNetDataset(Dataset):
+    def __init__(self,
+                 root_dir,
+                 npz_path,
+                 intrinsic_path,
+                 mode='train',
+                 min_overlap_score=0.4,
+                 pose_dir=None,
+                 ):
+        """Manage one scene of ScanNet Dataset.
+        Args:
+            root_dir (str): ScanNet root directory that contains scene folders.
+            npz_path (str): {scene_id}.npz path. This contains image pair information of a scene.
+            intrinsic_path (str): path to depth-camera intrinsic file.
+            mode (str): options are ['train', 'val', 'test'].
+            pose_dir (str): ScanNet root directory that contains all poses.
+                (we use a separate (optional) pose_dir since we store images and poses separately.)
+        """
+        super().__init__()
+        self.root_dir = root_dir
+        self.pose_dir = pose_dir if pose_dir is not None else root_dir
+        self.mode = mode
+        # prepare data_names, intrinsics and extrinsics(T)
+        with np.load(npz_path) as data:
+            self.data_names = data['name']
+            if 'score' in data.keys() and mode not in ['val' or 'test']:
+                kept_mask = data['score'] > min_overlap_score
+                self.data_names = self.data_names[kept_mask]
+        self.intrinsics = dict(np.load(intrinsic_path))
+        self.augment = Augmentor(mode=='train')
+    def __len__(self):
+        return len(self.data_names)
+    def _read_abs_pose(self, scene_name, name):
+        pth = osp.join(self.pose_dir,
+                       scene_name,
+                       'pose', f'{name}.txt')
+        return read_scannet_pose(pth)
+    def _compute_rel_pose(self, scene_name, name0, name1):
+        pose0 = self._read_abs_pose(scene_name, name0)
+        pose1 = self._read_abs_pose(scene_name, name1)
+        return np.matmul(pose1, inv(pose0))  # (4, 4)
+    def __getitem__(self, idx):
+        data_name = self.data_names[idx]
+        scene_name, scene_sub_name, stem_name_0, stem_name_1 = data_name
+        scene_name = f'scene{scene_name:04d}_{scene_sub_name:02d}'
+        img_name0 = osp.join(self.root_dir, scene_name, 'color', f'{stem_name_0}.jpg')
+        img_name1 = osp.join(self.root_dir, scene_name, 'color', f'{stem_name_1}.jpg')
+        w_new, h_new = 640, 480
+        image0 = cv2.imread(img_name0)
+        image0 = cv2.resize(image0, (w_new, h_new))
+        image0 = cv2.cvtColor(image0, cv2.COLOR_BGR2RGB)
+        # image0 = self.augment(image0)
+        image0 = torch.from_numpy(image0).permute(2, 0, 1).float() / 255.
+        image1 = cv2.imread(img_name1)
+        image1 = cv2.resize(image1, (w_new, h_new))
+        image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2RGB)
+        # image1 = self.augment(image1)
+        image1 = torch.from_numpy(image1).permute(2, 0, 1).float() / 255.
+        images = torch.stack([image0, image1], dim=0)
+        # depth0 = cv2.imread(osp.join(self.root_dir, scene_name, 'depth', f'{stem_name_0}.png'), cv2.IMREAD_UNCHANGED)
+        # depth0 = depth0 / 1000
+        # depth0 = torch.from_numpy(depth0).float()
+        # depth1 = cv2.imread(osp.join(self.root_dir, scene_name, 'depth', f'{stem_name_1}.png'), cv2.IMREAD_UNCHANGED)
+        # depth1 = depth1 / 1000
+        # depth1 = torch.from_numpy(depth1).float()
+        # depths = torch.stack([depth0, depth1], dim=0)
+        K_0 = K_1 = torch.tensor(self.intrinsics[scene_name].copy(), dtype=torch.float).reshape(3, 3)
+        intrinsics = torch.stack([K_0, K_1], dim=0)
+        # read and compute relative poses
+        T_0to1 = torch.tensor(self._compute_rel_pose(scene_name, stem_name_0, stem_name_1),
+                              dtype=torch.float32)
+        data = {
+            'images': images,
+            # 'depths': depths,
+            'rotation': T_0to1[:3, :3],
+            'translation': T_0to1[:3, 3],
+            'intrinsics': intrinsics,
+            'pair_names': (osp.join(scene_name, 'color', f'{stem_name_0}.jpg'),
+                           osp.join(scene_name, 'color', f'{stem_name_1}.jpg'))
+        }
+        return data
+def build_concat_scannet(mode, config):
+    if mode == 'train':
+        config = config.DATASET.TRAIN
+    elif mode == 'val':
+        config = config.DATASET.VAL
+    elif mode == 'test':
+        config = config.DATASET.TEST
+    else:
+        raise NotImplementedError(f'mode {mode}')
+    data_root = config.DATA_ROOT
+    npz_root = config.NPZ_ROOT
+    list_path = config.LIST_PATH
+    intrinsic_path = config.INTRINSIC_PATH
+    min_overlap_score = config.MIN_OVERLAP_SCORE
+    with open(list_path, 'r') as f:
+        npz_names = [name.split()[0] for name in f.readlines()]
+    datasets = []
+    for npz_name in tqdm(npz_names, desc=f'Loading ScanNet {mode} datasets',):
+        npz_path = osp.join(npz_root, npz_name)
+        datasets.append(ScanNetDataset(
+            data_root,
+            npz_path,
+            intrinsic_path,
+            mode=mode,
+            min_overlap_score=min_overlap_score,
+        ))
+    return ConcatDataset(datasets)

eval.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import argparse
+from torch.utils.data import DataLoader
+import lightning as L
+from datasets import dataset_dict
+from model import PL_RelPose, keypoint_dict
+from configs.default import get_cfg_defaults
+def main(args):
+    config = get_cfg_defaults()
+    config.merge_from_file(args.config)
+    task = config.DATASET.TASK
+    dataset = config.DATASET.DATA_SOURCE
+    batch_size = config.TRAINER.BATCH_SIZE
+    num_workers = config.TRAINER.NUM_WORKERS
+    pin_memory = config.TRAINER.PIN_MEMORY
+    test_num_keypoints = config.MODEL.TEST_NUM_KEYPOINTS
+    build_fn = dataset_dict[task][dataset]
+    testset = build_fn('test', config)
+    testloader = DataLoader(testset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory)
+    pl_relpose = PL_RelPose.load_from_checkpoint(args.ckpt_path)
+    pl_relpose.extractor = keypoint_dict[pl_relpose.hparams['features']](max_num_keypoints=test_num_keypoints, detection_threshold=0.0).eval()
+    trainer = L.Trainer(
+        devices=[0],
+    )
+    trainer.test(pl_relpose, dataloaders=testloader)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config', type=str, help='.yaml configure file path')
+    parser.add_argument('ckpt_path', type=str)
+    return parser
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)

eval_add_reproj.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import argparse
+import numpy as np
+import torch
+from collections import defaultdict
+from tqdm import tqdm
+from transforms3d.quaternions import mat2quat
+import pandas as pd
+from model import PL_RelPose, keypoint_dict
+from utils.reproject import reprojection_error, Pose, save_submission
+from utils.metrics import reproj, add, adi, compute_continuous_auc, relative_pose_error, rotation_angular_error
+from datasets import dataset_dict
+from configs.default import get_cfg_defaults
+@torch.no_grad()
+def main(args):
+    config = get_cfg_defaults()
+    config.merge_from_file(args.config)
+    task = config.DATASET.TASK
+    dataset = config.DATASET.DATA_SOURCE
+    device = args.device
+    test_num_keypoints = test_num_keypoints = config.MODEL.TEST_NUM_KEYPOINTS
+    build_fn = dataset_dict[task][dataset]
+    testset = build_fn('test', config)
+    testloader = torch.utils.data.DataLoader(testset, batch_size=1)
+    pl_relpose = PL_RelPose.load_from_checkpoint(args.ckpt_path)
+    pl_relpose.extractor = keypoint_dict[pl_relpose.hparams['features']](max_num_keypoints=test_num_keypoints, detection_threshold=0.0).eval().to(device)
+    pl_relpose.module = pl_relpose.module.eval().to(device)
+    preprocess_times, extract_times, regress_times = [], [], []
+    adds, adis = [], []
+    repr_errs = []
+    R_errs, t_errs = [], []
+    ts_errs = []
+    results_dict = defaultdict(list)
+    for i, data in enumerate(tqdm(testloader)):
+        if dataset == 'ho3d' and args.obj_name is not None and data['objName'][0] != args.obj_name:
+            continue
+        image0, image1 = data['images'][0]
+        K0, K1 = data['intrinsics'][0]
+        T = torch.eye(4)
+        T[:3, :3] = data['rotation'][0]
+        T[:3, 3] = data['translation'][0]
+        T = T.numpy()
+        # with record_function("model_inference"):
+        R_est, t_est, preprocess_time, extract_time, regress_time = pl_relpose.predict_one_data(data)
+        preprocess_times.append(preprocess_time)
+        extract_times.append(extract_time)
+        regress_times.append(regress_time)
+        t_err, R_err = relative_pose_error(T, R_est.cpu().numpy(), t_est.cpu().numpy(), ignore_gt_t_thr=0.0)
+        R_errs.append(R_err)
+        t_errs.append(t_err)
+        ts_errs.append(torch.tensor(T[:3, 3] - t_est.cpu().numpy()).norm(2))
+        if dataset == 'mapfree':
+            repr_err = reprojection_error(R_est.cpu().numpy(), t_est.cpu().numpy(), T[:3, :3], T[:3, 3], K=K1, W=image1.shape[-1], H=image1.shape[-2])
+            repr_errs.append(repr_err)
+            R = R_est.detach().cpu().numpy()
+            t = t_est.reshape(-1).detach().cpu().numpy()
+            scene = data['scene_id'][0]
+            estimated_pose = Pose(
+                image_name=data['pair_names'][1][0],
+                q=mat2quat(R).reshape(-1),
+                t=t.reshape(-1),
+                inliers=0
+            )
+            results_dict[scene].append(estimated_pose)
+        if 'point_cloud' in data:
+            adds.append(add(R_est.cpu().numpy(), t_est.cpu().numpy(), T[:3, :3], T[:3, 3], data['point_cloud'][0].numpy()))
+            adis.append(adi(R_est.cpu().numpy(), t_est.cpu().numpy(), T[:3, :3], T[:3, 3], data['point_cloud'][0].numpy()))
+    metrics = []
+    values = []
+    preprocess_times = np.array(preprocess_times) * 1000
+    extract_times = np.array(extract_times) * 1000
+    regress_times = np.array(regress_times) * 1000
+    metrics.append('Extracting Time (ms)')
+    values.append(f'{np.mean(extract_times):.1f}')
+    metrics.append('Recovering Time (ms)')
+    values.append(f'{np.mean(regress_times):.1f}')
+    metrics.append('Total Time (ms)')
+    values.append(f'{np.mean(extract_times) + np.mean(regress_times):.1f}')
+    # ts_errs = np.array(ts_errs)
+    # print(f'Median Trans. Error (m):\t{np.median(ts_errs):.2f}')
+    # print(f'Median Rot. Error (°):\t{np.median(R_errs):.2f}')
+    if task == 'object':
+        metrics.append('Object ADD')
+        values.append(f'{compute_continuous_auc(adds, np.linspace(0.0, 0.1, 1000)) * 100:.1f}')
+        metrics.append('Object ADD-S')
+        values.append(f'{compute_continuous_auc(adis, np.linspace(0.0, 0.1, 1000)) * 100:.1f}')
+    if dataset == 'mapfree':
+        re = np.array(repr_errs)
+        metrics.append('VCRE @90px Prec.')
+        values.append(f'{(re < 90).mean() * 100:.2f}')
+        metrics.append('VCRE Med.')
+        values.append(f'{np.median(re):.2f}')
+        save_submission(results_dict, 'assets/new_submission.zip')
+    res = pd.DataFrame({'Metrics': metrics, 'Values': values})
+    print(res)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config', type=str, help='.yaml configure file path')
+    parser.add_argument('ckpt_path', type=str)
+    parser.add_argument('--device', type=str, default='cuda:0')
+    parser.add_argument('--obj_name', type=str, default=None)
+    return parser
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)

eval_baselines.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import numpy as np
+import argparse
+from tqdm import tqdm
+import torch
+import pandas as pd
+# from lightglue.utils import load_image
+from configs.default import get_cfg_defaults
+from datasets import dataset_dict
+from baselines.pose import PoseRecover
+from utils.metrics import relative_pose_error, rotation_angular_error, error_auc, add, adi, compute_continuous_auc
+def main(args):
+    config = get_cfg_defaults()
+    config.merge_from_file(args.config)
+    task = config.DATASET.TASK
+    dataset = config.DATASET.DATA_SOURCE
+    # try:
+    #     data_root = config.DATASET.TEST.DATA_ROOT
+    # except:
+    #     data_root = config.DATASET.DATA_ROOT
+    build_fn = dataset_dict[task][dataset]
+    testset = build_fn('test', config)
+    testloader = torch.utils.data.DataLoader(testset, batch_size=1)
+    device = args.device
+    img_resize = args.resize
+    poseRec = PoseRecover(matcher=args.matcher, solver=args.solver, img_resize=img_resize, device=device)
+    preprocess_times, extract_times, match_times, recover_times = [], [], [], []
+    R_errs, t_errs = [], []
+    ts_errs = []
+    adds, adis = [], []
+    for i, data in enumerate(tqdm(testloader)):
+        if dataset == 'ho3d' and args.obj_name is not None and data['objName'][0] != args.obj_name:
+            continue
+        image0, image1 = data['images'][0].to(device)
+        # if dataset == 'megadepth':
+        #     image0 = load_image(os.path.join(data_root, data['pair_names'][0][0])).to(device)
+        #     image1 = load_image(os.path.join(data_root, data['pair_names'][1][0])).to(device)
+        # else:
+        #     image0, image1 = data['images'][0].to(device)
+        bbox0, bbox1 = None, None
+        if task == 'object':
+            bbox0, bbox1 = data['bboxes'][0]
+            x1, y1, x2, y2 = bbox0
+            u1, v1, u2, v2 = bbox1
+            image0 = image0[:, y1:y2, x1:x2]
+            image1 = image1[:, v1:v2, u1:u2]
+        mask0, mask1 = None, None
+        if args.mask:
+            mask0, mask1 = data['masks'][0].to(device)
+        depth0, depth1 = None, None
+        if args.depth:
+            depth0, depth1 = data['depths'][0]
+        K0, K1 = data['intrinsics'][0]
+        T = torch.eye(4)
+        T[:3, :3] = data['rotation'][0]
+        T[:3, 3] = data['translation'][0]
+        T = T.numpy()
+        R, t, points0, points1, preprocess_time, extract_time, match_time, recover_time = poseRec.recover(image0, image1, K0, K1, bbox0, bbox1, mask0, mask1, depth0, depth1)
+        preprocess_times.append(preprocess_time)
+        extract_times.append(extract_time)
+        match_times.append(match_time)
+        recover_times.append(recover_time)
+        if np.isnan(R).any():
+            R_err = 180
+            R = np.identity(3)
+            t_err = 180
+            t = np.array([0., 0., 0.])
+        else:
+            t_err, R_err = relative_pose_error(T, R, t, ignore_gt_t_thr=0.0)
+        R_errs.append(R_err)
+        t_errs.append(t_err)
+        if args.depth:
+            t = np.nan_to_num(t)
+            ts_errs.append(torch.tensor(T[:3, 3] - t).norm(2))
+            if task == 'object':
+                if np.isnan(R).any():
+                    adds.append(1.)
+                    adis.append(1.)
+                else:
+                    adds.append(add(R, t, T[:3, :3], T[:3, 3], data['point_cloud'][0].numpy()))
+                    adis.append(adi(R, t, T[:3, :3], T[:3, 3], data['point_cloud'][0].numpy()))
+    metrics = []
+    values = []
+    preprocess_times = np.array(preprocess_time) * 1000
+    extract_times = np.array(extract_time) * 1000
+    match_times = np.array(match_times) * 1000
+    recover_times = np.array(recover_time) * 1000
+    metrics.append('Extracting Time (ms)')
+    values.append(f'{np.mean(extract_times):.1f}')
+    metrics.append('Matching Time (ms)')
+    values.append(f'{np.mean(match_times):.1f}')
+    metrics.append('Recovering Time (ms)')
+    values.append(f'{np.mean(recover_times):.1f}')
+    metrics.append('Total Time (ms)')
+    values.append(f'{np.mean(extract_times) + np.mean(match_times) + np.mean(recover_times):.1f}')
+    # pose auc
+    angular_thresholds = [5, 10, 20]
+    pose_errors = np.max(np.stack([R_errs, t_errs]), axis=0)
+    aucs = error_auc(pose_errors, angular_thresholds, mode='Pose estimation')  # (auc@5, auc@10, auc@20)
+    for k in aucs:
+        metrics.append(k)
+        values.append(f'{aucs[k] * 100:.2f}')
+    R_errs = torch.tensor(R_errs)
+    t_errs = torch.tensor(t_errs)
+    metrics.append('Rotation Avg. Error (°)')
+    values.append(f'{R_errs.mean():.2f}')
+    metrics.append('Rotation Med. Error (°)')
+    values.append(f'{R_errs.median():.2f}')
+    metrics.append('Rotation @30° ACC')
+    values.append(f'{(R_errs < 30).float().mean() * 100:.1f}')
+    metrics.append('Rotation @15° ACC')
+    values.append(f'{(R_errs < 15).float().mean() * 100:.1f}')
+    if args.depth:
+        ts_errs = torch.tensor(ts_errs)
+        metrics.append('Translation Avg. Error (m)')
+        values.append(f'{ts_errs.mean():.4f}')
+        metrics.append('Translation Med. Error (m)')
+        values.append(f'{ts_errs.median():.4f}')
+        metrics.append('Translation @1m ACC')
+        values.append(f'{(ts_errs < 1.0).float().mean() * 100:.1f}')
+        metrics.append('Translation @10cm ACC')
+        values.append(f'{(ts_errs < 0.1).float().mean() * 100:.1f}')
+        if task == 'object':
+            metrics.append('Object ADD')
+            values.append(f'{compute_continuous_auc(adds, np.linspace(0.0, 0.1, 1000)) * 100:.1f}')
+            metrics.append('Object ADD-S')
+            values.append(f'{compute_continuous_auc(adis, np.linspace(0.0, 0.1, 1000)) * 100:.1f}')
+    res = pd.DataFrame({'Metrics': metrics, 'Values': values})
+    print(res)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config', type=str, help='.yaml configure file path')
+    parser.add_argument('matcher', type=str)
+    parser.add_argument('--solver', type=str, default='procrustes')
+    parser.add_argument('--resize', type=int, default=None)
+    parser.add_argument('--depth', action='store_true')
+    parser.add_argument('--mask', action='store_true')
+    parser.add_argument('--obj_name', type=str, default=None)
+    parser.add_argument('--device', type=str, default='cuda:0')
+    return parser
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)

model/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import torch
+from .relpose import RelPose
+from .pl_trainer import PL_RelPose, keypoint_dict

model/pl_trainer.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from typing import Any
+import numpy as np
+import torch
+import lightning as L
+from lightglue import SuperPoint, DISK, SIFT, ALIKED
+import time
+from utils import rotation_angular_error, translation_angular_error, error_auc
+from .relpose import RelPose
+keypoint_dict = {
+    'superpoint': SuperPoint,
+    'disk': DISK,
+    'sift': SIFT,
+    'aliked': ALIKED,
+}
+class PL_RelPose(L.LightningModule):
+    def __init__(
+            self,
+            task,
+            lr,
+            epochs,
+            pct_start,
+            num_keypoints,
+            n_layers,
+            num_heads,
+            features='superpoint',
+        ):
+        super().__init__()
+        self.extractor = keypoint_dict[features](max_num_keypoints=num_keypoints, detection_threshold=0.0).eval()
+        self.module = RelPose(features=features, task=task, n_layers=n_layers, num_heads=num_heads)
+        self.criterion = torch.nn.HuberLoss()
+        self.s_r = torch.nn.Parameter(torch.zeros(1))
+        # self.s_ta = torch.nn.Parameter(torch. zeros(1))
+        self.s_t = torch.nn.Parameter(torch.zeros(1))
+        self.r_errors = {k:[] for k in ['train', 'valid', 'test']}
+        self.ta_errors = {k:[] for k in ['train', 'valid', 'test']}
+        self.t_errors = {k:[] for k in ['train', 'valid', 'test']}
+        self.save_hyperparameters()
+    def _shared_log(self, mode, loss, loss_r, loss_t, loss_ta, loss_tn):
+        self.log_dict({
+            f'{mode}_loss/sum': loss,
+            f'{mode}_loss/r': loss_r,
+            f'{mode}_loss/t': loss_t,
+            f'{mode}_loss/ta': loss_ta,
+            f'{mode}_loss/tn': loss_tn,
+        }, on_epoch=True, sync_dist=True)
+    def training_step(self, batch, batch_idx):
+        loss, loss_r, loss_ta, loss_t, loss_tn, r_err, ta_err, t_err = self._shared_forward_step(batch, batch_idx)
+        self.r_errors['train'].append(r_err)
+        self.ta_errors['train'].append(ta_err)
+        self.t_errors['train'].append(t_err)
+        self._shared_log('train', loss, loss_r, loss_t, loss_ta, loss_tn)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        loss, loss_r, loss_ta, loss_t, loss_tn, r_err, ta_err, t_err = self._shared_forward_step(batch, batch_idx)
+        self.r_errors['valid'].append(r_err)
+        self.ta_errors['valid'].append(ta_err)
+        self.t_errors['valid'].append(t_err)
+        self._shared_log('valid', loss, loss_r, loss_t, loss_ta, loss_tn)
+    def test_step(self, batch, batch_idx):
+        loss, loss_r, loss_ta, loss_t, loss_tn, r_err, ta_err, t_err = self._shared_forward_step(batch, batch_idx)
+        self.r_errors['test'].append(r_err)
+        self.ta_errors['test'].append(ta_err)
+        self.t_errors['test'].append(t_err)
+        self._shared_log('test', loss, loss_r, loss_t, loss_ta, loss_tn)
+    def _shared_forward_step(self, batch, batch_idx):
+        images = batch['images']
+        rotation = batch['rotation']
+        translation = batch['translation']
+        intrinsics = batch['intrinsics']
+        image0 = images[:, 0, ...]
+        image1 = images[:, 1, ...]
+        with torch.no_grad():
+            feats0 = self.extractor({'image': image0})
+            feats1 = self.extractor({'image': image1})
+        if 'scales' in batch:
+            scales = batch['scales']
+            feats0['keypoints'] *= scales[:, 0].unsqueeze(1)
+            feats1['keypoints'] *= scales[:, 1].unsqueeze(1)
+        if self.hparams.task == 'scene':
+            pred_r, pred_t = self.module({'image0': {**feats0, 'intrinsics': intrinsics[:, 0]}, 'image1': {**feats1, 'intrinsics': intrinsics[:, 1]}})
+        elif self.hparams.task == 'object':
+            bboxes = batch['bboxes']
+            pred_r, pred_t = self.module({'image0': {**feats0, 'intrinsics': intrinsics[:, 0], 'bbox': bboxes[:, 0]}, 'image1': {**feats1, 'intrinsics': intrinsics[:, 1]}})
+        r_err = rotation_angular_error(pred_r, rotation)
+        ta_err = translation_angular_error(pred_t, translation)
+        loss_r = self.criterion(r_err, torch.zeros_like(r_err))
+        loss_ta = self.criterion(ta_err, torch.zeros_like(ta_err))
+        loss_tn = self.criterion(pred_t / pred_t.norm(2, dim=-1, keepdim=True), translation / translation.norm(2, dim=-1, keepdim=True))
+        loss_t = self.criterion(pred_t, translation)
+        # loss = loss_r * torch.exp(-self.s_r) + loss_t * torch.exp(-self.s_t) + loss_ta * torch.exp(-self.s_ta) + self.s_r + self.s_t + self.s_ta
+        loss = loss_r + loss_ta + loss_t + loss_tn
+        r_err = r_err.detach()
+        ta_err = ta_err.detach()
+        t_err = (pred_t.detach() - translation).norm(2, dim=1)
+        return loss, loss_r, loss_ta, loss_t, loss_tn, r_err, ta_err, t_err
+    def predict_one_data(self, data, device='cuda'):
+        st_time = time.time()
+        images = data['images'].to(device)
+        intrinsics = data['intrinsics'].to(device)
+        image0 = images[:, 0, ...]
+        image1 = images[:, 1, ...]
+        preprocess = time.time()
+        with torch.no_grad():
+            feats0 = self.extractor({'image': image0})
+            feats1 = self.extractor({'image': image1})
+        extract_time = time.time()
+        if 'scales' in data:
+            scales = data['scales'].to(device)
+            feats0['keypoints'] *= scales[:, 0].unsqueeze(1)
+            feats1['keypoints'] *= scales[:, 1].unsqueeze(1)
+        if self.hparams.task == 'scene':
+            pred_r, pred_t = self.module({'image0': {**feats0, 'intrinsics': intrinsics[:, 0]}, 'image1': {**feats1, 'intrinsics': intrinsics[:, 1]}})
+        elif self.hparams.task == 'object':
+            bboxes = data['bboxes'].to(device)
+            pred_r, pred_t = self.module({'image0': {**feats0, 'intrinsics': intrinsics[:, 0], 'bbox': bboxes[:, 0]}, 'image1': {**feats1, 'intrinsics': intrinsics[:, 1]}})
+        regress_time = time.time()
+        return pred_r[0], pred_t[0], preprocess-st_time, extract_time-preprocess, regress_time-extract_time
+    def _shared_on_epoch_end(self, mode):
+        r_errors = torch.hstack(self.r_errors[mode]).rad2deg()
+        ta_errors = torch.hstack(self.ta_errors[mode]).rad2deg()
+        ta_errors = torch.minimum(ta_errors, 180-ta_errors)
+        auc = error_auc(torch.maximum(r_errors, ta_errors).cpu(), [5, 10, 20], mode)
+        t_errors = torch.hstack(self.t_errors[mode])
+        self.log_dict({
+            **auc,
+            f'{mode}_Rot./Avg. Error': r_errors.mean(),
+            f'{mode}_Rot./Med. Error': r_errors.median(),
+            f'{mode}_Rot./@30° ACC': (r_errors < 30).float().mean(),
+            f'{mode}_Rot./@15° ACC': (r_errors < 15).float().mean(),
+            # f'{mode}_ta/avg': ta_errors.mean(),
+            # f'{mode}_ta/med': ta_errors.median(),
+            f'{mode}_Trans./Avg. Error': t_errors.mean(),
+            f'{mode}_Trans./Med. Error': t_errors.median(),
+            f'{mode}_Trans./@10cm ACC': (t_errors < 0.1).float().mean(),
+            f'{mode}_Trans./@1m ACC': (t_errors < 1.0).float().mean(),
+        }, sync_dist=True)
+        self.r_errors[mode].clear()
+        self.ta_errors[mode].clear()
+        self.t_errors[mode].clear()
+    def on_train_epoch_end(self):
+        self._shared_on_epoch_end('train')
+    def on_validation_epoch_end(self):
+        self._shared_on_epoch_end('valid')
+    def on_test_epoch_end(self):
+        self._shared_on_epoch_end('test')
+    def configure_optimizers(self):
+        optimizer = torch.optim.AdamW(self.module.parameters(), lr=self.hparams.lr)
+        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=self.hparams.lr, steps_per_epoch=1, epochs=self.hparams.epochs, pct_start=self.hparams.pct_start)
+        return {
+            'optimizer': optimizer,
+            'lr_scheduler': scheduler
+        }

model/relpose.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import warnings
+from types import SimpleNamespace
+from typing import Callable, List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import nn
+from utils import rotation_matrix_from_ortho6d
+try:
+    from flash_attn.modules.mha import FlashCrossAttention
+except ModuleNotFoundError:
+    FlashCrossAttention = None
+if FlashCrossAttention or hasattr(F, "scaled_dot_product_attention"):
+    FLASH_AVAILABLE = True
+else:
+    FLASH_AVAILABLE = False
+torch.backends.cudnn.deterministic = True
+torch.set_float32_matmul_precision('medium')
+def normalize_keypoints(kpts, intrinsics):
+    # kpts: (B, M, 2)
+    # intrinsics: (B, 3, 3)
+    b, m, _ = kpts.shape
+    kpts = torch.cat([kpts, torch.ones((b, m, 1), device=kpts.device)], dim=2)
+    kpts = intrinsics.inverse() @ kpts.mT
+    kpts = kpts.mT
+    kpts = kpts[..., :2]
+    return kpts
+# @torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+def cosine_similarity(x, y):
+    sim = torch.einsum('...id,...jd->...ij', x / x.norm(2, -1, keepdim=True), y / y.norm(2, -1, keepdim=True))
+    sim = (sim + 1) / 2
+    return sim
+def pad_to_length(x: torch.Tensor, length: int) -> Tuple[torch.Tensor]:
+    if length <= x.shape[-2]:
+        return x, torch.ones_like(x[..., :1], dtype=torch.bool)
+    pad = torch.ones(
+        *x.shape[:-2], length - x.shape[-2], x.shape[-1], device=x.device, dtype=x.dtype
+    )
+    y = torch.cat([x, pad], dim=-2)
+    mask = torch.zeros(*y.shape[:-1], 1, dtype=torch.bool, device=x.device)
+    mask[..., : x.shape[-2], :] = True
+    return y, mask
+def gather(x: torch.Tensor, indices: torch.tensor):
+    b, _, n = x.shape
+    bs = torch.arange(b).reshape(b, 1, 1)
+    ns = torch.arange(n)
+    return x[bs, indices.unsqueeze(-1), ns]
+class Attention(nn.Module):
+    def __init__(self, allow_flash: bool = True) -> None:
+        super().__init__()
+        if allow_flash and not FLASH_AVAILABLE:
+            warnings.warn(
+                "FlashAttention is not available. For optimal speed, "
+                "consider installing torch >= 2.0 or flash-attn.",
+                stacklevel=2,
+            )
+        self.enable_flash = allow_flash and FLASH_AVAILABLE
+        self.has_sdp = hasattr(F, "scaled_dot_product_attention")
+        if allow_flash and FlashCrossAttention:
+            self.flash_ = FlashCrossAttention()
+        if self.has_sdp:
+            torch.backends.cuda.enable_flash_sdp(allow_flash)
+    def forward(self, q, k, v, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if self.enable_flash and q.device.type == "cuda":
+            # use torch 2.0 scaled_dot_product_attention with flash
+            if self.has_sdp:
+                args = [x.contiguous() for x in [q, k, v]]
+                v = F.scaled_dot_product_attention(*args, attn_mask=mask).to(q.dtype)
+                return v if mask is None else v.nan_to_num()
+            else:
+                assert mask is None
+                q, k, v = [x.transpose(-2, -3).contiguous() for x in [q, k, v]]
+                m = self.flash_(q, torch.stack([k, v], 2))
+                return m.transpose(-2, -3).to(q.dtype).clone()
+        elif self.has_sdp:
+            args = [x.contiguous() for x in [q, k, v]]
+            v = F.scaled_dot_product_attention(*args, attn_mask=mask)
+            return v if mask is None else v.nan_to_num()
+        else:
+            s = q.shape[-1] ** -0.5
+            sim = torch.einsum("...id,...jd->...ij", q, k) * s
+            if mask is not None:
+                sim.masked_fill(~mask, -float("inf"))
+            attn = F.softmax(sim, -1)
+            return torch.einsum("...ij,...jd->...id", attn, v)
+class SelfBlock(nn.Module):
+    def __init__(
+        self, embed_dim: int, num_heads: int, bias: bool = True
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        assert self.embed_dim % num_heads == 0
+        self.head_dim = self.embed_dim // num_heads
+        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias)
+        self.inner_attn = Attention()
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.ffn = nn.Sequential(
+            nn.Linear(2 * embed_dim, 2 * embed_dim),
+            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(2 * embed_dim, embed_dim),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoding: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv = self.Wqkv(x)
+        qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
+        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
+        q += encoding
+        k += encoding
+        context = self.inner_attn(q, k, v, mask=mask)
+        message = self.out_proj(context.transpose(1, 2).flatten(start_dim=-2))
+        return x + self.ffn(torch.cat([x, message], -1))
+class CrossBlock(nn.Module):
+    def __init__(
+        self, embed_dim: int, num_heads: int, bias: bool = True
+    ) -> None:
+        super().__init__()
+        self.heads = num_heads
+        dim_head = embed_dim // num_heads
+        self.scale = dim_head**-0.5
+        inner_dim = dim_head * num_heads
+        self.to_qk = nn.Linear(embed_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(embed_dim, inner_dim, bias=bias)
+        self.to_out = nn.Linear(inner_dim, embed_dim, bias=bias)
+        self.ffn = nn.Sequential(
+            nn.Linear(2 * embed_dim, 2 * embed_dim),
+            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(2 * embed_dim, embed_dim),
+        )
+        # self.reg_attn = nn.Identity()
+        # self.reg_sim = nn.Identity()
+    def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor):
+        return func(x0), func(x1)
+    def forward(
+        self, x0: torch.Tensor, x1: torch.Tensor, match: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> List[torch.Tensor]:
+        qk0, qk1 = self.map_(self.to_qk, x0, x1)
+        v0, v1 = self.map_(self.to_v, x0, x1)
+        qk0, qk1, v0, v1 = map(
+            lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2),
+            (qk0, qk1, v0, v1),
+        )
+        qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5
+        sim = torch.einsum("bhid, bhjd -> bhij", qk0, qk1)
+        if mask is not None:
+            sim = sim.masked_fill(~mask.unsqueeze(1), -float("inf"))
+        assert len(match.shape) == 3
+        match = match.unsqueeze(1)
+        sim = sim * match
+        # sim = self.reg_attn(sim)
+        # match = self.reg_sim(match)
+        attn01 = F.softmax(sim, dim=-1)
+        attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1)
+        m0 = torch.einsum("bhij, bhjd -> bhid", attn01, v1)
+        m1 = torch.einsum("bhji, bhjd -> bhid", attn10.transpose(-2, -1), v0)
+        if mask is not None:
+            m0, m1 = m0.nan_to_num(), m1.nan_to_num()
+        m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), m0, m1)
+        m0, m1 = self.map_(self.to_out, m0, m1)
+        x0 = x0 + self.ffn(torch.cat([x0, m0], -1))
+        x1 = x1 + self.ffn(torch.cat([x1, m1], -1))
+        return x0, x1
+class TransformerLayer(nn.Module):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.self_attn = SelfBlock(*args, **kwargs)
+        self.cross_attn = CrossBlock(*args, **kwargs)
+    def forward(
+        self,
+        desc0,
+        desc1,
+        encoding0,
+        encoding1,
+        match,
+        mask0: Optional[torch.Tensor] = None,
+        mask1: Optional[torch.Tensor] = None,
+    ):
+        if mask0 is not None and mask1 is not None:
+            return self.masked_forward(desc0, desc1, encoding0, encoding1, match, mask0, mask1)
+        else:
+            desc0 = self.self_attn(desc0, encoding0)
+            desc1 = self.self_attn(desc1, encoding1)
+            return self.cross_attn(desc0, desc1, match)
+    # This part is compiled and allows padding inputs
+    def masked_forward(self, desc0, desc1, encoding0, encoding1, match, mask0, mask1):
+        mask = mask0 & mask1.transpose(-1, -2)
+        mask0 = mask0 & mask0.transpose(-1, -2)
+        mask1 = mask1 & mask1.transpose(-1, -2)
+        desc0 = self.self_attn(desc0, encoding0, mask0)
+        desc1 = self.self_attn(desc1, encoding1, mask1)
+        return self.cross_attn(desc0, desc1, match, mask)
+class RelPose(nn.Module):
+    default_conf = {
+        "name": "RelPose",  # just for interfacing
+        "input_dim": 256,  # input descriptor dimension (autoselected from weights)
+        "descriptor_dim": 256,
+        "add_scale_ori": False,
+        "n_layers": 3,
+        "num_heads": 4,
+        "pct_pruning": 0,
+        "task": "scene",
+        "mp": False,  # enable mixed precision
+        "weights": None,
+    }
+    required_data_keys = ["image0", "image1"]
+    features = {
+        "superpoint": {
+            "input_dim": 256,
+        },
+        "disk": {
+            "input_dim": 128,
+        },
+        "aliked": {
+            "input_dim": 128,
+        },
+        "sift": {
+            "input_dim": 128,
+            # "add_scale_ori": True,
+        },
+    }
+    def __init__(self, features="superpoint", **conf) -> None:
+        super().__init__()
+        self.conf = conf = SimpleNamespace(**{**self.default_conf, **conf})
+        if features is not None:
+            if features not in self.features:
+                raise ValueError(
+                    f"Unsupported features: {features} not in "
+                    f"{{{','.join(self.features)}}}"
+                )
+            for k, v in self.features[features].items():
+                setattr(conf, k, v)
+        if conf.input_dim != conf.descriptor_dim:
+            self.input_proj = nn.Linear(conf.input_dim, conf.descriptor_dim, bias=True)
+        else:
+            self.input_proj = nn.Identity()
+        head_dim = conf.descriptor_dim // conf.num_heads
+        self.posenc = nn.Linear(
+            2 + 2 * self.conf.add_scale_ori, head_dim
+        )
+        h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim
+        self.transformers = nn.ModuleList(
+            [TransformerLayer(d, h) for _ in range(n)]
+        )
+        self.rotation_regressor = nn.Sequential(
+            nn.Linear(conf.descriptor_dim*2, conf.descriptor_dim),
+            nn.ReLU(),
+            nn.Linear(conf.descriptor_dim, conf.descriptor_dim//2),
+            nn.ReLU(),
+            nn.Linear(conf.descriptor_dim//2, 6),
+        )
+        self.translation_regressor = nn.Sequential(
+            nn.Linear(conf.descriptor_dim*2, conf.descriptor_dim),
+            nn.ReLU(),
+            nn.Linear(conf.descriptor_dim, conf.descriptor_dim//2),
+            nn.ReLU(),
+            nn.Linear(conf.descriptor_dim//2, 3),
+        )
+        # self.reg_kpts0 = nn.Identity()
+        # self.reg_kpts1 = nn.Identity()
+        # static lengths LightGlue is compiled for (only used with torch.compile)
+        self.static_lengths = None
+    def compile(
+        self, mode="reduce-overhead", static_lengths=[256, 512, 768, 1024, 1280, 1536]
+    ):
+        for i in range(self.conf.n_layers):
+            self.transformers[i].masked_forward = torch.compile(
+                self.transformers[i].masked_forward, mode=mode, fullgraph=True
+            )
+        self.static_lengths = static_lengths
+    def forward(self, data: dict) -> dict:
+        """
+        Match keypoints and descriptors between two images
+        Input (dict):
+            image0: dict
+                keypoints: [B x M x 2]
+                descriptors: [B x M x D]
+                image: [B x C x H x W] or image_size: [B x 2]
+            image1: dict
+                keypoints: [B x N x 2]
+                descriptors: [B x N x D]
+                image: [B x C x H x W] or image_size: [B x 2]
+        Output
+        """
+        with torch.autocast(enabled=self.conf.mp, device_type="cuda"):
+            return self._forward(data)
+    def _forward(self, data: dict) -> dict:
+        for key in self.required_data_keys:
+            assert key in data, f"Missing key {key} in data"
+        data0, data1 = data["image0"], data["image1"]
+        kpts0, kpts1 = data0["keypoints"], data1["keypoints"]
+        intrinsic0, intrinsic1 = data0["intrinsics"], data1["intrinsics"]
+        b, m, _ = kpts0.shape
+        b, n, _ = kpts1.shape
+        if self.conf.add_scale_ori:
+            kpts0 = torch.cat(
+                [kpts0] + [data0[k].unsqueeze(-1) for k in ("scales", "oris")], -1
+            )
+            kpts1 = torch.cat(
+                [kpts1] + [data1[k].unsqueeze(-1) for k in ("scales", "oris")], -1
+            )
+        desc0 = data0["descriptors"].detach().contiguous()
+        desc1 = data1["descriptors"].detach().contiguous()
+        assert desc0.shape[-1] == self.conf.input_dim
+        assert desc1.shape[-1] == self.conf.input_dim
+        mask0, mask1 = None, None
+        c = max(m, n)
+        do_compile = self.static_lengths and c <= max(self.static_lengths)
+        if do_compile:
+            kn = min([k for k in self.static_lengths if k >= c])
+            desc0, mask0 = pad_to_length(desc0, kn)
+            desc1, mask1 = pad_to_length(desc1, kn)
+            kpts0, _ = pad_to_length(kpts0, kn)
+            kpts1, _ = pad_to_length(kpts1, kn)
+        matchability = cosine_similarity(desc0, desc1)
+        assert self.conf.pct_pruning >= 0 and self.conf.pct_pruning < 1
+        if self.conf.pct_pruning > 0:
+            ind0, ind1 = self.get_pruned_indices(matchability, self.conf.pct_pruning)
+            matchability = gather(matchability, ind0)
+            matchability = gather(matchability.mT, ind1).mT
+            desc0 = gather(desc0, ind0)
+            desc1 = gather(desc1, ind1)
+            kpts0 = gather(kpts0, ind0)
+            kpts1 = gather(kpts1, ind1)
+        if self.conf.task == "object":
+            bbox = data0["bbox"] # (B, 4)
+            ind0, mask0 = self.get_prompted_indices(kpts0, bbox)
+            matchability[:, 0] = torch.zeros_like(matchability[:, 0], device=matchability.device)
+            desc0[:, 0] = torch.zeros_like(desc0[:, 0], device=desc0.device)
+            kpts0[:, 0] = torch.zeros_like(kpts0[:, 0], device=kpts0.device)
+            matchability = gather(matchability, ind0)
+            desc0 = gather(desc0, ind0)
+            kpts0 = gather(kpts0, ind0)
+        desc0 = self.input_proj(desc0)
+        desc1 = self.input_proj(desc1)
+        # kpts0 = self.reg_kpts0(kpts0)
+        # kpts1 = self.reg_kpts1(kpts1)
+        # cache positional embeddings
+        kpts0 = normalize_keypoints(kpts0, intrinsic0)
+        kpts1 = normalize_keypoints(kpts1, intrinsic1)
+        encoding0 = self.posenc(kpts0).unsqueeze(-3)
+        encoding1 = self.posenc(kpts1).unsqueeze(-3)
+        for i in range(self.conf.n_layers):
+            desc0, desc1 = self.transformers[i](
+                desc0, desc1, encoding0, encoding1, match=matchability, mask0=mask0, mask1=mask1,
+            )
+        desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :]
+        if self.conf.task == 'object':
+            n_kpts0 = mask0.sum(1, keepdim=True)
+            n_kpts0 = torch.clip(n_kpts0, min=1)
+            desc0 = (desc0 * mask0.unsqueeze(-1)).sum(1) / n_kpts0
+            desc1 = desc1.mean(1)
+        else:
+            desc0, desc1 = desc0.mean(1), desc1.mean(1)
+        feat = torch.cat([desc0, desc1], 1)
+        R = self.rotation_regressor(feat)
+        R = rotation_matrix_from_ortho6d(R)
+        t = self.translation_regressor(feat)
+        return R, t
+    def get_pruned_indices(self, match, pct_pruning):
+        matching_scores0 = match.mean(-1)
+        matching_scores1 = match.mean(-2)
+        num_pruning0 = int(pct_pruning * matching_scores0.size(-1))
+        num_pruning1 = int(pct_pruning * matching_scores1.size(-1))
+        _, indices0 = matching_scores0.sort()
+        _, indices1 = matching_scores1.sort()
+        indices0 = indices0[:, num_pruning0:]
+        indices1 = indices1[:, num_pruning1:]
+        return indices0, indices1
+    def get_prompted_indices(self, kpts, bbox):
+        # kpts: (B, M, 2)
+        # bbox: (B, 4) - (x, y, x, y)
+        x, y = kpts[..., 0], kpts[..., 1]
+        mask = (x >= bbox[:, 0].unsqueeze(-1)) & (x <= bbox[:, 2].unsqueeze(-1))
+        mask &= (y >= bbox[:, 1].unsqueeze(-1)) & (y <= bbox[:, 3].unsqueeze(-1))
+        mask_sorted, indices = mask.long().sort(descending=True)
+        indices *= mask_sorted
+        indices = indices[:, :mask_sorted.sum(-1).max()]
+        mask_sorted = mask_sorted[:, :mask_sorted.sum(-1).max()]
+        return indices, mask_sorted

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+albumentations==1.4.1
+kornia==0.7.1
+open3d==0.18.0
+opencv-python==4.9.0.80
+plyfile==1.0.3
+scikit-learn==1.4.1.post1
+yacs==0.1.8
+lightning==2.2.1
+transforms3d==0.4.1
+pandas==2.1.1
+lightglue @ git+https://github.com/cvg/LightGlue@main

train.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import argparse
+from torch.utils.data import DataLoader
+import lightning as L
+from lightning.pytorch.callbacks import LearningRateMonitor, ModelCheckpoint
+from datasets import dataset_dict, RandomConcatSampler
+from model import PL_RelPose
+from utils import seed_torch
+from configs.default import get_cfg_defaults
+def main(args):
+    config = get_cfg_defaults()
+    config.merge_from_file(args.config)
+    task = config.DATASET.TASK
+    dataset = config.DATASET.DATA_SOURCE
+    batch_size = config.TRAINER.BATCH_SIZE
+    num_workers = config.TRAINER.NUM_WORKERS
+    pin_memory = config.TRAINER.PIN_MEMORY
+    n_samples_per_subset = config.TRAINER.N_SAMPLES_PER_SUBSET
+    lr = config.TRAINER.LEARNING_RATE
+    epochs = config.TRAINER.EPOCHS
+    pct_start = config.TRAINER.PCT_START
+    num_keypoints = config.MODEL.NUM_KEYPOINTS
+    n_layers = config.MODEL.N_LAYERS
+    num_heads = config.MODEL.NUM_HEADS
+    features = config.MODEL.FEATURES
+    seed = config.RANDOM_SEED
+    seed_torch(seed)
+    build_fn = dataset_dict[task][dataset]
+    trainset = build_fn('train', config)
+    validset = build_fn('val', config)
+    if dataset == 'scannet' or dataset == 'megadepth' or dataset == 'linemod' or dataset == 'ho3d' or dataset == 'mapfree':
+        sampler = RandomConcatSampler(
+            trainset,
+            n_samples_per_subset=n_samples_per_subset,
+            subset_replacement=True,
+            shuffle=True,
+            seed=seed
+        )
+        trainloader = DataLoader(trainset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, sampler=sampler)
+    else:
+        trainloader = DataLoader(trainset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory, shuffle=True)
+    validloader = DataLoader(validset, batch_size=batch_size, num_workers=num_workers, pin_memory=pin_memory)
+    if args.weights is None:
+        pl_relpose = PL_RelPose(
+            task=task,
+            lr=lr,
+            epochs=epochs,
+            pct_start=pct_start,
+            n_layers=n_layers,
+            num_heads=num_heads,
+            num_keypoints=num_keypoints,
+            features=features,
+        )
+    else:
+        pl_relpose = PL_RelPose.load_from_checkpoint(
+            checkpoint_path=args.weights,
+            task=task,
+            lr=lr,
+            epochs=epochs,
+            pct_start=pct_start,
+            n_layers=n_layers,
+            num_heads=num_heads,
+            num_keypoints=num_keypoints,
+        )
+    lr_monitor = LearningRateMonitor(logging_interval='epoch')
+    latest_checkpoint_callback = ModelCheckpoint()
+    best_checkpoint_callback = ModelCheckpoint(monitor='valid/auc@20', mode='max')
+    trainer = L.Trainer(
+        devices=[0],
+        # devices=[0, 1],
+        # accelerator='gpu', strategy='ddp_find_unused_parameters_true',
+        max_epochs=epochs,
+        callbacks=[lr_monitor, latest_checkpoint_callback, best_checkpoint_callback],
+        precision="bf16-mixed",
+        # fast_dev_run=1,
+    )
+    trainer.fit(pl_relpose, trainloader, validloader, ckpt_path=args.resume)
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('config', type=str, help='.yaml configure file path')
+    parser.add_argument('--resume', type=str, default=None)
+    parser.add_argument('--weights', type=str, default=None)
+    return parser
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+import numpy as np
+import random
+import torch
+from .metrics import quat_degree_error, rotation_angular_error, translation_angular_error, error_auc
+from .transform import rotation_matrix_from_ortho6d, rotation_matrix_from_quaternion
+from .augment import Augmentor
+# from .visualize import project_3D_points, plot_3D_box
+def seed_torch(seed):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True

utils/augment.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import albumentations as A
+class Augmentor(object):
+    def __init__(self, is_training:bool):
+        self.augmentor = A.Compose([
+            A.MotionBlur(p=0.25),
+            A.ColorJitter(p=0.25),
+            A.ImageCompression(p=0.25),
+            A.ISONoise(p=0.25),
+            A.ToGray(p=0.1)
+        ], p=float(is_training))
+    def __call__(self, x):
+        return self.augmentor(image=x)['image']