mystorm commited on Sep 8, 2025

Commit

3bead73

verified ·

1 Parent(s): 8df2bdc

Delete FastVGGT

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

FastVGGT/.gitignore +0 -160
FastVGGT/.vscode/launch.json +0 -85
FastVGGT/README.md +0 -163
FastVGGT/assets/attn_map.png +0 -3
FastVGGT/assets/autolab_logo.png +0 -3
FastVGGT/assets/maclab_logo.png +0 -0
FastVGGT/assets/main.png +0 -3
FastVGGT/assets/vs.png +0 -3
FastVGGT/eval/__pycache__/base.cpython-310.pyc +0 -0
FastVGGT/eval/__pycache__/criterion.cpython-310.pyc +0 -0
FastVGGT/eval/__pycache__/data.cpython-310.pyc +0 -0
FastVGGT/eval/__pycache__/data.cpython-37.pyc +0 -0
FastVGGT/eval/__pycache__/utils.cpython-310.pyc +0 -0
FastVGGT/eval/__pycache__/utils.cpython-37.pyc +0 -0
FastVGGT/eval/base.py +0 -273
FastVGGT/eval/criterion.py +0 -534
FastVGGT/eval/data.py +0 -338
FastVGGT/eval/dataset_utils/__init__.py +0 -1
FastVGGT/eval/dataset_utils/__pycache__/__init__.cpython-310.pyc +0 -0
FastVGGT/eval/dataset_utils/__pycache__/__init__.cpython-37.pyc +0 -0
FastVGGT/eval/dataset_utils/__pycache__/corr.cpython-310.pyc +0 -0
FastVGGT/eval/dataset_utils/__pycache__/cropping.cpython-310.pyc +0 -0
FastVGGT/eval/dataset_utils/__pycache__/cropping.cpython-37.pyc +0 -0
FastVGGT/eval/dataset_utils/__pycache__/transforms.cpython-310.pyc +0 -0
FastVGGT/eval/dataset_utils/corr.py +0 -234
FastVGGT/eval/dataset_utils/cropping.py +0 -140
FastVGGT/eval/dataset_utils/transforms.py +0 -78
FastVGGT/eval/eval_7andN.py +0 -497
FastVGGT/eval/eval_custom.py +0 -467
FastVGGT/eval/eval_scannet.py +0 -208
FastVGGT/eval/utils.py +0 -142
FastVGGT/merging/__init__.py +0 -3
FastVGGT/merging/__pycache__/__init__.cpython-310.pyc +0 -0
FastVGGT/merging/__pycache__/merge.cpython-310.pyc +0 -0
FastVGGT/merging/merge.py +0 -370
FastVGGT/requirements.txt +0 -15
FastVGGT/vggt/__init__.py +0 -5
FastVGGT/vggt/__pycache__/__init__.cpython-310.pyc +0 -0
FastVGGT/vggt/dependency/__init__.py +0 -5
FastVGGT/vggt/dependency/__pycache__/__init__.cpython-310.pyc +0 -0
FastVGGT/vggt/dependency/__pycache__/distortion.cpython-310.pyc +0 -0
FastVGGT/vggt/dependency/distortion.py +0 -54
FastVGGT/vggt/heads/__pycache__/camera_head.cpython-310.pyc +0 -0
FastVGGT/vggt/heads/__pycache__/dpt_head.cpython-310.pyc +0 -0
FastVGGT/vggt/heads/__pycache__/head_act.cpython-310.pyc +0 -0
FastVGGT/vggt/heads/__pycache__/track_head.cpython-310.pyc +0 -0
FastVGGT/vggt/heads/__pycache__/utils.cpython-310.pyc +0 -0
FastVGGT/vggt/heads/camera_head.py +0 -149
FastVGGT/vggt/heads/dpt_head.py +0 -598
FastVGGT/vggt/heads/head_act.py +0 -125

FastVGGT/.gitignore DELETED Viewed

@@ -1,160 +0,0 @@
-.hydra/
-output/
-ckpt/
-.vscode/
-dependency/
-# Byte-compiled / optimized / DLL files
-__pycache__/
-**/__pycache__/
-*.py[cod]
-*$py.class
-test_logs/
-quick_start_logs/
-logs/
-*.pth
-/data/
-*.png
-eval_results/
-.vscode/
-.curosr/
-# C extensions
-*.so
-LightGlue/
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-pip-wheel-metadata/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-# Usually these files are written by a python script from a template
-# before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-.python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Profiling data
-.prof
-# Folder specific to your needs
-**/tmp/
-**/outputs/skyseg.onnx
-skyseg.onnx
-# pixi environments
-.pixi
-*.egg-info

FastVGGT/.vscode/launch.json DELETED Viewed

@@ -1,85 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "launch",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "/home/sy/code/vggt_0625/training/launch.py",
-            "console": "integratedTerminal",
-            "args": "${command:pickArgs}",
-            "env": {
-                "CUDA_VISIBLE_DEVICES": "3",
-            },
-            "cwd": "/home/sy/code/vggt_0625/training",
-            "justMyCode": true,
-            "python": "/home/sy/anaconda3/envs/vggt/bin/python"
-        }
-        ,{
-            "name": "train_scannet",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "/home/sy/code/vggt_0625/training/launch_scannet.py",
-            "console": "integratedTerminal",
-            "args": [
-                // "--config_name", "scannet",
-                // "--exp_name", "scannet_exp001",
-                // "--resume_checkpoint_path", "/home/sy/code/vggt_0625/ckpt/model_tracker_fixed_e20.pt"
-            ],
-            "env": {
-                "CUDA_VISIBLE_DEVICES": "7",
-                "WORLD_SIZE": "1",
-                "RANK": "0",
-                "MASTER_ADDR": "localhost",
-                "MASTER_PORT": "12345"
-            },
-            "cwd": "/home/sy/code/vggt_0625/training",
-            "justMyCode": true,
-            "python": "/home/sy/anaconda3/envs/vggt/bin/python"
-        }
-        ,{
-            "name": "eval_scannet",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "/home/sy/code/FastVGGT/eval/eval_scannet.py",
-            "console": "integratedTerminal",
-            "args": [
-                "--data_dir","/data/sy/scannetv2/process_scannet",
-                "--gt_ply_dir","/data/sy/scannetv2/OpenDataLab___ScanNet_v2/raw/scans",
-                "--output_path", "/home/sy/code/FastVGGT/eval_results",
-                "--merging", "0",
-                "--ckpt_path","/home/sy/code/vggt_0625/ckpt/model_tracker_fixed_e20.pt",
-                "--vis_attn_map"
-            ],
-            "env": {
-                "CUDA_VISIBLE_DEVICES": "2"
-            },
-            "justMyCode": true,
-            "python": "/home/sy/anaconda3/envs/fastvggt/bin/python"
-        },
-        {
-            "name": "eval_cd",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "/home/sy/code/FastVGGT/eval/eval_custom.py",
-            "console": "integratedTerminal",
-            "args": [
-                "--merging", "0",
-                // "--kf","10",
-                // "--output_dir","/home/sy/code/vggt_0625/eval_results_cd",
-                "--data_path","/data/sy/segment-102751/",
-                "--vis_attn_map"
-            ],
-            "env": {
-                "CUDA_VISIBLE_DEVICES": "3"
-            },
-            "justMyCode": true,
-            // "python": "/home/sy/anaconda3/envs/fastvggt/bin/python"
-        }
-    ]
-}

FastVGGT/README.md DELETED Viewed

@@ -1,163 +0,0 @@
-<div align="center">
-<h2>⚡️ FastVGGT: Training-Free Acceleration of Visual Geometry Transformer</h2>
-<p align="center">
-  <a href="https://arxiv.org/abs/2509.02560"><img src="https://img.shields.io/badge/arXiv-FastVGGT-red?logo=arxiv" alt="Paper PDF"></a>
-  <a href="https://mystorm16.github.io/fastvggt/"><img src="https://img.shields.io/badge/Project_Page-FastVGGT-yellow" alt="Project Page"></a>
-</p>
-<img src="assets/maclab_logo.png" alt="Maclab Logo" width="110" style="margin-right: 40px;">
-<img src="assets/autolab_logo.png" alt="Autolab Logo" width="110">
-**[Media Analytics & Computing Laboratory](https://mac.xmu.edu.cn/)**; **[AUTOLAB](https://zhipengzhang.cn/)**
-[You Shen](https://mystorm16.github.io/), [Zhipeng Zhang](https://zhipengzhang.cn/), [Yansong Qu](https://quyans.github.io/), [Liujuan Cao](https://mac.xmu.edu.cn/ljcao/)
-</div>
-## 📰 News
-- [Sep 8, 2025] Added custom dataset evaluation.
-- [Sep 3, 2025] Paper release.
-- [Sep 2, 2025] Code release.
-## 🔭 Overview
-FastVGGT observes **strong similarity** in attention maps and leverages it to design a training-free acceleration method for long-sequence 3D reconstruction, **achieving up to 4× faster inference without sacrificing accuracy.**
-<img src="assets/main.png" alt="Autolab Logo" width="">
-## ⚙️ Environment Setup
-First, create a virtual environment using Conda, clone this repository to your local machine, and install the required dependencies.
-```bash
-conda create -n fastvggt python=3.10
-conda activate fastvggt
-git clone git@github.com:mystorm16/FastVGGT.git
-cd FastVGGT
-pip install -r requirements.txt
-```
-Next, prepare the ScanNet dataset: http://www.scan-net.org/ScanNet/
-Then, download the VGGT checkpoint (we use the checkpoint link provided in https://github.com/facebookresearch/vggt/tree/evaluation/evaluation):
-```bash
-wget https://huggingface.co/facebook/VGGT_tracker_fixed/resolve/main/model_tracker_fixed_e20.pt
-```
-Finally, configure the dataset path and VGGT checkpoint path. For example:
-```bash
-    parser.add_argument(
-        "--data_dir", type=Path, default="/data/scannetv2/process_scannet"
-    )
-    parser.add_argument(
-        "--gt_ply_dir",
-        type=Path,
-        default="/data/scannetv2/OpenDataLab___ScanNet_v2/raw/scans",
-    )
-    parser.add_argument(
-        "--ckpt_path",
-        type=str,
-        default="./ckpt/model_tracker_fixed_e20.pt",
-    )
-```
-## 💎 Observation
-Note: A large number of input_frames may significantly slow down saving the visualization results. Please try using a smaller number first.
-```bash
-python eval/eval_scannet.py --input_frame 30 --vis_attn_map --merging 0
-```
-We observe that many token-level attention maps are highly similar in each block, motivating our optimization of the Global Attention module.
-<img src="assets/attn_map.png" alt="Autolab Logo" width="">
-## 🏀 Evaluation
-### Custom Dataset
-Please organize the data according to the following directory:
-```
-<data_path>/
-├── images/
-│   ├── 000000.jpg
-│   ├── 000001.jpg
-│   └── ...
-├── pose/                # Optional: Camera poses
-│   ├── 000000.txt
-│   ├── 000001.txt
-│   └── ...
-└── gt_ply/              # Optional: GT point cloud
-    └── scene_xxx.ply
-```
-- Required: `images/`
-- Additionally required when `--enable_evaluation` is enabled: `pose/` and `gt_ply/`
-Inference only:
-```bash
-python eval/eval_custom.py \
-  --data_path /path/to/your_dataset \
-  --output_path ./eval_results_custom \
-  --plot
-```
-Inference + Evaluation (requires `pose/` and `gt_ply/`):
-```bash
-python eval/eval_custom.py \
-  --data_path /path/to/your_dataset \
-  --enable_evaluation \
-  --output_path ./eval_results_custom \
-  --plot
-```
-### ScanNet
-Evaluate FastVGGT on the ScanNet dataset with 1,000 input images. The **--merging** parameter specifies the block index at which the merging strategy is applied:
-```bash
-python eval/eval_scannet.py --input_frame 1000 --merging 0
-```
-Evaluate Baseline VGGT on the ScanNet dataset with 1,000 input images:
-```bash
-python eval/eval_scannet.py --input_frame 1000
-```
-<img src="assets/vs.png" alt="Autolab Logo" width="">
-### 7 Scenes & NRGBD
-Evaluate across two datasets, sampling keyframes every 10 frames:
-```bash
-python eval/eval_7andN.py --kf 10
-```
-## 🍺 Acknowledgements
-- Thanks to these great repositories: [VGGT](https://github.com/facebookresearch/vggt), [Dust3r](https://github.com/naver/dust3r),  [Fast3R](https://github.com/facebookresearch/fast3r), [CUT3R](https://github.com/CUT3R/CUT3R), [MV-DUSt3R+](https://github.com/facebookresearch/mvdust3r), [StreamVGGT](https://github.com/wzzheng/StreamVGGT), [VGGT-Long](https://github.com/DengKaiCQ/VGGT-Long), [ToMeSD](https://github.com/dbolya/tomesd) and many other inspiring works in the community.
-- Special thanks to [Jianyuan Wang](https://jytime.github.io/) for his valuable discussions and suggestions on this work.
-<!-- ## ✍️ Checklist
-- [ ] Release the evaluation code on 7 Scenes / NRGBD -->
-## ⚖️ License
-See the [LICENSE](./LICENSE.txt) file for details about the license under which this code is made available.
-## Citation
-If you find this project helpful, please consider citing the following paper:
-```
-@article{shen2025fastvggt,
-  title={FastVGGT: Training-Free Acceleration of Visual Geometry Transformer},
-  author={Shen, You and Zhang, Zhipeng and Qu, Yansong and Cao, Liujuan},
-  journal={arXiv preprint arXiv:2509.02560},
-  year={2025}
-}
-```

FastVGGT/assets/attn_map.png DELETED Viewed

Git LFS Details

SHA256: 8477957f593c203bcf41df91ac3ed0d22329e22250fdab9f8f8674340964242c
Pointer size: 132 Bytes
Size of remote file: 2.37 MB

FastVGGT/assets/autolab_logo.png DELETED Viewed

Git LFS Details

SHA256: 4fcead3160cbf561c4385cc8b938a17a94652e3d849da6497f053d32d1245596
Pointer size: 132 Bytes
Size of remote file: 5.13 MB

FastVGGT/assets/maclab_logo.png DELETED Viewed

Binary file (4.8 kB)

FastVGGT/assets/main.png DELETED Viewed

Git LFS Details

SHA256: eecacb414647f01dc8a52b4aba5ff2556733f46d1b9129613e3f59aceff69685
Pointer size: 131 Bytes
Size of remote file: 884 kB

FastVGGT/assets/vs.png DELETED Viewed

Git LFS Details

SHA256: dac1ea5397b9f985c6fb86fc5f321313cc5a372d0917b7c1c1c7dd2cea6bec5f
Pointer size: 131 Bytes
Size of remote file: 230 kB

FastVGGT/eval/__pycache__/base.cpython-310.pyc DELETED Viewed

Binary file (6.92 kB)

FastVGGT/eval/__pycache__/criterion.cpython-310.pyc DELETED Viewed

Binary file (13.6 kB)

FastVGGT/eval/__pycache__/data.cpython-310.pyc DELETED Viewed

Binary file (7.78 kB)

FastVGGT/eval/__pycache__/data.cpython-37.pyc DELETED Viewed

Binary file (8.03 kB)

FastVGGT/eval/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (3.99 kB)

FastVGGT/eval/__pycache__/utils.cpython-37.pyc DELETED Viewed

Binary file (4.32 kB)

FastVGGT/eval/base.py DELETED Viewed

@@ -1,273 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-# base class for implementing datasets
-# --------------------------------------------------------
-import PIL
-import numpy as np
-import torch
-from dataset_utils.transforms import ImgNorm
-import dataset_utils.cropping as cropping
-from utils import depthmap_to_absolute_camera_coordinates
-class BaseStereoViewDataset:
-    """Define all basic options.
-    Usage:
-        class MyDataset (BaseStereoViewDataset):
-            def _get_views(self, idx, rng):
-                # overload here
-                views = []
-                views.append(dict(img=, ...))
-                return views
-    """
-    def __init__(
-        self,
-        *,  # only keyword arguments
-        split=None,
-        resolution=None,  # square_size or (width, height) or list of [(width,height), ...]
-        transform=ImgNorm,
-        aug_crop=False,
-        seed=None,
-    ):
-        self.num_views = 2
-        self.split = split
-        self._set_resolutions(resolution)
-        self.transform = transform
-        if isinstance(transform, str):
-            transform = eval(transform)
-        self.aug_crop = aug_crop
-        self.seed = seed
-    def __len__(self):
-        return len(self.scenes)
-    def get_stats(self):
-        return f"{len(self)} pairs"
-    def __repr__(self):
-        resolutions_str = "[" + ";".join(f"{w}x{h}" for w, h in self._resolutions) + "]"
-        return (
-            f"""{type(self).__name__}({self.get_stats()},
-            {self.split=},
-            {self.seed=},
-            resolutions={resolutions_str},
-            {self.transform=})""".replace(
-                "self.", ""
-            )
-            .replace("\n", "")
-            .replace("   ", "")
-        )
-    def _get_views(self, idx, resolution, rng):
-        raise NotImplementedError()
-    def __getitem__(self, idx):
-        if isinstance(idx, tuple):
-            # the idx is specifying the aspect-ratio
-            idx, ar_idx = idx
-        else:
-            assert len(self._resolutions) == 1
-            ar_idx = 0
-        # set-up the rng
-        if self.seed:  # reseed for each __getitem__
-            self._rng = np.random.default_rng(seed=self.seed + idx)
-        elif not hasattr(self, "_rng"):
-            seed = torch.initial_seed()  # this is different for each dataloader process
-            self._rng = np.random.default_rng(seed=seed)
-        # over-loaded code
-        resolution = self._resolutions[
-            ar_idx
-        ]  # DO NOT CHANGE THIS (compatible with BatchedRandomSampler)
-        views = self._get_views(idx, resolution, self._rng)
-        # check data-types
-        for v, view in enumerate(views):
-            assert (
-                "pts3d" not in view
-            ), f"pts3d should not be there, they will be computed afterwards based on intrinsics+depthmap for view {view_name(view)}"
-            view["idx"] = v
-            # encode the image
-            width, height = view["img"].size
-            view["true_shape"] = np.int32((height, width))
-            view["img"] = self.transform(view["img"])
-            assert "camera_intrinsics" in view
-            if "camera_pose" not in view:
-                view["camera_pose"] = np.full((4, 4), np.nan, dtype=np.float32)
-            else:
-                assert np.isfinite(
-                    view["camera_pose"]
-                ).all(), f"NaN in camera pose for view {view_name(view)}"
-            assert "pts3d" not in view
-            assert "valid_mask" not in view
-            assert np.isfinite(
-                view["depthmap"]
-            ).all(), f"NaN in depthmap for view {view_name(view)}"
-            pts3d, valid_mask = depthmap_to_absolute_camera_coordinates(**view)
-            view["pts3d"] = pts3d
-            view["valid_mask"] = valid_mask & np.isfinite(pts3d).all(axis=-1)
-            # check all datatypes
-            for key, val in view.items():
-                res, err_msg = is_good_type(key, val)
-                assert res, f"{err_msg} with {key}={val} for view {view_name(view)}"
-            K = view["camera_intrinsics"]
-            view["img_mask"] = True
-            view["ray_mask"] = False
-            view["ray_map"] = torch.full(
-                (6, view["img"].shape[-2], view["img"].shape[-1]), torch.nan
-            )
-            view["update"] = True
-            view["reset"] = False
-        # last thing done!
-        for view in views:
-            # transpose to make sure all views are the same size
-            transpose_to_landscape(view)
-            # this allows to check whether the RNG is is the same state each time
-            view["rng"] = int.from_bytes(self._rng.bytes(4), "big")
-        return views
-    def _set_resolutions(self, resolutions):
-        """Set the resolution(s) of the dataset.
-        Params:
-            - resolutions: int or tuple or list of tuples
-        """
-        assert resolutions is not None, "undefined resolution"
-        if not isinstance(resolutions, list):
-            resolutions = [resolutions]
-        self._resolutions = []
-        for resolution in resolutions:
-            if isinstance(resolution, int):
-                width = height = resolution
-            else:
-                width, height = resolution
-            assert isinstance(
-                width, int
-            ), f"Bad type for {width=} {type(width)=}, should be int"
-            assert isinstance(
-                height, int
-            ), f"Bad type for {height=} {type(height)=}, should be int"
-            assert width >= height
-            self._resolutions.append((width, height))
-    def _crop_resize_if_necessary(
-        self, image, depthmap, intrinsics, resolution, rng=None, info=None
-    ):
-        """This function:
-        - first downsizes the image with LANCZOS inteprolation,
-          which is better than bilinear interpolation in
-        """
-        if not isinstance(image, PIL.Image.Image):
-            image = PIL.Image.fromarray(image)
-        # downscale with lanczos interpolation so that image.size == resolution
-        # cropping centered on the principal point
-        W, H = image.size
-        cx, cy = intrinsics[:2, 2].round().astype(int)
-        # calculate min distance to margin
-        min_margin_x = min(cx, W - cx)
-        min_margin_y = min(cy, H - cy)
-        assert min_margin_x > W / 5, f"Bad principal point in view={info}"
-        assert min_margin_y > H / 5, f"Bad principal point in view={info}"
-        ## Center crop
-        # Crop on the principal point, make it always centered
-        # the new window will be a rectangle of size (2*min_margin_x, 2*min_margin_y) centered on (cx,cy)
-        l, t = cx - min_margin_x, cy - min_margin_y
-        r, b = cx + min_margin_x, cy + min_margin_y
-        crop_bbox = (l, t, r, b)
-        image, depthmap, intrinsics = cropping.crop_image_depthmap(
-            image, depthmap, intrinsics, crop_bbox
-        )
-        # # transpose the resolution if necessary
-        W, H = image.size  # new size
-        assert resolution[0] >= resolution[1]
-        if H > 1.1 * W:
-            # image is portrait mode
-            resolution = resolution[::-1]
-        elif 0.9 < H / W < 1.1 and resolution[0] != resolution[1]:
-            # image is square, so we chose (portrait, landscape) randomly
-            if rng.integers(2):
-                resolution = resolution[::-1]
-        # high-quality Lanczos down-scaling
-        target_resolution = np.array(resolution)
-        # # if self.aug_crop > 1:
-        # #     target_resolution += rng.integers(0, self.aug_crop)
-        # if resolution != (224, 224):
-        #     halfw, halfh = ((2*(W//2))//16)*8, ((2*(H//2))//16)*8
-        #     ## Recale with max factor, so  one of width or height might be larger than target_resolution
-        #     image, depthmap, intrinsics = cropping.rescale_image_depthmap(image, depthmap, intrinsics, (2*halfw, 2*halfh))
-        # else:
-        image, depthmap, intrinsics = cropping.rescale_image_depthmap(
-            image, depthmap, intrinsics, target_resolution
-        )
-        # actual cropping (if necessary) with bilinear interpolation
-        # if resolution == (224, 224):
-        intrinsics2 = cropping.camera_matrix_of_crop(
-            intrinsics, image.size, resolution, offset_factor=0.5
-        )
-        crop_bbox = cropping.bbox_from_intrinsics_in_out(
-            intrinsics, intrinsics2, resolution
-        )
-        image, depthmap, intrinsics = cropping.crop_image_depthmap(
-            image, depthmap, intrinsics, crop_bbox
-        )
-        return image, depthmap, intrinsics
-def is_good_type(key, v):
-    """returns (is_good, err_msg)"""
-    if isinstance(v, (str, int, tuple)):
-        return True, None
-    if v.dtype not in (np.float32, torch.float32, bool, np.int32, np.int64, np.uint8):
-        return False, f"bad {v.dtype=}"
-    return True, None
-def view_name(view, batch_index=None):
-    def sel(x):
-        return x[batch_index] if batch_index not in (None, slice(None)) else x
-    db = sel(view["dataset"])
-    label = sel(view["label"])
-    instance = sel(view["instance"])
-    return f"{db}/{label}/{instance}"
-def transpose_to_landscape(view):
-    height, width = view["true_shape"]
-    if width < height:
-        # rectify portrait to landscape
-        assert view["img"].shape == (3, height, width)
-        view["img"] = view["img"].swapaxes(1, 2)
-        assert view["valid_mask"].shape == (height, width)
-        view["valid_mask"] = view["valid_mask"].swapaxes(0, 1)
-        assert view["depthmap"].shape == (height, width)
-        view["depthmap"] = view["depthmap"].swapaxes(0, 1)
-        assert view["pts3d"].shape == (height, width, 3)
-        view["pts3d"] = view["pts3d"].swapaxes(0, 1)
-        # transpose x and y pixels
-        view["camera_intrinsics"] = view["camera_intrinsics"][[1, 0, 2]]

FastVGGT/eval/criterion.py DELETED Viewed

@@ -1,534 +0,0 @@
-import torch
-import torch.nn as nn
-from copy import copy, deepcopy
-from eval.dataset_utils.corr import geotrf, inv
-def invalid_to_nans(arr, valid_mask, ndim=999):
-    if valid_mask is not None:
-        arr = arr.clone()
-        arr[~valid_mask] = float("nan")
-    if arr.ndim > ndim:
-        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
-    return arr
-def invalid_to_zeros(arr, valid_mask, ndim=999):
-    if valid_mask is not None:
-        arr = arr.clone()
-        arr[~valid_mask] = 0
-        nnz = valid_mask.view(len(valid_mask), -1).sum(1)
-    else:
-        nnz = arr.numel() // len(arr) if len(arr) else 0  # number of point per image
-    if arr.ndim > ndim:
-        arr = arr.flatten(-2 - (arr.ndim - ndim), -2)
-    return arr, nnz
-class BaseCriterion(nn.Module):
-    def __init__(self, reduction="mean"):
-        super().__init__()
-        self.reduction = reduction
-class Criterion(nn.Module):
-    def __init__(self, criterion=None):
-        super().__init__()
-        assert isinstance(
-            criterion, BaseCriterion
-        ), f"{criterion} is not a proper criterion!"
-        self.criterion = copy(criterion)
-    def get_name(self):
-        return f"{type(self).__name__}({self.criterion})"
-    def with_reduction(self, mode="none"):
-        res = loss = deepcopy(self)
-        while loss is not None:
-            assert isinstance(loss, Criterion)
-            loss.criterion.reduction = mode  # make it return the loss for each sample
-            loss = loss._loss2  # we assume loss is a Multiloss
-        return res
-class MultiLoss(nn.Module):
-    """Easily combinable losses (also keep track of individual loss values):
-        loss = MyLoss1() + 0.1*MyLoss2()
-    Usage:
-        Inherit from this class and override get_name() and compute_loss()
-    """
-    def __init__(self):
-        super().__init__()
-        self._alpha = 1
-        self._loss2 = None
-    def compute_loss(self, *args, **kwargs):
-        raise NotImplementedError()
-    def get_name(self):
-        raise NotImplementedError()
-    def __mul__(self, alpha):
-        assert isinstance(alpha, (int, float))
-        res = copy(self)
-        res._alpha = alpha
-        return res
-    __rmul__ = __mul__  # same
-    def __add__(self, loss2):
-        assert isinstance(loss2, MultiLoss)
-        res = cur = copy(self)
-        while cur._loss2 is not None:
-            cur = cur._loss2
-        cur._loss2 = loss2
-        return res
-    def __repr__(self):
-        name = self.get_name()
-        if self._alpha != 1:
-            name = f"{self._alpha:g}*{name}"
-        if self._loss2:
-            name = f"{name} + {self._loss2}"
-        return name
-    def forward(self, *args, **kwargs):
-        loss = self.compute_loss(*args, **kwargs)
-        if isinstance(loss, tuple):
-            loss, details = loss
-        elif loss.ndim == 0:
-            details = {self.get_name(): float(loss)}
-        else:
-            details = {}
-        loss = loss * self._alpha
-        if self._loss2:
-            loss2, details2 = self._loss2(*args, **kwargs)
-            loss = loss + loss2
-            details |= details2
-        return loss, details
-class LLoss(BaseCriterion):
-    """L-norm loss"""
-    def forward(self, a, b):
-        assert (
-            a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3
-        ), f"Bad shape = {a.shape}"
-        dist = self.distance(a, b)
-        if self.reduction == "none":
-            return dist
-        if self.reduction == "sum":
-            return dist.sum()
-        if self.reduction == "mean":
-            return dist.mean() if dist.numel() > 0 else dist.new_zeros(())
-        raise ValueError(f"bad {self.reduction=} mode")
-    def distance(self, a, b):
-        raise NotImplementedError()
-class L21Loss(LLoss):
-    """Euclidean distance between 3d points"""
-    def distance(self, a, b):
-        return torch.norm(a - b, dim=-1)  # normalized L2 distance
-L21 = L21Loss()
-def get_pred_pts3d(gt, pred, use_pose=False):
-    assert use_pose is True
-    return pred["pts3d_in_other_view"]  # return!
-def Sum(losses, masks, conf=None):
-    loss, mask = losses[0], masks[0]
-    if loss.ndim > 0:
-        # we are actually returning the loss for every pixels
-        if conf is not None:
-            return losses, masks, conf
-        return losses, masks
-    else:
-        # we are returning the global loss
-        for loss2 in losses[1:]:
-            loss = loss + loss2
-        return loss
-def get_norm_factor(pts, norm_mode="avg_dis", valids=None, fix_first=True):
-    assert pts[0].ndim >= 3 and pts[0].shape[-1] == 3
-    assert pts[1] is None or (pts[1].ndim >= 3 and pts[1].shape[-1] == 3)
-    norm_mode, dis_mode = norm_mode.split("_")
-    nan_pts = []
-    nnzs = []
-    if norm_mode == "avg":
-        # gather all points together (joint normalization)
-        for i, pt in enumerate(pts):
-            nan_pt, nnz = invalid_to_zeros(pt, valids[i], ndim=3)
-            nan_pts.append(nan_pt)
-            nnzs.append(nnz)
-            if fix_first:
-                break
-        all_pts = torch.cat(nan_pts, dim=1)
-        # compute distance to origin
-        all_dis = all_pts.norm(dim=-1)
-        if dis_mode == "dis":
-            pass  # do nothing
-        elif dis_mode == "log1p":
-            all_dis = torch.log1p(all_dis)
-        else:
-            raise ValueError(f"bad {dis_mode=}")
-        norm_factor = all_dis.sum(dim=1) / (torch.cat(nnzs).sum() + 1e-8)
-    else:
-        raise ValueError(f"Not implemented {norm_mode=}")
-    norm_factor = norm_factor.clip(min=1e-8)
-    while norm_factor.ndim < pts[0].ndim:
-        norm_factor.unsqueeze_(-1)
-    return norm_factor
-def normalize_pointcloud_t(
-    pts, norm_mode="avg_dis", valids=None, fix_first=True, gt=False
-):
-    if gt:
-        norm_factor = get_norm_factor(pts, norm_mode, valids, fix_first)
-        res = []
-        for i, pt in enumerate(pts):
-            res.append(pt / norm_factor)
-    else:
-        # pts_l, pts_r = pts
-        # use pts_l and pts_r[-1] as pts to normalize
-        norm_factor = get_norm_factor(pts, norm_mode, valids, fix_first)
-        res = []
-        for i in range(len(pts)):
-            res.append(pts[i] / norm_factor)
-            # res_r.append(pts_r[i] / norm_factor)
-        # res = [res_l, res_r]
-    return res, norm_factor
-@torch.no_grad()
-def get_joint_pointcloud_depth(zs, valid_masks=None, quantile=0.5):
-    # set invalid points to NaN
-    _zs = []
-    for i in range(len(zs)):
-        valid_mask = valid_masks[i] if valid_masks is not None else None
-        _z = invalid_to_nans(zs[i], valid_mask).reshape(len(zs[i]), -1)
-        _zs.append(_z)
-    _zs = torch.cat(_zs, dim=-1)
-    # compute median depth overall (ignoring nans)
-    if quantile == 0.5:
-        shift_z = torch.nanmedian(_zs, dim=-1).values
-    else:
-        shift_z = torch.nanquantile(_zs, quantile, dim=-1)
-    return shift_z  # (B,)
-@torch.no_grad()
-def get_joint_pointcloud_center_scale(pts, valid_masks=None, z_only=False, center=True):
-    # set invalid points to NaN
-    _pts = []
-    for i in range(len(pts)):
-        valid_mask = valid_masks[i] if valid_masks is not None else None
-        _pt = invalid_to_nans(pts[i], valid_mask).reshape(len(pts[i]), -1, 3)
-        _pts.append(_pt)
-    _pts = torch.cat(_pts, dim=1)
-    # compute median center
-    _center = torch.nanmedian(_pts, dim=1, keepdim=True).values  # (B,1,3)
-    if z_only:
-        _center[..., :2] = 0  # do not center X and Y
-    # compute median norm
-    _norm = ((_pts - _center) if center else _pts).norm(dim=-1)
-    scale = torch.nanmedian(_norm, dim=1).values
-    return _center[:, None, :, :], scale[:, None, None, None]
-class Regr3D_t(Criterion, MultiLoss):
-    def __init__(self, criterion, norm_mode="avg_dis", gt_scale=False, fix_first=True):
-        super().__init__(criterion)
-        self.norm_mode = norm_mode
-        self.gt_scale = gt_scale
-        self.fix_first = fix_first
-    def get_all_pts3d_t(self, gts, preds, dist_clip=None):
-        # everything is normalized w.r.t. camera of view1
-        in_camera1 = inv(gts[0]["camera_pose"])
-        gt_pts = []
-        valids = []
-        pr_pts = []
-        for i, gt in enumerate(gts):
-            # in_camera1: Bs, 4, 4 gt['pts3d']: Bs, H, W, 3
-            gt_pts.append(geotrf(in_camera1, gt["pts3d"]))
-            valid = gt["valid_mask"].clone()
-            if dist_clip is not None:
-                # points that are too far-away == invalid
-                dis = gt["pts3d"].norm(dim=-1)
-                valid = valid & (dis <= dist_clip)
-            valids.append(valid)
-            pr_pts.append(get_pred_pts3d(gt, preds[i], use_pose=True))
-            # if i != len(gts)-1:
-            #     pr_pts_l.append(get_pred_pts3d(gt, preds[i][0], use_pose=(i!=0)))
-            # if i != 0:
-            #     pr_pts_r.append(get_pred_pts3d(gt, preds[i-1][1], use_pose=(i!=0)))
-        # pr_pts = (pr_pts_l, pr_pts_r)
-        if self.norm_mode:
-            pr_pts, pr_factor = normalize_pointcloud_t(
-                pr_pts, self.norm_mode, valids, fix_first=self.fix_first, gt=False
-            )
-        else:
-            pr_factor = None
-        if self.norm_mode and not self.gt_scale:
-            gt_pts, gt_factor = normalize_pointcloud_t(
-                gt_pts, self.norm_mode, valids, fix_first=self.fix_first, gt=True
-            )
-        else:
-            gt_factor = None
-        return gt_pts, pr_pts, gt_factor, pr_factor, valids, {}
-    def compute_frame_loss(self, gts, preds, **kw):
-        gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
-            self.get_all_pts3d_t(gts, preds, **kw)
-        )
-        pred_pts_l, pred_pts_r = pred_pts
-        loss_all = []
-        mask_all = []
-        conf_all = []
-        loss_left = 0
-        loss_right = 0
-        pred_conf_l = 0
-        pred_conf_r = 0
-        for i in range(len(gt_pts)):
-            # Left (Reference)
-            if i != len(gt_pts) - 1:
-                frame_loss = self.criterion(
-                    pred_pts_l[i][masks[i]], gt_pts[i][masks[i]]
-                )
-                loss_all.append(frame_loss)
-                mask_all.append(masks[i])
-                conf_all.append(preds[i][0]["conf"])
-                # To compare target/reference loss
-                if i != 0:
-                    loss_left += frame_loss.cpu().detach().numpy().mean()
-                    pred_conf_l += preds[i][0]["conf"].cpu().detach().numpy().mean()
-            # Right (Target)
-            if i != 0:
-                frame_loss = self.criterion(
-                    pred_pts_r[i - 1][masks[i]], gt_pts[i][masks[i]]
-                )
-                loss_all.append(frame_loss)
-                mask_all.append(masks[i])
-                conf_all.append(preds[i - 1][1]["conf"])
-                # To compare target/reference loss
-                if i != len(gt_pts) - 1:
-                    loss_right += frame_loss.cpu().detach().numpy().mean()
-                    pred_conf_r += preds[i - 1][1]["conf"].cpu().detach().numpy().mean()
-        if pr_factor is not None and gt_factor is not None:
-            filter_factor = pr_factor[pr_factor > gt_factor]
-        else:
-            filter_factor = []
-        if len(filter_factor) > 0:
-            factor_loss = (filter_factor - gt_factor).abs().mean()
-        else:
-            factor_loss = 0.0
-        self_name = type(self).__name__
-        details = {
-            self_name + "_pts3d_1": float(loss_all[0].mean()),
-            self_name + "_pts3d_2": float(loss_all[1].mean()),
-            self_name + "loss_left": float(loss_left),
-            self_name + "loss_right": float(loss_right),
-            self_name + "conf_left": float(pred_conf_l),
-            self_name + "conf_right": float(pred_conf_r),
-        }
-        return Sum(loss_all, mask_all, conf_all), (details | monitoring), factor_loss
-class ConfLoss_t(MultiLoss):
-    """Weighted regression by learned confidence.
-        Assuming the input pixel_loss is a pixel-level regression loss.
-    Principle:
-        high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10)
-        low  confidence means low  conf = 10  ==> conf_loss = x * 10 - alpha*log(10)
-        alpha: hyperparameter
-    """
-    def __init__(self, pixel_loss, alpha=1):
-        super().__init__()
-        assert alpha > 0
-        self.alpha = alpha
-        self.pixel_loss = pixel_loss.with_reduction("none")
-    def get_name(self):
-        return f"ConfLoss({self.pixel_loss})"
-    def get_conf_log(self, x):
-        return x, torch.log(x)
-    def compute_frame_loss(self, gts, preds, **kw):
-        # compute per-pixel loss
-        (losses, masks, confs), details, loss_factor = (
-            self.pixel_loss.compute_frame_loss(gts, preds, **kw)
-        )
-        # weight by confidence
-        conf_losses = []
-        conf_sum = 0
-        for i in range(len(losses)):
-            conf, log_conf = self.get_conf_log(confs[i][masks[i]])
-            conf_sum += conf.mean()
-            conf_loss = losses[i] * conf - self.alpha * log_conf
-            conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0
-            conf_losses.append(conf_loss)
-        conf_losses = torch.stack(conf_losses) * 2.0
-        conf_loss_mean = conf_losses.mean()
-        return (
-            conf_loss_mean,
-            dict(
-                conf_loss_1=float(conf_losses[0]),
-                conf_loss2=float(conf_losses[1]),
-                conf_mean=conf_sum / len(losses),
-                **details,
-            ),
-            loss_factor,
-        )
-class Regr3D_t_ShiftInv(Regr3D_t):
-    """Same than Regr3D but invariant to depth shift."""
-    def get_all_pts3d_t(self, gts, preds):
-        # compute unnormalized points
-        gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
-            super().get_all_pts3d_t(gts, preds)
-        )
-        # pred_pts_l, pred_pts_r = pred_pts
-        gt_zs = [gt_pt[..., 2] for gt_pt in gt_pts]
-        pred_zs = [pred_pt[..., 2] for pred_pt in pred_pts]
-        # pred_zs.append(pred_pts_r[-1][..., 2])
-        # compute median depth
-        gt_shift_z = get_joint_pointcloud_depth(gt_zs, masks)[:, None, None]
-        pred_shift_z = get_joint_pointcloud_depth(pred_zs, masks)[:, None, None]
-        # subtract the median depth
-        for i in range(len(gt_pts)):
-            gt_pts[i][..., 2] -= gt_shift_z
-        for i in range(len(pred_pts)):
-            # for j in range(len(pred_pts[i])):
-            pred_pts[i][..., 2] -= pred_shift_z
-        monitoring = dict(
-            monitoring,
-            gt_shift_z=gt_shift_z.mean().detach(),
-            pred_shift_z=pred_shift_z.mean().detach(),
-        )
-        return gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring
-class Regr3D_t_ScaleInv(Regr3D_t):
-    """Same than Regr3D but invariant to depth shift.
-    if gt_scale == True: enforce the prediction to take the same scale than GT
-    """
-    def get_all_pts3d_t(self, gts, preds):
-        # compute depth-normalized points
-        gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
-            super().get_all_pts3d_t(gts, preds)
-        )
-        # measure scene scale
-        # pred_pts_l, pred_pts_r = pred_pts
-        pred_pts_all = [
-            x.clone() for x in pred_pts
-        ]  # [pred_pt for pred_pt in pred_pts_l]
-        # pred_pts_all.append(pred_pts_r[-1])
-        _, gt_scale = get_joint_pointcloud_center_scale(gt_pts, masks)
-        _, pred_scale = get_joint_pointcloud_center_scale(pred_pts_all, masks)
-        # prevent predictions to be in a ridiculous range
-        pred_scale = pred_scale.clip(min=1e-3, max=1e3)
-        # subtract the median depth
-        if self.gt_scale:
-            for i in range(len(pred_pts)):
-                # for j in range(len(pred_pts[i])):
-                pred_pts[i] *= gt_scale / pred_scale
-        else:
-            for i in range(len(pred_pts)):
-                # for j in range(len(pred_pts[i])):
-                pred_pts[i] *= pred_scale / gt_scale
-            for i in range(len(gt_pts)):
-                gt_pts[i] *= gt_scale / pred_scale
-        monitoring = dict(
-            monitoring, gt_scale=gt_scale.mean(), pred_scale=pred_scale.mean().detach()
-        )
-        return gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring
-class Regr3D_t_ScaleShiftInv(Regr3D_t_ScaleInv, Regr3D_t_ShiftInv):
-    # calls Regr3D_ShiftInv first, then Regr3D_ScaleInv
-    pass

FastVGGT/eval/data.py DELETED Viewed

@@ -1,338 +0,0 @@
-import os
-import cv2
-import numpy as np
-import os.path as osp
-from collections import deque
-from base import BaseStereoViewDataset
-import dataset_utils.cropping as cropping
-from vggt.utils.eval_utils import imread_cv2, shuffle_deque
-class SevenScenes(BaseStereoViewDataset):
-    def __init__(
-        self,
-        num_seq=1,
-        num_frames=5,
-        min_thresh=10,
-        max_thresh=100,
-        test_id=None,
-        full_video=False,
-        tuple_list=None,
-        seq_id=None,
-        rebuttal=False,
-        shuffle_seed=-1,
-        kf_every=1,
-        *args,
-        ROOT,
-        **kwargs,
-    ):
-        self.ROOT = ROOT
-        super().__init__(*args, **kwargs)
-        self.num_seq = num_seq
-        self.num_frames = num_frames
-        self.max_thresh = max_thresh
-        self.min_thresh = min_thresh
-        self.test_id = test_id
-        self.full_video = full_video
-        self.kf_every = kf_every
-        self.seq_id = seq_id
-        self.rebuttal = rebuttal
-        self.shuffle_seed = shuffle_seed
-        # load all scenes
-        self.load_all_tuples(tuple_list)
-        self.load_all_scenes(ROOT)
-    def __len__(self):
-        if self.tuple_list is not None:
-            return len(self.tuple_list)
-        return len(self.scene_list) * self.num_seq
-    def load_all_tuples(self, tuple_list):
-        if tuple_list is not None:
-            self.tuple_list = tuple_list
-            # with open(tuple_path) as f:
-            #     self.tuple_list = f.read().splitlines()
-        else:
-            self.tuple_list = None
-    def load_all_scenes(self, base_dir):
-        if self.tuple_list is not None:
-            # Use pre-defined simplerecon scene_ids
-            self.scene_list = [
-                "stairs/seq-06",
-                "stairs/seq-02",
-                "pumpkin/seq-06",
-                "chess/seq-01",
-                "heads/seq-02",
-                "fire/seq-02",
-                "office/seq-03",
-                "pumpkin/seq-03",
-                "redkitchen/seq-07",
-                "chess/seq-02",
-                "office/seq-01",
-                "redkitchen/seq-01",
-                "fire/seq-01",
-            ]
-            print(f"Found {len(self.scene_list)} sequences in split {self.split}")
-            return
-        scenes = os.listdir(base_dir)
-        file_split = {"train": "TrainSplit.txt", "test": "TestSplit.txt"}[self.split]
-        self.scene_list = []
-        for scene in scenes:
-            if self.test_id is not None and scene != self.test_id:
-                continue
-            # read file split
-            with open(osp.join(base_dir, scene, file_split)) as f:
-                seq_ids = f.read().splitlines()
-                for seq_id in seq_ids:
-                    # seq is string, take the int part and make it 01, 02, 03
-                    # seq_id = 'seq-{:2d}'.format(int(seq_id))
-                    num_part = "".join(filter(str.isdigit, seq_id))
-                    seq_id = f"seq-{num_part.zfill(2)}"
-                    if self.seq_id is not None and seq_id != self.seq_id:
-                        continue
-                    self.scene_list.append(f"{scene}/{seq_id}")
-        print(f"Found {len(self.scene_list)} sequences in split {self.split}")
-    def _get_views(self, idx, resolution, rng):
-        if self.tuple_list is not None:
-            line = self.tuple_list[idx].split(" ")
-            scene_id = line[0]
-            img_idxs = line[1:]
-        else:
-            scene_id = self.scene_list[idx // self.num_seq]
-            seq_id = idx % self.num_seq
-            data_path = osp.join(self.ROOT, scene_id)
-            num_files = len([name for name in os.listdir(data_path) if "color" in name])
-            img_idxs = [f"{i:06d}" for i in range(num_files)]
-            img_idxs = img_idxs[:: self.kf_every]
-        # Intrinsics used in SimpleRecon
-        fx, fy, cx, cy = 525, 525, 320, 240
-        intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
-        views = []
-        imgs_idxs = deque(img_idxs)
-        if self.shuffle_seed >= 0:
-            imgs_idxs = shuffle_deque(imgs_idxs)
-        while len(imgs_idxs) > 0:
-            im_idx = imgs_idxs.popleft()
-            impath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.color.png")
-            depthpath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.depth.proj.png")
-            posepath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.pose.txt")
-            rgb_image = imread_cv2(impath)
-            depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
-            rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0]))
-            depthmap[depthmap == 65535] = 0
-            depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0
-            depthmap[depthmap > 10] = 0
-            depthmap[depthmap < 1e-3] = 0
-            camera_pose = np.loadtxt(posepath).astype(np.float32)
-            if resolution != (224, 224) or self.rebuttal:
-                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
-                    rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath
-                )
-            else:
-                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
-                    rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath
-                )
-                W, H = rgb_image.size
-                cx = W // 2
-                cy = H // 2
-                l, t = cx - 112, cy - 112
-                r, b = cx + 112, cy + 112
-                crop_bbox = (l, t, r, b)
-                rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
-                    rgb_image, depthmap, intrinsics, crop_bbox
-                )
-            views.append(
-                dict(
-                    img=rgb_image,
-                    depthmap=depthmap,
-                    camera_pose=camera_pose,
-                    camera_intrinsics=intrinsics,
-                    dataset="7scenes",
-                    label=osp.join(scene_id, im_idx),
-                    instance=impath,
-                )
-            )
-        return views
-class NRGBD(BaseStereoViewDataset):
-    def __init__(
-        self,
-        num_seq=1,
-        num_frames=5,
-        min_thresh=10,
-        max_thresh=100,
-        test_id=None,
-        full_video=False,
-        tuple_list=None,
-        seq_id=None,
-        rebuttal=False,
-        shuffle_seed=-1,
-        kf_every=1,
-        *args,
-        ROOT,
-        **kwargs,
-    ):
-        self.ROOT = ROOT
-        super().__init__(*args, **kwargs)
-        self.num_seq = num_seq
-        self.num_frames = num_frames
-        self.max_thresh = max_thresh
-        self.min_thresh = min_thresh
-        self.test_id = test_id
-        self.full_video = full_video
-        self.kf_every = kf_every
-        self.seq_id = seq_id
-        self.rebuttal = rebuttal
-        self.shuffle_seed = shuffle_seed
-        # load all scenes
-        self.load_all_tuples(tuple_list)
-        self.load_all_scenes(ROOT)
-    def __len__(self):
-        if self.tuple_list is not None:
-            return len(self.tuple_list)
-        return len(self.scene_list) * self.num_seq
-    def load_all_tuples(self, tuple_list):
-        if tuple_list is not None:
-            self.tuple_list = tuple_list
-            # with open(tuple_path) as f:
-            #     self.tuple_list = f.read().splitlines()
-        else:
-            self.tuple_list = None
-    def load_all_scenes(self, base_dir):
-        scenes = [
-            d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
-        ]
-        if self.test_id is not None:
-            self.scene_list = [self.test_id]
-        else:
-            self.scene_list = scenes
-        print(f"Found {len(self.scene_list)} sequences in split {self.split}")
-    def load_poses(self, path):
-        file = open(path, "r")
-        lines = file.readlines()
-        file.close()
-        poses = []
-        valid = []
-        lines_per_matrix = 4
-        for i in range(0, len(lines), lines_per_matrix):
-            if "nan" in lines[i]:
-                valid.append(False)
-                poses.append(np.eye(4, 4, dtype=np.float32).tolist())
-            else:
-                valid.append(True)
-                pose_floats = [
-                    [float(x) for x in line.split()]
-                    for line in lines[i : i + lines_per_matrix]
-                ]
-                poses.append(pose_floats)
-        return np.array(poses, dtype=np.float32), valid
-    def _get_views(self, idx, resolution, rng):
-        if self.tuple_list is not None:
-            line = self.tuple_list[idx].split(" ")
-            scene_id = line[0]
-            img_idxs = line[1:]
-        else:
-            scene_id = self.scene_list[idx // self.num_seq]
-            num_files = len(os.listdir(os.path.join(self.ROOT, scene_id, "images")))
-            img_idxs = [f"{i}" for i in range(num_files)]
-            img_idxs = img_idxs[:: min(self.kf_every, len(img_idxs) // 2)]
-        fx, fy, cx, cy = 554.2562584220408, 554.2562584220408, 320, 240
-        intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32)
-        posepath = osp.join(self.ROOT, scene_id, f"poses.txt")
-        camera_poses, valids = self.load_poses(posepath)
-        imgs_idxs = deque(img_idxs)
-        if self.shuffle_seed >= 0:
-            imgs_idxs = shuffle_deque(imgs_idxs)
-        views = []
-        while len(imgs_idxs) > 0:
-            im_idx = imgs_idxs.popleft()
-            impath = osp.join(self.ROOT, scene_id, "images", f"img{im_idx}.png")
-            depthpath = osp.join(self.ROOT, scene_id, "depth", f"depth{im_idx}.png")
-            rgb_image = imread_cv2(impath)
-            depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED)
-            depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0
-            depthmap[depthmap > 10] = 0
-            depthmap[depthmap < 1e-3] = 0
-            rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0]))
-            camera_pose = camera_poses[int(im_idx)]
-            # gl to cv
-            camera_pose[:, 1:3] *= -1.0
-            if resolution != (224, 224) or self.rebuttal:
-                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
-                    rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath
-                )
-            else:
-                rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary(
-                    rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath
-                )
-                W, H = rgb_image.size
-                cx = W // 2
-                cy = H // 2
-                l, t = cx - 112, cy - 112
-                r, b = cx + 112, cy + 112
-                crop_bbox = (l, t, r, b)
-                rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap(
-                    rgb_image, depthmap, intrinsics, crop_bbox
-                )
-            views.append(
-                dict(
-                    img=rgb_image,
-                    depthmap=depthmap,
-                    camera_pose=camera_pose,
-                    camera_intrinsics=intrinsics,
-                    dataset="nrgbd",
-                    label=osp.join(scene_id, im_idx),
-                    instance=impath,
-                )
-            )
-        return views

FastVGGT/eval/dataset_utils/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	-

FastVGGT/eval/dataset_utils/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (146 Bytes)

FastVGGT/eval/dataset_utils/__pycache__/__init__.cpython-37.pyc DELETED Viewed

Binary file (140 Bytes)

FastVGGT/eval/dataset_utils/__pycache__/corr.cpython-310.pyc DELETED Viewed

Binary file (5.85 kB)

FastVGGT/eval/dataset_utils/__pycache__/cropping.cpython-310.pyc DELETED Viewed

Binary file (4.29 kB)

FastVGGT/eval/dataset_utils/__pycache__/cropping.cpython-37.pyc DELETED Viewed

Binary file (4.29 kB)

FastVGGT/eval/dataset_utils/__pycache__/transforms.cpython-310.pyc DELETED Viewed

Binary file (2.18 kB)

FastVGGT/eval/dataset_utils/corr.py DELETED Viewed

@@ -1,234 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-import numpy as np
-import torch
-def todevice(batch, device, callback=None, non_blocking=False):
-    """Transfer some variables to another device (i.e. GPU, CPU:torch, CPU:numpy).
-    batch: list, tuple, dict of tensors or other things
-    device: pytorch device or 'numpy'
-    callback: function that would be called on every sub-elements.
-    """
-    if callback:
-        batch = callback(batch)
-    if isinstance(batch, dict):
-        return {k: todevice(v, device) for k, v in batch.items()}
-    if isinstance(batch, (tuple, list)):
-        return type(batch)(todevice(x, device) for x in batch)
-    x = batch
-    if device == "numpy":
-        if isinstance(x, torch.Tensor):
-            x = x.detach().cpu().numpy()
-    elif x is not None:
-        if isinstance(x, np.ndarray):
-            x = torch.from_numpy(x)
-        if torch.is_tensor(x):
-            x = x.to(device, non_blocking=non_blocking)
-    return x
-to_device = todevice  # alias
-def to_numpy(x):
-    return todevice(x, "numpy")
-def geotrf(Trf, pts, ncol=None, norm=False):
-    """Apply a geometric transformation to a list of 3-D points.
-    H: 3x3 or 4x4 projection matrix (typically a Homography)
-    p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3)
-    ncol: int. number of columns of the result (2 or 3)
-    norm: float. if != 0, the resut is projected on the z=norm plane.
-    Returns an array of projected 2d points.
-    """
-    assert Trf.ndim >= 2
-    if isinstance(Trf, np.ndarray):
-        pts = np.asarray(pts)
-    elif isinstance(Trf, torch.Tensor):
-        pts = torch.as_tensor(pts, dtype=Trf.dtype)
-    output_reshape = pts.shape[:-1]
-    ncol = ncol or pts.shape[-1]
-    if (
-        isinstance(Trf, torch.Tensor)
-        and isinstance(pts, torch.Tensor)
-        and Trf.ndim == 3
-        and pts.ndim == 4
-    ):
-        d = pts.shape[3]
-        if Trf.shape[-1] == d:
-            pts = torch.einsum("bij, bhwj -> bhwi", Trf, pts)
-        elif Trf.shape[-1] == d + 1:
-            pts = (
-                torch.einsum("bij, bhwj -> bhwi", Trf[:, :d, :d], pts)
-                + Trf[:, None, None, :d, d]
-            )
-        else:
-            raise ValueError(f"bad shape, not ending with 3 or 4, for {pts.shape=}")
-    else:
-        if Trf.ndim >= 3:
-            n = Trf.ndim - 2
-            assert Trf.shape[:n] == pts.shape[:n], "batch size does not match"
-            Trf = Trf.reshape(-1, Trf.shape[-2], Trf.shape[-1])
-            if pts.ndim > Trf.ndim:
-                pts = pts.reshape(Trf.shape[0], -1, pts.shape[-1])
-            elif pts.ndim == 2:
-                pts = pts[:, None, :]
-        if pts.shape[-1] + 1 == Trf.shape[-1]:
-            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
-            pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :]
-        elif pts.shape[-1] == Trf.shape[-1]:
-            Trf = Trf.swapaxes(-1, -2)  # transpose Trf
-            pts = pts @ Trf
-        else:
-            pts = Trf @ pts.T
-            if pts.ndim >= 2:
-                pts = pts.swapaxes(-1, -2)
-    if norm:
-        pts = pts / pts[..., -1:]  # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG
-        if norm != 1:
-            pts *= norm
-    res = pts[..., :ncol].reshape(*output_reshape, ncol)
-    return res
-def inv(mat):
-    """Invert a torch or numpy matrix"""
-    if isinstance(mat, torch.Tensor):
-        return torch.linalg.inv(mat)
-    if isinstance(mat, np.ndarray):
-        return np.linalg.inv(mat)
-    raise ValueError(f"bad matrix type = {type(mat)}")
-def reproject_view(pts3d, view2):
-    shape = view2["pts3d"].shape[:2]
-    return reproject(
-        pts3d, view2["camera_intrinsics"], inv(view2["camera_pose"]), shape
-    )
-def reproject(pts3d, K, world2cam, shape):
-    H, W, THREE = pts3d.shape
-    assert THREE == 3
-    with np.errstate(divide="ignore", invalid="ignore"):
-        pos = geotrf(K @ world2cam[:3], pts3d, norm=1, ncol=2)
-    return (H, W), ravel_xy(pos, shape)
-def ravel_xy(pos, shape):
-    H, W = shape
-    with np.errstate(invalid="ignore"):
-        qx, qy = pos.reshape(-1, 2).round().astype(np.int32).T
-    quantized_pos = qx.clip(min=0, max=W - 1, out=qx) + W * qy.clip(
-        min=0, max=H - 1, out=qy
-    )
-    return quantized_pos
-def unravel_xy(pos, shape):
-    return np.unravel_index(pos, shape)[0].base[:, ::-1].copy()
-def reciprocal_1d(corres_1_to_2, corres_2_to_1, ret_recip=False):
-    is_reciprocal1 = corres_2_to_1[corres_1_to_2] == np.arange(len(corres_1_to_2))
-    pos1 = is_reciprocal1.nonzero()[0]
-    pos2 = corres_1_to_2[pos1]
-    if ret_recip:
-        return is_reciprocal1, pos1, pos2
-    return pos1, pos2
-def extract_correspondences_from_pts3d(
-    view1, view2, target_n_corres, rng=np.random, ret_xy=True, nneg=0
-):
-    view1, view2 = to_numpy((view1, view2))
-    shape1, corres1_to_2 = reproject_view(view1["pts3d"], view2)
-    shape2, corres2_to_1 = reproject_view(view2["pts3d"], view1)
-    is_reciprocal1, pos1, pos2 = reciprocal_1d(
-        corres1_to_2, corres2_to_1, ret_recip=True
-    )
-    is_reciprocal2 = corres1_to_2[corres2_to_1] == np.arange(len(corres2_to_1))
-    if target_n_corres is None:
-        if ret_xy:
-            pos1 = unravel_xy(pos1, shape1)
-            pos2 = unravel_xy(pos2, shape2)
-        return pos1, pos2
-    available_negatives = min((~is_reciprocal1).sum(), (~is_reciprocal2).sum())
-    target_n_positives = int(target_n_corres * (1 - nneg))
-    n_positives = min(len(pos1), target_n_positives)
-    n_negatives = min(target_n_corres - n_positives, available_negatives)
-    if n_negatives + n_positives != target_n_corres:
-        n_positives = target_n_corres - n_negatives
-        assert n_positives <= len(pos1)
-    assert n_positives <= len(pos1)
-    assert n_positives <= len(pos2)
-    assert n_negatives <= (~is_reciprocal1).sum()
-    assert n_negatives <= (~is_reciprocal2).sum()
-    assert n_positives + n_negatives == target_n_corres
-    valid = np.ones(n_positives, dtype=bool)
-    if n_positives < len(pos1):
-        perm = rng.permutation(len(pos1))[:n_positives]
-        pos1 = pos1[perm]
-        pos2 = pos2[perm]
-    if n_negatives > 0:
-        def norm(p):
-            return p / p.sum()
-        pos1 = np.r_[
-            pos1,
-            rng.choice(
-                shape1[0] * shape1[1],
-                size=n_negatives,
-                replace=False,
-                p=norm(~is_reciprocal1),
-            ),
-        ]
-        pos2 = np.r_[
-            pos2,
-            rng.choice(
-                shape2[0] * shape2[1],
-                size=n_negatives,
-                replace=False,
-                p=norm(~is_reciprocal2),
-            ),
-        ]
-        valid = np.r_[valid, np.zeros(n_negatives, dtype=bool)]
-    if ret_xy:
-        pos1 = unravel_xy(pos1, shape1)
-        pos2 = unravel_xy(pos2, shape2)
-    return pos1, pos2, valid

FastVGGT/eval/dataset_utils/cropping.py DELETED Viewed

@@ -1,140 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-import PIL.Image
-import os
-from utils import colmap_to_opencv_intrinsics, opencv_to_colmap_intrinsics
-os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
-import cv2  # noqa
-import numpy as np  # noqa
-try:
-    lanczos = PIL.Image.Resampling.LANCZOS
-    bicubic = PIL.Image.Resampling.BICUBIC
-except AttributeError:
-    lanczos = PIL.Image.LANCZOS
-    bicubic = PIL.Image.BICUBIC
-class ImageList:
-    """Convenience class to aply the same operation to a whole set of images."""
-    def __init__(self, images):
-        if not isinstance(images, (tuple, list, set)):
-            images = [images]
-        self.images = []
-        for image in images:
-            if not isinstance(image, PIL.Image.Image):
-                image = PIL.Image.fromarray(image)
-            self.images.append(image)
-    def __len__(self):
-        return len(self.images)
-    def to_pil(self):
-        return tuple(self.images) if len(self.images) > 1 else self.images[0]
-    @property
-    def size(self):
-        sizes = [im.size for im in self.images]
-        assert all(sizes[0] == s for s in sizes)
-        return sizes[0]
-    def resize(self, *args, **kwargs):
-        return ImageList(self._dispatch("resize", *args, **kwargs))
-    def crop(self, *args, **kwargs):
-        return ImageList(self._dispatch("crop", *args, **kwargs))
-    def _dispatch(self, func, *args, **kwargs):
-        return [getattr(im, func)(*args, **kwargs) for im in self.images]
-def rescale_image_depthmap(
-    image, depthmap, camera_intrinsics, output_resolution, force=True
-):
-    """Jointly rescale a (image, depthmap)
-    so that (out_width, out_height) >= output_res
-    """
-    image = ImageList(image)
-    input_resolution = np.array(image.size)  # (W,H)
-    output_resolution = np.array(output_resolution)
-    if depthmap is not None:
-        assert tuple(depthmap.shape[:2]) == image.size[::-1]
-    assert output_resolution.shape == (2,)
-    scale_final = max(output_resolution / image.size) + 1e-8
-    if scale_final >= 1 and not force:  # image is already smaller than what is asked
-        return (image.to_pil(), depthmap, camera_intrinsics)
-    output_resolution = np.floor(input_resolution * scale_final).astype(int)
-    image = image.resize(
-        output_resolution, resample=lanczos if scale_final < 1 else bicubic
-    )
-    if depthmap is not None:
-        depthmap = cv2.resize(
-            depthmap,
-            output_resolution,
-            fx=scale_final,
-            fy=scale_final,
-            interpolation=cv2.INTER_NEAREST,
-        )
-    camera_intrinsics = camera_matrix_of_crop(
-        camera_intrinsics, input_resolution, output_resolution, scaling=scale_final
-    )
-    return image.to_pil(), depthmap, camera_intrinsics
-def camera_matrix_of_crop(
-    input_camera_matrix,
-    input_resolution,
-    output_resolution,
-    scaling=1,
-    offset_factor=0.5,
-    offset=None,
-):
-    margins = np.asarray(input_resolution) * scaling - output_resolution
-    assert np.all(margins >= 0.0)
-    if offset is None:
-        offset = offset_factor * margins
-    output_camera_matrix_colmap = opencv_to_colmap_intrinsics(input_camera_matrix)
-    output_camera_matrix_colmap[:2, :] *= scaling
-    output_camera_matrix_colmap[:2, 2] -= offset
-    output_camera_matrix = colmap_to_opencv_intrinsics(output_camera_matrix_colmap)
-    return output_camera_matrix
-def crop_image_depthmap(image, depthmap, camera_intrinsics, crop_bbox):
-    """
-    Return a crop of the input view.
-    """
-    image = ImageList(image)
-    l, t, r, b = crop_bbox
-    image = image.crop((l, t, r, b))
-    depthmap = depthmap[t:b, l:r]
-    camera_intrinsics = camera_intrinsics.copy()
-    camera_intrinsics[0, 2] -= l
-    camera_intrinsics[1, 2] -= t
-    return image.to_pil(), depthmap, camera_intrinsics
-def bbox_from_intrinsics_in_out(
-    input_camera_matrix, output_camera_matrix, output_resolution
-):
-    out_width, out_height = output_resolution
-    l, t = np.int32(np.round(input_camera_matrix[:2, 2] - output_camera_matrix[:2, 2]))
-    crop_bbox = (l, t, l + out_width, t + out_height)
-    return crop_bbox

FastVGGT/eval/dataset_utils/transforms.py DELETED Viewed

@@ -1,78 +0,0 @@
-# Copyright (C) 2024-present Naver Corporation. All rights reserved.
-# Licensed under CC BY-NC-SA 4.0 (non-commercial use only).
-#
-# --------------------------------------------------------
-import torchvision.transforms as tvf
-ImgNorm = tvf.Compose([tvf.ToTensor(), tvf.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
-ColorJitter = tvf.Compose([tvf.ColorJitter(0.5, 0.5, 0.5, 0.1), ImgNorm])
-def _check_input(value, center=1, bound=(0, float("inf")), clip_first_on_zero=True):
-    if isinstance(value, (int, float)):
-        if value < 0:
-            raise ValueError(f"If  is a single number, it must be non negative.")
-        value = [center - float(value), center + float(value)]
-        if clip_first_on_zero:
-            value[0] = max(value[0], 0.0)
-    elif isinstance(value, (tuple, list)) and len(value) == 2:
-        value = [float(value[0]), float(value[1])]
-    else:
-        raise TypeError(f"should be a single number or a list/tuple with length 2.")
-    if not bound[0] <= value[0] <= value[1] <= bound[1]:
-        raise ValueError(f"values should be between {bound}, but got {value}.")
-    if value[0] == value[1] == center:
-        return None
-    else:
-        return tuple(value)
-import torch
-import torchvision.transforms.functional as F
-def SeqColorJitter():
-    """
-    Return a color jitter transform with same random parameters
-    """
-    brightness = _check_input(0.5)
-    contrast = _check_input(0.5)
-    saturation = _check_input(0.5)
-    hue = _check_input(0.1, center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
-    fn_idx = torch.randperm(4)
-    brightness_factor = (
-        None
-        if brightness is None
-        else float(torch.empty(1).uniform_(brightness[0], brightness[1]))
-    )
-    contrast_factor = (
-        None
-        if contrast is None
-        else float(torch.empty(1).uniform_(contrast[0], contrast[1]))
-    )
-    saturation_factor = (
-        None
-        if saturation is None
-        else float(torch.empty(1).uniform_(saturation[0], saturation[1]))
-    )
-    hue_factor = None if hue is None else float(torch.empty(1).uniform_(hue[0], hue[1]))
-    def _color_jitter(img):
-        for fn_id in fn_idx:
-            if fn_id == 0 and brightness_factor is not None:
-                img = F.adjust_brightness(img, brightness_factor)
-            elif fn_id == 1 and contrast_factor is not None:
-                img = F.adjust_contrast(img, contrast_factor)
-            elif fn_id == 2 and saturation_factor is not None:
-                img = F.adjust_saturation(img, saturation_factor)
-            elif fn_id == 3 and hue_factor is not None:
-                img = F.adjust_hue(img, hue_factor)
-        return ImgNorm(img)
-    return _color_jitter

FastVGGT/eval/eval_7andN.py DELETED Viewed

@@ -1,497 +0,0 @@
-import os
-import sys
-# Ensure project root is on sys.path for absolute imports like `vggt.*`
-ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
-if ROOT_DIR not in sys.path:
-    sys.path.insert(0, ROOT_DIR)
-import time
-import torch
-import argparse
-import numpy as np
-import open3d as o3d
-import os.path as osp
-from torch.utils.data import DataLoader
-from torch.utils.data._utils.collate import default_collate
-from tqdm import tqdm
-from collections import defaultdict
-import torchvision.transforms as transforms
-def get_args_parser():
-    parser = argparse.ArgumentParser("3D Reconstruction evaluation", add_help=False)
-    parser.add_argument(
-        "--ckpt_path",
-        type=str,
-        default="/home/sy/code/FastVGGT/ckpt/model_tracker_fixed_e20.pt",
-        help="ckpt name",
-    )
-    parser.add_argument("--device", type=str, default="cuda:0", help="device")
-    parser.add_argument("--model_name", type=str, default="VGGT")
-    parser.add_argument(
-        "--conf_thresh", type=float, default=0.0, help="confidence threshold"
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="/home/sy/code/FastVGGT/eval_results",
-        help="value for outdir",
-    )
-    parser.add_argument("--size", type=int, default=518)
-    parser.add_argument("--revisit", type=int, default=1, help="revisit times")
-    parser.add_argument("--freeze", action="store_true")
-    parser.add_argument("--use_proj", action="store_true")
-    parser.add_argument(
-        "--merging", type=int, default=0, help="VGGT aggregator merging steps"
-    )
-    parser.add_argument("--kf", type=int, default=2, help="key frame")
-    return parser
-def main(args):
-    from data import SevenScenes, NRGBD
-    from utils import accuracy, completion
-    if args.size == 512:
-        resolution = (512, 384)
-    elif args.size == 224:
-        resolution = 224
-    elif args.size == 518:
-        resolution = (518, 392)
-    else:
-        raise NotImplementedError
-    datasets_all = {
-        "7scenes": SevenScenes(
-            split="test",
-            ROOT="/data/sy/7scenes",
-            resolution=resolution,
-            num_seq=1,
-            full_video=True,
-            kf_every=args.kf,
-        ),  # 20),
-        "NRGBD": NRGBD(
-            split="test",
-            ROOT="/data/sy/neural_rgbd_data",
-            resolution=resolution,
-            num_seq=1,
-            full_video=True,
-            kf_every=args.kf,
-        ),
-    }
-    device = args.device
-    model_name = args.model_name
-    from vggt.models.vggt import VGGT
-    from vggt.utils.pose_enc import pose_encoding_to_extri_intri
-    from vggt.utils.geometry import unproject_depth_map_to_point_map
-    from criterion import Regr3D_t_ScaleShiftInv, L21
-    # Force use of bf16 data type
-    dtype = torch.bfloat16
-    # Load VGGT model
-    model = VGGT(merging=args.merging, enable_point=True)
-    ckpt = torch.load(args.ckpt_path, map_location="cpu")
-    # ✅ Fix: load pre-trained weights
-    model.load_state_dict(
-        ckpt, strict=False
-    )  # Use strict=False due to enable_point=True difference
-    model = model.cuda().eval()
-    model = model.to(torch.bfloat16)
-    del ckpt
-    os.makedirs(osp.join(args.output_dir, f"{args.kf}"), exist_ok=True)
-    criterion = Regr3D_t_ScaleShiftInv(L21, norm_mode=False, gt_scale=True)
-    with torch.no_grad():
-        for name_data, dataset in datasets_all.items():
-            save_path = osp.join(osp.join(args.output_dir, f"{args.kf}"), name_data)
-            os.makedirs(save_path, exist_ok=True)
-            log_file = osp.join(save_path, "logs.txt")
-            acc_all = 0
-            acc_all_med = 0
-            comp_all = 0
-            comp_all_med = 0
-            nc1_all = 0
-            nc1_all_med = 0
-            nc2_all = 0
-            nc2_all_med = 0
-            scene_infer_times = defaultdict(list)
-            for data_idx in tqdm(range(len(dataset))):
-                batch = default_collate([dataset[data_idx]])
-                ignore_keys = set(
-                    [
-                        "depthmap",
-                        "dataset",
-                        "label",
-                        "instance",
-                        "idx",
-                        "true_shape",
-                        "rng",
-                    ]
-                )
-                for view in batch:
-                    for name in view.keys():  # pseudo_focal
-                        if name in ignore_keys:
-                            continue
-                        if isinstance(view[name], tuple) or isinstance(
-                            view[name], list
-                        ):
-                            view[name] = [
-                                x.to(device, non_blocking=True) for x in view[name]
-                            ]
-                        else:
-                            view[name] = view[name].to(device, non_blocking=True)
-                pts_all = []
-                pts_gt_all = []
-                images_all = []
-                masks_all = []
-                conf_all = []
-                in_camera1 = None
-                dtype = (
-                    torch.bfloat16
-                    if torch.cuda.get_device_capability()[0] >= 8
-                    else torch.float16
-                )
-                with torch.cuda.amp.autocast(dtype=dtype):
-                    if isinstance(batch, dict) and "img" in batch:
-                        batch["img"] = (batch["img"] + 1.0) / 2.0
-                    elif isinstance(batch, list) and all(
-                        isinstance(v, dict) and "img" in v for v in batch
-                    ):
-                        for view in batch:
-                            view["img"] = (view["img"] + 1.0) / 2.0
-                        # Gather all `img` tensors into a single tensor of shape [N, C, H, W]
-                        imgs_tensor = torch.cat([v["img"] for v in batch], dim=0)
-                with torch.cuda.amp.autocast(dtype=dtype):
-                    with torch.no_grad():
-                        torch.cuda.synchronize()
-                        start = time.time()
-                        preds = model(imgs_tensor)
-                        torch.cuda.synchronize()
-                        end = time.time()
-                        inference_time_ms = (end - start) * 1000
-                        print(f"Inference time: {inference_time_ms:.2f}ms")
-                    # Wrap model outputs per-view to align with batch later
-                    predictions = preds
-                    views = batch  # list[dict]
-                    if "pose_enc" in predictions:
-                        B, S = predictions["pose_enc"].shape[:2]
-                    elif "world_points" in predictions:
-                        B, S = predictions["world_points"].shape[:2]
-                    else:
-                        raise KeyError(
-                            "predictions is missing a key to infer sequence length"
-                        )
-                    ress = []
-                    for s in range(S):
-                        res = {
-                            "pts3d_in_other_view": predictions["world_points"][:, s],
-                            "conf": predictions["world_points_conf"][:, s],
-                            "depth": predictions["depth"][:, s],
-                            "depth_conf": predictions["depth_conf"][:, s],
-                            "camera_pose": predictions["pose_enc"][:, s, :],
-                        }
-                        if (
-                            isinstance(views, list)
-                            and s < len(views)
-                            and "valid_mask" in views[s]
-                        ):
-                            res["valid_mask"] = views[s]["valid_mask"]
-                        if "track" in predictions:
-                            res.update(
-                                {
-                                    "track": predictions["track"][:, s],
-                                    "vis": (
-                                        predictions.get("vis", None)[:, s]
-                                        if "vis" in predictions
-                                        else None
-                                    ),
-                                    "track_conf": (
-                                        predictions.get("conf", None)[:, s]
-                                        if "conf" in predictions
-                                        else None
-                                    ),
-                                }
-                            )
-                        ress.append(res)
-                    preds = ress
-                    valid_length = len(preds) // args.revisit
-                    if args.revisit > 1:
-                        preds = preds[-valid_length:]
-                        batch = batch[-valid_length:]
-                    # Evaluation
-                    print(f"Evaluation for {name_data} {data_idx+1}/{len(dataset)}")
-                    gt_pts, pred_pts, gt_factor, pr_factor, masks, monitoring = (
-                        criterion.get_all_pts3d_t(batch, preds)
-                    )
-                    in_camera1 = None
-                    pts_all = []
-                    pts_gt_all = []
-                    images_all = []
-                    masks_all = []
-                    conf_all = []
-                    for j, view in enumerate(batch):
-                        if in_camera1 is None:
-                            in_camera1 = view["camera_pose"][0].cpu()
-                        image = view["img"].permute(0, 2, 3, 1).cpu().numpy()[0]
-                        mask = view["valid_mask"].cpu().numpy()[0]
-                        pts = pred_pts[j].cpu().numpy()[0]
-                        conf = preds[j]["conf"].cpu().data.numpy()[0]
-                        # mask = mask & (conf > 1.8)
-                        pts_gt = gt_pts[j].detach().cpu().numpy()[0]
-                        H, W = image.shape[:2]
-                        cx = W // 2
-                        cy = H // 2
-                        l, t = cx - 112, cy - 112
-                        r, b = cx + 112, cy + 112
-                        image = image[t:b, l:r]
-                        mask = mask[t:b, l:r]
-                        pts = pts[t:b, l:r]
-                        pts_gt = pts_gt[t:b, l:r]
-                        images_all.append(image[None, ...])
-                        pts_all.append(pts[None, ...])
-                        pts_gt_all.append(pts_gt[None, ...])
-                        masks_all.append(mask[None, ...])
-                        conf_all.append(conf[None, ...])
-                images_all = np.concatenate(images_all, axis=0)
-                pts_all = np.concatenate(pts_all, axis=0)
-                pts_gt_all = np.concatenate(pts_gt_all, axis=0)
-                masks_all = np.concatenate(masks_all, axis=0)
-                scene_id = view["label"][0].rsplit("/", 1)[0]
-                # Record average inference time per scene
-                try:
-                    scene_infer_times[scene_id].append(float(inference_time_ms))
-                except Exception:
-                    pass
-                save_params = {}
-                save_params["images_all"] = images_all
-                save_params["pts_all"] = pts_all
-                save_params["pts_gt_all"] = pts_gt_all
-                save_params["masks_all"] = masks_all
-                pts_all_masked = pts_all[masks_all > 0]
-                pts_gt_all_masked = pts_gt_all[masks_all > 0]
-                images_all_masked = images_all[masks_all > 0]
-                mask = np.isfinite(pts_all_masked)
-                pts_all_masked = pts_all_masked[mask]
-                mask_gt = np.isfinite(pts_gt_all_masked)
-                pts_gt_all_masked = pts_gt_all_masked[mask_gt]
-                images_all_masked = images_all_masked[mask]
-                # Reshape to point cloud (N, 3) before sampling
-                pts_all_masked = pts_all_masked.reshape(-1, 3)
-                pts_gt_all_masked = pts_gt_all_masked.reshape(-1, 3)
-                images_all_masked = images_all_masked.reshape(-1, 3)
-                # If number of points exceeds threshold, sample by points
-                if pts_all_masked.shape[0] > 999999:
-                    sample_indices = np.random.choice(
-                        pts_all_masked.shape[0], 999999, replace=False
-                    )
-                    pts_all_masked = pts_all_masked[sample_indices]
-                    images_all_masked = images_all_masked[sample_indices]
-                # Apply the same sampling to GT point cloud
-                if pts_gt_all_masked.shape[0] > 999999:
-                    sample_indices_gt = np.random.choice(
-                        pts_gt_all_masked.shape[0], 999999, replace=False
-                    )
-                    pts_gt_all_masked = pts_gt_all_masked[sample_indices_gt]
-                if args.use_proj:
-                    def umeyama_alignment(
-                        src: np.ndarray, dst: np.ndarray, with_scale: bool = True
-                    ):
-                        assert src.shape == dst.shape
-                        N, dim = src.shape
-                        mu_src = src.mean(axis=0)
-                        mu_dst = dst.mean(axis=0)
-                        src_c = src - mu_src
-                        dst_c = dst - mu_dst
-                        Sigma = dst_c.T @ src_c / N  # (3,3)
-                        U, D, Vt = np.linalg.svd(Sigma)
-                        S = np.eye(dim)
-                        if np.linalg.det(U) * np.linalg.det(Vt) < 0:
-                            S[-1, -1] = -1
-                        R = U @ S @ Vt
-                        if with_scale:
-                            var_src = (src_c**2).sum() / N
-                            s = (D * S.diagonal()).sum() / var_src
-                        else:
-                            s = 1.0
-                        t = mu_dst - s * R @ mu_src
-                        return s, R, t
-                    pts_all_masked = pts_all_masked.reshape(-1, 3)
-                    pts_gt_all_masked = pts_gt_all_masked.reshape(-1, 3)
-                    s, R, t = umeyama_alignment(
-                        pts_all_masked, pts_gt_all_masked, with_scale=True
-                    )
-                    pts_all_aligned = (s * (R @ pts_all_masked.T)).T + t  # (N,3)
-                    pts_all_masked = pts_all_aligned
-                pcd = o3d.geometry.PointCloud()
-                pcd.points = o3d.utility.Vector3dVector(pts_all_masked)
-                pcd.colors = o3d.utility.Vector3dVector(images_all_masked)
-                pcd_gt = o3d.geometry.PointCloud()
-                pcd_gt.points = o3d.utility.Vector3dVector(pts_gt_all_masked)
-                pcd_gt.colors = o3d.utility.Vector3dVector(images_all_masked)
-                trans_init = np.eye(4)
-                threshold = 0.1
-                reg_p2p = o3d.pipelines.registration.registration_icp(
-                    pcd,
-                    pcd_gt,
-                    threshold,
-                    trans_init,
-                    o3d.pipelines.registration.TransformationEstimationPointToPoint(),
-                )
-                transformation = reg_p2p.transformation
-                pcd = pcd.transform(transformation)
-                pcd.estimate_normals()
-                pcd_gt.estimate_normals()
-                gt_normal = np.asarray(pcd_gt.normals)
-                pred_normal = np.asarray(pcd.normals)
-                acc, acc_med, nc1, nc1_med = accuracy(
-                    pcd_gt.points, pcd.points, gt_normal, pred_normal
-                )
-                comp, comp_med, nc2, nc2_med = completion(
-                    pcd_gt.points, pcd.points, gt_normal, pred_normal
-                )
-                print(
-                    f"Idx: {scene_id}, Acc: {acc}, Comp: {comp}, NC1: {nc1}, NC2: {nc2} - Acc_med: {acc_med}, Compc_med: {comp_med}, NC1c_med: {nc1_med}, NC2c_med: {nc2_med}"
-                )
-                print(
-                    f"Idx: {scene_id}, Acc: {acc}, Comp: {comp}, NC1: {nc1}, NC2: {nc2} - Acc_med: {acc_med}, Compc_med: {comp_med}, NC1c_med: {nc1_med}, NC2c_med: {nc2_med}",
-                    file=open(log_file, "a"),
-                )
-                acc_all += acc
-                comp_all += comp
-                nc1_all += nc1
-                nc2_all += nc2
-                acc_all_med += acc_med
-                comp_all_med += comp_med
-                nc1_all_med += nc1_med
-                nc2_all_med += nc2_med
-                # release cuda memory
-                torch.cuda.empty_cache()
-            # Get depth from pcd and run TSDFusion
-            to_write = ""
-            # Read the log file
-            if os.path.exists(osp.join(save_path, "logs.txt")):
-                with open(osp.join(save_path, "logs.txt"), "r") as f_sub:
-                    to_write += f_sub.read()
-            with open(osp.join(save_path, f"logs_all.txt"), "w") as f:
-                log_data = to_write
-                metrics = defaultdict(list)
-                for line in log_data.strip().split("\n"):
-                    match = regex.match(line)
-                    if match:
-                        data = match.groupdict()
-                        # Exclude 'scene_id' from metrics as it's an identifier
-                        for key, value in data.items():
-                            if key != "scene_id":
-                                metrics[key].append(float(value))
-                        metrics["nc"].append(
-                            (float(data["nc1"]) + float(data["nc2"])) / 2
-                        )
-                        metrics["nc_med"].append(
-                            (float(data["nc1_med"]) + float(data["nc2_med"])) / 2
-                        )
-                mean_metrics = {
-                    metric: sum(values) / len(values)
-                    for metric, values in metrics.items()
-                }
-                c_name = "mean"
-                print_str = f"{c_name.ljust(20)}: "
-                for m_name in mean_metrics:
-                    print_num = np.mean(mean_metrics[m_name])
-                    print_str = print_str + f"{m_name}: {print_num:.3f} | "
-                print_str = print_str + "\n"
-                # Summarize per-scene average inference time
-                time_lines = []
-                for sid, times in scene_infer_times.items():
-                    if len(times) > 0:
-                        time_lines.append(
-                            f"Idx: {sid}, Time_avg_ms: {np.mean(times):.2f}"
-                        )
-                time_block = "\n".join(time_lines) + (
-                    "\n" if len(time_lines) > 0 else ""
-                )
-                f.write(to_write + time_block + print_str)
-from collections import defaultdict
-import re
-pattern = r"""
-    Idx:\s*(?P<scene_id>[^,]+),\s*
-    Acc:\s*(?P<acc>[^,]+),\s*
-    Comp:\s*(?P<comp>[^,]+),\s*
-    NC1:\s*(?P<nc1>[^,]+),\s*
-    NC2:\s*(?P<nc2>[^,]+)\s*-\s*
-    Acc_med:\s*(?P<acc_med>[^,]+),\s*
-    Compc_med:\s*(?P<comp_med>[^,]+),\s*
-    NC1c_med:\s*(?P<nc1_med>[^,]+),\s*
-    NC2c_med:\s*(?P<nc2_med>[^,]+)
-"""
-regex = re.compile(pattern, re.VERBOSE)
-if __name__ == "__main__":
-    parser = get_args_parser()
-    args = parser.parse_args()
-    main(args)

FastVGGT/eval/eval_custom.py DELETED Viewed

@@ -1,467 +0,0 @@
-import argparse
-from pathlib import Path
-import numpy as np
-import torch
-import os
-import sys
-import matplotlib.pyplot as plt
-from scipy.spatial.transform import Rotation
-# Ensure project root is in sys.path for absolute imports like `vggt.*`
-ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
-if ROOT_DIR not in sys.path:
-    sys.path.insert(0, ROOT_DIR)
-from vggt.models.vggt import VGGT
-from vggt.utils.eval_utils import (
-    load_poses,
-    get_vgg_input_imgs,
-    get_sorted_image_paths,
-    build_frame_selection,
-    load_images_rgb,
-    infer_vggt_and_reconstruct,
-    evaluate_scene_and_save,
-)
-# Import pose visualization libraries (optional EVO support)
-try:
-    from evo.core.trajectory import PoseTrajectory3D
-    import evo.tools.plot as plot
-    EVO_AVAILABLE = True
-except ImportError:
-    # EVO is optional; we have a matplotlib-based fallback
-    EVO_AVAILABLE = False
-def visualize_predicted_poses(
-    all_cam_to_world_mat, frame_ids, output_scene_dir, scene_name="custom_dataset"
-):
-    """
-    Visualize the predicted camera pose trajectory (no GT comparison required).
-    Args:
-        all_cam_to_world_mat: List of camera-to-world transform matrices
-        frame_ids: List of frame IDs
-        output_scene_dir: Output directory
-        scene_name: Scene name
-    """
-    # Provide basic pose visualization even without EVO
-    if not EVO_AVAILABLE:
-        print("⚠️  EVO not installed; using basic matplotlib visualization")
-    try:
-        # Convert to numpy array
-        poses_est = np.array(all_cam_to_world_mat)
-        if len(poses_est) < 2:
-            print("⚠️  Not enough poses to generate trajectory plot")
-            return
-        print(f"🎨 Generating pose trajectory visualization...")
-        # Extract translation part
-        positions = poses_est[:, :3, 3]  # shape: (N, 3)
-        # Create figure - show XZ-plane projection only
-        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
-        # XZ-plane projection
-        ax.plot(
-            positions[:, 0],
-            positions[:, 2],
-            "b-",
-            linewidth=2,
-            label="Predicted Trajectory",
-        )
-        ax.scatter(
-            positions[0, 0], positions[0, 2], color="green", s=100, label="Start"
-        )
-        ax.scatter(positions[-1, 0], positions[-1, 2], color="red", s=100, label="End")
-        ax.set_xlabel("X (m)")
-        ax.set_ylabel("Z (m)")
-        ax.set_title(f"{scene_name} - XZ-plane projection")
-        ax.legend()
-        ax.grid(True, alpha=0.3)
-        # Save image
-        pose_plot_path = output_scene_dir / "predicted_trajectory.png"
-        plt.savefig(pose_plot_path, dpi=300, bbox_inches="tight")
-        plt.close()
-        print(f"📊 Trajectory visualization saved: {pose_plot_path}")
-    except Exception as e:
-        print(f"⚠️  Failed to generate pose visualization: {e}")
-        import traceback
-        traceback.print_exc()
-def main():
-    """
-    Evaluation script for a Custom Dataset.
-    Supports optional evaluation and custom dataset structure.
-    """
-    parser = argparse.ArgumentParser(
-        description="Run FastVGGT evaluation on a Custom Dataset"
-    )
-    # Required: dataset path
-    parser.add_argument(
-        "--data_path",
-        type=Path,
-        required=True,
-        help="Dataset path containing subfolders: color, depth, gt_ply, pose",
-    )
-    # Optional: enable evaluation
-    parser.add_argument(
-        "--enable_evaluation",
-        action="store_true",
-        help="Enable evaluation (requires pose and ply data)",
-    )
-    # Output path
-    parser.add_argument(
-        "--output_path",
-        type=Path,
-        default="./eval_results_custom",
-        help="Output path for evaluation results",
-    )
-    # Model parameters
-    parser.add_argument(
-        "--ckpt_path",
-        type=str,
-        default="/home/sy/code/FastVGGT/ckpt/model_tracker_fixed_e20.pt",
-        help="Model checkpoint file path",
-    )
-    parser.add_argument("--merging", type=int, default=0, help="Merging parameter")
-    # Processing parameters
-    parser.add_argument(
-        "--input_frame",
-        type=int,
-        default=200,
-        help="Maximum number of frames to process per scene",
-    )
-    parser.add_argument(
-        "--depth_conf_thresh",
-        type=float,
-        default=3.0,
-        help="Depth confidence threshold to filter low-confidence depth values",
-    )
-    # Evaluation parameters (only used when evaluation is enabled)
-    parser.add_argument(
-        "--chamfer_max_dist",
-        type=float,
-        default=0.5,
-        help="Maximum distance threshold used in Chamfer Distance computation",
-    )
-    parser.add_argument("--plot", action="store_true", help="Whether to generate plots")
-    parser.add_argument(
-        "--vis_attn_map",
-        action="store_true",
-        help="Visualize attention maps during inference",
-    )
-    args = parser.parse_args()
-    torch.manual_seed(33)
-    # Check data path exists
-    if not args.data_path.exists():
-        print(f"❌ Error: Data path does not exist: {args.data_path}")
-        return
-    # Check required subdirectories
-    color_dir = args.data_path / "images"
-    pose_dir = args.data_path / "pose"
-    if not color_dir.exists():
-        print(f"❌ Error: color directory does not exist: {color_dir}")
-        return
-    print(f"📁 Dataset path: {args.data_path}")
-    # print(f"🔧 Enable evaluation: {'Yes' if args.enable_evaluation else 'No'}")
-    # If evaluation is enabled, check pose and gt_ply directories
-    if args.enable_evaluation:
-        if not pose_dir.exists():
-            print(f"❌ Error: Evaluation requires pose directory: {pose_dir}")
-            return
-        gt_ply_dir = args.data_path / "gt_ply"
-        if not gt_ply_dir.exists():
-            print(f"❌ Error: Evaluation requires gt_ply directory: {gt_ply_dir}")
-            return
-        print(f"📊 Evaluation will use Ground Truth")
-    else:
-        print(f"🏃 Inference only, no evaluation")
-    # Create output directory
-    args.output_path.mkdir(parents=True, exist_ok=True)
-    output_scene_dir = args.output_path / "custom_dataset"
-    # Check if already processed
-    if (output_scene_dir / "metrics.json").exists() and args.enable_evaluation:
-        print(
-            f"⚠️  Results already exist, skipping: {output_scene_dir / 'metrics.json'}"
-        )
-        return
-    # Force use of bf16 dtype
-    dtype = torch.bfloat16
-    # Load VGGT model
-    print(f"🔄 Loading model: {args.ckpt_path}")
-    model = VGGT(merging=args.merging, vis_attn_map=args.vis_attn_map)
-    ckpt = torch.load(args.ckpt_path, map_location="cpu")
-    incompat = model.load_state_dict(ckpt, strict=False)
-    # if incompat.missing_keys or incompat.unexpected_keys:
-    #     print(f"⚠️  Partially incompatible keys when loading model: {incompat}")
-    model = model.cuda().eval()
-    model = model.to(torch.bfloat16)
-    print(f"✅ Model loaded")
-    # Load scene data
-    image_paths = get_sorted_image_paths(color_dir)
-    if len(image_paths) == 0:
-        print(f"❌ Error: No images found in {color_dir}")
-        return
-    print(f"🖼️  Found {len(image_paths)} images")
-    # Process pose data (if evaluation is enabled)
-    poses_gt = None
-    first_gt_pose = None
-    available_pose_frame_ids = None
-    c2ws = None
-    if args.enable_evaluation:
-        poses_gt, first_gt_pose, available_pose_frame_ids = load_poses(pose_dir)
-        if (
-            poses_gt is None
-            or first_gt_pose is None
-            or available_pose_frame_ids is None
-        ):
-            print(f"❌ Error: Failed to load pose data")
-            return
-        print(f"📐 Loaded {len(poses_gt)} poses")
-    # Frame selection
-    if args.enable_evaluation and available_pose_frame_ids is not None:
-        # Use pose data for frame selection
-        selected_frame_ids, selected_image_paths, selected_pose_indices = (
-            build_frame_selection(
-                image_paths, available_pose_frame_ids, args.input_frame
-            )
-        )
-        c2ws = poses_gt[selected_pose_indices]
-        image_paths = selected_image_paths
-    else:
-        # Simply take the first N frames
-        num_frames = min(len(image_paths), args.input_frame)
-        selected_frame_ids = list(range(num_frames))
-        image_paths = image_paths[:num_frames]
-    print(f"📋 Selected {len(image_paths)} frames for processing")
-    try:
-        # Load images
-        print(f"🔄 Loading images...")
-        images = load_images_rgb(image_paths)
-        if not images or len(images) < 3:
-            print(f"❌ Error: Not enough valid images (need at least 3)")
-            return
-        frame_ids = selected_frame_ids
-        images_array = np.stack(images)
-        vgg_input, patch_width, patch_height = get_vgg_input_imgs(images_array)
-        print(f"📐 Image patch dimensions: {patch_width}x{patch_height}")
-        # Update attention layer patch dimensions in the model
-        model.update_patch_dimensions(patch_width, patch_height)
-        # Inference + Reconstruction
-        print(f"🚀 Start inference and reconstruction...")
-        (
-            extrinsic_np,
-            intrinsic_np,
-            all_world_points,
-            all_point_colors,
-            all_cam_to_world_mat,
-            inference_time_ms,
-        ) = infer_vggt_and_reconstruct(
-            model, vgg_input, dtype, args.depth_conf_thresh, image_paths
-        )
-        print(f"⏱️  Inference time: {inference_time_ms:.2f}ms")
-        # Check results
-        if not all_cam_to_world_mat or not all_world_points:
-            print(f"❌ Error: Failed to obtain valid camera poses or point clouds")
-            return
-        # print(f"✅ Inference done, obtained {len(all_world_points)} point sets")
-        # Evaluation and saving
-        if args.enable_evaluation:
-            print(f"📊 Start evaluation...")
-            gt_ply_dir = args.data_path / "gt_ply"
-            metrics = evaluate_scene_and_save(
-                "custom_dataset",
-                c2ws,
-                first_gt_pose,
-                frame_ids,
-                all_cam_to_world_mat,
-                all_world_points,
-                output_scene_dir,
-                gt_ply_dir,
-                args.chamfer_max_dist,
-                inference_time_ms,
-                args.plot,
-            )
-            if metrics is not None:
-                print("📈 Evaluation results:")
-                for key, value in metrics.items():
-                    if key in [
-                        "chamfer_distance",
-                        "ate",
-                        "are",
-                        "rpe_rot",
-                        "rpe_trans",
-                        "inference_time_ms",
-                    ]:
-                        print(f"  {key}: {float(value):.4f}")
-            # Also visualize predicted poses in evaluation branch
-            if args.plot:
-                visualize_predicted_poses(
-                    all_cam_to_world_mat, frame_ids, output_scene_dir, "custom_dataset"
-                )
-        else:
-            # Save reconstruction only, no evaluation
-            print(f"💾 Saving reconstruction...")
-            output_scene_dir.mkdir(parents=True, exist_ok=True)
-            # Save camera poses
-            poses_output_path = output_scene_dir / "estimated_poses.txt"
-            with open(poses_output_path, "w") as f:
-                for i, pose in enumerate(all_cam_to_world_mat):
-                    f.write(f"# Frame {frame_ids[i]}\n")
-                    for row in pose:
-                        f.write(" ".join(map(str, row)) + "\n")
-                    f.write("\n")
-            # Save point cloud
-            if all_world_points:
-                points_output_path = output_scene_dir / "reconstructed_points.ply"
-                # Merge all frames' point clouds and colors
-                try:
-                    merged_point_cloud = np.vstack(all_world_points)
-                    merged_colors = (
-                        np.vstack(all_point_colors).astype(np.uint8)
-                        if all_point_colors is not None and len(all_point_colors) > 0
-                        else None
-                    )
-                    print(
-                        f"📊 Merged point clouds: {len(all_world_points)} frames, total {len(merged_point_cloud)} points"
-                    )
-                    # If too many points, randomly sample 100000 points
-                    max_points = 100000
-                    if len(merged_point_cloud) > max_points:
-                        print(
-                            f"🔽 Too many points, randomly sampling {max_points} points..."
-                        )
-                        # Randomly choose indices
-                        indices = np.random.choice(
-                            len(merged_point_cloud), size=max_points, replace=False
-                        )
-                        merged_point_cloud = merged_point_cloud[indices]
-                        if merged_colors is not None:
-                            merged_colors = merged_colors[indices]
-                        print(
-                            f"✅ Sampling done, kept {len(merged_point_cloud)} points"
-                        )
-                    # Save as PLY (with color)
-                    with open(points_output_path, "w") as f:
-                        f.write("ply\n")
-                        f.write("format ascii 1.0\n")
-                        f.write(f"element vertex {len(merged_point_cloud)}\n")
-                        f.write("property float x\n")
-                        f.write("property float y\n")
-                        f.write("property float z\n")
-                        if merged_colors is not None:
-                            f.write("property uchar red\n")
-                            f.write("property uchar green\n")
-                            f.write("property uchar blue\n")
-                        f.write("end_header\n")
-                        if merged_colors is None:
-                            for point in merged_point_cloud:
-                                if not (np.isnan(point).any() or np.isinf(point).any()):
-                                    f.write(
-                                        f"{point[0]:.6f} {point[1]:.6f} {point[2]:.6f}\n"
-                                    )
-                        else:
-                            for point, color in zip(merged_point_cloud, merged_colors):
-                                # Check point validity
-                                if not (np.isnan(point).any() or np.isinf(point).any()):
-                                    r = int(np.clip(color[0], 0, 255))
-                                    g = int(np.clip(color[1], 0, 255))
-                                    b = int(np.clip(color[2], 0, 255))
-                                    f.write(
-                                        f"{point[0]:.6f} {point[1]:.6f} {point[2]:.6f} {r} {g} {b}\n"
-                                    )
-                    print(f"💾 Point cloud saved to: {points_output_path}")
-                except Exception as e:
-                    print(f"⚠️  Error saving point cloud: {e}")
-                    # If merge fails, try to log per-frame info
-                    print(f"🔍 Point cloud debug info:")
-                    for i, frame_points in enumerate(all_world_points):
-                        print(
-                            f"  Frame {i}: {frame_points.shape if hasattr(frame_points, 'shape') else type(frame_points)}"
-                        )
-                        if (
-                            hasattr(frame_points, "shape")
-                            and len(frame_points.shape) >= 2
-                        ):
-                            print(
-                                f"    Shape: {frame_points.shape}, Dtype: {frame_points.dtype}"
-                            )
-                            if frame_points.shape[0] > 0:
-                                print(
-                                    f"    Range: x[{np.min(frame_points[:, 0]):.3f}, {np.max(frame_points[:, 0]):.3f}] "
-                                    f"y[{np.min(frame_points[:, 1]):.3f}, {np.max(frame_points[:, 1]):.3f}] "
-                                    f"z[{np.min(frame_points[:, 2]):.3f}, {np.max(frame_points[:, 2]):.3f}]"
-                                )
-            print(f"📁 Results saved to: {output_scene_dir}")
-            # Visualize predicted pose trajectory
-            if args.plot:
-                visualize_predicted_poses(
-                    all_cam_to_world_mat, frame_ids, output_scene_dir, "custom_dataset"
-                )
-        print(f"🎉 Done!")
-    except Exception as e:
-        print(f"❌ Error occurred during processing: {e}")
-        import traceback
-        traceback.print_exc()
-if __name__ == "__main__":
-    main()

FastVGGT/eval/eval_scannet.py DELETED Viewed

@@ -1,208 +0,0 @@
-import argparse
-from pathlib import Path
-import numpy as np
-import torch
-import os
-import sys
-# Ensure project root is in sys.path for absolute imports like `vggt.*`
-ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
-if ROOT_DIR not in sys.path:
-    sys.path.insert(0, ROOT_DIR)
-from vggt.models.vggt import VGGT
-from vggt.utils.eval_utils import (
-    load_poses,
-    get_vgg_input_imgs,
-    get_sorted_image_paths,
-    get_all_scenes,
-    build_frame_selection,
-    load_images_rgb,
-    infer_vggt_and_reconstruct,
-    evaluate_scene_and_save,
-    compute_average_metrics_and_save,
-)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--data_dir", type=Path, default="/data/scannetv2/process_scannet"
-    )
-    parser.add_argument(
-        "--gt_ply_dir",
-        type=Path,
-        default="/data/scannetv2/OpenDataLab___ScanNet_v2/raw/scans",
-    )
-    parser.add_argument("--output_path", type=Path, default="./eval_results")
-    parser.add_argument("--merging", type=int, default=None)
-    parser.add_argument("--plot", type=bool, default=True)
-    parser.add_argument(
-        "--depth_conf_thresh",
-        type=float,
-        default=3.0,
-        help="Depth confidence threshold for filtering low confidence depth values",
-    )
-    parser.add_argument(
-        "--chamfer_max_dist",
-        type=float,
-        default=0.5,
-        help="Maximum distance threshold in Chamfer Distance computation, distances exceeding this value will be clipped",
-    )
-    parser.add_argument(
-        "--input_frame",
-        type=int,
-        default=200,
-        help="Maximum number of frames selected for processing per scene",
-    )
-    parser.add_argument(
-        "--num_scenes",
-        type=int,
-        default=50,
-        help="Maximum number of scenes to evaluate",
-    )
-    parser.add_argument(
-        "--ckpt_path",
-        type=str,
-        default="./ckpt/model_tracker_fixed_e20.pt",
-        help="Path to the model checkpoint file",
-    )
-    parser.add_argument(
-        "--vis_attn_map",
-        action="store_true",
-        help="Whether to visualize attention maps during inference",
-    )
-    args = parser.parse_args()
-    torch.manual_seed(33)
-    # Scene sampling
-    scannet_scenes = get_all_scenes(args.data_dir, args.num_scenes)
-    print(f"Evaluate {len(scannet_scenes)} scenes")
-    all_scenes_metrics = {"scenes": {}, "average": {}}
-    # Force use of bf16 data type
-    dtype = torch.bfloat16
-    # Load VGGT model
-    model = VGGT(merging=args.merging, vis_attn_map=args.vis_attn_map)
-    ckpt = torch.load(args.ckpt_path, map_location="cpu")
-    incompat = model.load_state_dict(ckpt, strict=False)
-    model = model.cuda().eval()
-    model = model.to(torch.bfloat16)
-    # Process each scene
-    for scene in scannet_scenes:
-        scene_dir = args.data_dir / f"{scene}"
-        output_scene_dir = args.output_path / f"input_frame_{args.input_frame}" / scene
-        if (output_scene_dir / "metrics.json").exists():
-            continue
-        # Load scene data
-        images_dir = scene_dir / "color"
-        pose_path = scene_dir / "pose"
-        image_paths = get_sorted_image_paths(images_dir)
-        poses_gt, first_gt_pose, available_pose_frame_ids = load_poses(pose_path)
-        if (
-            poses_gt is None
-            or first_gt_pose is None
-            or available_pose_frame_ids is None
-        ):
-            print(f"Skipping scene {scene}: no pose data")
-            continue
-        # Frame filtering
-        selected_frame_ids, selected_image_paths, selected_pose_indices = (
-            build_frame_selection(
-                image_paths, available_pose_frame_ids, args.input_frame
-            )
-        )
-        # Get corresponding poses
-        c2ws = poses_gt[selected_pose_indices]
-        image_paths = selected_image_paths
-        if len(image_paths) == 0:
-            print(f"No images found in {images_dir}")
-            continue
-        print("🚩Processing", scene, f"Found {len(image_paths)} images")
-        all_cam_to_world_mat = []
-        all_world_points = []
-        try:
-            # Load images
-            images = load_images_rgb(image_paths)
-            if not images or len(images) < 3:
-                print(f"Skipping {scene}: insufficient valid images")
-                continue
-            frame_ids = selected_frame_ids
-            images_array = np.stack(images)
-            vgg_input, patch_width, patch_height = get_vgg_input_imgs(images_array)
-            print(f"Patch dimensions: {patch_width}x{patch_height}")
-            # Update model attention layers with dynamic patch dimensions
-            model.update_patch_dimensions(patch_width, patch_height)
-            # Inference + Reconstruction
-            (
-                extrinsic_np,
-                intrinsic_np,
-                all_world_points,
-                all_point_colors,
-                all_cam_to_world_mat,
-                inference_time_ms,
-            ) = infer_vggt_and_reconstruct(
-                model, vgg_input, dtype, args.depth_conf_thresh, image_paths
-            )
-            print(f"Inference time: {inference_time_ms:.2f}ms")
-            # Process results
-            if not all_cam_to_world_mat or not all_world_points:
-                print(
-                    f"Skipping {scene}: failed to obtain valid camera poses or point clouds"
-                )
-                continue
-            # Evaluate and save
-            metrics = evaluate_scene_and_save(
-                scene,
-                c2ws,
-                first_gt_pose,
-                frame_ids,
-                all_cam_to_world_mat,
-                all_world_points,
-                output_scene_dir,
-                args.gt_ply_dir,
-                args.chamfer_max_dist,
-                inference_time_ms,
-                args.plot,
-            )
-            if metrics is not None:
-                all_scenes_metrics["scenes"][scene] = {
-                    key: float(value)
-                    for key, value in metrics.items()
-                    if key
-                    in [
-                        "chamfer_distance",
-                        "ate",
-                        "are",
-                        "rpe_rot",
-                        "rpe_trans",
-                        "inference_time_ms",
-                    ]
-                }
-                print("Complete metrics", all_scenes_metrics["scenes"][scene])
-        except Exception as e:
-            print(f"Error processing scene {scene}: {e}")
-            import traceback
-            traceback.print_exc()
-    # Summarize average metrics and save
-    compute_average_metrics_and_save(
-        all_scenes_metrics,
-        args.output_path,
-        args.input_frame,
-    )

FastVGGT/eval/utils.py DELETED Viewed

@@ -1,142 +0,0 @@
-import numpy as np
-from scipy.spatial import cKDTree as KDTree
-def depthmap_to_camera_coordinates(depthmap, camera_intrinsics, pseudo_focal=None):
-    """
-    Args:
-        - depthmap (HxW array):
-        - camera_intrinsics: a 3x3 matrix
-    Returns:
-        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
-    """
-    camera_intrinsics = np.float32(camera_intrinsics)
-    H, W = depthmap.shape
-    assert camera_intrinsics[0, 1] == 0.0
-    assert camera_intrinsics[1, 0] == 0.0
-    if pseudo_focal is None:
-        fu = camera_intrinsics[0, 0]
-        fv = camera_intrinsics[1, 1]
-    else:
-        assert pseudo_focal.shape == (H, W)
-        fu = fv = pseudo_focal
-    cu = camera_intrinsics[0, 2]
-    cv = camera_intrinsics[1, 2]
-    u, v = np.meshgrid(np.arange(W), np.arange(H))
-    z_cam = depthmap
-    x_cam = (u - cu) * z_cam / fu
-    y_cam = (v - cv) * z_cam / fv
-    X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)
-    valid_mask = depthmap > 0.0
-    return X_cam, valid_mask
-def depthmap_to_absolute_camera_coordinates(
-    depthmap, camera_intrinsics, camera_pose, **kw
-):
-    """
-    Args:
-        - depthmap (HxW array):
-        - camera_intrinsics: a 3x3 matrix
-        - camera_pose: a 4x3 or 4x4 cam2world matrix
-    Returns:
-        pointmap of absolute coordinates (HxWx3 array), and a mask specifying valid pixels.
-    """
-    X_cam, valid_mask = depthmap_to_camera_coordinates(depthmap, camera_intrinsics)
-    X_world = X_cam  # default
-    if camera_pose is not None:
-        R_cam2world = camera_pose[:3, :3]
-        t_cam2world = camera_pose[:3, 3]
-        X_world = (
-            np.einsum("ik, vuk -> vui", R_cam2world, X_cam) + t_cam2world[None, None, :]
-        )
-    return X_world, valid_mask
-def completion_ratio(gt_points, rec_points, dist_th=0.05):
-    gen_points_kd_tree = KDTree(rec_points)
-    distances, _ = gen_points_kd_tree.query(gt_points)
-    comp_ratio = np.mean((distances < dist_th).astype(np.float32))
-    return comp_ratio
-def accuracy(gt_points, rec_points, gt_normals=None, rec_normals=None):
-    gt_points_kd_tree = KDTree(gt_points)
-    distances, idx = gt_points_kd_tree.query(rec_points, workers=-1)
-    acc = np.mean(distances)
-    acc_median = np.median(distances)
-    if gt_normals is not None and rec_normals is not None:
-        normal_dot = np.sum(gt_normals[idx] * rec_normals, axis=-1)
-        normal_dot = np.abs(normal_dot)
-        return acc, acc_median, np.mean(normal_dot), np.median(normal_dot)
-    return acc, acc_median
-def completion(gt_points, rec_points, gt_normals=None, rec_normals=None):
-    gt_points_kd_tree = KDTree(rec_points)
-    distances, idx = gt_points_kd_tree.query(gt_points, workers=-1)
-    comp = np.mean(distances)
-    comp_median = np.median(distances)
-    if gt_normals is not None and rec_normals is not None:
-        normal_dot = np.sum(gt_normals * rec_normals[idx], axis=-1)
-        normal_dot = np.abs(normal_dot)
-        return comp, comp_median, np.mean(normal_dot), np.median(normal_dot)
-    return comp, comp_median
-def compute_iou(pred_vox, target_vox):
-    # Get voxel indices
-    v_pred_indices = [voxel.grid_index for voxel in pred_vox.get_voxels()]
-    v_target_indices = [voxel.grid_index for voxel in target_vox.get_voxels()]
-    # Convert to sets for set operations
-    v_pred_filled = set(tuple(np.round(x, 4)) for x in v_pred_indices)
-    v_target_filled = set(tuple(np.round(x, 4)) for x in v_target_indices)
-    # Compute intersection and union
-    intersection = v_pred_filled & v_target_filled
-    union = v_pred_filled | v_target_filled
-    # Compute IoU
-    iou = len(intersection) / len(union)
-    return iou
-def colmap_to_opencv_intrinsics(K):
-    """
-    Modify camera intrinsics to follow a different convention.
-    Coordinates of the center of the top-left pixels are by default:
-    - (0.5, 0.5) in Colmap
-    - (0,0) in OpenCV
-    """
-    K = K.copy()
-    K[0, 2] -= 0.5
-    K[1, 2] -= 0.5
-    return K
-def opencv_to_colmap_intrinsics(K):
-    """
-    Modify camera intrinsics to follow a different convention.
-    Coordinates of the center of the top-left pixels are by default:
-    - (0.5, 0.5) in Colmap
-    - (0,0) in OpenCV
-    """
-    K = K.copy()
-    K[0, 2] += 0.5
-    K[1, 2] += 0.5
-    return K

FastVGGT/merging/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-from . import merge
-__all__ = ["merge"]

FastVGGT/merging/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (187 Bytes)

FastVGGT/merging/__pycache__/merge.cpython-310.pyc DELETED Viewed

Binary file (7.54 kB)

FastVGGT/merging/merge.py DELETED Viewed

@@ -1,370 +0,0 @@
-import torch
-from typing import Tuple, Callable, Optional, Union
-@torch.jit.script
-def fast_similarity_chunks(
-    a: torch.Tensor, b_transposed: torch.Tensor, chunk_size: int
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    B, num_src, C = a.shape
-    original_dtype = a.dtype
-    # Convert to bf16 for computation to improve performance and reduce memory usage
-    a_bf16 = a.to(torch.bfloat16)
-    b_transposed_bf16 = b_transposed.to(torch.bfloat16)
-    node_max = torch.empty(B, num_src, device=a.device, dtype=original_dtype)
-    node_idx = torch.empty(B, num_src, device=a.device, dtype=torch.long)
-    # Process in chunks
-    for i in range(0, num_src, chunk_size):
-        end_i = min(i + chunk_size, num_src)
-        a_chunk = a_bf16[:, i:end_i, :]  # [B, chunk_size, C]
-        scores_chunk = torch.bmm(a_chunk, b_transposed_bf16)
-        chunk_max_bf16, chunk_idx = torch.max(scores_chunk, dim=2)
-        chunk_max = chunk_max_bf16.to(original_dtype)
-        node_max[:, i:end_i] = chunk_max
-        node_idx[:, i:end_i] = chunk_idx
-    return node_max, node_idx
-def do_nothing(
-    x: torch.Tensor,
-    extra_tensors=None,
-    extra_tensors_2=None,
-) -> Union[
-    torch.Tensor,
-    Tuple[torch.Tensor, torch.Tensor],
-    Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-]:
-    if extra_tensors is not None and extra_tensors_2 is not None:
-        return x, extra_tensors, extra_tensors_2
-    elif extra_tensors is not None:
-        return x, extra_tensors
-    else:
-        return x
-def token_merge_bipartite2d(
-    metric: torch.Tensor,
-    w: int,
-    h: int,
-    sx: int,
-    sy: int,
-    r: int,
-    no_rand: bool = False,
-    generator: Optional[torch.Generator] = None,
-    enable_protection: bool = False,
-) -> Tuple[Callable, Callable]:
-    """
-    Divide tokens into source (src) and destination (dst) groups, and merge r tokens from src to dst.
-    dst tokens are selected by randomly choosing one token from each (sx, sy) region.
-    Optionally protect the top 10% of tokens from merging based on importance scores.
-    Args:
-     - metric [B, N, C]: Tensor for similarity computation, B=batch size, N=token count, C=feature dimension
-     - w: Image width in tokens
-     - h: Image height in tokens
-     - sx: dst stride in x dimension, must divide w evenly
-     - sy: dst stride in y dimension, must divide h evenly
-     - r: Number of tokens to remove through merging
-     - no_rand: If True, disable randomness (use only top-left token)
-     - generator: Random number generator if no_rand is False and not None
-     - enable_protection: If True, enable importance protection feature
-    Returns:
-     - (merge, unmerge): Two functions for merging tokens and restoring pre-merge state
-    """
-    B, N, _ = metric.shape  # Batch size B, total tokens N
-    if r <= 0:
-        return do_nothing, do_nothing
-    gather = torch.gather
-    tokens_per_img = w * h + 5
-    num_imgs = N // tokens_per_img
-    assert tokens_per_img * num_imgs == N, "Token count doesn't match (w*h+5)*num_imgs"
-    with torch.no_grad():
-        # Determine whether to compute importance scores based on enable_protection
-        if enable_protection:
-            num_protected = int(N * 0.1)
-            step = max(1, N // num_protected)
-            protected_indices = torch.arange(0, N, step, device=metric.device)[
-                :num_protected
-            ]
-        else:
-            protected_indices = None
-            num_protected = 0
-        # Global idx_buffer_seq of length N; -1 indicates dst, 0 indicates src (maintain original logic)
-        idx_buffer_seq = torch.zeros(N, device=metric.device, dtype=torch.int64)
-        hsy, wsx = h // sy, w // sx  # Number of blocks within each image
-        # Mark first image entirely as dst
-        if num_imgs > 0:
-            idx_buffer_seq[:tokens_per_img] = -1
-        # Process other images - fully vectorized batch operations
-        if num_imgs > 1:
-            cls_indices = (
-                torch.arange(1, num_imgs, device=metric.device) * tokens_per_img
-            )
-            cls_indices = cls_indices[:, None] + torch.arange(5, device=metric.device)
-            idx_buffer_seq[cls_indices.flatten()] = -1
-            effective_h = min(hsy * sy, h)
-            effective_w = min(wsx * sx, w)
-            effective_grid_size = effective_h * effective_w
-            if no_rand:
-                base_pattern = torch.zeros(
-                    effective_grid_size, device=metric.device, dtype=torch.int64
-                )
-                grid_starts = (
-                    torch.arange(1, num_imgs, device=metric.device) * tokens_per_img + 5
-                )
-                grid_indices = grid_starts[:, None] + torch.arange(
-                    effective_grid_size, device=metric.device
-                )
-                idx_buffer_seq[grid_indices.flatten()] = base_pattern.repeat(
-                    num_imgs - 1
-                )
-            else:
-                total_other_imgs = num_imgs - 1
-                all_rand_idx = torch.randint(
-                    sy * sx,
-                    size=(total_other_imgs, hsy, wsx),
-                    device=metric.device,
-                    generator=generator,
-                )
-                scatter_src = -torch.ones(
-                    total_other_imgs, hsy, wsx, device=metric.device, dtype=torch.int64
-                )
-                idx_buffer_batch = torch.zeros(
-                    total_other_imgs,
-                    hsy,
-                    wsx,
-                    sy * sx,
-                    device=metric.device,
-                    dtype=torch.int64,
-                )
-                idx_buffer_batch.scatter_(
-                    dim=3,
-                    index=all_rand_idx.unsqueeze(-1),
-                    src=scatter_src.unsqueeze(-1),
-                )
-                idx_buffer_batch = (
-                    idx_buffer_batch.view(total_other_imgs, hsy, wsx, sy, sx)
-                    .transpose(2, 3)
-                    .reshape(total_other_imgs, hsy * sy, wsx * sx)
-                )
-                # Batch fill to target positions - still needs a small loop here, but operations are greatly reduced
-                for i in range(total_other_imgs):
-                    img_idx = i + 1
-                    grid_start = img_idx * tokens_per_img + 5
-                    flat_view = idx_buffer_batch[
-                        i, :effective_h, :effective_w
-                    ].flatten()
-                    idx_buffer_seq[grid_start : grid_start + effective_grid_size] = (
-                        flat_view
-                    )
-        rand_idx = idx_buffer_seq.reshape(1, -1, 1).argsort(dim=1)
-        num_dst_orig = int((idx_buffer_seq == -1).sum())
-        # Original src and dst indices
-        a_idx_orig = rand_idx[:, num_dst_orig:, :]
-        b_idx_orig = rand_idx[:, :num_dst_orig, :]
-        a_idx = a_idx_orig
-        b_idx = b_idx_orig
-        if enable_protection:
-            protected_idx = protected_indices.unsqueeze(0).unsqueeze(-1)
-            num_protected_actual = protected_idx.shape[1]
-        else:
-            protected_idx = None
-            num_protected_actual = 0
-        num_src = a_idx.shape[1]
-        num_dst = b_idx.shape[1]
-        # Define an internal function to separate src, dst, and protected tokens
-        def split(x):
-            C = x.shape[-1]
-            if enable_protection:
-                src = gather(x, dim=1, index=a_idx.expand(B, num_src, C))
-                dst = gather(x, dim=1, index=b_idx.expand(B, num_dst, C))
-                protected = gather(
-                    x, dim=1, index=protected_idx.expand(B, num_protected_actual, C)
-                )
-                return src, dst, protected
-            else:
-                src = gather(x, dim=1, index=a_idx.expand(B, num_src, C))
-                dst = gather(x, dim=1, index=b_idx.expand(B, num_dst, C))
-                return src, dst
-        # Compute cosine similarity (normalize first then dot product)
-        metric = metric / metric.norm(dim=-1, keepdim=True)
-        if enable_protection:
-            a, b, protected = split(metric)
-        else:
-            a, b = split(metric)
-        r = min(a.shape[1], r)
-        num_src_actual = a.shape[1]
-        chunk_size = min(5000, num_src_actual)
-        node_max = torch.empty(B, num_src_actual, device=a.device, dtype=a.dtype)
-        node_idx = torch.empty(B, num_src_actual, device=a.device, dtype=torch.long)
-        b_transposed = b.transpose(-1, -2)
-        node_max, node_idx = fast_similarity_chunks(a, b_transposed, chunk_size)
-        edge_idx = node_max.argsort(dim=-1, descending=True)[..., None]
-        # If protection is enabled, filter out protected tokens to ensure they are not merged
-        if enable_protection:
-            src_indices = a_idx[0, :, 0]
-            protected_mask_src = torch.isin(src_indices, protected_indices)
-            edge_flat = edge_idx[0, :, 0]
-            valid_mask = ~protected_mask_src[edge_flat]
-            valid_edges = edge_flat[valid_mask]
-            valid_count = valid_edges.shape[0]
-            r_actual = min(r, valid_count)
-            unm_idx = valid_edges[r_actual:].unsqueeze(0).unsqueeze(-1)
-            src_idx = valid_edges[:r_actual].unsqueeze(0).unsqueeze(-1)
-        else:
-            unm_idx = edge_idx[..., r:, :]
-            src_idx = edge_idx[..., :r, :]
-            r_actual = r
-        # Get dst token indices corresponding to each src token to be merged
-        dst_idx = gather(node_idx[..., None], dim=-2, index=src_idx)
-        r = r_actual
-    # Define merge function to merge selected src tokens to corresponding dst tokens
-    def merge(
-        x: torch.Tensor,
-        mode: str = "mean",
-        extra_tensors=None,
-        extra_tensors_2=None,
-    ) -> Union[
-        torch.Tensor,
-        Tuple[torch.Tensor, torch.Tensor],
-        Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
-    ]:
-        if enable_protection:
-            src, dst, protected = split(x)
-        else:
-            src, dst = split(x)
-        n, t1, c = src.shape
-        # Extract unmerged src tokens - using actual unm_idx size
-        unm_len = unm_idx.shape[1]
-        unm = gather(src, dim=-2, index=unm_idx.expand(n, unm_len, c))
-        src_len = src_idx.shape[1]
-        src = gather(src, dim=-2, index=src_idx.expand(n, src_len, c))
-        dst = dst.scatter_reduce(-2, dst_idx.expand(n, src_len, c), src, reduce=mode)
-        # ---------------- Extra tensor processing ----------------
-        merged_extra_1 = None
-        merged_extra_2 = None
-        if extra_tensors is not None:
-            E_dim = extra_tensors.shape[-1]
-            if enable_protection:
-                src_e, dst_e, protected_e = split(extra_tensors)
-            else:
-                src_e, dst_e = split(extra_tensors)
-            # Consistent with main tensor, only select r src tokens to be merged
-            src_e_r = gather(src_e, dim=-2, index=src_idx.expand(n, src_len, E_dim))
-            unm_e = gather(src_e, dim=-2, index=unm_idx.expand(n, unm_len, E_dim))
-            dst_e = dst_e.scatter_reduce(
-                -2, dst_idx.expand(n, src_len, E_dim), src_e_r, reduce=mode
-            )
-            if enable_protection:
-                merged_extra_1 = torch.cat([unm_e, dst_e, protected_e], dim=1)
-            else:
-                merged_extra_1 = torch.cat([unm_e, dst_e], dim=1)
-        if extra_tensors_2 is not None:
-            E_dim_2 = extra_tensors_2.shape[-1]
-            if enable_protection:
-                src_e2, dst_e2, protected_e2 = split(extra_tensors_2)
-            else:
-                src_e2, dst_e2 = split(extra_tensors_2)
-            src_e2_r = gather(src_e2, dim=-2, index=src_idx.expand(n, src_len, E_dim_2))
-            unm_e2 = gather(src_e2, dim=-2, index=unm_idx.expand(n, unm_len, E_dim_2))
-            dst_e2 = dst_e2.scatter_reduce(
-                -2, dst_idx.expand(n, src_len, E_dim_2), src_e2_r, reduce=mode
-            )
-            if enable_protection:
-                merged_extra_2 = torch.cat([unm_e2, dst_e2, protected_e2], dim=1)
-            else:
-                merged_extra_2 = torch.cat([unm_e2, dst_e2], dim=1)
-        if enable_protection:
-            main_result = torch.cat([unm, dst, protected], dim=1)
-        else:
-            main_result = torch.cat([unm, dst], dim=1)
-        if merged_extra_1 is not None and merged_extra_2 is not None:
-            return main_result, merged_extra_1, merged_extra_2
-        elif merged_extra_1 is not None:
-            return main_result, merged_extra_1
-        else:
-            return main_result
-    # Define unmerge function to restore pre-merge state (for decoder)
-    def unmerge(x: torch.Tensor) -> torch.Tensor:
-        unm_len = unm_idx.shape[1]
-        dst_len = num_dst
-        src_len = src_idx.shape[1]
-        unm = x[..., :unm_len, :]
-        dst = x[..., unm_len : unm_len + dst_len, :]
-        if enable_protection:
-            protected = x[
-                ..., unm_len + dst_len : unm_len + dst_len + num_protected_actual, :
-            ]
-        _, _, c = unm.shape
-        src = gather(dst, dim=-2, index=dst_idx.expand(B, src_len, c))
-        out = torch.zeros(B, N, c, device=x.device, dtype=x.dtype)
-        out.scatter_(dim=-2, index=b_idx.expand(B, num_dst, c), src=dst)
-        out.scatter_(
-            dim=-2,
-            index=gather(
-                a_idx.expand(B, a_idx.shape[1], 1), dim=1, index=unm_idx
-            ).expand(B, unm_len, c),
-            src=unm,
-        )
-        out.scatter_(
-            dim=-2,
-            index=gather(
-                a_idx.expand(B, a_idx.shape[1], 1), dim=1, index=src_idx
-            ).expand(B, src_len, c),
-            src=src,
-        )
-        if enable_protection:
-            out.scatter_(
-                dim=-2,
-                index=protected_idx.expand(B, num_protected_actual, c),
-                src=protected,
-            )
-        return out
-    return merge, unmerge

FastVGGT/requirements.txt DELETED Viewed

@@ -1,15 +0,0 @@
-torch==2.3.1
-torchvision==0.18.1
-numpy==1.26.1
-Pillow
-huggingface_hub
-einops
-safetensors
-evo
-open3d
-matplotlib
-scipy
-opencv-python
-scikit-image
-tqdm

FastVGGT/vggt/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.

FastVGGT/vggt/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (132 Bytes)

FastVGGT/vggt/dependency/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.

FastVGGT/vggt/dependency/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (143 Bytes)

FastVGGT/vggt/dependency/__pycache__/distortion.cpython-310.pyc DELETED Viewed

Binary file (1.39 kB)

FastVGGT/vggt/dependency/distortion.py DELETED Viewed

@@ -1,54 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import numpy as np
-import torch
-def apply_distortion(points, distortion_params):
-    """
-    Apply distortion to normalized camera coordinates.
-    Args:
-        points: Array of normalized camera coordinates
-        distortion_params: Distortion parameters
-    Returns:
-        Distorted coordinates
-    """
-    # Simple passthrough for now - implement actual distortion if needed
-    return points
-def iterative_undistortion(points, distortion_params, max_iter=10):
-    """
-    Remove distortion from normalized camera coordinates using iterative method.
-    Args:
-        points: Array of distorted normalized camera coordinates
-        distortion_params: Distortion parameters
-        max_iter: Maximum number of iterations
-    Returns:
-        Undistorted coordinates
-    """
-    # Simple passthrough for now - implement actual undistortion if needed
-    return points
-def single_undistortion(points, distortion_params):
-    """
-    Remove distortion from normalized camera coordinates using single step.
-    Args:
-        points: Array of distorted normalized camera coordinates
-        distortion_params: Distortion parameters
-    Returns:
-        Undistorted coordinates
-    """
-    # Simple passthrough for now - implement actual undistortion if needed
-    return points

FastVGGT/vggt/heads/__pycache__/camera_head.cpython-310.pyc DELETED Viewed

Binary file (4.24 kB)

FastVGGT/vggt/heads/__pycache__/dpt_head.cpython-310.pyc DELETED Viewed

Binary file (12.8 kB)

FastVGGT/vggt/heads/__pycache__/head_act.cpython-310.pyc DELETED Viewed

Binary file (3.1 kB)

FastVGGT/vggt/heads/__pycache__/track_head.cpython-310.pyc DELETED Viewed

Binary file (3.41 kB)

FastVGGT/vggt/heads/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (3.18 kB)

FastVGGT/vggt/heads/camera_head.py DELETED Viewed

@@ -1,149 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import math
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from vggt.layers import Mlp
-from vggt.layers.block import Block
-from vggt.heads.head_act import activate_pose
-class CameraHead(nn.Module):
-    """
-    CameraHead predicts camera parameters from token representations using iterative refinement.
-    It applies a series of transformer blocks (the "trunk") to dedicated camera tokens.
-    """
-    def __init__(
-        self,
-        dim_in: int = 2048,
-        trunk_depth: int = 4,
-        pose_encoding_type: str = "absT_quaR_FoV",
-        num_heads: int = 16,
-        mlp_ratio: int = 4,
-        init_values: float = 0.01,
-        trans_act: str = "linear",
-        quat_act: str = "linear",
-        fl_act: str = "relu",  # Field of view activations: ensures FOV values are positive.
-    ):
-        super().__init__()
-        if pose_encoding_type == "absT_quaR_FoV":
-            self.target_dim = 9
-        else:
-            raise ValueError(f"Unsupported camera encoding type: {pose_encoding_type}")
-        self.trans_act = trans_act
-        self.quat_act = quat_act
-        self.fl_act = fl_act
-        self.trunk_depth = trunk_depth
-        # Build the trunk using a sequence of transformer blocks.
-        self.trunk = nn.Sequential(
-            *[
-                Block(dim=dim_in, num_heads=num_heads, mlp_ratio=mlp_ratio, init_values=init_values)
-                for _ in range(trunk_depth)
-            ]
-        )
-        # Normalizations for camera token and trunk output.
-        self.token_norm = nn.LayerNorm(dim_in)
-        self.trunk_norm = nn.LayerNorm(dim_in)
-        # Learnable empty camera pose token.
-        self.empty_pose_tokens = nn.Parameter(torch.zeros(1, 1, self.target_dim))
-        self.embed_pose = nn.Linear(self.target_dim, dim_in)
-        # Module for producing modulation parameters: shift, scale, and a gate.
-        self.poseLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(dim_in, 3 * dim_in, bias=True))
-        # Adaptive layer normalization without affine parameters.
-        self.adaln_norm = nn.LayerNorm(dim_in, elementwise_affine=False, eps=1e-6)
-        self.pose_branch = Mlp(in_features=dim_in, hidden_features=dim_in // 2, out_features=self.target_dim, drop=0)
-    def forward(self, aggregated_tokens_list: list, num_iterations: int = 4) -> list:
-        """
-        Forward pass to predict camera parameters.
-        Args:
-            aggregated_tokens_list (list): List of token tensors from the network;
-                the last tensor is used for prediction.
-            num_iterations (int, optional): Number of iterative refinement steps. Defaults to 4.
-        Returns:
-            list: A list of predicted camera encodings (post-activation) from each iteration.
-        """
-        # Use tokens from the last block for camera prediction.
-        tokens = aggregated_tokens_list[-1]
-        # Extract the camera tokens
-        pose_tokens = tokens[:, :, 0]
-        pose_tokens = self.token_norm(pose_tokens)
-        pred_pose_enc_list = self.trunk_fn(pose_tokens, num_iterations)
-        return pred_pose_enc_list
-    def trunk_fn(self, pose_tokens: torch.Tensor, num_iterations: int) -> list:
-        """
-        Iteratively refine camera pose predictions.
-        Args:
-            pose_tokens (torch.Tensor): Normalized camera tokens with shape [B, 1, C].
-            num_iterations (int): Number of refinement iterations.
-        Returns:
-            list: List of activated camera encodings from each iteration.
-        """
-        B, S, C = pose_tokens.shape  # S is expected to be 1.
-        pred_pose_enc = None
-        pred_pose_enc_list = []
-        for _ in range(num_iterations):
-            # Use a learned empty pose for the first iteration.
-            if pred_pose_enc is None:
-                module_input = self.embed_pose(self.empty_pose_tokens.expand(B, S, -1))
-            else:
-                # Detach the previous prediction to avoid backprop through time.
-                pred_pose_enc = pred_pose_enc.detach()
-                module_input = self.embed_pose(pred_pose_enc)
-            # Generate modulation parameters and split them into shift, scale, and gate components.
-            shift_msa, scale_msa, gate_msa = self.poseLN_modulation(module_input).chunk(3, dim=-1)
-            # Adaptive layer normalization and modulation.
-            pose_tokens_modulated = gate_msa * modulate(self.adaln_norm(pose_tokens), shift_msa, scale_msa)
-            pose_tokens_modulated = pose_tokens_modulated + pose_tokens
-            pose_tokens_modulated = self.trunk(pose_tokens_modulated)
-            # Compute the delta update for the pose encoding.
-            pred_pose_enc_delta = self.pose_branch(self.trunk_norm(pose_tokens_modulated))
-            if pred_pose_enc is None:
-                pred_pose_enc = pred_pose_enc_delta
-            else:
-                pred_pose_enc = pred_pose_enc + pred_pose_enc_delta
-            # Apply final activation functions for translation, quaternion, and field-of-view.
-            activated_pose = activate_pose(
-                pred_pose_enc, trans_act=self.trans_act, quat_act=self.quat_act, fl_act=self.fl_act
-            )
-            pred_pose_enc_list.append(activated_pose)
-        return pred_pose_enc_list
-def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
-    """
-    Modulate the input tensor using scaling and shifting parameters.
-    """
-    # modified from https://github.com/facebookresearch/DiT/blob/796c29e532f47bba17c5b9c5eb39b9354b8b7c64/models.py#L19
-    return x * (1 + scale) + shift

FastVGGT/vggt/heads/dpt_head.py DELETED Viewed

@@ -1,598 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-# Inspired by https://github.com/DepthAnything/Depth-Anything-V2
-import os
-from typing import List, Dict, Tuple, Union, Optional
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .head_act import activate_head
-from .utils import create_uv_grid, position_grid_to_embed
-class DPTHead(nn.Module):
-    """
-    DPT  Head for dense prediction tasks.
-    This implementation follows the architecture described in "Vision Transformers for Dense Prediction"
-    (https://arxiv.org/abs/2103.13413). The DPT head processes features from a vision transformer
-    backbone and produces dense predictions by fusing multi-scale features.
-    Args:
-        dim_in (int): Input dimension (channels).
-        patch_size (int, optional): Patch size. Default is 14.
-        output_dim (int, optional): Number of output channels. Default is 4.
-        activation (str, optional): Activation type. Default is "inv_log".
-        conf_activation (str, optional): Confidence activation type. Default is "expp1".
-        features (int, optional): Feature channels for intermediate representations. Default is 256.
-        out_channels (List[int], optional): Output channels for each intermediate layer.
-        intermediate_layer_idx (List[int], optional): Indices of layers from aggregated tokens used for DPT.
-        pos_embed (bool, optional): Whether to use positional embedding. Default is True.
-        feature_only (bool, optional): If True, return features only without the last several layers and activation head. Default is False.
-        down_ratio (int, optional): Downscaling factor for the output resolution. Default is 1.
-    """
-    def __init__(
-        self,
-        dim_in: int,
-        patch_size: int = 14,
-        output_dim: int = 4,
-        activation: str = "inv_log",
-        conf_activation: str = "expp1",
-        features: int = 256,
-        out_channels: List[int] = [256, 512, 1024, 1024],
-        intermediate_layer_idx: List[int] = [0, 1, 2, 3],
-        pos_embed: bool = True,
-        feature_only: bool = False,
-        down_ratio: int = 1,
-    ) -> None:
-        super(DPTHead, self).__init__()
-        self.patch_size = patch_size
-        self.activation = activation
-        self.conf_activation = conf_activation
-        self.pos_embed = pos_embed
-        self.feature_only = feature_only
-        self.down_ratio = down_ratio
-        self.intermediate_layer_idx = intermediate_layer_idx
-        self.norm = nn.LayerNorm(dim_in)
-        # Projection layers for each output channel from tokens.
-        self.projects = nn.ModuleList(
-            [
-                nn.Conv2d(
-                    in_channels=dim_in,
-                    out_channels=oc,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                )
-                for oc in out_channels
-            ]
-        )
-        # Resize layers for upsampling feature maps.
-        self.resize_layers = nn.ModuleList(
-            [
-                nn.ConvTranspose2d(
-                    in_channels=out_channels[0],
-                    out_channels=out_channels[0],
-                    kernel_size=4,
-                    stride=4,
-                    padding=0,
-                ),
-                nn.ConvTranspose2d(
-                    in_channels=out_channels[1],
-                    out_channels=out_channels[1],
-                    kernel_size=2,
-                    stride=2,
-                    padding=0,
-                ),
-                nn.Identity(),
-                nn.Conv2d(
-                    in_channels=out_channels[3],
-                    out_channels=out_channels[3],
-                    kernel_size=3,
-                    stride=2,
-                    padding=1,
-                ),
-            ]
-        )
-        self.scratch = _make_scratch(out_channels, features, expand=False)
-        # Attach additional modules to scratch.
-        self.scratch.stem_transpose = nn.Identity()  # Use Identity instead of None
-        self.scratch.refinenet1 = _make_fusion_block(features)
-        self.scratch.refinenet2 = _make_fusion_block(features)
-        self.scratch.refinenet3 = _make_fusion_block(features)
-        self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False)
-        head_features_1 = features
-        head_features_2 = 32
-        if feature_only:
-            self.scratch.output_conv1 = nn.Conv2d(
-                head_features_1, head_features_1, kernel_size=3, stride=1, padding=1
-            )
-        else:
-            self.scratch.output_conv1 = nn.Conv2d(
-                head_features_1,
-                head_features_1 // 2,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-            )
-            conv2_in_channels = head_features_1 // 2
-            self.scratch.output_conv2 = nn.Sequential(
-                nn.Conv2d(
-                    conv2_in_channels,
-                    head_features_2,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                ),
-                nn.ReLU(inplace=True),
-                nn.Conv2d(
-                    head_features_2, output_dim, kernel_size=1, stride=1, padding=0
-                ),
-            )
-    def forward(
-        self,
-        aggregated_tokens_list: List[torch.Tensor],
-        images: torch.Tensor,
-        patch_start_idx: int,
-        frames_chunk_size: int = 8,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Forward pass through the DPT head, supports processing by chunking frames.
-        Args:
-            aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers.
-            images (Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1].
-            patch_start_idx (int): Starting index for patch tokens in the token sequence.
-                Used to separate patch tokens from other tokens (e.g., camera or register tokens).
-            frames_chunk_size (int, optional): Number of frames to process in each chunk.
-                If None or larger than S, all frames are processed at once. Default: 8.
-        Returns:
-            Tensor or Tuple[Tensor, Tensor]:
-                - If feature_only=True: Feature maps with shape [B, S, C, H, W]
-                - Otherwise: Tuple of (predictions, confidence) both with shape [B, S, 1, H, W]
-        """
-        B, S, _, H, W = images.shape
-        # If frames_chunk_size is not specified or greater than S, process all frames at once
-        if frames_chunk_size is None or frames_chunk_size >= S:
-            return self._forward_impl(aggregated_tokens_list, images, patch_start_idx)
-        # Otherwise, process frames in chunks to manage memory usage
-        assert frames_chunk_size > 0
-        # Process frames in batches
-        all_preds = []
-        all_conf = []
-        for frames_start_idx in range(0, S, frames_chunk_size):
-            frames_end_idx = min(frames_start_idx + frames_chunk_size, S)
-            # Process batch of frames
-            if self.feature_only:
-                chunk_output = self._forward_impl(
-                    aggregated_tokens_list,
-                    images,
-                    patch_start_idx,
-                    frames_start_idx,
-                    frames_end_idx,
-                )
-                all_preds.append(chunk_output)
-            else:
-                chunk_preds, chunk_conf = self._forward_impl(
-                    aggregated_tokens_list,
-                    images,
-                    patch_start_idx,
-                    frames_start_idx,
-                    frames_end_idx,
-                )
-                all_preds.append(chunk_preds)
-                all_conf.append(chunk_conf)
-        # Concatenate results along the sequence dimension
-        if self.feature_only:
-            return torch.cat(all_preds, dim=1)
-        else:
-            return torch.cat(all_preds, dim=1), torch.cat(all_conf, dim=1)
-    def _forward_impl(
-        self,
-        aggregated_tokens_list: List[torch.Tensor],
-        images: torch.Tensor,
-        patch_start_idx: int,
-        frames_start_idx: Optional[int] = None,
-        frames_end_idx: Optional[int] = None,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        """
-        Implementation of the forward pass through the DPT head.
-        This method processes a specific chunk of frames from the sequence.
-        Args:
-            aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers.
-            images (Tensor): Input images with shape [B, S, 3, H, W].
-            patch_start_idx (int): Starting index for patch tokens.
-            frames_start_idx (int, optional): Starting index for frames to process.
-            frames_end_idx (int, optional): Ending index for frames to process.
-        Returns:
-            Tensor or Tuple[Tensor, Tensor]: Feature maps or (predictions, confidence).
-        """
-        if frames_start_idx is not None and frames_end_idx is not None:
-            images = images[:, frames_start_idx:frames_end_idx].contiguous()
-        B, S, _, H, W = images.shape
-        patch_h, patch_w = H // self.patch_size, W // self.patch_size
-        out = []
-        dpt_idx = 0
-        for layer_idx in self.intermediate_layer_idx:
-            x = aggregated_tokens_list[layer_idx][:, :, patch_start_idx:]
-            # Select frames if processing a chunk
-            if frames_start_idx is not None and frames_end_idx is not None:
-                x = x[:, frames_start_idx:frames_end_idx]
-            x = x.reshape(B * S, -1, x.shape[-1])
-            x = self.norm(x)
-            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
-            x = self.projects[dpt_idx](x)
-            if self.pos_embed:
-                x = self._apply_pos_embed(x, W, H)
-            x = self.resize_layers[dpt_idx](x)
-            out.append(x)
-            dpt_idx += 1
-        # Fuse features from multiple layers.
-        out = self.scratch_forward(out)
-        # Interpolate fused output to match target image resolution.
-        out = custom_interpolate(
-            out,
-            (
-                int(patch_h * self.patch_size / self.down_ratio),
-                int(patch_w * self.patch_size / self.down_ratio),
-            ),
-            mode="bilinear",
-            align_corners=True,
-        )
-        if self.pos_embed:
-            out = self._apply_pos_embed(out, W, H)
-        if self.feature_only:
-            return out.view(B, S, *out.shape[1:])
-        out = self.scratch.output_conv2(out)
-        preds, conf = activate_head(
-            out, activation=self.activation, conf_activation=self.conf_activation
-        )
-        preds = preds.view(B, S, *preds.shape[1:])
-        conf = conf.view(B, S, *conf.shape[1:])
-        return preds, conf
-    def _apply_pos_embed(
-        self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1
-    ) -> torch.Tensor:
-        """
-        Apply positional embedding to tensor x.
-        """
-        patch_w = x.shape[-1]
-        patch_h = x.shape[-2]
-        pos_embed = create_uv_grid(
-            patch_w, patch_h, aspect_ratio=W / H, dtype=x.dtype, device=x.device
-        )
-        pos_embed = position_grid_to_embed(pos_embed, x.shape[1])
-        pos_embed = pos_embed * ratio
-        pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1)
-        return x + pos_embed
-    def scratch_forward(self, features: List[torch.Tensor]) -> torch.Tensor:
-        """
-        Forward pass through the fusion blocks.
-        Args:
-            features (List[Tensor]): List of feature maps from different layers.
-        Returns:
-            Tensor: Fused feature map.
-        """
-        layer_1, layer_2, layer_3, layer_4 = features
-        layer_1_rn = self.scratch.layer1_rn(layer_1)
-        layer_2_rn = self.scratch.layer2_rn(layer_2)
-        layer_3_rn = self.scratch.layer3_rn(layer_3)
-        layer_4_rn = self.scratch.layer4_rn(layer_4)
-        out = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
-        del layer_4_rn, layer_4
-        out = self.scratch.refinenet3(out, layer_3_rn, size=layer_2_rn.shape[2:])
-        del layer_3_rn, layer_3
-        out = self.scratch.refinenet2(out, layer_2_rn, size=layer_1_rn.shape[2:])
-        del layer_2_rn, layer_2
-        out = self.scratch.refinenet1(out, layer_1_rn)
-        del layer_1_rn, layer_1
-        out = self.scratch.output_conv1(out)
-        return out
-################################################################################
-# Modules
-################################################################################
-def _make_fusion_block(
-    features: int,
-    size: Optional[int] = None,
-    has_residual: bool = True,
-    groups: int = 1,
-) -> nn.Module:
-    return FeatureFusionBlock(
-        features,
-        nn.ReLU(inplace=True),
-        deconv=False,
-        bn=False,
-        expand=False,
-        align_corners=True,
-        size=size,
-        has_residual=has_residual,
-        groups=groups,
-    )
-def _make_scratch(
-    in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False
-) -> nn.Module:
-    scratch = nn.Module()
-    out_shape1 = out_shape
-    out_shape2 = out_shape
-    out_shape3 = out_shape
-    if len(in_shape) >= 4:
-        out_shape4 = out_shape
-    if expand:
-        out_shape1 = out_shape
-        out_shape2 = out_shape * 2
-        out_shape3 = out_shape * 4
-        if len(in_shape) >= 4:
-            out_shape4 = out_shape * 8
-    scratch.layer1_rn = nn.Conv2d(
-        in_shape[0],
-        out_shape1,
-        kernel_size=3,
-        stride=1,
-        padding=1,
-        bias=False,
-        groups=groups,
-    )
-    scratch.layer2_rn = nn.Conv2d(
-        in_shape[1],
-        out_shape2,
-        kernel_size=3,
-        stride=1,
-        padding=1,
-        bias=False,
-        groups=groups,
-    )
-    scratch.layer3_rn = nn.Conv2d(
-        in_shape[2],
-        out_shape3,
-        kernel_size=3,
-        stride=1,
-        padding=1,
-        bias=False,
-        groups=groups,
-    )
-    if len(in_shape) >= 4:
-        scratch.layer4_rn = nn.Conv2d(
-            in_shape[3],
-            out_shape4,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=False,
-            groups=groups,
-        )
-    return scratch
-class ResidualConvUnit(nn.Module):
-    """Residual convolution module."""
-    def __init__(self, features, activation, bn, groups=1):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super().__init__()
-        self.bn = bn
-        self.groups = groups
-        self.conv1 = nn.Conv2d(
-            features,
-            features,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=True,
-            groups=self.groups,
-        )
-        self.conv2 = nn.Conv2d(
-            features,
-            features,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            bias=True,
-            groups=self.groups,
-        )
-        self.norm1 = None
-        self.norm2 = None
-        self.activation = activation
-    def forward(self, x):
-        """Forward pass.
-        Args:
-            x (tensor): input
-        Returns:
-            tensor: output
-        """
-        out = self.activation(x)
-        out = self.conv1(out)
-        if self.norm1 is not None:
-            out = self.norm1(out)
-        out = self.activation(out)
-        out = self.conv2(out)
-        if self.norm2 is not None:
-            out = self.norm2(out)
-        return out + x
-class FeatureFusionBlock(nn.Module):
-    """Feature fusion block."""
-    def __init__(
-        self,
-        features,
-        activation,
-        deconv=False,
-        bn=False,
-        expand=False,
-        align_corners=True,
-        size=None,
-        has_residual=True,
-        groups=1,
-    ):
-        """Init.
-        Args:
-            features (int): number of features
-        """
-        super(FeatureFusionBlock, self).__init__()
-        self.deconv = deconv
-        self.align_corners = align_corners
-        self.groups = groups
-        self.expand = expand
-        out_features = features
-        if self.expand == True:
-            out_features = features // 2
-        self.out_conv = nn.Conv2d(
-            features,
-            out_features,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias=True,
-            groups=self.groups,
-        )
-        if has_residual:
-            self.resConfUnit1 = ResidualConvUnit(
-                features, activation, bn, groups=self.groups
-            )
-        self.has_residual = has_residual
-        self.resConfUnit2 = ResidualConvUnit(
-            features, activation, bn, groups=self.groups
-        )
-        self.size = size
-    def forward(self, *xs, size=None):
-        """Forward pass.
-        Returns:
-            tensor: output
-        """
-        output = xs[0]
-        if self.has_residual:
-            res = self.resConfUnit1(xs[1])
-            output = output + res
-        output = self.resConfUnit2(output)
-        if (size is None) and (self.size is None):
-            modifier = {"scale_factor": 2}
-        elif size is None:
-            modifier = {"size": self.size}
-        else:
-            modifier = {"size": size}
-        output = custom_interpolate(
-            output, **modifier, mode="bilinear", align_corners=self.align_corners
-        )
-        output = self.out_conv(output)
-        return output
-def custom_interpolate(
-    x: torch.Tensor,
-    size: Optional[Tuple[int, int]] = None,
-    scale_factor: Optional[float] = None,
-    mode: str = "bilinear",
-    align_corners: bool = True,
-) -> torch.Tensor:
-    """
-    Custom interpolate to avoid INT_MAX issues in nn.functional.interpolate.
-    """
-    if size is None:
-        size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor))
-    INT_MAX = 1610612736
-    input_elements = size[0] * size[1] * x.shape[0] * x.shape[1]
-    if input_elements > INT_MAX:
-        chunks = torch.chunk(x, chunks=(input_elements // INT_MAX) + 1, dim=0)
-        interpolated_chunks = [
-            nn.functional.interpolate(
-                chunk, size=size, mode=mode, align_corners=align_corners
-            )
-            for chunk in chunks
-        ]
-        x = torch.cat(interpolated_chunks, dim=0)
-        return x.contiguous()
-    else:
-        return nn.functional.interpolate(
-            x, size=size, mode=mode, align_corners=align_corners
-        )

FastVGGT/vggt/heads/head_act.py DELETED Viewed

@@ -1,125 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-import torch
-import torch.nn.functional as F
-def activate_pose(pred_pose_enc, trans_act="linear", quat_act="linear", fl_act="linear"):
-    """
-    Activate pose parameters with specified activation functions.
-    Args:
-        pred_pose_enc: Tensor containing encoded pose parameters [translation, quaternion, focal length]
-        trans_act: Activation type for translation component
-        quat_act: Activation type for quaternion component
-        fl_act: Activation type for focal length component
-    Returns:
-        Activated pose parameters tensor
-    """
-    T = pred_pose_enc[..., :3]
-    quat = pred_pose_enc[..., 3:7]
-    fl = pred_pose_enc[..., 7:]  # or fov
-    T = base_pose_act(T, trans_act)
-    quat = base_pose_act(quat, quat_act)
-    fl = base_pose_act(fl, fl_act)  # or fov
-    pred_pose_enc = torch.cat([T, quat, fl], dim=-1)
-    return pred_pose_enc
-def base_pose_act(pose_enc, act_type="linear"):
-    """
-    Apply basic activation function to pose parameters.
-    Args:
-        pose_enc: Tensor containing encoded pose parameters
-        act_type: Activation type ("linear", "inv_log", "exp", "relu")
-    Returns:
-        Activated pose parameters
-    """
-    if act_type == "linear":
-        return pose_enc
-    elif act_type == "inv_log":
-        return inverse_log_transform(pose_enc)
-    elif act_type == "exp":
-        return torch.exp(pose_enc)
-    elif act_type == "relu":
-        return F.relu(pose_enc)
-    else:
-        raise ValueError(f"Unknown act_type: {act_type}")
-def activate_head(out, activation="norm_exp", conf_activation="expp1"):
-    """
-    Process network output to extract 3D points and confidence values.
-    Args:
-        out: Network output tensor (B, C, H, W)
-        activation: Activation type for 3D points
-        conf_activation: Activation type for confidence values
-    Returns:
-        Tuple of (3D points tensor, confidence tensor)
-    """
-    # Move channels from last dim to the 4th dimension => (B, H, W, C)
-    fmap = out.permute(0, 2, 3, 1)  # B,H,W,C expected
-    # Split into xyz (first C-1 channels) and confidence (last channel)
-    xyz = fmap[:, :, :, :-1]
-    conf = fmap[:, :, :, -1]
-    if activation == "norm_exp":
-        d = xyz.norm(dim=-1, keepdim=True).clamp(min=1e-8)
-        xyz_normed = xyz / d
-        pts3d = xyz_normed * torch.expm1(d)
-    elif activation == "norm":
-        pts3d = xyz / xyz.norm(dim=-1, keepdim=True)
-    elif activation == "exp":
-        pts3d = torch.exp(xyz)
-    elif activation == "relu":
-        pts3d = F.relu(xyz)
-    elif activation == "inv_log":
-        pts3d = inverse_log_transform(xyz)
-    elif activation == "xy_inv_log":
-        xy, z = xyz.split([2, 1], dim=-1)
-        z = inverse_log_transform(z)
-        pts3d = torch.cat([xy * z, z], dim=-1)
-    elif activation == "sigmoid":
-        pts3d = torch.sigmoid(xyz)
-    elif activation == "linear":
-        pts3d = xyz
-    else:
-        raise ValueError(f"Unknown activation: {activation}")
-    if conf_activation == "expp1":
-        conf_out = 1 + conf.exp()
-    elif conf_activation == "expp0":
-        conf_out = conf.exp()
-    elif conf_activation == "sigmoid":
-        conf_out = torch.sigmoid(conf)
-    else:
-        raise ValueError(f"Unknown conf_activation: {conf_activation}")
-    return pts3d, conf_out
-def inverse_log_transform(y):
-    """
-    Apply inverse log transform: sign(y) * (exp(|y|) - 1)
-    Args:
-        y: Input tensor
-    Returns:
-        Transformed tensor
-    """
-    return torch.sign(y) * (torch.expm1(torch.abs(y)))