diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/LICENSE b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d9b84b1a65f9db6d8920a9048d162f52ba3ea56d --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/LICENSE @@ -0,0 +1,52 @@ +CroCo, Copyright (c) 2022-present Naver Corporation, is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 license. + +A summary of the CC BY-NC-SA 4.0 license is located here: + https://creativecommons.org/licenses/by-nc-sa/4.0/ + +The CC BY-NC-SA 4.0 license is located here: + https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode + + +SEE NOTICE BELOW WITH RESPECT TO THE FILE: models/pos_embed.py, models/blocks.py + +*************************** + +NOTICE WITH RESPECT TO THE FILE: models/pos_embed.py + +This software is being redistributed in a modifiled form. The original form is available here: + +https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py + +This software in this file incorporates parts of the following software available here: + +Transformer: https://github.com/tensorflow/models/blob/master/official/legacy/transformer/model_utils.py +available under the following license: https://github.com/tensorflow/models/blob/master/LICENSE + +MoCo v3: https://github.com/facebookresearch/moco-v3 +available under the following license: https://github.com/facebookresearch/moco-v3/blob/main/LICENSE + +DeiT: https://github.com/facebookresearch/deit +available under the following license: https://github.com/facebookresearch/deit/blob/main/LICENSE + + +ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW: + +https://github.com/facebookresearch/mae/blob/main/LICENSE + +Attribution-NonCommercial 4.0 International + +*************************** + +NOTICE WITH RESPECT TO THE FILE: models/blocks.py + +This software is being redistributed in a modifiled form. The original form is available here: + +https://github.com/rwightman/pytorch-image-models + +ORIGINAL COPYRIGHT NOTICE AND PERMISSION NOTICE AVAILABLE HERE IS REPRODUCE BELOW: + +https://github.com/rwightman/pytorch-image-models/blob/master/LICENSE + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ \ No newline at end of file diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/NOTICE b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/NOTICE new file mode 100644 index 0000000000000000000000000000000000000000..d51bb365036c12d428d6e3a4fd00885756d5261c --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/NOTICE @@ -0,0 +1,21 @@ +CroCo +Copyright 2022-present NAVER Corp. + +This project contains subcomponents with separate copyright notices and license terms. +Your use of the source code for these subcomponents is subject to the terms and conditions of the following licenses. + +==== + +facebookresearch/mae +https://github.com/facebookresearch/mae + +Attribution-NonCommercial 4.0 International + +==== + +rwightman/pytorch-image-models +https://github.com/rwightman/pytorch-image-models + +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ \ No newline at end of file diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/README.MD b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..38e33b001a60bd16749317fb297acd60f28a6f1b --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/README.MD @@ -0,0 +1,124 @@ +# CroCo + CroCo v2 / CroCo-Stereo / CroCo-Flow + +[[`CroCo arXiv`](https://arxiv.org/abs/2210.10716)] [[`CroCo v2 arXiv`](https://arxiv.org/abs/2211.10408)] [[`project page and demo`](https://croco.europe.naverlabs.com/)] + +This repository contains the code for our CroCo model presented in our NeurIPS'22 paper [CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion](https://openreview.net/pdf?id=wZEfHUM5ri) and its follow-up extension published at ICCV'23 [Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow](https://openaccess.thecvf.com/content/ICCV2023/html/Weinzaepfel_CroCo_v2_Improved_Cross-view_Completion_Pre-training_for_Stereo_Matching_and_ICCV_2023_paper.html), refered to as CroCo v2: + +![image](assets/arch.jpg) + +```bibtex +@inproceedings{croco, + title={{CroCo: Self-Supervised Pre-training for 3D Vision Tasks by Cross-View Completion}}, + author={{Weinzaepfel, Philippe and Leroy, Vincent and Lucas, Thomas and Br\'egier, Romain and Cabon, Yohann and Arora, Vaibhav and Antsfeld, Leonid and Chidlovskii, Boris and Csurka, Gabriela and Revaud J\'er\^ome}}, + booktitle={{NeurIPS}}, + year={2022} +} + +@inproceedings{croco_v2, + title={{CroCo v2: Improved Cross-view Completion Pre-training for Stereo Matching and Optical Flow}}, + author={Weinzaepfel, Philippe and Lucas, Thomas and Leroy, Vincent and Cabon, Yohann and Arora, Vaibhav and Br{\'e}gier, Romain and Csurka, Gabriela and Antsfeld, Leonid and Chidlovskii, Boris and Revaud, J{\'e}r{\^o}me}, + booktitle={ICCV}, + year={2023} +} +``` + +## License + +The code is distributed under the CC BY-NC-SA 4.0 License. See [LICENSE](LICENSE) for more information. +Some components are based on code from [MAE](https://github.com/facebookresearch/mae) released under the CC BY-NC-SA 4.0 License and [timm](https://github.com/rwightman/pytorch-image-models) released under the Apache 2.0 License. +Some components for stereo matching and optical flow are based on code from [unimatch](https://github.com/autonomousvision/unimatch) released under the MIT license. + +## Preparation + +1. Install dependencies on a machine with a NVidia GPU using e.g. conda. Note that `habitat-sim` is required only for the interactive demo and the synthetic pre-training data generation. If you don't plan to use it, you can ignore the line installing it and use a more recent python version. + +```bash +conda create -n croco python=3.7 cmake=3.14.0 +conda activate croco +conda install habitat-sim headless -c conda-forge -c aihabitat +conda install pytorch torchvision -c pytorch +conda install notebook ipykernel matplotlib +conda install ipywidgets widgetsnbextension +conda install scikit-learn tqdm quaternion opencv # only for pretraining / habitat data generation + +``` + +2. Compile cuda kernels for RoPE + +CroCo v2 relies on RoPE positional embeddings for which you need to compile some cuda kernels. +```bash +cd models/curope/ +python setup.py build_ext --inplace +cd ../../ +``` + +This can be a bit long as we compile for all cuda architectures, feel free to update L9 of `models/curope/setup.py` to compile for specific architectures only. +You might also need to set the environment `CUDA_HOME` in case you use a custom cuda installation. + +In case you cannot provide, we also provide a slow pytorch version, which will be automatically loaded. + +3. Download pre-trained model + +We provide several pre-trained models: + +| modelname | pre-training data | pos. embed. | Encoder | Decoder | +|------------------------------------------------------------------------------------------------------------------------------------|-------------------|-------------|---------|---------| +| [`CroCo.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth) | Habitat | cosine | ViT-B | Small | +| [`CroCo_V2_ViTBase_SmallDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_SmallDecoder.pth) | Habitat + real | RoPE | ViT-B | Small | +| [`CroCo_V2_ViTBase_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTBase_BaseDecoder.pth) | Habitat + real | RoPE | ViT-B | Base | +| [`CroCo_V2_ViTLarge_BaseDecoder.pth`](https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo_V2_ViTLarge_BaseDecoder.pth) | Habitat + real | RoPE | ViT-L | Base | + +To download a specific model, i.e., the first one (`CroCo.pth`) +```bash +mkdir -p pretrained_models/ +wget https://download.europe.naverlabs.com/ComputerVision/CroCo/CroCo.pth -P pretrained_models/ +``` + +## Reconstruction example + +Simply run after downloading the `CroCo_V2_ViTLarge_BaseDecoder` pretrained model (or update the corresponding line in `demo.py`) +```bash +python demo.py +``` + +## Interactive demonstration of cross-view completion reconstruction on the Habitat simulator + +First download the test scene from Habitat: +```bash +python -m habitat_sim.utils.datasets_download --uids habitat_test_scenes --data-path habitat-sim-data/ +``` + +Then, run the Notebook demo `interactive_demo.ipynb`. + +In this demo, you should be able to sample a random reference viewpoint from an [Habitat](https://github.com/facebookresearch/habitat-sim) test scene. Use the sliders to change viewpoint and select a masked target view to reconstruct using CroCo. +![croco_interactive_demo](https://user-images.githubusercontent.com/1822210/200516576-7937bc6a-55f8-49ed-8618-3ddf89433ea4.jpg) + +## Pre-training + +### CroCo + +To pre-train CroCo, please first generate the pre-training data from the Habitat simulator, following the instructions in [datasets/habitat_sim/README.MD](datasets/habitat_sim/README.MD) and then run the following command: +``` +torchrun --nproc_per_node=4 pretrain.py --output_dir ./output/pretraining/ +``` + +Our CroCo pre-training was launched on a single server with 4 GPUs. +It should take around 10 days with A100 or 15 days with V100 to do the 400 pre-training epochs, but decent performances are obtained earlier in training. +Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case. +The first run can take a few minutes to start, to parse all available pre-training pairs. + +### CroCo v2 + +For CroCo v2 pre-training, in addition to the generation of the pre-training data from the Habitat simulator above, please pre-extract the crops from the real datasets following the instructions in [datasets/crops/README.MD](datasets/crops/README.MD). +Then, run the following command for the largest model (ViT-L encoder, Base decoder): +``` +torchrun --nproc_per_node=8 pretrain.py --model "CroCoNet(enc_embed_dim=1024, enc_depth=24, enc_num_heads=16, dec_embed_dim=768, dec_num_heads=12, dec_depth=12, pos_embed='RoPE100')" --dataset "habitat_release+ARKitScenes+MegaDepth+3DStreetView+IndoorVL" --warmup_epochs 12 --max_epoch 125 --epochs 250 --amp 0 --keep_freq 5 --output_dir ./output/pretraining_crocov2/ +``` + +Our CroCo v2 pre-training was launched on a single server with 8 GPUs for the largest model, and on a single server with 4 GPUs for the smaller ones, keeping a batch size of 64 per gpu in all cases. +The largest model should take around 12 days on A100. +Note that, while the code contains the same scaling rule of the learning rate as MAE when changing the effective batch size, we did not experimented if it is valid in our case. + +## Stereo matching and Optical flow downstream tasks + +For CroCo-Stereo and CroCo-Flow, please refer to [stereoflow/README.MD](stereoflow/README.MD). diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/croco-stereo-flow-demo.ipynb b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/croco-stereo-flow-demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2b00a7607ab5f82d1857041969bfec977e56b3e0 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/croco-stereo-flow-demo.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9bca0f41", + "metadata": {}, + "source": [ + "# Simple inference example with CroCo-Stereo or CroCo-Flow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80653ef7", + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n", + "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)." + ] + }, + { + "cell_type": "markdown", + "id": "4f033862", + "metadata": {}, + "source": [ + "First download the model(s) of your choice by running\n", + "```\n", + "bash stereoflow/download_model.sh crocostereo.pth\n", + "bash stereoflow/download_model.sh crocoflow.pth\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fb2e392", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n", + "device = torch.device('cuda:0' if use_gpu else 'cpu')\n", + "import matplotlib.pylab as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0e25d77", + "metadata": {}, + "outputs": [], + "source": [ + "from stereoflow.test import _load_model_and_criterion\n", + "from stereoflow.engine import tiled_pred\n", + "from stereoflow.datasets_stereo import img_to_tensor, vis_disparity\n", + "from stereoflow.datasets_flow import flowToColor\n", + "tile_overlap=0.7 # recommended value, higher value can be slightly better but slower" + ] + }, + { + "cell_type": "markdown", + "id": "86a921f5", + "metadata": {}, + "source": [ + "### CroCo-Stereo example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64e483cb", + "metadata": {}, + "outputs": [], + "source": [ + "image1 = np.asarray(Image.open(''))\n", + "image2 = np.asarray(Image.open(''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0d04303", + "metadata": {}, + "outputs": [], + "source": [ + "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocostereo.pth', None, device)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47dc14b5", + "metadata": {}, + "outputs": [], + "source": [ + "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n", + "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n", + "with torch.inference_mode():\n", + " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n", + "pred = pred.squeeze(0).squeeze(0).cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "583b9f16", + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(vis_disparity(pred))\n", + "plt.axis('off')" + ] + }, + { + "cell_type": "markdown", + "id": "d2df5d70", + "metadata": {}, + "source": [ + "### CroCo-Flow example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ee257a7", + "metadata": {}, + "outputs": [], + "source": [ + "image1 = np.asarray(Image.open(''))\n", + "image2 = np.asarray(Image.open(''))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5edccf0", + "metadata": {}, + "outputs": [], + "source": [ + "model, _, cropsize, with_conf, task, tile_conf_mode = _load_model_and_criterion('stereoflow_models/crocoflow.pth', None, device)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b19692c3", + "metadata": {}, + "outputs": [], + "source": [ + "im1 = img_to_tensor(image1).to(device).unsqueeze(0)\n", + "im2 = img_to_tensor(image2).to(device).unsqueeze(0)\n", + "with torch.inference_mode():\n", + " pred, _, _ = tiled_pred(model, None, im1, im2, None, conf_mode=tile_conf_mode, overlap=tile_overlap, crop=cropsize, with_conf=with_conf, return_time=False)\n", + "pred = pred.squeeze(0).permute(1,2,0).cpu().numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26f79db3", + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(flowToColor(pred))\n", + "plt.axis('off')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/crops/README.MD b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/crops/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..47ddabebb177644694ee247ae878173a3a16644f --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/crops/README.MD @@ -0,0 +1,104 @@ +## Generation of crops from the real datasets + +The instructions below allow to generate the crops used for pre-training CroCo v2 from the following real-world datasets: ARKitScenes, MegaDepth, 3DStreetView and IndoorVL. + +### Download the metadata of the crops to generate + +First, download the metadata and put them in `./data/`: +``` +mkdir -p data +cd data/ +wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/crop_metadata.zip +unzip crop_metadata.zip +rm crop_metadata.zip +cd .. +``` + +### Prepare the original datasets + +Second, download the original datasets in `./data/original_datasets/`. +``` +mkdir -p data/original_datasets +``` + +##### ARKitScenes + +Download the `raw` dataset from https://github.com/apple/ARKitScenes/blob/main/DATA.md and put it in `./data/original_datasets/ARKitScenes/`. +The resulting file structure should be like: +``` +./data/original_datasets/ARKitScenes/ +└───Training + └───40753679 + │ │ ultrawide + │ │ ... + └───40753686 + │ + ... +``` + +##### MegaDepth + +Download `MegaDepth v1 Dataset` from https://www.cs.cornell.edu/projects/megadepth/ and put it in `./data/original_datasets/MegaDepth/`. +The resulting file structure should be like: + +``` +./data/original_datasets/MegaDepth/ +└───0000 +│ └───images +│ │ │ 1000557903_87fa96b8a4_o.jpg +│ │ └ ... +│ └─── ... +└───0001 +│ │ +│ └ ... +└─── ... +``` + +##### 3DStreetView + +Download `3D_Street_View` dataset from https://github.com/amir32002/3D_Street_View and put it in `./data/original_datasets/3DStreetView/`. +The resulting file structure should be like: + +``` +./data/original_datasets/3DStreetView/ +└───dataset_aligned +│ └───0002 +│ │ │ 0000002_0000001_0000002_0000001.jpg +│ │ └ ... +│ └─── ... +└───dataset_unaligned +│ └───0003 +│ │ │ 0000003_0000001_0000002_0000001.jpg +│ │ └ ... +│ └─── ... +``` + +##### IndoorVL + +Download the `IndoorVL` datasets using [Kapture](https://github.com/naver/kapture). + +``` +pip install kapture +mkdir -p ./data/original_datasets/IndoorVL +cd ./data/original_datasets/IndoorVL +kapture_download_dataset.py update +kapture_download_dataset.py install "HyundaiDepartmentStore_*" +kapture_download_dataset.py install "GangnamStation_*" +cd - +``` + +### Extract the crops + +Now, extract the crops for each of the dataset: +``` +for dataset in ARKitScenes MegaDepth 3DStreetView IndoorVL; +do + python3 datasets/crops/extract_crops_from_images.py --crops ./data/crop_metadata/${dataset}/crops_release.txt --root-dir ./data/original_datasets/${dataset}/ --output-dir ./data/${dataset}_crops/ --imsize 256 --nthread 8 --max-subdir-levels 5 --ideal-number-pairs-in-dir 500; +done +``` + +##### Note for IndoorVL + +Due to some legal issues, we can only release 144,228 pairs out of the 1,593,689 pairs used in the paper. +To account for it in terms of number of pre-training iterations, the pre-training command in this repository uses 125 training epochs including 12 warm-up epochs and learning rate cosine schedule of 250, instead of 100, 10 and 200 respectively. +The impact on the performance is negligible. diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/crops/extract_crops_from_images.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/crops/extract_crops_from_images.py new file mode 100644 index 0000000000000000000000000000000000000000..870cf9f9690bfc53f10a59293aabc16da127b02e --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/crops/extract_crops_from_images.py @@ -0,0 +1,183 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Extracting crops for pre-training +# -------------------------------------------------------- + +import os +import argparse +from tqdm import tqdm +from PIL import Image +import functools +from multiprocessing import Pool +import math + + +def arg_parser(): + parser = argparse.ArgumentParser( + "Generate cropped image pairs from image crop list" + ) + + parser.add_argument("--crops", type=str, required=True, help="crop file") + parser.add_argument("--root-dir", type=str, required=True, help="root directory") + parser.add_argument( + "--output-dir", type=str, required=True, help="output directory" + ) + parser.add_argument("--imsize", type=int, default=256, help="size of the crops") + parser.add_argument( + "--nthread", type=int, required=True, help="number of simultaneous threads" + ) + parser.add_argument( + "--max-subdir-levels", + type=int, + default=5, + help="maximum number of subdirectories", + ) + parser.add_argument( + "--ideal-number-pairs-in-dir", + type=int, + default=500, + help="number of pairs stored in a dir", + ) + return parser + + +def main(args): + listing_path = os.path.join(args.output_dir, "listing.txt") + + print(f"Loading list of crops ... ({args.nthread} threads)") + crops, num_crops_to_generate = load_crop_file(args.crops) + + print(f"Preparing jobs ({len(crops)} candidate image pairs)...") + num_levels = min( + math.ceil(math.log(num_crops_to_generate, args.ideal_number_pairs_in_dir)), + args.max_subdir_levels, + ) + num_pairs_in_dir = math.ceil(num_crops_to_generate ** (1 / num_levels)) + + jobs = prepare_jobs(crops, num_levels, num_pairs_in_dir) + del crops + + os.makedirs(args.output_dir, exist_ok=True) + mmap = Pool(args.nthread).imap_unordered if args.nthread > 1 else map + call = functools.partial(save_image_crops, args) + + print(f"Generating cropped images to {args.output_dir} ...") + with open(listing_path, "w") as listing: + listing.write("# pair_path\n") + for results in tqdm(mmap(call, jobs), total=len(jobs)): + for path in results: + listing.write(f"{path}\n") + print("Finished writing listing to", listing_path) + + +def load_crop_file(path): + data = open(path).read().splitlines() + pairs = [] + num_crops_to_generate = 0 + for line in tqdm(data): + if line.startswith("#"): + continue + line = line.split(", ") + if len(line) < 8: + img1, img2, rotation = line + pairs.append((img1, img2, int(rotation), [])) + else: + l1, r1, t1, b1, l2, r2, t2, b2 = map(int, line) + rect1, rect2 = (l1, t1, r1, b1), (l2, t2, r2, b2) + pairs[-1][-1].append((rect1, rect2)) + num_crops_to_generate += 1 + return pairs, num_crops_to_generate + + +def prepare_jobs(pairs, num_levels, num_pairs_in_dir): + jobs = [] + powers = [num_pairs_in_dir**level for level in reversed(range(num_levels))] + + def get_path(idx): + idx_array = [] + d = idx + for level in range(num_levels - 1): + idx_array.append(idx // powers[level]) + idx = idx % powers[level] + idx_array.append(d) + return "/".join(map(lambda x: hex(x)[2:], idx_array)) + + idx = 0 + for pair_data in tqdm(pairs): + img1, img2, rotation, crops = pair_data + if -60 <= rotation and rotation <= 60: + rotation = 0 # most likely not a true rotation + paths = [get_path(idx + k) for k in range(len(crops))] + idx += len(crops) + jobs.append(((img1, img2), rotation, crops, paths)) + return jobs + + +def load_image(path): + try: + return Image.open(path).convert("RGB") + except Exception as e: + print("skipping", path, e) + raise OSError() + + +def save_image_crops(args, data): + # load images + img_pair, rot, crops, paths = data + try: + img1, img2 = [ + load_image(os.path.join(args.root_dir, impath)) for impath in img_pair + ] + except OSError as e: + return [] + + def area(sz): + return sz[0] * sz[1] + + tgt_size = (args.imsize, args.imsize) + + def prepare_crop(img, rect, rot=0): + # actual crop + img = img.crop(rect) + + # resize to desired size + interp = ( + Image.Resampling.LANCZOS + if area(img.size) > 4 * area(tgt_size) + else Image.Resampling.BICUBIC + ) + img = img.resize(tgt_size, resample=interp) + + # rotate the image + rot90 = (round(rot / 90) % 4) * 90 + if rot90 == 90: + img = img.transpose(Image.Transpose.ROTATE_90) + elif rot90 == 180: + img = img.transpose(Image.Transpose.ROTATE_180) + elif rot90 == 270: + img = img.transpose(Image.Transpose.ROTATE_270) + return img + + results = [] + for (rect1, rect2), path in zip(crops, paths): + crop1 = prepare_crop(img1, rect1) + crop2 = prepare_crop(img2, rect2, rot) + + fullpath1 = os.path.join(args.output_dir, path + "_1.jpg") + fullpath2 = os.path.join(args.output_dir, path + "_2.jpg") + os.makedirs(os.path.dirname(fullpath1), exist_ok=True) + + assert not os.path.isfile(fullpath1), fullpath1 + assert not os.path.isfile(fullpath2), fullpath2 + crop1.save(fullpath1) + crop2.save(fullpath2) + results.append(path) + + return results + + +if __name__ == "__main__": + args = arg_parser().parse_args() + main(args) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/README.MD b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..a505781ff9eb91bce7f1d189e848f8ba1c560940 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/README.MD @@ -0,0 +1,76 @@ +## Generation of synthetic image pairs using Habitat-Sim + +These instructions allow to generate pre-training pairs from the Habitat simulator. +As we did not save metadata of the pairs used in the original paper, they are not strictly the same, but these data use the same setting and are equivalent. + +### Download Habitat-Sim scenes +Download Habitat-Sim scenes: +- Download links can be found here: https://github.com/facebookresearch/habitat-sim/blob/main/DATASETS.md +- We used scenes from the HM3D, habitat-test-scenes, Replica, ReplicaCad and ScanNet datasets. +- Please put the scenes under `./data/habitat-sim-data/scene_datasets/` following the structure below, or update manually paths in `paths.py`. +``` +./data/ +└──habitat-sim-data/ + └──scene_datasets/ + ├──hm3d/ + ├──gibson/ + ├──habitat-test-scenes/ + ├──replica_cad_baked_lighting/ + ├──replica_cad/ + ├──ReplicaDataset/ + └──scannet/ +``` + +### Image pairs generation +We provide metadata to generate reproducible images pairs for pretraining and validation. +Experiments described in the paper used similar data, but whose generation was not reproducible at the time. + +Specifications: +- 256x256 resolution images, with 60 degrees field of view . +- Up to 1000 image pairs per scene. +- Number of scenes considered/number of images pairs per dataset: + - Scannet: 1097 scenes / 985 209 pairs + - HM3D: + - hm3d/train: 800 / 800k pairs + - hm3d/val: 100 scenes / 100k pairs + - hm3d/minival: 10 scenes / 10k pairs + - habitat-test-scenes: 3 scenes / 3k pairs + - replica_cad_baked_lighting: 13 scenes / 13k pairs + +- Scenes from hm3d/val and hm3d/minival pairs were not used for the pre-training but kept for validation purposes. + +Download metadata and extract it: +```bash +mkdir -p data/habitat_release_metadata/ +cd data/habitat_release_metadata/ +wget https://download.europe.naverlabs.com/ComputerVision/CroCo/data/habitat_release_metadata/multiview_habitat_metadata.tar.gz +tar -xvf multiview_habitat_metadata.tar.gz +cd ../.. +# Location of the metadata +METADATA_DIR="./data/habitat_release_metadata/multiview_habitat_metadata" +``` + +Generate image pairs from metadata: +- The following command will print a list of commandlines to generate image pairs for each scene: +```bash +# Target output directory +PAIRS_DATASET_DIR="./data/habitat_release/" +python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR +``` +- One can launch multiple of such commands in parallel e.g. using GNU Parallel: +```bash +python datasets/habitat_sim/generate_from_metadata_files.py --input_dir=$METADATA_DIR --output_dir=$PAIRS_DATASET_DIR | parallel -j 16 +``` + +## Metadata generation + +Image pairs were randomly sampled using the following commands, whose outputs contain randomness and are thus not exactly reproducible: +```bash +# Print commandlines to generate image pairs from the different scenes available. +PAIRS_DATASET_DIR=MY_CUSTOM_PATH +python datasets/habitat_sim/generate_multiview_images.py --list_commands --output_dir=$PAIRS_DATASET_DIR + +# Once a dataset is generated, pack metadata files for reproducibility. +METADATA_DIR=MY_CUSTON_PATH +python datasets/habitat_sim/pack_metadata_files.py $PAIRS_DATASET_DIR $METADATA_DIR +``` diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_from_metadata.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_from_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..6bbfbc6bec23e182baed2c4eedf0535fbc6aaa97 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_from_metadata.py @@ -0,0 +1,125 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +""" +Script to generate image pairs for a given scene reproducing poses provided in a metadata file. +""" +import os +from datasets.habitat_sim.multiview_habitat_sim_generator import ( + MultiviewHabitatSimGenerator, +) +from datasets.habitat_sim.paths import SCENES_DATASET +import argparse +import quaternion +import PIL.Image +import cv2 +import json +from tqdm import tqdm + + +def generate_multiview_images_from_metadata( + metadata_filename, + output_dir, + overload_params=dict(), + scene_datasets_paths=None, + exist_ok=False, +): + """ + Generate images from a metadata file for reproducibility purposes. + """ + # Reorder paths by decreasing label length, to avoid collisions when testing if a string by such label + if scene_datasets_paths is not None: + scene_datasets_paths = dict( + sorted(scene_datasets_paths.items(), key=lambda x: len(x[0]), reverse=True) + ) + + with open(metadata_filename, "r") as f: + input_metadata = json.load(f) + metadata = dict() + for key, value in input_metadata.items(): + # Optionally replace some paths + if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "": + if scene_datasets_paths is not None: + for dataset_label, dataset_path in scene_datasets_paths.items(): + if value.startswith(dataset_label): + value = os.path.normpath( + os.path.join( + dataset_path, os.path.relpath(value, dataset_label) + ) + ) + break + metadata[key] = value + + # Overload some parameters + for key, value in overload_params.items(): + metadata[key] = value + + generation_entries = dict( + [ + (key, value) + for key, value in metadata.items() + if not (key in ("multiviews", "output_dir", "generate_depth")) + ] + ) + generate_depth = metadata["generate_depth"] + + os.makedirs(output_dir, exist_ok=exist_ok) + + generator = MultiviewHabitatSimGenerator(**generation_entries) + + # Generate views + for idx_label, data in tqdm(metadata["multiviews"].items()): + positions = data["positions"] + orientations = data["orientations"] + n = len(positions) + for oidx in range(n): + observation = generator.render_viewpoint( + positions[oidx], quaternion.from_float_array(orientations[oidx]) + ) + observation_label = f"{oidx + 1}" # Leonid is indexing starting from 1 + # Color image saved using PIL + img = PIL.Image.fromarray(observation["color"][:, :, :3]) + filename = os.path.join(output_dir, f"{idx_label}_{observation_label}.jpeg") + img.save(filename) + if generate_depth: + # Depth image as EXR file + filename = os.path.join( + output_dir, f"{idx_label}_{observation_label}_depth.exr" + ) + cv2.imwrite( + filename, + observation["depth"], + [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF], + ) + # Camera parameters + camera_params = dict( + [ + (key, observation[key].tolist()) + for key in ("camera_intrinsics", "R_cam2world", "t_cam2world") + ] + ) + filename = os.path.join( + output_dir, f"{idx_label}_{observation_label}_camera_params.json" + ) + with open(filename, "w") as f: + json.dump(camera_params, f) + # Save metadata + with open(os.path.join(output_dir, "metadata.json"), "w") as f: + json.dump(metadata, f) + + generator.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--metadata_filename", required=True) + parser.add_argument("--output_dir", required=True) + args = parser.parse_args() + + generate_multiview_images_from_metadata( + metadata_filename=args.metadata_filename, + output_dir=args.output_dir, + scene_datasets_paths=SCENES_DATASET, + overload_params=dict(), + exist_ok=True, + ) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_from_metadata_files.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_from_metadata_files.py new file mode 100644 index 0000000000000000000000000000000000000000..2376957e0578726a98515220167e86fbecc2d72d --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_from_metadata_files.py @@ -0,0 +1,36 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +""" +Script generating commandlines to generate image pairs from metadata files. +""" +import os +import glob +from tqdm import tqdm +import argparse + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input_dir", required=True) + parser.add_argument("--output_dir", required=True) + parser.add_argument( + "--prefix", + default="", + help="Commanline prefix, useful e.g. to setup environment.", + ) + args = parser.parse_args() + + input_metadata_filenames = glob.iglob( + f"{args.input_dir}/**/metadata.json", recursive=True + ) + + for metadata_filename in tqdm(input_metadata_filenames): + output_dir = os.path.join( + args.output_dir, + os.path.relpath(os.path.dirname(metadata_filename), args.input_dir), + ) + # Do not process the scene if the metadata file already exists + if os.path.exists(os.path.join(output_dir, "metadata.json")): + continue + commandline = f"{args.prefix}python datasets/habitat_sim/generate_from_metadata.py --metadata_filename={metadata_filename} --output_dir={output_dir}" + print(commandline) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_multiview_images.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_multiview_images.py new file mode 100644 index 0000000000000000000000000000000000000000..cf16062135dfbaeb38ff2ad91c33bcab50cb98aa --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/generate_multiview_images.py @@ -0,0 +1,231 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import os +from tqdm import tqdm +import argparse +import PIL.Image +import numpy as np +import json +from datasets.habitat_sim.multiview_habitat_sim_generator import ( + MultiviewHabitatSimGenerator, + NoNaviguableSpaceError, +) +from datasets.habitat_sim.paths import list_scenes_available +import cv2 +import quaternion +import shutil + + +def generate_multiview_images_for_scene( + scene_dataset_config_file, + scene, + navmesh, + output_dir, + views_count, + size, + exist_ok=False, + generate_depth=False, + **kwargs, +): + """ + Generate tuples of overlapping views for a given scene. + generate_depth: generate depth images and camera parameters. + """ + if os.path.exists(output_dir) and not exist_ok: + print(f"Scene {scene}: data already generated. Ignoring generation.") + return + try: + print(f"Scene {scene}: {size} multiview acquisitions to generate...") + os.makedirs(output_dir, exist_ok=exist_ok) + + metadata_filename = os.path.join(output_dir, "metadata.json") + + metadata_template = dict( + scene_dataset_config_file=scene_dataset_config_file, + scene=scene, + navmesh=navmesh, + views_count=views_count, + size=size, + generate_depth=generate_depth, + **kwargs, + ) + metadata_template["multiviews"] = dict() + + if os.path.exists(metadata_filename): + print("Metadata file already exists:", metadata_filename) + print("Loading already generated metadata file...") + with open(metadata_filename, "r") as f: + metadata = json.load(f) + + for key in metadata_template.keys(): + if key != "multiviews": + assert ( + metadata_template[key] == metadata[key] + ), f"existing file is inconsistent with the input parameters:\nKey: {key}\nmetadata: {metadata[key]}\ntemplate: {metadata_template[key]}." + else: + print("No temporary file found. Starting generation from scratch...") + metadata = metadata_template + + starting_id = len(metadata["multiviews"]) + print(f"Starting generation from index {starting_id}/{size}...") + if starting_id >= size: + print("Generation already done.") + return + + generator = MultiviewHabitatSimGenerator( + scene_dataset_config_file=scene_dataset_config_file, + scene=scene, + navmesh=navmesh, + views_count=views_count, + size=size, + **kwargs, + ) + + for idx in tqdm(range(starting_id, size)): + # Generate / re-generate the observations + try: + data = generator[idx] + observations = data["observations"] + positions = data["positions"] + orientations = data["orientations"] + + idx_label = f"{idx:08}" + for oidx, observation in enumerate(observations): + observation_label = ( + f"{oidx + 1}" # Leonid is indexing starting from 1 + ) + # Color image saved using PIL + img = PIL.Image.fromarray(observation["color"][:, :, :3]) + filename = os.path.join( + output_dir, f"{idx_label}_{observation_label}.jpeg" + ) + img.save(filename) + if generate_depth: + # Depth image as EXR file + filename = os.path.join( + output_dir, f"{idx_label}_{observation_label}_depth.exr" + ) + cv2.imwrite( + filename, + observation["depth"], + [cv2.IMWRITE_EXR_TYPE, cv2.IMWRITE_EXR_TYPE_HALF], + ) + # Camera parameters + camera_params = dict( + [ + (key, observation[key].tolist()) + for key in ( + "camera_intrinsics", + "R_cam2world", + "t_cam2world", + ) + ] + ) + filename = os.path.join( + output_dir, + f"{idx_label}_{observation_label}_camera_params.json", + ) + with open(filename, "w") as f: + json.dump(camera_params, f) + metadata["multiviews"][idx_label] = { + "positions": positions.tolist(), + "orientations": orientations.tolist(), + "covisibility_ratios": data["covisibility_ratios"].tolist(), + "valid_fractions": data["valid_fractions"].tolist(), + "pairwise_visibility_ratios": data[ + "pairwise_visibility_ratios" + ].tolist(), + } + except RecursionError: + print( + "Recursion error: unable to sample observations for this scene. We will stop there." + ) + break + + # Regularly save a temporary metadata file, in case we need to restart the generation + if idx % 10 == 0: + with open(metadata_filename, "w") as f: + json.dump(metadata, f) + + # Save metadata + with open(metadata_filename, "w") as f: + json.dump(metadata, f) + + generator.close() + except NoNaviguableSpaceError: + pass + + +def create_commandline(scene_data, generate_depth, exist_ok=False): + """ + Create a commandline string to generate a scene. + """ + + def my_formatting(val): + if val is None or val == "": + return '""' + else: + return val + + commandline = f"""python {__file__} --scene {my_formatting(scene_data.scene)} + --scene_dataset_config_file {my_formatting(scene_data.scene_dataset_config_file)} + --navmesh {my_formatting(scene_data.navmesh)} + --output_dir {my_formatting(scene_data.output_dir)} + --generate_depth {int(generate_depth)} + --exist_ok {int(exist_ok)} + """ + commandline = " ".join(commandline.split()) + return commandline + + +if __name__ == "__main__": + os.umask(2) + + parser = argparse.ArgumentParser( + description="""Example of use -- listing commands to generate data for scenes available: + > python datasets/habitat_sim/generate_multiview_habitat_images.py --list_commands + """ + ) + + parser.add_argument("--output_dir", type=str, required=True) + parser.add_argument( + "--list_commands", action="store_true", help="list commandlines to run if true" + ) + parser.add_argument("--scene", type=str, default="") + parser.add_argument("--scene_dataset_config_file", type=str, default="") + parser.add_argument("--navmesh", type=str, default="") + + parser.add_argument("--generate_depth", type=int, default=1) + parser.add_argument("--exist_ok", type=int, default=0) + + kwargs = dict(resolution=(256, 256), hfov=60, views_count=2, size=1000) + + args = parser.parse_args() + generate_depth = bool(args.generate_depth) + exist_ok = bool(args.exist_ok) + + if args.list_commands: + # Listing scenes available... + scenes_data = list_scenes_available(base_output_dir=args.output_dir) + + for scene_data in scenes_data: + print( + create_commandline( + scene_data, generate_depth=generate_depth, exist_ok=exist_ok + ) + ) + else: + if args.scene == "" or args.output_dir == "": + print("Missing scene or output dir argument!") + print(parser.format_help()) + else: + generate_multiview_images_for_scene( + scene=args.scene, + scene_dataset_config_file=args.scene_dataset_config_file, + navmesh=args.navmesh, + output_dir=args.output_dir, + exist_ok=exist_ok, + generate_depth=generate_depth, + **kwargs, + ) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..b073407ec169be0674cbd33a1197731ec0dd3be3 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/multiview_habitat_sim_generator.py @@ -0,0 +1,501 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import os +import numpy as np +import quaternion +import habitat_sim +import json +from sklearn.neighbors import NearestNeighbors +import cv2 + +# OpenCV to habitat camera convention transformation +R_OPENCV2HABITAT = np.stack( + (habitat_sim.geo.RIGHT, -habitat_sim.geo.UP, habitat_sim.geo.FRONT), axis=0 +) +R_HABITAT2OPENCV = R_OPENCV2HABITAT.T +DEG2RAD = np.pi / 180 + + +def compute_camera_intrinsics(height, width, hfov): + f = width / 2 / np.tan(hfov / 2 * np.pi / 180) + cu, cv = width / 2, height / 2 + return f, cu, cv + + +def compute_camera_pose_opencv_convention(camera_position, camera_orientation): + R_cam2world = quaternion.as_rotation_matrix(camera_orientation) @ R_OPENCV2HABITAT + t_cam2world = np.asarray(camera_position) + return R_cam2world, t_cam2world + + +def compute_pointmap(depthmap, hfov): + """Compute a HxWx3 pointmap in camera frame from a HxW depth map.""" + height, width = depthmap.shape + f, cu, cv = compute_camera_intrinsics(height, width, hfov) + # Cast depth map to point + z_cam = depthmap + u, v = np.meshgrid(range(width), range(height)) + x_cam = (u - cu) / f * z_cam + y_cam = (v - cv) / f * z_cam + X_cam = np.stack((x_cam, y_cam, z_cam), axis=-1) + return X_cam + + +def compute_pointcloud(depthmap, hfov, camera_position, camera_rotation): + """Return a 3D point cloud corresponding to valid pixels of the depth map""" + R_cam2world, t_cam2world = compute_camera_pose_opencv_convention( + camera_position, camera_rotation + ) + + X_cam = compute_pointmap(depthmap=depthmap, hfov=hfov) + valid_mask = X_cam[:, :, 2] != 0.0 + + X_cam = X_cam.reshape(-1, 3)[valid_mask.flatten()] + X_world = X_cam @ R_cam2world.T + t_cam2world.reshape(1, 3) + return X_world + + +def compute_pointcloud_overlaps_scikit( + pointcloud1, pointcloud2, distance_threshold, compute_symmetric=False +): + """ + Compute 'overlapping' metrics based on a distance threshold between two point clouds. + """ + nbrs = NearestNeighbors(n_neighbors=1, algorithm="kd_tree").fit(pointcloud2) + distances, indices = nbrs.kneighbors(pointcloud1) + intersection1 = np.count_nonzero(distances.flatten() < distance_threshold) + + data = {"intersection1": intersection1, "size1": len(pointcloud1)} + if compute_symmetric: + nbrs = NearestNeighbors(n_neighbors=1, algorithm="kd_tree").fit(pointcloud1) + distances, indices = nbrs.kneighbors(pointcloud2) + intersection2 = np.count_nonzero(distances.flatten() < distance_threshold) + data["intersection2"] = intersection2 + data["size2"] = len(pointcloud2) + + return data + + +def _append_camera_parameters(observation, hfov, camera_location, camera_rotation): + """ + Add camera parameters to the observation dictionnary produced by Habitat-Sim + In-place modifications. + """ + R_cam2world, t_cam2world = compute_camera_pose_opencv_convention( + camera_location, camera_rotation + ) + height, width = observation["depth"].shape + f, cu, cv = compute_camera_intrinsics(height, width, hfov) + K = np.asarray([[f, 0, cu], [0, f, cv], [0, 0, 1.0]]) + observation["camera_intrinsics"] = K + observation["t_cam2world"] = t_cam2world + observation["R_cam2world"] = R_cam2world + + +def look_at(eye, center, up, return_cam2world=True): + """ + Return camera pose looking at a given center point. + Analogous of gluLookAt function, using OpenCV camera convention. + """ + z = center - eye + z /= np.linalg.norm(z, axis=-1, keepdims=True) + y = -up + y = y - np.sum(y * z, axis=-1, keepdims=True) * z + y /= np.linalg.norm(y, axis=-1, keepdims=True) + x = np.cross(y, z, axis=-1) + + if return_cam2world: + R = np.stack((x, y, z), axis=-1) + t = eye + else: + # World to camera transformation + # Transposed matrix + R = np.stack((x, y, z), axis=-2) + t = -np.einsum("...ij, ...j", R, eye) + return R, t + + +def look_at_for_habitat(eye, center, up, return_cam2world=True): + R, t = look_at(eye, center, up) + orientation = quaternion.from_rotation_matrix(R @ R_OPENCV2HABITAT.T) + return orientation, t + + +def generate_orientation_noise(pan_range, tilt_range, roll_range): + return ( + quaternion.from_rotation_vector( + np.random.uniform(*pan_range) * DEG2RAD * habitat_sim.geo.UP + ) + * quaternion.from_rotation_vector( + np.random.uniform(*tilt_range) * DEG2RAD * habitat_sim.geo.RIGHT + ) + * quaternion.from_rotation_vector( + np.random.uniform(*roll_range) * DEG2RAD * habitat_sim.geo.FRONT + ) + ) + + +class NoNaviguableSpaceError(RuntimeError): + def __init__(self, *args): + super().__init__(*args) + + +class MultiviewHabitatSimGenerator: + def __init__( + self, + scene, + navmesh, + scene_dataset_config_file, + resolution=(240, 320), + views_count=2, + hfov=60, + gpu_id=0, + size=10000, + minimum_covisibility=0.5, + transform=None, + ): + self.scene = scene + self.navmesh = navmesh + self.scene_dataset_config_file = scene_dataset_config_file + self.resolution = resolution + self.views_count = views_count + assert self.views_count >= 1 + self.hfov = hfov + self.gpu_id = gpu_id + self.size = size + self.transform = transform + + # Noise added to camera orientation + self.pan_range = (-3, 3) + self.tilt_range = (-10, 10) + self.roll_range = (-5, 5) + + # Height range to sample cameras + self.height_range = (1.2, 1.8) + + # Random steps between the camera views + self.random_steps_count = 5 + self.random_step_variance = 2.0 + + # Minimum fraction of the scene which should be valid (well defined depth) + self.minimum_valid_fraction = 0.7 + + # Distance threshold to see to select pairs + self.distance_threshold = 0.05 + # Minimum IoU of a view point cloud with respect to the reference view to be kept. + self.minimum_covisibility = minimum_covisibility + + # Maximum number of retries. + self.max_attempts_count = 100 + + self.seed = None + self._lazy_initialization() + + def _lazy_initialization(self): + # Lazy random seeding and instantiation of the simulator to deal with multiprocessing properly + if self.seed == None: + # Re-seed numpy generator + np.random.seed() + self.seed = np.random.randint(2**32 - 1) + sim_cfg = habitat_sim.SimulatorConfiguration() + sim_cfg.scene_id = self.scene + if ( + self.scene_dataset_config_file is not None + and self.scene_dataset_config_file != "" + ): + sim_cfg.scene_dataset_config_file = self.scene_dataset_config_file + sim_cfg.random_seed = self.seed + sim_cfg.load_semantic_mesh = False + sim_cfg.gpu_device_id = self.gpu_id + + depth_sensor_spec = habitat_sim.CameraSensorSpec() + depth_sensor_spec.uuid = "depth" + depth_sensor_spec.sensor_type = habitat_sim.SensorType.DEPTH + depth_sensor_spec.resolution = self.resolution + depth_sensor_spec.hfov = self.hfov + depth_sensor_spec.position = [0.0, 0.0, 0] + depth_sensor_spec.orientation + + rgb_sensor_spec = habitat_sim.CameraSensorSpec() + rgb_sensor_spec.uuid = "color" + rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR + rgb_sensor_spec.resolution = self.resolution + rgb_sensor_spec.hfov = self.hfov + rgb_sensor_spec.position = [0.0, 0.0, 0] + agent_cfg = habitat_sim.agent.AgentConfiguration( + sensor_specifications=[rgb_sensor_spec, depth_sensor_spec] + ) + + cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg]) + self.sim = habitat_sim.Simulator(cfg) + if self.navmesh is not None and self.navmesh != "": + # Use pre-computed navmesh when available (usually better than those generated automatically) + self.sim.pathfinder.load_nav_mesh(self.navmesh) + + if not self.sim.pathfinder.is_loaded: + # Try to compute a navmesh + navmesh_settings = habitat_sim.NavMeshSettings() + navmesh_settings.set_defaults() + self.sim.recompute_navmesh(self.sim.pathfinder, navmesh_settings, True) + + # Ensure that the navmesh is not empty + if not self.sim.pathfinder.is_loaded: + raise NoNaviguableSpaceError( + f"No naviguable location (scene: {self.scene} -- navmesh: {self.navmesh})" + ) + + self.agent = self.sim.initialize_agent(agent_id=0) + + def close(self): + self.sim.close() + + def __del__(self): + self.sim.close() + + def __len__(self): + return self.size + + def sample_random_viewpoint(self): + """Sample a random viewpoint using the navmesh""" + nav_point = self.sim.pathfinder.get_random_navigable_point() + + # Sample a random viewpoint height + viewpoint_height = np.random.uniform(*self.height_range) + viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP + viewpoint_orientation = quaternion.from_rotation_vector( + np.random.uniform(0, 2 * np.pi) * habitat_sim.geo.UP + ) * generate_orientation_noise(self.pan_range, self.tilt_range, self.roll_range) + return viewpoint_position, viewpoint_orientation, nav_point + + def sample_other_random_viewpoint(self, observed_point, nav_point): + """Sample a random viewpoint close to an existing one, using the navmesh and a reference observed point.""" + other_nav_point = nav_point + + walk_directions = self.random_step_variance * np.asarray([1, 0, 1]) + for i in range(self.random_steps_count): + temp = self.sim.pathfinder.snap_point( + other_nav_point + walk_directions * np.random.normal(size=3) + ) + # Snapping may return nan when it fails + if not np.isnan(temp[0]): + other_nav_point = temp + + other_viewpoint_height = np.random.uniform(*self.height_range) + other_viewpoint_position = ( + other_nav_point + other_viewpoint_height * habitat_sim.geo.UP + ) + + # Set viewing direction towards the central point + rotation, position = look_at_for_habitat( + eye=other_viewpoint_position, + center=observed_point, + up=habitat_sim.geo.UP, + return_cam2world=True, + ) + rotation = rotation * generate_orientation_noise( + self.pan_range, self.tilt_range, self.roll_range + ) + return position, rotation, other_nav_point + + def is_other_pointcloud_overlapping(self, ref_pointcloud, other_pointcloud): + """Check if a viewpoint is valid and overlaps significantly with a reference one.""" + # Observation + pixels_count = self.resolution[0] * self.resolution[1] + valid_fraction = len(other_pointcloud) / pixels_count + assert valid_fraction <= 1.0 and valid_fraction >= 0.0 + overlap = compute_pointcloud_overlaps_scikit( + ref_pointcloud, + other_pointcloud, + self.distance_threshold, + compute_symmetric=True, + ) + covisibility = min( + overlap["intersection1"] / pixels_count, + overlap["intersection2"] / pixels_count, + ) + is_valid = (valid_fraction >= self.minimum_valid_fraction) and ( + covisibility >= self.minimum_covisibility + ) + return is_valid, valid_fraction, covisibility + + def is_other_viewpoint_overlapping( + self, ref_pointcloud, observation, position, rotation + ): + """Check if a viewpoint is valid and overlaps significantly with a reference one.""" + # Observation + other_pointcloud = compute_pointcloud( + observation["depth"], self.hfov, position, rotation + ) + return self.is_other_pointcloud_overlapping(ref_pointcloud, other_pointcloud) + + def render_viewpoint(self, viewpoint_position, viewpoint_orientation): + agent_state = habitat_sim.AgentState() + agent_state.position = viewpoint_position + agent_state.rotation = viewpoint_orientation + self.agent.set_state(agent_state) + viewpoint_observations = self.sim.get_sensor_observations(agent_ids=0) + _append_camera_parameters( + viewpoint_observations, self.hfov, viewpoint_position, viewpoint_orientation + ) + return viewpoint_observations + + def __getitem__(self, useless_idx): + ref_position, ref_orientation, nav_point = self.sample_random_viewpoint() + ref_observations = self.render_viewpoint(ref_position, ref_orientation) + # Extract point cloud + ref_pointcloud = compute_pointcloud( + depthmap=ref_observations["depth"], + hfov=self.hfov, + camera_position=ref_position, + camera_rotation=ref_orientation, + ) + + pixels_count = self.resolution[0] * self.resolution[1] + ref_valid_fraction = len(ref_pointcloud) / pixels_count + assert ref_valid_fraction <= 1.0 and ref_valid_fraction >= 0.0 + if ref_valid_fraction < self.minimum_valid_fraction: + # This should produce a recursion error at some point when something is very wrong. + return self[0] + # Pick an reference observed point in the point cloud + observed_point = np.mean(ref_pointcloud, axis=0) + + # Add the first image as reference + viewpoints_observations = [ref_observations] + viewpoints_covisibility = [ref_valid_fraction] + viewpoints_positions = [ref_position] + viewpoints_orientations = [quaternion.as_float_array(ref_orientation)] + viewpoints_clouds = [ref_pointcloud] + viewpoints_valid_fractions = [ref_valid_fraction] + + for _ in range(self.views_count - 1): + # Generate an other viewpoint using some dummy random walk + successful_sampling = False + for sampling_attempt in range(self.max_attempts_count): + position, rotation, _ = self.sample_other_random_viewpoint( + observed_point, nav_point + ) + # Observation + other_viewpoint_observations = self.render_viewpoint(position, rotation) + other_pointcloud = compute_pointcloud( + other_viewpoint_observations["depth"], self.hfov, position, rotation + ) + + is_valid, valid_fraction, covisibility = ( + self.is_other_pointcloud_overlapping( + ref_pointcloud, other_pointcloud + ) + ) + if is_valid: + successful_sampling = True + break + if not successful_sampling: + print("WARNING: Maximum number of attempts reached.") + # Dirty hack, try using a novel original viewpoint + return self[0] + viewpoints_observations.append(other_viewpoint_observations) + viewpoints_covisibility.append(covisibility) + viewpoints_positions.append(position) + viewpoints_orientations.append( + quaternion.as_float_array(rotation) + ) # WXYZ convention for the quaternion encoding. + viewpoints_clouds.append(other_pointcloud) + viewpoints_valid_fractions.append(valid_fraction) + + # Estimate relations between all pairs of images + pairwise_visibility_ratios = np.ones( + (len(viewpoints_observations), len(viewpoints_observations)) + ) + for i in range(len(viewpoints_observations)): + pairwise_visibility_ratios[i, i] = viewpoints_valid_fractions[i] + for j in range(i + 1, len(viewpoints_observations)): + overlap = compute_pointcloud_overlaps_scikit( + viewpoints_clouds[i], + viewpoints_clouds[j], + self.distance_threshold, + compute_symmetric=True, + ) + pairwise_visibility_ratios[i, j] = ( + overlap["intersection1"] / pixels_count + ) + pairwise_visibility_ratios[j, i] = ( + overlap["intersection2"] / pixels_count + ) + + # IoU is relative to the image 0 + data = { + "observations": viewpoints_observations, + "positions": np.asarray(viewpoints_positions), + "orientations": np.asarray(viewpoints_orientations), + "covisibility_ratios": np.asarray(viewpoints_covisibility), + "valid_fractions": np.asarray(viewpoints_valid_fractions, dtype=float), + "pairwise_visibility_ratios": np.asarray( + pairwise_visibility_ratios, dtype=float + ), + } + + if self.transform is not None: + data = self.transform(data) + return data + + def generate_random_spiral_trajectory( + self, + images_count=100, + max_radius=0.5, + half_turns=5, + use_constant_orientation=False, + ): + """ + Return a list of images corresponding to a spiral trajectory from a random starting point. + Useful to generate nice visualisations. + Use an even number of half turns to get a nice "C1-continuous" loop effect + """ + ref_position, ref_orientation, navpoint = self.sample_random_viewpoint() + ref_observations = self.render_viewpoint(ref_position, ref_orientation) + ref_pointcloud = compute_pointcloud( + depthmap=ref_observations["depth"], + hfov=self.hfov, + camera_position=ref_position, + camera_rotation=ref_orientation, + ) + pixels_count = self.resolution[0] * self.resolution[1] + if len(ref_pointcloud) / pixels_count < self.minimum_valid_fraction: + # Dirty hack: ensure that the valid part of the image is significant + return self.generate_random_spiral_trajectory( + images_count, max_radius, half_turns, use_constant_orientation + ) + + # Pick an observed point in the point cloud + observed_point = np.mean(ref_pointcloud, axis=0) + ref_R, ref_t = compute_camera_pose_opencv_convention( + ref_position, ref_orientation + ) + + images = [] + is_valid = [] + # Spiral trajectory, use_constant orientation + for i, alpha in enumerate(np.linspace(0, 1, images_count)): + r = max_radius * np.abs( + np.sin(alpha * np.pi) + ) # Increase then decrease the radius + theta = alpha * half_turns * np.pi + x = r * np.cos(theta) + y = r * np.sin(theta) + z = 0.0 + position = ( + ref_position + (ref_R @ np.asarray([x, y, z]).reshape(3, 1)).flatten() + ) + if use_constant_orientation: + orientation = ref_orientation + else: + # trajectory looking at a mean point in front of the ref observation + orientation, position = look_at_for_habitat( + eye=position, center=observed_point, up=habitat_sim.geo.UP + ) + observations = self.render_viewpoint(position, orientation) + images.append(observations["color"][..., :3]) + _is_valid, valid_fraction, iou = self.is_other_viewpoint_overlapping( + ref_pointcloud, observations, position, orientation + ) + is_valid.append(_is_valid) + return images, np.all(is_valid) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/pack_metadata_files.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/pack_metadata_files.py new file mode 100644 index 0000000000000000000000000000000000000000..9bd8234dfaa491d5f25f7c778406255116a8b392 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/pack_metadata_files.py @@ -0,0 +1,80 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +""" +Utility script to pack metadata files of the dataset in order to be able to re-generate it elsewhere. +""" +import os +import glob +from tqdm import tqdm +import shutil +import json +from datasets.habitat_sim.paths import * +import argparse +import collections + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("input_dir") + parser.add_argument("output_dir") + args = parser.parse_args() + + input_dirname = args.input_dir + output_dirname = args.output_dir + + input_metadata_filenames = glob.iglob( + f"{input_dirname}/**/metadata.json", recursive=True + ) + + images_count = collections.defaultdict(lambda: 0) + + os.makedirs(output_dirname) + for input_filename in tqdm(input_metadata_filenames): + # Ignore empty files + with open(input_filename, "r") as f: + original_metadata = json.load(f) + if ( + "multiviews" not in original_metadata + or len(original_metadata["multiviews"]) == 0 + ): + print("No views in", input_filename) + continue + + relpath = os.path.relpath(input_filename, input_dirname) + print(relpath) + + # Copy metadata, while replacing scene paths by generic keys depending on the dataset, for portability. + # Data paths are sorted by decreasing length to avoid potential bugs due to paths starting by the same string pattern. + scenes_dataset_paths = dict( + sorted(SCENES_DATASET.items(), key=lambda x: len(x[1]), reverse=True) + ) + metadata = dict() + for key, value in original_metadata.items(): + if key in ("scene_dataset_config_file", "scene", "navmesh") and value != "": + known_path = False + for dataset, dataset_path in scenes_dataset_paths.items(): + if value.startswith(dataset_path): + value = os.path.join( + dataset, os.path.relpath(value, dataset_path) + ) + known_path = True + break + if not known_path: + raise KeyError("Unknown path:" + value) + metadata[key] = value + + # Compile some general statistics while packing data + scene_split = metadata["scene"].split("/") + upper_level = ( + "/".join(scene_split[:2]) if scene_split[0] == "hm3d" else scene_split[0] + ) + images_count[upper_level] += len(metadata["multiviews"]) + + output_filename = os.path.join(output_dirname, relpath) + os.makedirs(os.path.dirname(output_filename), exist_ok=True) + with open(output_filename, "w") as f: + json.dump(metadata, f) + + # Print statistics + print("Images count:") + for upper_level, count in images_count.items(): + print(f"- {upper_level}: {count}") diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/paths.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/paths.py new file mode 100644 index 0000000000000000000000000000000000000000..87389fcff93d220d6f205dc21119da3c56c3abb9 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/habitat_sim/paths.py @@ -0,0 +1,179 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +""" +Paths to Habitat-Sim scenes +""" + +import os +import json +import collections +from tqdm import tqdm + + +# Hardcoded path to the different scene datasets +SCENES_DATASET = { + "hm3d": "./data/habitat-sim-data/scene_datasets/hm3d/", + "gibson": "./data/habitat-sim-data/scene_datasets/gibson/", + "habitat-test-scenes": "./data/habitat-sim/scene_datasets/habitat-test-scenes/", + "replica_cad_baked_lighting": "./data/habitat-sim/scene_datasets/replica_cad_baked_lighting/", + "replica_cad": "./data/habitat-sim/scene_datasets/replica_cad/", + "replica": "./data/habitat-sim/scene_datasets/ReplicaDataset/", + "scannet": "./data/habitat-sim/scene_datasets/scannet/", +} + +SceneData = collections.namedtuple( + "SceneData", ["scene_dataset_config_file", "scene", "navmesh", "output_dir"] +) + + +def list_replicacad_scenes(base_output_dir, base_path=SCENES_DATASET["replica_cad"]): + scene_dataset_config_file = os.path.join( + base_path, "replicaCAD.scene_dataset_config.json" + ) + scenes = [f"apt_{i}" for i in range(6)] + ["empty_stage"] + navmeshes = [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + [ + "empty_stage.navmesh" + ] + scenes_data = [] + for idx in range(len(scenes)): + output_dir = os.path.join(base_output_dir, "ReplicaCAD", scenes[idx]) + # Add scene + data = SceneData( + scene_dataset_config_file=scene_dataset_config_file, + scene=scenes[idx] + ".scene_instance.json", + navmesh=os.path.join(base_path, navmeshes[idx]), + output_dir=output_dir, + ) + scenes_data.append(data) + return scenes_data + + +def list_replica_cad_baked_lighting_scenes( + base_output_dir, base_path=SCENES_DATASET["replica_cad_baked_lighting"] +): + scene_dataset_config_file = os.path.join( + base_path, "replicaCAD_baked.scene_dataset_config.json" + ) + scenes = sum( + [[f"Baked_sc{i}_staging_{j:02}" for i in range(5)] for j in range(21)], [] + ) + navmeshes = "" # [f"navmeshes/apt_{i}_static_furniture.navmesh" for i in range(6)] + ["empty_stage.navmesh"] + scenes_data = [] + for idx in range(len(scenes)): + output_dir = os.path.join( + base_output_dir, "replica_cad_baked_lighting", scenes[idx] + ) + data = SceneData( + scene_dataset_config_file=scene_dataset_config_file, + scene=scenes[idx], + navmesh="", + output_dir=output_dir, + ) + scenes_data.append(data) + return scenes_data + + +def list_replica_scenes(base_output_dir, base_path): + scenes_data = [] + for scene_id in os.listdir(base_path): + scene = os.path.join(base_path, scene_id, "mesh.ply") + navmesh = os.path.join( + base_path, scene_id, "habitat/mesh_preseg_semantic.navmesh" + ) # Not sure if I should use it + scene_dataset_config_file = "" + output_dir = os.path.join(base_output_dir, scene_id) + # Add scene only if it does not exist already, or if exist_ok + data = SceneData( + scene_dataset_config_file=scene_dataset_config_file, + scene=scene, + navmesh=navmesh, + output_dir=output_dir, + ) + scenes_data.append(data) + return scenes_data + + +def list_scenes(base_output_dir, base_path): + """ + Generic method iterating through a base_path folder to find scenes. + """ + scenes_data = [] + for root, dirs, files in os.walk(base_path, followlinks=True): + folder_scenes_data = [] + for file in files: + name, ext = os.path.splitext(file) + if ext == ".glb": + scene = os.path.join(root, name + ".glb") + navmesh = os.path.join(root, name + ".navmesh") + if not os.path.exists(navmesh): + navmesh = "" + relpath = os.path.relpath(root, base_path) + output_dir = os.path.abspath( + os.path.join(base_output_dir, relpath, name) + ) + data = SceneData( + scene_dataset_config_file="", + scene=scene, + navmesh=navmesh, + output_dir=output_dir, + ) + folder_scenes_data.append(data) + + # Specific check for HM3D: + # When two meshesxxxx.basis.glb and xxxx.glb are present, use the 'basis' version. + basis_scenes = [ + data.scene[: -len(".basis.glb")] + for data in folder_scenes_data + if data.scene.endswith(".basis.glb") + ] + if len(basis_scenes) != 0: + folder_scenes_data = [ + data + for data in folder_scenes_data + if not (data.scene[: -len(".glb")] in basis_scenes) + ] + + scenes_data.extend(folder_scenes_data) + return scenes_data + + +def list_scenes_available(base_output_dir, scenes_dataset_paths=SCENES_DATASET): + scenes_data = [] + + # HM3D + for split in ("minival", "train", "val", "examples"): + scenes_data += list_scenes( + base_output_dir=os.path.join(base_output_dir, f"hm3d/{split}/"), + base_path=f"{scenes_dataset_paths['hm3d']}/{split}", + ) + + # Gibson + scenes_data += list_scenes( + base_output_dir=os.path.join(base_output_dir, "gibson"), + base_path=scenes_dataset_paths["gibson"], + ) + + # Habitat test scenes (just a few) + scenes_data += list_scenes( + base_output_dir=os.path.join(base_output_dir, "habitat-test-scenes"), + base_path=scenes_dataset_paths["habitat-test-scenes"], + ) + + # ReplicaCAD (baked lightning) + scenes_data += list_replica_cad_baked_lighting_scenes( + base_output_dir=base_output_dir + ) + + # ScanNet + scenes_data += list_scenes( + base_output_dir=os.path.join(base_output_dir, "scannet"), + base_path=scenes_dataset_paths["scannet"], + ) + + # Replica + list_replica_scenes( + base_output_dir=os.path.join(base_output_dir, "replica"), + base_path=scenes_dataset_paths["replica"], + ) + return scenes_data diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/pairs_dataset.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/pairs_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..066bb9510332255edd211f98f2beb6670abff4f9 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/pairs_dataset.py @@ -0,0 +1,162 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import os +from torch.utils.data import Dataset +from PIL import Image + +from datasets.transforms import get_pair_transforms + + +def load_image(impath): + return Image.open(impath) + + +def load_pairs_from_cache_file(fname, root=""): + assert os.path.isfile( + fname + ), "cannot parse pairs from {:s}, file does not exist".format(fname) + with open(fname, "r") as fid: + lines = fid.read().strip().splitlines() + pairs = [ + (os.path.join(root, l.split()[0]), os.path.join(root, l.split()[1])) + for l in lines + ] + return pairs + + +def load_pairs_from_list_file(fname, root=""): + assert os.path.isfile( + fname + ), "cannot parse pairs from {:s}, file does not exist".format(fname) + with open(fname, "r") as fid: + lines = fid.read().strip().splitlines() + pairs = [ + (os.path.join(root, l + "_1.jpg"), os.path.join(root, l + "_2.jpg")) + for l in lines + if not l.startswith("#") + ] + return pairs + + +def write_cache_file(fname, pairs, root=""): + if len(root) > 0: + if not root.endswith("/"): + root += "/" + assert os.path.isdir(root) + s = "" + for im1, im2 in pairs: + if len(root) > 0: + assert im1.startswith(root), im1 + assert im2.startswith(root), im2 + s += "{:s} {:s}\n".format(im1[len(root) :], im2[len(root) :]) + with open(fname, "w") as fid: + fid.write(s[:-1]) + + +def parse_and_cache_all_pairs(dname, data_dir="./data/"): + if dname == "habitat_release": + dirname = os.path.join(data_dir, "habitat_release") + assert os.path.isdir(dirname), ( + "cannot find folder for habitat_release pairs: " + dirname + ) + cache_file = os.path.join(dirname, "pairs.txt") + assert not os.path.isfile(cache_file), ( + "cache file already exists: " + cache_file + ) + + print("Parsing pairs for dataset: " + dname) + pairs = [] + for root, dirs, files in os.walk(dirname): + if "val" in root: + continue + dirs.sort() + pairs += [ + ( + os.path.join(root, f), + os.path.join(root, f[: -len("_1.jpeg")] + "_2.jpeg"), + ) + for f in sorted(files) + if f.endswith("_1.jpeg") + ] + print("Found {:,} pairs".format(len(pairs))) + print("Writing cache to: " + cache_file) + write_cache_file(cache_file, pairs, root=dirname) + + else: + raise NotImplementedError("Unknown dataset: " + dname) + + +def dnames_to_image_pairs(dnames, data_dir="./data/"): + """ + dnames: list of datasets with image pairs, separated by + + """ + all_pairs = [] + for dname in dnames.split("+"): + if dname == "habitat_release": + dirname = os.path.join(data_dir, "habitat_release") + assert os.path.isdir(dirname), ( + "cannot find folder for habitat_release pairs: " + dirname + ) + cache_file = os.path.join(dirname, "pairs.txt") + assert os.path.isfile(cache_file), ( + "cannot find cache file for habitat_release pairs, please first create the cache file, see instructions. " + + cache_file + ) + pairs = load_pairs_from_cache_file(cache_file, root=dirname) + elif dname in ["ARKitScenes", "MegaDepth", "3DStreetView", "IndoorVL"]: + dirname = os.path.join(data_dir, dname + "_crops") + assert os.path.isdir( + dirname + ), "cannot find folder for {:s} pairs: {:s}".format(dname, dirname) + list_file = os.path.join(dirname, "listing.txt") + assert os.path.isfile( + list_file + ), "cannot find list file for {:s} pairs, see instructions. {:s}".format( + dname, list_file + ) + pairs = load_pairs_from_list_file(list_file, root=dirname) + print(" {:s}: {:,} pairs".format(dname, len(pairs))) + all_pairs += pairs + if "+" in dnames: + print(" Total: {:,} pairs".format(len(all_pairs))) + return all_pairs + + +class PairsDataset(Dataset): + + def __init__( + self, dnames, trfs="", totensor=True, normalize=True, data_dir="./data/" + ): + super().__init__() + self.image_pairs = dnames_to_image_pairs(dnames, data_dir=data_dir) + self.transforms = get_pair_transforms( + transform_str=trfs, totensor=totensor, normalize=normalize + ) + + def __len__(self): + return len(self.image_pairs) + + def __getitem__(self, index): + im1path, im2path = self.image_pairs[index] + im1 = load_image(im1path) + im2 = load_image(im2path) + if self.transforms is not None: + im1, im2 = self.transforms(im1, im2) + return im1, im2 + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + prog="Computing and caching list of pairs for a given dataset" + ) + parser.add_argument( + "--data_dir", default="./data/", type=str, help="path where data are stored" + ) + parser.add_argument( + "--dataset", default="habitat_release", type=str, help="name of the dataset" + ) + args = parser.parse_args() + parse_and_cache_all_pairs(dname=args.dataset, data_dir=args.data_dir) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/transforms.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..5dc89dd1092293f63035afd70e9ef9f907696f44 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/datasets/transforms.py @@ -0,0 +1,135 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import torch +import torchvision.transforms +import torchvision.transforms.functional as F + +# "Pair": apply a transform on a pair +# "Both": apply the exact same transform to both images + + +class ComposePair(torchvision.transforms.Compose): + def __call__(self, img1, img2): + for t in self.transforms: + img1, img2 = t(img1, img2) + return img1, img2 + + +class NormalizeBoth(torchvision.transforms.Normalize): + def forward(self, img1, img2): + img1 = super().forward(img1) + img2 = super().forward(img2) + return img1, img2 + + +class ToTensorBoth(torchvision.transforms.ToTensor): + def __call__(self, img1, img2): + img1 = super().__call__(img1) + img2 = super().__call__(img2) + return img1, img2 + + +class RandomCropPair(torchvision.transforms.RandomCrop): + # the crop will be intentionally different for the two images with this class + def forward(self, img1, img2): + img1 = super().forward(img1) + img2 = super().forward(img2) + return img1, img2 + + +class ColorJitterPair(torchvision.transforms.ColorJitter): + # can be symmetric (same for both images) or assymetric (different jitter params for each image) depending on assymetric_prob + def __init__(self, assymetric_prob, **kwargs): + super().__init__(**kwargs) + self.assymetric_prob = assymetric_prob + + def jitter_one( + self, + img, + fn_idx, + brightness_factor, + contrast_factor, + saturation_factor, + hue_factor, + ): + for fn_id in fn_idx: + if fn_id == 0 and brightness_factor is not None: + img = F.adjust_brightness(img, brightness_factor) + elif fn_id == 1 and contrast_factor is not None: + img = F.adjust_contrast(img, contrast_factor) + elif fn_id == 2 and saturation_factor is not None: + img = F.adjust_saturation(img, saturation_factor) + elif fn_id == 3 and hue_factor is not None: + img = F.adjust_hue(img, hue_factor) + return img + + def forward(self, img1, img2): + + fn_idx, brightness_factor, contrast_factor, saturation_factor, hue_factor = ( + self.get_params(self.brightness, self.contrast, self.saturation, self.hue) + ) + img1 = self.jitter_one( + img1, + fn_idx, + brightness_factor, + contrast_factor, + saturation_factor, + hue_factor, + ) + if torch.rand(1) < self.assymetric_prob: # assymetric: + ( + fn_idx, + brightness_factor, + contrast_factor, + saturation_factor, + hue_factor, + ) = self.get_params( + self.brightness, self.contrast, self.saturation, self.hue + ) + img2 = self.jitter_one( + img2, + fn_idx, + brightness_factor, + contrast_factor, + saturation_factor, + hue_factor, + ) + return img1, img2 + + +def get_pair_transforms(transform_str, totensor=True, normalize=True): + # transform_str is eg crop224+color + trfs = [] + for s in transform_str.split("+"): + if s.startswith("crop"): + size = int(s[len("crop") :]) + trfs.append(RandomCropPair(size)) + elif s == "acolor": + trfs.append( + ColorJitterPair( + assymetric_prob=1.0, + brightness=(0.6, 1.4), + contrast=(0.6, 1.4), + saturation=(0.6, 1.4), + hue=0.0, + ) + ) + elif s == "": # if transform_str was "" + pass + else: + raise NotImplementedError("Unknown augmentation: " + s) + + if totensor: + trfs.append(ToTensorBoth()) + if normalize: + trfs.append( + NormalizeBoth(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + ) + + if len(trfs) == 0: + return None + elif len(trfs) == 1: + return trfs + else: + return ComposePair(trfs) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/interactive_demo.ipynb b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/interactive_demo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6cfc960af5baac9a69029c29a16eea4e24123a71 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/interactive_demo.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interactive demo of Cross-view Completion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright (C) 2022-present Naver Corporation. All rights reserved.\n", + "# Licensed under CC BY-NC-SA 4.0 (non-commercial use only)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import numpy as np\n", + "from models.croco import CroCoNet\n", + "from ipywidgets import interact, interactive, fixed, interact_manual\n", + "import ipywidgets as widgets\n", + "import matplotlib.pyplot as plt\n", + "import quaternion\n", + "import models.masking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load CroCo model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ckpt = torch.load('pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth', 'cpu')\n", + "model = CroCoNet( **ckpt.get('croco_kwargs',{}))\n", + "msg = model.load_state_dict(ckpt['model'], strict=True)\n", + "use_gpu = torch.cuda.is_available() and torch.cuda.device_count()>0\n", + "device = torch.device('cuda:0' if use_gpu else 'cpu')\n", + "model = model.eval()\n", + "model = model.to(device=device)\n", + "print(msg)\n", + "\n", + "def process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches=False):\n", + " \"\"\"\n", + " Perform Cross-View completion using two input images, specified using Numpy arrays.\n", + " \"\"\"\n", + " # Replace the mask generator\n", + " model.mask_generator = models.masking.RandomMask(model.patch_embed.num_patches, masking_ratio)\n", + "\n", + " # ImageNet-1k color normalization\n", + " imagenet_mean = torch.as_tensor([0.485, 0.456, 0.406]).reshape(1,3,1,1).to(device)\n", + " imagenet_std = torch.as_tensor([0.229, 0.224, 0.225]).reshape(1,3,1,1).to(device)\n", + "\n", + " normalize_input_colors = True\n", + " is_output_normalized = True\n", + " with torch.no_grad():\n", + " # Cast data to torch\n", + " target_image = (torch.as_tensor(target_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n", + " ref_image = (torch.as_tensor(ref_image, dtype=torch.float, device=device).permute(2,0,1) / 255)[None]\n", + "\n", + " if normalize_input_colors:\n", + " ref_image = (ref_image - imagenet_mean) / imagenet_std\n", + " target_image = (target_image - imagenet_mean) / imagenet_std\n", + "\n", + " out, mask, _ = model(target_image, ref_image)\n", + " # # get target\n", + " if not is_output_normalized:\n", + " predicted_image = model.unpatchify(out)\n", + " else:\n", + " # The output only contains higher order information,\n", + " # we retrieve mean and standard deviation from the actual target image\n", + " patchified = model.patchify(target_image)\n", + " mean = patchified.mean(dim=-1, keepdim=True)\n", + " var = patchified.var(dim=-1, keepdim=True)\n", + " pred_renorm = out * (var + 1.e-6)**.5 + mean\n", + " predicted_image = model.unpatchify(pred_renorm)\n", + "\n", + " image_masks = model.unpatchify(model.patchify(torch.ones_like(ref_image)) * mask[:,:,None])\n", + " masked_target_image = (1 - image_masks) * target_image\n", + " \n", + " if not reconstruct_unmasked_patches:\n", + " # Replace unmasked patches by their actual values\n", + " predicted_image = predicted_image * image_masks + masked_target_image\n", + "\n", + " # Unapply color normalization\n", + " if normalize_input_colors:\n", + " predicted_image = predicted_image * imagenet_std + imagenet_mean\n", + " masked_target_image = masked_target_image * imagenet_std + imagenet_mean\n", + " \n", + " # Cast to Numpy\n", + " masked_target_image = np.asarray(torch.clamp(masked_target_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n", + " predicted_image = np.asarray(torch.clamp(predicted_image.squeeze(0).permute(1,2,0) * 255, 0, 255).cpu().numpy(), dtype=np.uint8)\n", + " return masked_target_image, predicted_image" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use the Habitat simulator to render images from arbitrary viewpoints (requires habitat_sim to be installed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"MAGNUM_LOG\"]=\"quiet\"\n", + "os.environ[\"HABITAT_SIM_LOG\"]=\"quiet\"\n", + "import habitat_sim\n", + "\n", + "scene = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.glb\"\n", + "navmesh = \"habitat-sim-data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh\"\n", + "\n", + "sim_cfg = habitat_sim.SimulatorConfiguration()\n", + "if use_gpu: sim_cfg.gpu_device_id = 0\n", + "sim_cfg.scene_id = scene\n", + "sim_cfg.load_semantic_mesh = False\n", + "rgb_sensor_spec = habitat_sim.CameraSensorSpec()\n", + "rgb_sensor_spec.uuid = \"color\"\n", + "rgb_sensor_spec.sensor_type = habitat_sim.SensorType.COLOR\n", + "rgb_sensor_spec.resolution = (224,224)\n", + "rgb_sensor_spec.hfov = 56.56\n", + "rgb_sensor_spec.position = [0.0, 0.0, 0.0]\n", + "rgb_sensor_spec.orientation = [0, 0, 0]\n", + "agent_cfg = habitat_sim.agent.AgentConfiguration(sensor_specifications=[rgb_sensor_spec])\n", + "\n", + "\n", + "cfg = habitat_sim.Configuration(sim_cfg, [agent_cfg])\n", + "sim = habitat_sim.Simulator(cfg)\n", + "if navmesh is not None:\n", + " sim.pathfinder.load_nav_mesh(navmesh)\n", + "agent = sim.initialize_agent(agent_id=0)\n", + "\n", + "def sample_random_viewpoint():\n", + " \"\"\" Sample a random viewpoint using the navmesh \"\"\"\n", + " nav_point = sim.pathfinder.get_random_navigable_point()\n", + " # Sample a random viewpoint height\n", + " viewpoint_height = np.random.uniform(1.0, 1.6)\n", + " viewpoint_position = nav_point + viewpoint_height * habitat_sim.geo.UP\n", + " viewpoint_orientation = quaternion.from_rotation_vector(np.random.uniform(-np.pi, np.pi) * habitat_sim.geo.UP)\n", + " return viewpoint_position, viewpoint_orientation\n", + "\n", + "def render_viewpoint(position, orientation):\n", + " agent_state = habitat_sim.AgentState()\n", + " agent_state.position = position\n", + " agent_state.rotation = orientation\n", + " agent.set_state(agent_state)\n", + " viewpoint_observations = sim.get_sensor_observations(agent_ids=0)\n", + " image = viewpoint_observations['color'][:,:,:3]\n", + " image = np.asarray(np.clip(1.5 * np.asarray(image, dtype=float), 0, 255), dtype=np.uint8)\n", + " return image" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sample a random reference view" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ref_position, ref_orientation = sample_random_viewpoint()\n", + "ref_image = render_viewpoint(ref_position, ref_orientation)\n", + "plt.clf()\n", + "fig, axes = plt.subplots(1,1, squeeze=False, num=1)\n", + "axes[0,0].imshow(ref_image)\n", + "for ax in axes.flatten():\n", + " ax.set_xticks([])\n", + " ax.set_yticks([])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interactive cross-view completion using CroCo" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reconstruct_unmasked_patches = False\n", + "\n", + "def show_demo(masking_ratio, x, y, z, panorama, elevation):\n", + " R = quaternion.as_rotation_matrix(ref_orientation)\n", + " target_position = ref_position + x * R[:,0] + y * R[:,1] + z * R[:,2]\n", + " target_orientation = (ref_orientation\n", + " * quaternion.from_rotation_vector(-elevation * np.pi/180 * habitat_sim.geo.LEFT) \n", + " * quaternion.from_rotation_vector(-panorama * np.pi/180 * habitat_sim.geo.UP))\n", + " \n", + " ref_image = render_viewpoint(ref_position, ref_orientation)\n", + " target_image = render_viewpoint(target_position, target_orientation)\n", + "\n", + " masked_target_image, predicted_image = process_images(ref_image, target_image, masking_ratio, reconstruct_unmasked_patches)\n", + "\n", + " fig, axes = plt.subplots(1,4, squeeze=True, dpi=300)\n", + " axes[0].imshow(ref_image)\n", + " axes[0].set_xlabel(\"Reference\")\n", + " axes[1].imshow(masked_target_image)\n", + " axes[1].set_xlabel(\"Masked target\")\n", + " axes[2].imshow(predicted_image)\n", + " axes[2].set_xlabel(\"Reconstruction\") \n", + " axes[3].imshow(target_image)\n", + " axes[3].set_xlabel(\"Target\")\n", + " for ax in axes.flatten():\n", + " ax.set_xticks([])\n", + " ax.set_yticks([])\n", + "\n", + "interact(show_demo,\n", + " masking_ratio=widgets.FloatSlider(description='masking', value=0.9, min=0.0, max=1.0),\n", + " x=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", + " y=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", + " z=widgets.FloatSlider(value=0.0, min=-0.5, max=0.5, step=0.05),\n", + " panorama=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5),\n", + " elevation=widgets.FloatSlider(value=0.0, min=-20, max=20, step=0.5));" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.13" + }, + "vscode": { + "interpreter": { + "hash": "f9237820cd248d7e07cb4fb9f0e4508a85d642f19d831560c0a4b61f3e907e67" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/blocks.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..aa85a431b44d276e3bba9a33fdfd7097f02bc330 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/blocks.py @@ -0,0 +1,385 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Main encoder/decoder blocks +# -------------------------------------------------------- +# References: +# timm +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/helpers.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/mlp.py +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/patch_embed.py + + +import torch +import torch.nn as nn + +from itertools import repeat +import collections.abc +from torch.nn.functional import scaled_dot_product_attention + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + + +def drop_path( + x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True +): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f"drop_prob={round(self.drop_prob,3):0.3f}" + + +class Mlp(nn.Module): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + bias=True, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + return self.drop2(self.fc2(self.drop1(self.act(self.fc1(x))))) + + +class Attention(nn.Module): + + def __init__( + self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0 + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.rope = rope.float() if rope is not None else None + + def forward(self, x, xpos): + B, N, C = x.shape + + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, C // self.num_heads) + .transpose(1, 3) + ) + q, k, v = [qkv[:, :, i] for i in range(3)] + # q,k,v = qkv.unbind(2) # make torchscript happy (cannot use tensor as tuple) + + q_type = q.dtype + k_type = k.dtype + if self.rope is not None: + q = q.to(torch.float16) + k = k.to(torch.float16) + with torch.autocast(device_type="cuda", enabled=False): + q = self.rope(q, xpos) + k = self.rope(k, xpos) + q = q.to(q_type) + k = k.to(k_type) + + # attn = (q @ k.transpose(-2, -1)) * self.scale + # attn = attn.softmax(dim=-1) + # attn = self.attn_drop(attn) + + # x = (attn @ v).transpose(1, 2).reshape(B, N, C) + # x = memory_efficient_attention(query=q.permute(0, 2, 1, 3), key=k.permute(0, 2, 1, 3), value=v.permute(0, 2, 1, 3), p=self.attn_drop.p, scale=self.scale).reshape(B, N, C) + x = ( + scaled_dot_product_attention( + query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale + ) + .transpose(1, 2) + .reshape(B, N, C) + ) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + rope=None, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x, xpos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class CrossAttention(nn.Module): + + def __init__( + self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0 + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.projq = nn.Linear(dim, dim, bias=qkv_bias) + self.projk = nn.Linear(dim, dim, bias=qkv_bias) + self.projv = nn.Linear(dim, dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.rope = rope.float() if rope is not None else None + + def forward(self, query, key, value, qpos, kpos): + B, Nq, C = query.shape + Nk = key.shape[1] + Nv = value.shape[1] + + q = ( + self.projq(query) + .reshape(B, Nq, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + k = ( + self.projk(key) + .reshape(B, Nk, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + v = ( + self.projv(value) + .reshape(B, Nv, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + + q_type = q.dtype + k_type = k.dtype + if self.rope is not None: + if qpos is not None: + q = q.to(torch.float16) + with torch.autocast(device_type="cuda", enabled=False): + q = self.rope(q, qpos) + q = q.to(q_type) + + if kpos is not None: + k = k.to(torch.float16) + with torch.autocast(device_type="cuda", enabled=False): + k = self.rope(k, kpos) + k = k.to(k_type) + + # attn = (q @ k.transpose(-2, -1)) * self.scale + # attn = attn.softmax(dim=-1) + # attn = self.attn_drop(attn) + + # x = (attn @ v).transpose(1, 2).reshape(B, Nq, C) + + # x = memory_efficient_attention(query=q.permute(0, 2, 1, 3), key=k.permute(0, 2, 1, 3), value=v.permute(0, 2, 1, 3), p=self.attn_drop.p, scale=self.scale).reshape(B, Nq, C) + x = ( + scaled_dot_product_attention( + query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale + ) + .transpose(1, 2) + .reshape(B, Nq, C) + ) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class DecoderBlock(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + norm_mem=True, + rope=None, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.cross_attn = CrossAttention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + self.norm3 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + self.norm_y = norm_layer(dim) if norm_mem else nn.Identity() + + def forward(self, x, y, xpos, ypos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + y_ = self.norm_y(y) + x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)) + x = x + self.drop_path(self.mlp(self.norm3(x))) + return x, y + + +# patch embedding +class PositionGetter(object): + """return positions of patches""" + + def __init__(self): + self.cache_positions = {} + + def __call__(self, b, h, w, device): + if not (h, w) in self.cache_positions: + x = torch.arange(w, device=device) + y = torch.arange(h, device=device) + self.cache_positions[h, w] = torch.cartesian_prod(y, x) # (h, w, 2) + pos = self.cache_positions[h, w].view(1, h * w, 2).expand(b, -1, 2).clone() + return pos + + +class PatchEmbed(nn.Module): + """just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed""" + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + self.position_getter = PositionGetter() + + def forward(self, x): + B, C, H, W = x.shape + torch._assert( + H == self.img_size[0], + f"Input image height ({H}) doesn't match model ({self.img_size[0]}).", + ) + torch._assert( + W == self.img_size[1], + f"Input image width ({W}) doesn't match model ({self.img_size[1]}).", + ) + x = self.proj(x) + pos = self.position_getter(B, x.size(2), x.size(3), x.device) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x, pos + + def _init_weights(self): + w = self.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/criterion.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..af94f572499c976ad9cfd87d4728b8b517cdfd39 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/criterion.py @@ -0,0 +1,38 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Criterion to train CroCo +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# -------------------------------------------------------- + +import torch + + +class MaskedMSE(torch.nn.Module): + + def __init__(self, norm_pix_loss=False, masked=True): + """ + norm_pix_loss: normalize each patch by their pixel mean and variance + masked: compute loss over the masked patches only + """ + super().__init__() + self.norm_pix_loss = norm_pix_loss + self.masked = masked + + def forward(self, pred, mask, target): + + if self.norm_pix_loss: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.0e-6) ** 0.5 + + loss = (pred - target) ** 2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + if self.masked: + loss = (loss * mask).sum() / mask.sum() # mean loss on masked patches + else: + loss = loss.mean() # mean loss + return loss diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/croco.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/croco.py new file mode 100644 index 0000000000000000000000000000000000000000..64b2410e9b52ab34bc66f1d7d768d0e91c8cf30b --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/croco.py @@ -0,0 +1,330 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# CroCo model during pretraining +# -------------------------------------------------------- + + +import torch +import torch.nn as nn + +torch.backends.cuda.matmul.allow_tf32 = True # for gpu >= Ampere and pytorch >= 1.12 +from functools import partial + +from models.blocks import Block, DecoderBlock, PatchEmbed +from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D +from models.masking import RandomMask + +from transformers import PretrainedConfig +from transformers import PreTrainedModel + + +class CrocoConfig(PretrainedConfig): + model_type = "croco" + + def __init__( + self, + img_size=224, # input image size + patch_size=16, # patch_size + mask_ratio=0.9, # ratios of masked tokens + enc_embed_dim=768, # encoder feature dimension + enc_depth=12, # encoder depth + enc_num_heads=12, # encoder number of heads in the transformer block + dec_embed_dim=512, # decoder feature dimension + dec_depth=8, # decoder depth + dec_num_heads=16, # decoder number of heads in the transformer block + mlp_ratio=4, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + norm_im2_in_dec=True, # whether to apply normalization of the 'memory' = (second image) in the decoder + pos_embed="cosine", # positional embedding (either cosine or RoPE100) + ): + super().__init__() + self.img_size = img_size + self.patch_size = patch_size + self.mask_ratio = mask_ratio + self.enc_embed_dim = enc_embed_dim + self.enc_depth = enc_depth + self.enc_num_heads = enc_num_heads + self.dec_embed_dim = dec_embed_dim + self.dec_depth = dec_depth + self.dec_num_heads = dec_num_heads + self.mlp_ratio = mlp_ratio + self.norm_layer = norm_layer + self.norm_im2_in_dec = norm_im2_in_dec + self.pos_embed = pos_embed + + +class CroCoNet(PreTrainedModel): + + config_class = CrocoConfig + base_model_prefix = "croco" + + def __init__(self, config: CrocoConfig): + + super().__init__(config) + + # patch embeddings (with initialization done as in MAE) + self._set_patch_embed(config.img_size, config.patch_size, config.enc_embed_dim) + + # mask generations + self._set_mask_generator(self.patch_embed.num_patches, config.mask_ratio) + + self.pos_embed = config.pos_embed + if config.pos_embed == "cosine": + # positional embedding of the encoder + enc_pos_embed = get_2d_sincos_pos_embed( + config.enc_embed_dim, + int(self.patch_embed.num_patches**0.5), + n_cls_token=0, + ) + self.register_buffer( + "enc_pos_embed", torch.from_numpy(enc_pos_embed).float() + ) + # positional embedding of the decoder + dec_pos_embed = get_2d_sincos_pos_embed( + config.dec_embed_dim, + int(self.patch_embed.num_patches**0.5), + n_cls_token=0, + ) + self.register_buffer( + "dec_pos_embed", torch.from_numpy(dec_pos_embed).float() + ) + # pos embedding in each block + self.rope = None # nothing for cosine + elif config.pos_embed.startswith("RoPE"): # eg RoPE100 + self.enc_pos_embed = None # nothing to add in the encoder with RoPE + self.dec_pos_embed = None # nothing to add in the decoder with RoPE + if RoPE2D is None: + raise ImportError( + "Cannot find cuRoPE2D, please install it following the README instructions" + ) + freq = float(config.pos_embed[len("RoPE") :]) + self.rope = RoPE2D(freq=freq) + else: + raise NotImplementedError("Unknown pos_embed " + config.pos_embed) + + # transformer for the encoder + self.enc_depth = config.enc_depth + self.enc_embed_dim = config.enc_embed_dim + self.enc_blocks = nn.ModuleList( + [ + Block( + config.enc_embed_dim, + config.enc_num_heads, + config.mlp_ratio, + qkv_bias=True, + norm_layer=config.norm_layer, + rope=self.rope, + ) + for i in range(config.enc_depth) + ] + ) + self.enc_norm = config.norm_layer(config.enc_embed_dim) + + # masked tokens + # self._set_mask_token(config.dec_embed_dim) + self.mask_token = None + + # decoder + self._set_decoder( + config.enc_embed_dim, + config.dec_embed_dim, + config.dec_num_heads, + config.dec_depth, + config.mlp_ratio, + config.norm_layer, + config.norm_im2_in_dec, + ) + + # prediction head + self._set_prediction_head(config.dec_embed_dim, config.patch_size) + + # initializer weights + self.initialize_weights() + + def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768): + self.patch_embed = PatchEmbed(img_size, patch_size, 3, enc_embed_dim) + + def _set_mask_generator(self, num_patches, mask_ratio): + self.mask_generator = RandomMask(num_patches, mask_ratio) + + def _set_mask_token(self, dec_embed_dim): + self.mask_token = nn.Parameter(torch.zeros(1, 1, dec_embed_dim)) + + def _set_decoder( + self, + enc_embed_dim, + dec_embed_dim, + dec_num_heads, + dec_depth, + mlp_ratio, + norm_layer, + norm_im2_in_dec, + ): + self.dec_depth = dec_depth + self.dec_embed_dim = dec_embed_dim + # transfer from encoder to decoder + self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True) + # transformer for the decoder + self.dec_blocks = nn.ModuleList( + [ + DecoderBlock( + dec_embed_dim, + dec_num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=True, + norm_layer=norm_layer, + norm_mem=norm_im2_in_dec, + rope=self.rope, + ) + for i in range(dec_depth) + ] + ) + # final norm layer + self.dec_norm = norm_layer(dec_embed_dim) + + def _set_prediction_head(self, dec_embed_dim, patch_size): + self.prediction_head = nn.Linear(dec_embed_dim, patch_size**2 * 3, bias=True) + + def initialize_weights(self): + # patch embed + self.patch_embed._init_weights() + # mask tokens + if self.mask_token is not None: + torch.nn.init.normal_(self.mask_token, std=0.02) + # linears and layer norms + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + # we use xavier_uniform following official JAX ViT: + torch.nn.init.xavier_uniform_(m.weight) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def _encode_image(self, image, do_mask=False, return_all_blocks=False): + """ + image has B x 3 x img_size x img_size + do_mask: whether to perform masking or not + return_all_blocks: if True, return the features at the end of every block + instead of just the features from the last block (eg for some prediction heads) + """ + # embed the image into patches (x has size B x Npatches x C) + # and get position if each return patch (pos has size B x Npatches x 2) + x, pos = self.patch_embed(image) + # add positional embedding without cls token + if self.enc_pos_embed is not None: + x = x + self.enc_pos_embed[None, ...] + # apply masking + B, N, C = x.size() + if do_mask: + masks = self.mask_generator(x) + x = x[~masks].view(B, -1, C) + posvis = pos[~masks].view(B, -1, 2) + else: + B, N, C = x.size() + masks = torch.zeros((B, N), dtype=bool) + posvis = pos + # now apply the transformer encoder and normalization + if return_all_blocks: + out = [] + for blk in self.enc_blocks: + x = blk(x, posvis) + out.append(x) + out[-1] = self.enc_norm(out[-1]) + return out, pos, masks + else: + for blk in self.enc_blocks: + x = blk(x, posvis) + x = self.enc_norm(x) + return x, pos, masks + + def _decoder(self, feat1, pos1, masks1, feat2, pos2, return_all_blocks=False): + """ + return_all_blocks: if True, return the features at the end of every block + instead of just the features from the last block (eg for some prediction heads) + + masks1 can be None => assume image1 fully visible + """ + # encoder to decoder layer + visf1 = self.decoder_embed(feat1) + f2 = self.decoder_embed(feat2) + # append masked tokens to the sequence + B, Nenc, C = visf1.size() + if masks1 is None: # downstreams + f1_ = visf1 + else: # pretraining + Ntotal = masks1.size(1) + f1_ = self.mask_token.repeat(B, Ntotal, 1).to(dtype=visf1.dtype) + f1_[~masks1] = visf1.view(B * Nenc, C) + # add positional embedding + if self.dec_pos_embed is not None: + f1_ = f1_ + self.dec_pos_embed + f2 = f2 + self.dec_pos_embed + # apply Transformer blocks + out = f1_ + out2 = f2 + if return_all_blocks: + _out, out = out, [] + for blk in self.dec_blocks: + _out, out2 = blk(_out, out2, pos1, pos2) + out.append(_out) + out[-1] = self.dec_norm(out[-1]) + else: + for blk in self.dec_blocks: + out, out2 = blk(out, out2, pos1, pos2) + out = self.dec_norm(out) + return out + + def patchify(self, imgs): + """ + imgs: (B, 3, H, W) + x: (B, L, patch_size**2 *3) + """ + p = self.patch_embed.patch_size[0] + assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 + + h = w = imgs.shape[2] // p + x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) + x = torch.einsum("nchpwq->nhwpqc", x) + x = x.reshape(shape=(imgs.shape[0], h * w, p**2 * 3)) + + return x + + def unpatchify(self, x, channels=3): + """ + x: (N, L, patch_size**2 *channels) + imgs: (N, 3, H, W) + """ + patch_size = self.patch_embed.patch_size[0] + h = w = int(x.shape[1] ** 0.5) + assert h * w == x.shape[1] + x = x.reshape(shape=(x.shape[0], h, w, patch_size, patch_size, channels)) + x = torch.einsum("nhwpqc->nchpwq", x) + imgs = x.reshape(shape=(x.shape[0], channels, h * patch_size, h * patch_size)) + return imgs + + # def forward(self, img1, img2): + # """ + # img1: tensor of size B x 3 x img_size x img_size + # img2: tensor of size B x 3 x img_size x img_size + + # out will be B x N x (3*patch_size*patch_size) + # masks are also returned as B x N just in case + # """ + # # encoder of the masked first image + # feat1, pos1, mask1 = self._encode_image(img1, do_mask=True) + # # encoder of the second image + # feat2, pos2, _ = self._encode_image(img2, do_mask=False) + # # decoder + # decfeat = self._decoder(feat1, pos1, mask1, feat2, pos2) + # # prediction head + # out = self.prediction_head(decfeat) + # # get target + # target = self.patchify(img1) + # return out, mask1, target diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/croco_downstream.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/croco_downstream.py new file mode 100644 index 0000000000000000000000000000000000000000..cd59dca45d403c16d60610640b4156b151f46c9b --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/croco_downstream.py @@ -0,0 +1,141 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# CroCo model for downstream tasks +# -------------------------------------------------------- + +import torch + +from .croco import CroCoNet + + +def croco_args_from_ckpt(ckpt): + if "croco_kwargs" in ckpt: # CroCo v2 released models + return ckpt["croco_kwargs"] + elif "args" in ckpt and hasattr( + ckpt["args"], "model" + ): # pretrained using the official code release + s = ckpt[ + "args" + ].model # eg "CroCoNet(enc_embed_dim=1024, enc_num_heads=16, enc_depth=24)" + assert s.startswith("CroCoNet(") + return eval( + "dict" + s[len("CroCoNet") :] + ) # transform it into the string of a dictionary and evaluate it + else: # CroCo v1 released models + return dict() + + +class CroCoDownstreamMonocularEncoder(CroCoNet): + + def __init__(self, head, **kwargs): + """Build network for monocular downstream task, only using the encoder. + It takes an extra argument head, that is called with the features + and a dictionary img_info containing 'width' and 'height' keys + The head is setup with the croconet arguments in this init function + NOTE: It works by *calling super().__init__() but with redefined setters + + """ + super(CroCoDownstreamMonocularEncoder, self).__init__(**kwargs) + head.setup(self) + self.head = head + + def _set_mask_generator(self, *args, **kwargs): + """No mask generator""" + return + + def _set_mask_token(self, *args, **kwargs): + """No mask token""" + self.mask_token = None + return + + def _set_decoder(self, *args, **kwargs): + """No decoder""" + return + + def _set_prediction_head(self, *args, **kwargs): + """No 'prediction head' for downstream tasks.""" + return + + def forward(self, img): + """ + img if of size batch_size x 3 x h x w + """ + B, C, H, W = img.size() + img_info = {"height": H, "width": W} + need_all_layers = ( + hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks + ) + out, _, _ = self._encode_image( + img, do_mask=False, return_all_blocks=need_all_layers + ) + return self.head(out, img_info) + + +class CroCoDownstreamBinocular(CroCoNet): + + def __init__(self, head, **kwargs): + """Build network for binocular downstream task + It takes an extra argument head, that is called with the features + and a dictionary img_info containing 'width' and 'height' keys + The head is setup with the croconet arguments in this init function + """ + super(CroCoDownstreamBinocular, self).__init__(**kwargs) + head.setup(self) + self.head = head + + def _set_mask_generator(self, *args, **kwargs): + """No mask generator""" + return + + def _set_mask_token(self, *args, **kwargs): + """No mask token""" + self.mask_token = None + return + + def _set_prediction_head(self, *args, **kwargs): + """No prediction head for downstream tasks, define your own head""" + return + + def encode_image_pairs(self, img1, img2, return_all_blocks=False): + """run encoder for a pair of images + it is actually ~5% faster to concatenate the images along the batch dimension + than to encode them separately + """ + ## the two commented lines below is the naive version with separate encoding + # out, pos, _ = self._encode_image(img1, do_mask=False, return_all_blocks=return_all_blocks) + # out2, pos2, _ = self._encode_image(img2, do_mask=False, return_all_blocks=False) + ## and now the faster version + out, pos, _ = self._encode_image( + torch.cat((img1, img2), dim=0), + do_mask=False, + return_all_blocks=return_all_blocks, + ) + if return_all_blocks: + out, out2 = list(map(list, zip(*[o.chunk(2, dim=0) for o in out]))) + out2 = out2[-1] + else: + out, out2 = out.chunk(2, dim=0) + pos, pos2 = pos.chunk(2, dim=0) + return out, out2, pos, pos2 + + def forward(self, img1, img2): + B, C, H, W = img1.size() + img_info = {"height": H, "width": W} + return_all_blocks = ( + hasattr(self.head, "return_all_blocks") and self.head.return_all_blocks + ) + out, out2, pos, pos2 = self.encode_image_pairs( + img1, img2, return_all_blocks=return_all_blocks + ) + if return_all_blocks: + decout = self._decoder( + out[-1], pos, None, out2, pos2, return_all_blocks=return_all_blocks + ) + decout = out + decout + else: + decout = self._decoder( + out, pos, None, out2, pos2, return_all_blocks=return_all_blocks + ) + return self.head(decout, img_info) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..25e3d48a162760260826080f6366838e83e26878 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/__init__.py @@ -0,0 +1,4 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +from .curope2d import cuRoPE2D diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/curope.cpp b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/curope.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8fe9058e05aa1bf3f37b0d970edc7312bc68455b --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/curope.cpp @@ -0,0 +1,69 @@ +/* + Copyright (C) 2022-present Naver Corporation. All rights reserved. + Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +*/ + +#include + +// forward declaration +void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ); + +void rope_2d_cpu( torch::Tensor tokens, const torch::Tensor positions, const float base, const float fwd ) +{ + const int B = tokens.size(0); + const int N = tokens.size(1); + const int H = tokens.size(2); + const int D = tokens.size(3) / 4; + + auto tok = tokens.accessor(); + auto pos = positions.accessor(); + + for (int b = 0; b < B; b++) { + for (int x = 0; x < 2; x++) { // y and then x (2d) + for (int n = 0; n < N; n++) { + + // grab the token position + const int p = pos[b][n][x]; + + for (int h = 0; h < H; h++) { + for (int d = 0; d < D; d++) { + // grab the two values + float u = tok[b][n][h][d+0+x*2*D]; + float v = tok[b][n][h][d+D+x*2*D]; + + // grab the cos,sin + const float inv_freq = fwd * p / powf(base, d/float(D)); + float c = cosf(inv_freq); + float s = sinf(inv_freq); + + // write the result + tok[b][n][h][d+0+x*2*D] = u*c - v*s; + tok[b][n][h][d+D+x*2*D] = v*c + u*s; + } + } + } + } + } +} + +void rope_2d( torch::Tensor tokens, // B,N,H,D + const torch::Tensor positions, // B,N,2 + const float base, + const float fwd ) +{ + TORCH_CHECK(tokens.dim() == 4, "tokens must have 4 dimensions"); + TORCH_CHECK(positions.dim() == 3, "positions must have 3 dimensions"); + TORCH_CHECK(tokens.size(0) == positions.size(0), "batch size differs between tokens & positions"); + TORCH_CHECK(tokens.size(1) == positions.size(1), "seq_length differs between tokens & positions"); + TORCH_CHECK(positions.size(2) == 2, "positions.shape[2] must be equal to 2"); + TORCH_CHECK(tokens.is_cuda() == positions.is_cuda(), "tokens and positions are not on the same device" ); + + if (tokens.is_cuda()) + rope_2d_cuda( tokens, positions, base, fwd ); + else + rope_2d_cpu( tokens, positions, base, fwd ); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("rope_2d", &rope_2d, "RoPE 2d forward/backward"); +} diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/curope2d.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/curope2d.py new file mode 100644 index 0000000000000000000000000000000000000000..7e0345c31bd3925be91dde5b9cfc64432f7bf516 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/curope2d.py @@ -0,0 +1,40 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +import torch + +try: + import curope as _kernels # run `python setup.py install` +except ModuleNotFoundError: + from . import curope as _kernels # run `python setup.py build_ext --inplace` + + +class cuRoPE2D_func(torch.autograd.Function): + + @staticmethod + def forward(ctx, tokens, positions, base, F0=1): + ctx.save_for_backward(positions) + ctx.saved_base = base + ctx.saved_F0 = F0 + # tokens = tokens.clone() # uncomment this if inplace doesn't work + _kernels.rope_2d(tokens, positions, base, F0) + ctx.mark_dirty(tokens) + return tokens + + @staticmethod + def backward(ctx, grad_res): + positions, base, F0 = ctx.saved_tensors[0], ctx.saved_base, ctx.saved_F0 + _kernels.rope_2d(grad_res, positions, base, -F0) + ctx.mark_dirty(grad_res) + return grad_res, None, None, None + + +class cuRoPE2D(torch.nn.Module): + def __init__(self, freq=100.0, F0=1.0): + super().__init__() + self.base = freq + self.F0 = F0 + + def forward(self, tokens, positions): + cuRoPE2D_func.apply(tokens.transpose(1, 2), positions, self.base, self.F0) + return tokens diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/kernels.cu b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/kernels.cu new file mode 100644 index 0000000000000000000000000000000000000000..7156cd1bb935cb1f0be45e58add53f9c21505c20 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/kernels.cu @@ -0,0 +1,108 @@ +/* + Copyright (C) 2022-present Naver Corporation. All rights reserved. + Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +*/ + +#include +#include +#include +#include + +#define CHECK_CUDA(tensor) {\ + TORCH_CHECK((tensor).is_cuda(), #tensor " is not in cuda memory"); \ + TORCH_CHECK((tensor).is_contiguous(), #tensor " is not contiguous"); } +void CHECK_KERNEL() {auto error = cudaGetLastError(); TORCH_CHECK( error == cudaSuccess, cudaGetErrorString(error));} + + +template < typename scalar_t > +__global__ void rope_2d_cuda_kernel( + //scalar_t* __restrict__ tokens, + torch::PackedTensorAccessor32 tokens, + const int64_t* __restrict__ pos, + const float base, + const float fwd ) + // const int N, const int H, const int D ) +{ + // tokens shape = (B, N, H, D) + const int N = tokens.size(1); + const int H = tokens.size(2); + const int D = tokens.size(3); + + // each block update a single token, for all heads + // each thread takes care of a single output + extern __shared__ float shared[]; + float* shared_inv_freq = shared + D; + + const int b = blockIdx.x / N; + const int n = blockIdx.x % N; + + const int Q = D / 4; + // one token = [0..Q : Q..2Q : 2Q..3Q : 3Q..D] + // u_Y v_Y u_X v_X + + // shared memory: first, compute inv_freq + if (threadIdx.x < Q) + shared_inv_freq[threadIdx.x] = fwd / powf(base, threadIdx.x/float(Q)); + __syncthreads(); + + // start of X or Y part + const int X = threadIdx.x < D/2 ? 0 : 1; + const int m = (X*D/2) + (threadIdx.x % Q); // index of u_Y or u_X + + // grab the cos,sin appropriate for me + const float freq = pos[blockIdx.x*2+X] * shared_inv_freq[threadIdx.x % Q]; + const float cos = cosf(freq); + const float sin = sinf(freq); + /* + float* shared_cos_sin = shared + D + D/4; + if ((threadIdx.x % (D/2)) < Q) + shared_cos_sin[m+0] = cosf(freq); + else + shared_cos_sin[m+Q] = sinf(freq); + __syncthreads(); + const float cos = shared_cos_sin[m+0]; + const float sin = shared_cos_sin[m+Q]; + */ + + for (int h = 0; h < H; h++) + { + // then, load all the token for this head in shared memory + shared[threadIdx.x] = tokens[b][n][h][threadIdx.x]; + __syncthreads(); + + const float u = shared[m]; + const float v = shared[m+Q]; + + // write output + if ((threadIdx.x % (D/2)) < Q) + tokens[b][n][h][threadIdx.x] = u*cos - v*sin; + else + tokens[b][n][h][threadIdx.x] = v*cos + u*sin; + } +} + +void rope_2d_cuda( torch::Tensor tokens, const torch::Tensor pos, const float base, const float fwd ) +{ + const int B = tokens.size(0); // batch size + const int N = tokens.size(1); // sequence length + const int H = tokens.size(2); // number of heads + const int D = tokens.size(3); // dimension per head + + TORCH_CHECK(tokens.stride(3) == 1 && tokens.stride(2) == D, "tokens are not contiguous"); + TORCH_CHECK(pos.is_contiguous(), "positions are not contiguous"); + TORCH_CHECK(pos.size(0) == B && pos.size(1) == N && pos.size(2) == 2, "bad pos.shape"); + TORCH_CHECK(D % 4 == 0, "token dim must be multiple of 4"); + + // one block for each layer, one thread per local-max + const int THREADS_PER_BLOCK = D; + const int N_BLOCKS = B * N; // each block takes care of H*D values + const int SHARED_MEM = sizeof(float) * (D + D/4); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(tokens.type(), "rope_2d_cuda", ([&] { + rope_2d_cuda_kernel <<>> ( + //tokens.data_ptr(), + tokens.packed_accessor32(), + pos.data_ptr(), + base, fwd); //, N, H, D ); + })); +} diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/setup.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..02ddb0912370a67a49fd2bb91164cf2f1da8648e --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/curope/setup.py @@ -0,0 +1,34 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +from setuptools import setup +from torch import cuda +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +# compile for all possible CUDA architectures +all_cuda_archs = cuda.get_gencode_flags().replace("compute=", "arch=").split() +# alternatively, you can list cuda archs that you want, eg: +# all_cuda_archs = [ +# '-gencode', 'arch=compute_70,code=sm_70', +# '-gencode', 'arch=compute_75,code=sm_75', +# '-gencode', 'arch=compute_80,code=sm_80', +# '-gencode', 'arch=compute_86,code=sm_86' +# ] + +setup( + name="curope", + ext_modules=[ + CUDAExtension( + name="curope", + sources=[ + "curope.cpp", + "kernels.cu", + ], + extra_compile_args=dict( + nvcc=["-O3", "--ptxas-options=-v", "--use_fast_math"] + all_cuda_archs, + cxx=["-O3"], + ), + ) + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/dpt_block.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/dpt_block.py new file mode 100644 index 0000000000000000000000000000000000000000..b470d91c9c86af8f3b3947e3abcf96d49ab3e06d --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/dpt_block.py @@ -0,0 +1,513 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# DPT head for ViTs +# -------------------------------------------------------- +# References: +# https://github.com/isl-org/DPT +# https://github.com/EPFL-VILAB/MultiMAE/blob/main/multimae/output_adapters.py + +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from typing import Union, Tuple, Iterable, List, Optional, Dict + + +def pair(t): + return t if isinstance(t, tuple) else (t, t) + + +def make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + out_shape4 = out_shape + if expand == True: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], + out_shape1, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], + out_shape2, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], + out_shape3, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + scratch.layer4_rn = nn.Conv2d( + in_shape[3], + out_shape4, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups, + ) + + scratch.layer_rn = nn.ModuleList( + [ + scratch.layer1_rn, + scratch.layer2_rn, + scratch.layer3_rn, + scratch.layer4_rn, + ] + ) + + return scratch + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module.""" + + def __init__(self, features, activation, bn): + """Init. + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups = 1 + + self.conv1 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + self.conv2 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=not self.bn, + groups=self.groups, + ) + + if self.bn == True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + Args: + x (tensor): input + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn == True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn == True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block.""" + + def __init__( + self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True, + width_ratio=1, + ): + """Init. + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + self.width_ratio = width_ratio + + self.deconv = deconv + self.align_corners = align_corners + + self.groups = 1 + + self.expand = expand + out_features = features + if self.expand == True: + out_features = features // 2 + + self.out_conv = nn.Conv2d( + features, + out_features, + kernel_size=1, + stride=1, + padding=0, + bias=True, + groups=1, + ) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, *xs): + """Forward pass. + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + if self.width_ratio != 1: + res = F.interpolate( + res, size=(output.shape[2], output.shape[3]), mode="bilinear" + ) + + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + if self.width_ratio != 1: + # and output.shape[3] < self.width_ratio * output.shape[2] + # size=(image.shape[]) + if (output.shape[3] / output.shape[2]) < (2 / 3) * self.width_ratio: + shape = 3 * output.shape[3] + else: + shape = int(self.width_ratio * 2 * output.shape[2]) + output = F.interpolate( + output, size=(2 * output.shape[2], shape), mode="bilinear" + ) + else: + output = nn.functional.interpolate( + output, + scale_factor=2, + mode="bilinear", + align_corners=self.align_corners, + ) + output = self.out_conv(output) + return output + + +def make_fusion_block(features, use_bn, width_ratio=1): + return FeatureFusionBlock_custom( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + width_ratio=width_ratio, + ) + + +class Interpolate(nn.Module): + """Interpolation module.""" + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + Args: + x (tensor): input + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, + scale_factor=self.scale_factor, + mode=self.mode, + align_corners=self.align_corners, + ) + + return x + + +class DPTOutputAdapter(nn.Module): + """DPT output adapter. + + :param num_cahnnels: Number of output channels + :param stride_level: tride level compared to the full-sized image. + E.g. 4 for 1/4th the size of the image. + :param patch_size_full: Int or tuple of the patch size over the full image size. + Patch size for smaller inputs will be computed accordingly. + :param hooks: Index of intermediate layers + :param layer_dims: Dimension of intermediate layers + :param feature_dim: Feature dimension + :param last_dim: out_channels/in_channels for the last two Conv2d when head_type == regression + :param use_bn: If set to True, activates batch norm + :param dim_tokens_enc: Dimension of tokens coming from encoder + """ + + def __init__( + self, + num_channels: int = 1, + stride_level: int = 1, + patch_size: Union[int, Tuple[int, int]] = 16, + main_tasks: Iterable[str] = ("rgb",), + hooks: List[int] = [2, 5, 8, 11], + layer_dims: List[int] = [96, 192, 384, 768], + feature_dim: int = 256, + last_dim: int = 32, + use_bn: bool = False, + dim_tokens_enc: Optional[int] = None, + head_type: str = "regression", + output_width_ratio=1, + **kwargs + ): + super().__init__() + self.num_channels = num_channels + self.stride_level = stride_level + self.patch_size = pair(patch_size) + self.main_tasks = main_tasks + self.hooks = hooks + self.layer_dims = layer_dims + self.feature_dim = feature_dim + self.dim_tokens_enc = ( + dim_tokens_enc * len(self.main_tasks) + if dim_tokens_enc is not None + else None + ) + self.head_type = head_type + + # Actual patch height and width, taking into account stride of input + self.P_H = max(1, self.patch_size[0] // stride_level) + self.P_W = max(1, self.patch_size[1] // stride_level) + + self.scratch = make_scratch(layer_dims, feature_dim, groups=1, expand=False) + + self.scratch.refinenet1 = make_fusion_block( + feature_dim, use_bn, output_width_ratio + ) + self.scratch.refinenet2 = make_fusion_block( + feature_dim, use_bn, output_width_ratio + ) + self.scratch.refinenet3 = make_fusion_block( + feature_dim, use_bn, output_width_ratio + ) + self.scratch.refinenet4 = make_fusion_block( + feature_dim, use_bn, output_width_ratio + ) + + if self.head_type == "regression": + # The "DPTDepthModel" head + self.head = nn.Sequential( + nn.Conv2d( + feature_dim, feature_dim // 2, kernel_size=3, stride=1, padding=1 + ), + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + nn.Conv2d( + feature_dim // 2, last_dim, kernel_size=3, stride=1, padding=1 + ), + nn.ReLU(True), + nn.Conv2d( + last_dim, self.num_channels, kernel_size=1, stride=1, padding=0 + ), + ) + elif self.head_type == "semseg": + # The "DPTSegmentationModel" head + self.head = nn.Sequential( + nn.Conv2d( + feature_dim, feature_dim, kernel_size=3, padding=1, bias=False + ), + nn.BatchNorm2d(feature_dim) if use_bn else nn.Identity(), + nn.ReLU(True), + nn.Dropout(0.1, False), + nn.Conv2d(feature_dim, self.num_channels, kernel_size=1), + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + ) + else: + raise ValueError('DPT head_type must be "regression" or "semseg".') + + if self.dim_tokens_enc is not None: + self.init(dim_tokens_enc=dim_tokens_enc) + + def init(self, dim_tokens_enc=768): + """ + Initialize parts of decoder that are dependent on dimension of encoder tokens. + Should be called when setting up MultiMAE. + + :param dim_tokens_enc: Dimension of tokens coming from encoder + """ + # print(dim_tokens_enc) + + # Set up activation postprocessing layers + if isinstance(dim_tokens_enc, int): + dim_tokens_enc = 4 * [dim_tokens_enc] + + self.dim_tokens_enc = [dt * len(self.main_tasks) for dt in dim_tokens_enc] + + self.act_1_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[0], + out_channels=self.layer_dims[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=self.layer_dims[0], + out_channels=self.layer_dims[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + self.act_2_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[1], + out_channels=self.layer_dims[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=self.layer_dims[1], + out_channels=self.layer_dims[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + self.act_3_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[2], + out_channels=self.layer_dims[2], + kernel_size=1, + stride=1, + padding=0, + ) + ) + + self.act_4_postprocess = nn.Sequential( + nn.Conv2d( + in_channels=self.dim_tokens_enc[3], + out_channels=self.layer_dims[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=self.layer_dims[3], + out_channels=self.layer_dims[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + self.act_postprocess = nn.ModuleList( + [ + self.act_1_postprocess, + self.act_2_postprocess, + self.act_3_postprocess, + self.act_4_postprocess, + ] + ) + + def adapt_tokens(self, encoder_tokens): + # Adapt tokens + x = [] + x.append(encoder_tokens[:, :]) + x = torch.cat(x, dim=-1) + return x + + def forward(self, encoder_tokens: List[torch.Tensor], image_size): + # input_info: Dict): + assert ( + self.dim_tokens_enc is not None + ), "Need to call init(dim_tokens_enc) function first" + H, W = image_size + + # Number of patches in height and width + N_H = H // (self.stride_level * self.P_H) + N_W = W // (self.stride_level * self.P_W) + + # Hook decoder onto 4 layers from specified ViT layers + layers = [encoder_tokens[hook] for hook in self.hooks] + + # Extract only task-relevant tokens and ignore global tokens. + layers = [self.adapt_tokens(l) for l in layers] + + # Reshape tokens to spatial representation + layers = [ + rearrange(l, "b (nh nw) c -> b c nh nw", nh=N_H, nw=N_W) for l in layers + ] + + layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)] + # Project layers to chosen feature dim + layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)] + + # Fuse layers using refinement stages + path_4 = self.scratch.refinenet4(layers[3]) + path_3 = self.scratch.refinenet3(path_4, layers[2]) + path_2 = self.scratch.refinenet2(path_3, layers[1]) + path_1 = self.scratch.refinenet1(path_2, layers[0]) + + # Output head + out = self.head(path_1) + + return out diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/head_downstream.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/head_downstream.py new file mode 100644 index 0000000000000000000000000000000000000000..384afcbd6ac9d4b5729c0219dd8534b5123d2b17 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/head_downstream.py @@ -0,0 +1,83 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Heads for downstream tasks +# -------------------------------------------------------- + +""" +A head is a module where the __init__ defines only the head hyperparameters. +A method setup(croconet) takes a CroCoNet and set all layers according to the head and croconet attributes. +The forward takes the features as well as a dictionary img_info containing the keys 'width' and 'height' +""" + +import torch +import torch.nn as nn +from .dpt_block import DPTOutputAdapter + + +class PixelwiseTaskWithDPT(nn.Module): + """DPT module for CroCo. + by default, hooks_idx will be equal to: + * for encoder-only: 4 equally spread layers + * for encoder+decoder: last encoder + 3 equally spread layers of the decoder + """ + + def __init__( + self, + *, + hooks_idx=None, + layer_dims=[96, 192, 384, 768], + output_width_ratio=1, + num_channels=1, + postprocess=None, + **kwargs, + ): + super(PixelwiseTaskWithDPT, self).__init__() + self.return_all_blocks = True # backbone needs to return all layers + self.postprocess = postprocess + self.output_width_ratio = output_width_ratio + self.num_channels = num_channels + self.hooks_idx = hooks_idx + self.layer_dims = layer_dims + + def setup(self, croconet): + dpt_args = { + "output_width_ratio": self.output_width_ratio, + "num_channels": self.num_channels, + } + if self.hooks_idx is None: + if hasattr(croconet, "dec_blocks"): # encoder + decoder + step = {8: 3, 12: 4, 24: 8}[croconet.dec_depth] + hooks_idx = [ + croconet.dec_depth + croconet.enc_depth - 1 - i * step + for i in range(3, -1, -1) + ] + else: # encoder only + step = croconet.enc_depth // 4 + hooks_idx = [ + croconet.enc_depth - 1 - i * step for i in range(3, -1, -1) + ] + self.hooks_idx = hooks_idx + print( + f" PixelwiseTaskWithDPT: automatically setting hook_idxs={self.hooks_idx}" + ) + dpt_args["hooks"] = self.hooks_idx + dpt_args["layer_dims"] = self.layer_dims + self.dpt = DPTOutputAdapter(**dpt_args) + dim_tokens = [ + ( + croconet.enc_embed_dim + if hook < croconet.enc_depth + else croconet.dec_embed_dim + ) + for hook in self.hooks_idx + ] + dpt_init_args = {"dim_tokens_enc": dim_tokens} + self.dpt.init(**dpt_init_args) + + def forward(self, x, img_info): + out = self.dpt(x, image_size=(img_info["height"], img_info["width"])) + if self.postprocess: + out = self.postprocess(out) + return out diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/masking.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/masking.py new file mode 100644 index 0000000000000000000000000000000000000000..ae18f927ae82e4075c2246ce722007c69a4da344 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/masking.py @@ -0,0 +1,26 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Masking utils +# -------------------------------------------------------- + +import torch +import torch.nn as nn + + +class RandomMask(nn.Module): + """ + random masking + """ + + def __init__(self, num_patches, mask_ratio): + super().__init__() + self.num_patches = num_patches + self.num_mask = int(mask_ratio * self.num_patches) + + def __call__(self, x): + noise = torch.rand(x.size(0), self.num_patches, device=x.device) + argsort = torch.argsort(noise, dim=1) + return argsort < self.num_mask diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/pos_embed.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/pos_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..0f76e4d5be2222d446f14d7fb24a047b686cb328 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/models/pos_embed.py @@ -0,0 +1,179 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + + +# -------------------------------------------------------- +# Position embedding utils +# -------------------------------------------------------- + + +import numpy as np + +import torch + + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, n_cls_token=0): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [n_cls_token+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if n_cls_token > 0: + pos_embed = np.concatenate( + [np.zeros([n_cls_token, embed_dim]), pos_embed], axis=0 + ) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=float) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# MAE: https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if "pos_embed" in checkpoint_model: + pos_embed_checkpoint = checkpoint_model["pos_embed"] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches**0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print( + "Position interpolate from %dx%d to %dx%d" + % (orig_size, orig_size, new_size, new_size) + ) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape( + -1, orig_size, orig_size, embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, + size=(new_size, new_size), + mode="bicubic", + align_corners=False, + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model["pos_embed"] = new_pos_embed + + +# ---------------------------------------------------------- +# RoPE2D: RoPE implementation in 2D +# ---------------------------------------------------------- + +try: + from models.curope import cuRoPE2D + + RoPE2D = cuRoPE2D +except ImportError: + print( + "Warning, cannot find cuda-compiled version of RoPE2D, using a slow pytorch version instead" + ) + + class RoPE2D(torch.nn.Module): + + def __init__(self, freq=100.0, F0=1.0): + super().__init__() + self.base = freq + self.F0 = F0 + self.cache = {} + + def get_cos_sin(self, D, seq_len, device, dtype): + if (D, seq_len, device, dtype) not in self.cache: + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, D, 2).float().to(device) / D) + ) + t = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, inv_freq).to(dtype) + freqs = torch.cat((freqs, freqs), dim=-1) + cos = freqs.cos() # (Seq, Dim) + sin = freqs.sin() + self.cache[D, seq_len, device, dtype] = (cos, sin) + return self.cache[D, seq_len, device, dtype] + + @staticmethod + def rotate_half(x): + x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def apply_rope1d(self, tokens, pos1d, cos, sin): + assert pos1d.ndim == 2 + cos = torch.nn.functional.embedding(pos1d, cos)[:, None, :, :] + sin = torch.nn.functional.embedding(pos1d, sin)[:, None, :, :] + return (tokens * cos) + (self.rotate_half(tokens) * sin) + + def forward(self, tokens, positions): + """ + input: + * tokens: batch_size x nheads x ntokens x dim + * positions: batch_size x ntokens x 2 (y and x position of each token) + output: + * tokens after appplying RoPE2D (batch_size x nheads x ntokens x dim) + """ + assert ( + tokens.size(3) % 2 == 0 + ), "number of dimensions should be a multiple of two" + D = tokens.size(3) // 2 + assert positions.ndim == 3 and positions.shape[-1] == 2 # Batch, Seq, 2 + cos, sin = self.get_cos_sin( + D, int(positions.max()) + 1, tokens.device, tokens.dtype + ) + # split features into two along the feature dimension, and apply rope1d on each half + y, x = tokens.chunk(2, dim=-1) + y = self.apply_rope1d(y, positions[:, :, 0], cos, sin) + x = self.apply_rope1d(x, positions[:, :, 1], cos, sin) + tokens = torch.cat((y, x), dim=-1) + return tokens diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/pretrain.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/pretrain.py new file mode 100644 index 0000000000000000000000000000000000000000..fef4ff2a0b7cb865a68741ac0e76d43d50ee4659 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/pretrain.py @@ -0,0 +1,391 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# Pre-training CroCo +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- +import argparse +import datetime +import json +import numpy as np +import os +import sys +import time +import math +from pathlib import Path +from typing import Iterable + +import torch +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets + +import utils.misc as misc +from utils.misc import NativeScalerWithGradNormCount as NativeScaler +from models.croco import CroCoNet +from models.criterion import MaskedMSE +from datasets.pairs_dataset import PairsDataset + + +def get_args_parser(): + parser = argparse.ArgumentParser("CroCo pre-training", add_help=False) + # model and criterion + parser.add_argument( + "--model", + default="CroCoNet()", + type=str, + help="string containing the model to build", + ) + parser.add_argument( + "--norm_pix_loss", + default=1, + choices=[0, 1], + help="apply per-patch mean/std normalization before applying the loss", + ) + # dataset + parser.add_argument( + "--dataset", default="habitat_release", type=str, help="training set" + ) + parser.add_argument( + "--transforms", default="crop224+acolor", type=str, help="transforms to apply" + ) # in the paper, we also use some homography and rotation, but find later that they were not useful or even harmful + # training + parser.add_argument("--seed", default=0, type=int, help="Random seed") + parser.add_argument( + "--batch_size", + default=64, + type=int, + help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus", + ) + parser.add_argument( + "--epochs", + default=800, + type=int, + help="Maximum number of epochs for the scheduler", + ) + parser.add_argument( + "--max_epoch", default=400, type=int, help="Stop training at this epoch" + ) + parser.add_argument( + "--accum_iter", + default=1, + type=int, + help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)", + ) + parser.add_argument( + "--weight_decay", type=float, default=0.05, help="weight decay (default: 0.05)" + ) + parser.add_argument( + "--lr", + type=float, + default=None, + metavar="LR", + help="learning rate (absolute lr)", + ) + parser.add_argument( + "--blr", + type=float, + default=1.5e-4, + metavar="LR", + help="base learning rate: absolute_lr = base_lr * total_batch_size / 256", + ) + parser.add_argument( + "--min_lr", + type=float, + default=0.0, + metavar="LR", + help="lower lr bound for cyclic schedulers that hit 0", + ) + parser.add_argument( + "--warmup_epochs", type=int, default=40, metavar="N", help="epochs to warmup LR" + ) + parser.add_argument( + "--amp", + type=int, + default=1, + choices=[0, 1], + help="Use Automatic Mixed Precision for pretraining", + ) + # others + parser.add_argument("--num_workers", default=8, type=int) + parser.add_argument( + "--world_size", default=1, type=int, help="number of distributed processes" + ) + parser.add_argument("--local_rank", default=-1, type=int) + parser.add_argument( + "--dist_url", default="env://", help="url used to set up distributed training" + ) + parser.add_argument( + "--save_freq", + default=1, + type=int, + help="frequence (number of epochs) to save checkpoint in checkpoint-last.pth", + ) + parser.add_argument( + "--keep_freq", + default=20, + type=int, + help="frequence (number of epochs) to save checkpoint in checkpoint-%d.pth", + ) + parser.add_argument( + "--print_freq", + default=20, + type=int, + help="frequence (number of iterations) to print infos while training", + ) + # paths + parser.add_argument( + "--output_dir", + default="./output/", + type=str, + help="path where to save the output", + ) + parser.add_argument( + "--data_dir", default="./data/", type=str, help="path where data are stored" + ) + return parser + + +def main(args): + misc.init_distributed_mode(args) + global_rank = misc.get_rank() + world_size = misc.get_world_size() + + print("output_dir: " + args.output_dir) + if args.output_dir: + Path(args.output_dir).mkdir(parents=True, exist_ok=True) + + # auto resume + last_ckpt_fname = os.path.join(args.output_dir, f"checkpoint-last.pth") + args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None + + print("job dir: {}".format(os.path.dirname(os.path.realpath(__file__)))) + print("{}".format(args).replace(", ", ",\n")) + + device = "cuda" if torch.cuda.is_available() else "cpu" + device = torch.device(device) + + # fix the seed + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + + cudnn.benchmark = True + + ## training dataset and loader + print( + "Building dataset for {:s} with transforms {:s}".format( + args.dataset, args.transforms + ) + ) + dataset = PairsDataset(args.dataset, trfs=args.transforms, data_dir=args.data_dir) + if world_size > 1: + sampler_train = torch.utils.data.DistributedSampler( + dataset, num_replicas=world_size, rank=global_rank, shuffle=True + ) + print("Sampler_train = %s" % str(sampler_train)) + else: + sampler_train = torch.utils.data.RandomSampler(dataset) + data_loader_train = torch.utils.data.DataLoader( + dataset, + sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True, + ) + + ## model + print("Loading model: {:s}".format(args.model)) + model = eval(args.model) + print( + "Loading criterion: MaskedMSE(norm_pix_loss={:s})".format( + str(bool(args.norm_pix_loss)) + ) + ) + criterion = MaskedMSE(norm_pix_loss=bool(args.norm_pix_loss)) + + model.to(device) + model_without_ddp = model + print("Model = %s" % str(model_without_ddp)) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + if args.lr is None: # only base_lr is specified + args.lr = args.blr * eff_batch_size / 256 + print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) + print("actual lr: %.2e" % args.lr) + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.gpu], find_unused_parameters=True, static_graph=True + ) + model_without_ddp = model.module + + param_groups = misc.get_parameter_groups( + model_without_ddp, args.weight_decay + ) # following timm: set wd as 0 for bias and norm layers + optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) + print(optimizer) + loss_scaler = NativeScaler() + + misc.load_model( + args=args, + model_without_ddp=model_without_ddp, + optimizer=optimizer, + loss_scaler=loss_scaler, + ) + + if global_rank == 0 and args.output_dir is not None: + log_writer = SummaryWriter(log_dir=args.output_dir) + else: + log_writer = None + + print(f"Start training until {args.max_epoch} epochs") + start_time = time.time() + for epoch in range(args.start_epoch, args.max_epoch): + if world_size > 1: + data_loader_train.sampler.set_epoch(epoch) + + train_stats = train_one_epoch( + model, + criterion, + data_loader_train, + optimizer, + device, + epoch, + loss_scaler, + log_writer=log_writer, + args=args, + ) + + if args.output_dir and epoch % args.save_freq == 0: + misc.save_model( + args=args, + model_without_ddp=model_without_ddp, + optimizer=optimizer, + loss_scaler=loss_scaler, + epoch=epoch, + fname="last", + ) + + if ( + args.output_dir + and (epoch % args.keep_freq == 0 or epoch + 1 == args.max_epoch) + and (epoch > 0 or args.max_epoch == 1) + ): + misc.save_model( + args=args, + model_without_ddp=model_without_ddp, + optimizer=optimizer, + loss_scaler=loss_scaler, + epoch=epoch, + ) + + log_stats = { + **{f"train_{k}": v for k, v in train_stats.items()}, + "epoch": epoch, + } + + if args.output_dir and misc.is_main_process(): + if log_writer is not None: + log_writer.flush() + with open( + os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8" + ) as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print("Training time {}".format(total_time_str)) + + +def train_one_epoch( + model: torch.nn.Module, + criterion: torch.nn.Module, + data_loader: Iterable, + optimizer: torch.optim.Optimizer, + device: torch.device, + epoch: int, + loss_scaler, + log_writer=None, + args=None, +): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter("lr", misc.SmoothedValue(window_size=1, fmt="{value:.6f}")) + header = "Epoch: [{}]".format(epoch) + accum_iter = args.accum_iter + + optimizer.zero_grad() + + if log_writer is not None: + print("log_dir: {}".format(log_writer.log_dir)) + + for data_iter_step, (image1, image2) in enumerate( + metric_logger.log_every(data_loader, args.print_freq, header) + ): + + # we use a per iteration lr scheduler + if data_iter_step % accum_iter == 0: + misc.adjust_learning_rate( + optimizer, data_iter_step / len(data_loader) + epoch, args + ) + + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + with torch.cuda.amp.autocast(enabled=bool(args.amp)): + out, mask, target = model(image1, image2) + loss = criterion(out, mask, target) + + loss_value = loss.item() + + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + loss /= accum_iter + loss_scaler( + loss, + optimizer, + parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0, + ) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(lr=lr) + + loss_value_reduce = misc.all_reduce_mean(loss_value) + if ( + log_writer is not None + and ((data_iter_step + 1) % (accum_iter * args.print_freq)) == 0 + ): + # x-axis is based on epoch_1000x in the tensorboard, calibrating differences curves when batch size changes + epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) + log_writer.add_scalar("train_loss", loss_value_reduce, epoch_1000x) + log_writer.add_scalar("lr", lr, epoch_1000x) + + # gather the stats from all processes + metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +if __name__ == "__main__": + args = get_args_parser() + args = args.parse_args() + main(args) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/README.MD b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..81595380fadd274b523e0cf77921b1b65cbedb34 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/README.MD @@ -0,0 +1,318 @@ +## CroCo-Stereo and CroCo-Flow + +This README explains how to use CroCo-Stereo and CroCo-Flow as well as how they were trained. +All commands should be launched from the root directory. + +### Simple inference example + +We provide a simple inference exemple for CroCo-Stereo and CroCo-Flow in the Totebook `croco-stereo-flow-demo.ipynb`. +Before running it, please download the trained models with: +``` +bash stereoflow/download_model.sh crocostereo.pth +bash stereoflow/download_model.sh crocoflow.pth +``` + +### Prepare data for training or evaluation + +Put the datasets used for training/evaluation in `./data/stereoflow` (or update the paths at the top of `stereoflow/datasets_stereo.py` and `stereoflow/datasets_flow.py`). +Please find below on the file structure should look for each dataset: +
+FlyingChairs + +``` +./data/stereoflow/FlyingChairs/ +└───chairs_split.txt +└───data/ + └─── ... +``` +
+ +
+MPI-Sintel + +``` +./data/stereoflow/MPI-Sintel/ +└───training/ +│ └───clean/ +│ └───final/ +│ └───flow/ +└───test/ + └───clean/ + └───final/ +``` +
+ +
+SceneFlow (including FlyingThings) + +``` +./data/stereoflow/SceneFlow/ +└───Driving/ +│ └───disparity/ +│ └───frames_cleanpass/ +│ └───frames_finalpass/ +└───FlyingThings/ +│ └───disparity/ +│ └───frames_cleanpass/ +│ └───frames_finalpass/ +│ └───optical_flow/ +└───Monkaa/ + └───disparity/ + └───frames_cleanpass/ + └───frames_finalpass/ +``` +
+ +
+TartanAir + +``` +./data/stereoflow/TartanAir/ +└───abandonedfactory/ +│ └───.../ +└───abandonedfactory_night/ +│ └───.../ +└───.../ +``` +
+ +
+Booster + +``` +./data/stereoflow/booster_gt/ +└───train/ + └───balanced/ + └───Bathroom/ + └───Bedroom/ + └───... +``` +
+ +
+CREStereo + +``` +./data/stereoflow/crenet_stereo_trainset/ +└───stereo_trainset/ + └───crestereo/ + └───hole/ + └───reflective/ + └───shapenet/ + └───tree/ +``` +
+ +
+ETH3D Two-view Low-res + +``` +./data/stereoflow/eth3d_lowres/ +└───test/ +│ └───lakeside_1l/ +│ └───... +└───train/ +│ └───delivery_area_1l/ +│ └───... +└───train_gt/ + └───delivery_area_1l/ + └───... +``` +
+ +
+KITTI 2012 + +``` +./data/stereoflow/kitti-stereo-2012/ +└───testing/ +│ └───colored_0/ +│ └───colored_1/ +└───training/ + └───colored_0/ + └───colored_1/ + └───disp_occ/ + └───flow_occ/ +``` +
+ +
+KITTI 2015 + +``` +./data/stereoflow/kitti-stereo-2015/ +└───testing/ +│ └───image_2/ +│ └───image_3/ +└───training/ + └───image_2/ + └───image_3/ + └───disp_occ_0/ + └───flow_occ/ +``` +
+ +
+Middlebury + +``` +./data/stereoflow/middlebury +└───2005/ +│ └───train/ +│ └───Art/ +│ └───... +└───2006/ +│ └───Aloe/ +│ └───Baby1/ +│ └───... +└───2014/ +│ └───Adirondack-imperfect/ +│ └───Adirondack-perfect/ +│ └───... +└───2021/ +│ └───data/ +│ └───artroom1/ +│ └───artroom2/ +│ └───... +└───MiddEval3_F/ + └───test/ + │ └───Australia/ + │ └───... + └───train/ + └───Adirondack/ + └───... +``` +
+ +
+Spring + +``` +./data/stereoflow/spring/ +└───test/ +│ └───0003/ +│ └───... +└───train/ + └───0001/ + └───... +``` +
+ + +### CroCo-Stereo + +##### Main model + +The main training of CroCo-Stereo was performed on a series of datasets, and it was used as it for Middlebury v3 benchmark. + +``` +# Download the model +bash stereoflow/download_model.sh crocostereo.pth +# Middlebury v3 submission +python stereoflow/test.py --model stereoflow_models/crocostereo.pth --dataset "MdEval3('all_full')" --save submission --tile_overlap 0.9 +# Training command that was used, using checkpoint-last.pth +python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/ +# or it can be launched on multiple gpus (while maintaining the effective batch size), e.g. on 3 gpus: +torchrun --nproc_per_node 3 stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 2 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main/ +``` + +For evaluation of validation set, we also provide the model trained on the `subtrain` subset of the training sets. + +``` +# Download the model +bash stereoflow/download_model.sh crocostereo_subtrain.pth +# Evaluation on validation sets +python stereoflow/test.py --model stereoflow_models/crocostereo_subtrain.pth --dataset "MdEval3('subval_full')+ETH3DLowRes('subval')+SceneFlow('test_finalpass')+SceneFlow('test_cleanpass')" --save metrics --tile_overlap 0.9 +# Training command that was used (same as above but on subtrain, using checkpoint-best.pth), can also be launched on multiple gpus +python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('subtrain')+50*Md05('subtrain')+50*Md06('subtrain')+50*Md14('subtrain')+50*Md21('subtrain')+50*MdEval3('subtrain_full')+Booster('subtrain_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_subtrain/ +``` + +##### Other models + +
+ Model for ETH3D + The model used for the submission on ETH3D is trained with the same command but using an unbounded Laplacian loss. + + # Download the model + bash stereoflow/download_model.sh crocostereo_eth3d.pth + # ETH3D submission + python stereoflow/test.py --model stereoflow_models/crocostereo_eth3d.pth --dataset "ETH3DLowRes('all')" --save submission --tile_overlap 0.9 + # Training command that was used + python -u stereoflow/train.py stereo --criterion "LaplacianLoss()" --tile_conf_mode conf_expbeta3 --dataset "CREStereo('train')+SceneFlow('train_allpass')+30*ETH3DLowRes('train')+50*Md05('train')+50*Md06('train')+50*Md14('train')+50*Md21('train')+50*MdEval3('train_full')+Booster('train_balanced')" --val_dataset "SceneFlow('test1of100_finalpass')+SceneFlow('test1of100_cleanpass')+ETH3DLowRes('subval')+Md05('subval')+Md06('subval')+Md14('subval')+Md21('subval')+MdEval3('subval_full')+Booster('subval_balanced')" --lr 3e-5 --batch_size 6 --epochs 32 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocostereo/main_eth3d/ + +
+ +
+ Main model finetuned on Kitti + + # Download the model + bash stereoflow/download_model.sh crocostereo_finetune_kitti.pth + # Kitti submission + python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.9 + # Training that was used + python -u stereoflow/train.py stereo --crop 352 1216 --criterion "LaplacianLossBounded2()" --dataset "Kitti12('train')+Kitti15('train')" --lr 3e-5 --batch_size 1 --accum_iter 6 --epochs 20 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_kitti/ --save_every 5 +
+ +
+ Main model finetuned on Spring + + # Download the model + bash stereoflow/download_model.sh crocostereo_finetune_spring.pth + # Spring submission + python stereoflow/test.py --model stereoflow_models/crocostereo_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9 + # Training command that was used + python -u stereoflow/train.py stereo --criterion "LaplacianLossBounded2()" --dataset "Spring('train')" --lr 3e-5 --batch_size 6 --epochs 8 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocostereo.pth --output_dir xps/crocostereo/finetune_spring/ +
+ +
+ Smaller models + To train CroCo-Stereo with smaller CroCo pretrained models, simply replace the --pretrained argument. To download the smaller CroCo-Stereo models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_smalldecoder.pth, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocostereo_subtrain_vitb_basedecoder.pth. +
+ + +### CroCo-Flow + +##### Main model + +The main training of CroCo-Flow was performed on the FlyingThings, FlyingChairs, MPI-Sintel and TartanAir datasets. +It was used for our submission to the MPI-Sintel benchmark. + +``` +# Download the model +bash stereoflow/download_model.sh crocoflow.pth +# Evaluation +python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --save metrics --tile_overlap 0.9 +# Sintel submission +python stereoflow/test.py --model stereoflow_models/crocoflow.pth --dataset "MPISintel('test_allpass')" --save submission --tile_overlap 0.9 +# Training command that was used, with checkpoint-best.pth +python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "40*MPISintel('subtrain_cleanpass')+40*MPISintel('subtrain_finalpass')+4*FlyingThings('train_allpass')+4*FlyingChairs('train')+TartanAir('train')" --val_dataset "MPISintel('subval_cleanpass')+MPISintel('subval_finalpass')" --lr 2e-5 --batch_size 8 --epochs 240 --img_per_epoch 30000 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --output_dir xps/crocoflow/main/ +``` + +##### Other models + +
+ Main model finetuned on Kitti + + # Download the model + bash stereoflow/download_model.sh crocoflow_finetune_kitti.pth + # Kitti submission + python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_kitti.pth --dataset "Kitti15('test')" --save submission --tile_overlap 0.99 + # Training that was used, with checkpoint-last.pth + python -u stereoflow/train.py flow --crop 352 1216 --criterion "LaplacianLossBounded()" --dataset "Kitti15('train')+Kitti12('train')" --lr 2e-5 --batch_size 1 --accum_iter 8 --epochs 150 --save_every 5 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_kitti/ +
+ +
+ Main model finetuned on Spring + + # Download the model + bash stereoflow/download_model.sh crocoflow_finetune_spring.pth + # Spring submission + python stereoflow/test.py --model stereoflow_models/crocoflow_finetune_spring.pth --dataset "Spring('test')" --save submission --tile_overlap 0.9 + # Training command that was used, with checkpoint-last.pth + python -u stereoflow/train.py flow --criterion "LaplacianLossBounded()" --dataset "Spring('train')" --lr 2e-5 --batch_size 8 --epochs 12 --pretrained pretrained_models/CroCo_V2_ViTLarge_BaseDecoder.pth --start_from stereoflow_models/crocoflow.pth --output_dir xps/crocoflow/finetune_spring/ +
+ +
+ Smaller models + To train CroCo-Flow with smaller CroCo pretrained models, simply replace the --pretrained argument. To download the smaller CroCo-Flow models based on CroCo v2 pretraining with ViT-Base encoder and Small encoder, use bash stereoflow/download_model.sh crocoflow_vitb_smalldecoder.pth, and for the model with a ViT-Base encoder and a Base decoder, use bash stereoflow/download_model.sh crocoflow_vitb_basedecoder.pth. +
diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/augmentor.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/augmentor.py new file mode 100644 index 0000000000000000000000000000000000000000..aac818df45d927ac383a41978ff92dc5f2899890 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/augmentor.py @@ -0,0 +1,396 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Data augmentation for training stereo and flow +# -------------------------------------------------------- + +# References +# https://github.com/autonomousvision/unimatch/blob/master/dataloader/stereo/transforms.py +# https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/transforms.py + + +import numpy as np +import random +from PIL import Image + +import cv2 + +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + +import torch +from torchvision.transforms import ColorJitter +import torchvision.transforms.functional as FF + + +class StereoAugmentor(object): + + def __init__( + self, + crop_size, + scale_prob=0.5, + scale_xonly=True, + lhth=800.0, + lminscale=0.0, + lmaxscale=1.0, + hminscale=-0.2, + hmaxscale=0.4, + scale_interp_nearest=True, + rightjitterprob=0.5, + v_flip_prob=0.5, + color_aug_asym=True, + color_choice_prob=0.5, + ): + self.crop_size = crop_size + self.scale_prob = scale_prob + self.scale_xonly = scale_xonly + self.lhth = lhth + self.lminscale = lminscale + self.lmaxscale = lmaxscale + self.hminscale = hminscale + self.hmaxscale = hmaxscale + self.scale_interp_nearest = scale_interp_nearest + self.rightjitterprob = rightjitterprob + self.v_flip_prob = v_flip_prob + self.color_aug_asym = color_aug_asym + self.color_choice_prob = color_choice_prob + + def _random_scale(self, img1, img2, disp): + ch, cw = self.crop_size + h, w = img1.shape[:2] + if self.scale_prob > 0.0 and np.random.rand() < self.scale_prob: + min_scale, max_scale = ( + (self.lminscale, self.lmaxscale) + if min(h, w) < self.lhth + else (self.hminscale, self.hmaxscale) + ) + scale_x = 2.0 ** np.random.uniform(min_scale, max_scale) + scale_x = np.clip(scale_x, (cw + 8) / float(w), None) + scale_y = 1.0 + if not self.scale_xonly: + scale_y = scale_x + scale_y = np.clip(scale_y, (ch + 8) / float(h), None) + img1 = cv2.resize( + img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR + ) + img2 = cv2.resize( + img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR + ) + disp = ( + cv2.resize( + disp, + None, + fx=scale_x, + fy=scale_y, + interpolation=( + cv2.INTER_LINEAR + if not self.scale_interp_nearest + else cv2.INTER_NEAREST + ), + ) + * scale_x + ) + else: # check if we need to resize to be able to crop + h, w = img1.shape[:2] + clip_scale = (cw + 8) / float(w) + if clip_scale > 1.0: + scale_x = clip_scale + scale_y = scale_x if not self.scale_xonly else 1.0 + img1 = cv2.resize( + img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR + ) + img2 = cv2.resize( + img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR + ) + disp = ( + cv2.resize( + disp, + None, + fx=scale_x, + fy=scale_y, + interpolation=( + cv2.INTER_LINEAR + if not self.scale_interp_nearest + else cv2.INTER_NEAREST + ), + ) + * scale_x + ) + return img1, img2, disp + + def _random_crop(self, img1, img2, disp): + h, w = img1.shape[:2] + ch, cw = self.crop_size + assert ch <= h and cw <= w, (img1.shape, h, w, ch, cw) + offset_x = np.random.randint(w - cw + 1) + offset_y = np.random.randint(h - ch + 1) + img1 = img1[offset_y : offset_y + ch, offset_x : offset_x + cw] + img2 = img2[offset_y : offset_y + ch, offset_x : offset_x + cw] + disp = disp[offset_y : offset_y + ch, offset_x : offset_x + cw] + return img1, img2, disp + + def _random_vflip(self, img1, img2, disp): + # vertical flip + if self.v_flip_prob > 0 and np.random.rand() < self.v_flip_prob: + img1 = np.copy(np.flipud(img1)) + img2 = np.copy(np.flipud(img2)) + disp = np.copy(np.flipud(disp)) + return img1, img2, disp + + def _random_rotate_shift_right(self, img2): + if self.rightjitterprob > 0.0 and np.random.rand() < self.rightjitterprob: + angle, pixel = 0.1, 2 + px = np.random.uniform(-pixel, pixel) + ag = np.random.uniform(-angle, angle) + image_center = ( + np.random.uniform(0, img2.shape[0]), + np.random.uniform(0, img2.shape[1]), + ) + rot_mat = cv2.getRotationMatrix2D(image_center, ag, 1.0) + img2 = cv2.warpAffine( + img2, rot_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR + ) + trans_mat = np.float32([[1, 0, 0], [0, 1, px]]) + img2 = cv2.warpAffine( + img2, trans_mat, img2.shape[1::-1], flags=cv2.INTER_LINEAR + ) + return img2 + + def _random_color_contrast(self, img1, img2): + if np.random.random() < 0.5: + contrast_factor = np.random.uniform(0.8, 1.2) + img1 = FF.adjust_contrast(img1, contrast_factor) + if self.color_aug_asym and np.random.random() < 0.5: + contrast_factor = np.random.uniform(0.8, 1.2) + img2 = FF.adjust_contrast(img2, contrast_factor) + return img1, img2 + + def _random_color_gamma(self, img1, img2): + if np.random.random() < 0.5: + gamma = np.random.uniform(0.7, 1.5) + img1 = FF.adjust_gamma(img1, gamma) + if self.color_aug_asym and np.random.random() < 0.5: + gamma = np.random.uniform(0.7, 1.5) + img2 = FF.adjust_gamma(img2, gamma) + return img1, img2 + + def _random_color_brightness(self, img1, img2): + if np.random.random() < 0.5: + brightness = np.random.uniform(0.5, 2.0) + img1 = FF.adjust_brightness(img1, brightness) + if self.color_aug_asym and np.random.random() < 0.5: + brightness = np.random.uniform(0.5, 2.0) + img2 = FF.adjust_brightness(img2, brightness) + return img1, img2 + + def _random_color_hue(self, img1, img2): + if np.random.random() < 0.5: + hue = np.random.uniform(-0.1, 0.1) + img1 = FF.adjust_hue(img1, hue) + if self.color_aug_asym and np.random.random() < 0.5: + hue = np.random.uniform(-0.1, 0.1) + img2 = FF.adjust_hue(img2, hue) + return img1, img2 + + def _random_color_saturation(self, img1, img2): + if np.random.random() < 0.5: + saturation = np.random.uniform(0.8, 1.2) + img1 = FF.adjust_saturation(img1, saturation) + if self.color_aug_asym and np.random.random() < 0.5: + saturation = np.random.uniform(-0.8, 1.2) + img2 = FF.adjust_saturation(img2, saturation) + return img1, img2 + + def _random_color(self, img1, img2): + trfs = [ + self._random_color_contrast, + self._random_color_gamma, + self._random_color_brightness, + self._random_color_hue, + self._random_color_saturation, + ] + img1 = Image.fromarray(img1.astype("uint8")) + img2 = Image.fromarray(img2.astype("uint8")) + if np.random.random() < self.color_choice_prob: + # A single transform + t = random.choice(trfs) + img1, img2 = t(img1, img2) + else: + # Combination of trfs + # Random order + random.shuffle(trfs) + for t in trfs: + img1, img2 = t(img1, img2) + img1 = np.array(img1).astype(np.float32) + img2 = np.array(img2).astype(np.float32) + return img1, img2 + + def __call__(self, img1, img2, disp, dataset_name): + img1, img2, disp = self._random_scale(img1, img2, disp) + img1, img2, disp = self._random_crop(img1, img2, disp) + img1, img2, disp = self._random_vflip(img1, img2, disp) + img2 = self._random_rotate_shift_right(img2) + img1, img2 = self._random_color(img1, img2) + return img1, img2, disp + + +class FlowAugmentor: + + def __init__( + self, + crop_size, + min_scale=-0.2, + max_scale=0.5, + spatial_aug_prob=0.8, + stretch_prob=0.8, + max_stretch=0.2, + h_flip_prob=0.5, + v_flip_prob=0.1, + asymmetric_color_aug_prob=0.2, + ): + + # spatial augmentation params + self.crop_size = crop_size + self.min_scale = min_scale + self.max_scale = max_scale + self.spatial_aug_prob = spatial_aug_prob + self.stretch_prob = stretch_prob + self.max_stretch = max_stretch + + # flip augmentation params + self.h_flip_prob = h_flip_prob + self.v_flip_prob = v_flip_prob + + # photometric augmentation params + self.photo_aug = ColorJitter( + brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5 / 3.14 + ) + + self.asymmetric_color_aug_prob = asymmetric_color_aug_prob + + def color_transform(self, img1, img2): + """Photometric augmentation""" + + # asymmetric + if np.random.rand() < self.asymmetric_color_aug_prob: + img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8) + img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8) + + # symmetric + else: + image_stack = np.concatenate([img1, img2], axis=0) + image_stack = np.array( + self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8 + ) + img1, img2 = np.split(image_stack, 2, axis=0) + + return img1, img2 + + def _resize_flow(self, flow, scale_x, scale_y, factor=1.0): + if np.all(np.isfinite(flow)): + flow = cv2.resize( + flow, + None, + fx=scale_x / factor, + fy=scale_y / factor, + interpolation=cv2.INTER_LINEAR, + ) + flow = flow * [scale_x, scale_y] + else: # sparse version + fx, fy = scale_x, scale_y + ht, wd = flow.shape[:2] + coords = np.meshgrid(np.arange(wd), np.arange(ht)) + coords = np.stack(coords, axis=-1) + + coords = coords.reshape(-1, 2).astype(np.float32) + flow = flow.reshape(-1, 2).astype(np.float32) + valid = np.isfinite(flow[:, 0]) + + coords0 = coords[valid] + flow0 = flow[valid] + + ht1 = int(round(ht * fy / factor)) + wd1 = int(round(wd * fx / factor)) + + rescale = np.expand_dims(np.array([fx, fy]), axis=0) + coords1 = coords0 * rescale / factor + flow1 = flow0 * rescale + + xx = np.round(coords1[:, 0]).astype(np.int32) + yy = np.round(coords1[:, 1]).astype(np.int32) + + v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) + xx = xx[v] + yy = yy[v] + flow1 = flow1[v] + + flow = np.inf * np.ones( + [ht1, wd1, 2], dtype=np.float32 + ) # invalid value every where, before we fill it with the correct ones + flow[yy, xx] = flow1 + return flow + + def spatial_transform(self, img1, img2, flow, dname): + + if np.random.rand() < self.spatial_aug_prob: + # randomly sample scale + ht, wd = img1.shape[:2] + clip_min_scale = np.maximum( + (self.crop_size[0] + 8) / float(ht), (self.crop_size[1] + 8) / float(wd) + ) + min_scale, max_scale = self.min_scale, self.max_scale + scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) + scale_x = scale + scale_y = scale + if np.random.rand() < self.stretch_prob: + scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) + scale_x = np.clip(scale_x, clip_min_scale, None) + scale_y = np.clip(scale_y, clip_min_scale, None) + # rescale the images + img1 = cv2.resize( + img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR + ) + img2 = cv2.resize( + img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR + ) + flow = self._resize_flow( + flow, scale_x, scale_y, factor=2.0 if dname == "Spring" else 1.0 + ) + elif dname == "Spring": + flow = self._resize_flow(flow, 1.0, 1.0, factor=2.0) + + if self.h_flip_prob > 0.0 and np.random.rand() < self.h_flip_prob: # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + + if self.v_flip_prob > 0.0 and np.random.rand() < self.v_flip_prob: # v-flip + img1 = img1[::-1, :] + img2 = img2[::-1, :] + flow = flow[::-1, :] * [1.0, -1.0] + + # In case no cropping + if img1.shape[0] - self.crop_size[0] > 0: + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) + else: + y0 = 0 + if img1.shape[1] - self.crop_size[1] > 0: + x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) + else: + x0 = 0 + + img1 = img1[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]] + img2 = img2[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]] + flow = flow[y0 : y0 + self.crop_size[0], x0 : x0 + self.crop_size[1]] + + return img1, img2, flow + + def __call__(self, img1, img2, flow, dname): + img1, img2, flow = self.spatial_transform(img1, img2, flow, dname) + img1, img2 = self.color_transform(img1, img2) + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + return img1, img2, flow diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/criterion.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..f041240edb549e32f2eaa1123b07871deb322fd5 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/criterion.py @@ -0,0 +1,351 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Losses, metrics per batch, metrics per dataset +# -------------------------------------------------------- + +import torch +from torch import nn +import torch.nn.functional as F + + +def _get_gtnorm(gt): + if gt.size(1) == 1: # stereo + return gt + # flow + return torch.sqrt(torch.sum(gt**2, dim=1, keepdims=True)) # Bx1xHxW + + +############ losses without confidence + + +class L1Loss(nn.Module): + + def __init__(self, max_gtnorm=None): + super().__init__() + self.max_gtnorm = max_gtnorm + self.with_conf = False + + def _error(self, gt, predictions): + return torch.abs(gt - predictions) + + def forward(self, predictions, gt, inspect=False): + mask = torch.isfinite(gt) + if self.max_gtnorm is not None: + mask *= _get_gtnorm(gt).expand(-1, gt.size(1), -1, -1) < self.max_gtnorm + if inspect: + return self._error(gt, predictions) + return self._error(gt[mask], predictions[mask]).mean() + + +############## losses with confience +## there are several parametrizations + + +class LaplacianLoss(nn.Module): # used for CroCo-Stereo on ETH3D, d'=exp(d) + + def __init__(self, max_gtnorm=None): + super().__init__() + self.max_gtnorm = max_gtnorm + self.with_conf = True + + def forward(self, predictions, gt, conf): + mask = torch.isfinite(gt) + mask = mask[:, 0, :, :] + if self.max_gtnorm is not None: + mask *= _get_gtnorm(gt)[:, 0, :, :] < self.max_gtnorm + conf = conf.squeeze(1) + return ( + torch.abs(gt - predictions).sum(dim=1)[mask] / torch.exp(conf[mask]) + + conf[mask] + ).mean() # + torch.log(2) => which is a constant + + +class LaplacianLossBounded( + nn.Module +): # used for CroCo-Flow ; in the equation of the paper, we have a=1/b + def __init__(self, max_gtnorm=10000.0, a=0.25, b=4.0): + super().__init__() + self.max_gtnorm = max_gtnorm + self.with_conf = True + self.a, self.b = a, b + + def forward(self, predictions, gt, conf): + mask = torch.isfinite(gt) + mask = mask[:, 0, :, :] + if self.max_gtnorm is not None: + mask *= _get_gtnorm(gt)[:, 0, :, :] < self.max_gtnorm + conf = conf.squeeze(1) + conf = (self.b - self.a) * torch.sigmoid(conf) + self.a + return ( + torch.abs(gt - predictions).sum(dim=1)[mask] / conf[mask] + + torch.log(conf)[mask] + ).mean() # + torch.log(2) => which is a constant + + +class LaplacianLossBounded2( + nn.Module +): # used for CroCo-Stereo (except for ETH3D) ; in the equation of the paper, we have a=b + def __init__(self, max_gtnorm=None, a=3.0, b=3.0): + super().__init__() + self.max_gtnorm = max_gtnorm + self.with_conf = True + self.a, self.b = a, b + + def forward(self, predictions, gt, conf): + mask = torch.isfinite(gt) + mask = mask[:, 0, :, :] + if self.max_gtnorm is not None: + mask *= _get_gtnorm(gt)[:, 0, :, :] < self.max_gtnorm + conf = conf.squeeze(1) + conf = 2 * self.a * (torch.sigmoid(conf / self.b) - 0.5) + return ( + torch.abs(gt - predictions).sum(dim=1)[mask] / torch.exp(conf[mask]) + + conf[mask] + ).mean() # + torch.log(2) => which is a constant + + +############## metrics per batch + + +class StereoMetrics(nn.Module): + + def __init__(self, do_quantile=False): + super().__init__() + self.bad_ths = [0.5, 1, 2, 3] + self.do_quantile = do_quantile + + def forward(self, predictions, gt): + B = predictions.size(0) + metrics = {} + gtcopy = gt.clone() + mask = torch.isfinite(gtcopy) + gtcopy[~mask] = ( + 999999.0 # we make a copy and put a non-infinite value, such that it does not become nan once multiplied by the mask value 0 + ) + Npx = mask.view(B, -1).sum(dim=1) + L1error = (torch.abs(gtcopy - predictions) * mask).view(B, -1) + L2error = (torch.square(gtcopy - predictions) * mask).view(B, -1) + # avgerr + metrics["avgerr"] = torch.mean(L1error.sum(dim=1) / Npx) + # rmse + metrics["rmse"] = torch.sqrt(L2error.sum(dim=1) / Npx).mean(dim=0) + # err > t for t in [0.5,1,2,3] + for ths in self.bad_ths: + metrics["bad@{:.1f}".format(ths)] = ( + ((L1error > ths) * mask.view(B, -1)).sum(dim=1) / Npx + ).mean(dim=0) * 100 + return metrics + + +class FlowMetrics(nn.Module): + def __init__(self): + super().__init__() + self.bad_ths = [1, 3, 5] + + def forward(self, predictions, gt): + B = predictions.size(0) + metrics = {} + mask = torch.isfinite(gt[:, 0, :, :]) # both x and y would be infinite + Npx = mask.view(B, -1).sum(dim=1) + gtcopy = ( + gt.clone() + ) # to compute L1/L2 error, we need to have non-infinite value, the error computed at this locations will be ignored + gtcopy[:, 0, :, :][~mask] = 999999.0 + gtcopy[:, 1, :, :][~mask] = 999999.0 + L1error = (torch.abs(gtcopy - predictions).sum(dim=1) * mask).view(B, -1) + L2error = ( + torch.sqrt(torch.sum(torch.square(gtcopy - predictions), dim=1)) * mask + ).view(B, -1) + metrics["L1err"] = torch.mean(L1error.sum(dim=1) / Npx) + metrics["EPE"] = torch.mean(L2error.sum(dim=1) / Npx) + for ths in self.bad_ths: + metrics["bad@{:.1f}".format(ths)] = ( + ((L2error > ths) * mask.view(B, -1)).sum(dim=1) / Npx + ).mean(dim=0) * 100 + return metrics + + +############## metrics per dataset +## we update the average and maintain the number of pixels while adding data batch per batch +## at the beggining, call reset() +## after each batch, call add_batch(...) +## at the end: call get_results() + + +class StereoDatasetMetrics(nn.Module): + + def __init__(self): + super().__init__() + self.bad_ths = [0.5, 1, 2, 3] + + def reset(self): + self.agg_N = 0 # number of pixels so far + self.agg_L1err = torch.tensor(0.0) # L1 error so far + self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels + self._metrics = None + + def add_batch(self, predictions, gt): + assert predictions.size(1) == 1, predictions.size() + assert gt.size(1) == 1, gt.size() + if ( + gt.size(2) == predictions.size(2) * 2 + and gt.size(3) == predictions.size(3) * 2 + ): # special case for Spring ... + L1err = torch.minimum( + torch.minimum( + torch.minimum( + torch.sum(torch.abs(gt[:, :, 0::2, 0::2] - predictions), dim=1), + torch.sum(torch.abs(gt[:, :, 1::2, 0::2] - predictions), dim=1), + ), + torch.sum(torch.abs(gt[:, :, 0::2, 1::2] - predictions), dim=1), + ), + torch.sum(torch.abs(gt[:, :, 1::2, 1::2] - predictions), dim=1), + ) + valid = torch.isfinite(L1err) + else: + valid = torch.isfinite(gt[:, 0, :, :]) # both x and y would be infinite + L1err = torch.sum(torch.abs(gt - predictions), dim=1) + N = valid.sum() + Nnew = self.agg_N + N + self.agg_L1err = ( + float(self.agg_N) / Nnew * self.agg_L1err + + L1err[valid].mean().cpu() * float(N) / Nnew + ) + self.agg_N = Nnew + for i, th in enumerate(self.bad_ths): + self.agg_Nbad[i] += (L1err[valid] > th).sum().cpu() + + def _compute_metrics(self): + if self._metrics is not None: + return + out = {} + out["L1err"] = self.agg_L1err.item() + for i, th in enumerate(self.bad_ths): + out["bad@{:.1f}".format(th)] = ( + float(self.agg_Nbad[i]) / self.agg_N + ).item() * 100.0 + self._metrics = out + + def get_results(self): + self._compute_metrics() # to avoid recompute them multiple times + return self._metrics + + +class FlowDatasetMetrics(nn.Module): + + def __init__(self): + super().__init__() + self.bad_ths = [0.5, 1, 3, 5] + self.speed_ths = [(0, 10), (10, 40), (40, torch.inf)] + + def reset(self): + self.agg_N = 0 # number of pixels so far + self.agg_L1err = torch.tensor(0.0) # L1 error so far + self.agg_L2err = torch.tensor(0.0) # L2 (=EPE) error so far + self.agg_Nbad = [0 for _ in self.bad_ths] # counter of bad pixels + self.agg_EPEspeed = [ + torch.tensor(0.0) for _ in self.speed_ths + ] # EPE per speed bin so far + self.agg_Nspeed = [0 for _ in self.speed_ths] # N pixels per speed bin so far + self._metrics = None + self.pairname_results = {} + + def add_batch(self, predictions, gt): + assert predictions.size(1) == 2, predictions.size() + assert gt.size(1) == 2, gt.size() + if ( + gt.size(2) == predictions.size(2) * 2 + and gt.size(3) == predictions.size(3) * 2 + ): # special case for Spring ... + L1err = torch.minimum( + torch.minimum( + torch.minimum( + torch.sum(torch.abs(gt[:, :, 0::2, 0::2] - predictions), dim=1), + torch.sum(torch.abs(gt[:, :, 1::2, 0::2] - predictions), dim=1), + ), + torch.sum(torch.abs(gt[:, :, 0::2, 1::2] - predictions), dim=1), + ), + torch.sum(torch.abs(gt[:, :, 1::2, 1::2] - predictions), dim=1), + ) + L2err = torch.minimum( + torch.minimum( + torch.minimum( + torch.sqrt( + torch.sum( + torch.square(gt[:, :, 0::2, 0::2] - predictions), dim=1 + ) + ), + torch.sqrt( + torch.sum( + torch.square(gt[:, :, 1::2, 0::2] - predictions), dim=1 + ) + ), + ), + torch.sqrt( + torch.sum( + torch.square(gt[:, :, 0::2, 1::2] - predictions), dim=1 + ) + ), + ), + torch.sqrt( + torch.sum(torch.square(gt[:, :, 1::2, 1::2] - predictions), dim=1) + ), + ) + valid = torch.isfinite(L1err) + gtspeed = ( + torch.sqrt(torch.sum(torch.square(gt[:, :, 0::2, 0::2]), dim=1)) + + torch.sqrt(torch.sum(torch.square(gt[:, :, 0::2, 1::2]), dim=1)) + + torch.sqrt(torch.sum(torch.square(gt[:, :, 1::2, 0::2]), dim=1)) + + torch.sqrt(torch.sum(torch.square(gt[:, :, 1::2, 1::2]), dim=1)) + ) / 4.0 # let's just average them + else: + valid = torch.isfinite(gt[:, 0, :, :]) # both x and y would be infinite + L1err = torch.sum(torch.abs(gt - predictions), dim=1) + L2err = torch.sqrt(torch.sum(torch.square(gt - predictions), dim=1)) + gtspeed = torch.sqrt(torch.sum(torch.square(gt), dim=1)) + N = valid.sum() + Nnew = self.agg_N + N + self.agg_L1err = ( + float(self.agg_N) / Nnew * self.agg_L1err + + L1err[valid].mean().cpu() * float(N) / Nnew + ) + self.agg_L2err = ( + float(self.agg_N) / Nnew * self.agg_L2err + + L2err[valid].mean().cpu() * float(N) / Nnew + ) + self.agg_N = Nnew + for i, th in enumerate(self.bad_ths): + self.agg_Nbad[i] += (L2err[valid] > th).sum().cpu() + for i, (th1, th2) in enumerate(self.speed_ths): + vv = (gtspeed[valid] >= th1) * (gtspeed[valid] < th2) + iNspeed = vv.sum() + if iNspeed == 0: + continue + iNnew = self.agg_Nspeed[i] + iNspeed + self.agg_EPEspeed[i] = ( + float(self.agg_Nspeed[i]) / iNnew * self.agg_EPEspeed[i] + + float(iNspeed) / iNnew * L2err[valid][vv].mean().cpu() + ) + self.agg_Nspeed[i] = iNnew + + def _compute_metrics(self): + if self._metrics is not None: + return + out = {} + out["L1err"] = self.agg_L1err.item() + out["EPE"] = self.agg_L2err.item() + for i, th in enumerate(self.bad_ths): + out["bad@{:.1f}".format(th)] = ( + float(self.agg_Nbad[i]) / self.agg_N + ).item() * 100.0 + for i, (th1, th2) in enumerate(self.speed_ths): + out["s{:d}{:s}".format(th1, "-" + str(th2) if th2 < torch.inf else "+")] = ( + self.agg_EPEspeed[i].item() + ) + self._metrics = out + + def get_results(self): + self._compute_metrics() # to avoid recompute them multiple times + return self._metrics diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/datasets_flow.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/datasets_flow.py new file mode 100644 index 0000000000000000000000000000000000000000..d5b1bc603b97a18e1245ec1756b74a9424d53ead --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/datasets_flow.py @@ -0,0 +1,936 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Dataset structure for flow +# -------------------------------------------------------- + +import os +import os.path as osp +import pickle +import numpy as np +import struct +from PIL import Image +import json +import h5py +import torch +from torch.utils import data + +from .augmentor import FlowAugmentor +from .datasets_stereo import _read_img, img_to_tensor, dataset_to_root, _read_pfm +from copy import deepcopy + +dataset_to_root = deepcopy(dataset_to_root) + +dataset_to_root.update( + **{ + "TartanAir": "./data/stereoflow/TartanAir", + "FlyingChairs": "./data/stereoflow/FlyingChairs/", + "FlyingThings": osp.join(dataset_to_root["SceneFlow"], "FlyingThings") + "/", + "MPISintel": "./data/stereoflow//MPI-Sintel/" + "/", + } +) +cache_dir = "./data/stereoflow/datasets_flow_cache/" + + +def flow_to_tensor(disp): + return torch.from_numpy(disp).float().permute(2, 0, 1) + + +class FlowDataset(data.Dataset): + + def __init__(self, split, augmentor=False, crop_size=None, totensor=True): + self.split = split + if not augmentor: + assert crop_size is None + if crop_size is not None: + assert augmentor + self.crop_size = crop_size + self.augmentor_str = augmentor + self.augmentor = FlowAugmentor(crop_size) if augmentor else None + self.totensor = totensor + self.rmul = 1 # keep track of rmul + self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time) + self._prepare_data() + self._load_or_build_cache() + + def prepare_data(self): + """ + to be defined for each dataset + """ + raise NotImplementedError + + def __len__(self): + return len( + self.pairnames + ) # each pairname is typically of the form (str, int1, int2) + + def __getitem__(self, index): + pairname = self.pairnames[index] + + # get filenames + img1name = self.pairname_to_img1name(pairname) + img2name = self.pairname_to_img2name(pairname) + flowname = ( + self.pairname_to_flowname(pairname) + if self.pairname_to_flowname is not None + else None + ) + + # load images and disparities + img1 = _read_img(img1name) + img2 = _read_img(img2name) + flow = self.load_flow(flowname) if flowname is not None else None + + # apply augmentations + if self.augmentor is not None: + img1, img2, flow = self.augmentor(img1, img2, flow, self.name) + + if self.totensor: + img1 = img_to_tensor(img1) + img2 = img_to_tensor(img2) + if flow is not None: + flow = flow_to_tensor(flow) + else: + flow = torch.tensor( + [] + ) # to allow dataloader batching with default collate_gn + pairname = str( + pairname + ) # transform potential tuple to str to be able to batch it + + return img1, img2, flow, pairname + + def __rmul__(self, v): + self.rmul *= v + self.pairnames = v * self.pairnames + return self + + def __str__(self): + return f"{self.__class__.__name__}_{self.split}" + + def __repr__(self): + s = f"{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})" + if self.rmul == 1: + s += f"\n\tnum pairs: {len(self.pairnames)}" + else: + s += f"\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})" + return s + + def _set_root(self): + self.root = dataset_to_root[self.name] + assert os.path.isdir( + self.root + ), f"could not find root directory for dataset {self.name}: {self.root}" + + def _load_or_build_cache(self): + cache_file = osp.join(cache_dir, self.name + ".pkl") + if osp.isfile(cache_file): + with open(cache_file, "rb") as fid: + self.pairnames = pickle.load(fid)[self.split] + else: + tosave = self._build_cache() + os.makedirs(cache_dir, exist_ok=True) + with open(cache_file, "wb") as fid: + pickle.dump(tosave, fid) + self.pairnames = tosave[self.split] + + +class TartanAirDataset(FlowDataset): + + def _prepare_data(self): + self.name = "TartanAir" + self._set_root() + assert self.split in ["train"] + self.pairname_to_img1name = lambda pairname: osp.join( + self.root, pairname[0], "image_left/{:06d}_left.png".format(pairname[1]) + ) + self.pairname_to_img2name = lambda pairname: osp.join( + self.root, pairname[0], "image_left/{:06d}_left.png".format(pairname[2]) + ) + self.pairname_to_flowname = lambda pairname: osp.join( + self.root, + pairname[0], + "flow/{:06d}_{:06d}_flow.npy".format(pairname[1], pairname[2]), + ) + self.pairname_to_str = lambda pairname: os.path.join( + pairname[0][pairname[0].find("/") + 1 :], + "{:06d}_{:06d}".format(pairname[1], pairname[2]), + ) + self.load_flow = _read_numpy_flow + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + pairs = [ + (osp.join(s, s, difficulty, Pxxx), int(a[:6]), int(a[:6]) + 1) + for s in seqs + for difficulty in ["Easy", "Hard"] + for Pxxx in sorted(os.listdir(osp.join(self.root, s, s, difficulty))) + for a in sorted( + os.listdir(osp.join(self.root, s, s, difficulty, Pxxx, "image_left/")) + )[:-1] + ] + assert len(pairs) == 306268, "incorrect parsing of pairs in TartanAir" + tosave = {"train": pairs} + return tosave + + +class FlyingChairsDataset(FlowDataset): + + def _prepare_data(self): + self.name = "FlyingChairs" + self._set_root() + assert self.split in ["train", "val"] + self.pairname_to_img1name = lambda pairname: osp.join( + self.root, "data", pairname + "_img1.ppm" + ) + self.pairname_to_img2name = lambda pairname: osp.join( + self.root, "data", pairname + "_img2.ppm" + ) + self.pairname_to_flowname = lambda pairname: osp.join( + self.root, "data", pairname + "_flow.flo" + ) + self.pairname_to_str = lambda pairname: pairname + self.load_flow = _read_flo_file + + def _build_cache(self): + split_file = osp.join(self.root, "chairs_split.txt") + split_list = np.loadtxt(split_file, dtype=np.int32) + trainpairs = ["{:05d}".format(i) for i in np.where(split_list == 1)[0] + 1] + valpairs = ["{:05d}".format(i) for i in np.where(split_list == 2)[0] + 1] + assert ( + len(trainpairs) == 22232 and len(valpairs) == 640 + ), "incorrect parsing of pairs in MPI-Sintel" + tosave = {"train": trainpairs, "val": valpairs} + return tosave + + +class FlyingThingsDataset(FlowDataset): + + def _prepare_data(self): + self.name = "FlyingThings" + self._set_root() + assert self.split in [ + f"{set_}_{pass_}pass{camstr}" + for set_ in ["train", "test", "test1024"] + for camstr in ["", "_rightcam"] + for pass_ in ["clean", "final", "all"] + ] + self.pairname_to_img1name = lambda pairname: osp.join( + self.root, + f"frames_{pairname[3]}pass", + pairname[0].replace("into_future", "").replace("into_past", ""), + "{:04d}.png".format(pairname[1]), + ) + self.pairname_to_img2name = lambda pairname: osp.join( + self.root, + f"frames_{pairname[3]}pass", + pairname[0].replace("into_future", "").replace("into_past", ""), + "{:04d}.png".format(pairname[2]), + ) + self.pairname_to_flowname = lambda pairname: osp.join( + self.root, + "optical_flow", + pairname[0], + "OpticalFlowInto{f:s}_{i:04d}_{c:s}.pfm".format( + f="Future" if "future" in pairname[0] else "Past", + i=pairname[1], + c="L" if "left" in pairname[0] else "R", + ), + ) + self.pairname_to_str = lambda pairname: os.path.join( + pairname[3] + "pass", + pairname[0], + "Into{f:s}_{i:04d}_{c:s}".format( + f="Future" if "future" in pairname[0] else "Past", + i=pairname[1], + c="L" if "left" in pairname[0] else "R", + ), + ) + self.load_flow = _read_pfm_flow + + def _build_cache(self): + tosave = {} + # train and test splits for the different passes + for set_ in ["train", "test"]: + sroot = osp.join(self.root, "optical_flow", set_.upper()) + fname_to_i = lambda f: int( + f[len("OpticalFlowIntoFuture_") : -len("_L.pfm")] + ) + pp = [ + (osp.join(set_.upper(), d, s, "into_future/left"), fname_to_i(fname)) + for d in sorted(os.listdir(sroot)) + for s in sorted(os.listdir(osp.join(sroot, d))) + for fname in sorted( + os.listdir(osp.join(sroot, d, s, "into_future/left")) + )[:-1] + ] + pairs = [(a, i, i + 1) for a, i in pp] + pairs += [(a.replace("into_future", "into_past"), i + 1, i) for a, i in pp] + assert ( + len(pairs) == {"train": 40302, "test": 7866}[set_] + ), "incorrect parsing of pairs Flying Things" + for cam in ["left", "right"]: + camstr = "" if cam == "left" else f"_{cam}cam" + for pass_ in ["final", "clean"]: + tosave[f"{set_}_{pass_}pass{camstr}"] = [ + (a.replace("left", cam), i, j, pass_) for a, i, j in pairs + ] + tosave[f"{set_}_allpass{camstr}"] = ( + tosave[f"{set_}_cleanpass{camstr}"] + + tosave[f"{set_}_finalpass{camstr}"] + ) + # test1024: this is the same split as unimatch 'validation' split + # see https://github.com/autonomousvision/unimatch/blob/master/dataloader/flow/datasets.py#L229 + test1024_nsamples = 1024 + alltest_nsamples = len(tosave["test_cleanpass"]) # 7866 + stride = alltest_nsamples // test1024_nsamples + remove = alltest_nsamples % test1024_nsamples + for cam in ["left", "right"]: + camstr = "" if cam == "left" else f"_{cam}cam" + for pass_ in ["final", "clean"]: + tosave[f"test1024_{pass_}pass{camstr}"] = sorted( + tosave[f"test_{pass_}pass{camstr}"] + )[:-remove][ + ::stride + ] # warning, it was not sorted before + assert ( + len(tosave["test1024_cleanpass"]) == 1024 + ), "incorrect parsing of pairs in Flying Things" + tosave[f"test1024_allpass{camstr}"] = ( + tosave[f"test1024_cleanpass{camstr}"] + + tosave[f"test1024_finalpass{camstr}"] + ) + return tosave + + +class MPISintelDataset(FlowDataset): + + def _prepare_data(self): + self.name = "MPISintel" + self._set_root() + assert self.split in [ + s + "_" + p + for s in ["train", "test", "subval", "subtrain"] + for p in ["cleanpass", "finalpass", "allpass"] + ] + self.pairname_to_img1name = lambda pairname: osp.join( + self.root, pairname[0], "frame_{:04d}.png".format(pairname[1]) + ) + self.pairname_to_img2name = lambda pairname: osp.join( + self.root, pairname[0], "frame_{:04d}.png".format(pairname[1] + 1) + ) + self.pairname_to_flowname = lambda pairname: ( + None + if pairname[0].startswith("test/") + else osp.join( + self.root, + pairname[0].replace("/clean/", "/flow/").replace("/final/", "/flow/"), + "frame_{:04d}.flo".format(pairname[1]), + ) + ) + self.pairname_to_str = lambda pairname: osp.join( + pairname[0], "frame_{:04d}".format(pairname[1]) + ) + self.load_flow = _read_flo_file + + def _build_cache(self): + trainseqs = sorted(os.listdir(self.root + "training/clean")) + trainpairs = [ + (osp.join("training/clean", s), i) + for s in trainseqs + for i in range(1, len(os.listdir(self.root + "training/clean/" + s))) + ] + subvalseqs = ["temple_2", "temple_3"] + subtrainseqs = [s for s in trainseqs if s not in subvalseqs] + subvalpairs = [(p, i) for p, i in trainpairs if any(s in p for s in subvalseqs)] + subtrainpairs = [ + (p, i) for p, i in trainpairs if any(s in p for s in subtrainseqs) + ] + testseqs = sorted(os.listdir(self.root + "test/clean")) + testpairs = [ + (osp.join("test/clean", s), i) + for s in testseqs + for i in range(1, len(os.listdir(self.root + "test/clean/" + s))) + ] + assert ( + len(trainpairs) == 1041 + and len(testpairs) == 552 + and len(subvalpairs) == 98 + and len(subtrainpairs) == 943 + ), "incorrect parsing of pairs in MPI-Sintel" + tosave = {} + tosave["train_cleanpass"] = trainpairs + tosave["test_cleanpass"] = testpairs + tosave["subval_cleanpass"] = subvalpairs + tosave["subtrain_cleanpass"] = subtrainpairs + for t in ["train", "test", "subval", "subtrain"]: + tosave[t + "_finalpass"] = [ + (p.replace("/clean/", "/final/"), i) + for p, i in tosave[t + "_cleanpass"] + ] + tosave[t + "_allpass"] = tosave[t + "_cleanpass"] + tosave[t + "_finalpass"] + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, _time): + assert prediction.shape[2] == 2 + outfile = os.path.join( + outdir, "submission", self.pairname_to_str(pairname) + ".flo" + ) + os.makedirs(os.path.dirname(outfile), exist_ok=True) + writeFlowFile(prediction, outfile) + + def finalize_submission(self, outdir): + assert self.split == "test_allpass" + bundle_exe = "/nfs/data/ffs-3d/datasets/StereoFlow/MPI-Sintel/bundler/linux-x64/bundler" # eg + if os.path.isfile(bundle_exe): + cmd = f'{bundle_exe} "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"' + print(cmd) + os.system(cmd) + print(f'Done. Submission file at: "{outdir}/submission/bundled.lzma"') + else: + print("Could not find bundler executable for submission.") + print("Please download it and run:") + print( + f' "{outdir}/submission/test/clean/" "{outdir}/submission/test/final" "{outdir}/submission/bundled.lzma"' + ) + + +class SpringDataset(FlowDataset): + + def _prepare_data(self): + self.name = "Spring" + self._set_root() + assert self.split in ["train", "test", "subtrain", "subval"] + self.pairname_to_img1name = lambda pairname: osp.join( + self.root, + pairname[0], + pairname[1], + "frame_" + pairname[3], + "frame_{:s}_{:04d}.png".format(pairname[3], pairname[4]), + ) + self.pairname_to_img2name = lambda pairname: osp.join( + self.root, + pairname[0], + pairname[1], + "frame_" + pairname[3], + "frame_{:s}_{:04d}.png".format( + pairname[3], pairname[4] + (1 if pairname[2] == "FW" else -1) + ), + ) + self.pairname_to_flowname = lambda pairname: ( + None + if pairname[0] == "test" + else osp.join( + self.root, + pairname[0], + pairname[1], + f"flow_{pairname[2]}_{pairname[3]}", + f"flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5", + ) + ) + self.pairname_to_str = lambda pairname: osp.join( + pairname[0], + pairname[1], + f"flow_{pairname[2]}_{pairname[3]}", + f"flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}", + ) + self.load_flow = _read_hdf5_flow + + def _build_cache(self): + # train + trainseqs = sorted(os.listdir(osp.join(self.root, "train"))) + trainpairs = [] + for leftright in ["left", "right"]: + for fwbw in ["FW", "BW"]: + trainpairs += [ + ( + "train", + s, + fwbw, + leftright, + int(f[len(f"flow_{fwbw}_{leftright}_") : -len(".flo5")]), + ) + for s in trainseqs + for f in sorted( + os.listdir( + osp.join(self.root, "train", s, f"flow_{fwbw}_{leftright}") + ) + ) + ] + # test + testseqs = sorted(os.listdir(osp.join(self.root, "test"))) + testpairs = [] + for leftright in ["left", "right"]: + testpairs += [ + ( + "test", + s, + "FW", + leftright, + int(f[len(f"frame_{leftright}_") : -len(".png")]), + ) + for s in testseqs + for f in sorted( + os.listdir(osp.join(self.root, "test", s, f"frame_{leftright}")) + )[:-1] + ] + testpairs += [ + ( + "test", + s, + "BW", + leftright, + int(f[len(f"frame_{leftright}_") : -len(".png")]) + 1, + ) + for s in testseqs + for f in sorted( + os.listdir(osp.join(self.root, "test", s, f"frame_{leftright}")) + )[:-1] + ] + # subtrain / subval + subtrainpairs = [p for p in trainpairs if p[1] != "0041"] + subvalpairs = [p for p in trainpairs if p[1] == "0041"] + assert ( + len(trainpairs) == 19852 + and len(testpairs) == 3960 + and len(subtrainpairs) == 19472 + and len(subvalpairs) == 380 + ), "incorrect parsing of pairs in Spring" + tosave = { + "train": trainpairs, + "test": testpairs, + "subtrain": subtrainpairs, + "subval": subvalpairs, + } + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 3 + assert prediction.shape[2] == 2 + assert prediction.dtype == np.float32 + outfile = osp.join( + outdir, + pairname[0], + pairname[1], + f"flow_{pairname[2]}_{pairname[3]}", + f"flow_{pairname[2]}_{pairname[3]}_{pairname[4]:04d}.flo5", + ) + os.makedirs(os.path.dirname(outfile), exist_ok=True) + writeFlo5File(prediction, outfile) + + def finalize_submission(self, outdir): + assert self.split == "test" + exe = "{self.root}/flow_subsampling" + if os.path.isfile(exe): + cmd = f'cd "{outdir}/test"; {exe} .' + print(cmd) + os.system(cmd) + print(f"Done. Submission file at {outdir}/test/flow_submission.hdf5") + else: + print("Could not find flow_subsampling executable for submission.") + print("Please download it and run:") + print(f'cd "{outdir}/test"; .') + + +class Kitti12Dataset(FlowDataset): + + def _prepare_data(self): + self.name = "Kitti12" + self._set_root() + assert self.split in ["train", "test"] + self.pairname_to_img1name = lambda pairname: osp.join( + self.root, pairname + "_10.png" + ) + self.pairname_to_img2name = lambda pairname: osp.join( + self.root, pairname + "_11.png" + ) + self.pairname_to_flowname = ( + None + if self.split == "test" + else lambda pairname: osp.join( + self.root, pairname.replace("/colored_0/", "/flow_occ/") + "_10.png" + ) + ) + self.pairname_to_str = lambda pairname: pairname.replace("/colored_0/", "/") + self.load_flow = _read_kitti_flow + + def _build_cache(self): + trainseqs = ["training/colored_0/%06d" % (i) for i in range(194)] + testseqs = ["testing/colored_0/%06d" % (i) for i in range(195)] + assert ( + len(trainseqs) == 194 and len(testseqs) == 195 + ), "incorrect parsing of pairs in Kitti12" + tosave = {"train": trainseqs, "test": testseqs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 3 + assert prediction.shape[2] == 2 + outfile = os.path.join(outdir, pairname.split("/")[-1] + "_10.png") + os.makedirs(os.path.dirname(outfile), exist_ok=True) + writeFlowKitti(outfile, prediction) + + def finalize_submission(self, outdir): + assert self.split == "test" + cmd = f'cd {outdir}/; zip -r "kitti12_flow_results.zip" .' + print(cmd) + os.system(cmd) + print(f"Done. Submission file at {outdir}/kitti12_flow_results.zip") + + +class Kitti15Dataset(FlowDataset): + + def _prepare_data(self): + self.name = "Kitti15" + self._set_root() + assert self.split in ["train", "subtrain", "subval", "test"] + self.pairname_to_img1name = lambda pairname: osp.join( + self.root, pairname + "_10.png" + ) + self.pairname_to_img2name = lambda pairname: osp.join( + self.root, pairname + "_11.png" + ) + self.pairname_to_flowname = ( + None + if self.split == "test" + else lambda pairname: osp.join( + self.root, pairname.replace("/image_2/", "/flow_occ/") + "_10.png" + ) + ) + self.pairname_to_str = lambda pairname: pairname.replace("/image_2/", "/") + self.load_flow = _read_kitti_flow + + def _build_cache(self): + trainseqs = ["training/image_2/%06d" % (i) for i in range(200)] + subtrainseqs = trainseqs[:-10] + subvalseqs = trainseqs[-10:] + testseqs = ["testing/image_2/%06d" % (i) for i in range(200)] + assert ( + len(trainseqs) == 200 + and len(subtrainseqs) == 190 + and len(subvalseqs) == 10 + and len(testseqs) == 200 + ), "incorrect parsing of pairs in Kitti15" + tosave = { + "train": trainseqs, + "subtrain": subtrainseqs, + "subval": subvalseqs, + "test": testseqs, + } + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 3 + assert prediction.shape[2] == 2 + outfile = os.path.join(outdir, "flow", pairname.split("/")[-1] + "_10.png") + os.makedirs(os.path.dirname(outfile), exist_ok=True) + writeFlowKitti(outfile, prediction) + + def finalize_submission(self, outdir): + assert self.split == "test" + cmd = f'cd {outdir}/; zip -r "kitti15_flow_results.zip" flow' + print(cmd) + os.system(cmd) + print(f"Done. Submission file at {outdir}/kitti15_flow_results.zip") + + +import cv2 + + +def _read_numpy_flow(filename): + return np.load(filename) + + +def _read_pfm_flow(filename): + f, _ = _read_pfm(filename) + assert np.all(f[:, :, 2] == 0.0) + return np.ascontiguousarray(f[:, :, :2]) + + +TAG_FLOAT = 202021.25 # tag to check the sanity of the file +TAG_STRING = "PIEH" # string containing the tag +MIN_WIDTH = 1 +MAX_WIDTH = 99999 +MIN_HEIGHT = 1 +MAX_HEIGHT = 99999 + + +def readFlowFile(filename): + """ + readFlowFile() reads a flow file into a 2-band np.array. + if does not exist, an IOError is raised. + if does not finish by '.flo' or the tag, the width, the height or the file's size is illegal, an Expcetion is raised. + ---- PARAMETERS ---- + filename: string containg the name of the file to read a flow + ---- OUTPUTS ---- + a np.array of dimension (height x width x 2) containing the flow of type 'float32' + """ + + # check filename + if not filename.endswith(".flo"): + raise Exception( + "readFlowFile({:s}): filename must finish with '.flo'".format(filename) + ) + + # open the file and read it + with open(filename, "rb") as f: + # check tag + tag = struct.unpack("f", f.read(4))[0] + if tag != TAG_FLOAT: + raise Exception("flow_utils.readFlowFile({:s}): wrong tag".format(filename)) + # read dimension + w, h = struct.unpack("ii", f.read(8)) + if w < MIN_WIDTH or w > MAX_WIDTH: + raise Exception( + "flow_utils.readFlowFile({:s}: illegal width {:d}".format(filename, w) + ) + if h < MIN_HEIGHT or h > MAX_HEIGHT: + raise Exception( + "flow_utils.readFlowFile({:s}: illegal height {:d}".format(filename, h) + ) + flow = np.fromfile(f, "float32") + if not flow.shape == (h * w * 2,): + raise Exception( + "flow_utils.readFlowFile({:s}: illegal size of the file".format( + filename + ) + ) + flow.shape = (h, w, 2) + return flow + + +def writeFlowFile(flow, filename): + """ + writeFlowFile(flow,) write flow to the file . + if does not exist, an IOError is raised. + if does not finish with '.flo' or the flow has not 2 bands, an Exception is raised. + ---- PARAMETERS ---- + flow: np.array of dimension (height x width x 2) containing the flow to write + filename: string containg the name of the file to write a flow + """ + + # check filename + if not filename.endswith(".flo"): + raise Exception( + "flow_utils.writeFlowFile(,{:s}): filename must finish with '.flo'".format( + filename + ) + ) + + if not flow.shape[2:] == (2,): + raise Exception( + "flow_utils.writeFlowFile(,{:s}): must have 2 bands".format( + filename + ) + ) + + # open the file and write it + with open(filename, "wb") as f: + # write TAG + f.write(TAG_STRING.encode("utf-8")) + # write dimension + f.write(struct.pack("ii", flow.shape[1], flow.shape[0])) + # write the flow + + flow.astype(np.float32).tofile(f) + + +_read_flo_file = readFlowFile + + +def _read_kitti_flow(filename): + flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR) + flow = flow[:, :, ::-1].astype(np.float32) + valid = flow[:, :, 2] > 0 + flow = flow[:, :, :2] + flow = (flow - 2**15) / 64.0 + flow[~valid, 0] = np.inf + flow[~valid, 1] = np.inf + return flow + + +_read_hd1k_flow = _read_kitti_flow + + +def writeFlowKitti(filename, uv): + uv = 64.0 * uv + 2**15 + valid = np.ones([uv.shape[0], uv.shape[1], 1]) + uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) + cv2.imwrite(filename, uv[..., ::-1]) + + +def writeFlo5File(flow, filename): + with h5py.File(filename, "w") as f: + f.create_dataset("flow", data=flow, compression="gzip", compression_opts=5) + + +def _read_hdf5_flow(filename): + flow = np.asarray(h5py.File(filename)["flow"]) + flow[np.isnan(flow)] = np.inf # make invalid values as +inf + return flow.astype(np.float32) + + +# flow visualization +RY = 15 +YG = 6 +GC = 4 +CB = 11 +BM = 13 +MR = 6 +UNKNOWN_THRESH = 1e9 + + +def colorTest(): + """ + flow_utils.colorTest(): display an example of image showing the color encoding scheme + """ + import matplotlib.pylab as plt + + truerange = 1 + h, w = 151, 151 + trange = truerange * 1.04 + s2 = round(h / 2) + x, y = np.meshgrid(range(w), range(h)) + u = x * trange / s2 - trange + v = y * trange / s2 - trange + img = _computeColor( + np.concatenate((u[:, :, np.newaxis], v[:, :, np.newaxis]), 2) + / trange + / np.sqrt(2) + ) + plt.imshow(img) + plt.axis("off") + plt.axhline(round(h / 2), color="k") + plt.axvline(round(w / 2), color="k") + + +def flowToColor(flow, maxflow=None, maxmaxflow=None, saturate=False): + """ + flow_utils.flowToColor(flow): return a color code flow field, normalized based on the maximum l2-norm of the flow + flow_utils.flowToColor(flow,maxflow): return a color code flow field, normalized by maxflow + ---- PARAMETERS ---- + flow: flow to display of shape (height x width x 2) + maxflow (default:None): if given, normalize the flow by its value, otherwise by the flow norm + maxmaxflow (default:None): if given, normalize the flow by the max of its value and the flow norm + ---- OUTPUT ---- + an np.array of shape (height x width x 3) of type uint8 containing a color code of the flow + """ + h, w, n = flow.shape + # check size of flow + assert n == 2, "flow_utils.flowToColor(flow): flow must have 2 bands" + # fix unknown flow + unknown_idx = np.max(np.abs(flow), 2) > UNKNOWN_THRESH + flow[unknown_idx] = 0.0 + # compute max flow if needed + if maxflow is None: + maxflow = flowMaxNorm(flow) + if maxmaxflow is not None: + maxflow = min(maxmaxflow, maxflow) + # normalize flow + eps = np.spacing(1) # minimum positive float value to avoid division by 0 + # compute the flow + img = _computeColor(flow / (maxflow + eps), saturate=saturate) + # put black pixels in unknown location + img[np.tile(unknown_idx[:, :, np.newaxis], [1, 1, 3])] = 0.0 + return img + + +def flowMaxNorm(flow): + """ + flow_utils.flowMaxNorm(flow): return the maximum of the l2-norm of the given flow + ---- PARAMETERS ---- + flow: the flow + + ---- OUTPUT ---- + a float containing the maximum of the l2-norm of the flow + """ + return np.max(np.sqrt(np.sum(np.square(flow), 2))) + + +def _computeColor(flow, saturate=True): + """ + flow_utils._computeColor(flow): compute color codes for the flow field flow + + ---- PARAMETERS ---- + flow: np.array of dimension (height x width x 2) containing the flow to display + ---- OUTPUTS ---- + an np.array of dimension (height x width x 3) containing the color conversion of the flow + """ + # set nan to 0 + nanidx = np.isnan(flow[:, :, 0]) + flow[nanidx] = 0.0 + + # colorwheel + ncols = RY + YG + GC + CB + BM + MR + nchans = 3 + colorwheel = np.zeros((ncols, nchans), "uint8") + col = 0 + # RY + colorwheel[:RY, 0] = 255 + colorwheel[:RY, 1] = [(255 * i) // RY for i in range(RY)] + col += RY + # YG + colorwheel[col : col + YG, 0] = [255 - (255 * i) // YG for i in range(YG)] + colorwheel[col : col + YG, 1] = 255 + col += YG + # GC + colorwheel[col : col + GC, 1] = 255 + colorwheel[col : col + GC, 2] = [(255 * i) // GC for i in range(GC)] + col += GC + # CB + colorwheel[col : col + CB, 1] = [255 - (255 * i) // CB for i in range(CB)] + colorwheel[col : col + CB, 2] = 255 + col += CB + # BM + colorwheel[col : col + BM, 0] = [(255 * i) // BM for i in range(BM)] + colorwheel[col : col + BM, 2] = 255 + col += BM + # MR + colorwheel[col : col + MR, 0] = 255 + colorwheel[col : col + MR, 2] = [255 - (255 * i) // MR for i in range(MR)] + + # compute utility variables + rad = np.sqrt(np.sum(np.square(flow), 2)) # magnitude + a = np.arctan2(-flow[:, :, 1], -flow[:, :, 0]) / np.pi # angle + fk = (a + 1) / 2 * (ncols - 1) # map [-1,1] to [0,ncols-1] + k0 = np.floor(fk).astype("int") + k1 = k0 + 1 + k1[k1 == ncols] = 0 + f = fk - k0 + + if not saturate: + rad = np.minimum(rad, 1) + + # compute the image + img = np.zeros((flow.shape[0], flow.shape[1], nchans), "uint8") + for i in range(nchans): + tmp = colorwheel[:, i].astype("float") + col0 = tmp[k0] / 255 + col1 = tmp[k1] / 255 + col = (1 - f) * col0 + f * col1 + idx = rad <= 1 + col[idx] = 1 - rad[idx] * (1 - col[idx]) # increase saturation with radius + col[~idx] *= 0.75 # out of range + img[:, :, i] = (255 * col * (1 - nanidx.astype("float"))).astype("uint8") + + return img + + +# flow dataset getter + + +def get_train_dataset_flow(dataset_str, augmentor=True, crop_size=None): + dataset_str = dataset_str.replace("(", "Dataset(") + if augmentor: + dataset_str = dataset_str.replace(")", ", augmentor=True)") + if crop_size is not None: + dataset_str = dataset_str.replace( + ")", ", crop_size={:s})".format(str(crop_size)) + ) + return eval(dataset_str) + + +def get_test_datasets_flow(dataset_str): + dataset_str = dataset_str.replace("(", "Dataset(") + return [eval(s) for s in dataset_str.split("+")] diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/datasets_stereo.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/datasets_stereo.py new file mode 100644 index 0000000000000000000000000000000000000000..60c9466ad05164fb433551dd23acb3153e6e7ea6 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/datasets_stereo.py @@ -0,0 +1,991 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Dataset structure for stereo +# -------------------------------------------------------- + +import sys, os +import os.path as osp +import pickle +import numpy as np +from PIL import Image +import json +import h5py +from glob import glob +import cv2 + +import torch +from torch.utils import data + +from .augmentor import StereoAugmentor + + +dataset_to_root = { + "CREStereo": "./data/stereoflow//crenet_stereo_trainset/stereo_trainset/crestereo/", + "SceneFlow": "./data/stereoflow//SceneFlow/", + "ETH3DLowRes": "./data/stereoflow/eth3d_lowres/", + "Booster": "./data/stereoflow/booster_gt/", + "Middlebury2021": "./data/stereoflow/middlebury/2021/data/", + "Middlebury2014": "./data/stereoflow/middlebury/2014/", + "Middlebury2006": "./data/stereoflow/middlebury/2006/", + "Middlebury2005": "./data/stereoflow/middlebury/2005/train/", + "MiddleburyEval3": "./data/stereoflow/middlebury/MiddEval3/", + "Spring": "./data/stereoflow/spring/", + "Kitti15": "./data/stereoflow/kitti-stereo-2015/", + "Kitti12": "./data/stereoflow/kitti-stereo-2012/", +} +cache_dir = "./data/stereoflow/datasets_stereo_cache/" + + +in1k_mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1) +in1k_std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1) + + +def img_to_tensor(img): + img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.0 + img = (img - in1k_mean) / in1k_std + return img + + +def disp_to_tensor(disp): + return torch.from_numpy(disp)[None, :, :] + + +class StereoDataset(data.Dataset): + + def __init__(self, split, augmentor=False, crop_size=None, totensor=True): + self.split = split + if not augmentor: + assert crop_size is None + if crop_size: + assert augmentor + self.crop_size = crop_size + self.augmentor_str = augmentor + self.augmentor = StereoAugmentor(crop_size) if augmentor else None + self.totensor = totensor + self.rmul = 1 # keep track of rmul + self.has_constant_resolution = True # whether the dataset has constant resolution or not (=> don't use batch_size>1 at test time) + self._prepare_data() + self._load_or_build_cache() + + def prepare_data(self): + """ + to be defined for each dataset + """ + raise NotImplementedError + + def __len__(self): + return len(self.pairnames) + + def __getitem__(self, index): + pairname = self.pairnames[index] + + # get filenames + Limgname = self.pairname_to_Limgname(pairname) + Rimgname = self.pairname_to_Rimgname(pairname) + Ldispname = ( + self.pairname_to_Ldispname(pairname) + if self.pairname_to_Ldispname is not None + else None + ) + + # load images and disparities + Limg = _read_img(Limgname) + Rimg = _read_img(Rimgname) + disp = self.load_disparity(Ldispname) if Ldispname is not None else None + + # sanity check + if disp is not None: + assert np.all(disp > 0) or self.name == "Spring", ( + self.name, + pairname, + Ldispname, + ) + + # apply augmentations + if self.augmentor is not None: + Limg, Rimg, disp = self.augmentor(Limg, Rimg, disp, self.name) + + if self.totensor: + Limg = img_to_tensor(Limg) + Rimg = img_to_tensor(Rimg) + if disp is None: + disp = torch.tensor( + [] + ) # to allow dataloader batching with default collate_gn + else: + disp = disp_to_tensor(disp) + + return Limg, Rimg, disp, str(pairname) + + def __rmul__(self, v): + self.rmul *= v + self.pairnames = v * self.pairnames + return self + + def __str__(self): + return f"{self.__class__.__name__}_{self.split}" + + def __repr__(self): + s = f"{self.__class__.__name__}(split={self.split}, augmentor={self.augmentor_str}, crop_size={str(self.crop_size)}, totensor={self.totensor})" + if self.rmul == 1: + s += f"\n\tnum pairs: {len(self.pairnames)}" + else: + s += f"\n\tnum pairs: {len(self.pairnames)} ({len(self.pairnames)//self.rmul}x{self.rmul})" + return s + + def _set_root(self): + self.root = dataset_to_root[self.name] + assert os.path.isdir( + self.root + ), f"could not find root directory for dataset {self.name}: {self.root}" + + def _load_or_build_cache(self): + cache_file = osp.join(cache_dir, self.name + ".pkl") + if osp.isfile(cache_file): + with open(cache_file, "rb") as fid: + self.pairnames = pickle.load(fid)[self.split] + else: + tosave = self._build_cache() + os.makedirs(cache_dir, exist_ok=True) + with open(cache_file, "wb") as fid: + pickle.dump(tosave, fid) + self.pairnames = tosave[self.split] + + +class CREStereoDataset(StereoDataset): + + def _prepare_data(self): + self.name = "CREStereo" + self._set_root() + assert self.split in ["train"] + self.pairname_to_Limgname = lambda pairname: osp.join( + self.root, pairname + "_left.jpg" + ) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname + "_right.jpg" + ) + self.pairname_to_Ldispname = lambda pairname: osp.join( + self.root, pairname + "_left.disp.png" + ) + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_crestereo_disp + + def _build_cache(self): + allpairs = [ + s + "/" + f[: -len("_left.jpg")] + for s in sorted(os.listdir(self.root)) + for f in sorted(os.listdir(self.root + "/" + s)) + if f.endswith("_left.jpg") + ] + assert len(allpairs) == 200000, "incorrect parsing of pairs in CreStereo" + tosave = {"train": allpairs} + return tosave + + +class SceneFlowDataset(StereoDataset): + + def _prepare_data(self): + self.name = "SceneFlow" + self._set_root() + assert self.split in [ + "train_finalpass", + "train_cleanpass", + "train_allpass", + "test_finalpass", + "test_cleanpass", + "test_allpass", + "test1of100_cleanpass", + "test1of100_finalpass", + ] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname + ).replace("/left/", "/right/") + self.pairname_to_Ldispname = ( + lambda pairname: osp.join(self.root, pairname) + .replace("/frames_finalpass/", "/disparity/") + .replace("/frames_cleanpass/", "/disparity/")[:-4] + + ".pfm" + ) + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_sceneflow_disp + + def _build_cache(self): + trainpairs = [] + # driving + pairs = sorted(glob(self.root + "Driving/frames_finalpass/*/*/*/left/*.png")) + pairs = list(map(lambda x: x[len(self.root) :], pairs)) + assert len(pairs) == 4400, "incorrect parsing of pairs in SceneFlow" + trainpairs += pairs + # monkaa + pairs = sorted(glob(self.root + "Monkaa/frames_finalpass/*/left/*.png")) + pairs = list(map(lambda x: x[len(self.root) :], pairs)) + assert len(pairs) == 8664, "incorrect parsing of pairs in SceneFlow" + trainpairs += pairs + # flyingthings + pairs = sorted( + glob(self.root + "FlyingThings/frames_finalpass/TRAIN/*/*/left/*.png") + ) + pairs = list(map(lambda x: x[len(self.root) :], pairs)) + assert len(pairs) == 22390, "incorrect parsing of pairs in SceneFlow" + trainpairs += pairs + assert len(trainpairs) == 35454, "incorrect parsing of pairs in SceneFlow" + testpairs = sorted( + glob(self.root + "FlyingThings/frames_finalpass/TEST/*/*/left/*.png") + ) + testpairs = list(map(lambda x: x[len(self.root) :], testpairs)) + assert len(testpairs) == 4370, "incorrect parsing of pairs in SceneFlow" + test1of100pairs = testpairs[::100] + assert len(test1of100pairs) == 44, "incorrect parsing of pairs in SceneFlow" + # all + tosave = { + "train_finalpass": trainpairs, + "train_cleanpass": list( + map( + lambda x: x.replace("frames_finalpass", "frames_cleanpass"), + trainpairs, + ) + ), + "test_finalpass": testpairs, + "test_cleanpass": list( + map( + lambda x: x.replace("frames_finalpass", "frames_cleanpass"), + testpairs, + ) + ), + "test1of100_finalpass": test1of100pairs, + "test1of100_cleanpass": list( + map( + lambda x: x.replace("frames_finalpass", "frames_cleanpass"), + test1of100pairs, + ) + ), + } + tosave["train_allpass"] = tosave["train_finalpass"] + tosave["train_cleanpass"] + tosave["test_allpass"] = tosave["test_finalpass"] + tosave["test_cleanpass"] + return tosave + + +class Md21Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2021" + self._set_root() + assert self.split in ["train", "subtrain", "subval"] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname.replace("/im0", "/im1") + ) + self.pairname_to_Ldispname = lambda pairname: osp.join( + self.root, pairname.split("/")[0], "disp0.pfm" + ) + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_middlebury_disp + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + # trainpairs += [s+'/im0.png'] # we should remove it, it is included as such in other lightings + trainpairs += [ + s + "/ambient/" + b + "/" + a + for b in sorted(os.listdir(osp.join(self.root, s, "ambient"))) + for a in sorted(os.listdir(osp.join(self.root, s, "ambient", b))) + if a.startswith("im0") + ] + assert len(trainpairs) == 355 + subtrainpairs = [ + p for p in trainpairs if any(p.startswith(s + "/") for s in seqs[:-2]) + ] + subvalpairs = [ + p for p in trainpairs if any(p.startswith(s + "/") for s in seqs[-2:]) + ] + assert ( + len(subtrainpairs) == 335 and len(subvalpairs) == 20 + ), "incorrect parsing of pairs in Middlebury 2021" + tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs} + return tosave + + +class Md14Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2014" + self._set_root() + assert self.split in ["train", "subtrain", "subval"] + self.pairname_to_Limgname = lambda pairname: osp.join( + self.root, osp.dirname(pairname), "im0.png" + ) + self.pairname_to_Rimgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Ldispname = lambda pairname: osp.join( + self.root, osp.dirname(pairname), "disp0.pfm" + ) + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_middlebury_disp + self.has_constant_resolution = False + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + trainpairs += [s + "/im1.png", s + "/im1E.png", s + "/im1L.png"] + assert len(trainpairs) == 138 + valseqs = ["Umbrella-imperfect", "Vintage-perfect"] + assert all(s in seqs for s in valseqs) + subtrainpairs = [ + p for p in trainpairs if not any(p.startswith(s + "/") for s in valseqs) + ] + subvalpairs = [ + p for p in trainpairs if any(p.startswith(s + "/") for s in valseqs) + ] + assert ( + len(subtrainpairs) == 132 and len(subvalpairs) == 6 + ), "incorrect parsing of pairs in Middlebury 2014" + tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs} + return tosave + + +class Md06Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2006" + self._set_root() + assert self.split in ["train", "subtrain", "subval"] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, osp.dirname(pairname), "view5.png" + ) + self.pairname_to_Ldispname = lambda pairname: osp.join( + self.root, pairname.split("/")[0], "disp1.png" + ) + self.load_disparity = _read_middlebury20052006_disp + self.has_constant_resolution = False + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + for i in ["Illum1", "Illum2", "Illum3"]: + for e in ["Exp0", "Exp1", "Exp2"]: + trainpairs.append(osp.join(s, i, e, "view1.png")) + assert len(trainpairs) == 189 + valseqs = ["Rocks1", "Wood2"] + assert all(s in seqs for s in valseqs) + subtrainpairs = [ + p for p in trainpairs if not any(p.startswith(s + "/") for s in valseqs) + ] + subvalpairs = [ + p for p in trainpairs if any(p.startswith(s + "/") for s in valseqs) + ] + assert ( + len(subtrainpairs) == 171 and len(subvalpairs) == 18 + ), "incorrect parsing of pairs in Middlebury 2006" + tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs} + return tosave + + +class Md05Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Middlebury2005" + self._set_root() + assert self.split in ["train", "subtrain", "subval"] + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, osp.dirname(pairname), "view5.png" + ) + self.pairname_to_Ldispname = lambda pairname: osp.join( + self.root, pairname.split("/")[0], "disp1.png" + ) + self.pairname_to_str = lambda pairname: pairname[:-4] + self.load_disparity = _read_middlebury20052006_disp + + def _build_cache(self): + seqs = sorted(os.listdir(self.root)) + trainpairs = [] + for s in seqs: + for i in ["Illum1", "Illum2", "Illum3"]: + for e in ["Exp0", "Exp1", "Exp2"]: + trainpairs.append(osp.join(s, i, e, "view1.png")) + assert len(trainpairs) == 54, "incorrect parsing of pairs in Middlebury 2005" + valseqs = ["Reindeer"] + assert all(s in seqs for s in valseqs) + subtrainpairs = [ + p for p in trainpairs if not any(p.startswith(s + "/") for s in valseqs) + ] + subvalpairs = [ + p for p in trainpairs if any(p.startswith(s + "/") for s in valseqs) + ] + assert ( + len(subtrainpairs) == 45 and len(subvalpairs) == 9 + ), "incorrect parsing of pairs in Middlebury 2005" + tosave = {"train": trainpairs, "subtrain": subtrainpairs, "subval": subvalpairs} + return tosave + + +class MdEval3Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "MiddleburyEval3" + self._set_root() + assert self.split in [ + s + "_" + r + for s in ["train", "subtrain", "subval", "test", "all"] + for r in ["full", "half", "quarter"] + ] + if self.split.endswith("_full"): + self.root = self.root.replace("/MiddEval3", "/MiddEval3_F") + elif self.split.endswith("_half"): + self.root = self.root.replace("/MiddEval3", "/MiddEval3_H") + else: + assert self.split.endswith("_quarter") + self.pairname_to_Limgname = lambda pairname: osp.join( + self.root, pairname, "im0.png" + ) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname, "im1.png" + ) + self.pairname_to_Ldispname = lambda pairname: ( + None + if pairname.startswith("test") + else osp.join(self.root, pairname, "disp0GT.pfm") + ) + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_middlebury_disp + # for submission only + self.submission_methodname = "CroCo-Stereo" + self.submission_sresolution = ( + "F" + if self.split.endswith("_full") + else ("H" if self.split.endswith("_half") else "Q") + ) + + def _build_cache(self): + trainpairs = ["train/" + s for s in sorted(os.listdir(self.root + "train/"))] + testpairs = ["test/" + s for s in sorted(os.listdir(self.root + "test/"))] + subvalpairs = trainpairs[-1:] + subtrainpairs = trainpairs[:-1] + allpairs = trainpairs + testpairs + assert ( + len(trainpairs) == 15 + and len(testpairs) == 15 + and len(subvalpairs) == 1 + and len(subtrainpairs) == 14 + and len(allpairs) == 30 + ), "incorrect parsing of pairs in Middlebury Eval v3" + tosave = {} + for r in ["full", "half", "quarter"]: + tosave.update( + **{ + "train_" + r: trainpairs, + "subtrain_" + r: subtrainpairs, + "subval_" + r: subvalpairs, + "test_" + r: testpairs, + "all_" + r: allpairs, + } + ) + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 2 + assert prediction.dtype == np.float32 + outfile = os.path.join( + outdir, + pairname.split("/")[0].replace("train", "training") + + self.submission_sresolution, + pairname.split("/")[1], + "disp0" + self.submission_methodname + ".pfm", + ) + os.makedirs(os.path.dirname(outfile), exist_ok=True) + writePFM(outfile, prediction) + timefile = os.path.join( + os.path.dirname(outfile), "time" + self.submission_methodname + ".txt" + ) + with open(timefile, "w") as fid: + fid.write(str(time)) + + def finalize_submission(self, outdir): + cmd = f'cd {outdir}/; zip -r "{self.submission_methodname}.zip" .' + print(cmd) + os.system(cmd) + print(f"Done. Submission file at {outdir}/{self.submission_methodname}.zip") + + +class ETH3DLowResDataset(StereoDataset): + + def _prepare_data(self): + self.name = "ETH3DLowRes" + self._set_root() + assert self.split in ["train", "test", "subtrain", "subval", "all"] + self.pairname_to_Limgname = lambda pairname: osp.join( + self.root, pairname, "im0.png" + ) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname, "im1.png" + ) + self.pairname_to_Ldispname = ( + None + if self.split == "test" + else lambda pairname: ( + None + if pairname.startswith("test/") + else osp.join( + self.root, pairname.replace("train/", "train_gt/"), "disp0GT.pfm" + ) + ) + ) + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_eth3d_disp + self.has_constant_resolution = False + + def _build_cache(self): + trainpairs = ["train/" + s for s in sorted(os.listdir(self.root + "train/"))] + testpairs = ["test/" + s for s in sorted(os.listdir(self.root + "test/"))] + assert ( + len(trainpairs) == 27 and len(testpairs) == 20 + ), "incorrect parsing of pairs in ETH3D Low Res" + subvalpairs = [ + "train/delivery_area_3s", + "train/electro_3l", + "train/playground_3l", + ] + assert all(p in trainpairs for p in subvalpairs) + subtrainpairs = [p for p in trainpairs if not p in subvalpairs] + assert ( + len(subvalpairs) == 3 and len(subtrainpairs) == 24 + ), "incorrect parsing of pairs in ETH3D Low Res" + tosave = { + "train": trainpairs, + "test": testpairs, + "subtrain": subtrainpairs, + "subval": subvalpairs, + "all": trainpairs + testpairs, + } + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 2 + assert prediction.dtype == np.float32 + outfile = os.path.join( + outdir, "low_res_two_view", pairname.split("/")[1] + ".pfm" + ) + os.makedirs(os.path.dirname(outfile), exist_ok=True) + writePFM(outfile, prediction) + timefile = outfile[:-4] + ".txt" + with open(timefile, "w") as fid: + fid.write("runtime " + str(time)) + + def finalize_submission(self, outdir): + cmd = f'cd {outdir}/; zip -r "eth3d_low_res_two_view_results.zip" low_res_two_view' + print(cmd) + os.system(cmd) + print(f"Done. Submission file at {outdir}/eth3d_low_res_two_view_results.zip") + + +class BoosterDataset(StereoDataset): + + def _prepare_data(self): + self.name = "Booster" + self._set_root() + assert self.split in [ + "train_balanced", + "test_balanced", + "subtrain_balanced", + "subval_balanced", + ] # we use only the balanced version + self.pairname_to_Limgname = lambda pairname: osp.join(self.root, pairname) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname + ).replace("/camera_00/", "/camera_02/") + self.pairname_to_Ldispname = lambda pairname: osp.join( + self.root, osp.dirname(pairname), "../disp_00.npy" + ) # same images with different colors, same gt per sequence + self.pairname_to_str = lambda pairname: pairname[:-4].replace( + "/camera_00/", "/" + ) + self.load_disparity = _read_booster_disp + + def _build_cache(self): + trainseqs = sorted(os.listdir(self.root + "train/balanced")) + trainpairs = [ + "train/balanced/" + s + "/camera_00/" + imname + for s in trainseqs + for imname in sorted( + os.listdir(self.root + "train/balanced/" + s + "/camera_00/") + ) + ] + testpairs = [ + "test/balanced/" + s + "/camera_00/" + imname + for s in sorted(os.listdir(self.root + "test/balanced")) + for imname in sorted( + os.listdir(self.root + "test/balanced/" + s + "/camera_00/") + ) + ] + assert len(trainpairs) == 228 and len(testpairs) == 191 + subtrainpairs = [p for p in trainpairs if any(s in p for s in trainseqs[:-2])] + subvalpairs = [p for p in trainpairs if any(s in p for s in trainseqs[-2:])] + # warning: if we do validation split, we should split scenes!!! + tosave = { + "train_balanced": trainpairs, + "test_balanced": testpairs, + "subtrain_balanced": subtrainpairs, + "subval_balanced": subvalpairs, + } + return tosave + + +class SpringDataset(StereoDataset): + + def _prepare_data(self): + self.name = "Spring" + self._set_root() + assert self.split in ["train", "test", "subtrain", "subval"] + self.pairname_to_Limgname = lambda pairname: osp.join( + self.root, pairname + ".png" + ) + self.pairname_to_Rimgname = ( + lambda pairname: osp.join(self.root, pairname + ".png") + .replace("frame_right", "") + .replace("frame_left", "frame_right") + .replace("", "frame_left") + ) + self.pairname_to_Ldispname = lambda pairname: ( + None + if pairname.startswith("test") + else osp.join(self.root, pairname + ".dsp5") + .replace("frame_left", "disp1_left") + .replace("frame_right", "disp1_right") + ) + self.pairname_to_str = lambda pairname: pairname + self.load_disparity = _read_hdf5_disp + + def _build_cache(self): + trainseqs = sorted(os.listdir(osp.join(self.root, "train"))) + trainpairs = [ + osp.join("train", s, "frame_left", f[:-4]) + for s in trainseqs + for f in sorted(os.listdir(osp.join(self.root, "train", s, "frame_left"))) + ] + testseqs = sorted(os.listdir(osp.join(self.root, "test"))) + testpairs = [ + osp.join("test", s, "frame_left", f[:-4]) + for s in testseqs + for f in sorted(os.listdir(osp.join(self.root, "test", s, "frame_left"))) + ] + testpairs += [p.replace("frame_left", "frame_right") for p in testpairs] + """maxnorm = {'0001': 32.88, '0002': 228.5, '0004': 298.2, '0005': 142.5, '0006': 113.6, '0007': 27.3, '0008': 554.5, '0009': 155.6, '0010': 126.1, '0011': 87.6, '0012': 303.2, '0013': 24.14, '0014': 82.56, '0015': 98.44, '0016': 156.9, '0017': 28.17, '0018': 21.03, '0020': 178.0, '0021': 58.06, '0022': 354.2, '0023': 8.79, '0024': 97.06, '0025': 55.16, '0026': 91.9, '0027': 156.6, '0030': 200.4, '0032': 58.66, '0033': 373.5, '0036': 149.4, '0037': 5.625, '0038': 37.0, '0039': 12.2, '0041': 453.5, '0043': 457.0, '0044': 379.5, '0045': 161.8, '0047': 105.44} # => let'use 0041""" + subtrainpairs = [p for p in trainpairs if p.split("/")[1] != "0041"] + subvalpairs = [p for p in trainpairs if p.split("/")[1] == "0041"] + assert ( + len(trainpairs) == 5000 + and len(testpairs) == 2000 + and len(subtrainpairs) == 4904 + and len(subvalpairs) == 96 + ), "incorrect parsing of pairs in Spring" + tosave = { + "train": trainpairs, + "test": testpairs, + "subtrain": subtrainpairs, + "subval": subvalpairs, + } + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 2 + assert prediction.dtype == np.float32 + outfile = ( + os.path.join(outdir, pairname + ".dsp5") + .replace("frame_left", "disp1_left") + .replace("frame_right", "disp1_right") + ) + os.makedirs(os.path.dirname(outfile), exist_ok=True) + writeDsp5File(prediction, outfile) + + def finalize_submission(self, outdir): + assert self.split == "test" + exe = "{self.root}/disp1_subsampling" + if os.path.isfile(exe): + cmd = f'cd "{outdir}/test"; {exe} .' + print(cmd) + os.system(cmd) + else: + print("Could not find disp1_subsampling executable for submission.") + print("Please download it and run:") + print(f'cd "{outdir}/test"; .') + + +class Kitti12Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Kitti12" + self._set_root() + assert self.split in ["train", "test"] + self.pairname_to_Limgname = lambda pairname: osp.join( + self.root, pairname + "_10.png" + ) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname.replace("/colored_0/", "/colored_1/") + "_10.png" + ) + self.pairname_to_Ldispname = ( + None + if self.split == "test" + else lambda pairname: osp.join( + self.root, pairname.replace("/colored_0/", "/disp_occ/") + "_10.png" + ) + ) + self.pairname_to_str = lambda pairname: pairname.replace("/colored_0/", "/") + self.load_disparity = _read_kitti_disp + + def _build_cache(self): + trainseqs = ["training/colored_0/%06d" % (i) for i in range(194)] + testseqs = ["testing/colored_0/%06d" % (i) for i in range(195)] + assert ( + len(trainseqs) == 194 and len(testseqs) == 195 + ), "incorrect parsing of pairs in Kitti12" + tosave = {"train": trainseqs, "test": testseqs} + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 2 + assert prediction.dtype == np.float32 + outfile = os.path.join(outdir, pairname.split("/")[-1] + "_10.png") + os.makedirs(os.path.dirname(outfile), exist_ok=True) + img = (prediction * 256).astype("uint16") + Image.fromarray(img).save(outfile) + + def finalize_submission(self, outdir): + assert self.split == "test" + cmd = f'cd {outdir}/; zip -r "kitti12_results.zip" .' + print(cmd) + os.system(cmd) + print(f"Done. Submission file at {outdir}/kitti12_results.zip") + + +class Kitti15Dataset(StereoDataset): + + def _prepare_data(self): + self.name = "Kitti15" + self._set_root() + assert self.split in ["train", "subtrain", "subval", "test"] + self.pairname_to_Limgname = lambda pairname: osp.join( + self.root, pairname + "_10.png" + ) + self.pairname_to_Rimgname = lambda pairname: osp.join( + self.root, pairname.replace("/image_2/", "/image_3/") + "_10.png" + ) + self.pairname_to_Ldispname = ( + None + if self.split == "test" + else lambda pairname: osp.join( + self.root, pairname.replace("/image_2/", "/disp_occ_0/") + "_10.png" + ) + ) + self.pairname_to_str = lambda pairname: pairname.replace("/image_2/", "/") + self.load_disparity = _read_kitti_disp + + def _build_cache(self): + trainseqs = ["training/image_2/%06d" % (i) for i in range(200)] + subtrainseqs = trainseqs[:-5] + subvalseqs = trainseqs[-5:] + testseqs = ["testing/image_2/%06d" % (i) for i in range(200)] + assert ( + len(trainseqs) == 200 + and len(subtrainseqs) == 195 + and len(subvalseqs) == 5 + and len(testseqs) == 200 + ), "incorrect parsing of pairs in Kitti15" + tosave = { + "train": trainseqs, + "subtrain": subtrainseqs, + "subval": subvalseqs, + "test": testseqs, + } + return tosave + + def submission_save_pairname(self, pairname, prediction, outdir, time): + assert prediction.ndim == 2 + assert prediction.dtype == np.float32 + outfile = os.path.join(outdir, "disp_0", pairname.split("/")[-1] + "_10.png") + os.makedirs(os.path.dirname(outfile), exist_ok=True) + img = (prediction * 256).astype("uint16") + Image.fromarray(img).save(outfile) + + def finalize_submission(self, outdir): + assert self.split == "test" + cmd = f'cd {outdir}/; zip -r "kitti15_results.zip" disp_0' + print(cmd) + os.system(cmd) + print(f"Done. Submission file at {outdir}/kitti15_results.zip") + + +### auxiliary functions + + +def _read_img(filename): + # convert to RGB for scene flow finalpass data + img = np.asarray(Image.open(filename).convert("RGB")) + return img + + +def _read_booster_disp(filename): + disp = np.load(filename) + disp[disp == 0.0] = np.inf + return disp + + +def _read_png_disp(filename, coef=1.0): + disp = np.asarray(Image.open(filename)) + disp = disp.astype(np.float32) / coef + disp[disp == 0.0] = np.inf + return disp + + +def _read_pfm_disp(filename): + disp = np.ascontiguousarray(_read_pfm(filename)[0]) + disp[disp <= 0] = ( + np.inf + ) # eg /nfs/data/ffs-3d/datasets/middlebury/2014/Shopvac-imperfect/disp0.pfm + return disp + + +def _read_npy_disp(filename): + return np.load(filename) + + +def _read_crestereo_disp(filename): + return _read_png_disp(filename, coef=32.0) + + +def _read_middlebury20052006_disp(filename): + return _read_png_disp(filename, coef=1.0) + + +def _read_kitti_disp(filename): + return _read_png_disp(filename, coef=256.0) + + +_read_sceneflow_disp = _read_pfm_disp +_read_eth3d_disp = _read_pfm_disp +_read_middlebury_disp = _read_pfm_disp +_read_carla_disp = _read_pfm_disp +_read_tartanair_disp = _read_npy_disp + + +def _read_hdf5_disp(filename): + disp = np.asarray(h5py.File(filename)["disparity"]) + disp[np.isnan(disp)] = np.inf # make invalid values as +inf + # disp[disp==0.0] = np.inf # make invalid values as +inf + return disp.astype(np.float32) + + +import re + + +def _read_pfm(file): + file = open(file, "rb") + + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().rstrip() + if header.decode("ascii") == "PF": + color = True + elif header.decode("ascii") == "Pf": + color = False + else: + raise Exception("Not a PFM file.") + + dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii")) + if dim_match: + width, height = list(map(int, dim_match.groups())) + else: + raise Exception("Malformed PFM header.") + + scale = float(file.readline().decode("ascii").rstrip()) + if scale < 0: # little-endian + endian = "<" + scale = -scale + else: + endian = ">" # big-endian + + data = np.fromfile(file, endian + "f") + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + return data, scale + + +def writePFM(file, image, scale=1): + file = open(file, "wb") + + color = None + + if image.dtype.name != "float32": + raise Exception("Image dtype must be float32.") + + image = np.flipud(image) + + if len(image.shape) == 3 and image.shape[2] == 3: # color image + color = True + elif ( + len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1 + ): # greyscale + color = False + else: + raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.") + + file.write("PF\n" if color else "Pf\n".encode()) + file.write("%d %d\n".encode() % (image.shape[1], image.shape[0])) + + endian = image.dtype.byteorder + + if endian == "<" or endian == "=" and sys.byteorder == "little": + scale = -scale + + file.write("%f\n".encode() % scale) + + image.tofile(file) + + +def writeDsp5File(disp, filename): + with h5py.File(filename, "w") as f: + f.create_dataset("disparity", data=disp, compression="gzip", compression_opts=5) + + +# disp visualization + + +def vis_disparity(disp, m=None, M=None): + if m is None: + m = disp.min() + if M is None: + M = disp.max() + disp_vis = (disp - m) / (M - m) * 255.0 + disp_vis = disp_vis.astype("uint8") + disp_vis = cv2.applyColorMap(disp_vis, cv2.COLORMAP_INFERNO) + return disp_vis + + +# dataset getter + + +def get_train_dataset_stereo(dataset_str, augmentor=True, crop_size=None): + dataset_str = dataset_str.replace("(", "Dataset(") + if augmentor: + dataset_str = dataset_str.replace(")", ", augmentor=True)") + if crop_size is not None: + dataset_str = dataset_str.replace( + ")", ", crop_size={:s})".format(str(crop_size)) + ) + return eval(dataset_str) + + +def get_test_datasets_stereo(dataset_str): + dataset_str = dataset_str.replace("(", "Dataset(") + return [eval(s) for s in dataset_str.split("+")] diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/download_model.sh b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/download_model.sh new file mode 100644 index 0000000000000000000000000000000000000000..533119609108c5ec3c22ff79b10e9215c1ac5098 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/download_model.sh @@ -0,0 +1,12 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +model=$1 +outfile="stereoflow_models/${model}" +if [[ ! -f $outfile ]] +then + mkdir -p stereoflow_models/; + wget https://download.europe.naverlabs.com/ComputerVision/CroCo/StereoFlow_models/$1 -P stereoflow_models/; +else + echo "Model ${model} already downloaded in ${outfile}." +fi \ No newline at end of file diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/engine.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..9736f2ab7c895e032893f60949baf87131a49b6e --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/engine.py @@ -0,0 +1,367 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Main function for training one epoch or testing +# -------------------------------------------------------- + +import math +import sys +from typing import Iterable +import numpy as np +import torch +import torchvision + +from utils import misc as misc + + +def split_prediction_conf(predictions, with_conf=False): + if not with_conf: + return predictions, None + conf = predictions[:, -1:, :, :] + predictions = predictions[:, :-1, :, :] + return predictions, conf + + +def train_one_epoch( + model: torch.nn.Module, + criterion: torch.nn.Module, + metrics: torch.nn.Module, + data_loader: Iterable, + optimizer: torch.optim.Optimizer, + device: torch.device, + epoch: int, + loss_scaler, + log_writer=None, + print_freq=20, + args=None, +): + model.train(True) + metric_logger = misc.MetricLogger(delimiter=" ") + metric_logger.add_meter("lr", misc.SmoothedValue(window_size=1, fmt="{value:.6f}")) + header = "Epoch: [{}]".format(epoch) + + accum_iter = args.accum_iter + + optimizer.zero_grad() + + details = {} + + if log_writer is not None: + print("log_dir: {}".format(log_writer.log_dir)) + + if args.img_per_epoch: + iter_per_epoch = args.img_per_epoch // args.batch_size + int( + args.img_per_epoch % args.batch_size > 0 + ) + assert ( + len(data_loader) >= iter_per_epoch + ), "Dataset is too small for so many iterations" + len_data_loader = iter_per_epoch + else: + len_data_loader, iter_per_epoch = len(data_loader), None + + for data_iter_step, (image1, image2, gt, pairname) in enumerate( + metric_logger.log_every( + data_loader, print_freq, header, max_iter=iter_per_epoch + ) + ): + + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + gt = gt.to(device, non_blocking=True) + + # we use a per iteration (instead of per epoch) lr scheduler + if data_iter_step % accum_iter == 0: + misc.adjust_learning_rate( + optimizer, data_iter_step / len_data_loader + epoch, args + ) + + with torch.cuda.amp.autocast(enabled=bool(args.amp)): + prediction = model(image1, image2) + prediction, conf = split_prediction_conf(prediction, criterion.with_conf) + batch_metrics = metrics(prediction.detach(), gt) + loss = ( + criterion(prediction, gt) + if conf is None + else criterion(prediction, gt, conf) + ) + + loss_value = loss.item() + if not math.isfinite(loss_value): + print("Loss is {}, stopping training".format(loss_value)) + sys.exit(1) + + loss /= accum_iter + loss_scaler( + loss, + optimizer, + parameters=model.parameters(), + update_grad=(data_iter_step + 1) % accum_iter == 0, + ) + if (data_iter_step + 1) % accum_iter == 0: + optimizer.zero_grad() + + torch.cuda.synchronize() + + metric_logger.update(loss=loss_value) + for k, v in batch_metrics.items(): + metric_logger.update(**{k: v.item()}) + lr = optimizer.param_groups[0]["lr"] + metric_logger.update(lr=lr) + + # if args.dsitributed: loss_value_reduce = misc.all_reduce_mean(loss_value) + time_to_log = (data_iter_step + 1) % ( + args.tboard_log_step * accum_iter + ) == 0 or data_iter_step == len_data_loader - 1 + loss_value_reduce = misc.all_reduce_mean(loss_value) + if log_writer is not None and time_to_log: + epoch_1000x = int((data_iter_step / len_data_loader + epoch) * 1000) + # We use epoch_1000x as the x-axis in tensorboard. This calibrates different curves when batch size changes. + log_writer.add_scalar("train/loss", loss_value_reduce, epoch_1000x) + log_writer.add_scalar("lr", lr, epoch_1000x) + for k, v in batch_metrics.items(): + log_writer.add_scalar("train/" + k, v.item(), epoch_1000x) + + # gather the stats from all processes + # if args.distributed: metric_logger.synchronize_between_processes() + print("Averaged stats:", metric_logger) + return {k: meter.global_avg for k, meter in metric_logger.meters.items()} + + +@torch.no_grad() +def validate_one_epoch( + model: torch.nn.Module, + criterion: torch.nn.Module, + metrics: torch.nn.Module, + data_loaders: list[Iterable], + device: torch.device, + epoch: int, + log_writer=None, + args=None, +): + + model.eval() + metric_loggers = [] + header = "Epoch: [{}]".format(epoch) + print_freq = 20 + + conf_mode = args.tile_conf_mode + crop = args.crop + + if log_writer is not None: + print("log_dir: {}".format(log_writer.log_dir)) + + results = {} + dnames = [] + image1, image2, gt, prediction = None, None, None, None + for didx, data_loader in enumerate(data_loaders): + dname = str(data_loader.dataset) + dnames.append(dname) + metric_loggers.append(misc.MetricLogger(delimiter=" ")) + for data_iter_step, (image1, image2, gt, pairname) in enumerate( + metric_loggers[didx].log_every(data_loader, print_freq, header) + ): + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + gt = gt.to(device, non_blocking=True) + if dname.startswith("Spring"): + assert ( + gt.size(2) == image1.size(2) * 2 + and gt.size(3) == image1.size(3) * 2 + ) + gt = ( + gt[:, :, 0::2, 0::2] + + gt[:, :, 0::2, 1::2] + + gt[:, :, 1::2, 0::2] + + gt[:, :, 1::2, 1::2] + ) / 4.0 # we approximate the gt based on the 2x upsampled ones + + with torch.inference_mode(): + prediction, tiled_loss, c = tiled_pred( + model, + criterion, + image1, + image2, + gt, + conf_mode=conf_mode, + overlap=args.val_overlap, + crop=crop, + with_conf=criterion.with_conf, + ) + batch_metrics = metrics(prediction.detach(), gt) + loss = ( + criterion(prediction.detach(), gt) + if not criterion.with_conf + else criterion(prediction.detach(), gt, c) + ) + loss_value = loss.item() + metric_loggers[didx].update(loss_tiled=tiled_loss.item()) + metric_loggers[didx].update(**{f"loss": loss_value}) + for k, v in batch_metrics.items(): + metric_loggers[didx].update(**{dname + "_" + k: v.item()}) + + results = { + k: meter.global_avg for ml in metric_loggers for k, meter in ml.meters.items() + } + if len(dnames) > 1: + for k in batch_metrics.keys(): + results["AVG_" + k] = sum( + results[dname + "_" + k] for dname in dnames + ) / len(dnames) + + if log_writer is not None: + epoch_1000x = int((1 + epoch) * 1000) + for k, v in results.items(): + log_writer.add_scalar("val/" + k, v, epoch_1000x) + + print("Averaged stats:", results) + return results + + +import torch.nn.functional as F + + +def _resize_img(img, new_size): + return F.interpolate(img, size=new_size, mode="bicubic", align_corners=False) + + +def _resize_stereo_or_flow(data, new_size): + assert data.ndim == 4 + assert data.size(1) in [1, 2] + scale_x = new_size[1] / float(data.size(3)) + out = F.interpolate(data, size=new_size, mode="bicubic", align_corners=False) + out[:, 0, :, :] *= scale_x + if out.size(1) == 2: + scale_y = new_size[0] / float(data.size(2)) + out[:, 1, :, :] *= scale_y + print(scale_x, new_size, data.shape) + return out + + +@torch.no_grad() +def tiled_pred( + model, + criterion, + img1, + img2, + gt, + overlap=0.5, + bad_crop_thr=0.05, + downscale=False, + crop=512, + ret="loss", + conf_mode="conf_expsigmoid_10_5", + with_conf=False, + return_time=False, +): + + # for each image, we are going to run inference on many overlapping patches + # then, all predictions will be weighted-averaged + if gt is not None: + B, C, H, W = gt.shape + else: + B, _, H, W = img1.shape + C = model.head.num_channels - int(with_conf) + win_height, win_width = crop[0], crop[1] + + # upscale to be larger than the crop + do_change_scale = H < win_height or W < win_width + if do_change_scale: + upscale_factor = max(win_width / W, win_height / W) + original_size = (H, W) + new_size = (round(H * upscale_factor), round(W * upscale_factor)) + img1 = _resize_img(img1, new_size) + img2 = _resize_img(img2, new_size) + # resize gt just for the computation of tiled losses + if gt is not None: + gt = _resize_stereo_or_flow(gt, new_size) + H, W = img1.shape[2:4] + + if conf_mode.startswith("conf_expsigmoid_"): # conf_expsigmoid_30_10 + beta, betasigmoid = map(float, conf_mode[len("conf_expsigmoid_") :].split("_")) + elif conf_mode.startswith("conf_expbeta"): # conf_expbeta3 + beta = float(conf_mode[len("conf_expbeta") :]) + else: + raise NotImplementedError(f"conf_mode {conf_mode} is not implemented") + + def crop_generator(): + for sy in _overlapping(H, win_height, overlap): + for sx in _overlapping(W, win_width, overlap): + yield sy, sx, sy, sx, True + + # keep track of weighted sum of prediction*weights and weights + accu_pred = img1.new_zeros( + (B, C, H, W) + ) # accumulate the weighted sum of predictions + accu_conf = img1.new_zeros((B, H, W)) + 1e-16 # accumulate the weights + accu_c = img1.new_zeros( + (B, H, W) + ) # accumulate the weighted sum of confidences ; not so useful except for computing some losses + + tiled_losses = [] + + if return_time: + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + + for sy1, sx1, sy2, sx2, aligned in crop_generator(): + # compute optical flow there + pred = model(_crop(img1, sy1, sx1), _crop(img2, sy2, sx2)) + pred, predconf = split_prediction_conf(pred, with_conf=with_conf) + + if gt is not None: + gtcrop = _crop(gt, sy1, sx1) + if criterion is not None and gt is not None: + tiled_losses.append( + criterion(pred, gtcrop).item() + if predconf is None + else criterion(pred, gtcrop, predconf).item() + ) + + if conf_mode.startswith("conf_expsigmoid_"): + conf = torch.exp( + -beta * 2 * (torch.sigmoid(predconf / betasigmoid) - 0.5) + ).view(B, win_height, win_width) + elif conf_mode.startswith("conf_expbeta"): + conf = torch.exp(-beta * predconf).view(B, win_height, win_width) + else: + raise NotImplementedError + + accu_pred[..., sy1, sx1] += pred * conf[:, None, :, :] + accu_conf[..., sy1, sx1] += conf + accu_c[..., sy1, sx1] += predconf.view(B, win_height, win_width) * conf + + pred = accu_pred / accu_conf[:, None, :, :] + c = accu_c / accu_conf + assert not torch.any(torch.isnan(pred)) + + if return_time: + end.record() + torch.cuda.synchronize() + time = start.elapsed_time(end) / 1000.0 # this was in milliseconds + + if do_change_scale: + pred = _resize_stereo_or_flow(pred, original_size) + + if return_time: + return pred, torch.mean(torch.tensor(tiled_losses)), c, time + return pred, torch.mean(torch.tensor(tiled_losses)), c + + +def _overlapping(total, window, overlap=0.5): + assert total >= window and 0 <= overlap < 1, (total, window, overlap) + num_windows = 1 + int(np.ceil((total - window) / ((1 - overlap) * window))) + offsets = np.linspace(0, total - window, num_windows).round().astype(int) + yield from (slice(x, x + window) for x in offsets) + + +def _crop(img, sy, sx): + B, THREE, H, W = img.shape + if 0 <= sy.start and sy.stop <= H and 0 <= sx.start and sx.stop <= W: + return img[:, :, sy, sx] + l, r = max(0, -sx.start), max(0, sx.stop - W) + t, b = max(0, -sy.start), max(0, sy.stop - H) + img = torch.nn.functional.pad(img, (l, r, t, b), mode="constant") + return img[:, :, slice(sy.start + t, sy.stop + t), slice(sx.start + l, sx.stop + l)] diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/test.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/test.py new file mode 100644 index 0000000000000000000000000000000000000000..15dcf769169d460b716b05acb290340b6a197a6d --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/test.py @@ -0,0 +1,303 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Main test function +# -------------------------------------------------------- + +import os +import argparse +import pickle +from PIL import Image +import numpy as np +from tqdm import tqdm + +import torch +from torch.utils.data import DataLoader + +import utils.misc as misc +from models.croco_downstream import CroCoDownstreamBinocular +from models.head_downstream import PixelwiseTaskWithDPT + +from stereoflow.criterion import * +from stereoflow.datasets_stereo import get_test_datasets_stereo +from stereoflow.datasets_flow import get_test_datasets_flow +from stereoflow.engine import tiled_pred + +from stereoflow.datasets_stereo import vis_disparity +from stereoflow.datasets_flow import flowToColor + + +def get_args_parser(): + parser = argparse.ArgumentParser("Test CroCo models on stereo/flow", add_help=False) + # important argument + parser.add_argument( + "--model", required=True, type=str, help="Path to the model to evaluate" + ) + parser.add_argument( + "--dataset", + required=True, + type=str, + help="test dataset (there can be multiple dataset separated by a +)", + ) + # tiling + parser.add_argument( + "--tile_conf_mode", + type=str, + default="", + help="Weights for the tiling aggregation based on confidence (empty means use the formula from the loaded checkpoint", + ) + parser.add_argument( + "--tile_overlap", type=float, default=0.7, help="overlap between tiles" + ) + # save (it will automatically go to _/_) + parser.add_argument( + "--save", + type=str, + nargs="+", + default=[], + help="what to save: \ + metrics (pickle file), \ + pred (raw prediction save as torch tensor), \ + visu (visualization in png of each prediction), \ + err10 (visualization in png of the error clamp at 10 for each prediction), \ + submission (submission file)", + ) + # other (no impact) + parser.add_argument("--num_workers", default=4, type=int) + return parser + + +def _load_model_and_criterion(model_path, do_load_metrics, device): + print("loading model from", model_path) + assert os.path.isfile(model_path) + ckpt = torch.load(model_path, "cpu") + + ckpt_args = ckpt["args"] + task = ckpt_args.task + tile_conf_mode = ckpt_args.tile_conf_mode + num_channels = {"stereo": 1, "flow": 2}[task] + with_conf = eval(ckpt_args.criterion).with_conf + if with_conf: + num_channels += 1 + print("head: PixelwiseTaskWithDPT()") + head = PixelwiseTaskWithDPT() + head.num_channels = num_channels + print("croco_args:", ckpt_args.croco_args) + model = CroCoDownstreamBinocular(head, **ckpt_args.croco_args) + msg = model.load_state_dict(ckpt["model"], strict=True) + model.eval() + model = model.to(device) + + if do_load_metrics: + if task == "stereo": + metrics = StereoDatasetMetrics().to(device) + else: + metrics = FlowDatasetMetrics().to(device) + else: + metrics = None + + return model, metrics, ckpt_args.crop, with_conf, task, tile_conf_mode + + +def _save_batch( + pred, gt, pairnames, dataset, task, save, outdir, time, submission_dir=None +): + + for i in range(len(pairnames)): + + pairname = ( + eval(pairnames[i]) if pairnames[i].startswith("(") else pairnames[i] + ) # unbatch pairname + fname = os.path.join(outdir, dataset.pairname_to_str(pairname)) + os.makedirs(os.path.dirname(fname), exist_ok=True) + + predi = pred[i, ...] + if gt is not None: + gti = gt[i, ...] + + if "pred" in save: + torch.save(predi.squeeze(0).cpu(), fname + "_pred.pth") + + if "visu" in save: + if task == "stereo": + disparity = predi.permute((1, 2, 0)).squeeze(2).cpu().numpy() + m, M = None + if gt is not None: + mask = torch.isfinite(gti) + m = gt[mask].min() + M = gt[mask].max() + img_disparity = vis_disparity(disparity, m=m, M=M) + Image.fromarray(img_disparity).save(fname + "_pred.png") + else: + # normalize flowToColor according to the maxnorm of gt (or prediction if not available) + flowNorm = ( + torch.sqrt( + torch.sum((gti if gt is not None else predi) ** 2, dim=0) + ) + .max() + .item() + ) + imgflow = flowToColor( + predi.permute((1, 2, 0)).cpu().numpy(), maxflow=flowNorm + ) + Image.fromarray(imgflow).save(fname + "_pred.png") + + if "err10" in save: + assert gt is not None + L2err = torch.sqrt(torch.sum((gti - predi) ** 2, dim=0)) + valid = torch.isfinite(gti[0, :, :]) + L2err[~valid] = 0.0 + L2err = torch.clamp(L2err, max=10.0) + red = (L2err * 255.0 / 10.0).to(dtype=torch.uint8)[:, :, None] + zer = torch.zeros_like(red) + imgerr = torch.cat((red, zer, zer), dim=2).cpu().numpy() + Image.fromarray(imgerr).save(fname + "_err10.png") + + if "submission" in save: + assert submission_dir is not None + predi_np = ( + predi.permute(1, 2, 0).squeeze(2).cpu().numpy() + ) # transform into HxWx2 for flow or HxW for stereo + dataset.submission_save_pairname(pairname, predi_np, submission_dir, time) + + +def main(args): + + # load the pretrained model and metrics + device = ( + torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") + ) + model, metrics, cropsize, with_conf, task, tile_conf_mode = ( + _load_model_and_criterion(args.model, "metrics" in args.save, device) + ) + if args.tile_conf_mode == "": + args.tile_conf_mode = tile_conf_mode + + # load the datasets + datasets = ( + get_test_datasets_stereo if task == "stereo" else get_test_datasets_flow + )(args.dataset) + dataloaders = [ + DataLoader( + dataset, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + for dataset in datasets + ] + + # run + for i, dataloader in enumerate(dataloaders): + dataset = datasets[i] + dstr = args.dataset.split("+")[i] + + outdir = args.model + "_" + misc.filename(dstr) + if "metrics" in args.save and len(args.save) == 1: + fname = os.path.join( + outdir, f"conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}.pkl" + ) + if os.path.isfile(fname) and len(args.save) == 1: + print(" metrics already compute in " + fname) + with open(fname, "rb") as fid: + results = pickle.load(fid) + for k, v in results.items(): + print("{:s}: {:.3f}".format(k, v)) + continue + + if "submission" in args.save: + dirname = ( + f"submission_conf_{args.tile_conf_mode}_overlap_{args.tile_overlap}" + ) + submission_dir = os.path.join(outdir, dirname) + else: + submission_dir = None + + print("") + print("saving {:s} in {:s}".format("+".join(args.save), outdir)) + print(repr(dataset)) + + if metrics is not None: + metrics.reset() + + for data_iter_step, (image1, image2, gt, pairnames) in enumerate( + tqdm(dataloader) + ): + + do_flip = ( + task == "stereo" + and dstr.startswith("Spring") + and any("right" in p for p in pairnames) + ) # we flip the images and will flip the prediction after as we assume img1 is on the left + + image1 = image1.to(device, non_blocking=True) + image2 = image2.to(device, non_blocking=True) + gt = ( + gt.to(device, non_blocking=True) if gt.numel() > 0 else None + ) # special case for test time + if do_flip: + assert all("right" in p for p in pairnames) + image1 = image1.flip( + dims=[3] + ) # this is already the right frame, let's flip it + image2 = image2.flip(dims=[3]) + gt = gt # that is ok + + with torch.inference_mode(): + pred, _, _, time = tiled_pred( + model, + None, + image1, + image2, + None if dataset.name == "Spring" else gt, + conf_mode=args.tile_conf_mode, + overlap=args.tile_overlap, + crop=cropsize, + with_conf=with_conf, + return_time=True, + ) + + if do_flip: + pred = pred.flip(dims=[3]) + + if metrics is not None: + metrics.add_batch(pred, gt) + + if any(k in args.save for k in ["pred", "visu", "err10", "submission"]): + _save_batch( + pred, + gt, + pairnames, + dataset, + task, + args.save, + outdir, + time, + submission_dir=submission_dir, + ) + + # print + if metrics is not None: + results = metrics.get_results() + for k, v in results.items(): + print("{:s}: {:.3f}".format(k, v)) + + # save if needed + if "metrics" in args.save: + os.makedirs(os.path.dirname(fname), exist_ok=True) + with open(fname, "wb") as fid: + pickle.dump(results, fid) + print("metrics saved in", fname) + + # finalize submission if needed + if "submission" in args.save: + dataset.finalize_submission(submission_dir) + + +if __name__ == "__main__": + args = get_args_parser() + args = args.parse_args() + main(args) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/train.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/train.py new file mode 100644 index 0000000000000000000000000000000000000000..c349cb479267648cad4d8b4c282dafd7a8896076 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/stereoflow/train.py @@ -0,0 +1,455 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). + +# -------------------------------------------------------- +# Main training function +# -------------------------------------------------------- + +import argparse +import datetime +import json +import numpy as np +import os +import sys +import time + +import torch +import torch.distributed as dist +import torch.backends.cudnn as cudnn +from torch.utils.tensorboard import SummaryWriter +import torchvision.transforms as transforms +import torchvision.datasets as datasets +from torch.utils.data import DataLoader + +import utils +import utils.misc as misc +from utils.misc import NativeScalerWithGradNormCount as NativeScaler +from models.croco_downstream import CroCoDownstreamBinocular, croco_args_from_ckpt +from models.pos_embed import interpolate_pos_embed +from models.head_downstream import PixelwiseTaskWithDPT + +from stereoflow.datasets_stereo import ( + get_train_dataset_stereo, + get_test_datasets_stereo, +) +from stereoflow.datasets_flow import get_train_dataset_flow, get_test_datasets_flow +from stereoflow.engine import train_one_epoch, validate_one_epoch +from stereoflow.criterion import * + + +def get_args_parser(): + # prepare subparsers + parser = argparse.ArgumentParser( + "Finetuning CroCo models on stereo or flow", add_help=False + ) + subparsers = parser.add_subparsers( + title="Task (stereo or flow)", dest="task", required=True + ) + parser_stereo = subparsers.add_parser("stereo", help="Training stereo model") + parser_flow = subparsers.add_parser("flow", help="Training flow model") + + def add_arg( + name_or_flags, default=None, default_stereo=None, default_flow=None, **kwargs + ): + if default is not None: + assert ( + default_stereo is None and default_flow is None + ), "setting default makes default_stereo and default_flow disabled" + parser_stereo.add_argument( + name_or_flags, + default=default if default is not None else default_stereo, + **kwargs, + ) + parser_flow.add_argument( + name_or_flags, + default=default if default is not None else default_flow, + **kwargs, + ) + + # output dir + add_arg( + "--output_dir", + required=True, + type=str, + help="path where to save, if empty, automatically created", + ) + # model + add_arg( + "--crop", + type=int, + nargs="+", + default_stereo=[352, 704], + default_flow=[320, 384], + help="size of the random image crops used during training.", + ) + add_arg( + "--pretrained", + required=True, + type=str, + help="Load pretrained model (required as croco arguments come from there)", + ) + # criterion + add_arg( + "--criterion", + default_stereo="LaplacianLossBounded2()", + default_flow="LaplacianLossBounded()", + type=str, + help="string to evaluate to get criterion", + ) + add_arg("--bestmetric", default_stereo="avgerr", default_flow="EPE", type=str) + # dataset + add_arg("--dataset", type=str, required=True, help="training set") + # training + add_arg("--seed", default=0, type=int, help="seed") + add_arg( + "--batch_size", + default_stereo=6, + default_flow=8, + type=int, + help="Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus", + ) + add_arg("--epochs", default=32, type=int, help="number of training epochs") + add_arg( + "--img_per_epoch", + type=int, + default=None, + help="Fix the number of images seen in an epoch (None means use all training pairs)", + ) + add_arg( + "--accum_iter", + default=1, + type=int, + help="Accumulate gradient iterations (for increasing the effective batch size under memory constraints)", + ) + add_arg( + "--weight_decay", type=float, default=0.05, help="weight decay (default: 0.05)" + ) + add_arg( + "--lr", + type=float, + default_stereo=3e-5, + default_flow=2e-5, + metavar="LR", + help="learning rate (absolute lr)", + ) + add_arg( + "--min_lr", + type=float, + default=0.0, + metavar="LR", + help="lower lr bound for cyclic schedulers that hit 0", + ) + add_arg( + "--warmup_epochs", type=int, default=1, metavar="N", help="epochs to warmup LR" + ) + add_arg( + "--optimizer", + default="AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95))", + type=str, + help="Optimizer from torch.optim [ default: AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) ]", + ) + add_arg( + "--amp", + default=0, + type=int, + choices=[0, 1], + help="enable automatic mixed precision training", + ) + # validation + add_arg( + "--val_dataset", + type=str, + default="", + help="Validation sets, multiple separated by + (empty string means that no validation is performed)", + ) + add_arg( + "--tile_conf_mode", + type=str, + default_stereo="conf_expsigmoid_15_3", + default_flow="conf_expsigmoid_10_5", + help="Weights for tile aggregation", + ) + add_arg( + "--val_overlap", default=0.7, type=float, help="Overlap value for the tiling" + ) + # others + add_arg("--num_workers", default=8, type=int) + add_arg("--eval_every", type=int, default=1, help="Val loss evaluation frequency") + add_arg("--save_every", type=int, default=1, help="Save checkpoint frequency") + add_arg( + "--start_from", + type=str, + default=None, + help="Start training using weights from an other model (eg for finetuning)", + ) + add_arg( + "--tboard_log_step", + type=int, + default=100, + help="Log to tboard every so many steps", + ) + add_arg( + "--dist_url", default="env://", help="url used to set up distributed training" + ) + + return parser + + +def main(args): + misc.init_distributed_mode(args) + global_rank = misc.get_rank() + num_tasks = misc.get_world_size() + + assert os.path.isfile(args.pretrained) + print("output_dir: " + args.output_dir) + os.makedirs(args.output_dir, exist_ok=True) + + # fix the seed for reproducibility + seed = args.seed + misc.get_rank() + torch.manual_seed(seed) + np.random.seed(seed) + cudnn.benchmark = True + + # Metrics / criterion + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + metrics = (StereoMetrics if args.task == "stereo" else FlowMetrics)().to(device) + criterion = eval(args.criterion).to(device) + print("Criterion: ", args.criterion) + + # Prepare model + assert os.path.isfile(args.pretrained) + ckpt = torch.load(args.pretrained, "cpu") + croco_args = croco_args_from_ckpt(ckpt) + croco_args["img_size"] = (args.crop[0], args.crop[1]) + print("Croco args: " + str(croco_args)) + args.croco_args = croco_args # saved for test time + # prepare head + num_channels = {"stereo": 1, "flow": 2}[args.task] + if criterion.with_conf: + num_channels += 1 + print(f"Building head PixelwiseTaskWithDPT() with {num_channels} channel(s)") + head = PixelwiseTaskWithDPT() + head.num_channels = num_channels + # build model and load pretrained weights + model = CroCoDownstreamBinocular(head, **croco_args) + interpolate_pos_embed(model, ckpt["model"]) + msg = model.load_state_dict(ckpt["model"], strict=False) + print(msg) + + total_params = sum(p.numel() for p in model.parameters()) + total_params_trainable = sum( + p.numel() for p in model.parameters() if p.requires_grad + ) + print(f"Total params: {total_params}") + print(f"Total params trainable: {total_params_trainable}") + model_without_ddp = model.to(device) + + eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() + print("lr: %.2e" % args.lr) + print("accumulate grad iterations: %d" % args.accum_iter) + print("effective batch size: %d" % eff_batch_size) + + if args.distributed: + model = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[args.gpu], static_graph=True + ) + model_without_ddp = model.module + + # following timm: set wd as 0 for bias and norm layers + param_groups = misc.get_parameter_groups(model_without_ddp, args.weight_decay) + optimizer = eval(f"torch.optim.{args.optimizer}") + print(optimizer) + loss_scaler = NativeScaler() + + # automatic restart + last_ckpt_fname = os.path.join(args.output_dir, f"checkpoint-last.pth") + args.resume = last_ckpt_fname if os.path.isfile(last_ckpt_fname) else None + + if not args.resume and args.start_from: + print(f"Starting from an other model's weights: {args.start_from}") + best_so_far = None + args.start_epoch = 0 + ckpt = torch.load(args.start_from, "cpu") + msg = model_without_ddp.load_state_dict(ckpt["model"], strict=False) + print(msg) + else: + best_so_far = misc.load_model( + args=args, + model_without_ddp=model_without_ddp, + optimizer=optimizer, + loss_scaler=loss_scaler, + ) + + if best_so_far is None: + best_so_far = np.inf + + # tensorboard + log_writer = None + if global_rank == 0 and args.output_dir is not None: + log_writer = SummaryWriter( + log_dir=args.output_dir, purge_step=args.start_epoch * 1000 + ) + + # dataset and loader + print("Building Train Data loader for dataset: ", args.dataset) + train_dataset = ( + get_train_dataset_stereo if args.task == "stereo" else get_train_dataset_flow + )(args.dataset, crop_size=args.crop) + + def _print_repr_dataset(d): + if isinstance(d, torch.utils.data.dataset.ConcatDataset): + for dd in d.datasets: + _print_repr_dataset(dd) + else: + print(repr(d)) + + _print_repr_dataset(train_dataset) + print(" total length:", len(train_dataset)) + if args.distributed: + sampler_train = torch.utils.data.DistributedSampler( + train_dataset, num_replicas=num_tasks, rank=global_rank, shuffle=True + ) + else: + sampler_train = torch.utils.data.RandomSampler(train_dataset) + data_loader_train = torch.utils.data.DataLoader( + train_dataset, + sampler=sampler_train, + batch_size=args.batch_size, + num_workers=args.num_workers, + pin_memory=True, + drop_last=True, + ) + if args.val_dataset == "": + data_loaders_val = None + else: + print("Building Val Data loader for datasets: ", args.val_dataset) + val_datasets = ( + get_test_datasets_stereo + if args.task == "stereo" + else get_test_datasets_flow + )(args.val_dataset) + for val_dataset in val_datasets: + print(repr(val_dataset)) + data_loaders_val = [ + DataLoader( + val_dataset, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + pin_memory=True, + drop_last=False, + ) + for val_dataset in val_datasets + ] + bestmetric = ( + "AVG_" + if len(data_loaders_val) > 1 + else str(data_loaders_val[0].dataset) + "_" + ) + args.bestmetric + + print(f"Start training for {args.epochs} epochs") + start_time = time.time() + # Training Loop + for epoch in range(args.start_epoch, args.epochs): + + if args.distributed: + data_loader_train.sampler.set_epoch(epoch) + + # Train + epoch_start = time.time() + train_stats = train_one_epoch( + model, + criterion, + metrics, + data_loader_train, + optimizer, + device, + epoch, + loss_scaler, + log_writer=log_writer, + args=args, + ) + epoch_time = time.time() - epoch_start + + if args.distributed: + dist.barrier() + + # Validation (current naive implementation runs the validation on every gpu ... not smart ...) + if ( + data_loaders_val is not None + and args.eval_every > 0 + and (epoch + 1) % args.eval_every == 0 + ): + val_epoch_start = time.time() + val_stats = validate_one_epoch( + model, + criterion, + metrics, + data_loaders_val, + device, + epoch, + log_writer=log_writer, + args=args, + ) + val_epoch_time = time.time() - val_epoch_start + + val_best = val_stats[bestmetric] + + # Save best of all + if val_best <= best_so_far: + best_so_far = val_best + misc.save_model( + args=args, + model_without_ddp=model_without_ddp, + optimizer=optimizer, + loss_scaler=loss_scaler, + epoch=epoch, + best_so_far=best_so_far, + fname="best", + ) + + log_stats = { + **{f"train_{k}": v for k, v in train_stats.items()}, + "epoch": epoch, + **{f"val_{k}": v for k, v in val_stats.items()}, + } + else: + log_stats = { + **{f"train_{k}": v for k, v in train_stats.items()}, + "epoch": epoch, + } + + if args.distributed: + dist.barrier() + + # Save stuff + if args.output_dir and ( + (epoch + 1) % args.save_every == 0 or epoch + 1 == args.epochs + ): + misc.save_model( + args=args, + model_without_ddp=model_without_ddp, + optimizer=optimizer, + loss_scaler=loss_scaler, + epoch=epoch, + best_so_far=best_so_far, + fname="last", + ) + + if args.output_dir: + if log_writer is not None: + log_writer.flush() + with open( + os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8" + ) as f: + f.write(json.dumps(log_stats) + "\n") + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print("Training time {}".format(total_time_str)) + + +if __name__ == "__main__": + args = get_args_parser() + args = args.parse_args() + main(args) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/utils/misc.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..f085dc963e4d9050b859db19bdb8bc440549f0dd --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/croco/utils/misc.py @@ -0,0 +1,635 @@ +# Copyright (C) 2022-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# utilitary functions for CroCo +# -------------------------------------------------------- +# References: +# MAE: https://github.com/facebookresearch/mae +# DeiT: https://github.com/facebookresearch/deit +# BEiT: https://github.com/microsoft/unilm/tree/master/beit +# -------------------------------------------------------- + +import builtins +import datetime +import os +import time +import math +import json +from collections import defaultdict, deque +from pathlib import Path +import numpy as np +from itertools import islice + +import torch +import torch.distributed as dist +from torch import inf +from accelerate import Accelerator +from accelerate.logging import get_logger + +printer = get_logger(__name__, log_level="DEBUG") + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values.""" + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self, accelerator: Accelerator): + """Synchronize the count and total across all processes.""" + if accelerator.num_processes == 1: + return + t = torch.tensor( + [self.count, self.total], dtype=torch.float64, device=accelerator.device + ) + accelerator.wait_for_everyone() + accelerator.reduce(t, reduction="sum") + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + return torch.tensor(list(self.deque)).median().item() + + @property + def avg(self): + return torch.tensor(list(self.deque), dtype=torch.float32).mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value, + ) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if v is None: + continue + if isinstance(v, torch.Tensor): + if v.ndim > 0: + continue + v = v.item() + if isinstance(v, list): + continue + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError( + "'{}' object has no attribute '{}'".format(type(self).__name__, attr) + ) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append("{}: {}".format(name, str(meter))) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self, accelerator): + for meter in self.meters.values(): + meter.synchronize_between_processes(accelerator) + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every( + self, iterable, print_freq, accelerator: Accelerator, header=None, max_iter=None, start_step=0, + ): + # `start_step` is used for resume: skip the first `start_step` iterations + # while keeping correct logging indices/ETA based on the original iterable length. + start_step = int(start_step or 0) + i = start_step + if not header: + header = "" + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt="{avg:.4f}") + data_time = SmoothedValue(fmt="{avg:.4f}") + # Compute the total length BEFORE slicing; `islice` itself has no __len__. + len_iterable = min(len(iterable), max_iter) if max_iter else len(iterable) + if start_step > 0: + iterable = islice(iterable, start_step, None) + space_fmt = ":" + str(len(str(len_iterable))) + "d" + log_msg = [ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + ] + if torch.cuda.is_available(): + log_msg.append("max mem: {memory:.0f}") + log_msg = self.delimiter.join(log_msg) + MB = 1024.0 * 1024.0 + for it, obj in enumerate(iterable): + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0 or i == len_iterable - 1: + eta_seconds = iter_time.global_avg * (len_iterable - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + if accelerator.is_main_process: + printer.info( + log_msg.format( + i, + len_iterable, + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB, + ) + ) + else: + if accelerator.is_main_process: + printer.info( + log_msg.format( + i, + len_iterable, + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), + ) + ) + i += 1 + end = time.time() + if max_iter and it >= max_iter: + break + # if i + start_step >= len_iterable: + # break + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + if accelerator.is_main_process: + printer.info( + "{} Total time: {} ({:.4f} s / it)".format( + header, total_time_str, total_time / len_iterable + ) + ) + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + builtin_print = builtins.print + + def print(*args, **kwargs): + force = kwargs.pop("force", False) + force = force or (get_world_size() > 8) + if is_master or force: + now = datetime.datetime.now().time() + builtin_print("[{}] ".format(now), end="") # print with time stamp + builtin_print(*args, **kwargs) + + builtins.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(accelerator: Accelerator): + return accelerator.is_main_process + + +def save_on_master(accelerator: Accelerator, *args, **kwargs): + if is_main_process(accelerator): + # torch.save(*args, **kwargs) + accelerator.save(*args, **kwargs) + # unwrapped_model = accelerator.unwrap_model(model) + # accelerator.save(unwrapped_model.state_dict(), checkpoint_path) + + +def init_distributed_mode(args): + nodist = args.nodist if hasattr(args, "nodist") else False + if "RANK" in os.environ and "WORLD_SIZE" in os.environ and not nodist: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ["WORLD_SIZE"]) + args.gpu = int(os.environ["LOCAL_RANK"]) + else: + print("Not using distributed mode") + setup_for_distributed(is_master=True) # hack + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = "nccl" + print( + "| distributed init (rank {}): {}, gpu {}".format( + args.rank, args.dist_url, args.gpu + ), + flush=True, + ) + torch.distributed.init_process_group( + backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, + ) + torch.distributed.barrier() + setup_for_distributed(args.rank == 0) + + +class NativeScalerWithGradNormCount: + state_dict_key = "amp_scaler" + + def __init__(self, enabled=True, accelerator: Accelerator = None): + self.accelerator = accelerator + + def __call__( + self, + loss, + optimizer, + clip_grad=None, + parameters=None, + create_graph=False, + update_grad=True, + ): + self.accelerator.backward( + loss, create_graph=create_graph + ) # .backward(create_graph=create_graph) + if update_grad: + if clip_grad is not None: + assert parameters is not None + # self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + norm = self.accelerator.clip_grad_norm_(parameters, clip_grad) + else: + if self.accelerator.scaler is not None: + self.accelerator.unscale_gradients() + norm = get_grad_norm_(parameters) + optimizer.step() + else: + norm = None + return norm + + def state_dict(self): + if self.accelerator.scaler is not None: + return self.accelerator.scaler.state_dict() + else: + return {} + + def load_state_dict(self, state_dict): + if self.accelerator.scaler is not None: + self.accelerator.scaler.load_state_dict(state_dict) + + +# class NativeScalerWithGradNormCount: +# state_dict_key = "amp_scaler" + +# def __init__(self, enabled=True, accelerator:Accelerator=None): +# self._scaler = torch.cuda.amp.GradScaler(enabled=enabled) +# self.accelerator = accelerator + +# def __call__(self, loss, optimizer, clip_grad=None, parameters=None, create_graph=False, update_grad=True): +# # self.accelerator.backward(loss, create_graph=create_graph) #.backward(create_graph=create_graph) +# self._scaler.scale(loss).backward(create_graph=create_graph) +# if update_grad: +# if clip_grad is not None: +# assert parameters is not None +# # #self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place +# # norm = self.accelerator.clip_grad_norm_(parameters, clip_grad) +# self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place +# norm = torch.nn.utils.clip_grad_norm_(parameters, clip_grad) +# else: +# # if self.accelerator.scaler is not None: +# # self.accelerator.unscale_gradients() +# # norm = get_grad_norm_(parameters) +# self._scaler.unscale_(optimizer) +# norm = get_grad_norm_(parameters) +# # optimizer.step() +# self._scaler.step(optimizer) +# self._scaler.update() +# else: +# norm = None +# return norm + +# # def state_dict(self): +# # if self.accelerator.scaler is not None: +# # return self.accelerator.scaler.state_dict() +# # else: +# # return {} + +# # def load_state_dict(self, state_dict): +# # if self.accelerator.scaler is not None: +# # self.accelerator.scaler.load_state_dict(state_dict) + +# def state_dict(self): +# return self._scaler.state_dict() + +# def load_state_dict(self, state_dict): +# self._scaler.load_state_dict(state_dict) + + +def get_grad_norm_(parameters, norm_type: float = 2.0) -> torch.Tensor: + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + norm_type = float(norm_type) + if len(parameters) == 0: + return torch.tensor(0.0) + device = parameters[0].grad.device + if norm_type == inf: + total_norm = max(p.grad.detach().abs().max().to(device) for p in parameters) + else: + total_norm = torch.norm( + torch.stack( + [torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters] + ), + norm_type, + ) + return total_norm + + +def save_model( + accelerator, + args, + epoch, + model_without_ddp, + optimizer, + loss_scaler, + step, + fname=None, + best_so_far=None, +): + if accelerator.is_main_process: + output_dir = Path(args.output_dir) + if fname is None: + fname = str(epoch) + checkpoint_path = output_dir / ("checkpoint-%s.pth" % fname) + to_save = { + "model": model_without_ddp.state_dict(), + "optimizer": optimizer.state_dict(), + "scaler": loss_scaler.state_dict(), + "args": args, + "epoch": epoch, + "step": step, + } + if best_so_far is not None: + to_save["best_so_far"] = best_so_far + print(f">> Saving model to {checkpoint_path} ...") + save_on_master(accelerator, to_save, checkpoint_path) + + to_save = { + "model": model_without_ddp.state_dict(), + } + checkpoint_path = output_dir / ("model.pth") + save_on_master(accelerator, to_save, checkpoint_path) + + +def load_model(args, model_without_ddp, optimizer, loss_scaler): + args.start_epoch = 0 + args.start_step = 0 + best_so_far = None + if args.resume is not None: + if args.resume.startswith("https"): + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location="cpu", check_hash=True + ) + else: + checkpoint = torch.load(args.resume, map_location="cuda", weights_only=False) + printer.info("Resume checkpoint %s" % args.resume) + state_dict = checkpoint["model"] + new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()} + model_without_ddp.load_state_dict(new_state_dict, strict=True) + args.start_epoch = checkpoint["epoch"] + 1 + if "step" in checkpoint: + args.start_step = checkpoint["step"] + device = next(model_without_ddp.parameters()).device + printer.info(f"Moving optimizer state to device: {device}") + + if "optimizer" in checkpoint: + for state in checkpoint["optimizer"]["state"].values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.to(device) + + optimizer.load_state_dict(checkpoint["optimizer"]) + + if "scaler" in checkpoint: + loss_scaler.load_state_dict(checkpoint["scaler"]) + if "best_so_far" in checkpoint: + best_so_far = checkpoint["best_so_far"] + printer.info(" & best_so_far={:g}".format(best_so_far)) + else: + printer.info("") + printer.info("With optim & sched! start_epoch={:d}".format(args.start_epoch)) + return best_so_far + + +def all_reduce_mean(x, accelerator): + """Use accelerator to all-reduce and compute mean.""" + if accelerator.state.num_processes > 1: + x_reduce = torch.tensor(x).cuda() + accelerator.reduce(x_reduce, reduce_op="SUM") + x_reduce /= accelerator.state.num_processes + return x_reduce.item() + else: + return x + + +def _replace(text, src, tgt, rm=""): + """Advanced string replacement. + Given a text: + - replace all elements in src by the corresponding element in tgt + - remove all elements in rm + """ + if len(tgt) == 1: + tgt = tgt * len(src) + assert len(src) == len(tgt), f"'{src}' and '{tgt}' should have the same len" + for s, t in zip(src, tgt): + text = text.replace(s, t) + for c in rm: + text = text.replace(c, "") + return text + + +def filename(obj): + """transform a python obj or cmd into a proper filename. + - \1 gets replaced by slash '/' + - \2 gets replaced by comma ',' + """ + if not isinstance(obj, str): + obj = repr(obj) + obj = str(obj).replace("()", "") + obj = _replace(obj, "_,(*/\1\2", "-__x%/,", rm=" )'\"") + assert all(len(s) < 256 for s in obj.split(os.sep)), ( + "filename too long (>256 characters):\n" + obj + ) + return obj + + +def _get_num_layer_for_vit(var_name, enc_depth, dec_depth): + if var_name in ("cls_token", "mask_token", "pos_embed", "global_tokens"): + return 0 + elif var_name.startswith("patch_embed"): + return 0 + elif var_name.startswith("enc_blocks"): + layer_id = int(var_name.split(".")[1]) + return layer_id + 1 + elif var_name.startswith("decoder_embed") or var_name.startswith( + "enc_norm" + ): # part of the last black + return enc_depth + elif var_name.startswith("dec_blocks"): + layer_id = int(var_name.split(".")[1]) + return enc_depth + layer_id + 1 + elif var_name.startswith("dec_norm"): # part of the last block + return enc_depth + dec_depth + elif any(var_name.startswith(k) for k in ["head", "prediction_head"]): + return enc_depth + dec_depth + 1 + else: + raise NotImplementedError(var_name) + + +def get_parameter_groups( + model, weight_decay, layer_decay=1.0, skip_list=(), no_lr_scale_list=[] +): + parameter_group_names = {} + parameter_group_vars = {} + enc_depth, dec_depth = None, None + # prepare layer decay values + assert layer_decay == 1.0 or 0.0 < layer_decay < 1.0 + if layer_decay < 1.0: + enc_depth = model.enc_depth + dec_depth = model.dec_depth if hasattr(model, "dec_blocks") else 0 + num_layers = enc_depth + dec_depth + layer_decay_values = list( + layer_decay ** (num_layers + 1 - i) for i in range(num_layers + 2) + ) + + for name, param in model.named_parameters(): + if not param.requires_grad: + continue # frozen weights + + # Assign weight decay values + if len(param.shape) == 1 or name.endswith(".bias") or name in skip_list: + if "enc_blocks" in name: + group_name = "no_decay_enc_blocks" + else: + group_name = "no_decay" + this_weight_decay = 0.0 + else: + if "enc_blocks" in name: + group_name = "decay_enc_blocks" + else: + group_name = "decay" + this_weight_decay = weight_decay + + # Assign layer ID for LR scaling + if layer_decay < 1.0: + skip_scale = False + layer_id = _get_num_layer_for_vit(name, enc_depth, dec_depth) + group_name = "layer_%d_%s" % (layer_id, group_name) + if name in no_lr_scale_list: + skip_scale = True + group_name = f"{group_name}_no_lr_scale" + else: + layer_id = 0 + skip_scale = True + + if group_name not in parameter_group_names: + if not skip_scale: + scale = layer_decay_values[layer_id] + else: + scale = 1.0 + + if "enc_blocks" in group_name: + scale *= 1.0 + parameter_group_names[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "lr_scale": scale, + } + parameter_group_vars[group_name] = { + "weight_decay": this_weight_decay, + "params": [], + "lr_scale": scale, + } + + parameter_group_vars[group_name]["params"].append(param) + parameter_group_names[group_name]["params"].append(name) + printer.info("Param groups = %s" % json.dumps(parameter_group_names, indent=2)) + return list(parameter_group_vars.values()) + + +def adjust_learning_rate(optimizer, epoch, args): + """Decay the learning rate with half-cycle cosine after warmup""" + + if epoch < args.warmup_epochs: + lr = args.lr * epoch / args.warmup_epochs + else: + # lr = args.lr + lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * ( + 1.0 + + math.cos( + math.pi + * (epoch - args.warmup_epochs) + / (args.epochs - args.warmup_epochs) + ) + ) + + for param_group in optimizer.param_groups: + if "lr_scale" in param_group: + param_group["lr"] = lr * param_group["lr_scale"] + else: + param_group["lr"] = lr + + return lr diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/alignment.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/alignment.py new file mode 100644 index 0000000000000000000000000000000000000000..63a92f05ff09a1101de177e8eca16d70046777d0 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/alignment.py @@ -0,0 +1,416 @@ +from typing import * +import math +from collections import namedtuple + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.types +# import utils3d + + +def scatter_min(size: int, dim: int, index: torch.LongTensor, src: torch.Tensor) -> torch.return_types.min: + "Scatter the minimum value along the given dimension of `input` into `src` at the indices specified in `index`." + shape = src.shape[:dim] + (size,) + src.shape[dim + 1:] + minimum = torch.full(shape, float('inf'), dtype=src.dtype, device=src.device).scatter_reduce(dim=dim, index=index, src=src, reduce='amin', include_self=False) + minimum_where = torch.where(src == torch.gather(minimum, dim=dim, index=index)) + indices = torch.full(shape, -1, dtype=torch.long, device=src.device) + indices[(*minimum_where[:dim], index[minimum_where], *minimum_where[dim + 1:])] = minimum_where[dim] + return torch.return_types.min((minimum, indices)) + + +def split_batch_fwd(fn: Callable, chunk_size: int, *args, **kwargs): + batch_size = next(x for x in (*args, *kwargs.values()) if isinstance(x, torch.Tensor)).shape[0] + n_chunks = batch_size // chunk_size + (batch_size % chunk_size > 0) + splited_args = tuple(arg.split(chunk_size, dim=0) if isinstance(arg, torch.Tensor) else [arg] * n_chunks for arg in args) + splited_kwargs = {k: [v.split(chunk_size, dim=0) if isinstance(v, torch.Tensor) else [v] * n_chunks] for k, v in kwargs.items()} + results = [] + for i in range(n_chunks): + chunk_args = tuple(arg[i] for arg in splited_args) + chunk_kwargs = {k: v[i] for k, v in splited_kwargs.items()} + results.append(fn(*chunk_args, **chunk_kwargs)) + + if isinstance(results[0], tuple): + return tuple(torch.cat(r, dim=0) for r in zip(*results)) + else: + return torch.cat(results, dim=0) + + +def _pad_inf(x_: torch.Tensor): + return torch.cat([torch.full_like(x_[..., :1], -torch.inf), x_, torch.full_like(x_[..., :1], torch.inf)], dim=-1) + + +def _pad_cumsum(cumsum: torch.Tensor): + return torch.cat([torch.zeros_like(cumsum[..., :1]), cumsum, cumsum[..., -1:]], dim=-1) + + +def _compute_residual(a: torch.Tensor, xyw: torch.Tensor, trunc: float): + return a.mul(xyw[..., 0]).sub_(xyw[..., 1]).abs_().mul_(xyw[..., 2]).clamp_max_(trunc).sum(dim=-1) + + +def align(x: torch.Tensor, y: torch.Tensor, w: torch.Tensor, trunc: Optional[Union[float, torch.Tensor]] = None, eps: float = 1e-7) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: + """ + If trunc is None, solve `min sum_i w_i * |a * x_i - y_i|`, otherwise solve `min sum_i min(trunc, w_i * |a * x_i - y_i|)`. + + w_i must be >= 0. + + ### Parameters: + - `x`: tensor of shape (..., n) + - `y`: tensor of shape (..., n) + - `w`: tensor of shape (..., n) + - `trunc`: optional, float or tensor of shape (..., n) or None + + ### Returns: + - `a`: tensor of shape (...), differentiable + - `loss`: tensor of shape (...), value of loss function at `a`, detached + - `index`: tensor of shape (...), where a = y[idx] / x[idx] + """ + if trunc is None: + x, y, w = torch.broadcast_tensors(x, y, w) + sign = torch.sign(x) + x, y = x * sign, y * sign + y_div_x = y / x.clamp_min(eps) + y_div_x, argsort = y_div_x.sort(dim=-1) + + wx = torch.gather(x * w, dim=-1, index=argsort) + derivatives = 2 * wx.cumsum(dim=-1) - wx.sum(dim=-1, keepdim=True) + search = torch.searchsorted(derivatives, torch.zeros_like(derivatives[..., :1]), side='left').clamp_max(derivatives.shape[-1] - 1) + + a = y_div_x.gather(dim=-1, index=search).squeeze(-1) + index = argsort.gather(dim=-1, index=search).squeeze(-1) + loss = (w * (a[..., None] * x - y).abs()).sum(dim=-1) + + else: + # Reshape to (batch_size, n) for simplicity + x, y, w = torch.broadcast_tensors(x, y, w) + batch_shape = x.shape[:-1] + batch_size = math.prod(batch_shape) + x, y, w = x.reshape(-1, x.shape[-1]), y.reshape(-1, y.shape[-1]), w.reshape(-1, w.shape[-1]) + + sign = torch.sign(x) + x, y = x * sign, y * sign + wx, wy = w * x, w * y + xyw = torch.stack([x, y, w], dim=-1) # Stacked for convenient gathering + + y_div_x = A = y / x.clamp_min(eps) + B = (wy - trunc) / wx.clamp_min(eps) + C = (wy + trunc) / wx.clamp_min(eps) + with torch.no_grad(): + # Caculate prefix sum by orders of A, B, C + A, A_argsort = A.sort(dim=-1) + Q_A = torch.cumsum(torch.gather(wx, dim=-1, index=A_argsort), dim=-1) + A, Q_A = _pad_inf(A), _pad_cumsum(Q_A) # Pad [-inf, A1, ..., An, inf] and [0, Q1, ..., Qn, Qn] to handle edge cases. + + B, B_argsort = B.sort(dim=-1) + Q_B = torch.cumsum(torch.gather(wx, dim=-1, index=B_argsort), dim=-1) + B, Q_B = _pad_inf(B), _pad_cumsum(Q_B) + + C, C_argsort = C.sort(dim=-1) + Q_C = torch.cumsum(torch.gather(wx, dim=-1, index=C_argsort), dim=-1) + C, Q_C = _pad_inf(C), _pad_cumsum(Q_C) + + # Caculate left and right derivative of A + j_A = torch.searchsorted(A, y_div_x, side='left').sub_(1) + j_B = torch.searchsorted(B, y_div_x, side='left').sub_(1) + j_C = torch.searchsorted(C, y_div_x, side='left').sub_(1) + left_derivative = 2 * torch.gather(Q_A, dim=-1, index=j_A) - torch.gather(Q_B, dim=-1, index=j_B) - torch.gather(Q_C, dim=-1, index=j_C) + j_A = torch.searchsorted(A, y_div_x, side='right').sub_(1) + j_B = torch.searchsorted(B, y_div_x, side='right').sub_(1) + j_C = torch.searchsorted(C, y_div_x, side='right').sub_(1) + right_derivative = 2 * torch.gather(Q_A, dim=-1, index=j_A) - torch.gather(Q_B, dim=-1, index=j_B) - torch.gather(Q_C, dim=-1, index=j_C) + + # Find extrema + is_extrema = (left_derivative < 0) & (right_derivative >= 0) + is_extrema[..., 0] |= ~is_extrema.any(dim=-1) # In case all derivatives are zero, take the first one as extrema. + where_extrema_batch, where_extrema_index = torch.where(is_extrema) + + # Calculate objective value at extrema + extrema_a = y_div_x[where_extrema_batch, where_extrema_index] # (num_extrema,) + MAX_ELEMENTS = 4096 ** 2 # Split into small batches to avoid OOM in case there are too many extrema.(~1G) + SPLIT_SIZE = MAX_ELEMENTS // x.shape[-1] + extrema_value = torch.cat([ + _compute_residual(extrema_a_split[:, None], xyw[extrema_i_split, :, :], trunc) + for extrema_a_split, extrema_i_split in zip(extrema_a.split(SPLIT_SIZE), where_extrema_batch.split(SPLIT_SIZE)) + ]) # (num_extrema,) + + # Find minima among corresponding extrema + minima, indices = scatter_min(size=batch_size, dim=0, index=where_extrema_batch, src=extrema_value) # (batch_size,) + index = where_extrema_index[indices] + + a = torch.gather(y, dim=-1, index=index[..., None]) / torch.gather(x, dim=-1, index=index[..., None]).clamp_min(eps) + a = a.reshape(batch_shape) + loss = minima.reshape(batch_shape) + index = index.reshape(batch_shape) + + return a, loss, index + + +def align_depth_scale(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None): + """ + Align `depth_src` to `depth_tgt` with given constant weights. + + ### Parameters: + - `depth_src: torch.Tensor` of shape (..., N) + - `depth_tgt: torch.Tensor` of shape (..., N) + + """ + scale, _, _ = align(depth_src, depth_tgt, weight, trunc) + + return scale + + +def align_depth_affine(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None): + """ + Align `depth_src` to `depth_tgt` with given constant weights. + + ### Parameters: + - `depth_src: torch.Tensor` of shape (..., N) + - `depth_tgt: torch.Tensor` of shape (..., N) + - `weight: torch.Tensor` of shape (..., N) + - `trunc: float` or tensor of shape (..., N) or None + + ### Returns: + - `scale: torch.Tensor` of shape (...). + - `shift: torch.Tensor` of shape (...). + """ + dtype, device = depth_src.dtype, depth_src.device + + # Flatten batch dimensions for simplicity + batch_shape, n = depth_src.shape[:-1], depth_src.shape[-1] + batch_size = math.prod(batch_shape) + depth_src, depth_tgt, weight = depth_src.reshape(batch_size, n), depth_tgt.reshape(batch_size, n), weight.reshape(batch_size, n) + + # Here, we take anchors only for non-zero weights. + # Although the results will be still correct even anchor points have zero weight, + # it is wasting computation and may cause instability in some cases, e.g. too many extrema. + anchors_where_batch, anchors_where_n = torch.where(weight > 0) + + # Stop gradient when solving optimal anchors + with torch.no_grad(): + depth_src_anchor = depth_src[anchors_where_batch, anchors_where_n] # (anchors) + depth_tgt_anchor = depth_tgt[anchors_where_batch, anchors_where_n] # (anchors) + + depth_src_anchored = depth_src[anchors_where_batch, :] - depth_src_anchor[..., None] # (anchors, n) + depth_tgt_anchored = depth_tgt[anchors_where_batch, :] - depth_tgt_anchor[..., None] # (anchors, n) + weight_anchored = weight[anchors_where_batch, :] # (anchors, n) + + scale, loss, index = align(depth_src_anchored, depth_tgt_anchored, weight_anchored, trunc) # (anchors) + + loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchors_where_batch, src=loss) # (batch_size,) + + # Reproduce by indexing for shorter compute graph + index_1 = anchors_where_n[index_anchor] # (batch_size,) + index_2 = index[index_anchor] # (batch_size,) + + tgt_1, src_1 = torch.gather(depth_tgt, dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(depth_src, dim=1, index=index_1[..., None]).squeeze(-1) + tgt_2, src_2 = torch.gather(depth_tgt, dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(depth_src, dim=1, index=index_2[..., None]).squeeze(-1) + + scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1e-7) + shift = tgt_1 - scale * src_1 + + scale, shift = scale.reshape(batch_shape), shift.reshape(batch_shape) + + return scale, shift + +def align_depth_affine_irls(depth_src: torch.Tensor, depth_tgt: torch.Tensor, weight: Optional[torch.Tensor], max_iter: int = 100, eps: float = 1e-12): + """ + Align `depth_src` to `depth_tgt` with given constant weights using IRLS. + """ + dtype, device = depth_src.dtype, depth_src.device + + w = weight + x = torch.stack([depth_src, torch.ones_like(depth_src)], dim=-1) + y = depth_tgt + + for i in range(max_iter): + beta = (x.transpose(-1, -2) @ (w * y)) @ (x.transpose(-1, -2) @ (w[..., None] * x)).inverse().transpose(-2, -1) + w = 1 / (y - (x @ beta[..., None])[..., 0]).abs().clamp_min(eps) + + return beta[..., 0], beta[..., 1] + + +def align_points_scale(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None): + """ + ### Parameters: + - `points_src: torch.Tensor` of shape (..., N, 3) + - `points_tgt: torch.Tensor` of shape (..., N, 3) + - `weight: torch.Tensor` of shape (..., N) + + ### Returns: + - `a: torch.Tensor` of shape (...). Only positive solutions are garunteed. You should filter out negative scales before using it. + - `b: torch.Tensor` of shape (...) + """ + dtype, device = points_src.dtype, points_src.device + + scale, _, _ = align(points_src.flatten(-2), points_tgt.flatten(-2), weight[..., None].expand_as(points_src).flatten(-2), trunc) + + return scale + + +def align_points_scale_z_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None): + """ + Align `points_src` to `points_tgt` with respect to a shared xyz scale and z shift. + It is similar to `align_affine` but scale and shift are applied to different dimensions. + + ### Parameters: + - `points_src: torch.Tensor` of shape (..., N, 3) + - `points_tgt: torch.Tensor` of shape (..., N, 3) + - `weights: torch.Tensor` of shape (..., N) + + ### Returns: + - `scale: torch.Tensor` of shape (...). + - `shift: torch.Tensor` of shape (..., 3). x and y shifts are zeros. + """ + dtype, device = points_src.dtype, points_src.device + + # Flatten batch dimensions for simplicity + batch_shape, n = points_src.shape[:-2], points_src.shape[-2] + batch_size = math.prod(batch_shape) + points_src, points_tgt, weight = points_src.reshape(batch_size, n, 3), points_tgt.reshape(batch_size, n, 3), weight.reshape(batch_size, n) + + # Take anchors + anchor_where_batch, anchor_where_n = torch.where(weight > 0) + with torch.no_grad(): + zeros = torch.zeros(anchor_where_batch.shape[0], device=device, dtype=dtype) + points_src_anchor = torch.stack([zeros, zeros, points_src[anchor_where_batch, anchor_where_n, 2]], dim=-1) # (anchors, 3) + points_tgt_anchor = torch.stack([zeros, zeros, points_tgt[anchor_where_batch, anchor_where_n, 2]], dim=-1) # (anchors, 3) + + points_src_anchored = points_src[anchor_where_batch, :, :] - points_src_anchor[..., None, :] # (anchors, n, 3) + points_tgt_anchored = points_tgt[anchor_where_batch, :, :] - points_tgt_anchor[..., None, :] # (anchors, n, 3) + weight_anchored = weight[anchor_where_batch, :, None].expand(-1, -1, 3) # (anchors, n, 3) + + # Solve optimal scale and shift for each anchor + MAX_ELEMENTS = 2 ** 20 + scale, loss, index = split_batch_fwd(align, MAX_ELEMENTS // n, points_src_anchored.flatten(-2), points_tgt_anchored.flatten(-2), weight_anchored.flatten(-2), trunc) # (anchors,) + + loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchor_where_batch, src=loss) # (batch_size,) + + # Reproduce by indexing for shorter compute graph + index_2 = index[index_anchor] # (batch_size,) [0, 3n) + index_1 = anchor_where_n[index_anchor] * 3 + index_2 % 3 # (batch_size,) [0, 3n) + + zeros = torch.zeros((batch_size, n), device=device, dtype=dtype) + points_tgt_00z, points_src_00z = torch.stack([zeros, zeros, points_tgt[..., 2]], dim=-1), torch.stack([zeros, zeros, points_src[..., 2]], dim=-1) + tgt_1, src_1 = torch.gather(points_tgt_00z.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(points_src_00z.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1) + tgt_2, src_2 = torch.gather(points_tgt.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(points_src.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1) + + scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1.0) + shift = torch.gather(points_tgt_00z, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) - scale[..., None] * torch.gather(points_src_00z, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) + scale, shift = scale.reshape(batch_shape), shift.reshape(*batch_shape, 3) + + return scale, shift + + +def align_points_scale_xyz_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6): + """ + Align `points_src` to `points_tgt` with respect to a shared xyz scale and z shift. + It is similar to `align_affine` but scale and shift are applied to different dimensions. + + ### Parameters: + - `points_src: torch.Tensor` of shape (..., N, 3) + - `points_tgt: torch.Tensor` of shape (..., N, 3) + - `weights: torch.Tensor` of shape (..., N) + + ### Returns: + - `scale: torch.Tensor` of shape (...). + - `shift: torch.Tensor` of shape (..., 3) + """ + dtype, device = points_src.dtype, points_src.device + + # Flatten batch dimensions for simplicity + batch_shape, n = points_src.shape[:-2], points_src.shape[-2] + batch_size = math.prod(batch_shape) + points_src, points_tgt, weight = points_src.reshape(batch_size, n, 3), points_tgt.reshape(batch_size, n, 3), weight.reshape(batch_size, n) + + # Take anchors + anchor_where_batch, anchor_where_n = torch.where(weight > 0) + + with torch.no_grad(): + points_src_anchor = points_src[anchor_where_batch, anchor_where_n] # (anchors, 3) + points_tgt_anchor = points_tgt[anchor_where_batch, anchor_where_n] # (anchors, 3) + + points_src_anchored = points_src[anchor_where_batch, :, :] - points_src_anchor[..., None, :] # (anchors, n, 3) + points_tgt_anchored = points_tgt[anchor_where_batch, :, :] - points_tgt_anchor[..., None, :] # (anchors, n, 3) + weight_anchored = weight[anchor_where_batch, :, None].expand(-1, -1, 3) # (anchors, n, 3) + + # Solve optimal scale and shift for each anchor + MAX_ELEMENTS = 2 ** 20 + scale, loss, index = split_batch_fwd(align, MAX_ELEMENTS // 2, points_src_anchored.flatten(-2), points_tgt_anchored.flatten(-2), weight_anchored.flatten(-2), trunc) # (anchors,) + + # Get optimal scale and shift for each batch element + loss, index_anchor = scatter_min(size=batch_size, dim=0, index=anchor_where_batch, src=loss) # (batch_size,) + + index_2 = index[index_anchor] # (batch_size,) [0, 3n) + index_1 = anchor_where_n[index_anchor] * 3 + index_2 % 3 # (batch_size,) [0, 3n) + + src_1, tgt_1 = torch.gather(points_src.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1), torch.gather(points_tgt.flatten(-2), dim=1, index=index_1[..., None]).squeeze(-1) + src_2, tgt_2 = torch.gather(points_src.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1), torch.gather(points_tgt.flatten(-2), dim=1, index=index_2[..., None]).squeeze(-1) + + scale = (tgt_2 - tgt_1) / torch.where(src_2 != src_1, src_2 - src_1, 1.0) + shift = torch.gather(points_tgt, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) - scale[..., None] * torch.gather(points_src, dim=1, index=(index_1 // 3)[..., None, None].expand(-1, -1, 3)).squeeze(-2) + + scale, shift = scale.reshape(batch_shape), shift.reshape(*batch_shape, 3) + + return scale, shift + + +def align_points_z_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6): + """ + Align `points_src` to `points_tgt` with respect to a Z-axis shift. + + ### Parameters: + - `points_src: torch.Tensor` of shape (..., N, 3) + - `points_tgt: torch.Tensor` of shape (..., N, 3) + - `weights: torch.Tensor` of shape (..., N) + + ### Returns: + - `scale: torch.Tensor` of shape (...). + - `shift: torch.Tensor` of shape (..., 3) + """ + dtype, device = points_src.dtype, points_src.device + + shift, _, _ = align(torch.ones_like(points_src[..., 2]), points_tgt[..., 2] - points_src[..., 2], weight, trunc) + shift = torch.stack([torch.zeros_like(shift), torch.zeros_like(shift), shift], dim=-1) + + return shift + + +def align_points_xyz_shift(points_src: torch.Tensor, points_tgt: torch.Tensor, weight: Optional[torch.Tensor], trunc: Optional[Union[float, torch.Tensor]] = None, max_iters: int = 30, eps: float = 1e-6): + """ + Align `points_src` to `points_tgt` with respect to a Z-axis shift. + + ### Parameters: + - `points_src: torch.Tensor` of shape (..., N, 3) + - `points_tgt: torch.Tensor` of shape (..., N, 3) + - `weights: torch.Tensor` of shape (..., N) + + ### Returns: + - `scale: torch.Tensor` of shape (...). + - `shift: torch.Tensor` of shape (..., 3) + """ + dtype, device = points_src.dtype, points_src.device + + shift, _, _ = align(torch.ones_like(points_src).swapaxes(-2, -1), (points_tgt - points_src).swapaxes(-2, -1), weight[..., None, :], trunc) + + return shift + + +def align_affine_lstsq(x: torch.Tensor, y: torch.Tensor, w: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Solve `min sum_i w_i * (a * x_i + b - y_i ) ^ 2`, where `a` and `b` are scalars, with respect to `a` and `b` using least squares. + + ### Parameters: + - `x: torch.Tensor` of shape (..., N) + - `y: torch.Tensor` of shape (..., N) + - `w: torch.Tensor` of shape (..., N) + + ### Returns: + - `a: torch.Tensor` of shape (...,) + - `b: torch.Tensor` of shape (...,) + """ + w_sqrt = torch.ones_like(x) if w is None else w.sqrt() + A = torch.stack([w_sqrt * x, torch.ones_like(x)], dim=-1) + B = (w_sqrt * y)[..., None] + a, b = torch.linalg.lstsq(A, B)[0].squeeze(-1).unbind(-1) + return a, b diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/blocks.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..5ee03a37aa5173a23969a1096208c8442a66a043 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/blocks.py @@ -0,0 +1,531 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +import torch +import torch.nn as nn + +from itertools import repeat +import collections.abc +from torch.nn.functional import scaled_dot_product_attention +from functools import partial + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + + +def drop_path( + x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True +): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f"drop_prob={round(self.drop_prob,3):0.3f}" + + +class Mlp(nn.Module): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + bias=True, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + return self.drop2(self.fc2(self.drop1(self.act(self.fc1(x))))) + + +class Attention(nn.Module): + + def __init__( + self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0 + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.rope = rope.float() if rope is not None else None + + def forward(self, x, xpos): + B, N, C = x.shape + + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, C // self.num_heads) + .transpose(1, 3) + ) + q, k, v = [qkv[:, :, i] for i in range(3)] + + q_type = q.dtype + k_type = k.dtype + if self.rope is not None: + q = q.float() + k = k.float() + with torch.autocast(device_type="cuda", enabled=False): + q = self.rope(q, xpos) + k = self.rope(k, xpos) + q = q.to(q_type) + k = k.to(k_type) + + x = ( + scaled_dot_product_attention( + query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale + ) + .transpose(1, 2) + .reshape(B, N, C) + ) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + rope=None, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x, xpos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class CrossAttention(nn.Module): + + def __init__( + self, dim, rope=None, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0 + ): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.projq = nn.Linear(dim, dim, bias=qkv_bias) + self.projk = nn.Linear(dim, dim, bias=qkv_bias) + self.projv = nn.Linear(dim, dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.rope = rope.float() if rope is not None else None + + def forward(self, query, key, value, qpos, kpos): + B, Nq, C = query.shape + Nk = key.shape[1] + Nv = value.shape[1] + + q = ( + self.projq(query) + .reshape(B, Nq, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + k = ( + self.projk(key) + .reshape(B, Nk, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + v = ( + self.projv(value) + .reshape(B, Nv, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + + q_type = q.dtype + k_type = k.dtype + if self.rope is not None: + if qpos is not None: + q = q.float() + with torch.autocast(device_type="cuda", enabled=False): + q = self.rope(q, qpos) + q = q.to(q_type) + + if kpos is not None: + k = k.float() + with torch.autocast(device_type="cuda", enabled=False): + k = self.rope(k, kpos) + k = k.to(k_type) + + x = ( + scaled_dot_product_attention( + query=q, key=k, value=v, dropout_p=self.attn_drop.p, scale=self.scale + ) + .transpose(1, 2) + .reshape(B, Nq, C) + ) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class DecoderBlock(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + norm_mem=True, + rope=None, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.cross_attn = CrossAttention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + self.norm3 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + self.norm_y = norm_layer(dim) if norm_mem else nn.Identity() + + def forward(self, x, y, xpos, ypos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + y_ = self.norm_y(y) + x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, y_, xpos, ypos)) + x = x + self.drop_path(self.mlp(self.norm3(x))) + return x, y + + +class CustomDecoderBlock(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + norm_mem=True, + rope=None, + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.cross_attn = CrossAttention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + self.norm3 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + self.norm_y = norm_layer(dim) if norm_mem else nn.Identity() + self.norm_z = norm_layer(dim) if norm_mem else nn.Identity() + + def forward(self, x, y, z, xpos, ypos): + x = x + self.drop_path(self.attn(self.norm1(x), xpos)) + y_ = self.norm_y(y) + z_ = self.norm_z(z) + x = x + self.drop_path(self.cross_attn(self.norm2(x), y_, z_, xpos, ypos)) + x = x + self.drop_path(self.mlp(self.norm3(x))) + return x, y + + +class ModLN(nn.Module): + """ + Modulation with adaLN. + + References: + DiT: https://github.com/facebookresearch/DiT/blob/main/models.py#L101 + """ + + def __init__(self, inner_dim: int, mod_dim: int, eps: float): + super().__init__() + self.norm = nn.LayerNorm(inner_dim, eps=eps) + self.mlp = nn.Sequential( + nn.SiLU(), + nn.Linear(mod_dim, inner_dim * 2), + ) + + @staticmethod + def modulate(x, shift, scale): + + return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) + + def forward(self, x: torch.Tensor, mod: torch.Tensor) -> torch.Tensor: + shift, scale = self.mlp(mod).chunk(2, dim=-1) # [N, D] + return self.modulate(self.norm(x), shift, scale) # [N, L, D] + + +class ConditionModulationBlock(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=partial(ModLN, eps=1e-6), + rope=None, + ): + super().__init__() + self.norm1 = norm_layer(dim, dim) + self.attn = Attention( + dim, + rope=rope, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim, dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + def forward(self, x, mod, xpos): + x = x + self.drop_path(self.attn(self.norm1(x, mod), xpos)) + x = x + self.drop_path(self.mlp(self.norm2(x, mod))) + return x + + +class PositionGetter(object): + """return positions of patches""" + + def __init__(self): + self.cache_positions = {} + + def __call__(self, b, h, w, device): + if not (h, w) in self.cache_positions: + x = torch.arange(w, device=device) + y = torch.arange(h, device=device) + self.cache_positions[h, w] = torch.cartesian_prod(y, x) # (h, w, 2) + pos = self.cache_positions[h, w].view(1, h * w, 2).expand(b, -1, 2).clone() + return pos + + +class PatchEmbed(nn.Module): + """just adding _init_weights + position getter compared to timm.models.layers.patch_embed.PatchEmbed""" + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + self.position_getter = PositionGetter() + + def forward(self, x): + B, C, H, W = x.shape + torch._assert( + H == self.img_size[0], + f"Input image height ({H}) doesn't match model ({self.img_size[0]}).", + ) + torch._assert( + W == self.img_size[1], + f"Input image width ({W}) doesn't match model ({self.img_size[1]}).", + ) + x = self.proj(x) + pos = self.position_getter(B, x.size(2), x.size(3), x.device) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x, pos + + def _init_weights(self): + w = self.proj.weight.data + torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) + + +if __name__ == "__main__": + import os + import sys + + sys.path.append(os.path.dirname(os.path.dirname(__file__))) + import dust3r.utils.path_to_croco + from models.pos_embed import get_2d_sincos_pos_embed, RoPE2D + from functools import partial + from torch.utils.checkpoint import checkpoint + + torch.manual_seed(0) + + enc_blocks_ray_map = ( + nn.ModuleList( + [ + Block( + 768, + 16, + 4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + rope=RoPE2D(100), + ) + for _ in range(2) + ] + ) + .cuda() + .train() + ) + + x = torch.randn(2, 196, 768, requires_grad=True).cuda() + xpos = torch.arange(0, 196).unsqueeze(0).unsqueeze(-1).repeat(2, 1, 2).cuda().long() + enc_blocks_ray_map.zero_grad() + for blk in enc_blocks_ray_map: + + x = checkpoint(blk, x, xpos) + enc_blocks_ray_map.zero_grad() + x.sum().backward() + + grad_not_checkpointed = {} + for name, param in enc_blocks_ray_map.named_parameters(): + grad_not_checkpointed[name] = param.grad.data.clone() + print(name, grad_not_checkpointed[name]) + break diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7efad5b728988b807497c6a70c1b5de61904cc7d --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/__init__.py @@ -0,0 +1,91 @@ +from .utils.transforms import * +from .base.batched_sampler import BatchedRandomSampler # noqa +from .arkitscenes import ARKitScenes_Multi # noqa +from .arkitscenes_highres import ARKitScenesHighRes_Multi +from .bedlam import BEDLAM_Multi +from .blendedmvs import BlendedMVS_Multi # noqa +from .co3d import Co3d_Multi # noqa +from .cop3d import Cop3D_Multi +from .dl3dv import DL3DV_Multi +from .dynamic_replica import DynamicReplica +from .eden import EDEN_Multi +from .hypersim import HyperSim_Multi +from .hoi4d import HOI4D_Multi +from .irs import IRS +from .mapfree import MapFree_Multi +from .megadepth import MegaDepth_Multi # noqa +from .mp3d import MP3D_Multi +from .mvimgnet import MVImgNet_Multi +from .mvs_synth import MVS_Synth_Multi +from .omniobject3d import OmniObject3D_Multi +from .pointodyssey import PointOdyssey_Multi +from .realestate10k import RE10K_Multi +from .scannet import ScanNet_Multi +from .scannetpp import ScanNetpp_Multi # noqa +from .smartportraits import SmartPortraits_Multi +from .spring import Spring +from .synscapes import SynScapes +from .tartanair import TartanAir_Multi +from .threedkb import ThreeDKenBurns +from .uasol import UASOL_Multi +from .urbansyn import UrbanSyn +from .unreal4k import UnReal4K_Multi +from .vkitti2 import VirtualKITTI2_Multi # noqa +from .waymo import Waymo_Multi # noqa (legacy h5 format) +from .waymo_v2 import Waymo_v2_Multi # noqa (parquet v2.0.1, with TOP-lidar) +from .kitti import KITTI_Multi # noqa (KITTI odometry + Velodyne) +from .kitti360 import KITTI360_Multi # noqa (KITTI-360 + Velodyne) +from .wildrgbd import WildRGBD_Multi # noqa + +from .habitat_hm3d import HabitatHM3D_Multi + + +from accelerate import Accelerator + + +def get_data_loader( + dataset, + batch_size, + num_workers=8, + shuffle=True, + drop_last=True, + pin_mem=True, + accelerator: Accelerator = None, + fixed_length=False, +): + import torch + + # pytorch dataset + if isinstance(dataset, str): + dataset = eval(dataset) + + try: + sampler = dataset.make_sampler( + batch_size, + shuffle=shuffle, + drop_last=drop_last, + world_size=accelerator.num_processes, + fixed_length=fixed_length, + ) + shuffle = False + + data_loader = torch.utils.data.DataLoader( + dataset, + batch_sampler=sampler, + num_workers=num_workers, + pin_memory=pin_mem, + ) + + except (AttributeError, NotImplementedError): + sampler = None + + data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + shuffle=shuffle, + num_workers=num_workers, + pin_memory=pin_mem, + drop_last=drop_last, + ) + + return data_loader diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/arkitscenes.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/arkitscenes.py new file mode 100644 index 0000000000000000000000000000000000000000..49d69d414a3b452d5619f5c8cbc55a89ec158a5b --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/arkitscenes.py @@ -0,0 +1,246 @@ +import os.path as osp +import os +import sys +import itertools + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +import cv2 +import numpy as np + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2, imread_pil + + +def stratified_sampling(indices, num_samples, rng=None): + if num_samples > len(indices): + raise ValueError("num_samples cannot exceed the number of available indices.") + elif num_samples == len(indices): + return indices + + sorted_indices = sorted(indices) + stride = len(sorted_indices) / num_samples + sampled_indices = [] + if rng is None: + rng = np.random.default_rng() + + for i in range(num_samples): + start = int(i * stride) + end = int((i + 1) * stride) + # Ensure end does not exceed the list + end = min(end, len(sorted_indices)) + if start < end: + # Randomly select within the current stratum + rand_idx = rng.integers(start, end) + sampled_indices.append(sorted_indices[rand_idx]) + else: + # In case of any rounding issues, select the last index + sampled_indices.append(sorted_indices[-1]) + + return rng.permutation(sampled_indices) + + +class ARKitScenes_Multi(BaseMultiViewDataset): + def __init__(self, *args, split, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = True + self.max_interval = 8 + super().__init__(*args, **kwargs) + if split == "train": + self.split = "Training" + elif split == "test": + self.split = "Test" + else: + raise ValueError("") + + self.loaded_data = self._load_data(self.split) + print('DATA: arkit', len(self)) + + def _load_data(self, split): + with np.load(osp.join(self.ROOT, split, "all_metadata.npz")) as data: + self.scenes: np.ndarray = data["scenes"] + ''' + high_res_list = np.array( + [ + d + for d in os.listdir( + os.path.join( + self.ROOT.rstrip("/"),# + "_highres", + split if split == "Training" else "Test",#"Validation", + ) + ) + if os.path.join(self.ROOT, split, d) + #if os.path.join(self.ROOT + "_highres", split, d) + ] + ) + self.scenes = np.setdiff1d(self.scenes, high_res_list) + ''' + offset = 0 + counts = [] + scenes = [] + sceneids = [] + images = [] + intrinsics = [] + trajectories = [] + groups = [] + id_ranges = [] + j = 0 + for scene_idx, scene in enumerate(self.scenes): + scene_dir = osp.join(self.ROOT, self.split, scene) + with np.load( + osp.join(scene_dir, "new_scene_metadata.npz"), allow_pickle=True + ) as data: + imgs = data["images"] + intrins = data["intrinsics"] + traj = data["trajectories"] + min_seq_len = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + if len(imgs) < min_seq_len: + print(f"Skipping {scene}") + continue + + collections = {} + assert "image_collection" in data, "Image collection not found" + collections["image"] = data["image_collection"] + + num_imgs = imgs.shape[0] + img_groups = [] + min_group_len = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + for ref_id, group in collections["image"].item().items(): + if len(group) + 1 < min_group_len: + continue + + # groups are (idx, score)s + group.insert(0, (ref_id, 1.0)) + group = [int(x[0] + offset) for x in group] + img_groups.append(sorted(group)) + + if len(img_groups) == 0: + print(f"Skipping {scene}") + continue + + scenes.append(scene) + sceneids.extend([j] * num_imgs) + id_ranges.extend([(offset, offset + num_imgs) for _ in range(num_imgs)]) + images.extend(imgs) + K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0) + + K[:, 0, 0] = [fx for _, _, fx, _, _, _ in intrins] + K[:, 1, 1] = [fy for _, _, _, fy, _, _ in intrins] + K[:, 0, 2] = [cx for _, _, _, _, cx, _ in intrins] + K[:, 1, 2] = [cy for _, _, _, _, _, cy in intrins] + intrinsics.extend(list(K)) + trajectories.extend(list(traj)) + + # offset groups + groups.extend(img_groups) + counts.append(offset) + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.id_ranges = id_ranges + self.images = images + self.intrinsics = intrinsics + self.trajectories = trajectories + self.groups = groups + + def __len__(self): + return len(self.groups) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + + if rng.choice([True, False]): + image_idxs = np.arange(self.id_ranges[idx][0], self.id_ranges[idx][1]) + cut_off = num_views if not self.allow_repeat else max(num_views // 3, 3) + start_image_idxs = image_idxs[: len(image_idxs) - cut_off + 1] + start_id = rng.choice(start_image_idxs) + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + image_idxs.tolist(), + rng, + max_interval=self.max_interval, + video_prob=0.8, + fix_interval_prob=0.5, + block_shuffle=16, + ) + image_idxs = np.array(image_idxs)[pos] + else: + ordered_video = False + image_idxs = self.groups[idx] + image_idxs = rng.permutation(image_idxs) + if len(image_idxs) > num_views: + image_idxs = image_idxs[:num_views] + else: + if rng.random() < 0.8: + image_idxs = rng.choice(image_idxs, size=num_views, replace=True) + else: + repeat_num = num_views // len(image_idxs) + 1 + image_idxs = np.tile(image_idxs, repeat_num)[:num_views] + + views = [] + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id]) + + intrinsics = self.intrinsics[view_idx] + camera_pose = self.trajectories[view_idx] + basename = self.images[view_idx] + assert ( + basename[:8] == self.scenes[scene_id] + ), f"{basename}, {self.scenes[scene_id]}" + # print(scene_dir, basename) + # Load RGB image + rgb_image = imread_pil( + osp.join(scene_dir, "vga_wide", basename.replace(".png", ".jpg")) + ) + # Load depthmap + depthmap = imread_cv2( + osp.join(scene_dir, "lowres_depth", basename), cv2.IMREAD_UNCHANGED + ) + depthmap = depthmap.astype(np.float32) / 1000.0 + depthmap[~np.isfinite(depthmap)] = 0 # invalid + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.75, 0.2, 0.05] + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="arkitscenes", + label=self.scenes[scene_id] + "_" + basename, + instance=f"{str(idx)}_{str(view_idx)}", + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.98, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/arkitscenes_highres.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/arkitscenes_highres.py new file mode 100644 index 0000000000000000000000000000000000000000..92826e1c46a067ed93ffc30d0470685085377bf6 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/arkitscenes_highres.py @@ -0,0 +1,175 @@ +import os.path as osp +import os +import sys +import itertools + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +import cv2 +import numpy as np +import h5py +import math +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class ARKitScenesHighRes_Multi(BaseMultiViewDataset): + def __init__(self, *args, split, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.max_interval = 8 + self.is_metric = True + super().__init__(*args, **kwargs) + if split == "train": + self.split = "Training" + elif split == "test": + self.split = "Validation" + else: + raise ValueError("") + + self.loaded_data = self._load_data(self.split) + + def _load_data(self, split): + all_scenes = sorted( + [ + d + for d in os.listdir(osp.join(self.ROOT, split)) + if osp.isdir(osp.join(self.ROOT, split, d)) + ] + ) + offset = 0 + scenes = [] + sceneids = [] + images = [] + start_img_ids = [] + scene_img_list = [] + timestamps = [] + intrinsics = [] + trajectories = [] + scene_id = 0 + for scene in all_scenes: + scene_dir = osp.join(self.ROOT, self.split, scene) + with np.load(osp.join(scene_dir, "scene_metadata.npz")) as data: + imgs_with_indices = sorted( + enumerate(data["images"]), key=lambda x: x[1] + ) + imgs = [x[1] for x in imgs_with_indices] + cut_off = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + if len(imgs) < cut_off: + print(f"Skipping {scene}") + continue + indices = [x[0] for x in imgs_with_indices] + tsps = np.array( + [float(img_name.split("_")[1][:-4]) for img_name in imgs] + ) + assert [img[:8] == scene for img in imgs], f"{scene}, {imgs}" + num_imgs = data["images"].shape[0] + img_ids = list(np.arange(num_imgs) + offset) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + scenes.append(scene) + scene_img_list.append(img_ids) + sceneids.extend([scene_id] * num_imgs) + images.extend(imgs) + start_img_ids.extend(start_img_ids_) + timestamps.extend(tsps) + + K = np.expand_dims(np.eye(3), 0).repeat(num_imgs, 0) + intrins = data["intrinsics"][indices] + K[:, 0, 0] = [fx for _, _, fx, _, _, _ in intrins] + K[:, 1, 1] = [fy for _, _, _, fy, _, _ in intrins] + K[:, 0, 2] = [cx for _, _, _, _, cx, _ in intrins] + K[:, 1, 2] = [cy for _, _, _, _, _, cy in intrins] + intrinsics.extend(list(K)) + trajectories.extend(list(data["trajectories"][indices])) + + # offset groups + offset += num_imgs + scene_id += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.scene_img_list = scene_img_list + self.intrinsics = intrinsics + self.trajectories = trajectories + self.start_img_ids = start_img_ids + assert len(self.images) == len(self.intrinsics) == len(self.trajectories) + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + all_image_ids = self.scene_img_list[self.sceneids[start_id]] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + block_shuffle=16, + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.split, self.scenes[scene_id]) + + intrinsics = self.intrinsics[view_idx] + camera_pose = self.trajectories[view_idx] + basename = self.images[view_idx] + assert ( + basename[:8] == self.scenes[scene_id] + ), f"{basename}, {self.scenes[scene_id]}" + # print(scene_dir, basename) + # Load RGB image + rgb_image = imread_cv2( + osp.join(scene_dir, "vga_wide", basename.replace(".png", ".jpg")) + ) + # Load depthmap + depthmap = imread_cv2( + osp.join(scene_dir, "highres_depth", basename), cv2.IMREAD_UNCHANGED + ) + depthmap = depthmap.astype(np.float32) / 1000.0 + depthmap[~np.isfinite(depthmap)] = 0 # invalid + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.7, 0.25, 0.05] + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="arkitscenes_highres", + label=self.scenes[scene_id] + "_" + basename, + instance=f"{str(idx)}_{str(view_idx)}", + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.99, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/bedlam.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/bedlam.py new file mode 100644 index 0000000000000000000000000000000000000000..f680a29fd8b446d30db51d531a939de5abf9e521 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/bedlam.py @@ -0,0 +1,297 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + +invalid_seqs = [ + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000042", + "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000059", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000079", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000978", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000081", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000268", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000089", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000189", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000034", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000889", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000293", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000067", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000904", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000434", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000044", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000013", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000396", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000012", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000082", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000120", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000324", + "20221013_3_250_batch01hand_static_bigOffice_seq_000038", + "20221012_3-10_500_batch01hand_zoom_highSchoolGym_seq_000486", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000421", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000226", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000012", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000149", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000311", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000080", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000122", + "20221012_3-10_500_batch01hand_zoom_highSchoolGym_seq_000079", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000077", + "20221014_3_250_batch01hand_orbit_archVizUI3_time15_seq_000095", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000062", + "20221013_3_250_batch01hand_static_bigOffice_seq_000015", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000095", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000119", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000297", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000011", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000196", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000316", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000283", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000085", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000287", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000163", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000804", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000842", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000027", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000182", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000982", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000029", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000031", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000025", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000250", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000785", + "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000069", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000122", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000246", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000352", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000425", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000192", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000900", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000043", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000063", + "20221014_3_250_batch01hand_orbit_archVizUI3_time15_seq_000096", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000091", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000013", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000309", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000114", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000969", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000361", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000267", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000083", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000383", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000890", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000003", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000045", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000317", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000076", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000082", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000907", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000279", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000076", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000004", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000061", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000811", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000800", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000841", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000794", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000308", + "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000064", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000284", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000752", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000269", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000036", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000419", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000290", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000322", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000818", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000327", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000326", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000002", + "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000060", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000348", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000059", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000016", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000817", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000332", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000094", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000193", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000779", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000177", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000368", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000023", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000024", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000310", + "20221014_3_250_batch01hand_orbit_archVizUI3_time15_seq_000086", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000038", + "20221024_10_100_batch01handhair_zoom_suburb_d_seq_000071", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000768", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000017", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000053", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000097", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000856", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000827", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000161", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000084", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000106", + "20221013_3_250_batch01hand_orbit_bigOffice_seq_000207", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000007", + "20221024_3-10_100_batch01handhair_static_highSchoolGym_seq_000013", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000251", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000796", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000105", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000251", + "20221019_3-8_250_highbmihand_orbit_stadium_seq_000046", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000334", + "20221019_3-8_1000_highbmihand_static_suburb_d_seq_000453", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000373", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000283", + "20221010_3-10_500_batch01hand_zoom_suburb_d_seq_000249", +] +hdri_scenes = [ + "20221010_3_1000_batch01hand", + "20221017_3_1000_batch01hand", + "20221018_3-8_250_batch01hand", + "20221019_3_250_highbmihand", +] + + +class BEDLAM_Multi(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.pose_root = os.path.join( + os.path.dirname(ROOT), f"{os.path.basename(ROOT)}_pose" + ) + assert os.path.exists(self.pose_root) + self.video = True + self.is_metric = True + self.max_interval = 4 + super().__init__(*args, **kwargs) + self.loaded_data = self._load_data() + + def _load_data(self): + self.scenes = os.listdir(self.ROOT) + + offset = 0 + scenes = [] + sceneids = [] + scene_img_list = [] + images = [] + start_img_ids = [] + + j = 0 + for scene in tqdm(self.scenes): + if scene in invalid_seqs: + continue + if any([scene.startswith(x) for x in hdri_scenes]): + continue + if "closeup" in scene: + continue + scene_dir = osp.join(self.ROOT, scene) + rgb_dir = osp.join(scene_dir, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")] + ) + num_imgs = len(basenames) + img_ids = list(np.arange(num_imgs) + offset) + cut_off = ( + self.num_views if not self.allow_repeat else max(self.num_views // 3, 3) + ) + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + start_img_ids.extend(start_img_ids_) + sceneids.extend([j] * num_imgs) + images.extend(basenames) + scenes.append(scene) + scene_img_list.append(img_ids) + + # offset groups + offset += num_imgs + j += 1 + + self.scenes = scenes + assert len(set(self.scenes) - set(os.listdir(self.pose_root))) == 0 + self.sceneids = sceneids + self.images = images + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + all_image_ids = self.scene_img_list[self.sceneids[start_id]] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=1.0, + fix_interval_prob=1.0, + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.scenes[scene_id]) + rgb_dir = osp.join(scene_dir, "rgb") + depth_dir = osp.join(scene_dir, "depth") + cam_dir = osp.join(osp.join(self.pose_root, self.scenes[scene_id]), "cam") + + basename = self.images[view_idx] + + # Load RGB image + rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png")) + # Load depthmap + depthmap = np.load(osp.join(depth_dir, basename + ".npy")) + depthmap[~np.isfinite(depthmap)] = 0 # invalid + depthmap[depthmap > 200.0] = 0.0 + + cam = np.load(osp.join(cam_dir, basename + ".npz")) + camera_pose = cam["pose"] + intrinsics = cam["intrinsics"] + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.85, 0.10, 0.05] + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="BEDLAM", + label=self.scenes[scene_id] + "_" + basename, + instance=osp.join(rgb_dir, basename + ".png"), + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(1, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/blendedmvs.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/blendedmvs.py new file mode 100644 index 0000000000000000000000000000000000000000..f43874276e5f52ff3745cca1c452361770ff57c2 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/blendedmvs.py @@ -0,0 +1,348 @@ +import os.path as osp +import numpy as np +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2, imread_pil +import h5py +from tqdm import tqdm + + +class BlendedMVS_Multi(BaseMultiViewDataset): + """Dataset of outdoor street scenes, 5 images each time""" + + def __init__(self, *args, ROOT, split=None, **kwargs): + self.ROOT = ROOT + self.video = False + self.is_metric = False + super().__init__(*args, **kwargs) + # assert split is None + self._load_data() + + def _load_data(self): + self.data_dict = self.read_h5_file(os.path.join(self.ROOT, "new_overlap.h5")) + self.num_imgs = sum( + [len(self.data_dict[s]["basenames"]) for s in self.data_dict.keys()] + ) + self.num_scenes = len(self.data_dict.keys()) + self.invalid_scenes = [] + self.is_reachable_cache = {scene: {} for scene in self.data_dict.keys()} + + def read_h5_file(self, h5_file_path): + data_dict = {} + self.all_ref_imgs = [] + with h5py.File(h5_file_path, "r") as f: + for scene_dir in tqdm(f.keys()): + group = f[scene_dir] + basenames = group["basenames"][:] + indices = group["indices"][:] + values = group["values"][:] + shape = group.attrs["shape"] + # Reconstruct the sparse matrix + score_matrix = np.zeros(shape, dtype=np.float32) + score_matrix[indices[0], indices[1]] = values + data_dict[scene_dir] = { + "basenames": basenames, + "score_matrix": self.build_adjacency_list(score_matrix), + } + self.all_ref_imgs.extend( + [(scene_dir, b) for b in range(len(basenames))] + ) + return data_dict + + @staticmethod + def build_adjacency_list(S, thresh=0.2): + adjacency_list = [[] for _ in range(len(S))] + S = S - thresh + S[S < 0] = 0 + rows, cols = np.nonzero(S) + for i, j in zip(rows, cols): + adjacency_list[i].append((j, S[i][j])) + return adjacency_list + + @staticmethod + def is_reachable(adjacency_list, start_index, k): + visited = set() + stack = [start_index] + while stack and len(visited) < k: + node = stack.pop() + if node not in visited: + visited.add(node) + for neighbor in adjacency_list[node]: + if neighbor[0] not in visited: + stack.append(neighbor[0]) + return len(visited) >= k + + @staticmethod + def random_sequence_no_revisit_with_backtracking( + adjacency_list, k, start_index, rng: np.random.Generator + ): + path = [start_index] + visited = set([start_index]) + + neighbor_iterators = [] + # Initialize the iterator for the start index + neighbors = adjacency_list[start_index] + neighbor_idxs = [n[0] for n in neighbors] + neighbor_weights = [n[1] for n in neighbors] + neighbor_idxs = rng.choice( + neighbor_idxs, + size=len(neighbor_idxs), + replace=False, + p=np.array(neighbor_weights) / np.sum(neighbor_weights), + ).tolist() + neighbor_iterators.append(iter(neighbor_idxs)) + + while len(path) < k: + if not neighbor_iterators: + # No possible sequence + return None + current_iterator = neighbor_iterators[-1] + try: + next_index = next(current_iterator) + if next_index not in visited: + path.append(next_index) + visited.add(next_index) + + # Prepare iterator for the next node + neighbors = adjacency_list[next_index] + neighbor_idxs = [n[0] for n in neighbors] + neighbor_weights = [n[1] for n in neighbors] + neighbor_idxs = rng.choice( + neighbor_idxs, + size=len(neighbor_idxs), + replace=False, + p=np.array(neighbor_weights) / np.sum(neighbor_weights), + ).tolist() + neighbor_iterators.append(iter(neighbor_idxs)) + except StopIteration: + # No more neighbors to try at this node, backtrack + neighbor_iterators.pop() + visited.remove(path.pop()) + return path + + @staticmethod + def random_sequence_with_optional_repeats( + adjacency_list, + k, + start_index, + rng: np.random.Generator, + max_k=None, + max_attempts=100, + ): + if max_k is None: + max_k = k + path = [start_index] + visited = set([start_index]) + current_index = start_index + attempts = 0 + + while len(path) < max_k and attempts < max_attempts: + attempts += 1 + neighbors = adjacency_list[current_index] + neighbor_idxs = [n[0] for n in neighbors] + neighbor_weights = [n[1] for n in neighbors] + + if not neighbor_idxs: + # No neighbors, cannot proceed further + break + + # Try to find unvisited neighbors + unvisited_neighbors = [ + (idx, wgt) + for idx, wgt in zip(neighbor_idxs, neighbor_weights) + if idx not in visited + ] + if unvisited_neighbors: + # Select among unvisited neighbors + unvisited_idxs = [idx for idx, _ in unvisited_neighbors] + unvisited_weights = [wgt for _, wgt in unvisited_neighbors] + probabilities = np.array(unvisited_weights) / np.sum(unvisited_weights) + next_index = rng.choice(unvisited_idxs, p=probabilities) + visited.add(next_index) + else: + # All neighbors visited, but we need to reach length max_k + # So we can revisit nodes + probabilities = np.array(neighbor_weights) / np.sum(neighbor_weights) + next_index = rng.choice(neighbor_idxs, p=probabilities) + + path.append(next_index) + current_index = next_index + + if len(set(path)) >= k: + # If path is shorter than max_k, extend it by repeating existing elements + while len(path) < max_k: + # Randomly select nodes from the existing path to repeat + next_index = rng.choice(path) + path.append(next_index) + return path + else: + # Could not reach k unique nodes + return None + + def __len__(self): + return len(self.all_ref_imgs) + + def get_image_num(self): + return self.num_imgs + + def get_stats(self): + return f"{len(self)} imgs from {self.num_scenes} scenes" + + def generate_sequence( + self, scene, adj_list, num_views, start_index, rng, allow_repeat=False + ): + cutoff = num_views if not allow_repeat else max(num_views // 5, 3) + if start_index in self.is_reachable_cache[scene]: + if not self.is_reachable_cache[scene][start_index]: + print( + f"Cannot reach {num_views} unique elements from index {start_index}." + ) + return None + else: + self.is_reachable_cache[scene][start_index] = self.is_reachable( + adj_list, start_index, cutoff + ) + if not self.is_reachable_cache[scene][start_index]: + print( + f"Cannot reach {num_views} unique elements from index {start_index}." + ) + return None + if not allow_repeat: + sequence = self.random_sequence_no_revisit_with_backtracking( + adj_list, cutoff, start_index, rng + ) + else: + sequence = self.random_sequence_with_optional_repeats( + adj_list, cutoff, start_index, rng, max_k=num_views + ) + if not sequence: + self.is_reachable_cache[scene][start_index] = False + print("Failed to generate a sequence without revisiting.") + return sequence + + def _get_views(self, idx, resolution, rng: np.random.Generator, num_views): + MAX_RETRIES = 100 # Maximum attempts to find a valid sequence + MAX_SCENE_RETRIES = 50 # Maximum attempts to find a valid scene + + scene_info, ref_img_idx = self.all_ref_imgs[idx] + invalid_seq = True + ordered_video = False + + outer_retry_count = 0 + + while invalid_seq and outer_retry_count < MAX_RETRIES: + outer_retry_count += 1 + + basenames = self.data_dict[scene_info]["basenames"] + if ( + sum( + [ + (1 - int(x)) + for x in list(self.is_reachable_cache[scene_info].values()) + ] + ) + > len(basenames) - self.num_views + ): + self.invalid_scenes.append(scene_info) + + inner_retry_count = 0 + while scene_info in self.invalid_scenes and inner_retry_count < MAX_SCENE_RETRIES: + inner_retry_count += 1 + idx = rng.integers(low=0, high=len(self.all_ref_imgs)) + scene_info, ref_img_idx = self.all_ref_imgs[idx] + basenames = self.data_dict[scene_info]["basenames"] + + # If we exhausted inner retries, skip to next sample + if inner_retry_count >= MAX_SCENE_RETRIES: + import warnings + warnings.warn( + f"BlendedMVS: Could not find valid scene after {MAX_SCENE_RETRIES} attempts. " + f"Skipping sample idx={idx}. This might indicate data quality issues." + ) + # Try with a completely random sample + idx = rng.integers(low=0, high=len(self.all_ref_imgs)) + scene_info, ref_img_idx = self.all_ref_imgs[idx] + basenames = self.data_dict[scene_info]["basenames"] + + score_matrix = self.data_dict[scene_info]["score_matrix"] + imgs_idxs = self.generate_sequence( + scene_info, score_matrix, num_views, ref_img_idx, rng, self.allow_repeat + ) + + if imgs_idxs is None: + random_direction = 2 * rng.choice(2) - 1 + for offset in range(1, len(basenames)): + tentative_im_idx = ( + ref_img_idx + (random_direction * offset) + ) % len(basenames) + if ( + tentative_im_idx not in self.is_reachable_cache[scene_info] + or self.is_reachable_cache[scene_info][tentative_im_idx] + ): + ref_img_idx = tentative_im_idx + break + else: + invalid_seq = False + + # If we exhausted all retries, raise an error instead of hanging + if outer_retry_count >= MAX_RETRIES: + import warnings + warnings.warn( + f"BlendedMVS: Failed to generate valid sequence after {MAX_RETRIES} attempts. " + f"Skipping sample idx={idx}. This might indicate severe data quality issues." + ) + # As a last resort, try one more time with a completely random sample + idx = rng.integers(low=0, high=len(self.all_ref_imgs)) + scene_info, ref_img_idx = self.all_ref_imgs[idx] + basenames = self.data_dict[scene_info]["basenames"] + score_matrix = self.data_dict[scene_info]["score_matrix"] + imgs_idxs = self.generate_sequence( + scene_info, score_matrix, num_views, ref_img_idx, rng, self.allow_repeat + ) + # If still None, use sequential indices as fallback + if imgs_idxs is None: + imgs_idxs = list(range(min(num_views, len(basenames)))) + + views = [] + for view_idx in imgs_idxs: + scene_dir = osp.join(self.ROOT, scene_info) + impath = basenames[view_idx].decode("utf-8") + image = imread_pil(osp.join(scene_dir, impath + ".jpg")) + depthmap = imread_cv2(osp.join(scene_dir, impath + ".exr")) + camera_params = np.load(osp.join(scene_dir, impath + ".npz")) + + intrinsics = np.float32(camera_params["intrinsics"]) + camera_pose = np.eye(4, dtype=np.float32) + camera_pose[:3, :3] = camera_params["R_cam2world"] + camera_pose[:3, 3] = camera_params["t_cam2world"] + + image, depthmap, intrinsics = self._crop_resize_if_necessary( + image, depthmap, intrinsics, resolution, rng, info=(scene_dir, impath) + ) + + views.append( + dict( + img=image, + depthmap=depthmap, + camera_pose=camera_pose, # cam2world + camera_intrinsics=intrinsics, + dataset="BlendedMVS", + label=osp.relpath(scene_dir, self.ROOT), + is_metric=self.is_metric, + is_video=ordered_video, + instance=osp.join(scene_dir, impath + ".jpg"), + quantile=np.array(0.97, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views + diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/co3d.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/co3d.py new file mode 100644 index 0000000000000000000000000000000000000000..98dcc820fcd70fd496396ef000c22aeb2adee35a --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/co3d.py @@ -0,0 +1,190 @@ +import os.path as osp +import json +import itertools +from collections import deque +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +import cv2 +import numpy as np +import time + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class Co3d_Multi(BaseMultiViewDataset): + def __init__(self, mask_bg="rand", *args, ROOT, **kwargs): + self.ROOT = ROOT + super().__init__(*args, **kwargs) + assert mask_bg in (True, False, "rand") + self.mask_bg = mask_bg + self.is_metric = False + self.dataset_label = "Co3d_v2" + + # load all scenes + with open(osp.join(self.ROOT, f"selected_seqs_{self.split}.json"), "r") as f: + self.scenes = json.load(f) + self.scenes = {k: v for k, v in self.scenes.items() if len(v) > 0} + self.scenes = { + (k, k2): v2 for k, v in self.scenes.items() for k2, v2 in v.items() + } + self.scene_list = list(self.scenes.keys()) + cut_off = ( + self.num_views if not self.allow_repeat else max(self.num_views // 3, 3) + ) + self.cut_off = cut_off + self.all_ref_imgs = [ + (key, value) + for key, values in self.scenes.items() + for value in values[: len(values) - cut_off + 1] + ] + self.invalidate = {scene: {} for scene in self.scene_list} + self.invalid_scenes = {scene: False for scene in self.scene_list} + + def __len__(self): + return len(self.all_ref_imgs) + + def _get_metadatapath(self, obj, instance, view_idx): + return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.npz") + + def _get_impath(self, obj, instance, view_idx): + return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg") + + def _get_depthpath(self, obj, instance, view_idx): + return osp.join( + self.ROOT, obj, instance, "depths", f"frame{view_idx:06n}.jpg.geometric.png" + ) + + def _get_maskpath(self, obj, instance, view_idx): + return osp.join(self.ROOT, obj, instance, "masks", f"frame{view_idx:06n}.png") + + def _read_depthmap(self, depthpath, input_metadata): + depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED) + depthmap = (depthmap.astype(np.float32) / 65535) * np.nan_to_num( + input_metadata["maximum_depth"] + ) + return depthmap + + def _get_views(self, idx, resolution, rng, num_views): + invalid_seq = True + scene_info, ref_img_idx = self.all_ref_imgs[idx] + + while invalid_seq: + while self.invalid_scenes[scene_info]: + idx = rng.integers(low=0, high=len(self.all_ref_imgs)) + scene_info, ref_img_idx = self.all_ref_imgs[idx] + + obj, instance = scene_info + + image_pool = self.scenes[obj, instance] + if len(image_pool) < self.cut_off: + print("Invalid scene!") + self.invalid_scenes[scene_info] = True + continue + + imgs_idxs, ordered_video = self.get_seq_from_start_id( + num_views, ref_img_idx, image_pool, rng + ) + + if resolution not in self.invalidate[obj, instance]: # flag invalid images + self.invalidate[obj, instance][resolution] = [ + False for _ in range(len(image_pool)) + ] + # decide now if we mask the bg + mask_bg = (self.mask_bg == True) or ( + self.mask_bg == "rand" and rng.choice(2, p=[0.9, 0.1]) + ) + views = [] + + imgs_idxs = deque(imgs_idxs) + + while len(imgs_idxs) > 0: # some images (few) have zero depth + if ( + len(image_pool) - sum(self.invalidate[obj, instance][resolution]) + < self.cut_off + ): + print("Invalid scene!") + invalid_seq = True + self.invalid_scenes[scene_info] = True + break + + im_idx = imgs_idxs.pop() + if self.invalidate[obj, instance][resolution][im_idx]: + # search for a valid image + ordered_video = False + random_direction = 2 * rng.choice(2) - 1 + for offset in range(1, len(image_pool)): + tentative_im_idx = (im_idx + (random_direction * offset)) % len( + image_pool + ) + if not self.invalidate[obj, instance][resolution][ + tentative_im_idx + ]: + im_idx = tentative_im_idx + break + view_idx = image_pool[im_idx] + impath = self._get_impath(obj, instance, view_idx) + depthpath = self._get_depthpath(obj, instance, view_idx) + + # load camera params + metadata_path = self._get_metadatapath(obj, instance, view_idx) + input_metadata = np.load(metadata_path) + camera_pose = input_metadata["camera_pose"].astype(np.float32) + intrinsics = input_metadata["camera_intrinsics"].astype(np.float32) + + # load image and depth + rgb_image = imread_cv2(impath) + depthmap = self._read_depthmap(depthpath, input_metadata) + + if mask_bg: + # load object mask + maskpath = self._get_maskpath(obj, instance, view_idx) + maskmap = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED).astype( + np.float32 + ) + maskmap = (maskmap / 255.0) > 0.1 + + # update the depthmap with mask + depthmap *= maskmap + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath + ) + num_valid = (depthmap > 0.0).sum() + if num_valid == 0: + # problem, invalidate image and retry + self.invalidate[obj, instance][resolution][im_idx] = True + imgs_idxs.append(im_idx) + continue + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, len(views), rng + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap, + camera_pose=camera_pose, + camera_intrinsics=intrinsics, + dataset=self.dataset_label, + label=osp.join(obj, instance), + instance=osp.split(impath)[1], + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.9, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + + if len(views) == num_views and not all( + [view["instance"] == views[0]["instance"] for view in views] + ): + invalid_seq = False + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/cop3d.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/cop3d.py new file mode 100644 index 0000000000000000000000000000000000000000..aa93c7d109f80d70869250b8a44daf59cf202e0f --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/cop3d.py @@ -0,0 +1,110 @@ +import os.path as osp +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +import cv2 +import numpy as np + +from dust3r.datasets.co3d import Co3d_Multi +from dust3r.utils.image import imread_cv2 + + +class Cop3D_Multi(Co3d_Multi): + def __init__(self, mask_bg="rand", *args, ROOT, **kwargs): + super().__init__(mask_bg, *args, ROOT=ROOT, **kwargs) + self.dataset_label = "Cop3D" + self.is_metric = False + + def _get_metadatapath(self, obj, instance, view_idx): + return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.npz") + + def _get_impath(self, obj, instance, view_idx): + return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg") + + def _get_depthpath(self, obj, instance, view_idx): + # no depth, pseduo path just for getting the right resolution + return osp.join(self.ROOT, obj, instance, "images", f"frame{view_idx:06n}.jpg") + + def _get_maskpath(self, obj, instance, view_idx): + return osp.join(self.ROOT, obj, instance, "masks", f"frame{view_idx:06n}.png") + + def _read_depthmap(self, impath, input_metadata): + # no depth, set to all ones + img = imread_cv2(impath, cv2.IMREAD_UNCHANGED) + depthmap = np.ones_like(img[..., 0], dtype=np.float32) + return depthmap + + def _get_views(self, idx, resolution, rng, num_views): + invalid_seq = True + scene_info, ref_img_idx = self.all_ref_imgs[idx] + + while invalid_seq: + while self.invalid_scenes[scene_info]: + idx = rng.integers(low=0, high=len(self.all_ref_imgs)) + scene_info, ref_img_idx = self.all_ref_imgs[idx] + + obj, instance = scene_info + + image_pool = self.scenes[obj, instance] + if len(image_pool) < self.num_views: + print("Invalid scene!") + self.invalid_scenes[scene_info] = True + continue + + imgs_idxs, ordered_video = self.get_seq_from_start_id( + num_views, + ref_img_idx, + image_pool, + rng, + max_interval=5, + video_prob=1.0, + fix_interval_prob=0.9, + ) + + views = [] + + for im_idx in imgs_idxs: + view_idx = image_pool[im_idx] + impath = self._get_impath(obj, instance, view_idx) + depthpath = self._get_depthpath(obj, instance, view_idx) + + # load camera params + metadata_path = self._get_metadatapath(obj, instance, view_idx) + input_metadata = np.load(metadata_path) + camera_pose = input_metadata["camera_pose"].astype(np.float32) + intrinsics = input_metadata["camera_intrinsics"].astype(np.float32) + + # load image and depth + rgb_image = imread_cv2(impath) + depthmap = self._read_depthmap(depthpath, input_metadata) + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap, + camera_pose=camera_pose, + camera_intrinsics=intrinsics, + dataset=self.dataset_label, + label=osp.join(obj, instance), + instance=osp.split(impath)[1], + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.96, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=True, + depth_only=False, + single_view=False, + reset=False, + ) + ) + + if len(views) == num_views and not all( + [view["instance"] == views[0]["instance"] for view in views] + ): + invalid_seq = False + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/dl3dv.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/dl3dv.py new file mode 100644 index 0000000000000000000000000000000000000000..2650d573123b86f10c99bb663ec399372808fe37 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/dl3dv.py @@ -0,0 +1,166 @@ +import os.path as osp +import os +import sys +import itertools + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +import cv2 +import numpy as np + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class DL3DV_Multi(BaseMultiViewDataset): + def __init__(self, *args, split, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.max_interval = 20 + self.is_metric = False + super().__init__(*args, **kwargs) + + self.loaded_data = self._load_data() + + def _load_data(self): + self.all_scenes = sorted( + [f for f in os.listdir(self.ROOT) if os.path.isdir(osp.join(self.ROOT, f))] + ) + subscenes = [] + for scene in self.all_scenes: + # not empty + subscenes.extend( + [ + osp.join(scene, f) + for f in os.listdir(osp.join(self.ROOT, scene)) + if os.path.isdir(osp.join(self.ROOT, scene, f)) + and len(os.listdir(osp.join(self.ROOT, scene, f))) > 0 + ] + ) + + offset = 0 + scenes = [] + sceneids = [] + images = [] + scene_img_list = [] + start_img_ids = [] + j = 0 + + for scene_idx, scene in enumerate(subscenes): + scene_dir = osp.join(self.ROOT, scene, "dense") + rgb_paths = sorted( + [ + f + for f in os.listdir(os.path.join(scene_dir, "rgb")) + if f.endswith(".png") + ] + ) + assert len(rgb_paths) > 0, f"{scene_dir} is empty." + num_imgs = len(rgb_paths) + cut_off = ( + self.num_views if not self.allow_repeat else max(self.num_views // 3, 3) + ) + + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + + img_ids = list(np.arange(num_imgs) + offset) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + scenes.append(scene) + scene_img_list.append(img_ids) + sceneids.extend([j] * num_imgs) + images.extend(rgb_paths) + start_img_ids.extend(start_img_ids_) + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + scene_id = self.sceneids[start_id] + all_image_ids = self.scene_img_list[scene_id] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + block_shuffle=25, + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + for view_idx in image_idxs: + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.scenes[scene_id], "dense") + + rgb_path = self.images[view_idx] + basename = rgb_path[:-4] + + rgb_image = imread_cv2( + osp.join(scene_dir, "rgb", rgb_path), cv2.IMREAD_COLOR + ) + depthmap = np.load(osp.join(scene_dir, "depth", basename + ".npy")).astype( + np.float32 + ) + depthmap[~np.isfinite(depthmap)] = 0 # invalid + cam_file = np.load(osp.join(scene_dir, "cam", basename + ".npz")) + sky_mask = ( + cv2.imread( + osp.join(scene_dir, "sky_mask", rgb_path), cv2.IMREAD_UNCHANGED + ) + >= 127 + ) + outlier_mask = cv2.imread( + osp.join(scene_dir, "outlier_mask", rgb_path), cv2.IMREAD_UNCHANGED + ) + depthmap[sky_mask] = -1.0 + depthmap[outlier_mask >= 127] = 0.0 + depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0) + threshold = ( + np.percentile(depthmap[depthmap > 0], 98) + if depthmap[depthmap > 0].size > 0 + else 0 + ) + depthmap[depthmap > threshold] = 0.0 + + intrinsics = cam_file["intrinsic"].astype(np.float32) + camera_pose = cam_file["pose"].astype(np.float32) + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="dl3dv", + label=self.scenes[scene_id] + "_" + rgb_path, + instance=osp.join(scene_dir, "rgb", rgb_path), + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.9, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/eden.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/eden.py new file mode 100644 index 0000000000000000000000000000000000000000..00af2fffc73535f436557929b1b0220737907b2b --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/eden.py @@ -0,0 +1,94 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class EDEN_Multi(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = True + super().__init__(*args, **kwargs) + self.loaded_data = self._load_data() + + def _load_data(self): + scenes = os.listdir(self.ROOT) + img_names = [] + for scene in scenes: + scene_dir = osp.join(self.ROOT, scene) + rgb_dir = osp.join(scene_dir, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")] + ) + img_names.extend([(scene, basename) for basename in basenames]) + + self.img_names = img_names + + def __len__(self): + return len(self.img_names) + + def get_image_num(self): + return len(self.img_names) + + def _get_views(self, idx, resolution, rng, num_views): + new_seed = rng.integers(0, 2**32) + idx + new_rng = np.random.default_rng(new_seed) + img_names = new_rng.permutation(self.img_names) + + views = [] + i = 0 + while len(views) < num_views: + # Load RGB image + scene, img_name = img_names[i] + try: + rgb_image = imread_cv2( + osp.join(self.ROOT, scene, "rgb", f"{img_name}.png") + ) + depthmap = np.load( + osp.join(self.ROOT, scene, "depth", f"{img_name}.npy") + ) + depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0) + + intrinsics = np.load( + osp.join(self.ROOT, scene, "cam", f"{img_name}.npz") + )["intrinsics"] + # camera pose is not provided, placeholder + camera_pose = np.eye(4) + except: + i += 1 + continue + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="EDEN", + label=img_name, + instance=osp.join(self.ROOT, scene, "rgb", f"{img_name}.png"), + is_metric=self.is_metric, + is_video=False, + quantile=np.array(1.0, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=False, + depth_only=False, + single_view=True, + reset=True, + ) + ) + i += 1 + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/hypersim.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/hypersim.py new file mode 100644 index 0000000000000000000000000000000000000000..141c1c95b49923923a87d6b4baf8fe32b00f98e6 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/hypersim.py @@ -0,0 +1,142 @@ +import os.path as osp +import os +import sys +import itertools + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +import cv2 +import numpy as np + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_pil + + +class HyperSim_Multi(BaseMultiViewDataset): + def __init__(self, *args, split, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = True + self.max_interval = 4 + super().__init__(*args, **kwargs) + + self.loaded_data = self._load_data() + print('DATA: hypersim', len(self)) + + def _load_data(self): + self.all_scenes = sorted( + [f for f in os.listdir(self.ROOT) if os.path.isdir(osp.join(self.ROOT, f))] + ) + subscenes = [] + for scene in self.all_scenes: + # not empty + subscenes.extend( + [ + osp.join(scene, f) + for f in os.listdir(osp.join(self.ROOT, scene)) + if os.path.isdir(osp.join(self.ROOT, scene, f)) + and len(os.listdir(osp.join(self.ROOT, scene, f))) > 0 + ] + ) + + offset = 0 + scenes = [] + sceneids = [] + images = [] + start_img_ids = [] + scene_img_list = [] + j = 0 + for scene_idx, scene in enumerate(subscenes): + scene_dir = osp.join(self.ROOT, scene) + rgb_paths = sorted([f for f in os.listdir(scene_dir) if f.endswith(".png")]) + assert len(rgb_paths) > 0, f"{scene_dir} is empty." + num_imgs = len(rgb_paths) + cut_off = ( + self.num_views if not self.allow_repeat else max(self.num_views // 3, 3) + ) + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + img_ids = list(np.arange(num_imgs) + offset) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + scenes.append(scene) + scene_img_list.append(img_ids) + sceneids.extend([j] * num_imgs) + images.extend(rgb_paths) + start_img_ids.extend(start_img_ids_) + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.scene_img_list = scene_img_list + self.start_img_ids = start_img_ids + + def __len__(self): + return len(self.start_img_ids) * 10 + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + idx = idx // 10 + start_id = self.start_img_ids[idx] + scene_id = self.sceneids[start_id] + all_image_ids = self.scene_img_list[scene_id] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + block_shuffle=16, + ) + image_idxs = np.array(all_image_ids)[pos] + views = [] + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.scenes[scene_id]) + + rgb_path = self.images[view_idx] + depth_path = rgb_path.replace("rgb.png", "depth.npy") + cam_path = rgb_path.replace("rgb.png", "cam.npz") + + rgb_image = imread_pil(osp.join(scene_dir, rgb_path)) + depthmap = np.load(osp.join(scene_dir, depth_path)).astype(np.float32) + depthmap[~np.isfinite(depthmap)] = 0 # invalid + cam_file = np.load(osp.join(scene_dir, cam_path)) + intrinsics = cam_file["intrinsics"].astype(np.float32) + camera_pose = cam_file["pose"].astype(np.float32) + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.75, 0.2, 0.05] + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="hypersim", + label=self.scenes[scene_id] + "_" + rgb_path, + instance=f"{str(idx)}_{str(view_idx)}", + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(1.0, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/irs.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/irs.py new file mode 100644 index 0000000000000000000000000000000000000000..52baa76d6f6a952dc5fa69aeab6b45239cc6b549 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/irs.py @@ -0,0 +1,86 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class IRS(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.video = False + self.is_metric = True + super().__init__(*args, **kwargs) + self.loaded_data = self._load_data() + + def _load_data(self): + scenes = os.listdir(self.ROOT) + img_names = [] + for scene in scenes: + scene_dir = osp.join(self.ROOT, scene) + rgb_dir = osp.join(scene_dir, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")] + ) + img_names.extend([(scene, basename) for basename in basenames]) + + self.img_names = img_names + + def __len__(self): + return len(self.img_names) + + def get_image_num(self): + return len(self.img_names) + + def _get_views(self, idx, resolution, rng, num_views): + new_seed = rng.integers(0, 2**32) + idx + new_rng = np.random.default_rng(new_seed) + img_names = new_rng.choice(self.img_names, num_views, replace=False) + + views = [] + for v, img_name in enumerate(img_names): + # Load RGB image + scene, img_name = img_name + rgb_image = imread_cv2(osp.join(self.ROOT, scene, "rgb", f"{img_name}.png")) + depthmap = np.load(osp.join(self.ROOT, scene, "depth", f"{img_name}.npy")) + depthmap[depthmap > 200] = 0.0 + depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0) + + intrinsics = np.load(osp.join(self.ROOT, scene, "cam", f"{img_name}.npz"))[ + "intrinsics" + ] + # camera pose is not provided, placeholder + camera_pose = np.eye(4) + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="irs", + label=img_name, + instance=f"{str(idx)}_{img_name}", + is_metric=self.is_metric, + is_video=False, + quantile=np.array(1.0, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=False, + depth_only=False, + single_view=True, + reset=True, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/kitti.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/kitti.py new file mode 100644 index 0000000000000000000000000000000000000000..b42f9c01ec697cc9e2cb5fccfeadeea1123ec515 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/kitti.py @@ -0,0 +1,323 @@ +"""KITTI Odometry training dataset loader. + +Mirrors VirtualKITTI2_Multi but for the real KITTI odometry benchmark +(11 sequences 00-10 with ground-truth poses). + +Optional sparse LiDAR depth supervision: when `velodyne_root` is provided, +loads the 64-beam Velodyne scan per frame, applies the `Tr` velo→cam0 +rigid transform from calib.txt, then projects via P2 onto image_2 to +build a sparse depthmap. Pixels without a lidar return are marked -1. + +Layout expected: + ROOT/sequences/<00..10>/image_2/{NNNNNN}.png + ROOT/sequences/<00..10>/calib.txt # P0 P1 P2 P3 + Tr (3x4 each) + ROOT/poses/<00..10>.txt # one 12-float c2w 3x4 row per frame + velodyne_root/sequences/<00..10>/velodyne/{NNNNNN}.bin # (N,4) float32 [x y z r] + +Train/eval split (matching `setup_lingbot_env.sh` and SLAM-eval convention): + train: 00, 01, 02, 03, 04, 05, 06, 07, 08 + test: 09, 10 +""" +import os +import os.path as osp +import sys + +import cv2 +import numpy as np + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + +TRAIN_SEQS = ["00", "01", "02", "03", "04", "05", "06", "07", "08"] +TEST_SEQS = ["09", "10"] + + +def _parse_calib(calib_path: str): + """Read calib.txt; return (P2 (3,4) left-color cam projection, + Tr (3,4) velodyne→cam0 rigid transform, or None if absent).""" + P2 = None + Tr = None + with open(calib_path) as fh: + for line in fh: + line = line.strip() + if line.startswith("P2:"): + vals = list(map(float, line[3:].split())) + P2 = np.array(vals).reshape(3, 4) + elif line.startswith("Tr:") or line.startswith("Tr_velo_to_cam:"): + pref = "Tr_velo_to_cam:" if line.startswith("Tr_velo_to_cam:") else "Tr:" + vals = list(map(float, line[len(pref):].split())) + Tr = np.array(vals).reshape(3, 4) + if P2 is None: + raise RuntimeError(f"P2 not found in {calib_path}") + return P2, Tr + + +def _parse_calib_p2(calib_path: str) -> np.ndarray: + """Backward-compat shim returning only P2.""" + P2, _ = _parse_calib(calib_path) + return P2 + + +def _project_velo_to_depth(velo_pts, P2, Tr, H, W, + min_depth=0.5, max_depth=80.0): + """Project a velodyne scan onto image_2 to build a sparse depthmap. + + Args: + velo_pts: (N,4) float32 [x,y,z,reflectance] in velodyne frame. + P2: (3,4) left-color cam projection (cam0 → image_2). + Tr: (3,4) velodyne → cam0 rigid transform. + H, W: target depthmap shape. + Returns: + (H,W) float32 depthmap, -1 where no lidar return. + """ + Tr_h = np.eye(4, dtype=np.float64) + Tr_h[:3, :] = Tr + pts_h = np.concatenate( + [velo_pts[:, :3].astype(np.float64), np.ones((velo_pts.shape[0], 1))], + axis=1, + ) + cam0 = pts_h @ Tr_h.T # (N,4) in cam0 frame + in_front = cam0[:, 2] > min_depth + cam0 = cam0[in_front] + if cam0.shape[0] == 0: + return np.full((H, W), -1.0, dtype=np.float32) + uv_h = cam0 @ P2.T # (M,3) + z = uv_h[:, 2] + valid = z > min_depth + z = z[valid] + u = uv_h[valid, 0] / z + v = uv_h[valid, 1] / z + in_img = (u >= 0) & (u < W) & (v >= 0) & (v < H) & (z < max_depth) + u = u[in_img].astype(np.int32) + v = v[in_img].astype(np.int32) + z = z[in_img] + depthmap = np.full((H, W), -1.0, dtype=np.float32) + if z.size == 0: + return depthmap + # Multiple lidar points hitting one pixel: keep the closest (smallest z). + order = np.argsort(-z) # descending; smallest z written last → wins + depthmap[v[order], u[order]] = z[order].astype(np.float32) + return depthmap + + +def _load_velodyne_bin(bin_path: str) -> np.ndarray: + """Load (N,4) float32 [x,y,z,reflectance] from KITTI .bin file.""" + return np.fromfile(bin_path, dtype=np.float32).reshape(-1, 4) + + +def _load_kitti_poses(poses_path: str) -> np.ndarray: + """Load KITTI poses.txt into (N,4,4) c2w matrices (homogeneous).""" + raw = np.loadtxt(poses_path) # (N, 12) row-major 3x4 + N = raw.shape[0] + out = np.zeros((N, 4, 4), dtype=np.float32) + out[:, :3, :] = raw.reshape(N, 3, 4) + out[:, 3, 3] = 1.0 + return out + + +class KITTI_Multi(BaseMultiViewDataset): + """Real KITTI odometry, image_2 (left color), camera-only supervision.""" + + def __init__(self, ROOT, *args, velodyne_root=None, **kwargs): + self.ROOT = ROOT + # Optional separate root for Velodyne .bin scans (e.g. semantickitti); + # if None, expects them at ROOT/sequences//velodyne/. + self.velodyne_root = velodyne_root + self.video = True + self.is_metric = True # poses are metric + self.max_interval = 4 + super().__init__(*args, **kwargs) + self._load_data(self.split) + + def _load_data(self, split=None): + seq_ids = TRAIN_SEQS if split == "train" else TEST_SEQS + seq_dirs = [] + for sid in seq_ids: + seq_path = osp.join(self.ROOT, "sequences", sid) + if not osp.isdir(seq_path): + continue + img_dir = osp.join(seq_path, "image_2") + poses_path = osp.join(self.ROOT, "poses", f"{sid}.txt") + if not osp.isdir(img_dir) or not osp.isfile(poses_path): + continue + seq_dirs.append(sid) + + if not seq_dirs: + raise RuntimeError(f"No KITTI sequences found at {self.ROOT}") + + offset = 0 + scenes = [] # list of seq ids + seq_intrinsics = [] # (3,3) per seq + seq_p2 = [] # (3,4) P2 per seq (needed for lidar projection) + seq_tr = [] # (3,4) Tr velo→cam0 per seq, or None + seq_velo_dir = [] # absolute velodyne dir per seq, or None + seq_poses = [] # (N_i,4,4) per seq + sceneids = [] # per-frame seq idx (global) + scene_img_list = [] # per-seq global frame idx list + start_img_ids = [] + j = 0 + + for sid in seq_dirs: + seq_path = osp.join(self.ROOT, "sequences", sid) + img_dir = osp.join(seq_path, "image_2") + calib_path = osp.join(seq_path, "calib.txt") + poses_path = osp.join(self.ROOT, "poses", f"{sid}.txt") + + P2, Tr = _parse_calib(calib_path) + K = P2[:, :3] # left-color intrinsics + poses_c2w = _load_kitti_poses(poses_path) # (N,4,4) + n_imgs = poses_c2w.shape[0] + + # Resolve velodyne dir (per-sequence). Try ROOT first; fall back to + # velodyne_root if provided. None = no lidar supervision for this seq. + velo_dir = None + velo_calib_path = None + for cand_velo, cand_calib in ( + (osp.join(seq_path, "velodyne"), calib_path), + ( + osp.join(self.velodyne_root, "sequences", sid, "velodyne") if self.velodyne_root else None, + osp.join(self.velodyne_root, "sequences", sid, "calib.txt") if self.velodyne_root else None, + ), + ): + if cand_velo and osp.isdir(cand_velo): + velo_dir = cand_velo + velo_calib_path = cand_calib + break + + # KITTI odometry eval calib.txt may ship without Tr (velo→cam0). + # If so, parse Tr from the velodyne_root's calib.txt (semantickitti + # ships full P0..P3 + Tr per sequence). + if Tr is None and velo_calib_path and osp.isfile(velo_calib_path) and velo_calib_path != calib_path: + _, Tr_velo = _parse_calib(velo_calib_path) + if Tr_velo is not None: + Tr = Tr_velo + + cut_off = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + if n_imgs < cut_off: + continue + + img_ids = list(np.arange(n_imgs) + offset) + start_img_ids_ = img_ids[: n_imgs - cut_off + 1] + + scenes.append(sid) + seq_intrinsics.append(K.astype(np.float32)) + seq_p2.append(P2.astype(np.float64)) + seq_tr.append(Tr.astype(np.float64) if Tr is not None else None) + seq_velo_dir.append(velo_dir) + seq_poses.append(poses_c2w) + scene_img_list.append(img_ids) + sceneids.extend([j] * n_imgs) + start_img_ids.extend(start_img_ids_) + offset += n_imgs + j += 1 + + self.scenes = scenes + self.seq_intrinsics = seq_intrinsics + self.seq_p2 = seq_p2 + self.seq_tr = seq_tr + self.seq_velo_dir = seq_velo_dir + self.seq_poses = seq_poses + self.sceneids = sceneids + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return sum(len(p) for p in self.seq_poses) + + def get_stats(self): + return f"{len(self)} groups of views across {len(self.scenes)} KITTI sequences" + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + scene_id = self.sceneids[start_id] + all_image_ids = self.scene_img_list[scene_id] + n_frames = len(all_image_ids) + seq_id_str = self.scenes[scene_id] + seq_path = osp.join(self.ROOT, "sequences", seq_id_str) + img_dir = osp.join(seq_path, "image_2") + K = self.seq_intrinsics[scene_id] + P2 = self.seq_p2[scene_id] + Tr = self.seq_tr[scene_id] + velo_dir = self.seq_velo_dir[scene_id] + has_lidar = velo_dir is not None and Tr is not None + poses = self.seq_poses[scene_id] + + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=1.0, + fix_interval_prob=0.9, + ) + # `pos` are 0-based local positions within all_image_ids + # (matching upstream SLAMFormer get_seq_from_start_id semantics). + local_idxs = np.asarray(pos, dtype=int) + + views = [] + for v, lid in enumerate(local_idxs): + lid = int(lid) + img_path = osp.join(img_dir, f"{lid:06d}.png") + image = imread_cv2(img_path) + H, W = image.shape[:2] + + # If velodyne available, project LiDAR scan to image_2 → sparse depthmap. + # Otherwise emit invalid depthmap and fall back to camera-only. + if has_lidar: + bin_path = osp.join(velo_dir, f"{lid:06d}.bin") + if osp.isfile(bin_path): + velo_pts = _load_velodyne_bin(bin_path) + depthmap = _project_velo_to_depth(velo_pts, P2, Tr, H, W) + frame_has_lidar = (depthmap > 0).any() + else: + depthmap = np.full((H, W), -1.0, dtype=np.float32) + frame_has_lidar = False + else: + depthmap = np.full((H, W), -1.0, dtype=np.float32) + frame_has_lidar = False + + intrinsics = K.copy() + camera_pose = poses[lid].astype(np.float32) + + image, depthmap, intrinsics = self._crop_resize_if_necessary( + image, depthmap, intrinsics, resolution, rng, info=(seq_path, img_path) + ) + + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.85, 0.1, 0.05] + ) + + views.append( + dict( + img=image, + depthmap=depthmap, + camera_pose=camera_pose, + camera_intrinsics=intrinsics, + dataset="KITTI", + label=seq_path, + is_metric=self.is_metric, + instance=f"{seq_id_str}/image_2/{lid:06d}.png", + is_video=ordered_video, + quantile=np.array(1.0, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + # If frame has any valid lidar return, supervise depth too. + camera_only=not frame_has_lidar, + depth_only=False, + single_view=False, + reset=False, + scene_tag=f"kitti/{seq_id_str}", + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/kitti360.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/kitti360.py new file mode 100644 index 0000000000000000000000000000000000000000..3113f845ef0be1ff1b1085b60dc92a21bffc79e5 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/kitti360.py @@ -0,0 +1,354 @@ +"""KITTI-360 training dataset loader (real outdoor). + +cam_00 perspective images (rectified) + poses.txt + perspective.txt P_rect_00 +intrinsics. Optional Velodyne HDL64 sparse depth supervision when data_3d_raw +is on disk (loaded from `velodyne_root`, defaults to ROOT). + +Layout expected: + ROOT/data_2d_raw//image_00/data_rect/{NNNNNNNNNN}.png + ROOT/data_poses//poses.txt # frame_idx + 12 floats c2w + ROOT/calibration/perspective.txt # P_rect_00 + R_rect_00 + ROOT/calibration/calib_cam_to_velo.txt # cam0→velo (3×4) + velodyne_root/data_3d_raw//velodyne_points/data/{NNNNNNNNNN}.bin # optional + +Train/test split (cvlibs convention): + train: 0000, 0002, 0003, 0004, 0005, 0006, 0009 + test: 0007, 0010 +""" +import os +import os.path as osp +import sys + +import cv2 +import numpy as np + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + +TRAIN_SEQS = [ + "2013_05_28_drive_0000_sync", + "2013_05_28_drive_0002_sync", + "2013_05_28_drive_0003_sync", + "2013_05_28_drive_0004_sync", + "2013_05_28_drive_0005_sync", + "2013_05_28_drive_0006_sync", + "2013_05_28_drive_0009_sync", +] +TEST_SEQS = [ + "2013_05_28_drive_0007_sync", + "2013_05_28_drive_0010_sync", +] + + +def _parse_perspective_intrinsics(path: str): + """Parse calibration/perspective.txt → (P_rect_00 (3,4), R_rect_00 (3,3), S_rect_00 (W, H)).""" + P_rect = None + R_rect = None + S_rect = None + with open(path) as fh: + for line in fh: + line = line.strip() + if line.startswith("P_rect_00:"): + vals = list(map(float, line.split()[1:])) + P_rect = np.array(vals, dtype=np.float64).reshape(3, 4) + elif line.startswith("R_rect_00:"): + vals = list(map(float, line.split()[1:])) + R_rect = np.array(vals, dtype=np.float64).reshape(3, 3) + elif line.startswith("S_rect_00:"): + vals = list(map(float, line.split()[1:])) + S_rect = (int(vals[0]), int(vals[1])) # (W, H) + if P_rect is None: + raise RuntimeError(f"P_rect_00 missing in {path}") + if R_rect is None: + R_rect = np.eye(3, dtype=np.float64) + return P_rect, R_rect, S_rect + + +def _parse_cam_to_velo(path: str): + """Parse calibration/calib_cam_to_velo.txt → T_cam0_to_velo (4×4 homogeneous). + + File contains a single 3×4 row-major rigid transform (cam0 origin in velo frame). + """ + with open(path) as fh: + line = fh.readline().strip() + vals = list(map(float, line.split())) + if len(vals) != 12: + raise RuntimeError(f"Expected 12 floats in {path}, got {len(vals)}") + T = np.eye(4, dtype=np.float64) + T[:3, :] = np.array(vals, dtype=np.float64).reshape(3, 4) + return T + + +def _load_velodyne_bin(bin_path: str) -> np.ndarray: + """Load (N,4) float32 [x,y,z,reflectance] from KITTI-360 .bin file.""" + return np.fromfile(bin_path, dtype=np.float32).reshape(-1, 4) + + +def _project_velo_to_depth_kitti360(velo_pts, P_rect_00, T_velo_to_cam_rect, H, W, + min_depth=0.5, max_depth=80.0): + """Project KITTI-360 velodyne scan onto image_00 (rectified) → sparse depthmap. + + pixel_h = P_rect_00 @ T_velo_to_cam_rect @ velo_h + where T_velo_to_cam_rect = R_rect_00 @ inv(T_cam0_to_velo) (4×4 incorporating rectification). + Closest-z wins on duplicate pixels. + """ + pts_h = np.concatenate( + [velo_pts[:, :3].astype(np.float64), np.ones((velo_pts.shape[0], 1))], + axis=1, + ) + cam = pts_h @ T_velo_to_cam_rect.T # (N,4) in rectified cam0 frame + in_front = cam[:, 2] > min_depth + cam = cam[in_front] + if cam.shape[0] == 0: + return np.full((H, W), -1.0, dtype=np.float32) + uv_h = cam @ P_rect_00.T # (M,3) + z = uv_h[:, 2] + valid = z > min_depth + z = z[valid] + u = uv_h[valid, 0] / z + v = uv_h[valid, 1] / z + in_img = (u >= 0) & (u < W) & (v >= 0) & (v < H) & (z < max_depth) + u = u[in_img].astype(np.int32) + v = v[in_img].astype(np.int32) + z = z[in_img] + depthmap = np.full((H, W), -1.0, dtype=np.float32) + if z.size == 0: + return depthmap + order = np.argsort(-z) # closest-z written last → wins + depthmap[v[order], u[order]] = z[order].astype(np.float32) + return depthmap + + +def _load_kitti360_poses(path: str): + """Read cam0_to_world.txt → dict[frame_idx] = (4,4) c2w matrix. + + KITTI-360 ships TWO pose files per sequence: + - poses.txt : IMU/system pose (NOT camera pose) + - cam0_to_world.txt : actual camera-to-world for cam_00 + The cam0 file has full 4x4 rows (16 floats); poses.txt is 3x4 (12 floats). + Using poses.txt makes pmap loss inconsistent with depth (~1m offset). + Note: not every frame has a pose (gaps where SLAM failed); skip missing. + """ + raw = np.loadtxt(path) + out = {} + for row in raw: + fid = int(row[0]) + if row.shape[0] >= 17: # cam0_to_world.txt: 1 + 16 + T = row[1:17].reshape(4, 4).astype(np.float32) + else: # poses.txt fallback: 1 + 12 + T = np.eye(4, dtype=np.float32) + T[:3, :] = row[1:13].reshape(3, 4).astype(np.float32) + out[fid] = T + return out + + +class KITTI360_Multi(BaseMultiViewDataset): + """KITTI-360 perspective cam_00. + + Camera-only by default; depth supervision activates per-frame when a Velodyne + .bin scan is present at velodyne_root/data_3d_raw//velodyne_points/data/.bin. + """ + + def __init__(self, ROOT, *args, velodyne_root=None, **kwargs): + self.ROOT = ROOT + # Velodyne root for data_3d_raw/. If None, look under ROOT (in-place download). + self.velodyne_root = velodyne_root if velodyne_root else ROOT + self.video = True + self.is_metric = True + self.max_interval = 4 + super().__init__(*args, **kwargs) + self._load_data(self.split) + + def _load_data(self, split=None): + # Intrinsics + rectification (shared across all KITTI-360 sequences) + calib_dir = osp.join(self.ROOT, "calibration") + P_rect, R_rect, _ = _parse_perspective_intrinsics( + osp.join(calib_dir, "perspective.txt") + ) + self.P_rect_00 = P_rect.copy() + self.K = P_rect[:, :3].copy().astype(np.float32) + + # T_cam0→velo from calib_cam_to_velo.txt; lidar projection needs the inverse, + # composed with R_rect_00 to land in rectified cam0 frame. + cam_to_velo_path = osp.join(calib_dir, "calib_cam_to_velo.txt") + if osp.isfile(cam_to_velo_path): + T_cam_to_velo = _parse_cam_to_velo(cam_to_velo_path) # (4,4) + T_velo_to_cam = np.linalg.inv(T_cam_to_velo) + R_rect_h = np.eye(4, dtype=np.float64) + R_rect_h[:3, :3] = R_rect + self.T_velo_to_cam_rect = R_rect_h @ T_velo_to_cam # (4,4) + else: + self.T_velo_to_cam_rect = None # lidar disabled + + seq_ids = TRAIN_SEQS if split == "train" else TEST_SEQS + scenes = [] + seq_poses = [] # list of (M_i, 4, 4) per scene + seq_frame_ids = [] # list of [frame_idx, ...] (only those with poses + image) + seq_velo_dir = [] # absolute velodyne dir per seq, or None + scene_img_list = [] + sceneids = [] + start_img_ids = [] + offset = 0 + j = 0 + + for sid in seq_ids: + img_dir = osp.join(self.ROOT, "data_2d_raw", sid, "image_00", "data_rect") + pose_path = osp.join(self.ROOT, "data_poses", sid, "cam0_to_world.txt") + if not osp.isdir(img_dir) or not osp.isfile(pose_path): + continue + + poses_dict = _load_kitti360_poses(pose_path) + # Walk image_00/data_rect for available frame_idx files. Skip zero-byte + # placeholders left over from partial / aborted downloads (would crash + # imread_cv2 at sample time). + avail = [] + for fname in os.listdir(img_dir): + if not fname.endswith(".png"): + continue + try: + fid = int(osp.splitext(fname)[0]) + except ValueError: + continue + if fid not in poses_dict: + continue + fpath = osp.join(img_dir, fname) + try: + if osp.getsize(fpath) <= 0: + continue + except OSError: + continue + avail.append(fid) + avail.sort() + if not avail: + continue + + poses = np.stack([poses_dict[f] for f in avail], axis=0) + n_imgs = len(avail) + cut_off = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + if n_imgs < cut_off: + continue + + # Velodyne dir (per-seq); set to None if absent → frame falls back to camera-only. + velo_dir = osp.join( + self.velodyne_root, "data_3d_raw", sid, "velodyne_points", "data" + ) + velo_dir = velo_dir if osp.isdir(velo_dir) else None + + img_ids = list(np.arange(n_imgs) + offset) + start_img_ids_ = img_ids[: n_imgs - cut_off + 1] + + scenes.append(sid) + seq_poses.append(poses) + seq_frame_ids.append(np.asarray(avail, dtype=np.int64)) + seq_velo_dir.append(velo_dir) + scene_img_list.append(img_ids) + sceneids.extend([j] * n_imgs) + start_img_ids.extend(start_img_ids_) + offset += n_imgs + j += 1 + + self.scenes = scenes + self.seq_poses = seq_poses + self.seq_frame_ids = seq_frame_ids + self.seq_velo_dir = seq_velo_dir + self.scene_img_list = scene_img_list + self.sceneids = sceneids + self.start_img_ids = start_img_ids + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return sum(len(p) for p in self.seq_poses) + + def get_stats(self): + return f"{len(self)} groups across {len(self.scenes)} KITTI-360 sequences" + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + scene_id = self.sceneids[start_id] + all_image_ids = self.scene_img_list[scene_id] + n_frames = len(all_image_ids) + sid = self.scenes[scene_id] + img_dir = osp.join(self.ROOT, "data_2d_raw", sid, "image_00", "data_rect") + frame_ids = self.seq_frame_ids[scene_id] + poses = self.seq_poses[scene_id] + velo_dir = self.seq_velo_dir[scene_id] + K = self.K + has_lidar = velo_dir is not None and self.T_velo_to_cam_rect is not None + + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=1.0, + fix_interval_prob=0.9, + ) + local_idxs = np.asarray(pos, dtype=int) + + views = [] + for v, lid in enumerate(local_idxs): + lid = int(lid) + fid = int(frame_ids[lid]) + img_path = osp.join(img_dir, f"{fid:010d}.png") + image = imread_cv2(img_path) + H, W = image.shape[:2] + + # If velodyne available, project lidar to image_00 → sparse depthmap. + if has_lidar: + bin_path = osp.join(velo_dir, f"{fid:010d}.bin") + if osp.isfile(bin_path): + velo_pts = _load_velodyne_bin(bin_path) + depthmap = _project_velo_to_depth_kitti360( + velo_pts, self.P_rect_00, self.T_velo_to_cam_rect, H, W + ) + frame_has_lidar = bool((depthmap > 0).any()) + else: + depthmap = np.full((H, W), -1.0, dtype=np.float32) + frame_has_lidar = False + else: + depthmap = np.full((H, W), -1.0, dtype=np.float32) + frame_has_lidar = False + + intrinsics = K.copy() + camera_pose = poses[lid].astype(np.float32) + + image, depthmap, intrinsics = self._crop_resize_if_necessary( + image, depthmap, intrinsics, resolution, rng, info=(img_dir, img_path) + ) + + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.85, 0.1, 0.05] + ) + + views.append( + dict( + img=image, + depthmap=depthmap, + camera_pose=camera_pose, + camera_intrinsics=intrinsics, + dataset="KITTI360", + label=img_dir, + is_metric=self.is_metric, + instance=f"{sid}/image_00/{fid:010d}.png", + is_video=ordered_video, + quantile=np.array(1.0, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=not frame_has_lidar, + depth_only=False, + single_view=False, + reset=False, + scene_tag=f"kitti360/{sid}", + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/megadepth.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/megadepth.py new file mode 100644 index 0000000000000000000000000000000000000000..75f9747e7da5a32998882ab22f44dfb9a515688f --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/megadepth.py @@ -0,0 +1,100 @@ +import os.path as osp +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2, imread_pil + + +class MegaDepth_Multi(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + super().__init__(*args, **kwargs) + self._load_data(self.split) + self.is_metric = False + if self.split is None: + pass + elif self.split == "train": + self.select_scene(("0015", "0022"), opposite=True) + elif self.split == "val": + self.select_scene(("0015", "0022")) + else: + raise ValueError(f"bad {self.split=}") + + print('DATA: megadepth', len(self)) + + def _load_data(self, split): + with np.load( + osp.join(self.ROOT, "megadepth_sets_64.npz"), allow_pickle=True + ) as data: + self.all_scenes = data["scenes"] + self.all_images = data["images"] + self.sets = data["sets"] + + def __len__(self): + return len(self.sets) + + def get_image_num(self): + return len(self.all_images) + + def get_stats(self): + return f"{len(self)} groups from {len(self.all_scenes)} scenes" + + def select_scene(self, scene, *instances, opposite=False): + scenes = (scene,) if isinstance(scene, str) else tuple(scene) + scene_id = [s.startswith(scenes) for s in self.all_scenes] + assert any(scene_id), "no scene found" + valid = np.in1d(self.sets[:, 0], np.nonzero(scene_id)[0]) + if instances: + raise NotImplementedError("selecting instances not implemented") + if opposite: + valid = ~valid + assert valid.any() + self.sets = self.sets[valid] + + def _get_views(self, idx, resolution, rng, num_views): + scene_id = self.sets[idx][0] + image_idxs = self.sets[idx][1:65] + replace = False if not self.allow_repeat else True + image_idxs = rng.choice(image_idxs, num_views, replace=replace) + scene, subscene = self.all_scenes[scene_id].split() + seq_path = osp.join(self.ROOT, scene, subscene) + views = [] + for im_id in image_idxs: + img = self.all_images[im_id] + try: + image = imread_pil(osp.join(seq_path, img + ".jpg")) + depthmap = imread_cv2(osp.join(seq_path, img + ".exr")) + camera_params = np.load(osp.join(seq_path, img + ".npz")) + except Exception as e: + raise OSError(f"cannot load {img}, got exception {e}") + intrinsics = np.float32(camera_params["intrinsics"]) + camera_pose = np.float32(camera_params["cam2world"]) + image, depthmap, intrinsics = self._crop_resize_if_necessary( + image, depthmap, intrinsics, resolution, rng, info=(seq_path, img) + ) + views.append( + dict( + img=image, + depthmap=depthmap, + camera_pose=camera_pose, # cam2world + camera_intrinsics=intrinsics, + dataset="MegaDepth", + label=osp.relpath(seq_path, self.ROOT), + is_metric=self.is_metric, + instance=img, + is_video=False, + quantile=np.array(0.96, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mp3d.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mp3d.py new file mode 100644 index 0000000000000000000000000000000000000000..f88d39e1c56907c4cad9e105ad8ffa505aa362d1 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mp3d.py @@ -0,0 +1,132 @@ +import os.path as osp +import os +import sys +import itertools + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +import cv2 +import numpy as np + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class MP3D_Multi(BaseMultiViewDataset): + def __init__(self, *args, split, ROOT, **kwargs): + self.ROOT = ROOT + self.video = False + self.is_metric = True + super().__init__(*args, **kwargs) + + self.loaded_data = self._load_data() + + def _load_data(self): + scenes = os.listdir(self.ROOT) + offset = 0 + overlaps = {scene: [] for scene in scenes} + scene_img_list = {scene: [] for scene in scenes} + images = [] + + j = 0 + for scene in scenes: + scene_dir = osp.join(self.ROOT, scene) + rgb_dir = osp.join(scene_dir, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")] + ) + overlap = np.load(osp.join(scene_dir, "overlap.npy")) + overlaps[scene] = overlap + num_imgs = len(basenames) + + images.extend( + [(scene, i, basename) for i, basename in enumerate(basenames)] + ) + scene_img_list[scene] = np.arange(num_imgs) + offset + offset += num_imgs + j += 1 + + self.scenes = scenes + self.scene_img_list = scene_img_list + self.images = images + self.overlaps = overlaps + + def __len__(self): + return len(self.images) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + num_views_posible = 0 + num_unique = num_views if not self.allow_repeat else max(num_views // 3, 3) + while num_views_posible < num_unique - 1: + scene, img_idx, _ = self.images[idx] + overlap = self.overlaps[scene] + sel_img_idx = np.where(overlap[:, 0] == img_idx)[0] + overlap_sel = overlap[sel_img_idx] + overlap_sel = overlap_sel[ + (overlap_sel[:, 2] > 0.01) * (overlap_sel[:, 2] < 1) + ] + num_views_posible = len(overlap_sel) + if num_views_posible >= num_unique - 1: + break + idx = rng.choice(len(self.images)) + + ref_id = self.scene_img_list[scene][img_idx] + ids = self.scene_img_list[scene][overlap_sel[:, 1].astype(np.int64)] + replace = False if not self.allow_repeat else True + image_idxs = rng.choice( + ids, + num_views - 1, + replace=replace, + p=overlap_sel[:, 2] / np.sum(overlap_sel[:, 2]), + ) + image_idxs = np.concatenate([[ref_id], image_idxs]) + + ordered_video = False + views = [] + for v, view_idx in enumerate(image_idxs): + scene, _, basename = self.images[view_idx] + scene_dir = osp.join(self.ROOT, scene) + rgb_path = osp.join(scene_dir, "rgb", basename + ".png") + depth_path = osp.join(scene_dir, "depth", basename + ".npy") + cam_path = osp.join(scene_dir, "cam", basename + ".npz") + + rgb_image = imread_cv2(rgb_path, cv2.IMREAD_COLOR) + depthmap = np.load(depth_path).astype(np.float32) + depthmap[~np.isfinite(depthmap)] = 0 # invalid + cam_file = np.load(cam_path) + intrinsics = cam_file["intrinsics"] + camera_pose = cam_file["pose"] + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.85, 0.1, 0.05] + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="mp3d", + label=scene + "_" + rgb_path, + instance=f"{str(idx)}_{str(view_idx)}", + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.99, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mvimgnet.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mvimgnet.py new file mode 100644 index 0000000000000000000000000000000000000000..9563f7f5dcd6120b460486b46415ad0e57c214c8 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/mvimgnet.py @@ -0,0 +1,145 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class MVImgNet_Multi(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = False + self.max_interval = 32 + super().__init__(*args, **kwargs) + + self.loaded_data = self._load_data() + + def _load_data(self): + self.scenes = os.listdir(self.ROOT) + + offset = 0 + scenes = [] + sceneids = [] + scene_img_list = [] + images = [] + start_img_ids = [] + + j = 0 + for scene in tqdm(self.scenes): + scene_dir = osp.join(self.ROOT, scene) + rgb_dir = osp.join(scene_dir, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".jpg")] + ) + + num_imgs = len(basenames) + cut_off = ( + self.num_views if not self.allow_repeat else max(self.num_views // 3, 3) + ) + + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + + img_ids = list(np.arange(num_imgs) + offset) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + start_img_ids.extend([(scene, id) for id in start_img_ids_]) + sceneids.extend([j] * num_imgs) + images.extend(basenames) + scenes.append(scene) + scene_img_list.append(img_ids) + + # offset groups + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + self.invalid_scenes = {scene: False for scene in self.scenes} + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + invalid_seq = True + scene, start_id = self.start_img_ids[idx] + + while invalid_seq: + while self.invalid_scenes[scene]: + idx = rng.integers(low=0, high=len(self.start_img_ids)) + scene, start_id = self.start_img_ids[idx] + + all_image_ids = self.scene_img_list[self.sceneids[start_id]] + pos, ordered_video = self.get_seq_from_start_id( + num_views, start_id, all_image_ids, rng, max_interval=self.max_interval + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + for view_idx in image_idxs: + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.scenes[scene_id]) + rgb_dir = osp.join(scene_dir, "rgb") + cam_dir = osp.join(scene_dir, "cam") + + basename = self.images[view_idx] + + try: + # Load RGB image + rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".jpg")) + # Load depthmap, no depth, set to all ones + depthmap = np.ones_like(rgb_image[..., 0], dtype=np.float32) + cam = np.load(osp.join(cam_dir, basename + ".npz")) + camera_pose = cam["pose"] + intrinsics = np.eye(3) + intrinsics[0, 0] = cam["intrinsics"][0, 0] + intrinsics[1, 1] = cam["intrinsics"][0, 0] + intrinsics[0, 2] = cam["intrinsics"][1, 1] + intrinsics[1, 2] = cam["intrinsics"][0, 2] + except: + print(f"Error loading {scene} {basename}, skipping") + self.invalid_scenes[scene] = True + break + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="MVImgnet", + label=self.scenes[scene_id] + "_" + basename, + instance=f"{str(idx)}_{str(view_idx)}", + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.98, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=True, + depth_only=False, + single_view=False, + reset=False, + ) + ) + if len(views) == num_views: + invalid_seq = False + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/spring.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/spring.py new file mode 100644 index 0000000000000000000000000000000000000000..39bc760a36f56be0e5020e5adacd6eb913aaca6d --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/spring.py @@ -0,0 +1,137 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class Spring(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = True + self.max_interval = 16 + super().__init__(*args, **kwargs) + + self.loaded_data = self._load_data() + + def _load_data(self): + self.scenes = os.listdir(self.ROOT) + + offset = 0 + scenes = [] + sceneids = [] + scene_img_list = [] + images = [] + start_img_ids = [] + + j = 0 + for scene in tqdm(self.scenes): + scene_dir = osp.join(self.ROOT, scene) + rgb_dir = osp.join(scene_dir, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")] + ) + num_imgs = len(basenames) + img_ids = list(np.arange(num_imgs) + offset) + # start_img_ids_ = img_ids[:-self.num_views+1] + cut_off = ( + self.num_views if not self.allow_repeat else max(self.num_views // 3, 3) + ) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + + start_img_ids.extend(start_img_ids_) + sceneids.extend([j] * num_imgs) + images.extend(basenames) + scenes.append(scene) + scene_img_list.append(img_ids) + + # offset groups + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + all_image_ids = self.scene_img_list[self.sceneids[start_id]] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=1.0, + fix_interval_prob=1.0, + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.scenes[scene_id]) + rgb_dir = osp.join(scene_dir, "rgb") + depth_dir = osp.join(scene_dir, "depth") + cam_dir = osp.join(scene_dir, "cam") + + basename = self.images[view_idx] + + # Load RGB image + rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png")) + # Load depthmap + depthmap = np.load(osp.join(depth_dir, basename + ".npy")) + depthmap[~np.isfinite(depthmap)] = 0 # invalid + + cam = np.load(osp.join(cam_dir, basename + ".npz")) + camera_pose = cam["pose"] + intrinsics = cam["intrinsics"] + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.85, 0.10, 0.05] + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="spring", + label=self.scenes[scene_id] + "_" + basename, + instance=f"{str(idx)}_{str(view_idx)}", + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(1.0, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/synscapes.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/synscapes.py new file mode 100644 index 0000000000000000000000000000000000000000..92f4fc8506558ec16f50b71d2feacc07ea2f3a18 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/synscapes.py @@ -0,0 +1,85 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class SynScapes(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.video = False + self.is_metric = True + super().__init__(*args, **kwargs) + self.loaded_data = self._load_data() + + def _load_data(self): + rgb_dir = osp.join(self.ROOT, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")], + key=lambda x: int(x), + ) + self.img_names = basenames + + def __len__(self): + return len(self.img_names) + + def get_image_num(self): + return len(self.img_names) + + def _get_views(self, idx, resolution, rng, num_views): + new_seed = rng.integers(0, 2**32) + idx + new_rng = np.random.default_rng(new_seed) + img_names = new_rng.choice(self.img_names, num_views, replace=False) + + views = [] + for v, img_name in enumerate(img_names): + # Load RGB image + rgb_image = imread_cv2(osp.join(self.ROOT, "rgb", f"{img_name}.png")) + depthmap = np.load(osp.join(self.ROOT, "depth", f"{img_name}.npy")) + sky_mask = ( + imread_cv2(osp.join(self.ROOT, "sky_mask", f"{img_name}.png"))[..., 0] + >= 127 + ) + depthmap[sky_mask] = -1.0 + depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0) + depthmap[depthmap > 200] = 0.0 + + intrinsics = np.load(osp.join(self.ROOT, "cam", f"{img_name}.npz"))[ + "intrinsics" + ] + # camera pose is not provided, placeholder + camera_pose = np.eye(4) + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="synscapes", + label=img_name, + instance=f"{str(idx)}_{img_name}", + is_metric=self.is_metric, + is_video=False, + quantile=np.array(1.0, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=False, + depth_only=False, + single_view=True, + reset=True, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/tartanair.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/tartanair.py new file mode 100644 index 0000000000000000000000000000000000000000..760d0e9d6921bb31354fbe505821b550d301f83a --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/tartanair.py @@ -0,0 +1,164 @@ +import os.path as osp +import numpy as np +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class TartanAir_Multi(BaseMultiViewDataset): + + def __init__(self, ROOT, *args, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = True + self.max_interval = 20 + super().__init__(*args, **kwargs) + # loading all + assert self.split is None + self._load_data() + + def _load_data(self): + scene_dirs = sorted( + [ + d + for d in os.listdir(self.ROOT) + if os.path.isdir(os.path.join(self.ROOT, d)) + ] + ) + + offset = 0 + scenes = [] + sceneids = [] + images = [] + scene_img_list = [] + start_img_ids = [] + j = 0 + + for scene in scene_dirs: + for mode in ["Easy", "Hard"]: + seq_dirs = sorted( + [ + os.path.join(self.ROOT, scene, mode, d) + for d in os.listdir(os.path.join(self.ROOT, scene, mode)) + if os.path.isdir(os.path.join(self.ROOT, scene, mode, d)) + ] + ) + for seq_dir in seq_dirs: + basenames = sorted( + [f[:-8] for f in os.listdir(seq_dir) if f.endswith(".png")] + ) + num_imgs = len(basenames) + cut_off = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + img_ids = list(np.arange(num_imgs) + offset) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + scenes.append(seq_dir) + scene_img_list.append(img_ids) + sceneids.extend([j] * num_imgs) + images.extend(basenames) + start_img_ids.extend(start_img_ids_) + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def get_stats(self): + return f"{len(self)} groups of views" + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + scene_id = self.sceneids[start_id] + all_image_ids = self.scene_img_list[scene_id] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=0.8, + fix_interval_prob=0.8, + block_shuffle=16, + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = self.scenes[scene_id] + basename = self.images[view_idx] + + img = basename + "_rgb.png" + image = imread_cv2(osp.join(scene_dir, img)) + depthmap = np.load(osp.join(scene_dir, basename + "_depth.npy")) + camera_params = np.load(osp.join(scene_dir, basename + "_cam.npz")) + + intrinsics = camera_params["camera_intrinsics"] + camera_pose = camera_params["camera_pose"] + + sky_mask = depthmap >= 1000 + depthmap[sky_mask] = -1.0 # sky + depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0) + threshold = ( + np.percentile(depthmap[depthmap > 0], 98) + if depthmap[depthmap > 0].size > 0 + else 0 + ) + depthmap[depthmap > threshold] = 0.0 + + image, depthmap, intrinsics = self._crop_resize_if_necessary( + image, depthmap, intrinsics, resolution, rng, info=(scene_dir, img) + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.75, 0.2, 0.05] + ) + + views.append( + dict( + img=image, + depthmap=depthmap, + camera_pose=camera_pose, # cam2world + camera_intrinsics=intrinsics, + dataset="TartanAir", + label=scene_dir, + is_metric=self.is_metric, + instance=scene_dir + "_" + img, + is_video=ordered_video, + quantile=np.array(1.0, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/uasol.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/uasol.py new file mode 100644 index 0000000000000000000000000000000000000000..b91b43bdd6a27691ac5016b22c183ac300d219a9 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/uasol.py @@ -0,0 +1,148 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + +import re + + +def extract_number(filename): + match = re.search(r"\d+", filename) + if match: + return int(match.group()) + return 0 + + +class UASOL_Multi(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = True + self.max_interval = 40 + super().__init__(*args, **kwargs) + self.loaded_data = self._load_data() + + def _load_data(self): + self.scenes = os.listdir(self.ROOT) + + offset = 0 + scenes = [] + sceneids = [] + scene_img_list = [] + images = [] + start_img_ids = [] + + j = 0 + for scene in tqdm(self.scenes): + scene_dir = osp.join(self.ROOT, scene) + rgb_dir = osp.join(scene_dir, "rgb") + basenames = sorted( + [f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")], + key=extract_number, + ) + num_imgs = len(basenames) + img_ids = list(np.arange(num_imgs) + offset) + # start_img_ids_ = img_ids[:-self.num_views+1] + cut_off = ( + self.num_views if not self.allow_repeat else max(self.num_views // 3, 3) + ) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + + start_img_ids.extend(start_img_ids_) + sceneids.extend([j] * num_imgs) + images.extend(basenames) + scenes.append(scene) + scene_img_list.append(img_ids) + + # offset groups + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + all_image_ids = self.scene_img_list[self.sceneids[start_id]] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=0.75, + fix_interval_prob=0.75, + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.scenes[scene_id]) + rgb_dir = osp.join(scene_dir, "rgb") + depth_dir = osp.join(scene_dir, "depth") + cam_dir = osp.join(scene_dir, "cam") + + basename = self.images[view_idx] + + # Load RGB image + rgb_image = imread_cv2(osp.join(rgb_dir, basename + ".png")) + # Load depthmap + depthmap = np.load(osp.join(depth_dir, basename + ".npy")) + depthmap[~np.isfinite(depthmap)] = 0 # invalid + depthmap[depthmap >= 20] = 0 # invalid + + cam = np.load(osp.join(cam_dir, basename + ".npz")) + camera_pose = cam["pose"] + intrinsics = cam["intrinsics"] + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=view_idx + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.75, 0.2, 0.05] + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="UASOL", + label=self.scenes[scene_id] + "_" + basename, + instance=osp.join(rgb_dir, basename + ".png"), + is_metric=self.is_metric, + is_video=ordered_video, + quantile=np.array(0.9, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/urbansyn.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/urbansyn.py new file mode 100644 index 0000000000000000000000000000000000000000..f3654a1200fffc1ae1c23483c752e06452f91310 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/urbansyn.py @@ -0,0 +1,82 @@ +import os.path as osp +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) +from tqdm import tqdm +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class UrbanSyn(BaseMultiViewDataset): + def __init__(self, *args, ROOT, **kwargs): + self.ROOT = ROOT + self.video = False + self.is_metric = True + super().__init__(*args, **kwargs) + self.loaded_data = self._load_data() + + def _load_data(self): + rgb_dir = osp.join(self.ROOT, "rgb") + basenames = sorted([f[:-4] for f in os.listdir(rgb_dir) if f.endswith(".png")]) + self.img_names = basenames + + def __len__(self): + return len(self.img_names) + + def get_image_num(self): + return len(self.img_names) + + def _get_views(self, idx, resolution, rng, num_views): + new_seed = rng.integers(0, 2**32) + idx + new_rng = np.random.default_rng(new_seed) + img_names = new_rng.choice(self.img_names, num_views, replace=False) + + views = [] + for img_name in img_names: + # Load RGB image + rgb_image = imread_cv2(osp.join(self.ROOT, "rgb", f"{img_name}.png")) + depthmap = np.load(osp.join(self.ROOT, "depth", f"{img_name}.npy")) + sky_mask = ( + imread_cv2(osp.join(self.ROOT, "sky_mask", f"{img_name}.png"))[..., 0] + >= 127 + ) + depthmap[sky_mask] = -1.0 + depthmap = np.nan_to_num(depthmap, nan=0, posinf=0, neginf=0) + depthmap[depthmap > 200] = 0.0 + + intrinsics = np.load(osp.join(self.ROOT, "cam", f"{img_name}.npz"))[ + "intrinsics" + ] + # camera pose is not provided, placeholder + camera_pose = np.eye(4) + + rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( + rgb_image, depthmap, intrinsics, resolution, rng=rng, info=img_name + ) + + views.append( + dict( + img=rgb_image, + depthmap=depthmap.astype(np.float32), + camera_pose=camera_pose.astype(np.float32), + camera_intrinsics=intrinsics.astype(np.float32), + dataset="urbansyn", + label=img_name, + instance=f"{str(idx)}_{img_name}", + is_metric=self.is_metric, + is_video=False, + quantile=np.array(1.0, dtype=np.float32), + img_mask=True, + ray_mask=False, + camera_only=False, + depth_only=False, + single_view=True, + reset=True, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/vkitti2.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/vkitti2.py new file mode 100644 index 0000000000000000000000000000000000000000..438e24f425fdb610b870c4d7b7f02b66ce8e3246 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/vkitti2.py @@ -0,0 +1,169 @@ +import os.path as osp +import numpy as np +import cv2 +import numpy as np +import itertools +import os +import sys + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 + + +class VirtualKITTI2_Multi(BaseMultiViewDataset): + + def __init__(self, ROOT, *args, **kwargs): + self.ROOT = ROOT + self.video = True + self.is_metric = True + self.max_interval = 5 + super().__init__(*args, **kwargs) + # loading all + self._load_data(self.split) + + def _load_data(self, split=None): + scene_dirs = sorted( + [ + d + for d in os.listdir(self.ROOT) + if os.path.isdir(os.path.join(self.ROOT, d)) + ] + ) + if split == "train": + scene_dirs = scene_dirs[:-1] + elif split == "test": + scene_dirs = scene_dirs[-1:] + seq_dirs = [] + for scene in scene_dirs: + seq_dirs += sorted( + [ + os.path.join(scene, d) + for d in os.listdir(os.path.join(self.ROOT, scene)) + if os.path.isdir(os.path.join(self.ROOT, scene, d)) + ] + ) + offset = 0 + scenes = [] + sceneids = [] + images = [] + scene_img_list = [] + start_img_ids = [] + j = 0 + + for seq_idx, seq in enumerate(seq_dirs): + seq_path = osp.join(self.ROOT, seq) + for cam in ["Camera_0", "Camera_1"]: + basenames = sorted( + [ + f[:5] + for f in os.listdir(seq_path + "/" + cam) + if f.endswith(".jpg") + ] + ) + num_imgs = len(basenames) + cut_off = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + if num_imgs < cut_off: + print(f"Skipping {scene}") + continue + img_ids = list(np.arange(num_imgs) + offset) + start_img_ids_ = img_ids[: num_imgs - cut_off + 1] + + scenes.append(seq + "/" + cam) + scene_img_list.append(img_ids) + sceneids.extend([j] * num_imgs) + images.extend(basenames) + start_img_ids.extend(start_img_ids_) + offset += num_imgs + j += 1 + + self.scenes = scenes + self.sceneids = sceneids + self.images = images + self.start_img_ids = start_img_ids + self.scene_img_list = scene_img_list + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return len(self.images) + + def get_stats(self): + return f"{len(self)} groups of views" + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + scene_id = self.sceneids[start_id] + all_image_ids = self.scene_img_list[scene_id] + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=1.0, + fix_interval_prob=0.9, + ) + image_idxs = np.array(all_image_ids)[pos] + + views = [] + + for v, view_idx in enumerate(image_idxs): + scene_id = self.sceneids[view_idx] + scene_dir = osp.join(self.ROOT, self.scenes[scene_id]) + basename = self.images[view_idx] + + img = basename + "_rgb.jpg" + image = imread_cv2(osp.join(scene_dir, img)) + depthmap = ( + cv2.imread( + osp.join(scene_dir, basename + "_depth.png"), + cv2.IMREAD_ANYCOLOR | cv2.IMREAD_ANYDEPTH, + ).astype(np.float32) + / 100.0 + ) + camera_params = np.load(osp.join(scene_dir, basename + "_cam.npz")) + + intrinsics = camera_params["camera_intrinsics"] + camera_pose = camera_params["camera_pose"] + + sky_mask = depthmap >= 655 + depthmap[sky_mask] = -1.0 # sky + + image, depthmap, intrinsics = self._crop_resize_if_necessary( + image, depthmap, intrinsics, resolution, rng, info=(scene_dir, img) + ) + + # generate img mask and raymap mask + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.85, 0.1, 0.05] + ) + + views.append( + dict( + img=image, + depthmap=depthmap, + camera_pose=camera_pose, # cam2world + camera_intrinsics=intrinsics, + dataset="VirtualKITTI2", + label=scene_dir, + is_metric=self.is_metric, + instance=scene_dir + "_" + img, + is_video=ordered_video, + quantile=np.array(1.0, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=False, + depth_only=False, + single_view=False, + reset=False, + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/waymo_v2.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/waymo_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..fdaa24144eb89ce2ced4991114046f070deb9370 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/datasets/waymo_v2.py @@ -0,0 +1,552 @@ +"""Waymo Open Dataset v2.0.1 training loader (parquet-direct). + +Each segment is a parquet file with one row per (camera, timestamp). We use +the FRONT camera (camera_name=1). Each row has: + - [CameraImageComponent].image: JPEG bytes + - [CameraImageComponent].pose.transform: 16-double 4×4 c2w at frame timestamp + - key.frame_timestamp_micros: int64 timestamp + +Intrinsics come from camera_calibration parquets (1 row per camera per segment). + +Optional sparse LiDAR depth supervision: + - lidar/*.parquet range images (per laser, per timestamp) + - lidar_calibration/*.parquet extrinsics + beam inclinations +We use only the TOP lidar (laser_name=1, 64-beam HDL64 surrogate). Range image +pixels are converted to (x,y,z) in lidar frame, transformed lidar→vehicle→camera, +then projected with the FRONT camera intrinsic. Closest-z wins per pixel. + +Reads pyarrow directly — no waymo_open_dataset_v2 package needed. + +Train/test split: take every 4th val segment for test (held-out for our eval), +rest for training. Adjust if you have a separate train split downloaded. +""" +import io +import os +import os.path as osp +import sys +from glob import glob + +import cv2 +import numpy as np + +sys.path.append(osp.join(osp.dirname(__file__), "..", "..")) + +from dust3r.datasets.base.base_multiview_dataset import BaseMultiViewDataset +from dust3r.utils.image import imread_cv2 # not used for Waymo (decode JPEG inline) + +FRONT_CAMERA = 1 # Waymo CameraName.FRONT + +# Axis-convention bridge: Waymo cam frame is (x=fwd, y=left, z=up). +# OpenCV / SLAMFormer / DUST3R cam frame is (x=right, y=down, z=fwd). +# Map a CV-frame point p_cv into Waymo frame via R_CV_TO_WAYMO @ p_cv. +# CV (1,0,0) right → Waymo ( 0,-1, 0) (right == −left) +# CV (0,1,0) down → Waymo ( 0, 0,-1) (down == −up) +# CV (0,0,1) fwd → Waymo ( 1, 0, 0) +R_CV_TO_WAYMO = np.array([[0, 0, 1], [-1, 0, 0], [0, -1, 0]], dtype=np.float64) +T_AXIS_CV_TO_WAYMO = np.eye(4, dtype=np.float64) +T_AXIS_CV_TO_WAYMO[:3, :3] = R_CV_TO_WAYMO +TOP_LIDAR = 1 # Waymo LaserName.TOP (64-beam, 360°) + + +def _decode_jpeg(image_bytes: bytes) -> np.ndarray: + """Bytes → BGR uint8 ndarray (opencv format used by base loader).""" + arr = np.frombuffer(image_bytes, dtype=np.uint8) + img = cv2.imdecode(arr, cv2.IMREAD_COLOR) + return img + + +def _load_lidar_calibration(path: str): + """Read lidar_calibration parquet → dict[laser_name] = (extrinsic 4x4 lidar→veh, + beam_inclinations 1D float (per row, descending top→bottom)). + + Returns None if the parquet is missing/empty. + """ + import pyarrow.parquet as pq + if not osp.isfile(path) or osp.getsize(path) < 1000: + return None + df = pq.read_table(path).to_pandas() + out = {} + for _, row in df.iterrows(): + laser = int(row["key.laser_name"]) + ext_flat = np.asarray( + row["[LiDARCalibrationComponent].extrinsic.transform"], dtype=np.float64 + ) + if ext_flat.size != 16: + continue + ext = ext_flat.reshape(4, 4) + # beam_inclination.values is sometimes empty when a uniform inclination + # range is specified instead — handle both. (None for missing column.) + bi_raw = row.get("[LiDARCalibrationComponent].beam_inclination.values") + bi_vals = ( + np.asarray(bi_raw, dtype=np.float64) if bi_raw is not None else np.empty(0) + ) + if bi_vals.size > 0: + beam_incl = bi_vals + else: + bi_min = float(row["[LiDARCalibrationComponent].beam_inclination.min"]) + bi_max = float(row["[LiDARCalibrationComponent].beam_inclination.max"]) + # Will be resampled to range-image height at projection time. + beam_incl = np.array([bi_min, bi_max], dtype=np.float64) + out[laser] = (ext, beam_incl) + return out + + +def _project_waymo_lidar_to_front( + range_img, # (H_li, W_li, C) where channel 0 = range + lidar_extrinsic, # (4,4) lidar → vehicle + beam_incl, # (H_li,) beam inclinations OR (2,) [min, max] + cam_extrinsic, # (4,4) camera → vehicle + K, # (3,3) intrinsic + img_H, img_W, + min_depth=0.5, max_depth=80.0, +): + """Project Waymo TOP lidar range image to FRONT camera → sparse depthmap. + + Ignores rolling-shutter / per-pixel pose (sub-pixel error for sparse supervision). + Waymo vehicle frame: x=fwd, y=left, z=up. CV camera frame: x=right, y=down, z=fwd. + Camera extrinsic in Waymo is `cam→veh` with cam axes following vehicle convention, + so the cv re-axis happens after lidar→cam. + """ + H_li, W_li = range_img.shape[:2] + ranges = range_img[..., 0].astype(np.float64) # (H_li, W_li) + valid_pix = ranges > 0 + if not valid_pix.any(): + return np.full((img_H, img_W), -1.0, dtype=np.float32) + + # Beam inclination per row. Range-image rows go top-of-cylinder → bottom, + # so inclination decreases (Waymo: highest angle at row 0). + if beam_incl.size == H_li: + incl = beam_incl[::-1] # parquet stores ascending → reverse to descending top→bot + else: + # uniform [min, max] → linear spacing per row (top=max, bottom=min). + bi_min, bi_max = float(beam_incl[0]), float(beam_incl[-1]) + incl = np.linspace(bi_max, bi_min, H_li) + + # Azimuth per column. Waymo TOP lidar columns sweep clockwise viewed from above; + # column W/2 is along vehicle +x (forward). Standard Waymo formula: + # az(c) = π − (c + 0.5) * 2π / W + cols = np.arange(W_li, dtype=np.float64) + az = np.pi - (cols + 0.5) * 2 * np.pi / W_li # (W_li,) + + cos_incl = np.cos(incl)[:, None] # (H_li, 1) + sin_incl = np.sin(incl)[:, None] # (H_li, 1) + cos_az = np.cos(az)[None, :] # (1, W_li) + sin_az = np.sin(az)[None, :] # (1, W_li) + + # In lidar frame (Waymo: x=fwd, y=left, z=up before applying extrinsic). + x = ranges * cos_incl * cos_az + y = ranges * cos_incl * sin_az + z = ranges * sin_incl + pts_lidar = np.stack([x, y, z], axis=-1)[valid_pix] # (M, 3) + if pts_lidar.shape[0] == 0: + return np.full((img_H, img_W), -1.0, dtype=np.float32) + + pts_h = np.concatenate( + [pts_lidar, np.ones((pts_lidar.shape[0], 1), dtype=np.float64)], axis=1 + ) + pts_veh = pts_h @ lidar_extrinsic.T # (M, 4) in vehicle frame + T_veh_to_cam = np.linalg.inv(cam_extrinsic) + pts_cam = pts_veh @ T_veh_to_cam.T # (M, 4) in Waymo cam frame (x=fwd,y=left,z=up) + # Re-axis to CV: (x_cv, y_cv, z_cv) = (-y_waymo, -z_waymo, x_waymo) + cv = np.stack([-pts_cam[:, 1], -pts_cam[:, 2], pts_cam[:, 0]], axis=-1) + in_front = cv[:, 2] > min_depth + cv = cv[in_front] + if cv.shape[0] == 0: + return np.full((img_H, img_W), -1.0, dtype=np.float32) + + uv_h = cv @ K.T + z_proj = uv_h[:, 2] + u = uv_h[:, 0] / z_proj + v = uv_h[:, 1] / z_proj + in_img = (u >= 0) & (u < img_W) & (v >= 0) & (v < img_H) & (z_proj < max_depth) + u = u[in_img].astype(np.int32) + v = v[in_img].astype(np.int32) + z_proj = z_proj[in_img] + depthmap = np.full((img_H, img_W), -1.0, dtype=np.float32) + if z_proj.size == 0: + return depthmap + order = np.argsort(-z_proj) # closest-z written last → wins + depthmap[v[order], u[order]] = z_proj[order].astype(np.float32) + return depthmap + + +def _load_lidar_range_images(path: str): + """Read lidar parquet → dict[(laser_name, frame_timestamp_micros)] → (H,W,C) range image. + + We only retain TOP_LIDAR. Returns None if file missing/unreadable. + """ + import pyarrow.parquet as pq + if not osp.isfile(path) or osp.getsize(path) < 1000: + return None + try: + tbl = pq.read_table( + path, + columns=[ + "key.laser_name", + "key.frame_timestamp_micros", + "[LiDARComponent].range_image_return1.values", + "[LiDARComponent].range_image_return1.shape", + ], + ) + except Exception: + return None + df = tbl.to_pandas() + df = df[df["key.laser_name"] == TOP_LIDAR] + out = {} + for _, row in df.iterrows(): + ts = int(row["key.frame_timestamp_micros"]) + shp = np.asarray(row["[LiDARComponent].range_image_return1.shape"], dtype=np.int64) + if shp.size != 3: + continue + H, W, C = int(shp[0]), int(shp[1]), int(shp[2]) + vals = np.asarray( + row["[LiDARComponent].range_image_return1.values"], dtype=np.float32 + ) + if vals.size != H * W * C: + continue + out[ts] = vals.reshape(H, W, C) + return out + + +def _load_camera_calibration(path: str): + """Read camera_calibration parquet → dict[camera_name] = (K (3x3), extrinsic (4x4 cam2vehicle)).""" + import pyarrow.parquet as pq + if not osp.isfile(path) or osp.getsize(path) == 0: + return None + df = pq.read_table(path).to_pandas() + out = {} + for _, row in df.iterrows(): + cam = int(row["key.camera_name"]) + # Intrinsic: 9 floats (fx, fy, cx, cy, k1, k2, p1, p2, k3) in v2 format + intrin = np.asarray(row["[CameraCalibrationComponent].intrinsic.f_u":"[CameraCalibrationComponent].intrinsic.k3"] + if False else row.get("[CameraCalibrationComponent].intrinsic.f_u")) + # Fall back: pyarrow gives separate columns + fx = row["[CameraCalibrationComponent].intrinsic.f_u"] + fy = row["[CameraCalibrationComponent].intrinsic.f_v"] + cx = row["[CameraCalibrationComponent].intrinsic.c_u"] + cy = row["[CameraCalibrationComponent].intrinsic.c_v"] + K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) + # Extrinsic: vehicle→camera as 16-double, but stored as 4x4 row-major + ext = np.asarray(row["[CameraCalibrationComponent].extrinsic.transform"]).reshape(4, 4) + out[cam] = (K, ext.astype(np.float32)) + return out + + +# Default FRONT camera intrinsics (used as fallback if calibration parquet missing). +# Approximate values; will be overridden once calibration parquets are downloaded. +_DEFAULT_FRONT_K = np.array( + [[2059.86, 0.0, 938.5], + [0.0, 2056.97, 540.5], + [0.0, 0.0, 1.0]], + dtype=np.float32, +) + + +class Waymo_v2_Multi(BaseMultiViewDataset): + """Waymo Open Dataset v2.0.1, FRONT camera + optional TOP-lidar sparse depth. + + Lidar supervision activates per-segment when matching parquets exist in + {ROOT}/validation/{lidar,lidar_calibration}/. Per-frame, projection only runs + when the range image for that timestamp is present and produces ≥1 valid pixel. + """ + + def __init__(self, ROOT, *args, lidar_root=None, **kwargs): + self.ROOT = ROOT + # Optional alternate root for lidar/lidar_calibration parquets (in case the + # camera and lidar live in separate directory trees). Defaults to ROOT. + self.lidar_root = lidar_root if lidar_root else ROOT + self.video = True + self.is_metric = True + self.max_interval = 4 + super().__init__(*args, **kwargs) + self._load_data(self.split) + + def _load_data(self, split=None): + # Walk training/ + validation/ camera_image parquets → segments. + # All splits use identical schemas; training/ has ~798 segs, validation/ ~202. + # Held-out test = every 4th validation/ segment (~50 segs). + # Train = training/ (all) + remaining 75% of validation/. + train_segments = [] + test_segments = [] + for sub in ("training", "validation"): + ci_dir = osp.join(self.ROOT, sub, "camera_image") + if not osp.isdir(ci_dir): + continue + sub_parquets = sorted(glob(osp.join(ci_dir, "*.parquet"))) + sub_parquets = [p for p in sub_parquets if osp.getsize(p) > 1000] + if sub == "training": + # All training/ segments → train + train_segments.extend(sub_parquets) + else: # validation + # Every 4th → test, rest → train + for i, p in enumerate(sub_parquets): + (test_segments if i % 4 == 0 else train_segments).append(p) + + if split == "train": + parquets = train_segments + else: + parquets = test_segments + if not parquets: + raise RuntimeError( + f"No Waymo segments found for split={split} under {self.ROOT}/{{training,validation}}/camera_image/" + ) + + # Per-segment companion-parquet directories. Camera calibration + lidar + # parquets are looked up by basename within the same split subdirectory. + # Pre-compute the {parquet_path → split} so we can resolve cc/li/lc paths. + def _split_of(p): + # parquet path is .../{ROOT}/{split}/camera_image/.parquet + return osp.basename(osp.dirname(osp.dirname(p))) + + # Lazy: index per-segment metadata only (don't preload images). + segments = [] + seq_poses = [] # list of (M_i, 4, 4) FRONT poses per segment + seq_timestamps = [] # list of [t_us] + seq_calib_K = [] # list of (3,3) intrinsic per segment + seq_cam_extrinsic = [] # (4,4) FRONT camera→vehicle, or None + seq_lidar_path = [] # absolute path to lidar parquet, or None + seq_lidar_extrinsic = [] # (4,4) TOP lidar→vehicle, or None + seq_lidar_beam_incl = [] # (H_li,) or (2,), or None + scene_img_list = [] + sceneids = [] + start_img_ids = [] + offset = 0 + j = 0 + + import pyarrow.parquet as pq + for parquet_path in parquets: + seg_name = osp.splitext(osp.basename(parquet_path))[0] + try: + tbl = pq.read_table( + parquet_path, + columns=[ + "key.camera_name", + "key.frame_timestamp_micros", + "[CameraImageComponent].pose.transform", + ], + ) + except Exception: + continue + df = tbl.to_pandas() + front = df[df["key.camera_name"] == FRONT_CAMERA] + if len(front) < self.num_views: + continue + front = front.sort_values("key.frame_timestamp_micros").reset_index(drop=True) + poses = np.stack( + [np.asarray(p, dtype=np.float64).reshape(4, 4) for p in front["[CameraImageComponent].pose.transform"]], + axis=0, + ).astype(np.float32) + timestamps = front["key.frame_timestamp_micros"].astype(np.int64).values + + # Intrinsics + cam extrinsic (calibration parquet may not exist yet → fallback) + sub = _split_of(parquet_path) + cc_dir = osp.join(self.ROOT, sub, "camera_calibration") + li_dir = osp.join(self.lidar_root, sub, "lidar") + lc_dir = osp.join(self.lidar_root, sub, "lidar_calibration") + + calib_path = osp.join(cc_dir, osp.basename(parquet_path)) + calib = _load_camera_calibration(calib_path) + if calib is not None and FRONT_CAMERA in calib: + K, cam_ext = calib[FRONT_CAMERA] + cam_ext = cam_ext.astype(np.float64) if cam_ext is not None else None + else: + K = _DEFAULT_FRONT_K.copy() + cam_ext = None + + # Lidar calibration + range-image parquet path (per-segment). + lidar_path = osp.join(li_dir, osp.basename(parquet_path)) if osp.isdir(li_dir) else "" + lidar_calib_path = osp.join(lc_dir, osp.basename(parquet_path)) if osp.isdir(lc_dir) else "" + lidar_ext = None + beam_incl = None + if osp.isfile(lidar_calib_path): + lc = _load_lidar_calibration(lidar_calib_path) + if lc is not None and TOP_LIDAR in lc: + lidar_ext, beam_incl = lc[TOP_LIDAR] + lidar_path = lidar_path if osp.isfile(lidar_path) else None + + n_imgs = len(front) + cut_off = ( + self.num_views + if not self.allow_repeat + else max(self.num_views // 3, 3) + ) + if n_imgs < cut_off: + continue + + img_ids = list(np.arange(n_imgs) + offset) + start_img_ids_ = img_ids[: n_imgs - cut_off + 1] + + segments.append((seg_name, parquet_path)) + seq_poses.append(poses) + seq_timestamps.append(timestamps) + seq_calib_K.append(K) + seq_cam_extrinsic.append(cam_ext) + seq_lidar_path.append(lidar_path) + seq_lidar_extrinsic.append(lidar_ext) + seq_lidar_beam_incl.append(beam_incl) + scene_img_list.append(img_ids) + sceneids.extend([j] * n_imgs) + start_img_ids.extend(start_img_ids_) + offset += n_imgs + j += 1 + + if not segments: + raise RuntimeError(f"No usable Waymo segments after filtering (split={split})") + + self.segments = segments + self.seq_poses = seq_poses + self.seq_timestamps = seq_timestamps + self.seq_calib_K = seq_calib_K + self.seq_cam_extrinsic = seq_cam_extrinsic + self.seq_lidar_path = seq_lidar_path + self.seq_lidar_extrinsic = seq_lidar_extrinsic + self.seq_lidar_beam_incl = seq_lidar_beam_incl + self.scene_img_list = scene_img_list + self.sceneids = sceneids + self.start_img_ids = start_img_ids + self.scenes = [s[0] for s in segments] # for compatibility + # Cache: segment_idx → (df with images loaded). LRU at __getitem__ time. + self._image_cache = {} + # Cache: segment_idx → dict[ts] = range image (top-laser only). + self._lidar_cache = {} + + def __len__(self): + return len(self.start_img_ids) + + def get_image_num(self): + return sum(len(p) for p in self.seq_poses) + + def get_stats(self): + return f"{len(self)} groups across {len(self.segments)} Waymo v2 segments" + + def _get_segment_images(self, seg_idx: int): + """Return DataFrame with ['image' bytes, 'frame_timestamp_micros'] for FRONT camera.""" + if seg_idx in self._image_cache: + return self._image_cache[seg_idx] + import pyarrow.parquet as pq + seg_name, parquet_path = self.segments[seg_idx] + tbl = pq.read_table( + parquet_path, + columns=[ + "key.camera_name", + "key.frame_timestamp_micros", + "[CameraImageComponent].image", + ], + ) + df = tbl.to_pandas() + df = df[df["key.camera_name"] == FRONT_CAMERA].sort_values("key.frame_timestamp_micros").reset_index(drop=True) + # Cap LRU at 4 segments to bound RAM (each ~50 MB) + if len(self._image_cache) >= 4: + old_key = next(iter(self._image_cache)) + self._image_cache.pop(old_key) + self._image_cache[seg_idx] = df + return df + + def _get_segment_lidar(self, seg_idx: int): + """Return dict[timestamp_us] → (H, W, C) range image (TOP lidar), or None.""" + if seg_idx in self._lidar_cache: + return self._lidar_cache[seg_idx] + lidar_path = self.seq_lidar_path[seg_idx] + if lidar_path is None: + self._lidar_cache[seg_idx] = None + return None + ranges = _load_lidar_range_images(lidar_path) + # LRU cap: each top-laser range image is ~300 KB × ~200 frames = ~60 MB / segment + if len(self._lidar_cache) >= 4: + old_key = next(iter(self._lidar_cache)) + self._lidar_cache.pop(old_key) + self._lidar_cache[seg_idx] = ranges + return ranges + + def _get_views(self, idx, resolution, rng, num_views): + start_id = self.start_img_ids[idx] + scene_id = self.sceneids[start_id] + all_image_ids = self.scene_img_list[scene_id] + n_frames = len(all_image_ids) + seg_name = self.scenes[scene_id] + poses = self.seq_poses[scene_id] + timestamps = self.seq_timestamps[scene_id] + K = self.seq_calib_K[scene_id] + cam_ext = self.seq_cam_extrinsic[scene_id] + lidar_ext = self.seq_lidar_extrinsic[scene_id] + beam_incl = self.seq_lidar_beam_incl[scene_id] + has_lidar_calib = ( + cam_ext is not None and lidar_ext is not None and beam_incl is not None + ) + + pos, ordered_video = self.get_seq_from_start_id( + num_views, + start_id, + all_image_ids, + rng, + max_interval=self.max_interval, + video_prob=1.0, + fix_interval_prob=0.9, + ) + local_idxs = np.asarray(pos, dtype=int) + + # Lazy-load images + range images for this segment + df = self._get_segment_images(scene_id) + lidar_imgs = self._get_segment_lidar(scene_id) if has_lidar_calib else None + views = [] + for v, lid in enumerate(local_idxs): + lid = int(lid) + img_bytes = df.iloc[lid]["[CameraImageComponent].image"] + image = _decode_jpeg(bytes(img_bytes)) + if image is None: + raise RuntimeError(f"Failed to decode JPEG for {seg_name} frame {lid}") + H, W = image.shape[:2] + + # Project TOP-lidar range image → sparse depthmap when available. + frame_has_lidar = False + if has_lidar_calib and lidar_imgs is not None: + ts = int(timestamps[lid]) + rimg = lidar_imgs.get(ts) + if rimg is not None: + depthmap = _project_waymo_lidar_to_front( + rimg, lidar_ext, beam_incl, cam_ext, K, H, W + ) + frame_has_lidar = bool((depthmap > 0).any()) + else: + depthmap = np.full((H, W), -1.0, dtype=np.float32) + else: + depthmap = np.full((H, W), -1.0, dtype=np.float32) + + intrinsics = K.copy() + # Waymo `pose.transform` is c2w with cam axes (x=fwd, y=left, z=up). + # Re-axis to OpenCV cam frame (x=right, y=down, z=fwd) so the pose + # composes correctly with depth-back-projected points (which use K + # in CV convention, matching the lidar projection at line ~140). + camera_pose = (poses[lid] @ T_AXIS_CV_TO_WAYMO).astype(np.float32) + + image, depthmap, intrinsics = self._crop_resize_if_necessary( + image, depthmap, intrinsics, resolution, rng, info=(seg_name, f"{lid}.jpg") + ) + + img_mask, ray_mask = self.get_img_and_ray_masks( + self.is_metric, v, rng, p=[0.85, 0.1, 0.05] + ) + + views.append( + dict( + img=image, + depthmap=depthmap, + camera_pose=camera_pose, + camera_intrinsics=intrinsics, + dataset="WaymoV2", + label=seg_name, + is_metric=self.is_metric, + instance=f"{seg_name}/FRONT/{lid}.jpg", + is_video=ordered_video, + quantile=np.array(1.0, dtype=np.float32), + img_mask=img_mask, + ray_mask=ray_mask, + camera_only=not frame_has_lidar, + depth_only=False, + single_view=False, + reset=False, + scene_tag=f"waymo/{seg_name}", + ) + ) + assert len(views) == num_views + return views diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75cfc19c494f4c9faa0c9235864541902c75f4f6 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/__init__.py @@ -0,0 +1,35 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +from .linear_head import LinearPts3d, LinearPts3d_Desc, LinearPts3dPose +from .dpt_head import DPTPts3dPose + + +def head_factory( + head_type, + output_mode, + net, + has_conf=False, + has_depth=False, + has_rgb=False, + has_pose_conf=False, + has_pose=False, +): + """ " build a prediction head for the decoder""" + if head_type == "linear" and output_mode == "pts3d": + return LinearPts3d(net, has_conf, has_depth, has_rgb, has_pose_conf) + elif head_type == "linear" and output_mode == "pts3d+pose": + return LinearPts3dPose(net, has_conf, has_rgb, has_pose) + elif head_type == "linear" and output_mode.startswith("pts3d+desc"): + local_feat_dim = int(output_mode[10:]) + return LinearPts3d_Desc(net, has_conf, has_depth, local_feat_dim) + elif head_type == "dpt" and output_mode == "pts3d": + raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}") + return create_dpt_head(net, has_conf=has_conf) + elif head_type == "dpt" and output_mode == "pts3d+pose": + return DPTPts3dPose(net, has_conf, has_rgb, has_pose) + else: + raise NotImplementedError(f"unexpected {head_type=} and {output_mode=}") diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/dpt_head.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/dpt_head.py new file mode 100644 index 0000000000000000000000000000000000000000..11bd08ed69679c09770a728d98afb6a1b1bc1cf3 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/dpt_head.py @@ -0,0 +1,260 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +from einops import rearrange +from typing import List +import torch +import torch.nn as nn +from dust3r.heads.postprocess import ( + postprocess, + postprocess_desc, + postprocess_rgb, + postprocess_pose_conf, + postprocess_pose, + reg_dense_conf, +) +import dust3r.utils.path_to_croco # noqa: F401 +from models.dpt_block import DPTOutputAdapter # noqa +from dust3r.utils.camera import pose_encoding_to_camera, PoseDecoder +from dust3r.blocks import ConditionModulationBlock +from torch.utils.checkpoint import checkpoint + + +class DPTOutputAdapter_fix(DPTOutputAdapter): + """ + Adapt croco's DPTOutputAdapter implementation for dust3r: + remove duplicated weigths, and fix forward for dust3r + """ + + def init(self, dim_tokens_enc=768): + super().init(dim_tokens_enc) + + del self.act_1_postprocess + del self.act_2_postprocess + del self.act_3_postprocess + del self.act_4_postprocess + + def forward(self, encoder_tokens: List[torch.Tensor], image_size=None): + assert ( + self.dim_tokens_enc is not None + ), "Need to call init(dim_tokens_enc) function first" + + image_size = self.image_size if image_size is None else image_size + H, W = image_size + + N_H = H // (self.stride_level * self.P_H) + N_W = W // (self.stride_level * self.P_W) + + layers = [encoder_tokens[hook] for hook in self.hooks] + + layers = [self.adapt_tokens(l) for l in layers] + + layers = [ + rearrange(l, "b (nh nw) c -> b c nh nw", nh=N_H, nw=N_W) for l in layers + ] + + layers = [self.act_postprocess[idx](l) for idx, l in enumerate(layers)] + + layers = [self.scratch.layer_rn[idx](l) for idx, l in enumerate(layers)] + + path_4 = self.scratch.refinenet4(layers[3])[ + :, :, : layers[2].shape[2], : layers[2].shape[3] + ] + path_3 = self.scratch.refinenet3(path_4, layers[2]) + path_2 = self.scratch.refinenet2(path_3, layers[1]) + path_1 = self.scratch.refinenet1(path_2, layers[0]) + + out = self.head(path_1) + + return out + + +class PixelwiseTaskWithDPT(nn.Module): + """DPT module for dust3r, can return 3D points + confidence for all pixels""" + + def __init__( + self, + *, + n_cls_token=0, + hooks_idx=None, + dim_tokens=None, + output_width_ratio=1, + num_channels=1, + postprocess=None, + depth_mode=None, + conf_mode=None, + **kwargs + ): + super(PixelwiseTaskWithDPT, self).__init__() + self.return_all_layers = True # backbone needs to return all layers + self.postprocess = postprocess + self.depth_mode = depth_mode + self.conf_mode = conf_mode + + assert n_cls_token == 0, "Not implemented" + dpt_args = dict( + output_width_ratio=output_width_ratio, num_channels=num_channels, **kwargs + ) + if hooks_idx is not None: + dpt_args.update(hooks=hooks_idx) + self.dpt = DPTOutputAdapter_fix(**dpt_args) + dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens} + self.dpt.init(**dpt_init_args) + + def forward(self, x, img_info): + out = self.dpt(x, image_size=(img_info[0], img_info[1])) + if self.postprocess: + out = self.postprocess(out, self.depth_mode, self.conf_mode) + return out + + +def create_dpt_head(net, has_conf=False): + """ + return PixelwiseTaskWithDPT for given net params + """ + assert net.dec_depth > 9 + l2 = net.dec_depth + feature_dim = 256 + last_dim = feature_dim // 2 + out_nchan = 3 + ed = net.enc_embed_dim + dd = net.dec_embed_dim + return PixelwiseTaskWithDPT( + num_channels=out_nchan + has_conf, + feature_dim=feature_dim, + last_dim=last_dim, + hooks_idx=[0, l2 * 2 // 4, l2 * 3 // 4, l2], + dim_tokens=[ed, dd, dd, dd], + postprocess=postprocess, + depth_mode=net.depth_mode, + conf_mode=net.conf_mode, + head_type="regression", + ) + + +class DPTPts3dPose(nn.Module): + def __init__(self, net, has_conf=False, has_rgb=False, has_pose=False): + super(DPTPts3dPose, self).__init__() + self.return_all_layers = True # backbone needs to return all layers + self.depth_mode = net.depth_mode + self.conf_mode = net.conf_mode + self.pose_mode = net.pose_mode + + self.has_conf = has_conf + self.has_rgb = has_rgb + self.has_pose = has_pose + + pts_channels = 3 + has_conf + rgb_channels = has_rgb * 3 + feature_dim = 256 + last_dim = feature_dim // 2 + ed = net.enc_embed_dim + dd = net.dec_embed_dim + hooks_idx = [0, 1, 2, 3] + dim_tokens = [ed, dd, dd, dd] + head_type = "regression" + output_width_ratio = 1 + + pts_dpt_args = dict( + output_width_ratio=output_width_ratio, + num_channels=pts_channels, + feature_dim=feature_dim, + last_dim=last_dim, + dim_tokens=dim_tokens, + hooks_idx=hooks_idx, + head_type=head_type, + ) + rgb_dpt_args = dict( + output_width_ratio=output_width_ratio, + num_channels=rgb_channels, + feature_dim=feature_dim, + last_dim=last_dim, + dim_tokens=dim_tokens, + hooks_idx=hooks_idx, + head_type=head_type, + ) + if hooks_idx is not None: + pts_dpt_args.update(hooks=hooks_idx) + rgb_dpt_args.update(hooks=hooks_idx) + + self.dpt_self = DPTOutputAdapter_fix(**pts_dpt_args) + dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens} + self.dpt_self.init(**dpt_init_args) + + self.final_transform = nn.ModuleList( + [ + ConditionModulationBlock( + net.dec_embed_dim, + net.dec_num_heads, + mlp_ratio=4.0, + qkv_bias=True, + rope=net.rope, + ) + for _ in range(2) + ] + ) + + self.dpt_cross = DPTOutputAdapter_fix(**pts_dpt_args) + dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens} + self.dpt_cross.init(**dpt_init_args) + + if has_rgb: + self.dpt_rgb = DPTOutputAdapter_fix(**rgb_dpt_args) + dpt_init_args = {} if dim_tokens is None else {"dim_tokens_enc": dim_tokens} + self.dpt_rgb.init(**dpt_init_args) + + if has_pose: + in_dim = net.dec_embed_dim + self.pose_head = PoseDecoder(hidden_size=in_dim) + + def forward(self, x, img_info, **kwargs): + if self.has_pose: + pose_token = x[-1][:, 0].clone() + token = x[-1][:, 1:] + with torch.cuda.amp.autocast(enabled=False): + pose = self.pose_head(pose_token) + + token_cross = token.clone() + for blk in self.final_transform: + token_cross = blk(token_cross, pose_token, kwargs.get("pos")) + x = x[:-1] + [token] + x_cross = x[:-1] + [token_cross] + + with torch.cuda.amp.autocast(enabled=False): + self_out = checkpoint( + self.dpt_self, + x, + image_size=(img_info[0], img_info[1]), + use_reentrant=False, + ) + + final_output = postprocess(self_out, self.depth_mode, self.conf_mode) + final_output["pts3d_in_self_view"] = final_output.pop("pts3d") + final_output["conf_self"] = final_output.pop("conf") + + if self.has_rgb: + rgb_out = checkpoint( + self.dpt_rgb, + x, + image_size=(img_info[0], img_info[1]), + use_reentrant=False, + ) + rgb_output = postprocess_rgb(rgb_out) + final_output.update(rgb_output) + + if self.has_pose: + pose = postprocess_pose(pose, self.pose_mode) + final_output["camera_pose"] = pose # B,7 + cross_out = checkpoint( + self.dpt_cross, + x_cross, + image_size=(img_info[0], img_info[1]), + use_reentrant=False, + ) + tmp = postprocess(cross_out, self.depth_mode, self.conf_mode) + final_output["pts3d_in_other_view"] = tmp.pop("pts3d") + final_output["conf"] = tmp.pop("conf") + return final_output diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/linear_head.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/linear_head.py new file mode 100644 index 0000000000000000000000000000000000000000..081cf21de8252c9ed51882cedf2ecae0c8364985 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/linear_head.py @@ -0,0 +1,346 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +import torch +import torch.nn as nn +import torch.nn.functional as F +from dust3r.heads.postprocess import ( + postprocess, + postprocess_desc, + postprocess_rgb, + postprocess_pose_conf, + postprocess_pose, + reg_dense_conf, +) +import dust3r.utils.path_to_croco # noqa +from models.blocks import Mlp # noqa +from dust3r.utils.geometry import geotrf +from dust3r.utils.camera import pose_encoding_to_camera, PoseDecoder +from dust3r.blocks import ConditionModulationBlock + + +class LinearPts3d(nn.Module): + """ + Linear head for dust3r + Each token outputs: - 16x16 3D points (+ confidence) + """ + + def __init__( + self, net, has_conf=False, has_depth=False, has_rgb=False, has_pose_conf=False + ): + super().__init__() + self.patch_size = net.patch_embed.patch_size[0] + self.depth_mode = net.depth_mode + self.conf_mode = net.conf_mode + self.has_conf = has_conf + self.has_rgb = has_rgb + self.has_pose_conf = has_pose_conf + self.has_depth = has_depth + self.proj = Mlp( + net.dec_embed_dim, out_features=(3 + has_conf) * self.patch_size**2 + ) + if has_depth: + self.self_proj = Mlp( + net.dec_embed_dim, out_features=(3 + has_conf) * self.patch_size**2 + ) + if has_rgb: + self.rgb_proj = Mlp(net.dec_embed_dim, out_features=3 * self.patch_size**2) + + def setup(self, croconet): + pass + + def forward(self, decout, img_shape): + H, W = img_shape + tokens = decout[-1] + B, S, D = tokens.shape + + feat = self.proj(tokens) # B,S,D + feat = feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + feat = F.pixel_shuffle(feat, self.patch_size) # B,3,H,W + + final_output = postprocess(feat, self.depth_mode, self.conf_mode) + final_output["pts3d_in_other_view"] = final_output.pop("pts3d") + + if self.has_depth: + self_feat = self.self_proj(tokens) # B,S,D + self_feat = self_feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + self_feat = F.pixel_shuffle(self_feat, self.patch_size) # B,3,H,W + self_3d_output = postprocess(self_feat, self.depth_mode, self.conf_mode) + self_3d_output["pts3d_in_self_view"] = self_3d_output.pop("pts3d") + self_3d_output["conf_self"] = self_3d_output.pop("conf") + final_output.update(self_3d_output) + + if self.has_rgb: + rgb_feat = self.rgb_proj(tokens) + rgb_feat = rgb_feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + rgb_feat = F.pixel_shuffle(rgb_feat, self.patch_size) # B,3,H,W + rgb_output = postprocess_rgb(rgb_feat) + final_output.update(rgb_output) + + if self.has_pose_conf: + pose_conf = self.pose_conf_proj(tokens) + pose_conf = pose_conf.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + pose_conf = F.pixel_shuffle(pose_conf, self.patch_size) + pose_conf_output = postprocess_pose_conf(pose_conf) + final_output.update(pose_conf_output) + + return final_output + + +class LinearPts3d_Desc(nn.Module): + """ + Linear head for dust3r + Each token outputs: - 16x16 3D points (+ confidence) + """ + + def __init__( + self, + net, + has_conf=False, + has_depth=False, + local_feat_dim=24, + hidden_dim_factor=4.0, + ): + super().__init__() + self.patch_size = net.patch_embed.patch_size[0] + self.depth_mode = net.depth_mode + self.conf_mode = net.conf_mode + self.has_conf = has_conf + self.double_channel = has_depth + self.local_feat_dim = local_feat_dim + + if not has_depth: + self.proj = nn.Linear( + net.dec_embed_dim, (3 + has_conf) * self.patch_size**2 + ) + else: + self.proj = nn.Linear( + net.dec_embed_dim, (3 + has_conf) * 2 * self.patch_size**2 + ) + idim = net.enc_embed_dim + net.dec_embed_dim + self.head_local_features = Mlp( + in_features=idim, + hidden_features=int(hidden_dim_factor * idim), + out_features=(self.local_feat_dim + 1) * self.patch_size**2, + ) + + def setup(self, croconet): + pass + + def forward(self, decout, img_shape): + H, W = img_shape + tokens = decout[-1] + B, S, D = tokens.shape + + feat = self.proj(tokens) # B,S,D + feat = feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + feat = F.pixel_shuffle(feat, self.patch_size) # B,3,H,W + + enc_output, dec_output = decout[0], decout[-1] + cat_output = torch.cat([enc_output, dec_output], dim=-1) + local_features = self.head_local_features(cat_output) # B,S,D + local_features = local_features.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + local_features = F.pixel_shuffle(local_features, self.patch_size) # B,d,H,W + feat = torch.cat([feat, local_features], dim=1) + + return postprocess_desc( + feat, + self.depth_mode, + self.conf_mode, + self.local_feat_dim, + self.double_channel, + ) + + +class LinearPts3dPoseDirect(nn.Module): + """ + Linear head for dust3r + Each token outputs: - 16x16 3D points (+ confidence) + """ + + def __init__(self, net, has_conf=False, has_rgb=False, has_pose=False): + super().__init__() + self.patch_size = net.patch_embed.patch_size[0] + self.depth_mode = net.depth_mode + self.conf_mode = net.conf_mode + self.pose_mode = net.pose_mode + self.has_conf = has_conf + self.has_rgb = has_rgb + self.has_pose = has_pose + + self.proj = Mlp( + net.dec_embed_dim, out_features=(3 + has_conf) * self.patch_size**2 + ) + if has_rgb: + self.rgb_proj = Mlp(net.dec_embed_dim, out_features=3 * self.patch_size**2) + if has_pose: + self.pose_head = PoseDecoder(hidden_size=net.dec_embed_dim) + if has_conf: + self.cross_conf_proj = Mlp( + net.dec_embed_dim, out_features=self.patch_size**2 + ) + + def setup(self, croconet): + pass + + def forward(self, decout, img_shape): + H, W = img_shape + tokens = decout[-1] + if self.has_pose: + pose_token = tokens[:, 0] + tokens = tokens[:, 1:] + B, S, D = tokens.shape + + feat = self.proj(tokens) # B,S,D + feat = feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + feat = F.pixel_shuffle(feat, self.patch_size) # B,3,H,W + final_output = postprocess(feat, self.depth_mode, self.conf_mode) + final_output["pts3d_in_self_view"] = final_output.pop("pts3d") + final_output["conf_self"] = final_output.pop("conf") + + if self.has_rgb: + rgb_feat = self.rgb_proj(tokens) + rgb_feat = rgb_feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + rgb_feat = F.pixel_shuffle(rgb_feat, self.patch_size) # B,3,H,W + rgb_output = postprocess_rgb(rgb_feat) + final_output.update(rgb_output) + + if self.has_pose: + pose = self.pose_head(pose_token) + pose = postprocess_pose(pose, self.pose_mode) + final_output["camera_pose"] = pose # B,7 + final_output["pts3d_in_other_view"] = geotrf( + pose_encoding_to_camera(final_output["camera_pose"]), + final_output["pts3d_in_self_view"], + ) + + if self.has_conf: + cross_conf = self.cross_conf_proj(tokens) + cross_conf = cross_conf.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + cross_conf = F.pixel_shuffle(cross_conf, self.patch_size)[:, 0] + final_output["conf"] = reg_dense_conf(cross_conf, mode=self.conf_mode) + return final_output + + +class LinearPts3dPose(nn.Module): + """ + Linear head for dust3r + Each token outputs: - 16x16 3D points (+ confidence) + """ + + def __init__( + self, net, has_conf=False, has_rgb=False, has_pose=False, mlp_ratio=4.0 + ): + super().__init__() + self.patch_size = net.patch_embed.patch_size[0] + self.depth_mode = net.depth_mode + self.conf_mode = net.conf_mode + self.pose_mode = net.pose_mode + self.has_conf = has_conf + self.has_rgb = has_rgb + self.has_pose = has_pose + + self.proj = Mlp( + net.dec_embed_dim, + hidden_features=int(mlp_ratio * net.dec_embed_dim), + out_features=(3 + has_conf) * self.patch_size**2, + ) + if has_rgb: + self.rgb_proj = Mlp( + net.dec_embed_dim, + hidden_features=int(mlp_ratio * net.dec_embed_dim), + out_features=3 * self.patch_size**2, + ) + if has_pose: + self.pose_head = PoseDecoder(hidden_size=net.dec_embed_dim) + self.final_transform = nn.ModuleList( + [ + ConditionModulationBlock( + net.dec_embed_dim, + net.dec_num_heads, + mlp_ratio=4.0, + qkv_bias=True, + rope=net.rope, + ) + for _ in range(2) + ] + ) + self.cross_proj = Mlp( + net.dec_embed_dim, + hidden_features=int(mlp_ratio * net.dec_embed_dim), + out_features=(3 + has_conf) * self.patch_size**2, + ) + + def setup(self, croconet): + pass + + def forward(self, decout, img_shape, **kwargs): + H, W = img_shape + tokens = decout[-1] + if self.has_pose: + pose_token = tokens[:, 0] + tokens = tokens[:, 1:] + with torch.cuda.amp.autocast(enabled=False): + pose = self.pose_head(pose_token) + cross_tokens = tokens + for blk in self.final_transform: + cross_tokens = blk(cross_tokens, pose_token, kwargs.get("pos")) + + with torch.cuda.amp.autocast(enabled=False): + B, S, D = tokens.shape + + feat = self.proj(tokens) # B,S,D + feat = feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + feat = F.pixel_shuffle(feat, self.patch_size) # B,3,H,W + final_output = postprocess( + feat, self.depth_mode, self.conf_mode, pos_z=True + ) + final_output["pts3d_in_self_view"] = final_output.pop("pts3d") + final_output["conf_self"] = final_output.pop("conf") + + if self.has_rgb: + rgb_feat = self.rgb_proj(tokens) + rgb_feat = rgb_feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + rgb_feat = F.pixel_shuffle(rgb_feat, self.patch_size) # B,3,H,W + rgb_output = postprocess_rgb(rgb_feat) + final_output.update(rgb_output) + + if self.has_pose: + pose = postprocess_pose(pose, self.pose_mode) + final_output["camera_pose"] = pose # B,7 + + cross_feat = self.cross_proj(cross_tokens) # B,S,D + cross_feat = cross_feat.transpose(-1, -2).view( + B, -1, H // self.patch_size, W // self.patch_size + ) + cross_feat = F.pixel_shuffle(cross_feat, self.patch_size) # B,3,H,W + tmp = postprocess(cross_feat, self.depth_mode, self.conf_mode) + final_output["pts3d_in_other_view"] = tmp.pop("pts3d") + final_output["conf"] = tmp.pop("conf") + + return final_output diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/postprocess.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/postprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..63cf3211b4b2dc5a9782c1d1d53eff17886d54cd --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/heads/postprocess.py @@ -0,0 +1,167 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +import torch +import torch.nn.functional as F + + +def postprocess(out, depth_mode, conf_mode, pos_z=False): + """ + extract 3D points/confidence from prediction head output + """ + fmap = out.permute(0, 2, 3, 1) # B,H,W,3 + res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode, pos_z=pos_z)) + + if conf_mode is not None: + res["conf"] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode) + return res + + +def postprocess_rgb(out, eps=1e-6): + fmap = out.permute(0, 2, 3, 1) # B,H,W,3 + res = torch.sigmoid(fmap) * (1 - 2 * eps) + eps + res = (res - 0.5) * 2 + return dict(rgb=res) + + +def postprocess_pose(out, mode, inverse=False): + """ + extract pose from prediction head output + """ + mode, vmin, vmax = mode + + no_bounds = (vmin == -float("inf")) and (vmax == float("inf")) + assert no_bounds + trans = out[..., 0:3] + quats = out[..., 3:7] + + if mode == "linear": + if no_bounds: + return trans # [-inf, +inf] + return trans.clip(min=vmin, max=vmax) + + d = trans.norm(dim=-1, keepdim=True) + + if mode == "square": + if inverse: + scale = d / d.square().clip(min=1e-8) + else: + scale = d.square() / d.clip(min=1e-8) + + if mode == "exp": + if inverse: + scale = d / torch.expm1(d).clip(min=1e-8) + else: + scale = torch.expm1(d) / d.clip(min=1e-8) + + trans = trans * scale + quats = standardize_quaternion(quats) + + return torch.cat([trans, quats], dim=-1) + + +def postprocess_pose_conf(out): + fmap = out.permute(0, 2, 3, 1) # B,H,W,1 + return dict(pose_conf=torch.sigmoid(fmap)) + + +def postprocess_desc(out, depth_mode, conf_mode, desc_dim, double_channel=False): + """ + extract 3D points/confidence from prediction head output + """ + fmap = out.permute(0, 2, 3, 1) # B,H,W,3 + res = dict(pts3d=reg_dense_depth(fmap[:, :, :, 0:3], mode=depth_mode)) + + if conf_mode is not None: + res["conf"] = reg_dense_conf(fmap[:, :, :, 3], mode=conf_mode) + + if double_channel: + res["pts3d_self"] = reg_dense_depth( + fmap[ + :, :, :, 3 + int(conf_mode is not None) : 6 + int(conf_mode is not None) + ], + mode=depth_mode, + ) + if conf_mode is not None: + res["conf_self"] = reg_dense_conf( + fmap[:, :, :, 6 + int(conf_mode is not None)], mode=conf_mode + ) + + start = ( + 3 + + int(conf_mode is not None) + + int(double_channel) * (3 + int(conf_mode is not None)) + ) + res["desc"] = reg_desc(fmap[:, :, :, start : start + desc_dim], mode="norm") + res["desc_conf"] = reg_dense_conf(fmap[:, :, :, start + desc_dim], mode=conf_mode) + assert start + desc_dim + 1 == fmap.shape[-1] + + return res + + +def reg_desc(desc, mode="norm"): + if "norm" in mode: + desc = desc / desc.norm(dim=-1, keepdim=True) + else: + raise ValueError(f"Unknown desc mode {mode}") + return desc + + +def reg_dense_depth(xyz, mode, pos_z=False): + """ + extract 3D points from prediction head output + """ + mode, vmin, vmax = mode + + no_bounds = (vmin == -float("inf")) and (vmax == float("inf")) + assert no_bounds + + if mode == "linear": + if no_bounds: + return xyz # [-inf, +inf] + return xyz.clip(min=vmin, max=vmax) + + if pos_z: + sign = torch.sign(xyz[..., -1:]) + xyz *= sign + d = xyz.norm(dim=-1, keepdim=True) + xyz = xyz / d.clip(min=1e-8) + + if mode == "square": + return xyz * d.square() + + if mode == "exp": + return xyz * torch.expm1(d) + + raise ValueError(f"bad {mode=}") + + +def reg_dense_conf(x, mode): + """ + extract confidence from prediction head output + """ + mode, vmin, vmax = mode + if mode == "exp": + return vmin + x.exp().clip(max=vmax - vmin) + if mode == "sigmoid": + return (vmax - vmin) * torch.sigmoid(x) + vmin + raise ValueError(f"bad {mode=}") + + +def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor: + """ + Convert a unit quaternion to a standard form: one in which the real + part is non negative. + + Args: + quaternions: Quaternions with real part first, + as tensor of shape (..., 4). + + Returns: + Standardized quaternions as tensor of shape (..., 4). + """ + quaternions = F.normalize(quaternions, p=2, dim=-1) + return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/inference.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..03bd8c3e413b54849be1d524a9964c52f319e777 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/inference.py @@ -0,0 +1,263 @@ +import tqdm +import torch +from dust3r.utils.device import to_cpu, collate_with_cat +from dust3r.utils.misc import invalid_to_nans +from dust3r.utils.geometry import depthmap_to_pts3d, geotrf +from dust3r.model import ARCroco3DStereo +from accelerate import Accelerator +import re +import time + +def sample_query_points(mask, M): + B, H, W = mask.shape + yx = [] + for b in range(B): + ys, xs = torch.where(mask[b]) + if len(xs) == 0 or len(xs) < M: + pts = torch.zeros(M, 2, device=mask.device) + else: + idx = torch.randint(0, len(xs), (M,)) + pts = torch.stack([xs[idx], ys[idx]], dim=-1) + yx.append(pts) + return torch.stack(yx, dim=0) + +def custom_sort_key(key): + text = key.split("/") + if len(text) > 1: + text, num = text[0], text[-1] + return (text, int(num)) + else: + return (key, -1) + + +def merge_chunk_dict(old_dict, curr_dict, add_number): + new_dict = {} + for key, value in curr_dict.items(): + + match = re.search(r"(\d+)$", key) + if match: + + num_part = int(match.group()) + add_number + + new_key = re.sub(r"(\d+)$", str(num_part), key, 1) + new_dict[new_key] = value + else: + new_dict[key] = value + new_dict = old_dict | new_dict + return {k: new_dict[k] for k in sorted(new_dict.keys(), key=custom_sort_key)} + + +def _interleave_imgs(img1, img2): + res = {} + for key, value1 in img1.items(): + value2 = img2[key] + if isinstance(value1, torch.Tensor): + value = torch.stack((value1, value2), dim=1).flatten(0, 1) + else: + value = [x for pair in zip(value1, value2) for x in pair] + res[key] = value + return res + + +def make_batch_symmetric(batch): + view1, view2 = batch + view1, view2 = (_interleave_imgs(view1, view2), _interleave_imgs(view2, view1)) + return view1, view2 + + +def loss_of_one_batch( + batch, + model, + criterion, + accelerator: Accelerator, + teacher=None, + symmetrize_batch=False, + use_amp=False, + ret=None, + img_mask=None, + inference=False, +): + if len(batch) > 2: + assert ( + symmetrize_batch is False + ), "cannot symmetrize batch with more than 2 views" + if symmetrize_batch: + batch = make_batch_symmetric(batch) + if "valid_mask" in batch[0]: + query_pts = sample_query_points(batch[0]['valid_mask'], M=64).to(device=batch[0]["img"].device) + else: + query_pts = None + dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 + with torch.cuda.amp.autocast(dtype=dtype): + if False:#inference: + with torch.no_grad(): + output = model.inference(batch, query_pts) + preds, batch = output.ress, output.views + result = dict(views=batch, pred=preds) + return result[ret] if ret else result + else: + output_frontend, output_mix, output_backend = model(batch, query_pts) + preds_f, batch_f = output_frontend.ress, output_frontend.views + preds_m, batch_m = output_mix.ress, output_mix.views + preds_b, batch_b = output_backend.ress, output_backend.views + + + + if teacher is not None: + with torch.no_grad(): + knowledge = teacher.inference(batch_f, query_pts) + gts, batch = knowledge.ress, knowledge.views + + with torch.cuda.amp.autocast(enabled=False): + loss_f = criterion(gts, preds_f) if criterion is not None else None + loss_m = criterion(gts, preds_m) if criterion is not None else None + loss_b = criterion(gts, preds_b) if criterion is not None else None + total = (loss_f[0]+loss_m[0]*10+loss_b[0]) + details = dict() + details['Lcamera_frontend'] = loss_f[1]['Lcamera'] + details['Ldepth_frontend'] = loss_f[1]['Ldepth'] + details['Lpmap_frontend'] = loss_f[1]['Lpmap'] + details['Ltrack_frontend'] = loss_f[1]['Ltrack'] + + details['Lcamera_mix'] = loss_m[1]['Lcamera'] + details['Ldepth_mix'] = loss_m[1]['Ldepth'] + details['Lpmap_mix'] = loss_m[1]['Lpmap'] + details['Ltrack_mix'] = loss_m[1]['Ltrack'] + + details['Lcamera_backend'] = loss_b[1]['Lcamera'] + details['Ldepth_backend'] = loss_b[1]['Ldepth'] + details['Lpmap_backend'] = loss_b[1]['Lpmap'] + details['Ltrack_backend'] = loss_b[1]['Ltrack'] + + details['total'] = total + loss = [total, details] + + else: + with torch.cuda.amp.autocast(enabled=False): + loss_f = criterion(batch_f, preds_f) if criterion is not None else None + loss_m = criterion(batch_m, preds_m) if criterion is not None else None + loss_b = criterion(batch_b, preds_b) if criterion is not None else None + + total = (loss_f[0]+loss_m[0]+loss_b[0]*10) + + details = dict() + details['Lcamera_frontend'] = loss_f[1]['Lcamera'] + details['Ldepth_frontend'] = loss_f[1]['Ldepth'] + details['Lpmap_frontend'] = loss_f[1]['Lpmap'] + details['Ltrack_frontend'] = loss_f[1]['Ltrack'] + + details['Lcamera_mix'] = loss_m[1]['Lcamera'] + details['Ldepth_mix'] = loss_m[1]['Ldepth'] + details['Lpmap_mix'] = loss_m[1]['Lpmap'] + details['Ltrack_mix'] = loss_m[1]['Ltrack'] + + details['Lcamera_backend'] = loss_b[1]['Lcamera'] + details['Ldepth_backend'] = loss_b[1]['Ldepth'] + details['Lpmap_backend'] = loss_b[1]['Lpmap'] + details['Ltrack_backend'] = loss_b[1]['Ltrack'] + + details['total'] = total + loss = [total, details] + + result = dict(views=batch_b, pred=preds_b, loss=loss) + return result[ret] if ret else result + + + +def check_if_same_size(pairs): + shapes1 = [img1["img"].shape[-2:] for img1, img2 in pairs] + shapes2 = [img2["img"].shape[-2:] for img1, img2 in pairs] + return all(shapes1[0] == s for s in shapes1) and all( + shapes2[0] == s for s in shapes2 + ) + + +def get_pred_pts3d(gt, pred, use_pose=False, inplace=False): + if "depth" in pred and "pseudo_focal" in pred: + try: + pp = gt["camera_intrinsics"][..., :2, 2] + except KeyError: + pp = None + pts3d = depthmap_to_pts3d(**pred, pp=pp) + + elif "pts3d" in pred: + + pts3d = pred["pts3d"] + + elif "pts3d_in_other_view" in pred: + + assert use_pose is True + return ( + pred["pts3d_in_other_view"] + if inplace + else pred["pts3d_in_other_view"].clone() + ) + + if use_pose: + camera_pose = pred.get("camera_pose") + assert camera_pose is not None + pts3d = geotrf(camera_pose, pts3d) + + return pts3d + + +def find_opt_scaling( + gt_pts1, + gt_pts2, + pr_pts1, + pr_pts2=None, + fit_mode="weiszfeld_stop_grad", + valid1=None, + valid2=None, +): + assert gt_pts1.ndim == pr_pts1.ndim == 4 + assert gt_pts1.shape == pr_pts1.shape + if gt_pts2 is not None: + assert gt_pts2.ndim == pr_pts2.ndim == 4 + assert gt_pts2.shape == pr_pts2.shape + + nan_gt_pts1 = invalid_to_nans(gt_pts1, valid1).flatten(1, 2) + nan_gt_pts2 = ( + invalid_to_nans(gt_pts2, valid2).flatten(1, 2) if gt_pts2 is not None else None + ) + + pr_pts1 = invalid_to_nans(pr_pts1, valid1).flatten(1, 2) + pr_pts2 = ( + invalid_to_nans(pr_pts2, valid2).flatten(1, 2) if pr_pts2 is not None else None + ) + + all_gt = ( + torch.cat((nan_gt_pts1, nan_gt_pts2), dim=1) + if gt_pts2 is not None + else nan_gt_pts1 + ) + all_pr = torch.cat((pr_pts1, pr_pts2), dim=1) if pr_pts2 is not None else pr_pts1 + + dot_gt_pr = (all_pr * all_gt).sum(dim=-1) + dot_gt_gt = all_gt.square().sum(dim=-1) + + if fit_mode.startswith("avg"): + + scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1) + elif fit_mode.startswith("median"): + scaling = (dot_gt_pr / dot_gt_gt).nanmedian(dim=1).values + elif fit_mode.startswith("weiszfeld"): + + scaling = dot_gt_pr.nanmean(dim=1) / dot_gt_gt.nanmean(dim=1) + + for iter in range(10): + + dis = (all_pr - scaling.view(-1, 1, 1) * all_gt).norm(dim=-1) + + w = dis.clip_(min=1e-8).reciprocal() + + scaling = (w * dot_gt_pr).nanmean(dim=1) / (w * dot_gt_gt).nanmean(dim=1) + else: + raise ValueError(f"bad {fit_mode=}") + + if fit_mode.endswith("stop_grad"): + scaling = scaling.detach() + + scaling = scaling.clip(min=1e-3) + + return scaling diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/losses_noteacher.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/losses_noteacher.py new file mode 100644 index 0000000000000000000000000000000000000000..5dd37b8d44093b8b9e0e2c9710fd1fd0ddbae237 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/losses_noteacher.py @@ -0,0 +1,1541 @@ +from copy import copy, deepcopy +import torch +import torch.nn as nn +import torch.nn.functional as F + +from dust3r.inference import get_pred_pts3d, find_opt_scaling +from dust3r.utils.geometry import ( + inv, + geotrf, + normalize_pointcloud, + normalize_pointcloud_group, +) +from dust3r.utils.geometry import ( + get_group_pointcloud_depth, + get_group_pointcloud_center_scale, + weighted_procrustes, +) + +from dust3r.alignment import align_points_scale + +from vggt.utils.pose_enc import extri_intri_to_pose_encoding, pose_encoding_to_extri_intri + +from gsplat import rasterization +import numpy as np +import lpips +from dust3r.utils.camera import ( + pose_encoding_to_camera, + camera_to_pose_encoding, + relative_pose_absT_quatR, +) + + +from vggt.train_utils.normalization import normalize_camera_extrinsics_and_points_batch + +def Sum(*losses_and_masks): + loss, mask = losses_and_masks[0] + if loss.ndim > 0: + # we are actually returning the loss for every pixels + return losses_and_masks + else: + # we are returning the global loss + for loss2, mask2 in losses_and_masks[1:]: + loss = loss + loss2 + return loss + + +class BaseCriterion(nn.Module): + def __init__(self, reduction="mean"): + super().__init__() + self.reduction = reduction + + +class LLoss(BaseCriterion): + """L-norm loss""" + + def forward(self, a, b): + assert ( + a.shape == b.shape and a.ndim >= 2 and 1 <= a.shape[-1] <= 3 + ), f"Bad shape = {a.shape}" + dist = self.distance(a, b) + if self.reduction == "none": + return dist + if self.reduction == "sum": + return dist.sum() + if self.reduction == "mean": + return dist.mean() if dist.numel() > 0 else dist.new_zeros(()) + raise ValueError(f"bad {self.reduction=} mode") + + def distance(self, a, b): + raise NotImplementedError() + + +class L21Loss(LLoss): + """Euclidean distance between 3d points""" + + def distance(self, a, b): + return torch.norm(a - b, dim=-1) # normalized L2 distance + + +L21 = L21Loss() + + +class MSELoss(LLoss): + def distance(self, a, b): + return (a - b) ** 2 + + +MSE = MSELoss() + + +class Criterion(nn.Module): + def __init__(self, criterion=None): + super().__init__() + assert isinstance( + criterion, BaseCriterion + ), f"{criterion} is not a proper criterion!" + self.criterion = copy(criterion) + + def get_name(self): + return f"{type(self).__name__}({self.criterion})" + + def with_reduction(self, mode="none"): + res = loss = deepcopy(self) + while loss is not None: + assert isinstance(loss, Criterion) + loss.criterion.reduction = mode # make it return the loss for each sample + loss = loss._loss2 # we assume loss is a Multiloss + return res + + +class MultiLoss(nn.Module): + """Easily combinable losses (also keep track of individual loss values): + loss = MyLoss1() + 0.1*MyLoss2() + Usage: + Inherit from this class and override get_name() and compute_loss() + """ + + def __init__(self): + super().__init__() + self._alpha = 1 + self._loss2 = None + + def compute_loss(self, *args, **kwargs): + raise NotImplementedError() + + def get_name(self): + raise NotImplementedError() + + def __mul__(self, alpha): + assert isinstance(alpha, (int, float)) + res = copy(self) + res._alpha = alpha + return res + + __rmul__ = __mul__ # same + + def __add__(self, loss2): + assert isinstance(loss2, MultiLoss) + res = cur = copy(self) + # find the end of the chain + while cur._loss2 is not None: + cur = cur._loss2 + cur._loss2 = loss2 + return res + + def __repr__(self): + name = self.get_name() + if self._alpha != 1: + name = f"{self._alpha:g}*{name}" + if self._loss2: + name = f"{name} + {self._loss2}" + return name + + def forward(self, *args, **kwargs): + loss = self.compute_loss(*args, **kwargs) + if isinstance(loss, tuple): + loss, details = loss + elif loss.ndim == 0: + details = {self.get_name(): float(loss)} + else: + details = {} + loss = loss * self._alpha + + if self._loss2: + loss2, details2 = self._loss2(*args, **kwargs) + loss = loss + loss2 + details |= details2 + + return loss, details + + +class SSIM(nn.Module): + """Layer to compute the SSIM loss between a pair of images""" + + def __init__(self): + super(SSIM, self).__init__() + self.mu_x_pool = nn.AvgPool2d(3, 1) + self.mu_y_pool = nn.AvgPool2d(3, 1) + self.sig_x_pool = nn.AvgPool2d(3, 1) + self.sig_y_pool = nn.AvgPool2d(3, 1) + self.sig_xy_pool = nn.AvgPool2d(3, 1) + + self.refl = nn.ReflectionPad2d(1) + + self.C1 = 0.01**2 + self.C2 = 0.03**2 + + def forward(self, x, y): + x = self.refl(x) + y = self.refl(y) + + mu_x = self.mu_x_pool(x) + mu_y = self.mu_y_pool(y) + + sigma_x = self.sig_x_pool(x**2) - mu_x**2 + sigma_y = self.sig_y_pool(y**2) - mu_y**2 + sigma_xy = self.sig_xy_pool(x * y) - mu_x * mu_y + + SSIM_n = (2 * mu_x * mu_y + self.C1) * (2 * sigma_xy + self.C2) + SSIM_d = (mu_x**2 + mu_y**2 + self.C1) * (sigma_x + sigma_y + self.C2) + + return torch.clamp((1 - SSIM_n / SSIM_d) / 2, 0, 1) + + +class RGBLoss(Criterion, MultiLoss): + def __init__(self, criterion): + super().__init__(criterion) + self.ssim = SSIM() + + def img_loss(self, a, b): + return self.criterion(a, b) + + def compute_loss(self, gts, preds, **kw): + gt_rgbs = [gt["img"].permute(0, 2, 3, 1) for gt in gts] + pred_rgbs = [pred["rgb"] for pred in preds] + ls = [ + self.img_loss(pred_rgb, gt_rgb) + for pred_rgb, gt_rgb in zip(pred_rgbs, gt_rgbs) + ] + details = {} + self_name = type(self).__name__ + for i, l in enumerate(ls): + details[self_name + f"_rgb/{i+1}"] = float(l) + details[f"pred_rgb_{i+1}"] = pred_rgbs[i] + rgb_loss = sum(ls) / len(ls) + return rgb_loss, details + + +class DepthScaleShiftInvLoss(BaseCriterion): + """scale and shift invariant loss""" + + def __init__(self, reduction="none"): + super().__init__(reduction) + + def forward(self, pred, gt, mask): + assert pred.shape == gt.shape and pred.ndim == 3, f"Bad shape = {pred.shape}" + dist = self.distance(pred, gt, mask) + # assert dist.ndim == a.ndim - 1 # one dimension less + if self.reduction == "none": + return dist + if self.reduction == "sum": + return dist.sum() + if self.reduction == "mean": + return dist.mean() if dist.numel() > 0 else dist.new_zeros(()) + raise ValueError(f"bad {self.reduction=} mode") + + def normalize(self, x, mask): + x_valid = x[mask] + splits = mask.sum(dim=(1, 2)).tolist() + x_valid_list = torch.split(x_valid, splits) + shift = [x.mean() for x in x_valid_list] + x_valid_centered = [x - m for x, m in zip(x_valid_list, shift)] + scale = [x.abs().mean() for x in x_valid_centered] + scale = torch.stack(scale) + shift = torch.stack(shift) + x = (x - shift.view(-1, 1, 1)) / scale.view(-1, 1, 1).clamp(min=1e-6) + return x + + def distance(self, pred, gt, mask): + pred = self.normalize(pred, mask) + gt = self.normalize(gt, mask) + return torch.abs((pred - gt)[mask]) + + +class ScaleInvLoss(BaseCriterion): + """scale invariant loss""" + + def __init__(self, reduction="none"): + super().__init__(reduction) + + def forward(self, pred, gt, mask): + assert pred.shape == gt.shape and pred.ndim == 4, f"Bad shape = {pred.shape}" + dist = self.distance(pred, gt, mask) + # assert dist.ndim == a.ndim - 1 # one dimension less + if self.reduction == "none": + return dist + if self.reduction == "sum": + return dist.sum() + if self.reduction == "mean": + return dist.mean() if dist.numel() > 0 else dist.new_zeros(()) + raise ValueError(f"bad {self.reduction=} mode") + + def distance(self, pred, gt, mask): + pred_norm_factor = (torch.norm(pred, dim=-1) * mask).sum(dim=(1, 2)) / mask.sum( + dim=(1, 2) + ).clamp(min=1e-6) + gt_norm_factor = (torch.norm(gt, dim=-1) * mask).sum(dim=(1, 2)) / mask.sum( + dim=(1, 2) + ).clamp(min=1e-6) + pred = pred / pred_norm_factor.view(-1, 1, 1, 1).clamp(min=1e-6) + gt = gt / gt_norm_factor.view(-1, 1, 1, 1).clamp(min=1e-6) + return torch.norm(pred - gt, dim=-1)[mask] + + +class Regr3DPose(Criterion, MultiLoss): + """Ensure that all 3D points are correct. + Asymmetric loss: view1 is supposed to be the anchor. + + P1 = RT1 @ D1 + P2 = RT2 @ D2 + loss1 = (I @ pred_D1) - (RT1^-1 @ RT1 @ D1) + loss2 = (RT21 @ pred_D2) - (RT1^-1 @ P2) + = (RT21 @ pred_D2) - (RT1^-1 @ RT2 @ D2) + """ + + def __init__( + self, + criterion, + norm_mode="?avg_dis", + gt_scale=False, + sky_loss_value=2, + max_metric_scale=False, + ): + super().__init__(criterion) + if norm_mode.startswith("?"): + # do no norm pts from metric scale datasets + self.norm_all = False + self.norm_mode = norm_mode[1:] + else: + self.norm_all = True + self.norm_mode = norm_mode + self.gt_scale = gt_scale + + self.sky_loss_value = sky_loss_value + self.max_metric_scale = max_metric_scale + + def get_norm_factor_point_cloud( + self, pts_cross, valids, conf_cross, norm_self_only=False + ): + pts = [x for x in pts_cross] + valids = [x for x in valids] + confs = [x for x in conf_cross] + norm_factor = normalize_pointcloud_group( + pts, self.norm_mode, valids, confs, ret_factor_only=True + ) + return norm_factor + + def get_norm_factor_poses(self, gt_trans, pr_trans, not_metric_mask): + + if self.norm_mode and not self.gt_scale: + gt_trans = [x[:, None, None, :].clone() for x in gt_trans] + valids = [torch.ones_like(x[..., 0], dtype=torch.bool) for x in gt_trans] + norm_factor_gt = ( + normalize_pointcloud_group( + gt_trans, + self.norm_mode, + valids, + ret_factor_only=True, + ) + .squeeze(-1) + .squeeze(-1) + ) + else: + norm_factor_gt = torch.ones( + len(gt_trans), dtype=gt_trans[0].dtype, device=gt_trans[0].device + ) + + norm_factor_pr = norm_factor_gt.clone() + if self.norm_mode and not_metric_mask.sum() > 0 and not self.gt_scale: + pr_trans_not_metric = [ + x[not_metric_mask][:, None, None, :].clone() for x in pr_trans + ] + valids = [ + torch.ones_like(x[..., 0], dtype=torch.bool) + for x in pr_trans_not_metric + ] + norm_factor_pr_not_metric = ( + normalize_pointcloud_group( + pr_trans_not_metric, + self.norm_mode, + valids, + ret_factor_only=True, + ) + .squeeze(-1) + .squeeze(-1) + ) + norm_factor_pr[not_metric_mask] = norm_factor_pr_not_metric + return norm_factor_gt, norm_factor_pr + + def get_all_pts3d( + self, + gts, + preds, + dist_clip=None, + norm_self_only=False, + norm_pose_separately=False, + eps=1e-3, + camera1=None, + ): + # everything is normalized w.r.t. camera of view1 + in_camera1 = inv(gts[0]["camera_pose"]) if camera1 is None else inv(camera1) + gt_pts_cross = [geotrf(in_camera1, gt["pts3d"]) for gt in gts] + valids = [gt["valid_mask"].clone() for gt in gts] + camera_only = gts[0]["camera_only"] + + if dist_clip is not None: + # points that are too far-away == invalid + dis = [gt_pt.norm(dim=-1) for gt_pt in gt_pts_cross] + valids = [valid & (dis <= dist_clip) for valid, dis in zip(valids, dis)] + + pr_pts_cross = [pred["pts3d_in_other_view"] for pred in preds] + conf_cross = [torch.log(pred["conf"]).detach().clip(eps) for pred in preds] + + # valids = torch.stack(valids, dim=0) # S B H W + # valids = valids.permute(1, 0, 2, 3) # B S H W + # valids_masks = preprocess_mask(valids, mode="pad") # (B, S, H, W) + # + # valids = torch.unbind(valids_masks, dim=1) # [S] (B, H, W) + + if not self.norm_all: + if self.max_metric_scale: + B = valids[0].shape[0] + dist = [ + torch.where(valid, torch.linalg.norm(gt_pt_cross, dim=-1), 0).view( + B, -1 + ) + for valid, gt_pt_cross in zip(valids, gt_pts_cross) + ] + for d in dist: + gts[0]["is_metric"] = gts[0]["is_metric_scale"] & ( + d.max(dim=-1).values < self.max_metric_scale + ) + not_metric_mask = ~gts[0]["is_metric"] + else: + not_metric_mask = torch.ones_like(gts[0]["is_metric"]) + + # normalize 3d points + # compute the scale using only the self view point maps + if self.norm_mode and not self.gt_scale: + norm_factor_gt = self.get_norm_factor_point_cloud( + gt_pts_cross, + valids, + conf_cross, + norm_self_only=norm_self_only, + ) + else: + norm_factor_gt = torch.ones_like( + preds[0]["pts3d_in_other_view"][:, :1, :1, :1] + ) + + norm_factor_pr = norm_factor_gt.clone() + if self.norm_mode and not_metric_mask.sum() > 0 and not self.gt_scale: + norm_factor_pr_not_metric = self.get_norm_factor_point_cloud( + [pr_pt_cross[not_metric_mask] for pr_pt_cross in pr_pts_cross], + [valid[not_metric_mask] for valid in valids], + [conf[not_metric_mask] for conf in conf_cross], + norm_self_only=norm_self_only, + ) + norm_factor_pr[not_metric_mask] = norm_factor_pr_not_metric + + norm_factor_gt = norm_factor_gt.clip(eps) + norm_factor_pr = norm_factor_pr.clip(eps) + + gt_pts_cross = [pts / norm_factor_gt for pts in gt_pts_cross] + pr_pts_cross = [pts / norm_factor_pr for pts in pr_pts_cross] + + # [(Bx3, BX4), (BX3, BX4), ...], 3 for translation, 4 for quaternion + gt_poses = [ + camera_to_pose_encoding(in_camera1 @ gt["camera_pose"]).clone() + for gt in gts + ] + pr_poses = [pred["camera_pose"].clone() for pred in preds] + pose_norm_factor_gt = norm_factor_gt.clone().squeeze(2, 3) + pose_norm_factor_pr = norm_factor_pr.clone().squeeze(2, 3) + + if norm_pose_separately: + gt_trans = [gt[:, :3] for gt in gt_poses] + pr_trans = [pr[:, :3] for pr in pr_poses] + pose_norm_factor_gt, pose_norm_factor_pr = self.get_norm_factor_poses( + gt_trans, pr_trans, not_metric_mask + ) + elif any(camera_only): + gt_trans = [gt[:, :3] for gt in gt_poses] + pr_trans = [pr[:, :3] for pr in pr_poses] + pose_only_norm_factor_gt, pose_only_norm_factor_pr = ( + self.get_norm_factor_poses(gt_trans, pr_trans, not_metric_mask) + ) + pose_norm_factor_gt = torch.where( + camera_only[:, None], pose_only_norm_factor_gt, pose_norm_factor_gt + ) + pose_norm_factor_pr = torch.where( + camera_only[:, None], pose_only_norm_factor_pr, pose_norm_factor_pr + ) + + gt_poses = [ + (gt[:, :3] / pose_norm_factor_gt.clip(eps), gt[:, 3:]) for gt in gt_poses + ] + pr_poses = [ + (pr[:, :3] / pose_norm_factor_pr.clip(eps), pr[:, 3:]) for pr in pr_poses + ] + pose_masks = (pose_norm_factor_gt.squeeze(-1) > eps) & ( + pose_norm_factor_pr.squeeze(-1) > eps + ) + + + skys = [gt["sky_mask"] & ~valid for gt, valid in zip(gts, valids)] + return ( + gt_pts_cross, + pr_pts_cross, + gt_poses, + pr_poses, + valids, + skys, + pose_masks, + {}, + ) + + def get_all_pts3d_with_scale_loss( + self, + gts, + preds, + dist_clip=None, + norm_self_only=False, + norm_pose_separately=False, + eps=1e-3, + ): + # everything is normalized w.r.t. camera of view1 + in_camera1 = inv(gts[0]["camera_pose"]) + gt_pts_self = [geotrf(inv(gt["camera_pose"]), gt["pts3d"]) for gt in gts] + gt_pts_cross = [geotrf(in_camera1, gt["pts3d"]) for gt in gts] + valids = [gt["valid_mask"].clone() for gt in gts] + camera_only = gts[0]["camera_only"] + + if dist_clip is not None: + # points that are too far-away == invalid + dis = [gt_pt.norm(dim=-1) for gt_pt in gt_pts_cross] + valids = [valid & (dis <= dist_clip) for valid, dis in zip(valids, dis)] + + pr_pts_self = [pred["pts3d_in_self_view"] for pred in preds] + pr_pts_cross = [pred["pts3d_in_other_view"] for pred in preds] + conf_self = [torch.log(pred["conf_self"]).detach().clip(eps) for pred in preds] + conf_cross = [torch.log(pred["conf"]).detach().clip(eps) for pred in preds] + + if not self.norm_all: + if self.max_metric_scale: + B = valids[0].shape[0] + dist = [ + torch.where(valid, torch.linalg.norm(gt_pt_cross, dim=-1), 0).view( + B, -1 + ) + for valid, gt_pt_cross in zip(valids, gt_pts_cross) + ] + for d in dist: + gts[0]["is_metric"] = gts[0]["is_metric_scale"] & ( + d.max(dim=-1).values < self.max_metric_scale + ) + not_metric_mask = ~gts[0]["is_metric"] + else: + not_metric_mask = torch.ones_like(gts[0]["is_metric"]) + + # normalize 3d points + # compute the scale using only the self view point maps + if self.norm_mode and not self.gt_scale: + norm_factor_gt = self.get_norm_factor_point_cloud( + gt_pts_self[:1], + gt_pts_cross[:1], + valids[:1], + conf_self[:1], + conf_cross[:1], + norm_self_only=norm_self_only, + ) + else: + norm_factor_gt = torch.ones_like( + preds[0]["pts3d_in_other_view"][:, :1, :1, :1] + ) + + if self.norm_mode: + norm_factor_pr = self.get_norm_factor_point_cloud( + pr_pts_self[:1], + pr_pts_cross[:1], + valids[:1], + conf_self[:1], + conf_cross[:1], + norm_self_only=norm_self_only, + ) + else: + raise NotImplementedError + # only add loss to metric scale norm factor + if (~not_metric_mask).sum() > 0: + pts_scale_loss = torch.abs( + norm_factor_pr[~not_metric_mask] - norm_factor_gt[~not_metric_mask] + ).mean() + else: + pts_scale_loss = 0.0 + + norm_factor_gt = norm_factor_gt.clip(eps) + norm_factor_pr = norm_factor_pr.clip(eps) + + gt_pts_self = [pts / norm_factor_gt for pts in gt_pts_self] + gt_pts_cross = [pts / norm_factor_gt for pts in gt_pts_cross] + pr_pts_self = [pts / norm_factor_pr for pts in pr_pts_self] + pr_pts_cross = [pts / norm_factor_pr for pts in pr_pts_cross] + + # [(Bx3, BX4), (BX3, BX4), ...], 3 for translation, 4 for quaternion + gt_poses = [ + camera_to_pose_encoding(in_camera1 @ gt["camera_pose"]).clone() + for gt in gts + ] + pr_poses = [pred["camera_pose"].clone() for pred in preds] + pose_norm_factor_gt = norm_factor_gt.clone().squeeze(2, 3) + pose_norm_factor_pr = norm_factor_pr.clone().squeeze(2, 3) + + if norm_pose_separately: + gt_trans = [gt[:, :3] for gt in gt_poses][:1] + pr_trans = [pr[:, :3] for pr in pr_poses][:1] + pose_norm_factor_gt, pose_norm_factor_pr = self.get_norm_factor_poses( + gt_trans, pr_trans, torch.ones_like(not_metric_mask) + ) + elif any(camera_only): + gt_trans = [gt[:, :3] for gt in gt_poses][:1] + pr_trans = [pr[:, :3] for pr in pr_poses][:1] + pose_only_norm_factor_gt, pose_only_norm_factor_pr = ( + self.get_norm_factor_poses( + gt_trans, pr_trans, torch.ones_like(not_metric_mask) + ) + ) + pose_norm_factor_gt = torch.where( + camera_only[:, None], pose_only_norm_factor_gt, pose_norm_factor_gt + ) + pose_norm_factor_pr = torch.where( + camera_only[:, None], pose_only_norm_factor_pr, pose_norm_factor_pr + ) + # only add loss to metric scale norm factor + if (~not_metric_mask).sum() > 0: + pose_scale_loss = torch.abs( + pose_norm_factor_pr[~not_metric_mask] + - pose_norm_factor_gt[~not_metric_mask] + ).mean() + else: + pose_scale_loss = 0.0 + gt_poses = [ + (gt[:, :3] / pose_norm_factor_gt.clip(eps), gt[:, 3:]) for gt in gt_poses + ] + pr_poses = [ + (pr[:, :3] / pose_norm_factor_pr.clip(eps), pr[:, 3:]) for pr in pr_poses + ] + + pose_masks = (pose_norm_factor_gt.squeeze() > eps) & ( + pose_norm_factor_pr.squeeze() > eps + ) + + if any(camera_only): + # this is equal to a loss for camera intrinsics + gt_pts_self = [ + torch.where( + camera_only[:, None, None, None], + (gt / gt[..., -1:].clip(1e-6)).clip(-2, 2), + gt, + ) + for gt in gt_pts_self + ] + pr_pts_self = [ + torch.where( + camera_only[:, None, None, None], + (pr / pr[..., -1:].clip(1e-6)).clip(-2, 2), + pr, + ) + for pr in pr_pts_self + ] + # # do not add cross view loss when there is only camera supervision + + skys = [gt["sky_mask"] & ~valid for gt, valid in zip(gts, valids)] + return ( + gt_pts_self, + gt_pts_cross, + pr_pts_self, + pr_pts_cross, + gt_poses, + pr_poses, + valids, + skys, + pose_masks, + {"scale_loss": pose_scale_loss + pts_scale_loss}, + ) + + def compute_relative_pose_loss( + self, gt_trans, gt_quats, pr_trans, pr_quats, masks=None + ): + if masks is None: + masks = torch.ones(len(gt_trans), dtype=torch.bool, device=gt_trans.device) + gt_trans_matrix1 = gt_trans[:, :, None, :].repeat(1, 1, gt_trans.shape[1], 1)[ + masks + ] + gt_trans_matrix2 = gt_trans[:, None, :, :].repeat(1, gt_trans.shape[1], 1, 1)[ + masks + ] + gt_quats_matrix1 = gt_quats[:, :, None, :].repeat(1, 1, gt_quats.shape[1], 1)[ + masks + ] + gt_quats_matrix2 = gt_quats[:, None, :, :].repeat(1, gt_quats.shape[1], 1, 1)[ + masks + ] + pr_trans_matrix1 = pr_trans[:, :, None, :].repeat(1, 1, pr_trans.shape[1], 1)[ + masks + ] + pr_trans_matrix2 = pr_trans[:, None, :, :].repeat(1, pr_trans.shape[1], 1, 1)[ + masks + ] + pr_quats_matrix1 = pr_quats[:, :, None, :].repeat(1, 1, pr_quats.shape[1], 1)[ + masks + ] + pr_quats_matrix2 = pr_quats[:, None, :, :].repeat(1, pr_quats.shape[1], 1, 1)[ + masks + ] + + gt_rel_trans, gt_rel_quats = relative_pose_absT_quatR( + gt_trans_matrix1, gt_quats_matrix1, gt_trans_matrix2, gt_quats_matrix2 + ) + pr_rel_trans, pr_rel_quats = relative_pose_absT_quatR( + pr_trans_matrix1, pr_quats_matrix1, pr_trans_matrix2, pr_quats_matrix2 + ) + rel_trans_err = torch.norm(gt_rel_trans - pr_rel_trans, dim=-1) + rel_quats_err = torch.norm(gt_rel_quats - pr_rel_quats, dim=-1) + return rel_trans_err.mean() + rel_quats_err.mean() + + def compute_pose_loss(self, gt_poses, pred_poses, masks=None): + """ + gt_pose: list of (Bx3, Bx4) + pred_pose: list of (Bx3, Bx4) + masks: None, or B + """ + gt_trans = torch.stack([gt[0] for gt in gt_poses], dim=1) # BxNx3 + gt_quats = torch.stack([gt[1] for gt in gt_poses], dim=1) # BXNX3 + pred_trans = torch.stack([pr[0] for pr in pred_poses], dim=1) # BxNx4 + pred_quats = torch.stack([pr[1] for pr in pred_poses], dim=1) # BxNx4 + if masks == None: + pose_loss = ( + torch.norm(pred_trans - gt_trans, dim=-1).mean() + + torch.norm(pred_quats - gt_quats, dim=-1).mean() + ) + else: + if not any(masks): + return torch.tensor(0.0) + pose_loss = ( + torch.norm(pred_trans - gt_trans, dim=-1)[masks].mean() + + torch.norm(pred_quats - gt_quats, dim=-1)[masks].mean() + ) + + return pose_loss + + def compute_loss(self, gts, preds, **kw): + ( + gt_pts_cross, + pred_pts_cross, + gt_poses, + pr_poses, + masks, + skys, + pose_masks, + monitoring, + ) = self.get_all_pts3d(gts, preds, **kw) + + if self.sky_loss_value > 0: + assert ( + self.criterion.reduction == "none" + ), "sky_loss_value should be 0 if no conf loss" + masks = [mask | sky for mask, sky in zip(masks, skys)] + + + # if self.sky_loss_value > 0: + # assert ( + # self.criterion.reduction == "none" + # ), "sky_loss_value should be 0 if no conf loss" + # for i, l in enumerate(ls_self): + # ls_self[i] = torch.where(skys[i][masks[i]], self.sky_loss_value, l) + + self_name = type(self).__name__ + + details = {} + + # cross view loss and details + camera_only = gts[0]["camera_only"] + pred_pts_cross = [pred_pts[~camera_only] for pred_pts in pred_pts_cross] + gt_pts_cross = [gt_pts[~camera_only] for gt_pts in gt_pts_cross] + masks_cross = [mask[~camera_only] for mask in masks] + skys_cross = [sky[~camera_only] for sky in skys] + + if "Quantile" in self.criterion.__class__.__name__: + # quantile masks have already been determined by self view losses, here pass in None as quantile + ls_cross, _ = self.criterion( + pred_pts_cross, gt_pts_cross, masks_cross, None + ) + else: + ls_cross = [ + self.criterion(pred_pt[mask], gt_pt[mask]) + for pred_pt, gt_pt, mask in zip( + pred_pts_cross, gt_pts_cross, masks_cross + ) + ] + + for i in range(len(ls_cross)): + details[f"gt_img{i + 1}"] = gts[i]["img"].permute(0, 2, 3, 1).detach() + details[f"valid_mask_{i + 1}"] = masks[i].detach() + + if "img_mask" in gts[i] and "ray_mask" in gts[i]: + details[f"img_mask_{i + 1}"] = gts[i]["img_mask"].detach() + details[f"ray_mask_{i + 1}"] = gts[i]["ray_mask"].detach() + + if "desc" in preds[i]: + details[f"desc_{i + 1}"] = preds[i]["desc"].detach() + + if self.sky_loss_value > 0: + assert ( + self.criterion.reduction == "none" + ), "sky_loss_value should be 0 if no conf loss" + for i, l in enumerate(ls_cross): + ls_cross[i] = torch.where( + skys_cross[i][masks_cross[i]], self.sky_loss_value, l + ) + + for i in range(len(ls_cross)): + details[self_name + f"_pts3d/{i+1}"] = float( + ls_cross[i].mean() if ls_cross[i].numel() > 0 else 0 + ) + details[f"conf_{i+1}"] = preds[i]["conf"].detach() + + ls = ls_cross + masks = masks_cross + details["img_ids"] = ( + np.arange(len(ls_cross)).tolist() + ) + details["pose_loss"] = self.compute_pose_loss(gt_poses, pr_poses, pose_masks) + + return Sum(*list(zip(ls, masks))), (details | monitoring) + + +class Regr3DPoseBatchList(Regr3DPose): + """Ensure that all 3D points are correct. + Asymmetric loss: view1 is supposed to be the anchor. + + P1 = RT1 @ D1 + P2 = RT2 @ D2 + loss1 = (I @ pred_D1) - (RT1^-1 @ RT1 @ D1) + loss2 = (RT21 @ pred_D2) - (RT1^-1 @ P2) + = (RT21 @ pred_D2) - (RT1^-1 @ RT2 @ D2) + """ + + def __init__( + self, + criterion, + norm_mode="?avg_dis", + gt_scale=False, + sky_loss_value=2, + max_metric_scale=False, + ): + super().__init__( + criterion, norm_mode, gt_scale, sky_loss_value, max_metric_scale + ) + self.depth_only_criterion = DepthScaleShiftInvLoss() + self.single_view_criterion = ScaleInvLoss() + + def reorg(self, ls_b, masks_b): + ids_split = [mask.sum(dim=(1, 2)) for mask in masks_b] + ls = [[] for _ in range(len(masks_b[0]))] + for i in range(len(ls_b)): + ls_splitted_i = torch.split(ls_b[i], ids_split[i].tolist()) + for j in range(len(masks_b[0])): + ls[j].append(ls_splitted_i[j]) + ls = [torch.cat(l) for l in ls] + return ls + + def compute_loss(self, gts, preds, **kw): + ( + gt_pts_cross, + pred_pts_cross, + gt_poses, + pr_poses, + masks, + skys, + pose_masks, + monitoring, + ) = self.get_all_pts3d(gts, preds, **kw) + + if self.sky_loss_value > 0: + assert ( + self.criterion.reduction == "none" + ), "sky_loss_value should be 0 if no conf loss" + masks = [mask | sky for mask, sky in zip(masks, skys)] + + camera_only = gts[0]["camera_only"] + depth_only = gts[0]["depth_only"] + single_view = gts[0]["single_view"] + is_metric = gts[0]["is_metric"] + + # self view loss and details + if "Quantile" in self.criterion.__class__.__name__: + raise NotImplementedError + else: + # list [(B, h, w, 3)] x num_views -> list [num_views, h, w, 3] x B + masks_b = torch.unbind(torch.stack(masks, dim=1), dim=0) + + + self_name = type(self).__name__ + + gt_pts_cross_b = torch.unbind( + torch.stack(gt_pts_cross, dim=1)[~camera_only], dim=0 + ) + pred_pts_cross_b = torch.unbind( + torch.stack(pred_pts_cross, dim=1)[~camera_only], dim=0 + ) + masks_cross_b = torch.unbind(torch.stack(masks, dim=1)[~camera_only], dim=0) + ls_cross_b = [] + for i in range(len(gt_pts_cross_b)): + if depth_only[~camera_only][i]: + ls_cross_b.append( + self.depth_only_criterion( + pred_pts_cross_b[i][..., -1], + gt_pts_cross_b[i][..., -1], + masks_cross_b[i], + ) + ) + elif single_view[~camera_only][i] and not is_metric[~camera_only][i]: + ls_cross_b.append( + self.single_view_criterion( + pred_pts_cross_b[i], gt_pts_cross_b[i], masks_cross_b[i] + ) + ) + else: + ls_cross_b.append( + self.criterion( + pred_pts_cross_b[i][masks_cross_b[i]], + gt_pts_cross_b[i][masks_cross_b[i]], + ) + ) + ls_cross = self.reorg(ls_cross_b, masks_cross_b) + + if self.sky_loss_value > 0: + assert ( + self.criterion.reduction == "none" + ), "sky_loss_value should be 0 if no conf loss" + masks_cross = [mask[~camera_only] for mask in masks] + skys_cross = [sky[~camera_only] for sky in skys] + for i, l in enumerate(ls_cross): + ls_cross[i] = torch.where( + skys_cross[i][masks_cross[i]], self.sky_loss_value, l + ) + + details = {} + for i in range(len(ls_cross)): + details[f"gt_img{i + 1}"] = gts[i]["img"].permute(0, 2, 3, 1).detach() + details[f"valid_mask_{i + 1}"] = masks[i].detach() + + if "img_mask" in gts[i] and "ray_mask" in gts[i]: + details[f"img_mask_{i + 1}"] = gts[i]["img_mask"].detach() + details[f"ray_mask_{i + 1}"] = gts[i]["ray_mask"].detach() + + if "desc" in preds[i]: + details[f"desc_{i + 1}"] = preds[i]["desc"].detach() + + for i in range(len(ls_cross)): + details[self_name + f"_pts3d/{i+1}"] = float( + ls_cross[i].mean() if ls_cross[i].numel() > 0 else 0 + ) + details[f"conf_{i+1}"] = preds[i]["conf"].detach() + + ls = ls_cross + masks = masks_cross + details["img_ids"] = ( + np.arange(len(ls_cross)).tolist() + ) + pose_masks = pose_masks * gts[i]["img_mask"] + details["pose_loss"] = self.compute_pose_loss(gt_poses, pr_poses, pose_masks) + + return Sum(*list(zip(ls, masks))), (details | monitoring) + + +class ConfLoss(MultiLoss): + """Weighted regression by learned confidence. + Assuming the input pixel_loss is a pixel-level regression loss. + + Principle: + high-confidence means high conf = 0.1 ==> conf_loss = x / 10 + alpha*log(10) + low confidence means low conf = 10 ==> conf_loss = x * 10 - alpha*log(10) + + alpha: hyperparameter + """ + + def __init__(self, pixel_loss, alpha=1): + super().__init__() + assert alpha > 0 + self.alpha = alpha + self.pixel_loss = pixel_loss.with_reduction("none") + + def get_name(self): + return f"ConfLoss({self.pixel_loss})" + + def get_conf_log(self, x): + return x, torch.log(x) + + def compute_loss(self, gts, preds, **kw): + # compute per-pixel loss + losses_and_masks, details = self.pixel_loss(gts, preds, **kw) + if "is_self" in details and "img_ids" in details: + img_ids = details["img_ids"] + else: + img_ids = list(range(len(losses_and_masks))) + + # weight by confidence + conf_losses = [] + + for i in range(len(losses_and_masks)): + pred = preds[img_ids[i]] + conf_key = "conf" + + camera_only = gts[0]["camera_only"] + conf, log_conf = self.get_conf_log( + pred[conf_key][~camera_only][losses_and_masks[i][1]] + ) + + conf_loss = losses_and_masks[i][0] * conf - self.alpha * log_conf + conf_loss = conf_loss.mean() if conf_loss.numel() > 0 else 0 + conf_losses.append(conf_loss) + + + + details[self.get_name() + f"_conf_loss/{img_ids[i]+1}"] = float( + conf_loss + ) + + details.pop("img_ids", None) + + final_loss = sum(conf_losses) / len(conf_losses) * 2.0 + if "pose_loss" in details: + final_loss = ( + final_loss + details["pose_loss"].clip(max=0.3) * 5.0 + ) # , details + if "scale_loss" in details: + final_loss = final_loss + details["scale_loss"] + return final_loss, details + + +class Regr3DPose_ScaleInv(Regr3DPose): + """Same than Regr3D but invariant to depth shift. + if gt_scale == True: enforce the prediction to take the same scale than GT + """ + + def get_all_pts3d(self, gts, preds): + # compute depth-normalized points + ( + gt_pts_cross, + pr_pts_cross, + gt_poses, + pr_poses, + masks, + skys, + pose_masks, + monitoring, + ) = super().get_all_pts3d(gts, preds) + + # measure scene scale + + _, gt_scale_cross = get_group_pointcloud_center_scale(gt_pts_cross, masks) + _, pred_scale_cross = get_group_pointcloud_center_scale(pr_pts_cross, masks) + + # prevent predictions to be in a ridiculous range + pred_scale_cross = pred_scale_cross.clip(min=1e-3, max=1e3) + + # subtract the median depth + if self.gt_scale: + + pr_pts_cross = [ + pr_pt_cross * gt_scale_cross / pred_scale_cross + for pr_pt_cross in pr_pts_cross + ] + else: + gt_pts_cross = [ + gt_pt_cross / gt_scale_cross for gt_pt_cross in gt_pts_cross + ] + pr_pts_cross = [ + pr_pt_cross / pred_scale_cross for pr_pt_cross in pr_pts_cross + ] + + return ( + gt_pts_cross, + pr_pts_cross, + gt_poses, + pr_poses, + masks, + skys, + pose_masks, + monitoring, + ) + +def closed_form_scale_and_shift(pred, gt): + """ + Args: + pred: (B, H, W, C) + gt: (B, H, W, C) + valid_mask: (B, H, W) + Returns: + scale: (B,) + shift: (B,) + """ + assert pred.dim() == 4 and gt.dim() == 4, "Inputs must be 4D tensors" + B, H, W, C = pred.shape + device = pred.device + + pred_flat = pred.view(-1, C) # (N, C) + gt_flat = gt.view(-1, C) # (N, C) + + if C == 1: + pred_mean = pred_flat.mean(dim=0) + gt_mean = gt_flat.mean(dim=0) + + numerator = ((pred_flat - pred_mean) * (gt_flat - gt_mean)).sum(dim=0) + denominator = ((pred_flat - pred_mean) ** 2).sum(dim=0).clamp(min=1e-6) + scale = numerator / denominator + + shift = gt_mean - scale * pred_mean + return scale, shift + + elif C == 3: + pred_mean = pred_flat.mean(0) + gt_mean = gt_flat.mean(0) + pred_centered = pred_flat - pred_mean + gt_centered = gt_flat - gt_mean + + scale = (pred_centered * gt_centered).sum() / (pred_centered ** 2).sum().clamp(min=1e-6) + shift = gt_mean - scale * pred_mean + return scale, shift + + else: + raise ValueError(f"Unsupported channel dimension C={C}. Only 1 or 3 channels are supported.") + +def normalize_pointcloud(pts3d, valid_mask, eps=1e-3): + """ + pts3d: B, H, W, 3 + valid_mask: B, H, W + """ + #print('DEBUG',pts3d.shape, valid_mask.shape) + dist = pts3d.norm(dim=-1) + dist_sum = (dist * valid_mask).sum(dim=[1,2]) + valid_count = valid_mask.sum(dim=[1,2]) + + avg_scale = (dist_sum / (valid_count + eps)).clamp(min=eps, max=1e3) + + # avg_scale = avg_scale.view(-1, 1, 1, 1, 1) + + pts3d = pts3d / avg_scale.view(-1, 1, 1, 1) + return pts3d, avg_scale + +def point_map_to_normal(point_map, mask, eps=1e-6): + """ + point_map: (B, H, W, 3) - 3D points laid out in a 2D grid + mask: (B, H, W) - valid pixels (bool) + + Returns: + normals: (4, B, H, W, 3) - normal vectors for each of the 4 cross-product directions + valids: (4, B, H, W) - corresponding valid masks + """ + + with torch.cuda.amp.autocast(enabled=False): + padded_mask = F.pad(mask, (1, 1, 1, 1), mode='constant', value=0) + pts = F.pad(point_map.permute(0, 3, 1, 2), (1,1,1,1), mode='constant', value=0).permute(0, 2, 3, 1) + + center = pts[:, 1:-1, 1:-1, :] # B,H,W,3 + up = pts[:, :-2, 1:-1, :] + left = pts[:, 1:-1, :-2 , :] + down = pts[:, 2:, 1:-1, :] + right = pts[:, 1:-1, 2:, :] + + up_dir = up - center + left_dir = left - center + down_dir = down - center + right_dir = right - center + + n1 = torch.cross(up_dir, left_dir, dim=-1) # up x left + n2 = torch.cross(left_dir, down_dir, dim=-1) # left x down + n3 = torch.cross(down_dir, right_dir, dim=-1) # down x right + n4 = torch.cross(right_dir,up_dir, dim=-1) # right x up + + v1 = padded_mask[:, :-2, 1:-1] & padded_mask[:, 1:-1, 1:-1] & padded_mask[:, 1:-1, :-2] + v2 = padded_mask[:, 1:-1, :-2 ] & padded_mask[:, 1:-1, 1:-1] & padded_mask[:, 2:, 1:-1] + v3 = padded_mask[:, 2:, 1:-1] & padded_mask[:, 1:-1, 1:-1] & padded_mask[:, 1:-1, 2:] + v4 = padded_mask[:, 1:-1, 2: ] & padded_mask[:, 1:-1, 1:-1] & padded_mask[:, :-2, 1:-1] + + normals = torch.stack([n1, n2, n3, n4], dim=0) # shape [4, B, H, W, 3] + valids = torch.stack([v1, v2, v3, v4], dim=0) # shape [4, B, H, W] + + normals = F.normalize(normals, p=2, dim=-1, eps=eps) + + + # Zero out invalid entries so they don't pollute subsequent computations + # normals = normals * valids.unsqueeze(-1) + + return normals, valids + +class HuberLoss(nn.Module): + def __init__(self, delta=1e-1, reduction="mean"): + super().__init__() + self.delta = delta + self.reduction = reduction + def forward(self, pred, target): + err = pred - target + abs_err = err.abs() + sq = 0.5 * err.pow(2) / self.delta + lin = abs_err - 0.5 * self.delta + loss = torch.where(abs_err <= self.delta, sq, lin) + if self.reduction == "mean": + return loss.mean() + if self.reduction == "sum": + return loss.sum() + return loss # 'none' + +class CameraLoss(nn.Module): + def __init__(self, delta=1e-1, weights=(1.0, 1.0, 0.5)): + super().__init__() + self.huber = HuberLoss(delta=delta) + self.weights = weights + def forward(self, pred_pose, gt_pose): + #print('SHAPE', pred_pose.shape, gt_pose.shape) + loss_T = self.huber(pred_pose[..., :3], gt_pose[..., :3]) + loss_R = self.huber(pred_pose[..., 3:7], gt_pose[..., 3:7]) + #loss_fl = self.huber(pred_pose[..., 7:], gt_pose[..., 7:]) + loss_fl = 0 + return (self.weights[0] * loss_T + self.weights[1] * loss_R + self.weights[2] * loss_fl) + +class DepthOrPmapLoss(nn.Module): + def __init__(self, alpha=0.01): + super().__init__() + self.alpha = alpha + self.grad_scales = 3 + self.gamma = 1.0 + + def gradient_loss_multi_scale(self, pred, gt, mask): + total = 0 + for s in range(self.grad_scales): + step = 2 ** s + pred_s = pred[:, ::step, ::step] + gt_s = gt[:, ::step, ::step] + mask_s = mask[:, ::step, ::step] + total += self.normal_loss(pred_s, gt_s, mask_s) + return total / self.grad_scales + + def normal_loss(self, pred, gt, mask): + pred_norm, _ = point_map_to_normal(pred, mask) + gt_norm, _ = point_map_to_normal(gt, mask) + cos_sim = F.cosine_similarity(pred_norm, gt_norm, dim=-1) + return 1 - cos_sim.mean() + + def image_gradient_loss(self, pred, gt, mask): + assert pred.dim() == 4 and pred.shape[-1] == 1 + assert gt.shape == pred.shape + + B, H, W, _ = pred.shape + device = pred.device + + dx_pred = pred[:, :, 1:] - pred[:, :, :-1] # [B,H,W-1,1] + dx_gt = gt[:, :, 1:] - gt[:, :, :-1] + dx_mask = mask[:, :, 1:] & mask[:, :, :-1] # [B,H,W-1] + + dy_pred = pred[:, 1:, :] - pred[:, :-1, :] # [B,H-1,W,1] + dy_gt = gt[:, 1:, :] - gt[:, :-1, :] + dy_mask = mask[:, 1:, :] & mask[:, :-1, :] # [B,H-1,W] + + min_h = min(dy_pred.shape[1], dx_pred.shape[1]) + min_w = min(dx_pred.shape[2], dy_pred.shape[2]) + + dx_pred = dx_pred[:, :min_h, :min_w, :] # [B,H-1,W-1,1] + dx_gt = dx_gt[:, :min_h, :min_w, :] + dx_mask = dx_mask[:, :min_h, :min_w] # [B,H-1,W-1] + + dy_pred = dy_pred[:, :min_h, :min_w, :] # [B,H-1,W-1,1] + dy_gt = dy_gt[:, :min_h, :min_w, :] + dy_mask = dy_mask[:, :min_h, :min_w] # [B,H-1,W-1] + + loss_dx = F.l1_loss(dx_pred * dx_mask.unsqueeze(-1), + dx_gt * dx_mask.unsqueeze(-1)) + loss_dy = F.l1_loss(dy_pred * dy_mask.unsqueeze(-1), + dy_gt * dy_mask.unsqueeze(-1)) + + return (loss_dx + loss_dy) / 2 + + def forward(self, pred, gt, sigma_p, sigma_g, valid_mask): + if self.training: + pred_normalized = pred #, _ = normalize_pointcloud(pred, valid_mask) + gt_normalized = gt #, _ = normalize_pointcloud(gt, valid_mask) + else: + pred_normalized, gt_normalized = pred, gt + #scale, shift = closed_form_scale_and_shift( + # pred_normalized, gt_normalized + #) + scale = 1 + shift = 0 + pred_aligned = pred_normalized * scale + shift + sigma_p = sigma_p.clamp(min=1e-6) + if sigma_g is not None: + sigma_g = sigma_g.clamp(min=1e-6) + #sigma = 0.5 * (sigma_p + sigma_g) + sigma = sigma_p + diff = (pred_aligned - gt_normalized).abs() + + C = diff.shape[-1] + + main_loss = (sigma[..., None].expand(-1, -1, -1, C) * diff)[valid_mask[..., None].expand(-1, -1, -1, C)].mean() + # Pi3 depth loss + #main_loss = diff[valid_mask[...,None].expand(-1, -1, -1, C)] / (gt_normalized[...,-1][valid_mask]+1e-6) + #main_loss = main_loss.mean() + +#(sigma[..., None].expand(-1, -1, -1, C) * diff)[valid_mask[..., None].expand(-1, -1, -1, C)].mean() + #normal_loss = self.normal_loss(pred_aligned, gt_normalized, valid_mask) + + + if pred.shape[-1] == 1: + grad_loss = self.image_gradient_loss(pred_aligned, gt_normalized, valid_mask) + else: + grad_loss = self.gradient_loss_multi_scale(pred_aligned, gt_normalized, valid_mask) + reg_loss = -self.alpha * torch.log(sigma.clamp(min=1e-6))[valid_mask].mean() + # return main + reg + return self.gamma * main_loss + grad_loss + reg_loss + #return self.gamma * main_loss + normal_loss + reg_loss + +class TrackLoss(nn.Module): + def __init__(self): + super().__init__() + self.bce = nn.BCEWithLogitsLoss(reduction="none") + self.alpha = 0.2 + self.gamma = 1.0 + def forward(self, y_pr, y_gt, vis_pr, vis_gt, w_p, w_g): + #w = 0.5 * (w_p + w_g) + w = w_p + l_pos = (y_pr - y_gt).norm(dim=-1) + l_pos = (w * l_pos).mean() + + l_vis = self.bce(vis_pr, vis_gt.float()) + l_vis = (w * l_vis).mean() + return l_pos + l_vis + +class DistillLoss(MultiLoss): + def __init__(self, lambda_track=0.05): + super().__init__() + self.cam_loss = CameraLoss( + delta=0.1, + weights=(1.0, 1.0, 0.5) + ) + self.depth_loss = DepthOrPmapLoss(alpha=0.1)#init 0.01 now 0.1 + self.pmap_loss = DepthOrPmapLoss(alpha=0.1) + self.track_loss = TrackLoss() + self.lambda_track = lambda_track + + def get_name(self): return "DistillLoss" + + def compute_loss(self, gts, preds_, + track_queries=None, track_preds=None): + # ---------- Lcamera ---------- + ''' + gt_extrinsics = ['camera_pose'] + gt_intrinsics = batch_data['intrinsics'] + image_hw = batch_data['images'].shape[-2:] + ''' + + _,h,w,_ = gts[0]['pts3d'].shape + preds = preds_[0] + #cam_gt = torch.stack([g['camera_pose'] for g in gts], dim=1) + # use GT + ''' + T12w = torch.inverse(gts[0]['camera_pose']) + # Tw2i + cam_gt_extrin = torch.stack([g['camera_pose']@T12w for g in gts], dim=1) # 1,8,4,4 + cam_gt_extrin = torch.inverse(cam_gt_extrin) + ''' + cam_gt_extrin = torch.inverse(torch.stack([g['camera_pose'] for g in gts], dim=1)) # 1,8,4,4 + cam_gt_intrin = torch.stack([g['camera_intrinsics'] for g in gts], dim=1) + cam_gt_pts3d_in_other_view = torch.stack([g['pts3d'] for g in gts],dim=1) # 1,8,H,W,3 + cam_gt_depth = torch.stack([g['depthmap'] for g in gts],dim=1) # 1,8,H,W,3 + + + B,S,h,w,_ = cam_gt_pts3d_in_other_view.shape + cam_gt_pts3d_local = torch.stack([g['pts3d_local'] for g in gts],dim=1) # 1,8,H,W,3 +#(cam_gt_extrin[:,:,:3,:3]@cam_gt_pts3d_in_other_view.view(B,S,-1,3).permute(0,1,3,2) + cam_gt_extrin[:,:,:3,3:]).permute(0,1,3,2).view(B,S,h,w,3) + + + point_masks = torch.stack([g['valid_mask'] for g in gts],dim=1) + #print('CHECK', point_masks.shape, cam_gt_pts3d_in_other_view.shape, cam_gt_depth.shape) + cam_gt_extrin_34, cam_gt_pts3d_local, cam_gt_pts3d_in_other_view, cam_gt_depth = normalize_camera_extrinsics_and_points_batch(cam_gt_extrin[:,:,:3], cam_points=cam_gt_pts3d_local, world_points=cam_gt_pts3d_in_other_view, depths=cam_gt_depth, point_masks=point_masks) + cam_gt_extrin[:,:,:3] = cam_gt_extrin_34 + + for s in range(cam_gt_depth.shape[1]): + gts[s]['depth'] = cam_gt_depth[:,s] + gts[s]['pts3d_in_other_view'] = cam_gt_pts3d_in_other_view[:,s] + gts[s]['pts3d_local'] = cam_gt_pts3d_local[:,s] + + + # solve scales + #closed_form_scale_and_shift(cam_gt_pts3d_local, preds['points']) + pred_flat = preds['local_points'].reshape(-1,3) + gt_flat = cam_gt_pts3d_local.reshape(-1,3) + ''' + pred_mean = pred_flat.mean(0) + gt_mean = gt_flat.mean(0) + pred_centered = pred_flat - pred_mean + gt_centered = gt_flat - gt_mean + scale = (pred_centered * gt_centered).sum() / (pred_centered ** 2).sum().clamp(min=1e-6) + ''' + #weight_p = torch.ones(pred_flat.shape[0]).to(pred_flat.device) + weight_p = point_masks.reshape(-1) + scale = align_points_scale(pred_flat, gt_flat, weight_p) + + + + + + + # camera + + cam_gt_extrin_relative = cam_gt_extrin[:,:,None] @ torch.inverse(cam_gt_extrin[:,None]) # 1,S,S,4,4 + cam_gt_intrin = cam_gt_intrin[:,None].repeat(1,S,1,1,1) + + #print('SHAPE', cam_gt_extrin_relative.shape, cam_gt_intrin.shape) + #print('CHECK', cam_gt_extrin.shape, cam_gt_intrin.shape, gts[0]["true_shape"],h,w) + cam_gt_ = extri_intri_to_pose_encoding(cam_gt_extrin_relative.view(B,S*S,4,4), cam_gt_intrin.view(B,S*S,3,3), (h,w),"absT_quaR_FoV") + + + + + # find relative + ''' + cam_pr = torch.stack([p['camera_pose'] for p in preds], dim=1) + cam_pred_extrin, _ = pose_encoding_to_extri_intri(cam_pr, build_intrinsics=False) # 1,S, + additional_row = torch.zeros(B, S, 1, 4).to(cam_pred_extrin) + additional_row[..., -1] = 1 + cam_pred_extrin = torch.cat([cam_pred_extrin,additional_row],axis=2) + ''' + cam_pred_pose = preds['camera_poses'] + cam_pred_extrin = torch.inverse(cam_pred_pose) + + #intrin = preds['intrin'] # B,S,2 + + + # relative + cam_pred_extrin_relative = cam_pred_extrin[:,:,None] @ torch.inverse(cam_pred_extrin[:,None]) # 1,S,S,4,4 + cam_pr_ = extri_intri_to_pose_encoding(cam_pred_extrin_relative.view(B,S*S,4,4), cam_gt_intrin.view(B,S*S,3,3), (h,w),"absT_quaR_FoV") # 1,S*S,9 + #cam_pr_[:,:,7:] = cam_pr[:,:,7:][:,None].repeat(1,S,1,1).view(B,S*S,2) + #intrin = intrin[:,:,None].repeat(1,1,S,1).view(B,S*S,2) + + + cam_pr_[:,:,:3] *= scale + + #cam_pr_[:,:,-2:] = intrin + + + #print('CHECK',cam_gt.shape, cam_pr.shape) + #print('CHECK') + #print(cam_pr) + #print(cam_gt) + Lcamera = self.cam_loss(cam_pr_, cam_gt_) + + #Ldepth = torch.zeros_like(Lcamera) + # ---------- Ldepth ---------- + depth_terms = [] + for i,g in enumerate(gts):#, preds): + if ('depth' in g):# and ('depth' in p): + sigma_p = preds['conf'][:,i].squeeze(-1) + sigma_g = None #g['conf'] + valid_mask = g['valid_mask'] + if not valid_mask.any(): + valid_mask = torch.ones_like(g['valid_mask']) + + depth_terms.append(self.depth_loss(preds['local_points'][:,i,:,:,2:]*scale, g['depth'].view(preds['local_points'][:,i,:,:,2:].shape), sigma_p, sigma_g, valid_mask)) + + Ldepth = torch.stack(depth_terms).mean() if depth_terms else torch.zeros_like(Lcamera) + ''' + print('CHECK') + print(preds[0]['pts3d_in_other_view'][:,100:-100,100:-100]) + print(gts[0]['pts3d_in_other_view'][:,100:-100:,100:-100]) + print("MAX", preds[0]['pts3d_in_other_view']/(gts[0]['pts3d_in_other_view']+1e-6)) + + print("MAX", preds[0]['depth'].mean(), gts[0]['depth'].mean()) + print(preds[0]['depth'][:,:10]) + print(gts[0]['depth'][:,:10]) + ''' + + # ---------- Lpmap ---------- + # transform pointmap to a frame + world_frame = 0 #np.random.randint(S) + Twc_pred = cam_pred_extrin_relative[:,world_frame] # 1,S,4,4 + Twc_gt = cam_gt_extrin_relative[:,world_frame] + + pred_p = (Twc_pred[:,:,:3,:3]@preds['local_points'].view(B,S,-1,3).permute(0,1,3,2) + Twc_pred[:,:,:3,3:]).permute(0,1,3,2).view(B,S,h,w,3) * scale + gt_p = (Twc_gt[:,:,:3,:3]@cam_gt_pts3d_local.view(B,S,-1,3).permute(0,1,3,2) + Twc_gt[:,:,:3,3:]).permute(0,1,3,2).view(B,S,h,w,3) + + + if ('conf' in preds): + pmap_terms = [] + for i,g in enumerate(gts):#,preds): + sigma_p = preds['conf'][:,i].squeeze(-1) + sigma_g = None #g['conf'] + valid_mask = g['valid_mask'] + if not valid_mask.any(): + valid_mask = torch.ones_like(g['valid_mask']) + + loss_pmap = self.pmap_loss(pred_p[:,i], + gt_p[:,i], + sigma_p, + sigma_g, + valid_mask) + + ''' + loss_pmap = self.pmap_loss(preds['local_points'][:,i]*scale, + g['pts3d_local'], + sigma_p, + sigma_g, + valid_mask) + ''' + pmap_terms.append(loss_pmap) + Lpmap = torch.stack(pmap_terms).mean() + else: + Lpmap = torch.zeros_like(Lcamera) + + + + + + # ---------- Ltrack ---------- + if ('track' in gts[0]) and ('track' in preds[0]): + y_gt = torch.stack([g['track'] for g in gts], dim=1) + vis_gt = torch.stack([g['vis'] for g in gts], dim=1) + + y_pr = torch.stack([p['track'] for p in preds], dim=1) + vis_pr = torch.stack([p['vis'] for p in preds], dim=1) + + w_p = torch.stack([p['track_conf'] for p in preds], dim=1) + w_g = torch.stack([g['track_conf'] for g in gts], dim=1) + + + Ltrack = self.track_loss(y_pr, y_gt, vis_pr, vis_gt, w_p, w_g) + else: + Ltrack = torch.zeros_like(Lcamera) + + total = Lcamera * 20*100 + Ldepth * 20 + Lpmap * 10 + self.lambda_track * 10 * Ltrack + #total = Lcamera * 20*10 + Lpmap * 10 + self.lambda_track * 10 * Ltrack + + details = {} + + details['Lcamera'] = float(Lcamera) * 20 + details['Ldepth'] = float(Ldepth) * 20 + details['Lpmap'] = float(Lpmap) * 10 + details['Ltrack'] = float(Ltrack) * self.lambda_track * 10 + details['total'] = float(total) + + return total, details diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/model.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/model.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed9f6106fb063686990c874ede99876ebc939ab --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/model.py @@ -0,0 +1,1123 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.dirname(__file__))) +from collections import OrderedDict +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint +from copy import deepcopy +from functools import partial +from typing import Optional, Tuple, List, Any +from dataclasses import dataclass +from transformers import PretrainedConfig +from transformers import PreTrainedModel +from transformers.modeling_outputs import BaseModelOutput +from transformers.file_utils import ModelOutput +import time +from dust3r.utils.misc import ( + fill_default_args, + freeze_all_params, + is_symmetrized, + interleave, + transpose_to_landscape, +) +from dust3r.heads import head_factory +from dust3r.utils.camera import PoseEncoder +from dust3r.patch_embed import get_patch_embed +import dust3r.utils.path_to_croco # noqa: F401 +from models.croco import CroCoNet, CrocoConfig # noqa +from dust3r.blocks import ( + Block, + DecoderBlock, + Mlp, + Attention, + CrossAttention, + DropPath, + CustomDecoderBlock, +) # noqa + +inf = float("inf") +from accelerate.logging import get_logger + +printer = get_logger(__name__, log_level="DEBUG") + + +@dataclass +class ARCroco3DStereoOutput(ModelOutput): + """ + Custom output class for ARCroco3DStereo. + """ + + ress: Optional[List[Any]] = None + views: Optional[List[Any]] = None + + +def strip_module(state_dict): + """ + Removes the 'module.' prefix from the keys of a state_dict. + Args: + state_dict (dict): The original state_dict with possible 'module.' prefixes. + Returns: + OrderedDict: A new state_dict with 'module.' prefixes removed. + """ + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + name = k[7:] if k.startswith("module.") else k + new_state_dict[name] = v + return new_state_dict + + +def load_model(model_path, device, verbose=True): + if verbose: + print("... loading model from", model_path) + ckpt = torch.load(model_path, map_location="cpu") + args = ckpt["args"].model.replace( + "ManyAR_PatchEmbed", "PatchEmbedDust3R" + ) # ManyAR only for aspect ratio not consistent + if "landscape_only" not in args: + args = args[:-2] + ", landscape_only=False))" + else: + args = args.replace(" ", "").replace( + "landscape_only=True", "landscape_only=False" + ) + assert "landscape_only=False" in args + if verbose: + print(f"instantiating : {args}") + net = eval(args) + s = net.load_state_dict(ckpt["model"], strict=False) + if verbose: + print(s) + return net.to(device) + + +class ARCroco3DStereoConfig(PretrainedConfig): + model_type = "arcroco_3d_stereo" + + def __init__( + self, + output_mode="pts3d", + head_type="linear", # or dpt + depth_mode=("exp", -float("inf"), float("inf")), + conf_mode=("exp", 1, float("inf")), + pose_mode=("exp", -float("inf"), float("inf")), + freeze="none", + landscape_only=True, + patch_embed_cls="PatchEmbedDust3R", + ray_enc_depth=2, + state_size=324, + local_mem_size=256, + state_pe="2d", + state_dec_num_heads=16, + depth_head=False, + rgb_head=False, + pose_conf_head=False, + pose_head=False, + **croco_kwargs, + ): + super().__init__() + self.output_mode = output_mode + self.head_type = head_type + self.depth_mode = depth_mode + self.conf_mode = conf_mode + self.pose_mode = pose_mode + self.freeze = freeze + self.landscape_only = landscape_only + self.patch_embed_cls = patch_embed_cls + self.ray_enc_depth = ray_enc_depth + self.state_size = state_size + self.state_pe = state_pe + self.state_dec_num_heads = state_dec_num_heads + self.local_mem_size = local_mem_size + self.depth_head = depth_head + self.rgb_head = rgb_head + self.pose_conf_head = pose_conf_head + self.pose_head = pose_head + self.croco_kwargs = croco_kwargs + + +class LocalMemory(nn.Module): + def __init__( + self, + size, + k_dim, + v_dim, + num_heads, + depth=2, + mlp_ratio=4.0, + qkv_bias=False, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + norm_mem=True, + rope=None, + ) -> None: + super().__init__() + self.v_dim = v_dim + self.proj_q = nn.Linear(k_dim, v_dim) + self.masked_token = nn.Parameter( + torch.randn(1, 1, v_dim) * 0.2, requires_grad=True + ) + self.mem = nn.Parameter( + torch.randn(1, size, 2 * v_dim) * 0.2, requires_grad=True + ) + self.write_blocks = nn.ModuleList( + [ + DecoderBlock( + 2 * v_dim, + num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + attn_drop=attn_drop, + drop=drop, + drop_path=drop_path, + act_layer=act_layer, + norm_mem=norm_mem, + rope=rope, + ) + for _ in range(depth) + ] + ) + self.read_blocks = nn.ModuleList( + [ + DecoderBlock( + 2 * v_dim, + num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + attn_drop=attn_drop, + drop=drop, + drop_path=drop_path, + act_layer=act_layer, + norm_mem=norm_mem, + rope=rope, + ) + for _ in range(depth) + ] + ) + + def update_mem(self, mem, feat_k, feat_v): + """ + mem_k: [B, size, C] + mem_v: [B, size, C] + feat_k: [B, 1, C] + feat_v: [B, 1, C] + """ + feat_k = self.proj_q(feat_k) # [B, 1, C] + feat = torch.cat([feat_k, feat_v], dim=-1) + for blk in self.write_blocks: + mem, _ = blk(mem, feat, None, None) + return mem + + def inquire(self, query, mem): + x = self.proj_q(query) # [B, 1, C] + x = torch.cat([x, self.masked_token.expand(x.shape[0], -1, -1)], dim=-1) + for blk in self.read_blocks: + x, _ = blk(x, mem, None, None) + return x[..., -self.v_dim :] + + +class ARCroco3DStereo(CroCoNet): + config_class = ARCroco3DStereoConfig + base_model_prefix = "arcroco3dstereo" + supports_gradient_checkpointing = True + + def __init__(self, config: ARCroco3DStereoConfig): + self.gradient_checkpointing = False + self.fixed_input_length = True + config.croco_kwargs = fill_default_args( + config.croco_kwargs, CrocoConfig.__init__ + ) + self.config = config + self.patch_embed_cls = config.patch_embed_cls + self.croco_args = config.croco_kwargs + croco_cfg = CrocoConfig(**self.croco_args) + super().__init__(croco_cfg) + self.enc_blocks_ray_map = nn.ModuleList( + [ + Block( + self.enc_embed_dim, + 16, + 4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + rope=self.rope, + ) + for _ in range(config.ray_enc_depth) + ] + ) + self.enc_norm_ray_map = nn.LayerNorm(self.enc_embed_dim, eps=1e-6) + self.dec_num_heads = self.croco_args["dec_num_heads"] + self.pose_head_flag = config.pose_head + if self.pose_head_flag: + self.pose_token = nn.Parameter( + torch.randn(1, 1, self.dec_embed_dim) * 0.02, requires_grad=True + ) + self.pose_retriever = LocalMemory( + size=config.local_mem_size, + k_dim=self.enc_embed_dim, + v_dim=self.dec_embed_dim, + num_heads=self.dec_num_heads, + mlp_ratio=4, + qkv_bias=True, + attn_drop=0.0, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + rope=None, + ) + self.register_tokens = nn.Embedding(config.state_size, self.enc_embed_dim) + self.state_size = config.state_size + self.state_pe = config.state_pe + self.masked_img_token = nn.Parameter( + torch.randn(1, self.enc_embed_dim) * 0.02, requires_grad=True + ) + self.masked_ray_map_token = nn.Parameter( + torch.randn(1, self.enc_embed_dim) * 0.02, requires_grad=True + ) + self._set_state_decoder( + self.enc_embed_dim, + self.dec_embed_dim, + config.state_dec_num_heads, + self.dec_depth, + self.croco_args.get("mlp_ratio", None), + self.croco_args.get("norm_layer", None), + self.croco_args.get("norm_im2_in_dec", None), + ) + self.set_downstream_head( + config.output_mode, + config.head_type, + config.landscape_only, + config.depth_mode, + config.conf_mode, + config.pose_mode, + config.depth_head, + config.rgb_head, + config.pose_conf_head, + config.pose_head, + **self.croco_args, + ) + self.set_freeze(config.freeze) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kw): + if os.path.isfile(pretrained_model_name_or_path): + return load_model(pretrained_model_name_or_path, device="cpu") + else: + try: + model = super(ARCroco3DStereo, cls).from_pretrained( + pretrained_model_name_or_path, **kw + ) + except TypeError as e: + raise Exception( + f"tried to load {pretrained_model_name_or_path} from huggingface, but failed" + ) + return model + + def _set_patch_embed(self, img_size=224, patch_size=16, enc_embed_dim=768): + self.patch_embed = get_patch_embed( + self.patch_embed_cls, img_size, patch_size, enc_embed_dim, in_chans=3 + ) + self.patch_embed_ray_map = get_patch_embed( + self.patch_embed_cls, img_size, patch_size, enc_embed_dim, in_chans=6 + ) + + def _set_decoder( + self, + enc_embed_dim, + dec_embed_dim, + dec_num_heads, + dec_depth, + mlp_ratio, + norm_layer, + norm_im2_in_dec, + ): + self.dec_depth = dec_depth + self.dec_embed_dim = dec_embed_dim + self.decoder_embed = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True) + self.dec_blocks = nn.ModuleList( + [ + DecoderBlock( + dec_embed_dim, + dec_num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=True, + norm_layer=norm_layer, + norm_mem=norm_im2_in_dec, + rope=self.rope, + ) + for i in range(dec_depth) + ] + ) + self.dec_norm = norm_layer(dec_embed_dim) + + def _set_state_decoder( + self, + enc_embed_dim, + dec_embed_dim, + dec_num_heads, + dec_depth, + mlp_ratio, + norm_layer, + norm_im2_in_dec, + ): + self.dec_depth_state = dec_depth + self.dec_embed_dim_state = dec_embed_dim + self.decoder_embed_state = nn.Linear(enc_embed_dim, dec_embed_dim, bias=True) + self.dec_blocks_state = nn.ModuleList( + [ + DecoderBlock( + dec_embed_dim, + dec_num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=True, + norm_layer=norm_layer, + norm_mem=norm_im2_in_dec, + rope=self.rope, + ) + for i in range(dec_depth) + ] + ) + self.dec_norm_state = norm_layer(dec_embed_dim) + + def load_state_dict(self, ckpt, **kw): + if all(k.startswith("module") for k in ckpt): + ckpt = strip_module(ckpt) + new_ckpt = dict(ckpt) + if not any(k.startswith("dec_blocks_state") for k in ckpt): + for key, value in ckpt.items(): + if key.startswith("dec_blocks"): + new_ckpt[key.replace("dec_blocks", "dec_blocks_state")] = value + try: + return super().load_state_dict(new_ckpt, **kw) + except: + try: + new_new_ckpt = { + k: v + for k, v in new_ckpt.items() + if not k.startswith("dec_blocks") + and not k.startswith("dec_norm") + and not k.startswith("decoder_embed") + } + return super().load_state_dict(new_new_ckpt, **kw) + except: + new_new_ckpt = {} + for key in new_ckpt: + if key in self.state_dict(): + if new_ckpt[key].size() == self.state_dict()[key].size(): + new_new_ckpt[key] = new_ckpt[key] + else: + printer.info( + f"Skipping '{key}': size mismatch (ckpt: {new_ckpt[key].size()}, model: {self.state_dict()[key].size()})" + ) + else: + printer.info(f"Skipping '{key}': not found in model") + return super().load_state_dict(new_new_ckpt, **kw) + + def set_freeze(self, freeze): # this is for use by downstream models + self.freeze = freeze + to_be_frozen = { + "none": [], + "mask": [self.mask_token] if hasattr(self, "mask_token") else [], + "encoder": [ + self.patch_embed, + self.patch_embed_ray_map, + self.masked_img_token, + self.masked_ray_map_token, + self.enc_blocks, + self.enc_blocks_ray_map, + self.enc_norm, + self.enc_norm_ray_map, + ], + "encoder_and_head": [ + self.patch_embed, + self.patch_embed_ray_map, + self.masked_img_token, + self.masked_ray_map_token, + self.enc_blocks, + self.enc_blocks_ray_map, + self.enc_norm, + self.enc_norm_ray_map, + self.downstream_head, + ], + "encoder_and_decoder": [ + self.patch_embed, + self.patch_embed_ray_map, + self.masked_img_token, + self.masked_ray_map_token, + self.enc_blocks, + self.enc_blocks_ray_map, + self.enc_norm, + self.enc_norm_ray_map, + self.dec_blocks, + self.dec_blocks_state, + self.pose_retriever, + self.pose_token, + self.register_tokens, + self.decoder_embed_state, + self.decoder_embed, + self.dec_norm, + self.dec_norm_state, + ], + "decoder": [ + self.dec_blocks, + self.dec_blocks_state, + self.pose_retriever, + self.pose_token, + ], + } + freeze_all_params(to_be_frozen[freeze]) + + def _set_prediction_head(self, *args, **kwargs): + """No prediction head""" + return + + def set_downstream_head( + self, + output_mode, + head_type, + landscape_only, + depth_mode, + conf_mode, + pose_mode, + depth_head, + rgb_head, + pose_conf_head, + pose_head, + patch_size, + img_size, + **kw, + ): + assert ( + img_size[0] % patch_size == 0 and img_size[1] % patch_size == 0 + ), f"{img_size=} must be multiple of {patch_size=}" + self.output_mode = output_mode + self.head_type = head_type + self.depth_mode = depth_mode + self.conf_mode = conf_mode + self.pose_mode = pose_mode + self.downstream_head = head_factory( + head_type, + output_mode, + self, + has_conf=bool(conf_mode), + has_depth=bool(depth_head), + has_rgb=bool(rgb_head), + has_pose_conf=bool(pose_conf_head), + has_pose=bool(pose_head), + ) + self.head = transpose_to_landscape( + self.downstream_head, activate=landscape_only + ) + + def _encode_image(self, image, true_shape): + x, pos = self.patch_embed(image, true_shape=true_shape) + assert self.enc_pos_embed is None + for blk in self.enc_blocks: + if self.gradient_checkpointing and self.training: + x = checkpoint(blk, x, pos, use_reentrant=False) + else: + x = blk(x, pos) + x = self.enc_norm(x) + return [x], pos, None + + def _encode_ray_map(self, ray_map, true_shape): + x, pos = self.patch_embed_ray_map(ray_map, true_shape=true_shape) + assert self.enc_pos_embed is None + for blk in self.enc_blocks_ray_map: + if self.gradient_checkpointing and self.training: + x = checkpoint(blk, x, pos, use_reentrant=False) + else: + x = blk(x, pos) + x = self.enc_norm_ray_map(x) + return [x], pos, None + + def _encode_state(self, image_tokens, image_pos): + batch_size = image_tokens.shape[0] + state_feat = self.register_tokens( + torch.arange(self.state_size, device=image_pos.device) + ) + if self.state_pe == "1d": + state_pos = ( + torch.tensor( + [[i, i] for i in range(self.state_size)], + dtype=image_pos.dtype, + device=image_pos.device, + )[None] + .expand(batch_size, -1, -1) + .contiguous() + ) # .long() + elif self.state_pe == "2d": + width = int(self.state_size**0.5) + width = width + 1 if width % 2 == 1 else width + state_pos = ( + torch.tensor( + [[i // width, i % width] for i in range(self.state_size)], + dtype=image_pos.dtype, + device=image_pos.device, + )[None] + .expand(batch_size, -1, -1) + .contiguous() + ) + elif self.state_pe == "none": + state_pos = None + state_feat = state_feat[None].expand(batch_size, -1, -1) + return state_feat, state_pos, None + + def _encode_views(self, views, img_mask=None, ray_mask=None): + device = views[0]["img"].device + batch_size = views[0]["img"].shape[0] + given = True + if img_mask is None and ray_mask is None: + given = False + if not given: + img_mask = torch.stack( + [view["img_mask"] for view in views], dim=0 + ) # Shape: (num_views, batch_size) + ray_mask = torch.stack( + [view["ray_mask"] for view in views], dim=0 + ) # Shape: (num_views, batch_size) + imgs = torch.stack( + [view["img"] for view in views], dim=0 + ) # Shape: (num_views, batch_size, C, H, W) + ray_maps = torch.stack( + [view["ray_map"] for view in views], dim=0 + ) # Shape: (num_views, batch_size, H, W, C) + shapes = [] + for view in views: + if "true_shape" in view: + shapes.append(view["true_shape"]) + else: + shape = torch.tensor(view["img"].shape[-2:], device=device) + shapes.append(shape.unsqueeze(0).repeat(batch_size, 1)) + shapes = torch.stack(shapes, dim=0).to( + imgs.device + ) # Shape: (num_views, batch_size, 2) + imgs = imgs.view( + -1, *imgs.shape[2:] + ) # Shape: (num_views * batch_size, C, H, W) + ray_maps = ray_maps.view( + -1, *ray_maps.shape[2:] + ) # Shape: (num_views * batch_size, H, W, C) + shapes = shapes.view(-1, 2) # Shape: (num_views * batch_size, 2) + img_masks_flat = img_mask.view(-1) # Shape: (num_views * batch_size) + ray_masks_flat = ray_mask.view(-1) + selected_imgs = imgs[img_masks_flat] + selected_shapes = shapes[img_masks_flat] + if selected_imgs.size(0) > 0: + img_out, img_pos, _ = self._encode_image(selected_imgs, selected_shapes) + else: + raise NotImplementedError + full_out = [ + torch.zeros( + len(views) * batch_size, *img_out[0].shape[1:], device=img_out[0].device + ) + for _ in range(len(img_out)) + ] + full_pos = torch.zeros( + len(views) * batch_size, + *img_pos.shape[1:], + device=img_pos.device, + dtype=img_pos.dtype, + ) + for i in range(len(img_out)): + full_out[i][img_masks_flat] += img_out[i] + full_out[i][~img_masks_flat] += self.masked_img_token + full_pos[img_masks_flat] += img_pos + ray_maps = ray_maps.permute(0, 3, 1, 2) # Change shape to (N, C, H, W) + selected_ray_maps = ray_maps[ray_masks_flat] + selected_shapes_ray = shapes[ray_masks_flat] + if selected_ray_maps.size(0) > 0: + ray_out, ray_pos, _ = self._encode_ray_map( + selected_ray_maps, selected_shapes_ray + ) + assert len(ray_out) == len(full_out), f"{len(ray_out)}, {len(full_out)}" + for i in range(len(ray_out)): + full_out[i][ray_masks_flat] += ray_out[i] + full_out[i][~ray_masks_flat] += self.masked_ray_map_token + full_pos[ray_masks_flat] += ( + ray_pos * (~img_masks_flat[ray_masks_flat][:, None, None]).long() + ) + else: + raymaps = torch.zeros( + 1, 6, imgs[0].shape[-2], imgs[0].shape[-1], device=img_out[0].device + ) + ray_mask_flat = torch.zeros_like(img_masks_flat) + ray_mask_flat[:1] = True + ray_out, ray_pos, _ = self._encode_ray_map(raymaps, shapes[ray_mask_flat]) + for i in range(len(ray_out)): + full_out[i][ray_mask_flat] += ray_out[i] * 0.0 + full_out[i][~ray_mask_flat] += self.masked_ray_map_token * 0.0 + return ( + shapes.chunk(len(views), dim=0), + [out.chunk(len(views), dim=0) for out in full_out], + full_pos.chunk(len(views), dim=0), + ) + + def _decoder(self, f_state, pos_state, f_img, pos_img, f_pose, pos_pose): + final_output = [(f_state, f_img)] # before projection + assert f_state.shape[-1] == self.dec_embed_dim + f_img = self.decoder_embed(f_img) + if self.pose_head_flag: + assert f_pose is not None and pos_pose is not None + f_img = torch.cat([f_pose, f_img], dim=1) + pos_img = torch.cat([pos_pose, pos_img], dim=1) + final_output.append((f_state, f_img)) + for blk_state, blk_img in zip(self.dec_blocks_state, self.dec_blocks): + if ( + self.gradient_checkpointing + and self.training + and torch.is_grad_enabled() + ): + f_state, _ = checkpoint( + blk_state, + *final_output[-1][::+1], + pos_state, + pos_img, + use_reentrant=not self.fixed_input_length, + ) + f_img, _ = checkpoint( + blk_img, + *final_output[-1][::-1], + pos_img, + pos_state, + use_reentrant=not self.fixed_input_length, + ) + else: + f_state, _ = blk_state(*final_output[-1][::+1], pos_state, pos_img) + f_img, _ = blk_img(*final_output[-1][::-1], pos_img, pos_state) + final_output.append((f_state, f_img)) + del final_output[1] # duplicate with final_output[0] + final_output[-1] = ( + self.dec_norm_state(final_output[-1][0]), + self.dec_norm(final_output[-1][1]), + ) + return zip(*final_output) + + def _downstream_head(self, decout, img_shape, **kwargs): + B, S, D = decout[-1].shape + head = getattr(self, f"head") + return head(decout, img_shape, **kwargs) + + def _init_state(self, image_tokens, image_pos): + """ + Current Version: input the first frame img feature and pose to initialize the state feature and pose + """ + state_feat, state_pos, _ = self._encode_state(image_tokens, image_pos) + state_feat = self.decoder_embed_state(state_feat) + return state_feat, state_pos + + def _recurrent_rollout( + self, + state_feat, + state_pos, + current_feat, + current_pos, + pose_feat, + pose_pos, + init_state_feat, + img_mask=None, + reset_mask=None, + update=None, + ): + new_state_feat, dec = self._decoder( + state_feat, state_pos, current_feat, current_pos, pose_feat, pose_pos + ) + new_state_feat = new_state_feat[-1] + return new_state_feat, dec + + def _get_img_level_feat(self, feat): + return torch.mean(feat, dim=1, keepdim=True) + + def _forward_encoder(self, views): + shape, feat_ls, pos = self._encode_views(views) + feat = feat_ls[-1] + state_feat, state_pos = self._init_state(feat[0], pos[0]) + mem = self.pose_retriever.mem.expand(feat[0].shape[0], -1, -1) + init_state_feat = state_feat.clone() + init_mem = mem.clone() + return (feat, pos, shape), ( + init_state_feat, + init_mem, + state_feat, + state_pos, + mem, + ) + + def _forward_decoder_step( + self, + views, + i, + feat_i, + pos_i, + shape_i, + init_state_feat, + init_mem, + state_feat, + state_pos, + mem, + ): + if self.pose_head_flag: + global_img_feat_i = self._get_img_level_feat(feat_i) + if i == 0: + pose_feat_i = self.pose_token.expand(feat_i.shape[0], -1, -1) + else: + pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem) + pose_pos_i = -torch.ones( + feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype + ) + else: + pose_feat_i = None + pose_pos_i = None + new_state_feat, dec = self._recurrent_rollout( + state_feat, + state_pos, + feat_i, + pos_i, + pose_feat_i, + pose_pos_i, + init_state_feat, + img_mask=views[i]["img_mask"], + reset_mask=views[i]["reset"], + update=views[i].get("update", None), + ) + out_pose_feat_i = dec[-1][:, 0:1] + new_mem = self.pose_retriever.update_mem( + mem, global_img_feat_i, out_pose_feat_i + ) + head_input = [ + dec[0].float(), + dec[self.dec_depth * 2 // 4][:, 1:].float(), + dec[self.dec_depth * 3 // 4][:, 1:].float(), + dec[self.dec_depth].float(), + ] + res = self._downstream_head(head_input, shape_i, pos=pos_i) + img_mask = views[i]["img_mask"] + update = views[i].get("update", None) + if update is not None: + update_mask = img_mask & update # if don't update, then whatever img_mask + else: + update_mask = img_mask + update_mask = update_mask[:, None, None].float() + state_feat = new_state_feat * update_mask + state_feat * ( + 1 - update_mask + ) # update global state + mem = new_mem * update_mask + mem * (1 - update_mask) # then update local state + reset_mask = views[i]["reset"] + if reset_mask is not None: + reset_mask = reset_mask[:, None, None].float() + state_feat = init_state_feat * reset_mask + state_feat * (1 - reset_mask) + mem = init_mem * reset_mask + mem * (1 - reset_mask) + return res, (state_feat, mem) + + def _forward_impl(self, views, ret_state=False): + shape, feat_ls, pos = self._encode_views(views) + feat = feat_ls[-1] + state_feat, state_pos = self._init_state(feat[0], pos[0]) + mem = self.pose_retriever.mem.expand(feat[0].shape[0], -1, -1) + init_state_feat = state_feat.clone() + init_mem = mem.clone() + all_state_args = [(state_feat, state_pos, init_state_feat, mem, init_mem)] + ress = [] + for i in range(len(views)): + feat_i = feat[i] + pos_i = pos[i] + if self.pose_head_flag: + global_img_feat_i = self._get_img_level_feat(feat_i) + if i == 0: + pose_feat_i = self.pose_token.expand(feat_i.shape[0], -1, -1) + else: + pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem) + pose_pos_i = -torch.ones( + feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype + ) + else: + pose_feat_i = None + pose_pos_i = None + new_state_feat, dec = self._recurrent_rollout( + state_feat, + state_pos, + feat_i, + pos_i, + pose_feat_i, + pose_pos_i, + init_state_feat, + img_mask=views[i]["img_mask"], + reset_mask=views[i]["reset"], + update=views[i].get("update", None), + ) + out_pose_feat_i = dec[-1][:, 0:1] + new_mem = self.pose_retriever.update_mem( + mem, global_img_feat_i, out_pose_feat_i + ) + assert len(dec) == self.dec_depth + 1 + head_input = [ + dec[0].float(), + dec[self.dec_depth * 2 // 4][:, 1:].float(), + dec[self.dec_depth * 3 // 4][:, 1:].float(), + dec[self.dec_depth].float(), + ] + res = self._downstream_head(head_input, shape[i], pos=pos_i) + ress.append(res) + img_mask = views[i]["img_mask"] + update = views[i].get("update", None) + if update is not None: + update_mask = ( + img_mask & update + ) # if don't update, then whatever img_mask + else: + update_mask = img_mask + update_mask = update_mask[:, None, None].float() + state_feat = new_state_feat * update_mask + state_feat * ( + 1 - update_mask + ) # update global state + mem = new_mem * update_mask + mem * ( + 1 - update_mask + ) # then update local state + reset_mask = views[i]["reset"] + if reset_mask is not None: + reset_mask = reset_mask[:, None, None].float() + state_feat = init_state_feat * reset_mask + state_feat * ( + 1 - reset_mask + ) + mem = init_mem * reset_mask + mem * (1 - reset_mask) + all_state_args.append( + (state_feat, state_pos, init_state_feat, mem, init_mem) + ) + if ret_state: + return ress, views, all_state_args + return ress, views + + def forward(self, views, ret_state=False): + if ret_state: + ress, views, state_args = self._forward_impl(views, ret_state=ret_state) + return ARCroco3DStereoOutput(ress=ress, views=views), state_args + else: + ress, views = self._forward_impl(views, ret_state=ret_state) + return ARCroco3DStereoOutput(ress=ress, views=views) + + def inference_step( + self, view, state_feat, state_pos, init_state_feat, mem, init_mem + ): + batch_size = view["img"].shape[0] + raymaps = [] + shapes = [] + for j in range(batch_size): + assert view["ray_mask"][j] + raymap = view["ray_map"][[j]].permute(0, 3, 1, 2) + raymaps.append(raymap) + shapes.append( + view.get( + "true_shape", + torch.tensor(view["ray_map"].shape[-2:])[None].repeat( + view["ray_map"].shape[0], 1 + ), + )[[j]] + ) + + raymaps = torch.cat(raymaps, dim=0) + shape = torch.cat(shapes, dim=0).to(raymaps.device) + feat_ls, pos, _ = self._encode_ray_map(raymaps, shapes) + + feat_i = feat_ls[-1] + pos_i = pos + if self.pose_head_flag: + global_img_feat_i = self._get_img_level_feat(feat_i) + pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem) + pose_pos_i = -torch.ones( + feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype + ) + else: + pose_feat_i = None + pose_pos_i = None + new_state_feat, dec = self._recurrent_rollout( + state_feat, + state_pos, + feat_i, + pos_i, + pose_feat_i, + pose_pos_i, + init_state_feat, + img_mask=view["img_mask"], + reset_mask=view["reset"], + update=view.get("update", None), + ) + + out_pose_feat_i = dec[-1][:, 0:1] + new_mem = self.pose_retriever.update_mem( + mem, global_img_feat_i, out_pose_feat_i + ) + assert len(dec) == self.dec_depth + 1 + head_input = [ + dec[0].float(), + dec[self.dec_depth * 2 // 4][:, 1:].float(), + dec[self.dec_depth * 3 // 4][:, 1:].float(), + dec[self.dec_depth].float(), + ] + res = self._downstream_head(head_input, shape, pos=pos_i) + return res, view + + def forward_recurrent(self, views, device, ret_state=False): + ress = [] + all_state_args = [] + for i, view in enumerate(views): + device = view["img"].device + batch_size = view["img"].shape[0] + img_mask = view["img_mask"].reshape( + -1, batch_size + ) # Shape: (1, batch_size) + ray_mask = view["ray_mask"].reshape( + -1, batch_size + ) # Shape: (1, batch_size) + imgs = view["img"].unsqueeze(0) # Shape: (1, batch_size, C, H, W) + ray_maps = view["ray_map"].unsqueeze( + 0 + ) # Shape: (num_views, batch_size, H, W, C) + shapes = ( + view["true_shape"].unsqueeze(0) + if "true_shape" in view + else torch.tensor(view["img"].shape[-2:], device=device) + .unsqueeze(0) + .repeat(batch_size, 1) + .unsqueeze(0) + ) # Shape: (num_views, batch_size, 2) + imgs = imgs.view( + -1, *imgs.shape[2:] + ) # Shape: (num_views * batch_size, C, H, W) + ray_maps = ray_maps.view( + -1, *ray_maps.shape[2:] + ) # Shape: (num_views * batch_size, H, W, C) + shapes = shapes.view(-1, 2).to( + imgs.device + ) # Shape: (num_views * batch_size, 2) + img_masks_flat = img_mask.view(-1) # Shape: (num_views * batch_size) + ray_masks_flat = ray_mask.view(-1) + selected_imgs = imgs[img_masks_flat] + selected_shapes = shapes[img_masks_flat] + if selected_imgs.size(0) > 0: + img_out, img_pos, _ = self._encode_image(selected_imgs, selected_shapes) + else: + img_out, img_pos = None, None + ray_maps = ray_maps.permute(0, 3, 1, 2) # Change shape to (N, C, H, W) + selected_ray_maps = ray_maps[ray_masks_flat] + selected_shapes_ray = shapes[ray_masks_flat] + if selected_ray_maps.size(0) > 0: + ray_out, ray_pos, _ = self._encode_ray_map( + selected_ray_maps, selected_shapes_ray + ) + else: + ray_out, ray_pos = None, None + + shape = shapes + if img_out is not None and ray_out is None: + feat_i = img_out[-1] + pos_i = img_pos + elif img_out is None and ray_out is not None: + feat_i = ray_out[-1] + pos_i = ray_pos + elif img_out is not None and ray_out is not None: + feat_i = img_out[-1] + ray_out[-1] + pos_i = img_pos + else: + raise NotImplementedError + + if i == 0: + state_feat, state_pos = self._init_state(feat_i, pos_i) + mem = self.pose_retriever.mem.expand(feat_i.shape[0], -1, -1) + init_state_feat = state_feat.clone() + init_mem = mem.clone() + all_state_args.append( + (state_feat, state_pos, init_state_feat, mem, init_mem) + ) + + if self.pose_head_flag: + global_img_feat_i = self._get_img_level_feat(feat_i) + if i == 0: + pose_feat_i = self.pose_token.expand(feat_i.shape[0], -1, -1) + else: + pose_feat_i = self.pose_retriever.inquire(global_img_feat_i, mem) + pose_pos_i = -torch.ones( + feat_i.shape[0], 1, 2, device=feat_i.device, dtype=pos_i.dtype + ) + else: + pose_feat_i = None + pose_pos_i = None + new_state_feat, dec = self._recurrent_rollout( + state_feat, + state_pos, + feat_i, + pos_i, + pose_feat_i, + pose_pos_i, + init_state_feat, + img_mask=view["img_mask"], + reset_mask=view["reset"], + update=view.get("update", None), + ) + out_pose_feat_i = dec[-1][:, 0:1] + new_mem = self.pose_retriever.update_mem( + mem, global_img_feat_i, out_pose_feat_i + ) + assert len(dec) == self.dec_depth + 1 + head_input = [ + dec[0].float(), + dec[self.dec_depth * 2 // 4][:, 1:].float(), + dec[self.dec_depth * 3 // 4][:, 1:].float(), + dec[self.dec_depth].float(), + ] + res = self._downstream_head(head_input, shape, pos=pos_i) + ress.append(res) + img_mask = view["img_mask"] + update = view.get("update", None) + if update is not None: + update_mask = ( + img_mask & update + ) # if don't update, then whatever img_mask + else: + update_mask = img_mask + update_mask = update_mask[:, None, None].float() + state_feat = new_state_feat * update_mask + state_feat * ( + 1 - update_mask + ) # update global state + mem = new_mem * update_mask + mem * ( + 1 - update_mask + ) # then update local state + reset_mask = view["reset"] + if reset_mask is not None: + reset_mask = reset_mask[:, None, None].float() + state_feat = init_state_feat * reset_mask + state_feat * ( + 1 - reset_mask + ) + mem = init_mem * reset_mask + mem * (1 - reset_mask) + all_state_args.append( + (state_feat, state_pos, init_state_feat, mem, init_mem) + ) + if ret_state: + return ress, views, all_state_args + return ress, views + + +if __name__ == "__main__": + print(ARCroco3DStereo.mro()) + cfg = ARCroco3DStereoConfig( + state_size=256, + pos_embed="RoPE100", + rgb_head=True, + pose_head=True, + img_size=(224, 224), + head_type="linear", + output_mode="pts3d+pose", + depth_mode=("exp", -inf, inf), + conf_mode=("exp", 1, inf), + pose_mode=("exp", -inf, inf), + enc_embed_dim=1024, + enc_depth=24, + enc_num_heads=16, + dec_embed_dim=768, + dec_depth=12, + dec_num_heads=12, + ) + ARCroco3DStereo(cfg) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/patch_embed.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..6cc177f0b05940b5e9ee01b9053fbf24be6d1905 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/patch_embed.py @@ -0,0 +1,93 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +import torch +import dust3r.utils.path_to_croco # noqa: F401 +from models.blocks import PatchEmbed # noqa + + +def get_patch_embed(patch_embed_cls, img_size, patch_size, enc_embed_dim, in_chans=3): + assert patch_embed_cls in ["PatchEmbedDust3R", "ManyAR_PatchEmbed"] + patch_embed = eval(patch_embed_cls)(img_size, patch_size, in_chans, enc_embed_dim) + return patch_embed + + +class PatchEmbedDust3R(PatchEmbed): + def forward(self, x, **kw): + B, C, H, W = x.shape + assert ( + H % self.patch_size[0] == 0 + ), f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})." + assert ( + W % self.patch_size[1] == 0 + ), f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})." + x = self.proj(x) + pos = self.position_getter(B, x.size(2), x.size(3), x.device) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x, pos + + +class ManyAR_PatchEmbed(PatchEmbed): + """Handle images with non-square aspect ratio. + All images in the same batch have the same aspect ratio. + true_shape = [(height, width) ...] indicates the actual shape of each image. + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + ): + self.embed_dim = embed_dim + super().__init__(img_size, patch_size, in_chans, embed_dim, norm_layer, flatten) + + def forward(self, img, true_shape): + B, C, H, W = img.shape + + assert ( + H % self.patch_size[0] == 0 + ), f"Input image height ({H}) is not a multiple of patch size ({self.patch_size[0]})." + assert ( + W % self.patch_size[1] == 0 + ), f"Input image width ({W}) is not a multiple of patch size ({self.patch_size[1]})." + assert true_shape.shape == ( + B, + 2, + ), f"true_shape has the wrong shape={true_shape.shape}" + + W //= self.patch_size[0] + H //= self.patch_size[1] + n_tokens = H * W + + height, width = true_shape.T + + is_landscape = torch.ones_like(width, dtype=torch.bool) + is_portrait = ~is_landscape + + x = img.new_zeros((B, n_tokens, self.embed_dim)) + pos = img.new_zeros((B, n_tokens, 2), dtype=torch.int64) + + x[is_landscape] = ( + self.proj(img[is_landscape]).permute(0, 2, 3, 1).flatten(1, 2).float() + ) + x[is_portrait] = ( + self.proj(img[is_portrait].swapaxes(-1, -2)) + .permute(0, 2, 3, 1) + .flatten(1, 2) + .float() + ) + + pos[is_landscape] = self.position_getter(1, H, W, pos.device) + pos[is_portrait] = self.position_getter(1, W, H, pos.device) + + x = self.norm(x) + return x, pos diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/post_process.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/post_process.py new file mode 100644 index 0000000000000000000000000000000000000000..04a6597b33f2074f32b05477437dde2b940b3532 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/post_process.py @@ -0,0 +1,64 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +import numpy as np +import torch +from dust3r.utils.geometry import xy_grid + + +def estimate_focal_knowing_depth( + pts3d, pp, focal_mode="median", min_focal=0.0, max_focal=np.inf +): + """Reprojection method, for when the absolute depth is known: + 1) estimate the camera focal using a robust estimator + 2) reproject points onto true rays, minimizing a certain error + """ + B, H, W, THREE = pts3d.shape + assert THREE == 3 + + pixels = xy_grid(W, H, device=pts3d.device).view(1, -1, 2) - pp.view( + -1, 1, 2 + ) # B,HW,2 + pts3d = pts3d.flatten(1, 2) # (B, HW, 3) + + if focal_mode == "median": + with torch.no_grad(): + + u, v = pixels.unbind(dim=-1) + x, y, z = pts3d.unbind(dim=-1) + fx_votes = (u * z) / x + fy_votes = (v * z) / y + + f_votes = torch.cat((fx_votes.view(B, -1), fy_votes.view(B, -1)), dim=-1) + focal = torch.nanmedian(f_votes, dim=-1).values + + elif focal_mode == "weiszfeld": + + xy_over_z = (pts3d[..., :2] / pts3d[..., 2:3]).nan_to_num( + posinf=0, neginf=0 + ) # homogeneous (x,y,1) + + dot_xy_px = (xy_over_z * pixels).sum(dim=-1) + dot_xy_xy = xy_over_z.square().sum(dim=-1) + + focal = dot_xy_px.mean(dim=1) / dot_xy_xy.mean(dim=1) + + for iter in range(10): + + dis = (pixels - focal.view(-1, 1, 1) * xy_over_z).norm(dim=-1) + + w = dis.clip(min=1e-8).reciprocal() + + focal = (w * dot_xy_px).mean(dim=1) / (w * dot_xy_xy).mean(dim=1) + else: + raise ValueError(f"bad {focal_mode=}") + + focal_base = max(H, W) / ( + 2 * np.tan(np.deg2rad(60) / 2) + ) # size / 1.1547005383792515 + focal = focal.clip(min=min_focal * focal_base, max=max_focal * focal_base) + + return focal diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/viz.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/viz.py new file mode 100644 index 0000000000000000000000000000000000000000..f25aa80cca6226d34d9f6002bc927115d0e608ed --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/dust3r/viz.py @@ -0,0 +1,1089 @@ +# Copyright (C) 2024-present Naver Corporation. All rights reserved. +# Licensed under CC BY-NC-SA 4.0 (non-commercial use only). +# +# -------------------------------------------------------- +# modified from DUSt3R + +import PIL.Image +import numpy as np +from scipy.spatial.transform import Rotation +import torch +import cv2 +import matplotlib as mpl +import matplotlib.cm as cm +import matplotlib.pyplot as plt +from dust3r.utils.geometry import ( + geotrf, + get_med_dist_between_poses, + depthmap_to_absolute_camera_coordinates, +) +from dust3r.utils.device import to_numpy +from dust3r.utils.image import rgb, img_to_arr +from matplotlib.backends.backend_agg import FigureCanvasAgg +from matplotlib.figure import Figure + +try: + import trimesh +except ImportError: + print("/!\\ module trimesh is not installed, cannot visualize results /!\\") + + +def float2uint8(x): + return (255.0 * x).astype(np.uint8) + + +def uint82float(img): + return np.ascontiguousarray(img) / 255.0 + + +def cat_3d(vecs): + if isinstance(vecs, (np.ndarray, torch.Tensor)): + vecs = [vecs] + return np.concatenate([p.reshape(-1, 3) for p in to_numpy(vecs)]) + + +def show_raw_pointcloud(pts3d, colors, point_size=2): + scene = trimesh.Scene() + + pct = trimesh.PointCloud(cat_3d(pts3d), colors=cat_3d(colors)) + scene.add_geometry(pct) + + scene.show(line_settings={"point_size": point_size}) + + +def pts3d_to_trimesh(img, pts3d, valid=None): + H, W, THREE = img.shape + assert THREE == 3 + assert img.shape == pts3d.shape + + vertices = pts3d.reshape(-1, 3) + + idx = np.arange(len(vertices)).reshape(H, W) + idx1 = idx[:-1, :-1].ravel() # top-left corner + idx2 = idx[:-1, +1:].ravel() # right-left corner + idx3 = idx[+1:, :-1].ravel() # bottom-left corner + idx4 = idx[+1:, +1:].ravel() # bottom-right corner + faces = np.concatenate( + ( + np.c_[idx1, idx2, idx3], + np.c_[ + idx3, idx2, idx1 + ], # same triangle, but backward (cheap solution to cancel face culling) + np.c_[idx2, idx3, idx4], + np.c_[ + idx4, idx3, idx2 + ], # same triangle, but backward (cheap solution to cancel face culling) + ), + axis=0, + ) + + face_colors = np.concatenate( + ( + img[:-1, :-1].reshape(-1, 3), + img[:-1, :-1].reshape(-1, 3), + img[+1:, +1:].reshape(-1, 3), + img[+1:, +1:].reshape(-1, 3), + ), + axis=0, + ) + + if valid is not None: + assert valid.shape == (H, W) + valid_idxs = valid.ravel() + valid_faces = valid_idxs[faces].all(axis=-1) + faces = faces[valid_faces] + face_colors = face_colors[valid_faces] + + assert len(faces) == len(face_colors) + return dict(vertices=vertices, face_colors=face_colors, faces=faces) + + +def cat_meshes(meshes): + vertices, faces, colors = zip( + *[(m["vertices"], m["faces"], m["face_colors"]) for m in meshes] + ) + n_vertices = np.cumsum([0] + [len(v) for v in vertices]) + for i in range(len(faces)): + faces[i][:] += n_vertices[i] + + vertices = np.concatenate(vertices) + colors = np.concatenate(colors) + faces = np.concatenate(faces) + return dict(vertices=vertices, face_colors=colors, faces=faces) + + +def show_duster_pairs(view1, view2, pred1, pred2): + import matplotlib.pyplot as pl + + pl.ion() + + for e in range(len(view1["instance"])): + i = view1["idx"][e] + j = view2["idx"][e] + img1 = rgb(view1["img"][e]) + img2 = rgb(view2["img"][e]) + conf1 = pred1["conf"][e].squeeze() + conf2 = pred2["conf"][e].squeeze() + score = conf1.mean() * conf2.mean() + print(f">> Showing pair #{e} {i}-{j} {score=:g}") + pl.clf() + pl.subplot(221).imshow(img1) + pl.subplot(223).imshow(img2) + pl.subplot(222).imshow(conf1, vmin=1, vmax=30) + pl.subplot(224).imshow(conf2, vmin=1, vmax=30) + pts1 = pred1["pts3d"][e] + pts2 = pred2["pts3d_in_other_view"][e] + pl.subplots_adjust(0, 0, 1, 1, 0, 0) + if input("show pointcloud? (y/n) ") == "y": + show_raw_pointcloud(cat(pts1, pts2), cat(img1, img2), point_size=5) + + +def auto_cam_size(im_poses): + return 0.1 * get_med_dist_between_poses(im_poses) + + +class SceneViz: + def __init__(self): + self.scene = trimesh.Scene() + + def add_rgbd( + self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None + ): + image = img_to_arr(image) + + if intrinsics is None: + H, W, THREE = image.shape + focal = max(H, W) + intrinsics = np.float32([[focal, 0, W / 2], [0, focal, H / 2], [0, 0, 1]]) + + pts3d = depthmap_to_pts3d(depth, intrinsics, cam2world=cam2world) + + return self.add_pointcloud( + pts3d, image, mask=(depth < zfar) if mask is None else mask + ) + + def add_pointcloud(self, pts3d, color=(0, 0, 0), mask=None, denoise=False): + pts3d = to_numpy(pts3d) + mask = to_numpy(mask) + if not isinstance(pts3d, list): + pts3d = [pts3d.reshape(-1, 3)] + if mask is not None: + mask = [mask.ravel()] + if not isinstance(color, (tuple, list)): + color = [color.reshape(-1, 3)] + if mask is None: + mask = [slice(None)] * len(pts3d) + + pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)]) + pct = trimesh.PointCloud(pts) + + if isinstance(color, (list, np.ndarray, torch.Tensor)): + color = to_numpy(color) + col = np.concatenate([p[m] for p, m in zip(color, mask)]) + assert col.shape == pts.shape, bb() + pct.visual.vertex_colors = uint8(col.reshape(-1, 3)) + else: + assert len(color) == 3 + pct.visual.vertex_colors = np.broadcast_to(uint8(color), pts.shape) + + if denoise: + + centroid = np.median(pct.vertices, axis=0) + dist_to_centroid = np.linalg.norm(pct.vertices - centroid, axis=-1) + dist_thr = np.quantile(dist_to_centroid, 0.99) + valid = dist_to_centroid < dist_thr + + pct = trimesh.PointCloud( + pct.vertices[valid], color=pct.visual.vertex_colors[valid] + ) + + self.scene.add_geometry(pct) + return self + + def add_rgbd( + self, image, depth, intrinsics=None, cam2world=None, zfar=np.inf, mask=None + ): + + if intrinsics is None: + H, W, THREE = image.shape + focal = max(H, W) + intrinsics = np.float32([[focal, 0, W / 2], [0, focal, H / 2], [0, 0, 1]]) + + pts3d, mask2 = depthmap_to_absolute_camera_coordinates( + depth, intrinsics, cam2world + ) + mask2 &= depth < zfar + + if mask is not None: + mask2 &= mask + + return self.add_pointcloud(pts3d, image, mask=mask2) + + def add_camera( + self, + pose_c2w, + focal=None, + color=(0, 0, 0), + image=None, + imsize=None, + cam_size=0.03, + ): + pose_c2w, focal, color, image = to_numpy((pose_c2w, focal, color, image)) + image = img_to_arr(image) + if isinstance(focal, np.ndarray) and focal.shape == (3, 3): + intrinsics = focal + focal = (intrinsics[0, 0] * intrinsics[1, 1]) ** 0.5 + if imsize is None: + imsize = (2 * intrinsics[0, 2], 2 * intrinsics[1, 2]) + + add_scene_cam( + self.scene, + pose_c2w, + color, + image, + focal, + imsize=imsize, + screen_width=cam_size, + marker=None, + ) + return self + + def add_cameras( + self, poses, focals=None, images=None, imsizes=None, colors=None, **kw + ): + get = lambda arr, idx: None if arr is None else arr[idx] + for i, pose_c2w in enumerate(poses): + self.add_camera( + pose_c2w, + get(focals, i), + image=get(images, i), + color=get(colors, i), + imsize=get(imsizes, i), + **kw, + ) + return self + + def show(self, point_size=2): + self.scene.show(line_settings={"point_size": point_size}) + + +def show_raw_pointcloud_with_cams( + imgs, pts3d, mask, focals, cams2world, point_size=2, cam_size=0.05, cam_color=None +): + """Visualization of a pointcloud with cameras + imgs = (N, H, W, 3) or N-size list of [(H,W,3), ...] + pts3d = (N, H, W, 3) or N-size list of [(H,W,3), ...] + focals = (N,) or N-size list of [focal, ...] + cams2world = (N,4,4) or N-size list of [(4,4), ...] + """ + assert len(pts3d) == len(mask) <= len(imgs) <= len(cams2world) == len(focals) + pts3d = to_numpy(pts3d) + imgs = to_numpy(imgs) + focals = to_numpy(focals) + cams2world = to_numpy(cams2world) + + scene = trimesh.Scene() + + pts = np.concatenate([p[m] for p, m in zip(pts3d, mask)]) + col = np.concatenate([p[m] for p, m in zip(imgs, mask)]) + pct = trimesh.PointCloud(pts.reshape(-1, 3), colors=col.reshape(-1, 3)) + scene.add_geometry(pct) + + for i, pose_c2w in enumerate(cams2world): + if isinstance(cam_color, list): + camera_edge_color = cam_color[i] + else: + camera_edge_color = cam_color or CAM_COLORS[i % len(CAM_COLORS)] + add_scene_cam( + scene, + pose_c2w, + camera_edge_color, + imgs[i] if i < len(imgs) else None, + focals[i], + screen_width=cam_size, + ) + + scene.show(line_settings={"point_size": point_size}) + + +def add_scene_cam( + scene, + pose_c2w, + edge_color, + image=None, + focal=None, + imsize=None, + screen_width=0.03, + marker=None, +): + if image is not None: + image = np.asarray(image) + H, W, THREE = image.shape + assert THREE == 3 + if image.dtype != np.uint8: + image = np.uint8(255 * image) + elif imsize is not None: + W, H = imsize + elif focal is not None: + H = W = focal / 1.1 + else: + H = W = 1 + + if isinstance(focal, np.ndarray): + focal = focal[0] + if not focal: + focal = min(H, W) * 1.1 # default value + + height = max(screen_width / 10, focal * screen_width / H) + width = screen_width * 0.5**0.5 + rot45 = np.eye(4) + rot45[:3, :3] = Rotation.from_euler("z", np.deg2rad(45)).as_matrix() + rot45[2, 3] = -height # set the tip of the cone = optical center + aspect_ratio = np.eye(4) + aspect_ratio[0, 0] = W / H + transform = pose_c2w @ OPENGL @ aspect_ratio @ rot45 + cam = trimesh.creation.cone(width, height, sections=4) # , transform=transform) + + if image is not None: + vertices = geotrf(transform, cam.vertices[[4, 5, 1, 3]]) + faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]]) + img = trimesh.Trimesh(vertices=vertices, faces=faces) + uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]]) + img.visual = trimesh.visual.TextureVisuals( + uv_coords, image=PIL.Image.fromarray(image) + ) + scene.add_geometry(img) + + rot2 = np.eye(4) + rot2[:3, :3] = Rotation.from_euler("z", np.deg2rad(2)).as_matrix() + vertices = np.r_[cam.vertices, 0.95 * cam.vertices, geotrf(rot2, cam.vertices)] + vertices = geotrf(transform, vertices) + faces = [] + for face in cam.faces: + if 0 in face: + continue + a, b, c = face + a2, b2, c2 = face + len(cam.vertices) + a3, b3, c3 = face + 2 * len(cam.vertices) + + faces.append((a, b, b2)) + faces.append((a, a2, c)) + faces.append((c2, b, c)) + + faces.append((a, b, b3)) + faces.append((a, a3, c)) + faces.append((c3, b, c)) + + faces += [(c, b, a) for a, b, c in faces] + + cam = trimesh.Trimesh(vertices=vertices, faces=faces) + cam.visual.face_colors[:, :3] = edge_color + scene.add_geometry(cam) + + if marker == "o": + marker = trimesh.creation.icosphere(3, radius=screen_width / 4) + marker.vertices += pose_c2w[:3, 3] + marker.visual.face_colors[:, :3] = edge_color + scene.add_geometry(marker) + + +def cat(a, b): + return np.concatenate((a.reshape(-1, 3), b.reshape(-1, 3))) + + +OPENGL = np.array([[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]]) + + +CAM_COLORS = [ + (255, 0, 0), + (0, 0, 255), + (0, 255, 0), + (255, 0, 255), + (255, 204, 0), + (0, 204, 204), + (128, 255, 255), + (255, 128, 255), + (255, 255, 128), + (0, 0, 0), + (128, 128, 128), +] + + +def uint8(colors): + if not isinstance(colors, np.ndarray): + colors = np.array(colors) + if np.issubdtype(colors.dtype, np.floating): + colors *= 255 + assert 0 <= colors.min() and colors.max() < 256 + return np.uint8(colors) + + +def segment_sky(image): + import cv2 + from scipy import ndimage + + image = to_numpy(image) + if np.issubdtype(image.dtype, np.floating): + image = np.uint8(255 * image.clip(min=0, max=1)) + hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) + + lower_blue = np.array([0, 0, 100]) + upper_blue = np.array([30, 255, 255]) + mask = cv2.inRange(hsv, lower_blue, upper_blue).view(bool) + + mask |= (hsv[:, :, 1] < 10) & (hsv[:, :, 2] > 150) + mask |= (hsv[:, :, 1] < 30) & (hsv[:, :, 2] > 180) + mask |= (hsv[:, :, 1] < 50) & (hsv[:, :, 2] > 220) + + kernel = np.ones((5, 5), np.uint8) + mask2 = ndimage.binary_opening(mask, structure=kernel) + + _, labels, stats, _ = cv2.connectedComponentsWithStats( + mask2.view(np.uint8), connectivity=8 + ) + cc_sizes = stats[1:, cv2.CC_STAT_AREA] + order = cc_sizes.argsort()[::-1] # bigger first + i = 0 + selection = [] + while i < len(order) and cc_sizes[order[i]] > cc_sizes[order[0]] / 2: + selection.append(1 + order[i]) + i += 1 + mask3 = np.in1d(labels, selection).reshape(labels.shape) + + return torch.from_numpy(mask3) + + +def get_vertical_colorbar(h, vmin, vmax, cmap_name="jet", label=None, cbar_precision=2): + """ + :param w: pixels + :param h: pixels + :param vmin: min value + :param vmax: max value + :param cmap_name: + :param label + :return: + """ + fig = Figure(figsize=(2, 8), dpi=100) + fig.subplots_adjust(right=1.5) + canvas = FigureCanvasAgg(fig) + + ax = fig.add_subplot(111) + cmap = cm.get_cmap(cmap_name) + norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax) + + tick_cnt = 6 + tick_loc = np.linspace(vmin, vmax, tick_cnt) + cb1 = mpl.colorbar.ColorbarBase( + ax, cmap=cmap, norm=norm, ticks=tick_loc, orientation="vertical" + ) + + tick_label = [str(np.round(x, cbar_precision)) for x in tick_loc] + if cbar_precision == 0: + tick_label = [x[:-2] for x in tick_label] + + cb1.set_ticklabels(tick_label) + + cb1.ax.tick_params(labelsize=18, rotation=0) + if label is not None: + cb1.set_label(label) + + fig.tight_layout() + + canvas.draw() + s, (width, height) = canvas.print_to_buffer() + + im = np.frombuffer(s, np.uint8).reshape((height, width, 4)) + + im = im[:, :, :3].astype(np.float32) / 255.0 + if h != im.shape[0]: + w = int(im.shape[1] / im.shape[0] * h) + im = cv2.resize(im, (w, h), interpolation=cv2.INTER_AREA) + + return im + + +def colorize_np( + x, + cmap_name="jet", + mask=None, + range=None, + append_cbar=False, + cbar_in_image=False, + cbar_precision=2, +): + """ + turn a grayscale image into a color image + :param x: input grayscale, [H, W] + :param cmap_name: the colorization method + :param mask: the mask image, [H, W] + :param range: the range for scaling, automatic if None, [min, max] + :param append_cbar: if append the color bar + :param cbar_in_image: put the color bar inside the image to keep the output image the same size as the input image + :return: colorized image, [H, W] + """ + if range is not None: + vmin, vmax = range + elif mask is not None: + + vmin = np.min(x[mask][np.nonzero(x[mask])]) + vmax = np.max(x[mask]) + + x[np.logical_not(mask)] = vmin + + else: + vmin, vmax = np.percentile(x, (1, 100)) + vmax += 1e-6 + + x = np.clip(x, vmin, vmax) + x = (x - vmin) / (vmax - vmin) + + cmap = cm.get_cmap(cmap_name) + x_new = cmap(x)[:, :, :3] + + if mask is not None: + mask = np.float32(mask[:, :, np.newaxis]) + x_new = x_new * mask + np.ones_like(x_new) * (1.0 - mask) + + cbar = get_vertical_colorbar( + h=x.shape[0], + vmin=vmin, + vmax=vmax, + cmap_name=cmap_name, + cbar_precision=cbar_precision, + ) + + if append_cbar: + if cbar_in_image: + x_new[:, -cbar.shape[1] :, :] = cbar + else: + x_new = np.concatenate( + (x_new, np.zeros_like(x_new[:, :5, :]), cbar), axis=1 + ) + return x_new + else: + return x_new + + +def colorize( + x, cmap_name="jet", mask=None, range=None, append_cbar=False, cbar_in_image=False +): + """ + turn a grayscale image into a color image + :param x: torch.Tensor, grayscale image, [H, W] or [B, H, W] + :param mask: torch.Tensor or None, mask image, [H, W] or [B, H, W] or None + """ + + device = x.device + x = x.cpu().numpy() + if mask is not None: + mask = mask.cpu().numpy() > 0.99 + kernel = np.ones((3, 3), np.uint8) + + if x.ndim == 2: + x = x[None] + if mask is not None: + mask = mask[None] + + out = [] + for x_ in x: + if mask is not None: + mask = cv2.erode(mask.astype(np.uint8), kernel, iterations=1).astype(bool) + + x_ = colorize_np(x_, cmap_name, mask, range, append_cbar, cbar_in_image) + out.append(torch.from_numpy(x_).to(device).float()) + out = torch.stack(out).squeeze(0) + return out + + +def draw_correspondences( + imgs1, imgs2, coords1, coords2, interval=10, color_by=0, radius=2 +): + """ + draw correspondences between two images + :param img1: tensor [B, H, W, 3] + :param img2: tensor [B, H, W, 3] + :param coord1: tensor [B, N, 2] + :param coord2: tensor [B, N, 2] + :param interval: int the interval between two points + :param color_by: specify the color based on image 1 or image 2, 0 or 1 + :return: [B, 2*H, W, 3] + """ + batch_size = len(imgs1) + out = [] + for i in range(batch_size): + img1 = imgs1[i].detach().cpu().numpy() + img2 = imgs2[i].detach().cpu().numpy() + coord1 = ( + coords1[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2) + ) + coord2 = ( + coords2[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2) + ) + img = drawMatches( + img1, img2, coord1, coord2, radius=radius, color_by=color_by, row_cat=True + ) + out.append(img) + out = np.stack(out) + return out + + +def draw_correspondences_lines( + imgs1, imgs2, coords1, coords2, interval=10, color_by=0, radius=2 +): + """ + draw correspondences between two images + :param img1: tensor [B, H, W, 3] + :param img2: tensor [B, H, W, 3] + :param coord1: tensor [B, N, 2] + :param coord2: tensor [B, N, 2] + :param interval: int the interval between two points + :param color_by: specify the color based on image 1 or image 2, 0 or 1 + :return: [B, 2*H, W, 3] + """ + batch_size = len(imgs1) + out = [] + for i in range(batch_size): + img1 = imgs1[i].detach().cpu().numpy() + img2 = imgs2[i].detach().cpu().numpy() + coord1 = ( + coords1[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2) + ) + coord2 = ( + coords2[i].detach().cpu().numpy()[::interval, ::interval].reshape(-1, 2) + ) + img = drawMatches_lines( + img1, img2, coord1, coord2, radius=radius, color_by=color_by, row_cat=True + ) + out.append(img) + out = np.stack(out) + return out + + +def drawMatches(img1, img2, kp1, kp2, radius=2, mask=None, color_by=0, row_cat=False): + + h1, w1 = img1.shape[:2] + h2, w2 = img2.shape[:2] + + img1 = np.ascontiguousarray(float2uint8(img1)) + img2 = np.ascontiguousarray(float2uint8(img2)) + + center1 = np.median(kp1, axis=0) + center2 = np.median(kp2, axis=0) + + set_max = range(128) + colors = {m: i for i, m in enumerate(set_max)} + colors = { + m: (255 * np.array(plt.cm.hsv(i / float(len(colors))))[:3][::-1]).astype( + np.int32 + ) + for m, i in colors.items() + } + + if mask is not None: + ind = np.argsort(mask)[::-1] + kp1 = kp1[ind] + kp2 = kp2[ind] + mask = mask[ind] + + for i, (pt1, pt2) in enumerate(zip(kp1, kp2)): + + if color_by == 0: + coord_angle = np.arctan2(pt1[1] - center1[1], pt1[0] - center1[0]) + elif color_by == 1: + coord_angle = np.arctan2(pt2[1] - center2[1], pt2[0] - center2[0]) + + corr_color = np.int32(64 * coord_angle / np.pi) % 128 + color = tuple(colors[corr_color].tolist()) + + if ( + (pt1[0] <= w1 - 1) + and (pt1[0] >= 0) + and (pt1[1] <= h1 - 1) + and (pt1[1] >= 0) + ): + img1 = cv2.circle( + img1, (int(pt1[0]), int(pt1[1])), radius, color, -1, cv2.LINE_AA + ) + + if ( + (pt2[0] <= w2 - 1) + and (pt2[0] >= 0) + and (pt2[1] <= h2 - 1) + and (pt2[1] >= 0) + ): + if mask is not None and mask[i]: + img2 = cv2.drawMarker( + img2, + (int(pt2[0]), int(pt2[1])), + color, + markerType=cv2.MARKER_CROSS, + markerSize=int(5 * radius), + thickness=int(radius / 2), + line_type=cv2.LINE_AA, + ) + else: + img2 = cv2.circle( + img2, (int(pt2[0]), int(pt2[1])), radius, color, -1, cv2.LINE_AA + ) + if row_cat: + whole_img = np.concatenate([img1, img2], axis=0) + else: + whole_img = np.concatenate([img1, img2], axis=1) + return whole_img + if row_cat: + return np.concatenate([img1, img2], axis=0) + return np.concatenate([img1, img2], axis=1) + + +def drawMatches_lines( + img1, img2, kp1, kp2, radius=2, mask=None, color_by=0, row_cat=False +): + + h1, w1 = img1.shape[:2] + h2, w2 = img2.shape[:2] + + img1 = np.ascontiguousarray(float2uint8(img1)) + img2 = np.ascontiguousarray(float2uint8(img2)) + + center1 = np.median(kp1, axis=0) + center2 = np.median(kp2, axis=0) + + set_max = range(128) + colors = {m: i for i, m in enumerate(set_max)} + colors = { + m: (255 * np.array(plt.cm.hsv(i / float(len(colors))))[:3][::-1]).astype( + np.int32 + ) + for m, i in colors.items() + } + + if mask is not None: + ind = np.argsort(mask)[::-1] + kp1 = kp1[ind] + kp2 = kp2[ind] + mask = mask[ind] + + if row_cat: + whole_img = np.concatenate([img1, img2], axis=0) + else: + whole_img = np.concatenate([img1, img2], axis=1) + for i, (pt1, pt2) in enumerate(zip(kp1, kp2)): + if color_by == 0: + coord_angle = np.arctan2(pt1[1] - center1[1], pt1[0] - center1[0]) + elif color_by == 1: + coord_angle = np.arctan2(pt2[1] - center2[1], pt2[0] - center2[0]) + + corr_color = np.int32(64 * coord_angle / np.pi) % 128 + color = tuple(colors[corr_color].tolist()) + rand_val = np.random.rand() + if rand_val < 0.1: + if ( + (pt1[0] <= w1 - 1) + and (pt1[0] >= 0) + and (pt1[1] <= h1 - 1) + and (pt1[1] >= 0) + ) and ( + (pt2[0] <= w2 - 1) + and (pt2[0] >= 0) + and (pt2[1] <= h2 - 1) + and (pt2[1] >= 0) + ): + + whole_img = cv2.circle( + whole_img, + (int(pt1[0]), int(pt1[1])), + radius, + color, + -1, + cv2.LINE_AA, + ) + + if row_cat: + whole_img = cv2.circle( + whole_img, + (int(pt2[0]), int(pt2[1] + h1)), + radius, + color, + -1, + cv2.LINE_AA, + ) + cv2.line( + whole_img, + (int(pt1[0]), int(pt1[1])), + (int(pt2[0]), int(pt2[1] + h1)), + color, + 1, + cv2.LINE_AA, + ) + else: + whole_img = cv2.circle( + whole_img, + (int(pt2[0] + w1), int(pt2[1])), + radius, + color, + -1, + cv2.LINE_AA, + ) + cv2.line( + whole_img, + (int(pt1[0]), int(pt1[1])), + (int(pt2[0] + w1), int(pt2[1])), + color, + 1, + cv2.LINE_AA, + ) + return whole_img + if row_cat: + return np.concatenate([img1, img2], axis=0) + return np.concatenate([img1, img2], axis=1) + + +import torch +import os +import time +import viser + + +def rotation_matrix_to_quaternion(R): + """ + :param R: [3, 3] + :return: [4] + """ + tr = np.trace(R) + Rxx = R[0, 0] + Ryy = R[1, 1] + Rzz = R[2, 2] + q = np.zeros(4) + q[0] = 0.5 * np.sqrt(1 + tr) + q[1] = (R[2, 1] - R[1, 2]) / (4 * q[0]) + q[2] = (R[0, 2] - R[2, 0]) / (4 * q[0]) + q[3] = (R[1, 0] - R[0, 1]) / (4 * q[0]) + return q + + +class PointCloudViewer: + def __init__(self, pc_dir, device="cpu"): + self.server = viser.ViserServer() + self.server.set_up_direction("-y") + self.device = device + self.tt = lambda x: torch.from_numpy(x).float().to(device) + self.pc_dir = pc_dir + self.pcs, self.all_steps = self.read_data() + self.num_frames = len(self.all_steps) + + self.fix_camera = False + self.camera_scale = self.server.add_gui_slider( + "camera_scale", + min=0.01, + max=1.0, + step=0.01, + initial_value=0.1, + ) + + self.camera_handles = [] + + def read_data(self): + pc_list = os.listdir(self.pc_dir) + pc_list.sort(key=lambda x: int(x.split(".")[0].split("_")[-1])) + pcs = {} + step_list = [] + for pc_name in pc_list: + pc = np.load(os.path.join(self.pc_dir, pc_name)) + step = int(pc_name.split(".")[0].split("_")[-1]) + pcs.update({step: {"pc": pc}}) + step_list.append(step) + return pcs, step_list + + def parse_pc_data(self, pc, batch_idx=-1): + idx = batch_idx + ret_dict = {} + for i in range(len(pc.keys()) // 2): + pred_pts = pc[f"pts3d_{i+1}"][idx].reshape(-1, 3) # [N, 3] + color = pc[f"colors_{i+1}"][idx].reshape(-1, 3) # [N, 3] + ret_dict.update({f"pred_pts_{i+1}": pred_pts, f"color_{i+1}": color}) + return ret_dict + + def add_pc(self, step): + pc = self.pcs[step]["pc"] + pc_dict = self.parse_pc_data(pc) + + for i in range(len(pc_dict.keys()) // 2): + self.server.add_point_cloud( + name=f"/frames/{step}/pred_pts_{i+1}_{step}", + points=pc_dict[f"pred_pts_{i+1}"], + colors=pc_dict[f"color_{i+1}"], + point_size=0.002, + ) + + if not self.fix_camera: + raise NotImplementedError + + R21, T21 = find_rigid_alignment_batched( + torch.from_numpy(pc_dict["pred_pts1_2"][None]), + torch.from_numpy(pc_dict["pred_pts1_1"][None]), + ) + R12, T12 = find_rigid_alignment_batched( + torch.from_numpy(pc_dict["pred_pts2_1"][None]), + torch.from_numpy(pc_dict["pred_pts2_2"][None]), + ) + R21 = R21[0].numpy() + T21 = T21.numpy() + R12 = R12[0].numpy() + T12 = T12.numpy() + pred_pts1_2 = pc_dict["pred_pts1_2"] @ R21.T + T21 + pred_pts2_1 = pc_dict["pred_pts2_1"] @ R12.T + T12 + self.server.add_point_cloud( + name=f"/frames/{step}/pred_pts1_2_{step}", + points=pred_pts1_2, + colors=pc_dict["color1_2"], + point_size=0.002, + ) + + self.server.add_point_cloud( + name=f"/frames/{step}/pred_pts2_1_{step}", + points=pred_pts2_1, + colors=pc_dict["color2_1"], + point_size=0.002, + ) + img1 = pc_dict["color1_1"].reshape(224, 224, 3) + img2 = pc_dict["color2_2"].reshape(224, 224, 3) + self.camera_handles.append( + self.server.add_camera_frustum( + name=f"/frames/{step}/camera1_{step}", + fov=2.0 * np.arctan(224.0 / 490.0), + aspect=1.0, + scale=self.camera_scale.value, + color=(1.0, 0, 0), + image=img1, + ) + ) + self.camera_handles.append( + self.server.add_camera_frustum( + name=f"/frames/{step}/camera2_{step}", + fov=2.0 * np.arctan(224.0 / 490.0), + aspect=1.0, + scale=self.camera_scale.value, + color=(0, 0, 1.0), + wxyz=rotation_matrix_to_quaternion(R21), + position=T21, + image=img2, + ) + ) + + def animate(self): + with self.server.add_gui_folder("Playback"): + gui_timestep = self.server.add_gui_slider( + "Train Step", + min=0, + max=self.num_frames - 1, + step=1, + initial_value=0, + disabled=True, + ) + gui_next_frame = self.server.add_gui_button("Next Step", disabled=True) + gui_prev_frame = self.server.add_gui_button("Prev Step", disabled=True) + gui_playing = self.server.add_gui_checkbox("Playing", False) + gui_framerate = self.server.add_gui_slider( + "FPS", min=1, max=60, step=0.1, initial_value=1 + ) + gui_framerate_options = self.server.add_gui_button_group( + "FPS options", ("10", "20", "30", "60") + ) + + @gui_next_frame.on_click + def _(_) -> None: + gui_timestep.value = (gui_timestep.value + 1) % self.num_frames + + @gui_prev_frame.on_click + def _(_) -> None: + gui_timestep.value = (gui_timestep.value - 1) % self.num_frames + + @gui_playing.on_update + def _(_) -> None: + gui_timestep.disabled = gui_playing.value + gui_next_frame.disabled = gui_playing.value + gui_prev_frame.disabled = gui_playing.value + + @gui_framerate_options.on_click + def _(_) -> None: + gui_framerate.value = int(gui_framerate_options.value) + + prev_timestep = gui_timestep.value + + @gui_timestep.on_update + def _(_) -> None: + nonlocal prev_timestep + current_timestep = gui_timestep.value + with self.server.atomic(): + frame_nodes[current_timestep].visible = True + frame_nodes[prev_timestep].visible = False + prev_timestep = current_timestep + self.server.flush() # Optional! + + self.server.add_frame( + "/frames", + show_axes=False, + ) + frame_nodes = [] + for i in range(self.num_frames): + step = self.all_steps[i] + frame_nodes.append( + self.server.add_frame( + f"/frames/{step}", + show_axes=False, + ) + ) + self.add_pc(step) + + for i, frame_node in enumerate(frame_nodes): + + frame_node.visible = i == gui_timestep.value + + prev_timestep = gui_timestep.value + while True: + if gui_playing.value: + gui_timestep.value = (gui_timestep.value + 1) % self.num_frames + for handle in self.camera_handles: + handle.scale = self.camera_scale.value + time.sleep(1.0 / gui_framerate.value) + + def run(self): + self.animate() + while True: + time.sleep(10.0) + + +from sklearn.decomposition import PCA + + +def colorize_feature_map(x): + """ + Args: + x: torch.Tensor, [B, H, W, D] + Returns: + torch.Tensor, [B, H, W, 3] + """ + device = x.device + x = x.cpu().numpy() + + out = [] + for x_ in x: + x_ = colorize_feature_map_np(x_) + out.append(torch.from_numpy(x_).to(device)) + out = torch.stack(out).squeeze(0) + return out + + +def colorize_feature_map_np(x): + """ + Args: + x: np.ndarray, [H, W, D] + """ + pca = PCA(n_components=3) + pca_features = pca.fit_transform(x.reshape(-1, x.shape[-1])) + + pca_features = (pca_features - pca_features.min()) / ( + pca_features.max() - pca_features.min() + ) + pca_features = pca_features.reshape(x.shape[0], x.shape[1], 3) + return pca_features diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7b3f0048703b51bcb8bf63d327b66bef0d51f8da --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/__init__.py @@ -0,0 +1,5 @@ +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused +from .block import NestedTensorBlock +from .attention import MemEffAttention diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/attention.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..b25b9c0d52481d63076f1bc5b776c8690c3445d8 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/attention.py @@ -0,0 +1,129 @@ +import logging +import os +import warnings + +import torch +from torch import Tensor +from torch import nn +import torch.nn.functional as F +from typing import Union, Tuple, Dict, Optional + +from einops import rearrange + +XFORMERS_AVAILABLE = False + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + qk_norm: bool = False, + fused_attn: bool = True, # use F.scaled_dot_product_attention or not + rope=None, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.fused_attn = fused_attn + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + self.rope = rope + + def forward(self, + x: torch.Tensor, + pos=None, + attn_mask=None, + past_key_values=None, + use_cache=False + ) -> Union[torch.Tensor, Tuple[torch.Tensor, Tuple]]: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + + pos_k = pos + if use_cache: + k = k.unsqueeze(2) + v = v.unsqueeze(2) + if past_key_values is not None: + past_k, past_v = past_key_values + k = torch.cat([past_k, k], dim=2) + v = torch.cat([past_v, v], dim=2) + + new_kv = (k, v) + a, b, c, d, e = k.shape + k = k.reshape(a, b, c*d, e) + v = v.reshape(a, b, c*d, e) + if pos_k is not None: + #print(pos_k.shape) + pos_k = pos_k.repeat(1, c, 1) + #print(pos_k.shape) + + q, k = self.q_norm(q), self.k_norm(k) + + if self.rope is not None: + q = self.rope(q, pos) + k = self.rope(k, pos_k) + + if self.fused_attn: + x = F.scaled_dot_product_attention( + q, + k, + v, + attn_mask=attn_mask, + dropout_p=self.attn_drop.p if self.training else 0.0, + ) + + else: + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + # Mask + if attn_mask is not None: + assert attn_mask.shape[-2:] == (N, N), f"Expected mask shape [..., {N}, {N}], got {attn_mask.shape}" + attn = attn + attn_mask + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + if use_cache: + return x, new_kv + return x + + +class MemEffAttention(Attention): + def forward(self, x: Tensor, attn_bias=None, pos=None) -> Tensor: + assert pos is None + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = unbind(qkv, 2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + + return x diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/block.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/block.py new file mode 100644 index 0000000000000000000000000000000000000000..2450405388160bdf423fef7500dcd746c2e0069c --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/block.py @@ -0,0 +1,263 @@ +import logging +import os +from typing import Callable, List, Any, Tuple, Dict, Union +import warnings + +import torch +from torch import nn, Tensor + +from .attention import Attention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + + +XFORMERS_AVAILABLE = False + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + qk_norm: bool = False, + fused_attn: bool = True, # use F.scaled_dot_product_attention or not + rope=None, + ) -> None: + super().__init__() + + self.norm1 = norm_layer(dim) + + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + qk_norm=qk_norm, + fused_attn=fused_attn, + rope=rope, + ) + + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor, pos=None, attn_mask=None, past_key_values=None, use_cache=False) -> Union[Tensor, Tuple[Tensor, Dict]]: + + def attn_residual_func(x: Tensor, pos=None, attn_mask=None, past_key_values=None, use_cache=False) -> Union[Tensor, Tuple[Tensor, Dict]]: + if use_cache: + output, new_kv = self.attn(self.norm1(x), pos=pos, past_key_values=past_key_values, use_cache=True) + return self.ls1(output), new_kv + else: + if attn_mask is not None: + return self.ls1(self.attn(self.norm1(x), pos=pos, attn_mask=attn_mask)) + else: + return self.ls1(self.attn(self.norm1(x), pos=pos)) + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if use_cache: + attn_output, new_kv = attn_residual_func(x, pos=pos, past_key_values=past_key_values, use_cache=True) + x = x + attn_output + x = x + ffn_residual_func(x) + return x, new_kv + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + pos=pos, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x, pos=pos, attn_mask=attn_mask)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x, pos=pos, attn_mask=attn_mask) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, + pos=None, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + if pos is not None: + # if necessary, apply rope to the subset + pos = pos[brange] + residual = residual_func(x_subset, pos=pos) + else: + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + if not XFORMERS_AVAILABLE: + raise AssertionError("xFormers is required for using nested tensors") + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/drop_path.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/drop_path.py new file mode 100644 index 0000000000000000000000000000000000000000..b5021815ed084014cd7eab52139a0d1113138123 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/drop_path.py @@ -0,0 +1,24 @@ +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/layer_scale.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/layer_scale.py new file mode 100644 index 0000000000000000000000000000000000000000..9e83807bdec80588636761c42e4a245fda6a67ea --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/layer_scale.py @@ -0,0 +1,20 @@ +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/mlp.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..8d12860274f57e1aeb080d615421225474ee9bf7 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/mlp.py @@ -0,0 +1,30 @@ +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/patch_embed.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..cfea54a42bb5ed0714d97c7701ceff2410f55fac --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/patch_embed.py @@ -0,0 +1,79 @@ +from typing import Callable, Optional, Tuple, Union + +from torch import Tensor +import torch.nn as nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/rope.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/rope.py new file mode 100644 index 0000000000000000000000000000000000000000..a94afbdde5c1b9ab0f36ba8257437aac10f87969 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/rope.py @@ -0,0 +1,172 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Dict, Tuple + + +class PositionGetter: + """Generates and caches 2D spatial positions for patches in a grid. + + This class efficiently manages the generation of spatial coordinates for patches + in a 2D grid, caching results to avoid redundant computations. + + Attributes: + position_cache: Dictionary storing precomputed position tensors for different + grid dimensions. + """ + + def __init__(self): + """Initializes the position generator with an empty cache.""" + self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {} + + def __call__(self, batch_size: int, height: int, width: int, device: torch.device) -> torch.Tensor: + """Generates spatial positions for a batch of patches. + + Args: + batch_size: Number of samples in the batch. + height: Height of the grid in patches. + width: Width of the grid in patches. + device: Target device for the position tensor. + + Returns: + Tensor of shape (batch_size, height*width, 2) containing y,x coordinates + for each position in the grid, repeated for each batch item. + """ + if (height, width) not in self.position_cache: + y_coords = torch.arange(height, device=device) + x_coords = torch.arange(width, device=device) + positions = torch.cartesian_prod(y_coords, x_coords) + self.position_cache[height, width] = positions + + cached_positions = self.position_cache[height, width] + return cached_positions.view(1, height * width, 2).expand(batch_size, -1, -1).clone() + + +class RotaryPositionEmbedding2D(nn.Module): + """2D Rotary Position Embedding implementation. + + This module applies rotary position embeddings to input tokens based on their + 2D spatial positions. It handles the position-dependent rotation of features + separately for vertical and horizontal dimensions. + + Args: + frequency: Base frequency for the position embeddings. Default: 100.0 + scaling_factor: Scaling factor for frequency computation. Default: 1.0 + + Attributes: + base_frequency: Base frequency for computing position embeddings. + scaling_factor: Factor to scale the computed frequencies. + frequency_cache: Cache for storing precomputed frequency components. + """ + + def __init__(self, frequency: float = 100.0, scaling_factor: float = 1.0): + """Initializes the 2D RoPE module.""" + super().__init__() + self.base_frequency = frequency + self.scaling_factor = scaling_factor + self.frequency_cache: Dict[Tuple, Tuple[torch.Tensor, torch.Tensor]] = {} + + def _compute_frequency_components( + self, dim: int, seq_len: int, device: torch.device, dtype: torch.dtype + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Computes frequency components for rotary embeddings. + + Args: + dim: Feature dimension (must be even). + seq_len: Maximum sequence length. + device: Target device for computations. + dtype: Data type for the computed tensors. + + Returns: + Tuple of (cosine, sine) tensors for frequency components. + """ + cache_key = (dim, seq_len, device, dtype) + if cache_key not in self.frequency_cache: + # Compute frequency bands + exponents = torch.arange(0, dim, 2, device=device).float() / dim + inv_freq = 1.0 / (self.base_frequency**exponents) + + # Generate position-dependent frequencies + positions = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + angles = torch.einsum("i,j->ij", positions, inv_freq) + + # Compute and cache frequency components + angles = angles.to(dtype) + angles = torch.cat((angles, angles), dim=-1) + cos_components = angles.cos().to(dtype) + sin_components = angles.sin().to(dtype) + self.frequency_cache[cache_key] = (cos_components, sin_components) + + return self.frequency_cache[cache_key] + + @staticmethod + def _rotate_features(x: torch.Tensor) -> torch.Tensor: + """Performs feature rotation by splitting and recombining feature dimensions. + + Args: + x: Input tensor to rotate. + + Returns: + Rotated feature tensor. + """ + feature_dim = x.shape[-1] + x1, x2 = x[..., : feature_dim // 2], x[..., feature_dim // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def _apply_1d_rope( + self, tokens: torch.Tensor, positions: torch.Tensor, cos_comp: torch.Tensor, sin_comp: torch.Tensor + ) -> torch.Tensor: + """Applies 1D rotary position embeddings along one dimension. + + Args: + tokens: Input token features. + positions: Position indices. + cos_comp: Cosine components for rotation. + sin_comp: Sine components for rotation. + + Returns: + Tokens with applied rotary position embeddings. + """ + # Embed positions with frequency components + cos = F.embedding(positions, cos_comp)[:, None, :, :] + sin = F.embedding(positions, sin_comp)[:, None, :, :] + + # Apply rotation + return (tokens * cos) + (self._rotate_features(tokens) * sin) + + def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor: + """Applies 2D rotary position embeddings to input tokens. + + Args: + tokens: Input tensor of shape (batch_size, n_heads, n_tokens, dim). + The feature dimension (dim) must be divisible by 4. + positions: Position tensor of shape (batch_size, n_tokens, 2) containing + the y and x coordinates for each token. + + Returns: + Tensor of same shape as input with applied 2D rotary position embeddings. + + Raises: + AssertionError: If input dimensions are invalid or positions are malformed. + """ + # Validate inputs + assert tokens.size(-1) % 2 == 0, "Feature dimension must be even" + assert positions.ndim == 3 and positions.shape[-1] == 2, "Positions must have shape (batch_size, n_tokens, 2)" + + # Compute feature dimension for each spatial direction + feature_dim = tokens.size(-1) // 2 + + # Get frequency components + max_position = int(positions.max()) + 1 + cos_comp, sin_comp = self._compute_frequency_components(feature_dim, max_position, tokens.device, tokens.dtype) + + # Split features for vertical and horizontal processing + vertical_features, horizontal_features = tokens.chunk(2, dim=-1) + + # Apply RoPE separately for each dimension + vertical_features = self._apply_1d_rope(vertical_features, positions[..., 0], cos_comp, sin_comp) + horizontal_features = self._apply_1d_rope(horizontal_features, positions[..., 1], cos_comp, sin_comp) + + # Combine processed features + return torch.cat((vertical_features, horizontal_features), dim=-1) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/swiglu_ffn.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/swiglu_ffn.py new file mode 100644 index 0000000000000000000000000000000000000000..65adc6cb3bc5b95502971bc939e195cbd5ae7313 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/swiglu_ffn.py @@ -0,0 +1,67 @@ +import os +from typing import Callable, Optional +import warnings + +from torch import Tensor, nn +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +# try: +# if XFORMERS_ENABLED: +# from xformers.ops import SwiGLU + +# XFORMERS_AVAILABLE = True +# warnings.warn("xFormers is available (SwiGLU)") +# else: +# warnings.warn("xFormers is disabled (SwiGLU)") +# raise ImportError +# except ImportError: +SwiGLU = SwiGLUFFN +XFORMERS_AVAILABLE = False + +# warnings.warn("xFormers is not available (SwiGLU)") + + +class SwiGLUFFNFused(SwiGLU): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/vision_transformer.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..408185956315205817e49aeb824de3d933bb2595 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/layers/vision_transformer.py @@ -0,0 +1,398 @@ +from functools import partial +import math +import logging +from typing import Sequence, Tuple, Union, Callable + +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint +from torch.nn.init import trunc_normal_ +from . import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block + +logger = logging.getLogger("dinov2") + + +def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + def forward(self, x): + for b in self: + x = b(x) + return x + + +class DinoVisionTransformer(nn.Module): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=Block, + ffn_layer="mlp", + block_chunks=1, + num_register_tokens=0, + interpolate_antialias=False, + interpolate_offset=0.1, + qk_norm=False, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + num_register_tokens: (int) number of extra cls tokens (so-called "registers") + interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings + interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings + """ + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + # tricky but makes it work + self.use_checkpoint = False + # + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.patch_size = patch_size + self.num_register_tokens = num_register_tokens + self.interpolate_antialias = interpolate_antialias + self.interpolate_offset = interpolate_offset + + self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + assert num_register_tokens >= 0 + self.register_tokens = ( + nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None + ) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + if ffn_layer == "mlp": + logger.info("using MLP layer as FFN") + ffn_layer = Mlp + elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": + logger.info("using SwiGLU layer as FFN") + ffn_layer = SwiGLUFFNFused + elif ffn_layer == "identity": + logger.info("using Identity layer as FFN") + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + qk_norm=qk_norm, + ) + for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize]) + self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + self.init_weights() + + def init_weights(self): + trunc_normal_(self.pos_embed, std=0.02) + nn.init.normal_(self.cls_token, std=1e-6) + if self.register_tokens is not None: + nn.init.normal_(self.register_tokens, std=1e-6) + named_apply(init_weights_vit_timm, self) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + M = int(math.sqrt(N)) # Recover the number of patches in each dimension + assert N == M * M + kwargs = {} + if self.interpolate_offset: + # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8 + # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors + sx = float(w0 + self.interpolate_offset) / M + sy = float(h0 + self.interpolate_offset) / M + kwargs["scale_factor"] = (sx, sy) + else: + # Simply specify an output size instead of a scale factor + kwargs["size"] = (w0, h0) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2), + mode="bicubic", + antialias=self.interpolate_antialias, + **kwargs, + ) + assert (w0, h0) == patch_pos_embed.shape[-2:] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + if self.register_tokens is not None: + x = torch.cat( + ( + x[:, :1], + self.register_tokens.expand(x.shape[0], -1, -1), + x[:, 1:], + ), + dim=1, + ) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint(blk, x, use_reentrant=self.use_reentrant) + else: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append( + { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + ) + return output + + def forward_features(self, x, masks=None): + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint(blk, x, use_reentrant=self.use_reentrant) + else: + x = blk(x) + + x_norm = self.norm(x) + return { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, is_training=True, **kwargs): + ret = self.forward_features(*args, **kwargs) + if is_training: + return ret + else: + return self.head(ret["x_norm_clstoken"]) + + +def init_weights_vit_timm(module: nn.Module, name: str = ""): + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def vit_small(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_base(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_large(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs): + """ + Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64 + """ + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/models/aggregator.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/models/aggregator.py new file mode 100644 index 0000000000000000000000000000000000000000..c2e651840b9c2c50a40a365965ec696fc96eb32b --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/models/aggregator.py @@ -0,0 +1,394 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple, Union, List, Dict, Any + +from streamvggt.layers import PatchEmbed +from streamvggt.layers.block import Block +from streamvggt.layers.rope import RotaryPositionEmbedding2D, PositionGetter +from streamvggt.layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2 + +logger = logging.getLogger(__name__) + +_RESNET_MEAN = [0.485, 0.456, 0.406] +_RESNET_STD = [0.229, 0.224, 0.225] + + +class Aggregator(nn.Module): + """ + The Aggregator applies alternating-attention over input frames, + as described in VGGT: Visual Geometry Grounded Transformer. + + + Args: + img_size (int): Image size in pixels. + patch_size (int): Size of each patch for PatchEmbed. + embed_dim (int): Dimension of the token embeddings. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + mlp_ratio (float): Ratio of MLP hidden dim to embedding dim. + num_register_tokens (int): Number of register tokens. + block_fn (nn.Module): The block type used for attention (Block by default). + qkv_bias (bool): Whether to include bias in QKV projections. + proj_bias (bool): Whether to include bias in the output projection. + ffn_bias (bool): Whether to include bias in MLP layers. + patch_embed (str): Type of patch embed. e.g., "conv" or "dinov2_vitl14_reg". + aa_order (list[str]): The order of alternating attention, e.g. ["frame", "global"]. + aa_block_size (int): How many blocks to group under each attention type before switching. If not necessary, set to 1. + qk_norm (bool): Whether to apply QK normalization. + rope_freq (int): Base frequency for rotary embedding. -1 to disable. + init_values (float): Init scale for layer scale. + """ + + def __init__( + self, + img_size=518, + patch_size=14, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4.0, + num_register_tokens=4, + block_fn=Block, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + patch_embed="dinov2_vitl14_reg", + aa_order=["frame", "global"], + aa_block_size=1, + qk_norm=True, + rope_freq=100, + init_values=0.01, + ): + super().__init__() + + self.__build_patch_embed__(patch_embed, img_size, patch_size, num_register_tokens, embed_dim=embed_dim) + + # Initialize rotary position embedding if frequency > 0 + self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None + self.position_getter = PositionGetter() if self.rope is not None else None + + self.frame_blocks = nn.ModuleList( + [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + init_values=init_values, + qk_norm=qk_norm, + rope=self.rope, + ) + for _ in range(depth) + ] + ) + + self.global_blocks = nn.ModuleList( + [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + init_values=init_values, + qk_norm=qk_norm, + rope=self.rope, + ) + for _ in range(depth) + ] + ) + + self.depth = depth + self.aa_order = aa_order + self.patch_size = patch_size + self.aa_block_size = aa_block_size + + # Validate that depth is divisible by aa_block_size + if self.depth % self.aa_block_size != 0: + raise ValueError(f"depth ({depth}) must be divisible by aa_block_size ({aa_block_size})") + + self.aa_block_num = self.depth // self.aa_block_size + + # Note: We have two camera tokens, one for the first frame and one for the rest + # The same applies for register tokens + self.camera_token = nn.Parameter(torch.randn(1, 2, 1, embed_dim)) + self.register_token = nn.Parameter(torch.randn(1, 2, num_register_tokens, embed_dim)) + + # The patch tokens start after the camera and register tokens + self.patch_start_idx = 1 + num_register_tokens + + # Initialize parameters with small values + nn.init.normal_(self.camera_token, std=1e-6) + nn.init.normal_(self.register_token, std=1e-6) + + # Register normalization constants as buffers + for name, value in ( + ("_resnet_mean", _RESNET_MEAN), + ("_resnet_std", _RESNET_STD), + ): + self.register_buffer( + name, + torch.FloatTensor(value).reshape(1, 1, 3, 1, 1), + persistent=False, + ) + + + def __build_patch_embed__( + self, + patch_embed, + img_size, + patch_size, + num_register_tokens, + interpolate_antialias=True, + interpolate_offset=0.0, + block_chunks=0, + init_values=1.0, + embed_dim=1024, + ): + """ + Build the patch embed layer. If 'conv', we use a + simple PatchEmbed conv layer. Otherwise, we use a vision transformer. + """ + + if "conv" in patch_embed: + self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=3, embed_dim=embed_dim) + else: + vit_models = { + "dinov2_vitl14_reg": vit_large, + "dinov2_vitb14_reg": vit_base, + "dinov2_vits14_reg": vit_small, + "dinov2_vitg2_reg": vit_giant2, + } + + self.patch_embed = vit_models[patch_embed]( + img_size=img_size, + patch_size=patch_size, + num_register_tokens=num_register_tokens, + interpolate_antialias=interpolate_antialias, + interpolate_offset=interpolate_offset, + block_chunks=block_chunks, + init_values=init_values, + ) + + # Disable gradient updates for mask token + if hasattr(self.patch_embed, "mask_token"): + self.patch_embed.mask_token.requires_grad_(False) + + def forward( + self, + images: torch.Tensor, + past_key_values=None, + use_cache=False, + past_frame_idx=0 + ) -> Tuple[List[torch.Tensor], int]: + """ + Args: + images (torch.Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1]. + B: batch size, S: sequence length, 3: RGB channels, H: height, W: width + + Returns: + (list[torch.Tensor], int): + The list of outputs from the attention blocks, + and the patch_start_idx indicating where patch tokens begin. + """ + B, S, C_in, H, W = images.shape + + if use_cache and past_key_values[0] is not None: + _, _, S_true, _, _ = past_key_values[0][0].shape + S_true += 1 + else: + S_true = S + + if use_cache and S > 1: + print(f"Use KV cache expects S=1, got S={S}") + + if C_in != 3: + raise ValueError(f"Expected 3 input channels, got {C_in}") + + # Normalize images and reshape for patch embed + images = (images - self._resnet_mean.to(images.device)) / self._resnet_std.to(images.device) + + # Reshape to [B*S, C, H, W] for patch embedding + images = images.reshape(B * S, C_in, H, W) + patch_tokens = self.patch_embed(images) + + if isinstance(patch_tokens, dict): + patch_tokens = patch_tokens["x_norm_patchtokens"] + + _, P, C = patch_tokens.shape + + if use_cache: + camera_token_full = slice_expand_and_flatten(self.camera_token, B, S_true) + camera_token = camera_token_full[-1:, :, :] + + register_token_full = slice_expand_and_flatten(self.register_token, B, S_true) + register_token = register_token_full[-1:, :, :] + else: + camera_token = slice_expand_and_flatten(self.camera_token, B, S) + register_token = slice_expand_and_flatten(self.register_token, B, S) + # Concatenate special tokens with patch tokens + tokens = torch.cat([camera_token, register_token, patch_tokens], dim=1) + + pos = None + if self.rope is not None: + pos = self.position_getter(B * S, H // self.patch_size, W // self.patch_size, device=images.device) + + if self.patch_start_idx > 0: + # do not use position embedding for special tokens (camera and register tokens) + # so set pos to 0 for the special tokens + pos = pos + 1 + pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(images.device).to(pos.dtype) + pos = torch.cat([pos_special, pos], dim=1) + + # update P because we added special tokens + _, P, C = tokens.shape + + frame_idx = 0 + global_idx = 0 + output_list = [] + + for _ in range(self.aa_block_num): + for attn_type in self.aa_order: + if attn_type == "frame": + tokens, frame_idx, frame_intermediates = self._process_frame_attention( + tokens, B, S, P, C, frame_idx, pos=pos + ) + elif attn_type == "global": + if use_cache: + if past_key_values[global_idx] is not None: + k, v = past_key_values[global_idx] + tokens, global_idx, global_intermediates, new_kv = self._process_global_attention( + tokens, B, S, P, C, global_idx, pos=pos, + past_key_values_block=past_key_values[global_idx] if past_key_values[global_idx] is not None else None, + use_cache=True, + past_frame_idx=past_frame_idx + ) + past_key_values[global_idx - 1] = new_kv + else: + tokens, global_idx, global_intermediates = self._process_global_attention( + tokens, B, S, P, C, global_idx, pos=pos + ) + else: + raise ValueError(f"Unknown attention type: {attn_type}") + for i in range(len(frame_intermediates)): + # concat frame and global intermediates, [B x S x P x 2C] + concat_inter = torch.cat([frame_intermediates[i], global_intermediates[i]], dim=-1) + output_list.append(concat_inter) + + del concat_inter + del frame_intermediates + del global_intermediates + if use_cache: + return output_list, self.patch_start_idx, past_key_values + return output_list, self.patch_start_idx + + def _process_frame_attention(self, tokens, B, S, P, C, frame_idx, pos=None): + """ + Process frame attention blocks. We keep tokens in shape (B*S, P, C). + """ + # If needed, reshape tokens or positions: + if tokens.shape != (B * S, P, C): + tokens = tokens.reshape(B, S, P, C).reshape(B * S, P, C) + + if pos is not None and pos.shape != (B * S, P, 2): + pos = pos.reshape(B, S, P, 2).reshape(B * S, P, 2) + + intermediates = [] + + # by default, self.aa_block_size=1, which processes one block at a time + for _ in range(self.aa_block_size): + tokens = self.frame_blocks[frame_idx](tokens, pos=pos) + frame_idx += 1 + intermediates.append(tokens.reshape(B, S, P, C)) + + return tokens, frame_idx, intermediates + + def _process_global_attention( + self, + tokens, + B, + S, + P, + C, + global_idx, + pos=None, + past_key_values_block=None, + use_cache=False, + past_frame_idx=0 + ) -> Union[Tuple[torch.Tensor, int, List[torch.Tensor]], Tuple[torch.Tensor, int, List[torch.Tensor], List]]: + """ + Process global attention blocks. We keep tokens in shape (B, S*P, C). + """ + + if tokens.shape != (B, S * P, C): + tokens = tokens.reshape(B, S, P, C).reshape(B, S * P, C) + + if pos is not None and pos.shape != (B, S * P, 2): + pos = pos.reshape(B, S, P, 2).reshape(B, S * P, 2) + + intermediates = [] + + for _ in range(self.aa_block_size): + if not use_cache: + L = S * P + frame_ids = torch.arange(L, device=tokens.device) // P # [0,0,...,1,1,...,S-1] + future_frame = frame_ids.unsqueeze(1) < frame_ids.unsqueeze(0) + attn_mask = future_frame.to(tokens.dtype) * torch.finfo(tokens.dtype).min + else: + attn_mask = None + + if use_cache: + tokens, block_kv = self.global_blocks[global_idx]( + tokens, + pos=pos, + attn_mask=attn_mask, + past_key_values=past_key_values_block, + use_cache=True + ) + else: + tokens = self.global_blocks[global_idx](tokens, pos=pos, attn_mask=attn_mask) + global_idx += 1 + intermediates.append(tokens.reshape(B, S, P, C)) + + # if self.use_causal_global: + # del attn_mask + if use_cache: + return tokens, global_idx, intermediates, block_kv + return tokens, global_idx, intermediates + + +def slice_expand_and_flatten(token_tensor, B, S): + """ + Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing: + 1) Uses the first position (index=0) for the first frame only + 2) Uses the second position (index=1) for all remaining frames (S-1 frames) + 3) Expands both to match batch size B + 4) Concatenates to form (B, S, X, C) where each sequence has 1 first-position token + followed by (S-1) second-position tokens + 5) Flattens to (B*S, X, C) for processing + + Returns: + torch.Tensor: Processed tokens with shape (B*S, X, C) + """ + + # Slice out the "query" tokens => shape (1, 1, ...) + query = token_tensor[:, 0:1, ...].expand(B, 1, *token_tensor.shape[2:]) + # Slice out the "other" tokens => shape (1, S-1, ...) + others = token_tensor[:, 1:, ...].expand(B, S - 1, *token_tensor.shape[2:]) + # Concatenate => shape (B, S, ...) + combined = torch.cat([query, others], dim=1) + + # Finally flatten => shape (B*S, ...) + combined = combined.reshape(B * S, *combined.shape[2:]) + return combined diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/models/streamvggt.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/models/streamvggt.py new file mode 100644 index 0000000000000000000000000000000000000000..b257c607520ffea321ed546e9613dfd21def57e3 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/models/streamvggt.py @@ -0,0 +1,248 @@ +import torch +import torch.nn as nn +from huggingface_hub import PyTorchModelHubMixin # used for model hub + +from streamvggt.models.aggregator import Aggregator +from streamvggt.heads.camera_head import CameraHead +from streamvggt.heads.dpt_head import DPTHead +from streamvggt.heads.track_head import TrackHead +from transformers.file_utils import ModelOutput +from typing import Optional, Tuple, List, Any +from dataclasses import dataclass +import pdb + +@dataclass +class StreamVGGTOutput(ModelOutput): + ress: Optional[List[dict]] = None + views: Optional[torch.Tensor] = None + +class StreamVGGT(nn.Module, PyTorchModelHubMixin): + def __init__(self, img_size=518, patch_size=14, embed_dim=1024): + super().__init__() + + self.aggregator = Aggregator(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim) + self.camera_head = CameraHead(dim_in=2 * embed_dim) + self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1") + self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1") + self.track_head = TrackHead(dim_in=2 * embed_dim, patch_size=patch_size) + + + + def forward( + self, + views, + query_points: torch.Tensor = None, + history_info: Optional[dict] = None, + past_key_values=None, + use_cache=False, + past_frame_idx=0 + ): + images = torch.stack( + [view["img"] for view in views], dim=0 + ).permute(1, 0, 2, 3, 4) # B S C H W + + # If without batch dimension, add it + if len(images.shape) == 4: + images = images.unsqueeze(0) + if query_points is not None and len(query_points.shape) == 2: + query_points = query_points.unsqueeze(0) + + if history_info is None: + history_info = {"token": None} + + aggregated_tokens_list, patch_start_idx = self.aggregator(images) + predictions = {} + + with torch.cuda.amp.autocast(enabled=False): + if self.camera_head is not None: + pose_enc_list = self.camera_head(aggregated_tokens_list) + predictions["pose_enc"] = pose_enc_list[-1] # pose encoding of the last iteration + + if self.depth_head is not None: + depth, depth_conf = self.depth_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["depth"] = depth + predictions["depth_conf"] = depth_conf + + if self.point_head is not None: + pts3d, pts3d_conf = self.point_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["world_points"] = pts3d + predictions["world_points_conf"] = pts3d_conf + + if self.track_head is not None and query_points is not None: + track_list, vis, conf = self.track_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx, query_points=query_points + ) + predictions["track"] = track_list[-1] # track of the last iteration + predictions["vis"] = vis + predictions["conf"] = conf + predictions["images"] = images + + B, S = images.shape[:2] + ress = [] + for s in range(S): + res = { + 'pts3d_in_other_view': predictions['world_points'][:, s], # [B, H, W, 3] + 'conf': predictions['world_points_conf'][:, s], # [B, H, W] + + 'depth': predictions['depth'][:, s], # [B, H, W, 1] + 'depth_conf': predictions['depth_conf'][:, s], # [B, H, W] + 'camera_pose': predictions['pose_enc'][:, s, :], # [B, 9] + + **({'valid_mask': views[s]["valid_mask"]} + if 'valid_mask' in views[s] else {}), # [B, H, W] + + **({'track': predictions['track'][:, s], # [B, N, 2] + 'vis': predictions['vis'][:, s], # [B, N] + 'track_conf': predictions['conf'][:, s]} + if 'track' in predictions else {}) + } + ress.append(res) + return StreamVGGTOutput(ress=ress, views=views) # [S] [B, C, H, W] + + def frontendT(self, frame): + images = frame[None, None] # 1,1,C,H,W + B,S,C,H,W = images.shape + #if self.frontend_past_key_values is None: + with torch.no_grad(): + if not hasattr(self,"frontend_past_key_values"): + self.frontend_images_size = (B,C,H,W) + self.frontend_past_key_values = [None] * self.aggregator.depth + self.frontend_kid = 0 + else: + self.frontend_kid += 1 + + + aggregator_output = self.aggregator( + images, + past_key_values=self.frontend_past_key_values, + use_cache=True, + past_frame_idx=self.frontend_kid + ) + aggregated_tokens, patch_start_idx, self.frontend_past_key_values = aggregator_output + aggregated_tokens = [t_.detach() for t_ in aggregated_tokens] + + return aggregated_tokens + + + def extract(self, map_tokens, query_points=None): + B,C,H,W = self.frontend_images_size + S = map_tokens[0].shape[1] + images = torch.zeros((B,S,C,H,W)).to('cuda') + + aggregated_tokens = map_tokens + patch_start_idx = self.aggregator.patch_start_idx + all_ress = [] + + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=False): + if self.camera_head is not None: + pose_enc = self.camera_head(aggregated_tokens) + pose_enc = pose_enc[-1] + camera_pose = pose_enc # 1,S,9 + + if self.depth_head is not None: + depth, depth_conf = self.depth_head( + aggregated_tokens, images=images, patch_start_idx=patch_start_idx + ) + # 1,S,H,W,1 + # 1,S,H,W + + if self.point_head is not None: + pts3d, pts3d_conf = self.point_head( + aggregated_tokens, images=images, patch_start_idx=patch_start_idx + ) + # 1,S,H,W,3 + # 1.S,H,W + + if self.track_head is not None and query_points is not None: + track_list, vis, conf = self.track_head( + aggregated_tokens, images=images, patch_start_idx=patch_start_idx, query_points=query_points + ) + track = track_list[-1][:, 0] + query_points = track + vis = vis[:, 0] + track_conf = conf[:, 0] + + output = { + 'pts3d_in_other_view': pts3d, + 'conf': pts3d_conf, + 'depth': depth, + 'depth_conf': depth_conf, + 'camera_pose': camera_pose, + } + #output = StreamVGGTOutput(ress=all_ress, views=processed_frames) + return output + + + def inference(self, frames, query_points: torch.Tensor = None, past_key_values=None): + past_key_values = [None] * self.aggregator.depth + past_key_values_camera = [None] * self.camera_head.trunk_depth + + all_ress = [] + processed_frames = [] + + for i, frame in enumerate(frames): + images = frame["img"].unsqueeze(0) + aggregator_output = self.aggregator( + images, + past_key_values=past_key_values, + use_cache=True, + past_frame_idx=i + ) + + if isinstance(aggregator_output, tuple) and len(aggregator_output) == 3: + aggregated_tokens, patch_start_idx, past_key_values = aggregator_output + else: + aggregated_tokens, patch_start_idx = aggregator_output + + with torch.cuda.amp.autocast(enabled=False): + if self.camera_head is not None: + pose_enc, past_key_values_camera = self.camera_head(aggregated_tokens, past_key_values_camera=past_key_values_camera, use_cache=True) + pose_enc = pose_enc[-1] + camera_pose = pose_enc[:, 0, :] + + if self.depth_head is not None: + depth, depth_conf = self.depth_head( + aggregated_tokens, images=images, patch_start_idx=patch_start_idx + ) + depth = depth[:, 0] + depth_conf = depth_conf[:, 0] + + if self.point_head is not None: + pts3d, pts3d_conf = self.point_head( + aggregated_tokens, images=images, patch_start_idx=patch_start_idx + ) + pts3d = pts3d[:, 0] + pts3d_conf = pts3d_conf[:, 0] + + if self.track_head is not None and query_points is not None: + track_list, vis, conf = self.track_head( + aggregated_tokens, images=images, patch_start_idx=patch_start_idx, query_points=query_points + ) + track = track_list[-1][:, 0] + query_points = track + vis = vis[:, 0] + track_conf = conf[:, 0] + + all_ress.append({ + 'pts3d_in_other_view': pts3d, + 'conf': pts3d_conf, + 'depth': depth, + 'depth_conf': depth_conf, + 'camera_pose': camera_pose, + **({'valid_mask': frame["valid_mask"]} + if 'valid_mask' in frame else {}), + + **({'track': track, + 'vis': vis, + 'track_conf': track_conf} + if query_points is not None else {}) + }) + processed_frames.append(frame) + + output = StreamVGGTOutput(ress=all_ress, views=processed_frames) + return output diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/load_fn.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/load_fn.py new file mode 100644 index 0000000000000000000000000000000000000000..46a226ef30e8b5671c905cfc5ab53488ed4cced2 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/load_fn.py @@ -0,0 +1,146 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from PIL import Image +from torchvision import transforms as TF + + +def load_and_preprocess_images(image_path_list, mode="crop"): + """ + A quick start function to load and preprocess images for model input. + This assumes the images should have the same shape for easier batching, but our model can also work well with different shapes. + + Args: + image_path_list (list): List of paths to image files + mode (str, optional): Preprocessing mode, either "crop" or "pad". + - "crop" (default): Sets width to 518px and center crops height if needed. + - "pad": Preserves all pixels by making the largest dimension 518px + and padding the smaller dimension to reach a square shape. + + Returns: + torch.Tensor: Batched tensor of preprocessed images with shape (N, 3, H, W) + + Raises: + ValueError: If the input list is empty or if mode is invalid + + Notes: + - Images with different dimensions will be padded with white (value=1.0) + - A warning is printed when images have different shapes + - When mode="crop": The function ensures width=518px while maintaining aspect ratio + and height is center-cropped if larger than 518px + - When mode="pad": The function ensures the largest dimension is 518px while maintaining aspect ratio + and the smaller dimension is padded to reach a square shape (518x518) + - Dimensions are adjusted to be divisible by 14 for compatibility with model requirements + """ + # Check for empty list + if len(image_path_list) == 0: + raise ValueError("At least 1 image is required") + + # Validate mode + if mode not in ["crop", "pad"]: + raise ValueError("Mode must be either 'crop' or 'pad'") + + images = [] + shapes = set() + to_tensor = TF.ToTensor() + target_size = 518 + + # First process all images and collect their shapes + for image_path in image_path_list: + + # Open image + img = Image.open(image_path) + + # If there's an alpha channel, blend onto white background: + if img.mode == "RGBA": + # Create white background + background = Image.new("RGBA", img.size, (255, 255, 255, 255)) + # Alpha composite onto the white background + img = Image.alpha_composite(background, img) + + # Now convert to "RGB" (this step assigns white for transparent areas) + img = img.convert("RGB") + + width, height = img.size + + if mode == "pad": + # Make the largest dimension 518px while maintaining aspect ratio + if width >= height: + new_width = target_size + new_height = round(height * (new_width / width) / 14) * 14 # Make divisible by 14 + else: + new_height = target_size + new_width = round(width * (new_height / height) / 14) * 14 # Make divisible by 14 + else: # mode == "crop" + # Original behavior: set width to 518px + new_width = target_size + # Calculate height maintaining aspect ratio, divisible by 14 + new_height = round(height * (new_width / width) / 14) * 14 + + # Resize with new dimensions (width, height) + img = img.resize((new_width, new_height), Image.Resampling.BICUBIC) + img = to_tensor(img) # Convert to tensor (0, 1) + + # Center crop height if it's larger than 518 (only in crop mode) + if mode == "crop" and new_height > target_size: + start_y = (new_height - target_size) // 2 + img = img[:, start_y: start_y + target_size, :] + + # For pad mode, pad to make a square of target_size x target_size + if mode == "pad": + h_padding = target_size - img.shape[1] + w_padding = target_size - img.shape[2] + + if h_padding > 0 or w_padding > 0: + pad_top = h_padding // 2 + pad_bottom = h_padding - pad_top + pad_left = w_padding // 2 + pad_right = w_padding - pad_left + + # Pad with white (value=1.0) + img = torch.nn.functional.pad( + img, (pad_left, pad_right, pad_top, pad_bottom), mode="constant", value=1.0 + ) + + shapes.add((img.shape[1], img.shape[2])) + images.append(img) + + # Check if we have different shapes + # In theory our model can also work well with different shapes + if len(shapes) > 1: + print(f"Warning: Found images with different shapes: {shapes}") + # Find maximum dimensions + max_height = max(shape[0] for shape in shapes) + max_width = max(shape[1] for shape in shapes) + + # Pad images if necessary + padded_images = [] + for img in images: + h_padding = max_height - img.shape[1] + w_padding = max_width - img.shape[2] + + if h_padding > 0 or w_padding > 0: + pad_top = h_padding // 2 + pad_bottom = h_padding - pad_top + pad_left = w_padding // 2 + pad_right = w_padding - pad_left + + img = torch.nn.functional.pad( + img, (pad_left, pad_right, pad_top, pad_bottom), mode="constant", value=1.0 + ) + padded_images.append(img) + images = padded_images + + images = torch.stack(images) # concatenate images + + # Ensure correct shape when single image + if len(image_path_list) == 1: + # Verify shape is (1, C, H, W) + if images.dim() == 3: + images = images.unsqueeze(0) + + return images diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/pose_enc.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/pose_enc.py new file mode 100644 index 0000000000000000000000000000000000000000..2f98b0878cb13451b8cdb80074349cbf2644c5fa --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/pose_enc.py @@ -0,0 +1,130 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from .rotation import quat_to_mat, mat_to_quat + + +def extri_intri_to_pose_encoding( + extrinsics, + intrinsics, + image_size_hw=None, # e.g., (256, 512) + pose_encoding_type="absT_quaR_FoV", +): + """Convert camera extrinsics and intrinsics to a compact pose encoding. + + This function transforms camera parameters into a unified pose encoding format, + which can be used for various downstream tasks like pose prediction or representation. + + Args: + extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4, + where B is batch size and S is sequence length. + In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world transformation. + The format is [R|t] where R is a 3x3 rotation matrix and t is a 3x1 translation vector. + intrinsics (torch.Tensor): Camera intrinsic parameters with shape BxSx3x3. + Defined in pixels, with format: + [[fx, 0, cx], + [0, fy, cy], + [0, 0, 1]] + where fx, fy are focal lengths and (cx, cy) is the principal point + image_size_hw (tuple): Tuple of (height, width) of the image in pixels. + Required for computing field of view values. For example: (256, 512). + pose_encoding_type (str): Type of pose encoding to use. Currently only + supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view). + + Returns: + torch.Tensor: Encoded camera pose parameters with shape BxSx9. + For "absT_quaR_FoV" type, the 9 dimensions are: + - [:3] = absolute translation vector T (3D) + - [3:7] = rotation as quaternion quat (4D) + - [7:] = field of view (2D) + """ + + # extrinsics: BxSx3x4 + # intrinsics: BxSx3x3 + + if pose_encoding_type == "absT_quaR_FoV": + R = extrinsics[:, :, :3, :3] # BxSx3x3 + T = extrinsics[:, :, :3, 3] # BxSx3 + + quat = mat_to_quat(R) + # Note the order of h and w here + H, W = image_size_hw + fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1]) + fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0]) + pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float() + else: + raise NotImplementedError + + return pose_encoding + + +def pose_encoding_to_extri_intri( + pose_encoding, + image_size_hw=None, # e.g., (256, 512) + pose_encoding_type="absT_quaR_FoV", + build_intrinsics=True, +): + """Convert a pose encoding back to camera extrinsics and intrinsics. + + This function performs the inverse operation of extri_intri_to_pose_encoding, + reconstructing the full camera parameters from the compact encoding. + + Args: + pose_encoding (torch.Tensor): Encoded camera pose parameters with shape BxSx9, + where B is batch size and S is sequence length. + For "absT_quaR_FoV" type, the 9 dimensions are: + - [:3] = absolute translation vector T (3D) + - [3:7] = rotation as quaternion quat (4D) + - [7:] = field of view (2D) + image_size_hw (tuple): Tuple of (height, width) of the image in pixels. + Required for reconstructing intrinsics from field of view values. + For example: (256, 512). + pose_encoding_type (str): Type of pose encoding used. Currently only + supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view). + build_intrinsics (bool): Whether to reconstruct the intrinsics matrix. + If False, only extrinsics are returned and intrinsics will be None. + + Returns: + tuple: (extrinsics, intrinsics) + - extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4. + In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world + transformation. The format is [R|t] where R is a 3x3 rotation matrix and t is + a 3x1 translation vector. + - intrinsics (torch.Tensor or None): Camera intrinsic parameters with shape BxSx3x3, + or None if build_intrinsics is False. Defined in pixels, with format: + [[fx, 0, cx], + [0, fy, cy], + [0, 0, 1]] + where fx, fy are focal lengths and (cx, cy) is the principal point, + assumed to be at the center of the image (W/2, H/2). + """ + + intrinsics = None + + if pose_encoding_type == "absT_quaR_FoV": + T = pose_encoding[..., :3] + quat = pose_encoding[..., 3:7] + fov_h = pose_encoding[..., 7] + fov_w = pose_encoding[..., 8] + + R = quat_to_mat(quat) + extrinsics = torch.cat([R, T[..., None]], dim=-1) + + if build_intrinsics: + H, W = image_size_hw + fy = (H / 2.0) / torch.tan(fov_h / 2.0) + fx = (W / 2.0) / torch.tan(fov_w / 2.0) + intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device) + intrinsics[..., 0, 0] = fx + intrinsics[..., 1, 1] = fy + intrinsics[..., 0, 2] = W / 2 + intrinsics[..., 1, 2] = H / 2 + intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1 + else: + raise NotImplementedError + + return extrinsics, intrinsics diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/rotation.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/rotation.py new file mode 100644 index 0000000000000000000000000000000000000000..657583e6915437c824c192d51939990b589a14fa --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/rotation.py @@ -0,0 +1,138 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Modified from PyTorch3D, https://github.com/facebookresearch/pytorch3d + +import torch +import numpy as np +import torch.nn.functional as F + + +def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor: + """ + Quaternion Order: XYZW or say ijkr, scalar-last + + Convert rotations given as quaternions to rotation matrices. + Args: + quaternions: quaternions with real part last, + as tensor of shape (..., 4). + + Returns: + Rotation matrices as tensor of shape (..., 3, 3). + """ + i, j, k, r = torch.unbind(quaternions, -1) + # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`. + two_s = 2.0 / (quaternions * quaternions).sum(-1) + + o = torch.stack( + ( + 1 - two_s * (j * j + k * k), + two_s * (i * j - k * r), + two_s * (i * k + j * r), + two_s * (i * j + k * r), + 1 - two_s * (i * i + k * k), + two_s * (j * k - i * r), + two_s * (i * k - j * r), + two_s * (j * k + i * r), + 1 - two_s * (i * i + j * j), + ), + -1, + ) + return o.reshape(quaternions.shape[:-1] + (3, 3)) + + +def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor: + """ + Convert rotations given as rotation matrices to quaternions. + + Args: + matrix: Rotation matrices as tensor of shape (..., 3, 3). + + Returns: + quaternions with real part last, as tensor of shape (..., 4). + Quaternion Order: XYZW or say ijkr, scalar-last + """ + if matrix.size(-1) != 3 or matrix.size(-2) != 3: + raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.") + + batch_dim = matrix.shape[:-2] + m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1) + + q_abs = _sqrt_positive_part( + torch.stack( + [ + 1.0 + m00 + m11 + m22, + 1.0 + m00 - m11 - m22, + 1.0 - m00 + m11 - m22, + 1.0 - m00 - m11 + m22, + ], + dim=-1, + ) + ) + + # we produce the desired quaternion multiplied by each of r, i, j, k + quat_by_rijk = torch.stack( + [ + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1), + ], + dim=-2, + ) + + # We floor here at 0.1 but the exact level is not important; if q_abs is small, + # the candidate won't be picked. + flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device) + quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr)) + + # if not for numerical problems, quat_candidates[i] should be same (up to a sign), + # forall i; we pick the best-conditioned one (with the largest denominator) + out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(batch_dim + (4,)) + + # Convert from rijk to ijkr + out = out[..., [1, 2, 3, 0]] + + out = standardize_quaternion(out) + + return out + + +def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor: + """ + Returns torch.sqrt(torch.max(0, x)) + but with a zero subgradient where x is 0. + """ + ret = torch.zeros_like(x) + positive_mask = x > 0 + if torch.is_grad_enabled(): + ret[positive_mask] = torch.sqrt(x[positive_mask]) + else: + ret = torch.where(positive_mask, torch.sqrt(x), ret) + return ret + + +def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor: + """ + Convert a unit quaternion to a standard form: one in which the real + part is non negative. + + Args: + quaternions: Quaternions with real part last, + as tensor of shape (..., 4). + + Returns: + Standardized quaternions as tensor of shape (..., 4). + """ + return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/visual_track.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/visual_track.py new file mode 100644 index 0000000000000000000000000000000000000000..796c114ccba00b5f7850e04b9444a6cd5c44b154 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/streamvggt/utils/visual_track.py @@ -0,0 +1,239 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import cv2 +import torch +import numpy as np +import os + + +def color_from_xy(x, y, W, H, cmap_name="hsv"): + """ + Map (x, y) -> color in (R, G, B). + 1) Normalize x,y to [0,1]. + 2) Combine them into a single scalar c in [0,1]. + 3) Use matplotlib's colormap to convert c -> (R,G,B). + + You can customize step 2, e.g., c = (x + y)/2, or some function of (x, y). + """ + import matplotlib.cm + import matplotlib.colors + + x_norm = x / max(W - 1, 1) + y_norm = y / max(H - 1, 1) + # Simple combination: + c = (x_norm + y_norm) / 2.0 + + cmap = matplotlib.cm.get_cmap(cmap_name) + # cmap(c) -> (r,g,b,a) in [0,1] + rgba = cmap(c) + r, g, b = rgba[0], rgba[1], rgba[2] + return (r, g, b) # in [0,1], RGB order + + +def get_track_colors_by_position(tracks_b, vis_mask_b=None, image_width=None, image_height=None, cmap_name="hsv"): + """ + Given all tracks in one sample (b), compute a (N,3) array of RGB color values + in [0,255]. The color is determined by the (x,y) position in the first + visible frame for each track. + + Args: + tracks_b: Tensor of shape (S, N, 2). (x,y) for each track in each frame. + vis_mask_b: (S, N) boolean mask; if None, assume all are visible. + image_width, image_height: used for normalizing (x, y). + cmap_name: for matplotlib (e.g., 'hsv', 'rainbow', 'jet'). + + Returns: + track_colors: np.ndarray of shape (N, 3), each row is (R,G,B) in [0,255]. + """ + S, N, _ = tracks_b.shape + track_colors = np.zeros((N, 3), dtype=np.uint8) + + if vis_mask_b is None: + # treat all as visible + vis_mask_b = torch.ones(S, N, dtype=torch.bool, device=tracks_b.device) + + for i in range(N): + # Find first visible frame for track i + visible_frames = torch.where(vis_mask_b[:, i])[0] + if len(visible_frames) == 0: + # track is never visible; just assign black or something + track_colors[i] = (0, 0, 0) + continue + + first_s = int(visible_frames[0].item()) + # use that frame's (x,y) + x, y = tracks_b[first_s, i].tolist() + + # map (x,y) -> (R,G,B) in [0,1] + r, g, b = color_from_xy(x, y, W=image_width, H=image_height, cmap_name=cmap_name) + # scale to [0,255] + r, g, b = int(r * 255), int(g * 255), int(b * 255) + track_colors[i] = (r, g, b) + + return track_colors + + +def visualize_tracks_on_images( + images, + tracks, + track_vis_mask=None, + out_dir="track_visuals_concat_by_xy", + image_format="CHW", # "CHW" or "HWC" + normalize_mode="[0,1]", + cmap_name="hsv", # e.g. "hsv", "rainbow", "jet" + frames_per_row=4, # New parameter for grid layout + save_grid=True, # Flag to control whether to save the grid image +): + """ + Visualizes frames in a grid layout with specified frames per row. + Each track's color is determined by its (x,y) position + in the first visible frame (or frame 0 if always visible). + Finally convert the BGR result to RGB before saving. + Also saves each individual frame as a separate PNG file. + + Args: + images: torch.Tensor (S, 3, H, W) if CHW or (S, H, W, 3) if HWC. + tracks: torch.Tensor (S, N, 2), last dim = (x, y). + track_vis_mask: torch.Tensor (S, N) or None. + out_dir: folder to save visualizations. + image_format: "CHW" or "HWC". + normalize_mode: "[0,1]", "[-1,1]", or None for direct raw -> 0..255 + cmap_name: a matplotlib colormap name for color_from_xy. + frames_per_row: number of frames to display in each row of the grid. + save_grid: whether to save all frames in one grid image. + + Returns: + None (saves images in out_dir). + """ + + if len(tracks.shape) == 4: + tracks = tracks.squeeze(0) + images = images.squeeze(0) + if track_vis_mask is not None: + track_vis_mask = track_vis_mask.squeeze(0) + + import matplotlib + + matplotlib.use("Agg") # for non-interactive (optional) + + os.makedirs(out_dir, exist_ok=True) + + S = images.shape[0] + _, N, _ = tracks.shape # (S, N, 2) + + # Move to CPU + images = images.cpu().clone() + tracks = tracks.cpu().clone() + if track_vis_mask is not None: + track_vis_mask = track_vis_mask.cpu().clone() + + # Infer H, W from images shape + if image_format == "CHW": + # e.g. images[s].shape = (3, H, W) + H, W = images.shape[2], images.shape[3] + else: + # e.g. images[s].shape = (H, W, 3) + H, W = images.shape[1], images.shape[2] + + # Pre-compute the color for each track i based on first visible position + track_colors_rgb = get_track_colors_by_position( + tracks, # shape (S, N, 2) + vis_mask_b=track_vis_mask if track_vis_mask is not None else None, + image_width=W, + image_height=H, + cmap_name=cmap_name, + ) + + # We'll accumulate each frame's drawn image in a list + frame_images = [] + + for s in range(S): + # shape => either (3, H, W) or (H, W, 3) + img = images[s] + + # Convert to (H, W, 3) + if image_format == "CHW": + img = img.permute(1, 2, 0) # (H, W, 3) + # else "HWC", do nothing + + img = img.numpy().astype(np.float32) + + # Scale to [0,255] if needed + if normalize_mode == "[0,1]": + img = np.clip(img, 0, 1) * 255.0 + elif normalize_mode == "[-1,1]": + img = (img + 1.0) * 0.5 * 255.0 + img = np.clip(img, 0, 255.0) + # else no normalization + + # Convert to uint8 + img = img.astype(np.uint8) + + # For drawing in OpenCV, convert to BGR + img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + # Draw each visible track + cur_tracks = tracks[s] # shape (N, 2) + if track_vis_mask is not None: + valid_indices = torch.where(track_vis_mask[s])[0] + else: + valid_indices = range(N) + + cur_tracks_np = cur_tracks.numpy() + for i in valid_indices: + x, y = cur_tracks_np[i] + pt = (int(round(x)), int(round(y))) + + # track_colors_rgb[i] is (R,G,B). For OpenCV circle, we need BGR + R, G, B = track_colors_rgb[i] + color_bgr = (int(B), int(G), int(R)) + cv2.circle(img_bgr, pt, radius=3, color=color_bgr, thickness=-1) + + # Convert back to RGB for consistent final saving: + img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) + + # Save individual frame + frame_path = os.path.join(out_dir, f"frame_{s:04d}.png") + # Convert to BGR for OpenCV imwrite + frame_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) + cv2.imwrite(frame_path, frame_bgr) + + frame_images.append(img_rgb) + + # Only create and save the grid image if save_grid is True + if save_grid: + # Calculate grid dimensions + num_rows = (S + frames_per_row - 1) // frames_per_row # Ceiling division + + # Create a grid of images + grid_img = None + for row in range(num_rows): + start_idx = row * frames_per_row + end_idx = min(start_idx + frames_per_row, S) + + # Concatenate this row horizontally + row_img = np.concatenate(frame_images[start_idx:end_idx], axis=1) + + # If this row has fewer than frames_per_row images, pad with black + if end_idx - start_idx < frames_per_row: + padding_width = (frames_per_row - (end_idx - start_idx)) * W + padding = np.zeros((H, padding_width, 3), dtype=np.uint8) + row_img = np.concatenate([row_img, padding], axis=1) + + # Add this row to the grid + if grid_img is None: + grid_img = row_img + else: + grid_img = np.concatenate([grid_img, row_img], axis=0) + + out_path = os.path.join(out_dir, "tracks_grid.png") + # Convert back to BGR for OpenCV imwrite + grid_img_bgr = cv2.cvtColor(grid_img, cv2.COLOR_RGB2BGR) + cv2.imwrite(out_path, grid_img_bgr) + print(f"[INFO] Saved color-by-XY track visualization grid -> {out_path}") + + print(f"[INFO] Saved {S} individual frames to {out_dir}/frame_*.png") diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/camera_head.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/camera_head.py new file mode 100644 index 0000000000000000000000000000000000000000..176d76fb5baeb3a42fa3675a1d1fb14010f2904d --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/camera_head.py @@ -0,0 +1,162 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from vggt.layers import Mlp +from vggt.layers.block import Block +from vggt.heads.head_act import activate_pose + + +class CameraHead(nn.Module): + """ + CameraHead predicts camera parameters from token representations using iterative refinement. + + It applies a series of transformer blocks (the "trunk") to dedicated camera tokens. + """ + + def __init__( + self, + dim_in: int = 2048, + trunk_depth: int = 4, + pose_encoding_type: str = "absT_quaR_FoV", + num_heads: int = 16, + mlp_ratio: int = 4, + init_values: float = 0.01, + trans_act: str = "linear", + quat_act: str = "linear", + fl_act: str = "relu", # Field of view activations: ensures FOV values are positive. + ): + super().__init__() + + if pose_encoding_type == "absT_quaR_FoV": + self.target_dim = 9 + else: + raise ValueError(f"Unsupported camera encoding type: {pose_encoding_type}") + + self.trans_act = trans_act + self.quat_act = quat_act + self.fl_act = fl_act + self.trunk_depth = trunk_depth + + # Build the trunk using a sequence of transformer blocks. + self.trunk = nn.Sequential( + *[ + Block( + dim=dim_in, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + init_values=init_values, + ) + for _ in range(trunk_depth) + ] + ) + + # Normalizations for camera token and trunk output. + self.token_norm = nn.LayerNorm(dim_in) + self.trunk_norm = nn.LayerNorm(dim_in) + + # Learnable empty camera pose token. + self.empty_pose_tokens = nn.Parameter(torch.zeros(1, 1, self.target_dim)) + self.embed_pose = nn.Linear(self.target_dim, dim_in) + + # Module for producing modulation parameters: shift, scale, and a gate. + self.poseLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(dim_in, 3 * dim_in, bias=True)) + + # Adaptive layer normalization without affine parameters. + self.adaln_norm = nn.LayerNorm(dim_in, elementwise_affine=False, eps=1e-6) + self.pose_branch = Mlp( + in_features=dim_in, + hidden_features=dim_in // 2, + out_features=self.target_dim, + drop=0, + ) + + def forward(self, aggregated_tokens_list: list, num_iterations: int = 4) -> list: + """ + Forward pass to predict camera parameters. + + Args: + aggregated_tokens_list (list): List of token tensors from the network; + the last tensor is used for prediction. + num_iterations (int, optional): Number of iterative refinement steps. Defaults to 4. + + Returns: + list: A list of predicted camera encodings (post-activation) from each iteration. + """ + # Use tokens from the last block for camera prediction. + tokens = aggregated_tokens_list[-1] + + # Extract the camera tokens + pose_tokens = tokens[:, :, 0] + pose_tokens = self.token_norm(pose_tokens) + + pred_pose_enc_list = self.trunk_fn(pose_tokens, num_iterations) + return pred_pose_enc_list + + def trunk_fn(self, pose_tokens: torch.Tensor, num_iterations: int) -> list: + """ + Iteratively refine camera pose predictions. + + Args: + pose_tokens (torch.Tensor): Normalized camera tokens with shape [B, 1, C]. + num_iterations (int): Number of refinement iterations. + + Returns: + list: List of activated camera encodings from each iteration. + """ + B, S, C = pose_tokens.shape # S is expected to be 1. + pred_pose_enc = None + pred_pose_enc_list = [] + + for _ in range(num_iterations): + # Use a learned empty pose for the first iteration. + if pred_pose_enc is None: + module_input = self.embed_pose(self.empty_pose_tokens.expand(B, S, -1)) + else: + # Detach the previous prediction to avoid backprop through time. + pred_pose_enc = pred_pose_enc.detach() + module_input = self.embed_pose(pred_pose_enc) + + # Generate modulation parameters and split them into shift, scale, and gate components. + shift_msa, scale_msa, gate_msa = self.poseLN_modulation(module_input).chunk(3, dim=-1) + + # Adaptive layer normalization and modulation. + pose_tokens_modulated = gate_msa * modulate(self.adaln_norm(pose_tokens), shift_msa, scale_msa) + pose_tokens_modulated = pose_tokens_modulated + pose_tokens + + pose_tokens_modulated = self.trunk(pose_tokens_modulated) + # Compute the delta update for the pose encoding. + pred_pose_enc_delta = self.pose_branch(self.trunk_norm(pose_tokens_modulated)) + + if pred_pose_enc is None: + pred_pose_enc = pred_pose_enc_delta + else: + pred_pose_enc = pred_pose_enc + pred_pose_enc_delta + + # Apply final activation functions for translation, quaternion, and field-of-view. + activated_pose = activate_pose( + pred_pose_enc, + trans_act=self.trans_act, + quat_act=self.quat_act, + fl_act=self.fl_act, + ) + pred_pose_enc_list.append(activated_pose) + + return pred_pose_enc_list + + +def modulate(x: torch.Tensor, shift: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + """ + Modulate the input tensor using scaling and shifting parameters. + """ + # modified from https://github.com/facebookresearch/DiT/blob/796c29e532f47bba17c5b9c5eb39b9354b8b7c64/models.py#L19 + return x * (1 + scale) + shift diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/dpt_head.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/dpt_head.py new file mode 100644 index 0000000000000000000000000000000000000000..fc31b7da4589882b2dd7b52e47d3b30563bc9764 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/dpt_head.py @@ -0,0 +1,497 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +# Inspired by https://github.com/DepthAnything/Depth-Anything-V2 + + +import os +from typing import List, Dict, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from .head_act import activate_head +from .utils import create_uv_grid, position_grid_to_embed + + +class DPTHead(nn.Module): + """ + DPT Head for dense prediction tasks. + + This implementation follows the architecture described in "Vision Transformers for Dense Prediction" + (https://arxiv.org/abs/2103.13413). The DPT head processes features from a vision transformer + backbone and produces dense predictions by fusing multi-scale features. + + Args: + dim_in (int): Input dimension (channels). + patch_size (int, optional): Patch size. Default is 14. + output_dim (int, optional): Number of output channels. Default is 4. + activation (str, optional): Activation type. Default is "inv_log". + conf_activation (str, optional): Confidence activation type. Default is "expp1". + features (int, optional): Feature channels for intermediate representations. Default is 256. + out_channels (List[int], optional): Output channels for each intermediate layer. + intermediate_layer_idx (List[int], optional): Indices of layers from aggregated tokens used for DPT. + pos_embed (bool, optional): Whether to use positional embedding. Default is True. + feature_only (bool, optional): If True, return features only without the last several layers and activation head. Default is False. + down_ratio (int, optional): Downscaling factor for the output resolution. Default is 1. + """ + + def __init__( + self, + dim_in: int, + patch_size: int = 14, + output_dim: int = 4, + activation: str = "inv_log", + conf_activation: str = "expp1", + features: int = 256, + out_channels: List[int] = [256, 512, 1024, 1024], + intermediate_layer_idx: List[int] = [4, 11, 17, 23], + pos_embed: bool = True, + feature_only: bool = False, + down_ratio: int = 1, + ) -> None: + super(DPTHead, self).__init__() + self.patch_size = patch_size + self.activation = activation + self.conf_activation = conf_activation + self.pos_embed = pos_embed + self.feature_only = feature_only + self.down_ratio = down_ratio + self.intermediate_layer_idx = intermediate_layer_idx + + self.norm = nn.LayerNorm(dim_in) + + # Projection layers for each output channel from tokens. + self.projects = nn.ModuleList( + [ + nn.Conv2d( + in_channels=dim_in, + out_channels=oc, + kernel_size=1, + stride=1, + padding=0, + ) + for oc in out_channels + ] + ) + + # Resize layers for upsampling feature maps. + self.resize_layers = nn.ModuleList( + [ + nn.ConvTranspose2d( + in_channels=out_channels[0], out_channels=out_channels[0], kernel_size=4, stride=4, padding=0 + ), + nn.ConvTranspose2d( + in_channels=out_channels[1], out_channels=out_channels[1], kernel_size=2, stride=2, padding=0 + ), + nn.Identity(), + nn.Conv2d( + in_channels=out_channels[3], out_channels=out_channels[3], kernel_size=3, stride=2, padding=1 + ), + ] + ) + + self.scratch = _make_scratch( + out_channels, + features, + expand=False, + ) + + # Attach additional modules to scratch. + self.scratch.stem_transpose = None + self.scratch.refinenet1 = _make_fusion_block(features) + self.scratch.refinenet2 = _make_fusion_block(features) + self.scratch.refinenet3 = _make_fusion_block(features) + self.scratch.refinenet4 = _make_fusion_block(features, has_residual=False) + + head_features_1 = features + head_features_2 = 32 + + if feature_only: + self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1, kernel_size=3, stride=1, padding=1) + else: + self.scratch.output_conv1 = nn.Conv2d( + head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1 + ) + conv2_in_channels = head_features_1 // 2 + + self.scratch.output_conv2 = nn.Sequential( + nn.Conv2d(conv2_in_channels, head_features_2, kernel_size=3, stride=1, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(head_features_2, output_dim, kernel_size=1, stride=1, padding=0), + ) + + def forward( + self, + aggregated_tokens_list: List[torch.Tensor], + images: torch.Tensor, + patch_start_idx: int, + frames_chunk_size: int = 8, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Forward pass through the DPT head, supports processing by chunking frames. + Args: + aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers. + images (Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1]. + patch_start_idx (int): Starting index for patch tokens in the token sequence. + Used to separate patch tokens from other tokens (e.g., camera or register tokens). + frames_chunk_size (int, optional): Number of frames to process in each chunk. + If None or larger than S, all frames are processed at once. Default: 8. + + Returns: + Tensor or Tuple[Tensor, Tensor]: + - If feature_only=True: Feature maps with shape [B, S, C, H, W] + - Otherwise: Tuple of (predictions, confidence) both with shape [B, S, 1, H, W] + """ + B, S, _, H, W = images.shape + + # If frames_chunk_size is not specified or greater than S, process all frames at once + if frames_chunk_size is None or frames_chunk_size >= S: + return self._forward_impl(aggregated_tokens_list, images, patch_start_idx) + + # Otherwise, process frames in chunks to manage memory usage + assert frames_chunk_size > 0 + + # Process frames in batches + all_preds = [] + all_conf = [] + + for frames_start_idx in range(0, S, frames_chunk_size): + frames_end_idx = min(frames_start_idx + frames_chunk_size, S) + + # Process batch of frames + if self.feature_only: + chunk_output = self._forward_impl( + aggregated_tokens_list, images, patch_start_idx, frames_start_idx, frames_end_idx + ) + all_preds.append(chunk_output) + else: + chunk_preds, chunk_conf = self._forward_impl( + aggregated_tokens_list, images, patch_start_idx, frames_start_idx, frames_end_idx + ) + all_preds.append(chunk_preds) + all_conf.append(chunk_conf) + + # Concatenate results along the sequence dimension + if self.feature_only: + return torch.cat(all_preds, dim=1) + else: + return torch.cat(all_preds, dim=1), torch.cat(all_conf, dim=1) + + def _forward_impl( + self, + aggregated_tokens_list: List[torch.Tensor], + images: torch.Tensor, + patch_start_idx: int, + frames_start_idx: int = None, + frames_end_idx: int = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + """ + Implementation of the forward pass through the DPT head. + + This method processes a specific chunk of frames from the sequence. + + Args: + aggregated_tokens_list (List[Tensor]): List of token tensors from different transformer layers. + images (Tensor): Input images with shape [B, S, 3, H, W]. + patch_start_idx (int): Starting index for patch tokens. + frames_start_idx (int, optional): Starting index for frames to process. + frames_end_idx (int, optional): Ending index for frames to process. + + Returns: + Tensor or Tuple[Tensor, Tensor]: Feature maps or (predictions, confidence). + """ + if frames_start_idx is not None and frames_end_idx is not None: + images = images[:, frames_start_idx:frames_end_idx].contiguous() + + B, S, _, H, W = images.shape + + patch_h, patch_w = H // self.patch_size, W // self.patch_size + + out = [] + dpt_idx = 0 + + for layer_idx in self.intermediate_layer_idx: + x = aggregated_tokens_list[layer_idx][:, :, patch_start_idx:] + + # Select frames if processing a chunk + if frames_start_idx is not None and frames_end_idx is not None: + x = x[:, frames_start_idx:frames_end_idx] + + x = x.reshape(B * S, -1, x.shape[-1]) + + x = self.norm(x) + + x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w)) + + x = self.projects[dpt_idx](x) + if self.pos_embed: + x = self._apply_pos_embed(x, W, H) + x = self.resize_layers[dpt_idx](x) + + out.append(x) + dpt_idx += 1 + + # Fuse features from multiple layers. + out = self.scratch_forward(out) + # Interpolate fused output to match target image resolution. + out = custom_interpolate( + out, + (int(patch_h * self.patch_size / self.down_ratio), int(patch_w * self.patch_size / self.down_ratio)), + mode="bilinear", + align_corners=True, + ) + + if self.pos_embed: + out = self._apply_pos_embed(out, W, H) + + if self.feature_only: + return out.reshape(B, S, *out.shape[1:]) + + out = self.scratch.output_conv2(out) + preds, conf = activate_head(out, activation=self.activation, conf_activation=self.conf_activation) + + preds = preds.reshape(B, S, *preds.shape[1:]) + conf = conf.reshape(B, S, *conf.shape[1:]) + return preds, conf + + def _apply_pos_embed(self, x: torch.Tensor, W: int, H: int, ratio: float = 0.1) -> torch.Tensor: + """ + Apply positional embedding to tensor x. + """ + patch_w = x.shape[-1] + patch_h = x.shape[-2] + pos_embed = create_uv_grid(patch_w, patch_h, aspect_ratio=W / H, dtype=x.dtype, device=x.device) + pos_embed = position_grid_to_embed(pos_embed, x.shape[1]) + pos_embed = pos_embed * ratio + pos_embed = pos_embed.permute(2, 0, 1)[None].expand(x.shape[0], -1, -1, -1) + return x + pos_embed + + def scratch_forward(self, features: List[torch.Tensor]) -> torch.Tensor: + """ + Forward pass through the fusion blocks. + + Args: + features (List[Tensor]): List of feature maps from different layers. + + Returns: + Tensor: Fused feature map. + """ + layer_1, layer_2, layer_3, layer_4 = features + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + out = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:]) + del layer_4_rn, layer_4 + + out = self.scratch.refinenet3(out, layer_3_rn, size=layer_2_rn.shape[2:]) + del layer_3_rn, layer_3 + + out = self.scratch.refinenet2(out, layer_2_rn, size=layer_1_rn.shape[2:]) + del layer_2_rn, layer_2 + + out = self.scratch.refinenet1(out, layer_1_rn) + del layer_1_rn, layer_1 + + out = self.scratch.output_conv1(out) + return out + + +################################################################################ +# Modules +################################################################################ + + +def _make_fusion_block(features: int, size: int = None, has_residual: bool = True, groups: int = 1) -> nn.Module: + return FeatureFusionBlock( + features, + nn.ReLU(inplace=True), + deconv=False, + bn=False, + expand=False, + align_corners=True, + size=size, + has_residual=has_residual, + groups=groups, + ) + + +def _make_scratch(in_shape: List[int], out_shape: int, groups: int = 1, expand: bool = False) -> nn.Module: + scratch = nn.Module() + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + if len(in_shape) >= 4: + out_shape4 = out_shape + + if expand: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + if len(in_shape) >= 4: + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + if len(in_shape) >= 4: + scratch.layer4_rn = nn.Conv2d( + in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + return scratch + + +class ResidualConvUnit(nn.Module): + """Residual convolution module.""" + + def __init__(self, features, activation, bn, groups=1): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + self.groups = groups + self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) + self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups) + + self.norm1 = None + self.norm2 = None + + self.activation = activation + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.norm1 is not None: + out = self.norm1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.norm2 is not None: + out = self.norm2(out) + + return self.skip_add.add(out, x) + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block.""" + + def __init__( + self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True, + size=None, + has_residual=True, + groups=1, + ): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + self.groups = groups + self.expand = expand + out_features = features + if self.expand == True: + out_features = features // 2 + + self.out_conv = nn.Conv2d( + features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=self.groups + ) + + if has_residual: + self.resConfUnit1 = ResidualConvUnit(features, activation, bn, groups=self.groups) + + self.has_residual = has_residual + self.resConfUnit2 = ResidualConvUnit(features, activation, bn, groups=self.groups) + + self.skip_add = nn.quantized.FloatFunctional() + self.size = size + + def forward(self, *xs, size=None): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if self.has_residual: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + + output = self.resConfUnit2(output) + + if (size is None) and (self.size is None): + modifier = {"scale_factor": 2} + elif size is None: + modifier = {"size": self.size} + else: + modifier = {"size": size} + + output = custom_interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners) + output = self.out_conv(output) + + return output + + +def custom_interpolate( + x: torch.Tensor, + size: Tuple[int, int] = None, + scale_factor: float = None, + mode: str = "bilinear", + align_corners: bool = True, +) -> torch.Tensor: + """ + Custom interpolate to avoid INT_MAX issues in nn.functional.interpolate. + """ + if size is None: + size = (int(x.shape[-2] * scale_factor), int(x.shape[-1] * scale_factor)) + + INT_MAX = 1610612736 + + input_elements = size[0] * size[1] * x.shape[0] * x.shape[1] + + if input_elements > INT_MAX: + chunks = torch.chunk(x, chunks=(input_elements // INT_MAX) + 1, dim=0) + interpolated_chunks = [ + nn.functional.interpolate(chunk, size=size, mode=mode, align_corners=align_corners) for chunk in chunks + ] + x = torch.cat(interpolated_chunks, dim=0) + return x.contiguous() + else: + return nn.functional.interpolate(x, size=size, mode=mode, align_corners=align_corners) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/head_act.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/head_act.py new file mode 100644 index 0000000000000000000000000000000000000000..2dedfcf1180a653dddc99623e60df625e5897489 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/head_act.py @@ -0,0 +1,125 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +import torch.nn.functional as F + + +def activate_pose(pred_pose_enc, trans_act="linear", quat_act="linear", fl_act="linear"): + """ + Activate pose parameters with specified activation functions. + + Args: + pred_pose_enc: Tensor containing encoded pose parameters [translation, quaternion, focal length] + trans_act: Activation type for translation component + quat_act: Activation type for quaternion component + fl_act: Activation type for focal length component + + Returns: + Activated pose parameters tensor + """ + T = pred_pose_enc[..., :3] + quat = pred_pose_enc[..., 3:7] + fl = pred_pose_enc[..., 7:] # or fov + + T = base_pose_act(T, trans_act) + quat = base_pose_act(quat, quat_act) + fl = base_pose_act(fl, fl_act) # or fov + + pred_pose_enc = torch.cat([T, quat, fl], dim=-1) + + return pred_pose_enc + + +def base_pose_act(pose_enc, act_type="linear"): + """ + Apply basic activation function to pose parameters. + + Args: + pose_enc: Tensor containing encoded pose parameters + act_type: Activation type ("linear", "inv_log", "exp", "relu") + + Returns: + Activated pose parameters + """ + if act_type == "linear": + return pose_enc + elif act_type == "inv_log": + return inverse_log_transform(pose_enc) + elif act_type == "exp": + return torch.exp(pose_enc) + elif act_type == "relu": + return F.relu(pose_enc) + else: + raise ValueError(f"Unknown act_type: {act_type}") + + +def activate_head(out, activation="norm_exp", conf_activation="expp1"): + """ + Process network output to extract 3D points and confidence values. + + Args: + out: Network output tensor (B, C, H, W) + activation: Activation type for 3D points + conf_activation: Activation type for confidence values + + Returns: + Tuple of (3D points tensor, confidence tensor) + """ + # Move channels from last dim to the 4th dimension => (B, H, W, C) + fmap = out.permute(0, 2, 3, 1) # B,H,W,C expected + + # Split into xyz (first C-1 channels) and confidence (last channel) + xyz = fmap[:, :, :, :-1] + conf = fmap[:, :, :, -1] + + if activation == "norm_exp": + d = xyz.norm(dim=-1, keepdim=True).clamp(min=1e-8) + xyz_normed = xyz / d + pts3d = xyz_normed * torch.expm1(d) + elif activation == "norm": + pts3d = xyz / xyz.norm(dim=-1, keepdim=True) + elif activation == "exp": + pts3d = torch.exp(xyz) + elif activation == "relu": + pts3d = F.relu(xyz) + elif activation == "inv_log": + pts3d = inverse_log_transform(xyz) + elif activation == "xy_inv_log": + xy, z = xyz.split([2, 1], dim=-1) + z = inverse_log_transform(z) + pts3d = torch.cat([xy * z, z], dim=-1) + elif activation == "sigmoid": + pts3d = torch.sigmoid(xyz) + elif activation == "linear": + pts3d = xyz + else: + raise ValueError(f"Unknown activation: {activation}") + + if conf_activation == "expp1": + conf_out = 1 + conf.exp() + elif conf_activation == "expp0": + conf_out = conf.exp() + elif conf_activation == "sigmoid": + conf_out = torch.sigmoid(conf) + else: + raise ValueError(f"Unknown conf_activation: {conf_activation}") + + return pts3d, conf_out + + +def inverse_log_transform(y): + """ + Apply inverse log transform: sign(y) * (exp(|y|) - 1) + + Args: + y: Input tensor + + Returns: + Transformed tensor + """ + return torch.sign(y) * (torch.expm1(torch.abs(y))) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_head.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_head.py new file mode 100644 index 0000000000000000000000000000000000000000..9ec7199bd185060989c236997f93b93f4fc77825 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_head.py @@ -0,0 +1,108 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch.nn as nn +from .dpt_head import DPTHead +from .track_modules.base_track_predictor import BaseTrackerPredictor + + +class TrackHead(nn.Module): + """ + Track head that uses DPT head to process tokens and BaseTrackerPredictor for tracking. + The tracking is performed iteratively, refining predictions over multiple iterations. + """ + + def __init__( + self, + dim_in, + patch_size=14, + features=128, + iters=4, + predict_conf=True, + stride=2, + corr_levels=7, + corr_radius=4, + hidden_size=384, + ): + """ + Initialize the TrackHead module. + + Args: + dim_in (int): Input dimension of tokens from the backbone. + patch_size (int): Size of image patches used in the vision transformer. + features (int): Number of feature channels in the feature extractor output. + iters (int): Number of refinement iterations for tracking predictions. + predict_conf (bool): Whether to predict confidence scores for tracked points. + stride (int): Stride value for the tracker predictor. + corr_levels (int): Number of correlation pyramid levels + corr_radius (int): Radius for correlation computation, controlling the search area. + hidden_size (int): Size of hidden layers in the tracker network. + """ + super().__init__() + + self.patch_size = patch_size + + # Feature extractor based on DPT architecture + # Processes tokens into feature maps for tracking + self.feature_extractor = DPTHead( + dim_in=dim_in, + patch_size=patch_size, + features=features, + feature_only=True, # Only output features, no activation + down_ratio=2, # Reduces spatial dimensions by factor of 2 + pos_embed=False, + ) + + # Tracker module that predicts point trajectories + # Takes feature maps and predicts coordinates and visibility + self.tracker = BaseTrackerPredictor( + latent_dim=features, # Match the output_dim of feature extractor + predict_conf=predict_conf, + stride=stride, + corr_levels=corr_levels, + corr_radius=corr_radius, + hidden_size=hidden_size, + ) + + self.iters = iters + + def forward(self, aggregated_tokens_list, images, patch_start_idx, query_points=None, iters=None): + """ + Forward pass of the TrackHead. + + Args: + aggregated_tokens_list (list): List of aggregated tokens from the backbone. + images (torch.Tensor): Input images of shape (B, S, C, H, W) where: + B = batch size, S = sequence length. + patch_start_idx (int): Starting index for patch tokens. + query_points (torch.Tensor, optional): Initial query points to track. + If None, points are initialized by the tracker. + iters (int, optional): Number of refinement iterations. If None, uses self.iters. + + Returns: + tuple: + - coord_preds (torch.Tensor): Predicted coordinates for tracked points. + - vis_scores (torch.Tensor): Visibility scores for tracked points. + - conf_scores (torch.Tensor): Confidence scores for tracked points (if predict_conf=True). + """ + B, S, _, H, W = images.shape + + # Extract features from tokens + # feature_maps has shape (B, S, C, H//2, W//2) due to down_ratio=2 + feature_maps = self.feature_extractor(aggregated_tokens_list, images, patch_start_idx) + + # Use default iterations if not specified + if iters is None: + iters = self.iters + + # Perform tracking using the extracted features + coord_preds, vis_scores, conf_scores = self.tracker( + query_points=query_points, + fmaps=feature_maps, + iters=iters, + ) + + return coord_preds, vis_scores, conf_scores diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0952fcc3f57e34b3747962e9ebd6fc57aeea63fa --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/base_track_predictor.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/base_track_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce8ec4b66fff236e015d1bcaf85c8237a52be7a --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/base_track_predictor.py @@ -0,0 +1,209 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from einops import rearrange, repeat + + +from .blocks import EfficientUpdateFormer, CorrBlock +from .utils import sample_features4d, get_2d_embedding, get_2d_sincos_pos_embed +from .modules import Mlp + + +class BaseTrackerPredictor(nn.Module): + def __init__( + self, + stride=1, + corr_levels=5, + corr_radius=4, + latent_dim=128, + hidden_size=384, + use_spaceatt=True, + depth=6, + max_scale=518, + predict_conf=True, + ): + super(BaseTrackerPredictor, self).__init__() + """ + The base template to create a track predictor + + Modified from https://github.com/facebookresearch/co-tracker/ + and https://github.com/facebookresearch/vggsfm + """ + + self.stride = stride + self.latent_dim = latent_dim + self.corr_levels = corr_levels + self.corr_radius = corr_radius + self.hidden_size = hidden_size + self.max_scale = max_scale + self.predict_conf = predict_conf + + self.flows_emb_dim = latent_dim // 2 + + self.corr_mlp = Mlp( + in_features=self.corr_levels * (self.corr_radius * 2 + 1) ** 2, + hidden_features=self.hidden_size, + out_features=self.latent_dim, + ) + + self.transformer_dim = self.latent_dim + self.latent_dim + self.latent_dim + 4 + + self.query_ref_token = nn.Parameter(torch.randn(1, 2, self.transformer_dim)) + + space_depth = depth if use_spaceatt else 0 + time_depth = depth + + self.updateformer = EfficientUpdateFormer( + space_depth=space_depth, + time_depth=time_depth, + input_dim=self.transformer_dim, + hidden_size=self.hidden_size, + output_dim=self.latent_dim + 2, + mlp_ratio=4.0, + add_space_attn=use_spaceatt, + ) + + self.fmap_norm = nn.LayerNorm(self.latent_dim) + self.ffeat_norm = nn.GroupNorm(1, self.latent_dim) + + # A linear layer to update track feats at each iteration + self.ffeat_updater = nn.Sequential(nn.Linear(self.latent_dim, self.latent_dim), nn.GELU()) + + self.vis_predictor = nn.Sequential(nn.Linear(self.latent_dim, 1)) + + if predict_conf: + self.conf_predictor = nn.Sequential(nn.Linear(self.latent_dim, 1)) + + def forward(self, query_points, fmaps=None, iters=6, return_feat=False, down_ratio=1, apply_sigmoid=True): + """ + query_points: B x N x 2, the number of batches, tracks, and xy + fmaps: B x S x C x HH x WW, the number of batches, frames, and feature dimension. + note HH and WW is the size of feature maps instead of original images + """ + B, N, D = query_points.shape + B, S, C, HH, WW = fmaps.shape + + assert D == 2, "Input points must be 2D coordinates" + + # apply a layernorm to fmaps here + fmaps = self.fmap_norm(fmaps.permute(0, 1, 3, 4, 2)) + fmaps = fmaps.permute(0, 1, 4, 2, 3) + + # Scale the input query_points because we may downsample the images + # by down_ratio or self.stride + # e.g., if a 3x1024x1024 image is processed to a 128x256x256 feature map + # its query_points should be query_points/4 + if down_ratio > 1: + query_points = query_points / float(down_ratio) + + query_points = query_points / float(self.stride) + + # Init with coords as the query points + # It means the search will start from the position of query points at the reference frames + coords = query_points.clone().reshape(B, 1, N, 2).repeat(1, S, 1, 1) + + # Sample/extract the features of the query points in the query frame + query_track_feat = sample_features4d(fmaps[:, 0], coords[:, 0]) + + # init track feats by query feats + track_feats = query_track_feat.unsqueeze(1).repeat(1, S, 1, 1) # B, S, N, C + # back up the init coords + coords_backup = coords.clone() + + fcorr_fn = CorrBlock(fmaps, num_levels=self.corr_levels, radius=self.corr_radius) + + coord_preds = [] + + # Iterative Refinement + for _ in range(iters): + # Detach the gradients from the last iteration + # (in my experience, not very important for performance) + coords = coords.detach() + + fcorrs = fcorr_fn.corr_sample(track_feats, coords) + + corr_dim = fcorrs.shape[3] + fcorrs_ = fcorrs.permute(0, 2, 1, 3).reshape(B * N, S, corr_dim) + fcorrs_ = self.corr_mlp(fcorrs_) + + # Movement of current coords relative to query points + flows = (coords - coords[:, 0:1]).permute(0, 2, 1, 3).reshape(B * N, S, 2) + + flows_emb = get_2d_embedding(flows, self.flows_emb_dim, cat_coords=False) + + # (In my trials, it is also okay to just add the flows_emb instead of concat) + flows_emb = torch.cat([flows_emb, flows / self.max_scale, flows / self.max_scale], dim=-1) + + track_feats_ = track_feats.permute(0, 2, 1, 3).reshape(B * N, S, self.latent_dim) + + # Concatenate them as the input for the transformers + transformer_input = torch.cat([flows_emb, fcorrs_, track_feats_], dim=2) + + # 2D positional embed + # TODO: this can be much simplified + pos_embed = get_2d_sincos_pos_embed(self.transformer_dim, grid_size=(HH, WW)).to(query_points.device) + sampled_pos_emb = sample_features4d(pos_embed.expand(B, -1, -1, -1), coords[:, 0]) + + sampled_pos_emb = rearrange(sampled_pos_emb, "b n c -> (b n) c").unsqueeze(1) + + x = transformer_input + sampled_pos_emb + + # Add the query ref token to the track feats + query_ref_token = torch.cat( + [self.query_ref_token[:, 0:1], self.query_ref_token[:, 1:2].expand(-1, S - 1, -1)], dim=1 + ) + x = x + query_ref_token.to(x.device).to(x.dtype) + + # B, N, S, C + x = rearrange(x, "(b n) s d -> b n s d", b=B) + + # Compute the delta coordinates and delta track features + delta, _ = self.updateformer(x) + + # BN, S, C + delta = rearrange(delta, " b n s d -> (b n) s d", b=B) + delta_coords_ = delta[:, :, :2] + delta_feats_ = delta[:, :, 2:] + + track_feats_ = track_feats_.reshape(B * N * S, self.latent_dim) + delta_feats_ = delta_feats_.reshape(B * N * S, self.latent_dim) + + # Update the track features + track_feats_ = self.ffeat_updater(self.ffeat_norm(delta_feats_)) + track_feats_ + + track_feats = track_feats_.reshape(B, N, S, self.latent_dim).permute(0, 2, 1, 3) # BxSxNxC + + # B x S x N x 2 + coords = coords + delta_coords_.reshape(B, N, S, 2).permute(0, 2, 1, 3) + + # Force coord0 as query + # because we assume the query points should not be changed + coords[:, 0] = coords_backup[:, 0] + + # The predicted tracks are in the original image scale + if down_ratio > 1: + coord_preds.append(coords * self.stride * down_ratio) + else: + coord_preds.append(coords * self.stride) + + # B, S, N + vis_e = self.vis_predictor(track_feats.reshape(B * S * N, self.latent_dim)).reshape(B, S, N) + if apply_sigmoid: + vis_e = torch.sigmoid(vis_e) + + if self.predict_conf: + conf_e = self.conf_predictor(track_feats.reshape(B * S * N, self.latent_dim)).reshape(B, S, N) + if apply_sigmoid: + conf_e = torch.sigmoid(conf_e) + else: + conf_e = None + + if return_feat: + return coord_preds, vis_e, track_feats, query_track_feat, conf_e + else: + return coord_preds, vis_e, conf_e diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/blocks.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..8e7763f4fd8f515662421db192594380dbb574e5 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/blocks.py @@ -0,0 +1,246 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +# Modified from https://github.com/facebookresearch/co-tracker/ + +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .utils import bilinear_sampler +from .modules import Mlp, AttnBlock, CrossAttnBlock, ResidualBlock + + +class EfficientUpdateFormer(nn.Module): + """ + Transformer model that updates track estimates. + """ + + def __init__( + self, + space_depth=6, + time_depth=6, + input_dim=320, + hidden_size=384, + num_heads=8, + output_dim=130, + mlp_ratio=4.0, + add_space_attn=True, + num_virtual_tracks=64, + ): + super().__init__() + + self.out_channels = 2 + self.num_heads = num_heads + self.hidden_size = hidden_size + self.add_space_attn = add_space_attn + + # Add input LayerNorm before linear projection + self.input_norm = nn.LayerNorm(input_dim) + self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True) + + # Add output LayerNorm before final projection + self.output_norm = nn.LayerNorm(hidden_size) + self.flow_head = torch.nn.Linear(hidden_size, output_dim, bias=True) + self.num_virtual_tracks = num_virtual_tracks + + if self.add_space_attn: + self.virual_tracks = nn.Parameter(torch.randn(1, num_virtual_tracks, 1, hidden_size)) + else: + self.virual_tracks = None + + self.time_blocks = nn.ModuleList( + [ + AttnBlock( + hidden_size, + num_heads, + mlp_ratio=mlp_ratio, + attn_class=nn.MultiheadAttention, + ) + for _ in range(time_depth) + ] + ) + + if add_space_attn: + self.space_virtual_blocks = nn.ModuleList( + [ + AttnBlock( + hidden_size, + num_heads, + mlp_ratio=mlp_ratio, + attn_class=nn.MultiheadAttention, + ) + for _ in range(space_depth) + ] + ) + self.space_point2virtual_blocks = nn.ModuleList( + [CrossAttnBlock(hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(space_depth)] + ) + self.space_virtual2point_blocks = nn.ModuleList( + [CrossAttnBlock(hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(space_depth)] + ) + assert len(self.time_blocks) >= len(self.space_virtual2point_blocks) + self.initialize_weights() + + def initialize_weights(self): + def _basic_init(module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.constant_(module.bias, 0) + torch.nn.init.trunc_normal_(self.flow_head.weight, std=0.001) + + self.apply(_basic_init) + + def forward(self, input_tensor, mask=None): + # Apply input LayerNorm + input_tensor = self.input_norm(input_tensor) + tokens = self.input_transform(input_tensor) + + init_tokens = tokens + + B, _, T, _ = tokens.shape + + if self.add_space_attn: + virtual_tokens = self.virual_tracks.repeat(B, 1, T, 1) + tokens = torch.cat([tokens, virtual_tokens], dim=1) + + _, N, _, _ = tokens.shape + + j = 0 + for i in range(len(self.time_blocks)): + time_tokens = tokens.contiguous().view(B * N, T, -1) # B N T C -> (B N) T C + + time_tokens = self.time_blocks[i](time_tokens) + + tokens = time_tokens.view(B, N, T, -1) # (B N) T C -> B N T C + if self.add_space_attn and (i % (len(self.time_blocks) // len(self.space_virtual_blocks)) == 0): + space_tokens = tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1) # B N T C -> (B T) N C + point_tokens = space_tokens[:, : N - self.num_virtual_tracks] + virtual_tokens = space_tokens[:, N - self.num_virtual_tracks :] + + virtual_tokens = self.space_virtual2point_blocks[j](virtual_tokens, point_tokens, mask=mask) + virtual_tokens = self.space_virtual_blocks[j](virtual_tokens) + point_tokens = self.space_point2virtual_blocks[j](point_tokens, virtual_tokens, mask=mask) + + space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1) + tokens = space_tokens.view(B, T, N, -1).permute(0, 2, 1, 3) # (B T) N C -> B N T C + j += 1 + + if self.add_space_attn: + tokens = tokens[:, : N - self.num_virtual_tracks] + + tokens = tokens + init_tokens + + # Apply output LayerNorm before final projection + tokens = self.output_norm(tokens) + flow = self.flow_head(tokens) + + return flow, None + + +class CorrBlock: + def __init__(self, fmaps, num_levels=4, radius=4, multiple_track_feats=False, padding_mode="zeros"): + """ + Build a pyramid of feature maps from the input. + + fmaps: Tensor (B, S, C, H, W) + num_levels: number of pyramid levels (each downsampled by factor 2) + radius: search radius for sampling correlation + multiple_track_feats: if True, split the target features per pyramid level + padding_mode: passed to grid_sample / bilinear_sampler + """ + B, S, C, H, W = fmaps.shape + self.S, self.C, self.H, self.W = S, C, H, W + self.num_levels = num_levels + self.radius = radius + self.padding_mode = padding_mode + self.multiple_track_feats = multiple_track_feats + + # Build pyramid: each level is half the spatial resolution of the previous + self.fmaps_pyramid = [fmaps] # level 0 is full resolution + current_fmaps = fmaps + for i in range(num_levels - 1): + B, S, C, H, W = current_fmaps.shape + # Merge batch & sequence dimensions + current_fmaps = current_fmaps.reshape(B * S, C, H, W) + # Avg pool down by factor 2 + current_fmaps = F.avg_pool2d(current_fmaps, kernel_size=2, stride=2) + _, _, H_new, W_new = current_fmaps.shape + current_fmaps = current_fmaps.reshape(B, S, C, H_new, W_new) + self.fmaps_pyramid.append(current_fmaps) + + # Precompute a delta grid (of shape (2r+1, 2r+1, 2)) for sampling. + # This grid is added to the (scaled) coordinate centroids. + r = self.radius + dx = torch.linspace(-r, r, 2 * r + 1, device=fmaps.device, dtype=fmaps.dtype) + dy = torch.linspace(-r, r, 2 * r + 1, device=fmaps.device, dtype=fmaps.dtype) + # delta: for every (dy,dx) displacement (i.e. Δx, Δy) + self.delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), dim=-1) # shape: (2r+1, 2r+1, 2) + + def corr_sample(self, targets, coords): + """ + Instead of storing the entire correlation pyramid, we compute each level's correlation + volume, sample it immediately, then discard it. This saves GPU memory. + + Args: + targets: Tensor (B, S, N, C) — features for the current targets. + coords: Tensor (B, S, N, 2) — coordinates at full resolution. + + Returns: + Tensor (B, S, N, L) where L = num_levels * (2*radius+1)**2 (concatenated sampled correlations) + """ + B, S, N, C = targets.shape + + # If you have multiple track features, split them per level. + if self.multiple_track_feats: + targets_split = torch.split(targets, C // self.num_levels, dim=-1) + + out_pyramid = [] + for i, fmaps in enumerate(self.fmaps_pyramid): + # Get current spatial resolution H, W for this pyramid level. + B, S, C, H, W = fmaps.shape + # Reshape feature maps for correlation computation: + # fmap2s: (B, S, C, H*W) + fmap2s = fmaps.view(B, S, C, H * W) + # Choose appropriate target features. + fmap1 = targets_split[i] if self.multiple_track_feats else targets # shape: (B, S, N, C) + + # Compute correlation directly + corrs = compute_corr_level(fmap1, fmap2s, C) + corrs = corrs.view(B, S, N, H, W) + + # Prepare sampling grid: + # Scale down the coordinates for the current level. + centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / (2**i) + # Make sure our precomputed delta grid is on the same device/dtype. + delta_lvl = self.delta.to(coords.device).to(coords.dtype) + # Now the grid for grid_sample is: + # coords_lvl = centroid_lvl + delta_lvl (broadcasted over grid) + coords_lvl = centroid_lvl + delta_lvl.view(1, 2 * self.radius + 1, 2 * self.radius + 1, 2) + + # Sample from the correlation volume using bilinear interpolation. + # We reshape corrs to (B * S * N, 1, H, W) so grid_sample acts over each target. + corrs_sampled = bilinear_sampler( + corrs.reshape(B * S * N, 1, H, W), coords_lvl, padding_mode=self.padding_mode + ) + # The sampled output is (B * S * N, 1, 2r+1, 2r+1). Flatten the last two dims. + corrs_sampled = corrs_sampled.view(B, S, N, -1) # Now shape: (B, S, N, (2r+1)^2) + out_pyramid.append(corrs_sampled) + + # Concatenate all levels along the last dimension. + out = torch.cat(out_pyramid, dim=-1).contiguous() + return out + + +def compute_corr_level(fmap1, fmap2s, C): + # fmap1: (B, S, N, C) + # fmap2s: (B, S, C, H*W) + corrs = torch.matmul(fmap1, fmap2s) # (B, S, N, H*W) + corrs = corrs.view(fmap1.shape[0], fmap1.shape[1], fmap1.shape[2], -1) # (B, S, N, H*W) + return corrs / math.sqrt(C) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/modules.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..4b090ddc4a9db01c8dd3564f9053e1ca9cdde93a --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/modules.py @@ -0,0 +1,218 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial +from typing import Callable +import collections +from torch import Tensor +from itertools import repeat + + +# From PyTorch internals +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +def exists(val): + return val is not None + + +def default(val, d): + return val if exists(val) else d + + +to_2tuple = _ntuple(2) + + +class ResidualBlock(nn.Module): + """ + ResidualBlock: construct a block of two conv layers with residual connections + """ + + def __init__(self, in_planes, planes, norm_fn="group", stride=1, kernel_size=3): + super(ResidualBlock, self).__init__() + + self.conv1 = nn.Conv2d( + in_planes, + planes, + kernel_size=kernel_size, + padding=1, + stride=stride, + padding_mode="zeros", + ) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=kernel_size, + padding=1, + padding_mode="zeros", + ) + self.relu = nn.ReLU(inplace=True) + + num_groups = planes // 8 + + if norm_fn == "group": + self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + if not stride == 1: + self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) + + elif norm_fn == "batch": + self.norm1 = nn.BatchNorm2d(planes) + self.norm2 = nn.BatchNorm2d(planes) + if not stride == 1: + self.norm3 = nn.BatchNorm2d(planes) + + elif norm_fn == "instance": + self.norm1 = nn.InstanceNorm2d(planes) + self.norm2 = nn.InstanceNorm2d(planes) + if not stride == 1: + self.norm3 = nn.InstanceNorm2d(planes) + + elif norm_fn == "none": + self.norm1 = nn.Sequential() + self.norm2 = nn.Sequential() + if not stride == 1: + self.norm3 = nn.Sequential() + else: + raise NotImplementedError + + if stride == 1: + self.downsample = None + else: + self.downsample = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), + self.norm3, + ) + + def forward(self, x): + y = x + y = self.relu(self.norm1(self.conv1(y))) + y = self.relu(self.norm2(self.conv2(y))) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x + y) + + +class Mlp(nn.Module): + """MLP as used in Vision Transformer, MLP-Mixer and related networks""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=None, + bias=True, + drop=0.0, + use_conv=False, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear + + self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.fc2(x) + x = self.drop2(x) + return x + + +class AttnBlock(nn.Module): + def __init__( + self, + hidden_size, + num_heads, + attn_class: Callable[..., nn.Module] = nn.MultiheadAttention, + mlp_ratio=4.0, + **block_kwargs + ): + """ + Self attention block + """ + super().__init__() + + self.norm1 = nn.LayerNorm(hidden_size) + self.norm2 = nn.LayerNorm(hidden_size) + + self.attn = attn_class(embed_dim=hidden_size, num_heads=num_heads, batch_first=True, **block_kwargs) + + mlp_hidden_dim = int(hidden_size * mlp_ratio) + + self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, drop=0) + + def forward(self, x, mask=None): + # Prepare the mask for PyTorch's attention (it expects a different format) + # attn_mask = mask if mask is not None else None + # Normalize before attention + x = self.norm1(x) + + # PyTorch's MultiheadAttention returns attn_output, attn_output_weights + # attn_output, _ = self.attn(x, x, x, attn_mask=attn_mask) + + attn_output, _ = self.attn(x, x, x) + + # Add & Norm + x = x + attn_output + x = x + self.mlp(self.norm2(x)) + return x + + +class CrossAttnBlock(nn.Module): + def __init__(self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, **block_kwargs): + """ + Cross attention block + """ + super().__init__() + + self.norm1 = nn.LayerNorm(hidden_size) + self.norm_context = nn.LayerNorm(hidden_size) + self.norm2 = nn.LayerNorm(hidden_size) + + self.cross_attn = nn.MultiheadAttention( + embed_dim=hidden_size, num_heads=num_heads, batch_first=True, **block_kwargs + ) + + mlp_hidden_dim = int(hidden_size * mlp_ratio) + + self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, drop=0) + + def forward(self, x, context, mask=None): + # Normalize inputs + x = self.norm1(x) + context = self.norm_context(context) + + # Apply cross attention + # Note: nn.MultiheadAttention returns attn_output, attn_output_weights + attn_output, _ = self.cross_attn(x, context, context, attn_mask=mask) + + # Add & Norm + x = x + attn_output + x = x + self.mlp(self.norm2(x)) + return x diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/utils.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..51d01d39cdc10388a04dab5db7cf409b31bde766 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/track_modules/utils.py @@ -0,0 +1,226 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Modified from https://github.com/facebookresearch/vggsfm +# and https://github.com/facebookresearch/co-tracker/tree/main + + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from typing import Optional, Tuple, Union + + +def get_2d_sincos_pos_embed(embed_dim: int, grid_size: Union[int, Tuple[int, int]], return_grid=False) -> torch.Tensor: + """ + This function initializes a grid and generates a 2D positional embedding using sine and cosine functions. + It is a wrapper of get_2d_sincos_pos_embed_from_grid. + Args: + - embed_dim: The embedding dimension. + - grid_size: The grid size. + Returns: + - pos_embed: The generated 2D positional embedding. + """ + if isinstance(grid_size, tuple): + grid_size_h, grid_size_w = grid_size + else: + grid_size_h = grid_size_w = grid_size + grid_h = torch.arange(grid_size_h, dtype=torch.float) + grid_w = torch.arange(grid_size_w, dtype=torch.float) + grid = torch.meshgrid(grid_w, grid_h, indexing="xy") + grid = torch.stack(grid, dim=0) + grid = grid.reshape([2, 1, grid_size_h, grid_size_w]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if return_grid: + return ( + pos_embed.reshape(1, grid_size_h, grid_size_w, -1).permute(0, 3, 1, 2), + grid, + ) + return pos_embed.reshape(1, grid_size_h, grid_size_w, -1).permute(0, 3, 1, 2) + + +def get_2d_sincos_pos_embed_from_grid(embed_dim: int, grid: torch.Tensor) -> torch.Tensor: + """ + This function generates a 2D positional embedding from a given grid using sine and cosine functions. + + Args: + - embed_dim: The embedding dimension. + - grid: The grid to generate the embedding from. + + Returns: + - emb: The generated 2D positional embedding. + """ + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = torch.cat([emb_h, emb_w], dim=2) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim: int, pos: torch.Tensor) -> torch.Tensor: + """ + This function generates a 1D positional embedding from a given grid using sine and cosine functions. + + Args: + - embed_dim: The embedding dimension. + - pos: The position to generate the embedding from. + + Returns: + - emb: The generated 1D positional embedding. + """ + assert embed_dim % 2 == 0 + omega = torch.arange(embed_dim // 2, dtype=torch.double) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = torch.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = torch.sin(out) # (M, D/2) + emb_cos = torch.cos(out) # (M, D/2) + + emb = torch.cat([emb_sin, emb_cos], dim=1) # (M, D) + return emb[None].float() + + +def get_2d_embedding(xy: torch.Tensor, C: int, cat_coords: bool = True) -> torch.Tensor: + """ + This function generates a 2D positional embedding from given coordinates using sine and cosine functions. + + Args: + - xy: The coordinates to generate the embedding from. + - C: The size of the embedding. + - cat_coords: A flag to indicate whether to concatenate the original coordinates to the embedding. + + Returns: + - pe: The generated 2D positional embedding. + """ + B, N, D = xy.shape + assert D == 2 + + x = xy[:, :, 0:1] + y = xy[:, :, 1:2] + div_term = (torch.arange(0, C, 2, device=xy.device, dtype=torch.float32) * (1000.0 / C)).reshape(1, 1, int(C / 2)) + + pe_x = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32) + pe_y = torch.zeros(B, N, C, device=xy.device, dtype=torch.float32) + + pe_x[:, :, 0::2] = torch.sin(x * div_term) + pe_x[:, :, 1::2] = torch.cos(x * div_term) + + pe_y[:, :, 0::2] = torch.sin(y * div_term) + pe_y[:, :, 1::2] = torch.cos(y * div_term) + + pe = torch.cat([pe_x, pe_y], dim=2) # (B, N, C*3) + if cat_coords: + pe = torch.cat([xy, pe], dim=2) # (B, N, C*3+3) + return pe + + +def bilinear_sampler(input, coords, align_corners=True, padding_mode="border"): + r"""Sample a tensor using bilinear interpolation + + `bilinear_sampler(input, coords)` samples a tensor :attr:`input` at + coordinates :attr:`coords` using bilinear interpolation. It is the same + as `torch.nn.functional.grid_sample()` but with a different coordinate + convention. + + The input tensor is assumed to be of shape :math:`(B, C, H, W)`, where + :math:`B` is the batch size, :math:`C` is the number of channels, + :math:`H` is the height of the image, and :math:`W` is the width of the + image. The tensor :attr:`coords` of shape :math:`(B, H_o, W_o, 2)` is + interpreted as an array of 2D point coordinates :math:`(x_i,y_i)`. + + Alternatively, the input tensor can be of size :math:`(B, C, T, H, W)`, + in which case sample points are triplets :math:`(t_i,x_i,y_i)`. Note + that in this case the order of the components is slightly different + from `grid_sample()`, which would expect :math:`(x_i,y_i,t_i)`. + + If `align_corners` is `True`, the coordinate :math:`x` is assumed to be + in the range :math:`[0,W-1]`, with 0 corresponding to the center of the + left-most image pixel :math:`W-1` to the center of the right-most + pixel. + + If `align_corners` is `False`, the coordinate :math:`x` is assumed to + be in the range :math:`[0,W]`, with 0 corresponding to the left edge of + the left-most pixel :math:`W` to the right edge of the right-most + pixel. + + Similar conventions apply to the :math:`y` for the range + :math:`[0,H-1]` and :math:`[0,H]` and to :math:`t` for the range + :math:`[0,T-1]` and :math:`[0,T]`. + + Args: + input (Tensor): batch of input images. + coords (Tensor): batch of coordinates. + align_corners (bool, optional): Coordinate convention. Defaults to `True`. + padding_mode (str, optional): Padding mode. Defaults to `"border"`. + + Returns: + Tensor: sampled points. + """ + coords = coords.detach().clone() + ############################################################ + # IMPORTANT: + coords = coords.to(input.device).to(input.dtype) + ############################################################ + + sizes = input.shape[2:] + + assert len(sizes) in [2, 3] + + if len(sizes) == 3: + # t x y -> x y t to match dimensions T H W in grid_sample + coords = coords[..., [1, 2, 0]] + + if align_corners: + scale = torch.tensor( + [2 / max(size - 1, 1) for size in reversed(sizes)], device=coords.device, dtype=coords.dtype + ) + else: + scale = torch.tensor([2 / size for size in reversed(sizes)], device=coords.device, dtype=coords.dtype) + + coords.mul_(scale) # coords = coords * scale + coords.sub_(1) # coords = coords - 1 + + return F.grid_sample(input, coords, align_corners=align_corners, padding_mode=padding_mode) + + +def sample_features4d(input, coords): + r"""Sample spatial features + + `sample_features4d(input, coords)` samples the spatial features + :attr:`input` represented by a 4D tensor :math:`(B, C, H, W)`. + + The field is sampled at coordinates :attr:`coords` using bilinear + interpolation. :attr:`coords` is assumed to be of shape :math:`(B, R, + 2)`, where each sample has the format :math:`(x_i, y_i)`. This uses the + same convention as :func:`bilinear_sampler` with `align_corners=True`. + + The output tensor has one feature per point, and has shape :math:`(B, + R, C)`. + + Args: + input (Tensor): spatial features. + coords (Tensor): points. + + Returns: + Tensor: sampled features. + """ + + B, _, _, _ = input.shape + + # B R 2 -> B R 1 2 + coords = coords.unsqueeze(2) + + # B C R 1 + feats = bilinear_sampler(input, coords) + + return feats.permute(0, 2, 1, 3).view(B, -1, feats.shape[1] * feats.shape[3]) # B C R 1 -> B R C diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/utils.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d7af1f68fa0ce0a48d11a708d53aa20aa8f78ba2 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/heads/utils.py @@ -0,0 +1,108 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +def position_grid_to_embed(pos_grid: torch.Tensor, embed_dim: int, omega_0: float = 100) -> torch.Tensor: + """ + Convert 2D position grid (HxWx2) to sinusoidal embeddings (HxWxC) + + Args: + pos_grid: Tensor of shape (H, W, 2) containing 2D coordinates + embed_dim: Output channel dimension for embeddings + + Returns: + Tensor of shape (H, W, embed_dim) with positional embeddings + """ + H, W, grid_dim = pos_grid.shape + assert grid_dim == 2 + pos_flat = pos_grid.reshape(-1, grid_dim) # Flatten to (H*W, 2) + + # Process x and y coordinates separately + emb_x = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 0], omega_0=omega_0) # [1, H*W, D/2] + emb_y = make_sincos_pos_embed(embed_dim // 2, pos_flat[:, 1], omega_0=omega_0) # [1, H*W, D/2] + + # Combine and reshape + emb = torch.cat([emb_x, emb_y], dim=-1) # [1, H*W, D] + + return emb.view(H, W, embed_dim) # [H, W, D] + + +def make_sincos_pos_embed(embed_dim: int, pos: torch.Tensor, omega_0: float = 100) -> torch.Tensor: + """ + This function generates a 1D positional embedding from a given grid using sine and cosine functions. + + Args: + - embed_dim: The embedding dimension. + - pos: The position to generate the embedding from. + + Returns: + - emb: The generated 1D positional embedding. + """ + assert embed_dim % 2 == 0 + omega = torch.arange(embed_dim // 2, dtype=torch.double, device=pos.device) + omega /= embed_dim / 2.0 + omega = 1.0 / omega_0**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = torch.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = torch.sin(out) # (M, D/2) + emb_cos = torch.cos(out) # (M, D/2) + + emb = torch.cat([emb_sin, emb_cos], dim=1) # (M, D) + return emb.float() + + +# Inspired by https://github.com/microsoft/moge + + +def create_uv_grid( + width: int, height: int, aspect_ratio: float = None, dtype: torch.dtype = None, device: torch.device = None +) -> torch.Tensor: + """ + Create a normalized UV grid of shape (width, height, 2). + + The grid spans horizontally and vertically according to an aspect ratio, + ensuring the top-left corner is at (-x_span, -y_span) and the bottom-right + corner is at (x_span, y_span), normalized by the diagonal of the plane. + + Args: + width (int): Number of points horizontally. + height (int): Number of points vertically. + aspect_ratio (float, optional): Width-to-height ratio. Defaults to width/height. + dtype (torch.dtype, optional): Data type of the resulting tensor. + device (torch.device, optional): Device on which the tensor is created. + + Returns: + torch.Tensor: A (width, height, 2) tensor of UV coordinates. + """ + # Derive aspect ratio if not explicitly provided + if aspect_ratio is None: + aspect_ratio = float(width) / float(height) + + # Compute normalized spans for X and Y + diag_factor = (aspect_ratio**2 + 1.0) ** 0.5 + span_x = aspect_ratio / diag_factor + span_y = 1.0 / diag_factor + + # Establish the linspace boundaries + left_x = -span_x * (width - 1) / width + right_x = span_x * (width - 1) / width + top_y = -span_y * (height - 1) / height + bottom_y = span_y * (height - 1) / height + + # Generate 1D coordinates + x_coords = torch.linspace(left_x, right_x, steps=width, dtype=dtype, device=device) + y_coords = torch.linspace(top_y, bottom_y, steps=height, dtype=dtype, device=device) + + # Create 2D meshgrid (width x height) and stack into UV + uu, vv = torch.meshgrid(x_coords, y_coords, indexing="xy") + uv_grid = torch.stack((uu, vv), dim=-1) + + return uv_grid diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/__init__.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8120f4bc83066cb3f825ce32daa3b437f88486f1 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused +from .block import NestedTensorBlock +from .attention import MemEffAttention diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/attention.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/attention.py new file mode 100644 index 0000000000000000000000000000000000000000..8f232ef672212bbac29c44e2929909d646070f8c --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/attention.py @@ -0,0 +1,114 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging +import os +import warnings + +import torch +from torch import Tensor +import torch.nn.functional as F +from torch import nn +from typing import Union, Tuple, Dict, Optional + +from einops import rearrange + +XFORMERS_AVAILABLE = False + + +class Attention(nn.Module): + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + norm_layer: nn.Module = nn.LayerNorm, + qk_norm: bool = False, + fused_attn: bool = True, # use F.scaled_dot_product_attention or not + rope=None, + ) -> None: + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim**-0.5 + self.fused_attn = fused_attn + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity() + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + self.rope = rope + + def forward( + self, + x: torch.Tensor, + pos=None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, Tuple]]: + B, N, C = x.shape + + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + q, k = self.q_norm(q), self.k_norm(k) + + if self.rope is not None: + q = self.rope(q, pos) + k = self.rope(k, pos) + + if self.fused_attn: + x = F.scaled_dot_product_attention( + q, + k, + v, + dropout_p=self.attn_drop.p if self.training else 0.0, + ) + + else: + q = q * self.scale + attn = q @ k.transpose(-2, -1) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + def forward( + self, + x: Tensor, + attn_bias=None, + pos=None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict]]: + assert pos is None + if not XFORMERS_AVAILABLE: + if attn_bias is not None: + raise AssertionError("xFormers is required for using nested tensors") + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = qkv.unbind(2) + + x = memory_efficient_attention(q, k, v, attn_bias=attn_bias) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + + return x diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/block.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/block.py new file mode 100644 index 0000000000000000000000000000000000000000..01ed2788b9fee5f3daceea29c4299565bb6e5000 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/block.py @@ -0,0 +1,260 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +import os +from typing import Callable, List, Any, Tuple, Dict, Union +import warnings + +import torch +from torch import nn, Tensor + +from .attention import Attention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + + +XFORMERS_AVAILABLE = False + + +class Block(nn.Module): + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + qk_norm: bool = False, + fused_attn: bool = True, # use F.scaled_dot_product_attention or not + rope=None, + ) -> None: + super().__init__() + + self.norm1 = norm_layer(dim) + + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + qk_norm=qk_norm, + fused_attn=fused_attn, + rope=rope, + ) + + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor, pos=None) -> Union[Tensor, Tuple[Tensor, Dict]]: + + def attn_residual_func(x: Tensor, pos=None) -> Union[Tensor, Tuple[Tensor, Dict]]: + return self.ls1(self.attn(self.norm1(x), pos=pos)) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + pos=pos, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x, pos=pos)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x, pos=pos) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, + pos=None, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + if pos is not None: + # if necessary, apply rope to the subset + pos = pos[brange] + residual = residual_func(x_subset, pos=pos) + else: + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor + ) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors): + outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + if not XFORMERS_AVAILABLE: + raise AssertionError("xFormers is required for using nested tensors") + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/drop_path.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/drop_path.py new file mode 100644 index 0000000000000000000000000000000000000000..1d640e0b969b8dcba96260243473700b4e5b24b5 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/drop_path.py @@ -0,0 +1,34 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py + + +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/layer_scale.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/layer_scale.py new file mode 100644 index 0000000000000000000000000000000000000000..51df0d7ce61f2b41fa9e6369f52391dd7fe7d386 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/layer_scale.py @@ -0,0 +1,27 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 + +from typing import Union + +import torch +from torch import Tensor +from torch import nn + + +class LayerScale(nn.Module): + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/mlp.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..bbf9432aae9258612caeae910a7bde17999e328e --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/mlp.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py + + +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/patch_embed.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..8b7c0804784a42cf80c0297d110dcc68cc85b339 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/patch_embed.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +from typing import Callable, Optional, Tuple, Union + +from torch import Tensor +import torch.nn as nn + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}" + assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}" + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/rope.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/rope.py new file mode 100644 index 0000000000000000000000000000000000000000..4d5d33304e55dbd05687bd86752a47a80e5f82df --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/rope.py @@ -0,0 +1,188 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + + +# Implementation of 2D Rotary Position Embeddings (RoPE). + +# This module provides a clean implementation of 2D Rotary Position Embeddings, +# which extends the original RoPE concept to handle 2D spatial positions. + +# Inspired by: +# https://github.com/meta-llama/codellama/blob/main/llama/model.py +# https://github.com/naver-ai/rope-vit + + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Dict, Tuple + + +class PositionGetter: + """Generates and caches 2D spatial positions for patches in a grid. + + This class efficiently manages the generation of spatial coordinates for patches + in a 2D grid, caching results to avoid redundant computations. + + Attributes: + position_cache: Dictionary storing precomputed position tensors for different + grid dimensions. + """ + + def __init__(self): + """Initializes the position generator with an empty cache.""" + self.position_cache: Dict[Tuple[int, int], torch.Tensor] = {} + + def __call__(self, batch_size: int, height: int, width: int, device: torch.device) -> torch.Tensor: + """Generates spatial positions for a batch of patches. + + Args: + batch_size: Number of samples in the batch. + height: Height of the grid in patches. + width: Width of the grid in patches. + device: Target device for the position tensor. + + Returns: + Tensor of shape (batch_size, height*width, 2) containing y,x coordinates + for each position in the grid, repeated for each batch item. + """ + if (height, width) not in self.position_cache: + y_coords = torch.arange(height, device=device) + x_coords = torch.arange(width, device=device) + positions = torch.cartesian_prod(y_coords, x_coords) + self.position_cache[height, width] = positions + + cached_positions = self.position_cache[height, width] + return cached_positions.view(1, height * width, 2).expand(batch_size, -1, -1).clone() + + +class RotaryPositionEmbedding2D(nn.Module): + """2D Rotary Position Embedding implementation. + + This module applies rotary position embeddings to input tokens based on their + 2D spatial positions. It handles the position-dependent rotation of features + separately for vertical and horizontal dimensions. + + Args: + frequency: Base frequency for the position embeddings. Default: 100.0 + scaling_factor: Scaling factor for frequency computation. Default: 1.0 + + Attributes: + base_frequency: Base frequency for computing position embeddings. + scaling_factor: Factor to scale the computed frequencies. + frequency_cache: Cache for storing precomputed frequency components. + """ + + def __init__(self, frequency: float = 100.0, scaling_factor: float = 1.0): + """Initializes the 2D RoPE module.""" + super().__init__() + self.base_frequency = frequency + self.scaling_factor = scaling_factor + self.frequency_cache: Dict[Tuple, Tuple[torch.Tensor, torch.Tensor]] = {} + + def _compute_frequency_components( + self, dim: int, seq_len: int, device: torch.device, dtype: torch.dtype + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Computes frequency components for rotary embeddings. + + Args: + dim: Feature dimension (must be even). + seq_len: Maximum sequence length. + device: Target device for computations. + dtype: Data type for the computed tensors. + + Returns: + Tuple of (cosine, sine) tensors for frequency components. + """ + cache_key = (dim, seq_len, device, dtype) + if cache_key not in self.frequency_cache: + # Compute frequency bands + exponents = torch.arange(0, dim, 2, device=device).float() / dim + inv_freq = 1.0 / (self.base_frequency**exponents) + + # Generate position-dependent frequencies + positions = torch.arange(seq_len, device=device, dtype=inv_freq.dtype) + angles = torch.einsum("i,j->ij", positions, inv_freq) + + # Compute and cache frequency components + angles = angles.to(dtype) + angles = torch.cat((angles, angles), dim=-1) + cos_components = angles.cos().to(dtype) + sin_components = angles.sin().to(dtype) + self.frequency_cache[cache_key] = (cos_components, sin_components) + + return self.frequency_cache[cache_key] + + @staticmethod + def _rotate_features(x: torch.Tensor) -> torch.Tensor: + """Performs feature rotation by splitting and recombining feature dimensions. + + Args: + x: Input tensor to rotate. + + Returns: + Rotated feature tensor. + """ + feature_dim = x.shape[-1] + x1, x2 = x[..., : feature_dim // 2], x[..., feature_dim // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def _apply_1d_rope( + self, tokens: torch.Tensor, positions: torch.Tensor, cos_comp: torch.Tensor, sin_comp: torch.Tensor + ) -> torch.Tensor: + """Applies 1D rotary position embeddings along one dimension. + + Args: + tokens: Input token features. + positions: Position indices. + cos_comp: Cosine components for rotation. + sin_comp: Sine components for rotation. + + Returns: + Tokens with applied rotary position embeddings. + """ + # Embed positions with frequency components + cos = F.embedding(positions, cos_comp)[:, None, :, :] + sin = F.embedding(positions, sin_comp)[:, None, :, :] + + # Apply rotation + return (tokens * cos) + (self._rotate_features(tokens) * sin) + + def forward(self, tokens: torch.Tensor, positions: torch.Tensor) -> torch.Tensor: + """Applies 2D rotary position embeddings to input tokens. + + Args: + tokens: Input tensor of shape (batch_size, n_heads, n_tokens, dim). + The feature dimension (dim) must be divisible by 4. + positions: Position tensor of shape (batch_size, n_tokens, 2) containing + the y and x coordinates for each token. + + Returns: + Tensor of same shape as input with applied 2D rotary position embeddings. + + Raises: + AssertionError: If input dimensions are invalid or positions are malformed. + """ + # Validate inputs + assert tokens.size(-1) % 2 == 0, "Feature dimension must be even" + assert positions.ndim == 3 and positions.shape[-1] == 2, "Positions must have shape (batch_size, n_tokens, 2)" + + # Compute feature dimension for each spatial direction + feature_dim = tokens.size(-1) // 2 + + # Get frequency components + max_position = int(positions.max()) + 1 + cos_comp, sin_comp = self._compute_frequency_components(feature_dim, max_position, tokens.device, tokens.dtype) + + # Split features for vertical and horizontal processing + vertical_features, horizontal_features = tokens.chunk(2, dim=-1) + + # Apply RoPE separately for each dimension + vertical_features = self._apply_1d_rope(vertical_features, positions[..., 0], cos_comp, sin_comp) + horizontal_features = self._apply_1d_rope(horizontal_features, positions[..., 1], cos_comp, sin_comp) + + # Combine processed features + return torch.cat((vertical_features, horizontal_features), dim=-1) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/swiglu_ffn.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/swiglu_ffn.py new file mode 100644 index 0000000000000000000000000000000000000000..54fe8e90b7bedf6fbdbf09c6215844e3cc63f857 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/swiglu_ffn.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +import os +from typing import Callable, Optional +import warnings + +from torch import Tensor, nn +import torch.nn.functional as F + + +class SwiGLUFFN(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None +# try: +# if XFORMERS_ENABLED: +# from xformers.ops import SwiGLU + +# XFORMERS_AVAILABLE = True +# warnings.warn("xFormers is available (SwiGLU)") +# else: +# warnings.warn("xFormers is disabled (SwiGLU)") +# raise ImportError +# except ImportError: +SwiGLU = SwiGLUFFN +XFORMERS_AVAILABLE = False + +# warnings.warn("xFormers is not available (SwiGLU)") + + +class SwiGLUFFNFused(SwiGLU): + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/vision_transformer.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..120cbe6c26650d212e50aefc497669abdc937467 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/layers/vision_transformer.py @@ -0,0 +1,407 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the Apache License, Version 2.0 +# found in the LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +from functools import partial +import math +import logging +from typing import Sequence, Tuple, Union, Callable + +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint +from torch.nn.init import trunc_normal_ +from . import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block + +logger = logging.getLogger("dinov2") + + +def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = ".".join((name, child_name)) if name else child_name + named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + def forward(self, x): + for b in self: + x = b(x) + return x + + +class DinoVisionTransformer(nn.Module): + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=Block, + ffn_layer="mlp", + block_chunks=1, + num_register_tokens=0, + interpolate_antialias=False, + interpolate_offset=0.1, + qk_norm=False, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + num_register_tokens: (int) number of extra cls tokens (so-called "registers") + interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings + interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings + """ + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + # tricky but makes it work + self.use_checkpoint = False + # + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.patch_size = patch_size + self.num_register_tokens = num_register_tokens + self.interpolate_antialias = interpolate_antialias + self.interpolate_offset = interpolate_offset + + self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + assert num_register_tokens >= 0 + self.register_tokens = ( + nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None + ) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + + if ffn_layer == "mlp": + logger.info("using MLP layer as FFN") + ffn_layer = Mlp + elif ffn_layer == "swiglufused" or ffn_layer == "swiglu": + logger.info("using SwiGLU layer as FFN") + ffn_layer = SwiGLUFFNFused + elif ffn_layer == "identity": + logger.info("using Identity layer as FFN") + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + qk_norm=qk_norm, + ) + for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize]) + self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + self.init_weights() + + def init_weights(self): + trunc_normal_(self.pos_embed, std=0.02) + nn.init.normal_(self.cls_token, std=1e-6) + if self.register_tokens is not None: + nn.init.normal_(self.register_tokens, std=1e-6) + named_apply(init_weights_vit_timm, self) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + M = int(math.sqrt(N)) # Recover the number of patches in each dimension + assert N == M * M + kwargs = {} + if self.interpolate_offset: + # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8 + # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors + sx = float(w0 + self.interpolate_offset) / M + sy = float(h0 + self.interpolate_offset) / M + kwargs["scale_factor"] = (sx, sy) + else: + # Simply specify an output size instead of a scale factor + kwargs["size"] = (w0, h0) + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2), + mode="bicubic", + antialias=self.interpolate_antialias, + **kwargs, + ) + assert (w0, h0) == patch_pos_embed.shape[-2:] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + if self.register_tokens is not None: + x = torch.cat( + ( + x[:, :1], + self.register_tokens.expand(x.shape[0], -1, -1), + x[:, 1:], + ), + dim=1, + ) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)] + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint(blk, x, use_reentrant=self.use_reentrant) + else: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append( + { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + ) + return output + + def forward_features(self, x, masks=None): + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint(blk, x, use_reentrant=self.use_reentrant) + else: + x = blk(x) + + x_norm = self.norm(x) + return { + "x_norm_clstoken": x_norm[:, 0], + "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1], + "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :], + "x_prenorm": x, + "masks": masks, + } + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found" + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, is_training=True, **kwargs): + ret = self.forward_features(*args, **kwargs) + if is_training: + return ret + else: + return self.head(ret["x_norm_clstoken"]) + + +def init_weights_vit_timm(module: nn.Module, name: str = ""): + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def vit_small(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_base(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_large(patch_size=16, num_register_tokens=0, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model + + +def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs): + """ + Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64 + """ + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + num_register_tokens=num_register_tokens, + **kwargs, + ) + return model diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/models/aggregator.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/models/aggregator.py new file mode 100644 index 0000000000000000000000000000000000000000..cec455d20328fbc63da0345d05692442fe79d7ad --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/models/aggregator.py @@ -0,0 +1,332 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple, Union, List, Dict, Any + +from vggt.layers import PatchEmbed +from vggt.layers.block import Block +from vggt.layers.rope import RotaryPositionEmbedding2D, PositionGetter +from vggt.layers.vision_transformer import vit_small, vit_base, vit_large, vit_giant2 + +logger = logging.getLogger(__name__) + +_RESNET_MEAN = [0.485, 0.456, 0.406] +_RESNET_STD = [0.229, 0.224, 0.225] + + +class Aggregator(nn.Module): + """ + The Aggregator applies alternating-attention over input frames, + as described in VGGT: Visual Geometry Grounded Transformer. + + + Args: + img_size (int): Image size in pixels. + patch_size (int): Size of each patch for PatchEmbed. + embed_dim (int): Dimension of the token embeddings. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + mlp_ratio (float): Ratio of MLP hidden dim to embedding dim. + num_register_tokens (int): Number of register tokens. + block_fn (nn.Module): The block type used for attention (Block by default). + qkv_bias (bool): Whether to include bias in QKV projections. + proj_bias (bool): Whether to include bias in the output projection. + ffn_bias (bool): Whether to include bias in MLP layers. + patch_embed (str): Type of patch embed. e.g., "conv" or "dinov2_vitl14_reg". + aa_order (list[str]): The order of alternating attention, e.g. ["frame", "global"]. + aa_block_size (int): How many blocks to group under each attention type before switching. If not necessary, set to 1. + qk_norm (bool): Whether to apply QK normalization. + rope_freq (int): Base frequency for rotary embedding. -1 to disable. + init_values (float): Init scale for layer scale. + """ + + def __init__( + self, + img_size=518, + patch_size=14, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4.0, + num_register_tokens=4, + block_fn=Block, + qkv_bias=True, + proj_bias=True, + ffn_bias=True, + patch_embed="dinov2_vitl14_reg", + aa_order=["frame", "global"], + aa_block_size=1, + qk_norm=True, + rope_freq=100, + init_values=0.01, + ): + super().__init__() + + self.__build_patch_embed__(patch_embed, img_size, patch_size, num_register_tokens, embed_dim=embed_dim) + + # Initialize rotary position embedding if frequency > 0 + self.rope = RotaryPositionEmbedding2D(frequency=rope_freq) if rope_freq > 0 else None + self.position_getter = PositionGetter() if self.rope is not None else None + + self.frame_blocks = nn.ModuleList( + [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + init_values=init_values, + qk_norm=qk_norm, + rope=self.rope, + ) + for _ in range(depth) + ] + ) + + self.global_blocks = nn.ModuleList( + [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + init_values=init_values, + qk_norm=qk_norm, + rope=self.rope, + ) + for _ in range(depth) + ] + ) + + self.depth = depth + self.aa_order = aa_order + self.patch_size = patch_size + self.aa_block_size = aa_block_size + + # Validate that depth is divisible by aa_block_size + if self.depth % self.aa_block_size != 0: + raise ValueError(f"depth ({depth}) must be divisible by aa_block_size ({aa_block_size})") + + self.aa_block_num = self.depth // self.aa_block_size + + # Note: We have two camera tokens, one for the first frame and one for the rest + # The same applies for register tokens + self.camera_token = nn.Parameter(torch.randn(1, 2, 1, embed_dim)) + self.register_token = nn.Parameter(torch.randn(1, 2, num_register_tokens, embed_dim)) + + # The patch tokens start after the camera and register tokens + self.patch_start_idx = 1 + num_register_tokens + + # Initialize parameters with small values + nn.init.normal_(self.camera_token, std=1e-6) + nn.init.normal_(self.register_token, std=1e-6) + + # Register normalization constants as buffers + for name, value in ( + ("_resnet_mean", _RESNET_MEAN), + ("_resnet_std", _RESNET_STD), + ): + self.register_buffer( + name, + torch.FloatTensor(value).reshape(1, 1, 3, 1, 1), + persistent=False, + ) + + def __build_patch_embed__( + self, + patch_embed, + img_size, + patch_size, + num_register_tokens, + interpolate_antialias=True, + interpolate_offset=0.0, + block_chunks=0, + init_values=1.0, + embed_dim=1024, + ): + """ + Build the patch embed layer. If 'conv', we use a + simple PatchEmbed conv layer. Otherwise, we use a vision transformer. + """ + + if "conv" in patch_embed: + self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=3, embed_dim=embed_dim) + else: + vit_models = { + "dinov2_vitl14_reg": vit_large, + "dinov2_vitb14_reg": vit_base, + "dinov2_vits14_reg": vit_small, + "dinov2_vitg2_reg": vit_giant2, + } + + self.patch_embed = vit_models[patch_embed]( + img_size=img_size, + patch_size=patch_size, + num_register_tokens=num_register_tokens, + interpolate_antialias=interpolate_antialias, + interpolate_offset=interpolate_offset, + block_chunks=block_chunks, + init_values=init_values, + ) + + # Disable gradient updates for mask token + if hasattr(self.patch_embed, "mask_token"): + self.patch_embed.mask_token.requires_grad_(False) + + def forward( + self, + images: torch.Tensor, + ) -> Union[Tuple[List[torch.Tensor], int], Tuple[List[torch.Tensor], int, Dict]]: + """ + Args: + images (torch.Tensor): Input images with shape [B, S, 3, H, W], in range [0, 1]. + B: batch size, S: sequence length, 3: RGB channels, H: height, W: width + + Returns: + (list[torch.Tensor], int): + The list of outputs from the attention blocks, + and the patch_start_idx indicating where patch tokens begin. + """ + B, S, C_in, H, W = images.shape + # Normalize images and reshape for patch embed + images = (images - self._resnet_mean.to(images.device)) / self._resnet_std.to(images.device) + + # Reshape to [B*S, C, H, W] for patch embedding + images = images.reshape(B * S, C_in, H, W) + patch_tokens = self.patch_embed(images) + + if isinstance(patch_tokens, dict): + patch_tokens = patch_tokens["x_norm_patchtokens"] + + _, P, C = patch_tokens.shape + + camera_token = slice_expand_and_flatten(self.camera_token, B, S) + register_token = slice_expand_and_flatten(self.register_token, B, S) + + # Concatenate special tokens with patch tokens + tokens = torch.cat([camera_token, register_token, patch_tokens], dim=1) + + pos = None + if self.rope is not None: + pos = self.position_getter(B * S, H // self.patch_size, W // self.patch_size, device=images.device) + + if self.patch_start_idx > 0: + # do not use position embedding for special tokens (camera and register tokens) + # so set pos to 0 for the special tokens + pos = pos + 1 + pos_special = torch.zeros(B * S, self.patch_start_idx, 2).to(images.device).to(pos.dtype) + pos = torch.cat([pos_special, pos], dim=1) + + # update P because we added special tokens + _, P, C = tokens.shape + + frame_idx = 0 + global_idx = 0 + output_list = [] + + for block_num in range(self.aa_block_num): + for attn_type in self.aa_order: + if attn_type == "frame": + tokens, frame_idx, frame_intermediates = self._process_frame_attention( + tokens, B, S, P, C, frame_idx, pos=pos + ) + elif attn_type == "global": + tokens, global_idx, global_intermediates = self._process_global_attention( + tokens, B, S, P, C, global_idx, pos=pos + ) + else: + raise ValueError(f"Unknown attention type: {attn_type}") + + for i in range(len(frame_intermediates)): + # concat frame and global intermediates, [B x S x P x 2C] + concat_inter = torch.cat([frame_intermediates[i], global_intermediates[i]], dim=-1) + output_list.append(concat_inter) + + del concat_inter + del frame_intermediates + del global_intermediates + + return output_list, self.patch_start_idx + + + def _process_frame_attention(self, tokens, B, S, P, C, frame_idx, pos=None): + """ + Process frame attention blocks. We keep tokens in shape (B*S, P, C). + """ + # If needed, reshape tokens or positions: + if tokens.shape != (B * S, P, C): + tokens = tokens.reshape(B, S, P, C).reshape(B * S, P, C) + + if pos is not None and pos.shape != (B * S, P, 2): + pos = pos.reshape(B, S, P, 2).reshape(B * S, P, 2) + + intermediates = [] + + # by default, self.aa_block_size=1, which processes one block at a time + for _ in range(self.aa_block_size): + tokens = self.frame_blocks[frame_idx](tokens, pos=pos) + frame_idx += 1 + intermediates.append(tokens.reshape(B, S, P, C)) + + return tokens, frame_idx, intermediates + + + def _process_global_attention(self, tokens, B, S, P, C, global_idx, pos=None) -> Union[Tuple[torch.Tensor, int, List[torch.Tensor]], Tuple[torch.Tensor, int, List[torch.Tensor], List]]: + """ + Process global attention blocks. We keep tokens in shape (B, S*P, C). + """ + + if tokens.shape != (B, S * P, C): + tokens = tokens.reshape(B, S, P, C).reshape(B, S * P, C) + + if pos is not None and pos.shape != (B, S * P, 2): + pos = pos.reshape(B, S, P, 2).reshape(B, S * P, 2) + + intermediates = [] + + for _ in range(self.aa_block_size): + tokens = self.global_blocks[global_idx](tokens, pos=pos) + + global_idx += 1 + intermediates.append(tokens.reshape(B, S, P, C)) + + return tokens, global_idx, intermediates + + + + +def slice_expand_and_flatten(token_tensor, B, S): + """ + Processes specialized tokens with shape (1, 2, X, C) for multi-frame processing: + 1) Uses the first position (index=0) for the first frame only + 2) Uses the second position (index=1) for all remaining frames (S-1 frames) + 3) Expands both to match batch size B + 4) Concatenates to form (B, S, X, C) where each sequence has 1 first-position token + followed by (S-1) second-position tokens + 5) Flattens to (B*S, X, C) for processing + + Returns: + torch.Tensor: Processed tokens with shape (B*S, X, C) + """ + + # Slice out the "query" tokens => shape (1, 1, ...) + query = token_tensor[:, 0:1, ...].expand(B, 1, *token_tensor.shape[2:]) + # Slice out the "other" tokens => shape (1, S-1, ...) + others = token_tensor[:, 1:, ...].expand(B, S - 1, *token_tensor.shape[2:]) + # Concatenate => shape (B, S, ...) + combined = torch.cat([query, others], dim=1) + + # Finally flatten => shape (B*S, ...) + combined = combined.reshape(B * S, *combined.shape[2:]) + return combined diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/models/vggt.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/models/vggt.py new file mode 100644 index 0000000000000000000000000000000000000000..65b1dd76be36372d8c321cf3c8f9892282ef92ae --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/models/vggt.py @@ -0,0 +1,228 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from huggingface_hub import PyTorchModelHubMixin # used for model hub + +from vggt.models.aggregator import Aggregator +from vggt.heads.camera_head import CameraHead +from vggt.heads.dpt_head import DPTHead +from vggt.heads.track_head import TrackHead +from transformers.file_utils import ModelOutput +from typing import Optional, Tuple, List, Any +from dataclasses import dataclass + +@dataclass +class VGGTOutput(ModelOutput): + ress: Optional[List[dict]] = None + views: Optional[torch.Tensor] = None + +class VGGT(nn.Module, PyTorchModelHubMixin): + def __init__(self, img_size=518, patch_size=14, embed_dim=1024): + super().__init__() + + self.aggregator = Aggregator(img_size=img_size, patch_size=patch_size, embed_dim=embed_dim) + self.camera_head = CameraHead(dim_in=2 * embed_dim) + self.point_head = DPTHead(dim_in=2 * embed_dim, output_dim=4, activation="inv_log", conf_activation="expp1") + self.depth_head = DPTHead(dim_in=2 * embed_dim, output_dim=2, activation="exp", conf_activation="expp1") + self.track_head = TrackHead(dim_in=2 * embed_dim, patch_size=patch_size) + + def forward( + self, + views, + query_points: torch.Tensor = None, + ): + """ + Forward pass of the VGGT model. + + Args: + images (torch.Tensor): Input images with shape [S, 3, H, W] or [B, S, 3, H, W], in range [0, 1]. + B: batch size, S: sequence length, 3: RGB channels, H: height, W: width + query_points (torch.Tensor, optional): Query points for tracking, in pixel coordinates. + Shape: [N, 2] or [B, N, 2], where N is the number of query points. + Default: None + history_info (dict, optional): Token history for streaming inference + past_key_values (dict, optional): KV cache from previous steps + use_cache (bool): Whether to use and return KV cache + past_frame_idx (int): Frame index for position encoding in sequence + + Returns: + dict: A dictionary containing the following predictions: + - pose_enc (torch.Tensor): Camera pose encoding with shape [B, S, 9] (from the last iteration) + - depth (torch.Tensor): Predicted depth maps with shape [B, S, H, W, 1] + - depth_conf (torch.Tensor): Confidence scores for depth predictions with shape [B, S, H, W] + - world_points (torch.Tensor): 3D world coordinates for each pixel with shape [B, S, H, W, 3] + - world_points_conf (torch.Tensor): Confidence scores for world points with shape [B, S, H, W] + - images (torch.Tensor): Original input images, preserved for visualization + + If query_points is provided, also includes: + - track (torch.Tensor): Point tracks with shape [B, S, N, 2] (from the last iteration), in pixel coordinates + - vis (torch.Tensor): Visibility scores for tracked points with shape [B, S, N] + - conf (torch.Tensor): Confidence scores for tracked points with shape [B, S, N] + """ + images = torch.stack( + [view["img"] for view in views], dim=0 + ).permute(1, 0, 2, 3, 4) # B S C H + + # If without batch dimension, add it + if len(images.shape) == 4: + images = images.unsqueeze(0) + + if query_points is not None and len(query_points.shape) == 2: + query_points = query_points.unsqueeze(0) + + aggregated_tokens_list, patch_start_idx = self.aggregator(images) + predictions = {} + + with torch.cuda.amp.autocast(enabled=False): + if self.camera_head is not None: + pose_enc_list = self.camera_head(aggregated_tokens_list) + predictions["pose_enc"] = pose_enc_list[-1] # pose encoding of the last iteration + + if self.depth_head is not None: + depth, depth_conf = self.depth_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["depth"] = depth + predictions["depth_conf"] = depth_conf + + if self.point_head is not None: + pts3d, pts3d_conf = self.point_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["world_points"] = pts3d + predictions["world_points_conf"] = pts3d_conf + if self.track_head is not None and query_points is not None: + track_list, vis, conf = self.track_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx, + query_points=query_points + ) + predictions["track"] = track_list[-1] # track of the last iteration + predictions["vis"] = vis + predictions["conf"] = conf + predictions["images"] = images + + B, S = images.shape[:2] + ress = [] + for s in range(S): + res = { + 'pts3d_in_other_view': predictions['world_points'][:, s], # [B, H, W, 3] + 'conf': predictions['world_points_conf'][:, s], # [B, H, W] + + 'depth': predictions['depth'][:, s], # [B, H, W, 1] + 'depth_conf': predictions['depth_conf'][:, s], # [B, H, W] + + 'camera_pose': predictions['pose_enc'][:, s, :7], # [B, 7] + + **({'valid_mask': views[s]["valid_mask"]} + if 'valid_mask' in views[s] else {}), # [B, H, W] + + **({'track': predictions['track'][:, s], # [B, N, 2] + 'vis': predictions['vis'][:, s], # [B, N] + 'track_conf': predictions['conf'][:, s]} + if 'track' in predictions else {}) + } + ress.append(res) + return VGGTOutput(ress=ress, views=views) # [S] [B, C, H, W] + + def inference( + self, + views, + query_points: torch.Tensor = None, + ): + """ + Forward pass of the VGGT model. + + Args: + images (torch.Tensor): Input images with shape [S, 3, H, W] or [B, S, 3, H, W], in range [0, 1]. + B: batch size, S: sequence length, 3: RGB channels, H: height, W: width + query_points (torch.Tensor, optional): Query points for tracking, in pixel coordinates. + Shape: [N, 2] or [B, N, 2], where N is the number of query points. + Default: None + history_info (dict, optional): Token history for streaming inference + past_key_values (dict, optional): KV cache from previous steps + use_cache (bool): Whether to use and return KV cache + past_frame_idx (int): Frame index for position encoding in sequence + + Returns: + dict: A dictionary containing the following predictions: + - pose_enc (torch.Tensor): Camera pose encoding with shape [B, S, 9] (from the last iteration) + - depth (torch.Tensor): Predicted depth maps with shape [B, S, H, W, 1] + - depth_conf (torch.Tensor): Confidence scores for depth predictions with shape [B, S, H, W] + - world_points (torch.Tensor): 3D world coordinates for each pixel with shape [B, S, H, W, 3] + - world_points_conf (torch.Tensor): Confidence scores for world points with shape [B, S, H, W] + - images (torch.Tensor): Original input images, preserved for visualization + + If query_points is provided, also includes: + - track (torch.Tensor): Point tracks with shape [B, S, N, 2] (from the last iteration), in pixel coordinates + - vis (torch.Tensor): Visibility scores for tracked points with shape [B, S, N] + - conf (torch.Tensor): Confidence scores for tracked points with shape [B, S, N] + """ + images = torch.stack( + [view["img"] for view in views], dim=0 + ).permute(1, 0, 2, 3, 4) # B S C H W + + # If without batch dimension, add it + if len(images.shape) == 4: + images = images.unsqueeze(0) + + if query_points is not None and len(query_points.shape) == 2: + query_points = query_points.unsqueeze(0) + + aggregated_tokens_list, patch_start_idx = self.aggregator(images) + predictions = {} + + with torch.cuda.amp.autocast(enabled=False): + if self.camera_head is not None: + pose_enc_list = self.camera_head(aggregated_tokens_list) + predictions["pose_enc"] = pose_enc_list[-1] # pose encoding of the last iteration + + if self.depth_head is not None: + depth, depth_conf = self.depth_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["depth"] = depth + predictions["depth_conf"] = depth_conf + + if self.point_head is not None: + pts3d, pts3d_conf = self.point_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx + ) + predictions["world_points"] = pts3d + predictions["world_points_conf"] = pts3d_conf + if self.track_head is not None and query_points is not None: + track_list, vis, conf = self.track_head( + aggregated_tokens_list, images=images, patch_start_idx=patch_start_idx, + query_points=query_points + ) + predictions["track"] = track_list[-1] # track of the last iteration + predictions["vis"] = vis + predictions["conf"] = conf + predictions["images"] = images + + B, S = images.shape[:2] + ress = [] + for s in range(S): + res = { + 'pts3d_in_other_view': predictions['world_points'][:, s], # [B, H, W, 3] + 'conf': predictions['world_points_conf'][:, s], # [B, H, W] + + 'depth': predictions['depth'][:, s], # [B, H, W, 1] + 'depth_conf': predictions['depth_conf'][:, s], # [B, H, W] + + 'camera_pose': predictions['pose_enc'][:, s, :], # [B, 9] + + **({'valid_mask': views[s]["valid_mask"]} + if 'valid_mask' in views[s] else {}), # [B, H, W] + + **({'track': predictions['track'][:, s], # [B, N, 2] + 'vis': predictions['vis'][:, s], # [B, N] + 'track_conf': predictions['conf'][:, s]} + if 'track' in predictions else {}) + } + ress.append(res) + return VGGTOutput(ress=ress, views=views) # [S] [B, C, H, W] \ No newline at end of file diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/augmentation.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..6eef99cc65c69e349eb2982bc90fbf98808968bc --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/augmentation.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Dict +from torchvision import transforms + + +def get_image_augmentation( + color_jitter: Optional[Dict[str, float]] = None, + gray_scale: bool = True, + gau_blur: bool = False +) -> Optional[transforms.Compose]: + """Create a composition of image augmentations. + + Args: + color_jitter: Dictionary containing color jitter parameters: + - brightness: float (default: 0.5) + - contrast: float (default: 0.5) + - saturation: float (default: 0.5) + - hue: float (default: 0.1) + - p: probability of applying (default: 0.9) + If None, uses default values + gray_scale: Whether to apply random grayscale (default: True) + gau_blur: Whether to apply gaussian blur (default: False) + + Returns: + A Compose object of transforms or None if no transforms are added + """ + transform_list = [] + default_jitter = { + "brightness": 0.5, + "contrast": 0.5, + "saturation": 0.5, + "hue": 0.1, + "p": 0.9 + } + + # Handle color jitter + if color_jitter is not None: + # Merge with defaults for missing keys + effective_jitter = {**default_jitter, **color_jitter} + else: + effective_jitter = default_jitter + + transform_list.append( + transforms.RandomApply( + [ + transforms.ColorJitter( + brightness=effective_jitter["brightness"], + contrast=effective_jitter["contrast"], + saturation=effective_jitter["saturation"], + hue=effective_jitter["hue"], + ) + ], + p=effective_jitter["p"], + ) + ) + + if gray_scale: + transform_list.append(transforms.RandomGrayscale(p=0.05)) + + if gau_blur: + transform_list.append( + transforms.RandomApply( + [transforms.GaussianBlur(5, sigma=(0.1, 1.0))], p=0.05 + ) + ) + + return transforms.Compose(transform_list) if transform_list else None diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/general.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/general.py new file mode 100644 index 0000000000000000000000000000000000000000..6a0c88a05c9d3e7e7c0f09c1ab8c843b7285b29f --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/general.py @@ -0,0 +1,369 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import os +import math +import random +import numpy as np +from typing import Union, Optional +import logging +from iopath.common.file_io import g_pathmgr +import torch.distributed as dist +from pathlib import Path +from typing import Dict, Iterable, List + + + +from collections import defaultdict +from dataclasses import fields, is_dataclass +from typing import Any, Mapping, Protocol, runtime_checkable + + + + +def check_and_fix_inf_nan(input_tensor, loss_name="default", hard_max=100): + """ + Checks if 'input_tensor' contains inf or nan values and clamps extreme values. + + Args: + input_tensor (torch.Tensor): The loss tensor to check and fix. + loss_name (str): Name of the loss (for diagnostic prints). + hard_max (float, optional): Maximum absolute value allowed. Values outside + [-hard_max, hard_max] will be clamped. If None, + no clamping is performed. Defaults to 100. + """ + if input_tensor is None: + return input_tensor + + # Check for inf/nan values + has_inf_nan = torch.isnan(input_tensor).any() or torch.isinf(input_tensor).any() + if has_inf_nan: + logging.warning(f"Tensor {loss_name} contains inf or nan values. Replacing with zeros.") + input_tensor = torch.where( + torch.isnan(input_tensor) | torch.isinf(input_tensor), + torch.zeros_like(input_tensor), + input_tensor + ) + + # Apply hard clamping if specified + if hard_max is not None: + input_tensor = torch.clamp(input_tensor, min=-hard_max, max=hard_max) + + return input_tensor + + +def get_resume_checkpoint(checkpoint_save_dir): + if not g_pathmgr.isdir(checkpoint_save_dir): + return None + ckpt_file = os.path.join(checkpoint_save_dir, "checkpoint.pt") + if not g_pathmgr.isfile(ckpt_file): + return None + + return ckpt_file + +class DurationMeter: + def __init__(self, name, device, fmt=":f"): + self.name = name + self.device = device + self.fmt = fmt + self.val = 0 + + def reset(self): + self.val = 0 + + def update(self, val): + self.val = val + + def add(self, val): + self.val += val + + def __str__(self): + return f"{self.name}: {human_readable_time(self.val)}" + + +def human_readable_time(time_seconds): + time = int(time_seconds) + minutes, seconds = divmod(time, 60) + hours, minutes = divmod(minutes, 60) + days, hours = divmod(hours, 24) + return f"{days:02}d {hours:02}h {minutes:02}m" + + + +class ProgressMeter: + def __init__(self, num_batches, meters, real_meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.real_meters = real_meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + entries += [ + " | ".join( + [ + f"{os.path.join(name, subname)}: {val:.4f}" + for subname, val in meter.compute().items() + ] + ) + for name, meter in self.real_meters.items() + ] + logging.info(" | ".join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + + + +@runtime_checkable +class _CopyableData(Protocol): + def to(self, device: torch.device, *args: Any, **kwargs: Any): + """Copy data to the specified device""" + ... + + +def _is_named_tuple(x) -> bool: + return isinstance(x, tuple) and hasattr(x, "_asdict") and hasattr(x, "_fields") + + +def copy_data_to_device(data, device: torch.device, *args: Any, **kwargs: Any): + """Function that recursively copies data to a torch.device. + + Args: + data: The data to copy to device + device: The device to which the data should be copied + args: positional arguments that will be passed to the `to` call + kwargs: keyword arguments that will be passed to the `to` call + + Returns: + The data on the correct device + """ + + if _is_named_tuple(data): + return type(data)( + **copy_data_to_device(data._asdict(), device, *args, **kwargs) + ) + elif isinstance(data, (list, tuple)): + return type(data)(copy_data_to_device(e, device, *args, **kwargs) for e in data) + elif isinstance(data, defaultdict): + return type(data)( + data.default_factory, + { + k: copy_data_to_device(v, device, *args, **kwargs) + for k, v in data.items() + }, + ) + elif isinstance(data, Mapping) and not is_dataclass(data): # handing FrameData-like things + return type(data)( + { + k: copy_data_to_device(v, device, *args, **kwargs) + for k, v in data.items() + } + ) + elif is_dataclass(data) and not isinstance(data, type): + new_data_class = type(data)( + **{ + field.name: copy_data_to_device( + getattr(data, field.name), device, *args, **kwargs + ) + for field in fields(data) + if field.init + } + ) + for field in fields(data): + if not field.init: + setattr( + new_data_class, + field.name, + copy_data_to_device( + getattr(data, field.name), device, *args, **kwargs + ), + ) + return new_data_class + elif isinstance(data, _CopyableData): + return data.to(device, *args, **kwargs) + return data + + + +def safe_makedirs(path: str): + if not path: + logging.warning("safe_makedirs called with an empty path. No operation performed.") + return False + + try: + os.makedirs(path, exist_ok=True) + return True + except OSError as e: + logging.error(f"Failed to create directory '{path}'. Reason: {e}") + raise + except Exception as e: + # Catch any other unexpected errors. + logging.error(f"An unexpected error occurred while creating directory '{path}'. Reason: {e}") + raise + + + +def set_seeds(seed_value, max_epochs, dist_rank): + """ + Set the python random, numpy and torch seed for each gpu. Also set the CUDA + seeds if the CUDA is available. This ensures deterministic nature of the training. + """ + seed_value = (seed_value + dist_rank) * max_epochs + logging.info(f"GPU SEED: {seed_value}") + random.seed(seed_value) + np.random.seed(seed_value) + torch.manual_seed(seed_value) + + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed_value) + torch.cuda.manual_seed_all(seed_value) # for multi-GPU + + + + +def log_env_variables(): + env_keys = sorted(list(os.environ.keys())) + st = "" + for k in env_keys: + v = os.environ[k] + st += f"{k}={v}\n" + logging.info("Logging ENV_VARIABLES") + logging.info(st) + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + + +class AverageMeter: + """Computes and stores the average and current value. + Args: + name (str): Name of the metric being tracked + device (torch.device, optional): Device for tensor operations. Defaults to None. + fmt (str): Format string for displaying values. Defaults to ":f" + """ + + def __init__(self, name: str, device: Optional[torch.device] = None, fmt: str = ":f"): + self.name = name + self.fmt = fmt + self.device = device + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + self._allow_updates = True + + def update(self, val, n=1): + if n <= 0: + raise ValueError(f"n must be positive, got {n}") + + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count if self.count > 0 else 0.0 + + def __str__(self) -> str: + """String representation showing current and average values.""" + fmtstr = "{name}: {val" + self.fmt + "} ({avg" + self.fmt + "})" + return fmtstr.format(**self.__dict__) + + @property + def value(self) -> float: + """Get the current value.""" + return self.val + + @property + def average(self) -> float: + """Get the running average.""" + return self.avg + +################# + + +_UNITS = ('', ' K', ' M', ' B', ' T') # U+202F = thin-space for nicer look + +def pretty_int(n: int) -> str: + """Abbreviate a non-negative integer (0 → 0, 12_345 → '12.3 K').""" + assert n >= 0, 'pretty_int() expects a non-negative int' + if n < 1_000: + return f'{n:,}' + exp = int(math.log10(n) // 3) # group of 3 digits + exp = min(exp, len(_UNITS) - 1) # cap at trillions + value = n / 10 ** (3 * exp) + return f'{value:.1f}'.rstrip('0').rstrip('.') + _UNITS[exp] + + +def model_summary(model: torch.nn.Module, + *, + log_file = None, + prefix: str = '') -> None: + """ + Print / save a compact parameter summary. + + Args + ---- + model : The PyTorch nn.Module to inspect. + log_file : Optional path – if given, the full `str(model)` and per-parameter + lists are written there (three separate *.txt files). + prefix : Optional string printed at the beginning of every log line + (handy when several models share the same stdout). + """ + if get_rank(): # only rank-0 prints + return + + # --- counts ------------------------------------------------------------- + trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) + total = sum(p.numel() for p in model.parameters()) + frozen = total - trainable + + print(prefix + '='*60) + print(prefix + f'Model type : {model.__class__.__name__}') + print(prefix + f'Total : {pretty_int(total)} parameters') + print(prefix + f' trainable: {pretty_int(trainable)}') + print(prefix + f' frozen : {pretty_int(frozen)}') + print(prefix + '='*60) + + # --- optional file dump ------------------------------------------------- + if log_file is None: + return + + log_file = Path(log_file) + log_file.write_text(str(model)) # full architecture + + # two extra detailed lists + def _dump(names: Iterable[str], fname: str): + """Write a formatted per-parameter list to *log_file.with_name(fname)*.""" + with open(log_file.with_name(fname), 'w') as f: + for n in names: + p = dict(model.named_parameters())[n] + shape = str(tuple(p.shape)) + f.write(f'{n:<60s} {shape:<20} {p.numel()}\n') + + named = dict(model.named_parameters()) + _dump([n for n,p in named.items() if p.requires_grad], 'trainable.txt') + _dump([n for n,p in named.items() if not p.requires_grad], 'frozen.txt') + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/normalization.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/normalization.py new file mode 100644 index 0000000000000000000000000000000000000000..09595f7336f225b289b937fc974b6a3225a48a82 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/normalization.py @@ -0,0 +1,130 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import logging +from typing import Optional, Tuple +from vggt.utils.geometry import closed_form_inverse_se3 +from vggt.train_utils.general import check_and_fix_inf_nan + + +def check_valid_tensor(input_tensor: Optional[torch.Tensor], name: str = "tensor") -> None: + """ + Check if a tensor contains NaN or Inf values and log a warning if found. + + Args: + input_tensor: The tensor to check + name: Name of the tensor for logging purposes + """ + if input_tensor is not None: + if torch.isnan(input_tensor).any() or torch.isinf(input_tensor).any(): + logging.warning(f"NaN or Inf found in tensor: {name}") + + +def normalize_camera_extrinsics_and_points_batch( + extrinsics: torch.Tensor, + cam_points: Optional[torch.Tensor] = None, + world_points: Optional[torch.Tensor] = None, + depths: Optional[torch.Tensor] = None, + scale_by_points: bool = True, + point_masks: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Normalize camera extrinsics and corresponding 3D points. + + This function transforms the coordinate system to be centered at the first camera + and optionally scales the scene to have unit average distance. + + Args: + extrinsics: Camera extrinsic matrices of shape (B, S, 3, 4) + cam_points: 3D points in camera coordinates of shape (B, S, H, W, 3) or (*,3) + world_points: 3D points in world coordinates of shape (B, S, H, W, 3) or (*,3) + depths: Depth maps of shape (B, S, H, W) + scale_by_points: Whether to normalize the scale based on point distances + point_masks: Boolean masks for valid points of shape (B, S, H, W) + + Returns: + Tuple containing: + - Normalized camera extrinsics of shape (B, S, 3, 4) + - Normalized camera points (same shape as input cam_points) + - Normalized world points (same shape as input world_points) + - Normalized depths (same shape as input depths) + """ + # Validate inputs + check_valid_tensor(extrinsics, "extrinsics") + check_valid_tensor(cam_points, "cam_points") + check_valid_tensor(world_points, "world_points") + check_valid_tensor(depths, "depths") + + + B, S, _, _ = extrinsics.shape + device = extrinsics.device + #assert device == torch.device("cpu") + + + # Convert extrinsics to homogeneous form: (B, N,4,4) + extrinsics_homog = torch.cat( + [ + extrinsics, + torch.zeros((B, S, 1, 4), device=device), + ], + dim=-2, + ) + extrinsics_homog[:, :, -1, -1] = 1.0 + + # first_cam_extrinsic_inv, the inverse of the first camera's extrinsic matrix + # which can be also viewed as the cam_to_world extrinsic matrix + first_cam_extrinsic_inv = closed_form_inverse_se3(extrinsics_homog[:, 0]) + # new_extrinsics = torch.matmul(extrinsics_homog, first_cam_extrinsic_inv) + new_extrinsics = torch.matmul(extrinsics_homog, first_cam_extrinsic_inv.unsqueeze(1)) # (B,N,4,4) + + + if world_points is not None: + # since we are transforming the world points to the first camera's coordinate system + # we directly use the cam_from_world extrinsic matrix of the first camera + # instead of using the inverse of the first camera's extrinsic matrix + R = extrinsics[:, 0, :3, :3] + t = extrinsics[:, 0, :3, 3] + new_world_points = (world_points @ R.transpose(-1, -2).unsqueeze(1).unsqueeze(2)) + t.unsqueeze(1).unsqueeze(2).unsqueeze(3) + else: + new_world_points = None + + + if scale_by_points: + new_depths = depths.clone() + + dist = new_world_points.norm(dim=-1) + dist_sum = (dist * point_masks).sum(dim=[1,2,3]) + valid_count = point_masks.sum(dim=[1,2,3]) + avg_scale = (dist_sum / (valid_count + 1e-3)).clamp(min=1e-6, max=1e6) + + + new_world_points = new_world_points / avg_scale.view(-1, 1, 1, 1, 1) + new_extrinsics[:, :, :3, 3] = new_extrinsics[:, :, :3, 3] / avg_scale.view(-1, 1, 1) + if depths is not None: + new_depths = new_depths / avg_scale.view(-1, 1, 1, 1) + if cam_points is not None: + new_cam_points = cam_points.clone() + new_cam_points = new_cam_points / avg_scale.view(-1, 1, 1, 1, 1) + else: + return new_extrinsics[:, :, :3], cam_points, new_world_points, depths + + new_extrinsics = new_extrinsics[:, :, :3] # 4x4 -> 3x4 + new_extrinsics = check_and_fix_inf_nan(new_extrinsics, "new_extrinsics", hard_max=None) + if cam_points is not None: + new_cam_points = check_and_fix_inf_nan(new_cam_points, "new_cam_points", hard_max=None) + else: + new_cam_points = None + new_world_points = check_and_fix_inf_nan(new_world_points, "new_world_points", hard_max=None) + new_depths = check_and_fix_inf_nan(new_depths, "new_depths", hard_max=None) + + + return new_extrinsics, new_cam_points, new_world_points, new_depths + + + + + diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/normalization_v37.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/normalization_v37.py new file mode 100644 index 0000000000000000000000000000000000000000..09595f7336f225b289b937fc974b6a3225a48a82 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/train_utils/normalization_v37.py @@ -0,0 +1,130 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import logging +from typing import Optional, Tuple +from vggt.utils.geometry import closed_form_inverse_se3 +from vggt.train_utils.general import check_and_fix_inf_nan + + +def check_valid_tensor(input_tensor: Optional[torch.Tensor], name: str = "tensor") -> None: + """ + Check if a tensor contains NaN or Inf values and log a warning if found. + + Args: + input_tensor: The tensor to check + name: Name of the tensor for logging purposes + """ + if input_tensor is not None: + if torch.isnan(input_tensor).any() or torch.isinf(input_tensor).any(): + logging.warning(f"NaN or Inf found in tensor: {name}") + + +def normalize_camera_extrinsics_and_points_batch( + extrinsics: torch.Tensor, + cam_points: Optional[torch.Tensor] = None, + world_points: Optional[torch.Tensor] = None, + depths: Optional[torch.Tensor] = None, + scale_by_points: bool = True, + point_masks: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Normalize camera extrinsics and corresponding 3D points. + + This function transforms the coordinate system to be centered at the first camera + and optionally scales the scene to have unit average distance. + + Args: + extrinsics: Camera extrinsic matrices of shape (B, S, 3, 4) + cam_points: 3D points in camera coordinates of shape (B, S, H, W, 3) or (*,3) + world_points: 3D points in world coordinates of shape (B, S, H, W, 3) or (*,3) + depths: Depth maps of shape (B, S, H, W) + scale_by_points: Whether to normalize the scale based on point distances + point_masks: Boolean masks for valid points of shape (B, S, H, W) + + Returns: + Tuple containing: + - Normalized camera extrinsics of shape (B, S, 3, 4) + - Normalized camera points (same shape as input cam_points) + - Normalized world points (same shape as input world_points) + - Normalized depths (same shape as input depths) + """ + # Validate inputs + check_valid_tensor(extrinsics, "extrinsics") + check_valid_tensor(cam_points, "cam_points") + check_valid_tensor(world_points, "world_points") + check_valid_tensor(depths, "depths") + + + B, S, _, _ = extrinsics.shape + device = extrinsics.device + #assert device == torch.device("cpu") + + + # Convert extrinsics to homogeneous form: (B, N,4,4) + extrinsics_homog = torch.cat( + [ + extrinsics, + torch.zeros((B, S, 1, 4), device=device), + ], + dim=-2, + ) + extrinsics_homog[:, :, -1, -1] = 1.0 + + # first_cam_extrinsic_inv, the inverse of the first camera's extrinsic matrix + # which can be also viewed as the cam_to_world extrinsic matrix + first_cam_extrinsic_inv = closed_form_inverse_se3(extrinsics_homog[:, 0]) + # new_extrinsics = torch.matmul(extrinsics_homog, first_cam_extrinsic_inv) + new_extrinsics = torch.matmul(extrinsics_homog, first_cam_extrinsic_inv.unsqueeze(1)) # (B,N,4,4) + + + if world_points is not None: + # since we are transforming the world points to the first camera's coordinate system + # we directly use the cam_from_world extrinsic matrix of the first camera + # instead of using the inverse of the first camera's extrinsic matrix + R = extrinsics[:, 0, :3, :3] + t = extrinsics[:, 0, :3, 3] + new_world_points = (world_points @ R.transpose(-1, -2).unsqueeze(1).unsqueeze(2)) + t.unsqueeze(1).unsqueeze(2).unsqueeze(3) + else: + new_world_points = None + + + if scale_by_points: + new_depths = depths.clone() + + dist = new_world_points.norm(dim=-1) + dist_sum = (dist * point_masks).sum(dim=[1,2,3]) + valid_count = point_masks.sum(dim=[1,2,3]) + avg_scale = (dist_sum / (valid_count + 1e-3)).clamp(min=1e-6, max=1e6) + + + new_world_points = new_world_points / avg_scale.view(-1, 1, 1, 1, 1) + new_extrinsics[:, :, :3, 3] = new_extrinsics[:, :, :3, 3] / avg_scale.view(-1, 1, 1) + if depths is not None: + new_depths = new_depths / avg_scale.view(-1, 1, 1, 1) + if cam_points is not None: + new_cam_points = cam_points.clone() + new_cam_points = new_cam_points / avg_scale.view(-1, 1, 1, 1, 1) + else: + return new_extrinsics[:, :, :3], cam_points, new_world_points, depths + + new_extrinsics = new_extrinsics[:, :, :3] # 4x4 -> 3x4 + new_extrinsics = check_and_fix_inf_nan(new_extrinsics, "new_extrinsics", hard_max=None) + if cam_points is not None: + new_cam_points = check_and_fix_inf_nan(new_cam_points, "new_cam_points", hard_max=None) + else: + new_cam_points = None + new_world_points = check_and_fix_inf_nan(new_world_points, "new_world_points", hard_max=None) + new_depths = check_and_fix_inf_nan(new_depths, "new_depths", hard_max=None) + + + return new_extrinsics, new_cam_points, new_world_points, new_depths + + + + + diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/geometry.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/geometry.py new file mode 100644 index 0000000000000000000000000000000000000000..8ebd25dbc6cac6b0095956524c4f0628410dd5cb --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/geometry.py @@ -0,0 +1,166 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +import torch +import numpy as np + + +def unproject_depth_map_to_point_map( + depth_map: np.ndarray, extrinsics_cam: np.ndarray, intrinsics_cam: np.ndarray +) -> np.ndarray: + """ + Unproject a batch of depth maps to 3D world coordinates. + + Args: + depth_map (np.ndarray): Batch of depth maps of shape (S, H, W, 1) or (S, H, W) + extrinsics_cam (np.ndarray): Batch of camera extrinsic matrices of shape (S, 3, 4) + intrinsics_cam (np.ndarray): Batch of camera intrinsic matrices of shape (S, 3, 3) + + Returns: + np.ndarray: Batch of 3D world coordinates of shape (S, H, W, 3) + """ + if isinstance(depth_map, torch.Tensor): + depth_map = depth_map.cpu().numpy() + if isinstance(extrinsics_cam, torch.Tensor): + extrinsics_cam = extrinsics_cam.cpu().numpy() + if isinstance(intrinsics_cam, torch.Tensor): + intrinsics_cam = intrinsics_cam.cpu().numpy() + + world_points_list = [] + for frame_idx in range(depth_map.shape[0]): + cur_world_points, _, _ = depth_to_world_coords_points( + depth_map[frame_idx].squeeze(-1), extrinsics_cam[frame_idx], intrinsics_cam[frame_idx] + ) + world_points_list.append(cur_world_points) + world_points_array = np.stack(world_points_list, axis=0) + + return world_points_array + + +def depth_to_world_coords_points( + depth_map: np.ndarray, + extrinsic: np.ndarray, + intrinsic: np.ndarray, + eps=1e-8, +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert a depth map to world coordinates. + + Args: + depth_map (np.ndarray): Depth map of shape (H, W). + intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3). + extrinsic (np.ndarray): Camera extrinsic matrix of shape (3, 4). OpenCV camera coordinate convention, cam from world. + + Returns: + tuple[np.ndarray, np.ndarray]: World coordinates (H, W, 3) and valid depth mask (H, W). + """ + if depth_map is None: + return None, None, None + + # Valid depth mask + point_mask = depth_map > eps + + # Convert depth map to camera coordinates + cam_coords_points = depth_to_cam_coords_points(depth_map, intrinsic) + + # Multiply with the inverse of extrinsic matrix to transform to world coordinates + # extrinsic_inv is 4x4 (note closed_form_inverse_OpenCV is batched, the output is (N, 4, 4)) + cam_to_world_extrinsic = closed_form_inverse_se3(extrinsic[None])[0] + + R_cam_to_world = cam_to_world_extrinsic[:3, :3] + t_cam_to_world = cam_to_world_extrinsic[:3, 3] + + # Apply the rotation and translation to the camera coordinates + world_coords_points = np.dot(cam_coords_points, R_cam_to_world.T) + t_cam_to_world # HxWx3, 3x3 -> HxWx3 + # world_coords_points = np.einsum("ij,hwj->hwi", R_cam_to_world, cam_coords_points) + t_cam_to_world + + return world_coords_points, cam_coords_points, point_mask + + +def depth_to_cam_coords_points(depth_map: np.ndarray, intrinsic: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """ + Convert a depth map to camera coordinates. + + Args: + depth_map (np.ndarray): Depth map of shape (H, W). + intrinsic (np.ndarray): Camera intrinsic matrix of shape (3, 3). + + Returns: + tuple[np.ndarray, np.ndarray]: Camera coordinates (H, W, 3) + """ + H, W = depth_map.shape + assert intrinsic.shape == (3, 3), "Intrinsic matrix must be 3x3" + assert intrinsic[0, 1] == 0 and intrinsic[1, 0] == 0, "Intrinsic matrix must have zero skew" + + # Intrinsic parameters + fu, fv = intrinsic[0, 0], intrinsic[1, 1] + cu, cv = intrinsic[0, 2], intrinsic[1, 2] + + # Generate grid of pixel coordinates + u, v = np.meshgrid(np.arange(W), np.arange(H)) + + # Unproject to camera coordinates + x_cam = (u - cu) * depth_map / fu + y_cam = (v - cv) * depth_map / fv + z_cam = depth_map + + # Stack to form camera coordinates + cam_coords = np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32) + + return cam_coords + + +def closed_form_inverse_se3(se3, R=None, T=None): + """ + Compute the inverse of each 4x4 (or 3x4) SE3 matrix in a batch. + + If `R` and `T` are provided, they must correspond to the rotation and translation + components of `se3`. Otherwise, they will be extracted from `se3`. + + Args: + se3: Nx4x4 or Nx3x4 array or tensor of SE3 matrices. + R (optional): Nx3x3 array or tensor of rotation matrices. + T (optional): Nx3x1 array or tensor of translation vectors. + + Returns: + Inverted SE3 matrices with the same type and device as `se3`. + + Shapes: + se3: (N, 4, 4) + R: (N, 3, 3) + T: (N, 3, 1) + """ + # Check if se3 is a numpy array or a torch tensor + is_numpy = isinstance(se3, np.ndarray) + + # Validate shapes + if se3.shape[-2:] != (4, 4) and se3.shape[-2:] != (3, 4): + raise ValueError(f"se3 must be of shape (N,4,4), got {se3.shape}.") + + # Extract R and T if not provided + if R is None: + R = se3[:, :3, :3] # (N,3,3) + if T is None: + T = se3[:, :3, 3:] # (N,3,1) + + # Transpose R + if is_numpy: + # Compute the transpose of the rotation for NumPy + R_transposed = np.transpose(R, (0, 2, 1)) + # -R^T t for NumPy + top_right = -np.matmul(R_transposed, T) + inverted_matrix = np.tile(np.eye(4), (len(R), 1, 1)) + else: + R_transposed = R.transpose(1, 2) # (N,3,3) + top_right = -torch.bmm(R_transposed, T) # (N,3,1) + inverted_matrix = torch.eye(4, 4)[None].repeat(len(R), 1, 1) + inverted_matrix = inverted_matrix.to(R.dtype).to(R.device) + + inverted_matrix[:, :3, :3] = R_transposed + inverted_matrix[:, :3, 3:] = top_right + + return inverted_matrix diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/load_fn.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/load_fn.py new file mode 100644 index 0000000000000000000000000000000000000000..7032b1c8d1f09dd71922319394e4039aa8b01c2e --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/load_fn.py @@ -0,0 +1,147 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from PIL import Image +from torchvision import transforms as TF + + +def load_and_preprocess_images(image_path_list, mode="crop"): + """ + A quick start function to load and preprocess images for model input. + This assumes the images should have the same shape for easier batching, but our model can also work well with different shapes. + + Args: + image_path_list (list): List of paths to image files + mode (str, optional): Preprocessing mode, either "crop" or "pad". + - "crop" (default): Sets width to 518px and center crops height if needed. + - "pad": Preserves all pixels by making the largest dimension 518px + and padding the smaller dimension to reach a square shape. + + Returns: + torch.Tensor: Batched tensor of preprocessed images with shape (N, 3, H, W) + + Raises: + ValueError: If the input list is empty or if mode is invalid + + Notes: + - Images with different dimensions will be padded with white (value=1.0) + - A warning is printed when images have different shapes + - When mode="crop": The function ensures width=518px while maintaining aspect ratio + and height is center-cropped if larger than 518px + - When mode="pad": The function ensures the largest dimension is 518px while maintaining aspect ratio + and the smaller dimension is padded to reach a square shape (518x518) + - Dimensions are adjusted to be divisible by 14 for compatibility with model requirements + """ + # Check for empty list + if len(image_path_list) == 0: + raise ValueError("At least 1 image is required") + + # Validate mode + if mode not in ["crop", "pad"]: + raise ValueError("Mode must be either 'crop' or 'pad'") + + images = [] + shapes = set() + to_tensor = TF.ToTensor() + target_size = 224 + + # First process all images and collect their shapes + for image_path in image_path_list: + + # Open image + img = Image.open(image_path) + + # If there's an alpha channel, blend onto white background: + if img.mode == "RGBA": + # Create white background + background = Image.new("RGBA", img.size, (255, 255, 255, 255)) + # Alpha composite onto the white background + img = Image.alpha_composite(background, img) + + # Now convert to "RGB" (this step assigns white for transparent areas) + img = img.convert("RGB") + + width, height = img.size + + if mode == "pad": + # Make the largest dimension 518px while maintaining aspect ratio + if width >= height: + new_width = target_size + new_height = round(height * (new_width / width) / 14) * 14 # Make divisible by 14 + else: + new_height = target_size + new_width = round(width * (new_height / height) / 14) * 14 # Make divisible by 14 + else: # mode == "crop" + # Original behavior: set width to 518px + new_width = target_size + # Calcu late height maintaining aspect ratio, divisible by 14 + # new_height = round(height * (new_width / width) / 14) * 14 + new_height = target_size + + # Resize with new dimensions (width, height) + img = img.resize((new_width, new_height), Image.Resampling.BICUBIC) + img = to_tensor(img) # Convert to tensor (0, 1) + + # Center crop height if it's larger than 518 (only in crop mode) + if mode == "crop" and new_height > target_size: + start_y = (new_height - target_size) // 2 + img = img[:, start_y: start_y + target_size, :] + + # For pad mode, pad to make a square of target_size x target_size + if mode == "pad": + h_padding = target_size - img.shape[1] + w_padding = target_size - img.shape[2] + + if h_padding > 0 or w_padding > 0: + pad_top = h_padding // 2 + pad_bottom = h_padding - pad_top + pad_left = w_padding // 2 + pad_right = w_padding - pad_left + + # Pad with white (value=1.0) + img = torch.nn.functional.pad( + img, (pad_left, pad_right, pad_top, pad_bottom), mode="constant", value=1.0 + ) + + shapes.add((img.shape[1], img.shape[2])) + images.append(img) + + # Check if we have different shapes + # In theory our model can also work well with different shapes + if len(shapes) > 1: + print(f"Warning: Found images with different shapes: {shapes}") + # Find maximum dimensions + max_height = max(shape[0] for shape in shapes) + max_width = max(shape[1] for shape in shapes) + + # Pad images if necessary + padded_images = [] + for img in images: + h_padding = max_height - img.shape[1] + w_padding = max_width - img.shape[2] + + if h_padding > 0 or w_padding > 0: + pad_top = h_padding // 2 + pad_bottom = h_padding - pad_top + pad_left = w_padding // 2 + pad_right = w_padding - pad_left + + img = torch.nn.functional.pad( + img, (pad_left, pad_right, pad_top, pad_bottom), mode="constant", value=1.0 + ) + padded_images.append(img) + images = padded_images + + images = torch.stack(images) # concatenate images + + # Ensure correct shape when single image + if len(image_path_list) == 1: + # Verify shape is (1, C, H, W) + if images.dim() == 3: + images = images.unsqueeze(0) + + return images diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/pose_enc.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/pose_enc.py new file mode 100644 index 0000000000000000000000000000000000000000..2f98b0878cb13451b8cdb80074349cbf2644c5fa --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/pose_enc.py @@ -0,0 +1,130 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from .rotation import quat_to_mat, mat_to_quat + + +def extri_intri_to_pose_encoding( + extrinsics, + intrinsics, + image_size_hw=None, # e.g., (256, 512) + pose_encoding_type="absT_quaR_FoV", +): + """Convert camera extrinsics and intrinsics to a compact pose encoding. + + This function transforms camera parameters into a unified pose encoding format, + which can be used for various downstream tasks like pose prediction or representation. + + Args: + extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4, + where B is batch size and S is sequence length. + In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world transformation. + The format is [R|t] where R is a 3x3 rotation matrix and t is a 3x1 translation vector. + intrinsics (torch.Tensor): Camera intrinsic parameters with shape BxSx3x3. + Defined in pixels, with format: + [[fx, 0, cx], + [0, fy, cy], + [0, 0, 1]] + where fx, fy are focal lengths and (cx, cy) is the principal point + image_size_hw (tuple): Tuple of (height, width) of the image in pixels. + Required for computing field of view values. For example: (256, 512). + pose_encoding_type (str): Type of pose encoding to use. Currently only + supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view). + + Returns: + torch.Tensor: Encoded camera pose parameters with shape BxSx9. + For "absT_quaR_FoV" type, the 9 dimensions are: + - [:3] = absolute translation vector T (3D) + - [3:7] = rotation as quaternion quat (4D) + - [7:] = field of view (2D) + """ + + # extrinsics: BxSx3x4 + # intrinsics: BxSx3x3 + + if pose_encoding_type == "absT_quaR_FoV": + R = extrinsics[:, :, :3, :3] # BxSx3x3 + T = extrinsics[:, :, :3, 3] # BxSx3 + + quat = mat_to_quat(R) + # Note the order of h and w here + H, W = image_size_hw + fov_h = 2 * torch.atan((H / 2) / intrinsics[..., 1, 1]) + fov_w = 2 * torch.atan((W / 2) / intrinsics[..., 0, 0]) + pose_encoding = torch.cat([T, quat, fov_h[..., None], fov_w[..., None]], dim=-1).float() + else: + raise NotImplementedError + + return pose_encoding + + +def pose_encoding_to_extri_intri( + pose_encoding, + image_size_hw=None, # e.g., (256, 512) + pose_encoding_type="absT_quaR_FoV", + build_intrinsics=True, +): + """Convert a pose encoding back to camera extrinsics and intrinsics. + + This function performs the inverse operation of extri_intri_to_pose_encoding, + reconstructing the full camera parameters from the compact encoding. + + Args: + pose_encoding (torch.Tensor): Encoded camera pose parameters with shape BxSx9, + where B is batch size and S is sequence length. + For "absT_quaR_FoV" type, the 9 dimensions are: + - [:3] = absolute translation vector T (3D) + - [3:7] = rotation as quaternion quat (4D) + - [7:] = field of view (2D) + image_size_hw (tuple): Tuple of (height, width) of the image in pixels. + Required for reconstructing intrinsics from field of view values. + For example: (256, 512). + pose_encoding_type (str): Type of pose encoding used. Currently only + supports "absT_quaR_FoV" (absolute translation, quaternion rotation, field of view). + build_intrinsics (bool): Whether to reconstruct the intrinsics matrix. + If False, only extrinsics are returned and intrinsics will be None. + + Returns: + tuple: (extrinsics, intrinsics) + - extrinsics (torch.Tensor): Camera extrinsic parameters with shape BxSx3x4. + In OpenCV coordinate system (x-right, y-down, z-forward), representing camera from world + transformation. The format is [R|t] where R is a 3x3 rotation matrix and t is + a 3x1 translation vector. + - intrinsics (torch.Tensor or None): Camera intrinsic parameters with shape BxSx3x3, + or None if build_intrinsics is False. Defined in pixels, with format: + [[fx, 0, cx], + [0, fy, cy], + [0, 0, 1]] + where fx, fy are focal lengths and (cx, cy) is the principal point, + assumed to be at the center of the image (W/2, H/2). + """ + + intrinsics = None + + if pose_encoding_type == "absT_quaR_FoV": + T = pose_encoding[..., :3] + quat = pose_encoding[..., 3:7] + fov_h = pose_encoding[..., 7] + fov_w = pose_encoding[..., 8] + + R = quat_to_mat(quat) + extrinsics = torch.cat([R, T[..., None]], dim=-1) + + if build_intrinsics: + H, W = image_size_hw + fy = (H / 2.0) / torch.tan(fov_h / 2.0) + fx = (W / 2.0) / torch.tan(fov_w / 2.0) + intrinsics = torch.zeros(pose_encoding.shape[:2] + (3, 3), device=pose_encoding.device) + intrinsics[..., 0, 0] = fx + intrinsics[..., 1, 1] = fy + intrinsics[..., 0, 2] = W / 2 + intrinsics[..., 1, 2] = H / 2 + intrinsics[..., 2, 2] = 1.0 # Set the homogeneous coordinate to 1 + else: + raise NotImplementedError + + return extrinsics, intrinsics diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/rotation.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/rotation.py new file mode 100644 index 0000000000000000000000000000000000000000..657583e6915437c824c192d51939990b589a14fa --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/rotation.py @@ -0,0 +1,138 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# Modified from PyTorch3D, https://github.com/facebookresearch/pytorch3d + +import torch +import numpy as np +import torch.nn.functional as F + + +def quat_to_mat(quaternions: torch.Tensor) -> torch.Tensor: + """ + Quaternion Order: XYZW or say ijkr, scalar-last + + Convert rotations given as quaternions to rotation matrices. + Args: + quaternions: quaternions with real part last, + as tensor of shape (..., 4). + + Returns: + Rotation matrices as tensor of shape (..., 3, 3). + """ + i, j, k, r = torch.unbind(quaternions, -1) + # pyre-fixme[58]: `/` is not supported for operand types `float` and `Tensor`. + two_s = 2.0 / (quaternions * quaternions).sum(-1) + + o = torch.stack( + ( + 1 - two_s * (j * j + k * k), + two_s * (i * j - k * r), + two_s * (i * k + j * r), + two_s * (i * j + k * r), + 1 - two_s * (i * i + k * k), + two_s * (j * k - i * r), + two_s * (i * k - j * r), + two_s * (j * k + i * r), + 1 - two_s * (i * i + j * j), + ), + -1, + ) + return o.reshape(quaternions.shape[:-1] + (3, 3)) + + +def mat_to_quat(matrix: torch.Tensor) -> torch.Tensor: + """ + Convert rotations given as rotation matrices to quaternions. + + Args: + matrix: Rotation matrices as tensor of shape (..., 3, 3). + + Returns: + quaternions with real part last, as tensor of shape (..., 4). + Quaternion Order: XYZW or say ijkr, scalar-last + """ + if matrix.size(-1) != 3 or matrix.size(-2) != 3: + raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.") + + batch_dim = matrix.shape[:-2] + m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(matrix.reshape(batch_dim + (9,)), dim=-1) + + q_abs = _sqrt_positive_part( + torch.stack( + [ + 1.0 + m00 + m11 + m22, + 1.0 + m00 - m11 - m22, + 1.0 - m00 + m11 - m22, + 1.0 - m00 - m11 + m22, + ], + dim=-1, + ) + ) + + # we produce the desired quaternion multiplied by each of r, i, j, k + quat_by_rijk = torch.stack( + [ + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1), + # pyre-fixme[58]: `**` is not supported for operand types `Tensor` and + # `int`. + torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1), + ], + dim=-2, + ) + + # We floor here at 0.1 but the exact level is not important; if q_abs is small, + # the candidate won't be picked. + flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device) + quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr)) + + # if not for numerical problems, quat_candidates[i] should be same (up to a sign), + # forall i; we pick the best-conditioned one (with the largest denominator) + out = quat_candidates[F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :].reshape(batch_dim + (4,)) + + # Convert from rijk to ijkr + out = out[..., [1, 2, 3, 0]] + + out = standardize_quaternion(out) + + return out + + +def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor: + """ + Returns torch.sqrt(torch.max(0, x)) + but with a zero subgradient where x is 0. + """ + ret = torch.zeros_like(x) + positive_mask = x > 0 + if torch.is_grad_enabled(): + ret[positive_mask] = torch.sqrt(x[positive_mask]) + else: + ret = torch.where(positive_mask, torch.sqrt(x), ret) + return ret + + +def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor: + """ + Convert a unit quaternion to a standard form: one in which the real + part is non negative. + + Args: + quaternions: Quaternions with real part last, + as tensor of shape (..., 4). + + Returns: + Standardized quaternions as tensor of shape (..., 4). + """ + return torch.where(quaternions[..., 3:4] < 0, -quaternions, quaternions) diff --git a/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/visual_track.py b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/visual_track.py new file mode 100644 index 0000000000000000000000000000000000000000..796c114ccba00b5f7850e04b9444a6cd5c44b154 --- /dev/null +++ b/outdoor_v48_16gpu_v2/code/05_02-22:24:00/vggt/utils/visual_track.py @@ -0,0 +1,239 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import cv2 +import torch +import numpy as np +import os + + +def color_from_xy(x, y, W, H, cmap_name="hsv"): + """ + Map (x, y) -> color in (R, G, B). + 1) Normalize x,y to [0,1]. + 2) Combine them into a single scalar c in [0,1]. + 3) Use matplotlib's colormap to convert c -> (R,G,B). + + You can customize step 2, e.g., c = (x + y)/2, or some function of (x, y). + """ + import matplotlib.cm + import matplotlib.colors + + x_norm = x / max(W - 1, 1) + y_norm = y / max(H - 1, 1) + # Simple combination: + c = (x_norm + y_norm) / 2.0 + + cmap = matplotlib.cm.get_cmap(cmap_name) + # cmap(c) -> (r,g,b,a) in [0,1] + rgba = cmap(c) + r, g, b = rgba[0], rgba[1], rgba[2] + return (r, g, b) # in [0,1], RGB order + + +def get_track_colors_by_position(tracks_b, vis_mask_b=None, image_width=None, image_height=None, cmap_name="hsv"): + """ + Given all tracks in one sample (b), compute a (N,3) array of RGB color values + in [0,255]. The color is determined by the (x,y) position in the first + visible frame for each track. + + Args: + tracks_b: Tensor of shape (S, N, 2). (x,y) for each track in each frame. + vis_mask_b: (S, N) boolean mask; if None, assume all are visible. + image_width, image_height: used for normalizing (x, y). + cmap_name: for matplotlib (e.g., 'hsv', 'rainbow', 'jet'). + + Returns: + track_colors: np.ndarray of shape (N, 3), each row is (R,G,B) in [0,255]. + """ + S, N, _ = tracks_b.shape + track_colors = np.zeros((N, 3), dtype=np.uint8) + + if vis_mask_b is None: + # treat all as visible + vis_mask_b = torch.ones(S, N, dtype=torch.bool, device=tracks_b.device) + + for i in range(N): + # Find first visible frame for track i + visible_frames = torch.where(vis_mask_b[:, i])[0] + if len(visible_frames) == 0: + # track is never visible; just assign black or something + track_colors[i] = (0, 0, 0) + continue + + first_s = int(visible_frames[0].item()) + # use that frame's (x,y) + x, y = tracks_b[first_s, i].tolist() + + # map (x,y) -> (R,G,B) in [0,1] + r, g, b = color_from_xy(x, y, W=image_width, H=image_height, cmap_name=cmap_name) + # scale to [0,255] + r, g, b = int(r * 255), int(g * 255), int(b * 255) + track_colors[i] = (r, g, b) + + return track_colors + + +def visualize_tracks_on_images( + images, + tracks, + track_vis_mask=None, + out_dir="track_visuals_concat_by_xy", + image_format="CHW", # "CHW" or "HWC" + normalize_mode="[0,1]", + cmap_name="hsv", # e.g. "hsv", "rainbow", "jet" + frames_per_row=4, # New parameter for grid layout + save_grid=True, # Flag to control whether to save the grid image +): + """ + Visualizes frames in a grid layout with specified frames per row. + Each track's color is determined by its (x,y) position + in the first visible frame (or frame 0 if always visible). + Finally convert the BGR result to RGB before saving. + Also saves each individual frame as a separate PNG file. + + Args: + images: torch.Tensor (S, 3, H, W) if CHW or (S, H, W, 3) if HWC. + tracks: torch.Tensor (S, N, 2), last dim = (x, y). + track_vis_mask: torch.Tensor (S, N) or None. + out_dir: folder to save visualizations. + image_format: "CHW" or "HWC". + normalize_mode: "[0,1]", "[-1,1]", or None for direct raw -> 0..255 + cmap_name: a matplotlib colormap name for color_from_xy. + frames_per_row: number of frames to display in each row of the grid. + save_grid: whether to save all frames in one grid image. + + Returns: + None (saves images in out_dir). + """ + + if len(tracks.shape) == 4: + tracks = tracks.squeeze(0) + images = images.squeeze(0) + if track_vis_mask is not None: + track_vis_mask = track_vis_mask.squeeze(0) + + import matplotlib + + matplotlib.use("Agg") # for non-interactive (optional) + + os.makedirs(out_dir, exist_ok=True) + + S = images.shape[0] + _, N, _ = tracks.shape # (S, N, 2) + + # Move to CPU + images = images.cpu().clone() + tracks = tracks.cpu().clone() + if track_vis_mask is not None: + track_vis_mask = track_vis_mask.cpu().clone() + + # Infer H, W from images shape + if image_format == "CHW": + # e.g. images[s].shape = (3, H, W) + H, W = images.shape[2], images.shape[3] + else: + # e.g. images[s].shape = (H, W, 3) + H, W = images.shape[1], images.shape[2] + + # Pre-compute the color for each track i based on first visible position + track_colors_rgb = get_track_colors_by_position( + tracks, # shape (S, N, 2) + vis_mask_b=track_vis_mask if track_vis_mask is not None else None, + image_width=W, + image_height=H, + cmap_name=cmap_name, + ) + + # We'll accumulate each frame's drawn image in a list + frame_images = [] + + for s in range(S): + # shape => either (3, H, W) or (H, W, 3) + img = images[s] + + # Convert to (H, W, 3) + if image_format == "CHW": + img = img.permute(1, 2, 0) # (H, W, 3) + # else "HWC", do nothing + + img = img.numpy().astype(np.float32) + + # Scale to [0,255] if needed + if normalize_mode == "[0,1]": + img = np.clip(img, 0, 1) * 255.0 + elif normalize_mode == "[-1,1]": + img = (img + 1.0) * 0.5 * 255.0 + img = np.clip(img, 0, 255.0) + # else no normalization + + # Convert to uint8 + img = img.astype(np.uint8) + + # For drawing in OpenCV, convert to BGR + img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + # Draw each visible track + cur_tracks = tracks[s] # shape (N, 2) + if track_vis_mask is not None: + valid_indices = torch.where(track_vis_mask[s])[0] + else: + valid_indices = range(N) + + cur_tracks_np = cur_tracks.numpy() + for i in valid_indices: + x, y = cur_tracks_np[i] + pt = (int(round(x)), int(round(y))) + + # track_colors_rgb[i] is (R,G,B). For OpenCV circle, we need BGR + R, G, B = track_colors_rgb[i] + color_bgr = (int(B), int(G), int(R)) + cv2.circle(img_bgr, pt, radius=3, color=color_bgr, thickness=-1) + + # Convert back to RGB for consistent final saving: + img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB) + + # Save individual frame + frame_path = os.path.join(out_dir, f"frame_{s:04d}.png") + # Convert to BGR for OpenCV imwrite + frame_bgr = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR) + cv2.imwrite(frame_path, frame_bgr) + + frame_images.append(img_rgb) + + # Only create and save the grid image if save_grid is True + if save_grid: + # Calculate grid dimensions + num_rows = (S + frames_per_row - 1) // frames_per_row # Ceiling division + + # Create a grid of images + grid_img = None + for row in range(num_rows): + start_idx = row * frames_per_row + end_idx = min(start_idx + frames_per_row, S) + + # Concatenate this row horizontally + row_img = np.concatenate(frame_images[start_idx:end_idx], axis=1) + + # If this row has fewer than frames_per_row images, pad with black + if end_idx - start_idx < frames_per_row: + padding_width = (frames_per_row - (end_idx - start_idx)) * W + padding = np.zeros((H, padding_width, 3), dtype=np.uint8) + row_img = np.concatenate([row_img, padding], axis=1) + + # Add this row to the grid + if grid_img is None: + grid_img = row_img + else: + grid_img = np.concatenate([grid_img, row_img], axis=0) + + out_path = os.path.join(out_dir, "tracks_grid.png") + # Convert back to BGR for OpenCV imwrite + grid_img_bgr = cv2.cvtColor(grid_img, cv2.COLOR_RGB2BGR) + cv2.imwrite(out_path, grid_img_bgr) + print(f"[INFO] Saved color-by-XY track visualization grid -> {out_path}") + + print(f"[INFO] Saved {S} individual frames to {out_dir}/frame_*.png")