| |
| |
| |
| |
| |
|
|
| import random |
| import numpy as np |
| import glob |
| import os |
| import copy |
| import torch |
| import torch.nn.functional as F |
|
|
| |
| torch.backends.cudnn.enabled = True |
| torch.backends.cudnn.benchmark = True |
| torch.backends.cudnn.deterministic = False |
|
|
| import argparse |
| from pathlib import Path |
| import trimesh |
| import pycolmap |
|
|
|
|
| from vggt.models.vggt import VGGT |
| from vggt.utils.load_fn import load_and_preprocess_images_square |
| from vggt.utils.pose_enc import pose_encoding_to_extri_intri |
| from vggt.utils.geometry import unproject_depth_map_to_point_map |
| from vggt.utils.helper import create_pixel_coordinate_grid, randomly_limit_trues |
| from vggt.dependency.track_predict import predict_tracks |
| from vggt.dependency.np_to_pycolmap import batch_np_matrix_to_pycolmap, batch_np_matrix_to_pycolmap_wo_track |
|
|
|
|
| |
| |
| |
| |
| |
|
|
|
|
| def parse_args(): |
| parser = argparse.ArgumentParser(description="VGGT Demo") |
| parser.add_argument("--scene_dir", type=str, required=True, help="Directory containing the scene images") |
| parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility") |
| parser.add_argument("--use_ba", action="store_true", default=False, help="Use BA for reconstruction") |
| |
| parser.add_argument( |
| "--max_reproj_error", type=float, default=8.0, help="Maximum reprojection error for reconstruction" |
| ) |
| parser.add_argument("--shared_camera", action="store_true", default=False, help="Use shared camera for all images") |
| parser.add_argument("--camera_type", type=str, default="SIMPLE_PINHOLE", help="Camera type for reconstruction") |
| parser.add_argument("--vis_thresh", type=float, default=0.2, help="Visibility threshold for tracks") |
| parser.add_argument("--query_frame_num", type=int, default=5, help="Number of frames to query") |
| parser.add_argument("--max_query_pts", type=int, default=2048, help="Maximum number of query points") |
| parser.add_argument( |
| "--fine_tracking", action="store_true", default=True, help="Use fine tracking (slower but more accurate)" |
| ) |
| parser.add_argument( |
| "--conf_thres_value", type=float, default=5.0, help="Confidence threshold value for depth filtering (wo BA)" |
| ) |
| return parser.parse_args() |
|
|
|
|
| def run_VGGT(model, images, dtype, resolution=518): |
| |
|
|
| assert len(images.shape) == 4 |
| assert images.shape[1] == 3 |
|
|
| |
| images = F.interpolate(images, size=(resolution, resolution), mode="bilinear", align_corners=False) |
|
|
| with torch.no_grad(): |
| with torch.cuda.amp.autocast(dtype=dtype): |
| images = images[None] |
| aggregated_tokens_list, ps_idx = model.aggregator(images) |
|
|
| |
| pose_enc = model.camera_head(aggregated_tokens_list)[-1] |
| |
| extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:]) |
| |
| depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx) |
|
|
| extrinsic = extrinsic.squeeze(0).cpu().numpy() |
| intrinsic = intrinsic.squeeze(0).cpu().numpy() |
| depth_map = depth_map.squeeze(0).cpu().numpy() |
| depth_conf = depth_conf.squeeze(0).cpu().numpy() |
| return extrinsic, intrinsic, depth_map, depth_conf |
|
|
|
|
| def demo_fn(args): |
| |
| print("Arguments:", vars(args)) |
|
|
| |
| np.random.seed(args.seed) |
| torch.manual_seed(args.seed) |
| random.seed(args.seed) |
| if torch.cuda.is_available(): |
| torch.cuda.manual_seed(args.seed) |
| torch.cuda.manual_seed_all(args.seed) |
| print(f"Setting seed as: {args.seed}") |
|
|
| |
| dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16 |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| print(f"Using device: {device}") |
| print(f"Using dtype: {dtype}") |
|
|
| |
| model = VGGT() |
| _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt" |
| model.load_state_dict(torch.hub.load_state_dict_from_url(_URL)) |
| model.eval() |
| model = model.to(device) |
| print(f"Model loaded") |
|
|
| |
| image_dir = os.path.join(args.scene_dir, "images") |
| image_path_list = glob.glob(os.path.join(image_dir, "*")) |
| if len(image_path_list) == 0: |
| raise ValueError(f"No images found in {image_dir}") |
| base_image_path_list = [os.path.basename(path) for path in image_path_list] |
|
|
| |
| |
| vggt_fixed_resolution = 518 |
| img_load_resolution = 1024 |
|
|
| images, original_coords = load_and_preprocess_images_square(image_path_list, img_load_resolution) |
| images = images.to(device) |
| original_coords = original_coords.to(device) |
| print(f"Loaded {len(images)} images from {image_dir}") |
|
|
| |
| |
| extrinsic, intrinsic, depth_map, depth_conf = run_VGGT(model, images, dtype, vggt_fixed_resolution) |
| points_3d = unproject_depth_map_to_point_map(depth_map, extrinsic, intrinsic) |
|
|
| if args.use_ba: |
| image_size = np.array(images.shape[-2:]) |
| scale = img_load_resolution / vggt_fixed_resolution |
| shared_camera = args.shared_camera |
|
|
| with torch.cuda.amp.autocast(dtype=dtype): |
| |
| |
| |
| |
|
|
| |
| |
| pred_tracks, pred_vis_scores, pred_confs, points_3d, points_rgb = predict_tracks( |
| images, |
| conf=depth_conf, |
| points_3d=points_3d, |
| masks=None, |
| max_query_pts=args.max_query_pts, |
| query_frame_num=args.query_frame_num, |
| keypoint_extractor="aliked+sp", |
| fine_tracking=args.fine_tracking, |
| ) |
|
|
| torch.cuda.empty_cache() |
|
|
| |
| intrinsic[:, :2, :] *= scale |
| track_mask = pred_vis_scores > args.vis_thresh |
|
|
| |
| reconstruction, valid_track_mask = batch_np_matrix_to_pycolmap( |
| points_3d, |
| extrinsic, |
| intrinsic, |
| pred_tracks, |
| image_size, |
| masks=track_mask, |
| max_reproj_error=args.max_reproj_error, |
| shared_camera=shared_camera, |
| camera_type=args.camera_type, |
| points_rgb=points_rgb, |
| ) |
|
|
| if reconstruction is None: |
| raise ValueError("No reconstruction can be built with BA") |
|
|
| |
| ba_options = pycolmap.BundleAdjustmentOptions() |
| pycolmap.bundle_adjustment(reconstruction, ba_options) |
|
|
| reconstruction_resolution = img_load_resolution |
| else: |
| conf_thres_value = args.conf_thres_value |
| max_points_for_colmap = 100000 |
| shared_camera = False |
| camera_type = "PINHOLE" |
|
|
| image_size = np.array([vggt_fixed_resolution, vggt_fixed_resolution]) |
| num_frames, height, width, _ = points_3d.shape |
|
|
| points_rgb = F.interpolate( |
| images, size=(vggt_fixed_resolution, vggt_fixed_resolution), mode="bilinear", align_corners=False |
| ) |
| points_rgb = (points_rgb.cpu().numpy() * 255).astype(np.uint8) |
| points_rgb = points_rgb.transpose(0, 2, 3, 1) |
|
|
| |
| points_xyf = create_pixel_coordinate_grid(num_frames, height, width) |
|
|
| conf_mask = depth_conf >= conf_thres_value |
| |
| conf_mask = randomly_limit_trues(conf_mask, max_points_for_colmap) |
|
|
| points_3d = points_3d[conf_mask] |
| points_xyf = points_xyf[conf_mask] |
| points_rgb = points_rgb[conf_mask] |
|
|
| print("Converting to COLMAP format") |
| reconstruction = batch_np_matrix_to_pycolmap_wo_track( |
| points_3d, |
| points_xyf, |
| points_rgb, |
| extrinsic, |
| intrinsic, |
| image_size, |
| shared_camera=shared_camera, |
| camera_type=camera_type, |
| ) |
|
|
| reconstruction_resolution = vggt_fixed_resolution |
|
|
| reconstruction = rename_colmap_recons_and_rescale_camera( |
| reconstruction, |
| base_image_path_list, |
| original_coords.cpu().numpy(), |
| img_size=reconstruction_resolution, |
| shift_point2d_to_original_res=True, |
| shared_camera=shared_camera, |
| ) |
|
|
| print(f"Saving reconstruction to {args.scene_dir}/sparse") |
| sparse_reconstruction_dir = os.path.join(args.scene_dir, "sparse") |
| os.makedirs(sparse_reconstruction_dir, exist_ok=True) |
| reconstruction.write(sparse_reconstruction_dir) |
|
|
| |
| trimesh.PointCloud(points_3d, colors=points_rgb).export(os.path.join(args.scene_dir, "sparse/points.ply")) |
|
|
| return True |
|
|
|
|
| def rename_colmap_recons_and_rescale_camera( |
| reconstruction, image_paths, original_coords, img_size, shift_point2d_to_original_res=False, shared_camera=False |
| ): |
| rescale_camera = True |
|
|
| for pyimageid in reconstruction.images: |
| |
| |
| pyimage = reconstruction.images[pyimageid] |
| pycamera = reconstruction.cameras[pyimage.camera_id] |
| pyimage.name = image_paths[pyimageid - 1] |
|
|
| if rescale_camera: |
| |
| pred_params = copy.deepcopy(pycamera.params) |
|
|
| real_image_size = original_coords[pyimageid - 1, -2:] |
| resize_ratio = max(real_image_size) / img_size |
| pred_params = pred_params * resize_ratio |
| real_pp = real_image_size / 2 |
| pred_params[-2:] = real_pp |
|
|
| pycamera.params = pred_params |
| pycamera.width = real_image_size[0] |
| pycamera.height = real_image_size[1] |
|
|
| if shift_point2d_to_original_res: |
| |
| top_left = original_coords[pyimageid - 1, :2] |
|
|
| for point2D in pyimage.points2D: |
| point2D.xy = (point2D.xy - top_left) * resize_ratio |
|
|
| if shared_camera: |
| |
| |
| rescale_camera = False |
|
|
| return reconstruction |
|
|
|
|
| if __name__ == "__main__": |
| args = parse_args() |
| with torch.no_grad(): |
| demo_fn(args) |
|
|
|
|
| |
|
|
| """ |
| VGGT Runner Script |
| ================= |
| |
| A script to run the VGGT model for 3D reconstruction from image sequences. |
| |
| Directory Structure |
| ------------------ |
| Input: |
| input_folder/ |
| βββ images/ # Source images for reconstruction |
| |
| Output: |
| output_folder/ |
| βββ images/ |
| βββ sparse/ # Reconstruction results |
| β βββ cameras.bin # Camera parameters (COLMAP format) |
| β βββ images.bin # Pose for each image (COLMAP format) |
| β βββ points3D.bin # 3D points (COLMAP format) |
| β βββ points.ply # Point cloud visualization file |
| βββ visuals/ # Visualization outputs TODO |
| |
| Key Features |
| ----------- |
| β’ Dual-mode Support: Run reconstructions using either VGGT or VGGT+BA |
| β’ Resolution Preservation: Maintains original image resolution in camera parameters and tracks |
| β’ COLMAP Compatibility: Exports results in standard COLMAP sparse reconstruction format |
| """ |
|
|