Spaces:

CgvKodai
/

_vggt

Runtime error

App Files Files Community

_vggt / demo_colmap.py

CgvKodai

Upload folder using huggingface_hub

66003a2 verified about 1 month ago

raw

history blame contribute delete

12.7 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import random
	import numpy as np
	import glob
	import os
	import copy
	import torch
	import torch.nn.functional as F

	# Configure CUDA settings
	torch.backends.cudnn.enabled = True
	torch.backends.cudnn.benchmark = True
	torch.backends.cudnn.deterministic = False

	import argparse
	from pathlib import Path
	import trimesh
	import pycolmap


	from vggt.models.vggt import VGGT
	from vggt.utils.load_fn import load_and_preprocess_images_square
	from vggt.utils.pose_enc import pose_encoding_to_extri_intri
	from vggt.utils.geometry import unproject_depth_map_to_point_map
	from vggt.utils.helper import create_pixel_coordinate_grid, randomly_limit_trues
	from vggt.dependency.track_predict import predict_tracks
	from vggt.dependency.np_to_pycolmap import batch_np_matrix_to_pycolmap, batch_np_matrix_to_pycolmap_wo_track


	# TODO: add support for masks
	# TODO: add iterative BA
	# TODO: add support for radial distortion, which needs extra_params
	# TODO: test with more cases
	# TODO: test different camera types


	def parse_args():
	parser = argparse.ArgumentParser(description="VGGT Demo")
	parser.add_argument("--scene_dir", type=str, required=True, help="Directory containing the scene images")
	parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
	parser.add_argument("--use_ba", action="store_true", default=False, help="Use BA for reconstruction")
	######### BA parameters #########
	parser.add_argument(
	"--max_reproj_error", type=float, default=8.0, help="Maximum reprojection error for reconstruction"
	)
	parser.add_argument("--shared_camera", action="store_true", default=False, help="Use shared camera for all images")
	parser.add_argument("--camera_type", type=str, default="SIMPLE_PINHOLE", help="Camera type for reconstruction")
	parser.add_argument("--vis_thresh", type=float, default=0.2, help="Visibility threshold for tracks")
	parser.add_argument("--query_frame_num", type=int, default=8, help="Number of frames to query")
	parser.add_argument("--max_query_pts", type=int, default=4096, help="Maximum number of query points")
	parser.add_argument(
	"--fine_tracking", action="store_true", default=True, help="Use fine tracking (slower but more accurate)"
	)
	parser.add_argument(
	"--conf_thres_value", type=float, default=5.0, help="Confidence threshold value for depth filtering (wo BA)"
	)
	return parser.parse_args()


	def run_VGGT(model, images, dtype, resolution=518):
	# images: [B, 3, H, W]

	assert len(images.shape) == 4
	assert images.shape[1] == 3

	# hard-coded to use 518 for VGGT
	images = F.interpolate(images, size=(resolution, resolution), mode="bilinear", align_corners=False)

	with torch.no_grad():
	with torch.cuda.amp.autocast(dtype=dtype):
	images = images[None] # add batch dimension
	aggregated_tokens_list, ps_idx = model.aggregator(images)

	# Predict Cameras
	pose_enc = model.camera_head(aggregated_tokens_list)[-1]
	# Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
	extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, images.shape[-2:])
	# Predict Depth Maps
	depth_map, depth_conf = model.depth_head(aggregated_tokens_list, images, ps_idx)

	extrinsic = extrinsic.squeeze(0).cpu().numpy()
	intrinsic = intrinsic.squeeze(0).cpu().numpy()
	depth_map = depth_map.squeeze(0).cpu().numpy()
	depth_conf = depth_conf.squeeze(0).cpu().numpy()
	return extrinsic, intrinsic, depth_map, depth_conf


	def demo_fn(args):
	# Print configuration
	print("Arguments:", vars(args))

	# Set seed for reproducibility
	np.random.seed(args.seed)
	torch.manual_seed(args.seed)
	random.seed(args.seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(args.seed)
	torch.cuda.manual_seed_all(args.seed) # for multi-GPU
	print(f"Setting seed as: {args.seed}")

	# Set device and dtype
	dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")
	print(f"Using dtype: {dtype}")

	# Run VGGT for camera and depth estimation
	model = VGGT()
	_URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
	model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
	model.eval()
	model = model.to(device)
	print(f"Model loaded")

	# Get image paths and preprocess them
	image_dir = os.path.join(args.scene_dir, "images")
	image_path_list = glob.glob(os.path.join(image_dir, "*"))
	if len(image_path_list) == 0:
	raise ValueError(f"No images found in {image_dir}")
	base_image_path_list = [os.path.basename(path) for path in image_path_list]

	# Load images and original coordinates
	# Load Image in 1024, while running VGGT with 518
	vggt_fixed_resolution = 518
	img_load_resolution = 1024

	images, original_coords = load_and_preprocess_images_square(image_path_list, img_load_resolution)
	images = images.to(device)
	original_coords = original_coords.to(device)
	print(f"Loaded {len(images)} images from {image_dir}")

	# Run VGGT to estimate camera and depth
	# Run with 518x518 images
	extrinsic, intrinsic, depth_map, depth_conf = run_VGGT(model, images, dtype, vggt_fixed_resolution)
	points_3d = unproject_depth_map_to_point_map(depth_map, extrinsic, intrinsic)

	if args.use_ba:
	image_size = np.array(images.shape[-2:])
	scale = img_load_resolution / vggt_fixed_resolution
	shared_camera = args.shared_camera

	with torch.cuda.amp.autocast(dtype=dtype):
	# Predicting Tracks
	# Using VGGSfM tracker instead of VGGT tracker for efficiency
	# VGGT tracker requires multiple backbone runs to query different frames (this is a problem caused by the training process)
	# Will be fixed in VGGT v2

	# You can also change the pred_tracks to tracks from any other methods
	# e.g., from COLMAP, from CoTracker, or by chaining 2D matches from Lightglue/LoFTR.
	pred_tracks, pred_vis_scores, pred_confs, points_3d, points_rgb = predict_tracks(
	images,
	conf=depth_conf,
	points_3d=points_3d,
	masks=None,
	max_query_pts=args.max_query_pts,
	query_frame_num=args.query_frame_num,
	keypoint_extractor="aliked+sp",
	fine_tracking=args.fine_tracking,
	)

	torch.cuda.empty_cache()

	# rescale the intrinsic matrix from 518 to 1024
	intrinsic[:, :2, :] *= scale
	track_mask = pred_vis_scores > args.vis_thresh

	# TODO: radial distortion, iterative BA, masks
	reconstruction, valid_track_mask = batch_np_matrix_to_pycolmap(
	points_3d,
	extrinsic,
	intrinsic,
	pred_tracks,
	image_size,
	masks=track_mask,
	max_reproj_error=args.max_reproj_error,
	shared_camera=shared_camera,
	camera_type=args.camera_type,
	points_rgb=points_rgb,
	)

	if reconstruction is None:
	raise ValueError("No reconstruction can be built with BA")

	# Bundle Adjustment
	ba_options = pycolmap.BundleAdjustmentOptions()
	pycolmap.bundle_adjustment(reconstruction, ba_options)

	reconstruction_resolution = img_load_resolution
	else:
	conf_thres_value = args.conf_thres_value
	max_points_for_colmap = 100000 # randomly sample 3D points
	shared_camera = False # in the feedforward manner, we do not support shared camera
	camera_type = "PINHOLE" # in the feedforward manner, we only support PINHOLE camera

	image_size = np.array([vggt_fixed_resolution, vggt_fixed_resolution])
	num_frames, height, width, _ = points_3d.shape

	points_rgb = F.interpolate(
	images, size=(vggt_fixed_resolution, vggt_fixed_resolution), mode="bilinear", align_corners=False
	)
	points_rgb = (points_rgb.cpu().numpy() * 255).astype(np.uint8)
	points_rgb = points_rgb.transpose(0, 2, 3, 1)

	# (S, H, W, 3), with x, y coordinates and frame indices
	points_xyf = create_pixel_coordinate_grid(num_frames, height, width)

	conf_mask = depth_conf >= conf_thres_value
	# at most writing 100000 3d points to colmap reconstruction object
	conf_mask = randomly_limit_trues(conf_mask, max_points_for_colmap)

	points_3d = points_3d[conf_mask]
	points_xyf = points_xyf[conf_mask]
	points_rgb = points_rgb[conf_mask]

	print("Converting to COLMAP format")
	reconstruction = batch_np_matrix_to_pycolmap_wo_track(
	points_3d,
	points_xyf,
	points_rgb,
	extrinsic,
	intrinsic,
	image_size,
	shared_camera=shared_camera,
	camera_type=camera_type,
	)

	reconstruction_resolution = vggt_fixed_resolution

	reconstruction = rename_colmap_recons_and_rescale_camera(
	reconstruction,
	base_image_path_list,
	original_coords.cpu().numpy(),
	img_size=reconstruction_resolution,
	shift_point2d_to_original_res=True,
	shared_camera=shared_camera,
	)

	print(f"Saving reconstruction to {args.scene_dir}/sparse")
	sparse_reconstruction_dir = os.path.join(args.scene_dir, "sparse")
	os.makedirs(sparse_reconstruction_dir, exist_ok=True)
	reconstruction.write(sparse_reconstruction_dir)

	# Save point cloud for fast visualization
	trimesh.PointCloud(points_3d, colors=points_rgb).export(os.path.join(args.scene_dir, "sparse/points.ply"))

	return True


	def rename_colmap_recons_and_rescale_camera(
	reconstruction, image_paths, original_coords, img_size, shift_point2d_to_original_res=False, shared_camera=False
	):
	rescale_camera = True

	for pyimageid in reconstruction.images:
	# Reshaped the padded&resized image to the original size
	# Rename the images to the original names
	pyimage = reconstruction.images[pyimageid]
	pycamera = reconstruction.cameras[pyimage.camera_id]
	pyimage.name = image_paths[pyimageid - 1]

	if rescale_camera:
	# Rescale the camera parameters
	pred_params = copy.deepcopy(pycamera.params)

	real_image_size = original_coords[pyimageid - 1, -2:]
	resize_ratio = max(real_image_size) / img_size
	pred_params = pred_params * resize_ratio
	real_pp = real_image_size / 2
	pred_params[-2:] = real_pp # center of the image

	pycamera.params = pred_params
	pycamera.width = real_image_size[0]
	pycamera.height = real_image_size[1]

	if shift_point2d_to_original_res:
	# Also shift the point2D to original resolution
	top_left = original_coords[pyimageid - 1, :2]

	for point2D in pyimage.points2D:
	point2D.xy = (point2D.xy - top_left) * resize_ratio

	if shared_camera:
	# If shared_camera, all images share the same camera
	# no need to rescale any more
	rescale_camera = False

	return reconstruction


	if __name__ == "__main__":
	args = parse_args()
	with torch.no_grad():
	demo_fn(args)


	# Work in Progress (WIP)

	"""
	VGGT Runner Script
	=================

	A script to run the VGGT model for 3D reconstruction from image sequences.

	Directory Structure
	------------------
	Input:
	input_folder/
	└── images/ # Source images for reconstruction

	Output:
	output_folder/
	├── images/
	├── sparse/ # Reconstruction results
	│ ├── cameras.bin # Camera parameters (COLMAP format)
	│ ├── images.bin # Pose for each image (COLMAP format)
	│ ├── points3D.bin # 3D points (COLMAP format)
	│ └── points.ply # Point cloud visualization file
	└── visuals/ # Visualization outputs TODO

	Key Features
	-----------
	• Dual-mode Support: Run reconstructions using either VGGT or VGGT+BA
	• Resolution Preservation: Maintains original image resolution in camera parameters and tracks
	• COLMAP Compatibility: Exports results in standard COLMAP sparse reconstruction format
	"""