Spaces:

theairlabcmu
/

Any4D

Running on Zero

Any4D / mapanything /utils /moge_inference.py

Jay Karhade

Initial Space release

0343ccd 4 months ago

3.79 kB

	"""
	Util functions to run inference with MoGe
	"""

	import argparse
	import os
	import warnings
	from pathlib import Path

	warnings.filterwarnings("ignore", category=FutureWarning) # Suppress XFormers warnings

	import numpy as np
	import rerun as rr
	import torch
	import torchvision
	import torchvision.transforms as tvf
	from PIL import Image

	from mapanything.utils.viz import log_data_to_rerun, script_add_rerun_args


	def load_moge_model(
	model_code_path: str = "MoGe",
	ckpt_path: str = "Ruicheng/moge-vitl", #"/mnt/xri_mapsresearch/data/nkeetha/cache/huggingface/hub/models--Ruicheng--moge-vitl/snapshots/979e84da9415762c30e6c0cf8dc0962896c793df/model.pt",
	device="cuda",
	):
	"""
	Load the MoGe (ViT-L) model from huggingface hub (or load from local).
	"""
	if not Path(model_code_path).exists():
	raise FileNotFoundError(f"MoGe code not found at {model_code_path}")
	import sys

	# Add the MoGe code to the system path
	sys.path.append(str(model_code_path))

	# Init the MoGe model
	from moge.model.v1 import MoGeModel

	model = MoGeModel.from_pretrained(ckpt_path).to(device).eval()

	return model


	@torch.no_grad()
	def run_moge_inference(model: torch.nn.Module, image: torch.tensor, device="cuda"):
	"""
	Run MoGe inference on a batch of images or single image.
	Output is a dictionary with the following keys:
	- points: (B, H, W, 3) # scale-invariant point map in OpenCV camera coordinate system (x right, y down, z forward)
	- depth: (B, H, W) # scale-invariant depth map
	- mask: (B, H, W) # a binary mask for valid pixels
	- intrinsics: (B, 3, 3) # normalized camera intrinsics

	Args:
	model: MoGe model
	image: (B, 3, H, W) or (3, H, W) # RGB image in range [0, 1]
	"""
	image = image.to(device)
	return model.infer(image)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("-ip", "--image_path", default='/ocean/projects/cis220039p/mdt2/jkarhade/Any4D/benchmarking/monst3r/demo_data/lady-running/00000.jpg', type=str)
	parser.add_argument("--viz", action="store_true")
	script_add_rerun_args(parser) # Options: --headless, --connect, --serve, --addr, --save, --stdout
	args = parser.parse_args()

	# Setup Rerun if needed
	if args.viz:
	rr.script_setup(args, f"MoGe_Pred_Viz")
	rr.set_time_seconds("stable_time", 0)
	rr.log("moge", rr.ViewCoordinates.RDF, static=True)

	# Load the input data
	img = np.array(Image.open(args.image_path)) # (H, W, 3)
	transform = tvf.Compose([tvf.ToTensor()])
	input_img = transform(img).unsqueeze(0) # (B, 3, H, W)

	# Load the model
	model = load_moge_model()

	# Run the model inference
	output = run_moge_inference(model, input_img)

	# Get the different outputs
	pts3d = output["points"].cpu().squeeze(0).numpy() # (H, W, 3)
	depth = output["depth"].cpu().squeeze(0).numpy() # (H, W)
	mask = output["mask"].cpu().squeeze(0).numpy() # (H, W)
	intrinsics = output["intrinsics"].cpu().squeeze(0).numpy() # (3, 3), normalized
	intrinsics[0, :] = intrinsics[0, :] * depth.shape[1]
	intrinsics[1, :] = intrinsics[1, :] * depth.shape[0]

	# Log prediction to Rerun
	if args.viz:
	base_name = "moge"
	log_data_to_rerun(
	image=img, depthmap=depth, pose=np.eye(4), intrinsics=intrinsics, base_name=base_name, mask=np.float32(mask)
	)
	# Log the predicted 3D points
	filtered_pts = pts3d[mask]
	filtered_pts_col = img[mask]
	pts_name = f"{base_name}/points"
	rr.log(
	pts_name,
	rr.Points3D(
	positions=filtered_pts.reshape(-1, 3),
	colors=filtered_pts_col.reshape(-1, 3),
	),
	)