Spaces:
Running on Zero
Running on Zero
| """ | |
| Util functions to run inference with MoGe | |
| """ | |
| import argparse | |
| import os | |
| import warnings | |
| from pathlib import Path | |
| warnings.filterwarnings("ignore", category=FutureWarning) # Suppress XFormers warnings | |
| import numpy as np | |
| import rerun as rr | |
| import torch | |
| import torchvision | |
| import torchvision.transforms as tvf | |
| from PIL import Image | |
| from mapanything.utils.viz import log_data_to_rerun, script_add_rerun_args | |
| def load_moge_model( | |
| model_code_path: str = "MoGe", | |
| ckpt_path: str = "Ruicheng/moge-vitl", #"/mnt/xri_mapsresearch/data/nkeetha/cache/huggingface/hub/models--Ruicheng--moge-vitl/snapshots/979e84da9415762c30e6c0cf8dc0962896c793df/model.pt", | |
| device="cuda", | |
| ): | |
| """ | |
| Load the MoGe (ViT-L) model from huggingface hub (or load from local). | |
| """ | |
| if not Path(model_code_path).exists(): | |
| raise FileNotFoundError(f"MoGe code not found at {model_code_path}") | |
| import sys | |
| # Add the MoGe code to the system path | |
| sys.path.append(str(model_code_path)) | |
| # Init the MoGe model | |
| from moge.model.v1 import MoGeModel | |
| model = MoGeModel.from_pretrained(ckpt_path).to(device).eval() | |
| return model | |
| def run_moge_inference(model: torch.nn.Module, image: torch.tensor, device="cuda"): | |
| """ | |
| Run MoGe inference on a batch of images or single image. | |
| Output is a dictionary with the following keys: | |
| - points: (B, H, W, 3) # scale-invariant point map in OpenCV camera coordinate system (x right, y down, z forward) | |
| - depth: (B, H, W) # scale-invariant depth map | |
| - mask: (B, H, W) # a binary mask for valid pixels | |
| - intrinsics: (B, 3, 3) # normalized camera intrinsics | |
| Args: | |
| model: MoGe model | |
| image: (B, 3, H, W) or (3, H, W) # RGB image in range [0, 1] | |
| """ | |
| image = image.to(device) | |
| return model.infer(image) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("-ip", "--image_path", default='/ocean/projects/cis220039p/mdt2/jkarhade/Any4D/benchmarking/monst3r/demo_data/lady-running/00000.jpg', type=str) | |
| parser.add_argument("--viz", action="store_true") | |
| script_add_rerun_args(parser) # Options: --headless, --connect, --serve, --addr, --save, --stdout | |
| args = parser.parse_args() | |
| # Setup Rerun if needed | |
| if args.viz: | |
| rr.script_setup(args, f"MoGe_Pred_Viz") | |
| rr.set_time_seconds("stable_time", 0) | |
| rr.log("moge", rr.ViewCoordinates.RDF, static=True) | |
| # Load the input data | |
| img = np.array(Image.open(args.image_path)) # (H, W, 3) | |
| transform = tvf.Compose([tvf.ToTensor()]) | |
| input_img = transform(img).unsqueeze(0) # (B, 3, H, W) | |
| # Load the model | |
| model = load_moge_model() | |
| # Run the model inference | |
| output = run_moge_inference(model, input_img) | |
| # Get the different outputs | |
| pts3d = output["points"].cpu().squeeze(0).numpy() # (H, W, 3) | |
| depth = output["depth"].cpu().squeeze(0).numpy() # (H, W) | |
| mask = output["mask"].cpu().squeeze(0).numpy() # (H, W) | |
| intrinsics = output["intrinsics"].cpu().squeeze(0).numpy() # (3, 3), normalized | |
| intrinsics[0, :] = intrinsics[0, :] * depth.shape[1] | |
| intrinsics[1, :] = intrinsics[1, :] * depth.shape[0] | |
| # Log prediction to Rerun | |
| if args.viz: | |
| base_name = "moge" | |
| log_data_to_rerun( | |
| image=img, depthmap=depth, pose=np.eye(4), intrinsics=intrinsics, base_name=base_name, mask=np.float32(mask) | |
| ) | |
| # Log the predicted 3D points | |
| filtered_pts = pts3d[mask] | |
| filtered_pts_col = img[mask] | |
| pts_name = f"{base_name}/points" | |
| rr.log( | |
| pts_name, | |
| rr.Points3D( | |
| positions=filtered_pts.reshape(-1, 3), | |
| colors=filtered_pts_col.reshape(-1, 3), | |
| ), | |
| ) | |