Spaces:

pointcept-bot
/

Concerto

Running on Zero

App Files Files Community

Yujia-Zhang0913 commited on 15 days ago

Commit

d45351d

1 Parent(s): 9e30547

space gpu fuc

Browse files

Files changed (1) hide show

app.py +110 -87

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import shutil
 from datetime import datetime
 import glob
 import gc
 import gradio as gr
 import numpy as np
 import open3d as o3d
@@ -30,59 +31,59 @@ from vggt.utils.load_fn import load_and_preprocess_images
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 from vggt.utils.geometry import unproject_depth_map_to_point_map
-device = "cuda" if torch.cuda.is_available() else "cpu"
-def run_model(target_dir, model) -> dict:
     """
-    Run the VGGT model on images in the 'target_dir/images' folder and return predictions.
     """
-    print(f"Processing images from {target_dir}")
-    # if not torch.cuda.is_available():
-    #     raise ValueError("CUDA is not available. Check your environment.")
-    # Move model to device
-    model = model.to(device)
     model.eval()
-    # Load and preprocess images
-    image_names = glob.glob(os.path.join(target_dir, "images", "*"))
-    image_names = sorted(image_names)
-    print(f"Found {len(image_names)} images")
-    if len(image_names) == 0:
-        raise ValueError("No images found. Check your upload.")
-    images = load_and_preprocess_images(image_names).to(device)
-    print(f"Preprocessed images shape: {images.shape}")
-    # Run inference
     print("Running inference...")
     with torch.no_grad():
         if device == "cuda":
             with torch.cuda.amp.autocast(dtype=torch.bfloat16):
-                predictions = model(images)
         else:
-            predictions = model(images)
-    # Convert pose encoding to extrinsic and intrinsic matrices
     print("Converting pose encoding to extrinsic and intrinsic matrices...")
-    extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images.shape[-2:])
     predictions["extrinsic"] = extrinsic
     predictions["intrinsic"] = intrinsic
-    # Convert tensors to numpy
     for key in predictions.keys():
         if isinstance(predictions[key], torch.Tensor):
-            predictions[key] = predictions[key].cpu().numpy().squeeze(0)  # remove batch dimension
-    # Generate world points from depth map
     print("Computing world points from depth map...")
     depth_map = predictions["depth"]  # (S, H, W, 1)
     world_points = unproject_depth_map_to_point_map(depth_map, predictions["extrinsic"], predictions["intrinsic"])
     predictions["world_points_from_depth"] = world_points
-    # Clean up
-    torch.cuda.empty_cache()
     return predictions
 def handle_uploads(input_file,input_video,conf_thres,frame_slider,prediction_mode,if_TSDF):
@@ -92,7 +93,6 @@ def handle_uploads(input_file,input_video,conf_thres,frame_slider,prediction_mod
     """
     start_time = time.time()
     gc.collect()
-    torch.cuda.empty_cache()
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
@@ -203,7 +203,6 @@ def parse_frames(
     start_time = time.time()
     gc.collect()
-    torch.cuda.empty_cache()
     # Prepare frame_filter dropdown
     target_dir_images = os.path.join(target_dir, "images")
@@ -213,8 +212,7 @@ def parse_frames(
     frame_filter_choices = ["All"] + all_files
     print("Running run_model...")
-    with torch.no_grad():
-        predictions = run_model(target_dir, VGGT_model)
     # Save predictions
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
@@ -363,7 +361,6 @@ def parse_frames(
     # Cleanup
     del predictions
     gc.collect()
-    torch.cuda.empty_cache()
     end_time = time.time()
     print(f"Total time: {end_time - start_time:.2f} seconds")
     return original_points, original_colors, original_normals
@@ -570,29 +567,31 @@ def get_pca_color(feat, start = 0, brightness=1.25, center=True):
     color = color.clamp(0.0, 1.0)
     return color
-def Concerto_process(target_dir, original_points, original_colors, original_normals, slider_value, bright_value, model_type):
-    gc.collect()
-    torch.cuda.empty_cache()
-    target_dir_pcds = os.path.join(target_dir, "pcds")
-    point = {"coord": original_points, "color": original_colors, "normal":original_normals}
-    original_coord = point["coord"].copy()
-    original_color = point["color"].copy()
-    point = transform(point)
     with torch.inference_mode():
-        for key in point.keys():
-            if isinstance(point[key], torch.Tensor) and device=="cuda":
-                point[key] = point[key].cuda(non_blocking=True)
-        # model forward:
         concerto_start_time = time.time()
-        if model_type =="Concerto":
-            point = concerto_model(point)
-        elif model_type == "Sonata":
-            point = sonata_model(point)
         concerto_end_time = time.time()
         # upcast point feature
-        # Point is a structure contains all the information during forward
         for _ in range(2):
             assert "pooling_parent" in point.keys()
             assert "pooling_inverse" in point.keys()
@@ -607,27 +606,38 @@ def Concerto_process(target_dir, original_points, original_colors, original_norm
             parent.feat = point.feat[inverse]
             point = parent
-        # here point is down-sampled by GridSampling in default transform pipeline
-        # feature of point cloud in original scale can be acquired by:
-        _ = point.feat[point.inverse]
-        # PCA
-        point_feat = point.feat.cpu().detach().numpy()
-        np.save(os.path.join(target_dir_pcds,"feat.npy"),point_feat)
         pca_start_time = time.time()
-        pca_color = get_pca_color(point.feat,start = slider_value, brightness=bright_value, center=True)
         pca_end_time = time.time()
-    # inverse back to original scale before grid sampling
-    # point.inverse is acquired from the GirdSampling transform
     point_inverse = point.inverse.cpu().detach().numpy()
-    np.save(os.path.join(target_dir_pcds,"inverse.npy"),point_inverse)
-    original_pca_color = pca_color[point.inverse]
-    points = original_coord
-    colors = original_pca_color.cpu().detach().numpy()
-    end_time = time.time()
-    return points, colors, concerto_end_time - concerto_start_time, pca_end_time - pca_start_time
 def gradio_demo(target_dir,pca_slider,bright_slider, model_type, if_color=True, if_normal=True):
     target_dir_pcds = os.path.join(target_dir, "pcds")
@@ -651,21 +661,35 @@ def gradio_demo(target_dir,pca_slider,bright_slider, model_type, if_color=True,
     return processed_temp, f"Feature visualization process finished with {concerto_time:.3f} seconds using Concerto inference and {pca_time:.3f} seconds using PCA. Updating visualization."
 def concerto_slider_update(target_dir,pca_slider,bright_slider,is_example,log_output):
     if is_example == "True":
         return None, log_output
     else:
         target_dir_pcds = os.path.join(target_dir, "pcds")
         if os.path.isfile(os.path.join(target_dir_pcds,"feat.npy")):
             feat = np.load(os.path.join(target_dir_pcds,"feat.npy"))
             inverse = np.load(os.path.join(target_dir_pcds,"inverse.npy"))
-            feat = torch.tensor(feat, device = device)
-            inverse = torch.tensor(inverse, device = device)
-            pca_start_time = time.time()
-            pca_colors = get_pca_color(feat,start = pca_slider, brightness=bright_slider, center=True)
-            processed_colors = pca_colors[inverse].cpu().detach().numpy()
-            pca_end_time = time.time()
-            pca_time = pca_end_time - pca_start_time
             processed_points = np.load(os.path.join(target_dir_pcds,"points.npy"))
             processed_normals = np.load(os.path.join(target_dir_pcds,"normals.npy"))
             processed_temp = (os.path.join(target_dir_pcds,"processed.glb"))
@@ -673,36 +697,35 @@ def concerto_slider_update(target_dir,pca_slider,bright_slider,is_example,log_ou
             feat_data = trimesh.PointCloud(vertices=processed_points, colors=processed_colors, vertex_normals=processed_normals)
             feat_3d.add_geometry(feat_data)
             feat_3d.export(processed_temp)
-            log_output = f"Feature visualization process finished with{pca_time:.3f} seconds using PCA. Updating visualization."
         else:
             processed_temp = None
             log_output = "No representations saved, please click PCA generate first."
-            # processed_temp, log_output = gradio_demo(target_dir,pca_slider,bright_slider)
     return processed_temp, log_output
 # set random seed
 # (random seed affect pca color, yet change random seed need manual adjustment kmeans)
 # (the pca prevent in paper is with another version of cuda and pytorch environment)
 concerto.utils.set_seed(53124)
-# Load model
-if device == 'cuda' and flash_attn is not None:
-    print("Loading model with Flash Attention on GPU.")
-    concerto_model = concerto.load("concerto_large", repo_id="Pointcept/Concerto").to(device)
-    sonata_model = concerto.model.load("sonata", repo_id="facebook/sonata").to(device)
 else:
-    print("Loading model on CPU or without Flash Attention.")
     custom_config = dict(
         # enc_patch_size=[1024 for _ in range(5)],  # reduce patch size if necessary
         enable_flash=False,
     )
     concerto_model = concerto.load(
         "concerto_large", repo_id="Pointcept/Concerto", custom_config=custom_config
-    ).to(device)
-    sonata_model = concerto.load("sonata", repo_id="facebook/sonata", custom_config=custom_config).to(device)
 transform = concerto.transform.default()
-VGGT_model = VGGT().to(device)
 _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
 VGGT_model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 # VGGT_model.load_state_dict(torch.load("vggt/ckpt/model.pt",weights_only=True))

 from datetime import datetime
 import glob
 import gc
+import spaces
 import gradio as gr
 import numpy as np
 import open3d as o3d
 from vggt.utils.pose_enc import pose_encoding_to_extri_intri
 from vggt.utils.geometry import unproject_depth_map_to_point_map
+@spaces.GPU
+def _gpu_run_vggt_inference(images_tensor):
     """
+    GPU-only function: Run VGGT model inference on preprocessed images.
     """
+    global VGGT_model
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    images_tensor = images_tensor.to(device)
+    model = VGGT_model.to(device)
     model.eval()
     print("Running inference...")
     with torch.no_grad():
         if device == "cuda":
             with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                predictions = model(images_tensor)
         else:
+            predictions = model(images_tensor)
     print("Converting pose encoding to extrinsic and intrinsic matrices...")
+    extrinsic, intrinsic = pose_encoding_to_extri_intri(predictions["pose_enc"], images_tensor.shape[-2:])
     predictions["extrinsic"] = extrinsic
     predictions["intrinsic"] = intrinsic
     for key in predictions.keys():
         if isinstance(predictions[key], torch.Tensor):
+            predictions[key] = predictions[key].cpu().numpy().squeeze(0)
+    torch.cuda.empty_cache()
+    return predictions
+def run_model(target_dir) -> dict:
+    """
+    CPU-GPU hybrid: Handle CPU-intensive file I/O and call GPU function for inference.
+    """
+    print(f"Processing images from {target_dir}")
+    image_names = glob.glob(os.path.join(target_dir, "images", "*"))
+    image_names = sorted(image_names)
+    print(f"Found {len(image_names)} images")
+    if len(image_names) == 0:
+        raise ValueError("No images found. Check your upload.")
+    images = load_and_preprocess_images(image_names)
+    print(f"Preprocessed images shape: {images.shape}")
+    predictions = _gpu_run_vggt_inference(images)
     print("Computing world points from depth map...")
     depth_map = predictions["depth"]  # (S, H, W, 1)
     world_points = unproject_depth_map_to_point_map(depth_map, predictions["extrinsic"], predictions["intrinsic"])
     predictions["world_points_from_depth"] = world_points
     return predictions
 def handle_uploads(input_file,input_video,conf_thres,frame_slider,prediction_mode,if_TSDF):
     """
     start_time = time.time()
     gc.collect()
     # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     start_time = time.time()
     gc.collect()
     # Prepare frame_filter dropdown
     target_dir_images = os.path.join(target_dir, "images")
     frame_filter_choices = ["All"] + all_files
     print("Running run_model...")
+    predictions = run_model(target_dir)
     # Save predictions
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
     # Cleanup
     del predictions
     gc.collect()
     end_time = time.time()
     print(f"Total time: {end_time - start_time:.2f} seconds")
     return original_points, original_colors, original_normals
     color = color.clamp(0.0, 1.0)
     return color
+@spaces.GPU
+def _gpu_concerto_forward_pca(point, model_type, pca_slider, bright_slider):
+    """
+    GPU-only function: Run Concerto/Sonata model forward pass and PCA.
+    """
+    global concerto_model, sonata_model
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    for key in point.keys():
+        if isinstance(point[key], torch.Tensor):
+            point[key] = point[key].to(device, non_blocking=True)
+    if model_type == "Concerto":
+        model = concerto_model.to(device)
+    elif model_type == "Sonata":
+        model = sonata_model.to(device)
+    model.eval()
     with torch.inference_mode():
         concerto_start_time = time.time()
+        with torch.inference_mode(False):
+            point = model(point)
         concerto_end_time = time.time()
         # upcast point feature
         for _ in range(2):
             assert "pooling_parent" in point.keys()
             assert "pooling_inverse" in point.keys()
             parent.feat = point.feat[inverse]
             point = parent
         pca_start_time = time.time()
+        pca_color = get_pca_color(point.feat, start=pca_slider, brightness=bright_slider, center=True)
         pca_end_time = time.time()
+        original_pca_color = pca_color[point.inverse]
+    processed_colors = original_pca_color.cpu().detach().numpy()
+    point_feat = point.feat.cpu().detach().numpy()
     point_inverse = point.inverse.cpu().detach().numpy()
+    concerto_time = concerto_end_time - concerto_start_time
+    pca_time = pca_end_time - pca_start_time
+    torch.cuda.empty_cache()
+    return processed_colors, point_feat, point_inverse, concerto_time, pca_time
+def Concerto_process(target_dir, original_points, original_colors, original_normals, slider_value, bright_value, model_type):
+    target_dir_pcds = os.path.join(target_dir, "pcds")
+    point = {"coord": original_points, "color": original_colors, "normal": original_normals}
+    original_coord = point["coord"].copy()
+    point = transform(point)
+    # GPU: Run model forward + PCA
+    processed_colors, point_feat, point_inverse, concerto_time, pca_time = _gpu_concerto_forward_pca(
+        point, model_type, slider_value, bright_value
+    )
+    # CPU: Save features
+    np.save(os.path.join(target_dir_pcds, "feat.npy"), point_feat)
+    np.save(os.path.join(target_dir_pcds, "inverse.npy"), point_inverse)
+    return original_coord, processed_colors, concerto_time, pca_time
 def gradio_demo(target_dir,pca_slider,bright_slider, model_type, if_color=True, if_normal=True):
     target_dir_pcds = os.path.join(target_dir, "pcds")
     return processed_temp, f"Feature visualization process finished with {concerto_time:.3f} seconds using Concerto inference and {pca_time:.3f} seconds using PCA. Updating visualization."
+@spaces.GPU
+def _gpu_pca_slider_compute(feat_array, inverse_array, pca_slider, bright_slider):
+    """
+    GPU-only function: Compute PCA colors for slider updates.
+    """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    feat_tensor = torch.tensor(feat_array, device=device)
+    inverse_tensor = torch.tensor(inverse_array, device=device)
+    pca_start_time = time.time()
+    pca_colors = get_pca_color(feat_tensor, start=pca_slider, brightness=bright_slider, center=True)
+    processed_colors = pca_colors[inverse_tensor].cpu().detach().numpy()
+    pca_end_time = time.time()
+    return processed_colors, (pca_end_time - pca_start_time)
 def concerto_slider_update(target_dir,pca_slider,bright_slider,is_example,log_output):
     if is_example == "True":
         return None, log_output
     else:
         target_dir_pcds = os.path.join(target_dir, "pcds")
         if os.path.isfile(os.path.join(target_dir_pcds,"feat.npy")):
+            # CPU: Load data from disk
             feat = np.load(os.path.join(target_dir_pcds,"feat.npy"))
             inverse = np.load(os.path.join(target_dir_pcds,"inverse.npy"))
+            # GPU: Compute PCA colors
+            processed_colors, pca_time = _gpu_pca_slider_compute(feat, inverse, pca_slider, bright_slider)
+            # CPU: Build mesh
             processed_points = np.load(os.path.join(target_dir_pcds,"points.npy"))
             processed_normals = np.load(os.path.join(target_dir_pcds,"normals.npy"))
             processed_temp = (os.path.join(target_dir_pcds,"processed.glb"))
             feat_data = trimesh.PointCloud(vertices=processed_points, colors=processed_colors, vertex_normals=processed_normals)
             feat_3d.add_geometry(feat_data)
             feat_3d.export(processed_temp)
+            log_output = f"Feature visualization process finished with {pca_time:.3f} seconds using PCA. Updating visualization."
         else:
             processed_temp = None
             log_output = "No representations saved, please click PCA generate first."
     return processed_temp, log_output
 # set random seed
 # (random seed affect pca color, yet change random seed need manual adjustment kmeans)
 # (the pca prevent in paper is with another version of cuda and pytorch environment)
 concerto.utils.set_seed(53124)
+# Load model (to CPU; moved to GPU on-demand via @spaces.GPU)
+if flash_attn is not None:
+    print("Loading model with Flash Attention.")
+    concerto_model = concerto.load("concerto_large", repo_id="Pointcept/Concerto")
+    sonata_model = concerto.model.load("sonata", repo_id="facebook/sonata")
 else:
+    print("Loading model without Flash Attention.")
     custom_config = dict(
         # enc_patch_size=[1024 for _ in range(5)],  # reduce patch size if necessary
         enable_flash=False,
     )
     concerto_model = concerto.load(
         "concerto_large", repo_id="Pointcept/Concerto", custom_config=custom_config
+    )
+    sonata_model = concerto.load("sonata", repo_id="facebook/sonata", custom_config=custom_config)
 transform = concerto.transform.default()
+VGGT_model = VGGT()
 _URL = "https://huggingface.co/facebook/VGGT-1B/resolve/main/model.pt"
 VGGT_model.load_state_dict(torch.hub.load_state_dict_from_url(_URL))
 # VGGT_model.load_state_dict(torch.load("vggt/ckpt/model.pt",weights_only=True))