Spaces:

oxkitsune
/

rerun-ml-depth-pro

Running

App Files Files Community

oxkitsune commited on Oct 18, 2024

Commit

ad0f5c7

1 Parent(s): 4be800e

batch model inference

Browse files

Files changed (1) hide show

app.py +20 -19

app.py CHANGED Viewed

@@ -43,24 +43,15 @@ def resize_image(image_buffer, max_size=256):
 @spaces.GPU(duration=20)
 def predict_depth(input_images):
-    # Preprocess the image
     results = [depth_pro.load_rgb(image) for image in input_images]
-    # assume load_rgb returns a tuple of (image, f_px)
-    # stack the images and f_px values into tensors
-    images, f_px = zip(*results)
-    images = torch.stack(images)
-    f_px = torch.tensor(f_px)
-    images = transform(images)
     images = images.to(device)
-    f_px = f_px.to(device)
     # Run inference
-    prediction = model.infer(images, f_px=f_px)
-    depth = prediction["depth"]  # Depth in [m]
-    focallength_px = prediction["focallength_px"]  # Focal length in pixels
     # Convert depth to numpy array if it's a torch tensor
     if isinstance(depth, torch.Tensor):
@@ -68,9 +59,9 @@ def predict_depth(input_images):
     # Convert focal length to a float if it's a torch tensor
     if isinstance(focallength_px, torch.Tensor):
-        focallength_px = focallength_px.item()
-    # Ensure depth is a 2D numpy array
     if depth.ndim != 2:
         depth = depth.squeeze()
@@ -114,7 +105,13 @@ def run_rerun(path_to_video):
     # limit the number of frames to 10 seconds of video
     max_frames = min(10 * fps_video, num_frames)
-    batch_size = min(16, max_frames)
     # go through all the frames in the video, using the batch size
     for i in range(0, int(max_frames), batch_size):
@@ -122,6 +119,7 @@ def run_rerun(path_to_video):
             raise gr.Error("Reached the maximum number of frames to process")
         frames = []
         for _ in range(batch_size):
             ret, frame = cap.read()
             if not ret:
@@ -135,11 +133,14 @@ def run_rerun(path_to_video):
             depths, focal_lengths = predict_depth(temp_files)
-            for depth, focal_length in zip(depths, focal_lengths):
                 # find x and y scale factors, which can be applied to image
                 x_scale = depth.shape[1] / frames[0].shape[1]
                 y_scale = depth.shape[0] / frames[0].shape[0]
                 rr.log(
                     "world/camera/depth",
                     rr.DepthImage(depth, meter=1),
@@ -149,7 +150,7 @@ def run_rerun(path_to_video):
                     "world/camera/frame",
                     rr.VideoFrameReference(
                         timestamp=rr.components.VideoTimestamp(
-                            nanoseconds=frame_timestamps_ns[i]
                         ),
                         video_reference="world/video",
                     ),

 @spaces.GPU(duration=20)
 def predict_depth(input_images):
     results = [depth_pro.load_rgb(image) for image in input_images]
+    images = torch.stack([transform(result[0]) for result in results])
     images = images.to(device)
     # Run inference
+    with torch.no_grad():
+        prediction = model.infer(images)
+        depth = prediction["depth"]  # Depth in [m]
+        focallength_px = prediction["focallength_px"]  # Focal length in pixels
     # Convert depth to numpy array if it's a torch tensor
     if isinstance(depth, torch.Tensor):
     # Convert focal length to a float if it's a torch tensor
     if isinstance(focallength_px, torch.Tensor):
+        focallength_px = [focal_length.item() for focal_length in focallength_px]
+    # Ensure depth is a BxHxW tensor
     if depth.ndim != 2:
         depth = depth.squeeze()
     # limit the number of frames to 10 seconds of video
     max_frames = min(10 * fps_video, num_frames)
+    torch.cuda.empty_cache()
+    free_vram, _ = torch.cuda.mem_get_info(device)
+    free_vram = free_vram / 1024 / 1024 / 1024
+    # batch size is determined by the amount of free vram
+    batch_size = int(min(free_vram // 4, max_frames))
     # go through all the frames in the video, using the batch size
     for i in range(0, int(max_frames), batch_size):
             raise gr.Error("Reached the maximum number of frames to process")
         frames = []
+        frame_indices = list(range(i, min(i + batch_size, int(max_frames))))
         for _ in range(batch_size):
             ret, frame = cap.read()
             if not ret:
             depths, focal_lengths = predict_depth(temp_files)
+            for depth, focal_length, frame_idx in zip(
+                depths, focal_lengths, frame_indices
+            ):
                 # find x and y scale factors, which can be applied to image
                 x_scale = depth.shape[1] / frames[0].shape[1]
                 y_scale = depth.shape[0] / frames[0].shape[0]
+                rr.set_time_nanos("video_time", frame_timestamps_ns[frame_idx])
                 rr.log(
                     "world/camera/depth",
                     rr.DepthImage(depth, meter=1),
                     "world/camera/frame",
                     rr.VideoFrameReference(
                         timestamp=rr.components.VideoTimestamp(
+                            nanoseconds=frame_timestamps_ns[frame_idx]
                         ),
                         video_reference="world/video",
                     ),