Spaces:

oxkitsune
/

rerun-ml-depth-pro

Running

App Files Files Community

oxkitsune commited on Oct 18, 2024

Commit

4be800e

1 Parent(s): 0dd7180

batch model inference

Browse files

Files changed (1) hide show

app.py +65 -52

app.py CHANGED Viewed

@@ -42,16 +42,23 @@ def resize_image(image_buffer, max_size=256):
 @spaces.GPU(duration=20)
-def predict_depth(input_image):
     # Preprocess the image
-    result = depth_pro.load_rgb(input_image)
-    image = result[0]
-    f_px = result[-1]  # Assuming f_px is the last item in the returned tuple
-    image = transform(image)
-    image = image.to(device)
     # Run inference
-    prediction = model.infer(image, f_px=f_px)
     depth = prediction["depth"]  # Depth in [m]
     focallength_px = prediction["focallength_px"]  # Focal length in pixels
@@ -107,62 +114,68 @@ def run_rerun(path_to_video):
     # limit the number of frames to 10 seconds of video
     max_frames = min(10 * fps_video, num_frames)
-    for i in range(len(frame_timestamps_ns)):
         if i >= max_frames:
             raise gr.Error("Reached the maximum number of frames to process")
-        ret, frame = cap.read()
-        if not ret:
-            break
-        temp_file = None
         try:
-            # Resize the image to make the inference faster
-            temp_file = resize_image(frame, max_size=256)
-            depth, focal_length = predict_depth(temp_file)
-            # find x and y scale factors, which can be applied to image
-            x_scale = depth.shape[1] / frame.shape[1]
-            y_scale = depth.shape[0] / frame.shape[0]
-            rr.set_time_nanos("video_time", frame_timestamps_ns[i])
-            rr.log(
-                "world/camera/depth",
-                rr.DepthImage(depth, meter=1),
-            )
-            rr.log(
-                "world/camera/frame",
-                rr.VideoFrameReference(
-                    timestamp=rr.components.VideoTimestamp(
-                        nanoseconds=frame_timestamps_ns[i]
                     ),
-                    video_reference="world/video",
-                ),
-                rr.Transform3D(scale=(x_scale, y_scale, 1)),
-            )
-            rr.log(
-                "world/camera",
-                rr.Pinhole(
-                    focal_length=focal_length,
-                    width=depth.shape[1],
-                    height=depth.shape[0],
-                    principal_point=(depth.shape[1] / 2, depth.shape[0] / 2),
-                    camera_xyz=rr.ViewCoordinates.FLU,
-                    image_plane_distance=depth.max(),
-                ),
-            )
-            yield stream.read()
         except Exception as e:
             raise gr.Error(f"An error has occurred: {e}")
         finally:
-            # Clean up the temporary file
-            if temp_file and os.path.exists(temp_file):
-                os.remove(temp_file)
     yield stream.read()

 @spaces.GPU(duration=20)
+def predict_depth(input_images):
     # Preprocess the image
+    results = [depth_pro.load_rgb(image) for image in input_images]
+    # assume load_rgb returns a tuple of (image, f_px)
+    # stack the images and f_px values into tensors
+    images, f_px = zip(*results)
+    images = torch.stack(images)
+    f_px = torch.tensor(f_px)
+    images = transform(images)
+    images = images.to(device)
+    f_px = f_px.to(device)
     # Run inference
+    prediction = model.infer(images, f_px=f_px)
     depth = prediction["depth"]  # Depth in [m]
     focallength_px = prediction["focallength_px"]  # Focal length in pixels
     # limit the number of frames to 10 seconds of video
     max_frames = min(10 * fps_video, num_frames)
+    batch_size = min(16, max_frames)
+    # go through all the frames in the video, using the batch size
+    for i in range(0, int(max_frames), batch_size):
         if i >= max_frames:
             raise gr.Error("Reached the maximum number of frames to process")
+        frames = []
+        for _ in range(batch_size):
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frames.append(frame)
+        temp_files = []
         try:
+            # Resize the images to make the inference faster
+            temp_files = [resize_image(frame, max_size=256) for frame in frames]
+            depths, focal_lengths = predict_depth(temp_files)
+            for depth, focal_length in zip(depths, focal_lengths):
+                # find x and y scale factors, which can be applied to image
+                x_scale = depth.shape[1] / frames[0].shape[1]
+                y_scale = depth.shape[0] / frames[0].shape[0]
+                rr.log(
+                    "world/camera/depth",
+                    rr.DepthImage(depth, meter=1),
+                )
+                rr.log(
+                    "world/camera/frame",
+                    rr.VideoFrameReference(
+                        timestamp=rr.components.VideoTimestamp(
+                            nanoseconds=frame_timestamps_ns[i]
+                        ),
+                        video_reference="world/video",
                     ),
+                    rr.Transform3D(scale=(x_scale, y_scale, 1)),
+                )
+                rr.log(
+                    "world/camera",
+                    rr.Pinhole(
+                        focal_length=focal_length,
+                        width=depth.shape[1],
+                        height=depth.shape[0],
+                        principal_point=(depth.shape[1] / 2, depth.shape[0] / 2),
+                        camera_xyz=rr.ViewCoordinates.FLU,
+                        image_plane_distance=depth.max(),
+                    ),
+                )
+                yield stream.read()
         except Exception as e:
             raise gr.Error(f"An error has occurred: {e}")
         finally:
+            # Clean up the temporary files
+            for temp_file in temp_files:
+                if temp_file and os.path.exists(temp_file):
+                    os.remove(temp_file)
     yield stream.read()