Spaces:

niye4
/

depthmap

Build error

App Files Files Community

niye4 commited on Dec 3, 2025

Commit

9712ea3

verified ·

1 Parent(s): e6f85b6

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -53

app.py CHANGED Viewed

@@ -13,12 +13,13 @@ from depth_anything_v2.dpt import DepthAnythingV2
 # Configuration
 # -------------------
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
-CHECKPOINT = "checkpoints/depth_anything_v2_vitb.pth"  # expect this to exist
 WORKDIR = "workspace"
 FRAMES_DIR = os.path.join(WORKDIR, "frames")
 OUT_FRAMES_DIR = os.path.join(WORKDIR, "depth_frames")
 RAW_FRAMES_DIR = os.path.join(WORKDIR, "raw16")
 OUTPUT_DIR = "output"
 os.makedirs(FRAMES_DIR, exist_ok=True)
 os.makedirs(OUT_FRAMES_DIR, exist_ok=True)
 os.makedirs(RAW_FRAMES_DIR, exist_ok=True)
@@ -27,22 +28,27 @@ os.makedirs(OUTPUT_DIR, exist_ok=True)
 # -------------------
 # Load model (vitb)
 # -------------------
-model = DepthAnythingV2(encoder='vitb', features=128, out_channels=[96,192,384,768])
 state_dict = torch.load(CHECKPOINT, map_location="cpu")
 model.load_state_dict(state_dict)
 model = model.to(DEVICE).eval()
 def predict_depth(frame_rgb):
-    """Return depth map as float32 numpy array (same semantics as original app.py)."""
     return model.infer_image(frame_rgb).astype(np.float32)
 def depth_to_gray8(depth):
-    """Normalize depth to 0-255 uint8 for preview (same formula as original app.py)."""
     dmin, dmax = float(depth.min()), float(depth.max())
     if dmax - dmin < 1e-8:
-        norm = np.zeros_like(depth, dtype=np.uint8)
-    else:
-        norm = ((depth - dmin) / (dmax - dmin) * 255.0).astype(np.uint8)
     return norm
 def clear_workspace():
@@ -52,104 +58,93 @@ def clear_workspace():
     os.makedirs(RAW_FRAMES_DIR, exist_ok=True)
 # -------------------
-# Main pipeline
 # -------------------
 def process_video(video_file):
-    """
-    1) ffmpeg extract frames -> workspace/frames/frame_000001.png ...
-    2) for each frame:
-         - run model.infer_image on RGB frame
-         - save raw 16-bit PNG to workspace/raw16/frame_XXXXXX.png
-         - save normalized 8-bit PNG to workspace/depth_frames/frame_XXXXXX.png
-    3) ffmpeg merge workspace/depth_frames/frame_%06d.png -> output MP4 (same fps)
-    Returns: list of preview PIL images (sampled) and output video path
-    """
     clear_workspace()
-    # copy input to workspace (avoids /tmp path issues)
     in_path = os.path.join(WORKDIR, "input.mp4")
     shutil.copy(video_file.name, in_path)
-    # read fps and ensure video can be opened
     cap = cv2.VideoCapture(in_path)
     if not cap.isOpened():
         raise RuntimeError("Cannot open uploaded video.")
     fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
     cap.release()
-    # 1) extract frames with ffmpeg as PNG (lossless)
-    extract_cmd = [
         "ffmpeg", "-y",
         "-i", in_path,
         os.path.join(FRAMES_DIR, "frame_%06d.png")
-    ]
-    subprocess.run(extract_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     frame_files = sorted(os.listdir(FRAMES_DIR))
     if len(frame_files) == 0:
-        raise RuntimeError("No frames extracted from video (ffmpeg step failed).")
     preview_images = []
     total = len(frame_files)
-    sample_step = max(1, total // 20)  # max ~20 preview images
-    # 2) run per-frame inference and save raw16 + preview8
     for i, fname in enumerate(frame_files):
         fp = os.path.join(FRAMES_DIR, fname)
-        # read frame (BGR) -> convert to RGB for model
         bgr = cv2.imread(fp, cv2.IMREAD_COLOR)
-        if bgr is None:
-            raise RuntimeError(f"Failed to read extracted frame {fp}")
         rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
-        # predict
-        depth = predict_depth(rgb)  # float32 numpy
-        # save raw 16-bit (like app.py original: uint16 PNG)
         raw16 = depth.astype(np.uint16)
-        raw_out_path = os.path.join(RAW_FRAMES_DIR, fname)
-        Image.fromarray(raw16).save(raw_out_path)  # PIL will write 16-bit PNG
-        # normalized 8-bit preview (exact same normalization as original app)
         gray8 = depth_to_gray8(depth)
-        preview_out_path = os.path.join(OUT_FRAMES_DIR, fname)
-        Image.fromarray(gray8).save(preview_out_path)
-        # sample for gallery preview
         if i % sample_step == 0:
             preview_images.append(Image.fromarray(gray8))
-    # 3) merge preview frames into MP4 using ffmpeg (use libx264 for compatibility)
-    out_video = os.path.join(OUTPUT_DIR, os.path.basename(video_file.name).replace(".mp4","_depth.mp4"))
-    merge_cmd = [
         "ffmpeg", "-y",
         "-framerate", str(fps),
         "-i", os.path.join(OUT_FRAMES_DIR, "frame_%06d.png"),
         "-c:v", "libx264",
         "-pix_fmt", "yuv420p",
         out_video
-    ]
-    subprocess.run(merge_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-    # return gallery (list of PIL images) and output path
     return preview_images, out_video
 # -------------------
-# Gradio UI
 # -------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# Depth Anything V2 — Video (framewise, app.py-style, high-quality)")
     gr.Markdown(
-        "This pipeline extracts frames with ffmpeg, runs the original DepthAnythingV2 per-frame inference, "
-        "saves raw 16-bit PNGs (in workspace/raw16) and normalized 8-bit PNG previews (in workspace/depth_frames), "
-        "then merges previews into an MP4 at the original FPS. This reproduces the app.py image quality for video."
     )
-    video_in = gr.File(label="Upload MP4", file_types=[".mp4"])
-    gallery = gr.Gallery(label="Preview frames (sampled)").style(grid=5)
-    out_video = gr.Video(label="Depthmap Video (downloadable)")
-    btn = gr.Button("Render (framewise, high-quality)")
     btn.click(process_video, inputs=[video_in], outputs=[gallery, out_video])
 if __name__ == "__main__":

 # Configuration
 # -------------------
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+CHECKPOINT = "checkpoints/depth_anything_v2_vitb.pth"  # vitb only
 WORKDIR = "workspace"
 FRAMES_DIR = os.path.join(WORKDIR, "frames")
 OUT_FRAMES_DIR = os.path.join(WORKDIR, "depth_frames")
 RAW_FRAMES_DIR = os.path.join(WORKDIR, "raw16")
 OUTPUT_DIR = "output"
 os.makedirs(FRAMES_DIR, exist_ok=True)
 os.makedirs(OUT_FRAMES_DIR, exist_ok=True)
 os.makedirs(RAW_FRAMES_DIR, exist_ok=True)
 # -------------------
 # Load model (vitb)
 # -------------------
+model = DepthAnythingV2(
+    encoder='vitb',
+    features=128,
+    out_channels=[96, 192, 384, 768]
+)
 state_dict = torch.load(CHECKPOINT, map_location="cpu")
 model.load_state_dict(state_dict)
 model = model.to(DEVICE).eval()
+# -------------------
+# Depth functions
+# -------------------
 def predict_depth(frame_rgb):
+    """Return depth map float32 like original image app.py."""
     return model.infer_image(frame_rgb).astype(np.float32)
 def depth_to_gray8(depth):
     dmin, dmax = float(depth.min()), float(depth.max())
     if dmax - dmin < 1e-8:
+        return np.zeros_like(depth, dtype=np.uint8)
+    norm = ((depth - dmin) / (dmax - dmin) * 255.0).astype(np.uint8)
     return norm
 def clear_workspace():
     os.makedirs(RAW_FRAMES_DIR, exist_ok=True)
 # -------------------
+# Main Processing
 # -------------------
 def process_video(video_file):
+    """Extract → Infer each frame → Save → Merge → Return MP4 + preview frames."""
     clear_workspace()
+    # Copy video to workspace
     in_path = os.path.join(WORKDIR, "input.mp4")
     shutil.copy(video_file.name, in_path)
+    # Read FPS
     cap = cv2.VideoCapture(in_path)
     if not cap.isOpened():
         raise RuntimeError("Cannot open uploaded video.")
     fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
     cap.release()
+    # Extract PNG frames (lossless)
+    subprocess.run([
         "ffmpeg", "-y",
         "-i", in_path,
         os.path.join(FRAMES_DIR, "frame_%06d.png")
+    ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     frame_files = sorted(os.listdir(FRAMES_DIR))
     if len(frame_files) == 0:
+        raise RuntimeError("No frames extracted.")
     preview_images = []
     total = len(frame_files)
+    sample_step = max(1, total // 20)
+    # Process frames
     for i, fname in enumerate(frame_files):
         fp = os.path.join(FRAMES_DIR, fname)
         bgr = cv2.imread(fp, cv2.IMREAD_COLOR)
         rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
+        depth = predict_depth(rgb)
+        # Save raw 16-bit PNG
         raw16 = depth.astype(np.uint16)
+        Image.fromarray(raw16).save(os.path.join(RAW_FRAMES_DIR, fname))
+        # Save normalized grayscale preview
         gray8 = depth_to_gray8(depth)
+        Image.fromarray(gray8).save(os.path.join(OUT_FRAMES_DIR, fname))
         if i % sample_step == 0:
             preview_images.append(Image.fromarray(gray8))
+    # Merge video using ffmpeg
+    out_video = os.path.join(
+        OUTPUT_DIR,
+        os.path.basename(video_file.name).replace(".mp4", "_depth.mp4")
+    )
+    subprocess.run([
         "ffmpeg", "-y",
         "-framerate", str(fps),
         "-i", os.path.join(OUT_FRAMES_DIR, "frame_%06d.png"),
         "-c:v", "libx264",
         "-pix_fmt", "yuv420p",
         out_video
+    ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     return preview_images, out_video
 # -------------------
+# UI
 # -------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# Depth Anything V2 – High-Quality Video Depth (Frame-wise)")
     gr.Markdown(
+        "This reproduces the **exact image quality** of the official Depth Anything V2 app.py, "
+        "but applied **frame-by-frame** to video using ffmpeg for perfect sharpness."
     )
+    video_in = gr.File(label="Upload a video (mp4)", file_types=[".mp4"])
+    gallery = gr.Gallery(
+        label="Preview Depth Frames",
+        columns=5,
+        height="auto"
+    )
+    out_video = gr.Video(label="Depthmap Video Output")
+    btn = gr.Button("Render High-Quality Depth Video")
     btn.click(process_video, inputs=[video_in], outputs=[gallery, out_video])
 if __name__ == "__main__":