Spaces:

verymehari
/

depthshift-depth-generator

Sleeping

App Files Files Community

verymehari commited on Feb 27

Commit

f8cbdb8

verified ·

1 Parent(s): b249e40

Upload 3 files

Browse files

Files changed (3) hide show

README.md +7 -30
app.py +89 -220
requirements.txt +7 -6

README.md CHANGED Viewed

@@ -1,39 +1,16 @@
 ---
-title: DepthShift · Depth Generator
-emoji: 📐
 colorFrom: gray
-colorTo: green
 sdk: gradio
-sdk_version: "4.0.0"
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-# DepthShift · Depth Map Generator
-Part of the [Spatial Index](https://spatial-index.vercel.app) pipeline — **scene_03 · DEPTHSHIFT**.
-Upload any MP4 video and get back a grayscale depth map video powered by **Depth Anything V2**, ready to drop straight into [DepthShift](https://project-depthshift.vercel.app) for 3D particle field visualization in WebXR.
-## How it works
-1. Upload your video
-2. Adjust FPS and frame count
-3. Hit **GENERATE DEPTH**
-4. Download the depth video or PNG
-5. Open DepthShift and upload both files → instant 3D point cloud
-## Stack
-- **Depth Anything V2 Small** — fast, accurate monocular depth estimation
-- **Gradio + ZeroGPU** — free GPU inference on HuggingFace
-- **OpenCV** — frame extraction and video encoding
-## Output formats
-| File | Use |
-|------|-----|
-| `depth.mp4` | Grayscale depth video → feed into DepthShift |
-| `sbs.mp4` | Side-by-side preview (original + Inferno colormap) |
-| `depth_frame0.png` | First frame depth PNG → for static DepthShift tests |

 ---
+title: DepthShift Depth Generator
+emoji: ⬛
 colorFrom: gray
+colorTo: yellow
 sdk: gradio
+sdk_version: 3.50.2
 app_file: app.py
 pinned: false
 ---
+# DepthShift — Depth Map Generator
+Upload any MP4 and get a grayscale depth map video back, ready to load into [Spatial Index / DepthShift](https://spatial-index.vercel.app).
+Powered by [Depth Anything V2 Small](https://huggingface.co/depth-anything/Depth-Anything-V2-Small-hf). Runs on CPU — keep frames ≤ 30 for reasonable speed.

app.py CHANGED Viewed

@@ -2,276 +2,145 @@ import gradio as gr
 import torch
 import numpy as np
 import cv2
-import tempfile
-import os
 from PIL import Image
 from transformers import AutoImageProcessor, AutoModelForDepthEstimation
-# ── Model — Small + half-res inference = tolerable on CPU ────────────────────
-MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
-print(f"Loading {MODEL_ID} on CPU…")
-processor = AutoImageProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForDepthEstimation.from_pretrained(MODEL_ID)
 model.eval()
-print("Model ready.")
-INFER_SIZE = 256   # shrink frames before inference — big speed win on CPU
-# ── Inference ─────────────────────────────────────────────────────────────────
-def depth_frame(frame_bgr: np.ndarray, orig_w: int, orig_h: int) -> np.ndarray:
-    """Run depth on a downscaled frame, upsample result back to original size."""
-    small = cv2.resize(frame_bgr, (INFER_SIZE, INFER_SIZE))
-    image = Image.fromarray(cv2.cvtColor(small, cv2.COLOR_BGR2RGB))
-    inputs = processor(images=image, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
-    post = processor.post_process_depth_estimation(
-        outputs, target_sizes=[(INFER_SIZE, INFER_SIZE)]
-    )
-    depth = post[0]["predicted_depth"].numpy()
-    depth_norm = (depth - depth.min()) / (depth.max() - depth.min() + 1e-8)
-    depth_small = (depth_norm * 255).astype(np.uint8)
-    # Upsample back to original resolution
-    return cv2.resize(depth_small, (orig_w, orig_h), interpolation=cv2.INTER_LINEAR)
-def process_video(video_path: str, fps_out: int, max_frames: int, progress=gr.Progress()):
     if video_path is None:
-        raise gr.Error("Please upload a video first.")
     cap = cv2.VideoCapture(video_path)
-    total_src = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    if total_src == 0 or w == 0 or h == 0:
-        raise gr.Error("Could not read video. Please upload a valid MP4.")
-    step = max(1, total_src // max_frames)
-    frame_indices = list(range(0, total_src, step))[:max_frames]
     tmp_dir = tempfile.mkdtemp()
     depth_path = os.path.join(tmp_dir, "depth.mp4")
-    sbs_path   = os.path.join(tmp_dir, "sbs.mp4")
-    png_path   = os.path.join(tmp_dir, "depth_frame0.png")
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     depth_writer = cv2.VideoWriter(depth_path, fourcc, fps_out, (w, h), isColor=False)
-    sbs_writer   = cv2.VideoWriter(sbs_path,   fourcc, fps_out, (w * 2, h))
-    first_saved = False
-    n = len(frame_indices)
     for i, idx in enumerate(frame_indices):
-        progress(i / n, desc=f"Frame {i+1}/{n} — this takes ~1–2s per frame on CPU")
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
-        ret, frame = cap.read()
         if not ret:
             continue
-        depth_gray = depth_frame(frame, w, h)
-        depth_writer.write(depth_gray)
-        depth_color = cv2.applyColorMap(depth_gray, cv2.COLORMAP_INFERNO)
-        sbs_writer.write(np.hstack([frame, depth_color]))
-        if not first_saved:
-            cv2.imwrite(png_path, depth_gray)
-            first_saved = True
-    cap.release()
-    depth_writer.release()
-    sbs_writer.release()
-    return depth_path, sbs_path, png_path
-# ── UI ────────────────────────────────────────────────────────────────────────
-CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Space+Mono:ital,wght@0,400;0,700;1,400&family=Syne:wght@400;700;800&display=swap');
-:root {
-    --bg:      #080808;
-    --surface: #0f0f0f;
-    --border:  #1e1e1e;
-    --accent:  #c8ff00;
-    --dim:     #444;
-    --text:    #e8e8e8;
-    --mono:    'Space Mono', monospace;
-    --sans:    'Syne', sans-serif;
-}
-body, .gradio-container {
-    background: var(--bg) !important;
-    color: var(--text) !important;
-    font-family: var(--mono) !important;
-}
-#header {
-    border-bottom: 1px solid var(--border);
-    padding: 2rem 0 1.5rem;
-    margin-bottom: 2rem;
-}
-#header h1 {
-    font-family: var(--sans) !important;
-    font-weight: 800 !important;
-    font-size: 2rem !important;
-    letter-spacing: -0.02em;
-    color: var(--text) !important;
-    margin: 0 0 0.25rem !important;
-}
-#header p {
-    font-size: 0.72rem;
-    color: var(--dim);
-    letter-spacing: 0.12em;
-    text-transform: uppercase;
-    margin: 0;
-}
-#live-dot {
-    display: inline-block;
-    width: 6px; height: 6px;
-    background: var(--accent);
-    border-radius: 50%;
-    margin-right: 6px;
-    animation: pulse 2s ease-in-out infinite;
-}
-@keyframes pulse {
-    0%, 100% { opacity: 1; }
-    50%       { opacity: 0.3; }
-}
-.panel {
-    background: var(--surface) !important;
-    border: 1px solid var(--border) !important;
-    border-radius: 2px !important;
-    padding: 1.25rem !important;
-}
-label span, .label-wrap span {
-    font-family: var(--mono) !important;
-    font-size: 0.68rem !important;
-    letter-spacing: 0.1em !important;
-    text-transform: uppercase !important;
-    color: var(--dim) !important;
-}
-input[type=range] { accent-color: var(--accent); }
-#run-btn {
-    background: var(--accent) !important;
-    color: #000 !important;
-    font-family: var(--mono) !important;
-    font-weight: 700 !important;
-    font-size: 0.75rem !important;
-    letter-spacing: 0.15em !important;
-    text-transform: uppercase !important;
-    border: none !important;
-    border-radius: 1px !important;
-    padding: 0.85rem 2rem !important;
-    cursor: pointer;
-    transition: opacity 0.15s;
-    width: 100%;
-}
-#run-btn:hover { opacity: 0.85; }
-.timing-note {
-    margin-top: 1.5rem;
-    padding: 1rem;
-    border: 1px solid #1e1e1e;
-    font-size: 0.65rem;
-    line-height: 2;
-    color: #444;
-    letter-spacing: 0.05em;
-}
-.next-steps {
-    margin-top: 1rem;
-    padding: 0.75rem 1rem;
-    background: #0a0a0a;
-    border: 1px solid #1a1a1a;
-    font-size: 0.65rem;
-    color: #555;
-    letter-spacing: 0.05em;
-    line-height: 2.2;
-}
-#footer-note {
-    font-size: 0.65rem;
-    color: var(--dim);
-    letter-spacing: 0.08em;
-    text-transform: uppercase;
-    border-top: 1px solid var(--border);
-    padding-top: 1.5rem;
-    margin-top: 2rem;
-}
-#footer-note a { color: var(--accent); text-decoration: none; }
 """
-with gr.Blocks(css=CSS, title="DepthShift · Depth Generator") as demo:
     gr.HTML("""
-    <div id="header">
-      <h1><span id="live-dot"></span>DepthShift</h1>
-      <p>scene_03 · Depth Map Generator · Depth Anything V2 · CPU · Free</p>
-    </div>
     """)
     with gr.Row():
-        with gr.Column(scale=1, elem_classes="panel"):
-            gr.HTML('<div style="padding:0 0 0.5rem; font-size:0.68rem; letter-spacing:0.1em; text-transform:uppercase; color:#444;">01 · Upload</div>')
-            video_in = gr.Video(label="Source Video (MP4)", sources=["upload"])
-            gr.HTML('<div style="padding:1rem 0 0.5rem; font-size:0.68rem; letter-spacing:0.1em; text-transform:uppercase; color:#444;">02 · Parameters</div>')
-            fps_slider = gr.Slider(minimum=6, maximum=24, value=8, step=1, label="Output FPS")
-            frames_slider = gr.Slider(minimum=10, maximum=60, value=30, step=5,
-                                      label="Max Frames (keep low on CPU)")
-            gr.HTML('<div style="padding:0.75rem 0"></div>')
-            run_btn = gr.Button("GENERATE DEPTH →", elem_id="run-btn")
-            gr.HTML("""
-            <div class="timing-note">
-              MODEL · Depth Anything V2 Small<br>
-              BACKEND · CPU · <b style="color:#c8ff00">Free forever</b><br>
-              SPEED · ~1–2s per frame<br>
-              30 frames ≈ 1–2 min total<br>
-              TIP · Keep max frames ≤ 30 for fast results
-            </div>
-            """)
-        with gr.Column(scale=2, elem_classes="panel"):
-            gr.HTML('<div style="padding:0 0 0.5rem; font-size:0.68rem; letter-spacing:0.1em; text-transform:uppercase; color:#444;">03 · Outputs</div>')
-            with gr.Tabs():
-                with gr.Tab("Side-by-Side Preview"):
-                    sbs_out = gr.Video(label="Original · Depth (Inferno)", autoplay=True)
-                with gr.Tab("Depth Map (Grayscale)"):
-                    depth_out = gr.Video(label="Depth Video — feed into DepthShift", autoplay=True)
-                with gr.Tab("Frame PNG"):
-                    png_out = gr.Image(label="First Frame Depth PNG")
-            gr.HTML("""
-            <div class="next-steps">
-              ① Download the <b style="color:#c8ff00">Depth Video</b> or <b style="color:#c8ff00">Frame PNG</b><br>
-              ② Open <a href="https://project-depthshift.vercel.app" target="_blank"
-                        style="color:#c8ff00">project-depthshift.vercel.app</a><br>
-              ③ Upload source video + depth map → 3D particle field in WebXR
-            </div>
-            """)
     gr.HTML("""
-    <div id="footer-note">
-      Built for <a href="https://spatial-index.vercel.app">Spatial Index</a> ·
-      Model: <a href="https://huggingface.co/depth-anything/Depth-Anything-V2-Small-hf">Depth Anything V2 Small</a> ·
-      Apache-2.0 · $0.00
-    </div>
     """)
     run_btn.click(
         fn=process_video,
         inputs=[video_in, fps_slider, frames_slider],
-        outputs=[depth_out, sbs_out, png_out],
     )
-if __name__ == "__main__":
-    demo.launch()

 import torch
 import numpy as np
 import cv2
 from PIL import Image
 from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+import tempfile
+import os
+# Load model at startup on CPU
+print("Loading Depth Anything V2 Small...")
+processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
+model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
 model.eval()
+print("Model loaded.")
+def estimate_depth(frame_rgb: np.ndarray) -> np.ndarray:
+    """Run depth estimation on a single RGB frame, return normalised uint8 depth."""
+    h, w = frame_rgb.shape[:2]
+    # Downscale for speed on CPU
+    small = cv2.resize(frame_rgb, (256, int(256 * h / w)))
+    pil_img = Image.fromarray(small)
+    inputs = processor(images=pil_img, return_tensors="pt")
     with torch.no_grad():
         outputs = model(**inputs)
+    depth = outputs.predicted_depth.squeeze().numpy()
+    depth_norm = cv2.normalize(depth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
+    depth_full = cv2.resize(depth_norm, (w, h), interpolation=cv2.INTER_LINEAR)
+    return depth_full
+def process_video(video_path, fps_out, max_frames, progress=gr.Progress()):
     if video_path is None:
+        return None, None, None
     cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise gr.Error("Could not open video file.")
+    src_fps = cap.get(cv2.CAP_PROP_FPS) or 24
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
     h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # Sample every N frames to hit target fps_out
+    step = max(1, int(src_fps / fps_out))
+    frame_indices = list(range(0, min(total_frames, max_frames * step), step))[:max_frames]
+    # Output paths
     tmp_dir = tempfile.mkdtemp()
     depth_path = os.path.join(tmp_dir, "depth.mp4")
+    preview_path = os.path.join(tmp_dir, "preview.mp4")
     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
     depth_writer = cv2.VideoWriter(depth_path, fourcc, fps_out, (w, h), isColor=False)
+    preview_writer = cv2.VideoWriter(preview_path, fourcc, fps_out, (w * 2, h))
+    first_depth_frame = None
     for i, idx in enumerate(frame_indices):
+        progress(i / len(frame_indices), desc=f"Processing frame {i+1}/{len(frame_indices)}")
         cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        ret, frame_bgr = cap.read()
         if not ret:
             continue
+        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
+        depth = estimate_depth(frame_rgb)
+        if first_depth_frame is None:
+            first_depth_frame = depth
+        depth_writer.write(depth)
+        # Side-by-side preview: original | depth colourised
+        depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
+        depth_color_rgb = cv2.cvtColor(depth_color, cv2.COLOR_BGR2RGB)
+        side = np.concatenate([frame_rgb, depth_color_rgb], axis=1)
+        preview_writer.write(cv2.cvtColor(side, cv2.COLOR_RGB2BGR))
+    cap.release()
+    depth_writer.release()
+    preview_writer.release()
+    first_frame_png = None
+    if first_depth_frame is not None:
+        png_path = os.path.join(tmp_dir, "first_frame.png")
+        Image.fromarray(first_depth_frame).save(png_path)
+        first_frame_png = png_path
+    return depth_path, preview_path, first_frame_png
+# ── UI ──────────────────────────────────────────────────────────────────────
+css = """
+body { background: #0a0a0a; color: #e0e0e0; font-family: 'Space Mono', monospace; }
+.gradio-container { max-width: 900px; margin: 0 auto; }
+h1 { color: #c8ff00; letter-spacing: 0.08em; font-size: 1.6rem; }
+.label { color: #888; font-size: 0.75rem; letter-spacing: 0.1em; text-transform: uppercase; }
+button.primary { background: #c8ff00 !important; color: #0a0a0a !important; font-weight: 700; border-radius: 2px !important; }
+button.primary:hover { background: #b0e000 !important; }
+.footer { color: #444; font-size: 0.7rem; text-align: center; margin-top: 2rem; }
 """
+with gr.Blocks(css=css, title="DepthShift — Depth Map Generator") as demo:
     gr.HTML("""
+        <link href="https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&display=swap" rel="stylesheet">
+        <h1>⬛ DEPTHSHIFT / DEPTH GENERATOR</h1>
+        <p style="color:#888; font-size:0.85rem; margin-top:-0.5rem;">
+            Upload an MP4 → get a grayscale depth map video ready for
+            <a href="https://spatial-index.vercel.app" target="_blank" style="color:#c8ff00;">Spatial Index</a>
+        </p>
     """)
     with gr.Row():
+        with gr.Column():
+            video_in = gr.Video(label="Input Video (MP4)", interactive=True)
+            with gr.Row():
+                fps_slider = gr.Slider(6, 24, value=12, step=1, label="Output FPS (keep low for CPU speed)")
+                frames_slider = gr.Slider(10, 60, value=30, step=5, label="Max Frames")
+            run_btn = gr.Button("Generate Depth Map", variant="primary")
+        with gr.Column():
+            depth_out = gr.Video(label="Depth Map (grayscale) — use this in DepthShift")
+            preview_out = gr.Video(label="Preview (original | depth side-by-side)")
+            frame_out = gr.Image(label="First Frame Depth PNG")
     gr.HTML("""
+        <div class="footer">
+            <b style="color:#c8ff00">HOW TO USE</b><br>
+            1. Upload your MP4 &nbsp;→&nbsp;
+            2. Download the depth map video &nbsp;→&nbsp;
+            3. Load both into <a href="https://spatial-index.vercel.app" style="color:#c8ff00">Spatial Index / DepthShift</a>
+            <br><br>Processing runs on CPU — keep Max Frames ≤ 30 for reasonable wait times (~1–2 min).
+        </div>
     """)
     run_btn.click(
         fn=process_video,
         inputs=[video_in, fps_slider, frames_slider],
+        outputs=[depth_out, preview_out, frame_out],
     )
+demo.launch()

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-gradio>=4.0.0
-torch>=2.0.0
-transformers>=4.38.0
-opencv-python-headless>=4.8.0
-Pillow>=10.0.0
-numpy>=1.24.0

+gradio==3.50.2
+torch==2.1.0
+torchvision==0.16.0
+transformers==4.40.0
+opencv-python-headless==4.9.0.80
+numpy==1.26.4
+Pillow==10.3.0