verymehari's picture
Upload 3 files
66bffc2 verified
# Patch for Python 3.13: audioop removed from stdlib, pydub/gradio tries to import it
import sys
import types
if "audioop" not in sys.modules:
sys.modules["audioop"] = types.ModuleType("audioop")
if "pyaudioop" not in sys.modules:
sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")
import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import tempfile
import os
# Load model at startup on CPU
print("Loading Depth Anything V2 Small...")
processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
model.eval()
print("Model loaded.")
def estimate_depth(frame_rgb: np.ndarray) -> np.ndarray:
h, w = frame_rgb.shape[:2]
small = cv2.resize(frame_rgb, (256, int(256 * h / w)))
pil_img = Image.fromarray(small)
inputs = processor(images=pil_img, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
depth = outputs.predicted_depth.squeeze().numpy()
depth_norm = cv2.normalize(depth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
depth_full = cv2.resize(depth_norm, (w, h), interpolation=cv2.INTER_LINEAR)
return depth_full
def process_video(video_path, fps_out, max_frames, progress=gr.Progress()):
if video_path is None:
return None, None, None
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
raise gr.Error("Could not open video file.")
src_fps = cap.get(cv2.CAP_PROP_FPS) or 24
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
step = max(1, int(src_fps / fps_out))
frame_indices = list(range(0, min(total_frames, max_frames * step), step))[:max_frames]
tmp_dir = tempfile.mkdtemp()
depth_path = os.path.join(tmp_dir, "depth.mp4")
preview_path = os.path.join(tmp_dir, "preview.mp4")
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
depth_writer = cv2.VideoWriter(depth_path, fourcc, fps_out, (w, h), isColor=False)
preview_writer = cv2.VideoWriter(preview_path, fourcc, fps_out, (w * 2, h))
first_depth_frame = None
for i, idx in enumerate(frame_indices):
progress(i / len(frame_indices), desc=f"Processing frame {i+1}/{len(frame_indices)}")
cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
ret, frame_bgr = cap.read()
if not ret:
continue
frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
depth = estimate_depth(frame_rgb)
if first_depth_frame is None:
first_depth_frame = depth
depth_writer.write(depth)
depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
depth_color_rgb = cv2.cvtColor(depth_color, cv2.COLOR_BGR2RGB)
side = np.concatenate([frame_rgb, depth_color_rgb], axis=1)
preview_writer.write(cv2.cvtColor(side, cv2.COLOR_RGB2BGR))
cap.release()
depth_writer.release()
preview_writer.release()
first_frame_png = None
if first_depth_frame is not None:
png_path = os.path.join(tmp_dir, "first_frame.png")
Image.fromarray(first_depth_frame).save(png_path)
first_frame_png = png_path
return depth_path, preview_path, first_frame_png
css = """
body { background: #0a0a0a; color: #e0e0e0; font-family: 'Space Mono', monospace; }
.gradio-container { max-width: 900px; margin: 0 auto; }
h1 { color: #c8ff00; letter-spacing: 0.08em; font-size: 1.6rem; }
button.primary { background: #c8ff00 !important; color: #0a0a0a !important; font-weight: 700; border-radius: 2px !important; }
button.primary:hover { background: #b0e000 !important; }
.footer { color: #444; font-size: 0.7rem; text-align: center; margin-top: 2rem; }
"""
with gr.Blocks(css=css, title="DepthShift — Depth Map Generator") as demo:
gr.HTML("""
<link href="https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&display=swap" rel="stylesheet">
<h1>⬛ DEPTHSHIFT / DEPTH GENERATOR</h1>
<p style="color:#888; font-size:0.85rem; margin-top:-0.5rem;">
Upload an MP4 → get a grayscale depth map video ready for
<a href="https://spatial-index.vercel.app" target="_blank" style="color:#c8ff00;">Spatial Index</a>
</p>
""")
with gr.Row():
with gr.Column():
video_in = gr.Video(label="Input Video (MP4)", interactive=True)
with gr.Row():
fps_slider = gr.Slider(6, 24, value=12, step=1, label="Output FPS")
frames_slider = gr.Slider(10, 60, value=30, step=5, label="Max Frames")
run_btn = gr.Button("Generate Depth Map", variant="primary")
with gr.Column():
depth_out = gr.Video(label="Depth Map (grayscale) — use this in DepthShift")
preview_out = gr.Video(label="Preview (original | depth side-by-side)")
frame_out = gr.Image(label="First Frame Depth PNG")
gr.HTML("""
<div class="footer">
<b style="color:#c8ff00">HOW TO USE</b><br>
1. Upload your MP4 &nbsp;→&nbsp;
2. Download the depth map video &nbsp;→&nbsp;
3. Load both into <a href="https://spatial-index.vercel.app" style="color:#c8ff00">Spatial Index / DepthShift</a>
<br><br>Processing runs on CPU — keep Max Frames ≤ 30 for reasonable wait times (~1–2 min).
</div>
""")
run_btn.click(
fn=process_video,
inputs=[video_in, fps_slider, frames_slider],
outputs=[depth_out, preview_out, frame_out],
)
demo.launch()