# Patch for Python 3.13: audioop removed from stdlib, pydub/gradio tries to import it import sys import types if "audioop" not in sys.modules: sys.modules["audioop"] = types.ModuleType("audioop") if "pyaudioop" not in sys.modules: sys.modules["pyaudioop"] = types.ModuleType("pyaudioop") import gradio as gr import torch import numpy as np import cv2 from PIL import Image from transformers import AutoImageProcessor, AutoModelForDepthEstimation import tempfile import os # Load model at startup on CPU print("Loading Depth Anything V2 Small...") processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf") model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf") model.eval() print("Model loaded.") def estimate_depth(frame_rgb: np.ndarray) -> np.ndarray: h, w = frame_rgb.shape[:2] small = cv2.resize(frame_rgb, (256, int(256 * h / w))) pil_img = Image.fromarray(small) inputs = processor(images=pil_img, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) depth = outputs.predicted_depth.squeeze().numpy() depth_norm = cv2.normalize(depth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8) depth_full = cv2.resize(depth_norm, (w, h), interpolation=cv2.INTER_LINEAR) return depth_full def process_video(video_path, fps_out, max_frames, progress=gr.Progress()): if video_path is None: return None, None, None cap = cv2.VideoCapture(video_path) if not cap.isOpened(): raise gr.Error("Could not open video file.") src_fps = cap.get(cv2.CAP_PROP_FPS) or 24 total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) step = max(1, int(src_fps / fps_out)) frame_indices = list(range(0, min(total_frames, max_frames * step), step))[:max_frames] tmp_dir = tempfile.mkdtemp() depth_path = os.path.join(tmp_dir, "depth.mp4") preview_path = os.path.join(tmp_dir, "preview.mp4") fourcc = cv2.VideoWriter_fourcc(*"mp4v") depth_writer = cv2.VideoWriter(depth_path, fourcc, fps_out, (w, h), isColor=False) preview_writer = cv2.VideoWriter(preview_path, fourcc, fps_out, (w * 2, h)) first_depth_frame = None for i, idx in enumerate(frame_indices): progress(i / len(frame_indices), desc=f"Processing frame {i+1}/{len(frame_indices)}") cap.set(cv2.CAP_PROP_POS_FRAMES, idx) ret, frame_bgr = cap.read() if not ret: continue frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB) depth = estimate_depth(frame_rgb) if first_depth_frame is None: first_depth_frame = depth depth_writer.write(depth) depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO) depth_color_rgb = cv2.cvtColor(depth_color, cv2.COLOR_BGR2RGB) side = np.concatenate([frame_rgb, depth_color_rgb], axis=1) preview_writer.write(cv2.cvtColor(side, cv2.COLOR_RGB2BGR)) cap.release() depth_writer.release() preview_writer.release() first_frame_png = None if first_depth_frame is not None: png_path = os.path.join(tmp_dir, "first_frame.png") Image.fromarray(first_depth_frame).save(png_path) first_frame_png = png_path return depth_path, preview_path, first_frame_png css = """ body { background: #0a0a0a; color: #e0e0e0; font-family: 'Space Mono', monospace; } .gradio-container { max-width: 900px; margin: 0 auto; } h1 { color: #c8ff00; letter-spacing: 0.08em; font-size: 1.6rem; } button.primary { background: #c8ff00 !important; color: #0a0a0a !important; font-weight: 700; border-radius: 2px !important; } button.primary:hover { background: #b0e000 !important; } .footer { color: #444; font-size: 0.7rem; text-align: center; margin-top: 2rem; } """ with gr.Blocks(css=css, title="DepthShift — Depth Map Generator") as demo: gr.HTML("""
Upload an MP4 → get a grayscale depth map video ready for Spatial Index
""") with gr.Row(): with gr.Column(): video_in = gr.Video(label="Input Video (MP4)", interactive=True) with gr.Row(): fps_slider = gr.Slider(6, 24, value=12, step=1, label="Output FPS") frames_slider = gr.Slider(10, 60, value=30, step=5, label="Max Frames") run_btn = gr.Button("Generate Depth Map", variant="primary") with gr.Column(): depth_out = gr.Video(label="Depth Map (grayscale) — use this in DepthShift") preview_out = gr.Video(label="Preview (original | depth side-by-side)") frame_out = gr.Image(label="First Frame Depth PNG") gr.HTML(""" """) run_btn.click( fn=process_video, inputs=[video_in, fps_slider, frames_slider], outputs=[depth_out, preview_out, frame_out], ) demo.launch()