# Patch for Python 3.13: audioop removed from stdlib, pydub/gradio tries to import it
import sys
import types
if "audioop" not in sys.modules:
    sys.modules["audioop"] = types.ModuleType("audioop")
if "pyaudioop" not in sys.modules:
    sys.modules["pyaudioop"] = types.ModuleType("pyaudioop")

import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from transformers import AutoImageProcessor, AutoModelForDepthEstimation
import tempfile
import os

# Load model at startup on CPU
print("Loading Depth Anything V2 Small...")
processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
model.eval()
print("Model loaded.")


def estimate_depth(frame_rgb: np.ndarray) -> np.ndarray:
    h, w = frame_rgb.shape[:2]
    small = cv2.resize(frame_rgb, (256, int(256 * h / w)))
    pil_img = Image.fromarray(small)
    inputs = processor(images=pil_img, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    depth = outputs.predicted_depth.squeeze().numpy()
    depth_norm = cv2.normalize(depth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    depth_full = cv2.resize(depth_norm, (w, h), interpolation=cv2.INTER_LINEAR)
    return depth_full


def process_video(video_path, fps_out, max_frames, progress=gr.Progress()):
    if video_path is None:
        return None, None, None

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise gr.Error("Could not open video file.")

    src_fps = cap.get(cv2.CAP_PROP_FPS) or 24
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    step = max(1, int(src_fps / fps_out))
    frame_indices = list(range(0, min(total_frames, max_frames * step), step))[:max_frames]

    tmp_dir = tempfile.mkdtemp()
    depth_path = os.path.join(tmp_dir, "depth.mp4")
    preview_path = os.path.join(tmp_dir, "preview.mp4")

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    depth_writer = cv2.VideoWriter(depth_path, fourcc, fps_out, (w, h), isColor=False)
    preview_writer = cv2.VideoWriter(preview_path, fourcc, fps_out, (w * 2, h))

    first_depth_frame = None

    for i, idx in enumerate(frame_indices):
        progress(i / len(frame_indices), desc=f"Processing frame {i+1}/{len(frame_indices)}")
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame_bgr = cap.read()
        if not ret:
            continue

        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        depth = estimate_depth(frame_rgb)

        if first_depth_frame is None:
            first_depth_frame = depth

        depth_writer.write(depth)

        depth_color = cv2.applyColorMap(depth, cv2.COLORMAP_INFERNO)
        depth_color_rgb = cv2.cvtColor(depth_color, cv2.COLOR_BGR2RGB)
        side = np.concatenate([frame_rgb, depth_color_rgb], axis=1)
        preview_writer.write(cv2.cvtColor(side, cv2.COLOR_RGB2BGR))

    cap.release()
    depth_writer.release()
    preview_writer.release()

    first_frame_png = None
    if first_depth_frame is not None:
        png_path = os.path.join(tmp_dir, "first_frame.png")
        Image.fromarray(first_depth_frame).save(png_path)
        first_frame_png = png_path

    return depth_path, preview_path, first_frame_png


css = """
body { background: #0a0a0a; color: #e0e0e0; font-family: 'Space Mono', monospace; }
.gradio-container { max-width: 900px; margin: 0 auto; }
h1 { color: #c8ff00; letter-spacing: 0.08em; font-size: 1.6rem; }
button.primary { background: #c8ff00 !important; color: #0a0a0a !important; font-weight: 700; border-radius: 2px !important; }
button.primary:hover { background: #b0e000 !important; }
.footer { color: #444; font-size: 0.7rem; text-align: center; margin-top: 2rem; }
"""

with gr.Blocks(css=css, title="DepthShift — Depth Map Generator") as demo:
    gr.HTML("""
        <link href="https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&display=swap" rel="stylesheet">
        <h1>⬛ DEPTHSHIFT / DEPTH GENERATOR</h1>
        <p style="color:#888; font-size:0.85rem; margin-top:-0.5rem;">
            Upload an MP4 → get a grayscale depth map video ready for
            <a href="https://spatial-index.vercel.app" target="_blank" style="color:#c8ff00;">Spatial Index</a>
        </p>
    """)

    with gr.Row():
        with gr.Column():
            video_in = gr.Video(label="Input Video (MP4)", interactive=True)
            with gr.Row():
                fps_slider = gr.Slider(6, 24, value=12, step=1, label="Output FPS")
                frames_slider = gr.Slider(10, 60, value=30, step=5, label="Max Frames")
            run_btn = gr.Button("Generate Depth Map", variant="primary")

        with gr.Column():
            depth_out = gr.Video(label="Depth Map (grayscale) — use this in DepthShift")
            preview_out = gr.Video(label="Preview (original | depth side-by-side)")
            frame_out = gr.Image(label="First Frame Depth PNG")

    gr.HTML("""
        <div class="footer">
            <b style="color:#c8ff00">HOW TO USE</b><br>
            1. Upload your MP4 &nbsp;→&nbsp;
            2. Download the depth map video &nbsp;→&nbsp;
            3. Load both into <a href="https://spatial-index.vercel.app" style="color:#c8ff00">Spatial Index / DepthShift</a>
            <br><br>Processing runs on CPU — keep Max Frames ≤ 30 for reasonable wait times (~1–2 min).
        </div>
    """)

    run_btn.click(
        fn=process_video,
        inputs=[video_in, fps_slider, frames_slider],
        outputs=[depth_out, preview_out, frame_out],
    )

demo.launch()