Spaces:

Krokodilpirat
/

Video-Depth-Anything_RGBD_Zero

Running on Zero

App Files Files Community

Krokodilpirat commited on Jul 12, 2025

Commit

c245745

verified ·

1 Parent(s): c3bd575

Update app.py

Browse files

Files changed (1) hide show

app.py +280 -10

app.py CHANGED Viewed

@@ -346,6 +346,231 @@ def cleanup_temp_files(*file_paths):
             except Exception as e:
                 print(f"DEBUG: Cleanup failed for {file_path}: {e}")
 # --- Load depth model ---
 print("Loading Video Depth Anything model...")
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -667,16 +892,15 @@ def on_generate_audio(upload_video, video_url, audio_prompt, negative_prompt, au
         error_msg = f"❌ Audio generation error: {str(e)}"
         print(error_msg)
         return None, error_msg
-def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *args):
-    """Process video to generate depth maps and RGBD output"""
     try:
         max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
         # Determine input source
         input_path = upload_video or video_url
         if not input_path:
-            return None, None, "Error: No video source provided"
         # Fix filename at generation time
         base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
@@ -696,7 +920,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
         print("Reading video frames...")
         frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
         if len(frames) == 0:
-            return None, None, "Error: No frames could be extracted from video"
         # Generate depth maps
         print("Generating depth maps...")
@@ -707,6 +931,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
         save_video(depths, vis_video_path, fps=fps, is_depths=True)
         rgbd_path = None
         if stitch:
             print("Creating RGBD stitched video...")
@@ -743,6 +968,24 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
                 depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
                 stitched = cv2.hconcat([rgb, depth_resized])
                 stitched_frames.append(stitched)
             # Save stitched video
             save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
@@ -765,6 +1008,14 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
             rgbd_path = rgbd_video_path
         # Clean up memory
         gc.collect()
         if torch.cuda.is_available():
@@ -773,14 +1024,16 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
         success_msg = f"✅ Videos saved as '{base_name}_vis.mp4'"
         if stitch and rgbd_path:
             success_msg += f" and '{base_name}_RGBD.mp4'"
         print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
-        return vis_video_path, rgbd_path, success_msg
     except Exception as e:
         error_msg = f"Processing failed: {str(e)}"
         print(error_msg)
-        return None, None, error_msg
 # --- UI event handlers ---
 def on_video_upload_change(video_file, use_blip):
@@ -1022,7 +1275,7 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
                 show_label=True
             )
-    # Input controls row
     with gr.Row():
         video_url = gr.Textbox(
             label="Video URL (YouTube, TikTok, Instagram, Civitai, MidJourney, etc.)",
@@ -1040,6 +1293,21 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
             placeholder="Enter filename or let BLIP generate it",
             scale=3
         )
     # MMaudio Integration
     with gr.Accordion("🎵 Audio Generation (MMaudio)", open=False):
@@ -1203,11 +1471,11 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
     run_btn.click(
         fn=infer_video_depth_from_source,
         inputs=[
-            upload_video, video_url, filename, use_blip,
             max_len, target_fps, max_res, stitch,
             grayscale, convert_from_color, blur
         ],
-        outputs=[depth_out, rgbd_out, status_display]
     )
     gr.Markdown("""
@@ -1215,6 +1483,8 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
     - **Upload formats**: MP4, AVI, MOV, etc.
     - **BLIP naming**: Automatically generates descriptive filenames
     - **RGBD output**: Side-by-side comparison of original and depth
     - **Audio Generation**: Use MMaudio Large V2 (44kHz) for high-quality audio synthesis
     - **Audio Prompts**: Be descriptive (e.g. "gentle ocean waves with seagulls", "epic orchestral music")
     - **Iterative Audio**: Generate multiple times with different prompts to perfect the audio

             except Exception as e:
                 print(f"DEBUG: Cleanup failed for {file_path}: {e}")
+def create_overlay_thumbnail(rgb_frame, depth_frame):
+    """
+    Erstellt Overlay-Thumbnail mit vollständigem RGB und Depth-Miniatur unten rechts
+    Args:
+        rgb_frame: Original RGB Frame (volle Auflösung)
+        depth_frame: Depth Frame (bereits auf RGB-Größe angepasst und verarbeitet)
+    Returns:
+        np.array: Thumbnail mit RGB-Vollbild und Depth-Overlay unten rechts
+    """
+    print(f"DEBUG: Creating overlay thumbnail - RGB: {rgb_frame.shape}, Depth: {depth_frame.shape}")
+    # 1. Skaliere RGB auf Thumbnail-Größe
+    target_size = 1024
+    h, w = rgb_frame.shape[:2]
+    if max(h, w) > target_size:
+        scale = target_size / max(h, w)
+        new_h, new_w = int(h * scale), int(w * scale)
+        rgb_thumb = cv2.resize(rgb_frame, (new_w, new_h))
+    else:
+        rgb_thumb = rgb_frame.copy()
+    print(f"DEBUG: RGB thumbnail size: {rgb_thumb.shape}")
+    # 2. Erstelle Depth-Miniatur (30% der RGB-Breite)
+    thumb_h, thumb_w = rgb_thumb.shape[:2]
+    depth_mini_w = int(thumb_w * 0.30)  # 30% der RGB-Breite
+    depth_mini_h = int(depth_mini_w * (thumb_h / thumb_w))  # Proportional zur RGB-Höhe
+    # Skaliere Depth auf Miniatur-Größe
+    depth_mini = cv2.resize(depth_frame, (depth_mini_w, depth_mini_h))
+    print(f"DEBUG: Depth miniature size: {depth_mini.shape} (30% of RGB width)")
+    # 3. Positioniere Depth-Miniatur unten rechts (bündig, ohne Ränder)
+    result = rgb_thumb.copy()
+    # Berechne Position: unten rechts, bündig
+    x_start = thumb_w - depth_mini_w  # Rechts bündig
+    y_start = thumb_h - depth_mini_h  # Unten bündig
+    # Stelle sicher, dass die Miniatur innerhalb der Grenzen bleibt
+    x_start = max(0, x_start)
+    y_start = max(0, y_start)
+    x_end = min(thumb_w, x_start + depth_mini_w)
+    y_end = min(thumb_h, y_start + depth_mini_h)
+    # Passe Depth-Miniatur an tatsächliche verfügbare Größe an
+    actual_w = x_end - x_start
+    actual_h = y_end - y_start
+    if actual_w != depth_mini_w or actual_h != depth_mini_h:
+        depth_mini = cv2.resize(depth_mini, (actual_w, actual_h))
+    # 4. Erstelle abgerundete Maske für obere linke Ecke
+    mask = create_rounded_corner_mask(actual_w, actual_h)
+    # 5. Überlagere Depth-Miniatur auf RGB mit abgerundeter oberer linker Ecke
+    apply_rounded_overlay(result, depth_mini, x_start, y_start, mask)
+    print(f"DEBUG: Overlay thumbnail completed: {result.shape}")
+    print(f"DEBUG: Depth overlay at position ({x_start}, {y_start}) with size {actual_w}x{actual_h}")
+    return result
+def create_rounded_corner_mask(width, height):
+    """Erstellt Anti-Aliased Maske mit abgerundeter oberer linker Ecke"""
+    # Radius für die Rundung (40% der kleineren Dimension)
+    radius = int(min(width, height) * 0.40)
+    radius = max(radius, 5)  # Minimum 5 Pixel
+    # Erstelle Maske (weiß = sichtbar, schwarz = transparent)
+    mask = np.ones((height, width), dtype=np.float32)
+    # Erstelle Anti-Aliased Rundung in oberer linker Ecke
+    for y in range(radius):
+        for x in range(radius):
+            # Distanz zum Zentrum des Kreises
+            dist = np.sqrt((x - radius) ** 2 + (y - radius) ** 2)
+            if dist > radius:
+                # Außerhalb des Radius - berechne Anti-Aliasing
+                alpha = max(0, 1 - (dist - radius))
+                mask[y, x] = alpha
+    print(f"DEBUG: Created rounded mask with radius {radius}px for {width}x{height} overlay")
+    return mask
+def apply_rounded_overlay(result, depth_mini, x_start, y_start, mask):
+    """Wendet Depth-Overlay mit abgerundeter Maske an"""
+    actual_h, actual_w = depth_mini.shape[:2]
+    # Hole den zu überschreibenden RGB-Bereich
+    rgb_section = result[y_start:y_start + actual_h, x_start:x_start + actual_w].copy()
+    # Wende Maske auf alle Farbkanäle an
+    for c in range(3):  # RGB-Kanäle
+        # Alpha-Blending: RGB * (1-mask) + Depth * mask
+        blended = rgb_section[:, :, c].astype(np.float32) * (1 - mask) + \
+                 depth_mini[:, :, c].astype(np.float32) * mask
+        result[y_start:y_start + actual_h, x_start:x_start + actual_w, c] = blended.astype(np.uint8)
+    print(f"DEBUG: Applied anti-aliased rounded overlay at ({x_start}, {y_start})")
+def add_depth_logo_to_overlay(thumbnail, overlay_x, overlay_y, overlay_w, overlay_h):
+    """Adds small 'D' logo specifically to the depth overlay area"""
+    try:
+        # Logo-Größe proportional zur Overlay-Größe (kleiner)
+        logo_size = max(20, int(overlay_w * 0.15))  # 15% der Overlay-Breite, minimum 20px
+        # Position innerhalb des Overlays (unten rechts des Overlays)
+        margin = 5
+        x_pos = overlay_x + overlay_w - logo_size - margin
+        y_pos = overlay_y + overlay_h - margin
+        # Stelle sicher, dass Logo innerhalb des Overlays bleibt
+        x_pos = max(overlay_x + margin, min(x_pos, overlay_x + overlay_w - logo_size))
+        y_pos = max(overlay_y + logo_size, min(y_pos, overlay_y + overlay_h - margin))
+        # Font-Parameter für kleines Logo
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        font_scale = max(1.0, logo_size / 20)  # Kleinerer Font
+        font_thickness = max(2, int(logo_size / 10))  # Dünnere Linien
+        # Measure text size for centering
+        (text_w, text_h), baseline = cv2.getTextSize("D", font, font_scale, font_thickness)
+        # Circle parameters
+        circle_radius = logo_size // 2
+        circle_center = (x_pos + circle_radius, y_pos - circle_radius)
+        # Overlay for anti-aliasing
+        overlay = thumbnail.copy()
+        # Black circle
+        cv2.circle(overlay, circle_center, circle_radius, (0, 0, 0), -1, cv2.LINE_AA)
+        # "D" text centered in circle - WHITE
+        text_x = circle_center[0] - text_w // 2
+        text_y = circle_center[1] + text_h // 2
+        cv2.putText(overlay, "D",
+                   (text_x, text_y),
+                   font, font_scale, (255, 255, 255), font_thickness, cv2.LINE_AA)
+        # Alpha blending
+        alpha = 0.8
+        result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
+        print(f"DEBUG: Added small 'D' logo to overlay at ({circle_center[0]}, {circle_center[1]}), size: {logo_size}px")
+        return result
+    except Exception as e:
+        print(f"DEBUG: Overlay logo addition failed: {e}")
+        return thumbnail
+def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
+    """Bettet Thumbnail als Cover-Art in MP4-Video ein (JPEG für iOS-Kompatibilität)"""
+    try:
+        # 🎯 FIX: RGB zu BGR konvertieren für cv2.imwrite
+        if len(thumbnail_array.shape) == 3 and thumbnail_array.shape[2] == 3:
+            # Gradio/Preview verwendet RGB, cv2.imwrite erwartet BGR
+            thumbnail_bgr = cv2.cvtColor(thumbnail_array, cv2.COLOR_RGB2BGR)
+        else:
+            thumbnail_bgr = thumbnail_array
+        # Thumbnail als temporäre JPEG-Datei speichern (WICHTIG: Explizit JPEG für iOS)
+        temp_thumb_path = f"temp_{base_name}_thumb.jpg"
+        # Erzwinge JPEG-Format mit hoher Qualität
+        success = cv2.imwrite(temp_thumb_path, thumbnail_bgr, [
+            cv2.IMWRITE_JPEG_QUALITY, 90,
+            cv2.IMWRITE_JPEG_OPTIMIZE, 1
+        ])
+        if not success:
+            raise RuntimeError("Failed to save thumbnail as JPEG")
+        # Verifikation: Prüfe ob Datei wirklich JPEG ist
+        if not os.path.exists(temp_thumb_path):
+            raise RuntimeError("Thumbnail JPEG file not created")
+        print(f"DEBUG: Saved thumbnail as JPEG: {temp_thumb_path}")
+        # Temporärer Output-Pfad
+        temp_output = video_path.replace('.mp4', '_with_thumb.mp4')
+        # FFmpeg-Befehl zum Einbetten des JPEG-Thumbnails
+        cmd = [
+            "ffmpeg", "-y",
+            "-i", video_path,           # Original video
+            "-i", temp_thumb_path,      # JPEG Thumbnail image
+            "-map", "0",                # Alle Streams vom Video
+            "-map", "1",                # Thumbnail-Stream
+            "-c", "copy",               # Video/Audio kopieren (kein Re-encoding)
+            "-c:v:1", "mjpeg",         # Thumbnail explizit als MJPEG/JPEG
+            "-disposition:v:1", "attached_pic",  # Als Cover-Art markieren
+            "-metadata:s:v:1", "title=Cover",    # Metadaten
+            "-metadata:s:v:1", "comment=JPEG Video Thumbnail",
+            temp_output
+        ]
+        print(f"DEBUG: Embedding JPEG thumbnail in video: {video_path}")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode == 0:
+            # Ersetze Original mit Thumbnail-Version
+            os.replace(temp_output, video_path)
+            print(f"✅ JPEG thumbnail successfully embedded in {video_path}")
+        else:
+            print(f"❌ FFmpeg failed: {result.stderr}")
+        # Cleanup
+        if os.path.exists(temp_thumb_path):
+            os.remove(temp_thumb_path)
+        if os.path.exists(temp_output):
+            os.remove(temp_output)
+        return result.returncode == 0
+    except Exception as e:
+        print(f"❌ Thumbnail embedding failed: {e}")
+        return False
 # --- Load depth model ---
 print("Loading Video Depth Anything model...")
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
         error_msg = f"❌ Audio generation error: {str(e)}"
         print(error_msg)
         return None, error_msg
+def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
+    """Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
     try:
         max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
         # Determine input source
         input_path = upload_video or video_url
         if not input_path:
+            return None, None, "Error: No video source provided", None
         # Fix filename at generation time
         base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
         print("Reading video frames...")
         frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
         if len(frames) == 0:
+            return None, None, "Error: No frames could be extracted from video", None
         # Generate depth maps
         print("Generating depth maps...")
         save_video(depths, vis_video_path, fps=fps, is_depths=True)
         rgbd_path = None
+        thumbnail = None
         if stitch:
             print("Creating RGBD stitched video...")
                 depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
                 stitched = cv2.hconcat([rgb, depth_resized])
                 stitched_frames.append(stitched)
+                # 🎯 CREATE THUMBNAIL from first perfectly matched RGB+Depth pair (but don't embed yet)
+                if i == 0 and create_thumbnail:
+                    print("Creating thumbnail from first perfectly matched RGB+Depth pair...")
+                    try:
+                        print(f"DEBUG: Using RGB: {rgb.shape}, Depth: {depth_resized.shape}")
+                        print(f"DEBUG: Depth range: {depth_resized.min()} - {depth_resized.max()}")
+                        # Erstelle Thumbnail mit den bereits perfekt passenden Frames
+                        thumbnail = create_overlay_thumbnail(rgb, depth_resized)
+                        print("✅ Thumbnail created from first RGBD pair (not embedded yet)")
+                    except Exception as e:
+                        print(f"❌ Thumbnail creation failed: {e}")
+                        import traceback
+                        traceback.print_exc()
+                        thumbnail = None
             # Save stitched video
             save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
             rgbd_path = rgbd_video_path
+            # 🎯 FINAL FIX: Embed thumbnail ONLY in RGBD video AFTER all processing
+            if create_thumbnail and thumbnail is not None:
+                print("Embedding thumbnail in RGBD video only (after all processing)...")
+                embed_thumbnail_in_video(rgbd_video_path, thumbnail, base_name)
+                print("✅ Thumbnail embedded in RGBD video only")
+            elif create_thumbnail:
+                print("❌ No thumbnail to embed")
         # Clean up memory
         gc.collect()
         if torch.cuda.is_available():
         success_msg = f"✅ Videos saved as '{base_name}_vis.mp4'"
         if stitch and rgbd_path:
             success_msg += f" and '{base_name}_RGBD.mp4'"
+            if create_thumbnail and thumbnail is not None:
+                success_msg += " with embedded thumbnail"
         print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
+        return vis_video_path, rgbd_path, success_msg, thumbnail
     except Exception as e:
         error_msg = f"Processing failed: {str(e)}"
         print(error_msg)
+        return None, None, error_msg, None
 # --- UI event handlers ---
 def on_video_upload_change(video_file, use_blip):
                 show_label=True
             )
+    # Single row with all input controls and thumbnail preview
     with gr.Row():
         video_url = gr.Textbox(
             label="Video URL (YouTube, TikTok, Instagram, Civitai, MidJourney, etc.)",
             placeholder="Enter filename or let BLIP generate it",
             scale=3
         )
+        create_thumbnail = gr.Checkbox(
+            label="Embed Video Thumbnail",
+            value=True,
+            scale=1,
+            info="Generate and embed thumbnail in MP4"
+        )
+        thumbnail_preview = gr.Image(
+            label="Thumbnail Preview",
+            height=140,
+            width=180,
+            interactive=False,
+            show_label=True,
+            scale=1,
+            format="jpeg"  # Erzwinge JPEG für Downloads
+        )
     # MMaudio Integration
     with gr.Accordion("🎵 Audio Generation (MMaudio)", open=False):
     run_btn.click(
         fn=infer_video_depth_from_source,
         inputs=[
+            upload_video, video_url, filename, use_blip, create_thumbnail,
             max_len, target_fps, max_res, stitch,
             grayscale, convert_from_color, blur
         ],
+        outputs=[depth_out, rgbd_out, status_display, thumbnail_preview]
     )
     gr.Markdown("""
     - **Upload formats**: MP4, AVI, MOV, etc.
     - **BLIP naming**: Automatically generates descriptive filenames
     - **RGBD output**: Side-by-side comparison of original and depth
+    - **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
+    - **Embedded Thumbnails**: Videos will show previews in Windows Explorer
     - **Audio Generation**: Use MMaudio Large V2 (44kHz) for high-quality audio synthesis
     - **Audio Prompts**: Be descriptive (e.g. "gentle ocean waves with seagulls", "epic orchestral music")
     - **Iterative Audio**: Generate multiple times with different prompts to perfect the audio