Spaces:

Krokodilpirat
/

Video-Depth-Anything_RGBD_Zero

Sleeping

App Files Files Community

Krokodilpirat commited on Jun 28, 2025

Commit

d07eed9

verified ·

1 Parent(s): be10cd4

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -75

app.py CHANGED Viewed

@@ -45,8 +45,8 @@ print("Loading BLIP model...")
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
-def get_middle_frame_for_blip(video_path, target_size=480):
-    """Effizient: Lädt nur das mittlere Frame für BLIP (nicht alle Frames!)"""
     try:
         cap = cv2.VideoCapture(video_path)
@@ -56,23 +56,22 @@ def get_middle_frame_for_blip(video_path, target_size=480):
             cap.release()
             return None
-        # Hole Frame-Count und springe zum mittleren Frame
         frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         if frame_count <= 0:
             print(f"DEBUG: Invalid frame count: {frame_count}")
             cap.release()
             return None
-        middle_idx = frame_count // 2
-        print(f"DEBUG: Video has {frame_count} frames, jumping to frame {middle_idx}")
-        # Springe direkt zum Ziel-Frame (keine Iteration!)
-        cap.set(cv2.CAP_PROP_POS_FRAMES, middle_idx)
         ret, frame = cap.read()
         cap.release()
         if not ret or frame is None:
-            print("DEBUG: Could not read middle frame")
             return None
         # Verkleinere nur dieses eine Frame
@@ -90,7 +89,7 @@ def get_middle_frame_for_blip(video_path, target_size=480):
         return frame_rgb
     except Exception as e:
-        print(f"DEBUG: get_middle_frame_for_blip error: {e}")
         return None
 def generate_blip_name(frame: np.ndarray) -> str:
@@ -205,9 +204,8 @@ def create_gradient_thumbnail(rgb_frame, depth_frame, longest_side=1024, add_log
         if gradient_end < height:
             result[gradient_end:] = depth_3ch[gradient_end:]
-        # 5. Optional: Looking Glass Logo hinzufügen
-        if add_logo:
-            result = add_looking_glass_logo(result)
         print(f"DEBUG: Real depth gradient thumbnail created successfully: {result.shape}")
         return result
@@ -217,51 +215,64 @@ def create_gradient_thumbnail(rgb_frame, depth_frame, longest_side=1024, add_log
         # Fallback: return original RGB frame
         return rgb_frame
-def add_looking_glass_logo(thumbnail, position="bottom-right"):
-    """Fügt Looking Glass Logo zum Thumbnail hinzu (Mock implementation)"""
     try:
         thumb_h, thumb_w = thumbnail.shape[:2]
-        # Mock logo: einfacher weißer Kreis mit "LG" Text
-        logo_size = max(30, int(thumb_w * 0.08))  # 8% der Breite
         # Position bestimmen
-        margin = 10
         if position == "bottom-right":
-            y_start = thumb_h - logo_size - margin
-            x_start = thumb_w - logo_size - margin
         elif position == "bottom-left":
-            y_start = thumb_h - logo_size - margin
-            x_start = margin
         else:
-            y_start = thumb_h - logo_size - margin
-            x_start = thumb_w - logo_size - margin
         # Sicherstellen dass Logo im Bild ist
-        y_start = max(0, min(y_start, thumb_h - logo_size))
-        x_start = max(0, min(x_start, thumb_w - logo_size))
-        # Semi-transparenten Kreis zeichnen
         overlay = thumbnail.copy()
-        cv2.circle(overlay, (x_start + logo_size//2, y_start + logo_size//2),
-                  logo_size//2, (255, 255, 255), -1)
-        # Text hinzufügen
-        font = cv2.FONT_HERSHEY_SIMPLEX
-        font_scale = max(0.3, logo_size / 100)
-        cv2.putText(overlay, "LG",
-                   (x_start + logo_size//4, y_start + logo_size//2 + 5),
-                   font, font_scale, (0, 0, 0), 1, cv2.LINE_AA)
-        # Alpha blending
-        alpha = 0.7
         result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
-        print(f"DEBUG: Looking Glass logo added at {position}")
         return result
     except Exception as e:
-        print(f"DEBUG: Logo addition failed: {e}")
         return thumbnail
 def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
@@ -496,7 +507,7 @@ blip_generated_name = ""
 original_filename = ""
 # --- Main inference function ---
-def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, add_logo, *args):
     """Process video to generate depth maps and RGBD output"""
     try:
         max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
@@ -530,6 +541,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
         # Generate depth maps
         print("Generating depth maps...")
         depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
         # Save depth visualization with final name
         save_video(depths, vis_video_path, fps=fps, is_depths=True)
@@ -573,6 +585,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
             # Save stitched video with final name
             save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
             # Add audio from original video if possible
             try:
@@ -590,32 +603,36 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
                 print(f"Audio processing failed: {e}")
                 rgbd_path = rgbd_video_path
-        # 🎯 NEU: Thumbnail aus echter Depth-Map generieren
-        print("Creating thumbnail from real depth data...")
-        try:
-            # Mittleres Frame für Thumbnail
-            middle_idx = len(depths) // 2
-            rgb_frame = get_middle_frame_for_blip(input_path, target_size=1024)
-            real_depth_frame = depths[middle_idx]  # Echte Depth vom AI-Model!
-            # Gradient-Thumbnail mit echter Depth erstellen
-            thumbnail = create_gradient_thumbnail(
-                rgb_frame,
-                real_depth_frame,
-                longest_side=1024,
-                add_logo=add_logo
-            )
-            # Thumbnail in beide Videos einbetten
-            embed_thumbnail_in_video(vis_video_path, thumbnail, base_name)
-            if rgbd_path:
-                embed_thumbnail_in_video(rgbd_path, thumbnail, base_name)
-            print("✅ Thumbnail created and embedded successfully")
-        except Exception as e:
-            print(f"❌ Thumbnail creation failed: {e}")
-            thumbnail = None
         # Clean up memory
         gc.collect()
@@ -623,7 +640,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
             torch.cuda.empty_cache()
         success_msg = f"✅ Videos saved as '{base_name}_vis.mp4' and '{base_name}_RGBD.mp4'"
-        if thumbnail is not None:
             success_msg += " with embedded thumbnails"
         print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
@@ -691,7 +708,7 @@ def on_video_upload_change(video_file, use_blip):
         blip_generated_name = ""
         if use_blip:
             print("DEBUG: Starting optimized BLIP processing...")
-            frame = get_middle_frame_for_blip(video_file, target_size=480)
             blip_generated_name = generate_blip_name(frame)
             print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
@@ -832,7 +849,7 @@ def on_video_url_change(url, use_blip):
         if use_blip and video_path:
             try:
                 print("DEBUG: Starting optimized BLIP processing for URL video...")
-                frame = get_middle_frame_for_blip(video_path, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: {source.title()} BLIP name generated: '{blip_generated_name}'")
             except Exception as e:
@@ -864,12 +881,12 @@ def on_blip_toggle(use_blip):
         # If toggling BLIP on and we don't have a BLIP name yet, generate it
         if use_blip and not blip_generated_name:
             if current_video_file:
-                frame = get_middle_frame_for_blip(current_video_file, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
             elif current_video_url:
                 # For URL videos, we might need to re-read frames
-                frame = get_middle_frame_for_blip(current_video_url, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
@@ -957,10 +974,10 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
                     interactive=False,
                     show_label=True
                 )
-                add_looking_glass_logo = gr.Checkbox(
-                    label="🔍 Add Looking Glass Logo",
-                    value=False,
-                    info="Add LG logo to thumbnail"
                 )
     # Event handlers for input changes - SIMPLIFIED (no thumbnail previews during input)
@@ -1043,7 +1060,7 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
     run_btn.click(
         fn=infer_video_depth_from_source,
         inputs=[
-            upload_video, video_url, filename, use_blip, add_looking_glass_logo,
             max_len, target_fps, max_res, stitch,
             grayscale, convert_from_color, blur
         ],

 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
+def get_first_frame_for_blip(video_path, target_size=480):
+    """Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
     try:
         cap = cv2.VideoCapture(video_path)
             cap.release()
             return None
+        # Hole Frame-Count für Debug-Info
         frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         if frame_count <= 0:
             print(f"DEBUG: Invalid frame count: {frame_count}")
             cap.release()
             return None
+        print(f"DEBUG: Video has {frame_count} frames, reading first frame (index 0)")
+        # Lese direkt das erste Frame (Position 0)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
         ret, frame = cap.read()
         cap.release()
         if not ret or frame is None:
+            print("DEBUG: Could not read first frame")
             return None
         # Verkleinere nur dieses eine Frame
         return frame_rgb
     except Exception as e:
+        print(f"DEBUG: get_first_frame_for_blip error: {e}")
         return None
 def generate_blip_name(frame: np.ndarray) -> str:
         if gradient_end < height:
             result[gradient_end:] = depth_3ch[gradient_end:]
+        # 5. Immer "D" Logo hinzufügen
+        result = add_depth_logo(result)
         print(f"DEBUG: Real depth gradient thumbnail created successfully: {result.shape}")
         return result
         # Fallback: return original RGB frame
         return rgb_frame
+def add_depth_logo(thumbnail, position="bottom-right"):
+    """Fügt einfaches 'D' Logo zum Thumbnail hinzu (für Depth)"""
     try:
         thumb_h, thumb_w = thumbnail.shape[:2]
+        # Logo-Größe basierend auf Thumbnail-Größe
+        logo_size = max(40, int(thumb_w * 0.08))  # 8% der Breite, mindestens 40px
         # Position bestimmen
+        margin = 15
         if position == "bottom-right":
+            x_pos = thumb_w - logo_size - margin
+            y_pos = thumb_h - margin
         elif position == "bottom-left":
+            x_pos = margin
+            y_pos = thumb_h - margin
         else:
+            x_pos = thumb_w - logo_size - margin
+            y_pos = thumb_h - margin
         # Sicherstellen dass Logo im Bild ist
+        x_pos = max(margin, min(x_pos, thumb_w - logo_size))
+        y_pos = max(logo_size, min(y_pos, thumb_h - margin))
+        # Font-Parameter
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        font_scale = max(1.0, logo_size / 60)  # Skaliert mit Logo-Größe
+        font_thickness = max(2, int(logo_size / 25))
+        # Text-Größe messen für Zentrierung
+        (text_w, text_h), baseline = cv2.getTextSize("D", font, font_scale, font_thickness)
+        # Semi-transparenten Kreis als Hintergrund
         overlay = thumbnail.copy()
+        circle_radius = logo_size // 2
+        circle_center = (x_pos + circle_radius, y_pos - circle_radius)
+        # Dunkler Kreis mit weißem Rand
+        cv2.circle(overlay, circle_center, circle_radius, (0, 0, 0), -1)  # Schwarzer Hintergrund
+        cv2.circle(overlay, circle_center, circle_radius, (255, 255, 255), 2)  # Weißer Rand
+        # "D" Text zentriert im Kreis
+        text_x = circle_center[0] - text_w // 2
+        text_y = circle_center[1] + text_h // 2
+        cv2.putText(overlay, "D",
+                   (text_x, text_y),
+                   font, font_scale, (255, 255, 255), font_thickness, cv2.LINE_AA)
+        # Alpha blending für semi-transparenten Effekt
+        alpha = 0.8
         result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
+        print(f"DEBUG: Added 'D' logo at {position}, size: {logo_size}px")
         return result
     except Exception as e:
+        print(f"DEBUG: 'D' logo addition failed: {e}")
         return thumbnail
 def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
 original_filename = ""
 # --- Main inference function ---
+def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
     """Process video to generate depth maps and RGBD output"""
     try:
         max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
         # Generate depth maps
         print("Generating depth maps...")
         depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
+        print("✅ Depth maps generated successfully")
         # Save depth visualization with final name
         save_video(depths, vis_video_path, fps=fps, is_depths=True)
             # Save stitched video with final name
             save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
+            print("✅ RGBD video created successfully")
             # Add audio from original video if possible
             try:
                 print(f"Audio processing failed: {e}")
                 rgbd_path = rgbd_video_path
+        # 🎯 Thumbnail-Generierung (wenn aktiviert)
+        thumbnail = None
+        if create_thumbnail:
+            print("Creating thumbnail from completed depth data...")
+            try:
+                # Erstes Frame für Thumbnail (konsistent mit BLIP)
+                rgb_frame = get_first_frame_for_blip(input_path, target_size=1024)
+                # Erstes Depth-Frame (korrespondierend zum RGB-Frame)
+                first_depth_frame = depths[0]  # Erstes Depth vom AI-Model!
+                # Gradient-Thumbnail mit echter Depth erstellen (mit "D" Logo)
+                thumbnail = create_gradient_thumbnail(
+                    rgb_frame,
+                    first_depth_frame,
+                    longest_side=1024,
+                    add_logo=True  # Immer Logo hinzufügen
+                )
+                # Thumbnail in beide Videos einbetten
+                embed_thumbnail_in_video(vis_video_path, thumbnail, base_name)
+                if rgbd_path:
+                    embed_thumbnail_in_video(rgbd_path, thumbnail, base_name)
+                print("✅ Thumbnail created and embedded successfully")
+            except Exception as e:
+                print(f"❌ Thumbnail creation failed: {e}")
+                thumbnail = None
+        else:
+            print("📷 Thumbnail generation skipped (disabled)")
         # Clean up memory
         gc.collect()
             torch.cuda.empty_cache()
         success_msg = f"✅ Videos saved as '{base_name}_vis.mp4' and '{base_name}_RGBD.mp4'"
+        if create_thumbnail and thumbnail is not None:
             success_msg += " with embedded thumbnails"
         print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
         blip_generated_name = ""
         if use_blip:
             print("DEBUG: Starting optimized BLIP processing...")
+            frame = get_first_frame_for_blip(video_file, target_size=480)
             blip_generated_name = generate_blip_name(frame)
             print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
         if use_blip and video_path:
             try:
                 print("DEBUG: Starting optimized BLIP processing for URL video...")
+                frame = get_first_frame_for_blip(video_path, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: {source.title()} BLIP name generated: '{blip_generated_name}'")
             except Exception as e:
         # If toggling BLIP on and we don't have a BLIP name yet, generate it
         if use_blip and not blip_generated_name:
             if current_video_file:
+                frame = get_first_frame_for_blip(current_video_file, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
             elif current_video_url:
                 # For URL videos, we might need to re-read frames
+                frame = get_first_frame_for_blip(current_video_url, target_size=480)
                 blip_generated_name = generate_blip_name(frame)
                 print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
                     interactive=False,
                     show_label=True
                 )
+                create_thumbnail = gr.Checkbox(
+                    label="🖼️ Create Embedded Thumbnail",
+                    value=True,
+                    info="Generate and embed thumbnail in MP4"
                 )
     # Event handlers for input changes - SIMPLIFIED (no thumbnail previews during input)
     run_btn.click(
         fn=infer_video_depth_from_source,
         inputs=[
+            upload_video, video_url, filename, use_blip, create_thumbnail,
             max_len, target_fps, max_res, stitch,
             grayscale, convert_from_color, blur
         ],