Krokodilpirat commited on
Commit
d07eed9
·
verified ·
1 Parent(s): be10cd4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -75
app.py CHANGED
@@ -45,8 +45,8 @@ print("Loading BLIP model...")
45
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
46
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
47
 
48
- def get_middle_frame_for_blip(video_path, target_size=480):
49
- """Effizient: Lädt nur das mittlere Frame für BLIP (nicht alle Frames!)"""
50
  try:
51
  cap = cv2.VideoCapture(video_path)
52
 
@@ -56,23 +56,22 @@ def get_middle_frame_for_blip(video_path, target_size=480):
56
  cap.release()
57
  return None
58
 
59
- # Hole Frame-Count und springe zum mittleren Frame
60
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
61
  if frame_count <= 0:
62
  print(f"DEBUG: Invalid frame count: {frame_count}")
63
  cap.release()
64
  return None
65
 
66
- middle_idx = frame_count // 2
67
- print(f"DEBUG: Video has {frame_count} frames, jumping to frame {middle_idx}")
68
 
69
- # Springe direkt zum Ziel-Frame (keine Iteration!)
70
- cap.set(cv2.CAP_PROP_POS_FRAMES, middle_idx)
71
  ret, frame = cap.read()
72
  cap.release()
73
 
74
  if not ret or frame is None:
75
- print("DEBUG: Could not read middle frame")
76
  return None
77
 
78
  # Verkleinere nur dieses eine Frame
@@ -90,7 +89,7 @@ def get_middle_frame_for_blip(video_path, target_size=480):
90
  return frame_rgb
91
 
92
  except Exception as e:
93
- print(f"DEBUG: get_middle_frame_for_blip error: {e}")
94
  return None
95
 
96
  def generate_blip_name(frame: np.ndarray) -> str:
@@ -205,9 +204,8 @@ def create_gradient_thumbnail(rgb_frame, depth_frame, longest_side=1024, add_log
205
  if gradient_end < height:
206
  result[gradient_end:] = depth_3ch[gradient_end:]
207
 
208
- # 5. Optional: Looking Glass Logo hinzufügen
209
- if add_logo:
210
- result = add_looking_glass_logo(result)
211
 
212
  print(f"DEBUG: Real depth gradient thumbnail created successfully: {result.shape}")
213
  return result
@@ -217,51 +215,64 @@ def create_gradient_thumbnail(rgb_frame, depth_frame, longest_side=1024, add_log
217
  # Fallback: return original RGB frame
218
  return rgb_frame
219
 
220
- def add_looking_glass_logo(thumbnail, position="bottom-right"):
221
- """Fügt Looking Glass Logo zum Thumbnail hinzu (Mock implementation)"""
222
  try:
223
  thumb_h, thumb_w = thumbnail.shape[:2]
224
 
225
- # Mock logo: einfacher weißer Kreis mit "LG" Text
226
- logo_size = max(30, int(thumb_w * 0.08)) # 8% der Breite
227
 
228
  # Position bestimmen
229
- margin = 10
230
  if position == "bottom-right":
231
- y_start = thumb_h - logo_size - margin
232
- x_start = thumb_w - logo_size - margin
233
  elif position == "bottom-left":
234
- y_start = thumb_h - logo_size - margin
235
- x_start = margin
236
  else:
237
- y_start = thumb_h - logo_size - margin
238
- x_start = thumb_w - logo_size - margin
239
 
240
  # Sicherstellen dass Logo im Bild ist
241
- y_start = max(0, min(y_start, thumb_h - logo_size))
242
- x_start = max(0, min(x_start, thumb_w - logo_size))
243
 
244
- # Semi-transparenten Kreis zeichnen
 
 
 
 
 
 
 
 
245
  overlay = thumbnail.copy()
246
- cv2.circle(overlay, (x_start + logo_size//2, y_start + logo_size//2),
247
- logo_size//2, (255, 255, 255), -1)
248
 
249
- # Text hinzufügen
250
- font = cv2.FONT_HERSHEY_SIMPLEX
251
- font_scale = max(0.3, logo_size / 100)
252
- cv2.putText(overlay, "LG",
253
- (x_start + logo_size//4, y_start + logo_size//2 + 5),
254
- font, font_scale, (0, 0, 0), 1, cv2.LINE_AA)
 
 
 
 
 
255
 
256
- # Alpha blending
257
- alpha = 0.7
258
  result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
259
 
260
- print(f"DEBUG: Looking Glass logo added at {position}")
261
  return result
262
 
263
  except Exception as e:
264
- print(f"DEBUG: Logo addition failed: {e}")
265
  return thumbnail
266
 
267
  def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
@@ -496,7 +507,7 @@ blip_generated_name = ""
496
  original_filename = ""
497
 
498
  # --- Main inference function ---
499
- def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, add_logo, *args):
500
  """Process video to generate depth maps and RGBD output"""
501
  try:
502
  max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
@@ -530,6 +541,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
530
  # Generate depth maps
531
  print("Generating depth maps...")
532
  depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
 
533
 
534
  # Save depth visualization with final name
535
  save_video(depths, vis_video_path, fps=fps, is_depths=True)
@@ -573,6 +585,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
573
 
574
  # Save stitched video with final name
575
  save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
 
576
 
577
  # Add audio from original video if possible
578
  try:
@@ -590,32 +603,36 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
590
  print(f"Audio processing failed: {e}")
591
  rgbd_path = rgbd_video_path
592
 
593
- # 🎯 NEU: Thumbnail aus echter Depth-Map generieren
594
- print("Creating thumbnail from real depth data...")
595
- try:
596
- # Mittleres Frame für Thumbnail
597
- middle_idx = len(depths) // 2
598
- rgb_frame = get_middle_frame_for_blip(input_path, target_size=1024)
599
- real_depth_frame = depths[middle_idx] # Echte Depth vom AI-Model!
600
-
601
- # Gradient-Thumbnail mit echter Depth erstellen
602
- thumbnail = create_gradient_thumbnail(
603
- rgb_frame,
604
- real_depth_frame,
605
- longest_side=1024,
606
- add_logo=add_logo
607
- )
608
-
609
- # Thumbnail in beide Videos einbetten
610
- embed_thumbnail_in_video(vis_video_path, thumbnail, base_name)
611
- if rgbd_path:
612
- embed_thumbnail_in_video(rgbd_path, thumbnail, base_name)
613
-
614
- print("✅ Thumbnail created and embedded successfully")
615
-
616
- except Exception as e:
617
- print(f"❌ Thumbnail creation failed: {e}")
618
- thumbnail = None
 
 
 
 
619
 
620
  # Clean up memory
621
  gc.collect()
@@ -623,7 +640,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
623
  torch.cuda.empty_cache()
624
 
625
  success_msg = f"✅ Videos saved as '{base_name}_vis.mp4' and '{base_name}_RGBD.mp4'"
626
- if thumbnail is not None:
627
  success_msg += " with embedded thumbnails"
628
 
629
  print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
@@ -691,7 +708,7 @@ def on_video_upload_change(video_file, use_blip):
691
  blip_generated_name = ""
692
  if use_blip:
693
  print("DEBUG: Starting optimized BLIP processing...")
694
- frame = get_middle_frame_for_blip(video_file, target_size=480)
695
  blip_generated_name = generate_blip_name(frame)
696
  print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
697
 
@@ -832,7 +849,7 @@ def on_video_url_change(url, use_blip):
832
  if use_blip and video_path:
833
  try:
834
  print("DEBUG: Starting optimized BLIP processing for URL video...")
835
- frame = get_middle_frame_for_blip(video_path, target_size=480)
836
  blip_generated_name = generate_blip_name(frame)
837
  print(f"DEBUG: {source.title()} BLIP name generated: '{blip_generated_name}'")
838
  except Exception as e:
@@ -864,12 +881,12 @@ def on_blip_toggle(use_blip):
864
  # If toggling BLIP on and we don't have a BLIP name yet, generate it
865
  if use_blip and not blip_generated_name:
866
  if current_video_file:
867
- frame = get_middle_frame_for_blip(current_video_file, target_size=480)
868
  blip_generated_name = generate_blip_name(frame)
869
  print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
870
  elif current_video_url:
871
  # For URL videos, we might need to re-read frames
872
- frame = get_middle_frame_for_blip(current_video_url, target_size=480)
873
  blip_generated_name = generate_blip_name(frame)
874
  print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
875
 
@@ -957,10 +974,10 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
957
  interactive=False,
958
  show_label=True
959
  )
960
- add_looking_glass_logo = gr.Checkbox(
961
- label="🔍 Add Looking Glass Logo",
962
- value=False,
963
- info="Add LG logo to thumbnail"
964
  )
965
 
966
  # Event handlers for input changes - SIMPLIFIED (no thumbnail previews during input)
@@ -1043,7 +1060,7 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
1043
  run_btn.click(
1044
  fn=infer_video_depth_from_source,
1045
  inputs=[
1046
- upload_video, video_url, filename, use_blip, add_looking_glass_logo,
1047
  max_len, target_fps, max_res, stitch,
1048
  grayscale, convert_from_color, blur
1049
  ],
 
45
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
46
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
47
 
48
+ def get_first_frame_for_blip(video_path, target_size=480):
49
+ """Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
50
  try:
51
  cap = cv2.VideoCapture(video_path)
52
 
 
56
  cap.release()
57
  return None
58
 
59
+ # Hole Frame-Count für Debug-Info
60
  frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
61
  if frame_count <= 0:
62
  print(f"DEBUG: Invalid frame count: {frame_count}")
63
  cap.release()
64
  return None
65
 
66
+ print(f"DEBUG: Video has {frame_count} frames, reading first frame (index 0)")
 
67
 
68
+ # Lese direkt das erste Frame (Position 0)
69
+ cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
70
  ret, frame = cap.read()
71
  cap.release()
72
 
73
  if not ret or frame is None:
74
+ print("DEBUG: Could not read first frame")
75
  return None
76
 
77
  # Verkleinere nur dieses eine Frame
 
89
  return frame_rgb
90
 
91
  except Exception as e:
92
+ print(f"DEBUG: get_first_frame_for_blip error: {e}")
93
  return None
94
 
95
  def generate_blip_name(frame: np.ndarray) -> str:
 
204
  if gradient_end < height:
205
  result[gradient_end:] = depth_3ch[gradient_end:]
206
 
207
+ # 5. Immer "D" Logo hinzufügen
208
+ result = add_depth_logo(result)
 
209
 
210
  print(f"DEBUG: Real depth gradient thumbnail created successfully: {result.shape}")
211
  return result
 
215
  # Fallback: return original RGB frame
216
  return rgb_frame
217
 
218
+ def add_depth_logo(thumbnail, position="bottom-right"):
219
+ """Fügt einfaches 'D' Logo zum Thumbnail hinzu (für Depth)"""
220
  try:
221
  thumb_h, thumb_w = thumbnail.shape[:2]
222
 
223
+ # Logo-Größe basierend auf Thumbnail-Größe
224
+ logo_size = max(40, int(thumb_w * 0.08)) # 8% der Breite, mindestens 40px
225
 
226
  # Position bestimmen
227
+ margin = 15
228
  if position == "bottom-right":
229
+ x_pos = thumb_w - logo_size - margin
230
+ y_pos = thumb_h - margin
231
  elif position == "bottom-left":
232
+ x_pos = margin
233
+ y_pos = thumb_h - margin
234
  else:
235
+ x_pos = thumb_w - logo_size - margin
236
+ y_pos = thumb_h - margin
237
 
238
  # Sicherstellen dass Logo im Bild ist
239
+ x_pos = max(margin, min(x_pos, thumb_w - logo_size))
240
+ y_pos = max(logo_size, min(y_pos, thumb_h - margin))
241
 
242
+ # Font-Parameter
243
+ font = cv2.FONT_HERSHEY_SIMPLEX
244
+ font_scale = max(1.0, logo_size / 60) # Skaliert mit Logo-Größe
245
+ font_thickness = max(2, int(logo_size / 25))
246
+
247
+ # Text-Größe messen für Zentrierung
248
+ (text_w, text_h), baseline = cv2.getTextSize("D", font, font_scale, font_thickness)
249
+
250
+ # Semi-transparenten Kreis als Hintergrund
251
  overlay = thumbnail.copy()
252
+ circle_radius = logo_size // 2
253
+ circle_center = (x_pos + circle_radius, y_pos - circle_radius)
254
 
255
+ # Dunkler Kreis mit weißem Rand
256
+ cv2.circle(overlay, circle_center, circle_radius, (0, 0, 0), -1) # Schwarzer Hintergrund
257
+ cv2.circle(overlay, circle_center, circle_radius, (255, 255, 255), 2) # Weißer Rand
258
+
259
+ # "D" Text zentriert im Kreis
260
+ text_x = circle_center[0] - text_w // 2
261
+ text_y = circle_center[1] + text_h // 2
262
+
263
+ cv2.putText(overlay, "D",
264
+ (text_x, text_y),
265
+ font, font_scale, (255, 255, 255), font_thickness, cv2.LINE_AA)
266
 
267
+ # Alpha blending für semi-transparenten Effekt
268
+ alpha = 0.8
269
  result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
270
 
271
+ print(f"DEBUG: Added 'D' logo at {position}, size: {logo_size}px")
272
  return result
273
 
274
  except Exception as e:
275
+ print(f"DEBUG: 'D' logo addition failed: {e}")
276
  return thumbnail
277
 
278
  def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
 
507
  original_filename = ""
508
 
509
  # --- Main inference function ---
510
+ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
511
  """Process video to generate depth maps and RGBD output"""
512
  try:
513
  max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
 
541
  # Generate depth maps
542
  print("Generating depth maps...")
543
  depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
544
+ print("✅ Depth maps generated successfully")
545
 
546
  # Save depth visualization with final name
547
  save_video(depths, vis_video_path, fps=fps, is_depths=True)
 
585
 
586
  # Save stitched video with final name
587
  save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
588
+ print("✅ RGBD video created successfully")
589
 
590
  # Add audio from original video if possible
591
  try:
 
603
  print(f"Audio processing failed: {e}")
604
  rgbd_path = rgbd_video_path
605
 
606
+ # 🎯 Thumbnail-Generierung (wenn aktiviert)
607
+ thumbnail = None
608
+ if create_thumbnail:
609
+ print("Creating thumbnail from completed depth data...")
610
+ try:
611
+ # Erstes Frame für Thumbnail (konsistent mit BLIP)
612
+ rgb_frame = get_first_frame_for_blip(input_path, target_size=1024)
613
+ # Erstes Depth-Frame (korrespondierend zum RGB-Frame)
614
+ first_depth_frame = depths[0] # Erstes Depth vom AI-Model!
615
+
616
+ # Gradient-Thumbnail mit echter Depth erstellen (mit "D" Logo)
617
+ thumbnail = create_gradient_thumbnail(
618
+ rgb_frame,
619
+ first_depth_frame,
620
+ longest_side=1024,
621
+ add_logo=True # Immer Logo hinzufügen
622
+ )
623
+
624
+ # Thumbnail in beide Videos einbetten
625
+ embed_thumbnail_in_video(vis_video_path, thumbnail, base_name)
626
+ if rgbd_path:
627
+ embed_thumbnail_in_video(rgbd_path, thumbnail, base_name)
628
+
629
+ print("✅ Thumbnail created and embedded successfully")
630
+
631
+ except Exception as e:
632
+ print(f"❌ Thumbnail creation failed: {e}")
633
+ thumbnail = None
634
+ else:
635
+ print("📷 Thumbnail generation skipped (disabled)")
636
 
637
  # Clean up memory
638
  gc.collect()
 
640
  torch.cuda.empty_cache()
641
 
642
  success_msg = f"✅ Videos saved as '{base_name}_vis.mp4' and '{base_name}_RGBD.mp4'"
643
+ if create_thumbnail and thumbnail is not None:
644
  success_msg += " with embedded thumbnails"
645
 
646
  print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
 
708
  blip_generated_name = ""
709
  if use_blip:
710
  print("DEBUG: Starting optimized BLIP processing...")
711
+ frame = get_first_frame_for_blip(video_file, target_size=480)
712
  blip_generated_name = generate_blip_name(frame)
713
  print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
714
 
 
849
  if use_blip and video_path:
850
  try:
851
  print("DEBUG: Starting optimized BLIP processing for URL video...")
852
+ frame = get_first_frame_for_blip(video_path, target_size=480)
853
  blip_generated_name = generate_blip_name(frame)
854
  print(f"DEBUG: {source.title()} BLIP name generated: '{blip_generated_name}'")
855
  except Exception as e:
 
881
  # If toggling BLIP on and we don't have a BLIP name yet, generate it
882
  if use_blip and not blip_generated_name:
883
  if current_video_file:
884
+ frame = get_first_frame_for_blip(current_video_file, target_size=480)
885
  blip_generated_name = generate_blip_name(frame)
886
  print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
887
  elif current_video_url:
888
  # For URL videos, we might need to re-read frames
889
+ frame = get_first_frame_for_blip(current_video_url, target_size=480)
890
  blip_generated_name = generate_blip_name(frame)
891
  print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
892
 
 
974
  interactive=False,
975
  show_label=True
976
  )
977
+ create_thumbnail = gr.Checkbox(
978
+ label="🖼️ Create Embedded Thumbnail",
979
+ value=True,
980
+ info="Generate and embed thumbnail in MP4"
981
  )
982
 
983
  # Event handlers for input changes - SIMPLIFIED (no thumbnail previews during input)
 
1060
  run_btn.click(
1061
  fn=infer_video_depth_from_source,
1062
  inputs=[
1063
+ upload_video, video_url, filename, use_blip, create_thumbnail,
1064
  max_len, target_fps, max_res, stitch,
1065
  grayscale, convert_from_color, blur
1066
  ],