Update app.py
Browse files
app.py
CHANGED
|
@@ -45,8 +45,8 @@ print("Loading BLIP model...")
|
|
| 45 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 46 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
|
| 47 |
|
| 48 |
-
def
|
| 49 |
-
"""Effizient: Lädt nur das
|
| 50 |
try:
|
| 51 |
cap = cv2.VideoCapture(video_path)
|
| 52 |
|
|
@@ -56,23 +56,22 @@ def get_middle_frame_for_blip(video_path, target_size=480):
|
|
| 56 |
cap.release()
|
| 57 |
return None
|
| 58 |
|
| 59 |
-
# Hole Frame-Count
|
| 60 |
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 61 |
if frame_count <= 0:
|
| 62 |
print(f"DEBUG: Invalid frame count: {frame_count}")
|
| 63 |
cap.release()
|
| 64 |
return None
|
| 65 |
|
| 66 |
-
|
| 67 |
-
print(f"DEBUG: Video has {frame_count} frames, jumping to frame {middle_idx}")
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
cap.set(cv2.CAP_PROP_POS_FRAMES,
|
| 71 |
ret, frame = cap.read()
|
| 72 |
cap.release()
|
| 73 |
|
| 74 |
if not ret or frame is None:
|
| 75 |
-
print("DEBUG: Could not read
|
| 76 |
return None
|
| 77 |
|
| 78 |
# Verkleinere nur dieses eine Frame
|
|
@@ -90,7 +89,7 @@ def get_middle_frame_for_blip(video_path, target_size=480):
|
|
| 90 |
return frame_rgb
|
| 91 |
|
| 92 |
except Exception as e:
|
| 93 |
-
print(f"DEBUG:
|
| 94 |
return None
|
| 95 |
|
| 96 |
def generate_blip_name(frame: np.ndarray) -> str:
|
|
@@ -205,9 +204,8 @@ def create_gradient_thumbnail(rgb_frame, depth_frame, longest_side=1024, add_log
|
|
| 205 |
if gradient_end < height:
|
| 206 |
result[gradient_end:] = depth_3ch[gradient_end:]
|
| 207 |
|
| 208 |
-
# 5.
|
| 209 |
-
|
| 210 |
-
result = add_looking_glass_logo(result)
|
| 211 |
|
| 212 |
print(f"DEBUG: Real depth gradient thumbnail created successfully: {result.shape}")
|
| 213 |
return result
|
|
@@ -217,51 +215,64 @@ def create_gradient_thumbnail(rgb_frame, depth_frame, longest_side=1024, add_log
|
|
| 217 |
# Fallback: return original RGB frame
|
| 218 |
return rgb_frame
|
| 219 |
|
| 220 |
-
def
|
| 221 |
-
"""Fügt
|
| 222 |
try:
|
| 223 |
thumb_h, thumb_w = thumbnail.shape[:2]
|
| 224 |
|
| 225 |
-
#
|
| 226 |
-
logo_size = max(
|
| 227 |
|
| 228 |
# Position bestimmen
|
| 229 |
-
margin =
|
| 230 |
if position == "bottom-right":
|
| 231 |
-
|
| 232 |
-
|
| 233 |
elif position == "bottom-left":
|
| 234 |
-
|
| 235 |
-
|
| 236 |
else:
|
| 237 |
-
|
| 238 |
-
|
| 239 |
|
| 240 |
# Sicherstellen dass Logo im Bild ist
|
| 241 |
-
|
| 242 |
-
|
| 243 |
|
| 244 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
overlay = thumbnail.copy()
|
| 246 |
-
|
| 247 |
-
|
| 248 |
|
| 249 |
-
#
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
-
# Alpha blending
|
| 257 |
-
alpha = 0.
|
| 258 |
result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
|
| 259 |
|
| 260 |
-
print(f"DEBUG:
|
| 261 |
return result
|
| 262 |
|
| 263 |
except Exception as e:
|
| 264 |
-
print(f"DEBUG:
|
| 265 |
return thumbnail
|
| 266 |
|
| 267 |
def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
|
|
@@ -496,7 +507,7 @@ blip_generated_name = ""
|
|
| 496 |
original_filename = ""
|
| 497 |
|
| 498 |
# --- Main inference function ---
|
| 499 |
-
def infer_video_depth_from_source(upload_video, video_url, filename, use_blip,
|
| 500 |
"""Process video to generate depth maps and RGBD output"""
|
| 501 |
try:
|
| 502 |
max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
|
|
@@ -530,6 +541,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
|
|
| 530 |
# Generate depth maps
|
| 531 |
print("Generating depth maps...")
|
| 532 |
depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
|
|
|
|
| 533 |
|
| 534 |
# Save depth visualization with final name
|
| 535 |
save_video(depths, vis_video_path, fps=fps, is_depths=True)
|
|
@@ -573,6 +585,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
|
|
| 573 |
|
| 574 |
# Save stitched video with final name
|
| 575 |
save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
|
|
|
|
| 576 |
|
| 577 |
# Add audio from original video if possible
|
| 578 |
try:
|
|
@@ -590,32 +603,36 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
|
|
| 590 |
print(f"Audio processing failed: {e}")
|
| 591 |
rgbd_path = rgbd_video_path
|
| 592 |
|
| 593 |
-
# 🎯
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
embed_thumbnail_in_video(
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
|
| 620 |
# Clean up memory
|
| 621 |
gc.collect()
|
|
@@ -623,7 +640,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, a
|
|
| 623 |
torch.cuda.empty_cache()
|
| 624 |
|
| 625 |
success_msg = f"✅ Videos saved as '{base_name}_vis.mp4' and '{base_name}_RGBD.mp4'"
|
| 626 |
-
if thumbnail is not None:
|
| 627 |
success_msg += " with embedded thumbnails"
|
| 628 |
|
| 629 |
print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
|
|
@@ -691,7 +708,7 @@ def on_video_upload_change(video_file, use_blip):
|
|
| 691 |
blip_generated_name = ""
|
| 692 |
if use_blip:
|
| 693 |
print("DEBUG: Starting optimized BLIP processing...")
|
| 694 |
-
frame =
|
| 695 |
blip_generated_name = generate_blip_name(frame)
|
| 696 |
print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
|
| 697 |
|
|
@@ -832,7 +849,7 @@ def on_video_url_change(url, use_blip):
|
|
| 832 |
if use_blip and video_path:
|
| 833 |
try:
|
| 834 |
print("DEBUG: Starting optimized BLIP processing for URL video...")
|
| 835 |
-
frame =
|
| 836 |
blip_generated_name = generate_blip_name(frame)
|
| 837 |
print(f"DEBUG: {source.title()} BLIP name generated: '{blip_generated_name}'")
|
| 838 |
except Exception as e:
|
|
@@ -864,12 +881,12 @@ def on_blip_toggle(use_blip):
|
|
| 864 |
# If toggling BLIP on and we don't have a BLIP name yet, generate it
|
| 865 |
if use_blip and not blip_generated_name:
|
| 866 |
if current_video_file:
|
| 867 |
-
frame =
|
| 868 |
blip_generated_name = generate_blip_name(frame)
|
| 869 |
print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
|
| 870 |
elif current_video_url:
|
| 871 |
# For URL videos, we might need to re-read frames
|
| 872 |
-
frame =
|
| 873 |
blip_generated_name = generate_blip_name(frame)
|
| 874 |
print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
|
| 875 |
|
|
@@ -957,10 +974,10 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 957 |
interactive=False,
|
| 958 |
show_label=True
|
| 959 |
)
|
| 960 |
-
|
| 961 |
-
label="
|
| 962 |
-
value=
|
| 963 |
-
info="
|
| 964 |
)
|
| 965 |
|
| 966 |
# Event handlers for input changes - SIMPLIFIED (no thumbnail previews during input)
|
|
@@ -1043,7 +1060,7 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1043 |
run_btn.click(
|
| 1044 |
fn=infer_video_depth_from_source,
|
| 1045 |
inputs=[
|
| 1046 |
-
upload_video, video_url, filename, use_blip,
|
| 1047 |
max_len, target_fps, max_res, stitch,
|
| 1048 |
grayscale, convert_from_color, blur
|
| 1049 |
],
|
|
|
|
| 45 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 46 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
|
| 47 |
|
| 48 |
+
def get_first_frame_for_blip(video_path, target_size=480):
|
| 49 |
+
"""Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
|
| 50 |
try:
|
| 51 |
cap = cv2.VideoCapture(video_path)
|
| 52 |
|
|
|
|
| 56 |
cap.release()
|
| 57 |
return None
|
| 58 |
|
| 59 |
+
# Hole Frame-Count für Debug-Info
|
| 60 |
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
| 61 |
if frame_count <= 0:
|
| 62 |
print(f"DEBUG: Invalid frame count: {frame_count}")
|
| 63 |
cap.release()
|
| 64 |
return None
|
| 65 |
|
| 66 |
+
print(f"DEBUG: Video has {frame_count} frames, reading first frame (index 0)")
|
|
|
|
| 67 |
|
| 68 |
+
# Lese direkt das erste Frame (Position 0)
|
| 69 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
|
| 70 |
ret, frame = cap.read()
|
| 71 |
cap.release()
|
| 72 |
|
| 73 |
if not ret or frame is None:
|
| 74 |
+
print("DEBUG: Could not read first frame")
|
| 75 |
return None
|
| 76 |
|
| 77 |
# Verkleinere nur dieses eine Frame
|
|
|
|
| 89 |
return frame_rgb
|
| 90 |
|
| 91 |
except Exception as e:
|
| 92 |
+
print(f"DEBUG: get_first_frame_for_blip error: {e}")
|
| 93 |
return None
|
| 94 |
|
| 95 |
def generate_blip_name(frame: np.ndarray) -> str:
|
|
|
|
| 204 |
if gradient_end < height:
|
| 205 |
result[gradient_end:] = depth_3ch[gradient_end:]
|
| 206 |
|
| 207 |
+
# 5. Immer "D" Logo hinzufügen
|
| 208 |
+
result = add_depth_logo(result)
|
|
|
|
| 209 |
|
| 210 |
print(f"DEBUG: Real depth gradient thumbnail created successfully: {result.shape}")
|
| 211 |
return result
|
|
|
|
| 215 |
# Fallback: return original RGB frame
|
| 216 |
return rgb_frame
|
| 217 |
|
| 218 |
+
def add_depth_logo(thumbnail, position="bottom-right"):
|
| 219 |
+
"""Fügt einfaches 'D' Logo zum Thumbnail hinzu (für Depth)"""
|
| 220 |
try:
|
| 221 |
thumb_h, thumb_w = thumbnail.shape[:2]
|
| 222 |
|
| 223 |
+
# Logo-Größe basierend auf Thumbnail-Größe
|
| 224 |
+
logo_size = max(40, int(thumb_w * 0.08)) # 8% der Breite, mindestens 40px
|
| 225 |
|
| 226 |
# Position bestimmen
|
| 227 |
+
margin = 15
|
| 228 |
if position == "bottom-right":
|
| 229 |
+
x_pos = thumb_w - logo_size - margin
|
| 230 |
+
y_pos = thumb_h - margin
|
| 231 |
elif position == "bottom-left":
|
| 232 |
+
x_pos = margin
|
| 233 |
+
y_pos = thumb_h - margin
|
| 234 |
else:
|
| 235 |
+
x_pos = thumb_w - logo_size - margin
|
| 236 |
+
y_pos = thumb_h - margin
|
| 237 |
|
| 238 |
# Sicherstellen dass Logo im Bild ist
|
| 239 |
+
x_pos = max(margin, min(x_pos, thumb_w - logo_size))
|
| 240 |
+
y_pos = max(logo_size, min(y_pos, thumb_h - margin))
|
| 241 |
|
| 242 |
+
# Font-Parameter
|
| 243 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 244 |
+
font_scale = max(1.0, logo_size / 60) # Skaliert mit Logo-Größe
|
| 245 |
+
font_thickness = max(2, int(logo_size / 25))
|
| 246 |
+
|
| 247 |
+
# Text-Größe messen für Zentrierung
|
| 248 |
+
(text_w, text_h), baseline = cv2.getTextSize("D", font, font_scale, font_thickness)
|
| 249 |
+
|
| 250 |
+
# Semi-transparenten Kreis als Hintergrund
|
| 251 |
overlay = thumbnail.copy()
|
| 252 |
+
circle_radius = logo_size // 2
|
| 253 |
+
circle_center = (x_pos + circle_radius, y_pos - circle_radius)
|
| 254 |
|
| 255 |
+
# Dunkler Kreis mit weißem Rand
|
| 256 |
+
cv2.circle(overlay, circle_center, circle_radius, (0, 0, 0), -1) # Schwarzer Hintergrund
|
| 257 |
+
cv2.circle(overlay, circle_center, circle_radius, (255, 255, 255), 2) # Weißer Rand
|
| 258 |
+
|
| 259 |
+
# "D" Text zentriert im Kreis
|
| 260 |
+
text_x = circle_center[0] - text_w // 2
|
| 261 |
+
text_y = circle_center[1] + text_h // 2
|
| 262 |
+
|
| 263 |
+
cv2.putText(overlay, "D",
|
| 264 |
+
(text_x, text_y),
|
| 265 |
+
font, font_scale, (255, 255, 255), font_thickness, cv2.LINE_AA)
|
| 266 |
|
| 267 |
+
# Alpha blending für semi-transparenten Effekt
|
| 268 |
+
alpha = 0.8
|
| 269 |
result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
|
| 270 |
|
| 271 |
+
print(f"DEBUG: Added 'D' logo at {position}, size: {logo_size}px")
|
| 272 |
return result
|
| 273 |
|
| 274 |
except Exception as e:
|
| 275 |
+
print(f"DEBUG: 'D' logo addition failed: {e}")
|
| 276 |
return thumbnail
|
| 277 |
|
| 278 |
def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
|
|
|
|
| 507 |
original_filename = ""
|
| 508 |
|
| 509 |
# --- Main inference function ---
|
| 510 |
+
def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
|
| 511 |
"""Process video to generate depth maps and RGBD output"""
|
| 512 |
try:
|
| 513 |
max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
|
|
|
|
| 541 |
# Generate depth maps
|
| 542 |
print("Generating depth maps...")
|
| 543 |
depths, fps = video_depth_anything.infer_video_depth(frames, target_fps, input_size=518, device=DEVICE)
|
| 544 |
+
print("✅ Depth maps generated successfully")
|
| 545 |
|
| 546 |
# Save depth visualization with final name
|
| 547 |
save_video(depths, vis_video_path, fps=fps, is_depths=True)
|
|
|
|
| 585 |
|
| 586 |
# Save stitched video with final name
|
| 587 |
save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
|
| 588 |
+
print("✅ RGBD video created successfully")
|
| 589 |
|
| 590 |
# Add audio from original video if possible
|
| 591 |
try:
|
|
|
|
| 603 |
print(f"Audio processing failed: {e}")
|
| 604 |
rgbd_path = rgbd_video_path
|
| 605 |
|
| 606 |
+
# 🎯 Thumbnail-Generierung (wenn aktiviert)
|
| 607 |
+
thumbnail = None
|
| 608 |
+
if create_thumbnail:
|
| 609 |
+
print("Creating thumbnail from completed depth data...")
|
| 610 |
+
try:
|
| 611 |
+
# Erstes Frame für Thumbnail (konsistent mit BLIP)
|
| 612 |
+
rgb_frame = get_first_frame_for_blip(input_path, target_size=1024)
|
| 613 |
+
# Erstes Depth-Frame (korrespondierend zum RGB-Frame)
|
| 614 |
+
first_depth_frame = depths[0] # Erstes Depth vom AI-Model!
|
| 615 |
+
|
| 616 |
+
# Gradient-Thumbnail mit echter Depth erstellen (mit "D" Logo)
|
| 617 |
+
thumbnail = create_gradient_thumbnail(
|
| 618 |
+
rgb_frame,
|
| 619 |
+
first_depth_frame,
|
| 620 |
+
longest_side=1024,
|
| 621 |
+
add_logo=True # Immer Logo hinzufügen
|
| 622 |
+
)
|
| 623 |
+
|
| 624 |
+
# Thumbnail in beide Videos einbetten
|
| 625 |
+
embed_thumbnail_in_video(vis_video_path, thumbnail, base_name)
|
| 626 |
+
if rgbd_path:
|
| 627 |
+
embed_thumbnail_in_video(rgbd_path, thumbnail, base_name)
|
| 628 |
+
|
| 629 |
+
print("✅ Thumbnail created and embedded successfully")
|
| 630 |
+
|
| 631 |
+
except Exception as e:
|
| 632 |
+
print(f"❌ Thumbnail creation failed: {e}")
|
| 633 |
+
thumbnail = None
|
| 634 |
+
else:
|
| 635 |
+
print("📷 Thumbnail generation skipped (disabled)")
|
| 636 |
|
| 637 |
# Clean up memory
|
| 638 |
gc.collect()
|
|
|
|
| 640 |
torch.cuda.empty_cache()
|
| 641 |
|
| 642 |
success_msg = f"✅ Videos saved as '{base_name}_vis.mp4' and '{base_name}_RGBD.mp4'"
|
| 643 |
+
if create_thumbnail and thumbnail is not None:
|
| 644 |
success_msg += " with embedded thumbnails"
|
| 645 |
|
| 646 |
print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
|
|
|
|
| 708 |
blip_generated_name = ""
|
| 709 |
if use_blip:
|
| 710 |
print("DEBUG: Starting optimized BLIP processing...")
|
| 711 |
+
frame = get_first_frame_for_blip(video_file, target_size=480)
|
| 712 |
blip_generated_name = generate_blip_name(frame)
|
| 713 |
print(f"DEBUG: BLIP name generated: '{blip_generated_name}'")
|
| 714 |
|
|
|
|
| 849 |
if use_blip and video_path:
|
| 850 |
try:
|
| 851 |
print("DEBUG: Starting optimized BLIP processing for URL video...")
|
| 852 |
+
frame = get_first_frame_for_blip(video_path, target_size=480)
|
| 853 |
blip_generated_name = generate_blip_name(frame)
|
| 854 |
print(f"DEBUG: {source.title()} BLIP name generated: '{blip_generated_name}'")
|
| 855 |
except Exception as e:
|
|
|
|
| 881 |
# If toggling BLIP on and we don't have a BLIP name yet, generate it
|
| 882 |
if use_blip and not blip_generated_name:
|
| 883 |
if current_video_file:
|
| 884 |
+
frame = get_first_frame_for_blip(current_video_file, target_size=480)
|
| 885 |
blip_generated_name = generate_blip_name(frame)
|
| 886 |
print(f"DEBUG: Generated new BLIP name from file: '{blip_generated_name}'")
|
| 887 |
elif current_video_url:
|
| 888 |
# For URL videos, we might need to re-read frames
|
| 889 |
+
frame = get_first_frame_for_blip(current_video_url, target_size=480)
|
| 890 |
blip_generated_name = generate_blip_name(frame)
|
| 891 |
print(f"DEBUG: Generated new BLIP name from URL: '{blip_generated_name}'")
|
| 892 |
|
|
|
|
| 974 |
interactive=False,
|
| 975 |
show_label=True
|
| 976 |
)
|
| 977 |
+
create_thumbnail = gr.Checkbox(
|
| 978 |
+
label="🖼️ Create Embedded Thumbnail",
|
| 979 |
+
value=True,
|
| 980 |
+
info="Generate and embed thumbnail in MP4"
|
| 981 |
)
|
| 982 |
|
| 983 |
# Event handlers for input changes - SIMPLIFIED (no thumbnail previews during input)
|
|
|
|
| 1060 |
run_btn.click(
|
| 1061 |
fn=infer_video_depth_from_source,
|
| 1062 |
inputs=[
|
| 1063 |
+
upload_video, video_url, filename, use_blip, create_thumbnail,
|
| 1064 |
max_len, target_fps, max_res, stitch,
|
| 1065 |
grayscale, convert_from_color, blur
|
| 1066 |
],
|