Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -346,6 +346,231 @@ def cleanup_temp_files(*file_paths):
|
|
| 346 |
except Exception as e:
|
| 347 |
print(f"DEBUG: Cleanup failed for {file_path}: {e}")
|
| 348 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
# --- Load depth model ---
|
| 350 |
print("Loading Video Depth Anything model...")
|
| 351 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
@@ -667,16 +892,15 @@ def on_generate_audio(upload_video, video_url, audio_prompt, negative_prompt, au
|
|
| 667 |
error_msg = f"❌ Audio generation error: {str(e)}"
|
| 668 |
print(error_msg)
|
| 669 |
return None, error_msg
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
"""Process video to generate depth maps and RGBD output"""
|
| 673 |
try:
|
| 674 |
max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
|
| 675 |
|
| 676 |
# Determine input source
|
| 677 |
input_path = upload_video or video_url
|
| 678 |
if not input_path:
|
| 679 |
-
return None, None, "Error: No video source provided"
|
| 680 |
|
| 681 |
# Fix filename at generation time
|
| 682 |
base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
|
|
@@ -696,7 +920,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
|
|
| 696 |
print("Reading video frames...")
|
| 697 |
frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
|
| 698 |
if len(frames) == 0:
|
| 699 |
-
return None, None, "Error: No frames could be extracted from video"
|
| 700 |
|
| 701 |
# Generate depth maps
|
| 702 |
print("Generating depth maps...")
|
|
@@ -707,6 +931,7 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
|
|
| 707 |
save_video(depths, vis_video_path, fps=fps, is_depths=True)
|
| 708 |
|
| 709 |
rgbd_path = None
|
|
|
|
| 710 |
|
| 711 |
if stitch:
|
| 712 |
print("Creating RGBD stitched video...")
|
|
@@ -743,6 +968,24 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
|
|
| 743 |
depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
|
| 744 |
stitched = cv2.hconcat([rgb, depth_resized])
|
| 745 |
stitched_frames.append(stitched)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 746 |
|
| 747 |
# Save stitched video
|
| 748 |
save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
|
|
@@ -765,6 +1008,14 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
|
|
| 765 |
|
| 766 |
rgbd_path = rgbd_video_path
|
| 767 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 768 |
# Clean up memory
|
| 769 |
gc.collect()
|
| 770 |
if torch.cuda.is_available():
|
|
@@ -773,14 +1024,16 @@ def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, *
|
|
| 773 |
success_msg = f"✅ Videos saved as '{base_name}_vis.mp4'"
|
| 774 |
if stitch and rgbd_path:
|
| 775 |
success_msg += f" and '{base_name}_RGBD.mp4'"
|
|
|
|
|
|
|
| 776 |
|
| 777 |
print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
|
| 778 |
-
return vis_video_path, rgbd_path, success_msg
|
| 779 |
|
| 780 |
except Exception as e:
|
| 781 |
error_msg = f"Processing failed: {str(e)}"
|
| 782 |
print(error_msg)
|
| 783 |
-
return None, None, error_msg
|
| 784 |
|
| 785 |
# --- UI event handlers ---
|
| 786 |
def on_video_upload_change(video_file, use_blip):
|
|
@@ -1022,7 +1275,7 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1022 |
show_label=True
|
| 1023 |
)
|
| 1024 |
|
| 1025 |
-
#
|
| 1026 |
with gr.Row():
|
| 1027 |
video_url = gr.Textbox(
|
| 1028 |
label="Video URL (YouTube, TikTok, Instagram, Civitai, MidJourney, etc.)",
|
|
@@ -1040,6 +1293,21 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1040 |
placeholder="Enter filename or let BLIP generate it",
|
| 1041 |
scale=3
|
| 1042 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1043 |
|
| 1044 |
# MMaudio Integration
|
| 1045 |
with gr.Accordion("🎵 Audio Generation (MMaudio)", open=False):
|
|
@@ -1203,11 +1471,11 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1203 |
run_btn.click(
|
| 1204 |
fn=infer_video_depth_from_source,
|
| 1205 |
inputs=[
|
| 1206 |
-
upload_video, video_url, filename, use_blip,
|
| 1207 |
max_len, target_fps, max_res, stitch,
|
| 1208 |
grayscale, convert_from_color, blur
|
| 1209 |
],
|
| 1210 |
-
outputs=[depth_out, rgbd_out, status_display]
|
| 1211 |
)
|
| 1212 |
|
| 1213 |
gr.Markdown("""
|
|
@@ -1215,6 +1483,8 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1215 |
- **Upload formats**: MP4, AVI, MOV, etc.
|
| 1216 |
- **BLIP naming**: Automatically generates descriptive filenames
|
| 1217 |
- **RGBD output**: Side-by-side comparison of original and depth
|
|
|
|
|
|
|
| 1218 |
- **Audio Generation**: Use MMaudio Large V2 (44kHz) for high-quality audio synthesis
|
| 1219 |
- **Audio Prompts**: Be descriptive (e.g. "gentle ocean waves with seagulls", "epic orchestral music")
|
| 1220 |
- **Iterative Audio**: Generate multiple times with different prompts to perfect the audio
|
|
|
|
| 346 |
except Exception as e:
|
| 347 |
print(f"DEBUG: Cleanup failed for {file_path}: {e}")
|
| 348 |
|
| 349 |
+
def create_overlay_thumbnail(rgb_frame, depth_frame):
|
| 350 |
+
"""
|
| 351 |
+
Erstellt Overlay-Thumbnail mit vollständigem RGB und Depth-Miniatur unten rechts
|
| 352 |
+
|
| 353 |
+
Args:
|
| 354 |
+
rgb_frame: Original RGB Frame (volle Auflösung)
|
| 355 |
+
depth_frame: Depth Frame (bereits auf RGB-Größe angepasst und verarbeitet)
|
| 356 |
+
|
| 357 |
+
Returns:
|
| 358 |
+
np.array: Thumbnail mit RGB-Vollbild und Depth-Overlay unten rechts
|
| 359 |
+
"""
|
| 360 |
+
print(f"DEBUG: Creating overlay thumbnail - RGB: {rgb_frame.shape}, Depth: {depth_frame.shape}")
|
| 361 |
+
|
| 362 |
+
# 1. Skaliere RGB auf Thumbnail-Größe
|
| 363 |
+
target_size = 1024
|
| 364 |
+
h, w = rgb_frame.shape[:2]
|
| 365 |
+
|
| 366 |
+
if max(h, w) > target_size:
|
| 367 |
+
scale = target_size / max(h, w)
|
| 368 |
+
new_h, new_w = int(h * scale), int(w * scale)
|
| 369 |
+
rgb_thumb = cv2.resize(rgb_frame, (new_w, new_h))
|
| 370 |
+
else:
|
| 371 |
+
rgb_thumb = rgb_frame.copy()
|
| 372 |
+
|
| 373 |
+
print(f"DEBUG: RGB thumbnail size: {rgb_thumb.shape}")
|
| 374 |
+
|
| 375 |
+
# 2. Erstelle Depth-Miniatur (30% der RGB-Breite)
|
| 376 |
+
thumb_h, thumb_w = rgb_thumb.shape[:2]
|
| 377 |
+
depth_mini_w = int(thumb_w * 0.30) # 30% der RGB-Breite
|
| 378 |
+
depth_mini_h = int(depth_mini_w * (thumb_h / thumb_w)) # Proportional zur RGB-Höhe
|
| 379 |
+
|
| 380 |
+
# Skaliere Depth auf Miniatur-Größe
|
| 381 |
+
depth_mini = cv2.resize(depth_frame, (depth_mini_w, depth_mini_h))
|
| 382 |
+
|
| 383 |
+
print(f"DEBUG: Depth miniature size: {depth_mini.shape} (30% of RGB width)")
|
| 384 |
+
|
| 385 |
+
# 3. Positioniere Depth-Miniatur unten rechts (bündig, ohne Ränder)
|
| 386 |
+
result = rgb_thumb.copy()
|
| 387 |
+
|
| 388 |
+
# Berechne Position: unten rechts, bündig
|
| 389 |
+
x_start = thumb_w - depth_mini_w # Rechts bündig
|
| 390 |
+
y_start = thumb_h - depth_mini_h # Unten bündig
|
| 391 |
+
|
| 392 |
+
# Stelle sicher, dass die Miniatur innerhalb der Grenzen bleibt
|
| 393 |
+
x_start = max(0, x_start)
|
| 394 |
+
y_start = max(0, y_start)
|
| 395 |
+
x_end = min(thumb_w, x_start + depth_mini_w)
|
| 396 |
+
y_end = min(thumb_h, y_start + depth_mini_h)
|
| 397 |
+
|
| 398 |
+
# Passe Depth-Miniatur an tatsächliche verfügbare Größe an
|
| 399 |
+
actual_w = x_end - x_start
|
| 400 |
+
actual_h = y_end - y_start
|
| 401 |
+
|
| 402 |
+
if actual_w != depth_mini_w or actual_h != depth_mini_h:
|
| 403 |
+
depth_mini = cv2.resize(depth_mini, (actual_w, actual_h))
|
| 404 |
+
|
| 405 |
+
# 4. Erstelle abgerundete Maske für obere linke Ecke
|
| 406 |
+
mask = create_rounded_corner_mask(actual_w, actual_h)
|
| 407 |
+
|
| 408 |
+
# 5. Überlagere Depth-Miniatur auf RGB mit abgerundeter oberer linker Ecke
|
| 409 |
+
apply_rounded_overlay(result, depth_mini, x_start, y_start, mask)
|
| 410 |
+
|
| 411 |
+
print(f"DEBUG: Overlay thumbnail completed: {result.shape}")
|
| 412 |
+
print(f"DEBUG: Depth overlay at position ({x_start}, {y_start}) with size {actual_w}x{actual_h}")
|
| 413 |
+
return result
|
| 414 |
+
|
| 415 |
+
def create_rounded_corner_mask(width, height):
|
| 416 |
+
"""Erstellt Anti-Aliased Maske mit abgerundeter oberer linker Ecke"""
|
| 417 |
+
# Radius für die Rundung (40% der kleineren Dimension)
|
| 418 |
+
radius = int(min(width, height) * 0.40)
|
| 419 |
+
radius = max(radius, 5) # Minimum 5 Pixel
|
| 420 |
+
|
| 421 |
+
# Erstelle Maske (weiß = sichtbar, schwarz = transparent)
|
| 422 |
+
mask = np.ones((height, width), dtype=np.float32)
|
| 423 |
+
|
| 424 |
+
# Erstelle Anti-Aliased Rundung in oberer linker Ecke
|
| 425 |
+
for y in range(radius):
|
| 426 |
+
for x in range(radius):
|
| 427 |
+
# Distanz zum Zentrum des Kreises
|
| 428 |
+
dist = np.sqrt((x - radius) ** 2 + (y - radius) ** 2)
|
| 429 |
+
|
| 430 |
+
if dist > radius:
|
| 431 |
+
# Außerhalb des Radius - berechne Anti-Aliasing
|
| 432 |
+
alpha = max(0, 1 - (dist - radius))
|
| 433 |
+
mask[y, x] = alpha
|
| 434 |
+
|
| 435 |
+
print(f"DEBUG: Created rounded mask with radius {radius}px for {width}x{height} overlay")
|
| 436 |
+
return mask
|
| 437 |
+
|
| 438 |
+
def apply_rounded_overlay(result, depth_mini, x_start, y_start, mask):
|
| 439 |
+
"""Wendet Depth-Overlay mit abgerundeter Maske an"""
|
| 440 |
+
actual_h, actual_w = depth_mini.shape[:2]
|
| 441 |
+
|
| 442 |
+
# Hole den zu überschreibenden RGB-Bereich
|
| 443 |
+
rgb_section = result[y_start:y_start + actual_h, x_start:x_start + actual_w].copy()
|
| 444 |
+
|
| 445 |
+
# Wende Maske auf alle Farbkanäle an
|
| 446 |
+
for c in range(3): # RGB-Kanäle
|
| 447 |
+
# Alpha-Blending: RGB * (1-mask) + Depth * mask
|
| 448 |
+
blended = rgb_section[:, :, c].astype(np.float32) * (1 - mask) + \
|
| 449 |
+
depth_mini[:, :, c].astype(np.float32) * mask
|
| 450 |
+
result[y_start:y_start + actual_h, x_start:x_start + actual_w, c] = blended.astype(np.uint8)
|
| 451 |
+
|
| 452 |
+
print(f"DEBUG: Applied anti-aliased rounded overlay at ({x_start}, {y_start})")
|
| 453 |
+
|
| 454 |
+
def add_depth_logo_to_overlay(thumbnail, overlay_x, overlay_y, overlay_w, overlay_h):
|
| 455 |
+
"""Adds small 'D' logo specifically to the depth overlay area"""
|
| 456 |
+
try:
|
| 457 |
+
# Logo-Größe proportional zur Overlay-Größe (kleiner)
|
| 458 |
+
logo_size = max(20, int(overlay_w * 0.15)) # 15% der Overlay-Breite, minimum 20px
|
| 459 |
+
|
| 460 |
+
# Position innerhalb des Overlays (unten rechts des Overlays)
|
| 461 |
+
margin = 5
|
| 462 |
+
x_pos = overlay_x + overlay_w - logo_size - margin
|
| 463 |
+
y_pos = overlay_y + overlay_h - margin
|
| 464 |
+
|
| 465 |
+
# Stelle sicher, dass Logo innerhalb des Overlays bleibt
|
| 466 |
+
x_pos = max(overlay_x + margin, min(x_pos, overlay_x + overlay_w - logo_size))
|
| 467 |
+
y_pos = max(overlay_y + logo_size, min(y_pos, overlay_y + overlay_h - margin))
|
| 468 |
+
|
| 469 |
+
# Font-Parameter für kleines Logo
|
| 470 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 471 |
+
font_scale = max(1.0, logo_size / 20) # Kleinerer Font
|
| 472 |
+
font_thickness = max(2, int(logo_size / 10)) # Dünnere Linien
|
| 473 |
+
|
| 474 |
+
# Measure text size for centering
|
| 475 |
+
(text_w, text_h), baseline = cv2.getTextSize("D", font, font_scale, font_thickness)
|
| 476 |
+
|
| 477 |
+
# Circle parameters
|
| 478 |
+
circle_radius = logo_size // 2
|
| 479 |
+
circle_center = (x_pos + circle_radius, y_pos - circle_radius)
|
| 480 |
+
|
| 481 |
+
# Overlay for anti-aliasing
|
| 482 |
+
overlay = thumbnail.copy()
|
| 483 |
+
|
| 484 |
+
# Black circle
|
| 485 |
+
cv2.circle(overlay, circle_center, circle_radius, (0, 0, 0), -1, cv2.LINE_AA)
|
| 486 |
+
|
| 487 |
+
# "D" text centered in circle - WHITE
|
| 488 |
+
text_x = circle_center[0] - text_w // 2
|
| 489 |
+
text_y = circle_center[1] + text_h // 2
|
| 490 |
+
|
| 491 |
+
cv2.putText(overlay, "D",
|
| 492 |
+
(text_x, text_y),
|
| 493 |
+
font, font_scale, (255, 255, 255), font_thickness, cv2.LINE_AA)
|
| 494 |
+
|
| 495 |
+
# Alpha blending
|
| 496 |
+
alpha = 0.8
|
| 497 |
+
result = cv2.addWeighted(thumbnail, 1-alpha, overlay, alpha, 0)
|
| 498 |
+
|
| 499 |
+
print(f"DEBUG: Added small 'D' logo to overlay at ({circle_center[0]}, {circle_center[1]}), size: {logo_size}px")
|
| 500 |
+
return result
|
| 501 |
+
|
| 502 |
+
except Exception as e:
|
| 503 |
+
print(f"DEBUG: Overlay logo addition failed: {e}")
|
| 504 |
+
return thumbnail
|
| 505 |
+
|
| 506 |
+
def embed_thumbnail_in_video(video_path, thumbnail_array, base_name):
|
| 507 |
+
"""Bettet Thumbnail als Cover-Art in MP4-Video ein (JPEG für iOS-Kompatibilität)"""
|
| 508 |
+
try:
|
| 509 |
+
# 🎯 FIX: RGB zu BGR konvertieren für cv2.imwrite
|
| 510 |
+
if len(thumbnail_array.shape) == 3 and thumbnail_array.shape[2] == 3:
|
| 511 |
+
# Gradio/Preview verwendet RGB, cv2.imwrite erwartet BGR
|
| 512 |
+
thumbnail_bgr = cv2.cvtColor(thumbnail_array, cv2.COLOR_RGB2BGR)
|
| 513 |
+
else:
|
| 514 |
+
thumbnail_bgr = thumbnail_array
|
| 515 |
+
|
| 516 |
+
# Thumbnail als temporäre JPEG-Datei speichern (WICHTIG: Explizit JPEG für iOS)
|
| 517 |
+
temp_thumb_path = f"temp_{base_name}_thumb.jpg"
|
| 518 |
+
|
| 519 |
+
# Erzwinge JPEG-Format mit hoher Qualität
|
| 520 |
+
success = cv2.imwrite(temp_thumb_path, thumbnail_bgr, [
|
| 521 |
+
cv2.IMWRITE_JPEG_QUALITY, 90,
|
| 522 |
+
cv2.IMWRITE_JPEG_OPTIMIZE, 1
|
| 523 |
+
])
|
| 524 |
+
|
| 525 |
+
if not success:
|
| 526 |
+
raise RuntimeError("Failed to save thumbnail as JPEG")
|
| 527 |
+
|
| 528 |
+
# Verifikation: Prüfe ob Datei wirklich JPEG ist
|
| 529 |
+
if not os.path.exists(temp_thumb_path):
|
| 530 |
+
raise RuntimeError("Thumbnail JPEG file not created")
|
| 531 |
+
|
| 532 |
+
print(f"DEBUG: Saved thumbnail as JPEG: {temp_thumb_path}")
|
| 533 |
+
|
| 534 |
+
# Temporärer Output-Pfad
|
| 535 |
+
temp_output = video_path.replace('.mp4', '_with_thumb.mp4')
|
| 536 |
+
|
| 537 |
+
# FFmpeg-Befehl zum Einbetten des JPEG-Thumbnails
|
| 538 |
+
cmd = [
|
| 539 |
+
"ffmpeg", "-y",
|
| 540 |
+
"-i", video_path, # Original video
|
| 541 |
+
"-i", temp_thumb_path, # JPEG Thumbnail image
|
| 542 |
+
"-map", "0", # Alle Streams vom Video
|
| 543 |
+
"-map", "1", # Thumbnail-Stream
|
| 544 |
+
"-c", "copy", # Video/Audio kopieren (kein Re-encoding)
|
| 545 |
+
"-c:v:1", "mjpeg", # Thumbnail explizit als MJPEG/JPEG
|
| 546 |
+
"-disposition:v:1", "attached_pic", # Als Cover-Art markieren
|
| 547 |
+
"-metadata:s:v:1", "title=Cover", # Metadaten
|
| 548 |
+
"-metadata:s:v:1", "comment=JPEG Video Thumbnail",
|
| 549 |
+
temp_output
|
| 550 |
+
]
|
| 551 |
+
|
| 552 |
+
print(f"DEBUG: Embedding JPEG thumbnail in video: {video_path}")
|
| 553 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 554 |
+
|
| 555 |
+
if result.returncode == 0:
|
| 556 |
+
# Ersetze Original mit Thumbnail-Version
|
| 557 |
+
os.replace(temp_output, video_path)
|
| 558 |
+
print(f"✅ JPEG thumbnail successfully embedded in {video_path}")
|
| 559 |
+
else:
|
| 560 |
+
print(f"❌ FFmpeg failed: {result.stderr}")
|
| 561 |
+
|
| 562 |
+
# Cleanup
|
| 563 |
+
if os.path.exists(temp_thumb_path):
|
| 564 |
+
os.remove(temp_thumb_path)
|
| 565 |
+
if os.path.exists(temp_output):
|
| 566 |
+
os.remove(temp_output)
|
| 567 |
+
|
| 568 |
+
return result.returncode == 0
|
| 569 |
+
|
| 570 |
+
except Exception as e:
|
| 571 |
+
print(f"❌ Thumbnail embedding failed: {e}")
|
| 572 |
+
return False
|
| 573 |
+
|
| 574 |
# --- Load depth model ---
|
| 575 |
print("Loading Video Depth Anything model...")
|
| 576 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
|
|
| 892 |
error_msg = f"❌ Audio generation error: {str(e)}"
|
| 893 |
print(error_msg)
|
| 894 |
return None, error_msg
|
| 895 |
+
def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
|
| 896 |
+
"""Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
|
|
|
|
| 897 |
try:
|
| 898 |
max_len, target_fps, max_res, stitch, grayscale, convert_from_color, blur = args
|
| 899 |
|
| 900 |
# Determine input source
|
| 901 |
input_path = upload_video or video_url
|
| 902 |
if not input_path:
|
| 903 |
+
return None, None, "Error: No video source provided", None
|
| 904 |
|
| 905 |
# Fix filename at generation time
|
| 906 |
base_name = filename.strip().replace(" ", "_")[:30] if filename.strip() else "output"
|
|
|
|
| 920 |
print("Reading video frames...")
|
| 921 |
frames, target_fps = read_video_frames(input_path, max_len, target_fps, max_res)
|
| 922 |
if len(frames) == 0:
|
| 923 |
+
return None, None, "Error: No frames could be extracted from video", None
|
| 924 |
|
| 925 |
# Generate depth maps
|
| 926 |
print("Generating depth maps...")
|
|
|
|
| 931 |
save_video(depths, vis_video_path, fps=fps, is_depths=True)
|
| 932 |
|
| 933 |
rgbd_path = None
|
| 934 |
+
thumbnail = None
|
| 935 |
|
| 936 |
if stitch:
|
| 937 |
print("Creating RGBD stitched video...")
|
|
|
|
| 968 |
depth_resized = cv2.resize(depth_vis, (rgb.shape[1], rgb.shape[0]))
|
| 969 |
stitched = cv2.hconcat([rgb, depth_resized])
|
| 970 |
stitched_frames.append(stitched)
|
| 971 |
+
|
| 972 |
+
# 🎯 CREATE THUMBNAIL from first perfectly matched RGB+Depth pair (but don't embed yet)
|
| 973 |
+
if i == 0 and create_thumbnail:
|
| 974 |
+
print("Creating thumbnail from first perfectly matched RGB+Depth pair...")
|
| 975 |
+
try:
|
| 976 |
+
print(f"DEBUG: Using RGB: {rgb.shape}, Depth: {depth_resized.shape}")
|
| 977 |
+
print(f"DEBUG: Depth range: {depth_resized.min()} - {depth_resized.max()}")
|
| 978 |
+
|
| 979 |
+
# Erstelle Thumbnail mit den bereits perfekt passenden Frames
|
| 980 |
+
thumbnail = create_overlay_thumbnail(rgb, depth_resized)
|
| 981 |
+
|
| 982 |
+
print("✅ Thumbnail created from first RGBD pair (not embedded yet)")
|
| 983 |
+
|
| 984 |
+
except Exception as e:
|
| 985 |
+
print(f"❌ Thumbnail creation failed: {e}")
|
| 986 |
+
import traceback
|
| 987 |
+
traceback.print_exc()
|
| 988 |
+
thumbnail = None
|
| 989 |
|
| 990 |
# Save stitched video
|
| 991 |
save_video(np.array(stitched_frames), rgbd_video_path, fps=fps)
|
|
|
|
| 1008 |
|
| 1009 |
rgbd_path = rgbd_video_path
|
| 1010 |
|
| 1011 |
+
# 🎯 FINAL FIX: Embed thumbnail ONLY in RGBD video AFTER all processing
|
| 1012 |
+
if create_thumbnail and thumbnail is not None:
|
| 1013 |
+
print("Embedding thumbnail in RGBD video only (after all processing)...")
|
| 1014 |
+
embed_thumbnail_in_video(rgbd_video_path, thumbnail, base_name)
|
| 1015 |
+
print("✅ Thumbnail embedded in RGBD video only")
|
| 1016 |
+
elif create_thumbnail:
|
| 1017 |
+
print("❌ No thumbnail to embed")
|
| 1018 |
+
|
| 1019 |
# Clean up memory
|
| 1020 |
gc.collect()
|
| 1021 |
if torch.cuda.is_available():
|
|
|
|
| 1024 |
success_msg = f"✅ Videos saved as '{base_name}_vis.mp4'"
|
| 1025 |
if stitch and rgbd_path:
|
| 1026 |
success_msg += f" and '{base_name}_RGBD.mp4'"
|
| 1027 |
+
if create_thumbnail and thumbnail is not None:
|
| 1028 |
+
success_msg += " with embedded thumbnail"
|
| 1029 |
|
| 1030 |
print(f"DEBUG: Processing completed - Vis: '{vis_video_path}', RGBD: '{rgbd_path}'")
|
| 1031 |
+
return vis_video_path, rgbd_path, success_msg, thumbnail
|
| 1032 |
|
| 1033 |
except Exception as e:
|
| 1034 |
error_msg = f"Processing failed: {str(e)}"
|
| 1035 |
print(error_msg)
|
| 1036 |
+
return None, None, error_msg, None
|
| 1037 |
|
| 1038 |
# --- UI event handlers ---
|
| 1039 |
def on_video_upload_change(video_file, use_blip):
|
|
|
|
| 1275 |
show_label=True
|
| 1276 |
)
|
| 1277 |
|
| 1278 |
+
# Single row with all input controls and thumbnail preview
|
| 1279 |
with gr.Row():
|
| 1280 |
video_url = gr.Textbox(
|
| 1281 |
label="Video URL (YouTube, TikTok, Instagram, Civitai, MidJourney, etc.)",
|
|
|
|
| 1293 |
placeholder="Enter filename or let BLIP generate it",
|
| 1294 |
scale=3
|
| 1295 |
)
|
| 1296 |
+
create_thumbnail = gr.Checkbox(
|
| 1297 |
+
label="Embed Video Thumbnail",
|
| 1298 |
+
value=True,
|
| 1299 |
+
scale=1,
|
| 1300 |
+
info="Generate and embed thumbnail in MP4"
|
| 1301 |
+
)
|
| 1302 |
+
thumbnail_preview = gr.Image(
|
| 1303 |
+
label="Thumbnail Preview",
|
| 1304 |
+
height=140,
|
| 1305 |
+
width=180,
|
| 1306 |
+
interactive=False,
|
| 1307 |
+
show_label=True,
|
| 1308 |
+
scale=1,
|
| 1309 |
+
format="jpeg" # Erzwinge JPEG für Downloads
|
| 1310 |
+
)
|
| 1311 |
|
| 1312 |
# MMaudio Integration
|
| 1313 |
with gr.Accordion("🎵 Audio Generation (MMaudio)", open=False):
|
|
|
|
| 1471 |
run_btn.click(
|
| 1472 |
fn=infer_video_depth_from_source,
|
| 1473 |
inputs=[
|
| 1474 |
+
upload_video, video_url, filename, use_blip, create_thumbnail,
|
| 1475 |
max_len, target_fps, max_res, stitch,
|
| 1476 |
grayscale, convert_from_color, blur
|
| 1477 |
],
|
| 1478 |
+
outputs=[depth_out, rgbd_out, status_display, thumbnail_preview]
|
| 1479 |
)
|
| 1480 |
|
| 1481 |
gr.Markdown("""
|
|
|
|
| 1483 |
- **Upload formats**: MP4, AVI, MOV, etc.
|
| 1484 |
- **BLIP naming**: Automatically generates descriptive filenames
|
| 1485 |
- **RGBD output**: Side-by-side comparison of original and depth
|
| 1486 |
+
- **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
|
| 1487 |
+
- **Embedded Thumbnails**: Videos will show previews in Windows Explorer
|
| 1488 |
- **Audio Generation**: Use MMaudio Large V2 (44kHz) for high-quality audio synthesis
|
| 1489 |
- **Audio Prompts**: Be descriptive (e.g. "gentle ocean waves with seagulls", "epic orchestral music")
|
| 1490 |
- **Iterative Audio**: Generate multiple times with different prompts to perfect the audio
|