Update app.py
Browse files
app.py
CHANGED
|
@@ -13,8 +13,6 @@ from video_depth_anything.video_depth import VideoDepthAnything
|
|
| 13 |
from utils.dc_utils import read_video_frames, save_video
|
| 14 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 15 |
from PIL import Image
|
| 16 |
-
import tempfile
|
| 17 |
-
import shutil
|
| 18 |
|
| 19 |
# --- Environment setup ---
|
| 20 |
os.environ["HF_HOME"] = "/tmp/huggingface"
|
|
@@ -47,55 +45,6 @@ print("Loading BLIP model...")
|
|
| 47 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 48 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
|
| 49 |
|
| 50 |
-
# --- Load MMaudio model ---
|
| 51 |
-
print("Loading MMaudio model...")
|
| 52 |
-
mmaudio_model = None
|
| 53 |
-
|
| 54 |
-
try:
|
| 55 |
-
# Check if mmaudio folder exists (local installation)
|
| 56 |
-
if os.path.exists('./mmaudio'):
|
| 57 |
-
print("✅ Found local mmaudio folder")
|
| 58 |
-
|
| 59 |
-
# List contents to debug structure
|
| 60 |
-
import os
|
| 61 |
-
mmaudio_contents = os.listdir('./mmaudio')
|
| 62 |
-
print(f"DEBUG: mmaudio folder contents: {mmaudio_contents}")
|
| 63 |
-
|
| 64 |
-
# Add mmaudio to Python path
|
| 65 |
-
if './mmaudio' not in sys.path:
|
| 66 |
-
sys.path.insert(0, './mmaudio')
|
| 67 |
-
print("✅ Added mmaudio to Python path")
|
| 68 |
-
|
| 69 |
-
# Try different import patterns
|
| 70 |
-
try:
|
| 71 |
-
from eval_utils import generate, load_mmaudio_model
|
| 72 |
-
print("✅ MMaudio eval_utils imported successfully")
|
| 73 |
-
|
| 74 |
-
# Load model
|
| 75 |
-
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 76 |
-
model_name = "large_44k_v2"
|
| 77 |
-
|
| 78 |
-
mmaudio_model = load_mmaudio_model(
|
| 79 |
-
model_name=model_name,
|
| 80 |
-
device=device
|
| 81 |
-
)
|
| 82 |
-
|
| 83 |
-
print(f"✅ MMaudio {model_name} loaded on {device}")
|
| 84 |
-
|
| 85 |
-
except Exception as load_error:
|
| 86 |
-
print(f"❌ MMaudio model loading failed: {load_error}")
|
| 87 |
-
import traceback
|
| 88 |
-
traceback.print_exc()
|
| 89 |
-
mmaudio_model = None
|
| 90 |
-
|
| 91 |
-
else:
|
| 92 |
-
print("⚠️ mmaudio folder not found")
|
| 93 |
-
mmaudio_model = None
|
| 94 |
-
|
| 95 |
-
except Exception as e:
|
| 96 |
-
print(f"❌ MMaudio setup failed: {e}")
|
| 97 |
-
mmaudio_model = None
|
| 98 |
-
|
| 99 |
def get_first_frame_for_blip(video_path, target_size=480):
|
| 100 |
"""Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
|
| 101 |
try:
|
|
@@ -176,175 +125,7 @@ def generate_blip_name(frame: np.ndarray) -> str:
|
|
| 176 |
print(f"BLIP error: {e}")
|
| 177 |
return "video"
|
| 178 |
|
| 179 |
-
# ---
|
| 180 |
-
|
| 181 |
-
def get_video_duration(video_path):
|
| 182 |
-
"""Get video duration in seconds using OpenCV"""
|
| 183 |
-
try:
|
| 184 |
-
cap = cv2.VideoCapture(video_path)
|
| 185 |
-
fps = cap.get(cv2.CAP_PROP_FPS)
|
| 186 |
-
frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
|
| 187 |
-
cap.release()
|
| 188 |
-
|
| 189 |
-
if fps > 0:
|
| 190 |
-
duration = frame_count / fps
|
| 191 |
-
return duration
|
| 192 |
-
else:
|
| 193 |
-
return 8.0 # Default duration
|
| 194 |
-
except Exception as e:
|
| 195 |
-
print(f"Duration detection failed: {e}")
|
| 196 |
-
return 8.0
|
| 197 |
-
|
| 198 |
-
def generate_audio_for_video(video_path, audio_prompt, negative_prompt="", audio_seed=-1, num_inference_steps=25, guidance_scale=4.5):
|
| 199 |
-
"""Generate audio for video using MMaudio (official HF Space style)"""
|
| 200 |
-
try:
|
| 201 |
-
if mmaudio_model is None:
|
| 202 |
-
raise RuntimeError("MMaudio model not loaded")
|
| 203 |
-
|
| 204 |
-
# Import generation utilities (like official space)
|
| 205 |
-
from mmaudio.eval_utils import generate
|
| 206 |
-
|
| 207 |
-
# Get video duration (MMaudio works best with 8s, but can handle longer)
|
| 208 |
-
duration = get_video_duration(video_path)
|
| 209 |
-
print(f"DEBUG: Generating audio for {duration}s duration (matching input video)")
|
| 210 |
-
|
| 211 |
-
# Prepare inputs like official HF Space
|
| 212 |
-
text_input = audio_prompt.strip() if audio_prompt.strip() else None
|
| 213 |
-
negative_text = negative_prompt.strip() if negative_prompt.strip() else None
|
| 214 |
-
|
| 215 |
-
# Handle seed (like official space)
|
| 216 |
-
if audio_seed == -1:
|
| 217 |
-
import random
|
| 218 |
-
audio_seed = random.randint(0, 2**32 - 1)
|
| 219 |
-
|
| 220 |
-
# Set seeds for reproducibility
|
| 221 |
-
torch.manual_seed(audio_seed)
|
| 222 |
-
if torch.cuda.is_available():
|
| 223 |
-
torch.cuda.manual_seed(audio_seed)
|
| 224 |
-
|
| 225 |
-
print(f"DEBUG: MMaudio generating with:")
|
| 226 |
-
print(f" Prompt: '{text_input}'")
|
| 227 |
-
print(f" Negative: '{negative_text}'")
|
| 228 |
-
print(f" Seed: {audio_seed}")
|
| 229 |
-
print(f" Steps: {num_inference_steps}")
|
| 230 |
-
print(f" Guidance: {guidance_scale}")
|
| 231 |
-
print(f" Duration: {duration}s")
|
| 232 |
-
|
| 233 |
-
# Generate audio using official MMaudio generate function
|
| 234 |
-
with torch.no_grad():
|
| 235 |
-
# This follows the official HF Space pattern
|
| 236 |
-
result = generate(
|
| 237 |
-
model=mmaudio_model,
|
| 238 |
-
video_path=video_path,
|
| 239 |
-
text=text_input,
|
| 240 |
-
negative_text=negative_text,
|
| 241 |
-
duration=duration,
|
| 242 |
-
guidance_scale=guidance_scale,
|
| 243 |
-
num_inference_steps=num_inference_steps,
|
| 244 |
-
seed=audio_seed
|
| 245 |
-
)
|
| 246 |
-
|
| 247 |
-
# Save generated audio to temporary file
|
| 248 |
-
temp_audio_path = tempfile.mktemp(suffix=".wav")
|
| 249 |
-
|
| 250 |
-
# Extract audio from result (format depends on MMaudio output)
|
| 251 |
-
if isinstance(result, dict) and 'audio' in result:
|
| 252 |
-
audio_data = result['audio']
|
| 253 |
-
else:
|
| 254 |
-
audio_data = result
|
| 255 |
-
|
| 256 |
-
# Convert audio to numpy and save
|
| 257 |
-
if isinstance(audio_data, torch.Tensor):
|
| 258 |
-
audio_np = audio_data.cpu().numpy()
|
| 259 |
-
# Normalize audio
|
| 260 |
-
if audio_np.max() > 0:
|
| 261 |
-
audio_np = audio_np / np.max(np.abs(audio_np))
|
| 262 |
-
|
| 263 |
-
import scipy.io.wavfile
|
| 264 |
-
sample_rate = 44100 # 44.1kHz
|
| 265 |
-
|
| 266 |
-
# Handle different audio shapes
|
| 267 |
-
if len(audio_np.shape) == 2:
|
| 268 |
-
# Stereo - take first channel or mix down
|
| 269 |
-
if audio_np.shape[0] == 2: # (2, samples)
|
| 270 |
-
audio_np = audio_np[0] # Take first channel
|
| 271 |
-
elif audio_np.shape[1] == 2: # (samples, 2)
|
| 272 |
-
audio_np = np.mean(audio_np, axis=1) # Mix to mono
|
| 273 |
-
|
| 274 |
-
# Ensure audio is 1D
|
| 275 |
-
audio_np = audio_np.flatten()
|
| 276 |
-
|
| 277 |
-
# Convert to int16 for WAV
|
| 278 |
-
audio_int16 = (audio_np * 32767).astype(np.int16)
|
| 279 |
-
scipy.io.wavfile.write(temp_audio_path, sample_rate, audio_int16)
|
| 280 |
-
|
| 281 |
-
print(f"DEBUG: Audio generated and saved to: {temp_audio_path}")
|
| 282 |
-
return temp_audio_path
|
| 283 |
-
|
| 284 |
-
except Exception as e:
|
| 285 |
-
print(f"❌ Audio generation failed: {e}")
|
| 286 |
-
import traceback
|
| 287 |
-
traceback.print_exc()
|
| 288 |
-
return None
|
| 289 |
-
|
| 290 |
-
def mix_audio_with_video(video_path, audio_path, volume=0.5, replace_audio=False):
|
| 291 |
-
"""Mix generated audio with video using FFmpeg"""
|
| 292 |
-
try:
|
| 293 |
-
# Create output path
|
| 294 |
-
base_name = os.path.splitext(os.path.basename(video_path))[0]
|
| 295 |
-
output_path = f"{base_name}_with_audio.mp4"
|
| 296 |
-
|
| 297 |
-
if replace_audio:
|
| 298 |
-
# Replace original audio completely
|
| 299 |
-
cmd = [
|
| 300 |
-
"ffmpeg", "-y",
|
| 301 |
-
"-i", video_path,
|
| 302 |
-
"-i", audio_path,
|
| 303 |
-
"-c:v", "copy",
|
| 304 |
-
"-c:a", "aac",
|
| 305 |
-
"-map", "0:v:0",
|
| 306 |
-
"-map", "1:a:0",
|
| 307 |
-
"-shortest",
|
| 308 |
-
output_path
|
| 309 |
-
]
|
| 310 |
-
else:
|
| 311 |
-
# Mix with original audio
|
| 312 |
-
cmd = [
|
| 313 |
-
"ffmpeg", "-y",
|
| 314 |
-
"-i", video_path,
|
| 315 |
-
"-i", audio_path,
|
| 316 |
-
"-filter_complex", f"[0:a][1:a]amix=inputs=2:duration=shortest:weights=1 {volume}[a]",
|
| 317 |
-
"-map", "0:v:0",
|
| 318 |
-
"-map", "[a]",
|
| 319 |
-
"-c:v", "copy",
|
| 320 |
-
"-c:a", "aac",
|
| 321 |
-
"-shortest",
|
| 322 |
-
output_path
|
| 323 |
-
]
|
| 324 |
-
|
| 325 |
-
print(f"DEBUG: Mixing audio with video: {' '.join(cmd)}")
|
| 326 |
-
result = subprocess.run(cmd, capture_output=True, text=True)
|
| 327 |
-
|
| 328 |
-
if result.returncode == 0:
|
| 329 |
-
print(f"✅ Audio mixed successfully: {output_path}")
|
| 330 |
-
return output_path
|
| 331 |
-
else:
|
| 332 |
-
print(f"❌ FFmpeg mixing failed: {result.stderr}")
|
| 333 |
-
return None
|
| 334 |
-
|
| 335 |
-
except Exception as e:
|
| 336 |
-
print(f"❌ Audio mixing failed: {e}")
|
| 337 |
-
return None
|
| 338 |
-
|
| 339 |
-
def cleanup_temp_files(*file_paths):
|
| 340 |
-
"""Clean up temporary files"""
|
| 341 |
-
for file_path in file_paths:
|
| 342 |
-
if file_path and os.path.exists(file_path):
|
| 343 |
-
try:
|
| 344 |
-
os.remove(file_path)
|
| 345 |
-
print(f"DEBUG: Cleaned up: {file_path}")
|
| 346 |
-
except Exception as e:
|
| 347 |
-
print(f"DEBUG: Cleanup failed for {file_path}: {e}")
|
| 348 |
|
| 349 |
def create_overlay_thumbnail(rgb_frame, depth_frame):
|
| 350 |
"""
|
|
@@ -819,79 +600,10 @@ def download_generic_video(url):
|
|
| 819 |
# --- Global variables for toggling ---
|
| 820 |
current_video_file = None
|
| 821 |
current_video_url = None
|
| 822 |
-
current_original_video = None # Store original video before audio processing
|
| 823 |
blip_generated_name = ""
|
| 824 |
original_filename = ""
|
| 825 |
|
| 826 |
-
# ---
|
| 827 |
-
def on_generate_audio(upload_video, video_url, audio_prompt, negative_prompt, audio_volume, replace_audio, audio_seed, num_inference_steps, guidance_scale):
|
| 828 |
-
"""Handle audio generation for input video with full parameter control"""
|
| 829 |
-
global current_video_file, current_video_url, current_original_video
|
| 830 |
-
|
| 831 |
-
try:
|
| 832 |
-
# Determine input video
|
| 833 |
-
input_video = upload_video or video_url
|
| 834 |
-
if not input_video:
|
| 835 |
-
return None, "❌ No video provided for audio generation"
|
| 836 |
-
|
| 837 |
-
if not audio_prompt.strip():
|
| 838 |
-
return None, "❌ Please provide an audio prompt"
|
| 839 |
-
|
| 840 |
-
if mmaudio_model is None:
|
| 841 |
-
return None, "❌ MMaudio model not available"
|
| 842 |
-
|
| 843 |
-
# Store original video if not already stored
|
| 844 |
-
if current_original_video is None:
|
| 845 |
-
current_original_video = input_video
|
| 846 |
-
|
| 847 |
-
# Use original video for audio generation (not previous audio version)
|
| 848 |
-
print(f"DEBUG: Generating audio for: {current_original_video}")
|
| 849 |
-
|
| 850 |
-
# Generate audio with all parameters
|
| 851 |
-
audio_path = generate_audio_for_video(
|
| 852 |
-
current_original_video,
|
| 853 |
-
audio_prompt,
|
| 854 |
-
negative_prompt,
|
| 855 |
-
audio_seed,
|
| 856 |
-
num_inference_steps,
|
| 857 |
-
guidance_scale
|
| 858 |
-
)
|
| 859 |
-
if not audio_path:
|
| 860 |
-
return None, "❌ Audio generation failed"
|
| 861 |
-
|
| 862 |
-
# Mix audio with video
|
| 863 |
-
video_with_audio = mix_audio_with_video(current_original_video, audio_path, audio_volume, replace_audio)
|
| 864 |
-
if not video_with_audio:
|
| 865 |
-
cleanup_temp_files(audio_path)
|
| 866 |
-
return None, "❌ Audio mixing failed"
|
| 867 |
-
|
| 868 |
-
# Update current video to the new version with audio
|
| 869 |
-
if upload_video:
|
| 870 |
-
current_video_file = video_with_audio
|
| 871 |
-
else:
|
| 872 |
-
current_video_url = video_with_audio
|
| 873 |
-
|
| 874 |
-
# Cleanup temporary audio file
|
| 875 |
-
cleanup_temp_files(audio_path)
|
| 876 |
-
|
| 877 |
-
# Build success message with parameters
|
| 878 |
-
success_msg = f"✅ Audio generated successfully!"
|
| 879 |
-
if replace_audio:
|
| 880 |
-
success_msg += " (Original audio replaced)"
|
| 881 |
-
else:
|
| 882 |
-
success_msg += f" (Mixed at {audio_volume*100:.0f}% volume)"
|
| 883 |
-
|
| 884 |
-
success_msg += f"<br>🎛️ Steps: {num_inference_steps}, Guidance: {guidance_scale}"
|
| 885 |
-
if audio_seed != -1:
|
| 886 |
-
success_msg += f", Seed: {audio_seed}"
|
| 887 |
-
|
| 888 |
-
print(f"DEBUG: Audio generation completed: {video_with_audio}")
|
| 889 |
-
return video_with_audio, success_msg
|
| 890 |
-
|
| 891 |
-
except Exception as e:
|
| 892 |
-
error_msg = f"❌ Audio generation error: {str(e)}"
|
| 893 |
-
print(error_msg)
|
| 894 |
-
return None, error_msg
|
| 895 |
def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
|
| 896 |
"""Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
|
| 897 |
try:
|
|
@@ -1305,85 +1017,9 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1305 |
width=180,
|
| 1306 |
interactive=False,
|
| 1307 |
show_label=True,
|
| 1308 |
-
scale=1
|
| 1309 |
-
format="jpeg" # Erzwinge JPEG für Downloads
|
| 1310 |
)
|
| 1311 |
|
| 1312 |
-
# MMaudio Integration
|
| 1313 |
-
with gr.Accordion("🎵 Audio Generation (MMaudio)", open=False):
|
| 1314 |
-
with gr.Row():
|
| 1315 |
-
enable_mmaudio = gr.Checkbox(
|
| 1316 |
-
label="Generate Audio",
|
| 1317 |
-
value=False,
|
| 1318 |
-
info="Generate audio track from video content using MMaudio Large V2",
|
| 1319 |
-
scale=1
|
| 1320 |
-
)
|
| 1321 |
-
audio_prompt = gr.Textbox(
|
| 1322 |
-
label="Audio Prompt",
|
| 1323 |
-
placeholder="Describe the desired audio (e.g. 'ocean waves', 'forest sounds', 'city traffic', 'epic cinematic music')",
|
| 1324 |
-
scale=4,
|
| 1325 |
-
lines=2
|
| 1326 |
-
)
|
| 1327 |
-
|
| 1328 |
-
with gr.Row():
|
| 1329 |
-
negative_prompt = gr.Textbox(
|
| 1330 |
-
label="Negative Prompt",
|
| 1331 |
-
placeholder="What to avoid in audio (e.g. 'music', 'voices', 'loud noises')",
|
| 1332 |
-
scale=3,
|
| 1333 |
-
lines=1
|
| 1334 |
-
)
|
| 1335 |
-
audio_volume = gr.Slider(
|
| 1336 |
-
label="Audio Volume",
|
| 1337 |
-
minimum=0.0,
|
| 1338 |
-
maximum=1.0,
|
| 1339 |
-
value=0.5,
|
| 1340 |
-
step=0.1,
|
| 1341 |
-
info="Mix volume with original audio",
|
| 1342 |
-
scale=2
|
| 1343 |
-
)
|
| 1344 |
-
replace_audio = gr.Checkbox(
|
| 1345 |
-
label="Replace Original Audio",
|
| 1346 |
-
value=False,
|
| 1347 |
-
info="Replace instead of mixing",
|
| 1348 |
-
scale=1
|
| 1349 |
-
)
|
| 1350 |
-
|
| 1351 |
-
with gr.Row():
|
| 1352 |
-
audio_seed = gr.Number(
|
| 1353 |
-
label="Seed (-1: random)",
|
| 1354 |
-
value=-1,
|
| 1355 |
-
precision=0,
|
| 1356 |
-
info="Seed for reproducible audio generation",
|
| 1357 |
-
scale=1
|
| 1358 |
-
)
|
| 1359 |
-
num_inference_steps = gr.Slider(
|
| 1360 |
-
label="Num Steps",
|
| 1361 |
-
minimum=10,
|
| 1362 |
-
maximum=50,
|
| 1363 |
-
value=25,
|
| 1364 |
-
step=1,
|
| 1365 |
-
info="More steps = better quality, slower generation",
|
| 1366 |
-
scale=2
|
| 1367 |
-
)
|
| 1368 |
-
guidance_scale = gr.Slider(
|
| 1369 |
-
label="Guidance Strength",
|
| 1370 |
-
minimum=1.0,
|
| 1371 |
-
maximum=10.0,
|
| 1372 |
-
value=4.5,
|
| 1373 |
-
step=0.5,
|
| 1374 |
-
info="How closely to follow the prompt",
|
| 1375 |
-
scale=2
|
| 1376 |
-
)
|
| 1377 |
-
generate_audio_btn = gr.Button(
|
| 1378 |
-
"🎵 Generate Audio",
|
| 1379 |
-
variant="secondary",
|
| 1380 |
-
size="sm",
|
| 1381 |
-
scale=1
|
| 1382 |
-
)
|
| 1383 |
-
|
| 1384 |
-
audio_duration_info = gr.HTML("ℹ️ Audio duration will automatically match input video length")
|
| 1385 |
-
audio_status = gr.HTML("")
|
| 1386 |
-
|
| 1387 |
# Event handlers for input changes
|
| 1388 |
video_url.change(
|
| 1389 |
fn=on_video_url_change,
|
|
@@ -1406,13 +1042,6 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1406 |
outputs=[filename, status_display]
|
| 1407 |
)
|
| 1408 |
|
| 1409 |
-
# Audio generation event
|
| 1410 |
-
generate_audio_btn.click(
|
| 1411 |
-
fn=on_generate_audio,
|
| 1412 |
-
inputs=[upload_video, video_url, audio_prompt, negative_prompt, audio_volume, replace_audio, audio_seed, num_inference_steps, guidance_scale],
|
| 1413 |
-
outputs=[upload_video, audio_status]
|
| 1414 |
-
)
|
| 1415 |
-
|
| 1416 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
| 1417 |
with gr.Row():
|
| 1418 |
max_len = gr.Slider(
|
|
@@ -1485,9 +1114,6 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
|
|
| 1485 |
- **RGBD output**: Side-by-side comparison of original and depth
|
| 1486 |
- **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
|
| 1487 |
- **Embedded Thumbnails**: Videos will show previews in Windows Explorer
|
| 1488 |
-
- **Audio Generation**: Use MMaudio Large V2 (44kHz) for high-quality audio synthesis
|
| 1489 |
-
- **Audio Prompts**: Be descriptive (e.g. "gentle ocean waves with seagulls", "epic orchestral music")
|
| 1490 |
-
- **Iterative Audio**: Generate multiple times with different prompts to perfect the audio
|
| 1491 |
- **Processing time**: Depends on video length and resolution
|
| 1492 |
- **Filename**: Set your preferred name before clicking Generate!
|
| 1493 |
""")
|
|
|
|
| 13 |
from utils.dc_utils import read_video_frames, save_video
|
| 14 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 15 |
from PIL import Image
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# --- Environment setup ---
|
| 18 |
os.environ["HF_HOME"] = "/tmp/huggingface"
|
|
|
|
| 45 |
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
| 46 |
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def get_first_frame_for_blip(video_path, target_size=480):
|
| 49 |
"""Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
|
| 50 |
try:
|
|
|
|
| 125 |
print(f"BLIP error: {e}")
|
| 126 |
return "video"
|
| 127 |
|
| 128 |
+
# --- 🎨 Thumbnail Generation Functions ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
def create_overlay_thumbnail(rgb_frame, depth_frame):
|
| 131 |
"""
|
|
|
|
| 600 |
# --- Global variables for toggling ---
|
| 601 |
current_video_file = None
|
| 602 |
current_video_url = None
|
|
|
|
| 603 |
blip_generated_name = ""
|
| 604 |
original_filename = ""
|
| 605 |
|
| 606 |
+
# --- MAIN INFERENCE FUNCTION - NO FALLBACK THUMBNAIL ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
|
| 608 |
"""Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
|
| 609 |
try:
|
|
|
|
| 1017 |
width=180,
|
| 1018 |
interactive=False,
|
| 1019 |
show_label=True,
|
| 1020 |
+
scale=1
|
|
|
|
| 1021 |
)
|
| 1022 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1023 |
# Event handlers for input changes
|
| 1024 |
video_url.change(
|
| 1025 |
fn=on_video_url_change,
|
|
|
|
| 1042 |
outputs=[filename, status_display]
|
| 1043 |
)
|
| 1044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
| 1046 |
with gr.Row():
|
| 1047 |
max_len = gr.Slider(
|
|
|
|
| 1114 |
- **RGBD output**: Side-by-side comparison of original and depth
|
| 1115 |
- **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
|
| 1116 |
- **Embedded Thumbnails**: Videos will show previews in Windows Explorer
|
|
|
|
|
|
|
|
|
|
| 1117 |
- **Processing time**: Depends on video length and resolution
|
| 1118 |
- **Filename**: Set your preferred name before clicking Generate!
|
| 1119 |
""")
|