Krokodilpirat commited on
Commit
d6483ee
·
verified ·
1 Parent(s): c245745

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -377
app.py CHANGED
@@ -13,8 +13,6 @@ from video_depth_anything.video_depth import VideoDepthAnything
13
  from utils.dc_utils import read_video_frames, save_video
14
  from transformers import BlipProcessor, BlipForConditionalGeneration
15
  from PIL import Image
16
- import tempfile
17
- import shutil
18
 
19
  # --- Environment setup ---
20
  os.environ["HF_HOME"] = "/tmp/huggingface"
@@ -47,55 +45,6 @@ print("Loading BLIP model...")
47
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
48
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
49
 
50
- # --- Load MMaudio model ---
51
- print("Loading MMaudio model...")
52
- mmaudio_model = None
53
-
54
- try:
55
- # Check if mmaudio folder exists (local installation)
56
- if os.path.exists('./mmaudio'):
57
- print("✅ Found local mmaudio folder")
58
-
59
- # List contents to debug structure
60
- import os
61
- mmaudio_contents = os.listdir('./mmaudio')
62
- print(f"DEBUG: mmaudio folder contents: {mmaudio_contents}")
63
-
64
- # Add mmaudio to Python path
65
- if './mmaudio' not in sys.path:
66
- sys.path.insert(0, './mmaudio')
67
- print("✅ Added mmaudio to Python path")
68
-
69
- # Try different import patterns
70
- try:
71
- from eval_utils import generate, load_mmaudio_model
72
- print("✅ MMaudio eval_utils imported successfully")
73
-
74
- # Load model
75
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
76
- model_name = "large_44k_v2"
77
-
78
- mmaudio_model = load_mmaudio_model(
79
- model_name=model_name,
80
- device=device
81
- )
82
-
83
- print(f"✅ MMaudio {model_name} loaded on {device}")
84
-
85
- except Exception as load_error:
86
- print(f"❌ MMaudio model loading failed: {load_error}")
87
- import traceback
88
- traceback.print_exc()
89
- mmaudio_model = None
90
-
91
- else:
92
- print("⚠️ mmaudio folder not found")
93
- mmaudio_model = None
94
-
95
- except Exception as e:
96
- print(f"❌ MMaudio setup failed: {e}")
97
- mmaudio_model = None
98
-
99
  def get_first_frame_for_blip(video_path, target_size=480):
100
  """Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
101
  try:
@@ -176,175 +125,7 @@ def generate_blip_name(frame: np.ndarray) -> str:
176
  print(f"BLIP error: {e}")
177
  return "video"
178
 
179
- # --- 🎵 MMaudio Functions ---
180
-
181
- def get_video_duration(video_path):
182
- """Get video duration in seconds using OpenCV"""
183
- try:
184
- cap = cv2.VideoCapture(video_path)
185
- fps = cap.get(cv2.CAP_PROP_FPS)
186
- frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
187
- cap.release()
188
-
189
- if fps > 0:
190
- duration = frame_count / fps
191
- return duration
192
- else:
193
- return 8.0 # Default duration
194
- except Exception as e:
195
- print(f"Duration detection failed: {e}")
196
- return 8.0
197
-
198
- def generate_audio_for_video(video_path, audio_prompt, negative_prompt="", audio_seed=-1, num_inference_steps=25, guidance_scale=4.5):
199
- """Generate audio for video using MMaudio (official HF Space style)"""
200
- try:
201
- if mmaudio_model is None:
202
- raise RuntimeError("MMaudio model not loaded")
203
-
204
- # Import generation utilities (like official space)
205
- from mmaudio.eval_utils import generate
206
-
207
- # Get video duration (MMaudio works best with 8s, but can handle longer)
208
- duration = get_video_duration(video_path)
209
- print(f"DEBUG: Generating audio for {duration}s duration (matching input video)")
210
-
211
- # Prepare inputs like official HF Space
212
- text_input = audio_prompt.strip() if audio_prompt.strip() else None
213
- negative_text = negative_prompt.strip() if negative_prompt.strip() else None
214
-
215
- # Handle seed (like official space)
216
- if audio_seed == -1:
217
- import random
218
- audio_seed = random.randint(0, 2**32 - 1)
219
-
220
- # Set seeds for reproducibility
221
- torch.manual_seed(audio_seed)
222
- if torch.cuda.is_available():
223
- torch.cuda.manual_seed(audio_seed)
224
-
225
- print(f"DEBUG: MMaudio generating with:")
226
- print(f" Prompt: '{text_input}'")
227
- print(f" Negative: '{negative_text}'")
228
- print(f" Seed: {audio_seed}")
229
- print(f" Steps: {num_inference_steps}")
230
- print(f" Guidance: {guidance_scale}")
231
- print(f" Duration: {duration}s")
232
-
233
- # Generate audio using official MMaudio generate function
234
- with torch.no_grad():
235
- # This follows the official HF Space pattern
236
- result = generate(
237
- model=mmaudio_model,
238
- video_path=video_path,
239
- text=text_input,
240
- negative_text=negative_text,
241
- duration=duration,
242
- guidance_scale=guidance_scale,
243
- num_inference_steps=num_inference_steps,
244
- seed=audio_seed
245
- )
246
-
247
- # Save generated audio to temporary file
248
- temp_audio_path = tempfile.mktemp(suffix=".wav")
249
-
250
- # Extract audio from result (format depends on MMaudio output)
251
- if isinstance(result, dict) and 'audio' in result:
252
- audio_data = result['audio']
253
- else:
254
- audio_data = result
255
-
256
- # Convert audio to numpy and save
257
- if isinstance(audio_data, torch.Tensor):
258
- audio_np = audio_data.cpu().numpy()
259
- # Normalize audio
260
- if audio_np.max() > 0:
261
- audio_np = audio_np / np.max(np.abs(audio_np))
262
-
263
- import scipy.io.wavfile
264
- sample_rate = 44100 # 44.1kHz
265
-
266
- # Handle different audio shapes
267
- if len(audio_np.shape) == 2:
268
- # Stereo - take first channel or mix down
269
- if audio_np.shape[0] == 2: # (2, samples)
270
- audio_np = audio_np[0] # Take first channel
271
- elif audio_np.shape[1] == 2: # (samples, 2)
272
- audio_np = np.mean(audio_np, axis=1) # Mix to mono
273
-
274
- # Ensure audio is 1D
275
- audio_np = audio_np.flatten()
276
-
277
- # Convert to int16 for WAV
278
- audio_int16 = (audio_np * 32767).astype(np.int16)
279
- scipy.io.wavfile.write(temp_audio_path, sample_rate, audio_int16)
280
-
281
- print(f"DEBUG: Audio generated and saved to: {temp_audio_path}")
282
- return temp_audio_path
283
-
284
- except Exception as e:
285
- print(f"❌ Audio generation failed: {e}")
286
- import traceback
287
- traceback.print_exc()
288
- return None
289
-
290
- def mix_audio_with_video(video_path, audio_path, volume=0.5, replace_audio=False):
291
- """Mix generated audio with video using FFmpeg"""
292
- try:
293
- # Create output path
294
- base_name = os.path.splitext(os.path.basename(video_path))[0]
295
- output_path = f"{base_name}_with_audio.mp4"
296
-
297
- if replace_audio:
298
- # Replace original audio completely
299
- cmd = [
300
- "ffmpeg", "-y",
301
- "-i", video_path,
302
- "-i", audio_path,
303
- "-c:v", "copy",
304
- "-c:a", "aac",
305
- "-map", "0:v:0",
306
- "-map", "1:a:0",
307
- "-shortest",
308
- output_path
309
- ]
310
- else:
311
- # Mix with original audio
312
- cmd = [
313
- "ffmpeg", "-y",
314
- "-i", video_path,
315
- "-i", audio_path,
316
- "-filter_complex", f"[0:a][1:a]amix=inputs=2:duration=shortest:weights=1 {volume}[a]",
317
- "-map", "0:v:0",
318
- "-map", "[a]",
319
- "-c:v", "copy",
320
- "-c:a", "aac",
321
- "-shortest",
322
- output_path
323
- ]
324
-
325
- print(f"DEBUG: Mixing audio with video: {' '.join(cmd)}")
326
- result = subprocess.run(cmd, capture_output=True, text=True)
327
-
328
- if result.returncode == 0:
329
- print(f"✅ Audio mixed successfully: {output_path}")
330
- return output_path
331
- else:
332
- print(f"❌ FFmpeg mixing failed: {result.stderr}")
333
- return None
334
-
335
- except Exception as e:
336
- print(f"❌ Audio mixing failed: {e}")
337
- return None
338
-
339
- def cleanup_temp_files(*file_paths):
340
- """Clean up temporary files"""
341
- for file_path in file_paths:
342
- if file_path and os.path.exists(file_path):
343
- try:
344
- os.remove(file_path)
345
- print(f"DEBUG: Cleaned up: {file_path}")
346
- except Exception as e:
347
- print(f"DEBUG: Cleanup failed for {file_path}: {e}")
348
 
349
  def create_overlay_thumbnail(rgb_frame, depth_frame):
350
  """
@@ -819,79 +600,10 @@ def download_generic_video(url):
819
  # --- Global variables for toggling ---
820
  current_video_file = None
821
  current_video_url = None
822
- current_original_video = None # Store original video before audio processing
823
  blip_generated_name = ""
824
  original_filename = ""
825
 
826
- # --- Audio Generation Handler ---
827
- def on_generate_audio(upload_video, video_url, audio_prompt, negative_prompt, audio_volume, replace_audio, audio_seed, num_inference_steps, guidance_scale):
828
- """Handle audio generation for input video with full parameter control"""
829
- global current_video_file, current_video_url, current_original_video
830
-
831
- try:
832
- # Determine input video
833
- input_video = upload_video or video_url
834
- if not input_video:
835
- return None, "❌ No video provided for audio generation"
836
-
837
- if not audio_prompt.strip():
838
- return None, "❌ Please provide an audio prompt"
839
-
840
- if mmaudio_model is None:
841
- return None, "❌ MMaudio model not available"
842
-
843
- # Store original video if not already stored
844
- if current_original_video is None:
845
- current_original_video = input_video
846
-
847
- # Use original video for audio generation (not previous audio version)
848
- print(f"DEBUG: Generating audio for: {current_original_video}")
849
-
850
- # Generate audio with all parameters
851
- audio_path = generate_audio_for_video(
852
- current_original_video,
853
- audio_prompt,
854
- negative_prompt,
855
- audio_seed,
856
- num_inference_steps,
857
- guidance_scale
858
- )
859
- if not audio_path:
860
- return None, "❌ Audio generation failed"
861
-
862
- # Mix audio with video
863
- video_with_audio = mix_audio_with_video(current_original_video, audio_path, audio_volume, replace_audio)
864
- if not video_with_audio:
865
- cleanup_temp_files(audio_path)
866
- return None, "❌ Audio mixing failed"
867
-
868
- # Update current video to the new version with audio
869
- if upload_video:
870
- current_video_file = video_with_audio
871
- else:
872
- current_video_url = video_with_audio
873
-
874
- # Cleanup temporary audio file
875
- cleanup_temp_files(audio_path)
876
-
877
- # Build success message with parameters
878
- success_msg = f"✅ Audio generated successfully!"
879
- if replace_audio:
880
- success_msg += " (Original audio replaced)"
881
- else:
882
- success_msg += f" (Mixed at {audio_volume*100:.0f}% volume)"
883
-
884
- success_msg += f"<br>🎛️ Steps: {num_inference_steps}, Guidance: {guidance_scale}"
885
- if audio_seed != -1:
886
- success_msg += f", Seed: {audio_seed}"
887
-
888
- print(f"DEBUG: Audio generation completed: {video_with_audio}")
889
- return video_with_audio, success_msg
890
-
891
- except Exception as e:
892
- error_msg = f"❌ Audio generation error: {str(e)}"
893
- print(error_msg)
894
- return None, error_msg
895
  def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
896
  """Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
897
  try:
@@ -1305,85 +1017,9 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
1305
  width=180,
1306
  interactive=False,
1307
  show_label=True,
1308
- scale=1,
1309
- format="jpeg" # Erzwinge JPEG für Downloads
1310
  )
1311
 
1312
- # MMaudio Integration
1313
- with gr.Accordion("🎵 Audio Generation (MMaudio)", open=False):
1314
- with gr.Row():
1315
- enable_mmaudio = gr.Checkbox(
1316
- label="Generate Audio",
1317
- value=False,
1318
- info="Generate audio track from video content using MMaudio Large V2",
1319
- scale=1
1320
- )
1321
- audio_prompt = gr.Textbox(
1322
- label="Audio Prompt",
1323
- placeholder="Describe the desired audio (e.g. 'ocean waves', 'forest sounds', 'city traffic', 'epic cinematic music')",
1324
- scale=4,
1325
- lines=2
1326
- )
1327
-
1328
- with gr.Row():
1329
- negative_prompt = gr.Textbox(
1330
- label="Negative Prompt",
1331
- placeholder="What to avoid in audio (e.g. 'music', 'voices', 'loud noises')",
1332
- scale=3,
1333
- lines=1
1334
- )
1335
- audio_volume = gr.Slider(
1336
- label="Audio Volume",
1337
- minimum=0.0,
1338
- maximum=1.0,
1339
- value=0.5,
1340
- step=0.1,
1341
- info="Mix volume with original audio",
1342
- scale=2
1343
- )
1344
- replace_audio = gr.Checkbox(
1345
- label="Replace Original Audio",
1346
- value=False,
1347
- info="Replace instead of mixing",
1348
- scale=1
1349
- )
1350
-
1351
- with gr.Row():
1352
- audio_seed = gr.Number(
1353
- label="Seed (-1: random)",
1354
- value=-1,
1355
- precision=0,
1356
- info="Seed for reproducible audio generation",
1357
- scale=1
1358
- )
1359
- num_inference_steps = gr.Slider(
1360
- label="Num Steps",
1361
- minimum=10,
1362
- maximum=50,
1363
- value=25,
1364
- step=1,
1365
- info="More steps = better quality, slower generation",
1366
- scale=2
1367
- )
1368
- guidance_scale = gr.Slider(
1369
- label="Guidance Strength",
1370
- minimum=1.0,
1371
- maximum=10.0,
1372
- value=4.5,
1373
- step=0.5,
1374
- info="How closely to follow the prompt",
1375
- scale=2
1376
- )
1377
- generate_audio_btn = gr.Button(
1378
- "🎵 Generate Audio",
1379
- variant="secondary",
1380
- size="sm",
1381
- scale=1
1382
- )
1383
-
1384
- audio_duration_info = gr.HTML("ℹ️ Audio duration will automatically match input video length")
1385
- audio_status = gr.HTML("")
1386
-
1387
  # Event handlers for input changes
1388
  video_url.change(
1389
  fn=on_video_url_change,
@@ -1406,13 +1042,6 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
1406
  outputs=[filename, status_display]
1407
  )
1408
 
1409
- # Audio generation event
1410
- generate_audio_btn.click(
1411
- fn=on_generate_audio,
1412
- inputs=[upload_video, video_url, audio_prompt, negative_prompt, audio_volume, replace_audio, audio_seed, num_inference_steps, guidance_scale],
1413
- outputs=[upload_video, audio_status]
1414
- )
1415
-
1416
  with gr.Accordion("⚙️ Advanced Settings", open=False):
1417
  with gr.Row():
1418
  max_len = gr.Slider(
@@ -1485,9 +1114,6 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
1485
  - **RGBD output**: Side-by-side comparison of original and depth
1486
  - **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
1487
  - **Embedded Thumbnails**: Videos will show previews in Windows Explorer
1488
- - **Audio Generation**: Use MMaudio Large V2 (44kHz) for high-quality audio synthesis
1489
- - **Audio Prompts**: Be descriptive (e.g. "gentle ocean waves with seagulls", "epic orchestral music")
1490
- - **Iterative Audio**: Generate multiple times with different prompts to perfect the audio
1491
  - **Processing time**: Depends on video length and resolution
1492
  - **Filename**: Set your preferred name before clicking Generate!
1493
  """)
 
13
  from utils.dc_utils import read_video_frames, save_video
14
  from transformers import BlipProcessor, BlipForConditionalGeneration
15
  from PIL import Image
 
 
16
 
17
  # --- Environment setup ---
18
  os.environ["HF_HOME"] = "/tmp/huggingface"
 
45
  blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
46
  blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def get_first_frame_for_blip(video_path, target_size=480):
49
  """Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
50
  try:
 
125
  print(f"BLIP error: {e}")
126
  return "video"
127
 
128
+ # --- 🎨 Thumbnail Generation Functions ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  def create_overlay_thumbnail(rgb_frame, depth_frame):
131
  """
 
600
  # --- Global variables for toggling ---
601
  current_video_file = None
602
  current_video_url = None
 
603
  blip_generated_name = ""
604
  original_filename = ""
605
 
606
+ # --- MAIN INFERENCE FUNCTION - NO FALLBACK THUMBNAIL ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
607
  def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
608
  """Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
609
  try:
 
1017
  width=180,
1018
  interactive=False,
1019
  show_label=True,
1020
+ scale=1
 
1021
  )
1022
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1023
  # Event handlers for input changes
1024
  video_url.change(
1025
  fn=on_video_url_change,
 
1042
  outputs=[filename, status_display]
1043
  )
1044
 
 
 
 
 
 
 
 
1045
  with gr.Accordion("⚙️ Advanced Settings", open=False):
1046
  with gr.Row():
1047
  max_len = gr.Slider(
 
1114
  - **RGBD output**: Side-by-side comparison of original and depth
1115
  - **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
1116
  - **Embedded Thumbnails**: Videos will show previews in Windows Explorer
 
 
 
1117
  - **Processing time**: Depends on video length and resolution
1118
  - **Filename**: Set your preferred name before clicking Generate!
1119
  """)