prismaudio-project commited on
Commit
537397a
·
1 Parent(s): bdc92e0
Files changed (1) hide show
  1. app.py +27 -38
app.py CHANGED
@@ -358,22 +358,15 @@ def generate_audio_core(video_file, caption):
358
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
359
  _MODELS["feature_extractor"].to(DEVICE)
360
  _MODELS["diffusion"].to(DEVICE)
361
- start_time =time.time()
362
-
363
- """
364
- Gradio generator function (yields status + result progressively).
365
 
366
- Yields:
367
- (status_str, combined_video_path_or_None)
368
- """
369
- # ---- Basic validation ----
370
  if video_file is None:
371
  return "❌ Please upload a video file first.", None
372
 
373
  if not caption or caption.strip() == "":
374
  caption=""
375
 
376
-
377
  caption = caption.strip()
378
  logs = []
379
 
@@ -382,11 +375,11 @@ def generate_audio_core(video_file, caption):
382
  logs.append(msg)
383
  return "\n".join(logs)
384
 
385
- # ---- Working directory (auto-cleaned on exit) ----
386
  work_dir = tempfile.mkdtemp(prefix="PrismAudio_")
387
 
388
  try:
389
  # ---- Step 1: Convert / copy to mp4 ----
 
390
  status = log_step("📹 Step 1: Preparing video...")
391
 
392
  src_ext = os.path.splitext(video_file)[1].lower()
@@ -399,50 +392,46 @@ def generate_audio_core(video_file, caption):
399
  return log_step(f"❌ Video conversion failed:\n{err}"), None
400
  else:
401
  shutil.copy(video_file, mp4_path)
402
- log_step(" Video ready.")
403
 
404
  # ---- Step 2: Validate duration ----
 
405
  status = log_step("📹 Step 2: Checking video duration...")
406
 
407
-
408
  duration = get_video_duration(mp4_path)
409
- log_step(f" Duration: {duration:.2f}s")
410
 
411
  # ---- Step 3: Extract video frames ----
412
- status = log_step("🎞️ Step 3: Extracting video frames (clip & sync)...")
 
413
 
414
  clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
415
- log_step(f" clip_chunk : {tuple(clip_chunk.shape)}")
416
- log_step(f" sync_chunk : {tuple(sync_chunk.shape)}")
417
 
418
  # ---- Step 4: Extract model features ----
419
- status = log_step("🧠 Step 4: Extracting text / video / sync features...")
420
- #yield status, None
421
 
422
  info = extract_features(clip_chunk, sync_chunk, caption)
423
- log_step(f" text_features : {tuple(info['text_features'].shape)}")
424
- log_step(f" global_video_features : {tuple(info['global_video_features'].shape)}")
425
- log_step(f" video_features : {tuple(info['video_features'].shape)}")
426
- log_step(f" global_text_features : {tuple(info['global_text_features'].shape)}")
427
- log_step(f" sync_features : {tuple(info['sync_features'].shape)}")
428
 
429
  # ---- Step 5: Build inference batch ----
430
- #status = log_step("📦 Step 5: Building inference batch...")
431
- #yield status, None
432
 
433
  audio_latent, meta = build_meta(info, duration, caption)
434
- log_step(f" audio_latent : {tuple(audio_latent.shape)}")
435
 
436
  # ---- Step 6: Diffusion sampling ----
 
437
  status = log_step("🎵 Step 6: Running diffusion sampling...")
438
- #yield status, None
439
 
440
  generated_audio = run_diffusion(audio_latent, meta, duration)
441
- log_step(f" Generated audio shape : {tuple(generated_audio.shape)}")
442
 
443
  # ---- Step 7: Save generated audio (temp) ----
 
444
  status = log_step("💾 Step 7: Saving generated audio...")
445
- #yield status, None
446
 
447
  audio_path = os.path.join(work_dir, "generated_audio.wav")
448
  torchaudio.save(
@@ -450,38 +439,37 @@ def generate_audio_core(video_file, caption):
450
  generated_audio[0], # (1, T)
451
  SAMPLE_RATE,
452
  )
453
- log_step(f" Audio saved: {audio_path}")
454
 
455
  # ---- Step 8: Mux audio into original video ----
 
456
  status = log_step("🎬 Step 8: Merging audio into video...")
457
- #yield status, None
458
 
459
  combined_path = os.path.join(work_dir, "output_with_audio.mp4")
460
  ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
461
  if not ok:
462
  return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
463
 
 
464
 
465
- log_step("✅ Done! Audio and video merged successfully.")
 
 
466
  return "\n".join(logs), combined_path
467
 
468
  except Exception as e:
469
  log_step(f"❌ Unexpected error: {str(e)}")
470
  log.exception(e)
471
  return "\n".join(logs), None
472
-
473
- end_time =time.time()
474
- print("cost: ",end_time-start_time)
475
 
476
- # Note: work_dir is NOT deleted here so Gradio can serve the output file.
477
- # Gradio manages its own GRADIO_TEMP_DIR cleanup on restart.
478
 
479
  def generate_audio(video_file, caption):
480
- # 先yield状态
481
  yield "⏳ Waiting for GPU...", None
482
  result_logs, result_video = generate_audio_core(video_file, caption)
483
  yield result_logs, result_video
484
 
 
 
485
  # ==================== Gradio UI ====================
486
 
487
  def build_ui() -> gr.Blocks:
@@ -556,6 +544,7 @@ def build_ui() -> gr.Blocks:
556
  <Spatial> Natural sound distribution across the stereo field, suggesting birds are around the listener. Food interaction sounds can be localized.
557
  """],
558
  ["demos/Rail transport_3_479.mp4", "Generate ambient countryside sounds with a gentle breeze rustling the leaves of a large tree. From the right, introduce a faint rumble of wheels on a track and a steam engine chugging. Allow the sounds to grow louder and pan from right to left as the steam train travels across the landscape. Include the powerful chugging and clattering of carriages in the soundscape, then gradually recede the sounds to the left. Ensure no additional background noise or music is present."],
 
559
  ["demos/Cat_2_438.mp4", "A cat perched in a tree, letting out loud and sweet meows."],
560
  ],
561
  inputs=[video_input, caption_input],
 
358
  DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
359
  _MODELS["feature_extractor"].to(DEVICE)
360
  _MODELS["diffusion"].to(DEVICE)
361
+
362
+ total_start_time = time.time()
 
 
363
 
 
 
 
 
364
  if video_file is None:
365
  return "❌ Please upload a video file first.", None
366
 
367
  if not caption or caption.strip() == "":
368
  caption=""
369
 
 
370
  caption = caption.strip()
371
  logs = []
372
 
 
375
  logs.append(msg)
376
  return "\n".join(logs)
377
 
 
378
  work_dir = tempfile.mkdtemp(prefix="PrismAudio_")
379
 
380
  try:
381
  # ---- Step 1: Convert / copy to mp4 ----
382
+ step_start = time.time()
383
  status = log_step("📹 Step 1: Preparing video...")
384
 
385
  src_ext = os.path.splitext(video_file)[1].lower()
 
392
  return log_step(f"❌ Video conversion failed:\n{err}"), None
393
  else:
394
  shutil.copy(video_file, mp4_path)
395
+ log_step(f" Video ready. ⏱️ Step 1 cost: {time.time() - step_start:.2f}s")
396
 
397
  # ---- Step 2: Validate duration ----
398
+ step_start = time.time()
399
  status = log_step("📹 Step 2: Checking video duration...")
400
 
 
401
  duration = get_video_duration(mp4_path)
402
+ log_step(f" Duration: {duration:.2f}s ⏱️ Step 2 cost: {time.time() - step_start:.2f}s")
403
 
404
  # ---- Step 3: Extract video frames ----
405
+ step_start = time.time()
406
+ status = log_step("🎞️ Step 3: Extracting video frames...")
407
 
408
  clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
409
+ log_step(f" Frames extracted. ⏱️ Step 3 cost: {time.time() - step_start:.2f}s")
 
410
 
411
  # ---- Step 4: Extract model features ----
412
+ step_start = time.time()
413
+ status = log_step("🧠 Step 4: Extracting text / video features...")
414
 
415
  info = extract_features(clip_chunk, sync_chunk, caption)
416
+ log_step(f" Features extracted. ⏱️ Step 4 cost: {time.time() - step_start:.2f}s")
 
 
 
 
417
 
418
  # ---- Step 5: Build inference batch ----
419
+ step_start = time.time()
420
+ status = log_step("📦 Step 5: Building inference batch...")
421
 
422
  audio_latent, meta = build_meta(info, duration, caption)
423
+ log_step(f" audio_latent : {tuple(audio_latent.shape)} ⏱️ Step 5 cost: {time.time() - step_start:.2f}s")
424
 
425
  # ---- Step 6: Diffusion sampling ----
426
+ step_start = time.time()
427
  status = log_step("🎵 Step 6: Running diffusion sampling...")
 
428
 
429
  generated_audio = run_diffusion(audio_latent, meta, duration)
430
+ log_step(f" Diffusion sampling done. ⏱️ Step 6 cost: {time.time() - step_start:.2f}s")
431
 
432
  # ---- Step 7: Save generated audio (temp) ----
433
+ step_start = time.time()
434
  status = log_step("💾 Step 7: Saving generated audio...")
 
435
 
436
  audio_path = os.path.join(work_dir, "generated_audio.wav")
437
  torchaudio.save(
 
439
  generated_audio[0], # (1, T)
440
  SAMPLE_RATE,
441
  )
442
+ log_step(f" Audio saved: {audio_path} ⏱️ Step 7 cost: {time.time() - step_start:.2f}s")
443
 
444
  # ---- Step 8: Mux audio into original video ----
445
+ step_start = time.time()
446
  status = log_step("🎬 Step 8: Merging audio into video...")
 
447
 
448
  combined_path = os.path.join(work_dir, "output_with_audio.mp4")
449
  ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
450
  if not ok:
451
  return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
452
 
453
+ log_step(f" Audio and video merged. ⏱️ Step 8 cost: {time.time() - step_start:.2f}s")
454
 
455
+ total_cost = time.time() - total_start_time
456
+ log_step(f"✅ Done! Audio and video merged successfully. ⏱️ Total cost: {total_cost:.2f}s")
457
+
458
  return "\n".join(logs), combined_path
459
 
460
  except Exception as e:
461
  log_step(f"❌ Unexpected error: {str(e)}")
462
  log.exception(e)
463
  return "\n".join(logs), None
 
 
 
464
 
 
 
465
 
466
  def generate_audio(video_file, caption):
 
467
  yield "⏳ Waiting for GPU...", None
468
  result_logs, result_video = generate_audio_core(video_file, caption)
469
  yield result_logs, result_video
470
 
471
+
472
+
473
  # ==================== Gradio UI ====================
474
 
475
  def build_ui() -> gr.Blocks:
 
544
  <Spatial> Natural sound distribution across the stereo field, suggesting birds are around the listener. Food interaction sounds can be localized.
545
  """],
546
  ["demos/Rail transport_3_479.mp4", "Generate ambient countryside sounds with a gentle breeze rustling the leaves of a large tree. From the right, introduce a faint rumble of wheels on a track and a steam engine chugging. Allow the sounds to grow louder and pan from right to left as the steam train travels across the landscape. Include the powerful chugging and clattering of carriages in the soundscape, then gradually recede the sounds to the left. Ensure no additional background noise or music is present."],
547
+ ["demos/3ClbaJYWVO4_000030.mp4", "Produce delicate and melodious guitar strumming that gracefully flows and dances with the musical rhythm."],
548
  ["demos/Cat_2_438.mp4", "A cat perched in a tree, letting out loud and sweet meows."],
549
  ],
550
  inputs=[video_input, caption_input],