prismaudio-project commited on
Commit
c367d8b
Β·
1 Parent(s): 6cba118
Files changed (1) hide show
  1. app.py +10 -23
app.py CHANGED
@@ -352,8 +352,7 @@ def run_diffusion(audio_latent: torch.Tensor, meta: dict, duration: float) -> to
352
 
353
  @spaces.GPU
354
  def generate_audio_core(video_file, caption):
355
- if _MODELS["diffusion"] is None:
356
- load_all_models()
357
 
358
  start_time =time.time()
359
 
@@ -383,8 +382,10 @@ def generate_audio_core(video_file, caption):
383
  work_dir = tempfile.mkdtemp(dir=os.environ["GRADIO_TEMP_DIR"], prefix="PrismAudio_")
384
 
385
  try:
 
 
386
  # ---- Step 1: Convert / copy to mp4 ----
387
- #status = log_step("πŸ“Ή Step 1: Preparing video...")
388
 
389
  src_ext = os.path.splitext(video_file)[1].lower()
390
  mp4_path = os.path.join(work_dir, "input.mp4")
@@ -399,21 +400,21 @@ def generate_audio_core(video_file, caption):
399
  log_step(" Video ready.")
400
 
401
  # ---- Step 2: Validate duration ----
402
- #status = log_step("πŸ“Ή Step 2: Checking video duration...")
403
 
404
 
405
  duration = get_video_duration(mp4_path)
406
  log_step(f" Duration: {duration:.2f}s")
407
 
408
  # ---- Step 3: Extract video frames ----
409
- #status = log_step("🎞️ Step 3: Extracting video frames (clip & sync)...")
410
 
411
  clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
412
  log_step(f" clip_chunk : {tuple(clip_chunk.shape)}")
413
  log_step(f" sync_chunk : {tuple(sync_chunk.shape)}")
414
 
415
  # ---- Step 4: Extract model features ----
416
- #status = log_step("🧠 Step 4: Extracting text / video / sync features...")
417
  #yield status, None
418
 
419
  info = extract_features(clip_chunk, sync_chunk, caption)
@@ -431,14 +432,14 @@ def generate_audio_core(video_file, caption):
431
  log_step(f" audio_latent : {tuple(audio_latent.shape)}")
432
 
433
  # ---- Step 6: Diffusion sampling ----
434
- #status = log_step("🎡 Step 6: Running diffusion sampling...")
435
  #yield status, None
436
 
437
  generated_audio = run_diffusion(audio_latent, meta, duration)
438
  log_step(f" Generated audio shape : {tuple(generated_audio.shape)}")
439
 
440
  # ---- Step 7: Save generated audio (temp) ----
441
- #status = log_step("πŸ’Ύ Step 7: Saving generated audio...")
442
  #yield status, None
443
 
444
  audio_path = os.path.join(work_dir, "generated_audio.wav")
@@ -450,7 +451,7 @@ def generate_audio_core(video_file, caption):
450
  log_step(f" Audio saved: {audio_path}")
451
 
452
  # ---- Step 8: Mux audio into original video ----
453
- #status = log_step("🎬 Step 8: Merging audio into video...")
454
  #yield status, None
455
 
456
  combined_path = os.path.join(work_dir, "output_with_audio.mp4")
@@ -561,20 +562,6 @@ def build_ui() -> gr.Blocks:
561
  2. Enter a text prompt describing the desired audio content.
562
  3. Click **πŸš€ Generate Audio** and watch the log on the right for progress.
563
  4. The output video (original visuals + generated audio) appears below when done.
564
-
565
- **Notes**
566
- - All models are pre-loaded at startup β€” no warm-up delay on the first request.
567
- - Everything stays in memory; only the final wav and merged mp4 are written to disk.
568
- - A CUDA GPU is strongly recommended; CPU inference will be very slow.
569
- - Queue depth is limited to 3 concurrent requests to avoid OOM.
570
-
571
- **Current model paths**
572
- ```
573
- MODEL_CONFIG_PATH = {MODEL_CONFIG_PATH}
574
- CKPT_PATH = {CKPT_PATH}
575
- VAE_CKPT_PATH = {VAE_CKPT_PATH}
576
- SYNCHFORMER_CKPT_PATH = {SYNCHFORMER_CKPT_PATH}
577
- ```
578
  """)
579
 
580
  # ======================================================
 
352
 
353
  @spaces.GPU
354
  def generate_audio_core(video_file, caption):
355
+
 
356
 
357
  start_time =time.time()
358
 
 
382
  work_dir = tempfile.mkdtemp(dir=os.environ["GRADIO_TEMP_DIR"], prefix="PrismAudio_")
383
 
384
  try:
385
+ if _MODELS["diffusion"] is None:
386
+ load_all_models()
387
  # ---- Step 1: Convert / copy to mp4 ----
388
+ status = log_step("πŸ“Ή Step 1: Preparing video...")
389
 
390
  src_ext = os.path.splitext(video_file)[1].lower()
391
  mp4_path = os.path.join(work_dir, "input.mp4")
 
400
  log_step(" Video ready.")
401
 
402
  # ---- Step 2: Validate duration ----
403
+ status = log_step("πŸ“Ή Step 2: Checking video duration...")
404
 
405
 
406
  duration = get_video_duration(mp4_path)
407
  log_step(f" Duration: {duration:.2f}s")
408
 
409
  # ---- Step 3: Extract video frames ----
410
+ status = log_step("🎞️ Step 3: Extracting video frames (clip & sync)...")
411
 
412
  clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
413
  log_step(f" clip_chunk : {tuple(clip_chunk.shape)}")
414
  log_step(f" sync_chunk : {tuple(sync_chunk.shape)}")
415
 
416
  # ---- Step 4: Extract model features ----
417
+ status = log_step("🧠 Step 4: Extracting text / video / sync features...")
418
  #yield status, None
419
 
420
  info = extract_features(clip_chunk, sync_chunk, caption)
 
432
  log_step(f" audio_latent : {tuple(audio_latent.shape)}")
433
 
434
  # ---- Step 6: Diffusion sampling ----
435
+ status = log_step("🎡 Step 6: Running diffusion sampling...")
436
  #yield status, None
437
 
438
  generated_audio = run_diffusion(audio_latent, meta, duration)
439
  log_step(f" Generated audio shape : {tuple(generated_audio.shape)}")
440
 
441
  # ---- Step 7: Save generated audio (temp) ----
442
+ status = log_step("πŸ’Ύ Step 7: Saving generated audio...")
443
  #yield status, None
444
 
445
  audio_path = os.path.join(work_dir, "generated_audio.wav")
 
451
  log_step(f" Audio saved: {audio_path}")
452
 
453
  # ---- Step 8: Mux audio into original video ----
454
+ status = log_step("🎬 Step 8: Merging audio into video...")
455
  #yield status, None
456
 
457
  combined_path = os.path.join(work_dir, "output_with_audio.mp4")
 
562
  2. Enter a text prompt describing the desired audio content.
563
  3. Click **πŸš€ Generate Audio** and watch the log on the right for progress.
564
  4. The output video (original visuals + generated audio) appears below when done.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
  """)
566
 
567
  # ======================================================