prismaudio-project commited on
Commit
10a671c
·
1 Parent(s): 537397a
Files changed (1) hide show
  1. app.py +7 -7
app.py CHANGED
@@ -392,21 +392,21 @@ def generate_audio_core(video_file, caption):
392
  return log_step(f"❌ Video conversion failed:\n{err}"), None
393
  else:
394
  shutil.copy(video_file, mp4_path)
395
- log_step(f" Video ready. ⏱️ Step 1 cost: {time.time() - step_start:.2f}s")
396
 
397
  # ---- Step 2: Validate duration ----
398
  step_start = time.time()
399
  status = log_step("📹 Step 2: Checking video duration...")
400
 
401
  duration = get_video_duration(mp4_path)
402
- log_step(f" Duration: {duration:.2f}s ⏱️ Step 2 cost: {time.time() - step_start:.2f}s")
403
 
404
  # ---- Step 3: Extract video frames ----
405
  step_start = time.time()
406
  status = log_step("🎞️ Step 3: Extracting video frames...")
407
 
408
  clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
409
- log_step(f" Frames extracted. ⏱️ Step 3 cost: {time.time() - step_start:.2f}s")
410
 
411
  # ---- Step 4: Extract model features ----
412
  step_start = time.time()
@@ -420,14 +420,14 @@ def generate_audio_core(video_file, caption):
420
  status = log_step("📦 Step 5: Building inference batch...")
421
 
422
  audio_latent, meta = build_meta(info, duration, caption)
423
- log_step(f" audio_latent : {tuple(audio_latent.shape)} ⏱️ Step 5 cost: {time.time() - step_start:.2f}s")
424
 
425
  # ---- Step 6: Diffusion sampling ----
426
  step_start = time.time()
427
  status = log_step("🎵 Step 6: Running diffusion sampling...")
428
 
429
  generated_audio = run_diffusion(audio_latent, meta, duration)
430
- log_step(f" Diffusion sampling done. ⏱️ Step 6 cost: {time.time() - step_start:.2f}s")
431
 
432
  # ---- Step 7: Save generated audio (temp) ----
433
  step_start = time.time()
@@ -439,7 +439,7 @@ def generate_audio_core(video_file, caption):
439
  generated_audio[0], # (1, T)
440
  SAMPLE_RATE,
441
  )
442
- log_step(f" Audio saved: {audio_path} ⏱️ Step 7 cost: {time.time() - step_start:.2f}s")
443
 
444
  # ---- Step 8: Mux audio into original video ----
445
  step_start = time.time()
@@ -450,7 +450,7 @@ def generate_audio_core(video_file, caption):
450
  if not ok:
451
  return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
452
 
453
- log_step(f" Audio and video merged. ⏱️ Step 8 cost: {time.time() - step_start:.2f}s")
454
 
455
  total_cost = time.time() - total_start_time
456
  log_step(f"✅ Done! Audio and video merged successfully. ⏱️ Total cost: {total_cost:.2f}s")
 
392
  return log_step(f"❌ Video conversion failed:\n{err}"), None
393
  else:
394
  shutil.copy(video_file, mp4_path)
395
+ log_step(f"⏱️ Step 1 cost: {time.time() - step_start:.2f}s")
396
 
397
  # ---- Step 2: Validate duration ----
398
  step_start = time.time()
399
  status = log_step("📹 Step 2: Checking video duration...")
400
 
401
  duration = get_video_duration(mp4_path)
402
+ log_step(f"⏱️ Step 2 cost: {time.time() - step_start:.2f}s")
403
 
404
  # ---- Step 3: Extract video frames ----
405
  step_start = time.time()
406
  status = log_step("🎞️ Step 3: Extracting video frames...")
407
 
408
  clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
409
+ log_step(f"⏱️ Step 3 cost: {time.time() - step_start:.2f}s")
410
 
411
  # ---- Step 4: Extract model features ----
412
  step_start = time.time()
 
420
  status = log_step("📦 Step 5: Building inference batch...")
421
 
422
  audio_latent, meta = build_meta(info, duration, caption)
423
+ log_step(f"⏱️ Step 5 cost: {time.time() - step_start:.2f}s")
424
 
425
  # ---- Step 6: Diffusion sampling ----
426
  step_start = time.time()
427
  status = log_step("🎵 Step 6: Running diffusion sampling...")
428
 
429
  generated_audio = run_diffusion(audio_latent, meta, duration)
430
+ log_step(f"⏱️ Step 6 cost: {time.time() - step_start:.2f}s")
431
 
432
  # ---- Step 7: Save generated audio (temp) ----
433
  step_start = time.time()
 
439
  generated_audio[0], # (1, T)
440
  SAMPLE_RATE,
441
  )
442
+ log_step(f"⏱️ Step 7 cost: {time.time() - step_start:.2f}s")
443
 
444
  # ---- Step 8: Mux audio into original video ----
445
  step_start = time.time()
 
450
  if not ok:
451
  return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
452
 
453
+ log_step(f"⏱️ Step 8 cost: {time.time() - step_start:.2f}s")
454
 
455
  total_cost = time.time() - total_start_time
456
  log_step(f"✅ Done! Audio and video merged successfully. ⏱️ Total cost: {total_cost:.2f}s")