Spaces:
Running on Zero
Running on Zero
prismaudio-project commited on
Commit ·
10a671c
1
Parent(s): 537397a
fix
Browse files
app.py
CHANGED
|
@@ -392,21 +392,21 @@ def generate_audio_core(video_file, caption):
|
|
| 392 |
return log_step(f"❌ Video conversion failed:\n{err}"), None
|
| 393 |
else:
|
| 394 |
shutil.copy(video_file, mp4_path)
|
| 395 |
-
log_step(f"
|
| 396 |
|
| 397 |
# ---- Step 2: Validate duration ----
|
| 398 |
step_start = time.time()
|
| 399 |
status = log_step("📹 Step 2: Checking video duration...")
|
| 400 |
|
| 401 |
duration = get_video_duration(mp4_path)
|
| 402 |
-
log_step(f"
|
| 403 |
|
| 404 |
# ---- Step 3: Extract video frames ----
|
| 405 |
step_start = time.time()
|
| 406 |
status = log_step("🎞️ Step 3: Extracting video frames...")
|
| 407 |
|
| 408 |
clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
|
| 409 |
-
log_step(f"
|
| 410 |
|
| 411 |
# ---- Step 4: Extract model features ----
|
| 412 |
step_start = time.time()
|
|
@@ -420,14 +420,14 @@ def generate_audio_core(video_file, caption):
|
|
| 420 |
status = log_step("📦 Step 5: Building inference batch...")
|
| 421 |
|
| 422 |
audio_latent, meta = build_meta(info, duration, caption)
|
| 423 |
-
log_step(f"
|
| 424 |
|
| 425 |
# ---- Step 6: Diffusion sampling ----
|
| 426 |
step_start = time.time()
|
| 427 |
status = log_step("🎵 Step 6: Running diffusion sampling...")
|
| 428 |
|
| 429 |
generated_audio = run_diffusion(audio_latent, meta, duration)
|
| 430 |
-
log_step(f"
|
| 431 |
|
| 432 |
# ---- Step 7: Save generated audio (temp) ----
|
| 433 |
step_start = time.time()
|
|
@@ -439,7 +439,7 @@ def generate_audio_core(video_file, caption):
|
|
| 439 |
generated_audio[0], # (1, T)
|
| 440 |
SAMPLE_RATE,
|
| 441 |
)
|
| 442 |
-
log_step(f"
|
| 443 |
|
| 444 |
# ---- Step 8: Mux audio into original video ----
|
| 445 |
step_start = time.time()
|
|
@@ -450,7 +450,7 @@ def generate_audio_core(video_file, caption):
|
|
| 450 |
if not ok:
|
| 451 |
return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
|
| 452 |
|
| 453 |
-
log_step(f"
|
| 454 |
|
| 455 |
total_cost = time.time() - total_start_time
|
| 456 |
log_step(f"✅ Done! Audio and video merged successfully. ⏱️ Total cost: {total_cost:.2f}s")
|
|
|
|
| 392 |
return log_step(f"❌ Video conversion failed:\n{err}"), None
|
| 393 |
else:
|
| 394 |
shutil.copy(video_file, mp4_path)
|
| 395 |
+
log_step(f"⏱️ Step 1 cost: {time.time() - step_start:.2f}s")
|
| 396 |
|
| 397 |
# ---- Step 2: Validate duration ----
|
| 398 |
step_start = time.time()
|
| 399 |
status = log_step("📹 Step 2: Checking video duration...")
|
| 400 |
|
| 401 |
duration = get_video_duration(mp4_path)
|
| 402 |
+
log_step(f"⏱️ Step 2 cost: {time.time() - step_start:.2f}s")
|
| 403 |
|
| 404 |
# ---- Step 3: Extract video frames ----
|
| 405 |
step_start = time.time()
|
| 406 |
status = log_step("🎞️ Step 3: Extracting video frames...")
|
| 407 |
|
| 408 |
clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
|
| 409 |
+
log_step(f"⏱️ Step 3 cost: {time.time() - step_start:.2f}s")
|
| 410 |
|
| 411 |
# ---- Step 4: Extract model features ----
|
| 412 |
step_start = time.time()
|
|
|
|
| 420 |
status = log_step("📦 Step 5: Building inference batch...")
|
| 421 |
|
| 422 |
audio_latent, meta = build_meta(info, duration, caption)
|
| 423 |
+
log_step(f"⏱️ Step 5 cost: {time.time() - step_start:.2f}s")
|
| 424 |
|
| 425 |
# ---- Step 6: Diffusion sampling ----
|
| 426 |
step_start = time.time()
|
| 427 |
status = log_step("🎵 Step 6: Running diffusion sampling...")
|
| 428 |
|
| 429 |
generated_audio = run_diffusion(audio_latent, meta, duration)
|
| 430 |
+
log_step(f"⏱️ Step 6 cost: {time.time() - step_start:.2f}s")
|
| 431 |
|
| 432 |
# ---- Step 7: Save generated audio (temp) ----
|
| 433 |
step_start = time.time()
|
|
|
|
| 439 |
generated_audio[0], # (1, T)
|
| 440 |
SAMPLE_RATE,
|
| 441 |
)
|
| 442 |
+
log_step(f"⏱️ Step 7 cost: {time.time() - step_start:.2f}s")
|
| 443 |
|
| 444 |
# ---- Step 8: Mux audio into original video ----
|
| 445 |
step_start = time.time()
|
|
|
|
| 450 |
if not ok:
|
| 451 |
return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
|
| 452 |
|
| 453 |
+
log_step(f"⏱️ Step 8 cost: {time.time() - step_start:.2f}s")
|
| 454 |
|
| 455 |
total_cost = time.time() - total_start_time
|
| 456 |
log_step(f"✅ Done! Audio and video merged successfully. ⏱️ Total cost: {total_cost:.2f}s")
|