Spaces:
Running on Zero
Running on Zero
prismaudio-project commited on
Commit ·
537397a
1
Parent(s): bdc92e0
fix
Browse files
app.py
CHANGED
|
@@ -358,22 +358,15 @@ def generate_audio_core(video_file, caption):
|
|
| 358 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 359 |
_MODELS["feature_extractor"].to(DEVICE)
|
| 360 |
_MODELS["diffusion"].to(DEVICE)
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
"""
|
| 364 |
-
Gradio generator function (yields status + result progressively).
|
| 365 |
|
| 366 |
-
Yields:
|
| 367 |
-
(status_str, combined_video_path_or_None)
|
| 368 |
-
"""
|
| 369 |
-
# ---- Basic validation ----
|
| 370 |
if video_file is None:
|
| 371 |
return "❌ Please upload a video file first.", None
|
| 372 |
|
| 373 |
if not caption or caption.strip() == "":
|
| 374 |
caption=""
|
| 375 |
|
| 376 |
-
|
| 377 |
caption = caption.strip()
|
| 378 |
logs = []
|
| 379 |
|
|
@@ -382,11 +375,11 @@ def generate_audio_core(video_file, caption):
|
|
| 382 |
logs.append(msg)
|
| 383 |
return "\n".join(logs)
|
| 384 |
|
| 385 |
-
# ---- Working directory (auto-cleaned on exit) ----
|
| 386 |
work_dir = tempfile.mkdtemp(prefix="PrismAudio_")
|
| 387 |
|
| 388 |
try:
|
| 389 |
# ---- Step 1: Convert / copy to mp4 ----
|
|
|
|
| 390 |
status = log_step("📹 Step 1: Preparing video...")
|
| 391 |
|
| 392 |
src_ext = os.path.splitext(video_file)[1].lower()
|
|
@@ -399,50 +392,46 @@ def generate_audio_core(video_file, caption):
|
|
| 399 |
return log_step(f"❌ Video conversion failed:\n{err}"), None
|
| 400 |
else:
|
| 401 |
shutil.copy(video_file, mp4_path)
|
| 402 |
-
log_step(" Video ready.")
|
| 403 |
|
| 404 |
# ---- Step 2: Validate duration ----
|
|
|
|
| 405 |
status = log_step("📹 Step 2: Checking video duration...")
|
| 406 |
|
| 407 |
-
|
| 408 |
duration = get_video_duration(mp4_path)
|
| 409 |
-
log_step(f" Duration: {duration:.2f}s")
|
| 410 |
|
| 411 |
# ---- Step 3: Extract video frames ----
|
| 412 |
-
|
|
|
|
| 413 |
|
| 414 |
clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
|
| 415 |
-
log_step(f"
|
| 416 |
-
log_step(f" sync_chunk : {tuple(sync_chunk.shape)}")
|
| 417 |
|
| 418 |
# ---- Step 4: Extract model features ----
|
| 419 |
-
|
| 420 |
-
|
| 421 |
|
| 422 |
info = extract_features(clip_chunk, sync_chunk, caption)
|
| 423 |
-
log_step(f"
|
| 424 |
-
log_step(f" global_video_features : {tuple(info['global_video_features'].shape)}")
|
| 425 |
-
log_step(f" video_features : {tuple(info['video_features'].shape)}")
|
| 426 |
-
log_step(f" global_text_features : {tuple(info['global_text_features'].shape)}")
|
| 427 |
-
log_step(f" sync_features : {tuple(info['sync_features'].shape)}")
|
| 428 |
|
| 429 |
# ---- Step 5: Build inference batch ----
|
| 430 |
-
|
| 431 |
-
|
| 432 |
|
| 433 |
audio_latent, meta = build_meta(info, duration, caption)
|
| 434 |
-
log_step(f" audio_latent : {tuple(audio_latent.shape)}")
|
| 435 |
|
| 436 |
# ---- Step 6: Diffusion sampling ----
|
|
|
|
| 437 |
status = log_step("🎵 Step 6: Running diffusion sampling...")
|
| 438 |
-
#yield status, None
|
| 439 |
|
| 440 |
generated_audio = run_diffusion(audio_latent, meta, duration)
|
| 441 |
-
log_step(f"
|
| 442 |
|
| 443 |
# ---- Step 7: Save generated audio (temp) ----
|
|
|
|
| 444 |
status = log_step("💾 Step 7: Saving generated audio...")
|
| 445 |
-
#yield status, None
|
| 446 |
|
| 447 |
audio_path = os.path.join(work_dir, "generated_audio.wav")
|
| 448 |
torchaudio.save(
|
|
@@ -450,38 +439,37 @@ def generate_audio_core(video_file, caption):
|
|
| 450 |
generated_audio[0], # (1, T)
|
| 451 |
SAMPLE_RATE,
|
| 452 |
)
|
| 453 |
-
log_step(f" Audio saved: {audio_path}")
|
| 454 |
|
| 455 |
# ---- Step 8: Mux audio into original video ----
|
|
|
|
| 456 |
status = log_step("🎬 Step 8: Merging audio into video...")
|
| 457 |
-
#yield status, None
|
| 458 |
|
| 459 |
combined_path = os.path.join(work_dir, "output_with_audio.mp4")
|
| 460 |
ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
|
| 461 |
if not ok:
|
| 462 |
return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
|
| 463 |
|
|
|
|
| 464 |
|
| 465 |
-
|
|
|
|
|
|
|
| 466 |
return "\n".join(logs), combined_path
|
| 467 |
|
| 468 |
except Exception as e:
|
| 469 |
log_step(f"❌ Unexpected error: {str(e)}")
|
| 470 |
log.exception(e)
|
| 471 |
return "\n".join(logs), None
|
| 472 |
-
|
| 473 |
-
end_time =time.time()
|
| 474 |
-
print("cost: ",end_time-start_time)
|
| 475 |
|
| 476 |
-
# Note: work_dir is NOT deleted here so Gradio can serve the output file.
|
| 477 |
-
# Gradio manages its own GRADIO_TEMP_DIR cleanup on restart.
|
| 478 |
|
| 479 |
def generate_audio(video_file, caption):
|
| 480 |
-
# 先yield状态
|
| 481 |
yield "⏳ Waiting for GPU...", None
|
| 482 |
result_logs, result_video = generate_audio_core(video_file, caption)
|
| 483 |
yield result_logs, result_video
|
| 484 |
|
|
|
|
|
|
|
| 485 |
# ==================== Gradio UI ====================
|
| 486 |
|
| 487 |
def build_ui() -> gr.Blocks:
|
|
@@ -556,6 +544,7 @@ def build_ui() -> gr.Blocks:
|
|
| 556 |
<Spatial> Natural sound distribution across the stereo field, suggesting birds are around the listener. Food interaction sounds can be localized.
|
| 557 |
"""],
|
| 558 |
["demos/Rail transport_3_479.mp4", "Generate ambient countryside sounds with a gentle breeze rustling the leaves of a large tree. From the right, introduce a faint rumble of wheels on a track and a steam engine chugging. Allow the sounds to grow louder and pan from right to left as the steam train travels across the landscape. Include the powerful chugging and clattering of carriages in the soundscape, then gradually recede the sounds to the left. Ensure no additional background noise or music is present."],
|
|
|
|
| 559 |
["demos/Cat_2_438.mp4", "A cat perched in a tree, letting out loud and sweet meows."],
|
| 560 |
],
|
| 561 |
inputs=[video_input, caption_input],
|
|
|
|
| 358 |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 359 |
_MODELS["feature_extractor"].to(DEVICE)
|
| 360 |
_MODELS["diffusion"].to(DEVICE)
|
| 361 |
+
|
| 362 |
+
total_start_time = time.time()
|
|
|
|
|
|
|
| 363 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
if video_file is None:
|
| 365 |
return "❌ Please upload a video file first.", None
|
| 366 |
|
| 367 |
if not caption or caption.strip() == "":
|
| 368 |
caption=""
|
| 369 |
|
|
|
|
| 370 |
caption = caption.strip()
|
| 371 |
logs = []
|
| 372 |
|
|
|
|
| 375 |
logs.append(msg)
|
| 376 |
return "\n".join(logs)
|
| 377 |
|
|
|
|
| 378 |
work_dir = tempfile.mkdtemp(prefix="PrismAudio_")
|
| 379 |
|
| 380 |
try:
|
| 381 |
# ---- Step 1: Convert / copy to mp4 ----
|
| 382 |
+
step_start = time.time()
|
| 383 |
status = log_step("📹 Step 1: Preparing video...")
|
| 384 |
|
| 385 |
src_ext = os.path.splitext(video_file)[1].lower()
|
|
|
|
| 392 |
return log_step(f"❌ Video conversion failed:\n{err}"), None
|
| 393 |
else:
|
| 394 |
shutil.copy(video_file, mp4_path)
|
| 395 |
+
log_step(f" Video ready. ⏱️ Step 1 cost: {time.time() - step_start:.2f}s")
|
| 396 |
|
| 397 |
# ---- Step 2: Validate duration ----
|
| 398 |
+
step_start = time.time()
|
| 399 |
status = log_step("📹 Step 2: Checking video duration...")
|
| 400 |
|
|
|
|
| 401 |
duration = get_video_duration(mp4_path)
|
| 402 |
+
log_step(f" Duration: {duration:.2f}s ⏱️ Step 2 cost: {time.time() - step_start:.2f}s")
|
| 403 |
|
| 404 |
# ---- Step 3: Extract video frames ----
|
| 405 |
+
step_start = time.time()
|
| 406 |
+
status = log_step("🎞️ Step 3: Extracting video frames...")
|
| 407 |
|
| 408 |
clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
|
| 409 |
+
log_step(f" Frames extracted. ⏱️ Step 3 cost: {time.time() - step_start:.2f}s")
|
|
|
|
| 410 |
|
| 411 |
# ---- Step 4: Extract model features ----
|
| 412 |
+
step_start = time.time()
|
| 413 |
+
status = log_step("🧠 Step 4: Extracting text / video features...")
|
| 414 |
|
| 415 |
info = extract_features(clip_chunk, sync_chunk, caption)
|
| 416 |
+
log_step(f" Features extracted. ⏱️ Step 4 cost: {time.time() - step_start:.2f}s")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
# ---- Step 5: Build inference batch ----
|
| 419 |
+
step_start = time.time()
|
| 420 |
+
status = log_step("📦 Step 5: Building inference batch...")
|
| 421 |
|
| 422 |
audio_latent, meta = build_meta(info, duration, caption)
|
| 423 |
+
log_step(f" audio_latent : {tuple(audio_latent.shape)} ⏱️ Step 5 cost: {time.time() - step_start:.2f}s")
|
| 424 |
|
| 425 |
# ---- Step 6: Diffusion sampling ----
|
| 426 |
+
step_start = time.time()
|
| 427 |
status = log_step("🎵 Step 6: Running diffusion sampling...")
|
|
|
|
| 428 |
|
| 429 |
generated_audio = run_diffusion(audio_latent, meta, duration)
|
| 430 |
+
log_step(f" Diffusion sampling done. ⏱️ Step 6 cost: {time.time() - step_start:.2f}s")
|
| 431 |
|
| 432 |
# ---- Step 7: Save generated audio (temp) ----
|
| 433 |
+
step_start = time.time()
|
| 434 |
status = log_step("💾 Step 7: Saving generated audio...")
|
|
|
|
| 435 |
|
| 436 |
audio_path = os.path.join(work_dir, "generated_audio.wav")
|
| 437 |
torchaudio.save(
|
|
|
|
| 439 |
generated_audio[0], # (1, T)
|
| 440 |
SAMPLE_RATE,
|
| 441 |
)
|
| 442 |
+
log_step(f" Audio saved: {audio_path} ⏱️ Step 7 cost: {time.time() - step_start:.2f}s")
|
| 443 |
|
| 444 |
# ---- Step 8: Mux audio into original video ----
|
| 445 |
+
step_start = time.time()
|
| 446 |
status = log_step("🎬 Step 8: Merging audio into video...")
|
|
|
|
| 447 |
|
| 448 |
combined_path = os.path.join(work_dir, "output_with_audio.mp4")
|
| 449 |
ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
|
| 450 |
if not ok:
|
| 451 |
return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
|
| 452 |
|
| 453 |
+
log_step(f" Audio and video merged. ⏱️ Step 8 cost: {time.time() - step_start:.2f}s")
|
| 454 |
|
| 455 |
+
total_cost = time.time() - total_start_time
|
| 456 |
+
log_step(f"✅ Done! Audio and video merged successfully. ⏱️ Total cost: {total_cost:.2f}s")
|
| 457 |
+
|
| 458 |
return "\n".join(logs), combined_path
|
| 459 |
|
| 460 |
except Exception as e:
|
| 461 |
log_step(f"❌ Unexpected error: {str(e)}")
|
| 462 |
log.exception(e)
|
| 463 |
return "\n".join(logs), None
|
|
|
|
|
|
|
|
|
|
| 464 |
|
|
|
|
|
|
|
| 465 |
|
| 466 |
def generate_audio(video_file, caption):
|
|
|
|
| 467 |
yield "⏳ Waiting for GPU...", None
|
| 468 |
result_logs, result_video = generate_audio_core(video_file, caption)
|
| 469 |
yield result_logs, result_video
|
| 470 |
|
| 471 |
+
|
| 472 |
+
|
| 473 |
# ==================== Gradio UI ====================
|
| 474 |
|
| 475 |
def build_ui() -> gr.Blocks:
|
|
|
|
| 544 |
<Spatial> Natural sound distribution across the stereo field, suggesting birds are around the listener. Food interaction sounds can be localized.
|
| 545 |
"""],
|
| 546 |
["demos/Rail transport_3_479.mp4", "Generate ambient countryside sounds with a gentle breeze rustling the leaves of a large tree. From the right, introduce a faint rumble of wheels on a track and a steam engine chugging. Allow the sounds to grow louder and pan from right to left as the steam train travels across the landscape. Include the powerful chugging and clattering of carriages in the soundscape, then gradually recede the sounds to the left. Ensure no additional background noise or music is present."],
|
| 547 |
+
["demos/3ClbaJYWVO4_000030.mp4", "Produce delicate and melodious guitar strumming that gracefully flows and dances with the musical rhythm."],
|
| 548 |
["demos/Cat_2_438.mp4", "A cat perched in a tree, letting out loud and sweet meows."],
|
| 549 |
],
|
| 550 |
inputs=[video_input, caption_input],
|