Spaces:
Running on Zero
Running on Zero
prismaudio-project commited on
Commit Β·
c367d8b
1
Parent(s): 6cba118
fix
Browse files
app.py
CHANGED
|
@@ -352,8 +352,7 @@ def run_diffusion(audio_latent: torch.Tensor, meta: dict, duration: float) -> to
|
|
| 352 |
|
| 353 |
@spaces.GPU
|
| 354 |
def generate_audio_core(video_file, caption):
|
| 355 |
-
|
| 356 |
-
load_all_models()
|
| 357 |
|
| 358 |
start_time =time.time()
|
| 359 |
|
|
@@ -383,8 +382,10 @@ def generate_audio_core(video_file, caption):
|
|
| 383 |
work_dir = tempfile.mkdtemp(dir=os.environ["GRADIO_TEMP_DIR"], prefix="PrismAudio_")
|
| 384 |
|
| 385 |
try:
|
|
|
|
|
|
|
| 386 |
# ---- Step 1: Convert / copy to mp4 ----
|
| 387 |
-
|
| 388 |
|
| 389 |
src_ext = os.path.splitext(video_file)[1].lower()
|
| 390 |
mp4_path = os.path.join(work_dir, "input.mp4")
|
|
@@ -399,21 +400,21 @@ def generate_audio_core(video_file, caption):
|
|
| 399 |
log_step(" Video ready.")
|
| 400 |
|
| 401 |
# ---- Step 2: Validate duration ----
|
| 402 |
-
|
| 403 |
|
| 404 |
|
| 405 |
duration = get_video_duration(mp4_path)
|
| 406 |
log_step(f" Duration: {duration:.2f}s")
|
| 407 |
|
| 408 |
# ---- Step 3: Extract video frames ----
|
| 409 |
-
|
| 410 |
|
| 411 |
clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
|
| 412 |
log_step(f" clip_chunk : {tuple(clip_chunk.shape)}")
|
| 413 |
log_step(f" sync_chunk : {tuple(sync_chunk.shape)}")
|
| 414 |
|
| 415 |
# ---- Step 4: Extract model features ----
|
| 416 |
-
|
| 417 |
#yield status, None
|
| 418 |
|
| 419 |
info = extract_features(clip_chunk, sync_chunk, caption)
|
|
@@ -431,14 +432,14 @@ def generate_audio_core(video_file, caption):
|
|
| 431 |
log_step(f" audio_latent : {tuple(audio_latent.shape)}")
|
| 432 |
|
| 433 |
# ---- Step 6: Diffusion sampling ----
|
| 434 |
-
|
| 435 |
#yield status, None
|
| 436 |
|
| 437 |
generated_audio = run_diffusion(audio_latent, meta, duration)
|
| 438 |
log_step(f" Generated audio shape : {tuple(generated_audio.shape)}")
|
| 439 |
|
| 440 |
# ---- Step 7: Save generated audio (temp) ----
|
| 441 |
-
|
| 442 |
#yield status, None
|
| 443 |
|
| 444 |
audio_path = os.path.join(work_dir, "generated_audio.wav")
|
|
@@ -450,7 +451,7 @@ def generate_audio_core(video_file, caption):
|
|
| 450 |
log_step(f" Audio saved: {audio_path}")
|
| 451 |
|
| 452 |
# ---- Step 8: Mux audio into original video ----
|
| 453 |
-
|
| 454 |
#yield status, None
|
| 455 |
|
| 456 |
combined_path = os.path.join(work_dir, "output_with_audio.mp4")
|
|
@@ -561,20 +562,6 @@ def build_ui() -> gr.Blocks:
|
|
| 561 |
2. Enter a text prompt describing the desired audio content.
|
| 562 |
3. Click **π Generate Audio** and watch the log on the right for progress.
|
| 563 |
4. The output video (original visuals + generated audio) appears below when done.
|
| 564 |
-
|
| 565 |
-
**Notes**
|
| 566 |
-
- All models are pre-loaded at startup β no warm-up delay on the first request.
|
| 567 |
-
- Everything stays in memory; only the final wav and merged mp4 are written to disk.
|
| 568 |
-
- A CUDA GPU is strongly recommended; CPU inference will be very slow.
|
| 569 |
-
- Queue depth is limited to 3 concurrent requests to avoid OOM.
|
| 570 |
-
|
| 571 |
-
**Current model paths**
|
| 572 |
-
```
|
| 573 |
-
MODEL_CONFIG_PATH = {MODEL_CONFIG_PATH}
|
| 574 |
-
CKPT_PATH = {CKPT_PATH}
|
| 575 |
-
VAE_CKPT_PATH = {VAE_CKPT_PATH}
|
| 576 |
-
SYNCHFORMER_CKPT_PATH = {SYNCHFORMER_CKPT_PATH}
|
| 577 |
-
```
|
| 578 |
""")
|
| 579 |
|
| 580 |
# ======================================================
|
|
|
|
| 352 |
|
| 353 |
@spaces.GPU
|
| 354 |
def generate_audio_core(video_file, caption):
|
| 355 |
+
|
|
|
|
| 356 |
|
| 357 |
start_time =time.time()
|
| 358 |
|
|
|
|
| 382 |
work_dir = tempfile.mkdtemp(dir=os.environ["GRADIO_TEMP_DIR"], prefix="PrismAudio_")
|
| 383 |
|
| 384 |
try:
|
| 385 |
+
if _MODELS["diffusion"] is None:
|
| 386 |
+
load_all_models()
|
| 387 |
# ---- Step 1: Convert / copy to mp4 ----
|
| 388 |
+
status = log_step("πΉ Step 1: Preparing video...")
|
| 389 |
|
| 390 |
src_ext = os.path.splitext(video_file)[1].lower()
|
| 391 |
mp4_path = os.path.join(work_dir, "input.mp4")
|
|
|
|
| 400 |
log_step(" Video ready.")
|
| 401 |
|
| 402 |
# ---- Step 2: Validate duration ----
|
| 403 |
+
status = log_step("πΉ Step 2: Checking video duration...")
|
| 404 |
|
| 405 |
|
| 406 |
duration = get_video_duration(mp4_path)
|
| 407 |
log_step(f" Duration: {duration:.2f}s")
|
| 408 |
|
| 409 |
# ---- Step 3: Extract video frames ----
|
| 410 |
+
status = log_step("ποΈ Step 3: Extracting video frames (clip & sync)...")
|
| 411 |
|
| 412 |
clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
|
| 413 |
log_step(f" clip_chunk : {tuple(clip_chunk.shape)}")
|
| 414 |
log_step(f" sync_chunk : {tuple(sync_chunk.shape)}")
|
| 415 |
|
| 416 |
# ---- Step 4: Extract model features ----
|
| 417 |
+
status = log_step("π§ Step 4: Extracting text / video / sync features...")
|
| 418 |
#yield status, None
|
| 419 |
|
| 420 |
info = extract_features(clip_chunk, sync_chunk, caption)
|
|
|
|
| 432 |
log_step(f" audio_latent : {tuple(audio_latent.shape)}")
|
| 433 |
|
| 434 |
# ---- Step 6: Diffusion sampling ----
|
| 435 |
+
status = log_step("π΅ Step 6: Running diffusion sampling...")
|
| 436 |
#yield status, None
|
| 437 |
|
| 438 |
generated_audio = run_diffusion(audio_latent, meta, duration)
|
| 439 |
log_step(f" Generated audio shape : {tuple(generated_audio.shape)}")
|
| 440 |
|
| 441 |
# ---- Step 7: Save generated audio (temp) ----
|
| 442 |
+
status = log_step("πΎ Step 7: Saving generated audio...")
|
| 443 |
#yield status, None
|
| 444 |
|
| 445 |
audio_path = os.path.join(work_dir, "generated_audio.wav")
|
|
|
|
| 451 |
log_step(f" Audio saved: {audio_path}")
|
| 452 |
|
| 453 |
# ---- Step 8: Mux audio into original video ----
|
| 454 |
+
status = log_step("π¬ Step 8: Merging audio into video...")
|
| 455 |
#yield status, None
|
| 456 |
|
| 457 |
combined_path = os.path.join(work_dir, "output_with_audio.mp4")
|
|
|
|
| 562 |
2. Enter a text prompt describing the desired audio content.
|
| 563 |
3. Click **π Generate Audio** and watch the log on the right for progress.
|
| 564 |
4. The output video (original visuals + generated audio) appears below when done.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
""")
|
| 566 |
|
| 567 |
# ======================================================
|