Spaces:
Running on Zero
Running on Zero
Commit Β·
36c1b45
1
Parent(s): dd33d76
Update UI descriptions: scene-type guidance instead of duration
Browse files
app.py
CHANGED
|
@@ -573,11 +573,11 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
|
|
| 573 |
gr.Markdown(
|
| 574 |
"# Video-to-Audio Generation\n"
|
| 575 |
"Choose a model and upload a video to generate synchronized audio.\n\n"
|
| 576 |
-
"| Model |
|
| 577 |
-
"|-------|----------
|
| 578 |
-
"| **TARO** |
|
| 579 |
-
"| **MMAudio** |
|
| 580 |
-
"| **HunyuanFoley** |
|
| 581 |
)
|
| 582 |
|
| 583 |
with gr.Tabs():
|
|
@@ -587,9 +587,10 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
|
|
| 587 |
# ---------------------------------------------------------- #
|
| 588 |
with gr.Tab("TARO"):
|
| 589 |
gr.Markdown(
|
| 590 |
-
"**TARO** β Video-conditioned diffusion (ICCV 2025). No text prompt needed
|
| 591 |
-
"
|
| 592 |
-
"
|
|
|
|
| 593 |
)
|
| 594 |
with gr.Row():
|
| 595 |
with gr.Column():
|
|
@@ -646,9 +647,10 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
|
|
| 646 |
with gr.Tab("MMAudio"):
|
| 647 |
gr.Markdown(
|
| 648 |
"**MMAudio** β Multimodal flow-matching (CVPR 2025). "
|
| 649 |
-
"
|
| 650 |
-
"
|
| 651 |
-
"
|
|
|
|
| 652 |
)
|
| 653 |
with gr.Row():
|
| 654 |
with gr.Column():
|
|
@@ -698,9 +700,12 @@ with gr.Blocks(title="Video-to-Audio Generation") as demo:
|
|
| 698 |
# ---------------------------------------------------------- #
|
| 699 |
with gr.Tab("HunyuanFoley"):
|
| 700 |
gr.Markdown(
|
| 701 |
-
"**HunyuanVideo-Foley** (Tencent Hunyuan). "
|
| 702 |
-
"
|
| 703 |
-
"
|
|
|
|
|
|
|
|
|
|
| 704 |
)
|
| 705 |
with gr.Row():
|
| 706 |
with gr.Column():
|
|
|
|
| 573 |
gr.Markdown(
|
| 574 |
"# Video-to-Audio Generation\n"
|
| 575 |
"Choose a model and upload a video to generate synchronized audio.\n\n"
|
| 576 |
+
"| Model | Best for | Avoid for |\n"
|
| 577 |
+
"|-------|----------|-----------|\n"
|
| 578 |
+
"| **TARO** | Natural, physics-driven impacts β footsteps, collisions, water, wind, crackling fire. Excels when the sound is tightly coupled to visible motion without needing a text description. | Dialogue, music, or complex layered soundscapes where semantic context matters. |\n"
|
| 579 |
+
"| **MMAudio** | Mixed scenes where you want both visual grounding *and* semantic control via a text prompt β e.g. a busy street scene where you want to emphasize the rain rather than the traffic. Great for ambient textures and nuanced sound design. | Pure impact/foley shots where TARO's motion-coupling would be sharper, or cinematic music beds. |\n"
|
| 580 |
+
"| **HunyuanFoley** | Cinematic foley requiring high fidelity and explicit creative direction β dramatic SFX, layered environmental design, or any scene where you have a clear written description of the desired sound palette. | Quick one-shot clips where you don't want to write a prompt, or raw impact sounds where timing precision matters more than richness. |"
|
| 581 |
)
|
| 582 |
|
| 583 |
with gr.Tabs():
|
|
|
|
| 587 |
# ---------------------------------------------------------- #
|
| 588 |
with gr.Tab("TARO"):
|
| 589 |
gr.Markdown(
|
| 590 |
+
"**TARO** β Video-conditioned diffusion (ICCV 2025). No text prompt needed β "
|
| 591 |
+
"sound is derived entirely from visual motion. "
|
| 592 |
+
"Best for scenes with clear physics-driven events: footsteps, impacts, splashing water, "
|
| 593 |
+
"crackling fire, rustling leaves, machinery. The model learns timing directly from the video."
|
| 594 |
)
|
| 595 |
with gr.Row():
|
| 596 |
with gr.Column():
|
|
|
|
| 647 |
with gr.Tab("MMAudio"):
|
| 648 |
gr.Markdown(
|
| 649 |
"**MMAudio** β Multimodal flow-matching (CVPR 2025). "
|
| 650 |
+
"Combines visual grounding with optional text guidance, making it the most flexible choice. "
|
| 651 |
+
"Best for mixed or ambiguous scenes β busy environments, nature montages, abstract visuals β "
|
| 652 |
+
"where a short prompt lets you steer which element of the scene to emphasise "
|
| 653 |
+
"(e.g. *'heavy rain'* over a street scene to suppress traffic noise)."
|
| 654 |
)
|
| 655 |
with gr.Row():
|
| 656 |
with gr.Column():
|
|
|
|
| 700 |
# ---------------------------------------------------------- #
|
| 701 |
with gr.Tab("HunyuanFoley"):
|
| 702 |
gr.Markdown(
|
| 703 |
+
"**HunyuanVideo-Foley** (Tencent Hunyuan, 2025). "
|
| 704 |
+
"Highest-fidelity model for cinematic and creative foley. "
|
| 705 |
+
"Best for scenes that call for rich, layered sound design β dramatic SFX, "
|
| 706 |
+
"complex environments (crowd + rain + distant thunder), or any clip where you have "
|
| 707 |
+
"a clear creative vision you can describe in a prompt. "
|
| 708 |
+
"Requires a text prompt."
|
| 709 |
)
|
| 710 |
with gr.Row():
|
| 711 |
with gr.Column():
|