Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -140,7 +140,7 @@ css = """
|
|
| 140 |
iface = gr.Blocks(css=css)
|
| 141 |
|
| 142 |
with iface:
|
| 143 |
-
|
| 144 |
"""
|
| 145 |
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
|
| 146 |
<div
|
|
@@ -149,14 +149,89 @@ with iface:
|
|
| 149 |
"
|
| 150 |
>
|
| 151 |
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
|
| 152 |
-
AudioLDM: Text-to-Audio Generation
|
| 153 |
</h1>
|
| 154 |
-
</div> <p style="margin-bottom: 10px; font-size: 94%">
|
| 155 |
-
<a href="https://arxiv.org/abs/2301.12503">[Paper]</a> <a href="https://audioldm.github.io/">[Project
|
| 156 |
-
page]</a> <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm">[🧨
|
| 157 |
-
Diffusers]</a>
|
| 158 |
-
</p>
|
| 159 |
-
</div>
|
| 160 |
"""
|
| 161 |
)
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
iface = gr.Blocks(css=css)
|
| 141 |
|
| 142 |
with iface:
|
| 143 |
+
gr.HTML(
|
| 144 |
"""
|
| 145 |
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
|
| 146 |
<div
|
|
|
|
| 149 |
"
|
| 150 |
>
|
| 151 |
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
|
| 152 |
+
AudioLDM: Text-to-Audio Generation Diffusion Models
|
| 153 |
</h1>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
"""
|
| 155 |
)
|
| 156 |
+
with gr.Group():
|
| 157 |
+
with gr.Box():
|
| 158 |
+
textbox = gr.Textbox(
|
| 159 |
+
value="A hammer is hitting a wooden surface",
|
| 160 |
+
max_lines=1,
|
| 161 |
+
label="Input text",
|
| 162 |
+
info="Your text is important for the audio quality. Please ensure it is descriptive by using more adjectives.",
|
| 163 |
+
elem_id="prompt-in",
|
| 164 |
+
)
|
| 165 |
+
negative_textbox = gr.Textbox(
|
| 166 |
+
value="low quality, average quality",
|
| 167 |
+
max_lines=1,
|
| 168 |
+
label="Negative prompt",
|
| 169 |
+
info="Enter a negative prompt not to guide the audio generation. Selecting appropriate negative prompts can improve the audio quality significantly.",
|
| 170 |
+
elem_id="prompt-in",
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
with gr.Accordion("Click to modify detailed configurations", open=False):
|
| 174 |
+
seed = gr.Number(
|
| 175 |
+
value=45,
|
| 176 |
+
label="Seed",
|
| 177 |
+
info="Change this value (any integer number) will lead to a different generation result.",
|
| 178 |
+
)
|
| 179 |
+
duration = gr.Slider(2.5, 10, value=5, step=2.5, label="Duration (seconds)")
|
| 180 |
+
guidance_scale = gr.Slider(
|
| 181 |
+
0,
|
| 182 |
+
5,
|
| 183 |
+
value=3.5,
|
| 184 |
+
step=0.5,
|
| 185 |
+
label="Guidance scale",
|
| 186 |
+
info="Large => better quality and relevancy to text; Small => better diversity",
|
| 187 |
+
)
|
| 188 |
+
n_candidates = gr.Slider(
|
| 189 |
+
1,
|
| 190 |
+
3,
|
| 191 |
+
value=3,
|
| 192 |
+
step=1,
|
| 193 |
+
label="Number waveforms to generate",
|
| 194 |
+
info="Automatic quality control. This number control the number of candidates (e.g., generate three audios and choose the best to show you). A Larger value usually lead to better quality with heavier computation",
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
outputs = gr.Video(label="Output", elem_id="output-video")
|
| 198 |
+
btn = gr.Button("Submit").style(full_width=True)
|
| 199 |
+
|
| 200 |
+
with gr.Group(elem_id="share-btn-container", visible=False):
|
| 201 |
+
community_icon = gr.HTML(community_icon_html)
|
| 202 |
+
loading_icon = gr.HTML(loading_icon_html)
|
| 203 |
+
share_button = gr.Button("Share to community", elem_id="share-btn")
|
| 204 |
+
|
| 205 |
+
btn.click(
|
| 206 |
+
text2audio,
|
| 207 |
+
inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
|
| 208 |
+
outputs=[outputs],
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
share_button.click(None, [], [], _js=share_js)
|
| 212 |
+
gr.HTML(
|
| 213 |
+
gr.Examples(
|
| 214 |
+
[
|
| 215 |
+
["A hammer is hitting a wooden surface", "low quality, average quality", 5, 2.5, 45, 3],
|
| 216 |
+
["Peaceful and calming ambient music with singing bowl and other instruments.", "low quality, average quality", 5, 2.5, 45, 3],
|
| 217 |
+
["A man is speaking in a small room.", "low quality, average quality", 5, 2.5, 45, 3],
|
| 218 |
+
["A female is speaking followed by footstep sound", "low quality, average quality", 5, 2.5, 45, 3],
|
| 219 |
+
["Wooden table tapping sound followed by water pouring sound.", "low quality, average quality", 5, 2.5, 45, 3],
|
| 220 |
+
],
|
| 221 |
+
fn=text2audio,
|
| 222 |
+
inputs=[textbox, negative_textbox, duration, guidance_scale, seed, n_candidates],
|
| 223 |
+
outputs=[outputs],
|
| 224 |
+
cache_examples=True,
|
| 225 |
+
)
|
| 226 |
+
gr.HTML(
|
| 227 |
+
"""
|
| 228 |
+
<div class="acknowledgements"> <p>Essential Tricks for Enhancing the Quality of Your Generated
|
| 229 |
+
Audio</p> <p>1. Try to use more adjectives to describe your sound. For example: "A man is speaking
|
| 230 |
+
clearly and slowly in a large room" is better than "A man is speaking". This can make sure AudioLDM
|
| 231 |
+
understands what you want.</p> <p>2. Try to use different random seeds, which can affect the generation
|
| 232 |
+
quality significantly sometimes.</p> <p>3. It's better to use general terms like 'man' or 'woman'
|
| 233 |
+
instead of specific names for individuals or abstract objects that humans may not be familiar with,
|
| 234 |
+
such as 'mummy'.</p> <p>4. Using a negative prompt to not guide the diffusion process can improve the
|
| 235 |
+
audio quality significantly. Try using negative prompts like 'low quality'.</p> </div>
|
| 236 |
+
"""
|
| 237 |
+
)
|