Spaces:

Adam-512
/

audio-ldm2

Paused

App Files Files Community

Adam-512 commited on about 1 month ago

Commit

d95575c

1 Parent(s): c5b66c8

init

Browse files

Files changed (3) hide show

app.py +126 -0
packages.txt +1 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# app.py
+import gradio as gr
+import torch
+from diffusers import AudioLDM2Pipeline
+import scipy.io.wavfile as wavfile
+import tempfile
+import os
+# ==================== 模型加载（只加载一次）====================
+print("Loading AudioLDM2-large model... (this may take 1-2 minutes on first cold start)")
+repo_id = "cvssp/audioldm2-large"
+# 使用 cache_dir 指向 /src，避免 HF Space 只读根目录问题
+pipe = AudioLDM2Pipeline.from_pretrained(
+    repo_id,
+    torch_dtype=torch.float16,
+    variant="fp16",
+    cache_dir="/src/.cache"   # Space 可写目录
+)
+# 强制使用 GPU（Space 默认有 GPU）
+pipe = pipe.to("cuda")
+pipe.enable_attention_slicing()        # 显存优化
+pipe.enable_vae_slicing()
+print("Model loaded successfully on GPU!")
+# ==================== 生成函数 ====================
+def text_to_audio(
+    prompt: str,
+    negative_prompt: str = "",
+    duration: float = 5.0,
+    guidance_scale: float = 3.5,
+    num_inference_steps: int = 200,
+    num_waveforms: int = 1,
+    seed: int = -1,
+):
+    generator = None
+    if seed != -1:
+        generator = torch.Generator("cuda").manual_seed(seed)
+    with torch.autocast("cuda"):
+        audios = pipe(
+            prompt,
+            negative_prompt=negative_prompt or None,
+            num_inference_steps=num_inference_steps,
+            audio_length_in_s=duration,
+            num_waveforms_per_prompt=num_waveforms,
+            guidance_scale=guidance_scale,
+            generator=generator,
+        ).audios  # shape: [num_waveforms, samples]
+    # 取质量最好的第一个
+    audio_np = audios[0]
+    # 保存到临时文件
+    tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    wavfile.write(tmp_file.name, rate=16000, data=audio_np)
+    return tmp_file.name
+# ==================== Gradio 界面 ====================
+css = """
+.gradio-container {max-width: 900px !important; margin: auto !important;}
+footer {display: none !important;}
+"""
+with gr.Blocks(css=css, title="AudioLDM2-Large Text-to-Audio") as demo:
+    gr.Markdown("""
+    # AudioLDM2-Large
+    最强开源文本生成音频模型（支持音效、音乐、环境声、语音等）
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt = gr.Textbox(
+                label="描述你想要的音频（越详细越好）",
+                placeholder="例如：A dog barking angrily on a busy city street with car horns",
+                lines=3
+            )
+            negative = gr.Textbox(
+                label="负面提示（可选）",
+                placeholder="low quality, noise, distortion, echo",
+                lines=1
+            )
+            with gr.Row():
+                duration = gr.Slider(2.0, 10.0, value=5.0, step=0.5, label="时长（秒）")
+                steps = gr.Slider(50, 200, value=200, step=25, label="采样步数（越高越精细但越慢）")
+            with gr.Row():
+                guidance = gr.Slider(1.0, 10.0, value=3.5, step=0.5, label="引导尺度（Guidance Scale）")
+                num = gr.Slider(1, 4, value=1, step=1, label="生成数量（同时生成多个候选）")
+            seed = gr.Number(value=-1, label="随机种子（相同种子+相同提示 = 可复现，填 -1 随机）")
+            btn = gr.Button("Generate Audio 🎵", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(label="生成的音频", type="filepath", interactive=False)
+    btn.click(
+        fn=text_to_audio,
+        inputs=[prompt, negative, duration, guidance, steps, num, seed],
+        outputs=output_audio,
+        show_progress=True
+    )
+    gr.Examples(
+        examples=[
+            ["A beautiful piano melody with soft strings in the background", "", 8.0],
+            ["Thunderstorm with heavy rain and strong wind blowing through trees", "", 7.0],
+            ["A cat meowing and then purring while being petted", "", 5.0],
+            ["80s synthwave music with retro drums and electric guitar solo", "", 10.0],
+            ["Fire crackling in a cozy fireplace on a winter night", "", 6.0],
+        ],
+        inputs=[prompt, negative, duration],
+        label="点击示例一键生成"
+    )
+    gr.Markdown("""
+    ### Tips
+    - 生成一次大约需要 20～60 秒（取决于步数和时长）
+    - 推荐 200 步 + Guidance 3.5～4.5 获得最佳质量
+    - Space 使用 A10G GPU，冷启动后速度会稍慢，之后会很快
+    """)
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio>=4.0
+torch>=2.1
+diffusers>=0.27.0
+transformers>=4.38
+accelerate
+scipy
+safetensors