Adam-512 commited on
Commit
d95575c
·
1 Parent(s): c5b66c8
Files changed (3) hide show
  1. app.py +126 -0
  2. packages.txt +1 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import torch
4
+ from diffusers import AudioLDM2Pipeline
5
+ import scipy.io.wavfile as wavfile
6
+ import tempfile
7
+ import os
8
+
9
+ # ==================== 模型加载(只加载一次)====================
10
+ print("Loading AudioLDM2-large model... (this may take 1-2 minutes on first cold start)")
11
+ repo_id = "cvssp/audioldm2-large"
12
+
13
+ # 使用 cache_dir 指向 /src,避免 HF Space 只读根目录问题
14
+ pipe = AudioLDM2Pipeline.from_pretrained(
15
+ repo_id,
16
+ torch_dtype=torch.float16,
17
+ variant="fp16",
18
+ cache_dir="/src/.cache" # Space 可写目录
19
+ )
20
+
21
+ # 强制使用 GPU(Space 默认有 GPU)
22
+ pipe = pipe.to("cuda")
23
+ pipe.enable_attention_slicing() # 显存优化
24
+ pipe.enable_vae_slicing()
25
+ print("Model loaded successfully on GPU!")
26
+
27
+ # ==================== 生成函数 ====================
28
+ def text_to_audio(
29
+ prompt: str,
30
+ negative_prompt: str = "",
31
+ duration: float = 5.0,
32
+ guidance_scale: float = 3.5,
33
+ num_inference_steps: int = 200,
34
+ num_waveforms: int = 1,
35
+ seed: int = -1,
36
+ ):
37
+ generator = None
38
+ if seed != -1:
39
+ generator = torch.Generator("cuda").manual_seed(seed)
40
+
41
+ with torch.autocast("cuda"):
42
+ audios = pipe(
43
+ prompt,
44
+ negative_prompt=negative_prompt or None,
45
+ num_inference_steps=num_inference_steps,
46
+ audio_length_in_s=duration,
47
+ num_waveforms_per_prompt=num_waveforms,
48
+ guidance_scale=guidance_scale,
49
+ generator=generator,
50
+ ).audios # shape: [num_waveforms, samples]
51
+
52
+ # 取质量最好的第一个
53
+ audio_np = audios[0]
54
+
55
+ # 保存到临时文件
56
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
57
+ wavfile.write(tmp_file.name, rate=16000, data=audio_np)
58
+ return tmp_file.name
59
+
60
+
61
+ # ==================== Gradio 界面 ====================
62
+ css = """
63
+ .gradio-container {max-width: 900px !important; margin: auto !important;}
64
+ footer {display: none !important;}
65
+ """
66
+
67
+ with gr.Blocks(css=css, title="AudioLDM2-Large Text-to-Audio") as demo:
68
+ gr.Markdown("""
69
+ # AudioLDM2-Large
70
+ 最强开源文本生成音频模型(支持音效、音乐、环境声、语音等)
71
+ """)
72
+
73
+ with gr.Row():
74
+ with gr.Column(scale=2):
75
+ prompt = gr.Textbox(
76
+ label="描述你想要的音频(越详细越好)",
77
+ placeholder="例如:A dog barking angrily on a busy city street with car horns",
78
+ lines=3
79
+ )
80
+ negative = gr.Textbox(
81
+ label="负面提示(可选)",
82
+ placeholder="low quality, noise, distortion, echo",
83
+ lines=1
84
+ )
85
+
86
+ with gr.Row():
87
+ duration = gr.Slider(2.0, 10.0, value=5.0, step=0.5, label="时长(秒)")
88
+ steps = gr.Slider(50, 200, value=200, step=25, label="采样步数(越高越精细但越慢)")
89
+ with gr.Row():
90
+ guidance = gr.Slider(1.0, 10.0, value=3.5, step=0.5, label="引导尺度(Guidance Scale)")
91
+ num = gr.Slider(1, 4, value=1, step=1, label="生成数量(同时生成多个候选)")
92
+ seed = gr.Number(value=-1, label="随机种子(相同种子+相同提示 = 可复现,填 -1 随机)")
93
+
94
+ btn = gr.Button("Generate Audio 🎵", variant="primary", size="lg")
95
+
96
+ with gr.Column(scale=1):
97
+ output_audio = gr.Audio(label="生成的音频", type="filepath", interactive=False)
98
+
99
+ btn.click(
100
+ fn=text_to_audio,
101
+ inputs=[prompt, negative, duration, guidance, steps, num, seed],
102
+ outputs=output_audio,
103
+ show_progress=True
104
+ )
105
+
106
+ gr.Examples(
107
+ examples=[
108
+ ["A beautiful piano melody with soft strings in the background", "", 8.0],
109
+ ["Thunderstorm with heavy rain and strong wind blowing through trees", "", 7.0],
110
+ ["A cat meowing and then purring while being petted", "", 5.0],
111
+ ["80s synthwave music with retro drums and electric guitar solo", "", 10.0],
112
+ ["Fire crackling in a cozy fireplace on a winter night", "", 6.0],
113
+ ],
114
+ inputs=[prompt, negative, duration],
115
+ label="点击示例一键生成"
116
+ )
117
+
118
+ gr.Markdown("""
119
+ ### Tips
120
+ - 生成一次大约需要 20~60 秒(取决于步数和时长)
121
+ - 推荐 200 步 + Guidance 3.5~4.5 获得最佳质量
122
+ - Space 使用 A10G GPU,冷启动后速度会稍慢,之后会很快
123
+ """)
124
+
125
+ if __name__ == "__main__":
126
+ demo.queue(max_size=20).launch()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio>=4.0
2
+ torch>=2.1
3
+ diffusers>=0.27.0
4
+ transformers>=4.38
5
+ accelerate
6
+ scipy
7
+ safetensors