haoyue518 commited on
Commit
bba9bc2
·
verified ·
1 Parent(s): 6c23f71

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +317 -0
  2. packages.txt +1 -0
  3. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, tempfile, subprocess
2
+ import gradio as gr
3
+ import numpy as np
4
+ import soundfile as sf
5
+ import librosa
6
+ import torch
7
+
8
+ # 检查是否有 GPU
9
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
+ SAMPLE_RATE = 44100
11
+
12
+ def load_audio_any_format(file_path, target_sr=SAMPLE_RATE):
13
+ """加载任意格式音频(支持视频)"""
14
+ try:
15
+ audio, sr = librosa.load(file_path, sr=target_sr, mono=False)
16
+ if audio.ndim == 1:
17
+ audio = audio.reshape(1, -1)
18
+ return audio, sr
19
+ except Exception as e:
20
+ raise ValueError(f"音频加载失败: {str(e)}")
21
+
22
+ def save_audio(path, audio, sr):
23
+ """保存音频"""
24
+ if audio.ndim == 1:
25
+ audio = audio.reshape(1, -1)
26
+ sf.write(path, audio.T, sr, subtype="PCM_16")
27
+
28
+ def run_demucs_separation(audio_path, output_dir):
29
+ """使用 Demucs 进行人声/伴奏分离"""
30
+ try:
31
+ # 使用 htdemucs 模型,分离为 vocals 和 no_vocals
32
+ cmd = [
33
+ "python", "-m", "demucs.separate",
34
+ "--two-stems=vocals",
35
+ "-n", "htdemucs",
36
+ "-o", output_dir,
37
+ audio_path
38
+ ]
39
+
40
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
41
+
42
+ # 找到输出文件
43
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
44
+ stem_dir = os.path.join(output_dir, "htdemucs", base_name)
45
+
46
+ vocals_path = os.path.join(stem_dir, "vocals.wav")
47
+ instrumental_path = os.path.join(stem_dir, "no_vocals.wav")
48
+
49
+ if not os.path.exists(vocals_path):
50
+ raise FileNotFoundError("Demucs 分离失败,找不到输出文件")
51
+
52
+ return vocals_path, instrumental_path
53
+
54
+ except subprocess.CalledProcessError as e:
55
+ raise RuntimeError(f"Demucs 执行失败: {e.stderr}")
56
+ except Exception as e:
57
+ raise RuntimeError(f"Demucs 分离失败: {str(e)}")
58
+
59
+ def detect_singing_segments(vocals_audio, sr, confidence_threshold=0.5):
60
+ """
61
+ 检测唱歌片段(基于音高连续性)
62
+ 返回:singing_mask (0=说话, 1=唱歌)
63
+ """
64
+ try:
65
+ # 重采样到 16kHz 用于音高检测
66
+ if sr != 16000:
67
+ vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
68
+ sr_work = 16000
69
+ else:
70
+ vocals_16k = vocals_audio
71
+ sr_work = sr
72
+
73
+ # 使用 librosa 的 pyin 算法检测音高
74
+ f0, voiced_flag, voiced_probs = librosa.pyin(
75
+ vocals_16k,
76
+ fmin=librosa.note_to_hz('C2'),
77
+ fmax=librosa.note_to_hz('C7'),
78
+ sr=sr_work,
79
+ frame_length=2048,
80
+ hop_length=512
81
+ )
82
+
83
+ # 计算连续有声片段
84
+ hop_length = 512
85
+ n_frames = len(f0)
86
+ singing_frames = np.zeros(n_frames, dtype=np.float32)
87
+
88
+ # 连续音高片段判定为唱歌
89
+ min_duration_frames = int(0.3 * sr_work / hop_length) # 至少0.3秒
90
+
91
+ i = 0
92
+ while i < n_frames:
93
+ if voiced_probs[i] > confidence_threshold and not np.isnan(f0[i]):
94
+ j = i
95
+ pitch_sequence = []
96
+
97
+ # 找连续片段
98
+ while j < n_frames and voiced_probs[j] > confidence_threshold and not np.isnan(f0[j]):
99
+ pitch_sequence.append(f0[j])
100
+ j += 1
101
+
102
+ # 判断是否为唱歌(音高方差要合理)
103
+ if len(pitch_sequence) >= min_duration_frames:
104
+ pitch_std = np.std(pitch_sequence)
105
+ # 唱歌的音高变化通常在20-200Hz之间
106
+ if 20 < pitch_std < 200:
107
+ singing_frames[i:j] = 1.0
108
+
109
+ i = j
110
+ else:
111
+ i += 1
112
+
113
+ # 转换回原始采样率的掩码
114
+ samples_per_frame = hop_length
115
+ singing_mask = np.repeat(singing_frames, samples_per_frame)
116
+
117
+ # 调整长度匹配
118
+ target_length = len(vocals_16k)
119
+ if len(singing_mask) < target_length:
120
+ singing_mask = np.pad(singing_mask, (0, target_length - len(singing_mask)))
121
+ elif len(singing_mask) > target_length:
122
+ singing_mask = singing_mask[:target_length]
123
+
124
+ # 如果原始采样率不同,重采样掩码
125
+ if sr != sr_work:
126
+ # 简单的线性插值
127
+ from scipy import signal
128
+ singing_mask = signal.resample(singing_mask, len(vocals_audio))
129
+
130
+ # 平滑处理
131
+ window_size = int(0.1 * sr) # 100ms 窗口
132
+ if window_size > 1:
133
+ singing_mask = np.convolve(singing_mask, np.ones(window_size)/window_size, mode='same')
134
+ singing_mask = (singing_mask > 0.5).astype(np.float32)
135
+
136
+ return singing_mask
137
+
138
+ except Exception as e:
139
+ print(f"唱歌检测失败: {str(e)}")
140
+ # 失败时返回全零(全部视为说话)
141
+ return np.zeros(len(vocals_audio), dtype=np.float32)
142
+
143
+ def process_audio_full(audio_file, singing_sensitivity, enable_singing_detection):
144
+ """完整的音频分离流程"""
145
+ if audio_file is None:
146
+ return None, None, None, "❌ 请先上传音频文件"
147
+
148
+ status_messages = []
149
+
150
+ try:
151
+ with tempfile.TemporaryDirectory() as tmpdir:
152
+ # 1. 加载音频
153
+ status_messages.append("📂 正在加载音频...")
154
+ yield None, None, None, "\n".join(status_messages)
155
+
156
+ input_path = audio_file
157
+ audio, sr = load_audio_any_format(input_path, SAMPLE_RATE)
158
+
159
+ # 保存为标准 WAV
160
+ temp_wav = os.path.join(tmpdir, "input.wav")
161
+ save_audio(temp_wav, audio, sr)
162
+
163
+ # 2. Demucs 分离
164
+ status_messages.append("🎵 使用 AI 模型分离人声和伴奏(这可能需要几分钟)...")
165
+ yield None, None, None, "\n".join(status_messages)
166
+
167
+ vocals_path, instrumental_path = run_demucs_separation(temp_wav, tmpdir)
168
+
169
+ # 读取分离结果
170
+ vocals, _ = librosa.load(vocals_path, sr=sr, mono=True)
171
+ instrumental, _ = librosa.load(instrumental_path, sr=sr, mono=True)
172
+
173
+ # 3. 唱歌检测
174
+ if enable_singing_detection:
175
+ status_messages.append("🎤 正在检测唱歌片段...")
176
+ yield None, None, None, "\n".join(status_messages)
177
+
178
+ singing_mask = detect_singing_segments(
179
+ vocals, sr,
180
+ confidence_threshold=singing_sensitivity
181
+ )
182
+ else:
183
+ singing_mask = np.zeros(len(vocals), dtype=np.float32)
184
+
185
+ # 4. 分离对白和唱歌
186
+ status_messages.append("✂️ 正在分离对白和背景音乐...")
187
+ yield None, None, None, "\n".join(status_messages)
188
+
189
+ dialog_mask = 1 - singing_mask
190
+
191
+ dialog_vocals = vocals * dialog_mask
192
+ singing_vocals = vocals * singing_mask
193
+
194
+ # 5. 生成最终输出
195
+ # A: 前景对白(纯说话)
196
+ output_a = dialog_vocals
197
+
198
+ # B: 背景音乐(伴奏 + 唱段)
199
+ # 响度匹配,避免削波
200
+ singing_rms = np.sqrt(np.mean(singing_vocals**2) + 1e-8)
201
+ inst_rms = np.sqrt(np.mean(instrumental**2) + 1e-8)
202
+
203
+ if singing_rms > 1e-6:
204
+ singing_gain = inst_rms / singing_rms
205
+ singing_gain = np.clip(singing_gain, 0.1, 2.0)
206
+ else:
207
+ singing_gain = 1.0
208
+
209
+ output_b = np.clip(instrumental + singing_vocals * singing_gain, -1.0, 1.0)
210
+
211
+ # C: 纯伴奏
212
+ output_c = instrumental
213
+
214
+ # 保存文件
215
+ path_a = os.path.join(tmpdir, "A_dialog.wav")
216
+ path_b = os.path.join(tmpdir, "B_bgm_with_singing.wav")
217
+ path_c = os.path.join(tmpdir, "C_instrumental.wav")
218
+
219
+ save_audio(path_a, output_a, sr)
220
+ save_audio(path_b, output_b, sr)
221
+ save_audio(path_c, output_c, sr)
222
+
223
+ # 统计信息
224
+ total_duration = len(vocals) / sr
225
+ singing_duration = np.sum(singing_mask) / sr
226
+ dialog_duration = total_duration - singing_duration
227
+
228
+ status_messages.append(f"✅ 分离完成!")
229
+ status_messages.append(f" 总时长: {total_duration:.1f}秒")
230
+ status_messages.append(f" 对白时长: {dialog_duration:.1f}秒")
231
+ status_messages.append(f" 唱歌时长: {singing_duration:.1f}秒")
232
+ status_messages.append(f" 设备: {DEVICE.upper()}")
233
+
234
+ yield (
235
+ path_a,
236
+ path_b,
237
+ path_c,
238
+ "\n".join(status_messages)
239
+ )
240
+
241
+ except Exception as e:
242
+ error_msg = f"❌ 处理失败: {str(e)}\n\n已完成步骤:\n" + "\n".join(status_messages)
243
+ yield None, None, None, error_msg
244
+
245
+ # 创建 Gradio 界面
246
+ with gr.Blocks(theme=gr.themes.Soft(), title="AI音频分离工具") as demo:
247
+ gr.Markdown(f"""
248
+ # 🎵 AI 音频分离工具 - 完整版
249
+
250
+ **当前运行设备**: {DEVICE.upper()} {'✅ (GPU加速)' if DEVICE == 'cuda' else '⚠️ (CPU模式,速度较慢)'}
251
+
252
+ ## 功能说明
253
+ - **A - 前景对白**: 纯说话、旁白、Rap、口号、喊叫
254
+ - **B - 背景音乐**: 伴奏 + 唱歌(主唱/和声/合唱)
255
+ - **C - 纯伴奏**: 去除所有人声的纯音乐
256
+
257
+ 💡 **核心技术**: 使用 Demucs AI 模型 + 音高连续性检测
258
+ """)
259
+
260
+ with gr.Row():
261
+ with gr.Column(scale=1):
262
+ audio_input = gr.Audio(
263
+ type="filepath",
264
+ label="📁 上传音频或视频文件"
265
+ )
266
+
267
+ with gr.Accordion("⚙️ 高级设置", open=False):
268
+ enable_detection = gr.Checkbox(
269
+ value=True,
270
+ label="启用唱歌检测(关闭则所有人声归入对白)"
271
+ )
272
+ sensitivity = gr.Slider(
273
+ 0.3, 0.8, value=0.5, step=0.05,
274
+ label="唱歌检测灵敏度(越高越严格)"
275
+ )
276
+ gr.Markdown("**提示**: 如果唱段漏检,降低灵敏度;如果说话误判为唱歌,提高灵敏度")
277
+
278
+ process_btn = gr.Button("🚀 开始分离", variant="primary", size="lg")
279
+
280
+ with gr.Column(scale=1):
281
+ status_box = gr.Textbox(
282
+ label="📊 处理状态",
283
+ lines=10,
284
+ max_lines=15
285
+ )
286
+
287
+ gr.Markdown("---")
288
+ gr.Markdown("## 📥 分离结果")
289
+
290
+ with gr.Row():
291
+ output_a = gr.Audio(label="🎤 A - 前景对白(说话/Rap/口号)", type="filepath")
292
+ output_b = gr.Audio(label="🎵 B - 背景音乐(含唱段)", type="filepath")
293
+ output_c = gr.Audio(label="🎹 C - 纯伴奏", type="filepath")
294
+
295
+ process_btn.click(
296
+ fn=process_audio_full,
297
+ inputs=[audio_input, sensitivity, enable_detection],
298
+ outputs=[output_a, output_b, output_c, status_box]
299
+ )
300
+
301
+ gr.Markdown("""
302
+ ---
303
+ ## 📌 使用提示
304
+
305
+ 1. **支持格式**: MP3, WAV, M4A, MP4, MOV 等
306
+ 2. **处理时间**: GPU模式下约为音频时长的30%-100%,CPU模式会更慢
307
+ 3. **最佳效果**: 建议音频质量较高,背景噪音少
308
+ 4. **限制**: 单次建议不超过 10 分钟音频
309
+
310
+ ⚠️ **注意**:
311
+ - 第一次运行会自动下载 Demucs 模型(约500MB)
312
+ - 如果使用 CPU,5分钟音频可能需要10-20分钟处理
313
+ - 如遇内存不足,请上传较短的音频片段
314
+ """)
315
+
316
+ if __name__ == "__main__":
317
+ demo.launch(server_name="0.0.0.0", server_port=7860)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ demucs==4.0.1
3
+ torch>=2.1.0
4
+ torchaudio>=2.1.0
5
+ librosa>=0.10.1
6
+ soundfile>=0.12.1
7
+ numpy>=1.23.0
8
+ scipy>=1.10.0