haoyue518 commited on
Commit
afaa18a
·
verified ·
1 Parent(s): da239ff

Upload 5 files

Browse files
Files changed (3) hide show
  1. README.md +3 -27
  2. app.py +157 -617
  3. requirements.txt +2 -2
README.md CHANGED
@@ -4,35 +4,11 @@ emoji: 🎵
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # 🎵 AI Audio Separator
13
 
14
- AI-powered tool to separate dialog and background music from audio/video files.
15
-
16
- ## Features
17
-
18
- - 🎤 Pure dialog track (narration, speech, conversation)
19
- - 🎵 Background music + vocals (singing, rap, harmony)
20
- - 🎹 Pure instrumental (no vocals)
21
-
22
- ## Technology
23
-
24
- - **Demucs 4.0**: Vocal/instrumental separation (95%+ accuracy)
25
- - **Silero VAD**: Speech detection neural network (85-90% accuracy)
26
- - **Local model**: No network download required
27
-
28
- ## Supported Formats
29
-
30
- - Audio: MP3, WAV, M4A, FLAC, OGG, AAC
31
- - Video: MP4, MOV, AVI, MKV, FLV, WMV
32
-
33
- ## Usage
34
-
35
- 1. Upload audio or video file
36
- 2. Choose detection mode (strict/balanced)
37
- 3. Click "Start AI Separation"
38
- 4. Download 3 separated tracks
 
4
  colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 3.50.2
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # 🎵 AI 音频分离工具 (稳定版)
13
 
14
+ 已加载本地 Silero VAD 模型,提供高精度人声/伴奏分离。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,695 +1,235 @@
1
- import os, tempfile, subprocess
 
 
2
  import gradio as gr
3
  import numpy as np
4
  import soundfile as sf
5
  import librosa
6
  import torch
7
 
8
- # 检查 GPU
9
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
10
  SAMPLE_RATE = 44100
11
 
12
- # 全局变量
13
  SILERO_MODEL = None
14
  SILERO_AVAILABLE = False
15
 
16
  def load_silero_from_local():
17
  """从本地文件加载 Silero VAD 模型"""
18
  global SILERO_MODEL, SILERO_AVAILABLE
19
-
20
  try:
21
- # 尝试从不同位置加载
22
- model_paths = [
23
- "silero_vad.jit", # 根目录
24
- "models/silero_vad.jit", # models 文件夹
25
- "./silero_vad.jit",
26
- "/home/user/app/silero_vad.jit", # HF Spaces 默认路径
27
- ]
28
-
29
- model_path = None
30
- for path in model_paths:
31
- if os.path.exists(path):
32
- model_path = path
33
- break
34
 
35
- if model_path is None:
36
- print("⚠️ 未找到本地 Silero VAD 模型文件")
37
- print(" 请确保 silero_vad.jit 已上传到 Space 根目录")
38
- print(f" 当前工作目录: {os.getcwd()}")
39
- print(f" 目录内容: {os.listdir('.')}")
40
  SILERO_AVAILABLE = False
41
  return False
42
 
43
  print(f"📥 正在从本地加载 Silero VAD: {model_path}")
44
-
45
- # 加载模型
46
  SILERO_MODEL = torch.jit.load(model_path, map_location=DEVICE)
47
  SILERO_MODEL.eval()
48
-
49
- print("✅ Silero VAD 模型加载成功(从本地文件)")
50
  SILERO_AVAILABLE = True
51
  return True
52
-
53
  except Exception as e:
54
- print(f"❌ Silero VAD 加载失败: {str(e)}")
55
- import traceback
56
- traceback.print_exc()
57
  SILERO_AVAILABLE = False
58
  return False
59
 
60
-
61
  def extract_audio_from_video(video_path, output_path):
62
- """从视频中提取音频"""
63
  try:
64
- cmd = [
65
- 'ffmpeg', '-i', video_path,
66
- '-vn',
67
- '-acodec', 'pcm_s16le',
68
- '-ar', str(SAMPLE_RATE),
69
- '-ac', '2',
70
- '-y',
71
  output_path
72
- ]
73
- result = subprocess.run(cmd, capture_output=True, text=True)
74
- if result.returncode != 0:
75
- raise RuntimeError(f"FFmpeg 提取失败: {result.stderr}")
76
- return output_path
77
- except Exception as e:
78
- raise RuntimeError(f"音频提取失败: {str(e)}")
79
-
80
-
81
- def load_audio_any_format(file_path, target_sr=SAMPLE_RATE):
82
- """加载任意格式音频"""
83
- try:
84
- video_extensions = ['.mp4', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.m4v']
85
- file_ext = os.path.splitext(file_path)[1].lower()
86
-
87
- if file_ext in video_extensions:
88
- with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
89
- temp_audio_path = tmp.name
90
- extract_audio_from_video(file_path, temp_audio_path)
91
- audio, sr = librosa.load(temp_audio_path, sr=target_sr, mono=False)
92
- os.unlink(temp_audio_path)
93
- else:
94
- audio, sr = librosa.load(file_path, sr=target_sr, mono=False)
95
-
96
- if audio.ndim == 1:
97
- audio = audio.reshape(1, -1)
98
- return audio, sr
99
- except Exception as e:
100
- raise ValueError(f"音频加载失败: {str(e)}")
101
-
102
-
103
- def save_audio(path, audio, sr):
104
- """保存音频"""
105
- try:
106
- if audio.ndim == 1:
107
- audio = audio.reshape(1, -1)
108
- audio = np.clip(audio, -1.0, 1.0)
109
- sf.write(path, audio.T, sr, subtype="PCM_16")
110
- except Exception as e:
111
- raise RuntimeError(f"音频保存失败: {str(e)}")
112
-
113
-
114
- def run_demucs_separation(audio_path, output_dir):
115
- """使用 Demucs 进行人声/伴奏分离"""
116
- try:
117
- cmd = [
118
- "python", "-m", "demucs.separate",
119
- "--two-stems=vocals",
120
- "-n", "htdemucs",
121
- "--mp3",
122
- "--mp3-bitrate=320",
123
- "-o", output_dir,
124
- audio_path
125
- ]
126
-
127
- result = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=600)
128
-
129
- base_name = os.path.splitext(os.path.basename(audio_path))[0]
130
- stem_dir = os.path.join(output_dir, "htdemucs", base_name)
131
-
132
- vocals_path = os.path.join(stem_dir, "vocals.mp3")
133
- instrumental_path = os.path.join(stem_dir, "no_vocals.mp3")
134
-
135
- if not os.path.exists(vocals_path):
136
- raise FileNotFoundError(f"Demucs 输出文件不存在: {vocals_path}")
137
-
138
- return vocals_path, instrumental_path
139
-
140
- except subprocess.TimeoutExpired:
141
- raise RuntimeError("处理超时(超过10分钟),请上传较短的音频")
142
  except subprocess.CalledProcessError as e:
143
- raise RuntimeError(f"Demucs 执行失败: {e.stderr}")
144
- except Exception as e:
145
- raise RuntimeError(f"Demucs 分离失败: {str(e)}")
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  def detect_speech_with_silero(vocals_audio, sr):
149
- """使用 Silero VAD 深度学习模型检测说话"""
150
- try:
151
- global SILERO_MODEL
152
- if SILERO_MODEL is None:
153
- raise RuntimeError("Silero 模型未加载")
154
-
155
- # 重采样到 16kHz
156
- if sr != 16000:
157
- vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
158
- sr_work = 16000
159
- else:
160
- vocals_16k = vocals_audio
161
- sr_work = 16000
162
-
163
- # 转换为 torch tensor
164
- audio_tensor = torch.from_numpy(vocals_16k).float().to(DEVICE)
165
-
166
- # 使用 Silero VAD 检测
167
- window_size_samples = 512
168
- speech_probs = []
169
-
170
- with torch.no_grad():
171
- for i in range(0, len(audio_tensor), window_size_samples):
172
- chunk = audio_tensor[i:i+window_size_samples]
173
- if len(chunk) < window_size_samples:
174
- chunk = torch.nn.functional.pad(chunk, (0, window_size_samples - len(chunk)))
175
-
176
- speech_prob = SILERO_MODEL(chunk.unsqueeze(0), sr_work).item()
177
- speech_probs.append(speech_prob)
178
-
179
- # 创建掩码
180
- speech_mask = np.repeat(speech_probs, window_size_samples)[:len(vocals_16k)]
181
- speech_mask = (speech_mask > 0.5).astype(np.float32)
182
-
183
- # 调整回原始采样率
184
- if sr != sr_work:
185
- from scipy.interpolate import interp1d
186
- old_indices = np.linspace(0, 1, len(speech_mask))
187
- new_indices = np.linspace(0, 1, len(vocals_audio))
188
- interpolator = interp1d(old_indices, speech_mask, kind='linear', fill_value='extrapolate')
189
- speech_mask = interpolator(new_indices)
190
-
191
- # 确保长度匹配
192
- if len(speech_mask) != len(vocals_audio):
193
- if len(speech_mask) < len(vocals_audio):
194
- speech_mask = np.pad(speech_mask, (0, len(vocals_audio) - len(speech_mask)))
195
- else:
196
- speech_mask = speech_mask[:len(vocals_audio)]
197
-
198
- speech_mask = (speech_mask > 0.5).astype(np.float32)
199
-
200
- return speech_mask
201
 
202
- except Exception as e:
203
- print(f"Silero VAD 检测失败: {str(e)}")
204
- return None
205
-
206
-
207
- def detect_speech_fallback(vocals_audio, sr):
208
- """传统算法备用方案"""
209
- try:
210
- hop_length = 512
211
- frame_length = 2048
212
-
213
- # 能量
214
- rms = librosa.feature.rms(y=vocals_audio, frame_length=frame_length, hop_length=hop_length)[0]
215
-
216
- # 零交叉率
217
- zcr = librosa.feature.zero_crossing_rate(vocals_audio, frame_length=frame_length, hop_length=hop_length)[0]
218
-
219
- # 频谱质心
220
- spectral_centroids = librosa.feature.spectral_centroid(y=vocals_audio, sr=sr, hop_length=hop_length)[0]
221
-
222
- # 音高检测
223
- try:
224
- f0, voiced_flag, voiced_probs = librosa.pyin(
225
- vocals_audio,
226
- fmin=librosa.note_to_hz('C2'),
227
- fmax=librosa.note_to_hz('C7'),
228
- sr=sr,
229
- frame_length=frame_length,
230
- hop_length=hop_length
231
- )
232
- f0 = np.nan_to_num(f0, nan=0.0)
233
- except:
234
- f0 = np.zeros(len(rms))
235
-
236
- # 归一化
237
- min_len = min(len(rms), len(zcr), len(spectral_centroids), len(f0))
238
- rms = rms[:min_len]
239
- zcr = zcr[:min_len]
240
- spectral_centroids = spectral_centroids[:min_len]
241
- f0 = f0[:min_len]
242
 
243
- # 说话特征得分
244
- zcr_score = np.clip((zcr - 0.05) / 0.15, 0, 1)
245
-
246
- rms_norm = rms / (np.max(rms) + 1e-8)
247
- energy_variation = np.abs(np.gradient(rms_norm))
248
- energy_score = np.clip(energy_variation * 10, 0, 1)
249
-
250
- centroid_variation = np.abs(np.gradient(spectral_centroids))
251
- centroid_score = np.clip(centroid_variation / (np.mean(centroid_variation) + 1e-8), 0, 1)
252
-
253
- pitch_continuity = np.zeros_like(f0)
254
- for i in range(1, len(f0)):
255
- if f0[i] > 0 and f0[i-1] > 0:
256
- pitch_diff = abs(f0[i] - f0[i-1])
257
- if pitch_diff > 50:
258
- pitch_continuity[i] = 1
259
-
260
- # 综合得分
261
- speaking_score = (
262
- 0.30 * zcr_score +
263
- 0.25 * energy_score +
264
- 0.25 * centroid_score +
265
- 0.20 * pitch_continuity
266
- )
267
-
268
- speaking_mask = (speaking_score > 0.6).astype(np.float32)
269
-
270
- # 后处理
271
- min_duration = int(0.2 * sr / hop_length)
272
- i = 0
273
- while i < len(speaking_mask):
274
- if speaking_mask[i] == 1:
275
- j = i
276
- while j < len(speaking_mask) and speaking_mask[j] == 1:
277
- j += 1
278
- if j - i < min_duration:
279
- speaking_mask[i:j] = 0
280
- i = j
281
- else:
282
- i += 1
283
-
284
- # 转换为样本级
285
- speaking_mask_samples = np.repeat(speaking_mask, hop_length)
286
-
287
- if len(speaking_mask_samples) < len(vocals_audio):
288
- speaking_mask_samples = np.pad(speaking_mask_samples, (0, len(vocals_audio) - len(speaking_mask_samples)))
289
- else:
290
- speaking_mask_samples = speaking_mask_samples[:len(vocals_audio)]
291
-
292
- # 平滑
293
- smooth_window = int(0.03 * sr)
294
- if smooth_window > 1:
295
- speaking_mask_samples = np.convolve(
296
- speaking_mask_samples,
297
- np.ones(smooth_window) / smooth_window,
298
- mode='same'
299
- )
300
- speaking_mask_samples = (speaking_mask_samples > 0.5).astype(np.float32)
301
-
302
- return speaking_mask_samples
303
 
304
- except Exception as e:
305
- print(f"传统算法检测失败: {str(e)}")
306
- # 返回全1(假设全是说话)
307
- return np.ones(len(vocals_audio), dtype=np.float32)
308
-
309
-
310
- def detect_singing_hybrid(vocals_audio, sr, mode='strict'):
311
- """混合检测策略:优先使用 Silero VAD,失败则降级"""
312
- try:
313
- global SILERO_AVAILABLE
314
-
315
- if SILERO_AVAILABLE:
316
- print("✅ 使用 Silero VAD 深度学习模型检测")
317
- speech_mask = detect_speech_with_silero(vocals_audio, sr)
318
 
319
- if speech_mask is not None:
320
- if mode == 'strict':
321
- from scipy.ndimage import binary_erosion
322
- kernel_size = int(0.05 * sr)
323
- if kernel_size > 1:
324
- speech_mask = binary_erosion(speech_mask, structure=np.ones(kernel_size)).astype(np.float32)
325
-
326
- singing_mask = 1 - speech_mask
327
- return singing_mask, "Silero VAD"
328
-
329
- # Silero 不可用,使用传统算法
330
- print("⚠️ 使用传统多特征算法")
331
- speech_mask = detect_speech_fallback(vocals_audio, sr)
332
- singing_mask = 1 - speech_mask
333
- return singing_mask, "传统算法"
334
 
335
- except Exception as e:
336
- print(f"检测失败: {str(e)}")
337
- speech_mask = detect_speech_fallback(vocals_audio, sr)
338
- singing_mask = 1 - speech_mask
339
- return singing_mask, "传统算法"
340
-
 
341
 
342
- def process_audio_full(audio_file, detection_mode, enable_detection):
343
- """完整的音频分离流程"""
344
- if audio_file is None:
345
- return None, None, None, "❌ 请先上传音频或视频文件"
346
 
347
- status_messages = []
 
348
 
349
  try:
 
350
  with tempfile.TemporaryDirectory() as tmpdir:
351
- # 1. 加载音频
352
- status_messages.append("📂 正在加载文件...")
353
- yield None, None, None, "\n".join(status_messages)
354
-
355
- input_path = audio_file
356
-
357
- file_ext = os.path.splitext(input_path)[1].lower()
358
- if file_ext in ['.mp4', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.m4v']:
359
- status_messages.append(f"🎬 检测到视频文件 ({file_ext}),正在提取音频...")
360
- yield None, None, None, "\n".join(status_messages)
361
-
362
- audio, sr = load_audio_any_format(input_path, SAMPLE_RATE)
363
-
364
  temp_wav = os.path.join(tmpdir, "input.wav")
365
- save_audio(temp_wav, audio, sr)
366
 
367
- # 2. Demucs 分离
368
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
369
- status_messages.append("🎵 使用 Demucs AI 模型分离人声和伴奏...")
370
- status_messages.append(" (首次运行会下载模型,约500MB)")
371
- yield None, None, None, "\n".join(status_messages)
 
 
 
 
 
372
 
373
- vocals_path, instrumental_path = run_demucs_separation(temp_wav, tmpdir)
 
 
374
 
375
- vocals, _ = librosa.load(vocals_path, sr=sr, mono=True)
376
- instrumental, _ = librosa.load(instrumental_path, sr=sr, mono=True)
377
 
378
- status_messages.append(" ✅ Demucs 分离完成")
379
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
 
380
 
381
- # 3. 说话检测
382
- algorithm_used = "无"
 
383
 
384
  if enable_detection:
385
- status_messages.append("")
386
- status_messages.append("🔧 正在初始化 AI 检测器...")
387
-
388
- global SILERO_AVAILABLE
389
  if SILERO_AVAILABLE:
390
- status_messages.append(" Silero VAD 已加载(从本地文件)")
391
- else:
392
- status_messages.append(" ⚠️ Silero VAD 不可用,将使用传统算法")
393
-
394
- yield None, None, None, "\n".join(status_messages)
395
-
396
- status_messages.append("🎤 正在分析音频特征...")
397
- yield None, None, None, "\n".join(status_messages)
398
-
399
- # singing_mask: 1=唱歌, 0=说话
400
- singing_mask, algorithm_used = detect_singing_hybrid(vocals, sr, mode=detection_mode)
401
-
402
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
403
-
404
- # 醒目标注使用的算法
405
- if algorithm_used == "Silero VAD":
406
- status_messages.append("✅✅✅ 检测器状态: Silero VAD 深度学习")
407
- status_messages.append(" 📈 预期准确率: 85-90%")
408
- status_messages.append(" 🎯 算法类型: 神经网络")
409
- status_messages.append(" 📦 模型来源: 本地文件")
410
  else:
411
- status_messages.append("⚠️⚠️⚠️ 检测器状态: 传统多特征算法")
412
- status_messages.append(" 📉 预期准确率: 75-80%")
413
- status_messages.append(" 🎯 算法类型: 信号处理")
414
-
415
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
416
- status_messages.append(" ✅ 检测完成")
417
- else:
418
- status_messages.append("⚠️ 已关闭智能检测,所有人声归入对白")
419
- singing_mask = np.zeros(len(vocals), dtype=np.float32)
420
- algorithm_used = "关闭检测"
421
-
422
- # 4. 分离对白和唱歌
423
- status_messages.append("")
424
- status_messages.append("✂️ 正在分离对白和背景音乐...")
425
- yield None, None, None, "\n".join(status_messages)
426
-
427
- dialog_mask = 1 - singing_mask
428
-
429
- dialog_vocals = vocals * dialog_mask
430
- singing_vocals = vocals * singing_mask
431
-
432
- # 5. 生成最终输出
433
- output_a = dialog_vocals
434
-
435
- # 智能混音
436
- singing_rms = np.sqrt(np.mean(singing_vocals**2) + 1e-8)
437
- inst_rms = np.sqrt(np.mean(instrumental**2) + 1e-8)
438
 
439
- if singing_rms > 1e-6:
440
- singing_gain = inst_rms / singing_rms * 0.8
441
- singing_gain = np.clip(singing_gain, 0.1, 1.5)
442
- else:
443
- singing_gain = 1.0
444
-
445
- output_b = np.clip(instrumental + singing_vocals * singing_gain, -1.0, 1.0)
446
- output_c = instrumental
447
 
448
- # 保存文件
449
- status_messages.append("💾 正在保存输出文件...")
450
- yield None, None, None, "\n".join(status_messages)
451
 
452
- if algorithm_used == "Silero VAD":
453
- algo_tag = "SileroVAD"
454
- elif algorithm_used == "传统算法":
455
- algo_tag = "Traditional"
456
- else:
457
- algo_tag = "NoDetect"
458
 
459
- path_a = os.path.join(tmpdir, f"A_dialog_{algo_tag}.wav")
460
- path_b = os.path.join(tmpdir, f"B_bgm_with_singing_{algo_tag}.wav")
461
- path_c = os.path.join(tmpdir, f"C_instrumental_{algo_tag}.wav")
 
462
 
463
- save_audio(path_a, output_a, sr)
464
- save_audio(path_b, output_b, sr)
465
- save_audio(path_c, output_c, sr)
466
 
467
- # 统计信息
468
- total_duration = len(vocals) / sr
469
- dialog_duration = np.sum(dialog_mask) / sr
470
- singing_duration = total_duration - dialog_duration
471
 
472
- status_messages.append("")
473
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
474
- status_messages.append("✅✅✅ 分离完成!")
475
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
476
- status_messages.append("")
477
- status_messages.append("📊 统计信息:")
478
- status_messages.append(f" 总时长: {total_duration:.1f} 秒")
479
- status_messages.append(f" 对白时长: {dialog_duration:.1f} 秒 ({dialog_duration/total_duration*100:.1f}%)")
480
- status_messages.append(f" 音乐人声时长: {singing_duration:.1f} 秒 ({singing_duration/total_duration*100:.1f}%)")
481
- status_messages.append(f" 运行设备: {DEVICE.upper()}")
482
- status_messages.append("")
483
-
484
- # 醒目标注使用的算法
485
- if algorithm_used == "Silero VAD":
486
- status_messages.append("🎯 本次使用的检测算法:")
487
- status_messages.append(" ✅✅✅ Silero VAD 深度学习模型")
488
- status_messages.append(" 📈 准确率: 约 85-90%")
489
- status_messages.append(" 🧠 技术: 神经网络(10000+ 小时训练)")
490
- status_messages.append(" 📦 模型来源: 本地文件(无需下载)")
491
- elif algorithm_used == "传统算法":
492
- status_messages.append("🎯 本次使用的检测算法:")
493
- status_messages.append(" ⚠️⚠️⚠️ 传统多特征算法")
494
- status_messages.append(" 📉 准确率: 约 75-80%")
495
- status_messages.append(" 🔧 技术: 能量+零交叉率+频谱+音高")
496
- else:
497
- status_messages.append("🎯 本次使用的检测算法:")
498
- status_messages.append(" ⚪ 未启用检测(所有人声归入对白)")
499
 
500
- status_messages.append("")
501
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
502
- status_messages.append(f"💾 输出文件已标注算法: {algo_tag}")
503
- status_messages.append("━━━━━━━━━━━━━━━━━━━━")
504
-
505
- yield (
506
- path_a,
507
- path_b,
508
- path_c,
509
- "\n".join(status_messages)
510
- )
511
-
512
  except Exception as e:
513
  import traceback
514
- error_detail = traceback.format_exc()
515
- error_msg = f"❌ 处理失败:\n{str(e)}\n\n已完成步骤:\n" + "\n".join(status_messages)
516
- error_msg += f"\n\n详细错误:\n{error_detail}"
517
- yield None, None, None, error_msg
518
-
519
-
520
- # ===== 启动时加载 Silero VAD =====
521
- print("=" * 60)
522
- print("🚀 正在初始化 AI 音频分离工具...")
523
- print("=" * 60)
524
-
525
- # 尝试加载 Silero VAD
526
- silero_loaded = load_silero_from_local()
527
-
528
- if silero_loaded:
529
- print("✅ Silero VAD 已就绪(高准确率模式 85-90%)")
530
- else:
531
- print("⚠️ Silero VAD 不可用,将使用传统算法(准确率 75-80%)")
532
-
533
- print("=" * 60)
534
-
535
-
536
- # 创建 Gradio 界面
537
- with gr.Blocks(theme=gr.themes.Soft(), title="AI音频分离工具") as demo:
538
- gr.Markdown(f"""
539
- # 🎵 AI 音频分离工具 - Silero VAD 本地版
540
-
541
- **当前运行设备**: {DEVICE.upper()} {'✅ GPU加速' if DEVICE == 'cuda' else '⚠️ CPU模式'}
542
-
543
- **Silero VAD 状态**: {'✅ 已加载(本地文件,准确率 85-90%)' if SILERO_AVAILABLE else '⚠️ 未加载(使用传统算法,准确率 75-80%)'}
544
-
545
- ---
546
-
547
- ## 功能说明
548
-
549
- 本工具将音频/视频分离为 3 个独立轨道:
550
-
551
- - **🎤 A - 纯对白**:旁白、解说、对话(说话的部分)
552
- - **🎵 B - 背景音乐+人声**:伴奏 + 唱歌 + Rap + 和声
553
- - **🎹 C - 纯伴奏**:去除所有人声的纯音乐
554
-
555
- ---
556
-
557
- ## 💡 核心技术
558
-
559
- 1. **Demucs 4.0** 深度学习模型
560
- - 人声/伴奏分离(准确率 > 95%)
561
- - Meta AI 开发
562
-
563
- 2. **Silero VAD** 神经网络(如已加载)
564
- - 说话检测(准确率 85-90%)
565
- - 10000+ 小时训练数据
566
- - **从本地加载,无需网络下载**
567
-
568
- 3. **传统多特征算法**(备用)
569
- - 能量、零交叉率、频谱、音高融合
570
- - 准确率 75-80%
571
-
572
- ---
573
-
574
- ## 📋 使用场景
575
-
576
- ✅ **适合的场景**:
577
- - 短视频二次创作(提取对白/BGM)
578
- - 播客音频编辑
579
- - 教学视频字幕制作
580
- - 音乐制作(提取伴奏)
581
-
582
- ⚠️ **有挑战的场景**:
583
- - 说唱风格的旁白
584
- - 快速说话 + 强背景音乐
585
- - 唱歌式说话
586
- """)
587
 
588
  with gr.Row():
589
  with gr.Column(scale=1):
590
- audio_input = gr.File(
591
- label="📁 上传音频或视频文件",
592
- file_types=["audio", "video"],
593
- type="filepath"
594
- )
595
 
596
- gr.Markdown("""
597
- **支持格式**:
598
- - 🎵 音频: MP3, WAV, M4A, FLAC, OGG, AAC
599
- - 🎬 视频: MP4, MOV, AVI, MKV, FLV, WMV
600
- """)
601
 
602
- with gr.Accordion("⚙️ 高级设置", open=True):
603
- enable_detection = gr.Checkbox(
604
- value=True,
605
- label="🎯 启用智能说话检测(推荐开启)"
606
- )
607
- detection_mode = gr.Radio(
608
- choices=[
609
- ("严格模式 - 只保留明确的说话/旁白", "strict"),
610
- ("平衡模式 - 包含部分 Rap/快语", "balanced")
611
- ],
612
- value="strict",
613
- label="检测模式"
614
- )
615
- gr.Markdown("""
616
- **模式说明**:
617
- - **严格模式**(推荐):只有清晰的说话才归入对白,唱歌/Rap 归入 BGM
618
- - **平衡模式**:包含部分 Rap 风格的说话,边界更宽松
619
-
620
- **效果不满意?**
621
- - 说话被误判为唱歌 → 试试"平衡模式"
622
- - 唱歌被误判为说话 → 保持"严格模式"
623
- """)
624
 
625
- process_btn = gr.Button("🚀 开始 AI 智能分离", variant="primary", size="lg")
626
 
627
  with gr.Column(scale=1):
628
- status_box = gr.Textbox(
629
- label="📊 处理状态(实时显示)",
630
- lines=25,
631
- max_lines=30,
632
- show_label=True
633
- )
634
-
635
- gr.Markdown("---")
636
- gr.Markdown("## 📥 分离结果(点击播放预览,右键下载)")
637
-
638
- with gr.Row():
639
- output_a = gr.Audio(label="🎤 A - 纯对白(旁白/解说/对话)", type="filepath")
640
- output_b = gr.Audio(label="🎵 B - 背景音乐+人声(含唱歌/Rap)", type="filepath")
641
- output_c = gr.Audio(label="🎹 C - 纯伴奏(无人声)", type="filepath")
642
-
643
- process_btn.click(
644
  fn=process_audio_full,
645
- inputs=[audio_input, detection_mode, enable_detection],
646
- outputs=[output_a, output_b, output_c, status_box]
647
  )
648
-
649
- gr.Markdown(f"""
650
- ---
651
-
652
- ## 📌 技术说明
653
-
654
- ### 🎯 当前配置
655
-
656
- | 项目 | 状态 |
657
- |------|------|
658
- | **运行设备** | {DEVICE.upper()} {'(GPU 加速)' if DEVICE == 'cuda' else '(CPU 模式)'} |
659
- | **Silero VAD** | {'✅ 已加载(本地,准确率 85-90%)' if SILERO_AVAILABLE else '❌ 未加载(使用传统算法,准确率 75-80%)'} |
660
- | **Demucs 模型** | htdemucs(人声/伴奏分离) |
661
- | **输出格式** | WAV(无损,44.1kHz)|
662
-
663
- ### 💡 使用建议
664
-
665
- 1. **首次使用**:会下载 Demucs 模型(约 500MB),需 3-5 分钟
666
- 2. **处理时间**:1 分钟音频约需 10-30 秒(取决于设备)
667
- 3. **最佳效果**:上传清晰音质的文件
668
- 4. **文件大小**:建议单个文件 < 50MB,时长 < 5 分钟
669
-
670
- ### 🔧 如果 Silero VAD 未加载
671
-
672
- 说明 `silero_vad.jit` 文件未正确上传,请检查:
673
-
674
- 1. 文件是否在 Space 根目录
675
- 2. 文件名是否为 `silero_vad.jit`(全小写)
676
- 3. 文件大小约 1.4MB
677
-
678
- 即使没有 Silero VAD,传统算法也能提供 75-80% 的准确率。
679
-
680
- ---
681
-
682
- ## 📊 算法对比
683
-
684
- | 检测算法 | 准确率 | 速度 | 依赖 |
685
- |---------|-------|------|------|
686
- | **Silero VAD** | **85-90%** | 快 | 本地模型文件 |
687
- | **传统算法** | **75-80%** | 很快 | 无 |
688
-
689
- ---
690
-
691
- **提示**: 处理完成后,文件名会标注使用的算法(SileroVAD 或 Traditional)
692
- """)
693
 
694
  if __name__ == "__main__":
695
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
+ import os
2
+ import tempfile
3
+ import subprocess
4
  import gradio as gr
5
  import numpy as np
6
  import soundfile as sf
7
  import librosa
8
  import torch
9
 
10
+ # 检查 GPU 是否可用
11
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
  SAMPLE_RATE = 44100
13
 
14
+ # 全局变量:存储模型
15
  SILERO_MODEL = None
16
  SILERO_AVAILABLE = False
17
 
18
  def load_silero_from_local():
19
  """从本地文件加载 Silero VAD 模型"""
20
  global SILERO_MODEL, SILERO_AVAILABLE
 
21
  try:
22
+ # 尝试多个可能的路径
23
+ model_paths = ["silero_vad.jit", "models/silero_vad.jit", "./silero_vad.jit"]
24
+ model_path = next((p for p in model_paths if os.path.exists(p)), None)
 
 
 
 
 
 
 
 
 
 
25
 
26
+ if not model_path:
27
+ print("⚠️ 未找到本地 Silero VAD 模型文件,将使用传统算法")
 
 
 
28
  SILERO_AVAILABLE = False
29
  return False
30
 
31
  print(f"📥 正在从本地加载 Silero VAD: {model_path}")
 
 
32
  SILERO_MODEL = torch.jit.load(model_path, map_location=DEVICE)
33
  SILERO_MODEL.eval()
 
 
34
  SILERO_AVAILABLE = True
35
  return True
 
36
  except Exception as e:
37
+ print(f"❌ Silero VAD 加载失败: {e}")
 
 
38
  SILERO_AVAILABLE = False
39
  return False
40
 
 
41
  def extract_audio_from_video(video_path, output_path):
42
+ """使用 ffmpeg 从视频提取音频"""
43
  try:
44
+ subprocess.run([
45
+ 'ffmpeg', '-i', video_path,
46
+ '-vn', # 禁用视频
47
+ '-acodec', 'pcm_s16le',
48
+ '-ar', str(SAMPLE_RATE),
49
+ '-ac', '2',
50
+ '-y', # 覆盖输出
51
  output_path
52
+ ], check=True, capture_output=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  except subprocess.CalledProcessError as e:
54
+ print(f"FFmpeg 错误: {e}")
55
+ raise Exception("无法从视频提取音频,请检查文件格式")
 
56
 
57
+ def run_demucs_separation(audio_path, output_dir):
58
+ """运行 Demucs 进行人声/伴奏分离"""
59
+ cmd = [
60
+ "python", "-m", "demucs.separate",
61
+ "--two-stems=vocals", # 只需要分离人声和伴奏
62
+ "-n", "htdemucs", # 使用最新的模型
63
+ "--mp3", "--mp3-bitrate=320",
64
+ "-o", output_dir,
65
+ audio_path
66
+ ]
67
+ subprocess.run(cmd, check=True, capture_output=True, text=True)
68
+
69
+ # 构建输出路径
70
+ base_name = os.path.splitext(os.path.basename(audio_path))[0]
71
+ stem_dir = os.path.join(output_dir, "htdemucs", base_name)
72
+
73
+ return os.path.join(stem_dir, "vocals.mp3"), os.path.join(stem_dir, "no_vocals.mp3")
74
 
75
  def detect_speech_with_silero(vocals_audio, sr):
76
+ """使用 Silero VAD 检测纯语音(去除唱歌/Rap)"""
77
+ if not SILERO_MODEL: return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ # VAD 需要 16k 采样率
80
+ if sr != 16000:
81
+ vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
82
+ else:
83
+ vocals_16k = vocals_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ audio_tensor = torch.from_numpy(vocals_16k).float().to(DEVICE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ speech_probs = []
88
+ window_size_samples = 512
89
+
90
+ with torch.no_grad():
91
+ for i in range(0, len(audio_tensor), window_size_samples):
92
+ chunk = audio_tensor[i:i+window_size_samples]
93
+ if len(chunk) < window_size_samples:
94
+ chunk = torch.nn.functional.pad(chunk, (0, window_size_samples - len(chunk)))
 
 
 
 
 
 
95
 
96
+ # 模型推理
97
+ out = SILERO_MODEL(chunk.unsqueeze(0), 16000)
98
+ speech_probs.append(out.item())
99
+
100
+ # 将概率扩展回原始长度
101
+ speech_mask = np.repeat(speech_probs, window_size_samples)[:len(vocals_16k)]
102
+ speech_mask = (speech_mask > 0.5).astype(np.float32) # 阈值 0.5
 
 
 
 
 
 
 
 
103
 
104
+ # 如果重采样过,需要插值回原始长度
105
+ if sr != 16000:
106
+ from scipy.interpolate import interp1d
107
+ f = interp1d(np.linspace(0, 1, len(speech_mask)), speech_mask, kind='nearest', fill_value="extrapolate")
108
+ speech_mask = f(np.linspace(0, 1, len(vocals_audio)))
109
+
110
+ return (speech_mask > 0.5).astype(np.float32)
111
 
112
+ def process_audio_full(input_file, mode_selection, enable_detection):
113
+ """主处理流程"""
114
+ if input_file is None:
115
+ return None, None, None, "❌ 请先上传文件!"
116
 
117
+ logs = ["🚀 开始处理任务..."]
118
+ yield None, None, None, "\n".join(logs)
119
 
120
  try:
121
+ # 创建临时目录处理文件
122
  with tempfile.TemporaryDirectory() as tmpdir:
123
+ input_path = input_file.name
 
 
 
 
 
 
 
 
 
 
 
 
124
  temp_wav = os.path.join(tmpdir, "input.wav")
 
125
 
126
+ # 1. 预处理:如果是视频,提取音频;如果是音频,转为 WAV
127
+ if input_path.lower().endswith(('.mp4', '.mov', '.avi', '.mkv', '.flv', '.wmv')):
128
+ logs.append("🎬 检测到视频文件,正在提取音频...")
129
+ yield None, None, None, "\n".join(logs)
130
+ extract_audio_from_video(input_path, temp_wav)
131
+ else:
132
+ logs.append("🎵 加载音频文件...")
133
+ audio, sr = librosa.load(input_path, sr=SAMPLE_RATE, mono=False)
134
+ if audio.ndim == 1: audio = audio.reshape(1, -1)
135
+ sf.write(temp_wav, audio.T, sr, subtype="PCM_16")
136
 
137
+ # 2. Demucs 人声分离
138
+ logs.append("🤖 正在运行 Demucs 分离人声与伴奏 (可能需要几分钟)...")
139
+ yield None, None, None, "\n".join(logs)
140
 
141
+ vocals_path, inst_path = run_demucs_separation(temp_wav, tmpdir)
 
142
 
143
+ # 读取分离后的轨道
144
+ vocals, sr = librosa.load(vocals_path, sr=SAMPLE_RATE, mono=True)
145
+ instrumental, _ = librosa.load(inst_path, sr=SAMPLE_RATE, mono=True)
146
 
147
+ # 3. Silero VAD 智能检测
148
+ mask = np.ones_like(vocals) # 默认全部保留
149
+ detection_info = "未启用检测"
150
 
151
  if enable_detection:
 
 
 
 
152
  if SILERO_AVAILABLE:
153
+ logs.append("🧠 正在使用本地 Silero VAD 模型识别纯对白...")
154
+ yield None, None, None, "\n".join(logs)
155
+
156
+ vad_mask = detect_speech_with_silero(vocals, sr)
157
+ if vad_mask is not None:
158
+ mask = vad_mask
159
+ detection_info = "Silero VAD (本地模型)"
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  else:
161
+ logs.append("⚠️ 本地 VAD 模型未加载,跳过智能检测")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ # 4. 混合轨道生成
164
+ # 逻辑:
165
+ # A轨 (纯对白) = 人声 * mask
166
+ # B轨 (背景) = 纯伴奏 + (人声 * (1-mask)) <-- 把不是对白的人声(如唱歌)加回背景
167
+ # C轨 (纯伴奏) = 纯伴奏
 
 
 
168
 
169
+ singing_mask = 1 - mask
 
 
170
 
171
+ track_dialogue = vocals * mask
172
+ track_bgm_plus = instrumental + (vocals * singing_mask)
173
+ track_instrumental = instrumental
 
 
 
174
 
175
+ # 5. 导出文件
176
+ path_a = os.path.join(tmpdir, "Track_A_Dialogue.wav")
177
+ path_b = os.path.join(tmpdir, "Track_B_Background.wav")
178
+ path_c = os.path.join(tmpdir, "Track_C_Instrumental.wav")
179
 
180
+ sf.write(path_a, track_dialogue, sr)
181
+ sf.write(path_b, track_bgm_plus, sr)
182
+ sf.write(path_c, track_instrumental, sr)
183
 
184
+ logs.append(f"✅ 处理完成!\n检测模式: {detection_info}")
185
+ logs.append("📂 可以在下方下载三个分离轨道")
 
 
186
 
187
+ yield path_a, path_b, path_c, "\n".join(logs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  except Exception as e:
190
  import traceback
191
+ traceback.print_exc()
192
+ logs.append(f"❌ 发生严重错误: {str(e)}")
193
+ yield None, None, None, "\n".join(logs)
194
+
195
+ # --- 启动时加载模型 ---
196
+ print("⏳ 正在初始化系统...")
197
+ load_silero_from_local()
198
+
199
+ # --- Gradio 界面构建 (兼容 3.x) ---
200
+ with gr.Blocks(title="AI 音频分离专家", theme=gr.themes.Soft()) as demo:
201
+ gr.Markdown(
202
+ """
203
+ # 🎵 AI 音频分离专家 (修复版)
204
+ **功能**:上传视频或音频,自动分离出 **纯对白**、**背景声(含唱歌)** 和 **纯伴奏**。
205
+ """
206
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  with gr.Row():
209
  with gr.Column(scale=1):
210
+ input_file = gr.File(label="📁 上传文件 (支持 MP4/MP3/WAV 等)", file_types=["audio", "video"])
 
 
 
 
211
 
212
+ with gr.Group():
213
+ chk_detect = gr.Checkbox(label="启用智能对白检测 (Silero VAD)", value=True, interactive=True)
214
+ radio_mode = gr.Radio(["标准模式", "严格模式"], label="检测灵敏度", value="标准模式")
 
 
215
 
216
+ btn_run = gr.Button("🚀 开始分离处理", variant="primary", size="lg")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ status_log = gr.Textbox(label="运行日志", placeholder="等待任务开始...", lines=8, max_lines=12)
219
 
220
  with gr.Column(scale=1):
221
+ gr.Markdown("### 🎧 分离结果下载")
222
+ out_a = gr.Audio(label="🎤 A轨: 纯对白 (旁白/对话)", type="filepath")
223
+ out_b = gr.Audio(label="🎼 B轨: 背景 (BGM + 唱歌/Rap)", type="filepath")
224
+ out_c = gr.Audio(label="🎹 C轨: 纯伴奏 (无任何通过)", type="filepath")
225
+
226
+ # 绑定事件
227
+ btn_run.click(
 
 
 
 
 
 
 
 
 
228
  fn=process_audio_full,
229
+ inputs=[input_file, radio_mode, chk_detect],
230
+ outputs=[out_a, out_b, out_c, status_log]
231
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
 
233
  if __name__ == "__main__":
234
+ # 允许队列,设置最大并发
235
+ demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860, show_error=True)
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- gradio==4.19.0
2
  torch==2.0.1
3
  torchaudio==2.0.2
4
- demucs
5
  librosa==0.10.1
6
  soundfile==0.12.1
7
  numpy==1.24.3
 
1
+ gradio==3.50.2
2
  torch==2.0.1
3
  torchaudio==2.0.2
4
+ demucs==4.0.1
5
  librosa==0.10.1
6
  soundfile==0.12.1
7
  numpy==1.24.3