Spaces:

haoyue518
/

audio-separator

Sleeping

App Files Files Community

haoyue518 commited on Nov 19

Commit

f5d2e4c

verified ·

1 Parent(s): 9aed1d6

Upload 3 files

Browse files

Files changed (2) hide show

app.py +116 -121
requirements.txt +1 -2

app.py CHANGED Viewed

@@ -5,18 +5,34 @@ import soundfile as sf
 import librosa
 import torch
-# 加载 Silero VAD 模型（用于检测说话）
-try:
-    from silero_vad import load_silero_vad, get_speech_timestamps
-    SILERO_AVAILABLE = True
-except:
-    SILERO_AVAILABLE = False
-    print("⚠️ Silero VAD 不可用，将使用传统算法")
 # 检查 GPU
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SAMPLE_RATE = 44100
 def extract_audio_from_video(video_path, output_path):
     """从视频中提取音频"""
     try:
@@ -107,6 +123,10 @@ def detect_speech_with_silero(vocals_audio, sr):
     返回：speech_mask (1=说话, 0=其他)
     """
     try:
         # 重采样到 16kHz（Silero VAD 要求）
         if sr != 16000:
             vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
@@ -115,31 +135,25 @@ def detect_speech_with_silero(vocals_audio, sr):
             vocals_16k = vocals_audio
             sr_work = 16000
-        # 加载模型
-        model = load_silero_vad()
         # 转换为 torch tensor
-        audio_tensor = torch.from_numpy(vocals_16k).float()
-        # 获取说话时间戳
-        speech_timestamps = get_speech_timestamps(
-            audio_tensor,
-            model,
-            threshold=0.5,  # 检测阈值
-            sampling_rate=sr_work,
-            min_speech_duration_ms=250,  # 最短说话时长
-            min_silence_duration_ms=100,  # 最短静音时长
-            window_size_samples=512,
-            speech_pad_ms=30
-        )
-        # 创建掩码
-        speech_mask = np.zeros(len(vocals_16k), dtype=np.float32)
-        for ts in speech_timestamps:
-            start = ts['start']
-            end = ts['end']
-            speech_mask[start:end] = 1.0
         # 调整回原始采样率
         if sr != sr_work:
@@ -164,50 +178,12 @@ def detect_speech_with_silero(vocals_audio, sr):
         print(f"Silero VAD 检测失败: {str(e)}")
         import traceback
         traceback.print_exc()
-        # 失败时返回全零
         return np.zeros(len(vocals_audio), dtype=np.float32)
-def detect_singing_hybrid(vocals_audio, sr, mode='strict'):
-    """
-    混合检测策略：
-    1. 先用 Silero VAD 检测"说话"
-    2. 其余全部归入"唱歌/音乐"
-    mode='strict': 严格模式，只有明确的说话才归入对白
-    mode='balanced': 平衡模式，包含部分 Rap
-    """
-    try:
-        if SILERO_AVAILABLE:
-            print("🎯 使用 Silero VAD 深度学习模型检测说话...")
-            speech_mask = detect_speech_with_silero(vocals_audio, sr)
-        else:
-            print("⚠️ Silero 不可用，使用传统算法...")
-            speech_mask = detect_speech_fallback(vocals_audio, sr)
-        if mode == 'strict':
-            # 严格模式：只保留明确的说话
-            # 缩小说话区域，避免误判
-            from scipy.ndimage import binary_erosion
-            kernel_size = int(0.05 * sr)  # 50ms
-            if kernel_size > 1:
-                speech_mask = binary_erosion(speech_mask, structure=np.ones(kernel_size)).astype(np.float32)
-        # 说话 = 1, 唱歌 = 0
-        # 我们需要返回唱歌掩码，所以要反转
-        singing_mask = 1 - speech_mask
-        return singing_mask
-    except Exception as e:
-        print(f"检测失败: {str(e)}")
-        return np.ones(len(vocals_audio), dtype=np.float32)  # 全部归入唱歌
 def detect_speech_fallback(vocals_audio, sr):
     """传统算法备用方案（当 Silero 不可用时）"""
     try:
-        # 使用能量 + 零交叉率检测说话
         hop_length = 512
         # 能量
@@ -234,6 +210,40 @@ def detect_speech_fallback(vocals_audio, sr):
         return np.zeros(len(vocals_audio), dtype=np.float32)
 def process_audio_full(audio_file, detection_mode, enable_singing_detection):
     """完整的音频分离流程"""
     if audio_file is None:
@@ -243,6 +253,17 @@ def process_audio_full(audio_file, detection_mode, enable_singing_detection):
     try:
         with tempfile.TemporaryDirectory() as tmpdir:
             # 1. 加载音频
             status_messages.append("📂 正在加载文件...")
             yield None, None, None, "\n".join(status_messages)
@@ -261,10 +282,6 @@ def process_audio_full(audio_file, detection_mode, enable_singing_detection):
             # 2. Demucs 分离
             status_messages.append("🎵 使用 Demucs AI 模型分离人声和伴奏...")
-            if SILERO_AVAILABLE:
-                status_messages.append("   ✅ 已启用 Silero VAD 深度学习检测器")
-            else:
-                status_messages.append("   ⚠️ 使用传统算法（准确率较低）")
             yield None, None, None, "\n".join(status_messages)
             vocals_path, instrumental_path = run_demucs_separation(temp_wav, tmpdir)
@@ -274,10 +291,9 @@ def process_audio_full(audio_file, detection_mode, enable_singing_detection):
             # 3. 说话/唱歌检测
             if enable_singing_detection:
-                status_messages.append("🎤 正在检测说话片段（AI深度学习）...")
                 yield None, None, None, "\n".join(status_messages)
-                # singing_mask: 1=唱歌, 0=说话
                 singing_mask = detect_singing_hybrid(vocals, sr, mode=detection_mode)
             else:
                 status_messages.append("⚠️ 已关闭智能检测，所有人声归入对白")
@@ -287,7 +303,7 @@ def process_audio_full(audio_file, detection_mode, enable_singing_detection):
             status_messages.append("✂️ 正在分离对白和背景音乐...")
             yield None, None, None, "\n".join(status_messages)
-            dialog_mask = 1 - singing_mask  # 说话掩码
             dialog_vocals = vocals * dialog_mask
             singing_vocals = vocals * singing_mask
@@ -334,12 +350,10 @@ def process_audio_full(audio_file, detection_mode, enable_singing_detection):
             status_messages.append(f"   运行设备: {DEVICE.upper()}")
             if enable_singing_detection:
-                if SILERO_AVAILABLE:
                     status_messages.append(f"\n💡 检测算法: Silero VAD 深度学习")
-                    status_messages.append(f"   准确率: 约 85-90%")
                 else:
                     status_messages.append(f"\n💡 检测算法: 传统信号处理")
-                    status_messages.append(f"   准确率: 约 70-75%")
             status_messages.append(f"━━━━━━━━━━━━━━━━━━━━")
@@ -363,17 +377,16 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI音频分离工具") as demo:
     gr.Markdown(f"""
     # 🎵 AI 音频分离工具 - 深度学习版
-    **当前运行设备**: {DEVICE.upper()} {'✅ GPU加速' if DEVICE == 'cuda' else '⚠️ CPU模式'}
-    **AI检测器**: {'✅ Silero VAD (深度学习)' if SILERO_AVAILABLE else '⚠️ 传统算法'}
-    ## 功能说明（新定义）
     - **A - 纯对白**: 旁白、解说、对话（不含Rap/口号）
     - **B - 背景音乐+人声**: 伴奏 + 唱歌 + Rap + 和声
     - **C - 纯伴奏**: 去除所有人声的纯音乐
     💡 **核心技术**:
     - Demucs 4.0 深度学习模型（人声/伴奏分离）
-    - Silero VAD 神经网络（说话检测，准确率 85%+）
     """)
     with gr.Row():
@@ -405,7 +418,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI音频分离工具") as demo:
                 )
                 gr.Markdown("""
                 **模式说明**:
-                - **严格模式**（推荐）：只有清晰的说话才归入对白，Rap/口号归入背景音乐
                 - **平衡模式**：包含部分 Rap 风格的说话
                 💡 **大部分场景用严格模式效果最好！**
@@ -437,53 +450,35 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI音频分离工具") as demo:
     gr.Markdown("""
     ---
-    ## 📌 技术说明
-    ### 🎯 为什么改成"纯对白"定义
-    根据实际测试，我们发现：
-    - **Rap 介于说话和唱歌之间**，传统算法很难区分
-    - **大部分用户真正需要的是"旁白/解说"**，而不是 Rap
-    - **唱歌检测的核心难点在于 Rap**（它有节奏但不是旋律）
-    因此新版本：
-    - ✅ A区域：只保留纯说话（旁白、对话、解说）
-    - ✅ B区域：包含所有"有节奏感的人声"（唱歌、Rap、和声、口号）
-    - ✅ C区域：纯音乐（无人声）
-    ### 🧠 Silero VAD 深度学习模型
-    - **训练数据**: 超过 10000 小时的语音数据
-    - **准确率**: 说话检测准确率 85-90%
-    - **优势**: 专门训练识别"自然说话"，对 Rap/唱歌免疫
-    - **开源**: 完全免费，MIT 协议
-    ### ⚠️ 仍然存在的局限
-    即使用深度学习，以下场景仍有挑战：
-    - **说唱风格旁白**（如快速口播广告）
-    - **唱歌式说话**（如儿童节目主持）
-    - **多人快速对话 + 背景音乐**
-    这些边缘情况需要**专门训练的分类器**，超出了通用工具的范围。
-    ### 💡 使用建议
-    1. **优先用严格模式**
-    2. 如果对白被漏掉太多，试试平衡模式
-    3. 如果还不满意，考虑：
-       - 在专业音频软件中手动编辑
-       - 使用付费商业软件（如 Adobe Audition）
-       - 训练专门的分类模型（需要大量数据）
-    ### 🔬 技术对比
-    | 方法 | 准确率 | 优点 | 缺点 |
-    |------|--------|------|------|
-    | 音高检测 | 60-70% | 简单快速 | 误判 Rap |
-    | 多特征融合 | 70-75% | 准确率提升 | 仍难处理边缘情况 |
-    | **Silero VAD** | **85-90%** | **专���训练** | **需要网络下载模型** |
-    | 商业软件 | 95%+ | 接近完美 | 付费、闭源 |
     """)
 if __name__ == "__main__":

 import librosa
 import torch
 # 检查 GPU
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SAMPLE_RATE = 44100
+# 全局加载 Silero VAD 模型
+SILERO_MODEL = None
+def load_silero_model():
+    """加载 Silero VAD 模型"""
+    global SILERO_MODEL
+    if SILERO_MODEL is None:
+        try:
+            print("📥 正在下载 Silero VAD 模型...")
+            SILERO_MODEL, utils = torch.hub.load(
+                repo_or_dir='snakers4/silero-vad',
+                model='silero_vad',
+                force_reload=False,
+                onnx=False
+            )
+            SILERO_MODEL = SILERO_MODEL.to(DEVICE)
+            print("✅ Silero VAD 模型加载成功")
+            return True
+        except Exception as e:
+            print(f"⚠️ Silero VAD 加载失败: {str(e)}")
+            SILERO_MODEL = None
+            return False
+    return True
 def extract_audio_from_video(video_path, output_path):
     """从视频中提取音频"""
     try:
     返回：speech_mask (1=说话, 0=其他)
     """
     try:
+        global SILERO_MODEL
+        if SILERO_MODEL is None:
+            raise RuntimeError("Silero 模型未加载")
         # 重采样到 16kHz（Silero VAD 要求）
         if sr != 16000:
             vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
             vocals_16k = vocals_audio
             sr_work = 16000
         # 转换为 torch tensor
+        audio_tensor = torch.from_numpy(vocals_16k).float().to(DEVICE)
+        # 使用 Silero VAD 检测
+        window_size_samples = 512
+        speech_probs = []
+        for i in range(0, len(audio_tensor), window_size_samples):
+            chunk = audio_tensor[i:i+window_size_samples]
+            if len(chunk) < window_size_samples:
+                chunk = torch.nn.functional.pad(chunk, (0, window_size_samples - len(chunk)))
+            with torch.no_grad():
+                speech_prob = SILERO_MODEL(chunk.unsqueeze(0), sr_work).item()
+            speech_probs.append(speech_prob)
+        # 创建掩码
+        speech_mask = np.repeat(speech_probs, window_size_samples)[:len(vocals_16k)]
+        speech_mask = (speech_mask > 0.5).astype(np.float32)
         # 调整回原始采样率
         if sr != sr_work:
         print(f"Silero VAD 检测失败: {str(e)}")
         import traceback
         traceback.print_exc()
         return np.zeros(len(vocals_audio), dtype=np.float32)
 def detect_speech_fallback(vocals_audio, sr):
     """传统算法备用方案（当 Silero 不可用时）"""
     try:
         hop_length = 512
         # 能量
         return np.zeros(len(vocals_audio), dtype=np.float32)
+def detect_singing_hybrid(vocals_audio, sr, mode='strict'):
+    """
+    混合检测策略：
+    1. 先用 Silero VAD 检测"说话"
+    2. 其余全部归入"唱歌/音乐"
+    """
+    try:
+        global SILERO_MODEL
+        if SILERO_MODEL is not None:
+            print("🎯 使用 Silero VAD 深度学习模型检测说话...")
+            speech_mask = detect_speech_with_silero(vocals_audio, sr)
+        else:
+            print("⚠️ Silero 不可用，使用传统算法...")
+            speech_mask = detect_speech_fallback(vocals_audio, sr)
+        if mode == 'strict':
+            # 严格模式：只保留明确的说话
+            from scipy.ndimage import binary_erosion
+            kernel_size = int(0.05 * sr)  # 50ms
+            if kernel_size > 1:
+                speech_mask = binary_erosion(speech_mask, structure=np.ones(kernel_size)).astype(np.float32)
+        # 说话 = 1, 唱歌 = 0
+        # 返回唱歌掩码
+        singing_mask = 1 - speech_mask
+        return singing_mask
+    except Exception as e:
+        print(f"检测失败: {str(e)}")
+        return np.ones(len(vocals_audio), dtype=np.float32)
 def process_audio_full(audio_file, detection_mode, enable_singing_detection):
     """完整的音频分离流程"""
     if audio_file is None:
     try:
         with tempfile.TemporaryDirectory() as tmpdir:
+            # 0. 加载 Silero 模型（如果需要）
+            if enable_singing_detection:
+                status_messages.append("🔧 正在初始化 AI 检测器...")
+                yield None, None, None, "\n".join(status_messages)
+                silero_loaded = load_silero_model()
+                if silero_loaded:
+                    status_messages.append("   ✅ Silero VAD 深度学习模型已就绪")
+                else:
+                    status_messages.append("   ⚠️ Silero 加载失败，将使用传统算法")
+                yield None, None, None, "\n".join(status_messages)
             # 1. 加载音频
             status_messages.append("📂 正在加载文件...")
             yield None, None, None, "\n".join(status_messages)
             # 2. Demucs 分离
             status_messages.append("🎵 使用 Demucs AI 模型分离人声和伴奏...")
             yield None, None, None, "\n".join(status_messages)
             vocals_path, instrumental_path = run_demucs_separation(temp_wav, tmpdir)
             # 3. 说话/唱歌检测
             if enable_singing_detection:
+                status_messages.append("🎤 正在检测说话片段...")
                 yield None, None, None, "\n".join(status_messages)
                 singing_mask = detect_singing_hybrid(vocals, sr, mode=detection_mode)
             else:
                 status_messages.append("⚠️ 已关闭智能检测，所有人声归入对白")
             status_messages.append("✂️ 正在分离对白和背景音乐...")
             yield None, None, None, "\n".join(status_messages)
+            dialog_mask = 1 - singing_mask
             dialog_vocals = vocals * dialog_mask
             singing_vocals = vocals * singing_mask
             status_messages.append(f"   运行设备: {DEVICE.upper()}")
             if enable_singing_detection:
+                if SILERO_MODEL is not None:
                     status_messages.append(f"\n💡 检测算法: Silero VAD 深度学习")
                 else:
                     status_messages.append(f"\n💡 检测算法: 传统信号处理")
             status_messages.append(f"━━━━━━━━━━━━━━━━━━━━")
     gr.Markdown(f"""
     # 🎵 AI 音频分离工具 - 深度学习版
+    **当前运行设备**: {DEVICE.upper()} {'✅ GPU加速' if DEVICE == 'cuda' else '⚠️ CPU模式'}
+    ## 功能说明
     - **A - 纯对白**: 旁白、解说、对话（不含Rap/口号）
     - **B - 背景音乐+人声**: 伴奏 + 唱歌 + Rap + 和声
     - **C - 纯伴奏**: 去除所有人声的纯音乐
     💡 **核心技术**:
     - Demucs 4.0 深度学习模型（人声/伴奏分离）
+    - Silero VAD 神经网络（说话检测）
     """)
     with gr.Row():
                 )
                 gr.Markdown("""
                 **模式说明**:
+                - **严格模式**（推荐）：只有清晰的说话才归入对白
                 - **平衡模式**：包含部分 Rap 风格的说话
                 💡 **大部分场景用严格模式效果最好！**
     gr.Markdown("""
     ---
+    ## 📌 使用说明
+    ### 🎯 新版本改进
+    1. **使用 Silero VAD 深度学习模型**
+       - 自动从 torch.hub 下载（约10MB）
+       - 准确率比传统算法提升 15-20%
+       - 专门训练识别"说话"
+    2. **改变产品定义**
+       - A区域：只保留纯说话（旁白、对话）
+       - B区域：所有音乐性人声（唱歌、Rap、和声）
+       - 逻辑更清晰，用户需求更明确
+    3. **两种检测模式**
+       - 严格模式：优先保证对白纯净度
+       - 平衡模式：包含部分快速说话
+    ### ⚠️ 技术限制
+    - **深度学习准确率**: 85-90%（已是免费方案极限）
+    - **边缘情况**: 说唱风格旁白、唱歌式说话仍有挑战
+    - **完美分离**: 需要付费商业软件或自训练模型
+    ### 💡 效果不满意？
+    1. 尝试两种模式切换
+    2. 在专业音频软件中手动微调（推荐 Audacity）
+    3. 考虑使用付费商业软件（如 Adobe Audition）
     """)
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -5,5 +5,4 @@ torchaudio==2.1.0
 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.24.3
-scipy==1.11.4
-silero-vad==4.0.0

 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.24.3
+scipy==1.11.4