Spaces:

haoyue518
/

audio-separator

Runtime error

App Files Files Community

haoyue518 commited on Nov 19, 2025

Commit

bba9bc2

verified ·

1 Parent(s): 6c23f71

Upload 3 files

Browse files

Files changed (3) hide show

app.py +317 -0
packages.txt +1 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import os, tempfile, subprocess
+import gradio as gr
+import numpy as np
+import soundfile as sf
+import librosa
+import torch
+# 检查是否有 GPU
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SAMPLE_RATE = 44100
+def load_audio_any_format(file_path, target_sr=SAMPLE_RATE):
+    """加载任意格式音频（支持视频）"""
+    try:
+        audio, sr = librosa.load(file_path, sr=target_sr, mono=False)
+        if audio.ndim == 1:
+            audio = audio.reshape(1, -1)
+        return audio, sr
+    except Exception as e:
+        raise ValueError(f"音频加载失败: {str(e)}")
+def save_audio(path, audio, sr):
+    """保存音频"""
+    if audio.ndim == 1:
+        audio = audio.reshape(1, -1)
+    sf.write(path, audio.T, sr, subtype="PCM_16")
+def run_demucs_separation(audio_path, output_dir):
+    """使用 Demucs 进行人声/伴奏分离"""
+    try:
+        # 使用 htdemucs 模型，分离为 vocals 和 no_vocals
+        cmd = [
+            "python", "-m", "demucs.separate",
+            "--two-stems=vocals",
+            "-n", "htdemucs",
+            "-o", output_dir,
+            audio_path
+        ]
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        # 找到输出文件
+        base_name = os.path.splitext(os.path.basename(audio_path))[0]
+        stem_dir = os.path.join(output_dir, "htdemucs", base_name)
+        vocals_path = os.path.join(stem_dir, "vocals.wav")
+        instrumental_path = os.path.join(stem_dir, "no_vocals.wav")
+        if not os.path.exists(vocals_path):
+            raise FileNotFoundError("Demucs 分离失败，找不到输出文件")
+        return vocals_path, instrumental_path
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Demucs 执行失败: {e.stderr}")
+    except Exception as e:
+        raise RuntimeError(f"Demucs 分离失败: {str(e)}")
+def detect_singing_segments(vocals_audio, sr, confidence_threshold=0.5):
+    """
+    检测唱歌片段（基于音高连续性）
+    返回：singing_mask (0=说话, 1=唱歌)
+    """
+    try:
+        # 重采样到 16kHz 用于音高检测
+        if sr != 16000:
+            vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
+            sr_work = 16000
+        else:
+            vocals_16k = vocals_audio
+            sr_work = sr
+        # 使用 librosa 的 pyin 算法检测音高
+        f0, voiced_flag, voiced_probs = librosa.pyin(
+            vocals_16k,
+            fmin=librosa.note_to_hz('C2'),
+            fmax=librosa.note_to_hz('C7'),
+            sr=sr_work,
+            frame_length=2048,
+            hop_length=512
+        )
+        # 计算连续有声片段
+        hop_length = 512
+        n_frames = len(f0)
+        singing_frames = np.zeros(n_frames, dtype=np.float32)
+        # 连续音高片段判定为唱歌
+        min_duration_frames = int(0.3 * sr_work / hop_length)  # 至少0.3秒
+        i = 0
+        while i < n_frames:
+            if voiced_probs[i] > confidence_threshold and not np.isnan(f0[i]):
+                j = i
+                pitch_sequence = []
+                # 找连续片段
+                while j < n_frames and voiced_probs[j] > confidence_threshold and not np.isnan(f0[j]):
+                    pitch_sequence.append(f0[j])
+                    j += 1
+                # 判断是否为唱歌（音高方差要合理）
+                if len(pitch_sequence) >= min_duration_frames:
+                    pitch_std = np.std(pitch_sequence)
+                    # 唱歌的音高变化通常在20-200Hz之间
+                    if 20 < pitch_std < 200:
+                        singing_frames[i:j] = 1.0
+                i = j
+            else:
+                i += 1
+        # 转换回原始采样率的掩码
+        samples_per_frame = hop_length
+        singing_mask = np.repeat(singing_frames, samples_per_frame)
+        # 调整长度匹配
+        target_length = len(vocals_16k)
+        if len(singing_mask) < target_length:
+            singing_mask = np.pad(singing_mask, (0, target_length - len(singing_mask)))
+        elif len(singing_mask) > target_length:
+            singing_mask = singing_mask[:target_length]
+        # 如果原始采样率不同，重采样掩码
+        if sr != sr_work:
+            # 简单的线性插值
+            from scipy import signal
+            singing_mask = signal.resample(singing_mask, len(vocals_audio))
+        # 平滑处理
+        window_size = int(0.1 * sr)  # 100ms 窗口
+        if window_size > 1:
+            singing_mask = np.convolve(singing_mask, np.ones(window_size)/window_size, mode='same')
+        singing_mask = (singing_mask > 0.5).astype(np.float32)
+        return singing_mask
+    except Exception as e:
+        print(f"唱歌检测失败: {str(e)}")
+        # 失败时返回全零（全部视为说话）
+        return np.zeros(len(vocals_audio), dtype=np.float32)
+def process_audio_full(audio_file, singing_sensitivity, enable_singing_detection):
+    """完整的音频分离流程"""
+    if audio_file is None:
+        return None, None, None, "❌ 请先上传音频文件"
+    status_messages = []
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            # 1. 加载音频
+            status_messages.append("📂 正在加载音频...")
+            yield None, None, None, "\n".join(status_messages)
+            input_path = audio_file
+            audio, sr = load_audio_any_format(input_path, SAMPLE_RATE)
+            # 保存为标准 WAV
+            temp_wav = os.path.join(tmpdir, "input.wav")
+            save_audio(temp_wav, audio, sr)
+            # 2. Demucs 分离
+            status_messages.append("🎵 使用 AI 模型分离人声和伴奏（这可能需要几分钟）...")
+            yield None, None, None, "\n".join(status_messages)
+            vocals_path, instrumental_path = run_demucs_separation(temp_wav, tmpdir)
+            # 读取分离结果
+            vocals, _ = librosa.load(vocals_path, sr=sr, mono=True)
+            instrumental, _ = librosa.load(instrumental_path, sr=sr, mono=True)
+            # 3. 唱歌检测
+            if enable_singing_detection:
+                status_messages.append("🎤 正在检测唱歌片段...")
+                yield None, None, None, "\n".join(status_messages)
+                singing_mask = detect_singing_segments(
+                    vocals, sr,
+                    confidence_threshold=singing_sensitivity
+                )
+            else:
+                singing_mask = np.zeros(len(vocals), dtype=np.float32)
+            # 4. 分离对白和唱歌
+            status_messages.append("✂️ 正在分离对白和背景音乐...")
+            yield None, None, None, "\n".join(status_messages)
+            dialog_mask = 1 - singing_mask
+            dialog_vocals = vocals * dialog_mask
+            singing_vocals = vocals * singing_mask
+            # 5. 生成最终输出
+            # A: 前景对白（纯说话）
+            output_a = dialog_vocals
+            # B: 背景音乐（伴奏 + 唱段）
+            # 响度匹配，避免削波
+            singing_rms = np.sqrt(np.mean(singing_vocals**2) + 1e-8)
+            inst_rms = np.sqrt(np.mean(instrumental**2) + 1e-8)
+            if singing_rms > 1e-6:
+                singing_gain = inst_rms / singing_rms
+                singing_gain = np.clip(singing_gain, 0.1, 2.0)
+            else:
+                singing_gain = 1.0
+            output_b = np.clip(instrumental + singing_vocals * singing_gain, -1.0, 1.0)
+            # C: 纯伴奏
+            output_c = instrumental
+            # 保存文件
+            path_a = os.path.join(tmpdir, "A_dialog.wav")
+            path_b = os.path.join(tmpdir, "B_bgm_with_singing.wav")
+            path_c = os.path.join(tmpdir, "C_instrumental.wav")
+            save_audio(path_a, output_a, sr)
+            save_audio(path_b, output_b, sr)
+            save_audio(path_c, output_c, sr)
+            # 统计信息
+            total_duration = len(vocals) / sr
+            singing_duration = np.sum(singing_mask) / sr
+            dialog_duration = total_duration - singing_duration
+            status_messages.append(f"✅ 分离完成！")
+            status_messages.append(f"   总时长: {total_duration:.1f}秒")
+            status_messages.append(f"   对白时长: {dialog_duration:.1f}秒")
+            status_messages.append(f"   唱歌时长: {singing_duration:.1f}秒")
+            status_messages.append(f"   设备: {DEVICE.upper()}")
+            yield (
+                path_a,
+                path_b,
+                path_c,
+                "\n".join(status_messages)
+            )
+    except Exception as e:
+        error_msg = f"❌ 处理失败: {str(e)}\n\n已完成步骤:\n" + "\n".join(status_messages)
+        yield None, None, None, error_msg
+# 创建 Gradio 界面
+with gr.Blocks(theme=gr.themes.Soft(), title="AI音频分离工具") as demo:
+    gr.Markdown(f"""
+    # 🎵 AI 音频分离工具 - 完整版
+    **当前运行设备**: {DEVICE.upper()} {'✅ (GPU加速)' if DEVICE == 'cuda' else '⚠️ (CPU模式，速度较慢)'}
+    ## 功能说明
+    - **A - 前景对白**: 纯说话、旁白、Rap、口号、喊叫
+    - **B - 背景音乐**: 伴奏 + 唱歌（主唱/和声/合唱）
+    - **C - 纯伴奏**: 去除所有人声的纯音乐
+    💡 **核心技术**: 使用 Demucs AI 模型 + 音高连续性检测
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            audio_input = gr.Audio(
+                type="filepath",
+                label="📁 上传音频或视频文件"
+            )
+            with gr.Accordion("⚙️ 高级设置", open=False):
+                enable_detection = gr.Checkbox(
+                    value=True,
+                    label="启用唱歌检测（关闭则所有人声归入对白）"
+                )
+                sensitivity = gr.Slider(
+                    0.3, 0.8, value=0.5, step=0.05,
+                    label="唱歌检测灵敏度（越高越严格）"
+                )
+                gr.Markdown("**提示**: 如果唱段漏检，降低灵敏度；如果说话误判为唱歌，提高灵敏度")
+            process_btn = gr.Button("🚀 开始分离", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            status_box = gr.Textbox(
+                label="📊 处理状态",
+                lines=10,
+                max_lines=15
+            )
+    gr.Markdown("---")
+    gr.Markdown("## 📥 分离结果")
+    with gr.Row():
+        output_a = gr.Audio(label="🎤 A - 前景对白（说话/Rap/口号）", type="filepath")
+        output_b = gr.Audio(label="🎵 B - 背景音乐（含唱段）", type="filepath")
+        output_c = gr.Audio(label="🎹 C - 纯伴奏", type="filepath")
+    process_btn.click(
+        fn=process_audio_full,
+        inputs=[audio_input, sensitivity, enable_detection],
+        outputs=[output_a, output_b, output_c, status_box]
+    )
+    gr.Markdown("""
+    ---
+    ## 📌 使用提示
+    1. **支持格式**: MP3, WAV, M4A, MP4, MOV 等
+    2. **处理时间**: GPU模式下约为音频时长的30%-100%，CPU模式会更慢
+    3. **最佳效果**: 建议音频质量较高，背景噪音少
+    4. **限制**: 单次建议不超过 10 分钟音频
+    ⚠️ **注意**:
+    - 第一次运行会自动下载 Demucs 模型（约500MB）
+    - 如果使用 CPU，5分钟音频可能需要10-20分钟处理
+    - 如遇内存不足，请上传较短的音频片段
+    """)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio==4.44.0
+demucs==4.0.1
+torch>=2.1.0
+torchaudio>=2.1.0
+librosa>=0.10.1
+soundfile>=0.12.1
+numpy>=1.23.0
+scipy>=1.10.0