Spaces:

haoyue518
/

audio-separator

Runtime error

App Files Files Community

haoyue518 commited on Nov 19, 2025

Commit

afaa18a

verified ·

1 Parent(s): da239ff

Upload 5 files

Browse files

Files changed (3) hide show

README.md +3 -27
app.py +157 -617
requirements.txt +2 -2

README.md CHANGED Viewed

@@ -4,35 +4,11 @@ emoji: 🎵
 colorFrom: blue
 colorTo: purple
 sdk: gradio
-sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 ---
-# 🎵 AI Audio Separator
-AI-powered tool to separate dialog and background music from audio/video files.
-## Features
-- 🎤 Pure dialog track (narration, speech, conversation)
-- 🎵 Background music + vocals (singing, rap, harmony)
-- 🎹 Pure instrumental (no vocals)
-## Technology
-- **Demucs 4.0**: Vocal/instrumental separation (95%+ accuracy)
-- **Silero VAD**: Speech detection neural network (85-90% accuracy)
-- **Local model**: No network download required
-## Supported Formats
-- Audio: MP3, WAV, M4A, FLAC, OGG, AAC
-- Video: MP4, MOV, AVI, MKV, FLV, WMV
-## Usage
-1. Upload audio or video file
-2. Choose detection mode (strict/balanced)
-3. Click "Start AI Separation"
-4. Download 3 separated tracks

 colorFrom: blue
 colorTo: purple
 sdk: gradio
+sdk_version: 3.50.2
 app_file: app.py
 pinned: false
 ---
+# 🎵 AI 音频分离工具 (稳定版)
+已加载本地 Silero VAD 模型，提供高精度人声/伴奏分离。

app.py CHANGED Viewed

@@ -1,695 +1,235 @@
-import os, tempfile, subprocess
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import librosa
 import torch
-# 检查 GPU
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SAMPLE_RATE = 44100
-# 全局变量
 SILERO_MODEL = None
 SILERO_AVAILABLE = False
 def load_silero_from_local():
     """从本地文件加载 Silero VAD 模型"""
     global SILERO_MODEL, SILERO_AVAILABLE
     try:
-        # 尝试从不同位置加载
-        model_paths = [
-            "silero_vad.jit",           # 根目录
-            "models/silero_vad.jit",    # models 文件夹
-            "./silero_vad.jit",
-            "/home/user/app/silero_vad.jit",  # HF Spaces 默认路径
-        ]
-        model_path = None
-        for path in model_paths:
-            if os.path.exists(path):
-                model_path = path
-                break
-        if model_path is None:
-            print("⚠️ 未找到本地 Silero VAD 模型文件")
-            print("   请确保 silero_vad.jit 已上传到 Space 根目录")
-            print(f"   当前工作目录: {os.getcwd()}")
-            print(f"   目录内容: {os.listdir('.')}")
             SILERO_AVAILABLE = False
             return False
         print(f"📥 正在从本地加载 Silero VAD: {model_path}")
-        # 加载模型
         SILERO_MODEL = torch.jit.load(model_path, map_location=DEVICE)
         SILERO_MODEL.eval()
-        print("✅ Silero VAD 模型加载成功（从本地文件）")
         SILERO_AVAILABLE = True
         return True
     except Exception as e:
-        print(f"❌ Silero VAD 加载失败: {str(e)}")
-        import traceback
-        traceback.print_exc()
         SILERO_AVAILABLE = False
         return False
 def extract_audio_from_video(video_path, output_path):
-    """从视频中提取音频"""
     try:
-        cmd = [
-            'ffmpeg', '-i', video_path,
-            '-vn',
-            '-acodec', 'pcm_s16le',
-            '-ar', str(SAMPLE_RATE),
-            '-ac', '2',
-            '-y',
             output_path
-        ]
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"FFmpeg 提取失败: {result.stderr}")
-        return output_path
-    except Exception as e:
-        raise RuntimeError(f"音频提取失败: {str(e)}")
-def load_audio_any_format(file_path, target_sr=SAMPLE_RATE):
-    """加载任意格式音频"""
-    try:
-        video_extensions = ['.mp4', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.m4v']
-        file_ext = os.path.splitext(file_path)[1].lower()
-        if file_ext in video_extensions:
-            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
-                temp_audio_path = tmp.name
-            extract_audio_from_video(file_path, temp_audio_path)
-            audio, sr = librosa.load(temp_audio_path, sr=target_sr, mono=False)
-            os.unlink(temp_audio_path)
-        else:
-            audio, sr = librosa.load(file_path, sr=target_sr, mono=False)
-        if audio.ndim == 1:
-            audio = audio.reshape(1, -1)
-        return audio, sr
-    except Exception as e:
-        raise ValueError(f"音频加载失败: {str(e)}")
-def save_audio(path, audio, sr):
-    """保存音频"""
-    try:
-        if audio.ndim == 1:
-            audio = audio.reshape(1, -1)
-        audio = np.clip(audio, -1.0, 1.0)
-        sf.write(path, audio.T, sr, subtype="PCM_16")
-    except Exception as e:
-        raise RuntimeError(f"音频保存失败: {str(e)}")
-def run_demucs_separation(audio_path, output_dir):
-    """使用 Demucs 进行人声/伴奏分离"""
-    try:
-        cmd = [
-            "python", "-m", "demucs.separate",
-            "--two-stems=vocals",
-            "-n", "htdemucs",
-            "--mp3",
-            "--mp3-bitrate=320",
-            "-o", output_dir,
-            audio_path
-        ]
-        result = subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=600)
-        base_name = os.path.splitext(os.path.basename(audio_path))[0]
-        stem_dir = os.path.join(output_dir, "htdemucs", base_name)
-        vocals_path = os.path.join(stem_dir, "vocals.mp3")
-        instrumental_path = os.path.join(stem_dir, "no_vocals.mp3")
-        if not os.path.exists(vocals_path):
-            raise FileNotFoundError(f"Demucs 输出文件不存在: {vocals_path}")
-        return vocals_path, instrumental_path
-    except subprocess.TimeoutExpired:
-        raise RuntimeError("处理超时（超过10分钟），请上传较短的音频")
     except subprocess.CalledProcessError as e:
-        raise RuntimeError(f"Demucs 执行失败: {e.stderr}")
-    except Exception as e:
-        raise RuntimeError(f"Demucs 分离失败: {str(e)}")
 def detect_speech_with_silero(vocals_audio, sr):
-    """使用 Silero VAD 深度学习模型检测说话"""
-    try:
-        global SILERO_MODEL
-        if SILERO_MODEL is None:
-            raise RuntimeError("Silero 模型未加载")
-        # 重采样到 16kHz
-        if sr != 16000:
-            vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
-            sr_work = 16000
-        else:
-            vocals_16k = vocals_audio
-            sr_work = 16000
-        # 转换为 torch tensor
-        audio_tensor = torch.from_numpy(vocals_16k).float().to(DEVICE)
-        # 使用 Silero VAD 检测
-        window_size_samples = 512
-        speech_probs = []
-        with torch.no_grad():
-            for i in range(0, len(audio_tensor), window_size_samples):
-                chunk = audio_tensor[i:i+window_size_samples]
-                if len(chunk) < window_size_samples:
-                    chunk = torch.nn.functional.pad(chunk, (0, window_size_samples - len(chunk)))
-                speech_prob = SILERO_MODEL(chunk.unsqueeze(0), sr_work).item()
-                speech_probs.append(speech_prob)
-        # 创建掩码
-        speech_mask = np.repeat(speech_probs, window_size_samples)[:len(vocals_16k)]
-        speech_mask = (speech_mask > 0.5).astype(np.float32)
-        # 调整回原始采样率
-        if sr != sr_work:
-            from scipy.interpolate import interp1d
-            old_indices = np.linspace(0, 1, len(speech_mask))
-            new_indices = np.linspace(0, 1, len(vocals_audio))
-            interpolator = interp1d(old_indices, speech_mask, kind='linear', fill_value='extrapolate')
-            speech_mask = interpolator(new_indices)
-        # 确保长度匹配
-        if len(speech_mask) != len(vocals_audio):
-            if len(speech_mask) < len(vocals_audio):
-                speech_mask = np.pad(speech_mask, (0, len(vocals_audio) - len(speech_mask)))
-            else:
-                speech_mask = speech_mask[:len(vocals_audio)]
-        speech_mask = (speech_mask > 0.5).astype(np.float32)
-        return speech_mask
-    except Exception as e:
-        print(f"Silero VAD 检测失败: {str(e)}")
-        return None
-def detect_speech_fallback(vocals_audio, sr):
-    """传统算法备用方案"""
-    try:
-        hop_length = 512
-        frame_length = 2048
-        # 能量
-        rms = librosa.feature.rms(y=vocals_audio, frame_length=frame_length, hop_length=hop_length)[0]
-        # 零交叉率
-        zcr = librosa.feature.zero_crossing_rate(vocals_audio, frame_length=frame_length, hop_length=hop_length)[0]
-        # 频谱质心
-        spectral_centroids = librosa.feature.spectral_centroid(y=vocals_audio, sr=sr, hop_length=hop_length)[0]
-        # 音高检测
-        try:
-            f0, voiced_flag, voiced_probs = librosa.pyin(
-                vocals_audio,
-                fmin=librosa.note_to_hz('C2'),
-                fmax=librosa.note_to_hz('C7'),
-                sr=sr,
-                frame_length=frame_length,
-                hop_length=hop_length
-            )
-            f0 = np.nan_to_num(f0, nan=0.0)
-        except:
-            f0 = np.zeros(len(rms))
-        # 归一化
-        min_len = min(len(rms), len(zcr), len(spectral_centroids), len(f0))
-        rms = rms[:min_len]
-        zcr = zcr[:min_len]
-        spectral_centroids = spectral_centroids[:min_len]
-        f0 = f0[:min_len]
-        # 说话特征得分
-        zcr_score = np.clip((zcr - 0.05) / 0.15, 0, 1)
-        rms_norm = rms / (np.max(rms) + 1e-8)
-        energy_variation = np.abs(np.gradient(rms_norm))
-        energy_score = np.clip(energy_variation * 10, 0, 1)
-        centroid_variation = np.abs(np.gradient(spectral_centroids))
-        centroid_score = np.clip(centroid_variation / (np.mean(centroid_variation) + 1e-8), 0, 1)
-        pitch_continuity = np.zeros_like(f0)
-        for i in range(1, len(f0)):
-            if f0[i] > 0 and f0[i-1] > 0:
-                pitch_diff = abs(f0[i] - f0[i-1])
-                if pitch_diff > 50:
-                    pitch_continuity[i] = 1
-        # 综合得分
-        speaking_score = (
-            0.30 * zcr_score +
-            0.25 * energy_score +
-            0.25 * centroid_score +
-            0.20 * pitch_continuity
-        )
-        speaking_mask = (speaking_score > 0.6).astype(np.float32)
-        # 后处理
-        min_duration = int(0.2 * sr / hop_length)
-        i = 0
-        while i < len(speaking_mask):
-            if speaking_mask[i] == 1:
-                j = i
-                while j < len(speaking_mask) and speaking_mask[j] == 1:
-                    j += 1
-                if j - i < min_duration:
-                    speaking_mask[i:j] = 0
-                i = j
-            else:
-                i += 1
-        # 转换为样本级
-        speaking_mask_samples = np.repeat(speaking_mask, hop_length)
-        if len(speaking_mask_samples) < len(vocals_audio):
-            speaking_mask_samples = np.pad(speaking_mask_samples, (0, len(vocals_audio) - len(speaking_mask_samples)))
-        else:
-            speaking_mask_samples = speaking_mask_samples[:len(vocals_audio)]
-        # 平滑
-        smooth_window = int(0.03 * sr)
-        if smooth_window > 1:
-            speaking_mask_samples = np.convolve(
-                speaking_mask_samples,
-                np.ones(smooth_window) / smooth_window,
-                mode='same'
-            )
-        speaking_mask_samples = (speaking_mask_samples > 0.5).astype(np.float32)
-        return speaking_mask_samples
-    except Exception as e:
-        print(f"传统算法检测失败: {str(e)}")
-        # 返回全1（假设全是说话）
-        return np.ones(len(vocals_audio), dtype=np.float32)
-def detect_singing_hybrid(vocals_audio, sr, mode='strict'):
-    """混合检测策略：优先使用 Silero VAD，失败则降级"""
-    try:
-        global SILERO_AVAILABLE
-        if SILERO_AVAILABLE:
-            print("✅ 使用 Silero VAD 深度学习模型检测")
-            speech_mask = detect_speech_with_silero(vocals_audio, sr)
-            if speech_mask is not None:
-                if mode == 'strict':
-                    from scipy.ndimage import binary_erosion
-                    kernel_size = int(0.05 * sr)
-                    if kernel_size > 1:
-                        speech_mask = binary_erosion(speech_mask, structure=np.ones(kernel_size)).astype(np.float32)
-                singing_mask = 1 - speech_mask
-                return singing_mask, "Silero VAD"
-        # Silero 不可用，使用传统算法
-        print("⚠️ 使用传统多特征算法")
-        speech_mask = detect_speech_fallback(vocals_audio, sr)
-        singing_mask = 1 - speech_mask
-        return singing_mask, "传统算法"
-    except Exception as e:
-        print(f"检测失败: {str(e)}")
-        speech_mask = detect_speech_fallback(vocals_audio, sr)
-        singing_mask = 1 - speech_mask
-        return singing_mask, "传统算法"
-def process_audio_full(audio_file, detection_mode, enable_detection):
-    """完整的音频分离流程"""
-    if audio_file is None:
-        return None, None, None, "❌ 请先上传音频或视频文件"
-    status_messages = []
     try:
         with tempfile.TemporaryDirectory() as tmpdir:
-            # 1. 加载音频
-            status_messages.append("📂 正在加载文件...")
-            yield None, None, None, "\n".join(status_messages)
-            input_path = audio_file
-            file_ext = os.path.splitext(input_path)[1].lower()
-            if file_ext in ['.mp4', '.mov', '.avi', '.mkv', '.flv', '.wmv', '.m4v']:
-                status_messages.append(f"🎬 检测到视频文件 ({file_ext})，正在提取音频...")
-                yield None, None, None, "\n".join(status_messages)
-            audio, sr = load_audio_any_format(input_path, SAMPLE_RATE)
             temp_wav = os.path.join(tmpdir, "input.wav")
-            save_audio(temp_wav, audio, sr)
-            # 2. Demucs 分离
-            status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-            status_messages.append("🎵 使用 Demucs AI 模型分离人声和伴奏...")
-            status_messages.append("   （首次运行会下载模型，约500MB）")
-            yield None, None, None, "\n".join(status_messages)
-            vocals_path, instrumental_path = run_demucs_separation(temp_wav, tmpdir)
-            vocals, _ = librosa.load(vocals_path, sr=sr, mono=True)
-            instrumental, _ = librosa.load(instrumental_path, sr=sr, mono=True)
-            status_messages.append("   ✅ Demucs 分离完成")
-            status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-            # 3. 说话检测
-            algorithm_used = "无"
             if enable_detection:
-                status_messages.append("")
-                status_messages.append("🔧 正在初始化 AI 检测器...")
-                global SILERO_AVAILABLE
                 if SILERO_AVAILABLE:
-                    status_messages.append("   ✅ Silero VAD 已加载（从本地文件）")
-                else:
-                    status_messages.append("   ⚠️ Silero VAD 不可用，将使用传统算法")
-                yield None, None, None, "\n".join(status_messages)
-                status_messages.append("🎤 正在分析音频特征...")
-                yield None, None, None, "\n".join(status_messages)
-                # singing_mask: 1=唱歌, 0=说话
-                singing_mask, algorithm_used = detect_singing_hybrid(vocals, sr, mode=detection_mode)
-                status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-                # 醒目标注使用的算法
-                if algorithm_used == "Silero VAD":
-                    status_messages.append("✅✅✅ 检测器状态: Silero VAD 深度学习")
-                    status_messages.append("   📈 预期准确率: 85-90%")
-                    status_messages.append("   🎯 算法类型: 神经网络")
-                    status_messages.append("   📦 模型来源: 本地文件")
                 else:
-                    status_messages.append("⚠️⚠️⚠️ 检测器状态: 传统多特征算法")
-                    status_messages.append("   📉 预期准确率: 75-80%")
-                    status_messages.append("   🎯 算法类型: 信号处理")
-                status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-                status_messages.append("   ✅ 检测完成")
-            else:
-                status_messages.append("⚠️ 已关闭智能检测，所有人声归入对白")
-                singing_mask = np.zeros(len(vocals), dtype=np.float32)
-                algorithm_used = "关闭检测"
-            # 4. 分离对白和唱歌
-            status_messages.append("")
-            status_messages.append("✂️ 正在分离对白和背景音乐...")
-            yield None, None, None, "\n".join(status_messages)
-            dialog_mask = 1 - singing_mask
-            dialog_vocals = vocals * dialog_mask
-            singing_vocals = vocals * singing_mask
-            # 5. 生成最终输出
-            output_a = dialog_vocals
-            # 智能混音
-            singing_rms = np.sqrt(np.mean(singing_vocals**2) + 1e-8)
-            inst_rms = np.sqrt(np.mean(instrumental**2) + 1e-8)
-            if singing_rms > 1e-6:
-                singing_gain = inst_rms / singing_rms * 0.8
-                singing_gain = np.clip(singing_gain, 0.1, 1.5)
-            else:
-                singing_gain = 1.0
-            output_b = np.clip(instrumental + singing_vocals * singing_gain, -1.0, 1.0)
-            output_c = instrumental
-            # 保存文件
-            status_messages.append("💾 正在保存输出文件...")
-            yield None, None, None, "\n".join(status_messages)
-            if algorithm_used == "Silero VAD":
-                algo_tag = "SileroVAD"
-            elif algorithm_used == "传统算法":
-                algo_tag = "Traditional"
-            else:
-                algo_tag = "NoDetect"
-            path_a = os.path.join(tmpdir, f"A_dialog_{algo_tag}.wav")
-            path_b = os.path.join(tmpdir, f"B_bgm_with_singing_{algo_tag}.wav")
-            path_c = os.path.join(tmpdir, f"C_instrumental_{algo_tag}.wav")
-            save_audio(path_a, output_a, sr)
-            save_audio(path_b, output_b, sr)
-            save_audio(path_c, output_c, sr)
-            # 统计信息
-            total_duration = len(vocals) / sr
-            dialog_duration = np.sum(dialog_mask) / sr
-            singing_duration = total_duration - dialog_duration
-            status_messages.append("")
-            status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-            status_messages.append("✅✅✅ 分离完成！")
-            status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-            status_messages.append("")
-            status_messages.append("📊 统计信息:")
-            status_messages.append(f"   总时长: {total_duration:.1f} 秒")
-            status_messages.append(f"   对白时长: {dialog_duration:.1f} 秒 ({dialog_duration/total_duration*100:.1f}%)")
-            status_messages.append(f"   音乐人声时长: {singing_duration:.1f} 秒 ({singing_duration/total_duration*100:.1f}%)")
-            status_messages.append(f"   运行设备: {DEVICE.upper()}")
-            status_messages.append("")
-            # 醒目标注使用的算法
-            if algorithm_used == "Silero VAD":
-                status_messages.append("🎯 本次使用的检测算法:")
-                status_messages.append("   ✅✅✅ Silero VAD 深度学习模型")
-                status_messages.append("   📈 准确率: 约 85-90%")
-                status_messages.append("   🧠 技术: 神经网络（10000+ 小时训练）")
-                status_messages.append("   📦 模型来源: 本地文件（无需下载）")
-            elif algorithm_used == "传统算法":
-                status_messages.append("🎯 本次使用的检测算法:")
-                status_messages.append("   ⚠️⚠️⚠️ 传统多特征算法")
-                status_messages.append("   📉 准确率: 约 75-80%")
-                status_messages.append("   🔧 技术: 能量+零交叉率+频谱+音高")
-            else:
-                status_messages.append("🎯 本次使用的检测算法:")
-                status_messages.append("   ⚪ 未启用检测（所有人声归入对白）")
-            status_messages.append("")
-            status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-            status_messages.append(f"💾 输出文件已标注算法: {algo_tag}")
-            status_messages.append("━━━━━━━━━━━━━━━━━━━━")
-            yield (
-                path_a,
-                path_b,
-                path_c,
-                "\n".join(status_messages)
-            )
     except Exception as e:
         import traceback
-        error_detail = traceback.format_exc()
-        error_msg = f"❌ 处理失败:\n{str(e)}\n\n已完成步骤:\n" + "\n".join(status_messages)
-        error_msg += f"\n\n详细错误:\n{error_detail}"
-        yield None, None, None, error_msg
-# ===== 启动时加载 Silero VAD =====
-print("=" * 60)
-print("🚀 正在初始化 AI 音频分离工具...")
-print("=" * 60)
-# 尝试加载 Silero VAD
-silero_loaded = load_silero_from_local()
-if silero_loaded:
-    print("✅ Silero VAD 已就绪（高准确率模式 85-90%）")
-else:
-    print("⚠️ Silero VAD 不可用，将使用传统算法（准确率 75-80%）")
-print("=" * 60)
-# 创建 Gradio 界面
-with gr.Blocks(theme=gr.themes.Soft(), title="AI音频分离工具") as demo:
-    gr.Markdown(f"""
-    # 🎵 AI 音频分离工具 - Silero VAD 本地版
-    **当前运行设备**: {DEVICE.upper()} {'✅ GPU加速' if DEVICE == 'cuda' else '⚠️ CPU模式'}
-    **Silero VAD 状态**: {'✅ 已加载（本地文件，准确率 85-90%）' if SILERO_AVAILABLE else '⚠️ 未加载（使用传统算法，准确率 75-80%）'}
-    ---
-    ## 功能说明
-    本工具将音频/视频分离为 3 个独立轨道：
-    - **🎤 A - 纯对白**：旁白、解说、对话（说话的部分）
-    - **🎵 B - 背景音乐+人声**：伴奏 + 唱歌 + Rap + 和声
-    - **🎹 C - 纯伴奏**：去除所有人声的纯音乐
-    ---
-    ## 💡 核心技术
-    1. **Demucs 4.0** 深度学习模型
-       - 人声/伴奏分离（准确率 > 95%）
-       - Meta AI 开发
-    2. **Silero VAD** 神经网络（如已加载）
-       - 说话检测（准确率 85-90%）
-       - 10000+ 小时训练数据
-       - **从本地加载，无需网络下载**
-    3. **传统多特征算法**（备用）
-       - 能量、零交叉率、频谱、音高融合
-       - 准确率 75-80%
-    ---
-    ## 📋 使用场景
-    ✅ **适合的场景**：
-    - 短视频二次创作（提取对白/BGM）
-    - 播客音频编辑
-    - 教学视频字幕制作
-    - 音乐制作（提取伴奏）
-    ⚠️ **有挑战的场景**：
-    - 说唱风格的旁白
-    - 快速说话 + 强背景音乐
-    - 唱歌式说话
-    """)
     with gr.Row():
         with gr.Column(scale=1):
-            audio_input = gr.File(
-                label="📁 上传音频或视频文件",
-                file_types=["audio", "video"],
-                type="filepath"
-            )
-            gr.Markdown("""
-            **支持格式**:
-            - 🎵 音频: MP3, WAV, M4A, FLAC, OGG, AAC
-            - 🎬 视频: MP4, MOV, AVI, MKV, FLV, WMV
-            """)
-            with gr.Accordion("⚙️ 高级设置", open=True):
-                enable_detection = gr.Checkbox(
-                    value=True,
-                    label="🎯 启用智能说话检测（推荐开启）"
-                )
-                detection_mode = gr.Radio(
-                    choices=[
-                        ("严格模式 - 只保留明确的说话/旁白", "strict"),
-                        ("平衡模式 - 包含部分 Rap/快语", "balanced")
-                    ],
-                    value="strict",
-                    label="检测模式"
-                )
-                gr.Markdown("""
-                **模式说明**:
-                - **严格模式**（推荐）：只有清晰的说话才归入对白，唱歌/Rap 归入 BGM
-                - **平衡模式**：包含部分 Rap 风格的说话，边界更宽松
-                **效果不满意？**
-                - 说话被误判为唱歌 → 试试"平衡模式"
-                - 唱歌被误判为说话 → 保持"严格模式"
-                """)
-            process_btn = gr.Button("🚀 开始 AI 智能分离", variant="primary", size="lg")
         with gr.Column(scale=1):
-            status_box = gr.Textbox(
-                label="📊 处理状态（实时显示）",
-                lines=25,
-                max_lines=30,
-                show_label=True
-            )
-    gr.Markdown("---")
-    gr.Markdown("## 📥 分离结果（点击播放预览，右键下载）")
-    with gr.Row():
-        output_a = gr.Audio(label="🎤 A - 纯对白（旁白/解说/对话）", type="filepath")
-        output_b = gr.Audio(label="🎵 B - 背景音乐+人声（含唱歌/Rap）", type="filepath")
-        output_c = gr.Audio(label="🎹 C - 纯伴奏（无人声）", type="filepath")
-    process_btn.click(
         fn=process_audio_full,
-        inputs=[audio_input, detection_mode, enable_detection],
-        outputs=[output_a, output_b, output_c, status_box]
     )
-    gr.Markdown(f"""
-    ---
-    ## 📌 技术说明
-    ### 🎯 当前配置
-    | 项目 | 状态 |
-    |------|------|
-    | **运行设备** | {DEVICE.upper()} {'（GPU 加速）' if DEVICE == 'cuda' else '（CPU 模式）'} |
-    | **Silero VAD** | {'✅ 已加载（本地，准确率 85-90%）' if SILERO_AVAILABLE else '❌ 未加载（使用传统算法，准确率 75-80%）'} |
-    | **Demucs 模型** | htdemucs（人声/伴奏分离） |
-    | **输出格式** | WAV（无损，44.1kHz）|
-    ### 💡 使用建议
-    1. **首次使用**：会下载 Demucs 模型（约 500MB），需 3-5 分钟
-    2. **处理时间**：1 分钟音频约需 10-30 秒（取决于设备）
-    3. **最佳效果**：上传清晰音质的文件
-    4. **文件大小**：建议单个文件 < 50MB，时长 < 5 分钟
-    ### 🔧 如果 Silero VAD 未加载
-    说明 `silero_vad.jit` 文件未正确上传，请检查：
-    1. 文件是否在 Space 根目录
-    2. 文件名是否为 `silero_vad.jit`（全小写）
-    3. 文件大小约 1.4MB
-    即使没有 Silero VAD，传统算法也能提供 75-80% 的准确率。
-    ---
-    ## 📊 算法对比
-    | 检测算法 | 准确率 | 速度 | 依赖 |
-    |---------|-------|------|------|
-    | **Silero VAD** | **85-90%** | 快 | 本地模型文件 |
-    | **传统算法** | **75-80%** | 很快 | 无 |
-    ---
-    **提示**: 处理完成后，文件名会标注使用的算法（SileroVAD 或 Traditional）
-    """)
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

+import os
+import tempfile
+import subprocess
 import gradio as gr
 import numpy as np
 import soundfile as sf
 import librosa
 import torch
+# 检查 GPU 是否可用
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SAMPLE_RATE = 44100
+# 全局变量：存储模型
 SILERO_MODEL = None
 SILERO_AVAILABLE = False
 def load_silero_from_local():
     """从本地文件加载 Silero VAD 模型"""
     global SILERO_MODEL, SILERO_AVAILABLE
     try:
+        # 尝试多个可能的路径
+        model_paths = ["silero_vad.jit", "models/silero_vad.jit", "./silero_vad.jit"]
+        model_path = next((p for p in model_paths if os.path.exists(p)), None)
+        if not model_path:
+            print("⚠️ 未找到本地 Silero VAD 模型文件，将使用传统算法")
             SILERO_AVAILABLE = False
             return False
         print(f"📥 正在从本地加载 Silero VAD: {model_path}")
         SILERO_MODEL = torch.jit.load(model_path, map_location=DEVICE)
         SILERO_MODEL.eval()
         SILERO_AVAILABLE = True
         return True
     except Exception as e:
+        print(f"❌ Silero VAD 加载失败: {e}")
         SILERO_AVAILABLE = False
         return False
 def extract_audio_from_video(video_path, output_path):
+    """使用 ffmpeg 从视频提取音频"""
     try:
+        subprocess.run([
+            'ffmpeg', '-i', video_path,
+            '-vn', # 禁用视频
+            '-acodec', 'pcm_s16le',
+            '-ar', str(SAMPLE_RATE),
+            '-ac', '2',
+            '-y', # 覆盖输出
             output_path
+        ], check=True, capture_output=True)
     except subprocess.CalledProcessError as e:
+        print(f"FFmpeg 错误: {e}")
+        raise Exception("无法从视频提取音频，请检查文件格式")
+def run_demucs_separation(audio_path, output_dir):
+    """运行 Demucs 进行人声/伴奏分离"""
+    cmd = [
+        "python", "-m", "demucs.separate",
+        "--two-stems=vocals", # 只需要分离人声和伴奏
+        "-n", "htdemucs", # 使用最新的模型
+        "--mp3", "--mp3-bitrate=320",
+        "-o", output_dir,
+        audio_path
+    ]
+    subprocess.run(cmd, check=True, capture_output=True, text=True)
+    # 构建输出路径
+    base_name = os.path.splitext(os.path.basename(audio_path))[0]
+    stem_dir = os.path.join(output_dir, "htdemucs", base_name)
+    return os.path.join(stem_dir, "vocals.mp3"), os.path.join(stem_dir, "no_vocals.mp3")
 def detect_speech_with_silero(vocals_audio, sr):
+    """使用 Silero VAD 检测纯语音（去除唱歌/Rap）"""
+    if not SILERO_MODEL: return None
+    # VAD 需要 16k 采样率
+    if sr != 16000:
+        vocals_16k = librosa.resample(vocals_audio, orig_sr=sr, target_sr=16000)
+    else:
+        vocals_16k = vocals_audio
+    audio_tensor = torch.from_numpy(vocals_16k).float().to(DEVICE)
+    speech_probs = []
+    window_size_samples = 512
+    with torch.no_grad():
+        for i in range(0, len(audio_tensor), window_size_samples):
+            chunk = audio_tensor[i:i+window_size_samples]
+            if len(chunk) < window_size_samples:
+                chunk = torch.nn.functional.pad(chunk, (0, window_size_samples - len(chunk)))
+            # 模型推理
+            out = SILERO_MODEL(chunk.unsqueeze(0), 16000)
+            speech_probs.append(out.item())
+    # 将概率扩展回原始长度
+    speech_mask = np.repeat(speech_probs, window_size_samples)[:len(vocals_16k)]
+    speech_mask = (speech_mask > 0.5).astype(np.float32) # 阈值 0.5
+    # 如果重采样过，需要插值回原始长度
+    if sr != 16000:
+        from scipy.interpolate import interp1d
+        f = interp1d(np.linspace(0, 1, len(speech_mask)), speech_mask, kind='nearest', fill_value="extrapolate")
+        speech_mask = f(np.linspace(0, 1, len(vocals_audio)))
+    return (speech_mask > 0.5).astype(np.float32)
+def process_audio_full(input_file, mode_selection, enable_detection):
+    """主处理流程"""
+    if input_file is None:
+        return None, None, None, "❌ 请先上传文件！"
+    logs = ["🚀 开始处理任务..."]
+    yield None, None, None, "\n".join(logs)
     try:
+        # 创建临时目录处理文件
         with tempfile.TemporaryDirectory() as tmpdir:
+            input_path = input_file.name
             temp_wav = os.path.join(tmpdir, "input.wav")
+            # 1. 预处理：如果是视频，提取音频；如果是音频，转为 WAV
+            if input_path.lower().endswith(('.mp4', '.mov', '.avi', '.mkv', '.flv', '.wmv')):
+                logs.append("🎬 检测到视频文件，正在提取音频...")
+                yield None, None, None, "\n".join(logs)
+                extract_audio_from_video(input_path, temp_wav)
+            else:
+                logs.append("🎵 加载音频文件...")
+                audio, sr = librosa.load(input_path, sr=SAMPLE_RATE, mono=False)
+                if audio.ndim == 1: audio = audio.reshape(1, -1)
+                sf.write(temp_wav, audio.T, sr, subtype="PCM_16")
+            # 2. Demucs 人声分离
+            logs.append("🤖 正在运行 Demucs 分离人声与伴奏 (可能需要几分钟)...")
+            yield None, None, None, "\n".join(logs)
+            vocals_path, inst_path = run_demucs_separation(temp_wav, tmpdir)
+            # 读取分离后的轨道
+            vocals, sr = librosa.load(vocals_path, sr=SAMPLE_RATE, mono=True)
+            instrumental, _ = librosa.load(inst_path, sr=SAMPLE_RATE, mono=True)
+            # 3. Silero VAD 智能检测
+            mask = np.ones_like(vocals) # 默认全部保留
+            detection_info = "未启用检测"
             if enable_detection:
                 if SILERO_AVAILABLE:
+                    logs.append("🧠 正在使用本地 Silero VAD 模型识别纯对白...")
+                    yield None, None, None, "\n".join(logs)
+                    vad_mask = detect_speech_with_silero(vocals, sr)
+                    if vad_mask is not None:
+                        mask = vad_mask
+                        detection_info = "Silero VAD (本地模型)"
                 else:
+                    logs.append("⚠️ 本地 VAD 模型未加载，跳过智能检测")
+            # 4. 混合轨道生成
+            # 逻辑：
+            # A轨 (纯对白) = 人声 * mask
+            # B轨 (背景) = 纯伴奏 + (人声 * (1-mask))  <-- 把不是对白的人声（如唱歌）加回背景
+            # C轨 (纯伴奏) = 纯伴奏
+            singing_mask = 1 - mask
+            track_dialogue = vocals * mask
+            track_bgm_plus = instrumental + (vocals * singing_mask)
+            track_instrumental = instrumental
+            # 5. 导出文件
+            path_a = os.path.join(tmpdir, "Track_A_Dialogue.wav")
+            path_b = os.path.join(tmpdir, "Track_B_Background.wav")
+            path_c = os.path.join(tmpdir, "Track_C_Instrumental.wav")
+            sf.write(path_a, track_dialogue, sr)
+            sf.write(path_b, track_bgm_plus, sr)
+            sf.write(path_c, track_instrumental, sr)
+            logs.append(f"✅ 处理完成！\n检测模式: {detection_info}")
+            logs.append("📂 可以在下方下载三个分离轨道")
+            yield path_a, path_b, path_c, "\n".join(logs)
     except Exception as e:
         import traceback
+        traceback.print_exc()
+        logs.append(f"❌ 发生严重错误: {str(e)}")
+        yield None, None, None, "\n".join(logs)
+# --- 启动时加载模型 ---
+print("⏳ 正在初始化系统...")
+load_silero_from_local()
+# --- Gradio 界面构建 (兼容 3.x) ---
+with gr.Blocks(title="AI 音频分离专家", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🎵 AI 音频分离专家 (修复版)
+        **功能**：上传视频或音频，自动分离出 **纯对白**、**背景声(含唱歌)** 和 **纯伴奏**。
+        """
+    )
     with gr.Row():
         with gr.Column(scale=1):
+            input_file = gr.File(label="📁 上传文件 (支持 MP4/MP3/WAV 等)", file_types=["audio", "video"])
+            with gr.Group():
+                chk_detect = gr.Checkbox(label="启用智能对白检测 (Silero VAD)", value=True, interactive=True)
+                radio_mode = gr.Radio(["标准模式", "严格模式"], label="检测灵敏度", value="标准模式")
+            btn_run = gr.Button("🚀 开始分离处理", variant="primary", size="lg")
+            status_log = gr.Textbox(label="运行日志", placeholder="等待任务开始...", lines=8, max_lines=12)
         with gr.Column(scale=1):
+            gr.Markdown("### 🎧 分离结果下载")
+            out_a = gr.Audio(label="🎤 A轨: 纯对白 (旁白/对话)", type="filepath")
+            out_b = gr.Audio(label="🎼 B轨: 背景 (BGM + 唱歌/Rap)", type="filepath")
+            out_c = gr.Audio(label="🎹 C轨: 纯伴奏 (无任何通过)", type="filepath")
+    # 绑定事件
+    btn_run.click(
         fn=process_audio_full,
+        inputs=[input_file, radio_mode, chk_detect],
+        outputs=[out_a, out_b, out_c, status_log]
     )
 if __name__ == "__main__":
+    # 允许队列，设置最大并发
+    demo.queue(max_size=10).launch(server_name="0.0.0.0", server_port=7860, show_error=True)

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-gradio==4.19.0
 torch==2.0.1
 torchaudio==2.0.2
-demucs
 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.24.3

+gradio==3.50.2
 torch==2.0.1
 torchaudio==2.0.2
+demucs==4.0.1
 librosa==0.10.1
 soundfile==0.12.1
 numpy==1.24.3