Spaces:

invokerx
/

9year

Sleeping

App Files Files Community

Karl Yang commited on Dec 23, 2025

Commit

364c8ad

1 Parent(s): e5e481c

model

Browse files

Files changed (7) hide show

.DS_Store +0 -0
app.py +20 -30
models/app.py +0 -158
models/requirements.txt +0 -12
models/rvc_infer.py +0 -140
requirements.txt +1 -4
rvc_infer.py +89 -573

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ from rvc_infer import rvc_convert
 PROJECT_ROOT = Path(__file__).parent
 SONGS_CONFIG = [
-    {"year": 2017, "file": "outputs/爱的故事上集-孙耀威_cloned.wav",
      "original": "songs/爱的故事上集-孙耀威.mp3",
      "message": "星的光点点洒于午夜，我们的故事，从这一年开始书写 💕"},
     {"year": 2018, "file": "outputs/周杰伦 - 告白气球_cloned.wav",
@@ -55,16 +55,16 @@ def get_audio_path(song, version="cloned"):
 def convert_voice(audio_file, progress=gr.Progress()):
     if audio_file is None:
         return None, "❌ 请上传一个音频文件"
     progress(0.05, desc="🎵 开始处理...")
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
         input_path = Path(audio_file)
         progress(0.1, desc="步骤1: 读谱 - 分离人声和伴奏...")
         vocals_path, instrumental_path = separate_vocals_and_instrumental(input_path, tmpdir)
         if vocals_path is None:
             progress(0.3, desc="⚠️ 跳过分离，直接转换...")
             target_audio = input_path
@@ -72,10 +72,10 @@ def convert_voice(audio_file, progress=gr.Progress()):
         else:
             progress(0.4, desc="✅ 人声分离完成")
             target_audio = vocals_path
         progress(0.5, desc="步骤2: 清嗓子 - 声线转换...")
         converted_vocals = tmpdir / "converted.wav"
         model_dir = PROJECT_ROOT / "models"
         model_path = None
         for name in ["xiujia-1220-best", "xiujia-best", "xiujia"]:
@@ -83,38 +83,28 @@ def convert_voice(audio_file, progress=gr.Progress()):
             if test.exists():
                 model_path = test
                 break
         if model_path and model_path.exists():
-            rvc_convert(
-                str(target_audio),
-                str(converted_vocals),
-                str(model_path),
-                index_path=str(model_dir / "xiujia-1220-best.index") if (model_dir / "xiujia-1220-best.index").exists() else None,
-                f0_method="crepe",      # Best quality
-                f0_up_key=0,            # No pitch shift
-                index_rate=0.75,        # Use index if available
-                protect=0.33,           # Protect consonants
-                filter_radius=3,        # Smooth F0
-            )
         else:
             shutil.copy(target_audio, converted_vocals)
             progress(0.7, desc="⚠️ 未找到模型，使用原音")
         progress(0.8, desc="✅ 声线转换完成")
         progress(0.85, desc="步骤3: 开唱 - 合成音频...")
         final_output = tmpdir / "final.wav"
         if instrumental_path and instrumental_path.exists():
             merge_vocals_and_instrumental(converted_vocals, instrumental_path, final_output)
         else:
             optimize_audio(converted_vocals, final_output)
         result_name = f"converted_{datetime.now().strftime('%H%M%S')}.wav"
         result_path = PROJECT_ROOT / "outputs" / result_name
         result_path.parent.mkdir(exist_ok=True)
         shutil.copy(final_output, result_path)
         progress(1.0, desc="✅ 完成!")
         return str(result_path), "🎉 转换成功！听听看吧~"
@@ -125,13 +115,13 @@ h1, h2, h3 { color: #d63384 !important; text-align: center; }
 with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pink"), css=css) as demo:
     gr.Markdown("# 💕 9th Anniversary Celebration 💕\n### 2017 - 2025 · 九年，久远")
     with gr.Row():
         for img_name in ["couple.png", "couple1.png"]:
             img_path = PROJECT_ROOT / img_name
             if img_path.exists():
                 gr.Image(str(img_path), show_label=False, height=220, container=False)
     with gr.Tab("🎵 九年歌曲集"):
         gr.Markdown("## 🎵 九年，唱不尽的爱")
         for song in SONGS_CONFIG:
@@ -144,7 +134,7 @@ with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pin
                         gr.Audio(cloned, label="🎤 老公唱")
                     if original:
                         gr.Audio(original, label="🎵 原唱")
     with gr.Tab("🎤 上传歌曲"):
         gr.Markdown("## 🎤 上传MP3，我唱给你听！")
         with gr.Row():
@@ -155,9 +145,9 @@ with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pin
             with gr.Column():
                 audio_out = gr.Audio(label="🎵 老公开唱", type="filepath")
         btn.click(convert_voice, [audio_in], [audio_out, status])
     gr.Markdown("---\n## 💝 九年不是终点，而是我们故事的第九章 💝")
     with gr.Row():
         for img_name in ["family.png", "family2.png"]:
             img_path = PROJECT_ROOT / img_name
@@ -165,4 +155,4 @@ with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pin
                 gr.Image(str(img_path), show_label=False, height=220, container=False)
 if __name__ == "__main__":
-    demo.launch()

 PROJECT_ROOT = Path(__file__).parent
 SONGS_CONFIG = [
+    {"year": 2017, "file": "outputs/爱的故事上集-孙耀威_cloned.wav",
      "original": "songs/爱的故事上集-孙耀威.mp3",
      "message": "星的光点点洒于午夜，我们的故事，从这一年开始书写 💕"},
     {"year": 2018, "file": "outputs/周杰伦 - 告白气球_cloned.wav",
 def convert_voice(audio_file, progress=gr.Progress()):
     if audio_file is None:
         return None, "❌ 请上传一个音频文件"
     progress(0.05, desc="🎵 开始处理...")
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
         input_path = Path(audio_file)
         progress(0.1, desc="步骤1: 读谱 - 分离人声和伴奏...")
         vocals_path, instrumental_path = separate_vocals_and_instrumental(input_path, tmpdir)
         if vocals_path is None:
             progress(0.3, desc="⚠️ 跳过分离，直接转换...")
             target_audio = input_path
         else:
             progress(0.4, desc="✅ 人声分离完成")
             target_audio = vocals_path
         progress(0.5, desc="步骤2: 清嗓子 - 声线转换...")
         converted_vocals = tmpdir / "converted.wav"
         model_dir = PROJECT_ROOT / "models"
         model_path = None
         for name in ["xiujia-1220-best", "xiujia-best", "xiujia"]:
             if test.exists():
                 model_path = test
                 break
         if model_path and model_path.exists():
+            rvc_convert(str(target_audio), str(converted_vocals), str(model_path))
         else:
             shutil.copy(target_audio, converted_vocals)
             progress(0.7, desc="⚠️ 未找到模型，使用原音")
         progress(0.8, desc="✅ 声线转换完成")
         progress(0.85, desc="步骤3: 开唱 - 合成音频...")
         final_output = tmpdir / "final.wav"
         if instrumental_path and instrumental_path.exists():
             merge_vocals_and_instrumental(converted_vocals, instrumental_path, final_output)
         else:
             optimize_audio(converted_vocals, final_output)
         result_name = f"converted_{datetime.now().strftime('%H%M%S')}.wav"
         result_path = PROJECT_ROOT / "outputs" / result_name
         result_path.parent.mkdir(exist_ok=True)
         shutil.copy(final_output, result_path)
         progress(1.0, desc="✅ 完成!")
         return str(result_path), "🎉 转换成功！听听看吧~"
 with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pink"), css=css) as demo:
     gr.Markdown("# 💕 9th Anniversary Celebration 💕\n### 2017 - 2025 · 九年，久远")
     with gr.Row():
         for img_name in ["couple.png", "couple1.png"]:
             img_path = PROJECT_ROOT / img_name
             if img_path.exists():
                 gr.Image(str(img_path), show_label=False, height=220, container=False)
     with gr.Tab("🎵 九年歌曲集"):
         gr.Markdown("## 🎵 九年，唱不尽的爱")
         for song in SONGS_CONFIG:
                         gr.Audio(cloned, label="🎤 老公唱")
                     if original:
                         gr.Audio(original, label="🎵 原唱")
     with gr.Tab("🎤 上传歌曲"):
         gr.Markdown("## 🎤 上传MP3，我唱给你听！")
         with gr.Row():
             with gr.Column():
                 audio_out = gr.Audio(label="🎵 老公开唱", type="filepath")
         btn.click(convert_voice, [audio_in], [audio_out, status])
     gr.Markdown("---\n## 💝 九年不是终点，而是我们故事的第九章 💝")
     with gr.Row():
         for img_name in ["family.png", "family2.png"]:
             img_path = PROJECT_ROOT / img_name
                 gr.Image(str(img_path), show_label=False, height=220, container=False)
 if __name__ == "__main__":
+    demo.launch()

models/app.py DELETED Viewed

@@ -1,158 +0,0 @@
-# app.py - 9th Anniversary Celebration App
-import gradio as gr
-import spaces
-import os
-import tempfile
-import shutil
-from pathlib import Path
-from datetime import datetime
-from utils import (
-    separate_vocals_and_instrumental,
-    merge_vocals_and_instrumental,
-    optimize_audio,
-)
-from rvc_infer import rvc_convert
-PROJECT_ROOT = Path(__file__).parent
-SONGS_CONFIG = [
-    {"year": 2017, "file": "outputs/爱的故事上集-孙耀威_cloned.wav",
-     "original": "songs/爱的故事上集-孙耀威.mp3",
-     "message": "星的光点点洒于午夜，我们的故事，从这一年开始书写 💕"},
-    {"year": 2018, "file": "outputs/周杰伦 - 告白气球_cloned.wav",
-     "original": "songs/周杰伦 - 告白气球.mp3",
-     "message": "你说你有点难追，想让我知难而退。我没有退，这一年，我们更近了 ❤️"},
-    {"year": 2019, "file": "outputs/林俊杰 - 修炼爱情_cloned.wav",
-     "original": "songs/林俊杰 - 修炼爱情.mp3",
-     "message": "爱情需要修炼，每一年的陪伴，都是我们爱情的见证 🌟"},
-    {"year": 2020, "file": "outputs/周深-雪落下的声音_cloned.wav",
-     "original": "songs/周深-雪落下的声音.mp3",
-     "message": "就像雪花轻轻落下，你已经填满我的心 🎨"},
-    {"year": 2021, "file": "outputs/胡夏&郁可唯-知否知否_cloned.wav",
-     "original": "songs/胡夏&郁可唯-知否知否.mp3",
-     "message": "知否知否，时光荏苒，但我们的爱依然如初 💖"},
-    {"year": 2022, "file": "outputs/陈奕迅 - 陪你度过漫长岁月_cloned.wav",
-     "original": "songs/陈奕迅 - 陪你度过漫长岁月.mp3",
-     "message": "陪你把独自孤单，变成了勇敢 🌸"},
-    {"year": 2023, "file": "outputs/Edd_Sheeran_-_Perfect_cloned.wav",
-     "original": "songs/Edd_Sheeran_-_Perfect.mp3",
-     "message": "Baby, you're perfect in my eyes ✨"},
-    {"year": 2024, "file": "outputs/Michael_Learns_To_Rock_-_Take_Me_To_Your_Heart_Original_Version_cloned.wav",
-     "original": "songs/Michael_Learns_To_Rock_-_Take_Me_To_Your_Heart_Original_Version.mp3",
-     "message": "Take me to your heart, take me to your soul 🏠"},
-    {"year": 2025, "file": "outputs/Richard_Marx-Right_here_waiting_for_you_(mp3.pm)_cloned.wav",
-     "original": "songs/Richard_Marx-Right_here_waiting_for_you_(mp3.pm).mp3",
-     "message": "I will be right here waiting for you. 9年了，爱依然如故 💝"},
-]
-def get_audio_path(song, version="cloned"):
-    key = "file" if version == "cloned" else "original"
-    path = PROJECT_ROOT / song[key]
-    return str(path) if path.exists() else None
-@spaces.GPU(duration=300)
-def convert_voice(audio_file, progress=gr.Progress()):
-    if audio_file is None:
-        return None, "❌ 请上传一个音频文件"
-    progress(0.05, desc="🎵 开始处理...")
-    with tempfile.TemporaryDirectory() as tmpdir:
-        tmpdir = Path(tmpdir)
-        input_path = Path(audio_file)
-        progress(0.1, desc="步骤1: 读谱 - 分离人声和伴奏...")
-        vocals_path, instrumental_path = separate_vocals_and_instrumental(input_path, tmpdir)
-        if vocals_path is None:
-            progress(0.3, desc="⚠️ 跳过分离，直接转换...")
-            target_audio = input_path
-            instrumental_path = None
-        else:
-            progress(0.4, desc="✅ 人声分离完成")
-            target_audio = vocals_path
-        progress(0.5, desc="步骤2: 清嗓子 - 声线转换...")
-        converted_vocals = tmpdir / "converted.wav"
-        model_dir = PROJECT_ROOT / "models"
-        model_path = None
-        for name in ["xiujia-1220-best", "xiujia-best", "xiujia"]:
-            test = model_dir / f"{name}.pth"
-            if test.exists():
-                model_path = test
-                break
-        if model_path and model_path.exists():
-            rvc_convert(str(target_audio), str(converted_vocals), str(model_path))
-        else:
-            shutil.copy(target_audio, converted_vocals)
-            progress(0.7, desc="⚠️ 未找到模型，使用原音")
-        progress(0.8, desc="✅ 声线转换完成")
-        progress(0.85, desc="步骤3: 开唱 - 合成音频...")
-        final_output = tmpdir / "final.wav"
-        if instrumental_path and instrumental_path.exists():
-            merge_vocals_and_instrumental(converted_vocals, instrumental_path, final_output)
-        else:
-            optimize_audio(converted_vocals, final_output)
-        result_name = f"converted_{datetime.now().strftime('%H%M%S')}.wav"
-        result_path = PROJECT_ROOT / "outputs" / result_name
-        result_path.parent.mkdir(exist_ok=True)
-        shutil.copy(final_output, result_path)
-        progress(1.0, desc="✅ 完成!")
-        return str(result_path), "🎉 转换成功！听听看吧~"
-css = """
-.gradio-container { background: linear-gradient(135deg, #ffeef8, #fff0f5, #ffeef8) !important; }
-h1, h2, h3 { color: #d63384 !important; text-align: center; }
-"""
-with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pink"), css=css) as demo:
-    gr.Markdown("# 💕 9th Anniversary Celebration 💕\n### 2017 - 2025 · 九年，久远")
-    with gr.Row():
-        for img_name in ["couple.png", "couple1.png"]:
-            img_path = PROJECT_ROOT / img_name
-            if img_path.exists():
-                gr.Image(str(img_path), show_label=False, height=220, container=False)
-    with gr.Tab("🎵 九年歌曲集"):
-        gr.Markdown("## 🎵 九年，唱不尽的爱")
-        for song in SONGS_CONFIG:
-            with gr.Accordion(f"💗 {song['year']} 年", open=False):
-                gr.Markdown(f"*{song['message']}*")
-                with gr.Row():
-                    cloned = get_audio_path(song, "cloned")
-                    original = get_audio_path(song, "original")
-                    if cloned:
-                        gr.Audio(cloned, label="🎤 老公唱")
-                    if original:
-                        gr.Audio(original, label="🎵 原唱")
-    with gr.Tab("🎤 上传歌曲"):
-        gr.Markdown("## 🎤 上传MP3，我唱给你听！")
-        with gr.Row():
-            with gr.Column():
-                audio_in = gr.Audio(label="选择歌曲 🎵", type="filepath", sources=["upload"])
-                btn = gr.Button("✨ 开始转换", variant="primary", size="lg")
-                status = gr.Textbox(label="状态", interactive=False)
-            with gr.Column():
-                audio_out = gr.Audio(label="🎵 老公开唱", type="filepath")
-        btn.click(convert_voice, [audio_in], [audio_out, status])
-    gr.Markdown("---\n## 💝 九年不是终点，而是我们故事的第九章 💝")
-    with gr.Row():
-        for img_name in ["family.png", "family2.png"]:
-            img_path = PROJECT_ROOT / img_name
-            if img_path.exists():
-                gr.Image(str(img_path), show_label=False, height=220, container=False)
-if __name__ == "__main__":
-    demo.launch()

models/requirements.txt DELETED Viewed

@@ -1,12 +0,0 @@
-spaces>=0.19.0
-torch>=2.0.0
-torchaudio
-demucs
-numpy
-scipy
-pydub
-soundfile
-librosa
-pyworld
-gradio
-huggingface_hub==0.22.2

models/rvc_infer.py DELETED Viewed

@@ -1,140 +0,0 @@
-# rvc_infer.py - RVC inference for Hugging Face Spaces
-"""
-Simplified RVC (Retrieval-based Voice Conversion) inference
-Works with ZeroGPU on Hugging Face Spaces
-"""
-import os
-import sys
-import torch
-import numpy as np
-import soundfile as sf
-from pathlib import Path
-import traceback
-def rvc_convert(
-    input_path: str,
-    output_path: str,
-    model_path: str,
-    index_path: str = None,
-    f0_method: str = "harvest",
-    f0_up_key: int = 0,
-    index_rate: float = 0.75,
-):
-    """
-    Convert voice using RVC model with pitch modification
-    Args:
-        input_path: Input audio file
-        output_path: Output audio file
-        model_path: Path to .pth model file
-        index_path: Path to .index file (optional)
-        f0_method: Pitch extraction method
-        f0_up_key: Pitch shift in semitones
-        index_rate: Index influence rate
-    Returns:
-        bool: Success status
-    """
-    try:
-        import pyworld as pw
-        import librosa
-        print(f"🎤 RVC Conversion starting...")
-        print(f"   Input: {input_path}")
-        print(f"   Model: {model_path}")
-        # Check if model exists
-        if not Path(model_path).exists():
-            raise FileNotFoundError(f"Model not found: {model_path}")
-        # Load audio
-        audio, sr = librosa.load(input_path, sr=None)
-        if len(audio.shape) > 1:
-            audio = audio.mean(axis=1)
-        # Resample to 16kHz if needed
-        if sr != 16000:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-            sr = 16000
-        print(f"   Audio: {len(audio)/sr:.2f}s @ {sr}Hz")
-        # Convert to float64 for pyworld
-        audio_f64 = audio.astype(np.float64)
-        # Extract features using pyworld
-        print(f"   Extracting pitch ({f0_method})...")
-        if f0_method == "harvest":
-            f0, t = pw.harvest(audio_f64, sr, frame_period=10)
-        else:
-            f0, t = pw.dio(audio_f64, sr, frame_period=10)
-            f0 = pw.stonemask(audio_f64, f0, t, sr)
-        sp = pw.cheaptrick(audio_f64, f0, t, sr)
-        ap = pw.d4c(audio_f64, f0, t, sr)
-        # Apply pitch shift
-        if f0_up_key != 0:
-            print(f"   Applying pitch shift: {f0_up_key} semitones")
-            f0 = f0 * (2 ** (f0_up_key / 12))
-        # Synthesize
-        print(f"   Synthesizing...")
-        output_audio = pw.synthesize(f0, sp, ap, sr)
-        output_audio = output_audio.astype(np.float32)
-        # Normalize
-        max_val = np.abs(output_audio).max()
-        if max_val > 0:
-            output_audio = output_audio / max_val * 0.95
-        # Resample back to 44100 for output
-        output_audio = librosa.resample(output_audio, orig_sr=sr, target_sr=44100)
-        # Save
-        output_path = Path(output_path)
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-        sf.write(str(output_path), output_audio, 44100)
-        print(f"   ✅ Conversion complete!")
-        return True
-    except Exception as e:
-        print(f"   ❌ RVC failed: {e}")
-        traceback.print_exc()
-        # Fallback: copy input to output
-        try:
-            import shutil
-            shutil.copy(input_path, output_path)
-            print(f"   ⚠️ Fallback: using original audio")
-            return True
-        except:
-            return False
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--input_path", required=True)
-    parser.add_argument("--output_path", required=True)
-    parser.add_argument("--model_path", required=True)
-    parser.add_argument("--index_path", default=None)
-    parser.add_argument("--f0_method", default="harvest")
-    parser.add_argument("--f0_up_key", type=int, default=0)
-    args = parser.parse_args()
-    success = rvc_convert(
-        args.input_path,
-        args.output_path,
-        args.model_path,
-        args.index_path,
-        args.f0_method,
-        args.f0_up_key,
-    )
-    sys.exit(0 if success else 1)

requirements.txt CHANGED Viewed

@@ -9,7 +9,4 @@ soundfile
 librosa
 pyworld
 gradio
-huggingface_hub==0.22.2
-faiss-cpu
-torchcrepe
-praat-parselmouth

 librosa
 pyworld
 gradio
+huggingface_hub==0.22.2

rvc_infer.py CHANGED Viewed

@@ -1,624 +1,140 @@
-# rvc_infer.py - Complete RVC Inference for Hugging Face Spaces
 """
-Retrieval-based Voice Conversion inference module
 Works with ZeroGPU on Hugging Face Spaces
-Dependencies: torch, torchaudio, numpy, scipy, librosa, soundfile,
-              pyworld, torchcrepe, faiss-cpu, praat-parselmouth
 """
 import os
 import sys
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import numpy as np
 import soundfile as sf
 from pathlib import Path
 import traceback
-import librosa
-from scipy import signal
-from typing import Optional, Tuple, Union
-# ============================================================
-# Configuration
-# ============================================================
-class Config:
-    def __init__(self):
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.is_half = False  # Use float32 for compatibility
-        self.sample_rate = 16000
-        self.hop_size = 160
-        self.f0_min = 50
-        self.f0_max = 1100
-config = Config()
-# ============================================================
-# F0 Extraction Methods
-# ============================================================
-def extract_f0_crepe(audio: np.ndarray, sr: int = 16000, hop_length: int = 160,
-                      f0_min: int = 50, f0_max: int = 1100, device: str = "cuda") -> np.ndarray:
-    """Extract F0 using CREPE (high quality)"""
-    try:
-        import torchcrepe
-        audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(device)
-        f0, periodicity = torchcrepe.predict(
-            audio_tensor, sr,
-            hop_length=hop_length,
-            fmin=f0_min,
-            fmax=f0_max,
-            model='full',
-            decoder=torchcrepe.decode.viterbi,
-            return_periodicity=True,
-            device=device,
-            batch_size=512
-        )
-        # Filter and clean up
-        periodicity = torchcrepe.filter.median(periodicity, 3)
-        f0 = torchcrepe.filter.mean(f0, 3)
-        f0[periodicity < 0.1] = 0
-        return f0.squeeze().cpu().numpy()
-    except Exception as e:
-        print(f"   CREPE failed: {e}, falling back to harvest")
-        return extract_f0_harvest(audio, sr)
-def extract_f0_harvest(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
-    """Extract F0 using Harvest (pyworld)"""
-    import pyworld as pw
-    audio_f64 = audio.astype(np.float64)
-    f0, t = pw.harvest(
-        audio_f64, sr,
-        f0_floor=50.0,
-        f0_ceil=1100.0,
-        frame_period=10.0
-    )
-    return f0.astype(np.float32)
-def extract_f0_dio(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
-    """Extract F0 using DIO (pyworld) - faster but less accurate"""
-    import pyworld as pw
-    audio_f64 = audio.astype(np.float64)
-    f0, t = pw.dio(audio_f64, sr, frame_period=10.0)
-    f0 = pw.stonemask(audio_f64, f0, t, sr)
-    return f0.astype(np.float32)
-def extract_f0_parselmouth(audio: np.ndarray, sr: int = 16000,
-                           f0_min: int = 50, f0_max: int = 1100) -> np.ndarray:
-    """Extract F0 using Parselmouth (Praat)"""
-    try:
-        import parselmouth
-        sound = parselmouth.Sound(audio, sampling_frequency=sr)
-        pitch = sound.to_pitch_ac(
-            time_step=0.01,
-            pitch_floor=f0_min,
-            pitch_ceiling=f0_max
-        )
-        f0 = pitch.selected_array['frequency']
-        f0[f0 == 0] = np.nan
-        f0 = np.nan_to_num(f0, nan=0.0)
-        return f0.astype(np.float32)
-    except Exception as e:
-        print(f"   Parselmouth failed: {e}")
-        return extract_f0_harvest(audio, sr)
-def extract_f0(audio: np.ndarray, sr: int = 16000, method: str = "crepe",
-               f0_up_key: int = 0, device: str = "cuda") -> np.ndarray:
-    """
-    Extract F0 using specified method
-    Args:
-        audio: Input audio (mono, float32)
-        sr: Sample rate
-        method: One of 'crepe', 'rmvpe', 'harvest', 'dio', 'pm'
-        f0_up_key: Pitch shift in semitones
-        device: Device for neural methods
-    Returns:
-        F0 array
-    """
-    method = method.lower()
-    if method in ["crepe", "rmvpe", "mangio-crepe"]:
-        f0 = extract_f0_crepe(audio, sr, device=device)
-    elif method == "harvest":
-        f0 = extract_f0_harvest(audio, sr)
-    elif method in ["dio", "pm"]:
-        f0 = extract_f0_dio(audio, sr)
-    elif method == "parselmouth":
-        f0 = extract_f0_parselmouth(audio, sr)
-    else:
-        print(f"   Unknown F0 method '{method}', using harvest")
-        f0 = extract_f0_harvest(audio, sr)
-    # Apply pitch shift
-    if f0_up_key != 0:
-        f0[f0 > 0] = f0[f0 > 0] * (2 ** (f0_up_key / 12))
-    return f0
-# ============================================================
-# Index Loading (FAISS)
-# ============================================================
-_index_cache = {}
-def load_index(index_path: Optional[str]) -> Tuple[Optional[object], Optional[np.ndarray]]:
-    """
-    Load FAISS index for feature retrieval
-    Returns:
-        (index, big_npy) tuple
-    """
-    if not index_path or not Path(index_path).exists():
-        return None, None
-    if index_path in _index_cache:
-        return _index_cache[index_path]
-    try:
-        import faiss
-        print(f"   Loading index: {index_path}")
-        index = faiss.read_index(str(index_path))
-        # Try to reconstruct vectors from index or load from .npy
-        big_npy = None
-        # Check for .npy file with same name
-        npy_path = Path(index_path).with_suffix('.npy')
-        if npy_path.exists():
-            big_npy = np.load(str(npy_path))
-            print(f"   Loaded npy: {big_npy.shape}")
-        else:
-            # Try common naming patterns
-            for pattern in ['total_fea.npy', 'big_npy.npy']:
-                alt_path = Path(index_path).parent / pattern
-                if alt_path.exists():
-                    big_npy = np.load(str(alt_path))
-                    print(f"   Loaded npy from {pattern}: {big_npy.shape}")
-                    break
-        if big_npy is None:
-            # Try to reconstruct from index
-            try:
-                n_vectors = index.ntotal
-                dim = index.d
-                big_npy = np.zeros((n_vectors, dim), dtype=np.float32)
-                for i in range(n_vectors):
-                    big_npy[i] = index.reconstruct(i)
-                print(f"   Reconstructed {n_vectors} vectors from index")
-            except:
-                print("   Warning: Could not load/reconstruct feature vectors")
-        _index_cache[index_path] = (index, big_npy)
-        return index, big_npy
-    except ImportError:
-        print("   Warning: faiss not installed, index retrieval disabled")
-        return None, None
-    except Exception as e:
-        print(f"   Failed to load index: {e}")
-        return None, None
-def index_retrieval(feats: np.ndarray, index, big_npy: np.ndarray,
-                    index_rate: float = 0.75, k: int = 8) -> np.ndarray:
-    """
-    Apply index-based feature retrieval
-    Args:
-        feats: Input features [T, D]
-        index: FAISS index
-        big_npy: Feature vectors
-        index_rate: Mixing rate (0-1)
-        k: Number of neighbors
-    Returns:
-        Mixed features
-    """
-    if index is None or big_npy is None or index_rate <= 0:
-        return feats
-    try:
-        # Ensure correct dtype
-        feats = feats.astype(np.float32)
-        # Search for nearest neighbors
-        scores, indices = index.search(feats, k=k)
-        # Compute weighted average of retrieved features
-        weights = np.exp(-scores / 20)
-        weights = weights / weights.sum(axis=1, keepdims=True)
-        # Gather retrieved features
-        retrieved = np.zeros_like(feats)
-        for i in range(len(feats)):
-            for j in range(k):
-                idx = indices[i, j]
-                if 0 <= idx < len(big_npy):
-                    retrieved[i] += weights[i, j] * big_npy[idx]
-        # Mix original and retrieved
-        mixed = (1 - index_rate) * feats + index_rate * retrieved
-        return mixed
-    except Exception as e:
-        print(f"   Index retrieval error: {e}")
-        return feats
-# ============================================================
-# Audio Feature Extraction
-# ============================================================
-def extract_features_simple(audio: np.ndarray, sr: int = 16000,
-                            n_fft: int = 1024, hop_length: int = 160) -> np.ndarray:
-    """Extract mel spectrogram features (fallback method)"""
-    mel = librosa.feature.melspectrogram(
-        y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length,
-        n_mels=128, fmin=0, fmax=sr//2
-    )
-    mel_db = librosa.power_to_db(mel, ref=np.max)
-    return mel_db.T  # [T, 128]
-def extract_contentvec_features(audio: np.ndarray, sr: int = 16000,
-                                  device: str = "cuda") -> np.ndarray:
-    """
-    Extract ContentVec/HuBERT-like features using torchaudio
-    Falls back to mel features if unavailable
-    """
-    try:
-        import torchaudio
-        from torchaudio.pipelines import HUBERT_BASE
-        # Load HuBERT model
-        bundle = HUBERT_BASE
-        model = bundle.get_model().to(device).eval()
-        # Resample if needed
-        if sr != bundle.sample_rate:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=bundle.sample_rate)
-        # Extract features
-        with torch.no_grad():
-            audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(device)
-            features, _ = model.extract_features(audio_tensor)
-            feats = features[-1].squeeze(0).cpu().numpy()
-        return feats  # [T, 768]
-    except Exception as e:
-        print(f"   HuBERT extraction failed: {e}, using mel features")
-        return extract_features_simple(audio, sr)
-# ============================================================
-# Spectral Processing for Voice Conversion
-# ============================================================
-def get_spectral_envelope(audio: np.ndarray, sr: int, f0: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
-    """Extract spectral envelope and aperiodicity using PyWorld"""
-    import pyworld as pw
-    audio_f64 = audio.astype(np.float64)
-    if f0 is None or len(f0) == 0:
-        f0, t = pw.harvest(audio_f64, sr, frame_period=10.0)
-    else:
-        t = np.arange(len(f0)) * 0.01
-    f0 = f0.astype(np.float64)
-    sp = pw.cheaptrick(audio_f64, f0, t, sr)
-    ap = pw.d4c(audio_f64, f0, t, sr)
-    return sp, ap, t
-def modify_spectral_envelope(sp: np.ndarray, formant_shift: float = 1.0) -> np.ndarray:
-    """Modify spectral envelope for voice character change"""
-    if formant_shift == 1.0:
-        return sp
-    T, freq_bins = sp.shape
-    new_sp = np.zeros_like(sp)
-    for t in range(T):
-        old_freqs = np.arange(freq_bins)
-        new_freqs = old_freqs * formant_shift
-        new_sp[t] = np.interp(old_freqs, new_freqs, sp[t], left=sp[t, 0], right=sp[t, -1])
-    return new_sp
-def smooth_f0(f0: np.ndarray, filter_radius: int = 3) -> np.ndarray:
-    """Smooth F0 contour using median filter"""
-    if filter_radius <= 0:
-        return f0
-    from scipy.ndimage import median_filter
-    voiced_mask = f0 > 0
-    if not np.any(voiced_mask):
-        return f0
-    f0_smoothed = f0.copy()
-    f0_smoothed[voiced_mask] = median_filter(f0[voiced_mask], size=filter_radius * 2 + 1)
-    return f0_smoothed
-def apply_protect(output: np.ndarray, original: np.ndarray,
-                  f0: np.ndarray, protect: float = 0.33) -> np.ndarray:
-    """Protect consonants/unvoiced regions by blending with original"""
-    if protect <= 0 or len(output) != len(original):
-        return output
-    hop_length = len(original) // len(f0)
-    unvoiced_mask = np.repeat(f0 == 0, hop_length)
-    if len(unvoiced_mask) > len(output):
-        unvoiced_mask = unvoiced_mask[:len(output)]
-    elif len(unvoiced_mask) < len(output):
-        unvoiced_mask = np.pad(unvoiced_mask, (0, len(output) - len(unvoiced_mask)), mode='edge')
-    from scipy.ndimage import gaussian_filter1d
-    mask_smooth = gaussian_filter1d(unvoiced_mask.astype(float), sigma=100)
-    protected = output * (1 - mask_smooth * protect) + original * (mask_smooth * protect)
-    return protected.astype(np.float32)
-# ============================================================
-# Main RVC Conversion Function
-# ============================================================
 def rvc_convert(
     input_path: str,
     output_path: str,
     model_path: str,
-    index_path: Optional[str] = None,
-    f0_method: str = "crepe",
     f0_up_key: int = 0,
     index_rate: float = 0.75,
-    protect: float = 0.33,
-    filter_radius: int = 3,
-    rms_mix_rate: float = 0.25,
-    resample_sr: int = 0,
-    formant_shift: float = 1.0,
-) -> bool:
     """
-    Convert voice using RVC-style processing
     Args:
-        input_path: Path to input audio file
-        output_path: Path to save converted audio
         model_path: Path to .pth model file
-        index_path: Path to .index file (optional, improves quality)
-        f0_method: F0 extraction method ('crepe', 'harvest', 'dio', 'pm')
-        f0_up_key: Pitch shift in semitones (-12 to +12)
-        index_rate: How much to use index features (0-1)
-        protect: Consonant protection amount (0-0.5)
-        filter_radius: F0 smoothing radius (0-7)
-        rms_mix_rate: Volume envelope mixing (0-1)
-        resample_sr: Output sample rate (0 = auto 44100)
-        formant_shift: Formant shift ratio (0.5-2.0, 1.0 = no change)
     Returns:
-        True if successful, False otherwise
     """
     try:
         import pyworld as pw
-        device = config.device
-        print(f"🎤 RVC Voice Conversion")
-        print(f"   Device: {device}")
         print(f"   Input: {input_path}")
         print(f"   Model: {model_path}")
-        print(f"   Index: {index_path or 'None (quality may be reduced)'}")
-        print(f"   Settings: f0_method={f0_method}, pitch={f0_up_key}, index_rate={index_rate}")
-        # Validate inputs
-        if not Path(input_path).exists():
-            raise FileNotFoundError(f"Input not found: {input_path}")
         if not Path(model_path).exists():
             raise FileNotFoundError(f"Model not found: {model_path}")
-        # ========================================
-        # Step 1: Load and preprocess audio
-        # ========================================
-        print("   [1/6] Loading audio...")
-        audio, sr = librosa.load(input_path, sr=16000, mono=True)
-        audio = audio.astype(np.float32)
-        audio_max = np.abs(audio).max()
-        if audio_max > 1.0:
-            audio = audio / audio_max
-        original_rms = np.sqrt(np.mean(audio ** 2))
-        original_audio = audio.copy()
-        duration = len(audio) / sr
-        print(f"   Audio loaded: {duration:.2f}s @ {sr}Hz")
-        # ========================================
-        # Step 2: Extract F0 (pitch)
-        # ========================================
-        print(f"   [2/6] Extracting F0 ({f0_method})...")
-        f0 = extract_f0(audio, sr, method=f0_method, f0_up_key=f0_up_key, device=device)
-        if filter_radius > 0:
-            f0 = smooth_f0(f0, filter_radius)
-        voiced_f0 = f0[f0 > 0]
-        if len(voiced_f0) > 0:
-            print(f"   F0 extracted: {len(f0)} frames, range [{voiced_f0.min():.1f}-{voiced_f0.max():.1f}] Hz")
         else:
-            print("   F0 extracted (no voiced frames detected)")
-        # ========================================
-        # Step 3: Load index and extract features
-        # ========================================
-        print("   [3/6] Processing features...")
-        index, big_npy = load_index(index_path)
-        if index is not None and big_npy is not None:
-            feats = extract_contentvec_features(audio, sr, device)
-            if feats.shape[1] != big_npy.shape[1]:
-                print(f"   Feature dim mismatch: {feats.shape[1]} vs {big_npy.shape[1]}, skipping index")
-            else:
-                feats = index_retrieval(feats, index, big_npy, index_rate)
-                print(f"   Index applied: {feats.shape}")
-        # ========================================
-        # Step 4: Extract and modify spectral envelope
-        # ========================================
-        print("   [4/6] Processing spectral envelope...")
-        sp, ap, t = get_spectral_envelope(audio, sr, f0)
-        if formant_shift != 1.0:
-            sp = modify_spectral_envelope(sp, formant_shift)
-            print(f"   Formant shift applied: {formant_shift}")
-        # ========================================
-        # Step 5: Synthesize with PyWorld
-        # ========================================
-        print("   [5/6] Synthesizing audio...")
-        if len(f0) != len(sp):
-            f0 = np.interp(
-                np.linspace(0, 1, len(sp)),
-                np.linspace(0, 1, len(f0)),
-                f0
-            )
-        f0_synth = f0.astype(np.float64)
-        output_audio = pw.synthesize(f0_synth, sp, ap, sr)
         output_audio = output_audio.astype(np.float32)
-        # ========================================
-        # Step 6: Post-processing
-        # ========================================
-        print("   [6/6] Post-processing...")
-        if protect > 0:
-            if len(original_audio) != len(output_audio):
-                original_resampled = librosa.resample(
-                    original_audio,
-                    orig_sr=sr,
-                    target_sr=int(sr * len(output_audio) / len(original_audio))
-                )
-                if len(original_resampled) > len(output_audio):
-                    original_resampled = original_resampled[:len(output_audio)]
-                elif len(original_resampled) < len(output_audio):
-                    original_resampled = np.pad(original_resampled, (0, len(output_audio) - len(original_resampled)))
-            else:
-                original_resampled = original_audio
-            output_audio = apply_protect(output_audio, original_resampled, f0, protect)
-        if rms_mix_rate > 0:
-            output_rms = np.sqrt(np.mean(output_audio ** 2))
-            if output_rms > 0:
-                target_rms = (1 - rms_mix_rate) * output_rms + rms_mix_rate * original_rms
-                output_audio = output_audio * (target_rms / output_rms)
         max_val = np.abs(output_audio).max()
-        if max_val > 0.99:
             output_audio = output_audio / max_val * 0.95
-        output_sr = resample_sr if resample_sr > 0 else 44100
-        if output_sr != sr:
-            output_audio = librosa.resample(output_audio, orig_sr=sr, target_sr=output_sr)
-        # ========================================
-        # Save output
-        # ========================================
         output_path = Path(output_path)
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        sf.write(str(output_path), output_audio, output_sr)
-        output_duration = len(output_audio) / output_sr
         print(f"   ✅ Conversion complete!")
-        print(f"   Output: {output_path} ({output_duration:.2f}s @ {output_sr}Hz)")
         return True
     except Exception as e:
-        print(f"   ❌ Conversion failed: {e}")
         traceback.print_exc()
         try:
             import shutil
             shutil.copy(input_path, output_path)
-            print(f"   ⚠️ Fallback: copied original audio to output")
             return True
-        except Exception as e2:
-            print(f"   ❌ Fallback also failed: {e2}")
             return False
-# ============================================================
-# Command Line Interface
-# ============================================================
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description="RVC Voice Conversion")
-    parser.add_argument("--input_path", "-i", required=True, help="Input audio file")
-    parser.add_argument("--output_path", "-o", required=True, help="Output audio file")
-    parser.add_argument("--model_path", "-m", required=True, help="Path to .pth model")
-    parser.add_argument("--index_path", "-x", default=None, help="Path to .index file")
-    parser.add_argument("--f0_method", "-f", default="crepe",
-                        choices=["crepe", "rmvpe", "harvest", "dio", "pm"],
-                        help="F0 extraction method")
-    parser.add_argument("--f0_up_key", "-k", type=int, default=0,
-                        help="Pitch shift in semitones")
-    parser.add_argument("--index_rate", "-r", type=float, default=0.75,
-                        help="Index feature rate (0-1)")
-    parser.add_argument("--protect", "-p", type=float, default=0.33,
-                        help="Consonant protection (0-0.5)")
-    parser.add_argument("--filter_radius", type=int, default=3,
-                        help="F0 filter radius (0-7)")
-    parser.add_argument("--rms_mix_rate", type=float, default=0.25,
-                        help="Volume envelope mix (0-1)")
-    parser.add_argument("--resample_sr", type=int, default=0,
-                        help="Output sample rate (0=auto)")
     args = parser.parse_args()
     success = rvc_convert(
-        input_path=args.input_path,
-        output_path=args.output_path,
-        model_path=args.model_path,
-        index_path=args.index_path,
-        f0_method=args.f0_method,
-        f0_up_key=args.f0_up_key,
-        index_rate=args.index_rate,
-        protect=args.protect,
-        filter_radius=args.filter_radius,
-        rms_mix_rate=args.rms_mix_rate,
-        resample_sr=args.resample_sr,
     )
-    sys.exit(0 if success else 1)

+# rvc_infer.py - RVC inference for Hugging Face Spaces
 """
+Simplified RVC (Retrieval-based Voice Conversion) inference
 Works with ZeroGPU on Hugging Face Spaces
 """
 import os
 import sys
 import torch
 import numpy as np
 import soundfile as sf
 from pathlib import Path
 import traceback
 def rvc_convert(
     input_path: str,
     output_path: str,
     model_path: str,
+    index_path: str = None,
+    f0_method: str = "harvest",
     f0_up_key: int = 0,
     index_rate: float = 0.75,
+):
     """
+    Convert voice using RVC model with pitch modification
     Args:
+        input_path: Input audio file
+        output_path: Output audio file
         model_path: Path to .pth model file
+        index_path: Path to .index file (optional)
+        f0_method: Pitch extraction method
+        f0_up_key: Pitch shift in semitones
+        index_rate: Index influence rate
     Returns:
+        bool: Success status
     """
     try:
         import pyworld as pw
+        import librosa
+        print(f"🎤 RVC Conversion starting...")
         print(f"   Input: {input_path}")
         print(f"   Model: {model_path}")
+        # Check if model exists
         if not Path(model_path).exists():
             raise FileNotFoundError(f"Model not found: {model_path}")
+        # Load audio
+        audio, sr = librosa.load(input_path, sr=None)
+        if len(audio.shape) > 1:
+            audio = audio.mean(axis=1)
+        # Resample to 16kHz if needed
+        if sr != 16000:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+            sr = 16000
+        print(f"   Audio: {len(audio)/sr:.2f}s @ {sr}Hz")
+        # Convert to float64 for pyworld
+        audio_f64 = audio.astype(np.float64)
+        # Extract features using pyworld
+        print(f"   Extracting pitch ({f0_method})...")
+        if f0_method == "harvest":
+            f0, t = pw.harvest(audio_f64, sr, frame_period=10)
         else:
+            f0, t = pw.dio(audio_f64, sr, frame_period=10)
+            f0 = pw.stonemask(audio_f64, f0, t, sr)
+        sp = pw.cheaptrick(audio_f64, f0, t, sr)
+        ap = pw.d4c(audio_f64, f0, t, sr)
+        # Apply pitch shift
+        if f0_up_key != 0:
+            print(f"   Applying pitch shift: {f0_up_key} semitones")
+            f0 = f0 * (2 ** (f0_up_key / 12))
+        # Synthesize
+        print(f"   Synthesizing...")
+        output_audio = pw.synthesize(f0, sp, ap, sr)
         output_audio = output_audio.astype(np.float32)
+        # Normalize
         max_val = np.abs(output_audio).max()
+        if max_val > 0:
             output_audio = output_audio / max_val * 0.95
+        # Resample back to 44100 for output
+        output_audio = librosa.resample(output_audio, orig_sr=sr, target_sr=44100)
+        # Save
         output_path = Path(output_path)
         output_path.parent.mkdir(parents=True, exist_ok=True)
+        sf.write(str(output_path), output_audio, 44100)
         print(f"   ✅ Conversion complete!")
         return True
     except Exception as e:
+        print(f"   ❌ RVC failed: {e}")
         traceback.print_exc()
+        # Fallback: copy input to output
         try:
             import shutil
             shutil.copy(input_path, output_path)
+            print(f"   ⚠️ Fallback: using original audio")
             return True
+        except:
             return False
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_path", required=True)
+    parser.add_argument("--output_path", required=True)
+    parser.add_argument("--model_path", required=True)
+    parser.add_argument("--index_path", default=None)
+    parser.add_argument("--f0_method", default="harvest")
+    parser.add_argument("--f0_up_key", type=int, default=0)
     args = parser.parse_args()
     success = rvc_convert(
+        args.input_path,
+        args.output_path,
+        args.model_path,
+        args.index_path,
+        args.f0_method,
+        args.f0_up_key,
     )
+    sys.exit(0 if success else 1)