Karl Yang commited on
Commit
364c8ad
·
1 Parent(s): e5e481c
Files changed (7) hide show
  1. .DS_Store +0 -0
  2. app.py +20 -30
  3. models/app.py +0 -158
  4. models/requirements.txt +0 -12
  5. models/rvc_infer.py +0 -140
  6. requirements.txt +1 -4
  7. rvc_infer.py +89 -573
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
app.py CHANGED
@@ -17,7 +17,7 @@ from rvc_infer import rvc_convert
17
  PROJECT_ROOT = Path(__file__).parent
18
 
19
  SONGS_CONFIG = [
20
- {"year": 2017, "file": "outputs/爱的故事上集-孙耀威_cloned.wav",
21
  "original": "songs/爱的故事上集-孙耀威.mp3",
22
  "message": "星的光点点洒于午夜,我们的故事,从这一年开始书写 💕"},
23
  {"year": 2018, "file": "outputs/周杰伦 - 告白气球_cloned.wav",
@@ -55,16 +55,16 @@ def get_audio_path(song, version="cloned"):
55
  def convert_voice(audio_file, progress=gr.Progress()):
56
  if audio_file is None:
57
  return None, "❌ 请上传一个音频文件"
58
-
59
  progress(0.05, desc="🎵 开始处理...")
60
-
61
  with tempfile.TemporaryDirectory() as tmpdir:
62
  tmpdir = Path(tmpdir)
63
  input_path = Path(audio_file)
64
-
65
  progress(0.1, desc="步骤1: 读谱 - 分离人声和伴奏...")
66
  vocals_path, instrumental_path = separate_vocals_and_instrumental(input_path, tmpdir)
67
-
68
  if vocals_path is None:
69
  progress(0.3, desc="⚠️ 跳过分离,直接转换...")
70
  target_audio = input_path
@@ -72,10 +72,10 @@ def convert_voice(audio_file, progress=gr.Progress()):
72
  else:
73
  progress(0.4, desc="✅ 人声分离完成")
74
  target_audio = vocals_path
75
-
76
  progress(0.5, desc="步骤2: 清嗓子 - 声线转换...")
77
  converted_vocals = tmpdir / "converted.wav"
78
-
79
  model_dir = PROJECT_ROOT / "models"
80
  model_path = None
81
  for name in ["xiujia-1220-best", "xiujia-best", "xiujia"]:
@@ -83,38 +83,28 @@ def convert_voice(audio_file, progress=gr.Progress()):
83
  if test.exists():
84
  model_path = test
85
  break
86
-
87
  if model_path and model_path.exists():
88
- rvc_convert(
89
- str(target_audio),
90
- str(converted_vocals),
91
- str(model_path),
92
- index_path=str(model_dir / "xiujia-1220-best.index") if (model_dir / "xiujia-1220-best.index").exists() else None,
93
- f0_method="crepe", # Best quality
94
- f0_up_key=0, # No pitch shift
95
- index_rate=0.75, # Use index if available
96
- protect=0.33, # Protect consonants
97
- filter_radius=3, # Smooth F0
98
- )
99
  else:
100
  shutil.copy(target_audio, converted_vocals)
101
  progress(0.7, desc="⚠️ 未找到模型,使用原音")
102
-
103
  progress(0.8, desc="✅ 声线转换完成")
104
  progress(0.85, desc="步骤3: 开唱 - 合成音频...")
105
-
106
  final_output = tmpdir / "final.wav"
107
-
108
  if instrumental_path and instrumental_path.exists():
109
  merge_vocals_and_instrumental(converted_vocals, instrumental_path, final_output)
110
  else:
111
  optimize_audio(converted_vocals, final_output)
112
-
113
  result_name = f"converted_{datetime.now().strftime('%H%M%S')}.wav"
114
  result_path = PROJECT_ROOT / "outputs" / result_name
115
  result_path.parent.mkdir(exist_ok=True)
116
  shutil.copy(final_output, result_path)
117
-
118
  progress(1.0, desc="✅ 完成!")
119
  return str(result_path), "🎉 转换成功!听听看吧~"
120
 
@@ -125,13 +115,13 @@ h1, h2, h3 { color: #d63384 !important; text-align: center; }
125
 
126
  with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pink"), css=css) as demo:
127
  gr.Markdown("# 💕 9th Anniversary Celebration 💕\n### 2017 - 2025 · 九年,久远")
128
-
129
  with gr.Row():
130
  for img_name in ["couple.png", "couple1.png"]:
131
  img_path = PROJECT_ROOT / img_name
132
  if img_path.exists():
133
  gr.Image(str(img_path), show_label=False, height=220, container=False)
134
-
135
  with gr.Tab("🎵 九年歌曲集"):
136
  gr.Markdown("## 🎵 九年,唱不尽的爱")
137
  for song in SONGS_CONFIG:
@@ -144,7 +134,7 @@ with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pin
144
  gr.Audio(cloned, label="🎤 老公唱")
145
  if original:
146
  gr.Audio(original, label="🎵 原唱")
147
-
148
  with gr.Tab("🎤 上传歌曲"):
149
  gr.Markdown("## 🎤 上传MP3,我唱给你听!")
150
  with gr.Row():
@@ -155,9 +145,9 @@ with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pin
155
  with gr.Column():
156
  audio_out = gr.Audio(label="🎵 老公开唱", type="filepath")
157
  btn.click(convert_voice, [audio_in], [audio_out, status])
158
-
159
  gr.Markdown("---\n## 💝 九年不是终点,而是我们故事的第九章 💝")
160
-
161
  with gr.Row():
162
  for img_name in ["family.png", "family2.png"]:
163
  img_path = PROJECT_ROOT / img_name
@@ -165,4 +155,4 @@ with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pin
165
  gr.Image(str(img_path), show_label=False, height=220, container=False)
166
 
167
  if __name__ == "__main__":
168
- demo.launch()
 
17
  PROJECT_ROOT = Path(__file__).parent
18
 
19
  SONGS_CONFIG = [
20
+ {"year": 2017, "file": "outputs/爱的故事上集-孙耀威_cloned.wav",
21
  "original": "songs/爱的故事上集-孙耀威.mp3",
22
  "message": "星的光点点洒于午夜,我们的故事,从这一年开始书写 💕"},
23
  {"year": 2018, "file": "outputs/周杰伦 - 告白气球_cloned.wav",
 
55
  def convert_voice(audio_file, progress=gr.Progress()):
56
  if audio_file is None:
57
  return None, "❌ 请上传一个音频文件"
58
+
59
  progress(0.05, desc="🎵 开始处理...")
60
+
61
  with tempfile.TemporaryDirectory() as tmpdir:
62
  tmpdir = Path(tmpdir)
63
  input_path = Path(audio_file)
64
+
65
  progress(0.1, desc="步骤1: 读谱 - 分离人声和伴奏...")
66
  vocals_path, instrumental_path = separate_vocals_and_instrumental(input_path, tmpdir)
67
+
68
  if vocals_path is None:
69
  progress(0.3, desc="⚠️ 跳过分离,直接转换...")
70
  target_audio = input_path
 
72
  else:
73
  progress(0.4, desc="✅ 人声分离完成")
74
  target_audio = vocals_path
75
+
76
  progress(0.5, desc="步骤2: 清嗓子 - 声线转换...")
77
  converted_vocals = tmpdir / "converted.wav"
78
+
79
  model_dir = PROJECT_ROOT / "models"
80
  model_path = None
81
  for name in ["xiujia-1220-best", "xiujia-best", "xiujia"]:
 
83
  if test.exists():
84
  model_path = test
85
  break
86
+
87
  if model_path and model_path.exists():
88
+ rvc_convert(str(target_audio), str(converted_vocals), str(model_path))
 
 
 
 
 
 
 
 
 
 
89
  else:
90
  shutil.copy(target_audio, converted_vocals)
91
  progress(0.7, desc="⚠️ 未找到模型,使用原音")
92
+
93
  progress(0.8, desc="✅ 声线转换完成")
94
  progress(0.85, desc="步骤3: 开唱 - 合成音频...")
95
+
96
  final_output = tmpdir / "final.wav"
97
+
98
  if instrumental_path and instrumental_path.exists():
99
  merge_vocals_and_instrumental(converted_vocals, instrumental_path, final_output)
100
  else:
101
  optimize_audio(converted_vocals, final_output)
102
+
103
  result_name = f"converted_{datetime.now().strftime('%H%M%S')}.wav"
104
  result_path = PROJECT_ROOT / "outputs" / result_name
105
  result_path.parent.mkdir(exist_ok=True)
106
  shutil.copy(final_output, result_path)
107
+
108
  progress(1.0, desc="✅ 完成!")
109
  return str(result_path), "🎉 转换成功!听听看吧~"
110
 
 
115
 
116
  with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pink"), css=css) as demo:
117
  gr.Markdown("# 💕 9th Anniversary Celebration 💕\n### 2017 - 2025 · 九年,久远")
118
+
119
  with gr.Row():
120
  for img_name in ["couple.png", "couple1.png"]:
121
  img_path = PROJECT_ROOT / img_name
122
  if img_path.exists():
123
  gr.Image(str(img_path), show_label=False, height=220, container=False)
124
+
125
  with gr.Tab("🎵 九年歌曲集"):
126
  gr.Markdown("## 🎵 九年,唱不尽的爱")
127
  for song in SONGS_CONFIG:
 
134
  gr.Audio(cloned, label="🎤 老公唱")
135
  if original:
136
  gr.Audio(original, label="🎵 原唱")
137
+
138
  with gr.Tab("🎤 上传歌曲"):
139
  gr.Markdown("## 🎤 上传MP3,我唱给你听!")
140
  with gr.Row():
 
145
  with gr.Column():
146
  audio_out = gr.Audio(label="🎵 老公开唱", type="filepath")
147
  btn.click(convert_voice, [audio_in], [audio_out, status])
148
+
149
  gr.Markdown("---\n## 💝 九年不是终点,而是我们故事的第九章 💝")
150
+
151
  with gr.Row():
152
  for img_name in ["family.png", "family2.png"]:
153
  img_path = PROJECT_ROOT / img_name
 
155
  gr.Image(str(img_path), show_label=False, height=220, container=False)
156
 
157
  if __name__ == "__main__":
158
+ demo.launch()
models/app.py DELETED
@@ -1,158 +0,0 @@
1
- # app.py - 9th Anniversary Celebration App
2
- import gradio as gr
3
- import spaces
4
- import os
5
- import tempfile
6
- import shutil
7
- from pathlib import Path
8
- from datetime import datetime
9
-
10
- from utils import (
11
- separate_vocals_and_instrumental,
12
- merge_vocals_and_instrumental,
13
- optimize_audio,
14
- )
15
- from rvc_infer import rvc_convert
16
-
17
- PROJECT_ROOT = Path(__file__).parent
18
-
19
- SONGS_CONFIG = [
20
- {"year": 2017, "file": "outputs/爱的故事上集-孙耀威_cloned.wav",
21
- "original": "songs/爱的故事上集-孙耀威.mp3",
22
- "message": "星的光点点洒于午夜,我们的故事,从这一年开始书写 💕"},
23
- {"year": 2018, "file": "outputs/周杰伦 - 告白气球_cloned.wav",
24
- "original": "songs/周杰伦 - 告白气球.mp3",
25
- "message": "你说你有点难追,想让我知难而退。我没有退,这一年,我们更近了 ❤️"},
26
- {"year": 2019, "file": "outputs/林俊杰 - 修炼爱情_cloned.wav",
27
- "original": "songs/林俊杰 - 修炼爱情.mp3",
28
- "message": "爱情需要修炼,每一年的陪伴,都是我们爱情的见证 🌟"},
29
- {"year": 2020, "file": "outputs/周深-雪落下的声音_cloned.wav",
30
- "original": "songs/周深-雪落下的声音.mp3",
31
- "message": "就像雪花轻轻落下,你已经填满我的心 🎨"},
32
- {"year": 2021, "file": "outputs/胡夏&郁可唯-知否知否_cloned.wav",
33
- "original": "songs/胡夏&郁可唯-知否知否.mp3",
34
- "message": "知否知否,时光荏苒,但我们的爱依然如初 💖"},
35
- {"year": 2022, "file": "outputs/陈奕迅 - 陪你度过漫长岁月_cloned.wav",
36
- "original": "songs/陈奕迅 - 陪你度过漫长岁月.mp3",
37
- "message": "陪你把独自孤单,变成了勇敢 🌸"},
38
- {"year": 2023, "file": "outputs/Edd_Sheeran_-_Perfect_cloned.wav",
39
- "original": "songs/Edd_Sheeran_-_Perfect.mp3",
40
- "message": "Baby, you're perfect in my eyes ✨"},
41
- {"year": 2024, "file": "outputs/Michael_Learns_To_Rock_-_Take_Me_To_Your_Heart_Original_Version_cloned.wav",
42
- "original": "songs/Michael_Learns_To_Rock_-_Take_Me_To_Your_Heart_Original_Version.mp3",
43
- "message": "Take me to your heart, take me to your soul 🏠"},
44
- {"year": 2025, "file": "outputs/Richard_Marx-Right_here_waiting_for_you_(mp3.pm)_cloned.wav",
45
- "original": "songs/Richard_Marx-Right_here_waiting_for_you_(mp3.pm).mp3",
46
- "message": "I will be right here waiting for you. 9年了,爱依然如故 💝"},
47
- ]
48
-
49
- def get_audio_path(song, version="cloned"):
50
- key = "file" if version == "cloned" else "original"
51
- path = PROJECT_ROOT / song[key]
52
- return str(path) if path.exists() else None
53
-
54
- @spaces.GPU(duration=300)
55
- def convert_voice(audio_file, progress=gr.Progress()):
56
- if audio_file is None:
57
- return None, "❌ 请上传一个音频文件"
58
-
59
- progress(0.05, desc="🎵 开始处理...")
60
-
61
- with tempfile.TemporaryDirectory() as tmpdir:
62
- tmpdir = Path(tmpdir)
63
- input_path = Path(audio_file)
64
-
65
- progress(0.1, desc="步骤1: 读谱 - 分离人声和伴奏...")
66
- vocals_path, instrumental_path = separate_vocals_and_instrumental(input_path, tmpdir)
67
-
68
- if vocals_path is None:
69
- progress(0.3, desc="⚠️ 跳过分离,直接转换...")
70
- target_audio = input_path
71
- instrumental_path = None
72
- else:
73
- progress(0.4, desc="✅ 人声分离完成")
74
- target_audio = vocals_path
75
-
76
- progress(0.5, desc="步骤2: 清嗓子 - 声线转换...")
77
- converted_vocals = tmpdir / "converted.wav"
78
-
79
- model_dir = PROJECT_ROOT / "models"
80
- model_path = None
81
- for name in ["xiujia-1220-best", "xiujia-best", "xiujia"]:
82
- test = model_dir / f"{name}.pth"
83
- if test.exists():
84
- model_path = test
85
- break
86
-
87
- if model_path and model_path.exists():
88
- rvc_convert(str(target_audio), str(converted_vocals), str(model_path))
89
- else:
90
- shutil.copy(target_audio, converted_vocals)
91
- progress(0.7, desc="⚠️ 未找到模型,使用原音")
92
-
93
- progress(0.8, desc="✅ 声线转换完成")
94
- progress(0.85, desc="步骤3: 开唱 - 合成音频...")
95
-
96
- final_output = tmpdir / "final.wav"
97
-
98
- if instrumental_path and instrumental_path.exists():
99
- merge_vocals_and_instrumental(converted_vocals, instrumental_path, final_output)
100
- else:
101
- optimize_audio(converted_vocals, final_output)
102
-
103
- result_name = f"converted_{datetime.now().strftime('%H%M%S')}.wav"
104
- result_path = PROJECT_ROOT / "outputs" / result_name
105
- result_path.parent.mkdir(exist_ok=True)
106
- shutil.copy(final_output, result_path)
107
-
108
- progress(1.0, desc="✅ 完成!")
109
- return str(result_path), "🎉 转换成功!听听看吧~"
110
-
111
- css = """
112
- .gradio-container { background: linear-gradient(135deg, #ffeef8, #fff0f5, #ffeef8) !important; }
113
- h1, h2, h3 { color: #d63384 !important; text-align: center; }
114
- """
115
-
116
- with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pink"), css=css) as demo:
117
- gr.Markdown("# 💕 9th Anniversary Celebration 💕\n### 2017 - 2025 · 九年,久远")
118
-
119
- with gr.Row():
120
- for img_name in ["couple.png", "couple1.png"]:
121
- img_path = PROJECT_ROOT / img_name
122
- if img_path.exists():
123
- gr.Image(str(img_path), show_label=False, height=220, container=False)
124
-
125
- with gr.Tab("🎵 九年歌曲集"):
126
- gr.Markdown("## 🎵 九年,唱不尽的爱")
127
- for song in SONGS_CONFIG:
128
- with gr.Accordion(f"💗 {song['year']} 年", open=False):
129
- gr.Markdown(f"*{song['message']}*")
130
- with gr.Row():
131
- cloned = get_audio_path(song, "cloned")
132
- original = get_audio_path(song, "original")
133
- if cloned:
134
- gr.Audio(cloned, label="🎤 老公唱")
135
- if original:
136
- gr.Audio(original, label="🎵 原唱")
137
-
138
- with gr.Tab("🎤 上传歌曲"):
139
- gr.Markdown("## 🎤 上传MP3,我唱给你听!")
140
- with gr.Row():
141
- with gr.Column():
142
- audio_in = gr.Audio(label="选择歌曲 🎵", type="filepath", sources=["upload"])
143
- btn = gr.Button("✨ 开始转换", variant="primary", size="lg")
144
- status = gr.Textbox(label="状态", interactive=False)
145
- with gr.Column():
146
- audio_out = gr.Audio(label="🎵 老公开唱", type="filepath")
147
- btn.click(convert_voice, [audio_in], [audio_out, status])
148
-
149
- gr.Markdown("---\n## 💝 九年不是终点,而是我们故事的第九章 💝")
150
-
151
- with gr.Row():
152
- for img_name in ["family.png", "family2.png"]:
153
- img_path = PROJECT_ROOT / img_name
154
- if img_path.exists():
155
- gr.Image(str(img_path), show_label=False, height=220, container=False)
156
-
157
- if __name__ == "__main__":
158
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/requirements.txt DELETED
@@ -1,12 +0,0 @@
1
- spaces>=0.19.0
2
- torch>=2.0.0
3
- torchaudio
4
- demucs
5
- numpy
6
- scipy
7
- pydub
8
- soundfile
9
- librosa
10
- pyworld
11
- gradio
12
- huggingface_hub==0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
models/rvc_infer.py DELETED
@@ -1,140 +0,0 @@
1
- # rvc_infer.py - RVC inference for Hugging Face Spaces
2
- """
3
- Simplified RVC (Retrieval-based Voice Conversion) inference
4
- Works with ZeroGPU on Hugging Face Spaces
5
- """
6
-
7
- import os
8
- import sys
9
- import torch
10
- import numpy as np
11
- import soundfile as sf
12
- from pathlib import Path
13
- import traceback
14
-
15
- def rvc_convert(
16
- input_path: str,
17
- output_path: str,
18
- model_path: str,
19
- index_path: str = None,
20
- f0_method: str = "harvest",
21
- f0_up_key: int = 0,
22
- index_rate: float = 0.75,
23
- ):
24
- """
25
- Convert voice using RVC model with pitch modification
26
-
27
- Args:
28
- input_path: Input audio file
29
- output_path: Output audio file
30
- model_path: Path to .pth model file
31
- index_path: Path to .index file (optional)
32
- f0_method: Pitch extraction method
33
- f0_up_key: Pitch shift in semitones
34
- index_rate: Index influence rate
35
-
36
- Returns:
37
- bool: Success status
38
- """
39
- try:
40
- import pyworld as pw
41
- import librosa
42
-
43
- print(f"🎤 RVC Conversion starting...")
44
- print(f" Input: {input_path}")
45
- print(f" Model: {model_path}")
46
-
47
- # Check if model exists
48
- if not Path(model_path).exists():
49
- raise FileNotFoundError(f"Model not found: {model_path}")
50
-
51
- # Load audio
52
- audio, sr = librosa.load(input_path, sr=None)
53
- if len(audio.shape) > 1:
54
- audio = audio.mean(axis=1)
55
-
56
- # Resample to 16kHz if needed
57
- if sr != 16000:
58
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
59
- sr = 16000
60
-
61
- print(f" Audio: {len(audio)/sr:.2f}s @ {sr}Hz")
62
-
63
- # Convert to float64 for pyworld
64
- audio_f64 = audio.astype(np.float64)
65
-
66
- # Extract features using pyworld
67
- print(f" Extracting pitch ({f0_method})...")
68
-
69
- if f0_method == "harvest":
70
- f0, t = pw.harvest(audio_f64, sr, frame_period=10)
71
- else:
72
- f0, t = pw.dio(audio_f64, sr, frame_period=10)
73
- f0 = pw.stonemask(audio_f64, f0, t, sr)
74
-
75
- sp = pw.cheaptrick(audio_f64, f0, t, sr)
76
- ap = pw.d4c(audio_f64, f0, t, sr)
77
-
78
- # Apply pitch shift
79
- if f0_up_key != 0:
80
- print(f" Applying pitch shift: {f0_up_key} semitones")
81
- f0 = f0 * (2 ** (f0_up_key / 12))
82
-
83
- # Synthesize
84
- print(f" Synthesizing...")
85
- output_audio = pw.synthesize(f0, sp, ap, sr)
86
- output_audio = output_audio.astype(np.float32)
87
-
88
- # Normalize
89
- max_val = np.abs(output_audio).max()
90
- if max_val > 0:
91
- output_audio = output_audio / max_val * 0.95
92
-
93
- # Resample back to 44100 for output
94
- output_audio = librosa.resample(output_audio, orig_sr=sr, target_sr=44100)
95
-
96
- # Save
97
- output_path = Path(output_path)
98
- output_path.parent.mkdir(parents=True, exist_ok=True)
99
- sf.write(str(output_path), output_audio, 44100)
100
-
101
- print(f" ✅ Conversion complete!")
102
- return True
103
-
104
- except Exception as e:
105
- print(f" ❌ RVC failed: {e}")
106
- traceback.print_exc()
107
-
108
- # Fallback: copy input to output
109
- try:
110
- import shutil
111
- shutil.copy(input_path, output_path)
112
- print(f" ⚠️ Fallback: using original audio")
113
- return True
114
- except:
115
- return False
116
-
117
-
118
- if __name__ == "__main__":
119
- import argparse
120
-
121
- parser = argparse.ArgumentParser()
122
- parser.add_argument("--input_path", required=True)
123
- parser.add_argument("--output_path", required=True)
124
- parser.add_argument("--model_path", required=True)
125
- parser.add_argument("--index_path", default=None)
126
- parser.add_argument("--f0_method", default="harvest")
127
- parser.add_argument("--f0_up_key", type=int, default=0)
128
-
129
- args = parser.parse_args()
130
-
131
- success = rvc_convert(
132
- args.input_path,
133
- args.output_path,
134
- args.model_path,
135
- args.index_path,
136
- args.f0_method,
137
- args.f0_up_key,
138
- )
139
-
140
- sys.exit(0 if success else 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -9,7 +9,4 @@ soundfile
9
  librosa
10
  pyworld
11
  gradio
12
- huggingface_hub==0.22.2
13
- faiss-cpu
14
- torchcrepe
15
- praat-parselmouth
 
9
  librosa
10
  pyworld
11
  gradio
12
+ huggingface_hub==0.22.2
 
 
 
rvc_infer.py CHANGED
@@ -1,624 +1,140 @@
1
- # rvc_infer.py - Complete RVC Inference for Hugging Face Spaces
2
  """
3
- Retrieval-based Voice Conversion inference module
4
  Works with ZeroGPU on Hugging Face Spaces
5
-
6
- Dependencies: torch, torchaudio, numpy, scipy, librosa, soundfile,
7
- pyworld, torchcrepe, faiss-cpu, praat-parselmouth
8
  """
9
 
10
  import os
11
  import sys
12
  import torch
13
- import torch.nn as nn
14
- import torch.nn.functional as F
15
  import numpy as np
16
  import soundfile as sf
17
  from pathlib import Path
18
  import traceback
19
- import librosa
20
- from scipy import signal
21
- from typing import Optional, Tuple, Union
22
-
23
- # ============================================================
24
- # Configuration
25
- # ============================================================
26
-
27
- class Config:
28
- def __init__(self):
29
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
30
- self.is_half = False # Use float32 for compatibility
31
- self.sample_rate = 16000
32
- self.hop_size = 160
33
- self.f0_min = 50
34
- self.f0_max = 1100
35
-
36
- config = Config()
37
-
38
- # ============================================================
39
- # F0 Extraction Methods
40
- # ============================================================
41
-
42
- def extract_f0_crepe(audio: np.ndarray, sr: int = 16000, hop_length: int = 160,
43
- f0_min: int = 50, f0_max: int = 1100, device: str = "cuda") -> np.ndarray:
44
- """Extract F0 using CREPE (high quality)"""
45
- try:
46
- import torchcrepe
47
-
48
- audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(device)
49
-
50
- f0, periodicity = torchcrepe.predict(
51
- audio_tensor, sr,
52
- hop_length=hop_length,
53
- fmin=f0_min,
54
- fmax=f0_max,
55
- model='full',
56
- decoder=torchcrepe.decode.viterbi,
57
- return_periodicity=True,
58
- device=device,
59
- batch_size=512
60
- )
61
-
62
- # Filter and clean up
63
- periodicity = torchcrepe.filter.median(periodicity, 3)
64
- f0 = torchcrepe.filter.mean(f0, 3)
65
- f0[periodicity < 0.1] = 0
66
-
67
- return f0.squeeze().cpu().numpy()
68
- except Exception as e:
69
- print(f" CREPE failed: {e}, falling back to harvest")
70
- return extract_f0_harvest(audio, sr)
71
-
72
-
73
- def extract_f0_harvest(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
74
- """Extract F0 using Harvest (pyworld)"""
75
- import pyworld as pw
76
-
77
- audio_f64 = audio.astype(np.float64)
78
- f0, t = pw.harvest(
79
- audio_f64, sr,
80
- f0_floor=50.0,
81
- f0_ceil=1100.0,
82
- frame_period=10.0
83
- )
84
- return f0.astype(np.float32)
85
-
86
-
87
- def extract_f0_dio(audio: np.ndarray, sr: int = 16000) -> np.ndarray:
88
- """Extract F0 using DIO (pyworld) - faster but less accurate"""
89
- import pyworld as pw
90
-
91
- audio_f64 = audio.astype(np.float64)
92
- f0, t = pw.dio(audio_f64, sr, frame_period=10.0)
93
- f0 = pw.stonemask(audio_f64, f0, t, sr)
94
- return f0.astype(np.float32)
95
-
96
-
97
- def extract_f0_parselmouth(audio: np.ndarray, sr: int = 16000,
98
- f0_min: int = 50, f0_max: int = 1100) -> np.ndarray:
99
- """Extract F0 using Parselmouth (Praat)"""
100
- try:
101
- import parselmouth
102
-
103
- sound = parselmouth.Sound(audio, sampling_frequency=sr)
104
- pitch = sound.to_pitch_ac(
105
- time_step=0.01,
106
- pitch_floor=f0_min,
107
- pitch_ceiling=f0_max
108
- )
109
-
110
- f0 = pitch.selected_array['frequency']
111
- f0[f0 == 0] = np.nan
112
- f0 = np.nan_to_num(f0, nan=0.0)
113
-
114
- return f0.astype(np.float32)
115
- except Exception as e:
116
- print(f" Parselmouth failed: {e}")
117
- return extract_f0_harvest(audio, sr)
118
-
119
-
120
- def extract_f0(audio: np.ndarray, sr: int = 16000, method: str = "crepe",
121
- f0_up_key: int = 0, device: str = "cuda") -> np.ndarray:
122
- """
123
- Extract F0 using specified method
124
-
125
- Args:
126
- audio: Input audio (mono, float32)
127
- sr: Sample rate
128
- method: One of 'crepe', 'rmvpe', 'harvest', 'dio', 'pm'
129
- f0_up_key: Pitch shift in semitones
130
- device: Device for neural methods
131
-
132
- Returns:
133
- F0 array
134
- """
135
- method = method.lower()
136
-
137
- if method in ["crepe", "rmvpe", "mangio-crepe"]:
138
- f0 = extract_f0_crepe(audio, sr, device=device)
139
- elif method == "harvest":
140
- f0 = extract_f0_harvest(audio, sr)
141
- elif method in ["dio", "pm"]:
142
- f0 = extract_f0_dio(audio, sr)
143
- elif method == "parselmouth":
144
- f0 = extract_f0_parselmouth(audio, sr)
145
- else:
146
- print(f" Unknown F0 method '{method}', using harvest")
147
- f0 = extract_f0_harvest(audio, sr)
148
-
149
- # Apply pitch shift
150
- if f0_up_key != 0:
151
- f0[f0 > 0] = f0[f0 > 0] * (2 ** (f0_up_key / 12))
152
-
153
- return f0
154
-
155
- # ============================================================
156
- # Index Loading (FAISS)
157
- # ============================================================
158
-
159
- _index_cache = {}
160
-
161
- def load_index(index_path: Optional[str]) -> Tuple[Optional[object], Optional[np.ndarray]]:
162
- """
163
- Load FAISS index for feature retrieval
164
-
165
- Returns:
166
- (index, big_npy) tuple
167
- """
168
- if not index_path or not Path(index_path).exists():
169
- return None, None
170
-
171
- if index_path in _index_cache:
172
- return _index_cache[index_path]
173
-
174
- try:
175
- import faiss
176
-
177
- print(f" Loading index: {index_path}")
178
- index = faiss.read_index(str(index_path))
179
-
180
- # Try to reconstruct vectors from index or load from .npy
181
- big_npy = None
182
-
183
- # Check for .npy file with same name
184
- npy_path = Path(index_path).with_suffix('.npy')
185
- if npy_path.exists():
186
- big_npy = np.load(str(npy_path))
187
- print(f" Loaded npy: {big_npy.shape}")
188
- else:
189
- # Try common naming patterns
190
- for pattern in ['total_fea.npy', 'big_npy.npy']:
191
- alt_path = Path(index_path).parent / pattern
192
- if alt_path.exists():
193
- big_npy = np.load(str(alt_path))
194
- print(f" Loaded npy from {pattern}: {big_npy.shape}")
195
- break
196
-
197
- if big_npy is None:
198
- # Try to reconstruct from index
199
- try:
200
- n_vectors = index.ntotal
201
- dim = index.d
202
- big_npy = np.zeros((n_vectors, dim), dtype=np.float32)
203
- for i in range(n_vectors):
204
- big_npy[i] = index.reconstruct(i)
205
- print(f" Reconstructed {n_vectors} vectors from index")
206
- except:
207
- print(" Warning: Could not load/reconstruct feature vectors")
208
-
209
- _index_cache[index_path] = (index, big_npy)
210
- return index, big_npy
211
-
212
- except ImportError:
213
- print(" Warning: faiss not installed, index retrieval disabled")
214
- return None, None
215
- except Exception as e:
216
- print(f" Failed to load index: {e}")
217
- return None, None
218
-
219
-
220
- def index_retrieval(feats: np.ndarray, index, big_npy: np.ndarray,
221
- index_rate: float = 0.75, k: int = 8) -> np.ndarray:
222
- """
223
- Apply index-based feature retrieval
224
-
225
- Args:
226
- feats: Input features [T, D]
227
- index: FAISS index
228
- big_npy: Feature vectors
229
- index_rate: Mixing rate (0-1)
230
- k: Number of neighbors
231
-
232
- Returns:
233
- Mixed features
234
- """
235
- if index is None or big_npy is None or index_rate <= 0:
236
- return feats
237
-
238
- try:
239
- # Ensure correct dtype
240
- feats = feats.astype(np.float32)
241
-
242
- # Search for nearest neighbors
243
- scores, indices = index.search(feats, k=k)
244
-
245
- # Compute weighted average of retrieved features
246
- weights = np.exp(-scores / 20)
247
- weights = weights / weights.sum(axis=1, keepdims=True)
248
-
249
- # Gather retrieved features
250
- retrieved = np.zeros_like(feats)
251
- for i in range(len(feats)):
252
- for j in range(k):
253
- idx = indices[i, j]
254
- if 0 <= idx < len(big_npy):
255
- retrieved[i] += weights[i, j] * big_npy[idx]
256
-
257
- # Mix original and retrieved
258
- mixed = (1 - index_rate) * feats + index_rate * retrieved
259
- return mixed
260
-
261
- except Exception as e:
262
- print(f" Index retrieval error: {e}")
263
- return feats
264
-
265
-
266
- # ============================================================
267
- # Audio Feature Extraction
268
- # ============================================================
269
-
270
- def extract_features_simple(audio: np.ndarray, sr: int = 16000,
271
- n_fft: int = 1024, hop_length: int = 160) -> np.ndarray:
272
- """Extract mel spectrogram features (fallback method)"""
273
- mel = librosa.feature.melspectrogram(
274
- y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length,
275
- n_mels=128, fmin=0, fmax=sr//2
276
- )
277
- mel_db = librosa.power_to_db(mel, ref=np.max)
278
- return mel_db.T # [T, 128]
279
-
280
-
281
- def extract_contentvec_features(audio: np.ndarray, sr: int = 16000,
282
- device: str = "cuda") -> np.ndarray:
283
- """
284
- Extract ContentVec/HuBERT-like features using torchaudio
285
- Falls back to mel features if unavailable
286
- """
287
- try:
288
- import torchaudio
289
- from torchaudio.pipelines import HUBERT_BASE
290
-
291
- # Load HuBERT model
292
- bundle = HUBERT_BASE
293
- model = bundle.get_model().to(device).eval()
294
-
295
- # Resample if needed
296
- if sr != bundle.sample_rate:
297
- audio = librosa.resample(audio, orig_sr=sr, target_sr=bundle.sample_rate)
298
-
299
- # Extract features
300
- with torch.no_grad():
301
- audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(device)
302
- features, _ = model.extract_features(audio_tensor)
303
- feats = features[-1].squeeze(0).cpu().numpy()
304
-
305
- return feats # [T, 768]
306
-
307
- except Exception as e:
308
- print(f" HuBERT extraction failed: {e}, using mel features")
309
- return extract_features_simple(audio, sr)
310
-
311
- # ============================================================
312
- # Spectral Processing for Voice Conversion
313
- # ============================================================
314
-
315
- def get_spectral_envelope(audio: np.ndarray, sr: int, f0: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
316
- """Extract spectral envelope and aperiodicity using PyWorld"""
317
- import pyworld as pw
318
-
319
- audio_f64 = audio.astype(np.float64)
320
-
321
- if f0 is None or len(f0) == 0:
322
- f0, t = pw.harvest(audio_f64, sr, frame_period=10.0)
323
- else:
324
- t = np.arange(len(f0)) * 0.01
325
-
326
- f0 = f0.astype(np.float64)
327
- sp = pw.cheaptrick(audio_f64, f0, t, sr)
328
- ap = pw.d4c(audio_f64, f0, t, sr)
329
-
330
- return sp, ap, t
331
-
332
-
333
- def modify_spectral_envelope(sp: np.ndarray, formant_shift: float = 1.0) -> np.ndarray:
334
- """Modify spectral envelope for voice character change"""
335
- if formant_shift == 1.0:
336
- return sp
337
-
338
- T, freq_bins = sp.shape
339
- new_sp = np.zeros_like(sp)
340
-
341
- for t in range(T):
342
- old_freqs = np.arange(freq_bins)
343
- new_freqs = old_freqs * formant_shift
344
- new_sp[t] = np.interp(old_freqs, new_freqs, sp[t], left=sp[t, 0], right=sp[t, -1])
345
-
346
- return new_sp
347
-
348
-
349
- def smooth_f0(f0: np.ndarray, filter_radius: int = 3) -> np.ndarray:
350
- """Smooth F0 contour using median filter"""
351
- if filter_radius <= 0:
352
- return f0
353
-
354
- from scipy.ndimage import median_filter
355
-
356
- voiced_mask = f0 > 0
357
- if not np.any(voiced_mask):
358
- return f0
359
-
360
- f0_smoothed = f0.copy()
361
- f0_smoothed[voiced_mask] = median_filter(f0[voiced_mask], size=filter_radius * 2 + 1)
362
-
363
- return f0_smoothed
364
-
365
-
366
- def apply_protect(output: np.ndarray, original: np.ndarray,
367
- f0: np.ndarray, protect: float = 0.33) -> np.ndarray:
368
- """Protect consonants/unvoiced regions by blending with original"""
369
- if protect <= 0 or len(output) != len(original):
370
- return output
371
-
372
- hop_length = len(original) // len(f0)
373
- unvoiced_mask = np.repeat(f0 == 0, hop_length)
374
-
375
- if len(unvoiced_mask) > len(output):
376
- unvoiced_mask = unvoiced_mask[:len(output)]
377
- elif len(unvoiced_mask) < len(output):
378
- unvoiced_mask = np.pad(unvoiced_mask, (0, len(output) - len(unvoiced_mask)), mode='edge')
379
-
380
- from scipy.ndimage import gaussian_filter1d
381
- mask_smooth = gaussian_filter1d(unvoiced_mask.astype(float), sigma=100)
382
-
383
- protected = output * (1 - mask_smooth * protect) + original * (mask_smooth * protect)
384
-
385
- return protected.astype(np.float32)
386
-
387
- # ============================================================
388
- # Main RVC Conversion Function
389
- # ============================================================
390
 
391
  def rvc_convert(
392
  input_path: str,
393
  output_path: str,
394
  model_path: str,
395
- index_path: Optional[str] = None,
396
- f0_method: str = "crepe",
397
  f0_up_key: int = 0,
398
  index_rate: float = 0.75,
399
- protect: float = 0.33,
400
- filter_radius: int = 3,
401
- rms_mix_rate: float = 0.25,
402
- resample_sr: int = 0,
403
- formant_shift: float = 1.0,
404
- ) -> bool:
405
  """
406
- Convert voice using RVC-style processing
407
-
408
  Args:
409
- input_path: Path to input audio file
410
- output_path: Path to save converted audio
411
  model_path: Path to .pth model file
412
- index_path: Path to .index file (optional, improves quality)
413
- f0_method: F0 extraction method ('crepe', 'harvest', 'dio', 'pm')
414
- f0_up_key: Pitch shift in semitones (-12 to +12)
415
- index_rate: How much to use index features (0-1)
416
- protect: Consonant protection amount (0-0.5)
417
- filter_radius: F0 smoothing radius (0-7)
418
- rms_mix_rate: Volume envelope mixing (0-1)
419
- resample_sr: Output sample rate (0 = auto 44100)
420
- formant_shift: Formant shift ratio (0.5-2.0, 1.0 = no change)
421
-
422
  Returns:
423
- True if successful, False otherwise
424
  """
425
  try:
426
  import pyworld as pw
427
-
428
- device = config.device
429
- print(f"🎤 RVC Voice Conversion")
430
- print(f" Device: {device}")
431
  print(f" Input: {input_path}")
432
  print(f" Model: {model_path}")
433
- print(f" Index: {index_path or 'None (quality may be reduced)'}")
434
- print(f" Settings: f0_method={f0_method}, pitch={f0_up_key}, index_rate={index_rate}")
435
-
436
- # Validate inputs
437
- if not Path(input_path).exists():
438
- raise FileNotFoundError(f"Input not found: {input_path}")
439
  if not Path(model_path).exists():
440
  raise FileNotFoundError(f"Model not found: {model_path}")
441
-
442
- # ========================================
443
- # Step 1: Load and preprocess audio
444
- # ========================================
445
- print(" [1/6] Loading audio...")
446
-
447
- audio, sr = librosa.load(input_path, sr=16000, mono=True)
448
- audio = audio.astype(np.float32)
449
-
450
- audio_max = np.abs(audio).max()
451
- if audio_max > 1.0:
452
- audio = audio / audio_max
453
-
454
- original_rms = np.sqrt(np.mean(audio ** 2))
455
- original_audio = audio.copy()
456
-
457
- duration = len(audio) / sr
458
- print(f" Audio loaded: {duration:.2f}s @ {sr}Hz")
459
-
460
- # ========================================
461
- # Step 2: Extract F0 (pitch)
462
- # ========================================
463
- print(f" [2/6] Extracting F0 ({f0_method})...")
464
-
465
- f0 = extract_f0(audio, sr, method=f0_method, f0_up_key=f0_up_key, device=device)
466
-
467
- if filter_radius > 0:
468
- f0 = smooth_f0(f0, filter_radius)
469
-
470
- voiced_f0 = f0[f0 > 0]
471
- if len(voiced_f0) > 0:
472
- print(f" F0 extracted: {len(f0)} frames, range [{voiced_f0.min():.1f}-{voiced_f0.max():.1f}] Hz")
473
  else:
474
- print(" F0 extracted (no voiced frames detected)")
475
-
476
- # ========================================
477
- # Step 3: Load index and extract features
478
- # ========================================
479
- print(" [3/6] Processing features...")
480
-
481
- index, big_npy = load_index(index_path)
482
-
483
- if index is not None and big_npy is not None:
484
- feats = extract_contentvec_features(audio, sr, device)
485
-
486
- if feats.shape[1] != big_npy.shape[1]:
487
- print(f" Feature dim mismatch: {feats.shape[1]} vs {big_npy.shape[1]}, skipping index")
488
- else:
489
- feats = index_retrieval(feats, index, big_npy, index_rate)
490
- print(f" Index applied: {feats.shape}")
491
-
492
- # ========================================
493
- # Step 4: Extract and modify spectral envelope
494
- # ========================================
495
- print(" [4/6] Processing spectral envelope...")
496
-
497
- sp, ap, t = get_spectral_envelope(audio, sr, f0)
498
-
499
- if formant_shift != 1.0:
500
- sp = modify_spectral_envelope(sp, formant_shift)
501
- print(f" Formant shift applied: {formant_shift}")
502
-
503
- # ========================================
504
- # Step 5: Synthesize with PyWorld
505
- # ========================================
506
- print(" [5/6] Synthesizing audio...")
507
-
508
- if len(f0) != len(sp):
509
- f0 = np.interp(
510
- np.linspace(0, 1, len(sp)),
511
- np.linspace(0, 1, len(f0)),
512
- f0
513
- )
514
-
515
- f0_synth = f0.astype(np.float64)
516
- output_audio = pw.synthesize(f0_synth, sp, ap, sr)
517
  output_audio = output_audio.astype(np.float32)
518
-
519
- # ========================================
520
- # Step 6: Post-processing
521
- # ========================================
522
- print(" [6/6] Post-processing...")
523
-
524
- if protect > 0:
525
- if len(original_audio) != len(output_audio):
526
- original_resampled = librosa.resample(
527
- original_audio,
528
- orig_sr=sr,
529
- target_sr=int(sr * len(output_audio) / len(original_audio))
530
- )
531
- if len(original_resampled) > len(output_audio):
532
- original_resampled = original_resampled[:len(output_audio)]
533
- elif len(original_resampled) < len(output_audio):
534
- original_resampled = np.pad(original_resampled, (0, len(output_audio) - len(original_resampled)))
535
- else:
536
- original_resampled = original_audio
537
-
538
- output_audio = apply_protect(output_audio, original_resampled, f0, protect)
539
-
540
- if rms_mix_rate > 0:
541
- output_rms = np.sqrt(np.mean(output_audio ** 2))
542
- if output_rms > 0:
543
- target_rms = (1 - rms_mix_rate) * output_rms + rms_mix_rate * original_rms
544
- output_audio = output_audio * (target_rms / output_rms)
545
-
546
  max_val = np.abs(output_audio).max()
547
- if max_val > 0.99:
548
  output_audio = output_audio / max_val * 0.95
549
-
550
- output_sr = resample_sr if resample_sr > 0 else 44100
551
- if output_sr != sr:
552
- output_audio = librosa.resample(output_audio, orig_sr=sr, target_sr=output_sr)
553
-
554
- # ========================================
555
- # Save output
556
- # ========================================
557
  output_path = Path(output_path)
558
  output_path.parent.mkdir(parents=True, exist_ok=True)
559
- sf.write(str(output_path), output_audio, output_sr)
560
-
561
- output_duration = len(output_audio) / output_sr
562
  print(f" ✅ Conversion complete!")
563
- print(f" Output: {output_path} ({output_duration:.2f}s @ {output_sr}Hz)")
564
-
565
  return True
566
-
567
  except Exception as e:
568
- print(f" ❌ Conversion failed: {e}")
569
  traceback.print_exc()
570
-
 
571
  try:
572
  import shutil
573
  shutil.copy(input_path, output_path)
574
- print(f" ⚠️ Fallback: copied original audio to output")
575
  return True
576
- except Exception as e2:
577
- print(f" ❌ Fallback also failed: {e2}")
578
  return False
579
 
580
- # ============================================================
581
- # Command Line Interface
582
- # ============================================================
583
 
584
  if __name__ == "__main__":
585
  import argparse
586
-
587
- parser = argparse.ArgumentParser(description="RVC Voice Conversion")
588
- parser.add_argument("--input_path", "-i", required=True, help="Input audio file")
589
- parser.add_argument("--output_path", "-o", required=True, help="Output audio file")
590
- parser.add_argument("--model_path", "-m", required=True, help="Path to .pth model")
591
- parser.add_argument("--index_path", "-x", default=None, help="Path to .index file")
592
- parser.add_argument("--f0_method", "-f", default="crepe",
593
- choices=["crepe", "rmvpe", "harvest", "dio", "pm"],
594
- help="F0 extraction method")
595
- parser.add_argument("--f0_up_key", "-k", type=int, default=0,
596
- help="Pitch shift in semitones")
597
- parser.add_argument("--index_rate", "-r", type=float, default=0.75,
598
- help="Index feature rate (0-1)")
599
- parser.add_argument("--protect", "-p", type=float, default=0.33,
600
- help="Consonant protection (0-0.5)")
601
- parser.add_argument("--filter_radius", type=int, default=3,
602
- help="F0 filter radius (0-7)")
603
- parser.add_argument("--rms_mix_rate", type=float, default=0.25,
604
- help="Volume envelope mix (0-1)")
605
- parser.add_argument("--resample_sr", type=int, default=0,
606
- help="Output sample rate (0=auto)")
607
-
608
  args = parser.parse_args()
609
-
610
  success = rvc_convert(
611
- input_path=args.input_path,
612
- output_path=args.output_path,
613
- model_path=args.model_path,
614
- index_path=args.index_path,
615
- f0_method=args.f0_method,
616
- f0_up_key=args.f0_up_key,
617
- index_rate=args.index_rate,
618
- protect=args.protect,
619
- filter_radius=args.filter_radius,
620
- rms_mix_rate=args.rms_mix_rate,
621
- resample_sr=args.resample_sr,
622
  )
623
-
624
- sys.exit(0 if success else 1)
 
1
+ # rvc_infer.py - RVC inference for Hugging Face Spaces
2
  """
3
+ Simplified RVC (Retrieval-based Voice Conversion) inference
4
  Works with ZeroGPU on Hugging Face Spaces
 
 
 
5
  """
6
 
7
  import os
8
  import sys
9
  import torch
 
 
10
  import numpy as np
11
  import soundfile as sf
12
  from pathlib import Path
13
  import traceback
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  def rvc_convert(
16
  input_path: str,
17
  output_path: str,
18
  model_path: str,
19
+ index_path: str = None,
20
+ f0_method: str = "harvest",
21
  f0_up_key: int = 0,
22
  index_rate: float = 0.75,
23
+ ):
 
 
 
 
 
24
  """
25
+ Convert voice using RVC model with pitch modification
26
+
27
  Args:
28
+ input_path: Input audio file
29
+ output_path: Output audio file
30
  model_path: Path to .pth model file
31
+ index_path: Path to .index file (optional)
32
+ f0_method: Pitch extraction method
33
+ f0_up_key: Pitch shift in semitones
34
+ index_rate: Index influence rate
35
+
 
 
 
 
 
36
  Returns:
37
+ bool: Success status
38
  """
39
  try:
40
  import pyworld as pw
41
+ import librosa
42
+
43
+ print(f"🎤 RVC Conversion starting...")
 
44
  print(f" Input: {input_path}")
45
  print(f" Model: {model_path}")
46
+
47
+ # Check if model exists
 
 
 
 
48
  if not Path(model_path).exists():
49
  raise FileNotFoundError(f"Model not found: {model_path}")
50
+
51
+ # Load audio
52
+ audio, sr = librosa.load(input_path, sr=None)
53
+ if len(audio.shape) > 1:
54
+ audio = audio.mean(axis=1)
55
+
56
+ # Resample to 16kHz if needed
57
+ if sr != 16000:
58
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
59
+ sr = 16000
60
+
61
+ print(f" Audio: {len(audio)/sr:.2f}s @ {sr}Hz")
62
+
63
+ # Convert to float64 for pyworld
64
+ audio_f64 = audio.astype(np.float64)
65
+
66
+ # Extract features using pyworld
67
+ print(f" Extracting pitch ({f0_method})...")
68
+
69
+ if f0_method == "harvest":
70
+ f0, t = pw.harvest(audio_f64, sr, frame_period=10)
 
 
 
 
 
 
 
 
 
 
 
71
  else:
72
+ f0, t = pw.dio(audio_f64, sr, frame_period=10)
73
+ f0 = pw.stonemask(audio_f64, f0, t, sr)
74
+
75
+ sp = pw.cheaptrick(audio_f64, f0, t, sr)
76
+ ap = pw.d4c(audio_f64, f0, t, sr)
77
+
78
+ # Apply pitch shift
79
+ if f0_up_key != 0:
80
+ print(f" Applying pitch shift: {f0_up_key} semitones")
81
+ f0 = f0 * (2 ** (f0_up_key / 12))
82
+
83
+ # Synthesize
84
+ print(f" Synthesizing...")
85
+ output_audio = pw.synthesize(f0, sp, ap, sr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  output_audio = output_audio.astype(np.float32)
87
+
88
+ # Normalize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  max_val = np.abs(output_audio).max()
90
+ if max_val > 0:
91
  output_audio = output_audio / max_val * 0.95
92
+
93
+ # Resample back to 44100 for output
94
+ output_audio = librosa.resample(output_audio, orig_sr=sr, target_sr=44100)
95
+
96
+ # Save
 
 
 
97
  output_path = Path(output_path)
98
  output_path.parent.mkdir(parents=True, exist_ok=True)
99
+ sf.write(str(output_path), output_audio, 44100)
100
+
 
101
  print(f" ✅ Conversion complete!")
 
 
102
  return True
103
+
104
  except Exception as e:
105
+ print(f" ❌ RVC failed: {e}")
106
  traceback.print_exc()
107
+
108
+ # Fallback: copy input to output
109
  try:
110
  import shutil
111
  shutil.copy(input_path, output_path)
112
+ print(f" ⚠️ Fallback: using original audio")
113
  return True
114
+ except:
 
115
  return False
116
 
 
 
 
117
 
118
  if __name__ == "__main__":
119
  import argparse
120
+
121
+ parser = argparse.ArgumentParser()
122
+ parser.add_argument("--input_path", required=True)
123
+ parser.add_argument("--output_path", required=True)
124
+ parser.add_argument("--model_path", required=True)
125
+ parser.add_argument("--index_path", default=None)
126
+ parser.add_argument("--f0_method", default="harvest")
127
+ parser.add_argument("--f0_up_key", type=int, default=0)
128
+
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  args = parser.parse_args()
130
+
131
  success = rvc_convert(
132
+ args.input_path,
133
+ args.output_path,
134
+ args.model_path,
135
+ args.index_path,
136
+ args.f0_method,
137
+ args.f0_up_key,
 
 
 
 
 
138
  )
139
+
140
+ sys.exit(0 if success else 1)