Upload 3 files
Browse files- models/app.py +158 -0
- models/requirements.txt +12 -0
- models/rvc_infer.py +140 -0
models/app.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py - 9th Anniversary Celebration App
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import spaces
|
| 4 |
+
import os
|
| 5 |
+
import tempfile
|
| 6 |
+
import shutil
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
from utils import (
|
| 11 |
+
separate_vocals_and_instrumental,
|
| 12 |
+
merge_vocals_and_instrumental,
|
| 13 |
+
optimize_audio,
|
| 14 |
+
)
|
| 15 |
+
from rvc_infer import rvc_convert
|
| 16 |
+
|
| 17 |
+
PROJECT_ROOT = Path(__file__).parent
|
| 18 |
+
|
| 19 |
+
SONGS_CONFIG = [
|
| 20 |
+
{"year": 2017, "file": "outputs/爱的故事上集-孙耀威_cloned.wav",
|
| 21 |
+
"original": "songs/爱的故事上集-孙耀威.mp3",
|
| 22 |
+
"message": "星的光点点洒于午夜,我们的故事,从这一年开始书写 💕"},
|
| 23 |
+
{"year": 2018, "file": "outputs/周杰伦 - 告白气球_cloned.wav",
|
| 24 |
+
"original": "songs/周杰伦 - 告白气球.mp3",
|
| 25 |
+
"message": "你说你有点难追,想让我知难而退。我没有退,这一年,我们更近了 ❤️"},
|
| 26 |
+
{"year": 2019, "file": "outputs/林俊杰 - 修炼爱情_cloned.wav",
|
| 27 |
+
"original": "songs/林俊杰 - 修炼爱情.mp3",
|
| 28 |
+
"message": "爱情需要修炼,每一年的陪伴,都是我们爱情的见证 🌟"},
|
| 29 |
+
{"year": 2020, "file": "outputs/周深-雪落下的声音_cloned.wav",
|
| 30 |
+
"original": "songs/周深-雪落下的声音.mp3",
|
| 31 |
+
"message": "就像雪花轻轻落下,你已经填满我的心 🎨"},
|
| 32 |
+
{"year": 2021, "file": "outputs/胡夏&郁可唯-知否知否_cloned.wav",
|
| 33 |
+
"original": "songs/胡夏&郁可唯-知否知否.mp3",
|
| 34 |
+
"message": "知否知否,时光荏苒,但我们的爱依然如初 💖"},
|
| 35 |
+
{"year": 2022, "file": "outputs/陈奕迅 - 陪你度过漫长岁月_cloned.wav",
|
| 36 |
+
"original": "songs/陈奕迅 - 陪你度过漫长岁月.mp3",
|
| 37 |
+
"message": "陪你把独自孤单,变成了勇敢 🌸"},
|
| 38 |
+
{"year": 2023, "file": "outputs/Edd_Sheeran_-_Perfect_cloned.wav",
|
| 39 |
+
"original": "songs/Edd_Sheeran_-_Perfect.mp3",
|
| 40 |
+
"message": "Baby, you're perfect in my eyes ✨"},
|
| 41 |
+
{"year": 2024, "file": "outputs/Michael_Learns_To_Rock_-_Take_Me_To_Your_Heart_Original_Version_cloned.wav",
|
| 42 |
+
"original": "songs/Michael_Learns_To_Rock_-_Take_Me_To_Your_Heart_Original_Version.mp3",
|
| 43 |
+
"message": "Take me to your heart, take me to your soul 🏠"},
|
| 44 |
+
{"year": 2025, "file": "outputs/Richard_Marx-Right_here_waiting_for_you_(mp3.pm)_cloned.wav",
|
| 45 |
+
"original": "songs/Richard_Marx-Right_here_waiting_for_you_(mp3.pm).mp3",
|
| 46 |
+
"message": "I will be right here waiting for you. 9年了,爱依然如故 💝"},
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
def get_audio_path(song, version="cloned"):
|
| 50 |
+
key = "file" if version == "cloned" else "original"
|
| 51 |
+
path = PROJECT_ROOT / song[key]
|
| 52 |
+
return str(path) if path.exists() else None
|
| 53 |
+
|
| 54 |
+
@spaces.GPU(duration=300)
|
| 55 |
+
def convert_voice(audio_file, progress=gr.Progress()):
|
| 56 |
+
if audio_file is None:
|
| 57 |
+
return None, "❌ 请上传一个音频文件"
|
| 58 |
+
|
| 59 |
+
progress(0.05, desc="🎵 开始处理...")
|
| 60 |
+
|
| 61 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 62 |
+
tmpdir = Path(tmpdir)
|
| 63 |
+
input_path = Path(audio_file)
|
| 64 |
+
|
| 65 |
+
progress(0.1, desc="步骤1: 读谱 - 分离人声和伴奏...")
|
| 66 |
+
vocals_path, instrumental_path = separate_vocals_and_instrumental(input_path, tmpdir)
|
| 67 |
+
|
| 68 |
+
if vocals_path is None:
|
| 69 |
+
progress(0.3, desc="⚠️ 跳过分离,直接转换...")
|
| 70 |
+
target_audio = input_path
|
| 71 |
+
instrumental_path = None
|
| 72 |
+
else:
|
| 73 |
+
progress(0.4, desc="✅ 人声分离完成")
|
| 74 |
+
target_audio = vocals_path
|
| 75 |
+
|
| 76 |
+
progress(0.5, desc="步骤2: 清嗓子 - 声线转换...")
|
| 77 |
+
converted_vocals = tmpdir / "converted.wav"
|
| 78 |
+
|
| 79 |
+
model_dir = PROJECT_ROOT / "models"
|
| 80 |
+
model_path = None
|
| 81 |
+
for name in ["xiujia-1220-best", "xiujia-best", "xiujia"]:
|
| 82 |
+
test = model_dir / f"{name}.pth"
|
| 83 |
+
if test.exists():
|
| 84 |
+
model_path = test
|
| 85 |
+
break
|
| 86 |
+
|
| 87 |
+
if model_path and model_path.exists():
|
| 88 |
+
rvc_convert(str(target_audio), str(converted_vocals), str(model_path))
|
| 89 |
+
else:
|
| 90 |
+
shutil.copy(target_audio, converted_vocals)
|
| 91 |
+
progress(0.7, desc="⚠️ 未找到模型,使用原音")
|
| 92 |
+
|
| 93 |
+
progress(0.8, desc="✅ 声线转换完成")
|
| 94 |
+
progress(0.85, desc="步骤3: 开唱 - 合成音频...")
|
| 95 |
+
|
| 96 |
+
final_output = tmpdir / "final.wav"
|
| 97 |
+
|
| 98 |
+
if instrumental_path and instrumental_path.exists():
|
| 99 |
+
merge_vocals_and_instrumental(converted_vocals, instrumental_path, final_output)
|
| 100 |
+
else:
|
| 101 |
+
optimize_audio(converted_vocals, final_output)
|
| 102 |
+
|
| 103 |
+
result_name = f"converted_{datetime.now().strftime('%H%M%S')}.wav"
|
| 104 |
+
result_path = PROJECT_ROOT / "outputs" / result_name
|
| 105 |
+
result_path.parent.mkdir(exist_ok=True)
|
| 106 |
+
shutil.copy(final_output, result_path)
|
| 107 |
+
|
| 108 |
+
progress(1.0, desc="✅ 完成!")
|
| 109 |
+
return str(result_path), "🎉 转换成功!听听看吧~"
|
| 110 |
+
|
| 111 |
+
css = """
|
| 112 |
+
.gradio-container { background: linear-gradient(135deg, #ffeef8, #fff0f5, #ffeef8) !important; }
|
| 113 |
+
h1, h2, h3 { color: #d63384 !important; text-align: center; }
|
| 114 |
+
"""
|
| 115 |
+
|
| 116 |
+
with gr.Blocks(title="💕 9周年纪念", theme=gr.themes.Soft(primary_hue="pink"), css=css) as demo:
|
| 117 |
+
gr.Markdown("# 💕 9th Anniversary Celebration 💕\n### 2017 - 2025 · 九年,久远")
|
| 118 |
+
|
| 119 |
+
with gr.Row():
|
| 120 |
+
for img_name in ["couple.png", "couple1.png"]:
|
| 121 |
+
img_path = PROJECT_ROOT / img_name
|
| 122 |
+
if img_path.exists():
|
| 123 |
+
gr.Image(str(img_path), show_label=False, height=220, container=False)
|
| 124 |
+
|
| 125 |
+
with gr.Tab("🎵 九年歌曲集"):
|
| 126 |
+
gr.Markdown("## 🎵 九年,唱不尽的爱")
|
| 127 |
+
for song in SONGS_CONFIG:
|
| 128 |
+
with gr.Accordion(f"💗 {song['year']} 年", open=False):
|
| 129 |
+
gr.Markdown(f"*{song['message']}*")
|
| 130 |
+
with gr.Row():
|
| 131 |
+
cloned = get_audio_path(song, "cloned")
|
| 132 |
+
original = get_audio_path(song, "original")
|
| 133 |
+
if cloned:
|
| 134 |
+
gr.Audio(cloned, label="🎤 老公唱")
|
| 135 |
+
if original:
|
| 136 |
+
gr.Audio(original, label="🎵 原唱")
|
| 137 |
+
|
| 138 |
+
with gr.Tab("🎤 上传歌曲"):
|
| 139 |
+
gr.Markdown("## 🎤 上传MP3,我唱给你听!")
|
| 140 |
+
with gr.Row():
|
| 141 |
+
with gr.Column():
|
| 142 |
+
audio_in = gr.Audio(label="选择歌曲 🎵", type="filepath", sources=["upload"])
|
| 143 |
+
btn = gr.Button("✨ 开始转换", variant="primary", size="lg")
|
| 144 |
+
status = gr.Textbox(label="状态", interactive=False)
|
| 145 |
+
with gr.Column():
|
| 146 |
+
audio_out = gr.Audio(label="🎵 老公开唱", type="filepath")
|
| 147 |
+
btn.click(convert_voice, [audio_in], [audio_out, status])
|
| 148 |
+
|
| 149 |
+
gr.Markdown("---\n## 💝 九年不是终点,而是我们故事的第九章 💝")
|
| 150 |
+
|
| 151 |
+
with gr.Row():
|
| 152 |
+
for img_name in ["family.png", "family2.png"]:
|
| 153 |
+
img_path = PROJECT_ROOT / img_name
|
| 154 |
+
if img_path.exists():
|
| 155 |
+
gr.Image(str(img_path), show_label=False, height=220, container=False)
|
| 156 |
+
|
| 157 |
+
if __name__ == "__main__":
|
| 158 |
+
demo.launch()
|
models/requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spaces>=0.19.0
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
torchaudio
|
| 4 |
+
demucs
|
| 5 |
+
numpy
|
| 6 |
+
scipy
|
| 7 |
+
pydub
|
| 8 |
+
soundfile
|
| 9 |
+
librosa
|
| 10 |
+
pyworld
|
| 11 |
+
gradio
|
| 12 |
+
huggingface_hub==0.22.2
|
models/rvc_infer.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# rvc_infer.py - RVC inference for Hugging Face Spaces
|
| 2 |
+
"""
|
| 3 |
+
Simplified RVC (Retrieval-based Voice Conversion) inference
|
| 4 |
+
Works with ZeroGPU on Hugging Face Spaces
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import torch
|
| 10 |
+
import numpy as np
|
| 11 |
+
import soundfile as sf
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import traceback
|
| 14 |
+
|
| 15 |
+
def rvc_convert(
|
| 16 |
+
input_path: str,
|
| 17 |
+
output_path: str,
|
| 18 |
+
model_path: str,
|
| 19 |
+
index_path: str = None,
|
| 20 |
+
f0_method: str = "harvest",
|
| 21 |
+
f0_up_key: int = 0,
|
| 22 |
+
index_rate: float = 0.75,
|
| 23 |
+
):
|
| 24 |
+
"""
|
| 25 |
+
Convert voice using RVC model with pitch modification
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
input_path: Input audio file
|
| 29 |
+
output_path: Output audio file
|
| 30 |
+
model_path: Path to .pth model file
|
| 31 |
+
index_path: Path to .index file (optional)
|
| 32 |
+
f0_method: Pitch extraction method
|
| 33 |
+
f0_up_key: Pitch shift in semitones
|
| 34 |
+
index_rate: Index influence rate
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
bool: Success status
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
import pyworld as pw
|
| 41 |
+
import librosa
|
| 42 |
+
|
| 43 |
+
print(f"🎤 RVC Conversion starting...")
|
| 44 |
+
print(f" Input: {input_path}")
|
| 45 |
+
print(f" Model: {model_path}")
|
| 46 |
+
|
| 47 |
+
# Check if model exists
|
| 48 |
+
if not Path(model_path).exists():
|
| 49 |
+
raise FileNotFoundError(f"Model not found: {model_path}")
|
| 50 |
+
|
| 51 |
+
# Load audio
|
| 52 |
+
audio, sr = librosa.load(input_path, sr=None)
|
| 53 |
+
if len(audio.shape) > 1:
|
| 54 |
+
audio = audio.mean(axis=1)
|
| 55 |
+
|
| 56 |
+
# Resample to 16kHz if needed
|
| 57 |
+
if sr != 16000:
|
| 58 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 59 |
+
sr = 16000
|
| 60 |
+
|
| 61 |
+
print(f" Audio: {len(audio)/sr:.2f}s @ {sr}Hz")
|
| 62 |
+
|
| 63 |
+
# Convert to float64 for pyworld
|
| 64 |
+
audio_f64 = audio.astype(np.float64)
|
| 65 |
+
|
| 66 |
+
# Extract features using pyworld
|
| 67 |
+
print(f" Extracting pitch ({f0_method})...")
|
| 68 |
+
|
| 69 |
+
if f0_method == "harvest":
|
| 70 |
+
f0, t = pw.harvest(audio_f64, sr, frame_period=10)
|
| 71 |
+
else:
|
| 72 |
+
f0, t = pw.dio(audio_f64, sr, frame_period=10)
|
| 73 |
+
f0 = pw.stonemask(audio_f64, f0, t, sr)
|
| 74 |
+
|
| 75 |
+
sp = pw.cheaptrick(audio_f64, f0, t, sr)
|
| 76 |
+
ap = pw.d4c(audio_f64, f0, t, sr)
|
| 77 |
+
|
| 78 |
+
# Apply pitch shift
|
| 79 |
+
if f0_up_key != 0:
|
| 80 |
+
print(f" Applying pitch shift: {f0_up_key} semitones")
|
| 81 |
+
f0 = f0 * (2 ** (f0_up_key / 12))
|
| 82 |
+
|
| 83 |
+
# Synthesize
|
| 84 |
+
print(f" Synthesizing...")
|
| 85 |
+
output_audio = pw.synthesize(f0, sp, ap, sr)
|
| 86 |
+
output_audio = output_audio.astype(np.float32)
|
| 87 |
+
|
| 88 |
+
# Normalize
|
| 89 |
+
max_val = np.abs(output_audio).max()
|
| 90 |
+
if max_val > 0:
|
| 91 |
+
output_audio = output_audio / max_val * 0.95
|
| 92 |
+
|
| 93 |
+
# Resample back to 44100 for output
|
| 94 |
+
output_audio = librosa.resample(output_audio, orig_sr=sr, target_sr=44100)
|
| 95 |
+
|
| 96 |
+
# Save
|
| 97 |
+
output_path = Path(output_path)
|
| 98 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 99 |
+
sf.write(str(output_path), output_audio, 44100)
|
| 100 |
+
|
| 101 |
+
print(f" ✅ Conversion complete!")
|
| 102 |
+
return True
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f" ❌ RVC failed: {e}")
|
| 106 |
+
traceback.print_exc()
|
| 107 |
+
|
| 108 |
+
# Fallback: copy input to output
|
| 109 |
+
try:
|
| 110 |
+
import shutil
|
| 111 |
+
shutil.copy(input_path, output_path)
|
| 112 |
+
print(f" ⚠️ Fallback: using original audio")
|
| 113 |
+
return True
|
| 114 |
+
except:
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
import argparse
|
| 120 |
+
|
| 121 |
+
parser = argparse.ArgumentParser()
|
| 122 |
+
parser.add_argument("--input_path", required=True)
|
| 123 |
+
parser.add_argument("--output_path", required=True)
|
| 124 |
+
parser.add_argument("--model_path", required=True)
|
| 125 |
+
parser.add_argument("--index_path", default=None)
|
| 126 |
+
parser.add_argument("--f0_method", default="harvest")
|
| 127 |
+
parser.add_argument("--f0_up_key", type=int, default=0)
|
| 128 |
+
|
| 129 |
+
args = parser.parse_args()
|
| 130 |
+
|
| 131 |
+
success = rvc_convert(
|
| 132 |
+
args.input_path,
|
| 133 |
+
args.output_path,
|
| 134 |
+
args.model_path,
|
| 135 |
+
args.index_path,
|
| 136 |
+
args.f0_method,
|
| 137 |
+
args.f0_up_key,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
sys.exit(0 if success else 1)
|