RVC-CH / app.py
LosCaquitos's picture
Update app.py
5d082c2 verified
"""
RVC Voice Conversion – HuggingFace Space
Simple, fast, GPU/CPU auto-detected.
"""
from __future__ import annotations
import os
import subprocess
import tempfile
import shutil
from pathlib import Path
import gradio as gr
import numpy as np
from lib.config import (
BUILTIN_MODELS,
CSS,
DEVICE_LABEL,
MAX_INPUT_DURATION,
logger,
)
from lib.jobs import (
get_jobs_table,
get_queue_info,
poll_job,
submit_job,
)
from lib.models import list_models, startup_downloads
from lib.ui import refresh_models, toggle_autotune, upload_model
# ── Startup ───────────────────────────────────────────────────────────────────
startup_status = ""
default_model = ""
try:
default_model = startup_downloads()
startup_status = f"βœ… Ready  Β·  {DEVICE_LABEL}"
except Exception as e:
startup_status = f"⚠️ Some assets unavailable: {e}  Β·  {DEVICE_LABEL}"
logger.warning("Startup download issue: %s", e)
initial_models = list_models()
initial_value = default_model if default_model in initial_models else (
initial_models[0] if initial_models else None
)
# ── FunΓ§Γ£o para processar vΓ­deo ───────────────────────────────────────────────
def process_video(video_file, model, pitch, f0_method, index_rate, protect, vol_env,
clean, clean_strength, split, autotune, autotune_strength,
filter_radius, output_format, reverb, reverb_room, reverb_damp, reverb_wet):
"""Processa vΓ­deo e retorna ZIP com 5 outputs."""
if video_file is None:
return None, "No video file provided"
# Criar diretΓ³rio temporΓ‘rio
output_dir = Path(tempfile.mkdtemp())
# Extrair Γ‘udio
audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
cmd = ["ffmpeg", "-i", video_file, "-q:a", "0", "-map", "a", "-y", audio_path]
subprocess.run(cmd, check=True, capture_output=True)
# Salvar entrada.wav
entrada_wav = output_dir / "entrada.wav"
shutil.copy2(audio_path, entrada_wav)
# Criar versΓ΅es (simplificado)
entrada_acapella = output_dir / "entrada_acapella.wav"
entrada_instrumental = output_dir / "entrada_instrumental.wav"
shutil.copy2(audio_path, entrada_acapella)
import soundfile as sf
data, sr = sf.read(audio_path)
silent = np.zeros_like(data)
sf.write(entrada_instrumental, silent, sr)
# Converter com RVC
status, converted = submit_job(
None, str(entrada_acapella), model, pitch, f0_method,
index_rate, protect, vol_env, clean, clean_strength,
split, autotune, autotune_strength, filter_radius,
output_format, reverb, reverb_room, reverb_damp, reverb_wet
)
if not converted:
return None, f"Conversion failed: {status}"
# Salvar outputs
saida_acapella = output_dir / "saida_acapella.wav"
shutil.copy2(converted, saida_acapella)
# Mixar
rvc_audio, rvc_sr = sf.read(saida_acapella)
inst_audio, inst_sr = sf.read(entrada_instrumental)
if rvc_sr != inst_sr:
from scipy import signal
inst_audio = signal.resample(inst_audio, int(len(inst_audio) * rvc_sr / inst_sr))
min_len = min(len(rvc_audio), len(inst_audio))
mixed = rvc_audio[:min_len] + inst_audio[:min_len]
mixed = mixed * (0.95 / max(np.abs(mixed))) if max(np.abs(mixed)) > 0.95 else mixed
saida_wav = output_dir / "saida.wav"
sf.write(saida_wav, mixed, rvc_sr)
# Criar ZIP
zip_path = output_dir / "outputs.zip"
import zipfile
with zipfile.ZipFile(zip_path, 'w') as z:
z.write(saida_acapella, "saida_acapella.wav")
z.write(saida_wav, "saida.wav")
z.write(entrada_acapella, "entrada_acapella.wav")
z.write(entrada_wav, "entrada.wav")
z.write(entrada_instrumental, "entrada_instrumental.wav")
return str(zip_path), "βœ… Conversion complete! ZIP with 5 files ready."
# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="RVC Voice Conversion", delete_cache=(3600, 3600)) as demo:
gr.HTML(f"""
<div id="header">
<h1>πŸŽ™οΈ RVC Voice Conversion</h1>
<p>Retrieval-Based Voice Conversion Β· record or upload Β· custom models Β· GPU/CPU auto</p>
</div>
<p id="status">{startup_status}</p>
""")
with gr.Tabs():
# ── TAB 1: Convert ────────────────────────────────────────────────────
with gr.Tab("🎀 Convert"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ”Š Input Audio")
with gr.Tabs():
with gr.Tab("πŸŽ™οΈ Microphone"):
inp_mic = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record",
)
with gr.Tab("πŸ“ Upload File"):
inp_file = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload audio (wav / mp3 / flac / ogg …)",
)
gr.Markdown("### πŸ€– Model")
model_dd = gr.Dropdown(
choices=initial_models,
value=initial_value,
label="Active Voice Model",
interactive=True,
)
gr.Markdown("### 🎚️ Basic Settings")
pitch_sl = gr.Slider(
minimum=-24, maximum=24, value=0, step=1,
label="Pitch Shift (semitones)",
info="0 = unchanged Β· positive = higher Β· negative = lower",
)
f0_radio = gr.Radio(
choices=["rmvpe", "fcpe", "crepe", "crepe-tiny"],
value="rmvpe",
label="Pitch Extraction Method",
info="rmvpe = fastest & accurate Β· crepe = highest quality (slower)",
)
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Advanced Settings")
with gr.Accordion("Expand advanced options", open=False):
index_rate_sl = gr.Slider(
0.0, 1.0, value=0.75, step=0.05,
label="Index Rate",
info="How strongly the FAISS index influences timbre (0 = off)",
)
protect_sl = gr.Slider(
0.0, 0.5, value=0.5, step=0.01,
label="Protect Consonants",
info="Protects unvoiced consonants β€” 0.5 = max protection",
)
filter_radius_sl = gr.Slider(
0, 7, value=3, step=1,
label="Respiration Filter Radius",
info="Median filter on pitch β€” higher = smoother, reduces breath noise",
)
vol_env_sl = gr.Slider(
0.0, 1.0, value=0.25, step=0.05,
label="Volume Envelope Mix",
info="0.25 = natural blend Β· 1 = preserve input loudness Β· 0 = model output",
)
with gr.Row():
clean_cb = gr.Checkbox(value=False, label="Noise Reduction")
clean_sl = gr.Slider(
0.0, 1.0, value=0.5, step=0.05,
label="Reduction Strength",
)
with gr.Row():
split_cb = gr.Checkbox(value=False, label="Split Long Audio")
autotune_cb = gr.Checkbox(value=False, label="Autotune")
autotune_sl = gr.Slider(
0.0, 1.0, value=1.0, step=0.05,
label="Autotune Strength",
visible=False,
)
autotune_cb.change(
fn=toggle_autotune,
inputs=autotune_cb,
outputs=autotune_sl,
)
gr.Markdown("**πŸŽ›οΈ Reverb**")
reverb_cb = gr.Checkbox(value=False, label="Enable Reverb")
with gr.Group(visible=False) as reverb_group:
reverb_room_sl = gr.Slider(
0.0, 1.0, value=0.15, step=0.05,
label="Room Size",
info="Larger = bigger sounding space",
)
reverb_damp_sl = gr.Slider(
0.0, 1.0, value=0.7, step=0.05,
label="Damping",
info="Higher = more absorption, less echo tail",
)
reverb_wet_sl = gr.Slider(
0.0, 1.0, value=0.15, step=0.05,
label="Wet Level",
info="How much reverb is mixed in (0.15 = subtle)",
)
reverb_cb.change(
fn=lambda v: gr.update(visible=v),
inputs=reverb_cb,
outputs=reverb_group,
)
fmt_radio = gr.Radio(
choices=["WAV", "MP3", "FLAC", "OPUS"],
value="WAV",
label="Output Format",
info="OPUS = small file (~64 kbps, Telegram/Discord quality)",
)
convert_btn = gr.Button(
"πŸš€ Convert Voice",
variant="primary",
)
gr.Markdown("### 🎧 Output")
out_status = gr.Markdown(value="")
out_audio = gr.Audio(label="Result (if still on page)", type="filepath", interactive=False)
gr.Markdown("#### πŸ” Check Job Status")
with gr.Row():
job_id_box = gr.Textbox(
label="Job ID",
placeholder="e.g. a3f2b1c9",
scale=3,
)
poll_btn = gr.Button("πŸ”„ Check", scale=1)
poll_status = gr.Markdown(value="")
poll_audio = gr.Audio(label="Result", type="filepath", interactive=False)
# ── TAB 2: Video Convert ────────────────────────────────────────────────
with gr.Tab("🎬 Video Convert"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸŽ₯ Input Video")
video_file = gr.Video(label="Upload MP4 Video", sources=["upload"], format="mp4")
gr.Markdown("### πŸ€– Model")
video_model = gr.Dropdown(
choices=initial_models,
value=initial_value,
label="Active Voice Model",
interactive=True,
)
gr.Markdown("### 🎚️ Basic Settings")
video_pitch = gr.Slider(-24, 24, value=0, step=1, label="Pitch Shift")
video_f0 = gr.Radio(["rmvpe", "fcpe", "crepe", "crepe-tiny"], value="rmvpe", label="Pitch Method")
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Advanced Settings")
with gr.Accordion("Expand", open=False):
video_index = gr.Slider(0.0, 1.0, value=0.75, step=0.05, label="Index Rate")
video_protect = gr.Slider(0.0, 0.5, value=0.5, step=0.01, label="Protect Consonants")
video_filter = gr.Slider(0, 7, value=3, step=1, label="Filter Radius")
video_vol = gr.Slider(0.0, 1.0, value=0.25, step=0.05, label="Volume Envelope")
video_clean = gr.Checkbox(value=False, label="Noise Reduction")
video_clean_strength = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Clean Strength")
video_split = gr.Checkbox(value=False, label="Split Long Audio")
video_autotune = gr.Checkbox(value=False, label="Autotune")
video_autotune_strength = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Autotune Strength")
gr.Markdown("**πŸŽ›οΈ Reverb**")
video_reverb = gr.Checkbox(value=False, label="Enable Reverb")
video_reverb_room = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Room Size")
video_reverb_damp = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Damping")
video_reverb_wet = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Wet Level")
video_format = gr.Radio(["WAV", "MP3", "FLAC", "OPUS"], value="WAV", label="Output Format")
video_convert_btn = gr.Button("🎬 Convert Video", variant="primary")
gr.Markdown("### πŸ“¦ Output")
video_status = gr.Markdown(value="")
video_output = gr.File(label="Download ZIP (5 audio files)", type="filepath")
# ── TAB 3: Models ─────────────────────────────────────────────────────
with gr.Tab("πŸ“¦ Models"):
gr.Markdown("""
### Upload a Custom RVC Model
Provide a **`.zip`** containing:
- **`model.pth`** β€” weights (required)
- **`model.index`** β€” FAISS index (optional, improves voice matching)
**Built-in models** (pre-downloaded on startup):
Vestia Zeta v1 Β· Vestia Zeta v2 Β· Ayunda Risu Β· Gawr Gura
""")
with gr.Row():
with gr.Column(scale=1):
up_zip = gr.File(label="Model ZIP", file_types=[".zip"])
up_name = gr.Textbox(
label="Model Name",
placeholder="Leave blank to use zip filename",
)
up_btn = gr.Button("πŸ“€ Load Model", variant="primary")
up_status = gr.Textbox(label="Status", interactive=False, lines=2)
with gr.Column(scale=1):
gr.Markdown("### Loaded Models")
models_table = gr.Dataframe(
col_count=(1, "fixed"),
value=[[m] for m in initial_models],
interactive=False,
label="",
)
refresh_btn = gr.Button("πŸ”„ Refresh")
up_btn.click(
fn=upload_model,
inputs=[up_zip, up_name],
outputs=[up_status, model_dd, models_table],
)
refresh_btn.click(
fn=refresh_models,
outputs=[models_table, model_dd],
)
# ── TAB 4: Jobs ───────────────────────────────────────────────────────
with gr.Tab("πŸ“‹ Jobs"):
gr.Markdown("All submitted jobs, newest first. Click **Refresh** to update.")
queue_status = gr.Markdown(value=get_queue_info, every=10)
jobs_table = gr.Dataframe(
headers=["Job ID", "Model", "Status", "Time", "Download"],
col_count=(5, "fixed"),
value=get_jobs_table,
interactive=False,
wrap=True,
datatype=["str", "str", "str", "str", "markdown"],
every=10,
)
refresh_jobs_btn = gr.Button("πŸ”„ Refresh")
def _refresh_jobs():
return get_queue_info(), get_jobs_table()
refresh_jobs_btn.click(fn=_refresh_jobs, outputs=[queue_status, jobs_table])
# ── TAB 5: Help ───────────────────────────────────────────────────────
with gr.Tab("ℹ️ Help"):
gr.Markdown(f"""
## How it works
RVC (Retrieval-Based Voice Conversion) transforms a voice recording to sound
like a target speaker using only that speaker's model file.
---
## Quick Guide
1. Open the **Convert** tab
2. **Record** via microphone or **upload** an audio file (wav, mp3, flac, ogg …)
3. Choose a **model** from the dropdown β€” 4 models are pre-loaded on startup
4. Set **Pitch Shift** if needed (e.g. male β†’ female: try +12 semitones)
5. Click **πŸš€ Convert Voice** and wait for the result
---
## Built-in Models
| Model | Description |
|---|---|
| **Vestia Zeta v1** | Hololive ID VTuber, v1 model |
| **Vestia Zeta v2** | Hololive ID VTuber, v2 model (recommended) |
| **Ayunda Risu** | Hololive ID VTuber |
| **Gawr Gura** | Hololive EN VTuber |
---
## Pitch Extraction Methods
| Method | Speed | Quality | Best for |
|---|---|---|---|
| **rmvpe** | ⚑⚑⚑ | β˜…β˜…β˜…β˜… | General use (default) |
| **fcpe** | ⚑⚑ | β˜…β˜…β˜…β˜… | Singing |
| **crepe** | ⚑ | β˜…β˜…β˜…β˜…β˜… | Highest quality, slow |
| **crepe-tiny** | ⚑⚑ | β˜…β˜…β˜… | Low resource |
---
## Advanced Settings
| Setting | Description |
|---|---|
| **Index Rate** | Influence of FAISS index on output timbre (0.75 recommended) |
| **Protect Consonants** | Prevents artefacts on consonants (0.5 = max) |
| **Respiration Filter Radius** | Smooths pitch curve β€” higher reduces breath noise (0–7, default 3) |
| **Volume Envelope Mix** | 0.25 = natural blend Β· 1 = preserve input loudness |
| **Noise Reduction** | Removes background noise before conversion |
| **Split Long Audio** | Chunks audio for recordings > 60 s |
| **Autotune** | Snaps pitch to nearest musical note |
---
## Output Formats
| Format | Size | Quality |
|---|---|---|
| **WAV** | Large | Lossless |
| **FLAC** | Medium | Lossless compressed |
| **MP3** | Small | Lossy |
| **OPUS** | Tiny (~64 kbps) | Telegram/Discord quality |
---
**Device:** `{DEVICE_LABEL}`
**Max input duration:** {MAX_INPUT_DURATION // 60} minutes
---
## Credits
Engine: [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)
""")
# Wire convert button after all tabs
def _submit_and_extract_id(*args):
import re
status, audio = submit_job(*args)
match = re.search(r"[a-f0-9]{8}", status or "")
job_id = match.group(0) if match else ""
return status, audio, job_id, get_queue_info(), get_jobs_table()
convert_btn.click(
fn=_submit_and_extract_id,
inputs=[
inp_mic, inp_file, model_dd,
pitch_sl, f0_radio,
index_rate_sl, protect_sl, vol_env_sl,
clean_cb, clean_sl,
split_cb, autotune_cb, autotune_sl,
filter_radius_sl,
fmt_radio,
reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl,
],
outputs=[out_status, out_audio, job_id_box, queue_status, jobs_table],
)
def _poll_and_refresh(job_id):
status, file = poll_job(job_id)
return status, file, get_queue_info(), get_jobs_table()
poll_btn.click(
fn=_poll_and_refresh,
inputs=[job_id_box],
outputs=[poll_status, poll_audio, queue_status, jobs_table],
)
# Video convert
video_convert_btn.click(
fn=process_video,
inputs=[
video_file, video_model, video_pitch, video_f0,
video_index, video_protect, video_vol,
video_clean, video_clean_strength,
video_split, video_autotune, video_autotune_strength,
video_filter, video_format,
video_reverb, video_reverb_room, video_reverb_damp, video_reverb_wet
],
outputs=[video_output, video_status]
)
# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
demo.queue(default_concurrency_limit=5)
demo.launch(
server_name="0.0.0.0",
server_port=int(os.getenv("PORT", 7860)),
max_threads=10,
ssr_mode=False,
css=CSS,
)