RVC-CH / app.py
LosCaquitos's picture
Update app.py
e7d1d20 verified
raw
history blame
16.4 kB
"""
RVC Voice Conversion – HuggingFace Space
Simple, fast, GPU/CPU auto-detected.
Now with video upload and 5-output generation (acapella, instrumental, RVC on original, RVC on acapella).
Original job queue and all tabs fully preserved.
"""
from __future__ import annotations
import os
import re
import gradio as gr
from lib.config import (
BUILTIN_MODELS,
CSS,
DEVICE_LABEL,
MAX_INPUT_DURATION,
logger,
)
from lib.jobs import (
get_jobs_table,
get_queue_info,
poll_job,
submit_job,
submit_full_pipeline,
)
from lib.models import list_models, startup_downloads
from lib.ui import (
refresh_models,
toggle_autotune,
upload_model,
create_video_section,
create_five_outputs,
)
# ── Startup ───────────────────────────────────────────────────────────────────
startup_status = ""
default_model = ""
try:
default_model = startup_downloads()
startup_status = f"βœ… Ready  Β·  {DEVICE_LABEL}"
except Exception as e:
startup_status = f"⚠️ Some assets unavailable: {e}  Β·  {DEVICE_LABEL}"
logger.warning("Startup download issue: %s", e)
initial_models = list_models()
initial_value = default_model if default_model in initial_models else (
initial_models[0] if initial_models else None
)
# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="RVC Voice Conversion - Full Suite", delete_cache=(3600, 3600)) as demo:
gr.HTML(f"""
<div id="header">
<h1>πŸŽ™οΈ RVC Voice Conversion - Full Suite</h1>
<p>ConversΓ£o de voz com suporte a vΓ­deos, extraΓ§Γ£o de acapella/instrumental e 5 saΓ­das!</p>
</div>
<p id="status">{startup_status}</p>
""")
with gr.Tabs():
# ── TAB 1: Convert ────────────────────────────────────────────────────
with gr.Tab("🎀 Convert"):
gr.Markdown("## OpΓ§Γ£o 1: ConversΓ£o RVC clΓ‘ssica (um arquivo de saΓ­da)")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### πŸ”Š Input Audio")
with gr.Tabs():
with gr.Tab("πŸŽ™οΈ Microphone"):
inp_mic = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record",
)
with gr.Tab("πŸ“ Upload File"):
inp_file = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload audio (wav / mp3 / flac / ogg …)",
)
gr.Markdown("### πŸ€– Model")
model_dd = gr.Dropdown(
choices=initial_models,
value=initial_value,
label="Active Voice Model",
interactive=True,
)
gr.Markdown("### 🎚️ Basic Settings")
pitch_sl = gr.Slider(
minimum=-24, maximum=24, value=0, step=1,
label="Pitch Shift (semitones)",
info="0 = unchanged Β· positive = higher Β· negative = lower",
)
f0_radio = gr.Radio(
choices=["rmvpe", "fcpe", "crepe", "crepe-tiny"],
value="rmvpe",
label="Pitch Extraction Method",
info="rmvpe = fastest & accurate Β· crepe = highest quality (slower)",
)
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Advanced Settings")
with gr.Accordion("Expand advanced options", open=False):
index_rate_sl = gr.Slider(
0.0, 1.0, value=0.75, step=0.05,
label="Index Rate",
info="How strongly the FAISS index influences timbre (0 = off)",
)
protect_sl = gr.Slider(
0.0, 0.5, value=0.5, step=0.01,
label="Protect Consonants",
info="Protects unvoiced consonants β€” 0.5 = max protection",
)
filter_radius_sl = gr.Slider(
0, 7, value=3, step=1,
label="Respiration Filter Radius",
info="Median filter on pitch β€” higher = smoother, reduces breath noise",
)
vol_env_sl = gr.Slider(
0.0, 1.0, value=0.25, step=0.05,
label="Volume Envelope Mix",
info="0.25 = natural blend Β· 1 = preserve input loudness Β· 0 = model output",
)
with gr.Row():
clean_cb = gr.Checkbox(value=False, label="Noise Reduction")
clean_sl = gr.Slider(
0.0, 1.0, value=0.5, step=0.05,
label="Reduction Strength",
)
with gr.Row():
split_cb = gr.Checkbox(value=False, label="Split Long Audio")
autotune_cb = gr.Checkbox(value=False, label="Autotune")
autotune_sl = gr.Slider(
0.0, 1.0, value=1.0, step=0.05,
label="Autotune Strength",
visible=False,
)
autotune_cb.change(
fn=toggle_autotune,
inputs=autotune_cb,
outputs=autotune_sl,
)
gr.Markdown("**πŸŽ›οΈ Reverb**")
reverb_cb = gr.Checkbox(value=False, label="Enable Reverb")
with gr.Group(visible=False) as reverb_group:
reverb_room_sl = gr.Slider(
0.0, 1.0, value=0.15, step=0.05,
label="Room Size",
info="Larger = bigger sounding space",
)
reverb_damp_sl = gr.Slider(
0.0, 1.0, value=0.7, step=0.05,
label="Damping",
info="Higher = more absorption, less echo tail",
)
reverb_wet_sl = gr.Slider(
0.0, 1.0, value=0.15, step=0.05,
label="Wet Level",
info="How much reverb is mixed in (0.15 = subtle)",
)
reverb_cb.change(
fn=lambda v: gr.update(visible=v),
inputs=reverb_cb,
outputs=reverb_group,
)
fmt_radio = gr.Radio(
choices=["WAV", "MP3", "FLAC", "OPUS"],
value="WAV",
label="Output Format",
info="WAV = lossless, large file; MP3/OPUS = smaller",
)
with gr.Row():
classic_btn = gr.Button("πŸš€ Convert Voice (Classic)", variant="primary")
classic_status = gr.Markdown(value="")
classic_audio = gr.Audio(label="Converted Audio", type="filepath", interactive=False)
gr.Markdown("---")
gr.Markdown("## OpΓ§Γ£o 2: Pipeline completo – 5 saΓ­das (vΓ­deo + separaΓ§Γ£o de faixas + RVC)")
with gr.Row():
with gr.Column(scale=1):
video_input = create_video_section()
gr.Markdown("*(Ou use Γ‘udio/microfone acima – o pipeline aceita qualquer fonte)*")
with gr.Column(scale=1):
full_btn = gr.Button("πŸš€ Full Pipeline (5 Outputs)", variant="secondary")
full_status = gr.Markdown(value="")
# 5 componentes de saΓ­da
(entrada_acapella, entrada_audio, entrada_instrumental,
saida_audio, saida_acapella) = create_five_outputs()
# SeΓ§Γ£o de verificaΓ§Γ£o de jobs legada
gr.Markdown("---")
gr.Markdown("### πŸ” Verificar status de um job (clΓ‘ssico)")
with gr.Row():
job_id_box = gr.Textbox(
label="Job ID",
placeholder="e.g. a3f2b1c9",
scale=3,
)
poll_btn = gr.Button("πŸ”„ Check", scale=1)
poll_status = gr.Markdown(value="")
poll_audio = gr.Audio(label="Result", type="filepath", interactive=False)
# ── TAB 2: Models (original) ──────────────────────────────────────────
with gr.Tab("πŸ“¦ Models"):
gr.Markdown("""
### Upload a Custom RVC Model
Provide a **`.zip`** containing:
- **`model.pth`** β€” weights (required)
- **`model.index`** β€” FAISS index (optional, improves voice matching)
**Built-in models** (pre-downloaded on startup):
Vestia Zeta v1 Β· Vestia Zeta v2 Β· Ayunda Risu Β· Gawr Gura
""")
with gr.Row():
with gr.Column(scale=1):
up_zip = gr.File(label="Model ZIP", file_types=[".zip"])
up_name = gr.Textbox(
label="Model Name",
placeholder="Leave blank to use zip filename",
)
up_btn = gr.Button("πŸ“€ Load Model", variant="primary")
up_status = gr.Textbox(label="Status", interactive=False, lines=2)
with gr.Column(scale=1):
gr.Markdown("### Loaded Models")
models_table = gr.Dataframe(
col_count=(1, "fixed"),
value=[[m] for m in initial_models],
interactive=False,
label="",
)
refresh_btn = gr.Button("πŸ”„ Refresh")
up_btn.click(
fn=upload_model,
inputs=[up_zip, up_name],
outputs=[up_status, model_dd, models_table],
)
refresh_btn.click(
fn=refresh_models,
outputs=[models_table, model_dd],
)
# ── TAB 3: Jobs (original, intocada) ──────────────────────────────────
with gr.Tab("πŸ“‹ Jobs"):
gr.Markdown("All submitted jobs, newest first. Click **Refresh** to update.")
queue_status = gr.Markdown(value=get_queue_info, every=10)
jobs_table = gr.Dataframe(
headers=["Job ID", "Model", "Status", "Time", "Download"],
col_count=(5, "fixed"),
value=get_jobs_table,
interactive=False,
wrap=True,
datatype=["str", "str", "str", "str", "markdown"],
every=10,
)
refresh_jobs_btn = gr.Button("πŸ”„ Refresh")
def _refresh_jobs():
return get_queue_info(), get_jobs_table()
refresh_jobs_btn.click(fn=_refresh_jobs, outputs=[queue_status, jobs_table])
# ── TAB 4: Help (atualizada) ──────────────────────────────────────────
with gr.Tab("ℹ️ Help"):
gr.Markdown(f"""
## How it works
RVC (Retrieval-Based Voice Conversion) transforms a voice recording to sound
like a target speaker using only that speaker's model file.
---
## Two conversion modes
### 1. Classic Mode (single output)
- Upload an audio file or record your voice
- Choose a model and settings
- Click **Convert Voice (Classic)**
- Result is added to the job queue – you can monitor it in the **Jobs** tab
### 2. Full Pipeline Mode (5 outputs)
- Works with **video (MP4)** or any audio source
- Automatically extracts **acapella** (vocals) and **instrumental** using Demucs
- Runs RVC conversion on **both** the original audio and the acapella
- Returns 5 files:
1. `entrada_acapella.mp3` – extracted vocals
2. `entrada.mp3` – original audio
3. `entrada_instrumental.mp3` – background music
4. `saida.mp3` – RVC applied to original audio
5. `saida_acapella.mp3` – RVC applied only to vocals
- All 5 files appear directly on the interface – no need to poll jobs
---
**Device:** `{DEVICE_LABEL}`
**Max input duration:** {MAX_INPUT_DURATION // 60} minutes
---
## Credits
Engine: [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)
Separation: [Demucs](https://github.com/facebookresearch/demucs)
""")
# ── Event handlers ────────────────────────────────────────────────────────
def _submit_classic(*args):
status, audio = submit_job(*args)
match = re.search(r"[a-f0-9]{8}", status or "")
job_id = match.group(0) if match else ""
return status, audio, job_id, get_queue_info(), get_jobs_table()
classic_btn.click(
fn=_submit_classic,
inputs=[
inp_mic, inp_file, model_dd,
pitch_sl, f0_radio,
index_rate_sl, protect_sl, vol_env_sl,
clean_cb, clean_sl,
split_cb, autotune_cb, autotune_sl,
filter_radius_sl,
fmt_radio,
reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl,
],
outputs=[classic_status, classic_audio, job_id_box, queue_status, jobs_table],
)
full_btn.click(
fn=submit_full_pipeline,
inputs=[
video_input, inp_mic, inp_file, model_dd,
pitch_sl, f0_radio,
index_rate_sl, protect_sl, vol_env_sl,
clean_cb, clean_sl,
split_cb, autotune_cb, autotune_sl,
filter_radius_sl,
fmt_radio,
reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl,
],
outputs=[
full_status,
entrada_acapella, entrada_audio, entrada_instrumental,
saida_audio, saida_acapella,
],
)
def _poll_and_refresh(job_id):
status, file = poll_job(job_id)
return status, file, get_queue_info(), get_jobs_table()
poll_btn.click(
fn=_poll_and_refresh,
inputs=[job_id_box],
outputs=[poll_status, poll_audio, queue_status, jobs_table],
)
# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
demo.queue(default_concurrency_limit=5)
demo.launch(
server_name="0.0.0.0",
server_port=int(os.getenv("PORT", 7860)),
max_threads=10,
ssr_mode=False,
css=CSS,
)