""" RVC Voice Conversion – HuggingFace Space Simple, fast, GPU/CPU auto-detected. """ from __future__ import annotations import os import subprocess import tempfile import shutil from pathlib import Path import gradio as gr import numpy as np from lib.config import ( BUILTIN_MODELS, CSS, DEVICE_LABEL, MAX_INPUT_DURATION, logger, ) from lib.jobs import ( get_jobs_table, get_queue_info, poll_job, submit_job, ) from lib.models import list_models, startup_downloads from lib.ui import refresh_models, toggle_autotune, upload_model # ── Startup ─────────────────────────────────────────────────────────────────── startup_status = "" default_model = "" try: default_model = startup_downloads() startup_status = f"✅ Ready · {DEVICE_LABEL}" except Exception as e: startup_status = f"⚠️ Some assets unavailable: {e} · {DEVICE_LABEL}" logger.warning("Startup download issue: %s", e) initial_models = list_models() initial_value = default_model if default_model in initial_models else ( initial_models[0] if initial_models else None ) # ── Função para processar vídeo ─────────────────────────────────────────────── def process_video(video_file, model, pitch, f0_method, index_rate, protect, vol_env, clean, clean_strength, split, autotune, autotune_strength, filter_radius, output_format, reverb, reverb_room, reverb_damp, reverb_wet): """Processa vídeo e retorna ZIP com 5 outputs.""" if video_file is None: return None, "No video file provided" # Criar diretório temporário output_dir = Path(tempfile.mkdtemp()) # Extrair áudio audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name cmd = ["ffmpeg", "-i", video_file, "-q:a", "0", "-map", "a", "-y", audio_path] subprocess.run(cmd, check=True, capture_output=True) # Salvar entrada.wav entrada_wav = output_dir / "entrada.wav" shutil.copy2(audio_path, entrada_wav) # Criar versões (simplificado) entrada_acapella = output_dir / "entrada_acapella.wav" entrada_instrumental = output_dir / "entrada_instrumental.wav" shutil.copy2(audio_path, entrada_acapella) import soundfile as sf data, sr = sf.read(audio_path) silent = np.zeros_like(data) sf.write(entrada_instrumental, silent, sr) # Converter com RVC status, converted = submit_job( None, str(entrada_acapella), model, pitch, f0_method, index_rate, protect, vol_env, clean, clean_strength, split, autotune, autotune_strength, filter_radius, output_format, reverb, reverb_room, reverb_damp, reverb_wet ) if not converted: return None, f"Conversion failed: {status}" # Salvar outputs saida_acapella = output_dir / "saida_acapella.wav" shutil.copy2(converted, saida_acapella) # Mixar rvc_audio, rvc_sr = sf.read(saida_acapella) inst_audio, inst_sr = sf.read(entrada_instrumental) if rvc_sr != inst_sr: from scipy import signal inst_audio = signal.resample(inst_audio, int(len(inst_audio) * rvc_sr / inst_sr)) min_len = min(len(rvc_audio), len(inst_audio)) mixed = rvc_audio[:min_len] + inst_audio[:min_len] mixed = mixed * (0.95 / max(np.abs(mixed))) if max(np.abs(mixed)) > 0.95 else mixed saida_wav = output_dir / "saida.wav" sf.write(saida_wav, mixed, rvc_sr) # Criar ZIP zip_path = output_dir / "outputs.zip" import zipfile with zipfile.ZipFile(zip_path, 'w') as z: z.write(saida_acapella, "saida_acapella.wav") z.write(saida_wav, "saida.wav") z.write(entrada_acapella, "entrada_acapella.wav") z.write(entrada_wav, "entrada.wav") z.write(entrada_instrumental, "entrada_instrumental.wav") return str(zip_path), "✅ Conversion complete! ZIP with 5 files ready." # ── Função para refresh completo dos modelos ────────────────────────────────── def refresh_all_models(): """Atualiza todos os dropdowns de modelos.""" models = list_models() models_table_data = [[m] for m in models] first_model = models[0] if models else None return models_table_data, gr.Dropdown(choices=models, value=first_model), gr.Dropdown(choices=models, value=first_model) # ── Gradio UI ───────────────────────────────────────────────────────────────── with gr.Blocks(title="RVC Voice Conversion", delete_cache=(3600, 3600)) as demo: gr.HTML(f"""
Retrieval-Based Voice Conversion · record or upload · custom models · GPU/CPU auto
{startup_status}
""") with gr.Tabs(): # ── TAB 1: Convert ──────────────────────────────────────────────────── with gr.Tab("🎤 Convert"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🔊 Input Audio") with gr.Tabs(): with gr.Tab("🎙️ Microphone"): inp_mic = gr.Audio( sources=["microphone"], type="filepath", label="Record", ) with gr.Tab("📁 Upload File"): inp_file = gr.Audio( sources=["upload"], type="filepath", label="Upload audio (wav / mp3 / flac / ogg …)", ) gr.Markdown("### 🤖 Model") model_dd = gr.Dropdown( choices=initial_models, value=initial_value, label="Active Voice Model", interactive=True, ) gr.Markdown("### 🎚️ Basic Settings") pitch_sl = gr.Slider( minimum=-24, maximum=24, value=0, step=1, label="Pitch Shift (semitones)", info="0 = unchanged · positive = higher · negative = lower", ) f0_radio = gr.Radio( choices=["rmvpe", "fcpe", "crepe", "crepe-tiny"], value="rmvpe", label="Pitch Extraction Method", info="rmvpe = fastest & accurate · crepe = highest quality (slower)", ) with gr.Column(scale=1): gr.Markdown("### ⚙️ Advanced Settings") with gr.Accordion("Expand advanced options", open=False): index_rate_sl = gr.Slider( 0.0, 1.0, value=0.75, step=0.05, label="Index Rate", info="How strongly the FAISS index influences timbre (0 = off)", ) protect_sl = gr.Slider( 0.0, 0.5, value=0.5, step=0.01, label="Protect Consonants", info="Protects unvoiced consonants — 0.5 = max protection", ) filter_radius_sl = gr.Slider( 0, 7, value=3, step=1, label="Respiration Filter Radius", info="Median filter on pitch — higher = smoother, reduces breath noise", ) vol_env_sl = gr.Slider( 0.0, 1.0, value=0.25, step=0.05, label="Volume Envelope Mix", info="0.25 = natural blend · 1 = preserve input loudness · 0 = model output", ) with gr.Row(): clean_cb = gr.Checkbox(value=False, label="Noise Reduction") clean_sl = gr.Slider( 0.0, 1.0, value=0.5, step=0.05, label="Reduction Strength", ) with gr.Row(): split_cb = gr.Checkbox(value=False, label="Split Long Audio") autotune_cb = gr.Checkbox(value=False, label="Autotune") autotune_sl = gr.Slider( 0.0, 1.0, value=1.0, step=0.05, label="Autotune Strength", visible=False, ) autotune_cb.change( fn=toggle_autotune, inputs=autotune_cb, outputs=autotune_sl, ) gr.Markdown("**🎛️ Reverb**") reverb_cb = gr.Checkbox(value=False, label="Enable Reverb") with gr.Group(visible=False) as reverb_group: reverb_room_sl = gr.Slider( 0.0, 1.0, value=0.15, step=0.05, label="Room Size", info="Larger = bigger sounding space", ) reverb_damp_sl = gr.Slider( 0.0, 1.0, value=0.7, step=0.05, label="Damping", info="Higher = more absorption, less echo tail", ) reverb_wet_sl = gr.Slider( 0.0, 1.0, value=0.15, step=0.05, label="Wet Level", info="How much reverb is mixed in (0.15 = subtle)", ) reverb_cb.change( fn=lambda v: gr.update(visible=v), inputs=reverb_cb, outputs=reverb_group, ) fmt_radio = gr.Radio( choices=["WAV", "MP3", "FLAC", "OPUS"], value="WAV", label="Output Format", info="OPUS = small file (~64 kbps, Telegram/Discord quality)", ) convert_btn = gr.Button( "🚀 Convert Voice", variant="primary", ) gr.Markdown("### 🎧 Output") out_status = gr.Markdown(value="") out_audio = gr.Audio(label="Result (if still on page)", type="filepath", interactive=False) gr.Markdown("#### 🔍 Check Job Status") with gr.Row(): job_id_box = gr.Textbox( label="Job ID", placeholder="e.g. a3f2b1c9", scale=3, ) poll_btn = gr.Button("🔄 Check", scale=1) poll_status = gr.Markdown(value="") poll_audio = gr.Audio(label="Result", type="filepath", interactive=False) # ── TAB 2: Video Convert ──────────────────────────────────────────────── with gr.Tab("🎬 Video Convert"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🎥 Input Video") video_file = gr.Video(label="Upload MP4 Video", sources=["upload"], format="mp4") gr.Markdown("### 🤖 Model") video_model = gr.Dropdown( choices=initial_models, value=initial_value, label="Active Voice Model", interactive=True, ) gr.Markdown("### 🎚️ Basic Settings") video_pitch = gr.Slider(-24, 24, value=0, step=1, label="Pitch Shift") video_f0 = gr.Radio(["rmvpe", "fcpe", "crepe", "crepe-tiny"], value="rmvpe", label="Pitch Method") with gr.Column(scale=1): gr.Markdown("### ⚙️ Advanced Settings") with gr.Accordion("Expand", open=False): video_index = gr.Slider(0.0, 1.0, value=0.75, step=0.05, label="Index Rate") video_protect = gr.Slider(0.0, 0.5, value=0.5, step=0.01, label="Protect Consonants") video_filter = gr.Slider(0, 7, value=3, step=1, label="Filter Radius") video_vol = gr.Slider(0.0, 1.0, value=0.25, step=0.05, label="Volume Envelope") video_clean = gr.Checkbox(value=False, label="Noise Reduction") video_clean_strength = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Clean Strength") video_split = gr.Checkbox(value=False, label="Split Long Audio") video_autotune = gr.Checkbox(value=False, label="Autotune") video_autotune_strength = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Autotune Strength") gr.Markdown("**🎛️ Reverb**") video_reverb = gr.Checkbox(value=False, label="Enable Reverb") video_reverb_room = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Room Size") video_reverb_damp = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Damping") video_reverb_wet = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Wet Level") video_format = gr.Radio(["WAV", "MP3", "FLAC", "OPUS"], value="WAV", label="Output Format") video_convert_btn = gr.Button("🎬 Convert Video", variant="primary") gr.Markdown("### 📦 Output") video_status = gr.Markdown(value="") video_output = gr.File(label="Download ZIP (5 audio files)", type="filepath") # ── TAB 3: Models ───────────────────────────────────────────────────── with gr.Tab("📦 Models"): gr.Markdown(""" ### Upload a Custom RVC Model Provide a **`.zip`** containing: - **`model.pth`** — weights (required) - **`model.index`** — FAISS index (optional, improves voice matching) **Built-in models** (pre-downloaded on startup): Vestia Zeta v1 · Vestia Zeta v2 · Ayunda Risu · Gawr Gura """) with gr.Row(): with gr.Column(scale=1): up_zip = gr.File(label="Model ZIP", file_types=[".zip"]) up_name = gr.Textbox( label="Model Name", placeholder="Leave blank to use zip filename", ) up_btn = gr.Button("📤 Load Model", variant="primary") up_status = gr.Textbox(label="Status", interactive=False, lines=2) with gr.Column(scale=1): gr.Markdown("### Loaded Models") models_table = gr.Dataframe( col_count=(1, "fixed"), value=[[m] for m in initial_models], interactive=False, label="", ) refresh_btn = gr.Button("🔄 Refresh") up_btn.click( fn=upload_model, inputs=[up_zip, up_name], outputs=[up_status, model_dd, models_table], ) # Refresh button atualiza todos os modelos (incluindo o video_model) refresh_btn.click( fn=refresh_all_models, outputs=[models_table, model_dd, video_model], ) # ── TAB 4: Jobs ─────────────────────────────────────────────────────── with gr.Tab("📋 Jobs"): gr.Markdown("All submitted jobs, newest first. Click **Refresh** to update.") queue_status = gr.Markdown(value=get_queue_info, every=10) jobs_table = gr.Dataframe( headers=["Job ID", "Model", "Status", "Time", "Download"], col_count=(5, "fixed"), value=get_jobs_table, interactive=False, wrap=True, datatype=["str", "str", "str", "str", "markdown"], every=10, ) refresh_jobs_btn = gr.Button("🔄 Refresh") def _refresh_jobs(): return get_queue_info(), get_jobs_table() refresh_jobs_btn.click(fn=_refresh_jobs, outputs=[queue_status, jobs_table]) # ── TAB 5: Help ─────────────────────────────────────────────────────── with gr.Tab("ℹ️ Help"): gr.Markdown(f""" ## How it works RVC (Retrieval-Based Voice Conversion) transforms a voice recording to sound like a target speaker using only that speaker's model file. --- ## Quick Guide 1. Open the **Convert** tab 2. **Record** via microphone or **upload** an audio file (wav, mp3, flac, ogg …) 3. Choose a **model** from the dropdown — 4 models are pre-loaded on startup 4. Set **Pitch Shift** if needed (e.g. male → female: try +12 semitones) 5. Click **🚀 Convert Voice** and wait for the result --- ## Built-in Models | Model | Description | |---|---| | **Vestia Zeta v1** | Hololive ID VTuber, v1 model | | **Vestia Zeta v2** | Hololive ID VTuber, v2 model (recommended) | | **Ayunda Risu** | Hololive ID VTuber | | **Gawr Gura** | Hololive EN VTuber | --- ## Pitch Extraction Methods | Method | Speed | Quality | Best for | |---|---|---|---| | **rmvpe** | ⚡⚡⚡ | ★★★★ | General use (default) | | **fcpe** | ⚡⚡ | ★★★★ | Singing | | **crepe** | ⚡ | ★★★★★ | Highest quality, slow | | **crepe-tiny** | ⚡⚡ | ★★★ | Low resource | --- ## Advanced Settings | Setting | Description | |---|---| | **Index Rate** | Influence of FAISS index on output timbre (0.75 recommended) | | **Protect Consonants** | Prevents artefacts on consonants (0.5 = max) | | **Respiration Filter Radius** | Smooths pitch curve — higher reduces breath noise (0–7, default 3) | | **Volume Envelope Mix** | 0.25 = natural blend · 1 = preserve input loudness | | **Noise Reduction** | Removes background noise before conversion | | **Split Long Audio** | Chunks audio for recordings > 60 s | | **Autotune** | Snaps pitch to nearest musical note | --- ## Output Formats | Format | Size | Quality | |---|---|---| | **WAV** | Large | Lossless | | **FLAC** | Medium | Lossless compressed | | **MP3** | Small | Lossy | | **OPUS** | Tiny (~64 kbps) | Telegram/Discord quality | --- **Device:** `{DEVICE_LABEL}` **Max input duration:** {MAX_INPUT_DURATION // 60} minutes --- ## Credits Engine: [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc) """) # Wire convert button after all tabs def _submit_and_extract_id(*args): import re status, audio = submit_job(*args) match = re.search(r"[a-f0-9]{8}", status or "") job_id = match.group(0) if match else "" return status, audio, job_id, get_queue_info(), get_jobs_table() convert_btn.click( fn=_submit_and_extract_id, inputs=[ inp_mic, inp_file, model_dd, pitch_sl, f0_radio, index_rate_sl, protect_sl, vol_env_sl, clean_cb, clean_sl, split_cb, autotune_cb, autotune_sl, filter_radius_sl, fmt_radio, reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl, ], outputs=[out_status, out_audio, job_id_box, queue_status, jobs_table], ) def _poll_and_refresh(job_id): status, file = poll_job(job_id) return status, file, get_queue_info(), get_jobs_table() poll_btn.click( fn=_poll_and_refresh, inputs=[job_id_box], outputs=[poll_status, poll_audio, queue_status, jobs_table], ) # Video convert video_convert_btn.click( fn=process_video, inputs=[ video_file, video_model, video_pitch, video_f0, video_index, video_protect, video_vol, video_clean, video_clean_strength, video_split, video_autotune, video_autotune_strength, video_filter, video_format, video_reverb, video_reverb_room, video_reverb_damp, video_reverb_wet ], outputs=[video_output, video_status] ) # ── Launch ──────────────────────────────────────────────────────────────────── if __name__ == "__main__": demo.queue(default_concurrency_limit=5) demo.launch( server_name="0.0.0.0", server_port=int(os.getenv("PORT", 7860)), max_threads=10, ssr_mode=False, css=CSS, )