Spaces:
Running
Running
| """ | |
| RVC Voice Conversion β HuggingFace Space | |
| Simple, fast, GPU/CPU auto-detected. | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import subprocess | |
| import tempfile | |
| import shutil | |
| from pathlib import Path | |
| import gradio as gr | |
| import numpy as np | |
| from lib.config import ( | |
| BUILTIN_MODELS, | |
| CSS, | |
| DEVICE_LABEL, | |
| MAX_INPUT_DURATION, | |
| logger, | |
| ) | |
| from lib.jobs import ( | |
| get_jobs_table, | |
| get_queue_info, | |
| poll_job, | |
| submit_job, | |
| ) | |
| from lib.models import list_models, startup_downloads | |
| from lib.ui import refresh_models, toggle_autotune, upload_model | |
| # ββ Startup βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| startup_status = "" | |
| default_model = "" | |
| try: | |
| default_model = startup_downloads() | |
| startup_status = f"β Ready Β· {DEVICE_LABEL}" | |
| except Exception as e: | |
| startup_status = f"β οΈ Some assets unavailable: {e} Β· {DEVICE_LABEL}" | |
| logger.warning("Startup download issue: %s", e) | |
| initial_models = list_models() | |
| initial_value = default_model if default_model in initial_models else ( | |
| initial_models[0] if initial_models else None | |
| ) | |
| # ββ FunΓ§Γ£o para processar vΓdeo βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def process_video(video_file, model, pitch, f0_method, index_rate, protect, vol_env, | |
| clean, clean_strength, split, autotune, autotune_strength, | |
| filter_radius, output_format, reverb, reverb_room, reverb_damp, reverb_wet): | |
| """Processa vΓdeo e retorna ZIP com 5 outputs.""" | |
| if video_file is None: | |
| return None, "No video file provided" | |
| # Criar diretΓ³rio temporΓ‘rio | |
| output_dir = Path(tempfile.mkdtemp()) | |
| # Extrair Γ‘udio | |
| audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name | |
| cmd = ["ffmpeg", "-i", video_file, "-q:a", "0", "-map", "a", "-y", audio_path] | |
| subprocess.run(cmd, check=True, capture_output=True) | |
| # Salvar entrada.wav | |
| entrada_wav = output_dir / "entrada.wav" | |
| shutil.copy2(audio_path, entrada_wav) | |
| # Criar versΓ΅es (simplificado) | |
| entrada_acapella = output_dir / "entrada_acapella.wav" | |
| entrada_instrumental = output_dir / "entrada_instrumental.wav" | |
| shutil.copy2(audio_path, entrada_acapella) | |
| import soundfile as sf | |
| data, sr = sf.read(audio_path) | |
| silent = np.zeros_like(data) | |
| sf.write(entrada_instrumental, silent, sr) | |
| # Converter com RVC | |
| status, converted = submit_job( | |
| None, str(entrada_acapella), model, pitch, f0_method, | |
| index_rate, protect, vol_env, clean, clean_strength, | |
| split, autotune, autotune_strength, filter_radius, | |
| output_format, reverb, reverb_room, reverb_damp, reverb_wet | |
| ) | |
| if not converted: | |
| return None, f"Conversion failed: {status}" | |
| # Salvar outputs | |
| saida_acapella = output_dir / "saida_acapella.wav" | |
| shutil.copy2(converted, saida_acapella) | |
| # Mixar | |
| rvc_audio, rvc_sr = sf.read(saida_acapella) | |
| inst_audio, inst_sr = sf.read(entrada_instrumental) | |
| if rvc_sr != inst_sr: | |
| from scipy import signal | |
| inst_audio = signal.resample(inst_audio, int(len(inst_audio) * rvc_sr / inst_sr)) | |
| min_len = min(len(rvc_audio), len(inst_audio)) | |
| mixed = rvc_audio[:min_len] + inst_audio[:min_len] | |
| mixed = mixed * (0.95 / max(np.abs(mixed))) if max(np.abs(mixed)) > 0.95 else mixed | |
| saida_wav = output_dir / "saida.wav" | |
| sf.write(saida_wav, mixed, rvc_sr) | |
| # Criar ZIP | |
| zip_path = output_dir / "outputs.zip" | |
| import zipfile | |
| with zipfile.ZipFile(zip_path, 'w') as z: | |
| z.write(saida_acapella, "saida_acapella.wav") | |
| z.write(saida_wav, "saida.wav") | |
| z.write(entrada_acapella, "entrada_acapella.wav") | |
| z.write(entrada_wav, "entrada.wav") | |
| z.write(entrada_instrumental, "entrada_instrumental.wav") | |
| return str(zip_path), "β Conversion complete! ZIP with 5 files ready." | |
| # ββ Gradio UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="RVC Voice Conversion", delete_cache=(3600, 3600)) as demo: | |
| gr.HTML(f""" | |
| <div id="header"> | |
| <h1>ποΈ RVC Voice Conversion</h1> | |
| <p>Retrieval-Based Voice Conversion Β· record or upload Β· custom models Β· GPU/CPU auto</p> | |
| </div> | |
| <p id="status">{startup_status}</p> | |
| """) | |
| with gr.Tabs(): | |
| # ββ TAB 1: Convert ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π€ Convert"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π Input Audio") | |
| with gr.Tabs(): | |
| with gr.Tab("ποΈ Microphone"): | |
| inp_mic = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="Record", | |
| ) | |
| with gr.Tab("π Upload File"): | |
| inp_file = gr.Audio( | |
| sources=["upload"], | |
| type="filepath", | |
| label="Upload audio (wav / mp3 / flac / ogg β¦)", | |
| ) | |
| gr.Markdown("### π€ Model") | |
| model_dd = gr.Dropdown( | |
| choices=initial_models, | |
| value=initial_value, | |
| label="Active Voice Model", | |
| interactive=True, | |
| ) | |
| gr.Markdown("### ποΈ Basic Settings") | |
| pitch_sl = gr.Slider( | |
| minimum=-24, maximum=24, value=0, step=1, | |
| label="Pitch Shift (semitones)", | |
| info="0 = unchanged Β· positive = higher Β· negative = lower", | |
| ) | |
| f0_radio = gr.Radio( | |
| choices=["rmvpe", "fcpe", "crepe", "crepe-tiny"], | |
| value="rmvpe", | |
| label="Pitch Extraction Method", | |
| info="rmvpe = fastest & accurate Β· crepe = highest quality (slower)", | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Advanced Settings") | |
| with gr.Accordion("Expand advanced options", open=False): | |
| index_rate_sl = gr.Slider( | |
| 0.0, 1.0, value=0.75, step=0.05, | |
| label="Index Rate", | |
| info="How strongly the FAISS index influences timbre (0 = off)", | |
| ) | |
| protect_sl = gr.Slider( | |
| 0.0, 0.5, value=0.5, step=0.01, | |
| label="Protect Consonants", | |
| info="Protects unvoiced consonants β 0.5 = max protection", | |
| ) | |
| filter_radius_sl = gr.Slider( | |
| 0, 7, value=3, step=1, | |
| label="Respiration Filter Radius", | |
| info="Median filter on pitch β higher = smoother, reduces breath noise", | |
| ) | |
| vol_env_sl = gr.Slider( | |
| 0.0, 1.0, value=0.25, step=0.05, | |
| label="Volume Envelope Mix", | |
| info="0.25 = natural blend Β· 1 = preserve input loudness Β· 0 = model output", | |
| ) | |
| with gr.Row(): | |
| clean_cb = gr.Checkbox(value=False, label="Noise Reduction") | |
| clean_sl = gr.Slider( | |
| 0.0, 1.0, value=0.5, step=0.05, | |
| label="Reduction Strength", | |
| ) | |
| with gr.Row(): | |
| split_cb = gr.Checkbox(value=False, label="Split Long Audio") | |
| autotune_cb = gr.Checkbox(value=False, label="Autotune") | |
| autotune_sl = gr.Slider( | |
| 0.0, 1.0, value=1.0, step=0.05, | |
| label="Autotune Strength", | |
| visible=False, | |
| ) | |
| autotune_cb.change( | |
| fn=toggle_autotune, | |
| inputs=autotune_cb, | |
| outputs=autotune_sl, | |
| ) | |
| gr.Markdown("**ποΈ Reverb**") | |
| reverb_cb = gr.Checkbox(value=False, label="Enable Reverb") | |
| with gr.Group(visible=False) as reverb_group: | |
| reverb_room_sl = gr.Slider( | |
| 0.0, 1.0, value=0.15, step=0.05, | |
| label="Room Size", | |
| info="Larger = bigger sounding space", | |
| ) | |
| reverb_damp_sl = gr.Slider( | |
| 0.0, 1.0, value=0.7, step=0.05, | |
| label="Damping", | |
| info="Higher = more absorption, less echo tail", | |
| ) | |
| reverb_wet_sl = gr.Slider( | |
| 0.0, 1.0, value=0.15, step=0.05, | |
| label="Wet Level", | |
| info="How much reverb is mixed in (0.15 = subtle)", | |
| ) | |
| reverb_cb.change( | |
| fn=lambda v: gr.update(visible=v), | |
| inputs=reverb_cb, | |
| outputs=reverb_group, | |
| ) | |
| fmt_radio = gr.Radio( | |
| choices=["WAV", "MP3", "FLAC", "OPUS"], | |
| value="WAV", | |
| label="Output Format", | |
| info="OPUS = small file (~64 kbps, Telegram/Discord quality)", | |
| ) | |
| convert_btn = gr.Button( | |
| "π Convert Voice", | |
| variant="primary", | |
| ) | |
| gr.Markdown("### π§ Output") | |
| out_status = gr.Markdown(value="") | |
| out_audio = gr.Audio(label="Result (if still on page)", type="filepath", interactive=False) | |
| gr.Markdown("#### π Check Job Status") | |
| with gr.Row(): | |
| job_id_box = gr.Textbox( | |
| label="Job ID", | |
| placeholder="e.g. a3f2b1c9", | |
| scale=3, | |
| ) | |
| poll_btn = gr.Button("π Check", scale=1) | |
| poll_status = gr.Markdown(value="") | |
| poll_audio = gr.Audio(label="Result", type="filepath", interactive=False) | |
| # ββ TAB 2: Video Convert ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π¬ Video Convert"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### π₯ Input Video") | |
| video_file = gr.Video(label="Upload MP4 Video", sources=["upload"], format="mp4") | |
| gr.Markdown("### π€ Model") | |
| video_model = gr.Dropdown( | |
| choices=initial_models, | |
| value=initial_value, | |
| label="Active Voice Model", | |
| interactive=True, | |
| ) | |
| gr.Markdown("### ποΈ Basic Settings") | |
| video_pitch = gr.Slider(-24, 24, value=0, step=1, label="Pitch Shift") | |
| video_f0 = gr.Radio(["rmvpe", "fcpe", "crepe", "crepe-tiny"], value="rmvpe", label="Pitch Method") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Advanced Settings") | |
| with gr.Accordion("Expand", open=False): | |
| video_index = gr.Slider(0.0, 1.0, value=0.75, step=0.05, label="Index Rate") | |
| video_protect = gr.Slider(0.0, 0.5, value=0.5, step=0.01, label="Protect Consonants") | |
| video_filter = gr.Slider(0, 7, value=3, step=1, label="Filter Radius") | |
| video_vol = gr.Slider(0.0, 1.0, value=0.25, step=0.05, label="Volume Envelope") | |
| video_clean = gr.Checkbox(value=False, label="Noise Reduction") | |
| video_clean_strength = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Clean Strength") | |
| video_split = gr.Checkbox(value=False, label="Split Long Audio") | |
| video_autotune = gr.Checkbox(value=False, label="Autotune") | |
| video_autotune_strength = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Autotune Strength") | |
| gr.Markdown("**ποΈ Reverb**") | |
| video_reverb = gr.Checkbox(value=False, label="Enable Reverb") | |
| video_reverb_room = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Room Size") | |
| video_reverb_damp = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Damping") | |
| video_reverb_wet = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Wet Level") | |
| video_format = gr.Radio(["WAV", "MP3", "FLAC", "OPUS"], value="WAV", label="Output Format") | |
| video_convert_btn = gr.Button("π¬ Convert Video", variant="primary") | |
| gr.Markdown("### π¦ Output") | |
| video_status = gr.Markdown(value="") | |
| video_output = gr.File(label="Download ZIP (5 audio files)", type="filepath") | |
| # ββ TAB 3: Models βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π¦ Models"): | |
| gr.Markdown(""" | |
| ### Upload a Custom RVC Model | |
| Provide a **`.zip`** containing: | |
| - **`model.pth`** β weights (required) | |
| - **`model.index`** β FAISS index (optional, improves voice matching) | |
| **Built-in models** (pre-downloaded on startup): | |
| Vestia Zeta v1 Β· Vestia Zeta v2 Β· Ayunda Risu Β· Gawr Gura | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| up_zip = gr.File(label="Model ZIP", file_types=[".zip"]) | |
| up_name = gr.Textbox( | |
| label="Model Name", | |
| placeholder="Leave blank to use zip filename", | |
| ) | |
| up_btn = gr.Button("π€ Load Model", variant="primary") | |
| up_status = gr.Textbox(label="Status", interactive=False, lines=2) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Loaded Models") | |
| models_table = gr.Dataframe( | |
| col_count=(1, "fixed"), | |
| value=[[m] for m in initial_models], | |
| interactive=False, | |
| label="", | |
| ) | |
| refresh_btn = gr.Button("π Refresh") | |
| up_btn.click( | |
| fn=upload_model, | |
| inputs=[up_zip, up_name], | |
| outputs=[up_status, model_dd, models_table], | |
| ) | |
| refresh_btn.click( | |
| fn=refresh_models, | |
| outputs=[models_table, model_dd], | |
| ) | |
| # ββ TAB 4: Jobs βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("π Jobs"): | |
| gr.Markdown("All submitted jobs, newest first. Click **Refresh** to update.") | |
| queue_status = gr.Markdown(value=get_queue_info, every=10) | |
| jobs_table = gr.Dataframe( | |
| headers=["Job ID", "Model", "Status", "Time", "Download"], | |
| col_count=(5, "fixed"), | |
| value=get_jobs_table, | |
| interactive=False, | |
| wrap=True, | |
| datatype=["str", "str", "str", "str", "markdown"], | |
| every=10, | |
| ) | |
| refresh_jobs_btn = gr.Button("π Refresh") | |
| def _refresh_jobs(): | |
| return get_queue_info(), get_jobs_table() | |
| refresh_jobs_btn.click(fn=_refresh_jobs, outputs=[queue_status, jobs_table]) | |
| # ββ TAB 5: Help βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Tab("βΉοΈ Help"): | |
| gr.Markdown(f""" | |
| ## How it works | |
| RVC (Retrieval-Based Voice Conversion) transforms a voice recording to sound | |
| like a target speaker using only that speaker's model file. | |
| --- | |
| ## Quick Guide | |
| 1. Open the **Convert** tab | |
| 2. **Record** via microphone or **upload** an audio file (wav, mp3, flac, ogg β¦) | |
| 3. Choose a **model** from the dropdown β 4 models are pre-loaded on startup | |
| 4. Set **Pitch Shift** if needed (e.g. male β female: try +12 semitones) | |
| 5. Click **π Convert Voice** and wait for the result | |
| --- | |
| ## Built-in Models | |
| | Model | Description | | |
| |---|---| | |
| | **Vestia Zeta v1** | Hololive ID VTuber, v1 model | | |
| | **Vestia Zeta v2** | Hololive ID VTuber, v2 model (recommended) | | |
| | **Ayunda Risu** | Hololive ID VTuber | | |
| | **Gawr Gura** | Hololive EN VTuber | | |
| --- | |
| ## Pitch Extraction Methods | |
| | Method | Speed | Quality | Best for | | |
| |---|---|---|---| | |
| | **rmvpe** | β‘β‘β‘ | β β β β | General use (default) | | |
| | **fcpe** | β‘β‘ | β β β β | Singing | | |
| | **crepe** | β‘ | β β β β β | Highest quality, slow | | |
| | **crepe-tiny** | β‘β‘ | β β β | Low resource | | |
| --- | |
| ## Advanced Settings | |
| | Setting | Description | | |
| |---|---| | |
| | **Index Rate** | Influence of FAISS index on output timbre (0.75 recommended) | | |
| | **Protect Consonants** | Prevents artefacts on consonants (0.5 = max) | | |
| | **Respiration Filter Radius** | Smooths pitch curve β higher reduces breath noise (0β7, default 3) | | |
| | **Volume Envelope Mix** | 0.25 = natural blend Β· 1 = preserve input loudness | | |
| | **Noise Reduction** | Removes background noise before conversion | | |
| | **Split Long Audio** | Chunks audio for recordings > 60 s | | |
| | **Autotune** | Snaps pitch to nearest musical note | | |
| --- | |
| ## Output Formats | |
| | Format | Size | Quality | | |
| |---|---|---| | |
| | **WAV** | Large | Lossless | | |
| | **FLAC** | Medium | Lossless compressed | | |
| | **MP3** | Small | Lossy | | |
| | **OPUS** | Tiny (~64 kbps) | Telegram/Discord quality | | |
| --- | |
| **Device:** `{DEVICE_LABEL}` | |
| **Max input duration:** {MAX_INPUT_DURATION // 60} minutes | |
| --- | |
| ## Credits | |
| Engine: [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc) | |
| """) | |
| # Wire convert button after all tabs | |
| def _submit_and_extract_id(*args): | |
| import re | |
| status, audio = submit_job(*args) | |
| match = re.search(r"[a-f0-9]{8}", status or "") | |
| job_id = match.group(0) if match else "" | |
| return status, audio, job_id, get_queue_info(), get_jobs_table() | |
| convert_btn.click( | |
| fn=_submit_and_extract_id, | |
| inputs=[ | |
| inp_mic, inp_file, model_dd, | |
| pitch_sl, f0_radio, | |
| index_rate_sl, protect_sl, vol_env_sl, | |
| clean_cb, clean_sl, | |
| split_cb, autotune_cb, autotune_sl, | |
| filter_radius_sl, | |
| fmt_radio, | |
| reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl, | |
| ], | |
| outputs=[out_status, out_audio, job_id_box, queue_status, jobs_table], | |
| ) | |
| def _poll_and_refresh(job_id): | |
| status, file = poll_job(job_id) | |
| return status, file, get_queue_info(), get_jobs_table() | |
| poll_btn.click( | |
| fn=_poll_and_refresh, | |
| inputs=[job_id_box], | |
| outputs=[poll_status, poll_audio, queue_status, jobs_table], | |
| ) | |
| # Video convert | |
| video_convert_btn.click( | |
| fn=process_video, | |
| inputs=[ | |
| video_file, video_model, video_pitch, video_f0, | |
| video_index, video_protect, video_vol, | |
| video_clean, video_clean_strength, | |
| video_split, video_autotune, video_autotune_strength, | |
| video_filter, video_format, | |
| video_reverb, video_reverb_room, video_reverb_damp, video_reverb_wet | |
| ], | |
| outputs=[video_output, video_status] | |
| ) | |
| # ββ Launch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if __name__ == "__main__": | |
| demo.queue(default_concurrency_limit=5) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=int(os.getenv("PORT", 7860)), | |
| max_threads=10, | |
| ssr_mode=False, | |
| css=CSS, | |
| ) |