RVC-CH

Running

File size: 16,398 Bytes

"""
RVC Voice Conversion – HuggingFace Space

Simple, fast, GPU/CPU auto-detected.
Now with video upload and 5-output generation (acapella, instrumental, RVC on original, RVC on acapella).
Original job queue and all tabs fully preserved.
"""
from __future__ import annotations

import os
import re

import gradio as gr

from lib.config import (
    BUILTIN_MODELS,
    CSS,
    DEVICE_LABEL,
    MAX_INPUT_DURATION,
    logger,
)
from lib.jobs import (
    get_jobs_table,
    get_queue_info,
    poll_job,
    submit_job,
    submit_full_pipeline,
)
from lib.models import list_models, startup_downloads
from lib.ui import (
    refresh_models,
    toggle_autotune,
    upload_model,
    create_video_section,
    create_five_outputs,
)

# ── Startup ───────────────────────────────────────────────────────────────────
startup_status = ""
default_model = ""
try:
    default_model = startup_downloads()
    startup_status = f"✅ Ready &nbsp;·&nbsp; {DEVICE_LABEL}"
except Exception as e:
    startup_status = f"⚠️ Some assets unavailable: {e} &nbsp;·&nbsp; {DEVICE_LABEL}"
    logger.warning("Startup download issue: %s", e)

initial_models = list_models()
initial_value = default_model if default_model in initial_models else (
    initial_models[0] if initial_models else None
)

# ── Gradio UI ─────────────────────────────────────────────────────────────────
with gr.Blocks(title="RVC Voice Conversion - Full Suite", delete_cache=(3600, 3600)) as demo:

    gr.HTML(f"""
    <div id="header">
        <h1>🎙️ RVC Voice Conversion - Full Suite</h1>
        <p>Conversão de voz com suporte a vídeos, extração de acapella/instrumental e 5 saídas!</p>
    </div>
    <p id="status">{startup_status}</p>
    """)

    with gr.Tabs():

        # ── TAB 1: Convert ────────────────────────────────────────────────────
        with gr.Tab("🎤 Convert"):
            gr.Markdown("## Opção 1: Conversão RVC clássica (um arquivo de saída)")
            with gr.Row():
                with gr.Column(scale=1):
                    gr.Markdown("### 🔊 Input Audio")
                    with gr.Tabs():
                        with gr.Tab("🎙️ Microphone"):
                            inp_mic = gr.Audio(
                                sources=["microphone"],
                                type="filepath",
                                label="Record",
                            )
                        with gr.Tab("📁 Upload File"):
                            inp_file = gr.Audio(
                                sources=["upload"],
                                type="filepath",
                                label="Upload audio (wav / mp3 / flac / ogg …)",
                            )

                    gr.Markdown("### 🤖 Model")
                    model_dd = gr.Dropdown(
                        choices=initial_models,
                        value=initial_value,
                        label="Active Voice Model",
                        interactive=True,
                    )

                    gr.Markdown("### 🎚️ Basic Settings")
                    pitch_sl = gr.Slider(
                        minimum=-24, maximum=24, value=0, step=1,
                        label="Pitch Shift (semitones)",
                        info="0 = unchanged · positive = higher · negative = lower",
                    )
                    f0_radio = gr.Radio(
                        choices=["rmvpe", "fcpe", "crepe", "crepe-tiny"],
                        value="rmvpe",
                        label="Pitch Extraction Method",
                        info="rmvpe = fastest & accurate · crepe = highest quality (slower)",
                    )

                with gr.Column(scale=1):
                    gr.Markdown("### ⚙️ Advanced Settings")
                    with gr.Accordion("Expand advanced options", open=False):
                        index_rate_sl = gr.Slider(
                            0.0, 1.0, value=0.75, step=0.05,
                            label="Index Rate",
                            info="How strongly the FAISS index influences timbre (0 = off)",
                        )
                        protect_sl = gr.Slider(
                            0.0, 0.5, value=0.5, step=0.01,
                            label="Protect Consonants",
                            info="Protects unvoiced consonants — 0.5 = max protection",
                        )
                        filter_radius_sl = gr.Slider(
                            0, 7, value=3, step=1,
                            label="Respiration Filter Radius",
                            info="Median filter on pitch — higher = smoother, reduces breath noise",
                        )
                        vol_env_sl = gr.Slider(
                            0.0, 1.0, value=0.25, step=0.05,
                            label="Volume Envelope Mix",
                            info="0.25 = natural blend · 1 = preserve input loudness · 0 = model output",
                        )
                        with gr.Row():
                            clean_cb = gr.Checkbox(value=False, label="Noise Reduction")
                            clean_sl = gr.Slider(
                                0.0, 1.0, value=0.5, step=0.05,
                                label="Reduction Strength",
                            )
                        with gr.Row():
                            split_cb = gr.Checkbox(value=False, label="Split Long Audio")
                            autotune_cb = gr.Checkbox(value=False, label="Autotune")
                            autotune_sl = gr.Slider(
                                0.0, 1.0, value=1.0, step=0.05,
                                label="Autotune Strength",
                                visible=False,
                            )
                            autotune_cb.change(
                                fn=toggle_autotune,
                                inputs=autotune_cb,
                                outputs=autotune_sl,
                            )

                    gr.Markdown("**🎛️ Reverb**")
                    reverb_cb = gr.Checkbox(value=False, label="Enable Reverb")
                    with gr.Group(visible=False) as reverb_group:
                        reverb_room_sl = gr.Slider(
                            0.0, 1.0, value=0.15, step=0.05,
                            label="Room Size",
                            info="Larger = bigger sounding space",
                        )
                        reverb_damp_sl = gr.Slider(
                            0.0, 1.0, value=0.7, step=0.05,
                            label="Damping",
                            info="Higher = more absorption, less echo tail",
                        )
                        reverb_wet_sl = gr.Slider(
                            0.0, 1.0, value=0.15, step=0.05,
                            label="Wet Level",
                            info="How much reverb is mixed in (0.15 = subtle)",
                        )
                    reverb_cb.change(
                        fn=lambda v: gr.update(visible=v),
                        inputs=reverb_cb,
                        outputs=reverb_group,
                    )

                    fmt_radio = gr.Radio(
                        choices=["WAV", "MP3", "FLAC", "OPUS"],
                        value="WAV",
                        label="Output Format",
                        info="WAV = lossless, large file; MP3/OPUS = smaller",
                    )

            with gr.Row():
                classic_btn = gr.Button("🚀 Convert Voice (Classic)", variant="primary")
                classic_status = gr.Markdown(value="")
                classic_audio = gr.Audio(label="Converted Audio", type="filepath", interactive=False)

            gr.Markdown("---")
            gr.Markdown("## Opção 2: Pipeline completo – 5 saídas (vídeo + separação de faixas + RVC)")

            with gr.Row():
                with gr.Column(scale=1):
                    video_input = create_video_section()
                    gr.Markdown("*(Ou use áudio/microfone acima – o pipeline aceita qualquer fonte)*")
                with gr.Column(scale=1):
                    full_btn = gr.Button("🚀 Full Pipeline (5 Outputs)", variant="secondary")
                    full_status = gr.Markdown(value="")

            # 5 componentes de saída
            (entrada_acapella, entrada_audio, entrada_instrumental,
             saida_audio, saida_acapella) = create_five_outputs()

            # Seção de verificação de jobs legada
            gr.Markdown("---")
            gr.Markdown("### 🔍 Verificar status de um job (clássico)")
            with gr.Row():
                job_id_box = gr.Textbox(
                    label="Job ID",
                    placeholder="e.g. a3f2b1c9",
                    scale=3,
                )
                poll_btn = gr.Button("🔄 Check", scale=1)
            poll_status = gr.Markdown(value="")
            poll_audio = gr.Audio(label="Result", type="filepath", interactive=False)

        # ── TAB 2: Models (original) ──────────────────────────────────────────
        with gr.Tab("📦 Models"):
            gr.Markdown("""
            ### Upload a Custom RVC Model
            Provide a **`.zip`** containing:
            - **`model.pth`** — weights (required)
            - **`model.index`** — FAISS index (optional, improves voice matching)

            **Built-in models** (pre-downloaded on startup):
            Vestia Zeta v1 · Vestia Zeta v2 · Ayunda Risu · Gawr Gura
            """)
            with gr.Row():
                with gr.Column(scale=1):
                    up_zip = gr.File(label="Model ZIP", file_types=[".zip"])
                    up_name = gr.Textbox(
                        label="Model Name",
                        placeholder="Leave blank to use zip filename",
                    )
                    up_btn = gr.Button("📤 Load Model", variant="primary")
                    up_status = gr.Textbox(label="Status", interactive=False, lines=2)
                with gr.Column(scale=1):
                    gr.Markdown("### Loaded Models")
                    models_table = gr.Dataframe(
                        col_count=(1, "fixed"),
                        value=[[m] for m in initial_models],
                        interactive=False,
                        label="",
                    )
                    refresh_btn = gr.Button("🔄 Refresh")

            up_btn.click(
                fn=upload_model,
                inputs=[up_zip, up_name],
                outputs=[up_status, model_dd, models_table],
            )
            refresh_btn.click(
                fn=refresh_models,
                outputs=[models_table, model_dd],
            )

        # ── TAB 3: Jobs (original, intocada) ──────────────────────────────────
        with gr.Tab("📋 Jobs"):
            gr.Markdown("All submitted jobs, newest first. Click **Refresh** to update.")
            queue_status = gr.Markdown(value=get_queue_info, every=10)
            jobs_table = gr.Dataframe(
                headers=["Job ID", "Model", "Status", "Time", "Download"],
                col_count=(5, "fixed"),
                value=get_jobs_table,
                interactive=False,
                wrap=True,
                datatype=["str", "str", "str", "str", "markdown"],
                every=10,
            )
            refresh_jobs_btn = gr.Button("🔄 Refresh")

            def _refresh_jobs():
                return get_queue_info(), get_jobs_table()

            refresh_jobs_btn.click(fn=_refresh_jobs, outputs=[queue_status, jobs_table])

        # ── TAB 4: Help (atualizada) ──────────────────────────────────────────
        with gr.Tab("ℹ️ Help"):
            gr.Markdown(f"""
            ## How it works
            RVC (Retrieval-Based Voice Conversion) transforms a voice recording to sound
            like a target speaker using only that speaker's model file.

            ---

            ## Two conversion modes

            ### 1. Classic Mode (single output)
            - Upload an audio file or record your voice
            - Choose a model and settings
            - Click **Convert Voice (Classic)**
            - Result is added to the job queue – you can monitor it in the **Jobs** tab

            ### 2. Full Pipeline Mode (5 outputs)
            - Works with **video (MP4)** or any audio source
            - Automatically extracts **acapella** (vocals) and **instrumental** using Demucs
            - Runs RVC conversion on **both** the original audio and the acapella
            - Returns 5 files:
                1. `entrada_acapella.mp3` – extracted vocals
                2. `entrada.mp3` – original audio
                3. `entrada_instrumental.mp3` – background music
                4. `saida.mp3` – RVC applied to original audio
                5. `saida_acapella.mp3` – RVC applied only to vocals
            - All 5 files appear directly on the interface – no need to poll jobs

            ---

            **Device:** `{DEVICE_LABEL}`
            **Max input duration:** {MAX_INPUT_DURATION // 60} minutes

            ---

            ## Credits
            Engine: [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)
            Separation: [Demucs](https://github.com/facebookresearch/demucs)
            """)

    # ── Event handlers ────────────────────────────────────────────────────────
    def _submit_classic(*args):
        status, audio = submit_job(*args)
        match = re.search(r"[a-f0-9]{8}", status or "")
        job_id = match.group(0) if match else ""
        return status, audio, job_id, get_queue_info(), get_jobs_table()

    classic_btn.click(
        fn=_submit_classic,
        inputs=[
            inp_mic, inp_file, model_dd,
            pitch_sl, f0_radio,
            index_rate_sl, protect_sl, vol_env_sl,
            clean_cb, clean_sl,
            split_cb, autotune_cb, autotune_sl,
            filter_radius_sl,
            fmt_radio,
            reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl,
        ],
        outputs=[classic_status, classic_audio, job_id_box, queue_status, jobs_table],
    )

    full_btn.click(
        fn=submit_full_pipeline,
        inputs=[
            video_input, inp_mic, inp_file, model_dd,
            pitch_sl, f0_radio,
            index_rate_sl, protect_sl, vol_env_sl,
            clean_cb, clean_sl,
            split_cb, autotune_cb, autotune_sl,
            filter_radius_sl,
            fmt_radio,
            reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl,
        ],
        outputs=[
            full_status,
            entrada_acapella, entrada_audio, entrada_instrumental,
            saida_audio, saida_acapella,
        ],
    )

    def _poll_and_refresh(job_id):
        status, file = poll_job(job_id)
        return status, file, get_queue_info(), get_jobs_table()

    poll_btn.click(
        fn=_poll_and_refresh,
        inputs=[job_id_box],
        outputs=[poll_status, poll_audio, queue_status, jobs_table],
    )

# ── Launch ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    demo.queue(default_concurrency_limit=5)
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.getenv("PORT", 7860)),
        max_threads=10,
        ssr_mode=False,
        css=CSS,
    )