Spaces:

PlotweaverAI
/

PlotweaverNigerianVoice

Sleeping

File size: 10,519 Bytes

"""
PlotweaverNigerianVoice
A Gradio Space for the PlotweaverAI Nigerian-English fine-tuned F5-TTS model.
Two modes:
  1. Default voice: type text -> generated speech using the Nigerian-English
     voice baked into this Space (sample.wav / sample.txt from the model repo).
  2. Custom voice clone: upload your own short reference clip (+ transcript,
     or leave blank to auto-transcribe) -> generated speech in that voice.
Note: F5-TTS's own infer pipeline already auto-transcribes (Whisper, via
transformers) and auto-trims reference audio when ref_text is left blank,
so we lean on that built-in behavior rather than duplicating it here.
Model: PlotweaverAI/nigerian-english-ft-tts (private HF model repo)
Base architecture: F5-TTS (SWivid/F5-TTS)
"""

import os
import threading

import gradio as gr
import torch
from huggingface_hub import hf_hub_download

# --- Optional: only present when running on HF Spaces with a GPU tier ----
try:
    import spaces

    ON_SPACES = True
except ImportError:
    ON_SPACES = False


def gpu_decorator(func):
    """Wrap with spaces.GPU only when an actual GPU is present (ZeroGPU).
    On a CPU Space `spaces` still imports, so gating on ON_SPACES alone would
    wrap CPU-only inference in spaces.GPU and raise at call time. Gate on real
    CUDA availability instead."""
    if ON_SPACES and torch.cuda.is_available():
        return spaces.GPU(func)
    return func


# F5-TTS imports (package: f5-tts, installed from PyPI / git in requirements.txt)
from f5_tts.api import F5TTS


MODEL_REPO = "PlotweaverAI/nigerian-english-ft-tts"
HF_TOKEN = os.environ.get("HF_TOKEN")  # set as a Space secret (repo is private)

# Architecture the checkpoint was fine-tuned from. F5TTS_v1_Base is the current
# default for new finetunes; override with the F5TTS_ARCH secret/variable if
# your training run used a different base (e.g. "F5TTS_Base").
MODEL_ARCH = os.environ.get("F5TTS_ARCH", "F5TTS_v1_Base")

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


_model_lock = threading.Lock()
_tts_model = None
_default_ref_audio = None
_default_ref_text = None


def _download_model_files():
    """Pull the fine-tuned checkpoint, vocab, and bundled sample voice
    from the private model repo. Requires HF_TOKEN secret with read access."""
    ckpt_path = hf_hub_download(
        repo_id=MODEL_REPO, filename="model_last.pt", token=HF_TOKEN
    )
    vocab_path = hf_hub_download(
        repo_id=MODEL_REPO, filename="vocab.txt", token=HF_TOKEN
    )
    sample_wav_path = hf_hub_download(
        repo_id=MODEL_REPO, filename="sample.wav", token=HF_TOKEN
    )
    sample_txt_path = hf_hub_download(
        repo_id=MODEL_REPO, filename="sample.txt", token=HF_TOKEN
    )
    with open(sample_txt_path, "r", encoding="utf-8") as f:
        sample_text = f.read().strip()
    return ckpt_path, vocab_path, sample_wav_path, sample_text


def get_model():
    """Lazily load the F5-TTS model + default reference voice exactly once."""
    global _tts_model, _default_ref_audio, _default_ref_text
    if _tts_model is not None:
        return _tts_model, _default_ref_audio, _default_ref_text

    with _model_lock:
        if _tts_model is not None:
            return _tts_model, _default_ref_audio, _default_ref_text

        ckpt_path, vocab_path, sample_wav_path, sample_text = _download_model_files()

        try:
            model = F5TTS(
                model=MODEL_ARCH,
                ckpt_file=ckpt_path,
                vocab_file=vocab_path,
                device=DEVICE,
            )
        except RuntimeError as e:
            raise RuntimeError(
                f"Failed to load checkpoint with architecture '{MODEL_ARCH}'. "
                "If this fine-tune was trained from a different F5-TTS base "
                "(e.g. 'F5TTS_Base' instead of 'F5TTS_v1_Base'), set the "
                "F5TTS_ARCH variable in your Space settings to match. "
                f"Original error: {e}"
            ) from e

        _tts_model = model
        _default_ref_audio = sample_wav_path
        _default_ref_text = sample_text

    return _tts_model, _default_ref_audio, _default_ref_text


@gpu_decorator
def generate_default_voice(text: str, speed: float, nfe_steps: int):
    if not text or not text.strip():
        raise gr.Error("Please enter some text to generate speech for.")

    model, ref_audio, ref_text = get_model()

    wav, sr, _ = model.infer(
        ref_file=ref_audio,
        ref_text=ref_text,
        gen_text=text.strip(),
        speed=speed,
        nfe_step=int(nfe_steps),
        remove_silence=True,
    )

    return (sr, wav)


@gpu_decorator
def generate_cloned_voice(
    ref_audio_path: str,
    ref_text: str,
    gen_text: str,
    speed: float,
    nfe_steps: int,
):
    if ref_audio_path is None:
        raise gr.Error("Please upload a reference voice clip first.")
    if not gen_text or not gen_text.strip():
        raise gr.Error("Please enter the text you want spoken in the cloned voice.")

    model, _, _ = get_model()

    transcript = ref_text.strip() if ref_text else ""
    if not transcript:
        # F5-TTS's transcribe() uses a Whisper pipeline under the hood.
        transcript = model.transcribe(ref_audio_path)
        if not transcript:
            raise gr.Error(
                "Could not auto-transcribe the uploaded clip. "
                "Please type the transcript manually and try again."
            )

    wav, sr, _ = model.infer(
        ref_file=ref_audio_path,
        ref_text=transcript,
        gen_text=gen_text.strip(),
        speed=speed,
        nfe_step=int(nfe_steps),
        remove_silence=True,
    )

    return (sr, wav), transcript

CSS = """
#title { text-align: center; margin-bottom: 0.5em; }
#subtitle { text-align: center; color: var(--body-text-color-subdued); margin-bottom: 1.5em; }
.cpu-note { font-size: 0.85em; color: var(--body-text-color-subdued); }
"""

with gr.Blocks(css=CSS, title="Plotweaver Nigerian Voice") as demo:
    gr.Markdown("# Plotweaver Nigerian Voice", elem_id="title")
    gr.Markdown(
        "Nigerian-English text-to-speech, fine-tuned from F5-TTS by Plotweaver AI. "
        "Generate speech in our Nigerian voice, or clone a voice from your own clip.",
        elem_id="subtitle",
    )
    gr.Markdown(
        "Running on free CPU hardware — generation can take **30–90+ seconds** "
        "per request, longer for longer text. Research / non-commercial use only "
        "(base F5-TTS is CC-BY-NC).",
        elem_classes="cpu-note",
    )

    with gr.Tabs():

        with gr.Tab("Nigerian Voice (Text → Speech)"):
            with gr.Row():
                with gr.Column(scale=1):
                    default_text = gr.Textbox(
                        label="Text to speak",
                        placeholder="Type what you want the Nigerian voice to say...",
                        lines=6,
                        max_lines=12,
                    )
                    with gr.Accordion("Advanced settings", open=False):
                        default_speed = gr.Slider(
                            0.5, 2.0, value=1.0, step=0.05, label="Speed"
                        )
                        default_nfe = gr.Slider(
                            8, 64, value=24, step=2,
                            label="Quality steps (NFE)",
                            info="Higher = better quality but slower. 16-24 recommended on CPU.",
                        )
                    default_btn = gr.Button("Generate Speech", variant="primary")
                with gr.Column(scale=1):
                    default_audio_out = gr.Audio(label="Generated audio", type="numpy")

            default_btn.click(
                fn=generate_default_voice,
                inputs=[default_text, default_speed, default_nfe],
                outputs=[default_audio_out],
            )

        with gr.Tab("Clone a Voice (Upload Clip → Speech)"):
            gr.Markdown(
                "Upload a short, clean voice clip (5-15 seconds works best). "
                "Add the transcript of that clip if you have it — or leave it "
                "blank and we'll auto-transcribe it with Whisper."
            )
            with gr.Row():
                with gr.Column(scale=1):
                    ref_audio_in = gr.Audio(
                        label="Reference voice clip",
                        type="filepath",
                        sources=["upload", "microphone"],
                    )
                    ref_text_in = gr.Textbox(
                        label="Transcript of the clip (optional — auto-transcribed if left blank)",
                        placeholder="Leave blank to auto-transcribe with Whisper...",
                        lines=3,
                    )
                    clone_gen_text = gr.Textbox(
                        label="Text to speak in this cloned voice",
                        placeholder="Type what you want spoken in the uploaded voice...",
                        lines=5,
                        max_lines=12,
                    )
                    with gr.Accordion("Advanced settings", open=False):
                        clone_speed = gr.Slider(
                            0.5, 2.0, value=1.0, step=0.05, label="Speed"
                        )
                        clone_nfe = gr.Slider(
                            8, 64, value=24, step=2,
                            label="Quality steps (NFE)",
                            info="Higher = better quality but slower. 16-24 recommended on CPU.",
                        )
                    clone_btn = gr.Button("Generate Cloned Speech", variant="primary")
                with gr.Column(scale=1):
                    clone_audio_out = gr.Audio(label="Generated audio", type="numpy")
                    used_transcript_out = gr.Textbox(
                        label="Transcript used for the reference clip",
                        interactive=False,
                    )

            clone_btn.click(
                fn=generate_cloned_voice,
                inputs=[ref_audio_in, ref_text_in, clone_gen_text, clone_speed, clone_nfe],
                outputs=[clone_audio_out, used_transcript_out],
            )

    gr.Markdown(
        "---\nBuilt on [F5-TTS](https://github.com/SWivid/F5-TTS) "
        "(CC-BY-NC license) · Fine-tuned voice by Plotweaver AI · "
        "Non-commercial / research use.",
        elem_classes="cpu-note",
    )

if __name__ == "__main__":
    demo.launch()