| """ |
| PlotweaverNigerianVoice |
| A Gradio Space for the PlotweaverAI Nigerian-English fine-tuned F5-TTS model. |
| Two modes: |
| 1. Default voice: type text -> generated speech using the Nigerian-English |
| voice baked into this Space (sample.wav / sample.txt from the model repo). |
| 2. Custom voice clone: upload your own short reference clip (+ transcript, |
| or leave blank to auto-transcribe) -> generated speech in that voice. |
| Note: F5-TTS's own infer pipeline already auto-transcribes (Whisper, via |
| transformers) and auto-trims reference audio when ref_text is left blank, |
| so we lean on that built-in behavior rather than duplicating it here. |
| Model: PlotweaverAI/nigerian-english-ft-tts (private HF model repo) |
| Base architecture: F5-TTS (SWivid/F5-TTS) |
| """ |
|
|
| import os |
| import threading |
|
|
| import gradio as gr |
| import torch |
| from huggingface_hub import hf_hub_download |
|
|
| |
| try: |
| import spaces |
|
|
| ON_SPACES = True |
| except ImportError: |
| ON_SPACES = False |
|
|
|
|
| def gpu_decorator(func): |
| """Wrap with spaces.GPU only when an actual GPU is present (ZeroGPU). |
| On a CPU Space `spaces` still imports, so gating on ON_SPACES alone would |
| wrap CPU-only inference in spaces.GPU and raise at call time. Gate on real |
| CUDA availability instead.""" |
| if ON_SPACES and torch.cuda.is_available(): |
| return spaces.GPU(func) |
| return func |
|
|
|
|
| |
| from f5_tts.api import F5TTS |
|
|
|
|
| MODEL_REPO = "PlotweaverAI/nigerian-english-ft-tts" |
| HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
| |
| |
| |
| MODEL_ARCH = os.environ.get("F5TTS_ARCH", "F5TTS_v1_Base") |
|
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
| _model_lock = threading.Lock() |
| _tts_model = None |
| _default_ref_audio = None |
| _default_ref_text = None |
|
|
|
|
| def _download_model_files(): |
| """Pull the fine-tuned checkpoint, vocab, and bundled sample voice |
| from the private model repo. Requires HF_TOKEN secret with read access.""" |
| ckpt_path = hf_hub_download( |
| repo_id=MODEL_REPO, filename="model_last.pt", token=HF_TOKEN |
| ) |
| vocab_path = hf_hub_download( |
| repo_id=MODEL_REPO, filename="vocab.txt", token=HF_TOKEN |
| ) |
| sample_wav_path = hf_hub_download( |
| repo_id=MODEL_REPO, filename="sample.wav", token=HF_TOKEN |
| ) |
| sample_txt_path = hf_hub_download( |
| repo_id=MODEL_REPO, filename="sample.txt", token=HF_TOKEN |
| ) |
| with open(sample_txt_path, "r", encoding="utf-8") as f: |
| sample_text = f.read().strip() |
| return ckpt_path, vocab_path, sample_wav_path, sample_text |
|
|
|
|
| def get_model(): |
| """Lazily load the F5-TTS model + default reference voice exactly once.""" |
| global _tts_model, _default_ref_audio, _default_ref_text |
| if _tts_model is not None: |
| return _tts_model, _default_ref_audio, _default_ref_text |
|
|
| with _model_lock: |
| if _tts_model is not None: |
| return _tts_model, _default_ref_audio, _default_ref_text |
|
|
| ckpt_path, vocab_path, sample_wav_path, sample_text = _download_model_files() |
|
|
| try: |
| model = F5TTS( |
| model=MODEL_ARCH, |
| ckpt_file=ckpt_path, |
| vocab_file=vocab_path, |
| device=DEVICE, |
| ) |
| except RuntimeError as e: |
| raise RuntimeError( |
| f"Failed to load checkpoint with architecture '{MODEL_ARCH}'. " |
| "If this fine-tune was trained from a different F5-TTS base " |
| "(e.g. 'F5TTS_Base' instead of 'F5TTS_v1_Base'), set the " |
| "F5TTS_ARCH variable in your Space settings to match. " |
| f"Original error: {e}" |
| ) from e |
|
|
| _tts_model = model |
| _default_ref_audio = sample_wav_path |
| _default_ref_text = sample_text |
|
|
| return _tts_model, _default_ref_audio, _default_ref_text |
|
|
|
|
| @gpu_decorator |
| def generate_default_voice(text: str, speed: float, nfe_steps: int): |
| if not text or not text.strip(): |
| raise gr.Error("Please enter some text to generate speech for.") |
|
|
| model, ref_audio, ref_text = get_model() |
|
|
| wav, sr, _ = model.infer( |
| ref_file=ref_audio, |
| ref_text=ref_text, |
| gen_text=text.strip(), |
| speed=speed, |
| nfe_step=int(nfe_steps), |
| remove_silence=True, |
| ) |
|
|
| return (sr, wav) |
|
|
|
|
| @gpu_decorator |
| def generate_cloned_voice( |
| ref_audio_path: str, |
| ref_text: str, |
| gen_text: str, |
| speed: float, |
| nfe_steps: int, |
| ): |
| if ref_audio_path is None: |
| raise gr.Error("Please upload a reference voice clip first.") |
| if not gen_text or not gen_text.strip(): |
| raise gr.Error("Please enter the text you want spoken in the cloned voice.") |
|
|
| model, _, _ = get_model() |
|
|
| transcript = ref_text.strip() if ref_text else "" |
| if not transcript: |
| |
| transcript = model.transcribe(ref_audio_path) |
| if not transcript: |
| raise gr.Error( |
| "Could not auto-transcribe the uploaded clip. " |
| "Please type the transcript manually and try again." |
| ) |
|
|
| wav, sr, _ = model.infer( |
| ref_file=ref_audio_path, |
| ref_text=transcript, |
| gen_text=gen_text.strip(), |
| speed=speed, |
| nfe_step=int(nfe_steps), |
| remove_silence=True, |
| ) |
|
|
| return (sr, wav), transcript |
|
|
| CSS = """ |
| #title { text-align: center; margin-bottom: 0.5em; } |
| #subtitle { text-align: center; color: var(--body-text-color-subdued); margin-bottom: 1.5em; } |
| .cpu-note { font-size: 0.85em; color: var(--body-text-color-subdued); } |
| """ |
|
|
| with gr.Blocks(css=CSS, title="Plotweaver Nigerian Voice") as demo: |
| gr.Markdown("# Plotweaver Nigerian Voice", elem_id="title") |
| gr.Markdown( |
| "Nigerian-English text-to-speech, fine-tuned from F5-TTS by Plotweaver AI. " |
| "Generate speech in our Nigerian voice, or clone a voice from your own clip.", |
| elem_id="subtitle", |
| ) |
| gr.Markdown( |
| "Running on free CPU hardware — generation can take **30–90+ seconds** " |
| "per request, longer for longer text. Research / non-commercial use only " |
| "(base F5-TTS is CC-BY-NC).", |
| elem_classes="cpu-note", |
| ) |
|
|
| with gr.Tabs(): |
|
|
| with gr.Tab("Nigerian Voice (Text → Speech)"): |
| with gr.Row(): |
| with gr.Column(scale=1): |
| default_text = gr.Textbox( |
| label="Text to speak", |
| placeholder="Type what you want the Nigerian voice to say...", |
| lines=6, |
| max_lines=12, |
| ) |
| with gr.Accordion("Advanced settings", open=False): |
| default_speed = gr.Slider( |
| 0.5, 2.0, value=1.0, step=0.05, label="Speed" |
| ) |
| default_nfe = gr.Slider( |
| 8, 64, value=24, step=2, |
| label="Quality steps (NFE)", |
| info="Higher = better quality but slower. 16-24 recommended on CPU.", |
| ) |
| default_btn = gr.Button("Generate Speech", variant="primary") |
| with gr.Column(scale=1): |
| default_audio_out = gr.Audio(label="Generated audio", type="numpy") |
|
|
| default_btn.click( |
| fn=generate_default_voice, |
| inputs=[default_text, default_speed, default_nfe], |
| outputs=[default_audio_out], |
| ) |
|
|
| with gr.Tab("Clone a Voice (Upload Clip → Speech)"): |
| gr.Markdown( |
| "Upload a short, clean voice clip (5-15 seconds works best). " |
| "Add the transcript of that clip if you have it — or leave it " |
| "blank and we'll auto-transcribe it with Whisper." |
| ) |
| with gr.Row(): |
| with gr.Column(scale=1): |
| ref_audio_in = gr.Audio( |
| label="Reference voice clip", |
| type="filepath", |
| sources=["upload", "microphone"], |
| ) |
| ref_text_in = gr.Textbox( |
| label="Transcript of the clip (optional — auto-transcribed if left blank)", |
| placeholder="Leave blank to auto-transcribe with Whisper...", |
| lines=3, |
| ) |
| clone_gen_text = gr.Textbox( |
| label="Text to speak in this cloned voice", |
| placeholder="Type what you want spoken in the uploaded voice...", |
| lines=5, |
| max_lines=12, |
| ) |
| with gr.Accordion("Advanced settings", open=False): |
| clone_speed = gr.Slider( |
| 0.5, 2.0, value=1.0, step=0.05, label="Speed" |
| ) |
| clone_nfe = gr.Slider( |
| 8, 64, value=24, step=2, |
| label="Quality steps (NFE)", |
| info="Higher = better quality but slower. 16-24 recommended on CPU.", |
| ) |
| clone_btn = gr.Button("Generate Cloned Speech", variant="primary") |
| with gr.Column(scale=1): |
| clone_audio_out = gr.Audio(label="Generated audio", type="numpy") |
| used_transcript_out = gr.Textbox( |
| label="Transcript used for the reference clip", |
| interactive=False, |
| ) |
|
|
| clone_btn.click( |
| fn=generate_cloned_voice, |
| inputs=[ref_audio_in, ref_text_in, clone_gen_text, clone_speed, clone_nfe], |
| outputs=[clone_audio_out, used_transcript_out], |
| ) |
|
|
| gr.Markdown( |
| "---\nBuilt on [F5-TTS](https://github.com/SWivid/F5-TTS) " |
| "(CC-BY-NC license) · Fine-tuned voice by Plotweaver AI · " |
| "Non-commercial / research use.", |
| elem_classes="cpu-note", |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |