""" PlotweaverNigerianVoice A Gradio Space for the PlotweaverAI Nigerian-English fine-tuned F5-TTS model. Two modes: 1. Default voice: type text -> generated speech using the Nigerian-English voice baked into this Space (sample.wav / sample.txt from the model repo). 2. Custom voice clone: upload your own short reference clip (+ transcript, or leave blank to auto-transcribe) -> generated speech in that voice. Note: F5-TTS's own infer pipeline already auto-transcribes (Whisper, via transformers) and auto-trims reference audio when ref_text is left blank, so we lean on that built-in behavior rather than duplicating it here. Model: PlotweaverAI/nigerian-english-ft-tts (private HF model repo) Base architecture: F5-TTS (SWivid/F5-TTS) """ import os import threading import gradio as gr import torch from huggingface_hub import hf_hub_download # --- Optional: only present when running on HF Spaces with a GPU tier ---- try: import spaces ON_SPACES = True except ImportError: ON_SPACES = False def gpu_decorator(func): """Wrap with spaces.GPU only when an actual GPU is present (ZeroGPU). On a CPU Space `spaces` still imports, so gating on ON_SPACES alone would wrap CPU-only inference in spaces.GPU and raise at call time. Gate on real CUDA availability instead.""" if ON_SPACES and torch.cuda.is_available(): return spaces.GPU(func) return func # F5-TTS imports (package: f5-tts, installed from PyPI / git in requirements.txt) from f5_tts.api import F5TTS MODEL_REPO = "PlotweaverAI/nigerian-english-ft-tts" HF_TOKEN = os.environ.get("HF_TOKEN") # set as a Space secret (repo is private) # Architecture the checkpoint was fine-tuned from. F5TTS_v1_Base is the current # default for new finetunes; override with the F5TTS_ARCH secret/variable if # your training run used a different base (e.g. "F5TTS_Base"). MODEL_ARCH = os.environ.get("F5TTS_ARCH", "F5TTS_v1_Base") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" _model_lock = threading.Lock() _tts_model = None _default_ref_audio = None _default_ref_text = None def _download_model_files(): """Pull the fine-tuned checkpoint, vocab, and bundled sample voice from the private model repo. Requires HF_TOKEN secret with read access.""" ckpt_path = hf_hub_download( repo_id=MODEL_REPO, filename="model_last.pt", token=HF_TOKEN ) vocab_path = hf_hub_download( repo_id=MODEL_REPO, filename="vocab.txt", token=HF_TOKEN ) sample_wav_path = hf_hub_download( repo_id=MODEL_REPO, filename="sample.wav", token=HF_TOKEN ) sample_txt_path = hf_hub_download( repo_id=MODEL_REPO, filename="sample.txt", token=HF_TOKEN ) with open(sample_txt_path, "r", encoding="utf-8") as f: sample_text = f.read().strip() return ckpt_path, vocab_path, sample_wav_path, sample_text def get_model(): """Lazily load the F5-TTS model + default reference voice exactly once.""" global _tts_model, _default_ref_audio, _default_ref_text if _tts_model is not None: return _tts_model, _default_ref_audio, _default_ref_text with _model_lock: if _tts_model is not None: return _tts_model, _default_ref_audio, _default_ref_text ckpt_path, vocab_path, sample_wav_path, sample_text = _download_model_files() try: model = F5TTS( model=MODEL_ARCH, ckpt_file=ckpt_path, vocab_file=vocab_path, device=DEVICE, ) except RuntimeError as e: raise RuntimeError( f"Failed to load checkpoint with architecture '{MODEL_ARCH}'. " "If this fine-tune was trained from a different F5-TTS base " "(e.g. 'F5TTS_Base' instead of 'F5TTS_v1_Base'), set the " "F5TTS_ARCH variable in your Space settings to match. " f"Original error: {e}" ) from e _tts_model = model _default_ref_audio = sample_wav_path _default_ref_text = sample_text return _tts_model, _default_ref_audio, _default_ref_text @gpu_decorator def generate_default_voice(text: str, speed: float, nfe_steps: int): if not text or not text.strip(): raise gr.Error("Please enter some text to generate speech for.") model, ref_audio, ref_text = get_model() wav, sr, _ = model.infer( ref_file=ref_audio, ref_text=ref_text, gen_text=text.strip(), speed=speed, nfe_step=int(nfe_steps), remove_silence=True, ) return (sr, wav) @gpu_decorator def generate_cloned_voice( ref_audio_path: str, ref_text: str, gen_text: str, speed: float, nfe_steps: int, ): if ref_audio_path is None: raise gr.Error("Please upload a reference voice clip first.") if not gen_text or not gen_text.strip(): raise gr.Error("Please enter the text you want spoken in the cloned voice.") model, _, _ = get_model() transcript = ref_text.strip() if ref_text else "" if not transcript: # F5-TTS's transcribe() uses a Whisper pipeline under the hood. transcript = model.transcribe(ref_audio_path) if not transcript: raise gr.Error( "Could not auto-transcribe the uploaded clip. " "Please type the transcript manually and try again." ) wav, sr, _ = model.infer( ref_file=ref_audio_path, ref_text=transcript, gen_text=gen_text.strip(), speed=speed, nfe_step=int(nfe_steps), remove_silence=True, ) return (sr, wav), transcript CSS = """ #title { text-align: center; margin-bottom: 0.5em; } #subtitle { text-align: center; color: var(--body-text-color-subdued); margin-bottom: 1.5em; } .cpu-note { font-size: 0.85em; color: var(--body-text-color-subdued); } """ with gr.Blocks(css=CSS, title="Plotweaver Nigerian Voice") as demo: gr.Markdown("# Plotweaver Nigerian Voice", elem_id="title") gr.Markdown( "Nigerian-English text-to-speech, fine-tuned from F5-TTS by Plotweaver AI. " "Generate speech in our Nigerian voice, or clone a voice from your own clip.", elem_id="subtitle", ) gr.Markdown( "Running on free CPU hardware — generation can take **30–90+ seconds** " "per request, longer for longer text. Research / non-commercial use only " "(base F5-TTS is CC-BY-NC).", elem_classes="cpu-note", ) with gr.Tabs(): with gr.Tab("Nigerian Voice (Text → Speech)"): with gr.Row(): with gr.Column(scale=1): default_text = gr.Textbox( label="Text to speak", placeholder="Type what you want the Nigerian voice to say...", lines=6, max_lines=12, ) with gr.Accordion("Advanced settings", open=False): default_speed = gr.Slider( 0.5, 2.0, value=1.0, step=0.05, label="Speed" ) default_nfe = gr.Slider( 8, 64, value=24, step=2, label="Quality steps (NFE)", info="Higher = better quality but slower. 16-24 recommended on CPU.", ) default_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=1): default_audio_out = gr.Audio(label="Generated audio", type="numpy") default_btn.click( fn=generate_default_voice, inputs=[default_text, default_speed, default_nfe], outputs=[default_audio_out], ) with gr.Tab("Clone a Voice (Upload Clip → Speech)"): gr.Markdown( "Upload a short, clean voice clip (5-15 seconds works best). " "Add the transcript of that clip if you have it — or leave it " "blank and we'll auto-transcribe it with Whisper." ) with gr.Row(): with gr.Column(scale=1): ref_audio_in = gr.Audio( label="Reference voice clip", type="filepath", sources=["upload", "microphone"], ) ref_text_in = gr.Textbox( label="Transcript of the clip (optional — auto-transcribed if left blank)", placeholder="Leave blank to auto-transcribe with Whisper...", lines=3, ) clone_gen_text = gr.Textbox( label="Text to speak in this cloned voice", placeholder="Type what you want spoken in the uploaded voice...", lines=5, max_lines=12, ) with gr.Accordion("Advanced settings", open=False): clone_speed = gr.Slider( 0.5, 2.0, value=1.0, step=0.05, label="Speed" ) clone_nfe = gr.Slider( 8, 64, value=24, step=2, label="Quality steps (NFE)", info="Higher = better quality but slower. 16-24 recommended on CPU.", ) clone_btn = gr.Button("Generate Cloned Speech", variant="primary") with gr.Column(scale=1): clone_audio_out = gr.Audio(label="Generated audio", type="numpy") used_transcript_out = gr.Textbox( label="Transcript used for the reference clip", interactive=False, ) clone_btn.click( fn=generate_cloned_voice, inputs=[ref_audio_in, ref_text_in, clone_gen_text, clone_speed, clone_nfe], outputs=[clone_audio_out, used_transcript_out], ) gr.Markdown( "---\nBuilt on [F5-TTS](https://github.com/SWivid/F5-TTS) " "(CC-BY-NC license) · Fine-tuned voice by Plotweaver AI · " "Non-commercial / research use.", elem_classes="cpu-note", ) if __name__ == "__main__": demo.launch()