offiongbassey's picture
Update app
4fc0704 verified
Raw
History Blame Contribute Delete
10.5 kB
"""
PlotweaverNigerianVoice
A Gradio Space for the PlotweaverAI Nigerian-English fine-tuned F5-TTS model.
Two modes:
1. Default voice: type text -> generated speech using the Nigerian-English
voice baked into this Space (sample.wav / sample.txt from the model repo).
2. Custom voice clone: upload your own short reference clip (+ transcript,
or leave blank to auto-transcribe) -> generated speech in that voice.
Note: F5-TTS's own infer pipeline already auto-transcribes (Whisper, via
transformers) and auto-trims reference audio when ref_text is left blank,
so we lean on that built-in behavior rather than duplicating it here.
Model: PlotweaverAI/nigerian-english-ft-tts (private HF model repo)
Base architecture: F5-TTS (SWivid/F5-TTS)
"""
import os
import threading
import gradio as gr
import torch
from huggingface_hub import hf_hub_download
# --- Optional: only present when running on HF Spaces with a GPU tier ----
try:
import spaces
ON_SPACES = True
except ImportError:
ON_SPACES = False
def gpu_decorator(func):
"""Wrap with spaces.GPU only when an actual GPU is present (ZeroGPU).
On a CPU Space `spaces` still imports, so gating on ON_SPACES alone would
wrap CPU-only inference in spaces.GPU and raise at call time. Gate on real
CUDA availability instead."""
if ON_SPACES and torch.cuda.is_available():
return spaces.GPU(func)
return func
# F5-TTS imports (package: f5-tts, installed from PyPI / git in requirements.txt)
from f5_tts.api import F5TTS
MODEL_REPO = "PlotweaverAI/nigerian-english-ft-tts"
HF_TOKEN = os.environ.get("HF_TOKEN") # set as a Space secret (repo is private)
# Architecture the checkpoint was fine-tuned from. F5TTS_v1_Base is the current
# default for new finetunes; override with the F5TTS_ARCH secret/variable if
# your training run used a different base (e.g. "F5TTS_Base").
MODEL_ARCH = os.environ.get("F5TTS_ARCH", "F5TTS_v1_Base")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
_model_lock = threading.Lock()
_tts_model = None
_default_ref_audio = None
_default_ref_text = None
def _download_model_files():
"""Pull the fine-tuned checkpoint, vocab, and bundled sample voice
from the private model repo. Requires HF_TOKEN secret with read access."""
ckpt_path = hf_hub_download(
repo_id=MODEL_REPO, filename="model_last.pt", token=HF_TOKEN
)
vocab_path = hf_hub_download(
repo_id=MODEL_REPO, filename="vocab.txt", token=HF_TOKEN
)
sample_wav_path = hf_hub_download(
repo_id=MODEL_REPO, filename="sample.wav", token=HF_TOKEN
)
sample_txt_path = hf_hub_download(
repo_id=MODEL_REPO, filename="sample.txt", token=HF_TOKEN
)
with open(sample_txt_path, "r", encoding="utf-8") as f:
sample_text = f.read().strip()
return ckpt_path, vocab_path, sample_wav_path, sample_text
def get_model():
"""Lazily load the F5-TTS model + default reference voice exactly once."""
global _tts_model, _default_ref_audio, _default_ref_text
if _tts_model is not None:
return _tts_model, _default_ref_audio, _default_ref_text
with _model_lock:
if _tts_model is not None:
return _tts_model, _default_ref_audio, _default_ref_text
ckpt_path, vocab_path, sample_wav_path, sample_text = _download_model_files()
try:
model = F5TTS(
model=MODEL_ARCH,
ckpt_file=ckpt_path,
vocab_file=vocab_path,
device=DEVICE,
)
except RuntimeError as e:
raise RuntimeError(
f"Failed to load checkpoint with architecture '{MODEL_ARCH}'. "
"If this fine-tune was trained from a different F5-TTS base "
"(e.g. 'F5TTS_Base' instead of 'F5TTS_v1_Base'), set the "
"F5TTS_ARCH variable in your Space settings to match. "
f"Original error: {e}"
) from e
_tts_model = model
_default_ref_audio = sample_wav_path
_default_ref_text = sample_text
return _tts_model, _default_ref_audio, _default_ref_text
@gpu_decorator
def generate_default_voice(text: str, speed: float, nfe_steps: int):
if not text or not text.strip():
raise gr.Error("Please enter some text to generate speech for.")
model, ref_audio, ref_text = get_model()
wav, sr, _ = model.infer(
ref_file=ref_audio,
ref_text=ref_text,
gen_text=text.strip(),
speed=speed,
nfe_step=int(nfe_steps),
remove_silence=True,
)
return (sr, wav)
@gpu_decorator
def generate_cloned_voice(
ref_audio_path: str,
ref_text: str,
gen_text: str,
speed: float,
nfe_steps: int,
):
if ref_audio_path is None:
raise gr.Error("Please upload a reference voice clip first.")
if not gen_text or not gen_text.strip():
raise gr.Error("Please enter the text you want spoken in the cloned voice.")
model, _, _ = get_model()
transcript = ref_text.strip() if ref_text else ""
if not transcript:
# F5-TTS's transcribe() uses a Whisper pipeline under the hood.
transcript = model.transcribe(ref_audio_path)
if not transcript:
raise gr.Error(
"Could not auto-transcribe the uploaded clip. "
"Please type the transcript manually and try again."
)
wav, sr, _ = model.infer(
ref_file=ref_audio_path,
ref_text=transcript,
gen_text=gen_text.strip(),
speed=speed,
nfe_step=int(nfe_steps),
remove_silence=True,
)
return (sr, wav), transcript
CSS = """
#title { text-align: center; margin-bottom: 0.5em; }
#subtitle { text-align: center; color: var(--body-text-color-subdued); margin-bottom: 1.5em; }
.cpu-note { font-size: 0.85em; color: var(--body-text-color-subdued); }
"""
with gr.Blocks(css=CSS, title="Plotweaver Nigerian Voice") as demo:
gr.Markdown("# Plotweaver Nigerian Voice", elem_id="title")
gr.Markdown(
"Nigerian-English text-to-speech, fine-tuned from F5-TTS by Plotweaver AI. "
"Generate speech in our Nigerian voice, or clone a voice from your own clip.",
elem_id="subtitle",
)
gr.Markdown(
"Running on free CPU hardware — generation can take **30–90+ seconds** "
"per request, longer for longer text. Research / non-commercial use only "
"(base F5-TTS is CC-BY-NC).",
elem_classes="cpu-note",
)
with gr.Tabs():
with gr.Tab("Nigerian Voice (Text → Speech)"):
with gr.Row():
with gr.Column(scale=1):
default_text = gr.Textbox(
label="Text to speak",
placeholder="Type what you want the Nigerian voice to say...",
lines=6,
max_lines=12,
)
with gr.Accordion("Advanced settings", open=False):
default_speed = gr.Slider(
0.5, 2.0, value=1.0, step=0.05, label="Speed"
)
default_nfe = gr.Slider(
8, 64, value=24, step=2,
label="Quality steps (NFE)",
info="Higher = better quality but slower. 16-24 recommended on CPU.",
)
default_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column(scale=1):
default_audio_out = gr.Audio(label="Generated audio", type="numpy")
default_btn.click(
fn=generate_default_voice,
inputs=[default_text, default_speed, default_nfe],
outputs=[default_audio_out],
)
with gr.Tab("Clone a Voice (Upload Clip → Speech)"):
gr.Markdown(
"Upload a short, clean voice clip (5-15 seconds works best). "
"Add the transcript of that clip if you have it — or leave it "
"blank and we'll auto-transcribe it with Whisper."
)
with gr.Row():
with gr.Column(scale=1):
ref_audio_in = gr.Audio(
label="Reference voice clip",
type="filepath",
sources=["upload", "microphone"],
)
ref_text_in = gr.Textbox(
label="Transcript of the clip (optional — auto-transcribed if left blank)",
placeholder="Leave blank to auto-transcribe with Whisper...",
lines=3,
)
clone_gen_text = gr.Textbox(
label="Text to speak in this cloned voice",
placeholder="Type what you want spoken in the uploaded voice...",
lines=5,
max_lines=12,
)
with gr.Accordion("Advanced settings", open=False):
clone_speed = gr.Slider(
0.5, 2.0, value=1.0, step=0.05, label="Speed"
)
clone_nfe = gr.Slider(
8, 64, value=24, step=2,
label="Quality steps (NFE)",
info="Higher = better quality but slower. 16-24 recommended on CPU.",
)
clone_btn = gr.Button("Generate Cloned Speech", variant="primary")
with gr.Column(scale=1):
clone_audio_out = gr.Audio(label="Generated audio", type="numpy")
used_transcript_out = gr.Textbox(
label="Transcript used for the reference clip",
interactive=False,
)
clone_btn.click(
fn=generate_cloned_voice,
inputs=[ref_audio_in, ref_text_in, clone_gen_text, clone_speed, clone_nfe],
outputs=[clone_audio_out, used_transcript_out],
)
gr.Markdown(
"---\nBuilt on [F5-TTS](https://github.com/SWivid/F5-TTS) "
"(CC-BY-NC license) · Fine-tuned voice by Plotweaver AI · "
"Non-commercial / research use.",
elem_classes="cpu-note",
)
if __name__ == "__main__":
demo.launch()