Spaces:

PlotweaverAI
/

PlotweaverNigerianVoice

Sleeping

App Files Files Community

PlotweaverNigerianVoice / app.py

offiongbassey

Update app

4fc0704 verified 12 days ago

Raw

History Blame Contribute Delete

10.5 kB

	"""
	PlotweaverNigerianVoice
	A Gradio Space for the PlotweaverAI Nigerian-English fine-tuned F5-TTS model.
	Two modes:
	1. Default voice: type text -> generated speech using the Nigerian-English
	voice baked into this Space (sample.wav / sample.txt from the model repo).
	2. Custom voice clone: upload your own short reference clip (+ transcript,
	or leave blank to auto-transcribe) -> generated speech in that voice.
	Note: F5-TTS's own infer pipeline already auto-transcribes (Whisper, via
	transformers) and auto-trims reference audio when ref_text is left blank,
	so we lean on that built-in behavior rather than duplicating it here.
	Model: PlotweaverAI/nigerian-english-ft-tts (private HF model repo)
	Base architecture: F5-TTS (SWivid/F5-TTS)
	"""

	import os
	import threading

	import gradio as gr
	import torch
	from huggingface_hub import hf_hub_download

	# --- Optional: only present when running on HF Spaces with a GPU tier ----
	try:
	import spaces

	ON_SPACES = True
	except ImportError:
	ON_SPACES = False


	def gpu_decorator(func):
	"""Wrap with spaces.GPU only when an actual GPU is present (ZeroGPU).
	On a CPU Space `spaces` still imports, so gating on ON_SPACES alone would
	wrap CPU-only inference in spaces.GPU and raise at call time. Gate on real
	CUDA availability instead."""
	if ON_SPACES and torch.cuda.is_available():
	return spaces.GPU(func)
	return func


	# F5-TTS imports (package: f5-tts, installed from PyPI / git in requirements.txt)
	from f5_tts.api import F5TTS


	MODEL_REPO = "PlotweaverAI/nigerian-english-ft-tts"
	HF_TOKEN = os.environ.get("HF_TOKEN") # set as a Space secret (repo is private)

	# Architecture the checkpoint was fine-tuned from. F5TTS_v1_Base is the current
	# default for new finetunes; override with the F5TTS_ARCH secret/variable if
	# your training run used a different base (e.g. "F5TTS_Base").
	MODEL_ARCH = os.environ.get("F5TTS_ARCH", "F5TTS_v1_Base")

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


	_model_lock = threading.Lock()
	_tts_model = None
	_default_ref_audio = None
	_default_ref_text = None


	def _download_model_files():
	"""Pull the fine-tuned checkpoint, vocab, and bundled sample voice
	from the private model repo. Requires HF_TOKEN secret with read access."""
	ckpt_path = hf_hub_download(
	repo_id=MODEL_REPO, filename="model_last.pt", token=HF_TOKEN
	)
	vocab_path = hf_hub_download(
	repo_id=MODEL_REPO, filename="vocab.txt", token=HF_TOKEN
	)
	sample_wav_path = hf_hub_download(
	repo_id=MODEL_REPO, filename="sample.wav", token=HF_TOKEN
	)
	sample_txt_path = hf_hub_download(
	repo_id=MODEL_REPO, filename="sample.txt", token=HF_TOKEN
	)
	with open(sample_txt_path, "r", encoding="utf-8") as f:
	sample_text = f.read().strip()
	return ckpt_path, vocab_path, sample_wav_path, sample_text


	def get_model():
	"""Lazily load the F5-TTS model + default reference voice exactly once."""
	global _tts_model, _default_ref_audio, _default_ref_text
	if _tts_model is not None:
	return _tts_model, _default_ref_audio, _default_ref_text

	with _model_lock:
	if _tts_model is not None:
	return _tts_model, _default_ref_audio, _default_ref_text

	ckpt_path, vocab_path, sample_wav_path, sample_text = _download_model_files()

	try:
	model = F5TTS(
	model=MODEL_ARCH,
	ckpt_file=ckpt_path,
	vocab_file=vocab_path,
	device=DEVICE,
	)
	except RuntimeError as e:
	raise RuntimeError(
	f"Failed to load checkpoint with architecture '{MODEL_ARCH}'. "
	"If this fine-tune was trained from a different F5-TTS base "
	"(e.g. 'F5TTS_Base' instead of 'F5TTS_v1_Base'), set the "
	"F5TTS_ARCH variable in your Space settings to match. "
	f"Original error: {e}"
	) from e

	_tts_model = model
	_default_ref_audio = sample_wav_path
	_default_ref_text = sample_text

	return _tts_model, _default_ref_audio, _default_ref_text


	@gpu_decorator
	def generate_default_voice(text: str, speed: float, nfe_steps: int):
	if not text or not text.strip():
	raise gr.Error("Please enter some text to generate speech for.")

	model, ref_audio, ref_text = get_model()

	wav, sr, _ = model.infer(
	ref_file=ref_audio,
	ref_text=ref_text,
	gen_text=text.strip(),
	speed=speed,
	nfe_step=int(nfe_steps),
	remove_silence=True,
	)

	return (sr, wav)


	@gpu_decorator
	def generate_cloned_voice(
	ref_audio_path: str,
	ref_text: str,
	gen_text: str,
	speed: float,
	nfe_steps: int,
	):
	if ref_audio_path is None:
	raise gr.Error("Please upload a reference voice clip first.")
	if not gen_text or not gen_text.strip():
	raise gr.Error("Please enter the text you want spoken in the cloned voice.")

	model, _, _ = get_model()

	transcript = ref_text.strip() if ref_text else ""
	if not transcript:
	# F5-TTS's transcribe() uses a Whisper pipeline under the hood.
	transcript = model.transcribe(ref_audio_path)
	if not transcript:
	raise gr.Error(
	"Could not auto-transcribe the uploaded clip. "
	"Please type the transcript manually and try again."
	)

	wav, sr, _ = model.infer(
	ref_file=ref_audio_path,
	ref_text=transcript,
	gen_text=gen_text.strip(),
	speed=speed,
	nfe_step=int(nfe_steps),
	remove_silence=True,
	)

	return (sr, wav), transcript

	CSS = """
	#title { text-align: center; margin-bottom: 0.5em; }
	#subtitle { text-align: center; color: var(--body-text-color-subdued); margin-bottom: 1.5em; }
	.cpu-note { font-size: 0.85em; color: var(--body-text-color-subdued); }
	"""

	with gr.Blocks(css=CSS, title="Plotweaver Nigerian Voice") as demo:
	gr.Markdown("# Plotweaver Nigerian Voice", elem_id="title")
	gr.Markdown(
	"Nigerian-English text-to-speech, fine-tuned from F5-TTS by Plotweaver AI. "
	"Generate speech in our Nigerian voice, or clone a voice from your own clip.",
	elem_id="subtitle",
	)
	gr.Markdown(
	"Running on free CPU hardware — generation can take 30–90+ seconds "
	"per request, longer for longer text. Research / non-commercial use only "
	"(base F5-TTS is CC-BY-NC).",
	elem_classes="cpu-note",
	)

	with gr.Tabs():

	with gr.Tab("Nigerian Voice (Text → Speech)"):
	with gr.Row():
	with gr.Column(scale=1):
	default_text = gr.Textbox(
	label="Text to speak",
	placeholder="Type what you want the Nigerian voice to say...",
	lines=6,
	max_lines=12,
	)
	with gr.Accordion("Advanced settings", open=False):
	default_speed = gr.Slider(
	0.5, 2.0, value=1.0, step=0.05, label="Speed"
	)
	default_nfe = gr.Slider(
	8, 64, value=24, step=2,
	label="Quality steps (NFE)",
	info="Higher = better quality but slower. 16-24 recommended on CPU.",
	)
	default_btn = gr.Button("Generate Speech", variant="primary")
	with gr.Column(scale=1):
	default_audio_out = gr.Audio(label="Generated audio", type="numpy")

	default_btn.click(
	fn=generate_default_voice,
	inputs=[default_text, default_speed, default_nfe],
	outputs=[default_audio_out],
	)

	with gr.Tab("Clone a Voice (Upload Clip → Speech)"):
	gr.Markdown(
	"Upload a short, clean voice clip (5-15 seconds works best). "
	"Add the transcript of that clip if you have it — or leave it "
	"blank and we'll auto-transcribe it with Whisper."
	)
	with gr.Row():
	with gr.Column(scale=1):
	ref_audio_in = gr.Audio(
	label="Reference voice clip",
	type="filepath",
	sources=["upload", "microphone"],
	)
	ref_text_in = gr.Textbox(
	label="Transcript of the clip (optional — auto-transcribed if left blank)",
	placeholder="Leave blank to auto-transcribe with Whisper...",
	lines=3,
	)
	clone_gen_text = gr.Textbox(
	label="Text to speak in this cloned voice",
	placeholder="Type what you want spoken in the uploaded voice...",
	lines=5,
	max_lines=12,
	)
	with gr.Accordion("Advanced settings", open=False):
	clone_speed = gr.Slider(
	0.5, 2.0, value=1.0, step=0.05, label="Speed"
	)
	clone_nfe = gr.Slider(
	8, 64, value=24, step=2,
	label="Quality steps (NFE)",
	info="Higher = better quality but slower. 16-24 recommended on CPU.",
	)
	clone_btn = gr.Button("Generate Cloned Speech", variant="primary")
	with gr.Column(scale=1):
	clone_audio_out = gr.Audio(label="Generated audio", type="numpy")
	used_transcript_out = gr.Textbox(
	label="Transcript used for the reference clip",
	interactive=False,
	)

	clone_btn.click(
	fn=generate_cloned_voice,
	inputs=[ref_audio_in, ref_text_in, clone_gen_text, clone_speed, clone_nfe],
	outputs=[clone_audio_out, used_transcript_out],
	)

	gr.Markdown(
	"---\nBuilt on [F5-TTS](https://github.com/SWivid/F5-TTS) "
	"(CC-BY-NC license) · Fine-tuned voice by Plotweaver AI · "
	"Non-commercial / research use.",
	elem_classes="cpu-note",
	)

	if __name__ == "__main__":
	demo.launch()