Tokenizer

Build error

App Files Files Community

Tokenizer / app.py

britto224

Update app.py

4c4af4f verified 23 days ago

raw

history blame contribute delete

7 kB

	"""
	Kanade Tokenizer — Text-to-Audio with Voice Cloning
	=====================================================
	v3 fixes:
	- kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13)
	- espeak-ng installed via packages.txt (OS-level, not pip)
	- Gradio 6 API: theme/css passed to launch(), not Blocks()
	- No internet required — 100% offline inference

	Pipeline:
	Text → Kokoro TTS (offline) → intermediate WAV
	Reference Audio → Kanade encode → global_embedding (WHO)
	intermediate WAV → Kanade encode → content_token_indices (WHAT)
	Kanade decode(content_tokens + speaker_embedding) → mel
	Vocoder → final WAV ✅
	"""

	import os
	import tempfile
	import numpy as np
	import torch
	import soundfile as sf
	import gradio as gr

	from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
	from kokoro import KPipeline

	# ─────────────────────────────────────────────────────────────────────────────
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_ID = "frothywater/kanade-25hz-clean"
	KOKORO_SR = 24000

	print(f"[init] Loading Kanade ({DEVICE})…")
	kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
	vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
	SR = kanade.config.sample_rate # 16000
	print("[init] Kanade ready.")

	print("[init] Loading Kokoro TTS…")
	_kokoro_us = KPipeline(lang_code='a') # American English
	_kokoro_uk = KPipeline(lang_code='b') # British English
	print("[init] All models ready.")

	# ── Voice menu ────────────────────────────────────────────────────────────────
	VOICES = {
	"🇺🇸 Female — Heart (warm)": ("a", "af_heart"),
	"🇺🇸 Female — Bella (smooth)": ("a", "af_bella"),
	"🇺🇸 Female — Nicole (breathy)": ("a", "af_nicole"),
	"🇺🇸 Female — Sarah": ("a", "af_sarah"),
	"🇺🇸 Male — Adam": ("a", "am_adam"),
	"🇺🇸 Male — Michael": ("a", "am_michael"),
	"🇬🇧 Female — Emma": ("b", "bf_emma"),
	"🇬🇧 Male — George": ("b", "bm_george"),
	"🇬🇧 Male — Lewis": ("b", "bm_lewis"),
	}

	# ── helpers ───────────────────────────────────────────────────────────────────

	def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
	"""Kokoro TTS (offline) → temp WAV at Kanade sample rate."""
	pipe = _kokoro_us if lang == 'a' else _kokoro_uk
	chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)]
	if not chunks:
	raise RuntimeError("Kokoro produced no audio. Check your text.")
	audio_24k = np.concatenate(chunks)

	import librosa
	audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)

	tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	sf.write(tmp.name, audio_16k, SR)
	tmp.close()
	return tmp.name


	def load_tensor(path: str) -> torch.Tensor:
	return load_audio(path, sample_rate=SR).to(DEVICE)


	# ── inference ─────────────────────────────────────────────────────────────────

	def synthesize(text, voice_label, ref_audio_path, speed):
	if not text.strip():
	raise gr.Error("Please enter some text.")
	if ref_audio_path is None:
	raise gr.Error("Please upload a reference audio clip.")

	lang, voice_id = VOICES[voice_label]

	gr.Info("Step 1/4 — Synthesising text with Kokoro (offline)…")
	tts_path = tts_to_wav(text, lang, voice_id)

	gr.Info("Step 2/4 — Extracting content tokens…")
	tts_wav = load_tensor(tts_path); os.unlink(tts_path)
	with torch.inference_mode():
	tts_feat = kanade.encode(tts_wav)

	gr.Info("Step 3/4 — Extracting speaker embedding from reference…")
	ref_wav = load_tensor(ref_audio_path)
	with torch.inference_mode():
	ref_feat = kanade.encode(ref_wav)

	gr.Info("Step 4/4 — Decoding with cloned voice…")
	with torch.inference_mode():
	mel = kanade.decode(
	content_token_indices=tts_feat.content_token_indices,
	global_embedding=ref_feat.global_embedding,
	)
	waveform = vocode(vocoder, mel.unsqueeze(0))

	audio_np = waveform.squeeze().cpu().float().numpy()

	if abs(speed - 1.0) > 0.05:
	import librosa
	audio_np = librosa.effects.time_stretch(audio_np, rate=speed)

	return int(SR), audio_np


	# ── UI ────────────────────────────────────────────────────────────────────────

	CSS = """
	#title { text-align: center; }
	#banner { text-align: center; color: #6366f1; }
	footer { display: none !important; }
	"""

	with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
	gr.Markdown("# 🎙️ Kanade — Text-to-Audio with Voice Cloning", elem_id="title")
	gr.Markdown(
	"Enter text · Upload a reference audio · Get your text spoken "
	"in that person's voice — fully offline.",
	elem_id="banner",
	)
	with gr.Row():
	with gr.Column(scale=3):
	text_in = gr.Textbox(label="📝 Text to synthesise", lines=5,
	placeholder="Type anything here…")
	voice_dd = gr.Dropdown(label="🔊 Base TTS voice (content only)",
	choices=list(VOICES), value=list(VOICES)[0])
	speed_sl = gr.Slider(label="⏩ Speed", minimum=0.7, maximum=1.5,
	value=1.0, step=0.05)
	with gr.Column(scale=2):
	ref_audio = gr.Audio(label="🎤 Reference audio (voice to clone)",
	type="filepath",
	sources=["upload", "microphone"])
	gr.Markdown("💡 5–30 sec · clean speech · single speaker")

	btn = gr.Button("🚀 Generate", variant="primary", size="lg")
	out = gr.Audio(label="🔈 Output", type="numpy")

	btn.click(fn=synthesize,
	inputs=[text_in, voice_dd, ref_audio, speed_sl],
	outputs=out)

	gr.Markdown(
	"---\n"
	"Models: "
	"[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) · "
	"[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
	)

	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Soft(), css=CSS)