Tokenizer / app.py
britto224's picture
Update app.py
4c4af4f verified
"""
Kanade Tokenizer β€” Text-to-Audio with Voice Cloning
=====================================================
v3 fixes:
- kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13)
- espeak-ng installed via packages.txt (OS-level, not pip)
- Gradio 6 API: theme/css passed to launch(), not Blocks()
- No internet required β€” 100% offline inference
Pipeline:
Text β†’ Kokoro TTS (offline) β†’ intermediate WAV
Reference Audio β†’ Kanade encode β†’ global_embedding (WHO)
intermediate WAV β†’ Kanade encode β†’ content_token_indices (WHAT)
Kanade decode(content_tokens + speaker_embedding) β†’ mel
Vocoder β†’ final WAV βœ…
"""
import os
import tempfile
import numpy as np
import torch
import soundfile as sf
import gradio as gr
from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
from kokoro import KPipeline
# ─────────────────────────────────────────────────────────────────────────────
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "frothywater/kanade-25hz-clean"
KOKORO_SR = 24000
print(f"[init] Loading Kanade ({DEVICE})…")
kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
SR = kanade.config.sample_rate # 16000
print("[init] Kanade ready.")
print("[init] Loading Kokoro TTS…")
_kokoro_us = KPipeline(lang_code='a') # American English
_kokoro_uk = KPipeline(lang_code='b') # British English
print("[init] All models ready.")
# ── Voice menu ────────────────────────────────────────────────────────────────
VOICES = {
"πŸ‡ΊπŸ‡Έ Female β€” Heart (warm)": ("a", "af_heart"),
"πŸ‡ΊπŸ‡Έ Female β€” Bella (smooth)": ("a", "af_bella"),
"πŸ‡ΊπŸ‡Έ Female β€” Nicole (breathy)": ("a", "af_nicole"),
"πŸ‡ΊπŸ‡Έ Female β€” Sarah": ("a", "af_sarah"),
"πŸ‡ΊπŸ‡Έ Male β€” Adam": ("a", "am_adam"),
"πŸ‡ΊπŸ‡Έ Male β€” Michael": ("a", "am_michael"),
"πŸ‡¬πŸ‡§ Female β€” Emma": ("b", "bf_emma"),
"πŸ‡¬πŸ‡§ Male β€” George": ("b", "bm_george"),
"πŸ‡¬πŸ‡§ Male β€” Lewis": ("b", "bm_lewis"),
}
# ── helpers ───────────────────────────────────────────────────────────────────
def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
"""Kokoro TTS (offline) β†’ temp WAV at Kanade sample rate."""
pipe = _kokoro_us if lang == 'a' else _kokoro_uk
chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)]
if not chunks:
raise RuntimeError("Kokoro produced no audio. Check your text.")
audio_24k = np.concatenate(chunks)
import librosa
audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, audio_16k, SR)
tmp.close()
return tmp.name
def load_tensor(path: str) -> torch.Tensor:
return load_audio(path, sample_rate=SR).to(DEVICE)
# ── inference ─────────────────────────────────────────────────────────────────
def synthesize(text, voice_label, ref_audio_path, speed):
if not text.strip():
raise gr.Error("Please enter some text.")
if ref_audio_path is None:
raise gr.Error("Please upload a reference audio clip.")
lang, voice_id = VOICES[voice_label]
gr.Info("Step 1/4 β€” Synthesising text with Kokoro (offline)…")
tts_path = tts_to_wav(text, lang, voice_id)
gr.Info("Step 2/4 β€” Extracting content tokens…")
tts_wav = load_tensor(tts_path); os.unlink(tts_path)
with torch.inference_mode():
tts_feat = kanade.encode(tts_wav)
gr.Info("Step 3/4 β€” Extracting speaker embedding from reference…")
ref_wav = load_tensor(ref_audio_path)
with torch.inference_mode():
ref_feat = kanade.encode(ref_wav)
gr.Info("Step 4/4 β€” Decoding with cloned voice…")
with torch.inference_mode():
mel = kanade.decode(
content_token_indices=tts_feat.content_token_indices,
global_embedding=ref_feat.global_embedding,
)
waveform = vocode(vocoder, mel.unsqueeze(0))
audio_np = waveform.squeeze().cpu().float().numpy()
if abs(speed - 1.0) > 0.05:
import librosa
audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
return int(SR), audio_np
# ── UI ────────────────────────────────────────────────────────────────────────
CSS = """
#title { text-align: center; }
#banner { text-align: center; color: #6366f1; }
footer { display: none !important; }
"""
with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
gr.Markdown("# πŸŽ™οΈ Kanade β€” Text-to-Audio with Voice Cloning", elem_id="title")
gr.Markdown(
"Enter text Β· Upload a **reference audio** Β· Get your text spoken "
"**in that person's voice** β€” fully offline.",
elem_id="banner",
)
with gr.Row():
with gr.Column(scale=3):
text_in = gr.Textbox(label="πŸ“ Text to synthesise", lines=5,
placeholder="Type anything here…")
voice_dd = gr.Dropdown(label="πŸ”Š Base TTS voice (content only)",
choices=list(VOICES), value=list(VOICES)[0])
speed_sl = gr.Slider(label="⏩ Speed", minimum=0.7, maximum=1.5,
value=1.0, step=0.05)
with gr.Column(scale=2):
ref_audio = gr.Audio(label="🎀 Reference audio (voice to clone)",
type="filepath",
sources=["upload", "microphone"])
gr.Markdown("πŸ’‘ 5–30 sec Β· clean speech Β· single speaker")
btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
out = gr.Audio(label="πŸ”ˆ Output", type="numpy")
btn.click(fn=synthesize,
inputs=[text_in, voice_dd, ref_audio, speed_sl],
outputs=out)
gr.Markdown(
"---\n"
"**Models:** "
"[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) Β· "
"[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft(), css=CSS)