""" Kanade Tokenizer — Text-to-Audio with Voice Cloning ===================================================== v3 fixes: - kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13) - espeak-ng installed via packages.txt (OS-level, not pip) - Gradio 6 API: theme/css passed to launch(), not Blocks() - No internet required — 100% offline inference Pipeline: Text → Kokoro TTS (offline) → intermediate WAV Reference Audio → Kanade encode → global_embedding (WHO) intermediate WAV → Kanade encode → content_token_indices (WHAT) Kanade decode(content_tokens + speaker_embedding) → mel Vocoder → final WAV ✅ """ import os import tempfile import numpy as np import torch import soundfile as sf import gradio as gr from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode from kokoro import KPipeline # ───────────────────────────────────────────────────────────────────────────── DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODEL_ID = "frothywater/kanade-25hz-clean" KOKORO_SR = 24000 print(f"[init] Loading Kanade ({DEVICE})…") kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE) vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE) SR = kanade.config.sample_rate # 16000 print("[init] Kanade ready.") print("[init] Loading Kokoro TTS…") _kokoro_us = KPipeline(lang_code='a') # American English _kokoro_uk = KPipeline(lang_code='b') # British English print("[init] All models ready.") # ── Voice menu ──────────────────────────────────────────────────────────────── VOICES = { "🇺🇸 Female — Heart (warm)": ("a", "af_heart"), "🇺🇸 Female — Bella (smooth)": ("a", "af_bella"), "🇺🇸 Female — Nicole (breathy)": ("a", "af_nicole"), "🇺🇸 Female — Sarah": ("a", "af_sarah"), "🇺🇸 Male — Adam": ("a", "am_adam"), "🇺🇸 Male — Michael": ("a", "am_michael"), "🇬🇧 Female — Emma": ("b", "bf_emma"), "🇬🇧 Male — George": ("b", "bm_george"), "🇬🇧 Male — Lewis": ("b", "bm_lewis"), } # ── helpers ─────────────────────────────────────────────────────────────────── def tts_to_wav(text: str, lang: str, voice_id: str) -> str: """Kokoro TTS (offline) → temp WAV at Kanade sample rate.""" pipe = _kokoro_us if lang == 'a' else _kokoro_uk chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)] if not chunks: raise RuntimeError("Kokoro produced no audio. Check your text.") audio_24k = np.concatenate(chunks) import librosa audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR) tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sf.write(tmp.name, audio_16k, SR) tmp.close() return tmp.name def load_tensor(path: str) -> torch.Tensor: return load_audio(path, sample_rate=SR).to(DEVICE) # ── inference ───────────────────────────────────────────────────────────────── def synthesize(text, voice_label, ref_audio_path, speed): if not text.strip(): raise gr.Error("Please enter some text.") if ref_audio_path is None: raise gr.Error("Please upload a reference audio clip.") lang, voice_id = VOICES[voice_label] gr.Info("Step 1/4 — Synthesising text with Kokoro (offline)…") tts_path = tts_to_wav(text, lang, voice_id) gr.Info("Step 2/4 — Extracting content tokens…") tts_wav = load_tensor(tts_path); os.unlink(tts_path) with torch.inference_mode(): tts_feat = kanade.encode(tts_wav) gr.Info("Step 3/4 — Extracting speaker embedding from reference…") ref_wav = load_tensor(ref_audio_path) with torch.inference_mode(): ref_feat = kanade.encode(ref_wav) gr.Info("Step 4/4 — Decoding with cloned voice…") with torch.inference_mode(): mel = kanade.decode( content_token_indices=tts_feat.content_token_indices, global_embedding=ref_feat.global_embedding, ) waveform = vocode(vocoder, mel.unsqueeze(0)) audio_np = waveform.squeeze().cpu().float().numpy() if abs(speed - 1.0) > 0.05: import librosa audio_np = librosa.effects.time_stretch(audio_np, rate=speed) return int(SR), audio_np # ── UI ──────────────────────────────────────────────────────────────────────── CSS = """ #title { text-align: center; } #banner { text-align: center; color: #6366f1; } footer { display: none !important; } """ with gr.Blocks(title="Kanade TTS Voice Cloner") as demo: gr.Markdown("# 🎙️ Kanade — Text-to-Audio with Voice Cloning", elem_id="title") gr.Markdown( "Enter text · Upload a **reference audio** · Get your text spoken " "**in that person's voice** — fully offline.", elem_id="banner", ) with gr.Row(): with gr.Column(scale=3): text_in = gr.Textbox(label="📝 Text to synthesise", lines=5, placeholder="Type anything here…") voice_dd = gr.Dropdown(label="🔊 Base TTS voice (content only)", choices=list(VOICES), value=list(VOICES)[0]) speed_sl = gr.Slider(label="⏩ Speed", minimum=0.7, maximum=1.5, value=1.0, step=0.05) with gr.Column(scale=2): ref_audio = gr.Audio(label="🎤 Reference audio (voice to clone)", type="filepath", sources=["upload", "microphone"]) gr.Markdown("💡 5–30 sec · clean speech · single speaker") btn = gr.Button("🚀 Generate", variant="primary", size="lg") out = gr.Audio(label="🔈 Output", type="numpy") btn.click(fn=synthesize, inputs=[text_in, voice_dd, ref_audio, speed_sl], outputs=out) gr.Markdown( "---\n" "**Models:** " "[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) · " "[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)" ) if __name__ == "__main__": demo.launch(theme=gr.themes.Soft(), css=CSS)