| """ |
| Kanade Tokenizer β Text-to-Audio with Voice Cloning |
| ===================================================== |
| v3 fixes: |
| - kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13) |
| - espeak-ng installed via packages.txt (OS-level, not pip) |
| - Gradio 6 API: theme/css passed to launch(), not Blocks() |
| - No internet required β 100% offline inference |
| |
| Pipeline: |
| Text β Kokoro TTS (offline) β intermediate WAV |
| Reference Audio β Kanade encode β global_embedding (WHO) |
| intermediate WAV β Kanade encode β content_token_indices (WHAT) |
| Kanade decode(content_tokens + speaker_embedding) β mel |
| Vocoder β final WAV β
|
| """ |
|
|
| import os |
| import tempfile |
| import numpy as np |
| import torch |
| import soundfile as sf |
| import gradio as gr |
|
|
| from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode |
| from kokoro import KPipeline |
|
|
| |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| MODEL_ID = "frothywater/kanade-25hz-clean" |
| KOKORO_SR = 24000 |
|
|
| print(f"[init] Loading Kanade ({DEVICE})β¦") |
| kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE) |
| vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE) |
| SR = kanade.config.sample_rate |
| print("[init] Kanade ready.") |
|
|
| print("[init] Loading Kokoro TTSβ¦") |
| _kokoro_us = KPipeline(lang_code='a') |
| _kokoro_uk = KPipeline(lang_code='b') |
| print("[init] All models ready.") |
|
|
| |
| VOICES = { |
| "πΊπΈ Female β Heart (warm)": ("a", "af_heart"), |
| "πΊπΈ Female β Bella (smooth)": ("a", "af_bella"), |
| "πΊπΈ Female β Nicole (breathy)": ("a", "af_nicole"), |
| "πΊπΈ Female β Sarah": ("a", "af_sarah"), |
| "πΊπΈ Male β Adam": ("a", "am_adam"), |
| "πΊπΈ Male β Michael": ("a", "am_michael"), |
| "π¬π§ Female β Emma": ("b", "bf_emma"), |
| "π¬π§ Male β George": ("b", "bm_george"), |
| "π¬π§ Male β Lewis": ("b", "bm_lewis"), |
| } |
|
|
| |
|
|
| def tts_to_wav(text: str, lang: str, voice_id: str) -> str: |
| """Kokoro TTS (offline) β temp WAV at Kanade sample rate.""" |
| pipe = _kokoro_us if lang == 'a' else _kokoro_uk |
| chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)] |
| if not chunks: |
| raise RuntimeError("Kokoro produced no audio. Check your text.") |
| audio_24k = np.concatenate(chunks) |
|
|
| import librosa |
| audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR) |
|
|
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) |
| sf.write(tmp.name, audio_16k, SR) |
| tmp.close() |
| return tmp.name |
|
|
|
|
| def load_tensor(path: str) -> torch.Tensor: |
| return load_audio(path, sample_rate=SR).to(DEVICE) |
|
|
|
|
| |
|
|
| def synthesize(text, voice_label, ref_audio_path, speed): |
| if not text.strip(): |
| raise gr.Error("Please enter some text.") |
| if ref_audio_path is None: |
| raise gr.Error("Please upload a reference audio clip.") |
|
|
| lang, voice_id = VOICES[voice_label] |
|
|
| gr.Info("Step 1/4 β Synthesising text with Kokoro (offline)β¦") |
| tts_path = tts_to_wav(text, lang, voice_id) |
|
|
| gr.Info("Step 2/4 β Extracting content tokensβ¦") |
| tts_wav = load_tensor(tts_path); os.unlink(tts_path) |
| with torch.inference_mode(): |
| tts_feat = kanade.encode(tts_wav) |
|
|
| gr.Info("Step 3/4 β Extracting speaker embedding from referenceβ¦") |
| ref_wav = load_tensor(ref_audio_path) |
| with torch.inference_mode(): |
| ref_feat = kanade.encode(ref_wav) |
|
|
| gr.Info("Step 4/4 β Decoding with cloned voiceβ¦") |
| with torch.inference_mode(): |
| mel = kanade.decode( |
| content_token_indices=tts_feat.content_token_indices, |
| global_embedding=ref_feat.global_embedding, |
| ) |
| waveform = vocode(vocoder, mel.unsqueeze(0)) |
|
|
| audio_np = waveform.squeeze().cpu().float().numpy() |
|
|
| if abs(speed - 1.0) > 0.05: |
| import librosa |
| audio_np = librosa.effects.time_stretch(audio_np, rate=speed) |
|
|
| return int(SR), audio_np |
|
|
|
|
| |
|
|
| CSS = """ |
| #title { text-align: center; } |
| #banner { text-align: center; color: #6366f1; } |
| footer { display: none !important; } |
| """ |
|
|
| with gr.Blocks(title="Kanade TTS Voice Cloner") as demo: |
| gr.Markdown("# ποΈ Kanade β Text-to-Audio with Voice Cloning", elem_id="title") |
| gr.Markdown( |
| "Enter text Β· Upload a **reference audio** Β· Get your text spoken " |
| "**in that person's voice** β fully offline.", |
| elem_id="banner", |
| ) |
| with gr.Row(): |
| with gr.Column(scale=3): |
| text_in = gr.Textbox(label="π Text to synthesise", lines=5, |
| placeholder="Type anything hereβ¦") |
| voice_dd = gr.Dropdown(label="π Base TTS voice (content only)", |
| choices=list(VOICES), value=list(VOICES)[0]) |
| speed_sl = gr.Slider(label="β© Speed", minimum=0.7, maximum=1.5, |
| value=1.0, step=0.05) |
| with gr.Column(scale=2): |
| ref_audio = gr.Audio(label="π€ Reference audio (voice to clone)", |
| type="filepath", |
| sources=["upload", "microphone"]) |
| gr.Markdown("π‘ 5β30 sec Β· clean speech Β· single speaker") |
|
|
| btn = gr.Button("π Generate", variant="primary", size="lg") |
| out = gr.Audio(label="π Output", type="numpy") |
|
|
| btn.click(fn=synthesize, |
| inputs=[text_in, voice_dd, ref_audio, speed_sl], |
| outputs=out) |
|
|
| gr.Markdown( |
| "---\n" |
| "**Models:** " |
| "[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) Β· " |
| "[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)" |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(theme=gr.themes.Soft(), css=CSS) |