File size: 7,002 Bytes
ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 2cba492 ae2f25b ab7c93f ae2f25b ab7c93f 2cba492 ae2f25b ab7c93f 4c4af4f ab7c93f 4c4af4f ab7c93f 4c4af4f ab7c93f 4c4af4f ab7c93f 4c4af4f ab7c93f 4c4af4f ae2f25b 4c4af4f ae2f25b ab7c93f 4c4af4f ab7c93f 4c4af4f ab7c93f ae2f25b 4c4af4f ae2f25b 4c4af4f ab7c93f 4c4af4f ae2f25b 4c4af4f ae2f25b ab7c93f ae2f25b ab7c93f ae2f25b ab7c93f 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b ab7c93f ae2f25b 4c4af4f ae2f25b 2cba492 ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ae2f25b 4c4af4f ab7c93f 2cba492 4c4af4f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | """
Kanade Tokenizer β Text-to-Audio with Voice Cloning
=====================================================
v3 fixes:
- kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13)
- espeak-ng installed via packages.txt (OS-level, not pip)
- Gradio 6 API: theme/css passed to launch(), not Blocks()
- No internet required β 100% offline inference
Pipeline:
Text β Kokoro TTS (offline) β intermediate WAV
Reference Audio β Kanade encode β global_embedding (WHO)
intermediate WAV β Kanade encode β content_token_indices (WHAT)
Kanade decode(content_tokens + speaker_embedding) β mel
Vocoder β final WAV β
"""
import os
import tempfile
import numpy as np
import torch
import soundfile as sf
import gradio as gr
from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
from kokoro import KPipeline
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "frothywater/kanade-25hz-clean"
KOKORO_SR = 24000
print(f"[init] Loading Kanade ({DEVICE})β¦")
kanade = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
SR = kanade.config.sample_rate # 16000
print("[init] Kanade ready.")
print("[init] Loading Kokoro TTSβ¦")
_kokoro_us = KPipeline(lang_code='a') # American English
_kokoro_uk = KPipeline(lang_code='b') # British English
print("[init] All models ready.")
# ββ Voice menu ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
VOICES = {
"πΊπΈ Female β Heart (warm)": ("a", "af_heart"),
"πΊπΈ Female β Bella (smooth)": ("a", "af_bella"),
"πΊπΈ Female β Nicole (breathy)": ("a", "af_nicole"),
"πΊπΈ Female β Sarah": ("a", "af_sarah"),
"πΊπΈ Male β Adam": ("a", "am_adam"),
"πΊπΈ Male β Michael": ("a", "am_michael"),
"π¬π§ Female β Emma": ("b", "bf_emma"),
"π¬π§ Male β George": ("b", "bm_george"),
"π¬π§ Male β Lewis": ("b", "bm_lewis"),
}
# ββ helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
"""Kokoro TTS (offline) β temp WAV at Kanade sample rate."""
pipe = _kokoro_us if lang == 'a' else _kokoro_uk
chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)]
if not chunks:
raise RuntimeError("Kokoro produced no audio. Check your text.")
audio_24k = np.concatenate(chunks)
import librosa
audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, audio_16k, SR)
tmp.close()
return tmp.name
def load_tensor(path: str) -> torch.Tensor:
return load_audio(path, sample_rate=SR).to(DEVICE)
# ββ inference βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def synthesize(text, voice_label, ref_audio_path, speed):
if not text.strip():
raise gr.Error("Please enter some text.")
if ref_audio_path is None:
raise gr.Error("Please upload a reference audio clip.")
lang, voice_id = VOICES[voice_label]
gr.Info("Step 1/4 β Synthesising text with Kokoro (offline)β¦")
tts_path = tts_to_wav(text, lang, voice_id)
gr.Info("Step 2/4 β Extracting content tokensβ¦")
tts_wav = load_tensor(tts_path); os.unlink(tts_path)
with torch.inference_mode():
tts_feat = kanade.encode(tts_wav)
gr.Info("Step 3/4 β Extracting speaker embedding from referenceβ¦")
ref_wav = load_tensor(ref_audio_path)
with torch.inference_mode():
ref_feat = kanade.encode(ref_wav)
gr.Info("Step 4/4 β Decoding with cloned voiceβ¦")
with torch.inference_mode():
mel = kanade.decode(
content_token_indices=tts_feat.content_token_indices,
global_embedding=ref_feat.global_embedding,
)
waveform = vocode(vocoder, mel.unsqueeze(0))
audio_np = waveform.squeeze().cpu().float().numpy()
if abs(speed - 1.0) > 0.05:
import librosa
audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
return int(SR), audio_np
# ββ UI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CSS = """
#title { text-align: center; }
#banner { text-align: center; color: #6366f1; }
footer { display: none !important; }
"""
with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
gr.Markdown("# ποΈ Kanade β Text-to-Audio with Voice Cloning", elem_id="title")
gr.Markdown(
"Enter text Β· Upload a **reference audio** Β· Get your text spoken "
"**in that person's voice** β fully offline.",
elem_id="banner",
)
with gr.Row():
with gr.Column(scale=3):
text_in = gr.Textbox(label="π Text to synthesise", lines=5,
placeholder="Type anything hereβ¦")
voice_dd = gr.Dropdown(label="π Base TTS voice (content only)",
choices=list(VOICES), value=list(VOICES)[0])
speed_sl = gr.Slider(label="β© Speed", minimum=0.7, maximum=1.5,
value=1.0, step=0.05)
with gr.Column(scale=2):
ref_audio = gr.Audio(label="π€ Reference audio (voice to clone)",
type="filepath",
sources=["upload", "microphone"])
gr.Markdown("π‘ 5β30 sec Β· clean speech Β· single speaker")
btn = gr.Button("π Generate", variant="primary", size="lg")
out = gr.Audio(label="π Output", type="numpy")
btn.click(fn=synthesize,
inputs=[text_in, voice_dd, ref_audio, speed_sl],
outputs=out)
gr.Markdown(
"---\n"
"**Models:** "
"[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) Β· "
"[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
)
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft(), css=CSS) |