File size: 7,002 Bytes
ae2f25b
 
 
4c4af4f
 
 
 
 
ae2f25b
 
4c4af4f
 
 
 
 
ae2f25b
 
2cba492
ae2f25b
 
ab7c93f
ae2f25b
ab7c93f
2cba492
ae2f25b
ab7c93f
 
 
 
 
4c4af4f
ab7c93f
4c4af4f
ab7c93f
 
4c4af4f
ab7c93f
 
4c4af4f
 
 
 
ab7c93f
4c4af4f
ab7c93f
4c4af4f
 
 
 
 
 
 
 
 
ae2f25b
 
4c4af4f
ae2f25b
ab7c93f
4c4af4f
 
 
ab7c93f
4c4af4f
ab7c93f
 
 
 
 
 
 
 
ae2f25b
 
 
4c4af4f
ae2f25b
 
 
4c4af4f
ab7c93f
4c4af4f
ae2f25b
4c4af4f
 
 
ae2f25b
ab7c93f
ae2f25b
ab7c93f
 
ae2f25b
ab7c93f
4c4af4f
ae2f25b
4c4af4f
ae2f25b
4c4af4f
 
ae2f25b
4c4af4f
ae2f25b
 
 
4c4af4f
 
 
ae2f25b
4c4af4f
ae2f25b
 
 
 
 
 
 
 
 
 
4c4af4f
ae2f25b
 
 
4c4af4f
ae2f25b
 
 
ab7c93f
ae2f25b
 
4c4af4f
 
ae2f25b
 
2cba492
ae2f25b
4c4af4f
 
 
 
 
 
ae2f25b
4c4af4f
 
 
 
ae2f25b
4c4af4f
 
 
 
 
 
ae2f25b
 
4c4af4f
 
 
ab7c93f
2cba492
 
 
4c4af4f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
Kanade Tokenizer β€” Text-to-Audio with Voice Cloning
=====================================================
v3 fixes:
  - kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13)
  - espeak-ng installed via packages.txt (OS-level, not pip)
  - Gradio 6 API: theme/css passed to launch(), not Blocks()
  - No internet required β€” 100% offline inference

Pipeline:
  Text  β†’  Kokoro TTS (offline)  β†’  intermediate WAV
  Reference Audio  β†’  Kanade encode  β†’  global_embedding  (WHO)
  intermediate WAV  β†’  Kanade encode  β†’  content_token_indices  (WHAT)
  Kanade decode(content_tokens + speaker_embedding)  β†’  mel
  Vocoder  β†’  final WAV  βœ…
"""

import os
import tempfile
import numpy as np
import torch
import soundfile as sf
import gradio as gr

from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
from kokoro import KPipeline

# ─────────────────────────────────────────────────────────────────────────────
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID  = "frothywater/kanade-25hz-clean"
KOKORO_SR = 24000

print(f"[init] Loading Kanade ({DEVICE})…")
kanade  = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
SR      = kanade.config.sample_rate   # 16000
print("[init] Kanade ready.")

print("[init] Loading Kokoro TTS…")
_kokoro_us = KPipeline(lang_code='a')   # American English
_kokoro_uk = KPipeline(lang_code='b')   # British English
print("[init] All models ready.")

# ── Voice menu ────────────────────────────────────────────────────────────────
VOICES = {
    "πŸ‡ΊπŸ‡Έ Female β€” Heart (warm)":     ("a", "af_heart"),
    "πŸ‡ΊπŸ‡Έ Female β€” Bella (smooth)":   ("a", "af_bella"),
    "πŸ‡ΊπŸ‡Έ Female β€” Nicole (breathy)": ("a", "af_nicole"),
    "πŸ‡ΊπŸ‡Έ Female β€” Sarah":            ("a", "af_sarah"),
    "πŸ‡ΊπŸ‡Έ Male β€” Adam":               ("a", "am_adam"),
    "πŸ‡ΊπŸ‡Έ Male β€” Michael":            ("a", "am_michael"),
    "πŸ‡¬πŸ‡§ Female β€” Emma":             ("b", "bf_emma"),
    "πŸ‡¬πŸ‡§ Male β€” George":             ("b", "bm_george"),
    "πŸ‡¬πŸ‡§ Male β€” Lewis":              ("b", "bm_lewis"),
}

# ── helpers ───────────────────────────────────────────────────────────────────

def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
    """Kokoro TTS (offline) β†’ temp WAV at Kanade sample rate."""
    pipe = _kokoro_us if lang == 'a' else _kokoro_uk
    chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)]
    if not chunks:
        raise RuntimeError("Kokoro produced no audio. Check your text.")
    audio_24k = np.concatenate(chunks)

    import librosa
    audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)

    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, audio_16k, SR)
    tmp.close()
    return tmp.name


def load_tensor(path: str) -> torch.Tensor:
    return load_audio(path, sample_rate=SR).to(DEVICE)


# ── inference ─────────────────────────────────────────────────────────────────

def synthesize(text, voice_label, ref_audio_path, speed):
    if not text.strip():
        raise gr.Error("Please enter some text.")
    if ref_audio_path is None:
        raise gr.Error("Please upload a reference audio clip.")

    lang, voice_id = VOICES[voice_label]

    gr.Info("Step 1/4 β€” Synthesising text with Kokoro (offline)…")
    tts_path = tts_to_wav(text, lang, voice_id)

    gr.Info("Step 2/4 β€” Extracting content tokens…")
    tts_wav = load_tensor(tts_path);  os.unlink(tts_path)
    with torch.inference_mode():
        tts_feat = kanade.encode(tts_wav)

    gr.Info("Step 3/4 β€” Extracting speaker embedding from reference…")
    ref_wav = load_tensor(ref_audio_path)
    with torch.inference_mode():
        ref_feat = kanade.encode(ref_wav)

    gr.Info("Step 4/4 β€” Decoding with cloned voice…")
    with torch.inference_mode():
        mel      = kanade.decode(
            content_token_indices=tts_feat.content_token_indices,
            global_embedding=ref_feat.global_embedding,
        )
        waveform = vocode(vocoder, mel.unsqueeze(0))

    audio_np = waveform.squeeze().cpu().float().numpy()

    if abs(speed - 1.0) > 0.05:
        import librosa
        audio_np = librosa.effects.time_stretch(audio_np, rate=speed)

    return int(SR), audio_np


# ── UI ────────────────────────────────────────────────────────────────────────

CSS = """
#title  { text-align: center; }
#banner { text-align: center; color: #6366f1; }
footer  { display: none !important; }
"""

with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
    gr.Markdown("# πŸŽ™οΈ Kanade β€” Text-to-Audio with Voice Cloning", elem_id="title")
    gr.Markdown(
        "Enter text Β· Upload a **reference audio** Β· Get your text spoken "
        "**in that person's voice** β€” fully offline.",
        elem_id="banner",
    )
    with gr.Row():
        with gr.Column(scale=3):
            text_in  = gr.Textbox(label="πŸ“ Text to synthesise", lines=5,
                                   placeholder="Type anything here…")
            voice_dd = gr.Dropdown(label="πŸ”Š Base TTS voice (content only)",
                                   choices=list(VOICES), value=list(VOICES)[0])
            speed_sl = gr.Slider(label="⏩ Speed", minimum=0.7, maximum=1.5,
                                  value=1.0, step=0.05)
        with gr.Column(scale=2):
            ref_audio = gr.Audio(label="🎀 Reference audio (voice to clone)",
                                  type="filepath",
                                  sources=["upload", "microphone"])
            gr.Markdown("πŸ’‘ 5–30 sec Β· clean speech Β· single speaker")

    btn = gr.Button("πŸš€ Generate", variant="primary", size="lg")
    out = gr.Audio(label="πŸ”ˆ Output", type="numpy")

    btn.click(fn=synthesize,
              inputs=[text_in, voice_dd, ref_audio, speed_sl],
              outputs=out)

    gr.Markdown(
        "---\n"
        "**Models:** "
        "[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) Β· "
        "[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft(), css=CSS)