Tokenizer

Build error

File size: 7,002 Bytes

ae2f25b
 
 
4c4af4f
 
 
 
 
ae2f25b
 
4c4af4f
 
 
 
 
ae2f25b
 
2cba492
ae2f25b
 
ab7c93f
ae2f25b
ab7c93f
2cba492
ae2f25b
ab7c93f
 
 
 
 
4c4af4f
ab7c93f
4c4af4f
ab7c93f
 
4c4af4f
ab7c93f
 
4c4af4f
 
 
 
ab7c93f
4c4af4f
ab7c93f
4c4af4f
 
 
 
 
 
 
 
 
ae2f25b
 
4c4af4f
ae2f25b
ab7c93f
4c4af4f
 
 
ab7c93f
4c4af4f
ab7c93f
 
 
 
 
 
 
 
ae2f25b
 
 
4c4af4f
ae2f25b
 
 
4c4af4f
ab7c93f
4c4af4f
ae2f25b
4c4af4f
 
 
ae2f25b
ab7c93f
ae2f25b
ab7c93f
 
ae2f25b
ab7c93f
4c4af4f
ae2f25b
4c4af4f
ae2f25b
4c4af4f
 
ae2f25b
4c4af4f
ae2f25b
 
 
4c4af4f
 
 
ae2f25b
4c4af4f
ae2f25b
 
 
 
 
 
 
 
 
 
4c4af4f
ae2f25b
 
 
4c4af4f
ae2f25b
 
 
ab7c93f
ae2f25b
 
4c4af4f
 
ae2f25b
 
2cba492
ae2f25b
4c4af4f
 
 
 
 
 
ae2f25b
4c4af4f
 
 
 
ae2f25b
4c4af4f
 
 
 
 
 
ae2f25b
 
4c4af4f
 
 
ab7c93f
2cba492
 
 
4c4af4f

"""
Kanade Tokenizer — Text-to-Audio with Voice Cloning
=====================================================
v3 fixes:
  - kokoro pinned to 0.7.16 (Python 3.13 compatible; 0.9.x requires <3.13)
  - espeak-ng installed via packages.txt (OS-level, not pip)
  - Gradio 6 API: theme/css passed to launch(), not Blocks()
  - No internet required — 100% offline inference

Pipeline:
  Text  →  Kokoro TTS (offline)  →  intermediate WAV
  Reference Audio  →  Kanade encode  →  global_embedding  (WHO)
  intermediate WAV  →  Kanade encode  →  content_token_indices  (WHAT)
  Kanade decode(content_tokens + speaker_embedding)  →  mel
  Vocoder  →  final WAV  ✅
"""

import os
import tempfile
import numpy as np
import torch
import soundfile as sf
import gradio as gr

from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
from kokoro import KPipeline

# ─────────────────────────────────────────────────────────────────────────────
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID  = "frothywater/kanade-25hz-clean"
KOKORO_SR = 24000

print(f"[init] Loading Kanade ({DEVICE})…")
kanade  = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
SR      = kanade.config.sample_rate   # 16000
print("[init] Kanade ready.")

print("[init] Loading Kokoro TTS…")
_kokoro_us = KPipeline(lang_code='a')   # American English
_kokoro_uk = KPipeline(lang_code='b')   # British English
print("[init] All models ready.")

# ── Voice menu ────────────────────────────────────────────────────────────────
VOICES = {
    "🇺🇸 Female — Heart (warm)":     ("a", "af_heart"),
    "🇺🇸 Female — Bella (smooth)":   ("a", "af_bella"),
    "🇺🇸 Female — Nicole (breathy)": ("a", "af_nicole"),
    "🇺🇸 Female — Sarah":            ("a", "af_sarah"),
    "🇺🇸 Male — Adam":               ("a", "am_adam"),
    "🇺🇸 Male — Michael":            ("a", "am_michael"),
    "🇬🇧 Female — Emma":             ("b", "bf_emma"),
    "🇬🇧 Male — George":             ("b", "bm_george"),
    "🇬🇧 Male — Lewis":              ("b", "bm_lewis"),
}

# ── helpers ───────────────────────────────────────────────────────────────────

def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
    """Kokoro TTS (offline) → temp WAV at Kanade sample rate."""
    pipe = _kokoro_us if lang == 'a' else _kokoro_uk
    chunks = [audio for _, _, audio in pipe(text, voice=voice_id, speed=1.0)]
    if not chunks:
        raise RuntimeError("Kokoro produced no audio. Check your text.")
    audio_24k = np.concatenate(chunks)

    import librosa
    audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)

    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    sf.write(tmp.name, audio_16k, SR)
    tmp.close()
    return tmp.name


def load_tensor(path: str) -> torch.Tensor:
    return load_audio(path, sample_rate=SR).to(DEVICE)


# ── inference ─────────────────────────────────────────────────────────────────

def synthesize(text, voice_label, ref_audio_path, speed):
    if not text.strip():
        raise gr.Error("Please enter some text.")
    if ref_audio_path is None:
        raise gr.Error("Please upload a reference audio clip.")

    lang, voice_id = VOICES[voice_label]

    gr.Info("Step 1/4 — Synthesising text with Kokoro (offline)…")
    tts_path = tts_to_wav(text, lang, voice_id)

    gr.Info("Step 2/4 — Extracting content tokens…")
    tts_wav = load_tensor(tts_path);  os.unlink(tts_path)
    with torch.inference_mode():
        tts_feat = kanade.encode(tts_wav)

    gr.Info("Step 3/4 — Extracting speaker embedding from reference…")
    ref_wav = load_tensor(ref_audio_path)
    with torch.inference_mode():
        ref_feat = kanade.encode(ref_wav)

    gr.Info("Step 4/4 — Decoding with cloned voice…")
    with torch.inference_mode():
        mel      = kanade.decode(
            content_token_indices=tts_feat.content_token_indices,
            global_embedding=ref_feat.global_embedding,
        )
        waveform = vocode(vocoder, mel.unsqueeze(0))

    audio_np = waveform.squeeze().cpu().float().numpy()

    if abs(speed - 1.0) > 0.05:
        import librosa
        audio_np = librosa.effects.time_stretch(audio_np, rate=speed)

    return int(SR), audio_np


# ── UI ────────────────────────────────────────────────────────────────────────

CSS = """
#title  { text-align: center; }
#banner { text-align: center; color: #6366f1; }
footer  { display: none !important; }
"""

with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
    gr.Markdown("# 🎙️ Kanade — Text-to-Audio with Voice Cloning", elem_id="title")
    gr.Markdown(
        "Enter text · Upload a **reference audio** · Get your text spoken "
        "**in that person's voice** — fully offline.",
        elem_id="banner",
    )
    with gr.Row():
        with gr.Column(scale=3):
            text_in  = gr.Textbox(label="📝 Text to synthesise", lines=5,
                                   placeholder="Type anything here…")
            voice_dd = gr.Dropdown(label="🔊 Base TTS voice (content only)",
                                   choices=list(VOICES), value=list(VOICES)[0])
            speed_sl = gr.Slider(label="⏩ Speed", minimum=0.7, maximum=1.5,
                                  value=1.0, step=0.05)
        with gr.Column(scale=2):
            ref_audio = gr.Audio(label="🎤 Reference audio (voice to clone)",
                                  type="filepath",
                                  sources=["upload", "microphone"])
            gr.Markdown("💡 5–30 sec · clean speech · single speaker")

    btn = gr.Button("🚀 Generate", variant="primary", size="lg")
    out = gr.Audio(label="🔈 Output", type="numpy")

    btn.click(fn=synthesize,
              inputs=[text_in, voice_dd, ref_audio, speed_sl],
              outputs=out)

    gr.Markdown(
        "---\n"
        "**Models:** "
        "[`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) · "
        "[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft(), css=CSS)