Tokenizer

Build error

App Files Files Community

britto224 commited on Apr 19

Commit

ab7c93f

verified ·

1 Parent(s): ccae3c3

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -89

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 """
 Kanade Tokenizer — Text-to-Audio with Voice Cloning
 =====================================================
-Original project: Audio-to-Audio (voice conversion)
-This version:     Text + Reference Audio → Cloned Voice Audio
 Pipeline:
-  1. Text  →  [TTS engine]  →  intermediate WAV  (content only)
   2. Reference Audio  →  Kanade encode  →  global_embedding  (speaker identity)
   3. intermediate WAV  →  Kanade encode  →  content_token_indices
   4. Kanade decode(content_tokens, reference_speaker_embedding)  →  output mel
@@ -14,72 +16,76 @@ Pipeline:
 import os
 import tempfile
-import torch
-import gradio as gr
 import numpy as np
 import soundfile as sf
-# ── Kanade ──────────────────────────────────────────────────────────────────
 from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
-# ── TTS back-end (edge-tts is zero-install, async) ──────────────────────────
-import asyncio
-import edge_tts
-# ────────────────────────────────────────────────────────────────────────────
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_ID = "frothywater/kanade-25hz-clean"   # change to kanade-12.5hz if preferred
-print(f"[init] Loading Kanade model: {MODEL_ID}  ({DEVICE})")
-kanade   = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
-vocoder  = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
-SR       = kanade.config.sample_rate          # typically 16000
-print("[init] Models ready.")
-# ── TTS voices available via edge-tts ───────────────────────────────────────
-TTS_VOICES = {
-    "English (US) Female — Aria":    "en-US-AriaNeural",
-    "English (US) Male — Guy":       "en-US-GuyNeural",
-    "English (UK) Female — Sonia":   "en-GB-SoniaNeural",
-    "English (UK) Male — Ryan":      "en-GB-RyanNeural",
-    "English (AU) Female — Natasha": "en-AU-NatashaNeural",
-    "English (IN) Female — Neerja":  "en-IN-NeerjaNeural",
 }
 # ── helpers ───────────────────────────────���──────────────────────────────────
-def tts_to_wav(text: str, voice: str) -> str:
-    """Run edge-tts and return path to a temp WAV file."""
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    tmp.close()
-    async def _run():
-        communicate = edge_tts.Communicate(text, voice)
-        # edge-tts outputs MP3; write to mp3 then convert
-        mp3_path = tmp.name.replace(".wav", ".mp3")
-        await communicate.save(mp3_path)
-        return mp3_path
-    mp3_path = asyncio.run(_run())
-    # Convert MP3 → WAV via soundfile / pydub fallback
-    try:
-        import pydub
-        audio = pydub.AudioSegment.from_mp3(mp3_path)
-        audio = audio.set_frame_rate(SR).set_channels(1)
-        audio.export(tmp.name, format="wav")
-    except Exception:
-        # fallback: scipy / librosa
-        import librosa
-        y, _ = librosa.load(mp3_path, sr=SR, mono=True)
-        sf.write(tmp.name, y, SR)
-    os.unlink(mp3_path)
     return tmp.name
 def load_wav_tensor(path: str) -> torch.Tensor:
-    """Load a WAV file → 1-D float32 tensor at Kanade's sample rate."""
     return load_audio(path, sample_rate=SR).to(DEVICE)
@@ -87,51 +93,49 @@ def load_wav_tensor(path: str) -> torch.Tensor:
 def synthesize(
     text: str,
-    tts_voice_label: str,
-    reference_audio_path: str,
     speed: float,
 ) -> tuple[int, np.ndarray]:
-    """
-    Returns (sample_rate, waveform_numpy) for Gradio Audio output.
-    """
     if not text.strip():
         raise gr.Error("Please enter some text to synthesize.")
     if reference_audio_path is None:
         raise gr.Error("Please upload a reference audio clip (the voice to clone).")
-    voice_id = TTS_VOICES[tts_voice_label]
-    # ── Step 1: Text → intermediate speech WAV ─────────────────────────────
-    gr.Info("Step 1/4 — Synthesising text with TTS…")
-    tts_wav_path = tts_to_wav(text, voice_id)
-    # ── Step 2: Encode TTS audio → content tokens ──────────────────────────
-    gr.Info("Step 2/4 — Extracting content tokens from TTS audio…")
-    tts_waveform = load_wav_tensor(tts_wav_path)
-    os.unlink(tts_wav_path)
     with torch.inference_mode():
         tts_features = kanade.encode(tts_waveform)
-    # ── Step 3: Encode reference audio → speaker embedding ─────────────────
     gr.Info("Step 3/4 — Extracting speaker embedding from reference audio…")
     ref_waveform = load_wav_tensor(reference_audio_path)
     with torch.inference_mode():
         ref_features = kanade.encode(ref_waveform)
-    # ── Step 4: Decode with cloned speaker embedding ────────────────────────
     gr.Info("Step 4/4 — Decoding with cloned voice…")
     with torch.inference_mode():
         mel = kanade.decode(
-            content_token_indices=tts_features.content_token_indices,  # WHAT to say
-            global_embedding=ref_features.global_embedding,            # WHO says it
         )
         waveform = vocode(vocoder, mel.unsqueeze(0))  # (1, samples)
     audio_np = waveform.squeeze().cpu().float().numpy()
-    # Optional speed adjustment via resampling
     if abs(speed - 1.0) > 0.05:
         import librosa
         audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
@@ -139,7 +143,7 @@ def synthesize(
     return int(SR), audio_np
-# ── Gradio UI ─────────────────────────────────────────────────────────────────
 CSS = """
 #title  { text-align: center; }
@@ -147,11 +151,11 @@ CSS = """
 footer  { display: none !important; }
 """
-with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎙️ Kanade — Text-to-Audio with Voice Cloning", elem_id="title")
     gr.Markdown(
-        "Type any text, upload a **reference audio** (the voice you want to clone), "
-        "and Kanade will speak your text **in that person's voice**.",
         elem_id="banner",
     )
@@ -163,10 +167,10 @@ with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft())
                 lines=5,
             )
             tts_voice = gr.Dropdown(
-                label="🔊 Base TTS voice (used for content extraction only)",
-                choices=list(TTS_VOICES.keys()),
-                value=list(TTS_VOICES.keys())[0],
-                info="This voice provides pronunciation — the output will sound like the reference speaker.",
             )
             speed_slider = gr.Slider(
                 label="⏩ Speed",
@@ -181,14 +185,16 @@ with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft())
             )
             gr.Markdown(
                 "💡 **Tips for best results:**\n"
-                "- Use 5–30 seconds of clean speech\n"
-                "- Single speaker, minimal background noise\n"
                 "- WAV or high-quality MP3\n"
             )
     run_btn = gr.Button("🚀 Generate Cloned Speech", variant="primary", size="lg")
-    output_audio = gr.Audio(label="🔈 Output — Your text in the reference speaker's voice", type="numpy")
     run_btn.click(
         fn=synthesize,
@@ -198,12 +204,15 @@ with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft())
     gr.Markdown("---")
     gr.Markdown(
-        "**How it works:** Kanade disentangles speech into *content tokens* (what is said) "
         "and a *global speaker embedding* (who says it). "
-        "We extract content from a TTS-generated intermediate and speaker identity from your "
-        "reference audio, then recombine them. "
-        "Model: [`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean)"
     )
 if __name__ == "__main__":
-    demo.launch()

 """
 Kanade Tokenizer — Text-to-Audio with Voice Cloning
 =====================================================
+Fixes vs v1:
+  - Replaced edge-tts (needs internet) with kokoro (100% offline, local model)
+  - Fixed Gradio 6.0 API: theme/css moved to launch()
+  - asyncio.run() issue eliminated (kokoro is sync)
 Pipeline:
+  1. Text  →  [Kokoro TTS, offline]  →  intermediate WAV  (content only)
   2. Reference Audio  →  Kanade encode  →  global_embedding  (speaker identity)
   3. intermediate WAV  →  Kanade encode  →  content_token_indices
   4. Kanade decode(content_tokens, reference_speaker_embedding)  →  output mel
 import os
 import tempfile
 import numpy as np
+import torch
 import soundfile as sf
+import gradio as gr
+# ── Kanade ───────────────────────────────────────────────────────────────────
 from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
+# ── Kokoro offline TTS ───────────────────────────────────────────────────────
+from kokoro import KPipeline
+# ─────────────────────────────────────────────────────────────────────────────
+DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_ID  = "frothywater/kanade-25hz-clean"
+KOKORO_SR = 24000   # Kokoro always outputs 24 kHz
+print(f"[init] Loading Kanade: {MODEL_ID}  ({DEVICE})")
+kanade  = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
+vocoder = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
+SR      = kanade.config.sample_rate          # 16000
+print("[init] Kanade ready.")
+print("[init] Loading Kokoro TTS pipeline…")
+# lang_code='a' = American English  |  'b' = British English
+kokoro_pipeline_us = KPipeline(lang_code='a')
+kokoro_pipeline_uk = KPipeline(lang_code='b')
+print("[init] Kokoro ready. All models loaded.")
+# ── Available Kokoro voices ───────────────────────────────────────────────────
+# Full list: https://huggingface.co/hexgrad/Kokoro-82M/tree/main/voices
+VOICES = {
+    # American English (lang='a')
+    "🇺🇸 Female — Heart (warm)":      ("a", "af_heart"),
+    "🇺🇸 Female — Bella (smooth)":    ("a", "af_bella"),
+    "🇺🇸 Female — Nicole (breathy)":  ("a", "af_nicole"),
+    "🇺🇸 Female — Sarah":             ("a", "af_sarah"),
+    "🇺🇸 Male — Adam":                ("a", "am_adam"),
+    "🇺🇸 Male — Michael":             ("a", "am_michael"),
+    # British English (lang='b')
+    "🇬🇧 Female — Emma":              ("b", "bf_emma"),
+    "🇬🇧 Male — George":              ("b", "bm_george"),
+    "🇬🇧 Male — Lewis":               ("b", "bm_lewis"),
 }
 # ── helpers ───────────────────────────────���──────────────────────────────────
+def tts_to_wav(text: str, lang: str, voice_id: str) -> str:
+    """Run Kokoro TTS (offline) → temp WAV resampled to Kanade's SR."""
+    pipeline = kokoro_pipeline_us if lang == 'a' else kokoro_pipeline_uk
+    chunks = []
+    for _, _, audio in pipeline(text, voice=voice_id, speed=1.0, split_pattern=r'(?<=[.!?])\s+'):
+        chunks.append(audio)
+    if not chunks:
+        raise RuntimeError("Kokoro produced no audio — check your text input.")
+    audio_24k = np.concatenate(chunks)
+    # Resample 24 kHz → 16 kHz for Kanade
+    import librosa
+    audio_16k = librosa.resample(audio_24k, orig_sr=KOKORO_SR, target_sr=SR)
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    sf.write(tmp.name, audio_16k, SR)
+    tmp.close()
     return tmp.name
 def load_wav_tensor(path: str) -> torch.Tensor:
     return load_audio(path, sample_rate=SR).to(DEVICE)
 def synthesize(
     text: str,
+    voice_label: str,
+    reference_audio_path,
     speed: float,
 ) -> tuple[int, np.ndarray]:
     if not text.strip():
         raise gr.Error("Please enter some text to synthesize.")
     if reference_audio_path is None:
         raise gr.Error("Please upload a reference audio clip (the voice to clone).")
+    lang, voice_id = VOICES[voice_label]
+    # Step 1 — Text → intermediate WAV via Kokoro (offline)
+    gr.Info("Step 1/4 — Synthesising text with Kokoro (offline)…")
+    tts_path = tts_to_wav(text, lang, voice_id)
+    # Step 2 — Encode TTS → content tokens
+    gr.Info("Step 2/4 — Extracting content tokens…")
+    tts_waveform = load_wav_tensor(tts_path)
+    os.unlink(tts_path)
     with torch.inference_mode():
         tts_features = kanade.encode(tts_waveform)
+    # Step 3 — Encode reference audio → speaker embedding
     gr.Info("Step 3/4 — Extracting speaker embedding from reference audio…")
     ref_waveform = load_wav_tensor(reference_audio_path)
     with torch.inference_mode():
         ref_features = kanade.encode(ref_waveform)
+    # Step 4 — Decode: content from TTS, voice from reference
     gr.Info("Step 4/4 — Decoding with cloned voice…")
     with torch.inference_mode():
         mel = kanade.decode(
+            content_token_indices=tts_features.content_token_indices,   # WHAT to say
+            global_embedding=ref_features.global_embedding,             # WHO says it
         )
         waveform = vocode(vocoder, mel.unsqueeze(0))  # (1, samples)
     audio_np = waveform.squeeze().cpu().float().numpy()
+    # Optional speed change via resampling
     if abs(speed - 1.0) > 0.05:
         import librosa
         audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
     return int(SR), audio_np
+# ── Gradio UI (Gradio 6 compatible) ──────────────────────────────────────────
 CSS = """
 #title  { text-align: center; }
 footer  { display: none !important; }
 """
+with gr.Blocks(title="Kanade TTS Voice Cloner") as demo:
     gr.Markdown("# 🎙️ Kanade — Text-to-Audio with Voice Cloning", elem_id="title")
     gr.Markdown(
+        "Type any text · Upload a **reference audio** (the voice to clone) · "
+        "Kanade will speak your text **in that person's voice** — 100% offline.",
         elem_id="banner",
     )
                 lines=5,
             )
             tts_voice = gr.Dropdown(
+                label="🔊 Base TTS voice (Kokoro — offline)",
+                choices=list(VOICES.keys()),
+                value=list(VOICES.keys())[0],
+                info="Provides pronunciation only — output will sound like the reference speaker.",
             )
             speed_slider = gr.Slider(
                 label="⏩ Speed",
             )
             gr.Markdown(
                 "💡 **Tips for best results:**\n"
+                "- 5–30 seconds of clean speech\n"
+                "- Single speaker, minimal noise\n"
                 "- WAV or high-quality MP3\n"
             )
     run_btn = gr.Button("🚀 Generate Cloned Speech", variant="primary", size="lg")
+    output_audio = gr.Audio(
+        label="🔈 Output — Your text in the reference speaker's voice",
+        type="numpy",
+    )
     run_btn.click(
         fn=synthesize,
     gr.Markdown("---")
     gr.Markdown(
+        "**How it works:** Kanade separates speech into *content tokens* (what is said) "
         "and a *global speaker embedding* (who says it). "
+        "Kokoro (82M offline TTS) generates the content — then Kanade re-voices it using your reference. "
+        "Models: [`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean) · "
+        "[`hexgrad/Kokoro-82M`](https://huggingface.co/hexgrad/Kokoro-82M)"
     )
 if __name__ == "__main__":
+    demo.launch(
+        theme=gr.themes.Soft(),
+        css=CSS,
+    )