Tokenizer

Build error

App Files Files Community

britto224 commited on 29 days ago

Commit

ae2f25b

verified ·

1 Parent(s): 2cba492

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -110

app.py CHANGED Viewed

@@ -1,119 +1,209 @@
-import sys
 import os
-import time
 import torch
 import gradio as gr
-# --- 1. PATH SETUP ---
-current_dir = os.path.dirname(os.path.abspath(__file__))
-src_path = os.path.join(current_dir, "src")
-if src_path not in sys.path:
-    sys.path.append(src_path)
-# --- 2. Imports ---
-try:
-    from kanade_tokenizer.model import KanadeModel
-    from kanade_tokenizer.util import load_vocoder, vocode, load_audio
-except ImportError as e:
-    print(f"❌ IMPORT ERROR: {e}")
-    raise e
-# --- Configuration ---
-KANADE_REPO = "frothywater/kanade-25hz-clean"
-KANADE_VOCODER = "hift"
-DEVICE = "cpu"
-SAMPLE_RATE = 24000
-MAX_AUDIO_SECONDS = 30  # Limit audio to 30 seconds
-print(f"🚀 Initializing on {DEVICE}...")
-# --- 3. Load Models ---
-print(f"📥 Loading Kanade...")
-kanade_model = KanadeModel.from_pretrained(repo_id=KANADE_REPO).to(DEVICE).eval()
-print(f"🔊 Loading HiFT Vocoder...")
-kanade_vocoder = load_vocoder(name=KANADE_VOCODER).to(DEVICE).eval()
-print("✅ Models Loaded.")
-# --- Core Inference ---
-def run_inference(source_wav, ref_wav):
-    """Run voice conversion inference on CPU"""
-    with torch.inference_mode():
-        mel_output = kanade_model.voice_conversion(source_wav, ref_wav)
-        generated_wav = vocode(kanade_vocoder, mel_output.unsqueeze(0))
-    return generated_wav
-# --- Main Handler ---
-def voice_conversion(source_path, reference_path):
-    if not source_path or not reference_path:
-        return None, "⚠️ Please provide both source and reference audio."
     try:
-        # Load audio
-        source_wav = load_audio(source_path, sample_rate=SAMPLE_RATE).to(DEVICE)
-        ref_wav = load_audio(reference_path, sample_rate=SAMPLE_RATE).to(DEVICE)
-        # Check duration (30 second limit)
-        max_samples = MAX_AUDIO_SECONDS * SAMPLE_RATE
-        if source_wav.shape[-1] > max_samples:
-            source_wav = source_wav[..., :max_samples]
-        if ref_wav.shape[-1] > max_samples:
-            ref_wav = ref_wav[..., :max_samples]
-        # Run inference
-        start = time.time()
-        final_wav = run_inference(source_wav, ref_wav)
-        proc_time = time.time() - start
-        output_np = final_wav.squeeze().cpu().float().numpy()
-        output_duration = len(output_np) / SAMPLE_RATE
-        # RTF = processing time / audio duration (lower is better, <1 means faster than real-time)
-        rtf = proc_time / output_duration if output_duration > 0 else 0
-        return (SAMPLE_RATE, output_np), f"✅ {proc_time:.2f}s to convert {output_duration:.1f}s of audio | RTF: {rtf:.2f}x"
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-        return None, f"❌ Error: {str(e)}"
-# --- Gradio Interface ---
-with gr.Blocks(title="Kanade Voice Cloning") as demo:
-    gr.Markdown("""
-    # 🗣️ Kanade Voice Cloning
-    **Model:** `frothywater/kanade-25hz-clean`
-    Convert any audio into a target voice. Upload a source audio (what to say) and a reference audio (whose voice to use).
-    ⏱️ **Limit:** Audio is trimmed to 30 seconds max.
-    """)
     with gr.Row():
-        with gr.Column():
-            source_audio = gr.Audio(label="Source Audio (Content - what to say)", type="filepath")
-            reference_audio = gr.Audio(label="Reference Audio (Target Voice - whose voice)", type="filepath")
-            convert_btn = gr.Button("🎤 Convert Voice", variant="primary")
-        with gr.Column():
-            output_audio = gr.Audio(label="Result")
-            status_text = gr.Textbox(label="Status", interactive=False)
-    convert_btn.click(
-        voice_conversion,
-        inputs=[source_audio, reference_audio],
-        outputs=[output_audio, status_text]
     )
-    gr.Markdown("""
-    ---
-    **Tips:**
-    - For best results, use clean reference audio (3-10 seconds of clear speech)
-    - Source and reference should ideally be similar in speaking pace
-    """)
 if __name__ == "__main__":
-    demo.launch()

+"""
+Kanade Tokenizer — Text-to-Audio with Voice Cloning
+=====================================================
+Original project: Audio-to-Audio (voice conversion)
+This version:     Text + Reference Audio → Cloned Voice Audio
+Pipeline:
+  1. Text  →  [TTS engine]  →  intermediate WAV  (content only)
+  2. Reference Audio  →  Kanade encode  →  global_embedding  (speaker identity)
+  3. intermediate WAV  →  Kanade encode  →  content_token_indices
+  4. Kanade decode(content_tokens, reference_speaker_embedding)  →  output mel
+  5. Vocoder  →  final WAV  (your text, in the reference speaker's voice)
+"""
 import os
+import tempfile
 import torch
 import gradio as gr
+import numpy as np
+import soundfile as sf
+# ── Kanade ──────────────────────────────────────────────────────────────────
+from kanade_tokenizer import KanadeModel, load_audio, load_vocoder, vocode
+# ── TTS back-end (edge-tts is zero-install, async) ──────────────────────────
+import asyncio
+import edge_tts
+# ────────────────────────────────────────────────────────────────────────────
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODEL_ID = "frothywater/kanade-25hz-clean"   # change to kanade-12.5hz if preferred
+print(f"[init] Loading Kanade model: {MODEL_ID}  ({DEVICE})")
+kanade   = KanadeModel.from_pretrained(MODEL_ID).eval().to(DEVICE)
+vocoder  = load_vocoder(kanade.config.vocoder_name).to(DEVICE)
+SR       = kanade.config.sample_rate          # typically 16000
+print("[init] Models ready.")
+# ── TTS voices available via edge-tts ───────────────────────────────────────
+TTS_VOICES = {
+    "English (US) Female — Aria":    "en-US-AriaNeural",
+    "English (US) Male — Guy":       "en-US-GuyNeural",
+    "English (UK) Female — Sonia":   "en-GB-SoniaNeural",
+    "English (UK) Male — Ryan":      "en-GB-RyanNeural",
+    "English (AU) Female — Natasha": "en-AU-NatashaNeural",
+    "English (IN) Female — Neerja":  "en-IN-NeerjaNeural",
+}
+# ── helpers ──────────────────────────────────────────────────────────────────
+def tts_to_wav(text: str, voice: str) -> str:
+    """Run edge-tts and return path to a temp WAV file."""
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    tmp.close()
+    async def _run():
+        communicate = edge_tts.Communicate(text, voice)
+        # edge-tts outputs MP3; write to mp3 then convert
+        mp3_path = tmp.name.replace(".wav", ".mp3")
+        await communicate.save(mp3_path)
+        return mp3_path
+    mp3_path = asyncio.run(_run())
+    # Convert MP3 → WAV via soundfile / pydub fallback
     try:
+        import pydub
+        audio = pydub.AudioSegment.from_mp3(mp3_path)
+        audio = audio.set_frame_rate(SR).set_channels(1)
+        audio.export(tmp.name, format="wav")
+    except Exception:
+        # fallback: scipy / librosa
+        import librosa
+        y, _ = librosa.load(mp3_path, sr=SR, mono=True)
+        sf.write(tmp.name, y, SR)
+    os.unlink(mp3_path)
+    return tmp.name
+def load_wav_tensor(path: str) -> torch.Tensor:
+    """Load a WAV file → 1-D float32 tensor at Kanade's sample rate."""
+    return load_audio(path, sample_rate=SR).to(DEVICE)
+# ── main inference ────────────────────────────────────────────────────────────
+def synthesize(
+    text: str,
+    tts_voice_label: str,
+    reference_audio_path: str,
+    speed: float,
+) -> tuple[int, np.ndarray]:
+    """
+    Returns (sample_rate, waveform_numpy) for Gradio Audio output.
+    """
+    if not text.strip():
+        raise gr.Error("Please enter some text to synthesize.")
+    if reference_audio_path is None:
+        raise gr.Error("Please upload a reference audio clip (the voice to clone).")
+    voice_id = TTS_VOICES[tts_voice_label]
+    # ── Step 1: Text → intermediate speech WAV ─────────────────────────────
+    gr.Info("Step 1/4 — Synthesising text with TTS…")
+    tts_wav_path = tts_to_wav(text, voice_id)
+    # ── Step 2: Encode TTS audio → content tokens ──────────────────────────
+    gr.Info("Step 2/4 — Extracting content tokens from TTS audio…")
+    tts_waveform = load_wav_tensor(tts_wav_path)
+    os.unlink(tts_wav_path)
+    with torch.inference_mode():
+        tts_features = kanade.encode(tts_waveform)
+    # ── Step 3: Encode reference audio → speaker embedding ─────────────────
+    gr.Info("Step 3/4 — Extracting speaker embedding from reference audio…")
+    ref_waveform = load_wav_tensor(reference_audio_path)
+    with torch.inference_mode():
+        ref_features = kanade.encode(ref_waveform)
+    # ── Step 4: Decode with cloned speaker embedding ────────────────────────
+    gr.Info("Step 4/4 — Decoding with cloned voice…")
+    with torch.inference_mode():
+        mel = kanade.decode(
+            content_token_indices=tts_features.content_token_indices,  # WHAT to say
+            global_embedding=ref_features.global_embedding,            # WHO says it
+        )
+        waveform = vocode(vocoder, mel.unsqueeze(0))  # (1, samples)
+    audio_np = waveform.squeeze().cpu().float().numpy()
+    # Optional speed adjustment via resampling
+    if abs(speed - 1.0) > 0.05:
+        import librosa
+        audio_np = librosa.effects.time_stretch(audio_np, rate=speed)
+    return int(SR), audio_np
+# ── Gradio UI ─────────────────────────────────────────────────────────────────
+CSS = """
+#title  { text-align: center; }
+#banner { text-align: center; color: #6366f1; font-size: 0.9em; }
+footer  { display: none !important; }
+"""
+with gr.Blocks(title="Kanade TTS Voice Cloner", css=CSS, theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🎙️ Kanade — Text-to-Audio with Voice Cloning", elem_id="title")
+    gr.Markdown(
+        "Type any text, upload a **reference audio** (the voice you want to clone), "
+        "and Kanade will speak your text **in that person's voice**.",
+        elem_id="banner",
+    )
     with gr.Row():
+        with gr.Column(scale=3):
+            text_input = gr.Textbox(
+                label="📝 Text to synthesise",
+                placeholder="Enter any text here…",
+                lines=5,
+            )
+            tts_voice = gr.Dropdown(
+                label="🔊 Base TTS voice (used for content extraction only)",
+                choices=list(TTS_VOICES.keys()),
+                value=list(TTS_VOICES.keys())[0],
+                info="This voice provides pronunciation — the output will sound like the reference speaker.",
+            )
+            speed_slider = gr.Slider(
+                label="⏩ Speed",
+                minimum=0.7, maximum=1.5, value=1.0, step=0.05,
+            )
+        with gr.Column(scale=2):
+            reference_audio = gr.Audio(
+                label="🎤 Reference audio (voice to clone)",
+                type="filepath",
+                sources=["upload", "microphone"],
+            )
+            gr.Markdown(
+                "💡 **Tips for best results:**\n"
+                "- Use 5–30 seconds of clean speech\n"
+                "- Single speaker, minimal background noise\n"
+                "- WAV or high-quality MP3\n"
+            )
+    run_btn = gr.Button("🚀 Generate Cloned Speech", variant="primary", size="lg")
+    output_audio = gr.Audio(label="🔈 Output — Your text in the reference speaker's voice", type="numpy")
+    run_btn.click(
+        fn=synthesize,
+        inputs=[text_input, tts_voice, reference_audio, speed_slider],
+        outputs=output_audio,
+    )
+    gr.Markdown("---")
+    gr.Markdown(
+        "**How it works:** Kanade disentangles speech into *content tokens* (what is said) "
+        "and a *global speaker embedding* (who says it). "
+        "We extract content from a TTS-generated intermediate and speaker identity from your "
+        "reference audio, then recombine them. "
+        "Model: [`frothywater/kanade-25hz-clean`](https://huggingface.co/frothywater/kanade-25hz-clean)"
     )
 if __name__ == "__main__":
+    demo.launch()