Spaces:

auralodyssey
/

api

Running

App Files Files Community

auralodyssey commited on Dec 31, 2025

Commit

4d29cb7

verified ·

1 Parent(s): 9986dc0

app.py

Browse files

main logic for kokoro

Files changed (1) hide show

app.py +213 -0

app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# app.py
+import os
+import sys
+import time
+import json
+import numpy as np
+import tempfile
+from huggingface_hub import snapshot_download
+from onnxruntime import InferenceSession, SessionOptions, GraphOptimizationLevel
+import scipy.io.wavfile as wavfile
+import gradio as gr
+# Misaki G2P
+try:
+    from misaki import en as misaki_en
+except Exception as e:
+    print("Misaki import failed", e)
+    raise
+# Config
+HF_REPO = "onnx-community/Kokoro-82M-v1.0-ONNX"
+LOCAL_DIR = "/tmp/kokoro_model"
+ONNX_SUBPATH = "onnx/model_q8f16.onnx"  # best CPU quantized file
+VOICES_DIRNAME = "voices"
+SAMPLE_RATE = 24000  # Kokoro uses 24k in README
+# Ensure local dir
+os.makedirs(LOCAL_DIR, exist_ok=True)
+def download_repo():
+    """Download model files to LOCAL_DIR (cached by HF hub)."""
+    # This will download the repo into hf cache and give us a path
+    print("Downloading model repo snapshot from HF. This may take several minutes on first run.")
+    repo_dir = snapshot_download(repo_id=HF_REPO, cache_dir=LOCAL_DIR, local_dir=LOCAL_DIR, repo_type="model")
+    print("Snapshot downloaded to", repo_dir)
+    return repo_dir
+def load_tokenizer_map(repo_dir):
+    # tokenizer.json contains mapping from phoneme token text -> id
+    tok_path = os.path.join(repo_dir, "tokenizer.json")
+    if not os.path.exists(tok_path):
+        raise FileNotFoundError(f"tokenizer.json not found at {tok_path}")
+    with open(tok_path, "r", encoding="utf-8") as f:
+        tok = json.load(f)
+    # tokenizer.json may follow HF tokenizers format; we need map: piece -> id
+    if "model" in tok and "vocab" in tok["model"]:
+        vocab = tok["model"]["vocab"]
+    elif "vocab" in tok:
+        vocab = tok["vocab"]
+    else:
+        # attempt fallback
+        vocab = tok.get("vocab", {})
+    piece_to_id = {}
+    if isinstance(vocab, dict):
+        # typical mapping piece -> id
+        piece_to_id = vocab
+    else:
+        # try tokens list (rare)
+        for i, p in enumerate(vocab):
+            piece_to_id[p] = i
+    return piece_to_id
+def make_session(onnx_path):
+    sess_opts = SessionOptions()
+    sess_opts.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+    # CPU provider explicit
+    sess = InferenceSession(onnx_path, sess_options=sess_opts, providers=["CPUExecutionProvider"])
+    return sess
+# Lazy global
+_repo_dir = None
+_sess = None
+_piece_to_id = None
+_voices_arr = None
+def ensure_loaded():
+    global _repo_dir, _sess, _piece_to_id, _voices_arr
+    if _repo_dir is None:
+        _repo_dir = download_repo()
+    if _piece_to_id is None:
+        _piece_to_id = load_tokenizer_map(_repo_dir)
+    if _sess is None:
+        onnx_path = os.path.join(_repo_dir, ONNX_SUBPATH)
+        if not os.path.exists(onnx_path):
+            # try alternative names
+            candidates = [p for p in os.listdir(os.path.join(_repo_dir, "onnx")) if p.endswith(".onnx")]
+            if not candidates:
+                raise FileNotFoundError("No ONNX model file found in repo/onnx")
+            onnx_path = os.path.join(_repo_dir, "onnx", candidates[0])
+        print("Loading onnx model:", onnx_path)
+        _sess = make_session(onnx_path)
+    if _voices_arr is None:
+        # read voices list from voices folder; we'll lazily load per voice later as needed
+        voices_path = os.path.join(_repo_dir, VOICES_DIRNAME)
+        if not os.path.exists(voices_path):
+            raise FileNotFoundError("voices folder not found in repo")
+        _voices_arr = {}  # dict voice_name -> numpy array
+    return
+def tokens_from_misaki(text):
+    # Use misaki to produce phonemes and tokens. misaki returns phonemes, tokens
+    # tokens can be a list of ints or token objects. We try to extract ints.
+    g2p = misaki_en.G2P(trf=False, british=False, fallback=None)
+    phonemes, tokens = g2p(text)
+    # tokens may be nested lists, token objects etc.
+    flat_ids = []
+    for entry in tokens:
+        if isinstance(entry, list):
+            # nested list of token objects
+            for tk in entry:
+                if hasattr(tk, "id"):
+                    flat_ids.append(int(tk.id))
+                elif isinstance(tk, int):
+                    flat_ids.append(int(tk))
+                else:
+                    # fallback: try string repr and map using tokenizer mapping
+                    token_str = str(tk)
+                    if token_str in _piece_to_id:
+                        flat_ids.append(int(_piece_to_id[token_str]))
+                    else:
+                        raise ValueError("Unknown token object and not in tokenizer map: " + token_str)
+        else:
+            if isinstance(entry, int):
+                flat_ids.append(int(entry))
+            elif hasattr(entry, "id"):
+                flat_ids.append(int(entry.id))
+            else:
+                token_str = str(entry)
+                if token_str in _piece_to_id:
+                    flat_ids.append(int(_piece_to_id[token_str]))
+                else:
+                    raise ValueError("Unknown token and not in tokenizer map: " + token_str)
+    # sanity
+    if len(flat_ids) > 510:
+        raise ValueError("Tokenized length exceeds model context length (<=510).")
+    return flat_ids, phonemes
+def load_voice_vector(repo_dir, voice):
+    voices_folder = os.path.join(repo_dir, VOICES_DIRNAME)
+    if not os.path.exists(voices_folder):
+        raise FileNotFoundError("voices folder missing")
+    file_path = os.path.join(voices_folder, f"{voice}.bin")
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"voice file {voice}.bin not found in voices folder")
+    arr = np.fromfile(file_path, dtype=np.float32).reshape(-1, 1, 256)  # shape checks per README
+    return arr
+def infer_kokoro(text, voice="af_bella", speed=1.0):
+    ensure_loaded()
+    # get token ids
+    token_ids, phonemes = tokens_from_misaki(text)
+    repo_dir = _repo_dir
+    # load voice vector
+    style_arr = load_voice_vector(repo_dir, voice)
+    # pick style vector by length tokens; README uses voices[len(tokens)]
+    idx = min(len(token_ids), style_arr.shape[0] - 1)
+    ref_s = style_arr[idx]  # shape (1, 256) expected
+    # build input tokens with pad 0 at start and end
+    input_ids = np.array([[0] + token_ids + [0]], dtype=np.int64)
+    speed_arr = np.ones((1,), dtype=np.float32) * float(speed)
+    # ONNX session run
+    ort_inputs = {
+        "input_ids": input_ids,
+        "style": ref_s.astype(np.float32),
+        "speed": speed_arr.astype(np.float32),
+    }
+    out = _sess.run(None, ort_inputs)[0]  # expected shape: (1, T)
+    # convert to int16 PCM for wav
+    audio = np.clip(out[0], -1.0, 1.0)
+    # map float32 [-1,1] to int16
+    pcm16 = (audio * 32767.0).astype(np.int16)
+    # write to temp wav and return path
+    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    wavfile.write(tmp.name, SAMPLE_RATE, pcm16)
+    tmp.close()
+    return tmp.name
+# Gradio UI and API
+with gr.Blocks() as demo:
+    gr.Markdown("### Kokoro ONNX TTS CPU Space")
+    with gr.Row():
+        txt = gr.Textbox(label="Text", value="Hello world", lines=3)
+        voice = gr.Dropdown(choices=[], label="Voice (loaded after model)", value="af_bella")
+    speed = gr.Slider(0.5, 2.0, value=1.0, step=0.01, label="Speed")
+    btn = gr.Button("Synthesize")
+    audio_out = gr.Audio(label="Audio", type="file")
+    def on_load():
+        ensure_loaded()
+        # read voices folder names
+        repo_dir = _repo_dir
+        voices_list = []
+        vf = os.path.join(repo_dir, VOICES_DIRNAME)
+        for f in os.listdir(vf):
+            if f.endswith(".bin"):
+                voices_list.append(f[:-4])
+        return gr.Dropdown.update(choices=sorted(voices_list), value=voices_list[0] if voices_list else None)
+    def synth(text_in, voice_in, speed_in):
+        if not text_in or not text_in.strip():
+            return None
+        t0 = time.time()
+        wav_path = infer_kokoro(text_in, voice_in, speed_in)
+        elapsed = time.time() - t0
+        print(f"Generated audio at {wav_path} in {elapsed:.2f}s")
+        return wav_path
+    demo.load(on_load)
+    btn.click(synth, inputs=[txt, voice, speed], outputs=[audio_out], api_name="/tts")
+if __name__ == "__main__":
+    demo.queue(concurrency_count=1)  # keep low concurrency on free CPU space
+    demo.launch()