from functools import lru_cache import os import tempfile from pathlib import Path import sherpa_onnx from huggingface_hub import hf_hub_download, snapshot_download ENGLISH_REPO_ID = "vidhi0405/TextToSpeech" def _normalize_repo_id(repo_id: str) -> str: v = repo_id.strip() if v.startswith("https://huggingface.co/"): v = v.removeprefix("https://huggingface.co/").strip("/") return v def _get_file(repo_id: str, filename: str, subfolder: str) -> str: return hf_hub_download( repo_id=repo_id, filename=filename, subfolder=subfolder, ) @lru_cache(maxsize=2) def get_pretrained_model(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: source_repo = _normalize_repo_id(repo_id) if source_repo != ENGLISH_REPO_ID: raise ValueError(f"Unsupported repo_id: {repo_id}. Use {ENGLISH_REPO_ID}") model = _get_file( repo_id=source_repo, filename="model.onnx", subfolder="kokoro-en-v0_19", ) tokens_raw = _get_file( repo_id=source_repo, filename="tokens.txt", subfolder="kokoro-en-v0_19", ) # Sanitize tokens file to prevent parsing errors (e.g. empty lines) with open(tokens_raw, "r", encoding="utf-8") as f: lines = [line for line in f if line.strip()] fd, tokens = tempfile.mkstemp(suffix=".txt", text=True) with os.fdopen(fd, "w", encoding="utf-8") as f: f.writelines(lines) voices = _get_file( repo_id=source_repo, filename="voices.bin", subfolder="kokoro-en-v0_19", ) root_dir = snapshot_download( repo_id=source_repo, allow_patterns=["kokoro-en-v0_19/espeak-ng-data/*"], ) data_dir = str(Path(root_dir) / "kokoro-en-v0_19" / "espeak-ng-data") tts_config = sherpa_onnx.OfflineTtsConfig( model=sherpa_onnx.OfflineTtsModelConfig( kokoro=sherpa_onnx.OfflineTtsKokoroModelConfig( model=model, voices=voices, tokens=tokens, data_dir=data_dir, length_scale=1.0 / speed, ), provider="cpu", debug=True, num_threads=2, ), max_num_sentences=1, ) return sherpa_onnx.OfflineTts(tts_config)