import spaces from kokoro import KModel, KPipeline import gradio as gr import os import random import torch import numpy as np from pyharp.core import ModelCard, build_endpoint from pyharp.media.audio import save_audio from audiotools import AudioSignal model_card = ModelCard( name="Kokoro Text To Speech", description=("Kokoro is an open-weight TTS model with 82 million parameters.\n" "Despite its lightweight architecture, it delivers comparable quality to larger models " "while being significantly faster and more cost-efficient."), author="Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, Nima Mesgarani (StyleTTS2)", tags=["tts"] ) IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/') CUDA_AVAILABLE = torch.cuda.is_available() if not IS_DUPLICATE: import kokoro import misaki print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__) CHAR_LIMIT = None if IS_DUPLICATE else 5000 models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])} pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'} pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO' pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ' @spaces.GPU(duration=30) def forward_gpu(ps, ref_s, speed): return models[True](ps, ref_s, speed) def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE): text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT] pipeline = pipelines[voice[0]] pack = pipeline.load_voice(voice) use_gpu = use_gpu and CUDA_AVAILABLE first = True for _, ps, _ in pipeline(text, voice, speed): ref_s = pack[len(ps)-1] try: if use_gpu: audio = forward_gpu(ps, ref_s, speed) else: audio = models[False](ps, ref_s, speed) except gr.exceptions.Error as e: if use_gpu: gr.Warning(str(e)) gr.Info('Switching to CPU') audio = models[False](ps, ref_s, speed) else: raise gr.Error(e) yield 24000, audio.numpy() # if first: # first = False # yield 24000, torch.zeros(1).numpy() CHOICES = { '[US-Female] Heart': 'af_heart', '[US-Female] Bella': 'af_bella', '[US-Female] Nicole': 'af_nicole', '[US-Female] Aoede': 'af_aoede', '[US-Female] Kore': 'af_kore', '[US-Female] Sarah': 'af_sarah', '[US-Female] Nova': 'af_nova', '[US-Female] Sky': 'af_sky', '[US-Female] Alloy': 'af_alloy', '[US-Female] Jessica': 'af_jessica', '[US-Female] River': 'af_river', '[US-Male] Michael': 'am_michael', '[US-Male] Fenrir': 'am_fenrir', '[US-Male] Puck': 'am_puck', '[US-Male] Echo': 'am_echo', '[US-Male] Eric': 'am_eric', '[US-Male] Liam': 'am_liam', '[US-Male] Onyx': 'am_onyx', '[US-Male] Santa': 'am_santa', '[US-Male] Adam': 'am_adam', '[UK-Female] Emma': 'bf_emma', '[UK-Female] Isabella': 'bf_isabella', '[UK-Female] Alice': 'bf_alice', '[UK-Female] Lily': 'bf_lily', '[UK-Male] George': 'bm_george', '[UK-Male] Fable': 'bm_fable', '[UK-Male] Lewis': 'bm_lewis', '[UK-Male] Daniel': 'bm_daniel', } for v in CHOICES.values(): pipelines[v[0]].load_voice(v) API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS' API_NAME = None if API_OPEN else False def process_fn(text_input: str, speaker: str): speaker = CHOICES[speaker] # (fs, wav), _ = generate_first(text_input, speaker, speed=1, use_gpu=False) results = [x for x in generate_all(text_input, speaker, speed=1, use_gpu=True)] fs = results[0][0] wavs = [x[1] for x in results] wav = np.concatenate(wavs) sig = AudioSignal(wav.astype("float32"), sample_rate=fs) return save_audio(sig) with gr.Blocks() as app: gr.Markdown("## 💬 Kokoro Text To Speech") # Inputs text_input = gr.Textbox( label="Text Input", info="Up to 5000 character text input. To get the best performance, please start a new line for each sentence." ).harp_required(True) speaker_dropdown = gr.Dropdown( list(CHOICES.keys()), value='[US-Female] Heart', label='Voice', info='US and UK accented male and female voices available' ) # Outputs output_wav = gr.Audio( type="filepath", label="Synthesized Speech" ) _ = build_endpoint( model_card=model_card, input_components=[ text_input, # language_dropdown, speaker_dropdown ], output_components=[ output_wav ], process_fn=process_fn ) if __name__ == '__main__': app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)