Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| from kokoro import KModel, KPipeline | |
| import gradio as gr | |
| import os | |
| import random | |
| import torch | |
| import numpy as np | |
| from pyharp.core import ModelCard, build_endpoint | |
| from pyharp.media.audio import save_audio | |
| from audiotools import AudioSignal | |
| model_card = ModelCard( | |
| name="Kokoro Text To Speech", | |
| description=("Kokoro is an open-weight TTS model with 82 million parameters.\n" | |
| "Despite its lightweight architecture, it delivers comparable quality to larger models " | |
| "while being significantly faster and more cost-efficient."), | |
| author="Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, Nima Mesgarani (StyleTTS2)", | |
| tags=["tts"] | |
| ) | |
| IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/') | |
| CUDA_AVAILABLE = torch.cuda.is_available() | |
| if not IS_DUPLICATE: | |
| import kokoro | |
| import misaki | |
| print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__) | |
| CHAR_LIMIT = None if IS_DUPLICATE else 5000 | |
| models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])} | |
| pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'} | |
| pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO' | |
| pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ' | |
| def forward_gpu(ps, ref_s, speed): | |
| return models[True](ps, ref_s, speed) | |
| def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE): | |
| text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT] | |
| pipeline = pipelines[voice[0]] | |
| pack = pipeline.load_voice(voice) | |
| use_gpu = use_gpu and CUDA_AVAILABLE | |
| first = True | |
| for _, ps, _ in pipeline(text, voice, speed): | |
| ref_s = pack[len(ps)-1] | |
| try: | |
| if use_gpu: | |
| audio = forward_gpu(ps, ref_s, speed) | |
| else: | |
| audio = models[False](ps, ref_s, speed) | |
| except gr.exceptions.Error as e: | |
| if use_gpu: | |
| gr.Warning(str(e)) | |
| gr.Info('Switching to CPU') | |
| audio = models[False](ps, ref_s, speed) | |
| else: | |
| raise gr.Error(e) | |
| yield 24000, audio.numpy() | |
| # if first: | |
| # first = False | |
| # yield 24000, torch.zeros(1).numpy() | |
| CHOICES = { | |
| '[US-Female] Heart': 'af_heart', | |
| '[US-Female] Bella': 'af_bella', | |
| '[US-Female] Nicole': 'af_nicole', | |
| '[US-Female] Aoede': 'af_aoede', | |
| '[US-Female] Kore': 'af_kore', | |
| '[US-Female] Sarah': 'af_sarah', | |
| '[US-Female] Nova': 'af_nova', | |
| '[US-Female] Sky': 'af_sky', | |
| '[US-Female] Alloy': 'af_alloy', | |
| '[US-Female] Jessica': 'af_jessica', | |
| '[US-Female] River': 'af_river', | |
| '[US-Male] Michael': 'am_michael', | |
| '[US-Male] Fenrir': 'am_fenrir', | |
| '[US-Male] Puck': 'am_puck', | |
| '[US-Male] Echo': 'am_echo', | |
| '[US-Male] Eric': 'am_eric', | |
| '[US-Male] Liam': 'am_liam', | |
| '[US-Male] Onyx': 'am_onyx', | |
| '[US-Male] Santa': 'am_santa', | |
| '[US-Male] Adam': 'am_adam', | |
| '[UK-Female] Emma': 'bf_emma', | |
| '[UK-Female] Isabella': 'bf_isabella', | |
| '[UK-Female] Alice': 'bf_alice', | |
| '[UK-Female] Lily': 'bf_lily', | |
| '[UK-Male] George': 'bm_george', | |
| '[UK-Male] Fable': 'bm_fable', | |
| '[UK-Male] Lewis': 'bm_lewis', | |
| '[UK-Male] Daniel': 'bm_daniel', | |
| } | |
| for v in CHOICES.values(): | |
| pipelines[v[0]].load_voice(v) | |
| API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS' | |
| API_NAME = None if API_OPEN else False | |
| def process_fn(text_input: str, speaker: str): | |
| speaker = CHOICES[speaker] | |
| # (fs, wav), _ = generate_first(text_input, speaker, speed=1, use_gpu=False) | |
| results = [x for x in generate_all(text_input, speaker, speed=1, use_gpu=True)] | |
| fs = results[0][0] | |
| wavs = [x[1] for x in results] | |
| wav = np.concatenate(wavs) | |
| sig = AudioSignal(wav.astype("float32"), sample_rate=fs) | |
| return save_audio(sig) | |
| with gr.Blocks() as app: | |
| gr.Markdown("## 💬 Kokoro Text To Speech") | |
| # Inputs | |
| text_input = gr.Textbox( | |
| label="Text Input", | |
| info="Up to 5000 character text input. To get the best performance, please start a new line for each sentence." | |
| ).harp_required(True) | |
| speaker_dropdown = gr.Dropdown( | |
| list(CHOICES.keys()), value='[US-Female] Heart', label='Voice', info='US and UK accented male and female voices available' | |
| ) | |
| # Outputs | |
| output_wav = gr.Audio( | |
| type="filepath", | |
| label="Synthesized Speech" | |
| ) | |
| _ = build_endpoint( | |
| model_card=model_card, | |
| input_components=[ | |
| text_input, | |
| # language_dropdown, | |
| speaker_dropdown | |
| ], | |
| output_components=[ | |
| output_wav | |
| ], | |
| process_fn=process_fn | |
| ) | |
| if __name__ == '__main__': | |
| app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True) | |