Kokoro-TTS / app.py
Huiran Yu
Update speaker description
6c81c71
import spaces
from kokoro import KModel, KPipeline
import gradio as gr
import os
import random
import torch
import numpy as np
from pyharp.core import ModelCard, build_endpoint
from pyharp.media.audio import save_audio
from audiotools import AudioSignal
model_card = ModelCard(
name="Kokoro Text To Speech",
description=("Kokoro is an open-weight TTS model with 82 million parameters.\n"
"Despite its lightweight architecture, it delivers comparable quality to larger models "
"while being significantly faster and more cost-efficient."),
author="Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, Nima Mesgarani (StyleTTS2)",
tags=["tts"]
)
IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
CUDA_AVAILABLE = torch.cuda.is_available()
if not IS_DUPLICATE:
import kokoro
import misaki
print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)
CHAR_LIMIT = None if IS_DUPLICATE else 5000
models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
@spaces.GPU(duration=30)
def forward_gpu(ps, ref_s, speed):
return models[True](ps, ref_s, speed)
def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
pipeline = pipelines[voice[0]]
pack = pipeline.load_voice(voice)
use_gpu = use_gpu and CUDA_AVAILABLE
first = True
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps)-1]
try:
if use_gpu:
audio = forward_gpu(ps, ref_s, speed)
else:
audio = models[False](ps, ref_s, speed)
except gr.exceptions.Error as e:
if use_gpu:
gr.Warning(str(e))
gr.Info('Switching to CPU')
audio = models[False](ps, ref_s, speed)
else:
raise gr.Error(e)
yield 24000, audio.numpy()
# if first:
# first = False
# yield 24000, torch.zeros(1).numpy()
CHOICES = {
'[US-Female] Heart': 'af_heart',
'[US-Female] Bella': 'af_bella',
'[US-Female] Nicole': 'af_nicole',
'[US-Female] Aoede': 'af_aoede',
'[US-Female] Kore': 'af_kore',
'[US-Female] Sarah': 'af_sarah',
'[US-Female] Nova': 'af_nova',
'[US-Female] Sky': 'af_sky',
'[US-Female] Alloy': 'af_alloy',
'[US-Female] Jessica': 'af_jessica',
'[US-Female] River': 'af_river',
'[US-Male] Michael': 'am_michael',
'[US-Male] Fenrir': 'am_fenrir',
'[US-Male] Puck': 'am_puck',
'[US-Male] Echo': 'am_echo',
'[US-Male] Eric': 'am_eric',
'[US-Male] Liam': 'am_liam',
'[US-Male] Onyx': 'am_onyx',
'[US-Male] Santa': 'am_santa',
'[US-Male] Adam': 'am_adam',
'[UK-Female] Emma': 'bf_emma',
'[UK-Female] Isabella': 'bf_isabella',
'[UK-Female] Alice': 'bf_alice',
'[UK-Female] Lily': 'bf_lily',
'[UK-Male] George': 'bm_george',
'[UK-Male] Fable': 'bm_fable',
'[UK-Male] Lewis': 'bm_lewis',
'[UK-Male] Daniel': 'bm_daniel',
}
for v in CHOICES.values():
pipelines[v[0]].load_voice(v)
API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
API_NAME = None if API_OPEN else False
def process_fn(text_input: str, speaker: str):
speaker = CHOICES[speaker]
# (fs, wav), _ = generate_first(text_input, speaker, speed=1, use_gpu=False)
results = [x for x in generate_all(text_input, speaker, speed=1, use_gpu=True)]
fs = results[0][0]
wavs = [x[1] for x in results]
wav = np.concatenate(wavs)
sig = AudioSignal(wav.astype("float32"), sample_rate=fs)
return save_audio(sig)
with gr.Blocks() as app:
gr.Markdown("## 💬 Kokoro Text To Speech")
# Inputs
text_input = gr.Textbox(
label="Text Input",
info="Up to 5000 character text input. To get the best performance, please start a new line for each sentence."
).harp_required(True)
speaker_dropdown = gr.Dropdown(
list(CHOICES.keys()), value='[US-Female] Heart', label='Voice', info='US and UK accented male and female voices available'
)
# Outputs
output_wav = gr.Audio(
type="filepath",
label="Synthesized Speech"
)
_ = build_endpoint(
model_card=model_card,
input_components=[
text_input,
# language_dropdown,
speaker_dropdown
],
output_components=[
output_wav
],
process_fn=process_fn
)
if __name__ == '__main__':
app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)