Kokoro-TTS

Running on Zero

Kokoro-TTS / app.py

Huiran Yu

Update speaker description

6c81c71 3 months ago

4.78 kB

	import spaces
	from kokoro import KModel, KPipeline
	import gradio as gr
	import os
	import random
	import torch
	import numpy as np

	from pyharp.core import ModelCard, build_endpoint
	from pyharp.media.audio import save_audio
	from audiotools import AudioSignal

	model_card = ModelCard(
	name="Kokoro Text To Speech",
	description=("Kokoro is an open-weight TTS model with 82 million parameters.\n"
	"Despite its lightweight architecture, it delivers comparable quality to larger models "
	"while being significantly faster and more cost-efficient."),
	author="Yinghao Aaron Li, Cong Han, Vinay S. Raghavan, Gavin Mischler, Nima Mesgarani (StyleTTS2)",
	tags=["tts"]
	)

	IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('hexgrad/')
	CUDA_AVAILABLE = torch.cuda.is_available()
	if not IS_DUPLICATE:
	import kokoro
	import misaki
	print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)

	CHAR_LIMIT = None if IS_DUPLICATE else 5000
	models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
	pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
	pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
	pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'

	@spaces.GPU(duration=30)
	def forward_gpu(ps, ref_s, speed):
	return models[True](ps, ref_s, speed)

	def generate_all(text, voice='af_heart', speed=1, use_gpu=CUDA_AVAILABLE):
	text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
	pipeline = pipelines[voice[0]]
	pack = pipeline.load_voice(voice)
	use_gpu = use_gpu and CUDA_AVAILABLE
	first = True
	for _, ps, _ in pipeline(text, voice, speed):
	ref_s = pack[len(ps)-1]
	try:
	if use_gpu:
	audio = forward_gpu(ps, ref_s, speed)
	else:
	audio = models[False](ps, ref_s, speed)
	except gr.exceptions.Error as e:
	if use_gpu:
	gr.Warning(str(e))
	gr.Info('Switching to CPU')
	audio = models[False](ps, ref_s, speed)
	else:
	raise gr.Error(e)
	yield 24000, audio.numpy()
	# if first:
	# first = False
	# yield 24000, torch.zeros(1).numpy()


	CHOICES = {
	'[US-Female] Heart': 'af_heart',
	'[US-Female] Bella': 'af_bella',
	'[US-Female] Nicole': 'af_nicole',
	'[US-Female] Aoede': 'af_aoede',
	'[US-Female] Kore': 'af_kore',
	'[US-Female] Sarah': 'af_sarah',
	'[US-Female] Nova': 'af_nova',
	'[US-Female] Sky': 'af_sky',
	'[US-Female] Alloy': 'af_alloy',
	'[US-Female] Jessica': 'af_jessica',
	'[US-Female] River': 'af_river',
	'[US-Male] Michael': 'am_michael',
	'[US-Male] Fenrir': 'am_fenrir',
	'[US-Male] Puck': 'am_puck',
	'[US-Male] Echo': 'am_echo',
	'[US-Male] Eric': 'am_eric',
	'[US-Male] Liam': 'am_liam',
	'[US-Male] Onyx': 'am_onyx',
	'[US-Male] Santa': 'am_santa',
	'[US-Male] Adam': 'am_adam',
	'[UK-Female] Emma': 'bf_emma',
	'[UK-Female] Isabella': 'bf_isabella',
	'[UK-Female] Alice': 'bf_alice',
	'[UK-Female] Lily': 'bf_lily',
	'[UK-Male] George': 'bm_george',
	'[UK-Male] Fable': 'bm_fable',
	'[UK-Male] Lewis': 'bm_lewis',
	'[UK-Male] Daniel': 'bm_daniel',
	}
	for v in CHOICES.values():
	pipelines[v[0]].load_voice(v)

	API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
	API_NAME = None if API_OPEN else False

	def process_fn(text_input: str, speaker: str):
	speaker = CHOICES[speaker]
	# (fs, wav), _ = generate_first(text_input, speaker, speed=1, use_gpu=False)
	results = [x for x in generate_all(text_input, speaker, speed=1, use_gpu=True)]
	fs = results[0][0]
	wavs = [x[1] for x in results]
	wav = np.concatenate(wavs)
	sig = AudioSignal(wav.astype("float32"), sample_rate=fs)
	return save_audio(sig)

	with gr.Blocks() as app:
	gr.Markdown("## 💬 Kokoro Text To Speech")

	# Inputs
	text_input = gr.Textbox(
	label="Text Input",
	info="Up to 5000 character text input. To get the best performance, please start a new line for each sentence."
	).harp_required(True)

	speaker_dropdown = gr.Dropdown(
	list(CHOICES.keys()), value='[US-Female] Heart', label='Voice', info='US and UK accented male and female voices available'
	)

	# Outputs
	output_wav = gr.Audio(
	type="filepath",
	label="Synthesized Speech"
	)

	_ = build_endpoint(
	model_card=model_card,
	input_components=[
	text_input,
	# language_dropdown,
	speaker_dropdown
	],
	output_components=[
	output_wav
	],
	process_fn=process_fn
	)

	if __name__ == '__main__':
	app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)