Kokoro-es-TTS

Running

App Files Files Community

Kokoro-es-TTS / app.py

igortamara

bugfix - Spanish does not have lexicon

2a7a09c 10 months ago

raw

history blame contribute delete

7.69 kB

	import spaces
	from kokoro import KModel, KPipeline
	import gradio as gr
	import os
	import random
	import torch

	IS_DUPLICATE = not os.getenv('SPACE_ID', '').startswith('igortamara/')
	CUDA_AVAILABLE = torch.cuda.is_available()
	if not IS_DUPLICATE:
	import kokoro
	import misaki
	print('DEBUG', kokoro.__version__, CUDA_AVAILABLE, misaki.__version__)

	CHAR_LIMIT = None if IS_DUPLICATE else 5000
	models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
	pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'e'}

	@spaces.GPU(duration=30)
	def forward_gpu(ps, ref_s, speed):
	return models[True](ps, ref_s, speed)

	def generate_first(text, voice='ef_dora', speed=1, use_gpu=CUDA_AVAILABLE):
	text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
	pipeline = pipelines[voice[0]]
	pack = pipeline.load_voice(voice)
	use_gpu = use_gpu and CUDA_AVAILABLE
	for _, ps, _ in pipeline(text, voice, speed):
	ref_s = pack[len(ps)-1]
	try:
	if use_gpu:
	audio = forward_gpu(ps, ref_s, speed)
	else:
	audio = models[False](ps, ref_s, speed)
	except gr.exceptions.Error as e:
	if use_gpu:
	gr.Warning(str(e))
	gr.Info('Intentando con CPU. Para evitar este error, cambie el Hardware a CPU.')
	audio = models[False](ps, ref_s, speed)
	else:
	raise gr.Error(e)
	return (24000, audio.numpy()), ps
	return None, ''

	# Arena API
	def predict(text, voice='ef_dora', speed=1):
	return generate_first(text, voice, speed, use_gpu=False)[0]

	def tokenize_first(text, voice='ef_dora'):
	pipeline = pipelines[voice[0]]
	for _, ps, _ in pipeline(text, voice):
	return ps
	return ''

	def generate_all(text, voice='ef_dora', speed=1, use_gpu=CUDA_AVAILABLE):
	text = text if CHAR_LIMIT is None else text.strip()[:CHAR_LIMIT]
	pipeline = pipelines[voice[0]]
	pack = pipeline.load_voice(voice)
	use_gpu = use_gpu and CUDA_AVAILABLE
	first = True
	for _, ps, _ in pipeline(text, voice, speed):
	ref_s = pack[len(ps)-1]
	try:
	if use_gpu:
	audio = forward_gpu(ps, ref_s, speed)
	else:
	audio = models[False](ps, ref_s, speed)
	except gr.exceptions.Error as e:
	if use_gpu:
	gr.Warning(str(e))
	gr.Info('Cambiando a CPU')
	audio = models[False](ps, ref_s, speed)
	else:
	raise gr.Error(e)
	yield 24000, audio.numpy()
	if first:
	first = False
	yield 24000, torch.zeros(1).numpy()

	with open('es.txt', 'r') as r:
	random_quotes = [line.strip() for line in r]

	def get_random_quote():
	return random.choice(random_quotes)

	def get_gatsby():
	with open('gatsby5k.md', 'r') as r:
	return r.read().strip()

	def get_frankenstein():
	with open('frankenstein5k.md', 'r') as r:
	return r.read().strip()

	CHOICES = {
	'🇪🇸 🚺 Dora ❤️': 'ef_dora',
	'🇪🇸 🚹 Alex': 'em_alex',
	'🇪🇸 🚹 Santa': 'em_santa',
	}
	for v in CHOICES.values():
	pipelines[v[0]].load_voice(v)

	TOKEN_NOTE = '''
	💡 Ajusta la pronunciación con la sintaxis de enlace de Markdown y /barras diagonales/ así `[Kokoro](/kˈOkəɹO/)`

	💬 Para ajustar la entonación, usa puntuación `;:,.!?—…"()“”` o estrés `ˈ` y `ˌ`

	⬇️ Disminuye el estrés `[1 nivel](-1)` o `[2 niveles](-2)`

	⬆️ Incrementa un nivel `[o](+2)` 2 niveles (solo funciona en palabras menos estresadas, usualmente cortas)
	'''

	with gr.Blocks() as generate_tab:
	out_audio = gr.Audio(label='Audio resultante', interactive=False, streaming=False, autoplay=True)
	generate_btn = gr.Button('Generar', variant='primary')
	with gr.Accordion('Tokens generados', open=True):
	out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens usados para generar el audio, contexto de máximo 510.')
	tokenize_btn = gr.Button('Tokenizar', variant='secondary')
	gr.Markdown(TOKEN_NOTE)
	predict_btn = gr.Button('Predecir', variant='secondary', visible=False)

	STREAM_NOTE = ['⚠️ Gradio tiene un bug que puede no generar ningún audio la primera vez que hagas clic en `Stream`.']
	if CHAR_LIMIT is not None:
	STREAM_NOTE.append(f'✂️ Cada stream se limita a {CHAR_LIMIT} caracteres.')
	STREAM_NOTE.append('🚀 ¿Quieres más caracteres? Puedes [usar Kokoro directamente](https://huggingface.co/hexgrad/Kokoro-82M#usage) o duplicar este espacio:')
	STREAM_NOTE = '\n\n'.join(STREAM_NOTE)

	with gr.Blocks() as stream_tab:
	out_stream = gr.Audio(label='Stream de audio generado', interactive=False, streaming=True, autoplay=True)
	with gr.Row():
	stream_btn = gr.Button('Stream', variant='primary')
	stop_btn = gr.Button('Detener', variant='stop')
	with gr.Accordion('Nota', open=True):
	gr.Markdown(STREAM_NOTE)
	gr.DuplicateButton()

	BANNER_TEXT = '''
	[*Kokoro* es un modelo de TTS de peso abierto con 82 millones de parámetros.](https://huggingface.co/hexgrad/Kokoro-82M)

	Este demo solo muestra español, puedes encontrar el [original](https://huggingface.co/spaces/hexgrad/Kokoro-TTS) o usarlo directamente para contar con otros idiomas.
	'''
	API_OPEN = os.getenv('SPACE_ID') != 'hexgrad/Kokoro-TTS'
	API_NAME = None if API_OPEN else False
	with gr.Blocks() as app:
	with gr.Row():
	gr.Markdown(BANNER_TEXT, container=True)
	with gr.Row():
	with gr.Column():
	text = gr.Textbox(label='Texto a leer', info=f"Máximo ~500 caracteres para «generar», o {'∞' if CHAR_LIMIT is None else CHAR_LIMIT} caracteres usando «Stream»")
	with gr.Row():
	voice = gr.Dropdown(list(CHOICES.items()), value='ef_dora', label='Voz', info='La calidad y disponibilidad varían por idioma')
	use_gpu = gr.Dropdown(
	[('ZeroGPU 🚀', True), ('CPU 🐌', False)],
	value=CUDA_AVAILABLE,
	label='Hardware',
	info='La GPU usualmente es más rápida, pero tiene quota de uso',
	interactive=CUDA_AVAILABLE
	)
	speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Velocidad')
	random_btn = gr.Button('🎲 Cita aleatoria 💬', variant='secondary')
	with gr.Row():
	gatsby_btn = gr.Button('🥂 Gatsby 📕', variant='secondary')
	frankenstein_btn = gr.Button('💀 Frankenstein 📗', variant='secondary')
	with gr.Column():
	gr.TabbedInterface([generate_tab, stream_tab], ['Generar', 'Stream'])
	random_btn.click(fn=get_random_quote, inputs=[], outputs=[text], api_name=API_NAME)
	gatsby_btn.click(fn=get_gatsby, inputs=[], outputs=[text], api_name=API_NAME)
	frankenstein_btn.click(fn=get_frankenstein, inputs=[], outputs=[text], api_name=API_NAME)
	generate_btn.click(fn=generate_first, inputs=[text, voice, speed, use_gpu], outputs=[out_audio, out_ps], api_name=API_NAME)
	tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps], api_name=API_NAME)
	stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed, use_gpu], outputs=[out_stream], api_name=API_NAME)
	stop_btn.click(fn=None, cancels=stream_event)
	predict_btn.click(fn=predict, inputs=[text, voice, speed], outputs=[out_audio], api_name=API_NAME)

	if __name__ == '__main__':
	app.queue(api_open=API_OPEN).launch(show_api=API_OPEN, ssr_mode=True)