Spaces:

eder0782
/

kokoro-tts-api

Paused

App Files Files Community

kokoro-tts-api / main.py

eder0782

Update main.py (#14)

0355a1c verified 10 months ago

raw

history blame contribute delete

3.71 kB

	import asyncio
	import base64
	import io
	import time
	import uuid
	from fastapi import FastAPI
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel
	import torch
	import soundfile as sf
	import numpy as np
	from kokoro import KModel, KPipeline
	import os

	os.environ['HUGGINGFACE_HUB_CACHE'] = '/app/models'

	app = FastAPI()

	CUDA_AVAILABLE = torch.cuda.is_available()
	try:
	models = {gpu: KModel(repo_id='hexgrad/Kokoro-82M').to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])}
	except Exception as e:
	print(f"Erro ao inicializar o modelo: {str(e)}")
	raise
	pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in ['a', 'p', 'e']}

	CHOICES = {
	'🇺🇸 🚺 Heart ❤️': 'af_heart',
	'🇺🇸 🚺 Alloy': 'af_alloy',
	'🇺🇸 🚺 Aoede': 'af_aoede',
	'🇺🇸 🚺 Bella 🔥': 'af_bella',
	'🇺🇸 🚺 Jessica': 'af_jessica',
	'🇺🇸 🚺 Kore': 'af_kore',
	'🇺🇸 🚺 Nicole 🎧': 'af_nicole',
	'🇺🇸 🚺 Nova': 'af_nova',
	'🇺🇸 🚺 River': 'af_river',
	'🇺🇸 🚺 Sarah': 'af_sarah',
	'🇺🇸 🚺 Sky': 'af_sky',
	'🇺🇸 🚹 Adam': 'am_adam',
	'🇺🇸 🚹 Echo': 'am_echo',
	'🇺🇸 🚹 Eric': 'am_eric',
	'🇺🇸 🚹 Fenrir': 'am_fenrir',
	'🇺🇸 🚹 Liam': 'am_liam',
	'🇺🇸 🚹 Michael': 'am_michael',
	'🇺🇸 🚹 Onyx': 'am_onyx',
	'🇺🇸 🚹 Puck': 'am_puck',
	'🇺🇸 🚹 Santa': 'am_santa',
	'🇧🇷 🚺 Dora': 'pf_dora',
	'🇧🇷 🚹 Alex': 'pm_alex',
	'🇧🇷 🚹 Santa': 'pm_santa',
	'🇪🇸 🚺 Dora': 'ef_dora',
	'🇪🇸 🚹 Alex': 'em_alex',
	'🇪🇸 🚹 Santa': 'em_santa',
	}

	for v in CHOICES.values():
	pipelines[v[0]].load_voice(v)

	class PredictRequest(BaseModel):
	text: str
	voice: str = 'af_heart'
	speed: float = 1.0

	def generate_audio(text: str, voice: str, speed: float, use_gpu: bool = CUDA_AVAILABLE):
	pipeline = pipelines[voice[0]]
	pack = pipeline.load_voice(voice)
	use_gpu = use_gpu and CUDA_AVAILABLE

	audios = []

	for _, ps, _ in pipeline(text, voice, speed):
	ref_s = pack[len(ps)-1]
	try:
	if use_gpu:
	audio = models[True](ps, ref_s, speed)
	else:
	audio = models[False](ps, ref_s, speed)
	except Exception as e:
	if use_gpu:
	audio = models[False](ps, ref_s, speed)
	else:
	raise e
	audios.append(audio.numpy())

	if not audios:
	return None, ''

	# Concatena todos os áudios gerados
	audio_final = np.concatenate(audios)
	return 24000, audio_final


	@app.post("/predict")
	async def predict(request: PredictRequest):
	start_time = time.time()
	sample_rate, audio_data = generate_audio(request.text, request.voice, request.speed, use_gpu=CUDA_AVAILABLE)

	if audio_data is None:
	return JSONResponse(status_code=400, content={"error": "Failed to generate audio"})

	buffer = io.BytesIO()
	sf.write(buffer, audio_data, sample_rate, format='WAV')
	buffer.seek(0)

	audio_base64 = base64.b64encode(buffer.read()).decode("utf-8")
	duration = len(audio_data) / sample_rate
	generation_time = time.time() - start_time

	return {
	"audio_base64": audio_base64,
	"duration_seconds": round(duration, 2),
	"generation_time_seconds": round(generation_time, 2)
	}

	@app.get("/voices")
	async def get_voices():
	return [{"name": k, "value": v} for k, v in CHOICES.items()]

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)