Spaces:
Paused
Paused
| import asyncio | |
| import base64 | |
| import io | |
| import time | |
| import uuid | |
| from fastapi import FastAPI | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| import torch | |
| import soundfile as sf | |
| import numpy as np | |
| from kokoro import KModel, KPipeline | |
| import os | |
| os.environ['HUGGINGFACE_HUB_CACHE'] = '/app/models' | |
| app = FastAPI() | |
| CUDA_AVAILABLE = torch.cuda.is_available() | |
| try: | |
| models = {gpu: KModel(repo_id='hexgrad/Kokoro-82M').to('cuda' if gpu else 'cpu').eval() for gpu in [False] + ([True] if CUDA_AVAILABLE else [])} | |
| except Exception as e: | |
| print(f"Erro ao inicializar o modelo: {str(e)}") | |
| raise | |
| pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in ['a', 'p', 'e']} | |
| CHOICES = { | |
| 'πΊπΈ πΊ Heart β€οΈ': 'af_heart', | |
| 'πΊπΈ πΊ Alloy': 'af_alloy', | |
| 'πΊπΈ πΊ Aoede': 'af_aoede', | |
| 'πΊπΈ πΊ Bella π₯': 'af_bella', | |
| 'πΊπΈ πΊ Jessica': 'af_jessica', | |
| 'πΊπΈ πΊ Kore': 'af_kore', | |
| 'πΊπΈ πΊ Nicole π§': 'af_nicole', | |
| 'πΊπΈ πΊ Nova': 'af_nova', | |
| 'πΊπΈ πΊ River': 'af_river', | |
| 'πΊπΈ πΊ Sarah': 'af_sarah', | |
| 'πΊπΈ πΊ Sky': 'af_sky', | |
| 'πΊπΈ πΉ Adam': 'am_adam', | |
| 'πΊπΈ πΉ Echo': 'am_echo', | |
| 'πΊπΈ πΉ Eric': 'am_eric', | |
| 'πΊπΈ πΉ Fenrir': 'am_fenrir', | |
| 'πΊπΈ πΉ Liam': 'am_liam', | |
| 'πΊπΈ πΉ Michael': 'am_michael', | |
| 'πΊπΈ πΉ Onyx': 'am_onyx', | |
| 'πΊπΈ πΉ Puck': 'am_puck', | |
| 'πΊπΈ πΉ Santa': 'am_santa', | |
| 'π§π· πΊ Dora': 'pf_dora', | |
| 'π§π· πΉ Alex': 'pm_alex', | |
| 'π§π· πΉ Santa': 'pm_santa', | |
| 'πͺπΈ πΊ Dora': 'ef_dora', | |
| 'πͺπΈ πΉ Alex': 'em_alex', | |
| 'πͺπΈ πΉ Santa': 'em_santa', | |
| } | |
| for v in CHOICES.values(): | |
| pipelines[v[0]].load_voice(v) | |
| class PredictRequest(BaseModel): | |
| text: str | |
| voice: str = 'af_heart' | |
| speed: float = 1.0 | |
| def generate_audio(text: str, voice: str, speed: float, use_gpu: bool = CUDA_AVAILABLE): | |
| pipeline = pipelines[voice[0]] | |
| pack = pipeline.load_voice(voice) | |
| use_gpu = use_gpu and CUDA_AVAILABLE | |
| audios = [] | |
| for _, ps, _ in pipeline(text, voice, speed): | |
| ref_s = pack[len(ps)-1] | |
| try: | |
| if use_gpu: | |
| audio = models[True](ps, ref_s, speed) | |
| else: | |
| audio = models[False](ps, ref_s, speed) | |
| except Exception as e: | |
| if use_gpu: | |
| audio = models[False](ps, ref_s, speed) | |
| else: | |
| raise e | |
| audios.append(audio.numpy()) | |
| if not audios: | |
| return None, '' | |
| # Concatena todos os Γ‘udios gerados | |
| audio_final = np.concatenate(audios) | |
| return 24000, audio_final | |
| async def predict(request: PredictRequest): | |
| start_time = time.time() | |
| sample_rate, audio_data = generate_audio(request.text, request.voice, request.speed, use_gpu=CUDA_AVAILABLE) | |
| if audio_data is None: | |
| return JSONResponse(status_code=400, content={"error": "Failed to generate audio"}) | |
| buffer = io.BytesIO() | |
| sf.write(buffer, audio_data, sample_rate, format='WAV') | |
| buffer.seek(0) | |
| audio_base64 = base64.b64encode(buffer.read()).decode("utf-8") | |
| duration = len(audio_data) / sample_rate | |
| generation_time = time.time() - start_time | |
| return { | |
| "audio_base64": audio_base64, | |
| "duration_seconds": round(duration, 2), | |
| "generation_time_seconds": round(generation_time, 2) | |
| } | |
| async def get_voices(): | |
| return [{"name": k, "value": v} for k, v in CHOICES.items()] | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |