from __future__ import annotations import tempfile from functools import lru_cache from pathlib import Path import gradio as gr import numpy as np import soundfile as sf from kokoro import KPipeline SPACE_TITLE = "VoiceMM TTS API" SAMPLE_RATE = 24_000 MAX_CHARS = 450 VOICE_OPTIONS = { "pf_dora": "Dora, feminina e clara", "pm_alex": "Alex, masculina e neutra", "pm_santa": "Santa, masculina e encorpada", } EXAMPLES = [ [ "Seu produto ficou pronto. Agora ele tem uma voz que passa confianca, ritmo e presenca.", "pf_dora", 1.0, ], [ "Apresente sua startup em vinte segundos: problema, promessa e chamada para acao.", "pm_alex", 1.05, ], [ "Bem-vindo ao VoiceMM. Transforme roteiro em audio com uma interface simples e bonita.", "pm_santa", 0.95, ], ] CSS = """ .gradio-container { background: radial-gradient(circle at top left, rgba(237, 180, 93, 0.18), transparent 30%), radial-gradient(circle at top right, rgba(33, 181, 168, 0.12), transparent 28%), #0f1518; } .voicelek-shell { max-width: 1024px; margin: 0 auto; } .voicelek-kicker { letter-spacing: 0.18em; text-transform: uppercase; color: #efbf74; font-size: 0.8rem; } """ @lru_cache(maxsize=8) def get_pipeline(lang_code: str) -> KPipeline: return KPipeline(lang_code=lang_code) def normalize_text(text: str) -> str: cleaned = " ".join((text or "").split()) if not cleaned: raise gr.Error("Digite algum texto antes de gerar o audio.") if len(cleaned) > MAX_CHARS: raise gr.Error( f"Use no maximo {MAX_CHARS} caracteres por vez para manter a latencia boa no plano gratis." ) return cleaned def synthesize(text: str, voice: str, speed: float) -> tuple[str, str]: cleaned = normalize_text(text) pipeline = get_pipeline(voice[0]) chunks: list[np.ndarray] = [] for _, _, audio in pipeline(cleaned, voice=voice, speed=float(speed)): chunks.append(np.asarray(audio, dtype=np.float32)) if not chunks: raise gr.Error("O modelo nao conseguiu gerar audio para esse texto.") waveform = np.concatenate(chunks) output_dir = Path(tempfile.mkdtemp(prefix="voicelek_")) output_path = output_dir / "voicelek-output.wav" sf.write(output_path, waveform, SAMPLE_RATE) duration_seconds = len(waveform) / SAMPLE_RATE details = ( f"**Voz:** {VOICE_OPTIONS[voice]} \n" f"**Velocidade:** {speed:.2f}x \n" f"**Entrada:** {len(cleaned)} caracteres \n" f"**Duracao estimada:** {duration_seconds:.1f}s" ) return str(output_path), details with gr.Blocks(title=SPACE_TITLE) as demo: with gr.Column(elem_classes="voicelek-shell"): gr.Markdown( """