from __future__ import annotations import tempfile from functools import lru_cache from pathlib import Path import gradio as gr import numpy as np import soundfile as sf from kokoro import KPipeline SPACE_TITLE = "VoiceMM TTS API" SAMPLE_RATE = 24_000 MAX_CHARS = 450 VOICE_OPTIONS = { "pf_dora": "Dora, feminina e clara", "pm_alex": "Alex, masculina e neutra", "pm_santa": "Santa, masculina e encorpada", } EXAMPLES = [ [ "Seu produto ficou pronto. Agora ele tem uma voz que passa confianca, ritmo e presenca.", "pf_dora", 1.0, ], [ "Apresente sua startup em vinte segundos: problema, promessa e chamada para acao.", "pm_alex", 1.05, ], [ "Bem-vindo ao VoiceMM. Transforme roteiro em audio com uma interface simples e bonita.", "pm_santa", 0.95, ], ] CSS = """ .gradio-container { background: radial-gradient(circle at top left, rgba(237, 180, 93, 0.18), transparent 30%), radial-gradient(circle at top right, rgba(33, 181, 168, 0.12), transparent 28%), #0f1518; } .voicelek-shell { max-width: 1024px; margin: 0 auto; } .voicelek-kicker { letter-spacing: 0.18em; text-transform: uppercase; color: #efbf74; font-size: 0.8rem; } """ @lru_cache(maxsize=8) def get_pipeline(lang_code: str) -> KPipeline: return KPipeline(lang_code=lang_code) def normalize_text(text: str) -> str: cleaned = " ".join((text or "").split()) if not cleaned: raise gr.Error("Digite algum texto antes de gerar o audio.") if len(cleaned) > MAX_CHARS: raise gr.Error( f"Use no maximo {MAX_CHARS} caracteres por vez para manter a latencia boa no plano gratis." ) return cleaned def synthesize(text: str, voice: str, speed: float) -> tuple[str, str]: cleaned = normalize_text(text) pipeline = get_pipeline(voice[0]) chunks: list[np.ndarray] = [] for _, _, audio in pipeline(cleaned, voice=voice, speed=float(speed)): chunks.append(np.asarray(audio, dtype=np.float32)) if not chunks: raise gr.Error("O modelo nao conseguiu gerar audio para esse texto.") waveform = np.concatenate(chunks) output_dir = Path(tempfile.mkdtemp(prefix="voicelek_")) output_path = output_dir / "voicelek-output.wav" sf.write(output_path, waveform, SAMPLE_RATE) duration_seconds = len(waveform) / SAMPLE_RATE details = ( f"**Voz:** {VOICE_OPTIONS[voice]} \n" f"**Velocidade:** {speed:.2f}x \n" f"**Entrada:** {len(cleaned)} caracteres \n" f"**Duracao estimada:** {duration_seconds:.1f}s" ) return str(output_path), details with gr.Blocks(title=SPACE_TITLE) as demo: with gr.Column(elem_classes="voicelek-shell"): gr.Markdown( """
VoiceMM
# API de TTS em portugues brasileiro Esta Space foi pensada para ser o backend de um frontend estatico no GitHub Pages. O endpoint publico principal e `"/synthesize"`. """, ) with gr.Row(): with gr.Column(scale=3): text_input = gr.Textbox( label="Texto", lines=8, max_lines=12, placeholder="Cole aqui sua copy, roteiro, CTA ou locucao curta.", value=EXAMPLES[0][0], ) with gr.Column(scale=2): voice_input = gr.Dropdown( choices=[(label, key) for key, label in VOICE_OPTIONS.items()], value="pf_dora", label="Voz", ) speed_input = gr.Slider( minimum=0.8, maximum=1.25, value=1.0, step=0.05, label="Velocidade", ) generate_button = gr.Button("Gerar audio", variant="primary") audio_output = gr.Audio( label="Saida", type="filepath", format="wav", ) details_output = gr.Markdown( value="Pronto para receber chamadas via navegador ou direto pela API do Gradio." ) gr.Examples( examples=EXAMPLES, inputs=[text_input, voice_input, speed_input], label="Exemplos rapidos", ) generate_button.click( fn=synthesize, inputs=[text_input, voice_input, speed_input], outputs=[audio_output, details_output], api_name="synthesize", ) demo.queue(default_concurrency_limit=1, max_size=16) if __name__ == "__main__": demo.launch( theme=gr.themes.Soft( primary_hue="amber", secondary_hue="teal", neutral_hue="slate", ), css=CSS, footer_links=["api", "gradio", "settings"], )