import gradio as gr import requests import os from deployment_options import voice_id_2_name, defualt_values, voice_name_2_note import uuid API_URL = "https://sentivue-endpoint.hf.space" ENDPOINT_URL = "https://sentivue-endpoint.hf.space/v1/tts" ENDPOINT_TOKEN = os.getenv("endpoint_READ") print(f"Public demo will call endpoint: {ENDPOINT_URL}") print(f"Token loaded: {'Yes' if ENDPOINT_TOKEN else 'No'}") voice_names = list(voice_id_2_name.values()) voice_names_display_dict = { f'{voice_name} ({voice_name_2_note[voice_name]})' : voice_name for voice_name in voice_names } voice_names_display_default = defualt_values['voice_name'] def generate_speech(text: str, voice_name: str): """ Calls the private FastAPI endpoint and returns audio """ if not text.strip(): return None, "Please enter some text" if not ENDPOINT_TOKEN: return None, "Error: endpoint_READ token not found in environment" try: voice_name_2_id = {} for vid, name in voice_id_2_name.items(): voice_name_2_id[name] = vid voice_id = voice_name_2_id[voice_name] payload = { "text": text } print(f"Sending request to: {ENDPOINT_URL}/{voice_id}") print(f"Payload: {payload}") response = requests.post( f"{ENDPOINT_URL}/{voice_id}", headers={ "Authorization": f"Bearer {ENDPOINT_TOKEN}", "Content-Type": "application/json" }, json=payload, # timeout=60, stream=True ) response.raise_for_status() # # Return raw WAV bytes - Gradio handles the rest # return response.content, "Success!" # # Save to temporary WAV file # with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: # tmp_file.write(response.content) # tmp_path = tmp_file.name # return tmp_path, "Success!" # Save to a regular file in current directory (not temp) # Generate unique ID for output file generation_id = str(uuid.uuid4())[:15] output_path = f"speech_{voice_id}_{generation_id}.wav" with open(output_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) return output_path, "Success!" except requests.exceptions.RequestException as e: error_msg = f"Error calling endpoint: {str(e)}" print(error_msg) return None, error_msg except Exception as e: error_msg = f"Unexpected error: {str(e)}" print(error_msg) return None, error_msg def check_readiness(): try: URL = f"{API_URL}/health" health = requests.get( URL, headers={ "Authorization": f"Bearer {ENDPOINT_TOKEN}", }, timeout=5) data = health.json() if data.get("ready"): msg = "✅ Ready" print(URL, msg) return gr.Button("🔊 Generate Speech", interactive=True), msg, gr.Timer(active=False) # STOP else: msg = "🔄 2/2: Preparing our model, it takes a few seconds..." print(URL, msg) return gr.Button(msg, interactive=False), msg, gr.Timer(active=True) # CONTINUE except: # /health didn't respond msg = "⏳ 1/2: Preparing our server, it takes around 2 minutes..." print(URL, msg) return gr.Button(msg, interactive=False), msg, gr.Timer(active=True) # CONTINUE # ── Gradio Interface ──────────────────────────────────────────────────────── with gr.Blocks( title="pt-PT TTS - Demo", css=""" body { zoom: 1.2; /* 110% zoom */ } """ ) as demo: # Header Section gr.Markdown( """ # 🎙️ Síntese de Voz em Português Europeu (pt-PT) — Public Preview Síntese de voz natural em português europeu (pt-PT), com prosódia fluida e pronúncia correta de números. High-quality European Portuguese (pt-PT) speech synthesis with natural prosody and accurate number pronunciation. """ ) gr.Markdown( """ ### Especificações Técnicas - **Tamanho do modelo:** ~3B parâmetros - **Arquitetura:** Backbone de TTS baseado em LLM - **Dados de Treino:** +11k horas de voz pt-PT curada Model Size: ~3B parameters | Architecture: LLM-based TTS backbone | Training Data: +11k hours of curated pt-PT speech """ ) gr.Markdown( """ Nota: Para melhor desempenho e compatibilidade de áudio, recomendamos o uso do Google Chrome. Note: For best audio performance and compatibility, we recommend using Google Chrome. """ ) # gr.Markdown("---") # Main Generation Interface # gr.Markdown("## Generate Speech") with gr.Row(): # Left Column - Input Controls with gr.Column(scale=5): text_input = gr.Textbox( label="📝 Text to Synthesize", placeholder="Enter Portuguese text here... (e.g., 'Olá! Este é um teste do sistema de síntese de voz.')", lines=6, max_lines=10, ) with gr.Row(): voice_dropdown = gr.Dropdown( choices=list(voice_names_display_dict.items()), value=voice_names_display_default, label="🎭 Voice Selection", info="More voices coming soon" ) submit_btn = gr.Button( "🔊 Generate Speech", variant="primary", size="lg", interactive=False, ) # Right Column - Output with gr.Column(scale=4): audio_output = gr.Audio( label="🔊 Generated Audio", type="filepath", autoplay=False, ) status_text = gr.Textbox( label="Status", interactive=False, ) # Example Inputs gr.Markdown("### 💡 Example Texts") gr.Examples( examples=[ ["Olá! Bem-vindo ao sistema de síntese de voz em português europeu."], ["A temperatura hoje está entre 5 e 9 graus Celsius."], ["Lisboa é a capital de Portugal, fundada antes do ano 1200."] ], inputs=text_input, ) # Footer gr.Markdown( """