pt-PT_TTS_Demo / app.py
m-nagy's picture
fix: pass original voice name to generate_speech instead of display name
b0728cf
import gradio as gr
import requests
import os
from deployment_options import voice_id_2_name, defualt_values, voice_name_2_note
import uuid
API_URL = "https://sentivue-endpoint.hf.space"
ENDPOINT_URL = "https://sentivue-endpoint.hf.space/v1/tts"
ENDPOINT_TOKEN = os.getenv("endpoint_READ")
print(f"Public demo will call endpoint: {ENDPOINT_URL}")
print(f"Token loaded: {'Yes' if ENDPOINT_TOKEN else 'No'}")
voice_names = list(voice_id_2_name.values())
voice_names_display_dict = {
f'{voice_name} ({voice_name_2_note[voice_name]})' : voice_name
for voice_name in voice_names
}
voice_names_display_default = defualt_values['voice_name']
def generate_speech(text: str, voice_name: str):
"""
Calls the private FastAPI endpoint and returns audio
"""
if not text.strip():
return None, "Please enter some text"
if not ENDPOINT_TOKEN:
return None, "Error: endpoint_READ token not found in environment"
try:
voice_name_2_id = {}
for vid, name in voice_id_2_name.items():
voice_name_2_id[name] = vid
voice_id = voice_name_2_id[voice_name]
payload = {
"text": text
}
print(f"Sending request to: {ENDPOINT_URL}/{voice_id}")
print(f"Payload: {payload}")
response = requests.post(
f"{ENDPOINT_URL}/{voice_id}",
headers={
"Authorization": f"Bearer {ENDPOINT_TOKEN}",
"Content-Type": "application/json"
},
json=payload,
# timeout=60,
stream=True
)
response.raise_for_status()
# # Return raw WAV bytes - Gradio handles the rest
# return response.content, "Success!"
# # Save to temporary WAV file
# with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
# tmp_file.write(response.content)
# tmp_path = tmp_file.name
# return tmp_path, "Success!"
# Save to a regular file in current directory (not temp)
# Generate unique ID for output file
generation_id = str(uuid.uuid4())[:15]
output_path = f"speech_{voice_id}_{generation_id}.wav"
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return output_path, "Success!"
except requests.exceptions.RequestException as e:
error_msg = f"Error calling endpoint: {str(e)}"
print(error_msg)
return None, error_msg
except Exception as e:
error_msg = f"Unexpected error: {str(e)}"
print(error_msg)
return None, error_msg
def check_readiness():
try:
URL = f"{API_URL}/health"
health = requests.get(
URL,
headers={
"Authorization": f"Bearer {ENDPOINT_TOKEN}",
},
timeout=5)
data = health.json()
if data.get("ready"):
msg = "✅ Ready"
print(URL, msg)
return gr.Button("🔊 Generate Speech", interactive=True), msg, gr.Timer(active=False) # STOP
else:
msg = "🔄 2/2: Preparing our model, it takes a few seconds..."
print(URL, msg)
return gr.Button(msg, interactive=False), msg, gr.Timer(active=True) # CONTINUE
except: # /health didn't respond
msg = "⏳ 1/2: Preparing our server, it takes around 2 minutes..."
print(URL, msg)
return gr.Button(msg, interactive=False), msg, gr.Timer(active=True) # CONTINUE
# ── Gradio Interface ────────────────────────────────────────────────────────
with gr.Blocks(
title="pt-PT TTS - Demo",
css="""
body {
zoom: 1.2; /* 110% zoom */
}
"""
) as demo:
# Header Section
gr.Markdown(
"""
# 🎙️ Síntese de Voz em Português Europeu (pt-PT) — Public Preview
Síntese de voz natural em português europeu (pt-PT), com prosódia fluida e pronúncia correta de números.
<small>High-quality European Portuguese (pt-PT) speech synthesis with natural prosody and accurate number pronunciation.</small>
"""
)
gr.Markdown(
"""
### Especificações Técnicas
- **Tamanho do modelo:** ~3B parâmetros
- **Arquitetura:** Backbone de TTS baseado em LLM
- **Dados de Treino:** +11k horas de voz pt-PT curada
<small>
Model Size: ~3B parameters | Architecture: LLM-based TTS backbone | Training Data: +11k hours of curated pt-PT speech
</small>
"""
)
gr.Markdown(
"""
Nota: Para melhor desempenho e compatibilidade de áudio, recomendamos o uso do Google Chrome.
<small>
Note: For best audio performance and compatibility, we recommend using Google Chrome.
</small>
"""
)
# gr.Markdown("---")
# Main Generation Interface
# gr.Markdown("## Generate Speech")
with gr.Row():
# Left Column - Input Controls
with gr.Column(scale=5):
text_input = gr.Textbox(
label="📝 Text to Synthesize",
placeholder="Enter Portuguese text here... (e.g., 'Olá! Este é um teste do sistema de síntese de voz.')",
lines=6,
max_lines=10,
)
with gr.Row():
voice_dropdown = gr.Dropdown(
choices=list(voice_names_display_dict.items()),
value=voice_names_display_default,
label="🎭 Voice Selection",
info="More voices coming soon"
)
submit_btn = gr.Button(
"🔊 Generate Speech",
variant="primary",
size="lg",
interactive=False,
)
# Right Column - Output
with gr.Column(scale=4):
audio_output = gr.Audio(
label="🔊 Generated Audio",
type="filepath",
autoplay=False,
)
status_text = gr.Textbox(
label="Status",
interactive=False,
)
# Example Inputs
gr.Markdown("### 💡 Example Texts")
gr.Examples(
examples=[
["Olá! Bem-vindo ao sistema de síntese de voz em português europeu."],
["A temperatura hoje está entre 5 e 9 graus Celsius."],
["Lisboa é a capital de Portugal, fundada antes do ano 1200."]
],
inputs=text_input,
)
# Footer
gr.Markdown(
"""
<div style="text-align: center">
Criado com ❤️ pela SentiVue
</div>
<div style="text-align: center">
Built with ❤️ by SentiVue
</div>
"""
)
timer = gr.Timer(value=1)
# demo.load(fn=wake_server, outputs=[submit_btn, status_text])
timer.tick(fn=check_readiness, outputs=[submit_btn, status_text, timer])
# Event Handlers
submit_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_text],
)
text_input.submit(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_text]
)
demo.queue().launch()