pt-PT_TTS_Demo / app-divide-text.py
m-nagy's picture
UI: update UI and add space info
e280b3f
import gradio as gr
import requests
import os
from deployment_options import voice_id_2_name, defualt_values
import uuid
ENDPOINT_URL = "https://sentivue-endpoint.hf.space/v1/tts"
ENDPOINT_TOKEN = os.getenv("endpoint_READ")
print(f"Public demo will call endpoint: {ENDPOINT_URL}")
print(f"Token loaded: {'Yes' if ENDPOINT_TOKEN else 'No'}")
voice_names = list(voice_id_2_name.values())
def generate_speech(text: str, voice_name: str):
"""
Calls the private FastAPI endpoint and returns audio
"""
if not text.strip():
return None, "Please enter some text"
if not ENDPOINT_TOKEN:
return None, "Error: endpoint_READ token not found in environment"
try:
voice_name_2_id = {}
for vid, name in voice_id_2_name.items():
voice_name_2_id[name] = vid
voice_id = voice_name_2_id[voice_name]
payload = {
"text": text
}
print(f"Sending request to: {ENDPOINT_URL}/{voice_id}")
print(f"Payload: {payload}")
response = requests.post(
f"{ENDPOINT_URL}/{voice_id}",
headers={
"Authorization": f"Bearer {ENDPOINT_TOKEN}",
"Content-Type": "application/json"
},
json=payload,
# timeout=60,
stream=True
)
response.raise_for_status()
# # Return raw WAV bytes - Gradio handles the rest
# return response.content, "Success!"
# # Save to temporary WAV file
# with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
# tmp_file.write(response.content)
# tmp_path = tmp_file.name
# return tmp_path, "Success!"
# Save to a regular file in current directory (not temp)
# Generate unique ID for output file
generation_id = str(uuid.uuid4())[:15]
output_path = f"speech_{voice_id}_{generation_id}.wav"
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return output_path, "Success!"
except requests.exceptions.RequestException as e:
error_msg = f"Error calling endpoint: {str(e)}"
print(error_msg)
return None, error_msg
except Exception as e:
error_msg = f"Unexpected error: {str(e)}"
print(error_msg)
return None, error_msg
# ── Gradio Interface ────────────────────────────────────────────────────────
with gr.Blocks(title="pt-PT TTS - Demo") as demo:
gr.Markdown("""
# pt-PT Text-to-Speech Demo
High-quality, natural-sounding speech synthesis for European Portuguese (pt-PT) with human-like prosody and accurate number pronunciation.
### Voices
- **Current:** AndrΓ©
- **More voices:** Coming soon, with **extended emotion and prosody control via tags**
""")
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="Text to speak",
placeholder="Write something here...",
lines=4
)
voice_dropdown = gr.Dropdown(
choices=voice_names,
value=defualt_values['voice_name'],
label="Voice"
)
submit_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column(scale=2):
audio_output = gr.Audio(
label="Generated Speech",
autoplay=False
)
status_text = gr.Textbox(label="Status", interactive=False)
submit_btn.click(
fn=generate_speech,
inputs=[text_input, voice_dropdown],
outputs=[audio_output, status_text]
)
gr.Markdown("""
### Info
- **Language:** pt-PT (European Portuguese only)
- **Voice:** AndrΓ©
- **Model size:** ~3B parameters
- **Architecture:** LLM-based TTS backbone
- **Training data:** +11k hours of curated pt-PT speech
- **Inference:** Streaming audio generation via remote endpoint
### API
- **Status:** Coming soon
### Fine-tuning
- **Status:** Coming soon
- **Requirements:** ~1.5 hours of recorded speech to fine-tune a custom voice
The system is designed for natural prosody, accent fidelity, and long-form synthesis in European Portuguese.
""")
demo.queue().launch()