Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import requests | |
| import os | |
| from deployment_options import voice_id_2_name, defualt_values | |
| import uuid | |
| ENDPOINT_URL = "https://sentivue-endpoint.hf.space/v1/tts" | |
| ENDPOINT_TOKEN = os.getenv("endpoint_READ") | |
| print(f"Public demo will call endpoint: {ENDPOINT_URL}") | |
| print(f"Token loaded: {'Yes' if ENDPOINT_TOKEN else 'No'}") | |
| voice_names = list(voice_id_2_name.values()) | |
| def generate_speech(text: str, voice_name: str): | |
| """ | |
| Calls the private FastAPI endpoint and returns audio | |
| """ | |
| if not text.strip(): | |
| return None, "Please enter some text" | |
| if not ENDPOINT_TOKEN: | |
| return None, "Error: endpoint_READ token not found in environment" | |
| try: | |
| voice_name_2_id = {} | |
| for vid, name in voice_id_2_name.items(): | |
| voice_name_2_id[name] = vid | |
| voice_id = voice_name_2_id[voice_name] | |
| payload = { | |
| "text": text | |
| } | |
| print(f"Sending request to: {ENDPOINT_URL}/{voice_id}") | |
| print(f"Payload: {payload}") | |
| response = requests.post( | |
| f"{ENDPOINT_URL}/{voice_id}", | |
| headers={ | |
| "Authorization": f"Bearer {ENDPOINT_TOKEN}", | |
| "Content-Type": "application/json" | |
| }, | |
| json=payload, | |
| # timeout=60, | |
| stream=True | |
| ) | |
| response.raise_for_status() | |
| # # Return raw WAV bytes - Gradio handles the rest | |
| # return response.content, "Success!" | |
| # # Save to temporary WAV file | |
| # with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: | |
| # tmp_file.write(response.content) | |
| # tmp_path = tmp_file.name | |
| # return tmp_path, "Success!" | |
| # Save to a regular file in current directory (not temp) | |
| # Generate unique ID for output file | |
| generation_id = str(uuid.uuid4())[:15] | |
| output_path = f"speech_{voice_id}_{generation_id}.wav" | |
| with open(output_path, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| if chunk: | |
| f.write(chunk) | |
| return output_path, "Success!" | |
| except requests.exceptions.RequestException as e: | |
| error_msg = f"Error calling endpoint: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| except Exception as e: | |
| error_msg = f"Unexpected error: {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| # ββ Gradio Interface ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(title="pt-PT TTS - Demo") as demo: | |
| gr.Markdown(""" | |
| # pt-PT Text-to-Speech Demo | |
| High-quality, natural-sounding speech synthesis for European Portuguese (pt-PT) with human-like prosody and accurate number pronunciation. | |
| ### Voices | |
| - **Current:** AndrΓ© | |
| - **More voices:** Coming soon, with **extended emotion and prosody control via tags** | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| text_input = gr.Textbox( | |
| label="Text to speak", | |
| placeholder="Write something here...", | |
| lines=4 | |
| ) | |
| voice_dropdown = gr.Dropdown( | |
| choices=voice_names, | |
| value=defualt_values['voice_name'], | |
| label="Voice" | |
| ) | |
| submit_btn = gr.Button("Generate Speech", variant="primary") | |
| with gr.Column(scale=2): | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| autoplay=False | |
| ) | |
| status_text = gr.Textbox(label="Status", interactive=False) | |
| submit_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, voice_dropdown], | |
| outputs=[audio_output, status_text] | |
| ) | |
| gr.Markdown(""" | |
| ### Info | |
| - **Language:** pt-PT (European Portuguese only) | |
| - **Voice:** AndrΓ© | |
| - **Model size:** ~3B parameters | |
| - **Architecture:** LLM-based TTS backbone | |
| - **Training data:** +11k hours of curated pt-PT speech | |
| - **Inference:** Streaming audio generation via remote endpoint | |
| ### API | |
| - **Status:** Coming soon | |
| ### Fine-tuning | |
| - **Status:** Coming soon | |
| - **Requirements:** ~1.5 hours of recorded speech to fine-tune a custom voice | |
| The system is designed for natural prosody, accent fidelity, and long-form synthesis in European Portuguese. | |
| """) | |
| demo.queue().launch() |