Spaces:

SentiVue
/

pt-PT_TTS_Demo

Sleeping

File size: 7,250 Bytes

e280b3f

import gradio as gr
import requests
import os
from deployment_options import voice_id_2_name, defualt_values

import uuid

ENDPOINT_URL = "https://sentivue-endpoint.hf.space/v1/tts"
ENDPOINT_TOKEN = os.getenv("endpoint_READ")

print(f"Public demo will call endpoint: {ENDPOINT_URL}")
print(f"Token loaded: {'Yes' if ENDPOINT_TOKEN else 'No'}")

voice_names = list(voice_id_2_name.values())
def generate_speech(text: str, voice_name: str):
    """
    Calls the private FastAPI endpoint and returns audio
    """
    if not text.strip():
        return None, "Please enter some text"
    
    if not ENDPOINT_TOKEN:
        return None, "Error: endpoint_READ token not found in environment"

    try:

        voice_name_2_id = {}
        for vid, name in voice_id_2_name.items():
            voice_name_2_id[name] = vid

        voice_id = voice_name_2_id[voice_name]
        
        payload = {
            "text": text
        }

        print(f"Sending request to: {ENDPOINT_URL}/{voice_id}")
        print(f"Payload: {payload}")

        response = requests.post(
            f"{ENDPOINT_URL}/{voice_id}",
            headers={
                "Authorization": f"Bearer {ENDPOINT_TOKEN}",
                "Content-Type": "application/json"
            },
            json=payload,
            # timeout=60,
            stream=True
        )

        response.raise_for_status()

        # # Return raw WAV bytes - Gradio handles the rest
        # return response.content, "Success!"

        # # Save to temporary WAV file
        # with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
        #     tmp_file.write(response.content)
        #     tmp_path = tmp_file.name
        
        # return tmp_path, "Success!"

        # Save to a regular file in current directory (not temp)
        # Generate unique ID for output file
        generation_id = str(uuid.uuid4())[:15]
        output_path = f"speech_{voice_id}_{generation_id}.wav"
        
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        return output_path, "Success!"

    except requests.exceptions.RequestException as e:
        error_msg = f"Error calling endpoint: {str(e)}"
        print(error_msg)
        return None, error_msg
    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        print(error_msg)
        return None, error_msg


# ── Gradio Interface ────────────────────────────────────────────────────────

with gr.Blocks(
    title="pt-PT TTS - Demo",
    css="""
    body {
        zoom: 1.2; /* 110% zoom */
    }
    """
    ) as demo:
    
    # Header Section
    gr.Markdown(
        """
        # 🎙️ European Portuguese Text-to-Speech
        
        High-quality, natural-sounding speech synthesis for pt-PT with human-like prosody and accurate number pronunciation.
        """
    )
    
    # Model Information Card
    # with gr.Accordion("📋 Model Information", open=False):
    #     gr.Markdown(
    #         """            
    #         ### Technical Specifications
    #         - **Model Size:** ~3B parameters
    #         - **Architecture:** LLM-based TTS backbone
    #         - **Training Data:** +11k hours of curated pt-PT speech
    #         """
    #     )
    gr.Markdown(
        """            
        ### Technical Specifications
        - **Model Size:** ~3B parameters
        - **Architecture:** LLM-based TTS backbone
        - **Training Data:** +11k hours of curated pt-PT speech
        """
    )
    
    
    # gr.Markdown("---")
    
    # Main Generation Interface
    # gr.Markdown("## Generate Speech")
    
    with gr.Row():
        # Left Column - Input Controls
        with gr.Column(scale=5):
            text_input = gr.Textbox(
                label="📝 Text to Synthesize",
                placeholder="Enter Portuguese text here... (e.g., 'Olá! Este é um teste do sistema de síntese de voz.')",
                lines=6,
                max_lines=10,
            )
            
            with gr.Row():
                voice_dropdown = gr.Dropdown(
                    choices=voice_names,
                    value=defualt_values['voice_name'],
                    label="🎭 Voice Selection",
                    info="More voices coming soon"
                )
                
                submit_btn = gr.Button(
                    "🎵 Generate Speech",
                    variant="primary",
                    size="lg"
                )
        
        # Right Column - Output
        with gr.Column(scale=4):
            audio_output = gr.Audio(
                label="🔊 Generated Audio",
                type="filepath",
                autoplay=False,
            )
            
            status_text = gr.Textbox(
                label="Status",
                interactive=False,
            )
    
    # Example Inputs
    gr.Markdown("### 💡 Example Texts")
    gr.Examples(
        examples=[
            ["Olá! Bem-vindo ao sistema de síntese de voz em português europeu."],
            ["A temperatura hoje está entre 15 e 20 graus Celsius."],
            ["Lisboa é a capital de Portugal, fundada antes do ano 1200."]
        ],
        inputs=text_input,
    )
    
    gr.Markdown("---")
    
    # Information Section
    with gr.Row():
        with gr.Column():
            gr.Markdown(
                """
                ### 🎤 Available Voices
                
                **Current Voice:**
                - André (Default)
                
                **Coming Soon:**
                - Additional voices
                - Extended emotion control
                - Prosody control via tags
                """
            )
        
        with gr.Column():
            gr.Markdown(
                """
                ### 🔌 API Access
                
                **Status:** Coming soon
                
                The API will allow programmatic access to the TTS system with full voice control and streaming support.
                """
            )
        
        with gr.Column():
            gr.Markdown(
                """
                ### 🎨 Fine-tuning
                
                **Status:** Coming soon
                
                **Requirements:**
                - ~1.5 hours of recorded speech
                - Create custom voice clones
                - Maintain natural prosody
                """
            )
    
    # Footer
    gr.Markdown(
        """        
        <div style="text-align: center">
            Built with ❤️ for European Portuguese • Powered by advanced LLM-based TTS
        </div>
        """
    )
    
    # Event Handlers
    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown],
        outputs=[audio_output, status_text],
    )
    
    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown],
        outputs=[audio_output, status_text]
    )

demo.queue().launch()