File size: 7,250 Bytes
e280b3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import gradio as gr
import requests
import os
from deployment_options import voice_id_2_name, defualt_values

import uuid

ENDPOINT_URL = "https://sentivue-endpoint.hf.space/v1/tts"
ENDPOINT_TOKEN = os.getenv("endpoint_READ")

print(f"Public demo will call endpoint: {ENDPOINT_URL}")
print(f"Token loaded: {'Yes' if ENDPOINT_TOKEN else 'No'}")

voice_names = list(voice_id_2_name.values())
def generate_speech(text: str, voice_name: str):
    """
    Calls the private FastAPI endpoint and returns audio
    """
    if not text.strip():
        return None, "Please enter some text"
    
    if not ENDPOINT_TOKEN:
        return None, "Error: endpoint_READ token not found in environment"

    try:

        voice_name_2_id = {}
        for vid, name in voice_id_2_name.items():
            voice_name_2_id[name] = vid

        voice_id = voice_name_2_id[voice_name]
        
        payload = {
            "text": text
        }

        print(f"Sending request to: {ENDPOINT_URL}/{voice_id}")
        print(f"Payload: {payload}")

        response = requests.post(
            f"{ENDPOINT_URL}/{voice_id}",
            headers={
                "Authorization": f"Bearer {ENDPOINT_TOKEN}",
                "Content-Type": "application/json"
            },
            json=payload,
            # timeout=60,
            stream=True
        )

        response.raise_for_status()

        # # Return raw WAV bytes - Gradio handles the rest
        # return response.content, "Success!"

        # # Save to temporary WAV file
        # with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
        #     tmp_file.write(response.content)
        #     tmp_path = tmp_file.name
        
        # return tmp_path, "Success!"

        # Save to a regular file in current directory (not temp)
        # Generate unique ID for output file
        generation_id = str(uuid.uuid4())[:15]
        output_path = f"speech_{voice_id}_{generation_id}.wav"
        
        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        return output_path, "Success!"

    except requests.exceptions.RequestException as e:
        error_msg = f"Error calling endpoint: {str(e)}"
        print(error_msg)
        return None, error_msg
    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        print(error_msg)
        return None, error_msg


# ── Gradio Interface ────────────────────────────────────────────────────────

with gr.Blocks(
    title="pt-PT TTS - Demo",
    css="""
    body {
        zoom: 1.2; /* 110% zoom */
    }
    """
    ) as demo:
    
    # Header Section
    gr.Markdown(
        """
        # πŸŽ™οΈ European Portuguese Text-to-Speech
        
        High-quality, natural-sounding speech synthesis for pt-PT with human-like prosody and accurate number pronunciation.
        """
    )
    
    # Model Information Card
    # with gr.Accordion("πŸ“‹ Model Information", open=False):
    #     gr.Markdown(
    #         """            
    #         ### Technical Specifications
    #         - **Model Size:** ~3B parameters
    #         - **Architecture:** LLM-based TTS backbone
    #         - **Training Data:** +11k hours of curated pt-PT speech
    #         """
    #     )
    gr.Markdown(
        """            
        ### Technical Specifications
        - **Model Size:** ~3B parameters
        - **Architecture:** LLM-based TTS backbone
        - **Training Data:** +11k hours of curated pt-PT speech
        """
    )
    
    
    # gr.Markdown("---")
    
    # Main Generation Interface
    # gr.Markdown("## Generate Speech")
    
    with gr.Row():
        # Left Column - Input Controls
        with gr.Column(scale=5):
            text_input = gr.Textbox(
                label="πŸ“ Text to Synthesize",
                placeholder="Enter Portuguese text here... (e.g., 'OlΓ‘! Este Γ© um teste do sistema de sΓ­ntese de voz.')",
                lines=6,
                max_lines=10,
            )
            
            with gr.Row():
                voice_dropdown = gr.Dropdown(
                    choices=voice_names,
                    value=defualt_values['voice_name'],
                    label="🎭 Voice Selection",
                    info="More voices coming soon"
                )
                
                submit_btn = gr.Button(
                    "🎡 Generate Speech",
                    variant="primary",
                    size="lg"
                )
        
        # Right Column - Output
        with gr.Column(scale=4):
            audio_output = gr.Audio(
                label="πŸ”Š Generated Audio",
                type="filepath",
                autoplay=False,
            )
            
            status_text = gr.Textbox(
                label="Status",
                interactive=False,
            )
    
    # Example Inputs
    gr.Markdown("### πŸ’‘ Example Texts")
    gr.Examples(
        examples=[
            ["OlΓ‘! Bem-vindo ao sistema de sΓ­ntese de voz em portuguΓͺs europeu."],
            ["A temperatura hoje estΓ‘ entre 15 e 20 graus Celsius."],
            ["Lisboa Γ© a capital de Portugal, fundada antes do ano 1200."]
        ],
        inputs=text_input,
    )
    
    gr.Markdown("---")
    
    # Information Section
    with gr.Row():
        with gr.Column():
            gr.Markdown(
                """
                ### 🎀 Available Voices
                
                **Current Voice:**
                - AndrΓ© (Default)
                
                **Coming Soon:**
                - Additional voices
                - Extended emotion control
                - Prosody control via tags
                """
            )
        
        with gr.Column():
            gr.Markdown(
                """
                ### πŸ”Œ API Access
                
                **Status:** Coming soon
                
                The API will allow programmatic access to the TTS system with full voice control and streaming support.
                """
            )
        
        with gr.Column():
            gr.Markdown(
                """
                ### 🎨 Fine-tuning
                
                **Status:** Coming soon
                
                **Requirements:**
                - ~1.5 hours of recorded speech
                - Create custom voice clones
                - Maintain natural prosody
                """
            )
    
    # Footer
    gr.Markdown(
        """        
        <div style="text-align: center">
            Built with ❀️ for European Portuguese β€’ Powered by advanced LLM-based TTS
        </div>
        """
    )
    
    # Event Handlers
    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown],
        outputs=[audio_output, status_text],
    )
    
    text_input.submit(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown],
        outputs=[audio_output, status_text]
    )

demo.queue().launch()