File size: 7,605 Bytes
bad894e
de64ba8
bad894e
 
de64ba8
 
5df7f8b
 
 
 
 
 
 
 
 
 
 
 
 
bad894e
de64ba8
bad894e
4f01cd3
de64ba8
bad894e
de64ba8
bad894e
de64ba8
 
 
 
3a268b8
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a268b8
de64ba8
3a268b8
de64ba8
 
 
 
 
30cf8cd
 
 
 
de64ba8
30cf8cd
 
de64ba8
30cf8cd
 
 
 
 
 
 
 
 
 
 
 
3a268b8
30cf8cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a268b8
bad894e
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
5df7f8b
 
 
 
 
 
 
 
 
 
 
 
de64ba8
 
 
 
5df7f8b
 
 
 
 
 
 
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cf8cd
de64ba8
30cf8cd
 
 
 
 
 
 
 
 
 
 
de64ba8
 
 
 
 
 
30cf8cd
de64ba8
 
 
 
 
30cf8cd
 
 
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30cf8cd
 
 
 
 
 
 
de64ba8
 
 
 
 
 
 
 
 
 
 
 
 
4f01cd3
bad894e
de64ba8
 
 
4f01cd3
de64ba8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import os
import sys
import tempfile
import torch
import gradio as gr
from datetime import datetime
import numpy as np

# Try to import audio libraries
try:
    import scipy.io.wavfile as wavfile
    USE_SCIPY = True
except ImportError:
    USE_SCIPY = False
    try:
        import soundfile as sf
        USE_SOUNDFILE = True
    except ImportError:
        USE_SOUNDFILE = False

# Configuration
MODEL_PATH = "v4_indic.pt"
DEFAULT_SPEAKER = "hindi_female"
DEFAULT_SAMPLE_RATE = 48000

print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====")

# Load the model
print(f"Loading model from {MODEL_PATH}")
m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model")
print(f"Model object loaded: {type(m).__name__}")

# Inspect apply_tts signature
import inspect
sig = inspect.signature(m.apply_tts)
print(f"apply_tts signature: {sig}")

# Available speakers
AVAILABLE_SPEAKERS = [
    "bengali_female", "bengali_male",
    "gujarati_female", "gujarati_male",
    "hindi_female", "hindi_male",
    "kannada_female", "kannada_male",
    "malayalam_female", "malayalam_male",
    "manipuri_female",
    "rajasthani_female", "rajasthani_male",
    "tamil_female", "tamil_male",
    "telugu_female", "telugu_male"
]

def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
    """
    Wrapper to call apply_tts with proper error handling.
    """
    # Validate speaker
    if speaker not in AVAILABLE_SPEAKERS:
        print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'")
        speaker = DEFAULT_SPEAKER
    
    # Clean and validate text
    text = text.strip()
    if not text:
        raise ValueError("Text cannot be empty")
    
    # Remove zero-width characters and normalize
    text = text.replace('\u200d', '').replace('\u200c', '')
    
    print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}")
    
    try:
        # Try with ssml_text parameter (some models prefer this)
        res = m.apply_tts(
            ssml_text=text,
            speaker=speaker,
            sample_rate=sample_rate
        )
        print("Success with ssml_text parameter")
    except Exception as e1:
        print(f"ssml_text attempt failed: {e1}")
        try:
            # Try with text parameter
            res = m.apply_tts(
                text=text,
                speaker=speaker,
                sample_rate=sample_rate
            )
            print("Success with text parameter")
        except Exception as e2:
            print(f"text attempt failed: {e2}")
            try:
                # Try minimal parameters
                res = m.apply_tts(
                    text=text,
                    speaker=speaker
                )
                print("Success with minimal parameters")
            except Exception as e3:
                print(f"All attempts failed. Last error: {e3}")
                raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}")
    
    # Handle different return types
    if isinstance(res, tuple):
        audio = res[0]
    else:
        audio = res
    
    return audio


def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE):
    """
    Synthesize text to audio and save to temporary WAV file.
    
    Args:
        text: Text to synthesize
        speaker: Speaker voice to use
        sample_rate: Audio sample rate
        
    Returns:
        Path to generated WAV file
    """
    audio = _call_apply_tts(text, speaker, sample_rate)
    
    # Convert to numpy array if needed
    if torch.is_tensor(audio):
        audio = audio.cpu().numpy()
    
    # Ensure audio is in the right format
    if audio.dtype != np.int16:
        # Normalize to -1 to 1 range if needed
        if audio.max() > 1.0 or audio.min() < -1.0:
            audio = audio / max(abs(audio.max()), abs(audio.min()))
        # Convert to 16-bit PCM
        audio = (audio * 32767).astype(np.int16)
    
    # Create temporary file
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    
    # Save audio using available library
    if USE_SCIPY:
        wavfile.write(path, sample_rate, audio)
    elif USE_SOUNDFILE:
        sf.write(path, audio, sample_rate)
    else:
        raise RuntimeError("No audio library available. Please install scipy or soundfile.")
    
    return path


def tts_gradio_fn(text, speaker, sample_rate):
    """
    Gradio interface function.
    
    Args:
        text: Input text
        speaker: Selected speaker voice
        sample_rate: Audio sample rate
        
    Returns:
        Path to generated audio file
    """
    if not text or not text.strip():
        raise gr.Error("Please enter some text to synthesize")
    
    # Warn if text is too long
    if len(text) > 200:
        raise gr.Error("Text is too long. Please use shorter text (under 200 characters)")
    
    try:
        path = synthesize_text_to_wavfile(text, speaker, sample_rate)
        return path
    except ValueError as e:
        raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.")
    except Exception as e:
        raise gr.Error(f"Speech generation failed: {str(e)}")


# Create Gradio interface
with gr.Blocks(title="Silero v4 Indic TTS") as demo:
    gr.Markdown("# Silero v4 Indic Text-to-Speech")
    gr.Markdown("Convert text to speech in multiple Indian languages")
    gr.Markdown("⚠️ **Note:** Use simple, short phrases for best results. Complex sentences may fail.")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Enter Text",
                placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)",
                lines=3,
                info="Keep text short and simple for best results"
            )
            
            speaker_dropdown = gr.Dropdown(
                choices=AVAILABLE_SPEAKERS,
                value=DEFAULT_SPEAKER,
                label="Select Speaker Voice"
            )
            
            sample_rate_dropdown = gr.Dropdown(
                choices=[8000, 16000, 24000, 48000],
                value=DEFAULT_SAMPLE_RATE,
                label="Sample Rate (Hz)"
            )
            
            submit_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Audio",
                type="filepath"
            )
    
    # Examples
    gr.Examples(
        examples=[
            ["नमस्ते", "hindi_female", 48000],
            ["आप कैसे हैं", "hindi_male", 48000],
            ["হ্যালো", "bengali_female", 48000],
            ["வணக்கம்", "tamil_female", 48000],
            ["హలో", "telugu_female", 48000],
            ["ಹಲೋ", "kannada_female", 48000],
            ["હેલો", "gujarati_female", 48000],
        ],
        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
        outputs=audio_output,
        fn=tts_gradio_fn,
        cache_examples=False
    )
    
    submit_btn.click(
        fn=tts_gradio_fn,
        inputs=[text_input, speaker_dropdown, sample_rate_dropdown],
        outputs=audio_output
    )

# Launch the app with API enabled
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_api=True  # This enables the API documentation
    )