import os import sys import tempfile import torch import gradio as gr from datetime import datetime import numpy as np # Try to import audio libraries try: import scipy.io.wavfile as wavfile USE_SCIPY = True except ImportError: USE_SCIPY = False try: import soundfile as sf USE_SOUNDFILE = True except ImportError: USE_SOUNDFILE = False # Configuration MODEL_PATH = "v4_indic.pt" DEFAULT_SPEAKER = "hindi_female" DEFAULT_SAMPLE_RATE = 48000 print(f"===== Application Startup at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} =====") # Load the model print(f"Loading model from {MODEL_PATH}") m = torch.package.PackageImporter(MODEL_PATH).load_pickle("tts_models", "model") print(f"Model object loaded: {type(m).__name__}") # Inspect apply_tts signature import inspect sig = inspect.signature(m.apply_tts) print(f"apply_tts signature: {sig}") # Available speakers AVAILABLE_SPEAKERS = [ "bengali_female", "bengali_male", "gujarati_female", "gujarati_male", "hindi_female", "hindi_male", "kannada_female", "kannada_male", "malayalam_female", "malayalam_male", "manipuri_female", "rajasthani_female", "rajasthani_male", "tamil_female", "tamil_male", "telugu_female", "telugu_male" ] def _call_apply_tts(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE): """ Wrapper to call apply_tts with proper error handling. """ # Validate speaker if speaker not in AVAILABLE_SPEAKERS: print(f"Warning: Invalid speaker '{speaker}', using default '{DEFAULT_SPEAKER}'") speaker = DEFAULT_SPEAKER # Clean and validate text text = text.strip() if not text: raise ValueError("Text cannot be empty") # Remove zero-width characters and normalize text = text.replace('\u200d', '').replace('\u200c', '') print(f"Calling apply_tts with text: '{text}', speaker: '{speaker}', sample_rate: {sample_rate}") try: # Try with ssml_text parameter (some models prefer this) res = m.apply_tts( ssml_text=text, speaker=speaker, sample_rate=sample_rate ) print("Success with ssml_text parameter") except Exception as e1: print(f"ssml_text attempt failed: {e1}") try: # Try with text parameter res = m.apply_tts( text=text, speaker=speaker, sample_rate=sample_rate ) print("Success with text parameter") except Exception as e2: print(f"text attempt failed: {e2}") try: # Try minimal parameters res = m.apply_tts( text=text, speaker=speaker ) print("Success with minimal parameters") except Exception as e3: print(f"All attempts failed. Last error: {e3}") raise ValueError(f"Text processing failed. The model may not support this text. Error: {e3}") # Handle different return types if isinstance(res, tuple): audio = res[0] else: audio = res return audio def synthesize_text_to_wavfile(text, speaker=DEFAULT_SPEAKER, sample_rate=DEFAULT_SAMPLE_RATE): """ Synthesize text to audio and save to temporary WAV file. Args: text: Text to synthesize speaker: Speaker voice to use sample_rate: Audio sample rate Returns: Path to generated WAV file """ audio = _call_apply_tts(text, speaker, sample_rate) # Convert to numpy array if needed if torch.is_tensor(audio): audio = audio.cpu().numpy() # Ensure audio is in the right format if audio.dtype != np.int16: # Normalize to -1 to 1 range if needed if audio.max() > 1.0 or audio.min() < -1.0: audio = audio / max(abs(audio.max()), abs(audio.min())) # Convert to 16-bit PCM audio = (audio * 32767).astype(np.int16) # Create temporary file fd, path = tempfile.mkstemp(suffix=".wav") os.close(fd) # Save audio using available library if USE_SCIPY: wavfile.write(path, sample_rate, audio) elif USE_SOUNDFILE: sf.write(path, audio, sample_rate) else: raise RuntimeError("No audio library available. Please install scipy or soundfile.") return path def tts_gradio_fn(text, speaker, sample_rate): """ Gradio interface function. Args: text: Input text speaker: Selected speaker voice sample_rate: Audio sample rate Returns: Path to generated audio file """ if not text or not text.strip(): raise gr.Error("Please enter some text to synthesize") # Warn if text is too long if len(text) > 200: raise gr.Error("Text is too long. Please use shorter text (under 200 characters)") try: path = synthesize_text_to_wavfile(text, speaker, sample_rate) return path except ValueError as e: raise gr.Error(f"Text processing failed: {str(e)}. Try simpler text or a different language.") except Exception as e: raise gr.Error(f"Speech generation failed: {str(e)}") # Create Gradio interface with gr.Blocks(title="Silero v4 Indic TTS") as demo: gr.Markdown("# Silero v4 Indic Text-to-Speech") gr.Markdown("Convert text to speech in multiple Indian languages") gr.Markdown("⚠️ **Note:** Use simple, short phrases for best results. Complex sentences may fail.") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Enter Text", placeholder="नमस्ते (Enter short text in Hindi, Bengali, Tamil, etc.)", lines=3, info="Keep text short and simple for best results" ) speaker_dropdown = gr.Dropdown( choices=AVAILABLE_SPEAKERS, value=DEFAULT_SPEAKER, label="Select Speaker Voice" ) sample_rate_dropdown = gr.Dropdown( choices=[8000, 16000, 24000, 48000], value=DEFAULT_SAMPLE_RATE, label="Sample Rate (Hz)" ) submit_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio( label="Generated Audio", type="filepath" ) # Examples gr.Examples( examples=[ ["नमस्ते", "hindi_female", 48000], ["आप कैसे हैं", "hindi_male", 48000], ["হ্যালো", "bengali_female", 48000], ["வணக்கம்", "tamil_female", 48000], ["హలో", "telugu_female", 48000], ["ಹಲೋ", "kannada_female", 48000], ["હેલો", "gujarati_female", 48000], ], inputs=[text_input, speaker_dropdown, sample_rate_dropdown], outputs=audio_output, fn=tts_gradio_fn, cache_examples=False ) submit_btn.click( fn=tts_gradio_fn, inputs=[text_input, speaker_dropdown, sample_rate_dropdown], outputs=audio_output ) # Launch the app with API enabled if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, show_api=True # This enables the API documentation )