Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from models.tts import TTSModel | |
| from utils.audio_utils import save_audio, get_cached_audio, get_audio_filename | |
| from utils.input_validation import validate_input | |
| from config.language_mapping import ( | |
| LANGUAGE_VOICE_MAPPING, | |
| construct_description, | |
| EMOTION_DESC, | |
| SPEED_DESC, | |
| PITCH_DESC, | |
| BACKGROUND_NOISE_DESC, | |
| REVERBERATION_DESC, | |
| QUALITY_DESC, | |
| get_speakers_for_language | |
| ) | |
| def generate_speech( | |
| text, | |
| language, | |
| speaker, | |
| emotion="Neutral", | |
| speed="Normal", | |
| pitch="Medium", | |
| background_noise="Minimal", | |
| reverberation="Close", | |
| quality="High" | |
| ): | |
| try: | |
| # Validate inputs | |
| validate_input(text, language) | |
| # Check if audio is already cached | |
| cached_audio = get_cached_audio( | |
| text, language, speaker, emotion, speed, | |
| pitch, background_noise, reverberation, quality | |
| ) | |
| if cached_audio: | |
| return cached_audio | |
| # Get the description using the imported constructor | |
| description = construct_description( | |
| speaker, | |
| language, | |
| emotion, | |
| speed, | |
| pitch, | |
| background_noise, | |
| reverberation, | |
| quality | |
| ) | |
| # Generate audio | |
| tts_model = TTSModel() | |
| audio_array = tts_model.generate_audio(text, description) | |
| # Save the generated audio | |
| filename = get_audio_filename( | |
| text, language, speaker, emotion, speed, | |
| pitch, background_noise, reverberation, quality | |
| ) | |
| filepath = save_audio(audio_array, filename) | |
| return filepath | |
| except Exception as e: | |
| raise gr.Error(str(e)) | |
| # Create Gradio interface | |
| with gr.Blocks(title="Indic Text-to-Speech") as demo: | |
| gr.Markdown("# Indian Local Text-to-Speech Synthesizer") | |
| gr.Markdown("Generate natural speech in multiple Indian languages using AI4Bharat's model") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to speak", | |
| placeholder="Enter the text you want to convert to speech...", | |
| lines=3 | |
| ) | |
| with gr.Row(): | |
| language_input = gr.Dropdown( | |
| choices=sorted(list(LANGUAGE_VOICE_MAPPING.keys())), | |
| label="Language", | |
| value="English" | |
| ) | |
| speaker_input = gr.Dropdown( | |
| choices=LANGUAGE_VOICE_MAPPING["English"], # Default choices | |
| label="Speaker", | |
| value=LANGUAGE_VOICE_MAPPING["English"][0] # Default value | |
| ) | |
| with gr.Row(): | |
| emotion_input = gr.Dropdown( | |
| choices=list(EMOTION_DESC.keys()), | |
| label="Expressivity", | |
| value="Neutral" | |
| ) | |
| speed_input = gr.Dropdown( | |
| choices=list(SPEED_DESC.keys()), | |
| label="Speaking Speed", | |
| value="Normal" | |
| ) | |
| with gr.Row(): | |
| pitch_input = gr.Dropdown( | |
| choices=list(PITCH_DESC.keys()), | |
| label="Pitch", | |
| value="Medium" | |
| ) | |
| background_input = gr.Dropdown( | |
| choices=list(BACKGROUND_NOISE_DESC.keys()), | |
| label="Background Noise", | |
| value="Minimal" | |
| ) | |
| with gr.Row(): | |
| reverb_input = gr.Dropdown( | |
| choices=list(REVERBERATION_DESC.keys()), | |
| label="Reverberation", | |
| value="Close" | |
| ) | |
| quality_input = gr.Dropdown( | |
| choices=list(QUALITY_DESC.keys()), | |
| label="Audio Quality", | |
| value="High" | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| type="numpy" | |
| ) | |
| # Update speaker choices when language changes | |
| def update_speakers(language): | |
| speakers = get_speakers_for_language(language) | |
| return gr.Dropdown(choices=speakers, value=speakers[0]) | |
| language_input.change( | |
| fn=update_speakers, | |
| inputs=[language_input], | |
| outputs=[speaker_input] | |
| ) | |
| # Connect the components | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[ | |
| text_input, | |
| language_input, | |
| speaker_input, | |
| emotion_input, | |
| speed_input, | |
| pitch_input, | |
| background_input, | |
| reverb_input, | |
| quality_input | |
| ], | |
| outputs=audio_output | |
| ) | |
| # Pre-generate and cache example outputs | |
| example_outputs = [] | |
| examples = [ | |
| ["Hello, how are you?", "English", "Thoma", "Happy", "Normal", "Medium", "Minimal", "Close", "High"], | |
| ["नमस्ते, आप कैसे हैं?", "Hindi", "Rohit", "Neutral", "Normal", "Medium", "None", "Very Close", "Studio"], | |
| ["ನಮಸ್ಕಾರ, ಹೇಗಿದ್ದೀರಾ?", "Kannada", "Suresh", "Highly Expressive", "Fast", "High", "Minimal", "Moderate", "High"], | |
| ["How are you doing today?", "English", "Mary", "Monotone", "Slow", "Low", "Moderate", "Distant", "Good"], | |
| ] | |
| # Generate and cache example outputs at startup | |
| for example in examples: | |
| output = generate_speech(*example) | |
| example_outputs.append(output) | |
| # Add examples with cached outputs | |
| gr.Examples( | |
| examples=examples, | |
| inputs=[ | |
| text_input, | |
| language_input, | |
| speaker_input, | |
| emotion_input, | |
| speed_input, | |
| pitch_input, | |
| background_input, | |
| reverb_input, | |
| quality_input | |
| ], | |
| outputs=audio_output, | |
| fn=generate_speech, | |
| cache_examples=True, | |
| preprocess=False, # Don't preprocess inputs | |
| postprocess=False # Don't postprocess outputs | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |