Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import VitsModel, AutoTokenizer | |
| # 1. Map our languages to their Hugging Face model IDs | |
| model_mapping = { | |
| "isiXhosa": "UBC-NLP/Simba-TTS-xho", | |
| "Sesotho": "UBC-NLP/Simba-TTS-sot", | |
| "Afrikaans": "UBC-NLP/Simba-TTS-afr" | |
| } | |
| # 2. Define the sample texts for each language | |
| sample_texts = { | |
| "isiXhosa": "Molo! Ndiyathemba ukuba uphilile. Namhlanje ilanga liyakhanya kwaye umoya upholile kancinci. Abantu bahamba-hamba ezitalatweni, abanye baya emsebenzini, abanye baya ezikolweni. Ekhaya, abantwana badlala ngaphandle, bevuya kwaye behleka. Ubomi buyaqhubeka mihla le, kwaye kubalulekile ukuxabisa amaxesha amancinci anika uvuyo.\n\nUkuba unemibuzo okanye ufuna enye into, undazise!", | |
| "Sesotho": "Dumela! Ke ts'epa hore o phetse hantle. Kajeno letsatsi lea chaba, mme moya o foka butle. Batho ba bang ba ya mosebetsing, ba bang ba ya sekolong. Bana bona ba bapala kantle, ba thabile ba bile ba tsheha. Bophelo bo tswela pele letsatsi le letsatsi, mme ho bohlokwa ho ananela dintho tse nyane tse re thabisang.\n\nHaeba o hloka thuso kapa o batla mongolo o mong, mpolelle!", | |
| "Afrikaans": "Hallo! Ek hoop dit gaan goed met jou. Vandag skyn die son en daar waai ’n ligte wind. Mense is besig om werk toe te gaan en kinders maak gereed vir skool. In die buurt hoor jy voëls sing en mense gesels. Die lewe gaan aan, en dit is belangrik om die klein oomblikke te waardeer wat vreugde bring.\n\nLaat weet gerus as jy nog iets nodig het!" | |
| } | |
| # Dictionaries to hold loaded models | |
| loaded_models = {} | |
| loaded_tokenizers = {} | |
| def synthesize_speech(language, text): | |
| """Core function that takes UI inputs and returns audio for the UI.""" | |
| if not text.strip(): | |
| return None | |
| model_id = model_mapping[language] | |
| # Lazy load the model and tokenizer | |
| if language not in loaded_models: | |
| print(f"Loading {language} model for the first time...") | |
| loaded_models[language] = VitsModel.from_pretrained(model_id) | |
| loaded_tokenizers[language] = AutoTokenizer.from_pretrained(model_id) | |
| model = loaded_models[language] | |
| tokenizer = loaded_tokenizers[language] | |
| # Generate the audio | |
| inputs = tokenizer(text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| sample_rate = model.config.sampling_rate | |
| audio_data = output.squeeze().cpu().numpy() | |
| return (sample_rate, audio_data) | |
| def update_text(language): | |
| """Returns the sample text for the selected language.""" | |
| return sample_texts.get(language, "") | |
| # 3. Build the Web Interface using Gradio Blocks | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# Simba African TTS Synthesizer") | |
| gr.Markdown("Generate high-quality Text-to-Speech for isiXhosa, Sesotho, and Afrikaans.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| # Dropdown with isiXhosa as the default | |
| lang_dropdown = gr.Dropdown(choices=list(model_mapping.keys()), value="isiXhosa", label="Select Language") | |
| # Textbox prepopulated with isiXhosa text to match the default dropdown value | |
| text_input = gr.Textbox(lines=6, value=sample_texts["isiXhosa"], label="Text to Synthesize") | |
| submit_btn = gr.Button("Generate Audio", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Audio", type="numpy") | |
| # 4. Set up the Interactivity | |
| # When the dropdown changes, run `update_text` and push the result into `text_input` | |
| lang_dropdown.change(fn=update_text, inputs=lang_dropdown, outputs=text_input) | |
| # When the button is clicked, run `synthesize_speech` | |
| submit_btn.click(fn=synthesize_speech, inputs=[lang_dropdown, text_input], outputs=audio_output) | |
| # 5. Launch the local server | |
| if __name__ == "__main__": | |
| demo.launch() |