import gradio as gr import torch from transformers import VitsModel, AutoTokenizer # 1. Map our languages to their Hugging Face model IDs model_mapping = { "isiXhosa": "UBC-NLP/Simba-TTS-xho", "Sesotho": "UBC-NLP/Simba-TTS-sot", "Afrikaans": "UBC-NLP/Simba-TTS-afr" } # 2. Define the sample texts for each language sample_texts = { "isiXhosa": "Molo! Ndiyathemba ukuba uphilile. Namhlanje ilanga liyakhanya kwaye umoya upholile kancinci. Abantu bahamba-hamba ezitalatweni, abanye baya emsebenzini, abanye baya ezikolweni. Ekhaya, abantwana badlala ngaphandle, bevuya kwaye behleka. Ubomi buyaqhubeka mihla le, kwaye kubalulekile ukuxabisa amaxesha amancinci anika uvuyo.\n\nUkuba unemibuzo okanye ufuna enye into, undazise!", "Sesotho": "Dumela! Ke ts'epa hore o phetse hantle. Kajeno letsatsi lea chaba, mme moya o foka butle. Batho ba bang ba ya mosebetsing, ba bang ba ya sekolong. Bana bona ba bapala kantle, ba thabile ba bile ba tsheha. Bophelo bo tswela pele letsatsi le letsatsi, mme ho bohlokwa ho ananela dintho tse nyane tse re thabisang.\n\nHaeba o hloka thuso kapa o batla mongolo o mong, mpolelle!", "Afrikaans": "Hallo! Ek hoop dit gaan goed met jou. Vandag skyn die son en daar waai ’n ligte wind. Mense is besig om werk toe te gaan en kinders maak gereed vir skool. In die buurt hoor jy voëls sing en mense gesels. Die lewe gaan aan, en dit is belangrik om die klein oomblikke te waardeer wat vreugde bring.\n\nLaat weet gerus as jy nog iets nodig het!" } # Dictionaries to hold loaded models loaded_models = {} loaded_tokenizers = {} def synthesize_speech(language, text): """Core function that takes UI inputs and returns audio for the UI.""" if not text.strip(): return None model_id = model_mapping[language] # Lazy load the model and tokenizer if language not in loaded_models: print(f"Loading {language} model for the first time...") loaded_models[language] = VitsModel.from_pretrained(model_id) loaded_tokenizers[language] = AutoTokenizer.from_pretrained(model_id) model = loaded_models[language] tokenizer = loaded_tokenizers[language] # Generate the audio inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform sample_rate = model.config.sampling_rate audio_data = output.squeeze().cpu().numpy() return (sample_rate, audio_data) def update_text(language): """Returns the sample text for the selected language.""" return sample_texts.get(language, "") # 3. Build the Web Interface using Gradio Blocks with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# Simba African TTS Synthesizer") gr.Markdown("Generate high-quality Text-to-Speech for isiXhosa, Sesotho, and Afrikaans.") with gr.Row(): with gr.Column(): # Dropdown with isiXhosa as the default lang_dropdown = gr.Dropdown(choices=list(model_mapping.keys()), value="isiXhosa", label="Select Language") # Textbox prepopulated with isiXhosa text to match the default dropdown value text_input = gr.Textbox(lines=6, value=sample_texts["isiXhosa"], label="Text to Synthesize") submit_btn = gr.Button("Generate Audio", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Audio", type="numpy") # 4. Set up the Interactivity # When the dropdown changes, run `update_text` and push the result into `text_input` lang_dropdown.change(fn=update_text, inputs=lang_dropdown, outputs=text_input) # When the button is clicked, run `synthesize_speech` submit_btn.click(fn=synthesize_speech, inputs=[lang_dropdown, text_input], outputs=audio_output) # 5. Launch the local server if __name__ == "__main__": demo.launch()