| import os |
| import gradio as gr |
| import whisper |
| import requests |
| import tempfile |
| from neon_tts_plugin_coqui import CoquiTTS |
|
|
| |
| |
|
|
| |
| model = whisper.load_model("base") |
| model_med = whisper.load_model("medium") |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom" |
| HF_TOKEN = os.environ["HF_TOKEN"] |
| headers = {"Authorization": f"Bearer {HF_TOKEN}"} |
| |
| |
|
|
|
|
| |
| LANGUAGES = list(CoquiTTS.langs.keys()) |
| coquiTTS = CoquiTTS() |
| print(f"Languages for Coqui are: {LANGUAGES}") |
| |
| |
| |
| |
|
|
|
|
| |
| def driver_fun(audio) : |
| transcribe, translation, lang = whisper_stt(audio) |
| |
| |
| |
| text_generated = lang_model_response(transcribe, lang) |
| text_generated_en = lang_model_response(translation, 'en') |
| |
| if lang in ['es', 'fr']: |
| speech = tts(text_generated, lang) |
| else: |
| speech = tts(text_generated_en, 'en') |
| return transcribe, translation, text_generated, text_generated_en, speech |
|
|
|
|
| |
| def whisper_stt(audio): |
| print("Inside Whisper TTS") |
| |
| audio = whisper.load_audio(audio) |
| audio = whisper.pad_or_trim(audio) |
| |
| |
| mel = whisper.log_mel_spectrogram(audio).to(model.device) |
| |
| |
| _, probs = model.detect_language(mel) |
| lang = max(probs, key=probs.get) |
| print(f"Detected language: {max(probs, key=probs.get)}") |
| |
| |
| options_transc = whisper.DecodingOptions(fp16 = False, language=lang, task='transcribe') |
| options_transl = whisper.DecodingOptions(fp16 = False, language='en', task='translate') |
| result_transc = whisper.decode(model_med, mel, options_transc) |
| result_transl = whisper.decode(model_med, mel, options_transl) |
| |
| |
| print(f"transcript is : {result_transc.text}") |
| print(f"translation is : {result_transl.text}") |
|
|
| return result_transc.text, result_transl.text, lang |
|
|
|
|
| |
| def lang_model_response(prompt, language): |
| print(f"Inside lang_model_response - Prompt is :{prompt}") |
| p_en = """Question: How are you doing today? |
| Answer: I am doing good, thanks. |
| Question: """ |
| p_es = """Pregunta: Cómo estás hoy? |
| Responder: Estoy bien, gracias. |
| Pregunta: """ |
| p_fr = """Question: Comment vas-tu aujourd'hui? |
| Réponse: Je vais bien, merci. |
| Question: """ |
| |
| if len(prompt) == 0: |
| prompt = """Question: Can you help me please? |
| Answer: Sure, I am here for you. |
| Question: """ |
| |
| if language == 'en': |
| prompt = p_en + prompt + "\n" + "Answer: " |
| elif language == 'es': |
| prompt = p_es + prompt + "\n" + "Responder: " |
| elif language == 'fr': |
| prompt = p_fr + prompt + "\n" + "Réponse: " |
| |
| |
| json_ = {"inputs": prompt, |
| "parameters": |
| { |
| "top_p": 0.90, |
| "max_new_tokens": 64, |
| "temperature": 1.1, |
| "return_full_text": False, |
| "do_sample": True, |
| }, |
| "options": |
| {"use_cache": True, |
| "wait_for_model": True, |
| },} |
| response = requests.post(API_URL, headers=headers, json=json_) |
| |
| output = response.json() |
| output_tmp = output[0]['generated_text'] |
| print(f"Bloom API Response is : {output_tmp}") |
| if language == 'en': |
| solution = output_tmp.split("Answer: ")[2].split("\n")[0] |
| elif language == 'es': |
| solution = output_tmp.split("Responder: ")[2].split("\n")[0] |
| elif language == 'fr': |
| solution = output_tmp.split("Réponse: ")[2].split("\n")[0] |
| else: |
| if '?' in output_tmp: |
| solution = output_tmp.split("?")[0] |
| elif '.' in output_tmp: |
| solution = output_tmp.split(".")[0] |
| elif ',' in output_tmp: |
| solution = output_tmp.split(",")[0] |
| else: |
| solution = output_tmp |
| print(f"Another language was used : {language}") |
| |
| print(f"Final Bloom Response after splits is: {solution}") |
| return solution |
|
|
| |
| def tts(text, language): |
| print(f"Inside tts - language is : {language}") |
| coqui_langs = ['en' ,'es' ,'fr' ,'de' ,'pl' ,'uk' ,'ro' ,'hu' ,'bg' ,'nl' ,'fi' ,'sl' ,'lv' ,'ga'] |
| if language not in coqui_langs: |
| language = 'en' |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
| coquiTTS.get_tts(text, fp, speaker = {"language" : language}) |
| return fp.name |
|
|
| demo = gr.Blocks() |
| with demo: |
| gr.Markdown("<h1><center>Talk to Your Multilingual AI Assistant</center></h1>") |
| gr.Markdown( |
| """Model pipeline consisting of - <br>- [**Whisper**](https://github.com/openai/whisper)for Speech-to-text, <br>- [**Bloom**](https://huggingface.co/bigscience/bloom) for Text-generation, and <br>- [**CoquiTTS**](https://huggingface.co/coqui) for Text-To-Speech. <br><br> Front end is built using [**Gradio Block API**](https://gradio.app/docs/#blocks).<br>All three models are Multilingual, however, there are only these three overlapping languages among them - <u>Spanish (es), French(fr), and English(en). Hence it would be suggested to test this ML-App using these three languages to get the best results</u>. If an English voice input is given then both the textbox on the left-hand side would show the same transcripts. However, if the input is either in _Spanish_ or _French_, then the first textbox would show the language transcript, while the next one would show its English translations.<br>If you want to reuse the App, simply click on the small cross button in the top right corner of your voice record panel, and then press record again! |
| """) |
| with gr.Row(): |
| with gr.Column(): |
| in_audio = gr.Audio(source="microphone", type="filepath", label='Record your voice here in English, Spanish or French for best results-') |
| b1 = gr.Button("AI response pipeline (Whisper - Bloom - Coqui pipeline)") |
| out_transcript = gr.Textbox(label= 'English/Spanish/French Transcript of your Audio using OpenAI Whisper') |
| out_translation_en = gr.Textbox(label= 'English Translation of audio using OpenAI Whisper') |
| with gr.Column(): |
| out_audio = gr.Audio(label='AI response in Audio form in your language - This will be either in Spanish, or in French or in English for all other languages -') |
| out_generated_text = gr.Textbox(label= 'AI response to your query in your preferred language using Bloom! ') |
| out_generated_text_en = gr.Textbox(label= 'AI response to your query in English using Bloom! ') |
| |
| b1.click(driver_fun,inputs=[in_audio], outputs=[out_transcript, out_translation_en, out_generated_text,out_generated_text_en, out_audio]) |
| |
| demo.launch(enable_queue=True, debug=True) |