| | import os |
| | import gradio as gr |
| | import torch |
| | import numpy |
| | import librosa |
| | import languages_dic |
| | from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline |
| |
|
| |
|
| | title = "Multilanguage Transcription and Translation" |
| |
|
| | availableLang = "Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh." |
| |
|
| | description1 = """<p style='font-size: 18px;'> Transcribe an audio file containing a speech in any of the languages listed below and translate it to English. </p> |
| | <p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n |
| | """ + availableLang |
| |
|
| | description2 ="""<p style='font-size: 18px;'> Transcribe a recording with your microphone of a speech in any of the languages listed below and translate it to English. </p> |
| | <p style='font-size: 16px;'> This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow. </p> \n |
| | """ + availableLang |
| |
|
| |
|
| | device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| | |
| |
|
| | class LM: |
| | model={} |
| | processor={} |
| | pipe={} |
| | |
| | LMsizes = ["base", "small", "medium"] |
| |
|
| | myLM = LM() |
| |
|
| | for LMsize in myLM.LMsizes: |
| | modelType = "openai/whisper-"+LMsize |
| | myLM.model[LMsize] = WhisperForConditionalGeneration.from_pretrained(modelType).to(device) |
| | myLM.processor[LMsize] = WhisperProcessor.from_pretrained(modelType) |
| | myLM.pipe[LMsize] = pipeline(task="automatic-speech-recognition", model=modelType, device=device, chunk_length_s=29, stride_length_s=[5,0]) |
| | |
| |
|
| |
|
| | def detect_language(audio_path, model, processor, asr_pipe_whisper): |
| | |
| | |
| | speech_data, sampling_rate = librosa.load(audio_path, sr=16000, mono=True, duration=20) |
| | |
| | input_features = processor.feature_extractor(speech_data, return_tensors="pt", sampling_rate=sampling_rate).input_features.to(device) |
| | |
| | predicted_ids = model.generate(input_features, task="transcribe") |
| | |
| | detected_lang = asr_pipe_whisper.tokenizer.decode(predicted_ids[0,1]) |
| | |
| | detected_lang = languages_dic.LANGUAGES.get(detected_lang.strip("<|>")) |
| | return detected_lang |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | def getLM(modelsize): |
| | modelsize = modelsize.split(" ") |
| | if len(modelsize) > 0: |
| | modelsize = modelsize[0] |
| | return (myLM.model[modelsize], myLM.processor[modelsize], myLM.pipe[modelsize]) |
| |
|
| | |
| | def processAudio(audio_path, modelsize): |
| | model, processor, asr_pipe_whisper = getLM(modelsize) |
| | translation = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"}) |
| | transcription = asr_pipe_whisper(audio_path, generate_kwargs={"task":"transcribe"}) |
| | |
| | inputLang = detect_language(audio_path, model, processor, asr_pipe_whisper) |
| | return (inputLang, transcription["text"], translation["text"]) |
| | |
| |
|
| | modelsizeInfo = "Try out the performance for different model sizes. Larger models are more robust and deliver better results but are also slower." |
| |
|
| | app1 = gr.Interface( |
| | fn=processAudio, |
| | |
| | |
| | |
| | inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"), |
| | gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")], |
| | outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")], |
| | title=title, |
| | description=description1 |
| | ) |
| |
|
| |
|
| | app2 = gr.Interface( |
| | fn=processAudio, |
| | inputs=[gr.Audio(source="microphone", type="filepath",label="Audio Input"), |
| | gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")], |
| | outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")], |
| | title=title, |
| | description=description2 |
| | ) |
| |
|
| | demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"]) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|