from deep_translator import GoogleTranslator from transformers import pipeline import gradio as gr import time from pytube import YouTube pipe = pipeline(model="tlord/whisper") # change to "your-username/the-name-you-picked" # def transcribe(audio, state = ""): # time.sleep(2) # text = pipe(audio)["text"] # state += text + " " # return state, state # iface = gr.Interface( # title="Whisper Small Swedish", # description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.", # fn=transcribe, # inputs=[gr.Audio(source="microphone", type="filepath", streaming=True), "state"], # outputs=["text", "state"], # live=True # ) # chatbot = gr.Chatbot().style(color_map=("green", "gray")) # iface = gr.Interface( # title="Whisper Sentiment Analysis in Swedish", # description="Say something and the Oracle will respond depending on your mood.", # fn=transcribe, # inputs=[gr.Audio(source="microphone", type="filepath"), "state"], # outputs=[chatbot, "state"], # allow_flagging="never", # ) LANGUAGES = { 'afrikaans' : 'af', 'albanian' : 'sq', 'amharic' : 'am', 'arabic' : 'ar', 'armenian' : 'hy', 'azerbaijani' : 'az', 'basque' : 'eu', 'belarusian' : 'be', 'bengali' : 'bn', 'bosnian' : 'bs', 'bulgarian' : 'bg', 'catalan' : 'ca', 'cebuano' : 'ceb', 'chichewa' : 'ny', 'chinese (simplified)' : 'zh-c', 'chinese (traditional)' : 'zh-t', 'corsican' : 'co', 'croatian' : 'hr', 'czech' : 'cs', 'danish' : 'da', 'dutch' : 'nl', 'english' : 'en', 'esperanto' : 'eo', 'estonian' : 'et', 'filipino' : 'tl', 'finnish' : 'fi', 'french' : 'fr', 'frisian' : 'fy', 'galician' : 'gl', 'georgian' : 'ka', 'german' : 'de', 'greek' : 'el', 'gujarati' : 'gu', 'haitian creole' : 'ht', 'hausa' : 'ha', 'hawaiian' : 'haw', 'hebrew' : 'iw', 'hebrew' : 'he', 'hindi' : 'hi', 'hmong' : 'hmn', 'hungarian' : 'hu', 'icelandic' : 'is', 'igbo' : 'ig', 'indonesian' : 'id', 'irish' : 'ga', 'italian' : 'it', 'japanese' : 'ja', 'javanese' : 'jw', 'kannada' : 'kn', 'kazakh' : 'kk', 'khmer' : 'km', 'korean' : 'ko', 'kurdish (kurmanji)' : 'ku', 'kyrgyz' : 'ky', 'lao' : 'lo', 'latin' : 'la', 'latvian' : 'lv', 'lithuanian' : 'lt', 'luxembourgish' : 'lb', 'macedonian' : 'mk', 'malagasy' : 'mg', 'malay' : 'ms', 'malayalam' : 'ml', 'maltese' : 'mt', 'maori' : 'mi', 'marathi' : 'mr', 'mongolian' : 'mn', 'myanmar (burmese)' : 'my', 'nepali' : 'ne', 'norwegian' : 'no', 'odia' : 'or', 'pashto' : 'ps', 'persian' : 'fa', 'polish' : 'pl', 'portuguese' : 'pt', 'punjabi' : 'pa', 'romanian' : 'ro', 'russian' : 'ru', 'samoan' : 'sm', 'scots gaelic' : 'gd', 'serbian' : 'sr', 'sesotho' : 'st', 'shona' : 'sn', 'sindhi' : 'sd', 'sinhala' : 'si', 'slovak' : 'sk', 'slovenian' : 'sl', 'somali' : 'so', 'spanish' : 'es', 'sundanese' : 'su', 'swahili' : 'sw', 'tajik' : 'tg', 'tamil' : 'ta', 'telugu' : 'te', 'thai' : 'th', 'turkish' : 'tr', 'ukrainian' : 'uk', 'urdu' : 'ur', 'uyghur' : 'ug', 'uzbek' : 'uz', 'vietnamese' : 'vi', 'welsh' : 'cy', 'xhosa' : 'xh', 'yiddish' : 'yi', 'yoruba' : 'yo', 'zulu' : 'zu', } def get_soundfile(link): yt = YouTube(link) audio = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4") return audio def translate(message, lang): res = GoogleTranslator(source='sv', target=lang).translate(message) if res != None and res != "": return res else: return "Error, sorry!" def transcribe(audio, lang, history, link): if link != "": audio = get_soundfile(link) if lang is None or lang == "": lang = 'english' history = history or [] lang_code = LANGUAGES[lang] text = pipe(audio)["text"] history.append((text, translate(text, lang_code))) return history, history with gr.Blocks() as demo: history = gr.State([]) with gr.Row(): with gr.Column(): language = gr.Dropdown(list(LANGUAGES.keys()), value="english") audio = gr.Audio(source="microphone", type="filepath") link = gr.Textbox(label = "Put YouTube link here", value="") submit = gr.Button(value="Translate") with gr.Column(): chatbot = gr.Chatbot().style(color_map=("green", "gray")) submit.click(transcribe, inputs=[audio, language, history, link], outputs=[chatbot, history]) demo.launch()