Spaces:

jacobda
/

whisper

Runtime error

File size: 4,756 Bytes

from deep_translator import GoogleTranslator
from transformers import pipeline
import gradio as gr
import time
from pytube import YouTube

pipe = pipeline(model="tlord/whisper")  # change to "your-username/the-name-you-picked"

# def transcribe(audio, state = ""):
    # time.sleep(2)
    # text = pipe(audio)["text"]
    # state += text + " "
    # return state, state

# iface = gr.Interface(
#     title="Whisper Small Swedish",
#     description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
#     fn=transcribe, 
#     inputs=[gr.Audio(source="microphone", type="filepath", streaming=True), "state"], 
#     outputs=["text", "state"],
#     live=True
# )

# chatbot = gr.Chatbot().style(color_map=("green", "gray"))
# iface = gr.Interface(
#     title="Whisper Sentiment Analysis in Swedish",
#     description="Say something and the Oracle will respond depending on your mood.",
#     fn=transcribe,
#     inputs=[gr.Audio(source="microphone", type="filepath"), "state"],
#     outputs=[chatbot, "state"],
#     allow_flagging="never",
# )

LANGUAGES = {
    'afrikaans' : 'af',
    'albanian' : 'sq',
    'amharic' : 'am',
    'arabic' : 'ar',
    'armenian' : 'hy',
    'azerbaijani' : 'az',
    'basque' : 'eu',
    'belarusian' : 'be',
    'bengali' : 'bn',
    'bosnian' : 'bs',
    'bulgarian' : 'bg',
    'catalan' : 'ca',
    'cebuano' : 'ceb',
    'chichewa' : 'ny',
    'chinese (simplified)' : 'zh-c',
    'chinese (traditional)' : 'zh-t',
    'corsican' : 'co',
    'croatian' : 'hr',
    'czech' : 'cs',
    'danish' : 'da',
    'dutch' : 'nl',
    'english' : 'en',
    'esperanto' : 'eo',
    'estonian' : 'et',
    'filipino' : 'tl',
    'finnish' : 'fi',
    'french' : 'fr',
    'frisian' : 'fy',
    'galician' : 'gl',
    'georgian' : 'ka',
    'german' : 'de',
    'greek' : 'el',
    'gujarati' : 'gu',
    'haitian creole' : 'ht',
    'hausa' : 'ha',
    'hawaiian' : 'haw',
    'hebrew' : 'iw',
    'hebrew' : 'he',
    'hindi' : 'hi',
    'hmong' : 'hmn',
    'hungarian' : 'hu',
    'icelandic' : 'is',
    'igbo' : 'ig',
    'indonesian' : 'id',
    'irish' : 'ga',
    'italian' : 'it',
    'japanese' : 'ja',
    'javanese' : 'jw',
    'kannada' : 'kn',
    'kazakh' : 'kk',
    'khmer' : 'km',
    'korean' : 'ko',
    'kurdish (kurmanji)' : 'ku',
    'kyrgyz' : 'ky',
    'lao' : 'lo',
    'latin' : 'la',
    'latvian' : 'lv',
    'lithuanian' : 'lt',
    'luxembourgish' : 'lb',
    'macedonian' : 'mk',
    'malagasy' : 'mg',
    'malay' : 'ms',
    'malayalam' : 'ml',
    'maltese' : 'mt',
    'maori' : 'mi',
    'marathi' : 'mr',
    'mongolian' : 'mn',
    'myanmar (burmese)' : 'my',
    'nepali' : 'ne',
    'norwegian' : 'no',
    'odia' : 'or',
    'pashto' : 'ps',
    'persian' : 'fa',
    'polish' : 'pl',
    'portuguese' : 'pt',
    'punjabi' : 'pa',
    'romanian' : 'ro',
    'russian' : 'ru',
    'samoan' : 'sm',
    'scots gaelic' : 'gd',
    'serbian' : 'sr',
    'sesotho' : 'st',
    'shona' : 'sn',
    'sindhi' : 'sd',
    'sinhala' : 'si',
    'slovak' : 'sk',
    'slovenian' : 'sl',
    'somali' : 'so',
    'spanish' : 'es',
    'sundanese' : 'su',
    'swahili' : 'sw',
    'tajik' : 'tg',
    'tamil' : 'ta',
    'telugu' : 'te',
    'thai' : 'th',
    'turkish' : 'tr',
    'ukrainian' : 'uk',
    'urdu' : 'ur',
    'uyghur' : 'ug',
    'uzbek' : 'uz',
    'vietnamese' : 'vi',
    'welsh' : 'cy',
    'xhosa' : 'xh',
    'yiddish' : 'yi',
    'yoruba' : 'yo',
    'zulu' : 'zu',
}

def get_soundfile(link):
    yt = YouTube(link)
    audio = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")

    return audio

def translate(message, lang):
    res = GoogleTranslator(source='sv', target=lang).translate(message)
    if res != None and res != "":
        return res
    else:
        return "Error, sorry!"

def transcribe(audio, lang, history, link):

    if link != "":
        audio = get_soundfile(link)
    if lang is None or lang == "":
        lang = 'english'
    history = history or []
    lang_code = LANGUAGES[lang]
    text = pipe(audio)["text"]
    history.append((text, translate(text, lang_code)))
    
    return history, history

with gr.Blocks() as demo:
    history = gr.State([])
    with gr.Row():
        with gr.Column():
            language = gr.Dropdown(list(LANGUAGES.keys()), value="english")
            audio = gr.Audio(source="microphone", type="filepath")
            link = gr.Textbox(label = "Put YouTube link here", value="")
            submit = gr.Button(value="Translate")
        with gr.Column():
            chatbot = gr.Chatbot().style(color_map=("green", "gray"))
    submit.click(transcribe, inputs=[audio, language, history, link], outputs=[chatbot, history])
    
demo.launch()