File size: 4,756 Bytes
440158f
 
61da54a
440158f
 
61da54a
440158f
61da54a
440158f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da9992
 
 
 
 
 
440158f
 
 
 
 
 
 
cc38f71
9bbc2de
d84be0f
cc38f71
d84be0f
9bbc2de
440158f
 
 
 
 
 
 
 
 
 
 
d84be0f
cc38f71
d84be0f
440158f
 
 
cc38f71
440158f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from deep_translator import GoogleTranslator
from transformers import pipeline
import gradio as gr
import time
from pytube import YouTube

pipe = pipeline(model="tlord/whisper")  # change to "your-username/the-name-you-picked"

# def transcribe(audio, state = ""):
    # time.sleep(2)
    # text = pipe(audio)["text"]
    # state += text + " "
    # return state, state

# iface = gr.Interface(
#     title="Whisper Small Swedish",
#     description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
#     fn=transcribe, 
#     inputs=[gr.Audio(source="microphone", type="filepath", streaming=True), "state"], 
#     outputs=["text", "state"],
#     live=True
# )

# chatbot = gr.Chatbot().style(color_map=("green", "gray"))
# iface = gr.Interface(
#     title="Whisper Sentiment Analysis in Swedish",
#     description="Say something and the Oracle will respond depending on your mood.",
#     fn=transcribe,
#     inputs=[gr.Audio(source="microphone", type="filepath"), "state"],
#     outputs=[chatbot, "state"],
#     allow_flagging="never",
# )

LANGUAGES = {
    'afrikaans' : 'af',
    'albanian' : 'sq',
    'amharic' : 'am',
    'arabic' : 'ar',
    'armenian' : 'hy',
    'azerbaijani' : 'az',
    'basque' : 'eu',
    'belarusian' : 'be',
    'bengali' : 'bn',
    'bosnian' : 'bs',
    'bulgarian' : 'bg',
    'catalan' : 'ca',
    'cebuano' : 'ceb',
    'chichewa' : 'ny',
    'chinese (simplified)' : 'zh-c',
    'chinese (traditional)' : 'zh-t',
    'corsican' : 'co',
    'croatian' : 'hr',
    'czech' : 'cs',
    'danish' : 'da',
    'dutch' : 'nl',
    'english' : 'en',
    'esperanto' : 'eo',
    'estonian' : 'et',
    'filipino' : 'tl',
    'finnish' : 'fi',
    'french' : 'fr',
    'frisian' : 'fy',
    'galician' : 'gl',
    'georgian' : 'ka',
    'german' : 'de',
    'greek' : 'el',
    'gujarati' : 'gu',
    'haitian creole' : 'ht',
    'hausa' : 'ha',
    'hawaiian' : 'haw',
    'hebrew' : 'iw',
    'hebrew' : 'he',
    'hindi' : 'hi',
    'hmong' : 'hmn',
    'hungarian' : 'hu',
    'icelandic' : 'is',
    'igbo' : 'ig',
    'indonesian' : 'id',
    'irish' : 'ga',
    'italian' : 'it',
    'japanese' : 'ja',
    'javanese' : 'jw',
    'kannada' : 'kn',
    'kazakh' : 'kk',
    'khmer' : 'km',
    'korean' : 'ko',
    'kurdish (kurmanji)' : 'ku',
    'kyrgyz' : 'ky',
    'lao' : 'lo',
    'latin' : 'la',
    'latvian' : 'lv',
    'lithuanian' : 'lt',
    'luxembourgish' : 'lb',
    'macedonian' : 'mk',
    'malagasy' : 'mg',
    'malay' : 'ms',
    'malayalam' : 'ml',
    'maltese' : 'mt',
    'maori' : 'mi',
    'marathi' : 'mr',
    'mongolian' : 'mn',
    'myanmar (burmese)' : 'my',
    'nepali' : 'ne',
    'norwegian' : 'no',
    'odia' : 'or',
    'pashto' : 'ps',
    'persian' : 'fa',
    'polish' : 'pl',
    'portuguese' : 'pt',
    'punjabi' : 'pa',
    'romanian' : 'ro',
    'russian' : 'ru',
    'samoan' : 'sm',
    'scots gaelic' : 'gd',
    'serbian' : 'sr',
    'sesotho' : 'st',
    'shona' : 'sn',
    'sindhi' : 'sd',
    'sinhala' : 'si',
    'slovak' : 'sk',
    'slovenian' : 'sl',
    'somali' : 'so',
    'spanish' : 'es',
    'sundanese' : 'su',
    'swahili' : 'sw',
    'tajik' : 'tg',
    'tamil' : 'ta',
    'telugu' : 'te',
    'thai' : 'th',
    'turkish' : 'tr',
    'ukrainian' : 'uk',
    'urdu' : 'ur',
    'uyghur' : 'ug',
    'uzbek' : 'uz',
    'vietnamese' : 'vi',
    'welsh' : 'cy',
    'xhosa' : 'xh',
    'yiddish' : 'yi',
    'yoruba' : 'yo',
    'zulu' : 'zu',
}

def get_soundfile(link):
    yt = YouTube(link)
    audio = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")

    return audio

def translate(message, lang):
    res = GoogleTranslator(source='sv', target=lang).translate(message)
    if res != None and res != "":
        return res
    else:
        return "Error, sorry!"

def transcribe(audio, lang, history, link):

    if link != "":
        audio = get_soundfile(link)
    if lang is None or lang == "":
        lang = 'english'
    history = history or []
    lang_code = LANGUAGES[lang]
    text = pipe(audio)["text"]
    history.append((text, translate(text, lang_code)))
    
    return history, history

with gr.Blocks() as demo:
    history = gr.State([])
    with gr.Row():
        with gr.Column():
            language = gr.Dropdown(list(LANGUAGES.keys()), value="english")
            audio = gr.Audio(source="microphone", type="filepath")
            link = gr.Textbox(label = "Put YouTube link here", value="")
            submit = gr.Button(value="Translate")
        with gr.Column():
            chatbot = gr.Chatbot().style(color_map=("green", "gray"))
    submit.click(transcribe, inputs=[audio, language, history, link], outputs=[chatbot, history])
    
demo.launch()