Implement OpenAI Whisper (STT) functionality
Browse files
app.py
CHANGED
|
@@ -9,8 +9,8 @@ import gradio as gr
|
|
| 9 |
import requests
|
| 10 |
|
| 11 |
# UNCOMMENT TO USE WHISPER
|
| 12 |
-
|
| 13 |
-
|
| 14 |
|
| 15 |
from langchain import ConversationChain, LLMChain
|
| 16 |
|
|
@@ -59,29 +59,29 @@ POLLY_VOICE_DATA = PollyVoiceData()
|
|
| 59 |
|
| 60 |
|
| 61 |
# UNCOMMENT TO USE WHISPER
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
|
| 66 |
|
| 67 |
# UNCOMMENT TO USE WHISPER
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
#
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
|
| 86 |
|
| 87 |
# Pertains to Express-inator functionality
|
|
@@ -470,14 +470,15 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
|
|
| 470 |
submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
|
| 471 |
|
| 472 |
# UNCOMMENT TO USE WHISPER
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
|
| 478 |
gr.Examples(
|
| 479 |
examples=["How many people live in Canada?",
|
| 480 |
"What is 2 to the 30th power?",
|
|
|
|
| 481 |
"How much did it rain in SF today?",
|
| 482 |
"Get me information about the movie 'Avatar'",
|
| 483 |
"What are the top tech headlines in the US?",
|
|
@@ -514,7 +515,7 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
|
|
| 514 |
TRANSLATE_TO_DEFAULT, "Arabic", "Arabic (Gulf)", "Catalan", "Chinese (Cantonese)", "Chinese (Mandarin)",
|
| 515 |
"Danish", "Dutch", "English (Australian)", "English (British)", "English (Indian)", "English (New Zealand)",
|
| 516 |
"English (South African)", "English (US)", "English (Welsh)", "Finnish", "French", "French (Canadian)",
|
| 517 |
-
"German", "German (Austrian)", "Hindi", "Icelandic", "Indonesian", "Italian", "Japanese", "Korean", "Norwegian", "Polish",
|
| 518 |
"Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Spanish (European)",
|
| 519 |
"Spanish (Mexican)", "Spanish (US)", "Swedish", "Turkish", "Ukrainian", "Welsh",
|
| 520 |
"emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",
|
|
|
|
| 9 |
import requests
|
| 10 |
|
| 11 |
# UNCOMMENT TO USE WHISPER
|
| 12 |
+
import warnings
|
| 13 |
+
import whisper
|
| 14 |
|
| 15 |
from langchain import ConversationChain, LLMChain
|
| 16 |
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
# UNCOMMENT TO USE WHISPER
|
| 62 |
+
warnings.filterwarnings("ignore")
|
| 63 |
+
WHISPER_MODEL = whisper.load_model("tiny")
|
| 64 |
+
print("WHISPER_MODEL", WHISPER_MODEL)
|
| 65 |
|
| 66 |
|
| 67 |
# UNCOMMENT TO USE WHISPER
|
| 68 |
+
def transcribe(aud_inp):
|
| 69 |
+
if aud_inp is None:
|
| 70 |
+
return ""
|
| 71 |
+
aud = whisper.load_audio(aud_inp)
|
| 72 |
+
aud = whisper.pad_or_trim(aud)
|
| 73 |
+
mel = whisper.log_mel_spectrogram(aud).to(WHISPER_MODEL.device)
|
| 74 |
+
_, probs = WHISPER_MODEL.detect_language(mel)
|
| 75 |
+
|
| 76 |
+
options = whisper.DecodingOptions()
|
| 77 |
+
# options = whisper.DecodingOptions(language="ja")
|
| 78 |
+
|
| 79 |
+
result = whisper.decode(WHISPER_MODEL, mel, options)
|
| 80 |
+
print("result.text", result.text)
|
| 81 |
+
result_text = ""
|
| 82 |
+
if result and result.text:
|
| 83 |
+
result_text = result.text
|
| 84 |
+
return result_text
|
| 85 |
|
| 86 |
|
| 87 |
# Pertains to Express-inator functionality
|
|
|
|
| 470 |
submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
|
| 471 |
|
| 472 |
# UNCOMMENT TO USE WHISPER
|
| 473 |
+
with gr.Row():
|
| 474 |
+
audio_comp = gr.Microphone(source="microphone", type="filepath", label="Just say it!",
|
| 475 |
+
interactive=True, streaming=False)
|
| 476 |
+
audio_comp.change(transcribe, inputs=[audio_comp], outputs=[message])
|
| 477 |
|
| 478 |
gr.Examples(
|
| 479 |
examples=["How many people live in Canada?",
|
| 480 |
"What is 2 to the 30th power?",
|
| 481 |
+
"If x+y=10 and x-y=4, what are x and y?",
|
| 482 |
"How much did it rain in SF today?",
|
| 483 |
"Get me information about the movie 'Avatar'",
|
| 484 |
"What are the top tech headlines in the US?",
|
|
|
|
| 515 |
TRANSLATE_TO_DEFAULT, "Arabic", "Arabic (Gulf)", "Catalan", "Chinese (Cantonese)", "Chinese (Mandarin)",
|
| 516 |
"Danish", "Dutch", "English (Australian)", "English (British)", "English (Indian)", "English (New Zealand)",
|
| 517 |
"English (South African)", "English (US)", "English (Welsh)", "Finnish", "French", "French (Canadian)",
|
| 518 |
+
"German", "German (Austrian)", "Georgian", "Hindi", "Icelandic", "Indonesian", "Italian", "Japanese", "Korean", "Norwegian", "Polish",
|
| 519 |
"Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Spanish (European)",
|
| 520 |
"Spanish (Mexican)", "Spanish (US)", "Swedish", "Turkish", "Ukrainian", "Welsh",
|
| 521 |
"emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",
|