JLW commited on
Commit
70c5171
·
1 Parent(s): 27f7f87

Implement OpenAI Whisper (STT) functionality

Browse files
Files changed (1) hide show
  1. app.py +28 -27
app.py CHANGED
@@ -9,8 +9,8 @@ import gradio as gr
9
  import requests
10
 
11
  # UNCOMMENT TO USE WHISPER
12
- # import warnings
13
- # import whisper
14
 
15
  from langchain import ConversationChain, LLMChain
16
 
@@ -59,29 +59,29 @@ POLLY_VOICE_DATA = PollyVoiceData()
59
 
60
 
61
  # UNCOMMENT TO USE WHISPER
62
- # warnings.filterwarnings("ignore")
63
- # WHISPER_MODEL = whisper.load_model("tiny")
64
- # print("WHISPER_MODEL", WHISPER_MODEL)
65
 
66
 
67
  # UNCOMMENT TO USE WHISPER
68
- # def transcribe(aud_inp):
69
- # if aud_inp is None:
70
- # return ""
71
- # aud = whisper.load_audio(aud_inp)
72
- # aud = whisper.pad_or_trim(aud)
73
- # mel = whisper.log_mel_spectrogram(aud).to(WHISPER_MODEL.device)
74
- # _, probs = WHISPER_MODEL.detect_language(mel)
75
- #
76
- # options = whisper.DecodingOptions()
77
- # # options = whisper.DecodingOptions(language="ja")
78
- #
79
- # result = whisper.decode(WHISPER_MODEL, mel, options)
80
- # print("result.text", result.text)
81
- # result_text = ""
82
- # if result and result.text:
83
- # result_text = result.text
84
- # return result_text
85
 
86
 
87
  # Pertains to Express-inator functionality
@@ -470,14 +470,15 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
470
  submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
471
 
472
  # UNCOMMENT TO USE WHISPER
473
- # with gr.Row():
474
- # audio_comp = gr.Microphone(source="microphone", type="filepath", label="Just say it!",
475
- # interactive=True, streaming=False)
476
- # audio_comp.change(transcribe, inputs=[audio_comp], outputs=[message])
477
 
478
  gr.Examples(
479
  examples=["How many people live in Canada?",
480
  "What is 2 to the 30th power?",
 
481
  "How much did it rain in SF today?",
482
  "Get me information about the movie 'Avatar'",
483
  "What are the top tech headlines in the US?",
@@ -514,7 +515,7 @@ with gr.Blocks(css=".gradio-container {background-color: lightgray}") as block:
514
  TRANSLATE_TO_DEFAULT, "Arabic", "Arabic (Gulf)", "Catalan", "Chinese (Cantonese)", "Chinese (Mandarin)",
515
  "Danish", "Dutch", "English (Australian)", "English (British)", "English (Indian)", "English (New Zealand)",
516
  "English (South African)", "English (US)", "English (Welsh)", "Finnish", "French", "French (Canadian)",
517
- "German", "German (Austrian)", "Hindi", "Icelandic", "Indonesian", "Italian", "Japanese", "Korean", "Norwegian", "Polish",
518
  "Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Spanish (European)",
519
  "Spanish (Mexican)", "Spanish (US)", "Swedish", "Turkish", "Ukrainian", "Welsh",
520
  "emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",
 
9
  import requests
10
 
11
  # UNCOMMENT TO USE WHISPER
12
+ import warnings
13
+ import whisper
14
 
15
  from langchain import ConversationChain, LLMChain
16
 
 
59
 
60
 
61
  # UNCOMMENT TO USE WHISPER
62
+ warnings.filterwarnings("ignore")
63
+ WHISPER_MODEL = whisper.load_model("tiny")
64
+ print("WHISPER_MODEL", WHISPER_MODEL)
65
 
66
 
67
  # UNCOMMENT TO USE WHISPER
68
+ def transcribe(aud_inp):
69
+ if aud_inp is None:
70
+ return ""
71
+ aud = whisper.load_audio(aud_inp)
72
+ aud = whisper.pad_or_trim(aud)
73
+ mel = whisper.log_mel_spectrogram(aud).to(WHISPER_MODEL.device)
74
+ _, probs = WHISPER_MODEL.detect_language(mel)
75
+
76
+ options = whisper.DecodingOptions()
77
+ # options = whisper.DecodingOptions(language="ja")
78
+
79
+ result = whisper.decode(WHISPER_MODEL, mel, options)
80
+ print("result.text", result.text)
81
+ result_text = ""
82
+ if result and result.text:
83
+ result_text = result.text
84
+ return result_text
85
 
86
 
87
  # Pertains to Express-inator functionality
 
470
  submit = gr.Button(value="Send", variant="secondary").style(full_width=False)
471
 
472
  # UNCOMMENT TO USE WHISPER
473
+ with gr.Row():
474
+ audio_comp = gr.Microphone(source="microphone", type="filepath", label="Just say it!",
475
+ interactive=True, streaming=False)
476
+ audio_comp.change(transcribe, inputs=[audio_comp], outputs=[message])
477
 
478
  gr.Examples(
479
  examples=["How many people live in Canada?",
480
  "What is 2 to the 30th power?",
481
+ "If x+y=10 and x-y=4, what are x and y?",
482
  "How much did it rain in SF today?",
483
  "Get me information about the movie 'Avatar'",
484
  "What are the top tech headlines in the US?",
 
515
  TRANSLATE_TO_DEFAULT, "Arabic", "Arabic (Gulf)", "Catalan", "Chinese (Cantonese)", "Chinese (Mandarin)",
516
  "Danish", "Dutch", "English (Australian)", "English (British)", "English (Indian)", "English (New Zealand)",
517
  "English (South African)", "English (US)", "English (Welsh)", "Finnish", "French", "French (Canadian)",
518
+ "German", "German (Austrian)", "Georgian", "Hindi", "Icelandic", "Indonesian", "Italian", "Japanese", "Korean", "Norwegian", "Polish",
519
  "Portuguese (Brazilian)", "Portuguese (European)", "Romanian", "Russian", "Spanish (European)",
520
  "Spanish (Mexican)", "Spanish (US)", "Swedish", "Turkish", "Ukrainian", "Welsh",
521
  "emojis", "Gen Z slang", "how the stereotypical Karen would say it", "Klingon",