Spaces:

Keyven
/

Multimodal-Vision-Insight

Runtime error

Keyven commited on Oct 2, 2023

Commit

2449b43

1 Parent(s): 2d256fb

whisper integration

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,6 +5,12 @@ import re
 import copy
 import secrets
 from pathlib import Path
 # Constants
 BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
@@ -46,6 +52,15 @@ def format_text(text):
     text = "".join(lines)
     return text
 def get_chat_response(chatbot, task_history):
     global model, tokenizer
@@ -133,6 +148,12 @@ def handle_regeneration(chatbot, task_history):
 with gr.Blocks(theme='gradio/soft') as demo:
     gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
     gr.Markdown(
         "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
@@ -162,6 +183,7 @@ with gr.Blocks(theme='gradio/soft') as demo:
     clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
     regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
     upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
 demo.launch()

 import copy
 import secrets
 from pathlib import Path
+import os
+os.system("pip install git+https://github.com/openai/whisper.git")
+import whisper
+model_whisper = whisper.load_model("small")
 # Constants
 BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
     text = "".join(lines)
     return text
+def transcribe_audio(audio):
+    audio = whisper.load_audio(audio)
+    audio = whisper.pad_or_trim(audio)
+    mel = whisper.log_mel_spectrogram(audio).to(model_whisper.device)
+    _, probs = model_whisper.detect_language(mel)
+    options = whisper.DecodingOptions(fp16 = False)
+    result = whisper.decode(model_whisper, mel, options)
+    return result.text
 def get_chat_response(chatbot, task_history):
     global model, tokenizer
 with gr.Blocks(theme='gradio/soft') as demo:
+    audio = gr.Audio(
+        label="Input Audio",
+        show_label=False,
+        source="microphone",
+        type="filepath"
+    )
     gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
     gr.Markdown(
         "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
     clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
     regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
     upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
+    audio.on_change(transcribe_audio, inputs=[audio], outputs=[query])
 demo.launch()