Spaces:

Keyven
/

Multimodal-Vision-Insight

Runtime error

App Files Files Community

Keyven commited on Oct 2, 2023

Commit

0ddcdf3

1 Parent(s): e70cad0

updating UI

Browse files

Files changed (1) hide show

app.py +21 -30

app.py CHANGED Viewed

@@ -5,12 +5,6 @@ import re
 import copy
 import secrets
 from pathlib import Path
-import os
-os.system("pip install git+https://github.com/openai/whisper.git")
-import whisper
-model_whisper = whisper.load_model("small")
 # Constants
 BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
@@ -52,15 +46,6 @@ def format_text(text):
     text = "".join(lines)
     return text
-def transcribe_audio(audio):
-    audio = whisper.load_audio(audio)
-    audio = whisper.pad_or_trim(audio)
-    mel = whisper.log_mel_spectrogram(audio).to(model_whisper.device)
-    _, probs = model_whisper.detect_language(mel)
-    options = whisper.DecodingOptions(fp16 = False)
-    result = whisper.decode(model_whisper, mel, options)
-    return result.text
 def get_chat_response(chatbot, task_history):
     global model, tokenizer
@@ -148,28 +133,36 @@ def handle_regeneration(chatbot, task_history):
 with gr.Blocks(theme='gradio/soft') as demo:
-    audio = gr.Audio(
-        label="Input Audio",
-        show_label=False,
-        source="microphone",
-        type="filepath"
-    )
     gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
     gr.Markdown(
         "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
         "Special thanks to [@Artificialguybr](https://twitter.com/artificialguybr) for the inspiration from his code.\n"
         "### Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud\n"
     )
-    chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=520)
-    query = gr.Textbox(lines=2, label='Input')
     task_history = gr.State([])
     with gr.Row():
-            upload_btn = gr.UploadButton("📁 Upload", file_types=["image"], elem_classes="control-width")
-            submit_btn = gr.Button("🚀 Submit", elem_classes="control-width", variant="primary")
-            regen_btn = gr.Button("🤔️ Regenerate", elem_classes="control-width")
-            clear_btn = gr.Button("🧹 Clear History", elem_classes="control-width", variant="secondary")
     gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
     submit_btn.click(handle_text_input, [chatbot, task_history, query], [chatbot, task_history]).then(
         get_chat_response, [chatbot, task_history], [chatbot], show_progress=True
@@ -179,8 +172,6 @@ with gr.Blocks(theme='gradio/soft') as demo:
     clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
     regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
     upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
-    audio.change(transcribe_audio, inputs=[audio], outputs=[query])
 demo.launch()

 import copy
 import secrets
 from pathlib import Path
 # Constants
 BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
     text = "".join(lines)
     return text
 def get_chat_response(chatbot, task_history):
     global model, tokenizer
 with gr.Blocks(theme='gradio/soft') as demo:
     gr.Markdown("# Qwen-VL Multimodal-Vision-Insight")
     gr.Markdown(
         "## Developed by Keyvan Hardani (Keyvven on [Twitter](https://twitter.com/Keyvven))\n"
         "Special thanks to [@Artificialguybr](https://twitter.com/artificialguybr) for the inspiration from his code.\n"
         "### Qwen-VL: A Multimodal Large Vision Language Model by Alibaba Cloud\n"
     )
+    chatbot = gr.Chatbot([("Hello", "Hi"), ("Describe the image", "I can describe images. Please upload one.")], label='Qwen-VL-Chat', elem_classes="control-height", height=520)
+    gr.Markdown(
+    "### Chat with Qwen-VL\n"
+    "You can ask questions or make statements in the chat input below. "
+    "You can also upload an image and ask questions about it like "
+    "'Describe this image', 'What can you see in this image?', or "
+    "'Explain what's happening in this image'."
+    )
+    query = gr.Textbox(
+        lines=2,
+        label='Chat Input',
+        placeholder='Type your question or statement here, or upload an image and ask about it...',
+        hint='E.g., "Describe this image" or "What is the capital of France?"'
+    )
     task_history = gr.State([])
     with gr.Row():
+        upload_btn = gr.File("🖼️ Upload", file_types=["image"], elem_classes="control-width", label='Upload File')
+        submit_btn = gr.Button("🚀 Submit", elem_classes="control-width", variant="primary")
+        regen_btn = gr.Button("🔄 Regenerate", elem_classes="control-width")
+        clear_btn = gr.Button("🧹 Clear History", elem_classes="control-width", variant="secondary")
     gr.Markdown("### Key Features:\n- **Strong Performance**: Surpasses existing LVLMs on multiple English benchmarks including Zero-shot Captioning and VQA.\n- **Multi-lingual Support**: Supports English, Chinese, and multi-lingual conversation.\n- **High Resolution**: Utilizes 448*448 resolution for fine-grained recognition and understanding.")
     submit_btn.click(handle_text_input, [chatbot, task_history, query], [chatbot, task_history]).then(
         get_chat_response, [chatbot, task_history], [chatbot], show_progress=True
     clear_btn.click(clear_history, [task_history], [chatbot], show_progress=True)
     regen_btn.click(handle_regeneration, [chatbot, task_history], [chatbot], show_progress=True)
     upload_btn.upload(handle_file_upload, [chatbot, task_history, upload_btn], [chatbot, task_history], show_progress=True)
 demo.launch()