Spaces:

saa231
/

MutimodalVisionAssistant

Paused

App Files Files Community

saa231 commited on Apr 27, 2025

Commit

070fc32

verified ·

1 Parent(s): 334328d

Update app.py

Browse files

merged audio inputs

Files changed (1) hide show

app.py +8 -12

app.py CHANGED Viewed

@@ -4,10 +4,8 @@ import os
 from project_model import process_inputs, session
-# --- Handle Initial and Follow-up ---
-def handle_question(image, audio, followup_audio=None):
-    # Initial check for image and audio for first question
-    if image and audio:
         message, answer_audio = process_inputs(session, image=image, audio_path=audio)
         # Save images
@@ -31,11 +29,10 @@ def handle_question(image, audio, followup_audio=None):
             #f"Your browser does not support the audio element.\n"
             #f"</audio>"
         )
-    # Handle follow-up if no initial image/audio but only follow-up audio
-    elif followup_audio:
-        message, answer_audio = process_inputs(session, audio_path=followup_audio)
-        # Save original and annotated images again
         original_path = "uploaded_image.png"
         annotated_path = "annotated_image.png"
@@ -53,7 +50,7 @@ def handle_question(image, audio, followup_audio=None):
             #f"![Annotated Image](file/{annotated_path})"
         )
     else:
-        return "Please upload both an image and an audio clip for the initial question or record a follow-up question.", None
     return markdown_reply, answer_audio
@@ -65,8 +62,7 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
-            audio_input = gr.Audio(label="Initial Question (Voice)", sources=["microphone"], type="filepath")
-            followup_audio_input = gr.Audio(label="Follow-up Question (Voice)", sources=["microphone"], type="filepath")
             submit_btn = gr.Button("Submit Question")
@@ -77,7 +73,7 @@ with gr.Blocks() as demo:
     # Single button now for both initial and follow-up inputs
     submit_btn.click(
         fn=handle_question,
-        inputs=[image_input, audio_input, followup_audio_input],
         outputs=[status_output, audio_output]
     )

 from project_model import process_inputs, session
+def handle_question(image, audio):
+    if image and audio:  # Handle the initial question with image and audio
         message, answer_audio = process_inputs(session, image=image, audio_path=audio)
         # Save images
             #f"Your browser does not support the audio element.\n"
             #f"</audio>"
         )
+    elif audio:  # Handle follow-up with just audio
+        message, answer_audio = process_inputs(session, audio_path=audio)
+        # Save the current image if it exists
         original_path = "uploaded_image.png"
         annotated_path = "annotated_image.png"
             #f"![Annotated Image](file/{annotated_path})"
         )
     else:
+        return "Please upload an image and/or record an audio clip.", None
     return markdown_reply, answer_audio
     with gr.Row():
         with gr.Column():
             image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
+            audio_input = gr.Audio(label="Ask a Question (Voice)", sources=["microphone"], type="filepath")
             submit_btn = gr.Button("Submit Question")
     # Single button now for both initial and follow-up inputs
     submit_btn.click(
         fn=handle_question,
+        inputs=[image_input, audio_input],
         outputs=[status_output, audio_output]
     )