Spaces:

saa231
/

MutimodalVisionAssistant

Paused

App Files Files Community

saa231 commited on Apr 22, 2025

Commit

7fa1011

verified ·

1 Parent(s): cb2c3a0

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -65

app.py CHANGED Viewed

@@ -1,78 +1,41 @@
-import gradio as gr
 from PIL import Image
-from project_model import process_inputs, VisualQAState
-# Create a session object to manage conversation state per image
-session = VisualQAState()
-# Global variables to keep track of the current image and the conversation history
-current_image = None
-chat_history = []
-# Unified handler for new questions or new images
-def handle_inputs(new_image, audio, followup_text, tts_enabled):
-    global current_image, chat_history
-    # If a new image is uploaded, reset current session state
-    if new_image is not None:
-        current_image = new_image
-        chat_history.clear()  # Clear previous Q&A history
-        question = followup_text.strip() if followup_text else "Describe the image"
-        # Process the new image and question/audio
-        response, audio_output = process_inputs(
-            session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled
-        )
-    elif current_image is not None and (followup_text or audio):
-        # Follow-up question for current image
-        question = followup_text.strip() if followup_text else ""
-        response, audio_output = process_inputs(
-            session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled
-        )
-    else:
-        # No input given
-        return "Please upload an image and ask a question.", None, None, ""
-    # Append the Q&A to chat history
-    chat_history.append(f"🗨️ **Q:** {question}\n🧠 **A:** {response}")
-    history_text = "\n\n".join(chat_history)  # Format chat as markdown
-    return response, current_image, audio_output if tts_enabled else None, history_text
-# Build the Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History")
     with gr.Row():
         with gr.Column():
-            # User inputs
-            image_input = gr.Image(
-                label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil"
-            )
-            audio_input = gr.Audio(
-                label="Record Voice (Optional)", sources=["microphone"], type="filepath"
-            )
-            followup_text = gr.Textbox(
-                label="Type a Question or Follow-up", placeholder="e.g., What’s going on?"
-            )
-            tts_toggle = gr.Checkbox(
-                label="Enable Audio Response", value=True
-            )
-            submit_btn = gr.Button("Ask")
         with gr.Column():
-            # Outputs: Answer, image shown back, audio (if enabled), and chat history
-            status_output = gr.Textbox(label="Answer", interactive=False)
-            image_display = gr.Image(label="Current Image")
             audio_output = gr.Audio(label="Answer Audio", interactive=False)
-            chat_box = gr.Markdown(label="Chat History")
-    # Link the submit button to the handler function
-    submit_btn.click(
-        fn=handle_inputs,
-        inputs=[image_input, audio_input, followup_text, tts_toggle],
-        outputs=[status_output, image_display, audio_output, chat_box]
-    )
-# Launch the app
 if __name__ == "__main__":
-    demo.launch(show_error=True, share=True)

 from PIL import Image
+from project_model import process_inputs, session
+def handle_initial(image, audio):
+    if image is None or audio is None:
+        return "Please upload both an image and an audio clip.", None, None
+    message, answer_audio = process_inputs(session, image=image, audio_path=audio)
+    return message, image, answer_audio
+def handle_followup(followup_audio):
+    if followup_audio is None:
+        return "Please record a follow-up question.", None, None
+    message, answer_audio = process_inputs(session, audio_path=followup_audio)
+    return message, session.current_image, answer_audio  # reuse image
 with gr.Blocks() as demo:
+    gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output")
     with gr.Row():
         with gr.Column():
+            image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
+            audio_input = gr.Audio(label="Initial Question (Voice)", sources=["microphone"], type="filepath")
+            submit_btn = gr.Button("Submit Initial Q&A")
+            gr.Markdown("### 🎤 Ask a Follow-up Question")
+            followup_audio_input = gr.Audio(label="Follow-up Question", sources=["microphone"], type="filepath")
+            followup_btn = gr.Button("Ask Follow-up")
         with gr.Column():
+            status_output = gr.Textbox(label="Response", interactive=False)
+            image_display = gr.Image(label="Context Image")
             audio_output = gr.Audio(label="Answer Audio", interactive=False)
+    submit_btn.click(fn=handle_initial, inputs=[image_input, audio_input],
+                     outputs=[status_output, image_display, audio_output])
+    followup_btn.click(fn=handle_followup, inputs=followup_audio_input,
+                       outputs=[status_output, image_display, audio_output])
 if __name__ == "__main__":
+    demo.launch(show_error=True, share=True)