| from PIL import Image |
| import gradio as gr |
| import os |
| from project_model import process_inputs, session |
|
|
|
|
| def handle_question(image, audio): |
| if image and audio: |
| message, answer_audio = process_inputs(session, image=image, audio_path=audio) |
|
|
| |
| original_path = "uploaded_image.png" |
| annotated_path = "annotated_image.png" |
|
|
| image.save(original_path) |
| if session.annotated_image: |
| session.annotated_image.save(annotated_path) |
|
|
| |
| markdown_reply = ( |
| f"**{message}**\n\n" |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| ) |
| elif audio: |
|
|
| if not session.current_image: |
| return "No initial image found for the follow-up question.", None |
| |
| message, answer_audio = process_inputs(session, image=session.current_image, audio_path=audio) |
|
|
| |
| original_path = "uploaded_image.png" |
| annotated_path = "annotated_image.png" |
|
|
| if session.current_image: |
| session.current_image.save(original_path) |
| if session.annotated_image: |
| session.annotated_image.save(annotated_path) |
|
|
| |
| markdown_reply = ( |
| f"**{message}**\n\n" |
| |
| |
| |
| |
| ) |
| else: |
| return "Please upload an image and/or record an audio clip.", None |
|
|
| return markdown_reply, answer_audio |
|
|
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown("## Multimodal Visual Q&A with Audio Output") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil") |
| audio_input = gr.Audio(label="Ask a Question (Voice)", sources=["microphone"], type="filepath") |
|
|
| submit_btn = gr.Button("Submit Question") |
|
|
| with gr.Column(): |
| status_output = gr.Markdown(label="Response") |
| audio_output = gr.Audio(label="Audio Answer", interactive=False) |
|
|
| |
| submit_btn.click( |
| fn=handle_question, |
| inputs=[image_input, audio_input], |
| outputs=[status_output, audio_output] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(show_error=True, share=True) |