saa231's picture
Update app.py
69a31d8 verified
raw
history blame
3.09 kB
from PIL import Image
import gradio as gr
import os
from project_model import process_inputs, session
def handle_question(image, audio):
if image and audio: # Handle the initial question with image and audio
message, answer_audio = process_inputs(session, image=image, audio_path=audio)
# Save images
original_path = "uploaded_image.png"
annotated_path = "annotated_image.png"
image.save(original_path)
if session.annotated_image:
session.annotated_image.save(annotated_path)
# Build markdown reply for initial question
markdown_reply = (
f"**{message}**\n\n"
#f"**Original Image:**\n\n"
#f"![Original Image](file/{original_path})\n\n"
#f"**Detected Objects:**\n\n"
#f"![Annotated Image](file/{annotated_path})\n\n"
#f"**🔊 Audio Response:**\n\n"
#f"<audio controls autoplay>\n"
#f" <source src='file/{answer_audio}' type='audio/wav'>\n"
#f"Your browser does not support the audio element.\n"
#f"</audio>"
)
elif audio: # Handle follow-up with just audio
if not session.current_image:
return "No initial image found for the follow-up question.", None
message, answer_audio = process_inputs(session, image=session.current_image, audio_path=audio)
# Save the current image if it exists
original_path = "uploaded_image.png"
annotated_path = "annotated_image.png"
if session.current_image:
session.current_image.save(original_path)
if session.annotated_image:
session.annotated_image.save(annotated_path)
# Build markdown reply for follow-up question
markdown_reply = (
f"**{message}**\n\n"
#f"**Original Image:**\n\n"
#f"![Original Image](file/{original_path})\n\n"
#f"**Detected Objects:**\n\n"
#f"![Annotated Image](file/{annotated_path})"
)
else:
return "Please upload an image and/or record an audio clip.", None
return markdown_reply, answer_audio
# --- Gradio App ---
with gr.Blocks() as demo:
gr.Markdown("## Multimodal Visual Q&A with Audio Output")
with gr.Row():
with gr.Column():
image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
audio_input = gr.Audio(label="Ask a Question (Voice)", sources=["microphone"], type="filepath")
submit_btn = gr.Button("Submit Question")
with gr.Column():
status_output = gr.Markdown(label="Response") # change from Textbox to Markdown!
audio_output = gr.Audio(label="Audio Answer", interactive=False)
# Single button now for both initial and follow-up inputs
submit_btn.click(
fn=handle_question,
inputs=[image_input, audio_input],
outputs=[status_output, audio_output]
)
if __name__ == "__main__":
demo.launch(show_error=True, share=True)