File size: 2,510 Bytes
b931330 3103aa5 1057b7a 7fa1011 2a693ec 23013ca 070fc32 109c765 37ccf02 109c765 1057b7a 109c765 1057b7a 109c765 1057b7a 109c765 03a525c 334328d 03a525c 3c53d13 e268082 3b94126 8cdf492 070fc32 334328d 3b94126 109c765 70f14f4 3b94126 109c765 3b94126 334328d 070fc32 70f14f4 03a525c 3c53d13 a72a60d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from PIL import Image
import gradio as gr
import uuid
from project_model import process_inputs, session
def handle_question(image, audio):
try:
# No input provided
if not image and not audio:
return "Please upload an image and/or record an audio clip.", None
# New question with both image + audio
if image is not None:
session.current_image = image
session.messages = []
session.images = []
if session.current_image is None:
return "No initial image found. Please upload an image first.", None
# Process inputs
message, answer_audio = process_inputs(session, image=session.current_image, audio_path=audio)
# Save images (only if a new image was uploaded)
if image:
unique_id = uuid.uuid4().hex
original_path = f"uploaded_image_{unique_id}.png"
annotated_path = f"annotated_image_{unique_id}.png"
image.save(original_path)
if session.annotated_image:
session.annotated_image.save(annotated_path)
# Build Markdown reply
markdown_reply = f"**{message}**\n\n"
# Uncomment if you want images/audio previews inside Markdown
# markdown_reply += f"\n\n"
# markdown_reply += f"\n\n"
# markdown_reply += f"<audio controls autoplay><source src='file/{answer_audio}' type='audio/wav'></audio>"
return markdown_reply, answer_audio
except ValueError as e:
return f"Error: {str(e)}", None
# --- Gradio App ---
with gr.Blocks() as demo:
gr.Markdown("## Multimodal Visual Q&A with Audio Output")
with gr.Row():
with gr.Column():
image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
audio_input = gr.Audio(label="Ask a Question (Voice)", sources=["microphone"], type="filepath")
submit_btn = gr.Button("Submit Question")
with gr.Column():
status_output = gr.Markdown(label="Response") # Use Markdown to format answers
audio_output = gr.Audio(label="Audio Answer", interactive=False)
# Connect button to function
submit_btn.click(
fn=handle_question,
inputs=[image_input, audio_input],
outputs=[status_output, audio_output]
)
if __name__ == "__main__":
demo.launch(show_error=True)
|