File size: 2,510 Bytes
b931330
3103aa5
1057b7a
7fa1011
2a693ec
23013ca
070fc32
109c765
 
 
 
 
 
 
37ccf02
109c765
 
 
 
 
 
 
 
 
 
 
1057b7a
 
 
 
109c765
1057b7a
 
109c765
 
 
 
 
 
 
 
1057b7a
109c765
 
 
03a525c
334328d
03a525c
3c53d13
e268082
3b94126
 
 
8cdf492
070fc32
334328d
3b94126
 
109c765
70f14f4
3b94126
109c765
3b94126
334328d
070fc32
70f14f4
03a525c
3c53d13
 
a72a60d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from PIL import Image
import gradio as gr
import uuid
from project_model import process_inputs, session


def handle_question(image, audio):
    try:
        # No input provided
        if not image and not audio:
            return "Please upload an image and/or record an audio clip.", None

        # New question with both image + audio
        if image is not None:
            session.current_image = image
            session.messages = []
            session.images = []

        if session.current_image is None:
            return "No initial image found. Please upload an image first.", None

        # Process inputs
        message, answer_audio = process_inputs(session, image=session.current_image, audio_path=audio)

        # Save images (only if a new image was uploaded)
        if image:
            unique_id = uuid.uuid4().hex
            original_path = f"uploaded_image_{unique_id}.png"
            annotated_path = f"annotated_image_{unique_id}.png"
            image.save(original_path)

            if session.annotated_image:
                session.annotated_image.save(annotated_path)

        # Build Markdown reply
        markdown_reply = f"**{message}**\n\n"
        # Uncomment if you want images/audio previews inside Markdown
        # markdown_reply += f"![Original Image](file/{original_path})\n\n"
        # markdown_reply += f"![Annotated Image](file/{annotated_path})\n\n"
        # markdown_reply += f"<audio controls autoplay><source src='file/{answer_audio}' type='audio/wav'></audio>"

        return markdown_reply, answer_audio

    except ValueError as e:
        return f"Error: {str(e)}", None


# --- Gradio App ---
with gr.Blocks() as demo:
    gr.Markdown("## Multimodal Visual Q&A with Audio Output")

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
            audio_input = gr.Audio(label="Ask a Question (Voice)", sources=["microphone"], type="filepath")
            submit_btn = gr.Button("Submit Question")

        with gr.Column():
            status_output = gr.Markdown(label="Response")  # Use Markdown to format answers
            audio_output = gr.Audio(label="Audio Answer", interactive=False)

    # Connect button to function
    submit_btn.click(
        fn=handle_question,
        inputs=[image_input, audio_input],
        outputs=[status_output, audio_output]
    )

if __name__ == "__main__":
    demo.launch(show_error=True)