|
|
import gradio as gr |
|
|
from PIL import Image |
|
|
from project_module import process_inputs, VisualQAState |
|
|
|
|
|
|
|
|
session = VisualQAState() |
|
|
|
|
|
|
|
|
current_image = None |
|
|
chat_history = [] |
|
|
|
|
|
|
|
|
def handle_inputs(new_image, audio, followup_text, tts_enabled): |
|
|
global current_image, chat_history |
|
|
|
|
|
|
|
|
if new_image is not None: |
|
|
current_image = new_image |
|
|
chat_history.clear() |
|
|
question = followup_text.strip() if followup_text else "Describe the image" |
|
|
|
|
|
response, audio_output = process_inputs( |
|
|
session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled |
|
|
) |
|
|
elif current_image is not None and (followup_text or audio): |
|
|
|
|
|
question = followup_text.strip() if followup_text else "" |
|
|
response, audio_output = process_inputs( |
|
|
session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled |
|
|
) |
|
|
else: |
|
|
|
|
|
return "Please upload an image and ask a question.", None, None, "" |
|
|
|
|
|
|
|
|
chat_history.append(f"🗨️ **Q:** {question}\n🧠 **A:** {response}") |
|
|
history_text = "\n\n".join(chat_history) |
|
|
|
|
|
return response, current_image, audio_output if tts_enabled else None, history_text |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
|
|
|
image_input = gr.Image( |
|
|
label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil" |
|
|
) |
|
|
audio_input = gr.Audio( |
|
|
label="Record Voice (Optional)", sources=["microphone"], type="filepath" |
|
|
) |
|
|
followup_text = gr.Textbox( |
|
|
label="Type a Question or Follow-up", placeholder="e.g., What’s going on?" |
|
|
) |
|
|
tts_toggle = gr.Checkbox( |
|
|
label="Enable Audio Response", value=True |
|
|
) |
|
|
submit_btn = gr.Button("Ask") |
|
|
|
|
|
with gr.Column(): |
|
|
|
|
|
status_output = gr.Textbox(label="Answer", interactive=False) |
|
|
image_display = gr.Image(label="Current Image") |
|
|
audio_output = gr.Audio(label="Answer Audio", interactive=False) |
|
|
chat_box = gr.Markdown(label="Chat History") |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=handle_inputs, |
|
|
inputs=[image_input, audio_input, followup_text, tts_toggle], |
|
|
outputs=[status_output, image_display, audio_output, chat_box] |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(show_error=True, share=True) |
|
|
|