saa231's picture
Update app.py
2ede072 verified
raw
history blame
3.18 kB
import gradio as gr
from PIL import Image
from project_module import process_inputs, VisualQAState
# Create a session object to manage conversation state per image
session = VisualQAState()
# Global variables to keep track of the current image and the conversation history
current_image = None
chat_history = []
# Unified handler for new questions or new images
def handle_inputs(new_image, audio, followup_text, tts_enabled):
global current_image, chat_history
# If a new image is uploaded, reset current session state
if new_image is not None:
current_image = new_image
chat_history.clear() # Clear previous Q&A history
question = followup_text.strip() if followup_text else "Describe the image"
# Process the new image and question/audio
response, audio_output = process_inputs(
session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled
)
elif current_image is not None and (followup_text or audio):
# Follow-up question for current image
question = followup_text.strip() if followup_text else ""
response, audio_output = process_inputs(
session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled
)
else:
# No input given
return "Please upload an image and ask a question.", None, None, ""
# Append the Q&A to chat history
chat_history.append(f"🗨️ **Q:** {question}\n🧠 **A:** {response}")
history_text = "\n\n".join(chat_history) # Format chat as markdown
return response, current_image, audio_output if tts_enabled else None, history_text
# Build the Gradio UI
with gr.Blocks() as demo:
gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History")
with gr.Row():
with gr.Column():
# User inputs
image_input = gr.Image(
label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil"
)
audio_input = gr.Audio(
label="Record Voice (Optional)", sources=["microphone"], type="filepath"
)
followup_text = gr.Textbox(
label="Type a Question or Follow-up", placeholder="e.g., What’s going on?"
)
tts_toggle = gr.Checkbox(
label="Enable Audio Response", value=True
)
submit_btn = gr.Button("Ask")
with gr.Column():
# Outputs: Answer, image shown back, audio (if enabled), and chat history
status_output = gr.Textbox(label="Answer", interactive=False)
image_display = gr.Image(label="Current Image")
audio_output = gr.Audio(label="Answer Audio", interactive=False)
chat_box = gr.Markdown(label="Chat History")
# Link the submit button to the handler function
submit_btn.click(
fn=handle_inputs,
inputs=[image_input, audio_input, followup_text, tts_toggle],
outputs=[status_output, image_display, audio_output, chat_box]
)
# Launch the app
if __name__ == "__main__":
demo.launch(show_error=True, share=True)