import gradio as gr from PIL import Image from project_module import process_inputs, VisualQAState # Create a session object to manage conversation state per image session = VisualQAState() # Global variables to keep track of the current image and the conversation history current_image = None chat_history = [] # Unified handler for new questions or new images def handle_inputs(new_image, audio, followup_text, tts_enabled): global current_image, chat_history # If a new image is uploaded, reset current session state if new_image is not None: current_image = new_image chat_history.clear() # Clear previous Q&A history question = followup_text.strip() if followup_text else "Describe the image" # Process the new image and question/audio response, audio_output = process_inputs( session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled ) elif current_image is not None and (followup_text or audio): # Follow-up question for current image question = followup_text.strip() if followup_text else "" response, audio_output = process_inputs( session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled ) else: # No input given return "Please upload an image and ask a question.", None, None, "" # Append the Q&A to chat history chat_history.append(f"🗨️ **Q:** {question}\n🧠 **A:** {response}") history_text = "\n\n".join(chat_history) # Format chat as markdown return response, current_image, audio_output if tts_enabled else None, history_text # Build the Gradio UI with gr.Blocks() as demo: gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History") with gr.Row(): with gr.Column(): # User inputs image_input = gr.Image( label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil" ) audio_input = gr.Audio( label="Record Voice (Optional)", sources=["microphone"], type="filepath" ) followup_text = gr.Textbox( label="Type a Question or Follow-up", placeholder="e.g., What’s going on?" ) tts_toggle = gr.Checkbox( label="Enable Audio Response", value=True ) submit_btn = gr.Button("Ask") with gr.Column(): # Outputs: Answer, image shown back, audio (if enabled), and chat history status_output = gr.Textbox(label="Answer", interactive=False) image_display = gr.Image(label="Current Image") audio_output = gr.Audio(label="Answer Audio", interactive=False) chat_box = gr.Markdown(label="Chat History") # Link the submit button to the handler function submit_btn.click( fn=handle_inputs, inputs=[image_input, audio_input, followup_text, tts_toggle], outputs=[status_output, image_display, audio_output, chat_box] ) # Launch the app if __name__ == "__main__": demo.launch(show_error=True, share=True)