Spaces:

saa231
/

MutimodalVisionAssistant

Paused

App Files Files Community

MutimodalVisionAssistant / app.py

saa231

Update app.py

2ede072 verified 10 months ago

raw

history blame

3.18 kB

	import gradio as gr
	from PIL import Image
	from project_module import process_inputs, VisualQAState

	# Create a session object to manage conversation state per image
	session = VisualQAState()

	# Global variables to keep track of the current image and the conversation history
	current_image = None
	chat_history = []

	# Unified handler for new questions or new images
	def handle_inputs(new_image, audio, followup_text, tts_enabled):
	global current_image, chat_history

	# If a new image is uploaded, reset current session state
	if new_image is not None:
	current_image = new_image
	chat_history.clear() # Clear previous Q&A history
	question = followup_text.strip() if followup_text else "Describe the image"
	# Process the new image and question/audio
	response, audio_output = process_inputs(
	session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled
	)
	elif current_image is not None and (followup_text or audio):
	# Follow-up question for current image
	question = followup_text.strip() if followup_text else ""
	response, audio_output = process_inputs(
	session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled
	)
	else:
	# No input given
	return "Please upload an image and ask a question.", None, None, ""

	# Append the Q&A to chat history
	chat_history.append(f"🗨️ Q: {question}\n🧠 A: {response}")
	history_text = "\n\n".join(chat_history) # Format chat as markdown

	return response, current_image, audio_output if tts_enabled else None, history_text

	# Build the Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History")

	with gr.Row():
	with gr.Column():
	# User inputs
	image_input = gr.Image(
	label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil"
	)
	audio_input = gr.Audio(
	label="Record Voice (Optional)", sources=["microphone"], type="filepath"
	)
	followup_text = gr.Textbox(
	label="Type a Question or Follow-up", placeholder="e.g., What’s going on?"
	)
	tts_toggle = gr.Checkbox(
	label="Enable Audio Response", value=True
	)
	submit_btn = gr.Button("Ask")

	with gr.Column():
	# Outputs: Answer, image shown back, audio (if enabled), and chat history
	status_output = gr.Textbox(label="Answer", interactive=False)
	image_display = gr.Image(label="Current Image")
	audio_output = gr.Audio(label="Answer Audio", interactive=False)
	chat_box = gr.Markdown(label="Chat History")

	# Link the submit button to the handler function
	submit_btn.click(
	fn=handle_inputs,
	inputs=[image_input, audio_input, followup_text, tts_toggle],
	outputs=[status_output, image_display, audio_output, chat_box]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(show_error=True, share=True)