saa231 commited on
Commit
7fa1011
·
verified ·
1 Parent(s): cb2c3a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -65
app.py CHANGED
@@ -1,78 +1,41 @@
1
- import gradio as gr
2
  from PIL import Image
3
- from project_model import process_inputs, VisualQAState
4
 
5
- # Create a session object to manage conversation state per image
6
- session = VisualQAState()
 
 
 
7
 
8
- # Global variables to keep track of the current image and the conversation history
9
- current_image = None
10
- chat_history = []
 
 
11
 
12
- # Unified handler for new questions or new images
13
- def handle_inputs(new_image, audio, followup_text, tts_enabled):
14
- global current_image, chat_history
15
-
16
- # If a new image is uploaded, reset current session state
17
- if new_image is not None:
18
- current_image = new_image
19
- chat_history.clear() # Clear previous Q&A history
20
- question = followup_text.strip() if followup_text else "Describe the image"
21
- # Process the new image and question/audio
22
- response, audio_output = process_inputs(
23
- session, image=current_image, question=question, audio_path=audio, enable_tts=tts_enabled
24
- )
25
- elif current_image is not None and (followup_text or audio):
26
- # Follow-up question for current image
27
- question = followup_text.strip() if followup_text else ""
28
- response, audio_output = process_inputs(
29
- session, image=None, question=question, audio_path=audio, enable_tts=tts_enabled
30
- )
31
- else:
32
- # No input given
33
- return "Please upload an image and ask a question.", None, None, ""
34
-
35
- # Append the Q&A to chat history
36
- chat_history.append(f"🗨️ **Q:** {question}\n🧠 **A:** {response}")
37
- history_text = "\n\n".join(chat_history) # Format chat as markdown
38
-
39
- return response, current_image, audio_output if tts_enabled else None, history_text
40
-
41
- # Build the Gradio UI
42
  with gr.Blocks() as demo:
43
- gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output + History")
44
 
45
  with gr.Row():
46
  with gr.Column():
47
- # User inputs
48
- image_input = gr.Image(
49
- label="Upload or Capture New Image", sources=["upload", "webcam"], type="pil"
50
- )
51
- audio_input = gr.Audio(
52
- label="Record Voice (Optional)", sources=["microphone"], type="filepath"
53
- )
54
- followup_text = gr.Textbox(
55
- label="Type a Question or Follow-up", placeholder="e.g., What’s going on?"
56
- )
57
- tts_toggle = gr.Checkbox(
58
- label="Enable Audio Response", value=True
59
- )
60
- submit_btn = gr.Button("Ask")
61
-
62
  with gr.Column():
63
- # Outputs: Answer, image shown back, audio (if enabled), and chat history
64
- status_output = gr.Textbox(label="Answer", interactive=False)
65
- image_display = gr.Image(label="Current Image")
66
  audio_output = gr.Audio(label="Answer Audio", interactive=False)
67
- chat_box = gr.Markdown(label="Chat History")
68
 
69
- # Link the submit button to the handler function
70
- submit_btn.click(
71
- fn=handle_inputs,
72
- inputs=[image_input, audio_input, followup_text, tts_toggle],
73
- outputs=[status_output, image_display, audio_output, chat_box]
74
- )
75
 
76
- # Launch the app
77
  if __name__ == "__main__":
78
- demo.launch(show_error=True, share=True)
 
 
1
  from PIL import Image
2
+ from project_model import process_inputs, session
3
 
4
+ def handle_initial(image, audio):
5
+ if image is None or audio is None:
6
+ return "Please upload both an image and an audio clip.", None, None
7
+ message, answer_audio = process_inputs(session, image=image, audio_path=audio)
8
+ return message, image, answer_audio
9
 
10
+ def handle_followup(followup_audio):
11
+ if followup_audio is None:
12
+ return "Please record a follow-up question.", None, None
13
+ message, answer_audio = process_inputs(session, audio_path=followup_audio)
14
+ return message, session.current_image, answer_audio # reuse image
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  with gr.Blocks() as demo:
17
+ gr.Markdown("## 👁️🎙️ Multimodal Visual Q&A with Audio Output")
18
 
19
  with gr.Row():
20
  with gr.Column():
21
+ image_input = gr.Image(label="Upload or Capture Image", sources=["upload", "webcam"], type="pil")
22
+ audio_input = gr.Audio(label="Initial Question (Voice)", sources=["microphone"], type="filepath")
23
+ submit_btn = gr.Button("Submit Initial Q&A")
24
+
25
+ gr.Markdown("### 🎤 Ask a Follow-up Question")
26
+ followup_audio_input = gr.Audio(label="Follow-up Question", sources=["microphone"], type="filepath")
27
+ followup_btn = gr.Button("Ask Follow-up")
28
+
 
 
 
 
 
 
 
29
  with gr.Column():
30
+ status_output = gr.Textbox(label="Response", interactive=False)
31
+ image_display = gr.Image(label="Context Image")
 
32
  audio_output = gr.Audio(label="Answer Audio", interactive=False)
 
33
 
34
+ submit_btn.click(fn=handle_initial, inputs=[image_input, audio_input],
35
+ outputs=[status_output, image_display, audio_output])
36
+
37
+ followup_btn.click(fn=handle_followup, inputs=followup_audio_input,
38
+ outputs=[status_output, image_display, audio_output])
 
39
 
 
40
  if __name__ == "__main__":
41
+ demo.launch(show_error=True, share=True)