Saroj Bono commited on
Commit
f53082b
·
1 Parent(s): fc0e5f3

Fix: Refactor to gr.Interface for max compatibility

Browse files
Files changed (1) hide show
  1. app.py +19 -31
app.py CHANGED
@@ -5,6 +5,7 @@ An application that uses vision and voice to answer questions about the world.
5
  import gradio as gr
6
  from transformers import pipeline
7
  from PIL import Image
 
8
 
9
  # --- AI Model Initialization ---
10
  # Load a pre-trained model that is supported for visual-question-answering.
@@ -14,27 +15,23 @@ vqa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-fi
14
  print("Model initialized successfully!")
15
 
16
  # --- Main Processing Function ---
17
- def process_inputs(image, audio_question):
18
  """
19
  Takes a webcam image and a spoken question, and returns the AI's answer.
20
  """
21
- if image is None:
22
  return "Please provide an image from the webcam."
23
- if audio_question is None:
24
  return "Please ask a question using the microphone."
25
 
26
  try:
27
- # For now, we'll use a placeholder for the question.
28
- # In the next step, we'll add speech-to-text.
29
  question = "What do you see in this image?"
30
 
31
- # Convert the NumPy array from the webcam to a PIL Image
32
- pil_image = Image.fromarray(image)
33
-
34
  print(f"Processing question: '{question}'")
35
- result = vqa_pipeline(image=pil_image, question=question)
36
 
37
- # Extract the answer from the model's output
38
  answer = result[0]['answer'] if result else "I'm not sure."
39
  print(f"AI Answer: {answer}")
40
 
@@ -44,27 +41,18 @@ def process_inputs(image, audio_question):
44
  print(f"An error occurred: {e}")
45
  return "Sorry, I encountered an error. Please try again."
46
 
47
- # --- User Interface Definition ---
48
- with gr.Blocks(theme=gr.themes.Soft(), title="Seeing-Eye AI Assistant") as demo:
49
- gr.Markdown("# 👁️ Seeing-Eye AI Assistant")
50
- gr.Markdown("Point your webcam at an object and ask a question. For now, any recording will ask: 'What do you see?'")
51
-
52
- with gr.Row():
53
- webcam_input = gr.Image(source="webcam", label="Webcam Feed")
54
-
55
- with gr.Column():
56
- audio_input = gr.Audio(type="filepath", label="Ask a Question")
57
- ai_output_text = gr.Textbox(label="AI Assistant's Response", interactive=False)
58
-
59
- # Connect the components to the processing function
60
- audio_input.stop_recording(
61
- fn=process_inputs,
62
- inputs=[webcam_input, audio_input],
63
- outputs=ai_output_text
64
- )
65
-
66
- gr.Markdown("---")
67
- gr.Markdown("Built with Gradio and Hugging Face Transformers.")
68
 
69
  if __name__ == "__main__":
70
  demo.queue().launch(share=True)
 
5
  import gradio as gr
6
  from transformers import pipeline
7
  from PIL import Image
8
+ import numpy as np
9
 
10
  # --- AI Model Initialization ---
11
  # Load a pre-trained model that is supported for visual-question-answering.
 
15
  print("Model initialized successfully!")
16
 
17
  # --- Main Processing Function ---
18
+ def process_inputs(webcam_image, mic_audio):
19
  """
20
  Takes a webcam image and a spoken question, and returns the AI's answer.
21
  """
22
+ if webcam_image is None:
23
  return "Please provide an image from the webcam."
24
+ if mic_audio is None:
25
  return "Please ask a question using the microphone."
26
 
27
  try:
28
+ # Placeholder for the real question from speech-to-text
 
29
  question = "What do you see in this image?"
30
 
31
+ # The webcam input is already a PIL Image in this setup
 
 
32
  print(f"Processing question: '{question}'")
33
+ result = vqa_pipeline(image=webcam_image, question=question)
34
 
 
35
  answer = result[0]['answer'] if result else "I'm not sure."
36
  print(f"AI Answer: {answer}")
37
 
 
41
  print(f"An error occurred: {e}")
42
  return "Sorry, I encountered an error. Please try again."
43
 
44
+ # --- User Interface Definition using gr.Interface for maximum compatibility ---
45
+ demo = gr.Interface(
46
+ fn=process_inputs,
47
+ inputs=[
48
+ gr.Image(source="webcam", label="Webcam Feed"),
49
+ gr.Audio(source="microphone", type="filepath", label="Ask a Question")
50
+ ],
51
+ outputs=gr.Textbox(label="AI Assistant's Response"),
52
+ title="👁️ Seeing-Eye AI Assistant",
53
+ description="Point your webcam at an object, ask a question, and see the AI's response.",
54
+ allow_flagging="never",
55
+ )
 
 
 
 
 
 
 
 
 
56
 
57
  if __name__ == "__main__":
58
  demo.queue().launch(share=True)