Spaces:

Bonosa2
/

mujoco-panda-space

Configuration error

App Files Files Community

Saroj Bono commited on Jun 21, 2025

Commit

f53082b

1 Parent(s): fc0e5f3

Fix: Refactor to gr.Interface for max compatibility

Browse files

Files changed (1) hide show

app.py +19 -31

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ An application that uses vision and voice to answer questions about the world.
 import gradio as gr
 from transformers import pipeline
 from PIL import Image
 # --- AI Model Initialization ---
 # Load a pre-trained model that is supported for visual-question-answering.
@@ -14,27 +15,23 @@ vqa_pipeline = pipeline("visual-question-answering", model="dandelin/vilt-b32-fi
 print("Model initialized successfully!")
 # --- Main Processing Function ---
-def process_inputs(image, audio_question):
     """
     Takes a webcam image and a spoken question, and returns the AI's answer.
     """
-    if image is None:
         return "Please provide an image from the webcam."
-    if audio_question is None:
         return "Please ask a question using the microphone."
     try:
-        # For now, we'll use a placeholder for the question.
-        # In the next step, we'll add speech-to-text.
         question = "What do you see in this image?"
-        # Convert the NumPy array from the webcam to a PIL Image
-        pil_image = Image.fromarray(image)
         print(f"Processing question: '{question}'")
-        result = vqa_pipeline(image=pil_image, question=question)
-        # Extract the answer from the model's output
         answer = result[0]['answer'] if result else "I'm not sure."
         print(f"AI Answer: {answer}")
@@ -44,27 +41,18 @@ def process_inputs(image, audio_question):
         print(f"An error occurred: {e}")
         return "Sorry, I encountered an error. Please try again."
-# --- User Interface Definition ---
-with gr.Blocks(theme=gr.themes.Soft(), title="Seeing-Eye AI Assistant") as demo:
-    gr.Markdown("# 👁️ Seeing-Eye AI Assistant")
-    gr.Markdown("Point your webcam at an object and ask a question. For now, any recording will ask: 'What do you see?'")
-    with gr.Row():
-        webcam_input = gr.Image(source="webcam", label="Webcam Feed")
-        with gr.Column():
-            audio_input = gr.Audio(type="filepath", label="Ask a Question")
-            ai_output_text = gr.Textbox(label="AI Assistant's Response", interactive=False)
-    # Connect the components to the processing function
-    audio_input.stop_recording(
-        fn=process_inputs,
-        inputs=[webcam_input, audio_input],
-        outputs=ai_output_text
-    )
-    gr.Markdown("---")
-    gr.Markdown("Built with Gradio and Hugging Face Transformers.")
 if __name__ == "__main__":
     demo.queue().launch(share=True)

 import gradio as gr
 from transformers import pipeline
 from PIL import Image
+import numpy as np
 # --- AI Model Initialization ---
 # Load a pre-trained model that is supported for visual-question-answering.
 print("Model initialized successfully!")
 # --- Main Processing Function ---
+def process_inputs(webcam_image, mic_audio):
     """
     Takes a webcam image and a spoken question, and returns the AI's answer.
     """
+    if webcam_image is None:
         return "Please provide an image from the webcam."
+    if mic_audio is None:
         return "Please ask a question using the microphone."
     try:
+        # Placeholder for the real question from speech-to-text
         question = "What do you see in this image?"
+        # The webcam input is already a PIL Image in this setup
         print(f"Processing question: '{question}'")
+        result = vqa_pipeline(image=webcam_image, question=question)
         answer = result[0]['answer'] if result else "I'm not sure."
         print(f"AI Answer: {answer}")
         print(f"An error occurred: {e}")
         return "Sorry, I encountered an error. Please try again."
+# --- User Interface Definition using gr.Interface for maximum compatibility ---
+demo = gr.Interface(
+    fn=process_inputs,
+    inputs=[
+        gr.Image(source="webcam", label="Webcam Feed"),
+        gr.Audio(source="microphone", type="filepath", label="Ask a Question")
+    ],
+    outputs=gr.Textbox(label="AI Assistant's Response"),
+    title="👁️ Seeing-Eye AI Assistant",
+    description="Point your webcam at an object, ask a question, and see the AI's response.",
+    allow_flagging="never",
+)
 if __name__ == "__main__":
     demo.queue().launch(share=True)