Spaces:

OppaAI
/

Robot_MCP_Client

Sleeping

App Files Files Community

OppaAI commited on Nov 24, 2025

Commit

ef5183f

verified ·

1 Parent(s): 79f6e03

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -34

app.py CHANGED Viewed

@@ -6,6 +6,13 @@ from fastmcp.client import StreamableHttpTransport
 import asyncio
 import ast
 import json
 # -------------------------------
 # MCP server info
@@ -22,72 +29,61 @@ HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
 MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
 # -------------------------------
-# Async function using user's HF token
 # -------------------------------
 async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None = None):
-    """
-    Send webcam image to MCP server using user's HF token and process the response.
-    """
-    # 1. CHECK LOGIN
     if oauth_token is None:
-        return "Please log in using the button above.", "", "", "", "", "", "", ""
-    # 2. CHECK IMAGE
     if image is None:
         return "", "", "", "", "", "", "", ""
     try:
-        # 3. PREPARE IMAGE
         buffered = io.BytesIO()
         image.save(buffered, format="JPEG")
         b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
-        # 4. PREPARE PAYLOAD
         payload = {
             "hf_token_input": oauth_token.token,
             "robot_id_input": ROBOT_ID,
             "image_b64_input": b64_img
         }
-        # 5. CALL MCP SERVER
         async with MCP_CLIENT:
             response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
             if response.is_error:
-                # Handle error content safely
                 error_msg = "Unknown Error"
                 if hasattr(response, 'content') and isinstance(response.content, list):
                     error_msg = " ".join([getattr(item, 'text', '') for item in response.content])
                 raise Exception(f"MCP Tool Error: {error_msg}")
-            # ---------------------------------------------------------
-            # FIX: Handle List Content
-            # The 'content' is a list of objects (e.g., TextContent).
-            # We iterate through the list and join the text parts.
-            # ---------------------------------------------------------
             raw_text = ""
             if hasattr(response, 'content') and isinstance(response.content, list):
                 for item in response.content:
-                    # Check if the item has a 'text' attribute
                     if hasattr(item, 'text'):
                         raw_text += item.text
             else:
-                # Fallback for unexpected structure
                 raw_text = str(response)
-            # 6. PARSE RESPONSE
             try:
                 response_dict = json.loads(raw_text)
             except json.JSONDecodeError:
                 try:
                     response_dict = ast.literal_eval(raw_text)
                 except Exception:
-                    # If parsing fails completely, return the raw text in description
                     return f"Parsing Error. Raw output: {raw_text}", "", "", "", "", "", "", ""
             vlm_result = response_dict.get("result", {})
-            # 7. EXTRACT DATA
             description_out = vlm_result.get("description", "")
             environment_out = vlm_result.get("environment", "")
             indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
@@ -111,33 +107,44 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None =
             )
     except Exception as e:
-        print(f"Error calling MCP API: {e}")
         return f"Error: {e}", "", "", "", "", "", "", ""
 # -------------------------------
 # Gradio UI
 # -------------------------------
-with gr.Blocks() as demo:
     gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
     gr.LoginButton()
     with gr.Row():
         webcam_input = gr.Image(
-            label="Captured from Web-Cam",
             sources=["webcam"],
             type="pil"
         )
         with gr.Column():
-            description_out = gr.Textbox(label="Description", lines=5)
-            environment_out = gr.Textbox(label="Environment", lines=3)
-            indoor_outdoor_out = gr.Textbox(label="Indoor/Outdoor", lines=1)
-            lighting_condition_out = gr.Textbox(label="Lighting Condition", lines=1)
-            human_out = gr.Textbox(label="Human Detected", lines=3)
-            animals_out = gr.Textbox(label="Animals Detected", lines=2)
-            objects_out = gr.Textbox(label="Objects Detected", lines=2)
-            hazards_out = gr.Textbox(label="Hazards Identified", lines=2)
     webcam_input.stream(
         process_webcam_stream_async,
         inputs=[webcam_input],
@@ -151,7 +158,10 @@ with gr.Blocks() as demo:
             objects_out,
             hazards_out
         ],
-        stream_every=1.0
     )
 if __name__ == "__main__":

 import asyncio
 import ast
 import json
+import warnings
+# -------------------------------
+# 0. CLEANUP: Ignore the spammy DeprecationWarnings
+# -------------------------------
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
 # -------------------------------
 # MCP server info
 MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
 # -------------------------------
+# Async function
 # -------------------------------
 async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None = None):
+    # 1. Login Check
     if oauth_token is None:
+        return "⚠️ Please log in via the button above to start.", "", "", "", "", "", "", ""
+    # 2. Image Check
     if image is None:
         return "", "", "", "", "", "", "", ""
     try:
+        # 3. Process Image
         buffered = io.BytesIO()
         image.save(buffered, format="JPEG")
         b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
         payload = {
             "hf_token_input": oauth_token.token,
             "robot_id_input": ROBOT_ID,
             "image_b64_input": b64_img
         }
+        # 4. Call MCP Server
         async with MCP_CLIENT:
             response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
+            # Handle MCP Errors
             if response.is_error:
                 error_msg = "Unknown Error"
                 if hasattr(response, 'content') and isinstance(response.content, list):
                     error_msg = " ".join([getattr(item, 'text', '') for item in response.content])
                 raise Exception(f"MCP Tool Error: {error_msg}")
+            # 5. Extract Text from Response List
             raw_text = ""
             if hasattr(response, 'content') and isinstance(response.content, list):
                 for item in response.content:
                     if hasattr(item, 'text'):
                         raw_text += item.text
             else:
                 raw_text = str(response)
+            # 6. Parse JSON/Dict
             try:
                 response_dict = json.loads(raw_text)
             except json.JSONDecodeError:
                 try:
                     response_dict = ast.literal_eval(raw_text)
                 except Exception:
                     return f"Parsing Error. Raw output: {raw_text}", "", "", "", "", "", "", ""
             vlm_result = response_dict.get("result", {})
+            # 7. Map to Outputs
             description_out = vlm_result.get("description", "")
             environment_out = vlm_result.get("environment", "")
             indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
             )
     except Exception as e:
+        print(f"Error: {e}")
         return f"Error: {e}", "", "", "", "", "", "", ""
 # -------------------------------
 # Gradio UI
 # -------------------------------
+with gr.Blocks(title="Robot Vision MCP") as demo:
     gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
+    # Login Button
     gr.LoginButton()
     with gr.Row():
         webcam_input = gr.Image(
+            label="Webcam Input",
             sources=["webcam"],
             type="pil"
         )
         with gr.Column():
+            description_out = gr.Textbox(label="Description", lines=4)
+            with gr.Row():
+                environment_out = gr.Textbox(label="Environment")
+                indoor_outdoor_out = gr.Textbox(label="In/Out")
+            with gr.Row():
+                human_out = gr.Textbox(label="Humans")
+                hazards_out = gr.Textbox(label="Hazards")
+            # Hidden / Extra fields (optional, add back if needed)
+            lighting_condition_out = gr.Textbox(visible=False)
+            animals_out = gr.Textbox(visible=False)
+            objects_out = gr.Textbox(visible=False)
+    # -------------------------------
+    # STREAM CONFIGURATION (The Important Fix)
+    # -------------------------------
     webcam_input.stream(
         process_webcam_stream_async,
         inputs=[webcam_input],
             objects_out,
             hazards_out
         ],
+        # Update every 3 seconds to give the AI time to think
+        stream_every=3.0,
+        # Wait for the previous request to finish before sending a new one
+        concurrency_limit=1
     )
 if __name__ == "__main__":