Spaces:

OppaAI
/

Robot_MCP_Client

Sleeping

App Files Files Community

OppaAI commited on Nov 24, 2025

Commit

79f6e03

verified ·

1 Parent(s): 306ab5e

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -25

app.py CHANGED Viewed

@@ -28,23 +28,23 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None =
     """
     Send webcam image to MCP server using user's HF token and process the response.
     """
-    # 1. CHECK LOGIN: If no token, ask user to log in
     if oauth_token is None:
         return "Please log in using the button above.", "", "", "", "", "", "", ""
-    # 2. CHECK IMAGE: If camera hasn't loaded yet
     if image is None:
         return "", "", "", "", "", "", "", ""
     try:
-        # 3. PREPARE IMAGE: Convert to Base64
         buffered = io.BytesIO()
         image.save(buffered, format="JPEG")
         b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
-        # 4. PREPARE PAYLOAD: Inject the user's token
         payload = {
-            "hf_token_input": oauth_token.token, # <--- Token used here
             "robot_id_input": ROBOT_ID,
             "image_b64_input": b64_img
         }
@@ -54,17 +54,36 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None =
             response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
             if response.is_error:
-                error_text = response.content.text if response.content else "Unknown error"
-                raise Exception(f"MCP Tool Error: {error_text}")
-            raw_text = response.content.text
-            # 6. PARSE RESPONSE (Handle both JSON and Python Dict strings)
             try:
                 response_dict = json.loads(raw_text)
             except json.JSONDecodeError:
-                # Fallback if server returns single quotes
-                response_dict = ast.literal_eval(raw_text)
             vlm_result = response_dict.get("result", {})
@@ -78,7 +97,6 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None =
             objects_list = vlm_result.get("objects", [])
             hazards_out = vlm_result.get("hazards", "")
-            # Convert list to string
             objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
             return (
@@ -102,19 +120,14 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None =
 # -------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
-    # The Login Button (Required for oauth_token)
     gr.LoginButton()
     with gr.Row():
-        # Input: Webcam
         webcam_input = gr.Image(
             label="Captured from Web-Cam",
             sources=["webcam"],
             type="pil"
         )
-        # Outputs
         with gr.Column():
             description_out = gr.Textbox(label="Description", lines=5)
             environment_out = gr.Textbox(label="Environment", lines=3)
@@ -125,11 +138,6 @@ with gr.Blocks() as demo:
             objects_out = gr.Textbox(label="Objects Detected", lines=2)
             hazards_out = gr.Textbox(label="Hazards Identified", lines=2)
-    # -------------------------------
-    # Event Trigger
-    # -------------------------------
-    # CRITICAL FIX: Do NOT include gr.OAuthToken() in inputs.
-    # Gradio automatically injects it because it's in the function signature.
     webcam_input.stream(
         process_webcam_stream_async,
         inputs=[webcam_input],
@@ -147,4 +155,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

     """
     Send webcam image to MCP server using user's HF token and process the response.
     """
+    # 1. CHECK LOGIN
     if oauth_token is None:
         return "Please log in using the button above.", "", "", "", "", "", "", ""
+    # 2. CHECK IMAGE
     if image is None:
         return "", "", "", "", "", "", "", ""
     try:
+        # 3. PREPARE IMAGE
         buffered = io.BytesIO()
         image.save(buffered, format="JPEG")
         b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        # 4. PREPARE PAYLOAD
         payload = {
+            "hf_token_input": oauth_token.token,
             "robot_id_input": ROBOT_ID,
             "image_b64_input": b64_img
         }
             response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
             if response.is_error:
+                # Handle error content safely
+                error_msg = "Unknown Error"
+                if hasattr(response, 'content') and isinstance(response.content, list):
+                    error_msg = " ".join([getattr(item, 'text', '') for item in response.content])
+                raise Exception(f"MCP Tool Error: {error_msg}")
+            # ---------------------------------------------------------
+            # FIX: Handle List Content
+            # The 'content' is a list of objects (e.g., TextContent).
+            # We iterate through the list and join the text parts.
+            # ---------------------------------------------------------
+            raw_text = ""
+            if hasattr(response, 'content') and isinstance(response.content, list):
+                for item in response.content:
+                    # Check if the item has a 'text' attribute
+                    if hasattr(item, 'text'):
+                        raw_text += item.text
+            else:
+                # Fallback for unexpected structure
+                raw_text = str(response)
+            # 6. PARSE RESPONSE
             try:
                 response_dict = json.loads(raw_text)
             except json.JSONDecodeError:
+                try:
+                    response_dict = ast.literal_eval(raw_text)
+                except Exception:
+                    # If parsing fails completely, return the raw text in description
+                    return f"Parsing Error. Raw output: {raw_text}", "", "", "", "", "", "", ""
             vlm_result = response_dict.get("result", {})
             objects_list = vlm_result.get("objects", [])
             hazards_out = vlm_result.get("hazards", "")
             objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
             return (
 # -------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
     gr.LoginButton()
     with gr.Row():
         webcam_input = gr.Image(
             label="Captured from Web-Cam",
             sources=["webcam"],
             type="pil"
         )
         with gr.Column():
             description_out = gr.Textbox(label="Description", lines=5)
             environment_out = gr.Textbox(label="Environment", lines=3)
             objects_out = gr.Textbox(label="Objects Detected", lines=2)
             hazards_out = gr.Textbox(label="Hazards Identified", lines=2)
     webcam_input.stream(
         process_webcam_stream_async,
         inputs=[webcam_input],
     )
 if __name__ == "__main__":
+    demo.launch(ssr_mode=False)