Spaces:

OppaAI
/

Robot_MCP_Client

Sleeping

App Files Files Community

OppaAI commited on Nov 24, 2025

Commit

306ab5e

verified ·

1 Parent(s): f745b5c

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -30

app.py CHANGED Viewed

@@ -5,11 +5,12 @@ from fastmcp import Client
 from fastmcp.client import StreamableHttpTransport
 import asyncio
 import ast
 # -------------------------------
 # MCP server info
 # -------------------------------
-ROBOT_ID = "Robot_MCP_Client"  # Local client identifier
 MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
 SERVER_NAME = "Robot_MCP_Server"
 TOOL_NAME = "Robot_MCP_Server_robot_watch"
@@ -20,44 +21,54 @@ TOOL_NAME = "Robot_MCP_Server_robot_watch"
 HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
 MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
 # -------------------------------
 # Async function using user's HF token
 # -------------------------------
-async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None):
     """
     Send webcam image to MCP server using user's HF token and process the response.
     """
     if oauth_token is None:
-        return "Please log in first.", "", "", "", "", "", "", ""
     if image is None:
         return "", "", "", "", "", "", "", ""
-    # Convert image to Base64
-    buffered = io.BytesIO()
-    image.save(buffered, format="JPEG")
-    b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    # Payload with user token
-    payload = {
-        "hf_token_input": oauth_token.token,
-        "robot_id_input": ROBOT_ID,
-        "image_b64_input": b64_img
-    }
     try:
         async with MCP_CLIENT:
             response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
             if response.is_error:
                 error_text = response.content.text if response.content else "Unknown error"
                 raise Exception(f"MCP Tool Error: {error_text}")
             raw_text = response.content.text
-            response_dict = ast.literal_eval(raw_text)
             vlm_result = response_dict.get("result", {})
-            # Extract all fields
             description_out = vlm_result.get("description", "")
             environment_out = vlm_result.get("environment", "")
             indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
@@ -67,7 +78,7 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None):
             objects_list = vlm_result.get("objects", [])
             hazards_out = vlm_result.get("hazards", "")
-            # Convert lists to strings
             objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
             return (
@@ -83,8 +94,6 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None):
     except Exception as e:
         print(f"Error calling MCP API: {e}")
-        import traceback
-        traceback.print_exc()
         return f"Error: {e}", "", "", "", "", "", "", ""
@@ -92,11 +101,20 @@ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None):
 # Gradio UI
 # -------------------------------
 with gr.Blocks() as demo:
-    # Hugging Face OAuth login button
-    gr.LoginButton()
     gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
     with gr.Row():
-        webcam_input = gr.Image(label="Captured from Web-Cam", sources=["upload", "webcam"], type="pil")
         with gr.Column():
             description_out = gr.Textbox(label="Description", lines=5)
             environment_out = gr.Textbox(label="Environment", lines=3)
@@ -107,13 +125,14 @@ with gr.Blocks() as demo:
             objects_out = gr.Textbox(label="Objects Detected", lines=2)
             hazards_out = gr.Textbox(label="Hazards Identified", lines=2)
-    # Only webcam input in inputs; Gradio automatically injects oauth_token
     webcam_input.stream(
         process_webcam_stream_async,
-        inputs=[
-            webcam_input,
-            gr.OAuthToken()
-        ],
         outputs=[
             description_out,
             environment_out,
@@ -128,4 +147,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 from fastmcp.client import StreamableHttpTransport
 import asyncio
 import ast
+import json
 # -------------------------------
 # MCP server info
 # -------------------------------
+ROBOT_ID = "Robot_MCP_Client"
 MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
 SERVER_NAME = "Robot_MCP_Server"
 TOOL_NAME = "Robot_MCP_Server_robot_watch"
 HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
 MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
 # -------------------------------
 # Async function using user's HF token
 # -------------------------------
+async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None = None):
     """
     Send webcam image to MCP server using user's HF token and process the response.
     """
+    # 1. CHECK LOGIN: If no token, ask user to log in
     if oauth_token is None:
+        return "Please log in using the button above.", "", "", "", "", "", "", ""
+    # 2. CHECK IMAGE: If camera hasn't loaded yet
     if image is None:
         return "", "", "", "", "", "", "", ""
     try:
+        # 3. PREPARE IMAGE: Convert to Base64
+        buffered = io.BytesIO()
+        image.save(buffered, format="JPEG")
+        b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
+        # 4. PREPARE PAYLOAD: Inject the user's token
+        payload = {
+            "hf_token_input": oauth_token.token, # <--- Token used here
+            "robot_id_input": ROBOT_ID,
+            "image_b64_input": b64_img
+        }
+        # 5. CALL MCP SERVER
         async with MCP_CLIENT:
             response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
             if response.is_error:
                 error_text = response.content.text if response.content else "Unknown error"
                 raise Exception(f"MCP Tool Error: {error_text}")
             raw_text = response.content.text
+            # 6. PARSE RESPONSE (Handle both JSON and Python Dict strings)
+            try:
+                response_dict = json.loads(raw_text)
+            except json.JSONDecodeError:
+                # Fallback if server returns single quotes
+                response_dict = ast.literal_eval(raw_text)
             vlm_result = response_dict.get("result", {})
+            # 7. EXTRACT DATA
             description_out = vlm_result.get("description", "")
             environment_out = vlm_result.get("environment", "")
             indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
             objects_list = vlm_result.get("objects", [])
             hazards_out = vlm_result.get("hazards", "")
+            # Convert list to string
             objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
             return (
     except Exception as e:
         print(f"Error calling MCP API: {e}")
         return f"Error: {e}", "", "", "", "", "", "", ""
 # Gradio UI
 # -------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
+    # The Login Button (Required for oauth_token)
+    gr.LoginButton()
     with gr.Row():
+        # Input: Webcam
+        webcam_input = gr.Image(
+            label="Captured from Web-Cam",
+            sources=["webcam"],
+            type="pil"
+        )
+        # Outputs
         with gr.Column():
             description_out = gr.Textbox(label="Description", lines=5)
             environment_out = gr.Textbox(label="Environment", lines=3)
             objects_out = gr.Textbox(label="Objects Detected", lines=2)
             hazards_out = gr.Textbox(label="Hazards Identified", lines=2)
+    # -------------------------------
+    # Event Trigger
+    # -------------------------------
+    # CRITICAL FIX: Do NOT include gr.OAuthToken() in inputs.
+    # Gradio automatically injects it because it's in the function signature.
     webcam_input.stream(
         process_webcam_stream_async,
+        inputs=[webcam_input],
         outputs=[
             description_out,
             environment_out,
     )
 if __name__ == "__main__":
+    demo.launch()