Spaces:

OppaAI
/

Robot_MCP_Client

Sleeping

File size: 6,088 Bytes

165189d
627d59b
165189d
1fb1e3b
5253b0d
1fb1e3b
3565497
306ab5e
8eaa3a3
165189d
5253b0d
3565497
5253b0d
306ab5e
9ac3c29
 
 
 
 
4decfa0
 
5253b0d
4decfa0
5253b0d
 
 
4decfa0
 
 
3565497
9ac3c29
3565497
9ac3c29
3565497
 
4decfa0
9ac3c29
 
 
 
 
 
 
 
 
 
 
 
 
 
4decfa0
 
 
306ab5e
4decfa0
9ac3c29
79f6e03
 
 
 
 
9ac3c29
 
 
 
 
79f6e03
 
 
9ac3c29
79f6e03
 
 
9ac3c29
79f6e03
 
9ac3c29
306ab5e
 
 
79f6e03
 
 
9ac3c29
79f6e03
306ab5e
4decfa0
3565497
9ac3c29
4decfa0
 
0e3d6b3
 
f745b5c
 
0e3d6b3
f745b5c
3565497
4decfa0
3565497
0e3d6b3
 
 
 
 
 
f745b5c
0e3d6b3
f745b5c
0e3d6b3
3565497
4decfa0
9ac3c29
0e3d6b3
165189d
9cd6aba
5253b0d
 
 
9ac3c29
3565497
df6f581
 
 
 
 
 
 
 
306ab5e
27c0f8e
306ab5e
9ac3c29
306ab5e
 
 
b18ef1e
9ac3c29
 
 
 
 
 
 
 
 
27c0f8e
17f5b16
306ab5e
3c36d2f
 
 
 
 
 
 
3565497
3c36d2f
7caebc5
9ac3c29
27c0f8e
165189d
 
79f6e03

import base64
import io
import gradio as gr
from fastmcp import Client
from fastmcp.client import StreamableHttpTransport
import asyncio
import ast
import json
import os

# -------------------------------
# MCP server info
# -------------------------------
ROBOT_ID = "Robot_MCP_Client"
HF_TOKEN = os.environ.get("HF_TOKEN")
if not HF_TOKEN:
    print("Warning: HF_TOKEN not found. API calls may fail.")
    HF_TOKEN = "missing_token_placeholder"

MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
SERVER_NAME = "Robot_MCP_Server"
TOOL_NAME = "Robot_MCP_Server_robot_watch"

# -------------------------------
# Initialize MCP client globally
# -------------------------------
HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)

# -------------------------------
# Async function using user's HF token
# -------------------------------
async def process_webcam_stream_async(image):
    if image is None:
        return "", "", "", "", "", "", "", ""

    if HF_TOKEN == "missing_token_placeholder":
        return "Error: HF_TOKEN not set locally.", "", "", "", "", "", "", ""

    # Convert image to Base64
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")

    payload = {
        "hf_token_input": HF_TOKEN,
        "robot_id_input": ROBOT_ID,
        "image_b64_input": b64_img
    }

    try:
        async with MCP_CLIENT:
            response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
            
            if response.is_error:
                # Handle error content safely
                error_msg = "Unknown Error"
                if hasattr(response, 'content') and isinstance(response.content, list):
                    error_msg = " ".join([getattr(item, 'text', '') for item in response.content])
                raise Exception(f"MCP Tool Error: {error_msg}")

            # ---------------------------------------------------------
            # FIX: Handle List Content
            # The 'content' is a list of objects (e.g., TextContent).
            # We iterate through the list and join the text parts.
            # ---------------------------------------------------------
            raw_text = ""
            if hasattr(response, 'content') and isinstance(response.content, list):
                for item in response.content:
                    # Check if the item has a 'text' attribute
                    if hasattr(item, 'text'):
                        raw_text += item.text
            else:
                # Fallback for unexpected structure
                raw_text = str(response)

            # 6. PARSE RESPONSE
            try:
                response_dict = json.loads(raw_text)
            except json.JSONDecodeError:
                try:
                    response_dict = ast.literal_eval(raw_text)
                except Exception:
                    # If parsing fails completely, return the raw text in description
                    return f"Parsing Error. Raw output: {raw_text}", "", "", "", "", "", "", ""
            
            vlm_result = response_dict.get("result", {})

            # 7. EXTRACT DATA
            description_out = vlm_result.get("description", "")
            environment_out = vlm_result.get("environment", "")
            indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
            lighting_condition_out = vlm_result.get("lighting_condition", "")
            human_out = vlm_result.get("human", "")
            animals_out = vlm_result.get("animals", "")
            objects_list = vlm_result.get("objects", [])
            hazards_out = vlm_result.get("hazards", "")

            objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)

            return (
                description_out,
                environment_out,
                indoor_outdoor_out,
                lighting_condition_out,
                human_out,
                animals_out,
                objects_str,
                hazards_out
            )

    except Exception as e:
        print(f"Error calling MCP API: {e}")
        return f"Error: {e}", "", "", "", "", "", "", ""


# -------------------------------
# Gradio UI
# -------------------------------
with gr.Blocks() as demo:
    gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
    gr.Markdown(
        """
This interface captures a live webcam feed and sends each frame to the MCP Client for analysis.
The system extracts detailed information from the scene — including descriptions, detected objects,
humans, animals, environmental context, lighting conditions, and potential hazards.
Use this dashboard to observe how the robot interprets the world in real time.
        """
    )

    with gr.Row():
        webcam_input = gr.Image(
            label="Captured from Web-Cam", 
            sources=["webcam"], 
            type="pil"
        )
        with gr.Column():
            description_out = gr.Textbox(label="Description", lines=5)
            environment_out = gr.Textbox(label="Environment", lines=3)
            indoor_outdoor_out = gr.Textbox(label="Indoor/Outdoor", lines=1)
            lighting_condition_out = gr.Textbox(label="Lighting Condition", lines=1)
            human_out = gr.Textbox(label="Human Detected", lines=3)
            animals_out = gr.Textbox(label="Animals Detected", lines=2)
            objects_out = gr.Textbox(label="Objects Detected", lines=2)
            hazards_out = gr.Textbox(label="Hazards Identified", lines=2)

    webcam_input.stream(
        process_webcam_stream_async,
        inputs=[webcam_input], 
        outputs=[
            description_out,
            environment_out,
            indoor_outdoor_out,
            lighting_condition_out,
            human_out,
            animals_out,
            objects_out,
            hazards_out
        ],
        stream_every=1.0
    )

if __name__ == "__main__":
    demo.launch(ssr_mode=False)