Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 23, 2025

Commit

54151d7

verified ·

1 Parent(s): 867053d

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -38

app.py CHANGED Viewed

@@ -7,17 +7,19 @@ from datetime import datetime
 import traceback
 from typing import Optional, Dict, Any
 import asyncio
-from fastmcp import Client
 # --- Configuration ---
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
 HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
 REMOTE_MCP_URL = os.environ.get("REMOTE_MCP_URL", "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/sse")
 # -----------------------------------------------------
 # Save and upload image to HF
 # -----------------------------------------------------
-def save_and_upload_image(image_b64: str, hf_token: str):
     try:
         image_bytes = base64.b64decode(image_b64)
         size_bytes = len(image_bytes)
@@ -70,25 +72,11 @@ def safe_parse_json_from_text(text: str) -> Optional[Dict[str, Any]]:
         return None
     return None
-# -----------------------------------------------------
-# Call remote MCP tool asynchronously
-# -----------------------------------------------------
-async def call_remote_tool(tool_name: str, **kwargs):
-    async with Client(REMOTE_MCP_URL) as client:
-        result = await client.call_tool(tool_name, **kwargs)
-        return result
-def validate_and_call_tool(tool_name: str, tool_args: dict) -> Dict[str, Any]:
-    try:
-        return asyncio.run(call_remote_tool(tool_name, **tool_args))
-    except Exception as e:
-        traceback.print_exc()
-        return {"error": f"Remote tool execution error: {str(e)}"}
 # -----------------------------------------------------
 # Main pipeline: image → VLM → remote tool
 # -----------------------------------------------------
-def process_and_describe(payload: Dict[str, Any]) -> Dict[str, Any]:
     if isinstance(payload, str):
         try:
             payload = json.loads(payload)
@@ -105,7 +93,7 @@ def process_and_describe(payload: Dict[str, Any]) -> Dict[str, Any]:
         return {"error": "image_b64 missing"}
     # Save + Upload
-    _, hf_url, _, size_bytes = save_and_upload_image(image_b64, hf_token)
     if not hf_url:
         return {"error": "Image upload failed"}
@@ -113,24 +101,19 @@ def process_and_describe(payload: Dict[str, Any]) -> Dict[str, Any]:
     system_prompt = f"""
 Respond in STRICT JSON ONLY.
 Rules:
-1. Provide a long detail description of what you see.
-2. Decide ONE MCP tool to call from:
-   - chat_with_human
-3. If a human is gesturing with open hand, then set "tool_name": "chat_with_human".
-4. Otherwise, set "tool_name": "" and "arguments": {{}}
 Output format:
 {{
  "description": "...",
- "tool_name": "chat_with_human",
- "arguments": {{}}
 }}
 """
     messages = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": [
-            {"type": "text", "text": "Analyze the image and call the appropriate MCP tool."},
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
         ]}
     ]
@@ -151,21 +134,11 @@ Output format:
     if parsed is None:
         return {"status": "model_no_json", "robot_id": robot_id, "vlm_raw": vlm_output, "message": "VLM returned invalid JSON"}
-    # Call the MCP tool directly if VLM chooses one
-    tool_name = parsed.get("tool_name")
-    tool_args = parsed.get("arguments") or {}
-    tool_result = None
-    if tool_name:
-        tool_result = validate_and_call_tool(tool_name, tool_args)
     return {
         "status": "success",
         "robot_id": robot_id,
         "file_size_bytes": size_bytes,
         "vlm_description": parsed.get("description"),
-        "chosen_tool": tool_name,
-        "tool_arguments": tool_args,
-        "tool_execution_result": tool_result,
         "vlm_raw": vlm_output
     }

 import traceback
 from typing import Optional, Dict, Any
 import asyncio
+from fastmcp import Client, FastMCP
 # --- Configuration ---
 HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
 HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
 REMOTE_MCP_URL = os.environ.get("REMOTE_MCP_URL", "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/sse")
+mcp = FastMCP("Robot_MCP_Server")
 # -----------------------------------------------------
 # Save and upload image to HF
 # -----------------------------------------------------
+def upload_image(image_b64: str, hf_token: str):
     try:
         image_bytes = base64.b64decode(image_b64)
         size_bytes = len(image_bytes)
         return None
     return None
 # -----------------------------------------------------
 # Main pipeline: image → VLM → remote tool
 # -----------------------------------------------------
+@mcp.tool()
+def robot_watch(payload: Dict[str, Any]) -> Dict[str, Any]:
     if isinstance(payload, str):
         try:
             payload = json.loads(payload)
         return {"error": "image_b64 missing"}
     # Save + Upload
+    _, hf_url, _, size_bytes = upload_image(image_b64, hf_token)
     if not hf_url:
         return {"error": "Image upload failed"}
     system_prompt = f"""
 Respond in STRICT JSON ONLY.
 Rules:
+Provide a long detail description of what you see
 Output format:
 {{
  "description": "...",
+ "human": brief description of humans if any (eg. a man with glasses)
+ "environment": category of the environment (eg. room)
 }}
 """
     messages = [
         {"role": "system", "content": system_prompt},
         {"role": "user", "content": [
+            {"type": "text", "text": "Analyze the image and provide the description."},
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
         ]}
     ]
     if parsed is None:
         return {"status": "model_no_json", "robot_id": robot_id, "vlm_raw": vlm_output, "message": "VLM returned invalid JSON"}
     return {
         "status": "success",
         "robot_id": robot_id,
         "file_size_bytes": size_bytes,
         "vlm_description": parsed.get("description"),
         "vlm_raw": vlm_output
     }