Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,17 +7,19 @@ from datetime import datetime
|
|
| 7 |
import traceback
|
| 8 |
from typing import Optional, Dict, Any
|
| 9 |
import asyncio
|
| 10 |
-
from fastmcp import Client
|
| 11 |
|
| 12 |
# --- Configuration ---
|
| 13 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
|
| 14 |
HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
|
| 15 |
REMOTE_MCP_URL = os.environ.get("REMOTE_MCP_URL", "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/sse")
|
| 16 |
|
|
|
|
|
|
|
| 17 |
# -----------------------------------------------------
|
| 18 |
# Save and upload image to HF
|
| 19 |
# -----------------------------------------------------
|
| 20 |
-
def
|
| 21 |
try:
|
| 22 |
image_bytes = base64.b64decode(image_b64)
|
| 23 |
size_bytes = len(image_bytes)
|
|
@@ -70,25 +72,11 @@ def safe_parse_json_from_text(text: str) -> Optional[Dict[str, Any]]:
|
|
| 70 |
return None
|
| 71 |
return None
|
| 72 |
|
| 73 |
-
# -----------------------------------------------------
|
| 74 |
-
# Call remote MCP tool asynchronously
|
| 75 |
-
# -----------------------------------------------------
|
| 76 |
-
async def call_remote_tool(tool_name: str, **kwargs):
|
| 77 |
-
async with Client(REMOTE_MCP_URL) as client:
|
| 78 |
-
result = await client.call_tool(tool_name, **kwargs)
|
| 79 |
-
return result
|
| 80 |
-
|
| 81 |
-
def validate_and_call_tool(tool_name: str, tool_args: dict) -> Dict[str, Any]:
|
| 82 |
-
try:
|
| 83 |
-
return asyncio.run(call_remote_tool(tool_name, **tool_args))
|
| 84 |
-
except Exception as e:
|
| 85 |
-
traceback.print_exc()
|
| 86 |
-
return {"error": f"Remote tool execution error: {str(e)}"}
|
| 87 |
-
|
| 88 |
# -----------------------------------------------------
|
| 89 |
# Main pipeline: image → VLM → remote tool
|
| 90 |
# -----------------------------------------------------
|
| 91 |
-
|
|
|
|
| 92 |
if isinstance(payload, str):
|
| 93 |
try:
|
| 94 |
payload = json.loads(payload)
|
|
@@ -105,7 +93,7 @@ def process_and_describe(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 105 |
return {"error": "image_b64 missing"}
|
| 106 |
|
| 107 |
# Save + Upload
|
| 108 |
-
_, hf_url, _, size_bytes =
|
| 109 |
if not hf_url:
|
| 110 |
return {"error": "Image upload failed"}
|
| 111 |
|
|
@@ -113,24 +101,19 @@ def process_and_describe(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 113 |
system_prompt = f"""
|
| 114 |
Respond in STRICT JSON ONLY.
|
| 115 |
Rules:
|
| 116 |
-
|
| 117 |
-
2. Decide ONE MCP tool to call from:
|
| 118 |
-
- chat_with_human
|
| 119 |
-
3. If a human is gesturing with open hand, then set "tool_name": "chat_with_human".
|
| 120 |
-
4. Otherwise, set "tool_name": "" and "arguments": {{}}
|
| 121 |
-
|
| 122 |
Output format:
|
| 123 |
{{
|
| 124 |
"description": "...",
|
| 125 |
-
"
|
| 126 |
-
"
|
| 127 |
}}
|
| 128 |
"""
|
| 129 |
|
| 130 |
messages = [
|
| 131 |
{"role": "system", "content": system_prompt},
|
| 132 |
{"role": "user", "content": [
|
| 133 |
-
{"type": "text", "text": "Analyze the image and
|
| 134 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
|
| 135 |
]}
|
| 136 |
]
|
|
@@ -151,21 +134,11 @@ Output format:
|
|
| 151 |
if parsed is None:
|
| 152 |
return {"status": "model_no_json", "robot_id": robot_id, "vlm_raw": vlm_output, "message": "VLM returned invalid JSON"}
|
| 153 |
|
| 154 |
-
# Call the MCP tool directly if VLM chooses one
|
| 155 |
-
tool_name = parsed.get("tool_name")
|
| 156 |
-
tool_args = parsed.get("arguments") or {}
|
| 157 |
-
tool_result = None
|
| 158 |
-
if tool_name:
|
| 159 |
-
tool_result = validate_and_call_tool(tool_name, tool_args)
|
| 160 |
-
|
| 161 |
return {
|
| 162 |
"status": "success",
|
| 163 |
"robot_id": robot_id,
|
| 164 |
"file_size_bytes": size_bytes,
|
| 165 |
"vlm_description": parsed.get("description"),
|
| 166 |
-
"chosen_tool": tool_name,
|
| 167 |
-
"tool_arguments": tool_args,
|
| 168 |
-
"tool_execution_result": tool_result,
|
| 169 |
"vlm_raw": vlm_output
|
| 170 |
}
|
| 171 |
|
|
|
|
| 7 |
import traceback
|
| 8 |
from typing import Optional, Dict, Any
|
| 9 |
import asyncio
|
| 10 |
+
from fastmcp import Client, FastMCP
|
| 11 |
|
| 12 |
# --- Configuration ---
|
| 13 |
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
|
| 14 |
HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
|
| 15 |
REMOTE_MCP_URL = os.environ.get("REMOTE_MCP_URL", "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/sse")
|
| 16 |
|
| 17 |
+
mcp = FastMCP("Robot_MCP_Server")
|
| 18 |
+
|
| 19 |
# -----------------------------------------------------
|
| 20 |
# Save and upload image to HF
|
| 21 |
# -----------------------------------------------------
|
| 22 |
+
def upload_image(image_b64: str, hf_token: str):
|
| 23 |
try:
|
| 24 |
image_bytes = base64.b64decode(image_b64)
|
| 25 |
size_bytes = len(image_bytes)
|
|
|
|
| 72 |
return None
|
| 73 |
return None
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
# -----------------------------------------------------
|
| 76 |
# Main pipeline: image → VLM → remote tool
|
| 77 |
# -----------------------------------------------------
|
| 78 |
+
@mcp.tool()
|
| 79 |
+
def robot_watch(payload: Dict[str, Any]) -> Dict[str, Any]:
|
| 80 |
if isinstance(payload, str):
|
| 81 |
try:
|
| 82 |
payload = json.loads(payload)
|
|
|
|
| 93 |
return {"error": "image_b64 missing"}
|
| 94 |
|
| 95 |
# Save + Upload
|
| 96 |
+
_, hf_url, _, size_bytes = upload_image(image_b64, hf_token)
|
| 97 |
if not hf_url:
|
| 98 |
return {"error": "Image upload failed"}
|
| 99 |
|
|
|
|
| 101 |
system_prompt = f"""
|
| 102 |
Respond in STRICT JSON ONLY.
|
| 103 |
Rules:
|
| 104 |
+
Provide a long detail description of what you see
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
Output format:
|
| 106 |
{{
|
| 107 |
"description": "...",
|
| 108 |
+
"human": brief description of humans if any (eg. a man with glasses)
|
| 109 |
+
"environment": category of the environment (eg. room)
|
| 110 |
}}
|
| 111 |
"""
|
| 112 |
|
| 113 |
messages = [
|
| 114 |
{"role": "system", "content": system_prompt},
|
| 115 |
{"role": "user", "content": [
|
| 116 |
+
{"type": "text", "text": "Analyze the image and provide the description."},
|
| 117 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
|
| 118 |
]}
|
| 119 |
]
|
|
|
|
| 134 |
if parsed is None:
|
| 135 |
return {"status": "model_no_json", "robot_id": robot_id, "vlm_raw": vlm_output, "message": "VLM returned invalid JSON"}
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
return {
|
| 138 |
"status": "success",
|
| 139 |
"robot_id": robot_id,
|
| 140 |
"file_size_bytes": size_bytes,
|
| 141 |
"vlm_description": parsed.get("description"),
|
|
|
|
|
|
|
|
|
|
| 142 |
"vlm_raw": vlm_output
|
| 143 |
}
|
| 144 |
|