Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 21, 2025

Commit

d192cfe

verified ·

1 Parent(s): a4f7543

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -123

app.py CHANGED Viewed

@@ -9,32 +9,29 @@ import traceback
 import threading
 from typing import Tuple, Optional, Dict, Any
-# --- Config ---
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
-# In-memory processed requests cache to prevent duplicate execution for identical request_id
 PROCESSED_REQUESTS: Dict[str, Dict[str, Any]] = {}
 PROCESSED_LOCK = threading.Lock()
-# ==========================================
 # Robot Tools
-# ==========================================
 def tool_speak(text: str, emotion: str = "neutral") -> dict:
     return {"status": "success", "action_executed": "speak", "payload": {"text": text, "emotion": emotion}}
 def tool_navigate(direction: str, distance_meters: float) -> dict:
     if distance_meters > 5.0:
-        return {"status": "error", "message": "Safety limit: Cannot move more than 5m at once."}
     return {"status": "success", "action_executed": "navigate", "payload": {"direction": direction, "distance": distance_meters}}
 def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
     timestamp = datetime.now().isoformat()
-    log_entry = f"[{timestamp}] WARNING: {hazard_type} detected (Severity: {severity})"
-    return {"status": "warning_logged", "log": log_entry}
 def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
-    return {"status": "human_tracked", "details": f"Human wearing {clothing_color} is likely {estimated_action}."}
 TOOL_REGISTRY = {
     "speak": tool_speak,
@@ -43,178 +40,168 @@ TOOL_REGISTRY = {
     "analyze_human": tool_analyze_human
 }
-# ==========================================
-# Helper: Save & Upload
-# ==========================================
 def save_and_upload_image(image_b64: str, hf_token: str) -> Tuple[Optional[str], Optional[str], Optional[str], int]:
     try:
         image_bytes = base64.b64decode(image_b64)
         size_bytes = len(image_bytes)
-        if size_bytes < 10:
-            raise ValueError("Decoded image is too small or invalid base64")
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
-        local_tmp_path = f"/tmp/robot_img_{timestamp}.jpg"
-        with open(local_tmp_path, "wb") as f:
             f.write(image_bytes)
         filename = f"robot_{timestamp}.jpg"
-        path_in_repo = filename
         upload_file(
-            path_or_fileobj=local_tmp_path,
-            path_in_repo=path_in_repo,
             repo_id=HF_DATASET_REPO,
             token=hf_token,
             repo_type="dataset"
         )
-        hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
-        return local_tmp_path, hf_image_url, path_in_repo, size_bytes
     except Exception as e:
         traceback.print_exc()
         return None, None, None, 0
-# ==========================================
-# JSON parsing helper
-# ==========================================
 def safe_parse_json_from_text(text: str) -> Optional[dict]:
     if not text:
         return None
-    t = text.strip()
-    if t.startswith("```") and "```" in t[3:]:
-        t = t.strip("`")
-    start = t.find("{")
-    end = t.rfind("}")
-    if start >= 0 and end > start:
-        candidate = t[start:end+1]
-        try:
-            return json.loads(candidate)
-        except Exception:
-            try:
-                return json.loads(t)
-            except Exception:
-                return None
-    else:
-        try:
-            return json.loads(t)
-        except Exception:
-            return None
-# ==========================================
-# Tool executor
-# ==========================================
 def validate_and_call_tool(tool_name: str, tool_args: dict):
     if not tool_name:
-        return {"error": "No tool_name provided by VLM."}
     if tool_name not in TOOL_REGISTRY:
-        return {"error": f"Tool '{tool_name}' not found in registry."}
     try:
         return TOOL_REGISTRY[tool_name](**tool_args)
-    except TypeError as e:
-        return {"error": f"Tool call argument mismatch: {str(e)}"}
     except Exception as e:
         traceback.print_exc()
-        return {"error": f"Tool execution failed: {str(e)}"}
-# ==========================================
-# Main logic
-# ==========================================
 def process_and_describe(payload):
-    # If payload is str, try to parse it
     if isinstance(payload, str):
         try:
             payload = json.loads(payload)
         except Exception as e:
             return {"error": f"Invalid JSON string: {str(e)}"}
-    vlm_text = ""
-    tool_result = None
-    action_data = {}
     try:
         hf_token = payload.get("hf_token")
         if not hf_token:
-            return {"error": "HF token not provided in payload."}
-        request_id = payload.get("request_id") or payload.get("robot_id") or None
-        if request_id:
-            with PROCESSED_LOCK:
-                if request_id in PROCESSED_REQUESTS:
-                    return PROCESSED_REQUESTS[request_id]
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload.get("image_b64")
         if not image_b64:
-            return {"error": "No image provided in payload."}
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
         if not hf_url:
-            return {"error": "Image upload failed.", "debug": {"local_tmp_path": local_tmp_path, "size_bytes": size_bytes}}
-        # Build system prompt
-        tools_desc = json.dumps({
-            "speak": {"text": "string", "emotion": "string"},
-            "navigate": {"direction": "forward/left/right", "distance_meters": "float"},
-            "scan_hazard": {"hazard_type": "string", "severity": "low/medium/high"},
-            "analyze_human": {"clothing_color": "string", "estimated_action": "string"}
-        }, indent=2)
-        system_prompt = f"""
-You are a Robot Control AI. Analyze the image and choose ONE tool to execute.
-AVAILABLE TOOLS (JSON Schema):
-{tools_desc}
-INSTRUCTIONS:
-1. Describe what you see briefly.
-2. Select the single most appropriate tool and provide arguments matching the schema.
-RESPONSE FORMAT (Strict JSON):
-{{
-  "description": "Brief visual description",
-  "tool_name": "name_of_tool",
-  "arguments": {{ ...args matching schema... }}
-}}
 """
-        messages_payload = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": [
-                {"type": "text", "text": "Analyze this camera feed and decide on an action."},
-                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
             ]}
         ]
-        hf_client = InferenceClient(token=hf_token)
-        chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
-            messages=messages_payload,
             max_tokens=300,
             temperature=0.1
         )
-        vlm_text = chat_completion.choices[0].message.content.strip()
-        parsed = safe_parse_json_from_text(vlm_text)
         if parsed is None:
-            result = {
                 "status": "model_no_json",
                 "robot_id": robot_id,
                 "image_url": hf_url,
-                "vlm_raw": vlm_text,
-                "message": "VLM did not return valid JSON following the required schema."
             }
-            if request_id:
-                with PROCESSED_LOCK:
-                    PROCESSED_REQUESTS[request_id] = result
-            return result
-        action_data = parsed
-        tool_name = action_data.get("tool_name")
-        tool_args = action_data.get("arguments", {}) or {}
-        if not isinstance(tool_args, dict):
-            tool_args = {}
         tool_result = validate_and_call_tool(tool_name, tool_args)
@@ -223,31 +210,31 @@ RESPONSE FORMAT (Strict JSON):
             "robot_id": robot_id,
             "image_url": hf_url,
             "image_bytes": size_bytes,
-            "analysis": action_data.get("description"),
             "chosen_tool": tool_name,
             "tool_arguments": tool_args,
             "tool_execution_result": tool_result,
-            "vlm_raw": vlm_text
         }
-        if request_id:
-            with PROCESSED_LOCK:
-                PROCESSED_REQUESTS[request_id] = result
         return result
     except Exception as e:
         traceback.print_exc()
-        return {"error": f"Server error: {str(e)}", "vlm_raw": vlm_text}
-# --- Gradio Interface ---
 iface = gr.Interface(
     fn=process_and_describe,
-    inputs=gr.JSON(label="Input (JSON with 'image_b64', 'hf_token', optional 'request_id')"),
-    outputs=gr.JSON(label="Robot Command Output"),
     api_name="predict",
-    allow_flagging="never",
-    live=False
 )
 if __name__ == "__main__":

 import threading
 from typing import Tuple, Optional, Dict, Any
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 PROCESSED_REQUESTS: Dict[str, Dict[str, Any]] = {}
 PROCESSED_LOCK = threading.Lock()
+# --------------------
 # Robot Tools
+# --------------------
 def tool_speak(text: str, emotion: str = "neutral") -> dict:
     return {"status": "success", "action_executed": "speak", "payload": {"text": text, "emotion": emotion}}
 def tool_navigate(direction: str, distance_meters: float) -> dict:
     if distance_meters > 5.0:
+        return {"status": "error", "message": "Safety limit exceeded"}
     return {"status": "success", "action_executed": "navigate", "payload": {"direction": direction, "distance": distance_meters}}
 def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
     timestamp = datetime.now().isoformat()
+    return {"status": "warning_logged", "log": f"[{timestamp}] HAZARD: {hazard_type} (Severity: {severity})"}
 def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
+    return {"status": "human_tracked", "details": f"Human wearing {clothing_color} is {estimated_action}"}
 TOOL_REGISTRY = {
     "speak": tool_speak,
     "analyze_human": tool_analyze_human
 }
+# --------------------
+# Save + Upload
+# --------------------
 def save_and_upload_image(image_b64: str, hf_token: str) -> Tuple[Optional[str], Optional[str], Optional[str], int]:
     try:
         image_bytes = base64.b64decode(image_b64)
         size_bytes = len(image_bytes)
+        print("[debug] decoded image bytes:", size_bytes)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        local_path = f"/tmp/robot_img_{timestamp}.jpg"
+        with open(local_path, "wb") as f:
             f.write(image_bytes)
+        print("[debug] wrote local tmp file:", local_path)
         filename = f"robot_{timestamp}.jpg"
         upload_file(
+            path_or_fileobj=local_path,
+            path_in_repo=filename,
             repo_id=HF_DATASET_REPO,
             token=hf_token,
             repo_type="dataset"
         )
+        print("[debug] upload successful:", filename)
+        url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{filename}"
+        return local_path, url, filename, size_bytes
     except Exception as e:
         traceback.print_exc()
         return None, None, None, 0
+# --------------------
+# JSON Parse Helper
+# --------------------
 def safe_parse_json_from_text(text: str) -> Optional[dict]:
     if not text:
         return None
+    try:
+        return json.loads(text)
+    except:
+        pass
+    cleaned = text.strip()
+    if cleaned.startswith("```"):
+        cleaned = cleaned.strip("`")
+    try:
+        start = cleaned.find("{")
+        end = cleaned.rfind("}")
+        if start >= 0 and end > start:
+            return json.loads(cleaned[start:end+1])
+    except:
+        return None
+    return None
+# --------------------
+# Tool validation + exec
+# --------------------
 def validate_and_call_tool(tool_name: str, tool_args: dict):
     if not tool_name:
+        return {"error": "Missing tool_name"}
     if tool_name not in TOOL_REGISTRY:
+        return {"error": f"Unknown tool '{tool_name}'"}
     try:
         return TOOL_REGISTRY[tool_name](**tool_args)
     except Exception as e:
         traceback.print_exc()
+        return {"error": f"Tool error: {str(e)}"}
+# --------------------
+# Main Function
+# --------------------
 def process_and_describe(payload):
+    # If string → parse JSON
     if isinstance(payload, str):
         try:
             payload = json.loads(payload)
         except Exception as e:
+            print("[error] invalid JSON from client:", payload)
             return {"error": f"Invalid JSON string: {str(e)}"}
+    print("\n================ NEW REQUEST ================")
+    print("[debug] Incoming payload:", payload)
     try:
         hf_token = payload.get("hf_token")
         if not hf_token:
+            return {"error": "hf_token missing"}
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload.get("image_b64")
         if not image_b64:
+            return {"error": "image_b64 missing"}
+        # Save & Upload
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
         if not hf_url:
+            print("[error] Image upload failed.")
+            return {"error": "Image upload failed"}
+        print("[debug] HF image URL:", hf_url)
+        # Build prompt
+        system_prompt = """
+Respond in STRICT JSON:
+{
+ "description":"short visual description",
+ "tool_name":"name",
+ "arguments": { ... }
+}
 """
+        messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": [
+                {"type": "text", "text": "Analyze image and select one tool"},
+                {"type": "image_url",
+                 "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
             ]}
         ]
+        print("[debug] Calling VLM model...")
+        client = InferenceClient(token=hf_token)
+        response = client.chat.completions.create(
             model=HF_VLM_MODEL,
+            messages=messages,
             max_tokens=300,
             temperature=0.1
         )
+        vlm_output = response.choices[0].message.content.strip()
+        # 🔥 PRINT VLM RAW OUTPUT (你要求的)
+        print("\n------ VLM RAW OUTPUT ------")
+        print(vlm_output)
+        print("------ END VLM RAW ------\n")
+        parsed = safe_parse_json_from_text(vlm_output)
         if parsed is None:
+            print("[error] VLM did NOT return valid JSON")
+            return {
                 "status": "model_no_json",
                 "robot_id": robot_id,
                 "image_url": hf_url,
+                "vlm_raw": vlm_output,
+                "message": "VLM did not output valid JSON"
             }
+        tool_name = parsed.get("tool_name")
+        tool_args = parsed.get("arguments") or {}
+        print("[debug] Parsed JSON:", parsed)
         tool_result = validate_and_call_tool(tool_name, tool_args)
             "robot_id": robot_id,
             "image_url": hf_url,
             "image_bytes": size_bytes,
+            "analysis": parsed.get("description"),
             "chosen_tool": tool_name,
             "tool_arguments": tool_args,
             "tool_execution_result": tool_result,
+            "vlm_raw": vlm_output
         }
+        print("[debug] Final result:", result)
+        print("============================================\n")
         return result
     except Exception as e:
         traceback.print_exc()
+        return {"error": f"Server exception: {str(e)}"}
+# --------------------
+# Gradio
+# --------------------
 iface = gr.Interface(
     fn=process_and_describe,
+    inputs=gr.JSON(label="Input JSON"),
+    outputs=gr.JSON(label="Output JSON"),
     api_name="predict",
+    allow_flagging="never"
 )
 if __name__ == "__main__":