Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 20, 2025

Commit

024277f

verified ·

1 Parent(s): 08216b8

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -92

app.py CHANGED Viewed

@@ -1,23 +1,26 @@
 import os
 import base64
 import json
 import gradio as gr
 from huggingface_hub import upload_file, InferenceClient
 from datetime import datetime
 # --- Config ---
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 # ==========================================
-# 1. DEFINE ROBOT TOOLS
 # ==========================================
 def tool_speak(text: str, emotion: str = "neutral") -> dict:
-    """
-    Command the robot to speak text via TTS.
-    """
-    # In a real scenario, this would send a signal to the robot's speaker driver
     return {
         "status": "success",
         "action_executed": "speak",
@@ -25,12 +28,8 @@ def tool_speak(text: str, emotion: str = "neutral") -> dict:
     }
 def tool_navigate(direction: str, distance_meters: float) -> dict:
-    """
-    Move the robot. Direction options: 'forward', 'backward', 'left', 'right'.
-    """
     if distance_meters > 5.0:
         return {"status": "error", "message": "Safety limit: Cannot move more than 5m at once."}
     return {
         "status": "success",
         "action_executed": "navigate",
@@ -38,28 +37,20 @@ def tool_navigate(direction: str, distance_meters: float) -> dict:
     }
 def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
-    """
-    Log a safety hazard if seen in the image (e.g., 'fire', 'water', 'obstacle').
-    """
     timestamp = datetime.now().isoformat()
     log_entry = f"[{timestamp}] WARNING: {hazard_type} detected (Severity: {severity})"
-    # Here you would write to a log file or trigger an alarm
     return {
         "status": "warning_logged",
         "log": log_entry
     }
 def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
-    """
-    Specialized analysis when a human is detected.
-    """
     return {
         "status": "human_tracked",
         "details": f"Human wearing {clothing_color} is likely {estimated_action}."
     }
-# --- Tool Dispatcher ---
-# This maps string names to the actual Python functions
 TOOL_REGISTRY = {
     "speak": tool_speak,
     "navigate": tool_navigate,
@@ -68,20 +59,33 @@ TOOL_REGISTRY = {
 }
 # ==========================================
-# 2. HELPER FUNCTIONS
 # ==========================================
-def save_and_upload_image(image_b64: str, hf_token: str):
     try:
         image_bytes = base64.b64decode(image_b64)
-        local_tmp_path = "/tmp/tmp.jpg"
         with open(local_tmp_path, "wb") as f:
             f.write(image_bytes)
-        # Create unique filename to avoid overwriting
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        path_in_repo = f"images/robot_{timestamp}.jpg"
         upload_file(
             path_or_fileobj=local_tmp_path,
             path_in_repo=path_in_repo,
@@ -91,38 +95,108 @@ def save_and_upload_image(image_b64: str, hf_token: str):
         )
         hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
-        return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
     except Exception as e:
-        print(f"Upload failed: {e}")
         return None, None, None, 0
 # ==========================================
-# 3. MAIN LOGIC
 # ==========================================
 def process_and_describe(payload: dict):
-    tool_result = None
     vlm_text = ""
     action_data = {}
     try:
         hf_token = payload.get("hf_token")
         if not hf_token:
-            return {"error": "HF token not provided in payload."}
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload.get("image_b64")
         if not image_b64:
-            return {"error": "No image provided."}
-        # Upload Image
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
-        # Initialize HF Client
-        hf_client = InferenceClient(token=hf_token)
-        # --- Dynamic System Prompt Construction ---
         tools_desc = json.dumps({
             "speak": {"text": "string", "emotion": "string"},
             "navigate": {"direction": "forward/left/right", "distance_meters": "float"},
@@ -131,26 +205,24 @@ def process_and_describe(payload: dict):
         }, indent=2)
         system_prompt = f"""
-        You are a Robot Control AI. Analyze the image and choose ONE tool to execute.
-        AVAILABLE TOOLS (JSON Schema):
-        {tools_desc}
-        INSTRUCTIONS:
-        1. Describe what you see briefly.
-        2. Select the most appropriate tool based on the visual context.
-           - If you see a person -> use 'analyze_human' OR 'speak'.
-           - If you see a clear path -> use 'navigate'.
-           - If you see fire/mess -> use 'scan_hazard'.
-        RESPONSE FORMAT (Strict JSON):
-        {{
-            "description": "Brief visual description",
-            "tool_name": "name_of_tool",
-            "arguments": {{ ...args matching schema... }}
-        }}
-        """
         messages_payload = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": [
@@ -159,59 +231,80 @@ def process_and_describe(payload: dict):
             ]}
         ]
-        # Call VLM
         chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
             messages=messages_payload,
             max_tokens=300,
-            temperature=0.1 # Low temp for reliable JSON
         )
         vlm_text = chat_completion.choices[0].message.content.strip()
-        # Clean up markdown code blocks if the model adds them (```json ... ```)
-        if vlm_text.startswith("```"):
-            vlm_text = vlm_text.strip("`").replace("json", "").strip()
-        # Parse JSON
-        try:
-            action_data = json.loads(vlm_text)
-            # --- TOOL EXECUTION BLOCK ---
-            tool_name = action_data.get("tool_name")
-            tool_args = action_data.get("arguments", {})
-            if tool_name in TOOL_REGISTRY:
-                # Execute the Python function dynamically
-                print(f"Executing tool: {tool_name} with args {tool_args}")
-                tool_result = TOOL_REGISTRY[tool_name](**tool_args)
-            else:
-                tool_result = {"error": f"Tool '{tool_name}' not found in registry."}
-        except json.JSONDecodeError:
-            action_data = {"description": vlm_text, "tool_name": None}
-            tool_result = {"error": "Model did not return valid JSON."}
-        return {
             "status": "success",
             "robot_id": robot_id,
             "image_url": hf_url,
             "analysis": action_data.get("description"),
-            "chosen_tool": action_data.get("tool_name"),
-            "tool_arguments": action_data.get("arguments"),
-            "tool_execution_result": tool_result
         }
     except Exception as e:
-        return {"error": f"Server error: {str(e)}", "raw_response": vlm_text}
 # --- Gradio Interface ---
-demo = gr.Interface(
     fn=process_and_describe,
-    inputs=gr.JSON(label="Input (JSON with 'image_b64' and 'hf_token')"),
     outputs=gr.JSON(label="Robot Command Output"),
-    api_name="predict"
 )
 if __name__ == "__main__":
-    demo.launch()

+# app.py
 import os
 import base64
 import json
 import gradio as gr
 from huggingface_hub import upload_file, InferenceClient
 from datetime import datetime
+import traceback
+import threading
+from typing import Tuple, Optional, Dict, Any
 # --- Config ---
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
+# In-memory processed requests cache to prevent duplicate execution for identical request_id
+PROCESSED_REQUESTS: Dict[str, Dict[str, Any]] = {}
+PROCESSED_LOCK = threading.Lock()
 # ==========================================
+# Robot Tools (unchanged semantics)
 # ==========================================
 def tool_speak(text: str, emotion: str = "neutral") -> dict:
     return {
         "status": "success",
         "action_executed": "speak",
     }
 def tool_navigate(direction: str, distance_meters: float) -> dict:
     if distance_meters > 5.0:
         return {"status": "error", "message": "Safety limit: Cannot move more than 5m at once."}
     return {
         "status": "success",
         "action_executed": "navigate",
     }
 def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
     timestamp = datetime.now().isoformat()
     log_entry = f"[{timestamp}] WARNING: {hazard_type} detected (Severity: {severity})"
+    # (in real system: write to file/logging infra)
     return {
         "status": "warning_logged",
         "log": log_entry
     }
 def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
     return {
         "status": "human_tracked",
         "details": f"Human wearing {clothing_color} is likely {estimated_action}."
     }
 TOOL_REGISTRY = {
     "speak": tool_speak,
     "navigate": tool_navigate,
 }
 # ==========================================
+# Helper: Save & Upload (robust)
 # ==========================================
+def save_and_upload_image(image_b64: str, hf_token: str) -> Tuple[Optional[str], Optional[str], Optional[str], int]:
+    """
+    Save a base64 image to a uniquely named /tmp file and upload to HF dataset repo.
+    Returns: local_tmp_path, hf_url, path_in_repo, size_bytes
+    """
     try:
+        # decode
         image_bytes = base64.b64decode(image_b64)
+        size_bytes = len(image_bytes)
+        print("[debug] decoded image bytes:", size_bytes)
+        if size_bytes < 10:
+            raise ValueError("Decoded image is too small or invalid base64")
+        # unique tmp filename (avoid collision across workers)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        local_tmp_path = f"/tmp/robot_img_{timestamp}.jpg"
         with open(local_tmp_path, "wb") as f:
             f.write(image_bytes)
+        print(f"[debug] wrote local tmp file: {local_tmp_path}")
+        # Prepare filename in repo (put at repo root to avoid folder permission issues)
+        filename = f"robot_{timestamp}.jpg"
+        path_in_repo = filename
+        # upload_file might raise. capture exception and show traceback
         upload_file(
             path_or_fileobj=local_tmp_path,
             path_in_repo=path_in_repo,
         )
         hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
+        print("[debug] upload successful:", hf_image_url)
+        return local_tmp_path, hf_image_url, path_in_repo, size_bytes
     except Exception as e:
+        print("[error] save_and_upload_image failed:", e)
+        traceback.print_exc()
         return None, None, None, 0
 # ==========================================
+# Main logic
 # ==========================================
+def safe_parse_json_from_text(text: str) -> Optional[dict]:
+    """
+    Try to extract JSON object from model output.
+    Accepts raw JSON, or a ```json\n{...}``` block, or text with JSON substring.
+    Returns dict or None.
+    """
+    if not text:
+        return None
+    # remove markdown fences
+    t = text.strip()
+    if t.startswith("```") and "```" in t[3:]:
+        # remove outer fences
+        t = t.strip("`")
+    # find first '{' and last '}' to try to extract JSON substring
+    start = t.find("{")
+    end = t.rfind("}")
+    if start >= 0 and end > start:
+        candidate = t[start:end+1]
+        try:
+            return json.loads(candidate)
+        except Exception:
+            # fallback: try the whole text
+            try:
+                return json.loads(t)
+            except Exception:
+                return None
+    else:
+        try:
+            return json.loads(t)
+        except Exception:
+            return None
+def validate_and_call_tool(tool_name: str, tool_args: dict):
+    if not tool_name:
+        return {"error": "No tool_name provided by VLM."}
+    if tool_name not in TOOL_REGISTRY:
+        return {"error": f"Tool '{tool_name}' not found in registry."}
+    # safe-call: ensure dict args only contain acceptable keys for that tool
+    try:
+        result = TOOL_REGISTRY[tool_name](**tool_args)
+        return result
+    except TypeError as e:
+        return {"error": f"Tool call argument mismatch: {str(e)}"}
+    except Exception as e:
+        traceback.print_exc()
+        return {"error": f"Tool execution failed: {str(e)}"}
 def process_and_describe(payload: dict):
+    """
+    payload expects keys:
+      - hf_token (string)
+      - image_b64 (base64 str)
+      - robot_id (optional)
+      - request_id (optional)  # recommended to dedupe retries
+    """
     vlm_text = ""
+    tool_result = None
     action_data = {}
     try:
+        # basic checks
         hf_token = payload.get("hf_token")
         if not hf_token:
+            return {"error": "HF token not provided in payload. Token must have datasets write permission if uploading."}
+        request_id = payload.get("request_id") or payload.get("robot_id") or None
+        if request_id:
+            with PROCESSED_LOCK:
+                if request_id in PROCESSED_REQUESTS:
+                    print("[info] duplicate request_id detected; returning cached result")
+                    return PROCESSED_REQUESTS[request_id]
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload.get("image_b64")
         if not image_b64:
+            return {"error": "No image provided in payload."}
+        # Save & upload (only once per invocation)
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
+        if not hf_url:
+            # Upload failed: return error with helpful debug info
+            return {
+                "error": "Image upload failed on server.",
+                "debug": {
+                    "local_tmp_path": local_tmp_path,
+                    "path_in_repo": path_in_repo,
+                    "size_bytes": size_bytes
+                }
+            }
+        # Build system prompt (kept compact)
         tools_desc = json.dumps({
             "speak": {"text": "string", "emotion": "string"},
             "navigate": {"direction": "forward/left/right", "distance_meters": "float"},
         }, indent=2)
         system_prompt = f"""
+You are a Robot Control AI. Analyze the image and choose ONE tool to execute.
+AVAILABLE TOOLS (JSON Schema):
+{tools_desc}
+INSTRUCTIONS:
+1. Describe what you see briefly.
+2. Select the single most appropriate tool and provide arguments matching the schema.
+RESPONSE FORMAT (Strict JSON):
+{{
+  "description": "Brief visual description",
+  "tool_name": "name_of_tool",
+  "arguments": {{ ...args matching schema... }}
+}}
+"""
+        # Build messages payload for VLM - include the uploaded HF URL (some VLMs can fetch it)
         messages_payload = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": [
             ]}
         ]
+        # Instantiate HF Inference client and call chat completion
+        hf_client = InferenceClient(token=hf_token)
+        # NOTE: huggingface InferenceClient usage may vary by version. We use the chat completions create call.
         chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
             messages=messages_payload,
             max_tokens=300,
+            temperature=0.1
         )
         vlm_text = chat_completion.choices[0].message.content.strip()
+        print("[debug] VLM raw output:", vlm_text[:1000])
+        # attempt to parse JSON
+        parsed = safe_parse_json_from_text(vlm_text)
+        if parsed is None:
+            # If the model didn't return JSON, return descriptive fallback but do not execute tools
+            result = {
+                "status": "model_no_json",
+                "robot_id": robot_id,
+                "image_url": hf_url,
+                "vlm_raw": vlm_text,
+                "message": "VLM did not return valid JSON following the required schema."
+            }
+            if request_id:
+                with PROCESSED_LOCK:
+                    PROCESSED_REQUESTS[request_id] = result
+            return result
+        action_data = parsed
+        tool_name = action_data.get("tool_name")
+        tool_args = action_data.get("arguments", {}) or {}
+        # Validate that arguments is a dict
+        if not isinstance(tool_args, dict):
+            tool_args = {}
+        # Execute the tool once and capture result
+        print(f"[info] Executing tool: {tool_name} with args {tool_args}")
+        tool_result = validate_and_call_tool(tool_name, tool_args)
+        result = {
             "status": "success",
             "robot_id": robot_id,
             "image_url": hf_url,
+            "image_bytes": size_bytes,
             "analysis": action_data.get("description"),
+            "chosen_tool": tool_name,
+            "tool_arguments": tool_args,
+            "tool_execution_result": tool_result,
+            "vlm_raw": vlm_text
         }
+        if request_id:
+            with PROCESSED_LOCK:
+                PROCESSED_REQUESTS[request_id] = result
+        return result
     except Exception as e:
+        traceback.print_exc()
+        return {"error": f"Server error: {str(e)}", "vlm_raw": vlm_text}
 # --- Gradio Interface ---
+iface = gr.Interface(
     fn=process_and_describe,
+    inputs=gr.JSON(label="Input (JSON with 'image_b64', 'hf_token', optional 'request_id')"),
     outputs=gr.JSON(label="Robot Command Output"),
+    api_name="predict",
+    allow_flagging="never",
+    live=False
 )
 if __name__ == "__main__":
+    # When deploying to HF Space: set server_name and server_port via env if you need
+    iface.launch()