Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
import base64
|
| 3 |
import json
|
| 4 |
import gradio as gr
|
| 5 |
-
from huggingface_hub import
|
| 6 |
from datetime import datetime
|
| 7 |
import traceback
|
| 8 |
from typing import Optional, Dict, Any
|
|
@@ -31,7 +31,6 @@ def save_and_upload_image(image_b64: str, hf_token: str):
|
|
| 31 |
|
| 32 |
filename = f"robot_{timestamp}.jpg"
|
| 33 |
|
| 34 |
-
from huggingface_hub import HfApi
|
| 35 |
api = HfApi()
|
| 36 |
api.upload_file(
|
| 37 |
path_or_fileobj=local_path,
|
|
@@ -110,19 +109,28 @@ def process_and_describe(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
| 110 |
if not hf_url:
|
| 111 |
return {"error": "Image upload failed"}
|
| 112 |
|
| 113 |
-
# VLM system prompt
|
| 114 |
system_prompt = f"""
|
| 115 |
-
Respond in STRICT JSON ONLY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
{{
|
| 117 |
-
"description": "
|
| 118 |
-
"tool_name": "
|
| 119 |
-
"arguments": {{
|
| 120 |
}}
|
| 121 |
"""
|
|
|
|
| 122 |
messages = [
|
| 123 |
{"role": "system", "content": system_prompt},
|
| 124 |
{"role": "user", "content": [
|
| 125 |
-
{"type": "text", "text": "Analyze the image and
|
| 126 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
|
| 127 |
]}
|
| 128 |
]
|
|
@@ -143,9 +151,12 @@ Respond in STRICT JSON ONLY:
|
|
| 143 |
if parsed is None:
|
| 144 |
return {"status": "model_no_json", "robot_id": robot_id, "vlm_raw": vlm_output, "message": "VLM returned invalid JSON"}
|
| 145 |
|
|
|
|
| 146 |
tool_name = parsed.get("tool_name")
|
| 147 |
tool_args = parsed.get("arguments") or {}
|
| 148 |
-
tool_result =
|
|
|
|
|
|
|
| 149 |
|
| 150 |
return {
|
| 151 |
"status": "success",
|
|
@@ -177,4 +188,4 @@ if __name__ == "__main__":
|
|
| 177 |
print(f"[Config] HF_VLM_MODEL: {HF_VLM_MODEL}")
|
| 178 |
print(f"[Config] REMOTE_MCP_URL: {REMOTE_MCP_URL}")
|
| 179 |
print("[Gradio] Launching interface...")
|
| 180 |
-
app.launch(
|
|
|
|
| 2 |
import base64
|
| 3 |
import json
|
| 4 |
import gradio as gr
|
| 5 |
+
from huggingface_hub import HfApi, InferenceClient
|
| 6 |
from datetime import datetime
|
| 7 |
import traceback
|
| 8 |
from typing import Optional, Dict, Any
|
|
|
|
| 31 |
|
| 32 |
filename = f"robot_{timestamp}.jpg"
|
| 33 |
|
|
|
|
| 34 |
api = HfApi()
|
| 35 |
api.upload_file(
|
| 36 |
path_or_fileobj=local_path,
|
|
|
|
| 109 |
if not hf_url:
|
| 110 |
return {"error": "Image upload failed"}
|
| 111 |
|
| 112 |
+
# VLM system prompt: decide MCP tool automatically
|
| 113 |
system_prompt = f"""
|
| 114 |
+
Respond in STRICT JSON ONLY.
|
| 115 |
+
Rules:
|
| 116 |
+
1. Provide a short description of what you see.
|
| 117 |
+
2. Decide ONE MCP tool to call from:
|
| 118 |
+
- chat_with_human
|
| 119 |
+
3. If a human is looking directly at the robot and waving, set "tool_name": "chat_with_human".
|
| 120 |
+
4. Otherwise, set "tool_name": null and leave "arguments": {{}}
|
| 121 |
+
|
| 122 |
+
Output format:
|
| 123 |
{{
|
| 124 |
+
"description": "...",
|
| 125 |
+
"tool_name": "chat_with_human | null",
|
| 126 |
+
"arguments": {{}}
|
| 127 |
}}
|
| 128 |
"""
|
| 129 |
+
|
| 130 |
messages = [
|
| 131 |
{"role": "system", "content": system_prompt},
|
| 132 |
{"role": "user", "content": [
|
| 133 |
+
{"type": "text", "text": "Analyze the image and call the appropriate MCP tool."},
|
| 134 |
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
|
| 135 |
]}
|
| 136 |
]
|
|
|
|
| 151 |
if parsed is None:
|
| 152 |
return {"status": "model_no_json", "robot_id": robot_id, "vlm_raw": vlm_output, "message": "VLM returned invalid JSON"}
|
| 153 |
|
| 154 |
+
# Call the MCP tool directly if VLM chooses one
|
| 155 |
tool_name = parsed.get("tool_name")
|
| 156 |
tool_args = parsed.get("arguments") or {}
|
| 157 |
+
tool_result = None
|
| 158 |
+
if tool_name:
|
| 159 |
+
tool_result = validate_and_call_tool(tool_name, tool_args)
|
| 160 |
|
| 161 |
return {
|
| 162 |
"status": "success",
|
|
|
|
| 188 |
print(f"[Config] HF_VLM_MODEL: {HF_VLM_MODEL}")
|
| 189 |
print(f"[Config] REMOTE_MCP_URL: {REMOTE_MCP_URL}")
|
| 190 |
print("[Gradio] Launching interface...")
|
| 191 |
+
app.launch()
|