OppaAI commited on
Commit
d192cfe
·
verified ·
1 Parent(s): a4f7543

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -123
app.py CHANGED
@@ -9,32 +9,29 @@ import traceback
9
  import threading
10
  from typing import Tuple, Optional, Dict, Any
11
 
12
- # --- Config ---
13
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
14
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
15
 
16
- # In-memory processed requests cache to prevent duplicate execution for identical request_id
17
  PROCESSED_REQUESTS: Dict[str, Dict[str, Any]] = {}
18
  PROCESSED_LOCK = threading.Lock()
19
 
20
- # ==========================================
21
  # Robot Tools
22
- # ==========================================
23
  def tool_speak(text: str, emotion: str = "neutral") -> dict:
24
  return {"status": "success", "action_executed": "speak", "payload": {"text": text, "emotion": emotion}}
25
 
26
  def tool_navigate(direction: str, distance_meters: float) -> dict:
27
  if distance_meters > 5.0:
28
- return {"status": "error", "message": "Safety limit: Cannot move more than 5m at once."}
29
  return {"status": "success", "action_executed": "navigate", "payload": {"direction": direction, "distance": distance_meters}}
30
 
31
  def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
32
  timestamp = datetime.now().isoformat()
33
- log_entry = f"[{timestamp}] WARNING: {hazard_type} detected (Severity: {severity})"
34
- return {"status": "warning_logged", "log": log_entry}
35
 
36
  def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
37
- return {"status": "human_tracked", "details": f"Human wearing {clothing_color} is likely {estimated_action}."}
38
 
39
  TOOL_REGISTRY = {
40
  "speak": tool_speak,
@@ -43,178 +40,168 @@ TOOL_REGISTRY = {
43
  "analyze_human": tool_analyze_human
44
  }
45
 
46
- # ==========================================
47
- # Helper: Save & Upload
48
- # ==========================================
49
  def save_and_upload_image(image_b64: str, hf_token: str) -> Tuple[Optional[str], Optional[str], Optional[str], int]:
50
  try:
51
  image_bytes = base64.b64decode(image_b64)
52
  size_bytes = len(image_bytes)
53
- if size_bytes < 10:
54
- raise ValueError("Decoded image is too small or invalid base64")
55
 
56
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
57
- local_tmp_path = f"/tmp/robot_img_{timestamp}.jpg"
58
- with open(local_tmp_path, "wb") as f:
59
  f.write(image_bytes)
60
 
 
 
61
  filename = f"robot_{timestamp}.jpg"
62
- path_in_repo = filename
63
 
64
  upload_file(
65
- path_or_fileobj=local_tmp_path,
66
- path_in_repo=path_in_repo,
67
  repo_id=HF_DATASET_REPO,
68
  token=hf_token,
69
  repo_type="dataset"
70
  )
 
71
 
72
- hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
73
- return local_tmp_path, hf_image_url, path_in_repo, size_bytes
74
 
75
  except Exception as e:
76
  traceback.print_exc()
77
  return None, None, None, 0
78
 
79
- # ==========================================
80
- # JSON parsing helper
81
- # ==========================================
 
82
  def safe_parse_json_from_text(text: str) -> Optional[dict]:
83
  if not text:
84
  return None
85
- t = text.strip()
86
- if t.startswith("```") and "```" in t[3:]:
87
- t = t.strip("`")
88
- start = t.find("{")
89
- end = t.rfind("}")
90
- if start >= 0 and end > start:
91
- candidate = t[start:end+1]
92
- try:
93
- return json.loads(candidate)
94
- except Exception:
95
- try:
96
- return json.loads(t)
97
- except Exception:
98
- return None
99
- else:
100
- try:
101
- return json.loads(t)
102
- except Exception:
103
- return None
104
 
105
- # ==========================================
106
- # Tool executor
107
- # ==========================================
108
  def validate_and_call_tool(tool_name: str, tool_args: dict):
109
  if not tool_name:
110
- return {"error": "No tool_name provided by VLM."}
111
  if tool_name not in TOOL_REGISTRY:
112
- return {"error": f"Tool '{tool_name}' not found in registry."}
113
  try:
114
  return TOOL_REGISTRY[tool_name](**tool_args)
115
- except TypeError as e:
116
- return {"error": f"Tool call argument mismatch: {str(e)}"}
117
  except Exception as e:
118
  traceback.print_exc()
119
- return {"error": f"Tool execution failed: {str(e)}"}
120
 
121
- # ==========================================
122
- # Main logic
123
- # ==========================================
 
124
  def process_and_describe(payload):
125
- # If payload is str, try to parse it
 
126
  if isinstance(payload, str):
127
  try:
128
  payload = json.loads(payload)
129
  except Exception as e:
 
130
  return {"error": f"Invalid JSON string: {str(e)}"}
131
 
132
- vlm_text = ""
133
- tool_result = None
134
- action_data = {}
135
 
136
  try:
137
  hf_token = payload.get("hf_token")
138
  if not hf_token:
139
- return {"error": "HF token not provided in payload."}
140
-
141
- request_id = payload.get("request_id") or payload.get("robot_id") or None
142
- if request_id:
143
- with PROCESSED_LOCK:
144
- if request_id in PROCESSED_REQUESTS:
145
- return PROCESSED_REQUESTS[request_id]
146
 
147
  robot_id = payload.get("robot_id", "unknown")
148
  image_b64 = payload.get("image_b64")
149
  if not image_b64:
150
- return {"error": "No image provided in payload."}
151
 
 
152
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
153
  if not hf_url:
154
- return {"error": "Image upload failed.", "debug": {"local_tmp_path": local_tmp_path, "size_bytes": size_bytes}}
155
-
156
- # Build system prompt
157
- tools_desc = json.dumps({
158
- "speak": {"text": "string", "emotion": "string"},
159
- "navigate": {"direction": "forward/left/right", "distance_meters": "float"},
160
- "scan_hazard": {"hazard_type": "string", "severity": "low/medium/high"},
161
- "analyze_human": {"clothing_color": "string", "estimated_action": "string"}
162
- }, indent=2)
163
-
164
- system_prompt = f"""
165
- You are a Robot Control AI. Analyze the image and choose ONE tool to execute.
166
-
167
- AVAILABLE TOOLS (JSON Schema):
168
- {tools_desc}
169
-
170
- INSTRUCTIONS:
171
- 1. Describe what you see briefly.
172
- 2. Select the single most appropriate tool and provide arguments matching the schema.
173
-
174
- RESPONSE FORMAT (Strict JSON):
175
- {{
176
- "description": "Brief visual description",
177
- "tool_name": "name_of_tool",
178
- "arguments": {{ ...args matching schema... }}
179
- }}
180
  """
181
 
182
- messages_payload = [
183
  {"role": "system", "content": system_prompt},
184
  {"role": "user", "content": [
185
- {"type": "text", "text": "Analyze this camera feed and decide on an action."},
186
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
 
187
  ]}
188
  ]
189
 
190
- hf_client = InferenceClient(token=hf_token)
191
- chat_completion = hf_client.chat.completions.create(
 
 
192
  model=HF_VLM_MODEL,
193
- messages=messages_payload,
194
  max_tokens=300,
195
  temperature=0.1
196
  )
197
 
198
- vlm_text = chat_completion.choices[0].message.content.strip()
199
- parsed = safe_parse_json_from_text(vlm_text)
 
 
 
 
 
 
 
200
  if parsed is None:
201
- result = {
 
202
  "status": "model_no_json",
203
  "robot_id": robot_id,
204
  "image_url": hf_url,
205
- "vlm_raw": vlm_text,
206
- "message": "VLM did not return valid JSON following the required schema."
207
  }
208
- if request_id:
209
- with PROCESSED_LOCK:
210
- PROCESSED_REQUESTS[request_id] = result
211
- return result
212
 
213
- action_data = parsed
214
- tool_name = action_data.get("tool_name")
215
- tool_args = action_data.get("arguments", {}) or {}
216
- if not isinstance(tool_args, dict):
217
- tool_args = {}
218
 
219
  tool_result = validate_and_call_tool(tool_name, tool_args)
220
 
@@ -223,31 +210,31 @@ RESPONSE FORMAT (Strict JSON):
223
  "robot_id": robot_id,
224
  "image_url": hf_url,
225
  "image_bytes": size_bytes,
226
- "analysis": action_data.get("description"),
227
  "chosen_tool": tool_name,
228
  "tool_arguments": tool_args,
229
  "tool_execution_result": tool_result,
230
- "vlm_raw": vlm_text
231
  }
232
 
233
- if request_id:
234
- with PROCESSED_LOCK:
235
- PROCESSED_REQUESTS[request_id] = result
236
-
237
  return result
238
 
239
  except Exception as e:
240
  traceback.print_exc()
241
- return {"error": f"Server error: {str(e)}", "vlm_raw": vlm_text}
 
242
 
243
- # --- Gradio Interface ---
 
 
244
  iface = gr.Interface(
245
  fn=process_and_describe,
246
- inputs=gr.JSON(label="Input (JSON with 'image_b64', 'hf_token', optional 'request_id')"),
247
- outputs=gr.JSON(label="Robot Command Output"),
248
  api_name="predict",
249
- allow_flagging="never",
250
- live=False
251
  )
252
 
253
  if __name__ == "__main__":
 
9
  import threading
10
  from typing import Tuple, Optional, Dict, Any
11
 
 
12
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
13
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
14
 
 
15
  PROCESSED_REQUESTS: Dict[str, Dict[str, Any]] = {}
16
  PROCESSED_LOCK = threading.Lock()
17
 
18
+ # --------------------
19
  # Robot Tools
20
+ # --------------------
21
  def tool_speak(text: str, emotion: str = "neutral") -> dict:
22
  return {"status": "success", "action_executed": "speak", "payload": {"text": text, "emotion": emotion}}
23
 
24
  def tool_navigate(direction: str, distance_meters: float) -> dict:
25
  if distance_meters > 5.0:
26
+ return {"status": "error", "message": "Safety limit exceeded"}
27
  return {"status": "success", "action_executed": "navigate", "payload": {"direction": direction, "distance": distance_meters}}
28
 
29
  def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
30
  timestamp = datetime.now().isoformat()
31
+ return {"status": "warning_logged", "log": f"[{timestamp}] HAZARD: {hazard_type} (Severity: {severity})"}
 
32
 
33
  def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
34
+ return {"status": "human_tracked", "details": f"Human wearing {clothing_color} is {estimated_action}"}
35
 
36
  TOOL_REGISTRY = {
37
  "speak": tool_speak,
 
40
  "analyze_human": tool_analyze_human
41
  }
42
 
43
+ # --------------------
44
+ # Save + Upload
45
+ # --------------------
46
  def save_and_upload_image(image_b64: str, hf_token: str) -> Tuple[Optional[str], Optional[str], Optional[str], int]:
47
  try:
48
  image_bytes = base64.b64decode(image_b64)
49
  size_bytes = len(image_bytes)
50
+ print("[debug] decoded image bytes:", size_bytes)
 
51
 
52
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
53
+ local_path = f"/tmp/robot_img_{timestamp}.jpg"
54
+ with open(local_path, "wb") as f:
55
  f.write(image_bytes)
56
 
57
+ print("[debug] wrote local tmp file:", local_path)
58
+
59
  filename = f"robot_{timestamp}.jpg"
 
60
 
61
  upload_file(
62
+ path_or_fileobj=local_path,
63
+ path_in_repo=filename,
64
  repo_id=HF_DATASET_REPO,
65
  token=hf_token,
66
  repo_type="dataset"
67
  )
68
+ print("[debug] upload successful:", filename)
69
 
70
+ url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{filename}"
71
+ return local_path, url, filename, size_bytes
72
 
73
  except Exception as e:
74
  traceback.print_exc()
75
  return None, None, None, 0
76
 
77
+
78
+ # --------------------
79
+ # JSON Parse Helper
80
+ # --------------------
81
  def safe_parse_json_from_text(text: str) -> Optional[dict]:
82
  if not text:
83
  return None
84
+ try:
85
+ return json.loads(text)
86
+ except:
87
+ pass
88
+
89
+ cleaned = text.strip()
90
+ if cleaned.startswith("```"):
91
+ cleaned = cleaned.strip("`")
92
+
93
+ try:
94
+ start = cleaned.find("{")
95
+ end = cleaned.rfind("}")
96
+ if start >= 0 and end > start:
97
+ return json.loads(cleaned[start:end+1])
98
+ except:
99
+ return None
100
+
101
+ return None
102
+
103
 
104
+ # --------------------
105
+ # Tool validation + exec
106
+ # --------------------
107
  def validate_and_call_tool(tool_name: str, tool_args: dict):
108
  if not tool_name:
109
+ return {"error": "Missing tool_name"}
110
  if tool_name not in TOOL_REGISTRY:
111
+ return {"error": f"Unknown tool '{tool_name}'"}
112
  try:
113
  return TOOL_REGISTRY[tool_name](**tool_args)
 
 
114
  except Exception as e:
115
  traceback.print_exc()
116
+ return {"error": f"Tool error: {str(e)}"}
117
 
118
+
119
+ # --------------------
120
+ # Main Function
121
+ # --------------------
122
  def process_and_describe(payload):
123
+
124
+ # If string → parse JSON
125
  if isinstance(payload, str):
126
  try:
127
  payload = json.loads(payload)
128
  except Exception as e:
129
+ print("[error] invalid JSON from client:", payload)
130
  return {"error": f"Invalid JSON string: {str(e)}"}
131
 
132
+ print("\n================ NEW REQUEST ================")
133
+ print("[debug] Incoming payload:", payload)
 
134
 
135
  try:
136
  hf_token = payload.get("hf_token")
137
  if not hf_token:
138
+ return {"error": "hf_token missing"}
 
 
 
 
 
 
139
 
140
  robot_id = payload.get("robot_id", "unknown")
141
  image_b64 = payload.get("image_b64")
142
  if not image_b64:
143
+ return {"error": "image_b64 missing"}
144
 
145
+ # Save & Upload
146
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
147
  if not hf_url:
148
+ print("[error] Image upload failed.")
149
+ return {"error": "Image upload failed"}
150
+
151
+ print("[debug] HF image URL:", hf_url)
152
+
153
+ # Build prompt
154
+ system_prompt = """
155
+ Respond in STRICT JSON:
156
+ {
157
+ "description":"short visual description",
158
+ "tool_name":"name",
159
+ "arguments": { ... }
160
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  """
162
 
163
+ messages = [
164
  {"role": "system", "content": system_prompt},
165
  {"role": "user", "content": [
166
+ {"type": "text", "text": "Analyze image and select one tool"},
167
+ {"type": "image_url",
168
+ "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
169
  ]}
170
  ]
171
 
172
+ print("[debug] Calling VLM model...")
173
+
174
+ client = InferenceClient(token=hf_token)
175
+ response = client.chat.completions.create(
176
  model=HF_VLM_MODEL,
177
+ messages=messages,
178
  max_tokens=300,
179
  temperature=0.1
180
  )
181
 
182
+ vlm_output = response.choices[0].message.content.strip()
183
+
184
+ # 🔥 PRINT VLM RAW OUTPUT (你要求的)
185
+ print("\n------ VLM RAW OUTPUT ------")
186
+ print(vlm_output)
187
+ print("------ END VLM RAW ------\n")
188
+
189
+ parsed = safe_parse_json_from_text(vlm_output)
190
+
191
  if parsed is None:
192
+ print("[error] VLM did NOT return valid JSON")
193
+ return {
194
  "status": "model_no_json",
195
  "robot_id": robot_id,
196
  "image_url": hf_url,
197
+ "vlm_raw": vlm_output,
198
+ "message": "VLM did not output valid JSON"
199
  }
 
 
 
 
200
 
201
+ tool_name = parsed.get("tool_name")
202
+ tool_args = parsed.get("arguments") or {}
203
+
204
+ print("[debug] Parsed JSON:", parsed)
 
205
 
206
  tool_result = validate_and_call_tool(tool_name, tool_args)
207
 
 
210
  "robot_id": robot_id,
211
  "image_url": hf_url,
212
  "image_bytes": size_bytes,
213
+ "analysis": parsed.get("description"),
214
  "chosen_tool": tool_name,
215
  "tool_arguments": tool_args,
216
  "tool_execution_result": tool_result,
217
+ "vlm_raw": vlm_output
218
  }
219
 
220
+ print("[debug] Final result:", result)
221
+ print("============================================\n")
 
 
222
  return result
223
 
224
  except Exception as e:
225
  traceback.print_exc()
226
+ return {"error": f"Server exception: {str(e)}"}
227
+
228
 
229
+ # --------------------
230
+ # Gradio
231
+ # --------------------
232
  iface = gr.Interface(
233
  fn=process_and_describe,
234
+ inputs=gr.JSON(label="Input JSON"),
235
+ outputs=gr.JSON(label="Output JSON"),
236
  api_name="predict",
237
+ allow_flagging="never"
 
238
  )
239
 
240
  if __name__ == "__main__":