OppaAI commited on
Commit
024277f
·
verified ·
1 Parent(s): 08216b8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -92
app.py CHANGED
@@ -1,23 +1,26 @@
 
1
  import os
2
  import base64
3
  import json
4
  import gradio as gr
5
  from huggingface_hub import upload_file, InferenceClient
6
  from datetime import datetime
 
 
 
7
 
8
  # --- Config ---
9
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
10
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
11
 
 
 
 
 
12
  # ==========================================
13
- # 1. DEFINE ROBOT TOOLS
14
  # ==========================================
15
-
16
  def tool_speak(text: str, emotion: str = "neutral") -> dict:
17
- """
18
- Command the robot to speak text via TTS.
19
- """
20
- # In a real scenario, this would send a signal to the robot's speaker driver
21
  return {
22
  "status": "success",
23
  "action_executed": "speak",
@@ -25,12 +28,8 @@ def tool_speak(text: str, emotion: str = "neutral") -> dict:
25
  }
26
 
27
  def tool_navigate(direction: str, distance_meters: float) -> dict:
28
- """
29
- Move the robot. Direction options: 'forward', 'backward', 'left', 'right'.
30
- """
31
  if distance_meters > 5.0:
32
  return {"status": "error", "message": "Safety limit: Cannot move more than 5m at once."}
33
-
34
  return {
35
  "status": "success",
36
  "action_executed": "navigate",
@@ -38,28 +37,20 @@ def tool_navigate(direction: str, distance_meters: float) -> dict:
38
  }
39
 
40
  def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
41
- """
42
- Log a safety hazard if seen in the image (e.g., 'fire', 'water', 'obstacle').
43
- """
44
  timestamp = datetime.now().isoformat()
45
  log_entry = f"[{timestamp}] WARNING: {hazard_type} detected (Severity: {severity})"
46
- # Here you would write to a log file or trigger an alarm
47
  return {
48
  "status": "warning_logged",
49
  "log": log_entry
50
  }
51
 
52
  def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
53
- """
54
- Specialized analysis when a human is detected.
55
- """
56
  return {
57
  "status": "human_tracked",
58
  "details": f"Human wearing {clothing_color} is likely {estimated_action}."
59
  }
60
 
61
- # --- Tool Dispatcher ---
62
- # This maps string names to the actual Python functions
63
  TOOL_REGISTRY = {
64
  "speak": tool_speak,
65
  "navigate": tool_navigate,
@@ -68,20 +59,33 @@ TOOL_REGISTRY = {
68
  }
69
 
70
  # ==========================================
71
- # 2. HELPER FUNCTIONS
72
  # ==========================================
73
-
74
- def save_and_upload_image(image_b64: str, hf_token: str):
 
 
 
75
  try:
 
76
  image_bytes = base64.b64decode(image_b64)
77
- local_tmp_path = "/tmp/tmp.jpg"
 
 
 
 
 
 
 
78
  with open(local_tmp_path, "wb") as f:
79
  f.write(image_bytes)
 
80
 
81
- # Create unique filename to avoid overwriting
82
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
83
- path_in_repo = f"images/robot_{timestamp}.jpg"
84
-
 
85
  upload_file(
86
  path_or_fileobj=local_tmp_path,
87
  path_in_repo=path_in_repo,
@@ -91,38 +95,108 @@ def save_and_upload_image(image_b64: str, hf_token: str):
91
  )
92
 
93
  hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
94
- return local_tmp_path, hf_image_url, path_in_repo, len(image_bytes)
 
 
95
  except Exception as e:
96
- print(f"Upload failed: {e}")
 
97
  return None, None, None, 0
98
 
99
  # ==========================================
100
- # 3. MAIN LOGIC
101
  # ==========================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  def process_and_describe(payload: dict):
104
- tool_result = None
 
 
 
 
 
 
105
  vlm_text = ""
 
106
  action_data = {}
107
 
108
  try:
 
109
  hf_token = payload.get("hf_token")
110
  if not hf_token:
111
- return {"error": "HF token not provided in payload."}
 
 
 
 
 
 
 
112
 
113
  robot_id = payload.get("robot_id", "unknown")
114
  image_b64 = payload.get("image_b64")
115
-
116
  if not image_b64:
117
- return {"error": "No image provided."}
118
 
119
- # Upload Image
120
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
 
 
 
 
 
 
 
 
 
 
121
 
122
- # Initialize HF Client
123
- hf_client = InferenceClient(token=hf_token)
124
-
125
- # --- Dynamic System Prompt Construction ---
126
  tools_desc = json.dumps({
127
  "speak": {"text": "string", "emotion": "string"},
128
  "navigate": {"direction": "forward/left/right", "distance_meters": "float"},
@@ -131,26 +205,24 @@ def process_and_describe(payload: dict):
131
  }, indent=2)
132
 
133
  system_prompt = f"""
134
- You are a Robot Control AI. Analyze the image and choose ONE tool to execute.
135
-
136
- AVAILABLE TOOLS (JSON Schema):
137
- {tools_desc}
138
-
139
- INSTRUCTIONS:
140
- 1. Describe what you see briefly.
141
- 2. Select the most appropriate tool based on the visual context.
142
- - If you see a person -> use 'analyze_human' OR 'speak'.
143
- - If you see a clear path -> use 'navigate'.
144
- - If you see fire/mess -> use 'scan_hazard'.
145
-
146
- RESPONSE FORMAT (Strict JSON):
147
- {{
148
- "description": "Brief visual description",
149
- "tool_name": "name_of_tool",
150
- "arguments": {{ ...args matching schema... }}
151
- }}
152
- """
153
 
 
 
 
 
 
 
 
 
 
154
  messages_payload = [
155
  {"role": "system", "content": system_prompt},
156
  {"role": "user", "content": [
@@ -159,59 +231,80 @@ def process_and_describe(payload: dict):
159
  ]}
160
  ]
161
 
162
- # Call VLM
 
 
 
163
  chat_completion = hf_client.chat.completions.create(
164
  model=HF_VLM_MODEL,
165
  messages=messages_payload,
166
  max_tokens=300,
167
- temperature=0.1 # Low temp for reliable JSON
168
  )
169
 
170
  vlm_text = chat_completion.choices[0].message.content.strip()
171
-
172
- # Clean up markdown code blocks if the model adds them (```json ... ```)
173
- if vlm_text.startswith("```"):
174
- vlm_text = vlm_text.strip("`").replace("json", "").strip()
175
 
176
- # Parse JSON
177
- try:
178
- action_data = json.loads(vlm_text)
179
-
180
- # --- TOOL EXECUTION BLOCK ---
181
- tool_name = action_data.get("tool_name")
182
- tool_args = action_data.get("arguments", {})
183
-
184
- if tool_name in TOOL_REGISTRY:
185
- # Execute the Python function dynamically
186
- print(f"Executing tool: {tool_name} with args {tool_args}")
187
- tool_result = TOOL_REGISTRY[tool_name](**tool_args)
188
- else:
189
- tool_result = {"error": f"Tool '{tool_name}' not found in registry."}
190
-
191
- except json.JSONDecodeError:
192
- action_data = {"description": vlm_text, "tool_name": None}
193
- tool_result = {"error": "Model did not return valid JSON."}
194
-
195
- return {
 
 
 
 
 
 
 
 
 
196
  "status": "success",
197
  "robot_id": robot_id,
198
  "image_url": hf_url,
 
199
  "analysis": action_data.get("description"),
200
- "chosen_tool": action_data.get("tool_name"),
201
- "tool_arguments": action_data.get("arguments"),
202
- "tool_execution_result": tool_result
 
203
  }
204
 
 
 
 
 
 
 
205
  except Exception as e:
206
- return {"error": f"Server error: {str(e)}", "raw_response": vlm_text}
 
207
 
208
  # --- Gradio Interface ---
209
- demo = gr.Interface(
210
  fn=process_and_describe,
211
- inputs=gr.JSON(label="Input (JSON with 'image_b64' and 'hf_token')"),
212
  outputs=gr.JSON(label="Robot Command Output"),
213
- api_name="predict"
 
 
214
  )
215
 
216
  if __name__ == "__main__":
217
- demo.launch()
 
 
1
+ # app.py
2
  import os
3
  import base64
4
  import json
5
  import gradio as gr
6
  from huggingface_hub import upload_file, InferenceClient
7
  from datetime import datetime
8
+ import traceback
9
+ import threading
10
+ from typing import Tuple, Optional, Dict, Any
11
 
12
  # --- Config ---
13
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
14
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
15
 
16
+ # In-memory processed requests cache to prevent duplicate execution for identical request_id
17
+ PROCESSED_REQUESTS: Dict[str, Dict[str, Any]] = {}
18
+ PROCESSED_LOCK = threading.Lock()
19
+
20
  # ==========================================
21
+ # Robot Tools (unchanged semantics)
22
  # ==========================================
 
23
  def tool_speak(text: str, emotion: str = "neutral") -> dict:
 
 
 
 
24
  return {
25
  "status": "success",
26
  "action_executed": "speak",
 
28
  }
29
 
30
  def tool_navigate(direction: str, distance_meters: float) -> dict:
 
 
 
31
  if distance_meters > 5.0:
32
  return {"status": "error", "message": "Safety limit: Cannot move more than 5m at once."}
 
33
  return {
34
  "status": "success",
35
  "action_executed": "navigate",
 
37
  }
38
 
39
  def tool_scan_hazard(hazard_type: str, severity: str) -> dict:
 
 
 
40
  timestamp = datetime.now().isoformat()
41
  log_entry = f"[{timestamp}] WARNING: {hazard_type} detected (Severity: {severity})"
42
+ # (in real system: write to file/logging infra)
43
  return {
44
  "status": "warning_logged",
45
  "log": log_entry
46
  }
47
 
48
  def tool_analyze_human(clothing_color: str, estimated_action: str) -> dict:
 
 
 
49
  return {
50
  "status": "human_tracked",
51
  "details": f"Human wearing {clothing_color} is likely {estimated_action}."
52
  }
53
 
 
 
54
  TOOL_REGISTRY = {
55
  "speak": tool_speak,
56
  "navigate": tool_navigate,
 
59
  }
60
 
61
  # ==========================================
62
+ # Helper: Save & Upload (robust)
63
  # ==========================================
64
+ def save_and_upload_image(image_b64: str, hf_token: str) -> Tuple[Optional[str], Optional[str], Optional[str], int]:
65
+ """
66
+ Save a base64 image to a uniquely named /tmp file and upload to HF dataset repo.
67
+ Returns: local_tmp_path, hf_url, path_in_repo, size_bytes
68
+ """
69
  try:
70
+ # decode
71
  image_bytes = base64.b64decode(image_b64)
72
+ size_bytes = len(image_bytes)
73
+ print("[debug] decoded image bytes:", size_bytes)
74
+ if size_bytes < 10:
75
+ raise ValueError("Decoded image is too small or invalid base64")
76
+
77
+ # unique tmp filename (avoid collision across workers)
78
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
79
+ local_tmp_path = f"/tmp/robot_img_{timestamp}.jpg"
80
  with open(local_tmp_path, "wb") as f:
81
  f.write(image_bytes)
82
+ print(f"[debug] wrote local tmp file: {local_tmp_path}")
83
 
84
+ # Prepare filename in repo (put at repo root to avoid folder permission issues)
85
+ filename = f"robot_{timestamp}.jpg"
86
+ path_in_repo = filename
87
+
88
+ # upload_file might raise. capture exception and show traceback
89
  upload_file(
90
  path_or_fileobj=local_tmp_path,
91
  path_in_repo=path_in_repo,
 
95
  )
96
 
97
  hf_image_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/{path_in_repo}"
98
+ print("[debug] upload successful:", hf_image_url)
99
+ return local_tmp_path, hf_image_url, path_in_repo, size_bytes
100
+
101
  except Exception as e:
102
+ print("[error] save_and_upload_image failed:", e)
103
+ traceback.print_exc()
104
  return None, None, None, 0
105
 
106
  # ==========================================
107
+ # Main logic
108
  # ==========================================
109
+ def safe_parse_json_from_text(text: str) -> Optional[dict]:
110
+ """
111
+ Try to extract JSON object from model output.
112
+ Accepts raw JSON, or a ```json\n{...}``` block, or text with JSON substring.
113
+ Returns dict or None.
114
+ """
115
+ if not text:
116
+ return None
117
+ # remove markdown fences
118
+ t = text.strip()
119
+ if t.startswith("```") and "```" in t[3:]:
120
+ # remove outer fences
121
+ t = t.strip("`")
122
+ # find first '{' and last '}' to try to extract JSON substring
123
+ start = t.find("{")
124
+ end = t.rfind("}")
125
+ if start >= 0 and end > start:
126
+ candidate = t[start:end+1]
127
+ try:
128
+ return json.loads(candidate)
129
+ except Exception:
130
+ # fallback: try the whole text
131
+ try:
132
+ return json.loads(t)
133
+ except Exception:
134
+ return None
135
+ else:
136
+ try:
137
+ return json.loads(t)
138
+ except Exception:
139
+ return None
140
+
141
+ def validate_and_call_tool(tool_name: str, tool_args: dict):
142
+ if not tool_name:
143
+ return {"error": "No tool_name provided by VLM."}
144
+ if tool_name not in TOOL_REGISTRY:
145
+ return {"error": f"Tool '{tool_name}' not found in registry."}
146
+ # safe-call: ensure dict args only contain acceptable keys for that tool
147
+ try:
148
+ result = TOOL_REGISTRY[tool_name](**tool_args)
149
+ return result
150
+ except TypeError as e:
151
+ return {"error": f"Tool call argument mismatch: {str(e)}"}
152
+ except Exception as e:
153
+ traceback.print_exc()
154
+ return {"error": f"Tool execution failed: {str(e)}"}
155
 
156
  def process_and_describe(payload: dict):
157
+ """
158
+ payload expects keys:
159
+ - hf_token (string)
160
+ - image_b64 (base64 str)
161
+ - robot_id (optional)
162
+ - request_id (optional) # recommended to dedupe retries
163
+ """
164
  vlm_text = ""
165
+ tool_result = None
166
  action_data = {}
167
 
168
  try:
169
+ # basic checks
170
  hf_token = payload.get("hf_token")
171
  if not hf_token:
172
+ return {"error": "HF token not provided in payload. Token must have datasets write permission if uploading."}
173
+
174
+ request_id = payload.get("request_id") or payload.get("robot_id") or None
175
+ if request_id:
176
+ with PROCESSED_LOCK:
177
+ if request_id in PROCESSED_REQUESTS:
178
+ print("[info] duplicate request_id detected; returning cached result")
179
+ return PROCESSED_REQUESTS[request_id]
180
 
181
  robot_id = payload.get("robot_id", "unknown")
182
  image_b64 = payload.get("image_b64")
 
183
  if not image_b64:
184
+ return {"error": "No image provided in payload."}
185
 
186
+ # Save & upload (only once per invocation)
187
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
188
+ if not hf_url:
189
+ # Upload failed: return error with helpful debug info
190
+ return {
191
+ "error": "Image upload failed on server.",
192
+ "debug": {
193
+ "local_tmp_path": local_tmp_path,
194
+ "path_in_repo": path_in_repo,
195
+ "size_bytes": size_bytes
196
+ }
197
+ }
198
 
199
+ # Build system prompt (kept compact)
 
 
 
200
  tools_desc = json.dumps({
201
  "speak": {"text": "string", "emotion": "string"},
202
  "navigate": {"direction": "forward/left/right", "distance_meters": "float"},
 
205
  }, indent=2)
206
 
207
  system_prompt = f"""
208
+ You are a Robot Control AI. Analyze the image and choose ONE tool to execute.
209
+
210
+ AVAILABLE TOOLS (JSON Schema):
211
+ {tools_desc}
212
+
213
+ INSTRUCTIONS:
214
+ 1. Describe what you see briefly.
215
+ 2. Select the single most appropriate tool and provide arguments matching the schema.
 
 
 
 
 
 
 
 
 
 
 
216
 
217
+ RESPONSE FORMAT (Strict JSON):
218
+ {{
219
+ "description": "Brief visual description",
220
+ "tool_name": "name_of_tool",
221
+ "arguments": {{ ...args matching schema... }}
222
+ }}
223
+ """
224
+
225
+ # Build messages payload for VLM - include the uploaded HF URL (some VLMs can fetch it)
226
  messages_payload = [
227
  {"role": "system", "content": system_prompt},
228
  {"role": "user", "content": [
 
231
  ]}
232
  ]
233
 
234
+ # Instantiate HF Inference client and call chat completion
235
+ hf_client = InferenceClient(token=hf_token)
236
+
237
+ # NOTE: huggingface InferenceClient usage may vary by version. We use the chat completions create call.
238
  chat_completion = hf_client.chat.completions.create(
239
  model=HF_VLM_MODEL,
240
  messages=messages_payload,
241
  max_tokens=300,
242
+ temperature=0.1
243
  )
244
 
245
  vlm_text = chat_completion.choices[0].message.content.strip()
246
+ print("[debug] VLM raw output:", vlm_text[:1000])
 
 
 
247
 
248
+ # attempt to parse JSON
249
+ parsed = safe_parse_json_from_text(vlm_text)
250
+ if parsed is None:
251
+ # If the model didn't return JSON, return descriptive fallback but do not execute tools
252
+ result = {
253
+ "status": "model_no_json",
254
+ "robot_id": robot_id,
255
+ "image_url": hf_url,
256
+ "vlm_raw": vlm_text,
257
+ "message": "VLM did not return valid JSON following the required schema."
258
+ }
259
+ if request_id:
260
+ with PROCESSED_LOCK:
261
+ PROCESSED_REQUESTS[request_id] = result
262
+ return result
263
+
264
+ action_data = parsed
265
+ tool_name = action_data.get("tool_name")
266
+ tool_args = action_data.get("arguments", {}) or {}
267
+
268
+ # Validate that arguments is a dict
269
+ if not isinstance(tool_args, dict):
270
+ tool_args = {}
271
+
272
+ # Execute the tool once and capture result
273
+ print(f"[info] Executing tool: {tool_name} with args {tool_args}")
274
+ tool_result = validate_and_call_tool(tool_name, tool_args)
275
+
276
+ result = {
277
  "status": "success",
278
  "robot_id": robot_id,
279
  "image_url": hf_url,
280
+ "image_bytes": size_bytes,
281
  "analysis": action_data.get("description"),
282
+ "chosen_tool": tool_name,
283
+ "tool_arguments": tool_args,
284
+ "tool_execution_result": tool_result,
285
+ "vlm_raw": vlm_text
286
  }
287
 
288
+ if request_id:
289
+ with PROCESSED_LOCK:
290
+ PROCESSED_REQUESTS[request_id] = result
291
+
292
+ return result
293
+
294
  except Exception as e:
295
+ traceback.print_exc()
296
+ return {"error": f"Server error: {str(e)}", "vlm_raw": vlm_text}
297
 
298
  # --- Gradio Interface ---
299
+ iface = gr.Interface(
300
  fn=process_and_describe,
301
+ inputs=gr.JSON(label="Input (JSON with 'image_b64', 'hf_token', optional 'request_id')"),
302
  outputs=gr.JSON(label="Robot Command Output"),
303
+ api_name="predict",
304
+ allow_flagging="never",
305
+ live=False
306
  )
307
 
308
  if __name__ == "__main__":
309
+ # When deploying to HF Space: set server_name and server_port via env if you need
310
+ iface.launch()