OppaAI commited on
Commit
bbcef43
·
verified ·
1 Parent(s): a10dd0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -31
app.py CHANGED
@@ -1,10 +1,8 @@
1
  import os
2
  import base64
3
- import requests
4
- import tempfile
5
  import gradio as gr
6
  from huggingface_hub import upload_file, InferenceClient
7
- from PIL import Image
8
 
9
  # --- Config ---
10
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
@@ -14,11 +12,11 @@ HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
14
  def save_and_upload_image(image_b64, hf_token):
15
  """Save image to /tmp and upload to HF dataset."""
16
  image_bytes = base64.b64decode(image_b64)
17
- local_tmp_path = f"/tmp/tmp.jpg"
18
  with open(local_tmp_path, "wb") as f:
19
  f.write(image_bytes)
20
 
21
- path_in_repo = f"images/tmp.jpg"
22
  upload_file(
23
  path_or_fileobj=local_tmp_path,
24
  path_in_repo=path_in_repo,
@@ -34,54 +32,63 @@ def save_and_upload_image(image_b64, hf_token):
34
  # --- Main MCP function ---
35
  def process_and_describe(payload: dict):
36
  try:
37
- # 1️⃣ Use robot-sent token
38
  hf_token = payload.get("hf_token")
39
  if not hf_token:
40
  return {"error": "HF token not provided in payload."}
41
 
42
  robot_id = payload.get("robot_id", "unknown")
43
- image_b64 = payload["image_b64"]
 
 
44
 
45
- # 2️⃣ Save image temporarily
46
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
47
 
48
- # 3️⃣ Initialize HF client per request
49
  hf_client = InferenceClient(token=hf_token)
50
 
51
- # 4️⃣ System prompt for robot action
52
  system_prompt = """
53
- You are a helpful robot assistant.
54
- When receiving an image, you must:
55
  1. Describe the image in detail.
56
- 2. Suggest what the robot should do next based on what it sees.
57
- - If you see a human figure, suggest the robot to say 'Hi'.
58
- - If you see a ball, suggest the robot to go towards it.
59
- - If you see obstacles, suggest 'stop' or 'avoid'.
60
- - If you see a red button, suggest 'press the button'.
61
- Always provide the suggested actions in JSON format:
62
- {"move": "forward/stop/left/right", "interact": "press/say/do nothing"}
63
  """
64
 
65
- # 5️⃣ Prepare multimodal message payload
66
  messages_payload = [
67
  {"role": "system", "content": system_prompt},
68
- {
69
- "role": "user",
70
- "content": [
71
- {"type": "text", "text": "Here is an image."},
72
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
73
- ]
74
- }
75
  ]
76
 
77
- # 6️⃣ Call VLM
78
  chat_completion = hf_client.chat.completions.create(
79
  model=HF_VLM_MODEL,
80
  messages=messages_payload,
81
- max_tokens=200,
82
  )
83
 
84
- vlm_text = chat_completion.choices[0].message.content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  return {
87
  "saved_to_hf_hub": True,
@@ -90,7 +97,9 @@ def process_and_describe(payload: dict):
90
  "image_url": hf_url,
91
  "file_size_bytes": size_bytes,
92
  "robot_id": robot_id,
93
- "vlm_response": vlm_text
 
 
94
  }
95
 
96
  except Exception as e:
 
1
  import os
2
  import base64
 
 
3
  import gradio as gr
4
  from huggingface_hub import upload_file, InferenceClient
5
+ import json
6
 
7
  # --- Config ---
8
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
 
12
  def save_and_upload_image(image_b64, hf_token):
13
  """Save image to /tmp and upload to HF dataset."""
14
  image_bytes = base64.b64decode(image_b64)
15
+ local_tmp_path = "/tmp/tmp.jpg"
16
  with open(local_tmp_path, "wb") as f:
17
  f.write(image_bytes)
18
 
19
+ path_in_repo = "images/tmp.jpg"
20
  upload_file(
21
  path_or_fileobj=local_tmp_path,
22
  path_in_repo=path_in_repo,
 
32
  # --- Main MCP function ---
33
  def process_and_describe(payload: dict):
34
  try:
 
35
  hf_token = payload.get("hf_token")
36
  if not hf_token:
37
  return {"error": "HF token not provided in payload."}
38
 
39
  robot_id = payload.get("robot_id", "unknown")
40
+ image_b64 = payload.get("image_b64")
41
+ if not image_b64:
42
+ return {"error": "No image provided."}
43
 
44
+ # Save & upload
45
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
46
 
47
+ # Init HF client
48
  hf_client = InferenceClient(token=hf_token)
49
 
50
+ # System prompt: describe + suggest action
51
  system_prompt = """
52
+ You are a helpful robot assistant.
 
53
  1. Describe the image in detail.
54
+ 2. Suggest what the robot should do next based on what it sees:
55
+ - Human figure say 'Hi'.
56
+ - Ball move towards it.
57
+ - Obstacles stop or avoid.
58
+ - Red button press it.
59
+ Always respond in JSON:
60
+ {"description": "...", "action": {"move": "...", "interact": "..."}}
61
  """
62
 
 
63
  messages_payload = [
64
  {"role": "system", "content": system_prompt},
65
+ {"role": "user", "content": [
66
+ {"type": "text", "text": "Here is an image."},
67
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
68
+ ]}
 
 
 
69
  ]
70
 
71
+ # Call VLM
72
  chat_completion = hf_client.chat.completions.create(
73
  model=HF_VLM_MODEL,
74
  messages=messages_payload,
75
+ max_tokens=300
76
  )
77
 
78
+ # Robustly extract text
79
+ try:
80
+ vlm_text = chat_completion.choices[0].message.content.strip()
81
+ except Exception:
82
+ # fallback if structure is different
83
+ vlm_text = str(chat_completion)
84
+
85
+ # Attempt to parse JSON from VLM
86
+ action_data = {}
87
+ try:
88
+ action_data = json.loads(vlm_text)
89
+ except Exception:
90
+ # If VLM didn't return valid JSON, wrap text as description
91
+ action_data = {"description": vlm_text, "action": {"move": "unknown", "interact": "unknown"}}
92
 
93
  return {
94
  "saved_to_hf_hub": True,
 
97
  "image_url": hf_url,
98
  "file_size_bytes": size_bytes,
99
  "robot_id": robot_id,
100
+ "vlm_response": vlm_text,
101
+ "vlm_action": action_data.get("action", {}),
102
+ "vlm_description": action_data.get("description", "")
103
  }
104
 
105
  except Exception as e: