OppaAI commited on
Commit
a10dd0b
·
verified ·
1 Parent(s): 406e27f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -12
app.py CHANGED
@@ -8,7 +8,6 @@ from PIL import Image
8
 
9
  # --- Config ---
10
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
11
- # Model specifically for VLM (image-to-text) tasks on Hugging Face
12
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
13
 
14
  # --- Helper Functions ---
@@ -24,7 +23,7 @@ def save_and_upload_image(image_b64, hf_token):
24
  path_or_fileobj=local_tmp_path,
25
  path_in_repo=path_in_repo,
26
  repo_id=HF_DATASET_REPO,
27
- token=hf_token, # ← use token from payload
28
  repo_type="dataset"
29
  )
30
 
@@ -35,7 +34,7 @@ def save_and_upload_image(image_b64, hf_token):
35
  # --- Main MCP function ---
36
  def process_and_describe(payload: dict):
37
  try:
38
- # 1️⃣ Use robot-sent token if available, otherwise fallback
39
  hf_token = payload.get("hf_token")
40
  if not hf_token:
41
  return {"error": "HF token not provided in payload."}
@@ -43,28 +42,43 @@ def process_and_describe(payload: dict):
43
  robot_id = payload.get("robot_id", "unknown")
44
  image_b64 = payload["image_b64"]
45
 
46
- # 2️⃣ Save image temporarily (for tracking)
47
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
48
 
49
  # 3️⃣ Initialize HF client per request
50
  hf_client = InferenceClient(token=hf_token)
51
 
52
- # 4️⃣ Prepare multimodal message payload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  messages_payload = [
 
54
  {
55
  "role": "user",
56
  "content": [
57
- {"type": "text", "text": "Describe this image in detail."},
58
- {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
59
- ],
60
  }
61
  ]
62
 
63
- # 5️⃣ Call VLM
64
  chat_completion = hf_client.chat.completions.create(
65
  model=HF_VLM_MODEL,
66
  messages=messages_payload,
67
- max_tokens=150,
68
  )
69
 
70
  vlm_text = chat_completion.choices[0].message.content.strip()
@@ -76,12 +90,13 @@ def process_and_describe(payload: dict):
76
  "image_url": hf_url,
77
  "file_size_bytes": size_bytes,
78
  "robot_id": robot_id,
79
- "vlm_description": vlm_text
80
  }
81
 
82
  except Exception as e:
83
  return {"error": f"An API error occurred: {str(e)}"}
84
 
 
85
  # --- Gradio MCP Interface ---
86
  demo = gr.Interface(
87
  fn=process_and_describe,
@@ -91,5 +106,4 @@ demo = gr.Interface(
91
  )
92
 
93
  if __name__ == "__main__":
94
- # Ensure you have the latest huggingface-hub: pip install --upgrade huggingface-hub Pillow requests
95
  demo.launch(mcp_server=True)
 
8
 
9
  # --- Config ---
10
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
 
11
  HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
12
 
13
  # --- Helper Functions ---
 
23
  path_or_fileobj=local_tmp_path,
24
  path_in_repo=path_in_repo,
25
  repo_id=HF_DATASET_REPO,
26
+ token=hf_token,
27
  repo_type="dataset"
28
  )
29
 
 
34
  # --- Main MCP function ---
35
  def process_and_describe(payload: dict):
36
  try:
37
+ # 1️⃣ Use robot-sent token
38
  hf_token = payload.get("hf_token")
39
  if not hf_token:
40
  return {"error": "HF token not provided in payload."}
 
42
  robot_id = payload.get("robot_id", "unknown")
43
  image_b64 = payload["image_b64"]
44
 
45
+ # 2️⃣ Save image temporarily
46
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
47
 
48
  # 3️⃣ Initialize HF client per request
49
  hf_client = InferenceClient(token=hf_token)
50
 
51
+ # 4️⃣ System prompt for robot action
52
+ system_prompt = """
53
+ You are a helpful robot assistant.
54
+ When receiving an image, you must:
55
+ 1. Describe the image in detail.
56
+ 2. Suggest what the robot should do next based on what it sees.
57
+ - If you see a human figure, suggest the robot to say 'Hi'.
58
+ - If you see a ball, suggest the robot to go towards it.
59
+ - If you see obstacles, suggest 'stop' or 'avoid'.
60
+ - If you see a red button, suggest 'press the button'.
61
+ Always provide the suggested actions in JSON format:
62
+ {"move": "forward/stop/left/right", "interact": "press/say/do nothing"}
63
+ """
64
+
65
+ # 5️⃣ Prepare multimodal message payload
66
  messages_payload = [
67
+ {"role": "system", "content": system_prompt},
68
  {
69
  "role": "user",
70
  "content": [
71
+ {"type": "text", "text": "Here is an image."},
72
+ {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
73
+ ]
74
  }
75
  ]
76
 
77
+ # 6️⃣ Call VLM
78
  chat_completion = hf_client.chat.completions.create(
79
  model=HF_VLM_MODEL,
80
  messages=messages_payload,
81
+ max_tokens=200,
82
  )
83
 
84
  vlm_text = chat_completion.choices[0].message.content.strip()
 
90
  "image_url": hf_url,
91
  "file_size_bytes": size_bytes,
92
  "robot_id": robot_id,
93
+ "vlm_response": vlm_text
94
  }
95
 
96
  except Exception as e:
97
  return {"error": f"An API error occurred: {str(e)}"}
98
 
99
+
100
  # --- Gradio MCP Interface ---
101
  demo = gr.Interface(
102
  fn=process_and_describe,
 
106
  )
107
 
108
  if __name__ == "__main__":
 
109
  demo.launch(mcp_server=True)