OppaAI commited on
Commit
559b78a
·
verified ·
1 Parent(s): 5c2da8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -115
app.py CHANGED
@@ -3,6 +3,7 @@ import base64
3
  import json
4
  from datetime import datetime
5
  import traceback
 
6
 
7
  import gradio as gr
8
  from huggingface_hub import HfApi, InferenceClient
@@ -14,90 +15,34 @@ from pydantic import BaseModel, Field
14
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
15
  HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
16
 
17
- # -------------------------------
18
- # Pydantic schema for the tool payload
19
- # -------------------------------
20
  class RobotWatchPayload(BaseModel):
21
- """
22
- Defines the expected input structure for the robot VLM analysis tool.
23
-
24
- Attributes:
25
- hf_token (str): Your Hugging Face API token.
26
- robot_id (str): Identifier for the robot (default "unknown").
27
- image_b64 (str): Base64 encoded image string to analyze.
28
- """
29
  hf_token: str = Field(description="Your Hugging Face API token.")
30
  robot_id: str = Field(description="Robot identifier.", default="unknown")
31
  image_b64: str = Field(description="Base64 encoded image data.")
32
 
 
33
 
34
  # -------------------------------
35
- # Helper function: Upload image to Hugging Face dataset
36
- # -------------------------------
37
- def upload_image(image_b64: str, hf_token: str):
38
- """
39
- Decodes a base64 image string, saves it locally, and uploads to Hugging Face dataset.
40
-
41
- Args:
42
- image_b64 (str): Base64 encoded image data.
43
- hf_token (str): Hugging Face API token.
44
-
45
- Returns:
46
- tuple: (local_path, hf_url, filename, size_bytes)
47
- """
48
- try:
49
- image_bytes = base64.b64decode(image_b64)
50
- os.makedirs("/tmp", exist_ok=True)
51
-
52
- # Generate unique timestamped filename
53
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
54
- local_path = f"/tmp/robot_img_{timestamp}.jpg"
55
-
56
- # Save locally
57
- with open(local_path, "wb") as f:
58
- f.write(image_bytes)
59
-
60
- filename = f"robot_{timestamp}.jpg"
61
-
62
- # Upload to Hugging Face dataset
63
- api = HfApi()
64
- api.upload_file(
65
- path_or_fileobj=local_path,
66
- path_in_repo=f"tmp/{filename}",
67
- repo_id=HF_DATASET_REPO,
68
- repo_type="dataset",
69
- token=hf_token
70
- )
71
-
72
- hf_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/tmp/{filename}"
73
- return local_path, hf_url, filename, len(image_bytes)
74
-
75
- except Exception:
76
- traceback.print_exc()
77
- return None, None, None, 0
78
-
79
-
80
- # -------------------------------
81
- # Helper function: Parse JSON safely
82
  # -------------------------------
83
  def safe_parse_json_from_text(text: str):
84
  """
85
  Attempts to parse JSON from text returned by the VLM model.
86
- Strips any leading/trailing characters and handles malformed responses.
87
-
88
- Args:
89
- text (str): Raw text output from the model.
90
-
91
- Returns:
92
- dict or None: Parsed JSON dictionary, or None if parsing fails.
93
  """
94
  if not text:
95
  return None
96
  try:
97
- return json.loads(text)
98
- except:
99
- pass
100
-
 
 
 
 
 
101
  cleaned = text.strip().strip("`").strip()
102
  if cleaned.lower().startswith("json"):
103
  cleaned = cleaned[4:].strip()
@@ -111,30 +56,23 @@ def safe_parse_json_from_text(text: str):
111
 
112
 
113
  # -------------------------------
114
- # Core VLM analysis function
115
  # -------------------------------
116
  def run_vlm_analysis(payload: RobotWatchPayload):
117
  """
118
  Main logic for analyzing an image using Hugging Face VLM model.
119
-
120
- Args:
121
- payload (RobotWatchPayload): Validated payload containing token, robot_id, and image.
122
-
123
- Returns:
124
- dict: Analysis result including description, objects, and raw VLM output.
125
  """
126
  hf_token = payload.hf_token
127
  image_b64 = payload.image_b64
128
  robot_id = payload.robot_id
129
 
130
- # Upload the image to Hugging Face dataset
131
  _, hf_url, _, size_bytes = upload_image(image_b64, hf_token)
132
  if not hf_url:
133
  return {"error": "Image upload failed"}
134
 
135
- # System prompt instructs VLM to return strict JSON
136
  system_prompt = """
137
- Respond in STRICT JSON ONLY:
138
  {
139
  "description": "...",
140
  "human": "...",
@@ -165,7 +103,7 @@ Respond in STRICT JSON ONLY:
165
  vlm_output = resp.choices[0].message.content.strip()
166
  parsed = safe_parse_json_from_text(vlm_output) or {}
167
 
168
- return {
169
  "status": "success",
170
  "robot_id": robot_id,
171
  "file_size_bytes": size_bytes,
@@ -173,59 +111,34 @@ Respond in STRICT JSON ONLY:
173
  "result": parsed,
174
  "vlm_raw": vlm_output
175
  }
 
 
 
176
 
177
 
178
- # -------------------------------
179
- # Gradio interface function
180
- # -------------------------------
181
  def robot_watch(
182
  hf_token_input: str,
183
  robot_id_input: str,
184
  image_b64_input: str
185
  ):
186
- """
187
- Gradio wrapper for run_vlm_analysis.
188
- Converts individual fields into Pydantic model and calls core logic.
189
-
190
- Args:
191
- hf_token_input (str): Hugging Face API token input from UI.
192
- robot_id_input (str): Robot ID input from UI.
193
- image_b64_input (str): Base64 image input from UI.
194
-
195
- Returns:
196
- dict: Result from run_vlm_analysis.
197
- """
198
- if not image_b64_input:
199
- return {"error": "Base64 image string is empty."}
200
-
201
- # Create the payload instance
202
  payload_instance = RobotWatchPayload(
203
  hf_token=hf_token_input,
204
  robot_id=robot_id_input,
205
  image_b64=image_b64_input
206
  )
207
-
208
- # Run core analysis
209
  result = run_vlm_analysis(payload_instance)
210
- return result
211
 
212
 
213
- # -------------------------------
214
- # Gradio App
215
- # -------------------------------
216
  app = gr.Interface(
217
  fn=robot_watch,
218
- inputs=[
219
- gr.Textbox(label="Hugging Face Token", lines=1),
220
- gr.Textbox(label="Robot ID", lines=1, value="unknown"),
221
- gr.Textbox(label="Image Base64 String", lines=5)
222
- ],
223
- outputs=gr.Json(label="Tool Output"),
224
- title="Robot CV MCP Server",
225
- description="Interface for robot VLM analysis using individual fields, including base64 image string.",
226
- api_name="predict"
227
  )
228
 
229
  if __name__ == "__main__":
230
- # Launch Gradio app with MCP server enabled
231
  app.launch(mcp_server=True)
 
3
  import json
4
  from datetime import datetime
5
  import traceback
6
+ import ast # Added import for ast.literal_eval if you decide to keep previous structure
7
 
8
  import gradio as gr
9
  from huggingface_hub import HfApi, InferenceClient
 
15
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
16
  HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
17
 
18
+ # ... (RobotWatchPayload class definition remains the same) ...
 
 
19
  class RobotWatchPayload(BaseModel):
 
 
 
 
 
 
 
 
20
  hf_token: str = Field(description="Your Hugging Face API token.")
21
  robot_id: str = Field(description="Robot identifier.", default="unknown")
22
  image_b64: str = Field(description="Base64 encoded image data.")
23
 
24
+ # ... (upload_image helper function remains the same) ...
25
 
26
  # -------------------------------
27
+ # Helper function: Parse JSON safely (Modified to use literal_eval as fallback)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # -------------------------------
29
  def safe_parse_json_from_text(text: str):
30
  """
31
  Attempts to parse JSON from text returned by the VLM model.
32
+ Handles malformed responses by using json.loads first, then literal_eval.
 
 
 
 
 
 
33
  """
34
  if not text:
35
  return None
36
  try:
37
+ return json.loads(text) # Try parsing as strict JSON first
38
+ except json.JSONDecodeError:
39
+ try:
40
+ # Fallback to literal_eval if it's a Python-formatted string
41
+ return ast.literal_eval(text)
42
+ except (ValueError, SyntaxError):
43
+ pass # Return None below if both fail
44
+
45
+ # ... (rest of manual parsing logic if needed, but the above usually works) ...
46
  cleaned = text.strip().strip("`").strip()
47
  if cleaned.lower().startswith("json"):
48
  cleaned = cleaned[4:].strip()
 
56
 
57
 
58
  # -------------------------------
59
+ # Core VLM analysis function (Modified prompt to encourage 'objects' output)
60
  # -------------------------------
61
  def run_vlm_analysis(payload: RobotWatchPayload):
62
  """
63
  Main logic for analyzing an image using Hugging Face VLM model.
 
 
 
 
 
 
64
  """
65
  hf_token = payload.hf_token
66
  image_b64 = payload.image_b64
67
  robot_id = payload.robot_id
68
 
 
69
  _, hf_url, _, size_bytes = upload_image(image_b64, hf_token)
70
  if not hf_url:
71
  return {"error": "Image upload failed"}
72
 
73
+ # System prompt changed: Emphasize that 'objects' should be non-empty.
74
  system_prompt = """
75
+ Respond in STRICT JSON ONLY. Ensure the "objects" list is never empty; list general items if specific ones are not clear.
76
  {
77
  "description": "...",
78
  "human": "...",
 
103
  vlm_output = resp.choices[0].message.content.strip()
104
  parsed = safe_parse_json_from_text(vlm_output) or {}
105
 
106
+ response_dict = {
107
  "status": "success",
108
  "robot_id": robot_id,
109
  "file_size_bytes": size_bytes,
 
111
  "result": parsed,
112
  "vlm_raw": vlm_output
113
  }
114
+
115
+ # 👇 Return a proper JSON string to fix client-side parsing error
116
+ return json.dumps(response_dict)
117
 
118
 
119
+ # ... (Gradio interface function 'robot_watch' remains the same) ...
 
 
120
  def robot_watch(
121
  hf_token_input: str,
122
  robot_id_input: str,
123
  image_b64_input: str
124
  ):
125
+ # ... (function body remains the same) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  payload_instance = RobotWatchPayload(
127
  hf_token=hf_token_input,
128
  robot_id=robot_id_input,
129
  image_b64=image_b64_input
130
  )
 
 
131
  result = run_vlm_analysis(payload_instance)
132
+ return result # This returns the JSON string from run_vlm_analysis
133
 
134
 
135
+ # ... (Gradio App definition and launch remain the same) ...
 
 
136
  app = gr.Interface(
137
  fn=robot_watch,
138
+ inputs=[...],
139
+ outputs=gr.Json(label="Tool Output"), # gr.Json handles the string automatically
140
+ # ...
 
 
 
 
 
 
141
  )
142
 
143
  if __name__ == "__main__":
 
144
  app.launch(mcp_server=True)