OppaAI commited on
Commit
01a3239
·
verified ·
1 Parent(s): 559b78a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -28
app.py CHANGED
@@ -3,7 +3,6 @@ import base64
3
  import json
4
  from datetime import datetime
5
  import traceback
6
- import ast # Added import for ast.literal_eval if you decide to keep previous structure
7
 
8
  import gradio as gr
9
  from huggingface_hub import HfApi, InferenceClient
@@ -15,34 +14,86 @@ from pydantic import BaseModel, Field
15
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
16
  HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
17
 
18
- # ... (RobotWatchPayload class definition remains the same) ...
 
 
19
  class RobotWatchPayload(BaseModel):
 
 
 
 
 
 
 
 
20
  hf_token: str = Field(description="Your Hugging Face API token.")
21
  robot_id: str = Field(description="Robot identifier.", default="unknown")
22
  image_b64: str = Field(description="Base64 encoded image data.")
23
 
24
- # ... (upload_image helper function remains the same) ...
25
 
26
  # -------------------------------
27
- # Helper function: Parse JSON safely (Modified to use literal_eval as fallback)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # -------------------------------
29
  def safe_parse_json_from_text(text: str):
30
  """
31
  Attempts to parse JSON from text returned by the VLM model.
32
- Handles malformed responses by using json.loads first, then literal_eval.
 
 
 
 
33
  """
34
  if not text:
35
  return None
36
  try:
37
- return json.loads(text) # Try parsing as strict JSON first
38
- except json.JSONDecodeError:
39
- try:
40
- # Fallback to literal_eval if it's a Python-formatted string
41
- return ast.literal_eval(text)
42
- except (ValueError, SyntaxError):
43
- pass # Return None below if both fail
44
-
45
- # ... (rest of manual parsing logic if needed, but the above usually works) ...
46
  cleaned = text.strip().strip("`").strip()
47
  if cleaned.lower().startswith("json"):
48
  cleaned = cleaned[4:].strip()
@@ -56,21 +107,26 @@ def safe_parse_json_from_text(text: str):
56
 
57
 
58
  # -------------------------------
59
- # Core VLM analysis function (Modified prompt to encourage 'objects' output)
60
  # -------------------------------
61
  def run_vlm_analysis(payload: RobotWatchPayload):
62
  """
63
  Main logic for analyzing an image using Hugging Face VLM model.
 
 
 
 
64
  """
65
  hf_token = payload.hf_token
66
  image_b64 = payload.image_b64
67
  robot_id = payload.robot_id
68
 
 
69
  _, hf_url, _, size_bytes = upload_image(image_b64, hf_token)
70
  if not hf_url:
71
  return {"error": "Image upload failed"}
72
 
73
- # System prompt changed: Emphasize that 'objects' should be non-empty.
74
  system_prompt = """
75
  Respond in STRICT JSON ONLY. Ensure the "objects" list is never empty; list general items if specific ones are not clear.
76
  {
@@ -103,7 +159,7 @@ Respond in STRICT JSON ONLY. Ensure the "objects" list is never empty; list gene
103
  vlm_output = resp.choices[0].message.content.strip()
104
  parsed = safe_parse_json_from_text(vlm_output) or {}
105
 
106
- response_dict = {
107
  "status": "success",
108
  "robot_id": robot_id,
109
  "file_size_bytes": size_bytes,
@@ -111,34 +167,57 @@ Respond in STRICT JSON ONLY. Ensure the "objects" list is never empty; list gene
111
  "result": parsed,
112
  "vlm_raw": vlm_output
113
  }
114
-
115
- # 👇 Return a proper JSON string to fix client-side parsing error
116
- return json.dumps(response_dict)
117
 
118
 
119
- # ... (Gradio interface function 'robot_watch' remains the same) ...
 
 
120
  def robot_watch(
121
  hf_token_input: str,
122
  robot_id_input: str,
123
  image_b64_input: str
124
  ):
125
- # ... (function body remains the same) ...
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  payload_instance = RobotWatchPayload(
127
  hf_token=hf_token_input,
128
  robot_id=robot_id_input,
129
  image_b64=image_b64_input
130
  )
 
 
131
  result = run_vlm_analysis(payload_instance)
132
- return result # This returns the JSON string from run_vlm_analysis
133
 
134
 
135
- # ... (Gradio App definition and launch remain the same) ...
 
 
136
  app = gr.Interface(
137
  fn=robot_watch,
138
- inputs=[...],
139
- outputs=gr.Json(label="Tool Output"), # gr.Json handles the string automatically
140
- # ...
 
 
 
 
 
 
141
  )
142
 
143
  if __name__ == "__main__":
144
- app.launch(mcp_server=True)
 
 
3
  import json
4
  from datetime import datetime
5
  import traceback
 
6
 
7
  import gradio as gr
8
  from huggingface_hub import HfApi, InferenceClient
 
14
  HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "OppaAI/Robot_MCP")
15
  HF_VLM_MODEL = os.environ.get("HF_VLM_MODEL", "Qwen/Qwen2.5-VL-7B-Instruct")
16
 
17
+ # -------------------------------
18
+ # Pydantic schema for the tool payload
19
+ # -------------------------------
20
  class RobotWatchPayload(BaseModel):
21
+ """
22
+ Defines the expected input structure for the robot VLM analysis tool.
23
+
24
+ Attributes:
25
+ hf_token (str): Your Hugging Face API token.
26
+ robot_id (str): Identifier for the robot (default "unknown").
27
+ image_b64 (str): Base64 encoded image string to analyze.
28
+ """
29
  hf_token: str = Field(description="Your Hugging Face API token.")
30
  robot_id: str = Field(description="Robot identifier.", default="unknown")
31
  image_b64: str = Field(description="Base64 encoded image data.")
32
 
 
33
 
34
  # -------------------------------
35
+ # Helper function: Upload image to Hugging Face dataset
36
+ # -------------------------------
37
+ def upload_image(image_b64: str, hf_token: str):
38
+ """
39
+ Decodes a base64 image string, saves it locally, and uploads to Hugging Face dataset.
40
+ Args:
41
+ image_b64 (str): Base64 encoded image data.
42
+ hf_token (str): Hugging Face API token.
43
+ Returns:
44
+ tuple: (local_path, hf_url, filename, size_bytes)
45
+ """
46
+ try:
47
+ image_bytes = base64.b64decode(image_b64)
48
+ os.makedirs("/tmp", exist_ok=True)
49
+
50
+ # Generate unique timestamped filename
51
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
52
+ local_path = f"/tmp/robot_img_{timestamp}.jpg"
53
+
54
+ # Save locally
55
+ with open(local_path, "wb") as f:
56
+ f.write(image_bytes)
57
+
58
+ filename = f"robot_{timestamp}.jpg"
59
+
60
+ # Upload to Hugging Face dataset
61
+ api = HfApi()
62
+ api.upload_file(
63
+ path_or_fileobj=local_path,
64
+ path_in_repo=f"tmp/{filename}",
65
+ repo_id=HF_DATASET_REPO,
66
+ repo_type="dataset",
67
+ token=hf_token
68
+ )
69
+
70
+ hf_url = f"https://huggingface.co/datasets/{HF_DATASET_REPO}/resolve/main/tmp/{filename}"
71
+ return local_path, hf_url, filename, len(image_bytes)
72
+
73
+ except Exception:
74
+ traceback.print_exc()
75
+ return None, None, None, 0
76
+
77
+
78
+ # -------------------------------
79
+ # Helper function: Parse JSON safely
80
  # -------------------------------
81
  def safe_parse_json_from_text(text: str):
82
  """
83
  Attempts to parse JSON from text returned by the VLM model.
84
+ Strips any leading/trailing characters and handles malformed responses.
85
+ Args:
86
+ text (str): Raw text output from the model.
87
+ Returns:
88
+ dict or None: Parsed JSON dictionary, or None if parsing fails.
89
  """
90
  if not text:
91
  return None
92
  try:
93
+ return json.loads(text)
94
+ except:
95
+ pass
96
+
 
 
 
 
 
97
  cleaned = text.strip().strip("`").strip()
98
  if cleaned.lower().startswith("json"):
99
  cleaned = cleaned[4:].strip()
 
107
 
108
 
109
  # -------------------------------
110
+ # Core VLM analysis function
111
  # -------------------------------
112
  def run_vlm_analysis(payload: RobotWatchPayload):
113
  """
114
  Main logic for analyzing an image using Hugging Face VLM model.
115
+ Args:
116
+ payload (RobotWatchPayload): Validated payload containing token, robot_id, and image.
117
+ Returns:
118
+ dict: Analysis result including description, objects, and raw VLM output.
119
  """
120
  hf_token = payload.hf_token
121
  image_b64 = payload.image_b64
122
  robot_id = payload.robot_id
123
 
124
+ # Upload the image to Hugging Face dataset
125
  _, hf_url, _, size_bytes = upload_image(image_b64, hf_token)
126
  if not hf_url:
127
  return {"error": "Image upload failed"}
128
 
129
+ # System prompt instructs VLM to return strict JSON
130
  system_prompt = """
131
  Respond in STRICT JSON ONLY. Ensure the "objects" list is never empty; list general items if specific ones are not clear.
132
  {
 
159
  vlm_output = resp.choices[0].message.content.strip()
160
  parsed = safe_parse_json_from_text(vlm_output) or {}
161
 
162
+ return {
163
  "status": "success",
164
  "robot_id": robot_id,
165
  "file_size_bytes": size_bytes,
 
167
  "result": parsed,
168
  "vlm_raw": vlm_output
169
  }
 
 
 
170
 
171
 
172
+ # -------------------------------
173
+ # Gradio interface function
174
+ # -------------------------------
175
  def robot_watch(
176
  hf_token_input: str,
177
  robot_id_input: str,
178
  image_b64_input: str
179
  ):
180
+ """
181
+ Gradio wrapper for run_vlm_analysis.
182
+ Converts individual fields into Pydantic model and calls core logic.
183
+ Args:
184
+ hf_token_input (str): Hugging Face API token input from UI.
185
+ robot_id_input (str): Robot ID input from UI.
186
+ image_b64_input (str): Base64 image input from UI.
187
+ Returns:
188
+ dict: Result from run_vlm_analysis.
189
+ """
190
+ if not image_b64_input:
191
+ return {"error": "Base64 image string is empty."}
192
+
193
+ # Create the payload instance
194
  payload_instance = RobotWatchPayload(
195
  hf_token=hf_token_input,
196
  robot_id=robot_id_input,
197
  image_b64=image_b64_input
198
  )
199
+
200
+ # Run core analysis
201
  result = run_vlm_analysis(payload_instance)
202
+ return result
203
 
204
 
205
+ # -------------------------------
206
+ # Gradio App
207
+ # -------------------------------
208
  app = gr.Interface(
209
  fn=robot_watch,
210
+ inputs=[
211
+ gr.Textbox(label="Hugging Face Token", lines=1),
212
+ gr.Textbox(label="Robot ID", lines=1, value="unknown"),
213
+ gr.Textbox(label="Image Base64 String", lines=5)
214
+ ],
215
+ outputs=gr.Json(label="Tool Output"),
216
+ title="Robot CV MCP Server",
217
+ description="Interface for robot VLM analysis using individual fields, including base64 image string.",
218
+ api_name="predict"
219
  )
220
 
221
  if __name__ == "__main__":
222
+ # Launch Gradio app with MCP server enabled
223
+ app.launch(mcp_server=True)