OppaAI commited on
Commit
3565497
·
verified ·
1 Parent(s): 0e3d6b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -78
app.py CHANGED
@@ -1,26 +1,15 @@
1
- import os
2
  import base64
3
- import time
4
  import io
5
  import gradio as gr
6
  from fastmcp import Client
7
  from fastmcp.client import StreamableHttpTransport
8
  import asyncio
9
- from dotenv import load_dotenv
10
- import ast # For safely evaluating Python literals returned from server
11
 
12
  # -------------------------------
13
- # Load environment variables
14
  # -------------------------------
15
- load_dotenv()
16
-
17
  ROBOT_ID = "Robot_MCP_Client" # Local client identifier
18
- HF_TOKEN = os.environ.get("HF_TOKEN")
19
- if not HF_TOKEN:
20
- print("Warning: HF_TOKEN not found. API calls may fail.")
21
- HF_TOKEN = "missing_token_placeholder" # Placeholder to avoid crash
22
-
23
- # MCP server info
24
  MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
25
  SERVER_NAME = "Robot_MCP_Server"
26
  TOOL_NAME = "Robot_MCP_Server_robot_watch"
@@ -31,82 +20,55 @@ TOOL_NAME = "Robot_MCP_Server_robot_watch"
31
  HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
32
  MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
33
 
34
-
35
- async def process_webcam_stream_async(image):
 
 
36
  """
37
- Send webcam image to MCP server and process the response.
38
-
39
- Args:
40
- image (PIL.Image or None): Image captured from webcam or uploaded.
41
-
42
- Returns:
43
- tuple: (description, environment, indoor_or_outdoor, lighting_condition, human, animals_str, objects_str, hazards_str)
44
- description (str): General description of the scene.
45
- environment (str): Description of the surrounding environment.
46
- indoor_or_outdoor (str): Whether the scene appears to be indoors or outdoors.
47
- lighting_condition (str): Lighting condition (e.g., bright, dim, natural, artificial).
48
- human (str): Information about any humans detected.
49
- animals_str (str): Information about any animals detected, or "none".
50
- objects_str (str): Comma-separated list of detected objects.
51
- hazards_str (str): Comma-separated list of hazards, or "none".
52
-
53
  """
54
- if image is None:
55
- return "", "", "", ""
56
 
57
- if HF_TOKEN == "missing_token_placeholder":
58
- return "Error: HF_TOKEN not set locally.", "", "", ""
59
 
60
  # Convert image to Base64 string
61
  buffered = io.BytesIO()
62
  image.save(buffered, format="JPEG")
63
  b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
64
 
65
- # Prepare payload according to server's expected fields
66
  payload = {
67
- "hf_token_input": HF_TOKEN,
68
  "robot_id_input": ROBOT_ID,
69
  "image_b64_input": b64_img
70
  }
71
 
72
  try:
73
- # Use async context to call MCP server tool
74
  async with MCP_CLIENT:
75
  response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
76
-
77
  if response.is_error:
78
- # Extract error message using the correct attribute access
79
  error_text = response.content.text if response.content else "Unknown error"
80
  raise Exception(f"MCP Tool Error: {error_text}")
81
-
82
- # Server may return Python-style string (single quotes)
83
- # Corrected: Access the combined text content directly
84
- raw_text = response.content.text
85
  response_dict = ast.literal_eval(raw_text)
86
-
87
- # -------------------------------
88
- # Extract fields from response
89
- # -------------------------------
90
  vlm_result = response_dict.get("result", {})
91
-
92
  description_out = vlm_result.get("description", "")
93
  human_out = vlm_result.get("human", "")
94
  environment_out = vlm_result.get("environment", "")
95
-
96
- # New fields (assuming your server update added these)
97
  indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
98
  lighting_condition_out = vlm_result.get("lighting_condition", "")
99
- animals_list = vlm_result.get("animals", []) # Assuming animals are in a list
100
- hazards_list = vlm_result.get("hazards", []) # Assuming hazards are in a list
101
-
102
  objects_list = vlm_result.get("objects", [])
103
-
104
- # Convert lists to a comma-separated string for display
105
  objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
106
  animals_str = ", ".join(animals_list) if isinstance(animals_list, list) else str(animals_list)
107
  hazards_str = ", ".join(hazards_list) if isinstance(hazards_list, list) else str(hazards_list)
108
-
109
- # Return all 8 fields in the correct order
110
  return (
111
  description_out,
112
  environment_out,
@@ -117,35 +79,22 @@ async def process_webcam_stream_async(image):
117
  objects_str,
118
  hazards_str
119
  )
120
-
121
  except Exception as e:
122
  print(f"Error calling remote MCP API: {e}")
123
  import traceback
124
  traceback.print_exc()
125
- # Ensure error returns 8 values as well to maintain consistency
126
  return f"Error: {e}", "", "", "", "", "", "", ""
127
 
128
-
129
  # -------------------------------
130
  # Gradio UI
131
  # -------------------------------
132
  with gr.Blocks() as demo:
133
- gr.Markdown("## 🎥 Robot Vision Webcam Stream (using MCP Client)")
134
- gr.Markdown("""
135
- ### 🔑 Hugging Face Token Required
136
- To use this application, you must set a valid **Hugging Face API Token** in your local environment variables: `HF_TOKEN`.
137
- **A write token is required** to upload images to the public dataset associated with this space.
138
- Resource usage for VLM inference will be tracked against your account.
139
- """)
140
  with gr.Row():
141
- # Webcam / upload image input
142
- webcam_input = gr.Image(
143
- label="Captured from Web-Cam",
144
- sources=["upload", "webcam"],
145
- type="pil"
146
- )
147
  with gr.Column():
148
- # Output fields for MCP response
149
  description_out = gr.Textbox(label="Description", lines=5)
150
  environment_out = gr.Textbox(label="Environment", lines=3)
151
  indoor_outdoor_out = gr.Textbox(label="Indoor/Outdoor", lines=1)
@@ -155,18 +104,17 @@ with gr.Blocks() as demo:
155
  objects_out = gr.Textbox(label="Objects Detected", lines=2)
156
  hazards_out = gr.Textbox(label="Hazards Identified", lines=2)
157
 
158
- # Stream webcam input to server every 0.5 seconds
159
  webcam_input.stream(
160
  process_webcam_stream_async,
161
- inputs=[webcam_input],
162
  outputs=[
163
  description_out,
164
- objects_out,
165
  environment_out,
166
  indoor_outdoor_out,
167
  lighting_condition_out,
168
  human_out,
169
  animals_out,
 
170
  hazards_out
171
  ],
172
  stream_every=1.0
 
 
1
  import base64
 
2
  import io
3
  import gradio as gr
4
  from fastmcp import Client
5
  from fastmcp.client import StreamableHttpTransport
6
  import asyncio
7
+ import ast
 
8
 
9
  # -------------------------------
10
+ # MCP server info
11
  # -------------------------------
 
 
12
  ROBOT_ID = "Robot_MCP_Client" # Local client identifier
 
 
 
 
 
 
13
  MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
14
  SERVER_NAME = "Robot_MCP_Server"
15
  TOOL_NAME = "Robot_MCP_Server_robot_watch"
 
20
  HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
21
  MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
22
 
23
+ # -------------------------------
24
+ # Async function using user's token
25
+ # -------------------------------
26
+ async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None):
27
  """
28
+ Send webcam image to MCP server using user's HF token and process the response.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
+ if oauth_token is None:
31
+ return "Please log in first.", "", "", "", "", "", "", ""
32
 
33
+ if image is None:
34
+ return "", "", "", "", "", "", "", ""
35
 
36
  # Convert image to Base64 string
37
  buffered = io.BytesIO()
38
  image.save(buffered, format="JPEG")
39
  b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
40
 
41
+ # Payload with user token
42
  payload = {
43
+ "hf_token_input": oauth_token.token,
44
  "robot_id_input": ROBOT_ID,
45
  "image_b64_input": b64_img
46
  }
47
 
48
  try:
 
49
  async with MCP_CLIENT:
50
  response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
 
51
  if response.is_error:
 
52
  error_text = response.content.text if response.content else "Unknown error"
53
  raise Exception(f"MCP Tool Error: {error_text}")
54
+
55
+ raw_text = response.content.text
 
 
56
  response_dict = ast.literal_eval(raw_text)
 
 
 
 
57
  vlm_result = response_dict.get("result", {})
58
+
59
  description_out = vlm_result.get("description", "")
60
  human_out = vlm_result.get("human", "")
61
  environment_out = vlm_result.get("environment", "")
 
 
62
  indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
63
  lighting_condition_out = vlm_result.get("lighting_condition", "")
64
+ animals_list = vlm_result.get("animals", [])
65
+ hazards_list = vlm_result.get("hazards", [])
 
66
  objects_list = vlm_result.get("objects", [])
67
+
 
68
  objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
69
  animals_str = ", ".join(animals_list) if isinstance(animals_list, list) else str(animals_list)
70
  hazards_str = ", ".join(hazards_list) if isinstance(hazards_list, list) else str(hazards_list)
71
+
 
72
  return (
73
  description_out,
74
  environment_out,
 
79
  objects_str,
80
  hazards_str
81
  )
82
+
83
  except Exception as e:
84
  print(f"Error calling remote MCP API: {e}")
85
  import traceback
86
  traceback.print_exc()
 
87
  return f"Error: {e}", "", "", "", "", "", "", ""
88
 
 
89
  # -------------------------------
90
  # Gradio UI
91
  # -------------------------------
92
  with gr.Blocks() as demo:
93
+ gr.LoginButton()
94
+ gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
 
 
 
 
 
95
  with gr.Row():
96
+ webcam_input = gr.Image(label="Captured from Web-Cam", sources=["upload", "webcam"], type="pil")
 
 
 
 
 
97
  with gr.Column():
 
98
  description_out = gr.Textbox(label="Description", lines=5)
99
  environment_out = gr.Textbox(label="Environment", lines=3)
100
  indoor_outdoor_out = gr.Textbox(label="Indoor/Outdoor", lines=1)
 
104
  objects_out = gr.Textbox(label="Objects Detected", lines=2)
105
  hazards_out = gr.Textbox(label="Hazards Identified", lines=2)
106
 
 
107
  webcam_input.stream(
108
  process_webcam_stream_async,
109
+ inputs=[webcam_input, gr.OAuthToken()],
110
  outputs=[
111
  description_out,
 
112
  environment_out,
113
  indoor_outdoor_out,
114
  lighting_condition_out,
115
  human_out,
116
  animals_out,
117
+ objects_out,
118
  hazards_out
119
  ],
120
  stream_every=1.0