OppaAI commited on
Commit
5253b0d
Β·
verified Β·
1 Parent(s): 4decfa0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -32
app.py CHANGED
@@ -4,43 +4,60 @@ import time
4
  import io
5
  import gradio as gr
6
  from fastmcp import Client
7
- from fastmcp.client import StreamableHttpTransport
8
  import asyncio
9
  from dotenv import load_dotenv
10
- import ast # Import the Abstract Syntax Tree module
11
 
12
- # Load environment variables (ensure .env is set up locally)
 
 
13
  load_dotenv()
14
 
15
- ROBOT_ID = "Robot_MCP_Client"
16
  HF_TOKEN = os.environ.get("HF_TOKEN")
17
  if not HF_TOKEN:
18
  print("Warning: HF_TOKEN not found. API calls may fail.")
19
- HF_TOKEN = "missing_token_placeholder"
20
 
21
- # The MCP URL of your remote server
22
  MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
23
  SERVER_NAME = "Robot_MCP_Server"
24
- TOOL_NAME = "Robot_MCP_Server_robot_watch"
25
 
26
-
27
- # Initialize the MCP client globally
 
28
  HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
29
  MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
30
 
31
 
32
  async def process_webcam_stream_async(image):
33
- """Send webcam image to HF MCP Server using MCP protocol and get result"""
 
 
 
 
 
 
 
 
 
 
 
 
34
  if image is None:
35
  return "", "", "", ""
36
 
37
  if HF_TOKEN == "missing_token_placeholder":
38
  return "Error: HF_TOKEN not set locally.", "", "", ""
39
 
 
40
  buffered = io.BytesIO()
41
  image.save(buffered, format="JPEG")
42
  b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
43
 
 
44
  payload = {
45
  "hf_token_input": HF_TOKEN,
46
  "robot_id_input": ROBOT_ID,
@@ -48,62 +65,67 @@ async def process_webcam_stream_async(image):
48
  }
49
 
50
  try:
 
51
  async with MCP_CLIENT:
52
  response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
53
-
54
  if response.is_error:
 
55
  error_text = response.content[0].text if response.content and isinstance(response.content, list) else "Unknown error"
56
  raise Exception(f"MCP Tool Error: {error_text}")
57
 
58
- # Use ast.literal_eval because your server returns Python-formatted strings (single quotes)
59
  raw_text = response.content[0].text
60
  response_dict = ast.literal_eval(raw_text)
61
-
62
- # --- πŸ‘‡ EXTRACTING EACH FIELD CORRECTLY πŸ‘‡ ---
 
 
63
  vlm_result = response_dict.get("result", {})
64
-
65
  description_out = vlm_result.get("description", "")
66
  human_out = vlm_result.get("human", "")
67
  objects_list = vlm_result.get("objects", [])
68
  environment_out = vlm_result.get("environment", "")
69
 
70
- # Format the objects list into a single string for display
71
  objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
72
 
73
- return (
74
- description_out,
75
- human_out,
76
- objects_str,
77
- environment_out
78
- )
79
-
80
  except Exception as e:
81
- print(f"Error calling remote MCP API: {e}")
82
  import traceback
83
- traceback.print_exc() # Helps debug other errors
84
  return f"Error: {e}", "", "", ""
85
 
 
 
 
 
86
  with gr.Blocks() as demo:
87
  gr.Markdown("## πŸŽ₯ Robot Vision Webcam Stream (using MCP Client)")
88
  gr.Markdown("""
89
  ### πŸ”‘ Hugging Face Token Required
90
  To use this application, you must set a valid **Hugging Face API Token** in your local environment variables: `HF_TOKEN`.
91
- **A write token is required** to upload images to the public dataset associated with this space. The resource usage for VLM inference will be tracked against *your* account.
 
92
  """)
93
  with gr.Row():
 
94
  webcam_input = gr.Image(
95
  label="Captured from Web-Cam",
96
  sources=["upload", "webcam"],
97
  type="pil"
98
  )
99
  with gr.Column():
100
- # --- πŸ‘‡ INCREASED 'lines' PARAMETER HERE πŸ‘‡ ---
101
- description_out = gr.Textbox(label="Description", lines=5) # Made larger
102
- human_out = gr.Textbox(label="Human", lines=3) # Made larger
103
- objects_out = gr.Textbox(label="Objects", lines=2) # Made larger
104
- environment_out = gr.Textbox(label="Environment", lines=3) # Made larger
105
- # --------------------------------------------------
106
 
 
107
  webcam_input.stream(
108
  process_webcam_stream_async,
109
  inputs=[webcam_input],
 
4
  import io
5
  import gradio as gr
6
  from fastmcp import Client
7
+ from fastmcp.client import StreamableHttpTransport
8
  import asyncio
9
  from dotenv import load_dotenv
10
+ import ast # For safely evaluating Python literals returned from server
11
 
12
+ # -------------------------------
13
+ # Load environment variables
14
+ # -------------------------------
15
  load_dotenv()
16
 
17
+ ROBOT_ID = "Robot_MCP_Client" # Local client identifier
18
  HF_TOKEN = os.environ.get("HF_TOKEN")
19
  if not HF_TOKEN:
20
  print("Warning: HF_TOKEN not found. API calls may fail.")
21
+ HF_TOKEN = "missing_token_placeholder" # Placeholder to avoid crash
22
 
23
+ # MCP server info
24
  MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
25
  SERVER_NAME = "Robot_MCP_Server"
26
+ TOOL_NAME = "Robot_MCP_Server_robot_watch"
27
 
28
+ # -------------------------------
29
+ # Initialize MCP client globally
30
+ # -------------------------------
31
  HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
32
  MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)
33
 
34
 
35
  async def process_webcam_stream_async(image):
36
+ """
37
+ Send webcam image to MCP server and process the response.
38
+
39
+ Args:
40
+ image (PIL.Image or None): Image captured from webcam or uploaded.
41
+
42
+ Returns:
43
+ tuple: (description, human, objects_str, environment)
44
+ description (str): Description of scene.
45
+ human (str): Human-related information.
46
+ objects_str (str): Comma-separated list of objects.
47
+ environment (str): Environment description.
48
+ """
49
  if image is None:
50
  return "", "", "", ""
51
 
52
  if HF_TOKEN == "missing_token_placeholder":
53
  return "Error: HF_TOKEN not set locally.", "", "", ""
54
 
55
+ # Convert image to Base64 string
56
  buffered = io.BytesIO()
57
  image.save(buffered, format="JPEG")
58
  b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")
59
 
60
+ # Prepare payload according to server's expected fields
61
  payload = {
62
  "hf_token_input": HF_TOKEN,
63
  "robot_id_input": ROBOT_ID,
 
65
  }
66
 
67
  try:
68
+ # Use async context to call MCP server tool
69
  async with MCP_CLIENT:
70
  response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
71
+
72
  if response.is_error:
73
+ # Extract error message
74
  error_text = response.content[0].text if response.content and isinstance(response.content, list) else "Unknown error"
75
  raise Exception(f"MCP Tool Error: {error_text}")
76
 
77
+ # Server may return Python-style string (single quotes)
78
  raw_text = response.content[0].text
79
  response_dict = ast.literal_eval(raw_text)
80
+
81
+ # -------------------------------
82
+ # Extract fields from response
83
+ # -------------------------------
84
  vlm_result = response_dict.get("result", {})
85
+
86
  description_out = vlm_result.get("description", "")
87
  human_out = vlm_result.get("human", "")
88
  objects_list = vlm_result.get("objects", [])
89
  environment_out = vlm_result.get("environment", "")
90
 
91
+ # Convert objects list to a comma-separated string for display
92
  objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)
93
 
94
+ return description_out, human_out, objects_str, environment_out
95
+
 
 
 
 
 
96
  except Exception as e:
97
+ print(f"Error calling remote MCP API: {e}")
98
  import traceback
99
+ traceback.print_exc()
100
  return f"Error: {e}", "", "", ""
101
 
102
+
103
+ # -------------------------------
104
+ # Gradio UI
105
+ # -------------------------------
106
  with gr.Blocks() as demo:
107
  gr.Markdown("## πŸŽ₯ Robot Vision Webcam Stream (using MCP Client)")
108
  gr.Markdown("""
109
  ### πŸ”‘ Hugging Face Token Required
110
  To use this application, you must set a valid **Hugging Face API Token** in your local environment variables: `HF_TOKEN`.
111
+ **A write token is required** to upload images to the public dataset associated with this space.
112
+ Resource usage for VLM inference will be tracked against your account.
113
  """)
114
  with gr.Row():
115
+ # Webcam / upload image input
116
  webcam_input = gr.Image(
117
  label="Captured from Web-Cam",
118
  sources=["upload", "webcam"],
119
  type="pil"
120
  )
121
  with gr.Column():
122
+ # Output fields for MCP response
123
+ description_out = gr.Textbox(label="Description", lines=5)
124
+ human_out = gr.Textbox(label="Human", lines=3)
125
+ objects_out = gr.Textbox(label="Objects", lines=2)
126
+ environment_out = gr.Textbox(label="Environment", lines=3)
 
127
 
128
+ # Stream webcam input to server every 0.5 seconds
129
  webcam_input.stream(
130
  process_webcam_stream_async,
131
  inputs=[webcam_input],