OppaAI commited on
Commit
5410665
·
verified ·
1 Parent(s): 18adda8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -22
app.py CHANGED
@@ -3,9 +3,7 @@ import base64
3
  import gradio as gr
4
  from huggingface_hub import upload_file, InferenceClient
5
  import json
6
- from fastmcp import MCP, MCPClient
7
- from playsound import playsound
8
- from gtts import gTTS
9
 
10
  # --- Config ---
11
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
@@ -16,16 +14,12 @@ mcp = MCP() # 用於定義工具
16
 
17
  # --- MCP Tool ---
18
  @mcp.tools()
19
- def say_hi(text="Hi, How are you doing?"):
20
- # 1️⃣ 生成 mp3
21
- tts = gTTS(text=text, lang="en")
22
- tmp_path = "/tmp/say_hi.mp3"
23
- tts.save(tmp_path)
24
-
25
- # 2️⃣ 播放音檔
26
- playsound(tmp_path)
27
-
28
- return f"Played: {text}"
29
 
30
  # --- Helper Functions ---
31
  def save_and_upload_image(image_b64, hf_token):
@@ -58,16 +52,22 @@ def process_and_describe(payload: dict):
58
  if not image_b64:
59
  return {"error": "No image provided."}
60
 
 
61
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
62
  hf_client = InferenceClient(token=hf_token)
63
 
 
64
  system_prompt = """
65
  You are a helpful robot assistant.
66
  1. Describe the image in detail.
67
- 2. Suggest what the robot should do next.
68
- - Human figure say 'Hi'.
69
- Always respond in JSON:
70
- {"description": "...", "action": "say_hi"}
 
 
 
 
71
  """
72
 
73
  messages_payload = [
@@ -78,24 +78,31 @@ def process_and_describe(payload: dict):
78
  ]}
79
  ]
80
 
 
81
  chat_completion = hf_client.chat.completions.create(
82
  model=HF_VLM_MODEL,
83
  messages=messages_payload,
84
- max_tokens=200
85
  )
86
 
 
87
  vlm_text = chat_completion.choices[0].message.content.strip()
88
  action_data = {}
89
  try:
90
  action_data = json.loads(vlm_text)
91
  except Exception:
92
- action_data = {"description": vlm_text, "action": "unknown"}
 
 
 
 
93
 
94
- # --- Call MCP tool ---
95
  vlm_action = action_data.get("action")
96
  tool_result = None
97
  if vlm_action == "say_hi":
98
- tool_result = say_hi(text="Hi!") # 這裡會生成 /tmp/say_hi.mp3
 
99
 
100
  return {
101
  "saved_to_hf_hub": True,
@@ -116,7 +123,7 @@ def process_and_describe(payload: dict):
116
  # --- Gradio MCP Interface ---
117
  demo = gr.Interface(
118
  fn=process_and_describe,
119
- inputs=gr.JSON(label="Input Payload"),
120
  outputs=gr.JSON(label="Reply to Jetson"),
121
  api_name="predict"
122
  )
 
3
  import gradio as gr
4
  from huggingface_hub import upload_file, InferenceClient
5
  import json
6
+ from fastmcp import MCP
 
 
7
 
8
  # --- Config ---
9
  HF_DATASET_REPO = "OppaAI/Robot_MCP"
 
14
 
15
  # --- MCP Tool ---
16
  @mcp.tools()
17
+ def say_hi(greeting_text="Hi!"):
18
+ """Return a greeting command in JSON."""
19
+ return {
20
+ "command": "say_hi",
21
+ "text": greeting_text
22
+ }
 
 
 
 
23
 
24
  # --- Helper Functions ---
25
  def save_and_upload_image(image_b64, hf_token):
 
52
  if not image_b64:
53
  return {"error": "No image provided."}
54
 
55
+ # Save image and upload to HF
56
  local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
57
  hf_client = InferenceClient(token=hf_token)
58
 
59
+ # System prompt for VLM
60
  system_prompt = """
61
  You are a helpful robot assistant.
62
  1. Describe the image in detail.
63
+ 2. Suggest what the robot should do next:
64
+ - If you see a human figure, suggest saying 'Hi' in a friendly and varied way.
65
+ Always respond in JSON format:
66
+ {
67
+ "description": "...",
68
+ "action": "say_hi",
69
+ "greeting_text": "a friendly greeting that can be different each time"
70
+ }
71
  """
72
 
73
  messages_payload = [
 
78
  ]}
79
  ]
80
 
81
+ # Call VLM
82
  chat_completion = hf_client.chat.completions.create(
83
  model=HF_VLM_MODEL,
84
  messages=messages_payload,
85
+ max_tokens=300
86
  )
87
 
88
+ # Extract VLM text
89
  vlm_text = chat_completion.choices[0].message.content.strip()
90
  action_data = {}
91
  try:
92
  action_data = json.loads(vlm_text)
93
  except Exception:
94
+ action_data = {
95
+ "description": vlm_text,
96
+ "action": "unknown",
97
+ "greeting_text": "Hi!"
98
+ }
99
 
100
+ # --- Call MCP tool if needed ---
101
  vlm_action = action_data.get("action")
102
  tool_result = None
103
  if vlm_action == "say_hi":
104
+ greeting_text = action_data.get("greeting_text", "Hi!")
105
+ tool_result = say_hi(greeting_text=greeting_text)
106
 
107
  return {
108
  "saved_to_hf_hub": True,
 
123
  # --- Gradio MCP Interface ---
124
  demo = gr.Interface(
125
  fn=process_and_describe,
126
+ inputs=gr.JSON(label="Input Payload (Dict format with 'image_b64')"),
127
  outputs=gr.JSON(label="Reply to Jetson"),
128
  api_name="predict"
129
  )