Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 17, 2025

Commit

a10dd0b

verified ·

1 Parent(s): 406e27f

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -12

app.py CHANGED Viewed

@@ -8,7 +8,6 @@ from PIL import Image
 # --- Config ---
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
-# Model specifically for VLM (image-to-text) tasks on Hugging Face
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 # --- Helper Functions ---
@@ -24,7 +23,7 @@ def save_and_upload_image(image_b64, hf_token):
         path_or_fileobj=local_tmp_path,
         path_in_repo=path_in_repo,
         repo_id=HF_DATASET_REPO,
-        token=hf_token,  # ← use token from payload
         repo_type="dataset"
     )
@@ -35,7 +34,7 @@ def save_and_upload_image(image_b64, hf_token):
 # --- Main MCP function ---
 def process_and_describe(payload: dict):
     try:
-        # 1️⃣ Use robot-sent token if available, otherwise fallback
         hf_token = payload.get("hf_token")
         if not hf_token:
             return {"error": "HF token not provided in payload."}
@@ -43,28 +42,43 @@ def process_and_describe(payload: dict):
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload["image_b64"]
-        # 2️⃣ Save image temporarily (for tracking)
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
         # 3️⃣ Initialize HF client per request
         hf_client = InferenceClient(token=hf_token)
-        # 4️⃣ Prepare multimodal message payload
         messages_payload = [
             {
                 "role": "user",
                 "content": [
-                    {"type": "text", "text": "Describe this image in detail."},
-                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
-                ],
             }
         ]
-        # 5️⃣ Call VLM
         chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
             messages=messages_payload,
-            max_tokens=150,
         )
         vlm_text = chat_completion.choices[0].message.content.strip()
@@ -76,12 +90,13 @@ def process_and_describe(payload: dict):
             "image_url": hf_url,
             "file_size_bytes": size_bytes,
             "robot_id": robot_id,
-            "vlm_description": vlm_text
         }
     except Exception as e:
         return {"error": f"An API error occurred: {str(e)}"}
 # --- Gradio MCP Interface ---
 demo = gr.Interface(
     fn=process_and_describe,
@@ -91,5 +106,4 @@ demo = gr.Interface(
 )
 if __name__ == "__main__":
-    # Ensure you have the latest huggingface-hub: pip install --upgrade huggingface-hub Pillow requests
     demo.launch(mcp_server=True)

 # --- Config ---
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 # --- Helper Functions ---
         path_or_fileobj=local_tmp_path,
         path_in_repo=path_in_repo,
         repo_id=HF_DATASET_REPO,
+        token=hf_token,
         repo_type="dataset"
     )
 # --- Main MCP function ---
 def process_and_describe(payload: dict):
     try:
+        # 1️⃣ Use robot-sent token
         hf_token = payload.get("hf_token")
         if not hf_token:
             return {"error": "HF token not provided in payload."}
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload["image_b64"]
+        # 2️⃣ Save image temporarily
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64, hf_token)
         # 3️⃣ Initialize HF client per request
         hf_client = InferenceClient(token=hf_token)
+        # 4️⃣ System prompt for robot action
+        system_prompt = """
+        You are a helpful robot assistant.
+        When receiving an image, you must:
+        1. Describe the image in detail.
+        2. Suggest what the robot should do next based on what it sees.
+           - If you see a human figure, suggest the robot to say 'Hi'.
+           - If you see a ball, suggest the robot to go towards it.
+           - If you see obstacles, suggest 'stop' or 'avoid'.
+           - If you see a red button, suggest 'press the button'.
+        Always provide the suggested actions in JSON format:
+        {"move": "forward/stop/left/right", "interact": "press/say/do nothing"}
+        """
+        # 5️⃣ Prepare multimodal message payload
         messages_payload = [
+            {"role": "system", "content": system_prompt},
             {
                 "role": "user",
                 "content": [
+                    {"type": "text", "text": "Here is an image."},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}}
+                ]
             }
         ]
+        # 6️⃣ Call VLM
         chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
             messages=messages_payload,
+            max_tokens=200,
         )
         vlm_text = chat_completion.choices[0].message.content.strip()
             "image_url": hf_url,
             "file_size_bytes": size_bytes,
             "robot_id": robot_id,
+            "vlm_response": vlm_text
         }
     except Exception as e:
         return {"error": f"An API error occurred: {str(e)}"}
 # --- Gradio MCP Interface ---
 demo = gr.Interface(
     fn=process_and_describe,
 )
 if __name__ == "__main__":
     demo.launch(mcp_server=True)