Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 17, 2025

Commit

dac9550

verified ·

1 Parent(s): 51deb36

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -17

app.py CHANGED Viewed

@@ -7,17 +7,10 @@ from huggingface_hub import upload_file, InferenceClient
 from PIL import Image
 # --- Config ---
-#HF_TOKEN = os.environ.get("HF_CV_ROBOT_TOKEN")
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 # Model specifically for VLM (image-to-text) tasks on Hugging Face
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
-#if not HF_TOKEN:
-#    raise ValueError("HF_TOKEN environment variable not set.")
-# Initialize the Hugging Face Inference Client
-hf_client = InferenceClient(token=HF_TOKEN)
 # --- Helper Functions ---
 def save_and_upload_image(image_b64):
     """Save image to /tmp and upload to HF dataset."""
@@ -42,33 +35,38 @@ def save_and_upload_image(image_b64):
 # --- Main MCP function ---
 def process_and_describe(payload: dict):
     try:
-        hf_token = payload.get("hf_token", HF_TOKEN)
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload["image_b64"]
-        # 1️⃣ Save & upload image (needed for tracking, but B64 is used for VLM call)
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
-        # 2️⃣ Prepare the multimodal message payload for the conversational API
         messages_payload = [
             {
                 "role": "user",
                 "content": [
                     {"type": "text", "text": "Describe this image in detail."},
-                    # Pass the original Base64 string directly in the required format
                     {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
                 ],
             }
         ]
-        # 3️⃣ Call VLM using hf_client.chat.completions.create (The correct method for 'conversational' task)
         chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
             messages=messages_payload,
-            max_tokens=150, # Use max_tokens instead of max_new_tokens for this method
         )
-        # Extract the text content from the response object
         vlm_text = chat_completion.choices[0].message.content.strip()
         return {
@@ -82,7 +80,6 @@ def process_and_describe(payload: dict):
         }
     except Exception as e:
-        # Added better error handling
         return {"error": f"An API error occurred: {str(e)}"}
 # --- Gradio MCP Interface ---

 from PIL import Image
 # --- Config ---
 HF_DATASET_REPO = "OppaAI/Robot_MCP"
 # Model specifically for VLM (image-to-text) tasks on Hugging Face
 HF_VLM_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
 # --- Helper Functions ---
 def save_and_upload_image(image_b64):
     """Save image to /tmp and upload to HF dataset."""
 # --- Main MCP function ---
 def process_and_describe(payload: dict):
     try:
+        # 1️⃣ Use robot-sent token if available, otherwise fallback
+        hf_token = payload.get("hf_token")
+        if not hf_token:
+            return {"error": "HF token not provided in payload."}
         robot_id = payload.get("robot_id", "unknown")
         image_b64 = payload["image_b64"]
+        # 2️⃣ Save image temporarily (for tracking)
         local_tmp_path, hf_url, path_in_repo, size_bytes = save_and_upload_image(image_b64)
+        # 3️⃣ Initialize HF client per request
+        hf_client = InferenceClient(token=hf_token)
+        # 4️⃣ Prepare multimodal message payload
         messages_payload = [
             {
                 "role": "user",
                 "content": [
                     {"type": "text", "text": "Describe this image in detail."},
                     {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
                 ],
             }
         ]
+        # 5️⃣ Call VLM
         chat_completion = hf_client.chat.completions.create(
             model=HF_VLM_MODEL,
             messages=messages_payload,
+            max_tokens=150,
         )
         vlm_text = chat_completion.choices[0].message.content.strip()
         return {
         }
     except Exception as e:
         return {"error": f"An API error occurred: {str(e)}"}
 # --- Gradio MCP Interface ---