Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 16, 2025

Commit

6c10eb2

verified ·

1 Parent(s): cb0d5e3

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -54

app.py CHANGED Viewed

@@ -3,74 +3,49 @@ import base64
 from PIL import Image
 import io
 import json
-import torch
-from transformers import AutoModelForVision2Seq, AutoProcessor
-# ------------------------------------------------------------
-# 1. Load VLLM Model (Qwen3-VL-8B-Instruct)
-# ------------------------------------------------------------
-model_name = "Qwen/Qwen2-VL-7B-Instruct"  # HF 官方推薦名稱（VL）
-processor = AutoProcessor.from_pretrained(model_name)
-model = AutoModelForVision2Seq.from_pretrained(
-    model_name,
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True
-).to("cuda")
-# ------------------------------------------------------------
-# 2. Main Process Function
-# ------------------------------------------------------------
-def process(payload):
     try:
-        # 取得資料
-        data = payload
-        img_bytes = base64.b64decode(data["image_b64"])
         img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        # ------------------------------------------------------------
-        # 3. Vision-Language model inference
-        # ------------------------------------------------------------
-        prompt = "Describe what you see in this image in detail."
-        inputs = processor(images=img, text=prompt, return_tensors="pt").to("cuda", torch.float16)
-        output_ids = model.generate(
-            **inputs,
-            max_new_tokens=200,
-            temperature=0.2
-        )
-        response_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
-        # ------------------------------------------------------------
-        # 4. Return results to Jetson
-        # ------------------------------------------------------------
         reply = {
             "received": True,
-            "robot_id": data.get("robot_id"),
             "size": img.size,
-            "vllm_analysis": response_text
         }
         return reply
     except Exception as e:
-        return None, {"error": str(e)}
-# ------------------------------------------------------------
-# 5. Gradio UI
-# ------------------------------------------------------------
 demo = gr.Interface(
     fn=process,
     inputs=gr.JSON(label="Input Payload (Dict format)"),
-    outputs=[
-        gr.Image(type="pil", label="Image Preview"),
-        gr.JSON(label="Reply to Jetson")
-    ],
     api_name="predict"
 )
-demo.launch()

 from PIL import Image
 import io
 import json
+import requests
+HF_VLM_API = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-7B-Instruct"
+HF_TOKEN = "ROBOT_MCP_TOKEN"  # API TOKEN
+def call_vlm_api(img: Image):
+    # encode image to bytes
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG")
+    img_bytes = buf.getvalue()
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    payload = {"inputs": [{"image": img_bytes, "text": "Describe the image in detail."}]}
+    resp = requests.post(HF_VLM_API, headers=headers, json=payload, timeout=60)
+    if resp.status_code == 200:
+        return resp.json()[0].get("generated_text", "")
+    else:
+        return f"VLM API error: {resp.status_code}"
+def process(payload: dict):
     try:
+        img_bytes = base64.b64decode(payload["image_b64"])
         img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        vlm_text = call_vlm_api(img)
         reply = {
             "received": True,
+            "robot_id": payload.get("robot_id", "unknown"),
             "size": img.size,
+            "vllm_analysis": vlm_text
         }
         return reply
     except Exception as e:
+        return {"error": str(e)}
 demo = gr.Interface(
     fn=process,
     inputs=gr.JSON(label="Input Payload (Dict format)"),
+    outputs=gr.JSON(label="Reply to Jetson"),
     api_name="predict"
 )
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)