Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

App Files Files Community

OppaAI commited on Nov 16, 2025

Commit

95595f1

verified ·

1 Parent(s): d2ce059

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -15

app.py CHANGED Viewed

@@ -3,41 +3,72 @@ import base64
 from PIL import Image
 import io
 import json
-# 修改函式以確保它接收一個字典（這是 gradio_client 預設發送的格式）
 def process(payload):
     try:
-        # 如果客戶端已經傳送字典，直接使用 payload:
-        data = payload
-        # decode base64 image
         img_bytes = base64.b64decode(data["image_b64"])
-        img = Image.open(io.BytesIO(img_bytes))
-        # This goes to Jetson
         reply = {
             "received": True,
             "robot_id": data.get("robot_id"),
-            "size": img.size
         }
-        # *** 關鍵修改：回傳一個包含圖片和 JSON 回覆的元組 (tuple) ***
-        # Gradio 會自動將第一個值賦給第一個輸出元件 (gr.Image)
-        # 第二個值賦給第二個輸出元件 (gr.JSON)
         return img, reply
     except Exception as e:
-        # 發生錯誤時，確保回傳兩個值，其中圖片值為 None
         return None, {"error": str(e)}
 demo = gr.Interface(
     fn=process,
-    # 我們將輸入定義為 JSON，這允許後端接收字典格式
     inputs=gr.JSON(label="Input Payload (Dict format)"),
     outputs=[
-        gr.Image(type="pil", label="Image Preview"), # 現在將接收 img 物件
-        gr.JSON(label="Reply to Jetson")            # 現在將接收 reply 字典
     ],
     api_name="predict"
 )

 from PIL import Image
 import io
 import json
+import torch
+from transformers import AutoModelForVision2Seq, AutoProcessor
+# ------------------------------------------------------------
+# 1. Load VLLM Model (Qwen3-VL-8B-Instruct)
+# ------------------------------------------------------------
+model_name = "Qwen/Qwen2-VL-7B-Instruct"  # HF 官方推薦名稱（VL）
+processor = AutoProcessor.from_pretrained(model_name)
+model = AutoModelForVision2Seq.from_pretrained(
+    model_name,
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True
+).to("cuda")
+# ------------------------------------------------------------
+# 2. Main Process Function
+# ------------------------------------------------------------
 def process(payload):
     try:
+        # 取得資料
+        data = payload
         img_bytes = base64.b64decode(data["image_b64"])
+        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        # ------------------------------------------------------------
+        # 3. Vision-Language model inference
+        # ------------------------------------------------------------
+        prompt = "Describe what you see in this image in detail."
+        inputs = processor(images=img, text=prompt, return_tensors="pt").to("cuda", torch.float16)
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=200,
+            temperature=0.2
+        )
+        response_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
+        # ------------------------------------------------------------
+        # 4. Return results to Jetson
+        # ------------------------------------------------------------
         reply = {
             "received": True,
             "robot_id": data.get("robot_id"),
+            "size": img.size,
+            "vllm_analysis": response_text
         }
         return img, reply
     except Exception as e:
         return None, {"error": str(e)}
+# ------------------------------------------------------------
+# 5. Gradio UI
+# ------------------------------------------------------------
 demo = gr.Interface(
     fn=process,
     inputs=gr.JSON(label="Input Payload (Dict format)"),
     outputs=[
+        gr.Image(type="pil", label="Image Preview"),
+        gr.JSON(label="Reply to Jetson")
     ],
     api_name="predict"
 )