Spaces:

OppaAI
/

Robot_MCP_Server

Sleeping

File size: 2,352 Bytes

d081bf3
d82a7f0
 
 
 
95595f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d081bf3
d2ce059
afac99d
95595f1
 
afac99d
95595f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
afac99d
eb6d527
 
 
95595f1
 
eb6d527
 
9fc69b3
afac99d
 
eb6d527
 
95595f1
 
 
d081bf3
 
d82a7f0
d2ce059
eb6d527
95595f1
 
eb6d527
afac99d
d081bf3
 
a966c99

import gradio as gr
import base64
from PIL import Image
import io
import json
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor

# ------------------------------------------------------------
# 1. Load VLLM Model (Qwen3-VL-8B-Instruct)
# ------------------------------------------------------------

model_name = "Qwen/Qwen2-VL-7B-Instruct"  # HF 官方推薦名稱（VL）
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
).to("cuda")

# ------------------------------------------------------------
# 2. Main Process Function
# ------------------------------------------------------------

def process(payload):
    try:
        # 取得資料
        data = payload
        img_bytes = base64.b64decode(data["image_b64"])
        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")

        # ------------------------------------------------------------
        # 3. Vision-Language model inference
        # ------------------------------------------------------------

        prompt = "Describe what you see in this image in detail."
        inputs = processor(images=img, text=prompt, return_tensors="pt").to("cuda", torch.float16)

        output_ids = model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.2
        )
        response_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

        # ------------------------------------------------------------
        # 4. Return results to Jetson
        # ------------------------------------------------------------

        reply = {
            "received": True,
            "robot_id": data.get("robot_id"),
            "size": img.size,
            "vllm_analysis": response_text
        }

        return reply

    except Exception as e:
        return None, {"error": str(e)}

# ------------------------------------------------------------
# 5. Gradio UI
# ------------------------------------------------------------

demo = gr.Interface(
    fn=process,
    inputs=gr.JSON(label="Input Payload (Dict format)"),
    outputs=[
        gr.Image(type="pil", label="Image Preview"),
        gr.JSON(label="Reply to Jetson")
    ],
    api_name="predict"
)

demo.launch()