OppaAI commited on
Commit
6c10eb2
·
verified ·
1 Parent(s): cb0d5e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -54
app.py CHANGED
@@ -3,74 +3,49 @@ import base64
3
  from PIL import Image
4
  import io
5
  import json
6
- import torch
7
- from transformers import AutoModelForVision2Seq, AutoProcessor
8
-
9
- # ------------------------------------------------------------
10
- # 1. Load VLLM Model (Qwen3-VL-8B-Instruct)
11
- # ------------------------------------------------------------
12
-
13
- model_name = "Qwen/Qwen2-VL-7B-Instruct" # HF 官方推薦名稱(VL)
14
- processor = AutoProcessor.from_pretrained(model_name)
15
- model = AutoModelForVision2Seq.from_pretrained(
16
- model_name,
17
- torch_dtype=torch.float16,
18
- low_cpu_mem_usage=True
19
- ).to("cuda")
20
-
21
- # ------------------------------------------------------------
22
- # 2. Main Process Function
23
- # ------------------------------------------------------------
24
-
25
- def process(payload):
 
26
  try:
27
- # 取得資料
28
- data = payload
29
- img_bytes = base64.b64decode(data["image_b64"])
30
  img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
31
 
32
- # ------------------------------------------------------------
33
- # 3. Vision-Language model inference
34
- # ------------------------------------------------------------
35
-
36
- prompt = "Describe what you see in this image in detail."
37
- inputs = processor(images=img, text=prompt, return_tensors="pt").to("cuda", torch.float16)
38
-
39
- output_ids = model.generate(
40
- **inputs,
41
- max_new_tokens=200,
42
- temperature=0.2
43
- )
44
- response_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
45
-
46
- # ------------------------------------------------------------
47
- # 4. Return results to Jetson
48
- # ------------------------------------------------------------
49
 
50
  reply = {
51
  "received": True,
52
- "robot_id": data.get("robot_id"),
53
  "size": img.size,
54
- "vllm_analysis": response_text
55
  }
56
-
57
  return reply
58
-
59
  except Exception as e:
60
- return None, {"error": str(e)}
61
-
62
- # ------------------------------------------------------------
63
- # 5. Gradio UI
64
- # ------------------------------------------------------------
65
 
66
  demo = gr.Interface(
67
  fn=process,
68
  inputs=gr.JSON(label="Input Payload (Dict format)"),
69
- outputs=[
70
- gr.Image(type="pil", label="Image Preview"),
71
- gr.JSON(label="Reply to Jetson")
72
- ],
73
  api_name="predict"
74
  )
75
 
76
- demo.launch()
 
 
3
  from PIL import Image
4
  import io
5
  import json
6
+ import requests
7
+
8
+ HF_VLM_API = "https://api-inference.huggingface.co/models/Qwen/Qwen2-VL-7B-Instruct"
9
+ HF_TOKEN = "ROBOT_MCP_TOKEN" # API TOKEN
10
+
11
+ def call_vlm_api(img: Image):
12
+ # encode image to bytes
13
+ buf = io.BytesIO()
14
+ img.save(buf, format="JPEG")
15
+ img_bytes = buf.getvalue()
16
+
17
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
18
+ payload = {"inputs": [{"image": img_bytes, "text": "Describe the image in detail."}]}
19
+
20
+ resp = requests.post(HF_VLM_API, headers=headers, json=payload, timeout=60)
21
+ if resp.status_code == 200:
22
+ return resp.json()[0].get("generated_text", "")
23
+ else:
24
+ return f"VLM API error: {resp.status_code}"
25
+
26
+ def process(payload: dict):
27
  try:
28
+ img_bytes = base64.b64decode(payload["image_b64"])
 
 
29
  img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
30
 
31
+ vlm_text = call_vlm_api(img)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  reply = {
34
  "received": True,
35
+ "robot_id": payload.get("robot_id", "unknown"),
36
  "size": img.size,
37
+ "vllm_analysis": vlm_text
38
  }
 
39
  return reply
 
40
  except Exception as e:
41
+ return {"error": str(e)}
 
 
 
 
42
 
43
  demo = gr.Interface(
44
  fn=process,
45
  inputs=gr.JSON(label="Input Payload (Dict format)"),
46
+ outputs=gr.JSON(label="Reply to Jetson"),
 
 
 
47
  api_name="predict"
48
  )
49
 
50
+ if __name__ == "__main__":
51
+ demo.launch(mcp_server=True)