OppaAI commited on
Commit
95595f1
·
verified ·
1 Parent(s): d2ce059

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -15
app.py CHANGED
@@ -3,41 +3,72 @@ import base64
3
  from PIL import Image
4
  import io
5
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # 修改函式以確保它接收一個字典(這是 gradio_client 預設發送的格式)
8
  def process(payload):
9
  try:
10
- # 如果客戶端已經傳送字典,直接使用 payload:
11
- data = payload
12
-
13
- # decode base64 image
14
  img_bytes = base64.b64decode(data["image_b64"])
15
- img = Image.open(io.BytesIO(img_bytes))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- # This goes to Jetson
18
  reply = {
19
  "received": True,
20
  "robot_id": data.get("robot_id"),
21
- "size": img.size
 
22
  }
23
 
24
- # *** 關鍵修改:回傳一個包含圖片和 JSON 回覆的元組 (tuple) ***
25
- # Gradio 會自動將第一個值賦給第一個輸出元件 (gr.Image)
26
- # 第二個值賦給第二個輸出元件 (gr.JSON)
27
  return img, reply
28
 
29
  except Exception as e:
30
- # 發生錯誤時,確保回傳兩個值,其中圖片值為 None
31
  return None, {"error": str(e)}
32
 
 
 
 
33
 
34
  demo = gr.Interface(
35
  fn=process,
36
- # 我們將輸入定義為 JSON,這允許後端接收字典格式
37
  inputs=gr.JSON(label="Input Payload (Dict format)"),
38
  outputs=[
39
- gr.Image(type="pil", label="Image Preview"), # 現在將接收 img 物件
40
- gr.JSON(label="Reply to Jetson") # 現在將接收 reply 字典
41
  ],
42
  api_name="predict"
43
  )
 
3
  from PIL import Image
4
  import io
5
  import json
6
+ import torch
7
+ from transformers import AutoModelForVision2Seq, AutoProcessor
8
+
9
+ # ------------------------------------------------------------
10
+ # 1. Load VLLM Model (Qwen3-VL-8B-Instruct)
11
+ # ------------------------------------------------------------
12
+
13
+ model_name = "Qwen/Qwen2-VL-7B-Instruct" # HF 官方推薦名稱(VL)
14
+ processor = AutoProcessor.from_pretrained(model_name)
15
+ model = AutoModelForVision2Seq.from_pretrained(
16
+ model_name,
17
+ torch_dtype=torch.float16,
18
+ low_cpu_mem_usage=True
19
+ ).to("cuda")
20
+
21
+ # ------------------------------------------------------------
22
+ # 2. Main Process Function
23
+ # ------------------------------------------------------------
24
 
 
25
  def process(payload):
26
  try:
27
+ # 取得資料
28
+ data = payload
 
 
29
  img_bytes = base64.b64decode(data["image_b64"])
30
+ img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
31
+
32
+ # ------------------------------------------------------------
33
+ # 3. Vision-Language model inference
34
+ # ------------------------------------------------------------
35
+
36
+ prompt = "Describe what you see in this image in detail."
37
+ inputs = processor(images=img, text=prompt, return_tensors="pt").to("cuda", torch.float16)
38
+
39
+ output_ids = model.generate(
40
+ **inputs,
41
+ max_new_tokens=200,
42
+ temperature=0.2
43
+ )
44
+ response_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
45
+
46
+ # ------------------------------------------------------------
47
+ # 4. Return results to Jetson
48
+ # ------------------------------------------------------------
49
 
 
50
  reply = {
51
  "received": True,
52
  "robot_id": data.get("robot_id"),
53
+ "size": img.size,
54
+ "vllm_analysis": response_text
55
  }
56
 
 
 
 
57
  return img, reply
58
 
59
  except Exception as e:
 
60
  return None, {"error": str(e)}
61
 
62
+ # ------------------------------------------------------------
63
+ # 5. Gradio UI
64
+ # ------------------------------------------------------------
65
 
66
  demo = gr.Interface(
67
  fn=process,
 
68
  inputs=gr.JSON(label="Input Payload (Dict format)"),
69
  outputs=[
70
+ gr.Image(type="pil", label="Image Preview"),
71
+ gr.JSON(label="Reply to Jetson")
72
  ],
73
  api_name="predict"
74
  )