songtianhui commited on
Commit
01d6309
·
1 Parent(s): 46f0dc4

update example code

Browse files
Files changed (1) hide show
  1. README.md +62 -26
README.md CHANGED
@@ -80,36 +80,72 @@ Without introducing any complex architectures or special patterns, we show how e
80
 
81
  # Model Usage
82
 
83
- ## Inference with 🤗 Hugging Face Transformers
84
 
85
- It is recommended to use python=3.10, torch>=2.1.0, and transformers=4.48.2 as the development environment.
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  ```python
88
- from PIL import Image
89
- from transformers import AutoModelForCausalLM, AutoProcessor
90
- model_path = "sthui/SimpleSeg-Kimi-VL"
91
- model = AutoModelForCausalLM.from_pretrained(
92
- model_path,
93
- torch_dtype="auto",
94
- device_map="auto",
95
- trust_remote_code=True,
96
- )
97
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  image_path = "./figures/octopus.png"
99
- image = Image.open(image_path)
100
- messages = [
101
- {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": "Output the polygon coordinates of octopus in the image."}]}
102
- ]
103
- text = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
104
- inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device)
105
- generated_ids = model.generate(**inputs, max_new_tokens=512)
106
- generated_ids_trimmed = [
107
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
108
- ]
109
- response = processor.batch_decode(
110
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
111
- )[0]
112
- print(response)
113
  ```
114
 
115
 
 
80
 
81
  # Model Usage
82
 
83
+ ## Inference
84
 
85
+ We recommend using vLLM for production deployment. Requires `vllm>=0.12.0` with `--trust-remote-code`.
86
+
87
+ First, start the vLLM server:
88
+
89
+ ```
90
+ vllm serve sthui/SimpleSeg-Qwen2.5-VL \
91
+ --trust-remote-code \
92
+ --tensor-parallel-size 4 \
93
+ --served-model-name SimpleSeg-Qwen2.5-VL \
94
+ --host 0.0.0.0 \
95
+ --port 8000
96
+ ```
97
+
98
+ Then run the following code to inference:
99
 
100
  ```python
101
+
102
+ import base64
103
+ from openai import OpenAI
104
+
105
+ # vLLM server configuration
106
+ VLLM_BASE_URL = "http://localhost:8000/v1"
107
+ MODEL_NAME = "SimpleSeg-Qwen2.5-VL" # Should match --served-model-name in vllm serve
108
+
109
+ def encode_image(image_path: str) -> str:
110
+ """Encode image to base64 string."""
111
+ with open(image_path, "rb") as f:
112
+ return base64.b64encode(f.read()).decode()
113
+
114
+ def inference(image_path: str, instruction: str) -> str:
115
+ """Run GUI grounding inference via vLLM."""
116
+ client = OpenAI(base_url=VLLM_BASE_URL, api_key="EMPTY")
117
+
118
+ messages = [
119
+ {
120
+ "role": "user",
121
+ "content": [
122
+ {
123
+ "type": "image_url",
124
+ "image_url": {"url": f"data:image/png;base64,{encode_image(image_path)}"}
125
+ },
126
+ {"type": "text", "text": instruction},
127
+ ],
128
+ },
129
+ ]
130
+
131
+ response = client.chat.completions.create(
132
+ model=MODEL_NAME,
133
+ messages=messages,
134
+ max_tokens=4096,
135
+ temperature=0,
136
+ )
137
+
138
+ return response.choices[0].message.content
139
+
140
+ # Example usage
141
  image_path = "./figures/octopus.png"
142
+ instruction = "Output the polygon coordinates of octopus in the image."
143
+
144
+ response = inference(image_path, instruction)
145
+ print("Model output:", response)
146
+
147
+
148
+
 
 
 
 
 
 
 
149
  ```
150
 
151