CCCCyx commited on
Commit
45d5886
·
verified ·
1 Parent(s): c53c874

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +99 -122
README.md CHANGED
@@ -94,31 +94,17 @@ We conducted a comprehensive evaluation of **MOSS-VL-Instruct-0408** across four
94
  ## 🚀 Quickstart
95
 
96
  <details>
97
- <summary><strong>Queue-based offline inference (Python)</strong></summary>
98
 
99
  <br>
100
 
101
  ```python
102
- import os
103
- import queue
104
- import threading
105
-
106
  import torch
107
  from transformers import AutoModelForCausalLM, AutoProcessor
108
 
109
  checkpoint = "path/to/checkpoint"
110
- video_path = "data/example_video.mp4"
111
- prompt = "Describe the video."
112
-
113
- max_new_tokens = 1024
114
- temperature = 1.0
115
- top_k = 50
116
- top_p = 1.0
117
- repetition_penalty = 1.0
118
-
119
- video_fps = 1.0
120
- video_minlen = 8
121
- video_maxlen = 256
122
 
123
 
124
  def load_model(checkpoint: str):
@@ -137,72 +123,36 @@ def load_model(checkpoint: str):
137
  return model, processor
138
 
139
 
140
- if not checkpoint:
141
- raise ValueError("Missing `checkpoint`.")
142
- if not video_path:
143
- raise ValueError("Missing `video_path`.")
144
- if not os.path.isfile(video_path):
145
- raise FileNotFoundError(f"Video not found: {video_path}")
146
-
147
  model, processor = load_model(checkpoint)
148
- new_queries: "queue.Queue[dict]" = queue.Queue()
149
- output_text_queue: "queue.Queue[str]" = queue.Queue()
150
-
151
- query = {
152
- "prompt": prompt,
153
- "images": [],
154
- "videos": [video_path],
155
- "media_kwargs": {
156
- "video_fps": video_fps,
157
- "video_minlen": video_minlen,
158
- "video_maxlen": video_maxlen,
159
- },
160
- "generate_kwargs": {
161
- "temperature": temperature,
162
- "top_k": top_k,
163
- "top_p": top_p,
164
- "max_new_tokens": max_new_tokens,
165
- "repetition_penalty": repetition_penalty,
166
- "do_sample": False,
167
- },
168
- }
169
-
170
-
171
- def drain_output():
172
- while True:
173
- tok = output_text_queue.get()
174
- if tok == "<|round_end|>":
175
- break
176
- print(tok, end="", flush=True)
177
 
178
-
179
- worker = threading.Thread(
180
- target=model.offline_generate,
181
- args=(processor, new_queries, output_text_queue),
182
- kwargs={"vision_chunked_length": 64},
183
- daemon=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  )
185
- worker.start()
186
-
187
- new_queries.put(query)
188
- drain_output()
189
 
190
- new_queries.put({"stop_offline_generate": True})
191
- worker.join(timeout=5.0)
192
  ```
193
 
194
- For image-only usage, keep the same template and change:
195
-
196
- - replace `video_path` with `image_path`
197
- - validate `image_path` instead of `video_path`
198
- - set `images` to `[image_path]`
199
- - set `videos` to `[]`
200
- - remove `media_kwargs` if you do not need video-specific controls
201
-
202
  </details>
203
 
204
  <details>
205
- <summary><strong>Batched offline inference (Python)</strong></summary>
206
 
207
  <br>
208
 
@@ -211,21 +161,8 @@ import torch
211
  from transformers import AutoModelForCausalLM, AutoProcessor
212
 
213
  checkpoint = "path/to/checkpoint"
214
-
215
- shared_generate_kwargs = {
216
- "temperature": 1.0,
217
- "top_k": 50,
218
- "top_p": 1.0,
219
- "max_new_tokens": 256,
220
- "repetition_penalty": 1.0,
221
- "do_sample": False,
222
- }
223
-
224
- shared_media_kwargs = {
225
- "video_fps": 1.0,
226
- "video_minlen": 8,
227
- "video_maxlen": 256,
228
- }
229
 
230
 
231
  def load_model(checkpoint: str):
@@ -245,55 +182,95 @@ def load_model(checkpoint: str):
245
 
246
 
247
  model, processor = load_model(checkpoint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  queries = [
249
  {
250
  "prompt": "Describe sample A.",
251
  "images": [],
252
  "videos": ["data/sample_a.mp4"],
253
- "media_kwargs": dict(shared_media_kwargs),
254
- "generate_kwargs": dict(shared_generate_kwargs),
 
 
 
 
 
 
 
255
  },
256
  {
257
  "prompt": "Describe sample B.",
258
  "images": [],
259
  "videos": ["data/sample_b.mp4"],
260
- "media_kwargs": dict(shared_media_kwargs),
261
- "generate_kwargs": dict(shared_generate_kwargs),
 
 
 
 
 
 
 
262
  },
263
  ]
264
 
265
  with torch.no_grad():
266
- result = model.offline_batch_generate(
267
- processor,
268
- queries,
269
- session_states=None,
270
- vision_chunked_length=64,
271
- )
272
 
273
  texts = [item["text"] for item in result["results"]]
274
- session_states = result["session_states"]
275
- ```
276
-
277
- ```python
278
- followup_queries = [
279
- {
280
- "prompt": "Summarize sample A in one sentence.",
281
- "generate_kwargs": dict(shared_generate_kwargs),
282
- },
283
- {
284
- "prompt": "Restart sample B and answer again.",
285
- "reset_session": True,
286
- "generate_kwargs": dict(shared_generate_kwargs),
287
- },
288
- ]
289
-
290
- with torch.no_grad():
291
- followup_result = model.offline_batch_generate(
292
- processor,
293
- followup_queries,
294
- session_states=session_states,
295
- vision_chunked_length=64,
296
- )
297
  ```
298
 
299
  </details>
 
94
  ## 🚀 Quickstart
95
 
96
  <details>
97
+ <summary><strong>Single-image offline inference (Python)</strong></summary>
98
 
99
  <br>
100
 
101
  ```python
 
 
 
 
102
  import torch
103
  from transformers import AutoModelForCausalLM, AutoProcessor
104
 
105
  checkpoint = "path/to/checkpoint"
106
+ image_path = "data/example_image.jpg"
107
+ prompt = "Describe this image."
 
 
 
 
 
 
 
 
 
 
108
 
109
 
110
  def load_model(checkpoint: str):
 
123
  return model, processor
124
 
125
 
 
 
 
 
 
 
 
126
  model, processor = load_model(checkpoint)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ text = model.offline_image_generate(
129
+ processor,
130
+ prompt=prompt,
131
+ image=image_path,
132
+ shortest_edge=4096,
133
+ longest_edge=16777216,
134
+ multi_image_max_pixels=201326592,
135
+ patch_size=16,
136
+ temporal_patch_size=1,
137
+ merge_size=2,
138
+ image_mean=[0.5, 0.5, 0.5],
139
+ image_std=[0.5, 0.5, 0.5],
140
+ max_new_tokens=256,
141
+ temperature=1.0,
142
+ top_k=50,
143
+ top_p=1.0,
144
+ repetition_penalty=1.0,
145
+ do_sample=False,
146
+ vision_chunked_length=64,
147
  )
 
 
 
 
148
 
149
+ print(text)
 
150
  ```
151
 
 
 
 
 
 
 
 
 
152
  </details>
153
 
154
  <details>
155
+ <summary><strong>Single-video offline inference (Python)</strong></summary>
156
 
157
  <br>
158
 
 
161
  from transformers import AutoModelForCausalLM, AutoProcessor
162
 
163
  checkpoint = "path/to/checkpoint"
164
+ video_path = "data/example_video.mp4"
165
+ prompt = "Describe this video."
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
 
168
  def load_model(checkpoint: str):
 
182
 
183
 
184
  model, processor = load_model(checkpoint)
185
+
186
+ text = model.offline_video_generate(
187
+ processor,
188
+ prompt=prompt,
189
+ video=video_path,
190
+ shortest_edge=4096,
191
+ longest_edge=16777216,
192
+ video_max_pixels=201326592,
193
+ patch_size=16,
194
+ temporal_patch_size=1,
195
+ merge_size=2,
196
+ video_fps=1.0,
197
+ min_frames=1,
198
+ max_frames=256,
199
+ num_extract_threads=4,
200
+ image_mean=[0.5, 0.5, 0.5],
201
+ image_std=[0.5, 0.5, 0.5],
202
+ max_new_tokens=256,
203
+ temperature=1.0,
204
+ top_k=50,
205
+ top_p=1.0,
206
+ repetition_penalty=1.0,
207
+ do_sample=False,
208
+ vision_chunked_length=64,
209
+ )
210
+
211
+ print(text)
212
+ ```
213
+
214
+ </details>
215
+
216
+ <details>
217
+ <summary><strong>Batched offline inference (Python)</strong></summary>
218
+
219
+ <br>
220
+
221
+ ```python
222
+ import torch
223
+ from transformers import AutoModelForCausalLM, AutoProcessor
224
+
225
+ checkpoint = "path/to/checkpoint"
226
+ processor = AutoProcessor.from_pretrained(
227
+ checkpoint,
228
+ trust_remote_code=True,
229
+ frame_extract_num_threads=1,
230
+ )
231
+ model = AutoModelForCausalLM.from_pretrained(
232
+ checkpoint,
233
+ trust_remote_code=True,
234
+ device_map="auto",
235
+ torch_dtype=torch.bfloat16,
236
+ attn_implementation="flash_attention_2",
237
+ )
238
+
239
  queries = [
240
  {
241
  "prompt": "Describe sample A.",
242
  "images": [],
243
  "videos": ["data/sample_a.mp4"],
244
+ "media_kwargs": {"video_fps": 1.0, "min_frames": 8, "max_frames": 256},
245
+ "generate_kwargs": {
246
+ "temperature": 1.0,
247
+ "top_k": 50,
248
+ "top_p": 1.0,
249
+ "max_new_tokens": 256,
250
+ "repetition_penalty": 1.0,
251
+ "do_sample": False,
252
+ },
253
  },
254
  {
255
  "prompt": "Describe sample B.",
256
  "images": [],
257
  "videos": ["data/sample_b.mp4"],
258
+ "media_kwargs": {"video_fps": 1.0, "min_frames": 8, "max_frames": 256},
259
+ "generate_kwargs": {
260
+ "temperature": 1.0,
261
+ "top_k": 50,
262
+ "top_p": 1.0,
263
+ "max_new_tokens": 256,
264
+ "repetition_penalty": 1.0,
265
+ "do_sample": False,
266
+ },
267
  },
268
  ]
269
 
270
  with torch.no_grad():
271
+ result = model.offline_batch_generate(processor, queries, vision_chunked_length=64)
 
 
 
 
 
272
 
273
  texts = [item["text"] for item in result["results"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  ```
275
 
276
  </details>