OpenMOSS-Team
/

MOSS-VL-Instruct-0408

@@ -94,31 +94,17 @@ We conducted a comprehensive evaluation of **MOSS-VL-Instruct-0408** across four
 ## 🚀 Quickstart
 <details>
-<summary><strong>Queue-based offline inference (Python)</strong></summary>
 <br>
 ```python
-import os
-import queue
-import threading
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 checkpoint = "path/to/checkpoint"
-video_path = "data/example_video.mp4"
-prompt = "Describe the video."
-max_new_tokens = 1024
-temperature = 1.0
-top_k = 50
-top_p = 1.0
-repetition_penalty = 1.0
-video_fps = 1.0
-video_minlen = 8
-video_maxlen = 256
 def load_model(checkpoint: str):
@@ -137,72 +123,36 @@ def load_model(checkpoint: str):
     return model, processor
-if not checkpoint:
-    raise ValueError("Missing `checkpoint`.")
-if not video_path:
-    raise ValueError("Missing `video_path`.")
-if not os.path.isfile(video_path):
-    raise FileNotFoundError(f"Video not found: {video_path}")
 model, processor = load_model(checkpoint)
-new_queries: "queue.Queue[dict]" = queue.Queue()
-output_text_queue: "queue.Queue[str]" = queue.Queue()
-query = {
-    "prompt": prompt,
-    "images": [],
-    "videos": [video_path],
-    "media_kwargs": {
-        "video_fps": video_fps,
-        "video_minlen": video_minlen,
-        "video_maxlen": video_maxlen,
-    },
-    "generate_kwargs": {
-        "temperature": temperature,
-        "top_k": top_k,
-        "top_p": top_p,
-        "max_new_tokens": max_new_tokens,
-        "repetition_penalty": repetition_penalty,
-        "do_sample": False,
-    },
-}
-def drain_output():
-    while True:
-        tok = output_text_queue.get()
-        if tok == "<|round_end|>":
-            break
-        print(tok, end="", flush=True)
-worker = threading.Thread(
-    target=model.offline_generate,
-    args=(processor, new_queries, output_text_queue),
-    kwargs={"vision_chunked_length": 64},
-    daemon=True,
 )
-worker.start()
-new_queries.put(query)
-drain_output()
-new_queries.put({"stop_offline_generate": True})
-worker.join(timeout=5.0)
 ```
-For image-only usage, keep the same template and change:
-- replace `video_path` with `image_path`
-- validate `image_path` instead of `video_path`
-- set `images` to `[image_path]`
-- set `videos` to `[]`
-- remove `media_kwargs` if you do not need video-specific controls
 </details>
 <details>
-<summary><strong>Batched offline inference (Python)</strong></summary>
 <br>
@@ -211,21 +161,8 @@ import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 checkpoint = "path/to/checkpoint"
-shared_generate_kwargs = {
-    "temperature": 1.0,
-    "top_k": 50,
-    "top_p": 1.0,
-    "max_new_tokens": 256,
-    "repetition_penalty": 1.0,
-    "do_sample": False,
-}
-shared_media_kwargs = {
-    "video_fps": 1.0,
-    "video_minlen": 8,
-    "video_maxlen": 256,
-}
 def load_model(checkpoint: str):
@@ -245,55 +182,95 @@ def load_model(checkpoint: str):
 model, processor = load_model(checkpoint)
 queries = [
     {
         "prompt": "Describe sample A.",
         "images": [],
         "videos": ["data/sample_a.mp4"],
-        "media_kwargs": dict(shared_media_kwargs),
-        "generate_kwargs": dict(shared_generate_kwargs),
     },
     {
         "prompt": "Describe sample B.",
         "images": [],
         "videos": ["data/sample_b.mp4"],
-        "media_kwargs": dict(shared_media_kwargs),
-        "generate_kwargs": dict(shared_generate_kwargs),
     },
 ]
 with torch.no_grad():
-    result = model.offline_batch_generate(
-        processor,
-        queries,
-        session_states=None,
-        vision_chunked_length=64,
-    )
 texts = [item["text"] for item in result["results"]]
-session_states = result["session_states"]
-```
-```python
-followup_queries = [
-    {
-        "prompt": "Summarize sample A in one sentence.",
-        "generate_kwargs": dict(shared_generate_kwargs),
-    },
-    {
-        "prompt": "Restart sample B and answer again.",
-        "reset_session": True,
-        "generate_kwargs": dict(shared_generate_kwargs),
-    },
-]
-with torch.no_grad():
-    followup_result = model.offline_batch_generate(
-        processor,
-        followup_queries,
-        session_states=session_states,
-        vision_chunked_length=64,
-    )
 ```
 </details>

 ## 🚀 Quickstart
 <details>
+<summary><strong>Single-image offline inference (Python)</strong></summary>
 <br>
 ```python
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor
 checkpoint = "path/to/checkpoint"
+image_path = "data/example_image.jpg"
+prompt = "Describe this image."
 def load_model(checkpoint: str):
     return model, processor
 model, processor = load_model(checkpoint)
+text = model.offline_image_generate(
+    processor,
+    prompt=prompt,
+    image=image_path,
+    shortest_edge=4096,
+    longest_edge=16777216,
+    multi_image_max_pixels=201326592,
+    patch_size=16,
+    temporal_patch_size=1,
+    merge_size=2,
+    image_mean=[0.5, 0.5, 0.5],
+    image_std=[0.5, 0.5, 0.5],
+    max_new_tokens=256,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1.0,
+    do_sample=False,
+    vision_chunked_length=64,
 )
+print(text)
 ```
 </details>
 <details>
+<summary><strong>Single-video offline inference (Python)</strong></summary>
 <br>
 from transformers import AutoModelForCausalLM, AutoProcessor
 checkpoint = "path/to/checkpoint"
+video_path = "data/example_video.mp4"
+prompt = "Describe this video."
 def load_model(checkpoint: str):
 model, processor = load_model(checkpoint)
+text = model.offline_video_generate(
+    processor,
+    prompt=prompt,
+    video=video_path,
+    shortest_edge=4096,
+    longest_edge=16777216,
+    video_max_pixels=201326592,
+    patch_size=16,
+    temporal_patch_size=1,
+    merge_size=2,
+    video_fps=1.0,
+    min_frames=1,
+    max_frames=256,
+    num_extract_threads=4,
+    image_mean=[0.5, 0.5, 0.5],
+    image_std=[0.5, 0.5, 0.5],
+    max_new_tokens=256,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1.0,
+    do_sample=False,
+    vision_chunked_length=64,
+)
+print(text)
+```
+</details>
+<details>
+<summary><strong>Batched offline inference (Python)</strong></summary>
+<br>
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+checkpoint = "path/to/checkpoint"
+processor = AutoProcessor.from_pretrained(
+    checkpoint,
+    trust_remote_code=True,
+    frame_extract_num_threads=1,
+)
+model = AutoModelForCausalLM.from_pretrained(
+    checkpoint,
+    trust_remote_code=True,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+)
 queries = [
     {
         "prompt": "Describe sample A.",
         "images": [],
         "videos": ["data/sample_a.mp4"],
+        "media_kwargs": {"video_fps": 1.0, "min_frames": 8, "max_frames": 256},
+        "generate_kwargs": {
+            "temperature": 1.0,
+            "top_k": 50,
+            "top_p": 1.0,
+            "max_new_tokens": 256,
+            "repetition_penalty": 1.0,
+            "do_sample": False,
+        },
     },
     {
         "prompt": "Describe sample B.",
         "images": [],
         "videos": ["data/sample_b.mp4"],
+        "media_kwargs": {"video_fps": 1.0, "min_frames": 8, "max_frames": 256},
+        "generate_kwargs": {
+            "temperature": 1.0,
+            "top_k": 50,
+            "top_p": 1.0,
+            "max_new_tokens": 256,
+            "repetition_penalty": 1.0,
+            "do_sample": False,
+        },
     },
 ]
 with torch.no_grad():
+    result = model.offline_batch_generate(processor, queries, vision_chunked_length=64)
 texts = [item["text"] for item in result["results"]]
 ```
 </details>