Update modeling_moss_vl.py
Browse files- modeling_moss_vl.py +330 -48
modeling_moss_vl.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
# limitations under the License.
|
| 15 |
"""PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
|
| 16 |
|
|
|
|
| 17 |
from dataclasses import dataclass
|
| 18 |
import queue
|
| 19 |
import threading
|
|
@@ -2160,6 +2161,7 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2160 |
super().__init__(config)
|
| 2161 |
self.model = MossVLModel(config)
|
| 2162 |
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
|
|
|
| 2163 |
|
| 2164 |
self.post_init()
|
| 2165 |
|
|
@@ -2459,7 +2461,65 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2459 |
|
| 2460 |
return [{"role": "user", "content": content}]
|
| 2461 |
|
| 2462 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2463 |
processed_messages = []
|
| 2464 |
for message in messages:
|
| 2465 |
message_copy = dict(message)
|
|
@@ -2528,7 +2588,11 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2528 |
|
| 2529 |
def _offline_prepare_inputs(self, processor, query: Dict[str, Any]):
|
| 2530 |
messages = self._offline_prepare_messages(processor, query)
|
| 2531 |
-
input_text = self._offline_prepare_input_text(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2532 |
all_images, all_videos = self._offline_collect_media(messages)
|
| 2533 |
media_kwargs = dict(query.get("media_kwargs") or {})
|
| 2534 |
processor_kwargs = self._offline_build_processor_kwargs(
|
|
@@ -2543,25 +2607,26 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2543 |
modified_multi_image = False
|
| 2544 |
modified_video = False
|
| 2545 |
|
| 2546 |
-
|
| 2547 |
-
|
| 2548 |
-
|
| 2549 |
-
|
| 2550 |
-
|
| 2551 |
-
|
| 2552 |
-
|
| 2553 |
-
|
| 2554 |
-
|
| 2555 |
-
|
| 2556 |
-
|
| 2557 |
-
|
| 2558 |
-
|
| 2559 |
-
|
| 2560 |
-
|
| 2561 |
-
|
| 2562 |
-
image_proc
|
| 2563 |
-
|
| 2564 |
-
video_proc
|
|
|
|
| 2565 |
|
| 2566 |
text_device = self.get_input_embeddings().weight.device
|
| 2567 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
@@ -2773,7 +2838,13 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2773 |
current_session,
|
| 2774 |
)
|
| 2775 |
working_messages_list.append(working_messages)
|
| 2776 |
-
input_texts.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2777 |
|
| 2778 |
all_images, all_videos = self._offline_collect_media(working_messages)
|
| 2779 |
all_images_per_query.append(all_images)
|
|
@@ -2798,31 +2869,32 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2798 |
modified_video = False
|
| 2799 |
orig_padding_side = None
|
| 2800 |
|
| 2801 |
-
|
| 2802 |
-
|
| 2803 |
-
|
| 2804 |
-
|
| 2805 |
-
|
| 2806 |
-
|
| 2807 |
-
|
| 2808 |
-
|
| 2809 |
-
|
| 2810 |
-
|
| 2811 |
-
|
| 2812 |
-
|
| 2813 |
-
|
| 2814 |
-
|
| 2815 |
-
|
| 2816 |
-
|
| 2817 |
-
|
| 2818 |
-
|
| 2819 |
-
|
| 2820 |
-
|
| 2821 |
-
image_proc
|
| 2822 |
-
|
| 2823 |
-
video_proc
|
| 2824 |
-
|
| 2825 |
-
tokenizer
|
|
|
|
| 2826 |
|
| 2827 |
text_device = self.get_input_embeddings().weight.device
|
| 2828 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
@@ -2905,7 +2977,11 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2905 |
for index, (query, session_state) in enumerate(zip(prepared_queries, session_states)):
|
| 2906 |
current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
|
| 2907 |
working_messages = self._offline_build_session_messages(processor, query, current_session)
|
| 2908 |
-
input_text = self._offline_prepare_input_text(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2909 |
|
| 2910 |
if tokenizer is not None:
|
| 2911 |
token_ids = tokenizer(input_text, add_special_tokens=False)["input_ids"]
|
|
@@ -2972,6 +3048,210 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2972 |
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
|
| 2973 |
return processor.decode(new_tokens, skip_special_tokens=True)
|
| 2974 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2975 |
def offline_generate(
|
| 2976 |
self,
|
| 2977 |
processor,
|
|
@@ -2990,6 +3270,8 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2990 |
- `prompt` / `messages`
|
| 2991 |
- `images` / `videos`
|
| 2992 |
- `media_kwargs` / `generate_kwargs`
|
|
|
|
|
|
|
| 2993 |
- `thinking_mode` (`no_thinking` or `deep_thinking`, plus compatible aliases)
|
| 2994 |
- `system_prompt_type` (`text_image` or `video`, plus compatible aliases)
|
| 2995 |
- `system_prompt` for an explicit override
|
|
|
|
| 14 |
# limitations under the License.
|
| 15 |
"""PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
|
| 16 |
|
| 17 |
+
import copy
|
| 18 |
from dataclasses import dataclass
|
| 19 |
import queue
|
| 20 |
import threading
|
|
|
|
| 2161 |
super().__init__(config)
|
| 2162 |
self.model = MossVLModel(config)
|
| 2163 |
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
| 2164 |
+
self._offline_processor_lock = threading.RLock()
|
| 2165 |
|
| 2166 |
self.post_init()
|
| 2167 |
|
|
|
|
| 2461 |
|
| 2462 |
return [{"role": "user", "content": content}]
|
| 2463 |
|
| 2464 |
+
@staticmethod
|
| 2465 |
+
def _offline_extract_content_parts(content: Any) -> Tuple[str, List[Any], List[Any]]:
|
| 2466 |
+
if isinstance(content, str):
|
| 2467 |
+
return content, [], []
|
| 2468 |
+
if not isinstance(content, list):
|
| 2469 |
+
return (str(content) if content else ""), [], []
|
| 2470 |
+
|
| 2471 |
+
text_parts: List[str] = []
|
| 2472 |
+
images: List[Any] = []
|
| 2473 |
+
videos: List[Any] = []
|
| 2474 |
+
for item in content:
|
| 2475 |
+
if isinstance(item, dict):
|
| 2476 |
+
if item.get("type") == "image" or "image" in item or "image_url" in item:
|
| 2477 |
+
image = item.get("image") or item.get("image_url")
|
| 2478 |
+
if image is not None:
|
| 2479 |
+
images.append(image)
|
| 2480 |
+
elif item.get("type") == "video" or "video" in item or "video_path" in item:
|
| 2481 |
+
video = item.get("video") or item.get("video_path")
|
| 2482 |
+
if video is not None:
|
| 2483 |
+
videos.append(video)
|
| 2484 |
+
|
| 2485 |
+
if "text" in item and item["text"] is not None:
|
| 2486 |
+
text_parts.append(str(item["text"]))
|
| 2487 |
+
elif isinstance(item, str):
|
| 2488 |
+
text_parts.append(item)
|
| 2489 |
+
|
| 2490 |
+
return "".join(text_parts), images, videos
|
| 2491 |
+
|
| 2492 |
+
@staticmethod
|
| 2493 |
+
def _offline_resolve_use_template(query: Dict[str, Any]) -> bool:
|
| 2494 |
+
return bool(query.get("use_template", False))
|
| 2495 |
+
|
| 2496 |
+
def _offline_prepare_input_text(
|
| 2497 |
+
self,
|
| 2498 |
+
processor,
|
| 2499 |
+
messages: List[Dict[str, Any]],
|
| 2500 |
+
use_template: bool,
|
| 2501 |
+
) -> str:
|
| 2502 |
+
if not use_template:
|
| 2503 |
+
if any(isinstance(message, dict) and message.get("role") == "system" for message in messages):
|
| 2504 |
+
raise ValueError("system messages require use_template=True")
|
| 2505 |
+
|
| 2506 |
+
parts = ["<|im_start|>"]
|
| 2507 |
+
for message in messages:
|
| 2508 |
+
role = message.get("role", "user") if isinstance(message, dict) else "user"
|
| 2509 |
+
content = message.get("content", "") if isinstance(message, dict) else message
|
| 2510 |
+
text, msg_images, msg_videos = self._offline_extract_content_parts(content)
|
| 2511 |
+
|
| 2512 |
+
if role == "user":
|
| 2513 |
+
media_tokens = ""
|
| 2514 |
+
if msg_images:
|
| 2515 |
+
media_tokens += "<|image|>" * len(msg_images)
|
| 2516 |
+
if msg_videos:
|
| 2517 |
+
media_tokens += "<|video|>" * len(msg_videos)
|
| 2518 |
+
parts.append(f"{media_tokens}{text}")
|
| 2519 |
+
else:
|
| 2520 |
+
parts.append(f"{text}<|im_end|>")
|
| 2521 |
+
return "".join(parts)
|
| 2522 |
+
|
| 2523 |
processed_messages = []
|
| 2524 |
for message in messages:
|
| 2525 |
message_copy = dict(message)
|
|
|
|
| 2588 |
|
| 2589 |
def _offline_prepare_inputs(self, processor, query: Dict[str, Any]):
|
| 2590 |
messages = self._offline_prepare_messages(processor, query)
|
| 2591 |
+
input_text = self._offline_prepare_input_text(
|
| 2592 |
+
processor,
|
| 2593 |
+
messages,
|
| 2594 |
+
use_template=self._offline_resolve_use_template(query),
|
| 2595 |
+
)
|
| 2596 |
all_images, all_videos = self._offline_collect_media(messages)
|
| 2597 |
media_kwargs = dict(query.get("media_kwargs") or {})
|
| 2598 |
processor_kwargs = self._offline_build_processor_kwargs(
|
|
|
|
| 2607 |
modified_multi_image = False
|
| 2608 |
modified_video = False
|
| 2609 |
|
| 2610 |
+
with self._offline_processor_lock:
|
| 2611 |
+
try:
|
| 2612 |
+
multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
|
| 2613 |
+
if multi_image_max_pixels is not None and image_proc is not None:
|
| 2614 |
+
orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
|
| 2615 |
+
image_proc.multi_image_max_pixels = multi_image_max_pixels
|
| 2616 |
+
modified_multi_image = True
|
| 2617 |
+
|
| 2618 |
+
video_max_pixels = media_kwargs.get("video_max_pixels")
|
| 2619 |
+
if video_max_pixels is not None and video_proc is not None:
|
| 2620 |
+
orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
|
| 2621 |
+
video_proc.video_max_pixels = video_max_pixels
|
| 2622 |
+
modified_video = True
|
| 2623 |
+
|
| 2624 |
+
inputs = processor(**processor_kwargs)
|
| 2625 |
+
finally:
|
| 2626 |
+
if modified_multi_image and image_proc is not None:
|
| 2627 |
+
image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
|
| 2628 |
+
if modified_video and video_proc is not None:
|
| 2629 |
+
video_proc.video_max_pixels = orig_video_max_pixels
|
| 2630 |
|
| 2631 |
text_device = self.get_input_embeddings().weight.device
|
| 2632 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
|
|
| 2838 |
current_session,
|
| 2839 |
)
|
| 2840 |
working_messages_list.append(working_messages)
|
| 2841 |
+
input_texts.append(
|
| 2842 |
+
self._offline_prepare_input_text(
|
| 2843 |
+
processor,
|
| 2844 |
+
working_messages,
|
| 2845 |
+
use_template=self._offline_resolve_use_template(query),
|
| 2846 |
+
)
|
| 2847 |
+
)
|
| 2848 |
|
| 2849 |
all_images, all_videos = self._offline_collect_media(working_messages)
|
| 2850 |
all_images_per_query.append(all_images)
|
|
|
|
| 2869 |
modified_video = False
|
| 2870 |
orig_padding_side = None
|
| 2871 |
|
| 2872 |
+
with self._offline_processor_lock:
|
| 2873 |
+
try:
|
| 2874 |
+
multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
|
| 2875 |
+
if multi_image_max_pixels is not None and image_proc is not None:
|
| 2876 |
+
orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
|
| 2877 |
+
image_proc.multi_image_max_pixels = multi_image_max_pixels
|
| 2878 |
+
modified_multi_image = True
|
| 2879 |
+
|
| 2880 |
+
video_max_pixels = media_kwargs.get("video_max_pixels")
|
| 2881 |
+
if video_max_pixels is not None and video_proc is not None:
|
| 2882 |
+
orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
|
| 2883 |
+
video_proc.video_max_pixels = video_max_pixels
|
| 2884 |
+
modified_video = True
|
| 2885 |
+
|
| 2886 |
+
if tokenizer is not None and hasattr(tokenizer, "padding_side"):
|
| 2887 |
+
orig_padding_side = tokenizer.padding_side
|
| 2888 |
+
tokenizer.padding_side = "left"
|
| 2889 |
+
|
| 2890 |
+
inputs = processor(**processor_kwargs)
|
| 2891 |
+
finally:
|
| 2892 |
+
if modified_multi_image and image_proc is not None:
|
| 2893 |
+
image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
|
| 2894 |
+
if modified_video and video_proc is not None:
|
| 2895 |
+
video_proc.video_max_pixels = orig_video_max_pixels
|
| 2896 |
+
if tokenizer is not None and orig_padding_side is not None:
|
| 2897 |
+
tokenizer.padding_side = orig_padding_side
|
| 2898 |
|
| 2899 |
text_device = self.get_input_embeddings().weight.device
|
| 2900 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
|
|
| 2977 |
for index, (query, session_state) in enumerate(zip(prepared_queries, session_states)):
|
| 2978 |
current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
|
| 2979 |
working_messages = self._offline_build_session_messages(processor, query, current_session)
|
| 2980 |
+
input_text = self._offline_prepare_input_text(
|
| 2981 |
+
processor,
|
| 2982 |
+
working_messages,
|
| 2983 |
+
use_template=self._offline_resolve_use_template(query),
|
| 2984 |
+
)
|
| 2985 |
|
| 2986 |
if tokenizer is not None:
|
| 2987 |
token_ids = tokenizer(input_text, add_special_tokens=False)["input_ids"]
|
|
|
|
| 3048 |
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
|
| 3049 |
return processor.decode(new_tokens, skip_special_tokens=True)
|
| 3050 |
|
| 3051 |
+
@staticmethod
|
| 3052 |
+
def _offline_capture_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
| 3053 |
+
if target is None or not overrides:
|
| 3054 |
+
return None
|
| 3055 |
+
return {name: copy.deepcopy(getattr(target, name)) for name in overrides}
|
| 3056 |
+
|
| 3057 |
+
@staticmethod
|
| 3058 |
+
def _offline_apply_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> None:
|
| 3059 |
+
if target is None or not overrides:
|
| 3060 |
+
return
|
| 3061 |
+
for name, value in overrides.items():
|
| 3062 |
+
setattr(target, name, copy.deepcopy(value))
|
| 3063 |
+
|
| 3064 |
+
@staticmethod
|
| 3065 |
+
def _offline_restore_processor_attrs(target, snapshot: Optional[Dict[str, Any]]) -> None:
|
| 3066 |
+
if target is None or snapshot is None:
|
| 3067 |
+
return
|
| 3068 |
+
for name, value in snapshot.items():
|
| 3069 |
+
setattr(target, name, copy.deepcopy(value))
|
| 3070 |
+
|
| 3071 |
+
def _offline_generate_one_with_processor_overrides(
|
| 3072 |
+
self,
|
| 3073 |
+
processor,
|
| 3074 |
+
query: Dict[str, Any],
|
| 3075 |
+
image_processor_overrides: Optional[Dict[str, Any]] = None,
|
| 3076 |
+
video_processor_overrides: Optional[Dict[str, Any]] = None,
|
| 3077 |
+
) -> str:
|
| 3078 |
+
image_proc = getattr(processor, "image_processor", None)
|
| 3079 |
+
video_proc = getattr(processor, "video_processor", None)
|
| 3080 |
+
image_snapshot = self._offline_capture_processor_attrs(image_proc, image_processor_overrides)
|
| 3081 |
+
video_snapshot = self._offline_capture_processor_attrs(video_proc, video_processor_overrides)
|
| 3082 |
+
|
| 3083 |
+
with self._offline_processor_lock:
|
| 3084 |
+
try:
|
| 3085 |
+
self._offline_apply_processor_attrs(image_proc, image_processor_overrides)
|
| 3086 |
+
self._offline_apply_processor_attrs(video_proc, video_processor_overrides)
|
| 3087 |
+
return self._offline_generate_one(processor, query)
|
| 3088 |
+
finally:
|
| 3089 |
+
self._offline_restore_processor_attrs(image_proc, image_snapshot)
|
| 3090 |
+
self._offline_restore_processor_attrs(video_proc, video_snapshot)
|
| 3091 |
+
|
| 3092 |
+
def offline_image_generate(
|
| 3093 |
+
self,
|
| 3094 |
+
processor,
|
| 3095 |
+
prompt: str = "",
|
| 3096 |
+
image: Any = None,
|
| 3097 |
+
*,
|
| 3098 |
+
shortest_edge: int = 4096,
|
| 3099 |
+
longest_edge: int = 16777216,
|
| 3100 |
+
multi_image_max_pixels: int = 201326592,
|
| 3101 |
+
patch_size: int = 16,
|
| 3102 |
+
temporal_patch_size: int = 1,
|
| 3103 |
+
merge_size: int = 2,
|
| 3104 |
+
image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3105 |
+
image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3106 |
+
max_new_tokens: int = 1024,
|
| 3107 |
+
temperature: float = 1.0,
|
| 3108 |
+
top_k: int = 50,
|
| 3109 |
+
top_p: float = 1.0,
|
| 3110 |
+
repetition_penalty: float = 1.0,
|
| 3111 |
+
do_sample: bool = False,
|
| 3112 |
+
vision_chunked_length: int = 64,
|
| 3113 |
+
use_template: bool = False,
|
| 3114 |
+
thinking_mode: Optional[str] = None,
|
| 3115 |
+
system_prompt_type: Optional[str] = None,
|
| 3116 |
+
system_prompt: Optional[str] = None,
|
| 3117 |
+
) -> str:
|
| 3118 |
+
"""
|
| 3119 |
+
Single-image offline generation with explicit image preprocessor defaults.
|
| 3120 |
+
|
| 3121 |
+
The default values mirror `preprocessor_config.json` so README examples can
|
| 3122 |
+
surface the full image preprocessing setup without requiring a batch wrapper.
|
| 3123 |
+
"""
|
| 3124 |
+
if image is None:
|
| 3125 |
+
raise ValueError("`image` is required.")
|
| 3126 |
+
query: Dict[str, Any] = {
|
| 3127 |
+
"prompt": prompt,
|
| 3128 |
+
"images": [image],
|
| 3129 |
+
"videos": [],
|
| 3130 |
+
"media_kwargs": {
|
| 3131 |
+
"min_pixels": shortest_edge,
|
| 3132 |
+
"max_pixels": longest_edge,
|
| 3133 |
+
"multi_image_max_pixels": multi_image_max_pixels,
|
| 3134 |
+
},
|
| 3135 |
+
"generate_kwargs": {
|
| 3136 |
+
"max_new_tokens": max_new_tokens,
|
| 3137 |
+
"temperature": temperature,
|
| 3138 |
+
"top_k": top_k,
|
| 3139 |
+
"top_p": top_p,
|
| 3140 |
+
"repetition_penalty": repetition_penalty,
|
| 3141 |
+
"do_sample": do_sample,
|
| 3142 |
+
"vision_chunked_length": vision_chunked_length,
|
| 3143 |
+
},
|
| 3144 |
+
"use_template": use_template,
|
| 3145 |
+
}
|
| 3146 |
+
if thinking_mode is not None:
|
| 3147 |
+
query["thinking_mode"] = thinking_mode
|
| 3148 |
+
if system_prompt_type is not None:
|
| 3149 |
+
query["system_prompt_type"] = system_prompt_type
|
| 3150 |
+
if system_prompt is not None:
|
| 3151 |
+
query["system_prompt"] = system_prompt
|
| 3152 |
+
|
| 3153 |
+
image_processor_overrides = {
|
| 3154 |
+
"size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
|
| 3155 |
+
"multi_image_max_pixels": multi_image_max_pixels,
|
| 3156 |
+
"patch_size": patch_size,
|
| 3157 |
+
"temporal_patch_size": temporal_patch_size,
|
| 3158 |
+
"merge_size": merge_size,
|
| 3159 |
+
"image_mean": list(image_mean) if image_mean is not None else None,
|
| 3160 |
+
"image_std": list(image_std) if image_std is not None else None,
|
| 3161 |
+
}
|
| 3162 |
+
return self._offline_generate_one_with_processor_overrides(
|
| 3163 |
+
processor,
|
| 3164 |
+
query,
|
| 3165 |
+
image_processor_overrides=image_processor_overrides,
|
| 3166 |
+
)
|
| 3167 |
+
|
| 3168 |
+
def offline_video_generate(
|
| 3169 |
+
self,
|
| 3170 |
+
processor,
|
| 3171 |
+
prompt: str = "",
|
| 3172 |
+
video: Any = None,
|
| 3173 |
+
*,
|
| 3174 |
+
shortest_edge: int = 4096,
|
| 3175 |
+
longest_edge: int = 16777216,
|
| 3176 |
+
video_max_pixels: int = 201326592,
|
| 3177 |
+
patch_size: int = 16,
|
| 3178 |
+
temporal_patch_size: int = 1,
|
| 3179 |
+
merge_size: int = 2,
|
| 3180 |
+
video_fps: float = 1.0,
|
| 3181 |
+
min_frames: int = 1,
|
| 3182 |
+
max_frames: int = 256,
|
| 3183 |
+
num_extract_threads: int = 4,
|
| 3184 |
+
image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3185 |
+
image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3186 |
+
max_new_tokens: int = 1024,
|
| 3187 |
+
temperature: float = 1.0,
|
| 3188 |
+
top_k: int = 50,
|
| 3189 |
+
top_p: float = 1.0,
|
| 3190 |
+
repetition_penalty: float = 1.0,
|
| 3191 |
+
do_sample: bool = False,
|
| 3192 |
+
vision_chunked_length: int = 64,
|
| 3193 |
+
use_template: bool = False,
|
| 3194 |
+
thinking_mode: Optional[str] = None,
|
| 3195 |
+
system_prompt_type: Optional[str] = None,
|
| 3196 |
+
system_prompt: Optional[str] = None,
|
| 3197 |
+
) -> str:
|
| 3198 |
+
"""
|
| 3199 |
+
Single-video offline generation with explicit video preprocessor defaults.
|
| 3200 |
+
|
| 3201 |
+
The default values mirror `video_preprocessor_config.json` so README examples
|
| 3202 |
+
can show a standalone video entry point with the effective preprocessing knobs.
|
| 3203 |
+
"""
|
| 3204 |
+
if video is None:
|
| 3205 |
+
raise ValueError("`video` is required.")
|
| 3206 |
+
query: Dict[str, Any] = {
|
| 3207 |
+
"prompt": prompt,
|
| 3208 |
+
"images": [],
|
| 3209 |
+
"videos": [video],
|
| 3210 |
+
"media_kwargs": {
|
| 3211 |
+
"min_pixels": shortest_edge,
|
| 3212 |
+
"max_pixels": longest_edge,
|
| 3213 |
+
"video_max_pixels": video_max_pixels,
|
| 3214 |
+
"video_fps": video_fps,
|
| 3215 |
+
"min_frames": min_frames,
|
| 3216 |
+
"max_frames": max_frames,
|
| 3217 |
+
},
|
| 3218 |
+
"generate_kwargs": {
|
| 3219 |
+
"max_new_tokens": max_new_tokens,
|
| 3220 |
+
"temperature": temperature,
|
| 3221 |
+
"top_k": top_k,
|
| 3222 |
+
"top_p": top_p,
|
| 3223 |
+
"repetition_penalty": repetition_penalty,
|
| 3224 |
+
"do_sample": do_sample,
|
| 3225 |
+
"vision_chunked_length": vision_chunked_length,
|
| 3226 |
+
},
|
| 3227 |
+
"use_template": use_template,
|
| 3228 |
+
}
|
| 3229 |
+
if thinking_mode is not None:
|
| 3230 |
+
query["thinking_mode"] = thinking_mode
|
| 3231 |
+
if system_prompt_type is not None:
|
| 3232 |
+
query["system_prompt_type"] = system_prompt_type
|
| 3233 |
+
if system_prompt is not None:
|
| 3234 |
+
query["system_prompt"] = system_prompt
|
| 3235 |
+
|
| 3236 |
+
video_processor_overrides = {
|
| 3237 |
+
"size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
|
| 3238 |
+
"video_max_pixels": video_max_pixels,
|
| 3239 |
+
"patch_size": patch_size,
|
| 3240 |
+
"temporal_patch_size": temporal_patch_size,
|
| 3241 |
+
"merge_size": merge_size,
|
| 3242 |
+
"video_fps": video_fps,
|
| 3243 |
+
"min_frames": min_frames,
|
| 3244 |
+
"max_frames": max_frames,
|
| 3245 |
+
"num_extract_threads": num_extract_threads,
|
| 3246 |
+
"image_mean": list(image_mean) if image_mean is not None else None,
|
| 3247 |
+
"image_std": list(image_std) if image_std is not None else None,
|
| 3248 |
+
}
|
| 3249 |
+
return self._offline_generate_one_with_processor_overrides(
|
| 3250 |
+
processor,
|
| 3251 |
+
query,
|
| 3252 |
+
video_processor_overrides=video_processor_overrides,
|
| 3253 |
+
)
|
| 3254 |
+
|
| 3255 |
def offline_generate(
|
| 3256 |
self,
|
| 3257 |
processor,
|
|
|
|
| 3270 |
- `prompt` / `messages`
|
| 3271 |
- `images` / `videos`
|
| 3272 |
- `media_kwargs` / `generate_kwargs`
|
| 3273 |
+
- `use_template` to switch between backend-style pretrain prompting
|
| 3274 |
+
(`False`, default for base) and tokenizer chat template prompting (`True`)
|
| 3275 |
- `thinking_mode` (`no_thinking` or `deep_thinking`, plus compatible aliases)
|
| 3276 |
- `system_prompt_type` (`text_image` or `video`, plus compatible aliases)
|
| 3277 |
- `system_prompt` for an explicit override
|