CCCCyx commited on
Commit
df57422
·
verified ·
1 Parent(s): 7d687d9

Update modeling_moss_vl.py

Browse files
Files changed (1) hide show
  1. modeling_moss_vl.py +330 -48
modeling_moss_vl.py CHANGED
@@ -14,6 +14,7 @@
14
  # limitations under the License.
15
  """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
16
 
 
17
  from dataclasses import dataclass
18
  import queue
19
  import threading
@@ -2160,6 +2161,7 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2160
  super().__init__(config)
2161
  self.model = MossVLModel(config)
2162
  self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
 
2163
 
2164
  self.post_init()
2165
 
@@ -2459,7 +2461,65 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2459
 
2460
  return [{"role": "user", "content": content}]
2461
 
2462
- def _offline_prepare_input_text(self, processor, messages: List[Dict[str, Any]]) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2463
  processed_messages = []
2464
  for message in messages:
2465
  message_copy = dict(message)
@@ -2528,7 +2588,11 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2528
 
2529
  def _offline_prepare_inputs(self, processor, query: Dict[str, Any]):
2530
  messages = self._offline_prepare_messages(processor, query)
2531
- input_text = self._offline_prepare_input_text(processor, messages)
 
 
 
 
2532
  all_images, all_videos = self._offline_collect_media(messages)
2533
  media_kwargs = dict(query.get("media_kwargs") or {})
2534
  processor_kwargs = self._offline_build_processor_kwargs(
@@ -2543,25 +2607,26 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2543
  modified_multi_image = False
2544
  modified_video = False
2545
 
2546
- try:
2547
- multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2548
- if multi_image_max_pixels is not None and image_proc is not None:
2549
- orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2550
- image_proc.multi_image_max_pixels = multi_image_max_pixels
2551
- modified_multi_image = True
2552
-
2553
- video_max_pixels = media_kwargs.get("video_max_pixels")
2554
- if video_max_pixels is not None and video_proc is not None:
2555
- orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2556
- video_proc.video_max_pixels = video_max_pixels
2557
- modified_video = True
2558
-
2559
- inputs = processor(**processor_kwargs)
2560
- finally:
2561
- if modified_multi_image and image_proc is not None:
2562
- image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2563
- if modified_video and video_proc is not None:
2564
- video_proc.video_max_pixels = orig_video_max_pixels
 
2565
 
2566
  text_device = self.get_input_embeddings().weight.device
2567
  vision_device = self.visual.patch_embed.proj.weight.device
@@ -2773,7 +2838,13 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2773
  current_session,
2774
  )
2775
  working_messages_list.append(working_messages)
2776
- input_texts.append(self._offline_prepare_input_text(processor, working_messages))
 
 
 
 
 
 
2777
 
2778
  all_images, all_videos = self._offline_collect_media(working_messages)
2779
  all_images_per_query.append(all_images)
@@ -2798,31 +2869,32 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2798
  modified_video = False
2799
  orig_padding_side = None
2800
 
2801
- try:
2802
- multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2803
- if multi_image_max_pixels is not None and image_proc is not None:
2804
- orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2805
- image_proc.multi_image_max_pixels = multi_image_max_pixels
2806
- modified_multi_image = True
2807
-
2808
- video_max_pixels = media_kwargs.get("video_max_pixels")
2809
- if video_max_pixels is not None and video_proc is not None:
2810
- orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2811
- video_proc.video_max_pixels = video_max_pixels
2812
- modified_video = True
2813
-
2814
- if tokenizer is not None and hasattr(tokenizer, "padding_side"):
2815
- orig_padding_side = tokenizer.padding_side
2816
- tokenizer.padding_side = "left"
2817
-
2818
- inputs = processor(**processor_kwargs)
2819
- finally:
2820
- if modified_multi_image and image_proc is not None:
2821
- image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2822
- if modified_video and video_proc is not None:
2823
- video_proc.video_max_pixels = orig_video_max_pixels
2824
- if tokenizer is not None and orig_padding_side is not None:
2825
- tokenizer.padding_side = orig_padding_side
 
2826
 
2827
  text_device = self.get_input_embeddings().weight.device
2828
  vision_device = self.visual.patch_embed.proj.weight.device
@@ -2905,7 +2977,11 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2905
  for index, (query, session_state) in enumerate(zip(prepared_queries, session_states)):
2906
  current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
2907
  working_messages = self._offline_build_session_messages(processor, query, current_session)
2908
- input_text = self._offline_prepare_input_text(processor, working_messages)
 
 
 
 
2909
 
2910
  if tokenizer is not None:
2911
  token_ids = tokenizer(input_text, add_special_tokens=False)["input_ids"]
@@ -2972,6 +3048,210 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2972
  new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
2973
  return processor.decode(new_tokens, skip_special_tokens=True)
2974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2975
  def offline_generate(
2976
  self,
2977
  processor,
@@ -2990,6 +3270,8 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2990
  - `prompt` / `messages`
2991
  - `images` / `videos`
2992
  - `media_kwargs` / `generate_kwargs`
 
 
2993
  - `thinking_mode` (`no_thinking` or `deep_thinking`, plus compatible aliases)
2994
  - `system_prompt_type` (`text_image` or `video`, plus compatible aliases)
2995
  - `system_prompt` for an explicit override
 
14
  # limitations under the License.
15
  """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
16
 
17
+ import copy
18
  from dataclasses import dataclass
19
  import queue
20
  import threading
 
2161
  super().__init__(config)
2162
  self.model = MossVLModel(config)
2163
  self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
2164
+ self._offline_processor_lock = threading.RLock()
2165
 
2166
  self.post_init()
2167
 
 
2461
 
2462
  return [{"role": "user", "content": content}]
2463
 
2464
+ @staticmethod
2465
+ def _offline_extract_content_parts(content: Any) -> Tuple[str, List[Any], List[Any]]:
2466
+ if isinstance(content, str):
2467
+ return content, [], []
2468
+ if not isinstance(content, list):
2469
+ return (str(content) if content else ""), [], []
2470
+
2471
+ text_parts: List[str] = []
2472
+ images: List[Any] = []
2473
+ videos: List[Any] = []
2474
+ for item in content:
2475
+ if isinstance(item, dict):
2476
+ if item.get("type") == "image" or "image" in item or "image_url" in item:
2477
+ image = item.get("image") or item.get("image_url")
2478
+ if image is not None:
2479
+ images.append(image)
2480
+ elif item.get("type") == "video" or "video" in item or "video_path" in item:
2481
+ video = item.get("video") or item.get("video_path")
2482
+ if video is not None:
2483
+ videos.append(video)
2484
+
2485
+ if "text" in item and item["text"] is not None:
2486
+ text_parts.append(str(item["text"]))
2487
+ elif isinstance(item, str):
2488
+ text_parts.append(item)
2489
+
2490
+ return "".join(text_parts), images, videos
2491
+
2492
+ @staticmethod
2493
+ def _offline_resolve_use_template(query: Dict[str, Any]) -> bool:
2494
+ return bool(query.get("use_template", False))
2495
+
2496
+ def _offline_prepare_input_text(
2497
+ self,
2498
+ processor,
2499
+ messages: List[Dict[str, Any]],
2500
+ use_template: bool,
2501
+ ) -> str:
2502
+ if not use_template:
2503
+ if any(isinstance(message, dict) and message.get("role") == "system" for message in messages):
2504
+ raise ValueError("system messages require use_template=True")
2505
+
2506
+ parts = ["<|im_start|>"]
2507
+ for message in messages:
2508
+ role = message.get("role", "user") if isinstance(message, dict) else "user"
2509
+ content = message.get("content", "") if isinstance(message, dict) else message
2510
+ text, msg_images, msg_videos = self._offline_extract_content_parts(content)
2511
+
2512
+ if role == "user":
2513
+ media_tokens = ""
2514
+ if msg_images:
2515
+ media_tokens += "<|image|>" * len(msg_images)
2516
+ if msg_videos:
2517
+ media_tokens += "<|video|>" * len(msg_videos)
2518
+ parts.append(f"{media_tokens}{text}")
2519
+ else:
2520
+ parts.append(f"{text}<|im_end|>")
2521
+ return "".join(parts)
2522
+
2523
  processed_messages = []
2524
  for message in messages:
2525
  message_copy = dict(message)
 
2588
 
2589
  def _offline_prepare_inputs(self, processor, query: Dict[str, Any]):
2590
  messages = self._offline_prepare_messages(processor, query)
2591
+ input_text = self._offline_prepare_input_text(
2592
+ processor,
2593
+ messages,
2594
+ use_template=self._offline_resolve_use_template(query),
2595
+ )
2596
  all_images, all_videos = self._offline_collect_media(messages)
2597
  media_kwargs = dict(query.get("media_kwargs") or {})
2598
  processor_kwargs = self._offline_build_processor_kwargs(
 
2607
  modified_multi_image = False
2608
  modified_video = False
2609
 
2610
+ with self._offline_processor_lock:
2611
+ try:
2612
+ multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2613
+ if multi_image_max_pixels is not None and image_proc is not None:
2614
+ orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2615
+ image_proc.multi_image_max_pixels = multi_image_max_pixels
2616
+ modified_multi_image = True
2617
+
2618
+ video_max_pixels = media_kwargs.get("video_max_pixels")
2619
+ if video_max_pixels is not None and video_proc is not None:
2620
+ orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2621
+ video_proc.video_max_pixels = video_max_pixels
2622
+ modified_video = True
2623
+
2624
+ inputs = processor(**processor_kwargs)
2625
+ finally:
2626
+ if modified_multi_image and image_proc is not None:
2627
+ image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2628
+ if modified_video and video_proc is not None:
2629
+ video_proc.video_max_pixels = orig_video_max_pixels
2630
 
2631
  text_device = self.get_input_embeddings().weight.device
2632
  vision_device = self.visual.patch_embed.proj.weight.device
 
2838
  current_session,
2839
  )
2840
  working_messages_list.append(working_messages)
2841
+ input_texts.append(
2842
+ self._offline_prepare_input_text(
2843
+ processor,
2844
+ working_messages,
2845
+ use_template=self._offline_resolve_use_template(query),
2846
+ )
2847
+ )
2848
 
2849
  all_images, all_videos = self._offline_collect_media(working_messages)
2850
  all_images_per_query.append(all_images)
 
2869
  modified_video = False
2870
  orig_padding_side = None
2871
 
2872
+ with self._offline_processor_lock:
2873
+ try:
2874
+ multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2875
+ if multi_image_max_pixels is not None and image_proc is not None:
2876
+ orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2877
+ image_proc.multi_image_max_pixels = multi_image_max_pixels
2878
+ modified_multi_image = True
2879
+
2880
+ video_max_pixels = media_kwargs.get("video_max_pixels")
2881
+ if video_max_pixels is not None and video_proc is not None:
2882
+ orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2883
+ video_proc.video_max_pixels = video_max_pixels
2884
+ modified_video = True
2885
+
2886
+ if tokenizer is not None and hasattr(tokenizer, "padding_side"):
2887
+ orig_padding_side = tokenizer.padding_side
2888
+ tokenizer.padding_side = "left"
2889
+
2890
+ inputs = processor(**processor_kwargs)
2891
+ finally:
2892
+ if modified_multi_image and image_proc is not None:
2893
+ image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2894
+ if modified_video and video_proc is not None:
2895
+ video_proc.video_max_pixels = orig_video_max_pixels
2896
+ if tokenizer is not None and orig_padding_side is not None:
2897
+ tokenizer.padding_side = orig_padding_side
2898
 
2899
  text_device = self.get_input_embeddings().weight.device
2900
  vision_device = self.visual.patch_embed.proj.weight.device
 
2977
  for index, (query, session_state) in enumerate(zip(prepared_queries, session_states)):
2978
  current_session = [] if query.get("reset_session") or query.get("clear_history") else session_state
2979
  working_messages = self._offline_build_session_messages(processor, query, current_session)
2980
+ input_text = self._offline_prepare_input_text(
2981
+ processor,
2982
+ working_messages,
2983
+ use_template=self._offline_resolve_use_template(query),
2984
+ )
2985
 
2986
  if tokenizer is not None:
2987
  token_ids = tokenizer(input_text, add_special_tokens=False)["input_ids"]
 
3048
  new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
3049
  return processor.decode(new_tokens, skip_special_tokens=True)
3050
 
3051
+ @staticmethod
3052
+ def _offline_capture_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
3053
+ if target is None or not overrides:
3054
+ return None
3055
+ return {name: copy.deepcopy(getattr(target, name)) for name in overrides}
3056
+
3057
+ @staticmethod
3058
+ def _offline_apply_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> None:
3059
+ if target is None or not overrides:
3060
+ return
3061
+ for name, value in overrides.items():
3062
+ setattr(target, name, copy.deepcopy(value))
3063
+
3064
+ @staticmethod
3065
+ def _offline_restore_processor_attrs(target, snapshot: Optional[Dict[str, Any]]) -> None:
3066
+ if target is None or snapshot is None:
3067
+ return
3068
+ for name, value in snapshot.items():
3069
+ setattr(target, name, copy.deepcopy(value))
3070
+
3071
+ def _offline_generate_one_with_processor_overrides(
3072
+ self,
3073
+ processor,
3074
+ query: Dict[str, Any],
3075
+ image_processor_overrides: Optional[Dict[str, Any]] = None,
3076
+ video_processor_overrides: Optional[Dict[str, Any]] = None,
3077
+ ) -> str:
3078
+ image_proc = getattr(processor, "image_processor", None)
3079
+ video_proc = getattr(processor, "video_processor", None)
3080
+ image_snapshot = self._offline_capture_processor_attrs(image_proc, image_processor_overrides)
3081
+ video_snapshot = self._offline_capture_processor_attrs(video_proc, video_processor_overrides)
3082
+
3083
+ with self._offline_processor_lock:
3084
+ try:
3085
+ self._offline_apply_processor_attrs(image_proc, image_processor_overrides)
3086
+ self._offline_apply_processor_attrs(video_proc, video_processor_overrides)
3087
+ return self._offline_generate_one(processor, query)
3088
+ finally:
3089
+ self._offline_restore_processor_attrs(image_proc, image_snapshot)
3090
+ self._offline_restore_processor_attrs(video_proc, video_snapshot)
3091
+
3092
+ def offline_image_generate(
3093
+ self,
3094
+ processor,
3095
+ prompt: str = "",
3096
+ image: Any = None,
3097
+ *,
3098
+ shortest_edge: int = 4096,
3099
+ longest_edge: int = 16777216,
3100
+ multi_image_max_pixels: int = 201326592,
3101
+ patch_size: int = 16,
3102
+ temporal_patch_size: int = 1,
3103
+ merge_size: int = 2,
3104
+ image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3105
+ image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3106
+ max_new_tokens: int = 1024,
3107
+ temperature: float = 1.0,
3108
+ top_k: int = 50,
3109
+ top_p: float = 1.0,
3110
+ repetition_penalty: float = 1.0,
3111
+ do_sample: bool = False,
3112
+ vision_chunked_length: int = 64,
3113
+ use_template: bool = False,
3114
+ thinking_mode: Optional[str] = None,
3115
+ system_prompt_type: Optional[str] = None,
3116
+ system_prompt: Optional[str] = None,
3117
+ ) -> str:
3118
+ """
3119
+ Single-image offline generation with explicit image preprocessor defaults.
3120
+
3121
+ The default values mirror `preprocessor_config.json` so README examples can
3122
+ surface the full image preprocessing setup without requiring a batch wrapper.
3123
+ """
3124
+ if image is None:
3125
+ raise ValueError("`image` is required.")
3126
+ query: Dict[str, Any] = {
3127
+ "prompt": prompt,
3128
+ "images": [image],
3129
+ "videos": [],
3130
+ "media_kwargs": {
3131
+ "min_pixels": shortest_edge,
3132
+ "max_pixels": longest_edge,
3133
+ "multi_image_max_pixels": multi_image_max_pixels,
3134
+ },
3135
+ "generate_kwargs": {
3136
+ "max_new_tokens": max_new_tokens,
3137
+ "temperature": temperature,
3138
+ "top_k": top_k,
3139
+ "top_p": top_p,
3140
+ "repetition_penalty": repetition_penalty,
3141
+ "do_sample": do_sample,
3142
+ "vision_chunked_length": vision_chunked_length,
3143
+ },
3144
+ "use_template": use_template,
3145
+ }
3146
+ if thinking_mode is not None:
3147
+ query["thinking_mode"] = thinking_mode
3148
+ if system_prompt_type is not None:
3149
+ query["system_prompt_type"] = system_prompt_type
3150
+ if system_prompt is not None:
3151
+ query["system_prompt"] = system_prompt
3152
+
3153
+ image_processor_overrides = {
3154
+ "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
3155
+ "multi_image_max_pixels": multi_image_max_pixels,
3156
+ "patch_size": patch_size,
3157
+ "temporal_patch_size": temporal_patch_size,
3158
+ "merge_size": merge_size,
3159
+ "image_mean": list(image_mean) if image_mean is not None else None,
3160
+ "image_std": list(image_std) if image_std is not None else None,
3161
+ }
3162
+ return self._offline_generate_one_with_processor_overrides(
3163
+ processor,
3164
+ query,
3165
+ image_processor_overrides=image_processor_overrides,
3166
+ )
3167
+
3168
+ def offline_video_generate(
3169
+ self,
3170
+ processor,
3171
+ prompt: str = "",
3172
+ video: Any = None,
3173
+ *,
3174
+ shortest_edge: int = 4096,
3175
+ longest_edge: int = 16777216,
3176
+ video_max_pixels: int = 201326592,
3177
+ patch_size: int = 16,
3178
+ temporal_patch_size: int = 1,
3179
+ merge_size: int = 2,
3180
+ video_fps: float = 1.0,
3181
+ min_frames: int = 1,
3182
+ max_frames: int = 256,
3183
+ num_extract_threads: int = 4,
3184
+ image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3185
+ image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3186
+ max_new_tokens: int = 1024,
3187
+ temperature: float = 1.0,
3188
+ top_k: int = 50,
3189
+ top_p: float = 1.0,
3190
+ repetition_penalty: float = 1.0,
3191
+ do_sample: bool = False,
3192
+ vision_chunked_length: int = 64,
3193
+ use_template: bool = False,
3194
+ thinking_mode: Optional[str] = None,
3195
+ system_prompt_type: Optional[str] = None,
3196
+ system_prompt: Optional[str] = None,
3197
+ ) -> str:
3198
+ """
3199
+ Single-video offline generation with explicit video preprocessor defaults.
3200
+
3201
+ The default values mirror `video_preprocessor_config.json` so README examples
3202
+ can show a standalone video entry point with the effective preprocessing knobs.
3203
+ """
3204
+ if video is None:
3205
+ raise ValueError("`video` is required.")
3206
+ query: Dict[str, Any] = {
3207
+ "prompt": prompt,
3208
+ "images": [],
3209
+ "videos": [video],
3210
+ "media_kwargs": {
3211
+ "min_pixels": shortest_edge,
3212
+ "max_pixels": longest_edge,
3213
+ "video_max_pixels": video_max_pixels,
3214
+ "video_fps": video_fps,
3215
+ "min_frames": min_frames,
3216
+ "max_frames": max_frames,
3217
+ },
3218
+ "generate_kwargs": {
3219
+ "max_new_tokens": max_new_tokens,
3220
+ "temperature": temperature,
3221
+ "top_k": top_k,
3222
+ "top_p": top_p,
3223
+ "repetition_penalty": repetition_penalty,
3224
+ "do_sample": do_sample,
3225
+ "vision_chunked_length": vision_chunked_length,
3226
+ },
3227
+ "use_template": use_template,
3228
+ }
3229
+ if thinking_mode is not None:
3230
+ query["thinking_mode"] = thinking_mode
3231
+ if system_prompt_type is not None:
3232
+ query["system_prompt_type"] = system_prompt_type
3233
+ if system_prompt is not None:
3234
+ query["system_prompt"] = system_prompt
3235
+
3236
+ video_processor_overrides = {
3237
+ "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
3238
+ "video_max_pixels": video_max_pixels,
3239
+ "patch_size": patch_size,
3240
+ "temporal_patch_size": temporal_patch_size,
3241
+ "merge_size": merge_size,
3242
+ "video_fps": video_fps,
3243
+ "min_frames": min_frames,
3244
+ "max_frames": max_frames,
3245
+ "num_extract_threads": num_extract_threads,
3246
+ "image_mean": list(image_mean) if image_mean is not None else None,
3247
+ "image_std": list(image_std) if image_std is not None else None,
3248
+ }
3249
+ return self._offline_generate_one_with_processor_overrides(
3250
+ processor,
3251
+ query,
3252
+ video_processor_overrides=video_processor_overrides,
3253
+ )
3254
+
3255
  def offline_generate(
3256
  self,
3257
  processor,
 
3270
  - `prompt` / `messages`
3271
  - `images` / `videos`
3272
  - `media_kwargs` / `generate_kwargs`
3273
+ - `use_template` to switch between backend-style pretrain prompting
3274
+ (`False`, default for base) and tokenizer chat template prompting (`True`)
3275
  - `thinking_mode` (`no_thinking` or `deep_thinking`, plus compatible aliases)
3276
  - `system_prompt_type` (`text_image` or `video`, plus compatible aliases)
3277
  - `system_prompt` for an explicit override