CCCCyx commited on
Commit
bf6f7a6
·
verified ·
1 Parent(s): aec5558

Update modeling_moss_vl.py

Browse files
Files changed (1) hide show
  1. modeling_moss_vl.py +244 -44
modeling_moss_vl.py CHANGED
@@ -14,6 +14,7 @@
14
  # limitations under the License.
15
  """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
16
 
 
17
  from dataclasses import dataclass
18
  import queue
19
  import threading
@@ -2160,6 +2161,7 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2160
  super().__init__(config)
2161
  self.model = MossVLModel(config)
2162
  self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
 
2163
 
2164
  self.post_init()
2165
 
@@ -2543,25 +2545,26 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2543
  modified_multi_image = False
2544
  modified_video = False
2545
 
2546
- try:
2547
- multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2548
- if multi_image_max_pixels is not None and image_proc is not None:
2549
- orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2550
- image_proc.multi_image_max_pixels = multi_image_max_pixels
2551
- modified_multi_image = True
2552
-
2553
- video_max_pixels = media_kwargs.get("video_max_pixels")
2554
- if video_max_pixels is not None and video_proc is not None:
2555
- orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2556
- video_proc.video_max_pixels = video_max_pixels
2557
- modified_video = True
2558
-
2559
- inputs = processor(**processor_kwargs)
2560
- finally:
2561
- if modified_multi_image and image_proc is not None:
2562
- image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2563
- if modified_video and video_proc is not None:
2564
- video_proc.video_max_pixels = orig_video_max_pixels
 
2565
 
2566
  text_device = self.get_input_embeddings().weight.device
2567
  vision_device = self.visual.patch_embed.proj.weight.device
@@ -2798,31 +2801,32 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2798
  modified_video = False
2799
  orig_padding_side = None
2800
 
2801
- try:
2802
- multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2803
- if multi_image_max_pixels is not None and image_proc is not None:
2804
- orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2805
- image_proc.multi_image_max_pixels = multi_image_max_pixels
2806
- modified_multi_image = True
2807
-
2808
- video_max_pixels = media_kwargs.get("video_max_pixels")
2809
- if video_max_pixels is not None and video_proc is not None:
2810
- orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2811
- video_proc.video_max_pixels = video_max_pixels
2812
- modified_video = True
2813
-
2814
- if tokenizer is not None and hasattr(tokenizer, "padding_side"):
2815
- orig_padding_side = tokenizer.padding_side
2816
- tokenizer.padding_side = "left"
2817
-
2818
- inputs = processor(**processor_kwargs)
2819
- finally:
2820
- if modified_multi_image and image_proc is not None:
2821
- image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2822
- if modified_video and video_proc is not None:
2823
- video_proc.video_max_pixels = orig_video_max_pixels
2824
- if tokenizer is not None and orig_padding_side is not None:
2825
- tokenizer.padding_side = orig_padding_side
 
2826
 
2827
  text_device = self.get_input_embeddings().weight.device
2828
  vision_device = self.visual.patch_embed.proj.weight.device
@@ -2972,6 +2976,202 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
2972
  new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
2973
  return processor.decode(new_tokens, skip_special_tokens=True)
2974
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2975
  def offline_generate(
2976
  self,
2977
  processor,
 
14
  # limitations under the License.
15
  """PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
16
 
17
+ import copy
18
  from dataclasses import dataclass
19
  import queue
20
  import threading
 
2161
  super().__init__(config)
2162
  self.model = MossVLModel(config)
2163
  self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
2164
+ self._offline_processor_lock = threading.RLock()
2165
 
2166
  self.post_init()
2167
 
 
2545
  modified_multi_image = False
2546
  modified_video = False
2547
 
2548
+ with self._offline_processor_lock:
2549
+ try:
2550
+ multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2551
+ if multi_image_max_pixels is not None and image_proc is not None:
2552
+ orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2553
+ image_proc.multi_image_max_pixels = multi_image_max_pixels
2554
+ modified_multi_image = True
2555
+
2556
+ video_max_pixels = media_kwargs.get("video_max_pixels")
2557
+ if video_max_pixels is not None and video_proc is not None:
2558
+ orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2559
+ video_proc.video_max_pixels = video_max_pixels
2560
+ modified_video = True
2561
+
2562
+ inputs = processor(**processor_kwargs)
2563
+ finally:
2564
+ if modified_multi_image and image_proc is not None:
2565
+ image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2566
+ if modified_video and video_proc is not None:
2567
+ video_proc.video_max_pixels = orig_video_max_pixels
2568
 
2569
  text_device = self.get_input_embeddings().weight.device
2570
  vision_device = self.visual.patch_embed.proj.weight.device
 
2801
  modified_video = False
2802
  orig_padding_side = None
2803
 
2804
+ with self._offline_processor_lock:
2805
+ try:
2806
+ multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
2807
+ if multi_image_max_pixels is not None and image_proc is not None:
2808
+ orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
2809
+ image_proc.multi_image_max_pixels = multi_image_max_pixels
2810
+ modified_multi_image = True
2811
+
2812
+ video_max_pixels = media_kwargs.get("video_max_pixels")
2813
+ if video_max_pixels is not None and video_proc is not None:
2814
+ orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
2815
+ video_proc.video_max_pixels = video_max_pixels
2816
+ modified_video = True
2817
+
2818
+ if tokenizer is not None and hasattr(tokenizer, "padding_side"):
2819
+ orig_padding_side = tokenizer.padding_side
2820
+ tokenizer.padding_side = "left"
2821
+
2822
+ inputs = processor(**processor_kwargs)
2823
+ finally:
2824
+ if modified_multi_image and image_proc is not None:
2825
+ image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
2826
+ if modified_video and video_proc is not None:
2827
+ video_proc.video_max_pixels = orig_video_max_pixels
2828
+ if tokenizer is not None and orig_padding_side is not None:
2829
+ tokenizer.padding_side = orig_padding_side
2830
 
2831
  text_device = self.get_input_embeddings().weight.device
2832
  vision_device = self.visual.patch_embed.proj.weight.device
 
2976
  new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
2977
  return processor.decode(new_tokens, skip_special_tokens=True)
2978
 
2979
+ @staticmethod
2980
+ def _offline_capture_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
2981
+ if target is None or not overrides:
2982
+ return None
2983
+ return {name: copy.deepcopy(getattr(target, name)) for name in overrides}
2984
+
2985
+ @staticmethod
2986
+ def _offline_apply_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> None:
2987
+ if target is None or not overrides:
2988
+ return
2989
+ for name, value in overrides.items():
2990
+ setattr(target, name, copy.deepcopy(value))
2991
+
2992
+ @staticmethod
2993
+ def _offline_restore_processor_attrs(target, snapshot: Optional[Dict[str, Any]]) -> None:
2994
+ if target is None or snapshot is None:
2995
+ return
2996
+ for name, value in snapshot.items():
2997
+ setattr(target, name, copy.deepcopy(value))
2998
+
2999
+ def _offline_generate_one_with_processor_overrides(
3000
+ self,
3001
+ processor,
3002
+ query: Dict[str, Any],
3003
+ image_processor_overrides: Optional[Dict[str, Any]] = None,
3004
+ video_processor_overrides: Optional[Dict[str, Any]] = None,
3005
+ ) -> str:
3006
+ image_proc = getattr(processor, "image_processor", None)
3007
+ video_proc = getattr(processor, "video_processor", None)
3008
+ image_snapshot = self._offline_capture_processor_attrs(image_proc, image_processor_overrides)
3009
+ video_snapshot = self._offline_capture_processor_attrs(video_proc, video_processor_overrides)
3010
+
3011
+ with self._offline_processor_lock:
3012
+ try:
3013
+ self._offline_apply_processor_attrs(image_proc, image_processor_overrides)
3014
+ self._offline_apply_processor_attrs(video_proc, video_processor_overrides)
3015
+ return self._offline_generate_one(processor, query)
3016
+ finally:
3017
+ self._offline_restore_processor_attrs(image_proc, image_snapshot)
3018
+ self._offline_restore_processor_attrs(video_proc, video_snapshot)
3019
+
3020
+ def offline_image_generate(
3021
+ self,
3022
+ processor,
3023
+ prompt: str,
3024
+ image: Any,
3025
+ *,
3026
+ shortest_edge: int = 4096,
3027
+ longest_edge: int = 16777216,
3028
+ multi_image_max_pixels: int = 201326592,
3029
+ patch_size: int = 16,
3030
+ temporal_patch_size: int = 1,
3031
+ merge_size: int = 2,
3032
+ image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3033
+ image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3034
+ max_new_tokens: int = 1024,
3035
+ temperature: float = 1.0,
3036
+ top_k: int = 50,
3037
+ top_p: float = 1.0,
3038
+ repetition_penalty: float = 1.0,
3039
+ do_sample: bool = False,
3040
+ vision_chunked_length: int = 64,
3041
+ thinking_mode: Optional[str] = None,
3042
+ system_prompt_type: Optional[str] = None,
3043
+ system_prompt: Optional[str] = None,
3044
+ ) -> str:
3045
+ """
3046
+ Single-image offline generation with explicit image preprocessor defaults.
3047
+
3048
+ The default values mirror `preprocessor_config.json` so README examples can
3049
+ surface the full image preprocessing setup without requiring a batch wrapper.
3050
+ """
3051
+ query: Dict[str, Any] = {
3052
+ "prompt": prompt,
3053
+ "images": [image],
3054
+ "videos": [],
3055
+ "media_kwargs": {
3056
+ "min_pixels": shortest_edge,
3057
+ "max_pixels": longest_edge,
3058
+ "multi_image_max_pixels": multi_image_max_pixels,
3059
+ },
3060
+ "generate_kwargs": {
3061
+ "max_new_tokens": max_new_tokens,
3062
+ "temperature": temperature,
3063
+ "top_k": top_k,
3064
+ "top_p": top_p,
3065
+ "repetition_penalty": repetition_penalty,
3066
+ "do_sample": do_sample,
3067
+ "vision_chunked_length": vision_chunked_length,
3068
+ },
3069
+ }
3070
+ if thinking_mode is not None:
3071
+ query["thinking_mode"] = thinking_mode
3072
+ if system_prompt_type is not None:
3073
+ query["system_prompt_type"] = system_prompt_type
3074
+ if system_prompt is not None:
3075
+ query["system_prompt"] = system_prompt
3076
+
3077
+ image_processor_overrides = {
3078
+ "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
3079
+ "multi_image_max_pixels": multi_image_max_pixels,
3080
+ "patch_size": patch_size,
3081
+ "temporal_patch_size": temporal_patch_size,
3082
+ "merge_size": merge_size,
3083
+ "image_mean": list(image_mean) if image_mean is not None else None,
3084
+ "image_std": list(image_std) if image_std is not None else None,
3085
+ }
3086
+ return self._offline_generate_one_with_processor_overrides(
3087
+ processor,
3088
+ query,
3089
+ image_processor_overrides=image_processor_overrides,
3090
+ )
3091
+
3092
+ def offline_video_generate(
3093
+ self,
3094
+ processor,
3095
+ prompt: str,
3096
+ video: Any,
3097
+ *,
3098
+ shortest_edge: int = 4096,
3099
+ longest_edge: int = 16777216,
3100
+ video_max_pixels: int = 201326592,
3101
+ patch_size: int = 16,
3102
+ temporal_patch_size: int = 1,
3103
+ merge_size: int = 2,
3104
+ video_fps: float = 1.0,
3105
+ min_frames: int = 1,
3106
+ max_frames: int = 256,
3107
+ num_extract_threads: int = 4,
3108
+ image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3109
+ image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
3110
+ max_new_tokens: int = 1024,
3111
+ temperature: float = 1.0,
3112
+ top_k: int = 50,
3113
+ top_p: float = 1.0,
3114
+ repetition_penalty: float = 1.0,
3115
+ do_sample: bool = False,
3116
+ vision_chunked_length: int = 64,
3117
+ thinking_mode: Optional[str] = None,
3118
+ system_prompt_type: Optional[str] = None,
3119
+ system_prompt: Optional[str] = None,
3120
+ ) -> str:
3121
+ """
3122
+ Single-video offline generation with explicit video preprocessor defaults.
3123
+
3124
+ The default values mirror `video_preprocessor_config.json` so README examples
3125
+ can show a standalone video entry point with the effective preprocessing knobs.
3126
+ """
3127
+ query: Dict[str, Any] = {
3128
+ "prompt": prompt,
3129
+ "images": [],
3130
+ "videos": [video],
3131
+ "media_kwargs": {
3132
+ "min_pixels": shortest_edge,
3133
+ "max_pixels": longest_edge,
3134
+ "video_max_pixels": video_max_pixels,
3135
+ "video_fps": video_fps,
3136
+ "min_frames": min_frames,
3137
+ "max_frames": max_frames,
3138
+ },
3139
+ "generate_kwargs": {
3140
+ "max_new_tokens": max_new_tokens,
3141
+ "temperature": temperature,
3142
+ "top_k": top_k,
3143
+ "top_p": top_p,
3144
+ "repetition_penalty": repetition_penalty,
3145
+ "do_sample": do_sample,
3146
+ "vision_chunked_length": vision_chunked_length,
3147
+ },
3148
+ }
3149
+ if thinking_mode is not None:
3150
+ query["thinking_mode"] = thinking_mode
3151
+ if system_prompt_type is not None:
3152
+ query["system_prompt_type"] = system_prompt_type
3153
+ if system_prompt is not None:
3154
+ query["system_prompt"] = system_prompt
3155
+
3156
+ video_processor_overrides = {
3157
+ "size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
3158
+ "video_max_pixels": video_max_pixels,
3159
+ "patch_size": patch_size,
3160
+ "temporal_patch_size": temporal_patch_size,
3161
+ "merge_size": merge_size,
3162
+ "video_fps": video_fps,
3163
+ "min_frames": min_frames,
3164
+ "max_frames": max_frames,
3165
+ "num_extract_threads": num_extract_threads,
3166
+ "image_mean": list(image_mean) if image_mean is not None else None,
3167
+ "image_std": list(image_std) if image_std is not None else None,
3168
+ }
3169
+ return self._offline_generate_one_with_processor_overrides(
3170
+ processor,
3171
+ query,
3172
+ video_processor_overrides=video_processor_overrides,
3173
+ )
3174
+
3175
  def offline_generate(
3176
  self,
3177
  processor,