Update modeling_moss_vl.py
Browse files- modeling_moss_vl.py +244 -44
modeling_moss_vl.py
CHANGED
|
@@ -14,6 +14,7 @@
|
|
| 14 |
# limitations under the License.
|
| 15 |
"""PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
|
| 16 |
|
|
|
|
| 17 |
from dataclasses import dataclass
|
| 18 |
import queue
|
| 19 |
import threading
|
|
@@ -2160,6 +2161,7 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2160 |
super().__init__(config)
|
| 2161 |
self.model = MossVLModel(config)
|
| 2162 |
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
|
|
|
| 2163 |
|
| 2164 |
self.post_init()
|
| 2165 |
|
|
@@ -2543,25 +2545,26 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2543 |
modified_multi_image = False
|
| 2544 |
modified_video = False
|
| 2545 |
|
| 2546 |
-
|
| 2547 |
-
|
| 2548 |
-
|
| 2549 |
-
|
| 2550 |
-
|
| 2551 |
-
|
| 2552 |
-
|
| 2553 |
-
|
| 2554 |
-
|
| 2555 |
-
|
| 2556 |
-
|
| 2557 |
-
|
| 2558 |
-
|
| 2559 |
-
|
| 2560 |
-
|
| 2561 |
-
|
| 2562 |
-
image_proc
|
| 2563 |
-
|
| 2564 |
-
video_proc
|
|
|
|
| 2565 |
|
| 2566 |
text_device = self.get_input_embeddings().weight.device
|
| 2567 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
@@ -2798,31 +2801,32 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2798 |
modified_video = False
|
| 2799 |
orig_padding_side = None
|
| 2800 |
|
| 2801 |
-
|
| 2802 |
-
|
| 2803 |
-
|
| 2804 |
-
|
| 2805 |
-
|
| 2806 |
-
|
| 2807 |
-
|
| 2808 |
-
|
| 2809 |
-
|
| 2810 |
-
|
| 2811 |
-
|
| 2812 |
-
|
| 2813 |
-
|
| 2814 |
-
|
| 2815 |
-
|
| 2816 |
-
|
| 2817 |
-
|
| 2818 |
-
|
| 2819 |
-
|
| 2820 |
-
|
| 2821 |
-
image_proc
|
| 2822 |
-
|
| 2823 |
-
video_proc
|
| 2824 |
-
|
| 2825 |
-
tokenizer
|
|
|
|
| 2826 |
|
| 2827 |
text_device = self.get_input_embeddings().weight.device
|
| 2828 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
@@ -2972,6 +2976,202 @@ class MossVLForConditionalGeneration(MossVLPreTrainedModel, GenerationMixin):
|
|
| 2972 |
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
|
| 2973 |
return processor.decode(new_tokens, skip_special_tokens=True)
|
| 2974 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2975 |
def offline_generate(
|
| 2976 |
self,
|
| 2977 |
processor,
|
|
|
|
| 14 |
# limitations under the License.
|
| 15 |
"""PyTorch MossVL model - Qwen3VL Vision + Text with Cross Attention"""
|
| 16 |
|
| 17 |
+
import copy
|
| 18 |
from dataclasses import dataclass
|
| 19 |
import queue
|
| 20 |
import threading
|
|
|
|
| 2161 |
super().__init__(config)
|
| 2162 |
self.model = MossVLModel(config)
|
| 2163 |
self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
|
| 2164 |
+
self._offline_processor_lock = threading.RLock()
|
| 2165 |
|
| 2166 |
self.post_init()
|
| 2167 |
|
|
|
|
| 2545 |
modified_multi_image = False
|
| 2546 |
modified_video = False
|
| 2547 |
|
| 2548 |
+
with self._offline_processor_lock:
|
| 2549 |
+
try:
|
| 2550 |
+
multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
|
| 2551 |
+
if multi_image_max_pixels is not None and image_proc is not None:
|
| 2552 |
+
orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
|
| 2553 |
+
image_proc.multi_image_max_pixels = multi_image_max_pixels
|
| 2554 |
+
modified_multi_image = True
|
| 2555 |
+
|
| 2556 |
+
video_max_pixels = media_kwargs.get("video_max_pixels")
|
| 2557 |
+
if video_max_pixels is not None and video_proc is not None:
|
| 2558 |
+
orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
|
| 2559 |
+
video_proc.video_max_pixels = video_max_pixels
|
| 2560 |
+
modified_video = True
|
| 2561 |
+
|
| 2562 |
+
inputs = processor(**processor_kwargs)
|
| 2563 |
+
finally:
|
| 2564 |
+
if modified_multi_image and image_proc is not None:
|
| 2565 |
+
image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
|
| 2566 |
+
if modified_video and video_proc is not None:
|
| 2567 |
+
video_proc.video_max_pixels = orig_video_max_pixels
|
| 2568 |
|
| 2569 |
text_device = self.get_input_embeddings().weight.device
|
| 2570 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
|
|
| 2801 |
modified_video = False
|
| 2802 |
orig_padding_side = None
|
| 2803 |
|
| 2804 |
+
with self._offline_processor_lock:
|
| 2805 |
+
try:
|
| 2806 |
+
multi_image_max_pixels = media_kwargs.get("multi_image_max_pixels")
|
| 2807 |
+
if multi_image_max_pixels is not None and image_proc is not None:
|
| 2808 |
+
orig_multi_image_max_pixels = getattr(image_proc, "multi_image_max_pixels", None)
|
| 2809 |
+
image_proc.multi_image_max_pixels = multi_image_max_pixels
|
| 2810 |
+
modified_multi_image = True
|
| 2811 |
+
|
| 2812 |
+
video_max_pixels = media_kwargs.get("video_max_pixels")
|
| 2813 |
+
if video_max_pixels is not None and video_proc is not None:
|
| 2814 |
+
orig_video_max_pixels = getattr(video_proc, "video_max_pixels", None)
|
| 2815 |
+
video_proc.video_max_pixels = video_max_pixels
|
| 2816 |
+
modified_video = True
|
| 2817 |
+
|
| 2818 |
+
if tokenizer is not None and hasattr(tokenizer, "padding_side"):
|
| 2819 |
+
orig_padding_side = tokenizer.padding_side
|
| 2820 |
+
tokenizer.padding_side = "left"
|
| 2821 |
+
|
| 2822 |
+
inputs = processor(**processor_kwargs)
|
| 2823 |
+
finally:
|
| 2824 |
+
if modified_multi_image and image_proc is not None:
|
| 2825 |
+
image_proc.multi_image_max_pixels = orig_multi_image_max_pixels
|
| 2826 |
+
if modified_video and video_proc is not None:
|
| 2827 |
+
video_proc.video_max_pixels = orig_video_max_pixels
|
| 2828 |
+
if tokenizer is not None and orig_padding_side is not None:
|
| 2829 |
+
tokenizer.padding_side = orig_padding_side
|
| 2830 |
|
| 2831 |
text_device = self.get_input_embeddings().weight.device
|
| 2832 |
vision_device = self.visual.patch_embed.proj.weight.device
|
|
|
|
| 2976 |
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
|
| 2977 |
return processor.decode(new_tokens, skip_special_tokens=True)
|
| 2978 |
|
| 2979 |
+
@staticmethod
|
| 2980 |
+
def _offline_capture_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
|
| 2981 |
+
if target is None or not overrides:
|
| 2982 |
+
return None
|
| 2983 |
+
return {name: copy.deepcopy(getattr(target, name)) for name in overrides}
|
| 2984 |
+
|
| 2985 |
+
@staticmethod
|
| 2986 |
+
def _offline_apply_processor_attrs(target, overrides: Optional[Dict[str, Any]]) -> None:
|
| 2987 |
+
if target is None or not overrides:
|
| 2988 |
+
return
|
| 2989 |
+
for name, value in overrides.items():
|
| 2990 |
+
setattr(target, name, copy.deepcopy(value))
|
| 2991 |
+
|
| 2992 |
+
@staticmethod
|
| 2993 |
+
def _offline_restore_processor_attrs(target, snapshot: Optional[Dict[str, Any]]) -> None:
|
| 2994 |
+
if target is None or snapshot is None:
|
| 2995 |
+
return
|
| 2996 |
+
for name, value in snapshot.items():
|
| 2997 |
+
setattr(target, name, copy.deepcopy(value))
|
| 2998 |
+
|
| 2999 |
+
def _offline_generate_one_with_processor_overrides(
|
| 3000 |
+
self,
|
| 3001 |
+
processor,
|
| 3002 |
+
query: Dict[str, Any],
|
| 3003 |
+
image_processor_overrides: Optional[Dict[str, Any]] = None,
|
| 3004 |
+
video_processor_overrides: Optional[Dict[str, Any]] = None,
|
| 3005 |
+
) -> str:
|
| 3006 |
+
image_proc = getattr(processor, "image_processor", None)
|
| 3007 |
+
video_proc = getattr(processor, "video_processor", None)
|
| 3008 |
+
image_snapshot = self._offline_capture_processor_attrs(image_proc, image_processor_overrides)
|
| 3009 |
+
video_snapshot = self._offline_capture_processor_attrs(video_proc, video_processor_overrides)
|
| 3010 |
+
|
| 3011 |
+
with self._offline_processor_lock:
|
| 3012 |
+
try:
|
| 3013 |
+
self._offline_apply_processor_attrs(image_proc, image_processor_overrides)
|
| 3014 |
+
self._offline_apply_processor_attrs(video_proc, video_processor_overrides)
|
| 3015 |
+
return self._offline_generate_one(processor, query)
|
| 3016 |
+
finally:
|
| 3017 |
+
self._offline_restore_processor_attrs(image_proc, image_snapshot)
|
| 3018 |
+
self._offline_restore_processor_attrs(video_proc, video_snapshot)
|
| 3019 |
+
|
| 3020 |
+
def offline_image_generate(
|
| 3021 |
+
self,
|
| 3022 |
+
processor,
|
| 3023 |
+
prompt: str,
|
| 3024 |
+
image: Any,
|
| 3025 |
+
*,
|
| 3026 |
+
shortest_edge: int = 4096,
|
| 3027 |
+
longest_edge: int = 16777216,
|
| 3028 |
+
multi_image_max_pixels: int = 201326592,
|
| 3029 |
+
patch_size: int = 16,
|
| 3030 |
+
temporal_patch_size: int = 1,
|
| 3031 |
+
merge_size: int = 2,
|
| 3032 |
+
image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3033 |
+
image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3034 |
+
max_new_tokens: int = 1024,
|
| 3035 |
+
temperature: float = 1.0,
|
| 3036 |
+
top_k: int = 50,
|
| 3037 |
+
top_p: float = 1.0,
|
| 3038 |
+
repetition_penalty: float = 1.0,
|
| 3039 |
+
do_sample: bool = False,
|
| 3040 |
+
vision_chunked_length: int = 64,
|
| 3041 |
+
thinking_mode: Optional[str] = None,
|
| 3042 |
+
system_prompt_type: Optional[str] = None,
|
| 3043 |
+
system_prompt: Optional[str] = None,
|
| 3044 |
+
) -> str:
|
| 3045 |
+
"""
|
| 3046 |
+
Single-image offline generation with explicit image preprocessor defaults.
|
| 3047 |
+
|
| 3048 |
+
The default values mirror `preprocessor_config.json` so README examples can
|
| 3049 |
+
surface the full image preprocessing setup without requiring a batch wrapper.
|
| 3050 |
+
"""
|
| 3051 |
+
query: Dict[str, Any] = {
|
| 3052 |
+
"prompt": prompt,
|
| 3053 |
+
"images": [image],
|
| 3054 |
+
"videos": [],
|
| 3055 |
+
"media_kwargs": {
|
| 3056 |
+
"min_pixels": shortest_edge,
|
| 3057 |
+
"max_pixels": longest_edge,
|
| 3058 |
+
"multi_image_max_pixels": multi_image_max_pixels,
|
| 3059 |
+
},
|
| 3060 |
+
"generate_kwargs": {
|
| 3061 |
+
"max_new_tokens": max_new_tokens,
|
| 3062 |
+
"temperature": temperature,
|
| 3063 |
+
"top_k": top_k,
|
| 3064 |
+
"top_p": top_p,
|
| 3065 |
+
"repetition_penalty": repetition_penalty,
|
| 3066 |
+
"do_sample": do_sample,
|
| 3067 |
+
"vision_chunked_length": vision_chunked_length,
|
| 3068 |
+
},
|
| 3069 |
+
}
|
| 3070 |
+
if thinking_mode is not None:
|
| 3071 |
+
query["thinking_mode"] = thinking_mode
|
| 3072 |
+
if system_prompt_type is not None:
|
| 3073 |
+
query["system_prompt_type"] = system_prompt_type
|
| 3074 |
+
if system_prompt is not None:
|
| 3075 |
+
query["system_prompt"] = system_prompt
|
| 3076 |
+
|
| 3077 |
+
image_processor_overrides = {
|
| 3078 |
+
"size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
|
| 3079 |
+
"multi_image_max_pixels": multi_image_max_pixels,
|
| 3080 |
+
"patch_size": patch_size,
|
| 3081 |
+
"temporal_patch_size": temporal_patch_size,
|
| 3082 |
+
"merge_size": merge_size,
|
| 3083 |
+
"image_mean": list(image_mean) if image_mean is not None else None,
|
| 3084 |
+
"image_std": list(image_std) if image_std is not None else None,
|
| 3085 |
+
}
|
| 3086 |
+
return self._offline_generate_one_with_processor_overrides(
|
| 3087 |
+
processor,
|
| 3088 |
+
query,
|
| 3089 |
+
image_processor_overrides=image_processor_overrides,
|
| 3090 |
+
)
|
| 3091 |
+
|
| 3092 |
+
def offline_video_generate(
|
| 3093 |
+
self,
|
| 3094 |
+
processor,
|
| 3095 |
+
prompt: str,
|
| 3096 |
+
video: Any,
|
| 3097 |
+
*,
|
| 3098 |
+
shortest_edge: int = 4096,
|
| 3099 |
+
longest_edge: int = 16777216,
|
| 3100 |
+
video_max_pixels: int = 201326592,
|
| 3101 |
+
patch_size: int = 16,
|
| 3102 |
+
temporal_patch_size: int = 1,
|
| 3103 |
+
merge_size: int = 2,
|
| 3104 |
+
video_fps: float = 1.0,
|
| 3105 |
+
min_frames: int = 1,
|
| 3106 |
+
max_frames: int = 256,
|
| 3107 |
+
num_extract_threads: int = 4,
|
| 3108 |
+
image_mean: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3109 |
+
image_std: Optional[Union[List[float], Tuple[float, ...]]] = (0.5, 0.5, 0.5),
|
| 3110 |
+
max_new_tokens: int = 1024,
|
| 3111 |
+
temperature: float = 1.0,
|
| 3112 |
+
top_k: int = 50,
|
| 3113 |
+
top_p: float = 1.0,
|
| 3114 |
+
repetition_penalty: float = 1.0,
|
| 3115 |
+
do_sample: bool = False,
|
| 3116 |
+
vision_chunked_length: int = 64,
|
| 3117 |
+
thinking_mode: Optional[str] = None,
|
| 3118 |
+
system_prompt_type: Optional[str] = None,
|
| 3119 |
+
system_prompt: Optional[str] = None,
|
| 3120 |
+
) -> str:
|
| 3121 |
+
"""
|
| 3122 |
+
Single-video offline generation with explicit video preprocessor defaults.
|
| 3123 |
+
|
| 3124 |
+
The default values mirror `video_preprocessor_config.json` so README examples
|
| 3125 |
+
can show a standalone video entry point with the effective preprocessing knobs.
|
| 3126 |
+
"""
|
| 3127 |
+
query: Dict[str, Any] = {
|
| 3128 |
+
"prompt": prompt,
|
| 3129 |
+
"images": [],
|
| 3130 |
+
"videos": [video],
|
| 3131 |
+
"media_kwargs": {
|
| 3132 |
+
"min_pixels": shortest_edge,
|
| 3133 |
+
"max_pixels": longest_edge,
|
| 3134 |
+
"video_max_pixels": video_max_pixels,
|
| 3135 |
+
"video_fps": video_fps,
|
| 3136 |
+
"min_frames": min_frames,
|
| 3137 |
+
"max_frames": max_frames,
|
| 3138 |
+
},
|
| 3139 |
+
"generate_kwargs": {
|
| 3140 |
+
"max_new_tokens": max_new_tokens,
|
| 3141 |
+
"temperature": temperature,
|
| 3142 |
+
"top_k": top_k,
|
| 3143 |
+
"top_p": top_p,
|
| 3144 |
+
"repetition_penalty": repetition_penalty,
|
| 3145 |
+
"do_sample": do_sample,
|
| 3146 |
+
"vision_chunked_length": vision_chunked_length,
|
| 3147 |
+
},
|
| 3148 |
+
}
|
| 3149 |
+
if thinking_mode is not None:
|
| 3150 |
+
query["thinking_mode"] = thinking_mode
|
| 3151 |
+
if system_prompt_type is not None:
|
| 3152 |
+
query["system_prompt_type"] = system_prompt_type
|
| 3153 |
+
if system_prompt is not None:
|
| 3154 |
+
query["system_prompt"] = system_prompt
|
| 3155 |
+
|
| 3156 |
+
video_processor_overrides = {
|
| 3157 |
+
"size": {"shortest_edge": shortest_edge, "longest_edge": longest_edge},
|
| 3158 |
+
"video_max_pixels": video_max_pixels,
|
| 3159 |
+
"patch_size": patch_size,
|
| 3160 |
+
"temporal_patch_size": temporal_patch_size,
|
| 3161 |
+
"merge_size": merge_size,
|
| 3162 |
+
"video_fps": video_fps,
|
| 3163 |
+
"min_frames": min_frames,
|
| 3164 |
+
"max_frames": max_frames,
|
| 3165 |
+
"num_extract_threads": num_extract_threads,
|
| 3166 |
+
"image_mean": list(image_mean) if image_mean is not None else None,
|
| 3167 |
+
"image_std": list(image_std) if image_std is not None else None,
|
| 3168 |
+
}
|
| 3169 |
+
return self._offline_generate_one_with_processor_overrides(
|
| 3170 |
+
processor,
|
| 3171 |
+
query,
|
| 3172 |
+
video_processor_overrides=video_processor_overrides,
|
| 3173 |
+
)
|
| 3174 |
+
|
| 3175 |
def offline_generate(
|
| 3176 |
self,
|
| 3177 |
processor,
|