luotingdan commited on
Commit
456ec15
·
1 Parent(s): 77ddf22

update processor config and support transformers 5.0+

Browse files
config.json CHANGED
@@ -4,6 +4,7 @@
4
  ],
5
  "auto_map": {
6
  "AutoConfig": "configuration_step3p7.Step3p7Config",
 
7
  "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
8
  },
9
  "model_type": "step3p7",
@@ -1213,4 +1214,4 @@
1213
  "vit_large_projector"
1214
  ]
1215
  }
1216
- }
 
4
  ],
5
  "auto_map": {
6
  "AutoConfig": "configuration_step3p7.Step3p7Config",
7
+ "AutoProcessor": "processing_step3.Step3VLProcessor",
8
  "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
9
  },
10
  "model_type": "step3p7",
 
1214
  "vit_large_projector"
1215
  ]
1216
  }
1217
+ }
configuration_step3p7.py CHANGED
@@ -91,23 +91,10 @@ class Step3p7TextConfig(PretrainedConfig):
91
  **kwargs,
92
  ) -> None:
93
  torch_dtype = kwargs.get("torch_dtype")
94
- layer_types = _normalize_per_layer_values(layer_types,
95
  num_hidden_layers)
96
- swiglu_limits = _normalize_per_layer_values(swiglu_limits,
97
- num_hidden_layers)
98
- swiglu_limits_shared = _normalize_per_layer_values(
99
- swiglu_limits_shared, num_hidden_layers)
100
- partial_rotary_factors = kwargs.get("partial_rotary_factors")
101
- kwargs["partial_rotary_factors"] = _normalize_per_layer_values(
102
- partial_rotary_factors, num_hidden_layers)
103
- if isinstance(rope_theta, list):
104
- rope_theta = _normalize_per_layer_values(rope_theta,
105
- num_hidden_layers)
106
  if isinstance(rope_scaling, dict):
107
  rope_scaling = dict(rope_scaling)
108
- if use_rope_layers:
109
- use_rope_layers = _normalize_per_layer_values(
110
- use_rope_layers, num_hidden_layers)
111
  if share_expert_dim is None:
112
  share_expert_dim = share_expert_dims
113
  self.hidden_size = hidden_size
@@ -128,7 +115,7 @@ class Step3p7TextConfig(PretrainedConfig):
128
  self.head_dim = head_dim
129
  self.norm_expert_weight = norm_expert_weight
130
  self.moe_layers_enum = moe_layers_enum
131
- self.layer_types = layer_types
132
  self.sliding_window = sliding_window
133
  self.pad_token_id = pad_token_id
134
  self.attention_dropout = attention_dropout
@@ -145,6 +132,7 @@ class Step3p7TextConfig(PretrainedConfig):
145
  super().__init__(**kwargs)
146
  if torch_dtype is not None:
147
  self.torch_dtype = torch_dtype
 
148
 
149
  def to_dict(self):
150
  output = super().to_dict()
@@ -216,4 +204,4 @@ class Step3p7Config(PretrainedConfig):
216
  self.max_position_embeddings = text_config.max_position_embeddings
217
  self.image_token_id = image_token_id
218
  # Help Auto classes find the correct implementation when saving/loading.
219
- super().__init__(**kwargs)
 
91
  **kwargs,
92
  ) -> None:
93
  torch_dtype = kwargs.get("torch_dtype")
94
+ trim_layer_types = _normalize_per_layer_values(layer_types,
95
  num_hidden_layers)
 
 
 
 
 
 
 
 
 
 
96
  if isinstance(rope_scaling, dict):
97
  rope_scaling = dict(rope_scaling)
 
 
 
98
  if share_expert_dim is None:
99
  share_expert_dim = share_expert_dims
100
  self.hidden_size = hidden_size
 
115
  self.head_dim = head_dim
116
  self.norm_expert_weight = norm_expert_weight
117
  self.moe_layers_enum = moe_layers_enum
118
+ self.layer_types = trim_layer_types
119
  self.sliding_window = sliding_window
120
  self.pad_token_id = pad_token_id
121
  self.attention_dropout = attention_dropout
 
132
  super().__init__(**kwargs)
133
  if torch_dtype is not None:
134
  self.torch_dtype = torch_dtype
135
+ self.layer_types = layer_types
136
 
137
  def to_dict(self):
138
  output = super().to_dict()
 
204
  self.max_position_embeddings = text_config.max_position_embeddings
205
  self.image_token_id = image_token_id
206
  # Help Auto classes find the correct implementation when saving/loading.
207
+ super().__init__(**kwargs)
modeling_step3p7.py CHANGED
@@ -199,36 +199,40 @@ class Step3p7PreTrainedModel(PreTrainedModel):
199
  class Step3p7RotaryEmbedding(nn.Module):
200
  def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
201
  super().__init__()
202
- # BC: "rope_type" was originally "type"
203
  self.layer_idx = layer_idx
204
- self.original_rope_parameters = None
205
- if config.rope_parameters is not None:
206
- self.original_rope_parameters = config.rope_parameters
207
- config.rope_parameters = dict(config.rope_parameters)
208
- self.rope_type = config.rope_parameters.get(
209
- "rope_type", config.rope_parameters.get("type")
210
- )
211
- else:
212
- self.rope_type = "default"
213
  self.max_seq_len_cached = config.max_position_embeddings
214
  self.original_max_seq_len = config.max_position_embeddings
215
 
216
- partial_rotary_factors = getattr(
217
- config, "partial_rotary_factors", None
218
- )
 
 
 
219
  if partial_rotary_factors is not None:
220
- config.partial_rotary_factor = partial_rotary_factors[self.layer_idx]
221
- else:
222
- config.partial_rotary_factor = 1.0
223
 
224
- self.rope_theta = config.rope_theta
225
- if isinstance(config.rope_theta, list):
226
- self.rope_theta = config.rope_theta.copy()
227
- config.rope_theta = self.rope_theta[self.layer_idx]
228
 
229
  self.config = copy.copy(config)
 
 
 
230
  if config.rope_parameters is not None:
231
- self.config.rope_parameters = dict(config.rope_parameters)
 
 
 
 
 
 
 
 
 
 
232
  self.rope_init_fn = self.compute_default_rope_parameters
233
  if self.rope_type != "default":
234
  self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
@@ -238,8 +242,6 @@ class Step3p7RotaryEmbedding(nn.Module):
238
 
239
  self.register_buffer("inv_freq", inv_freq, persistent=False)
240
  self.original_inv_freq = self.inv_freq
241
- config.rope_theta = self.rope_theta
242
- config.rope_parameters = self.original_rope_parameters
243
 
244
  @torch.no_grad()
245
  @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
@@ -288,10 +290,14 @@ class Step3p7RotaryEmbedding(nn.Module):
288
  post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
289
  """
290
  base = config.rope_theta
291
- dim = (
 
 
 
292
  getattr(config, "head_dim", None)
293
  or config.hidden_size // config.num_attention_heads
294
  )
 
295
 
296
  attention_factor = 1.0 # Unused in this type of RoPE
297
 
@@ -968,7 +974,6 @@ class Step3p7TextModel(Step3p7TextPreTrainedModel, GenerationMixin):
968
  mask_kwargs = {
969
  "config": self.config,
970
  "attention_mask": attention_mask,
971
- "cache_position": cache_position,
972
  "past_key_values": past_key_values,
973
  "position_ids": position_ids,
974
  }
@@ -1381,7 +1386,12 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
1381
  **kwargs,
1382
  )
1383
 
1384
- if cache_position[0] == 0:
 
 
 
 
 
1385
  # During cached decoding, input ids no longer contain image tokens,
1386
  # so pixel values should only be passed at the first step.
1387
  model_inputs["pixel_values"] = pixel_values
@@ -1392,4 +1402,4 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
1392
  if key.startswith("language_model."):
1393
  return key[len("language_model.") :], True
1394
 
1395
- return key, False
 
199
  class Step3p7RotaryEmbedding(nn.Module):
200
  def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
201
  super().__init__()
 
202
  self.layer_idx = layer_idx
 
 
 
 
 
 
 
 
 
203
  self.max_seq_len_cached = config.max_position_embeddings
204
  self.original_max_seq_len = config.max_position_embeddings
205
 
206
+ rope_theta = config.rope_theta
207
+ if isinstance(rope_theta, list):
208
+ rope_theta = rope_theta[0 if layer_idx is None else layer_idx]
209
+
210
+ partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
211
+ partial_rotary_factors = getattr(config, "partial_rotary_factors", None)
212
  if partial_rotary_factors is not None:
213
+ partial_rotary_factor = partial_rotary_factors[
214
+ 0 if layer_idx is None else layer_idx
215
+ ]
216
 
217
+ self.rope_theta = rope_theta
218
+ self.partial_rotary_factor = partial_rotary_factor
 
 
219
 
220
  self.config = copy.copy(config)
221
+ self.config.rope_theta = rope_theta
222
+ self.config.partial_rotary_factor = partial_rotary_factor
223
+
224
  if config.rope_parameters is not None:
225
+ self.config.rope_parameters = copy.deepcopy(config.rope_parameters)
226
+ self.config.rope_parameters["rope_theta"] = rope_theta
227
+ self.config.rope_parameters["partial_rotary_factor"] = (
228
+ partial_rotary_factor
229
+ )
230
+ self.rope_type = self.config.rope_parameters.get(
231
+ "rope_type", self.config.rope_parameters.get("type")
232
+ )
233
+ else:
234
+ self.rope_type = "default"
235
+
236
  self.rope_init_fn = self.compute_default_rope_parameters
237
  if self.rope_type != "default":
238
  self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
242
 
243
  self.register_buffer("inv_freq", inv_freq, persistent=False)
244
  self.original_inv_freq = self.inv_freq
 
 
245
 
246
  @torch.no_grad()
247
  @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
 
290
  post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
291
  """
292
  base = config.rope_theta
293
+ partial_rotary_factor = getattr(
294
+ config, "partial_rotary_factor", 1.0
295
+ )
296
+ head_dim = (
297
  getattr(config, "head_dim", None)
298
  or config.hidden_size // config.num_attention_heads
299
  )
300
+ dim = int(head_dim * partial_rotary_factor)
301
 
302
  attention_factor = 1.0 # Unused in this type of RoPE
303
 
 
974
  mask_kwargs = {
975
  "config": self.config,
976
  "attention_mask": attention_mask,
 
977
  "past_key_values": past_key_values,
978
  "position_ids": position_ids,
979
  }
 
1386
  **kwargs,
1387
  )
1388
 
1389
+ generation_cache_position = model_inputs.get("cache_position", cache_position)
1390
+ is_prefill = past_key_values is None
1391
+ if generation_cache_position is not None and generation_cache_position.numel() > 0:
1392
+ is_prefill = generation_cache_position[0].item() == 0
1393
+
1394
+ if is_prefill:
1395
  # During cached decoding, input ids no longer contain image tokens,
1396
  # so pixel values should only be passed at the first step.
1397
  model_inputs["pixel_values"] = pixel_values
 
1402
  if key.startswith("language_model."):
1403
  return key[len("language_model.") :], True
1404
 
1405
+ return key, False
processing_step3.py CHANGED
@@ -16,6 +16,7 @@ from torchvision.transforms.functional import InterpolationMode
16
  from transformers.feature_extraction_utils import BatchFeature, TensorType
17
  from transformers.image_utils import ImageInput
18
  from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 
19
  from math import ceil
20
  from itertools import product
21
 
@@ -255,6 +256,16 @@ class Step3VLProcessor(ProcessorMixin):
255
  attributes = ["tokenizer"]
256
  tokenizer_class = "AutoTokenizer"
257
 
 
 
 
 
 
 
 
 
 
 
258
  def __init__(
259
  self,
260
  tokenizer=None,
 
16
  from transformers.feature_extraction_utils import BatchFeature, TensorType
17
  from transformers.image_utils import ImageInput
18
  from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
19
+ from transformers.tokenization_utils_tokenizers import TokenizersBackend
20
  from math import ceil
21
  from itertools import product
22
 
 
256
  attributes = ["tokenizer"]
257
  tokenizer_class = "AutoTokenizer"
258
 
259
+ @classmethod
260
+ def _load_tokenizer_from_pretrained(
261
+ cls, sub_processor_type, pretrained_model_name_or_path, subfolder="", **kwargs
262
+ ):
263
+ return TokenizersBackend.from_pretrained(
264
+ pretrained_model_name_or_path,
265
+ subfolder=subfolder,
266
+ **kwargs,
267
+ )
268
+
269
  def __init__(
270
  self,
271
  tokenizer=None,