shruthib commited on
Commit
795a2b1
·
verified ·
1 Parent(s): fd80a76

Update transformers to 4.51.3 (#18)

Browse files

- Update transformers to 4.51.3 (43e59dc346665e6de9e4edfd1f3a4807c629804e)
- Raise error if vision tower receives kwargs (443f857075c53b4af6ed4c00af8036c9cc9a11ce)

Files changed (4) hide show
  1. README.md +2 -2
  2. config.json +19 -2
  3. generation_config.json +1 -1
  4. modeling_maira2.py +32 -8
README.md CHANGED
@@ -84,10 +84,10 @@ pillow
84
  protobuf
85
  sentencepiece
86
  torch
87
- transformers>=4.48.0,<4.49
88
  ```
89
 
90
- Note: MAIRA-2 has last been tested with transformers v4.48.0.
91
 
92
  First, initialise the model and put it in eval mode.
93
  ```python
 
84
  protobuf
85
  sentencepiece
86
  torch
87
+ transformers>=4.48.0,<4.52
88
  ```
89
 
90
+ Note: MAIRA-2 has last been tested with transformers v4.51.3.
91
 
92
  First, initialise the model and put it in eval mode.
93
  ```python
config.json CHANGED
@@ -8,7 +8,6 @@
8
  "AutoModelForVision2Seq": "modeling_maira2.Maira2ForConditionalGeneration"
9
  },
10
  "hidden_size": 4096,
11
- "ignore_index": -100,
12
  "image_seq_length": 576,
13
  "image_token_index": 32204,
14
  "model_type": "maira2",
@@ -21,19 +20,33 @@
21
  "architectures": [
22
  "LlamaForCausalLM"
23
  ],
 
 
 
 
 
 
 
24
  "max_position_embeddings": 4096,
 
25
  "model_type": "llama",
 
 
 
26
  "pad_token_id": 0,
 
27
  "rms_norm_eps": 1e-05,
28
  "rope_scaling": {
29
  "factor": 1.5,
30
  "rope_type": "linear"
31
  },
 
32
  "torch_dtype": "bfloat16",
 
33
  "vocab_size": 32207
34
  },
35
  "torch_dtype": "float32",
36
- "transformers_version": "4.48.0",
37
  "vision_config": {
38
  "apply_layernorm": true,
39
  "architectures": [
@@ -45,11 +58,13 @@
45
  "hidden_dropout_prob": 0.0,
46
  "hidden_size": 768,
47
  "image_size": 518,
 
48
  "layer_norm_eps": 1e-06,
49
  "layerscale_value": 1.0,
50
  "mlp_ratio": 4,
51
  "model_type": "dinov2",
52
  "num_attention_heads": 12,
 
53
  "num_hidden_layers": 12,
54
  "out_features": [
55
  "stage12"
@@ -57,6 +72,7 @@
57
  "out_indices": [
58
  12
59
  ],
 
60
  "qkv_bias": true,
61
  "reshape_hidden_states": false,
62
  "stage_names": [
@@ -75,6 +91,7 @@
75
  "stage12"
76
  ],
77
  "torch_dtype": "float32",
 
78
  "use_swiglu_ffn": false
79
  },
80
  "vision_feature_layer": -1,
 
8
  "AutoModelForVision2Seq": "modeling_maira2.Maira2ForConditionalGeneration"
9
  },
10
  "hidden_size": 4096,
 
11
  "image_seq_length": 576,
12
  "image_token_index": 32204,
13
  "model_type": "maira2",
 
20
  "architectures": [
21
  "LlamaForCausalLM"
22
  ],
23
+ "attention_bias": false,
24
+ "attention_dropout": 0.0,
25
+ "head_dim": 128,
26
+ "hidden_act": "silu",
27
+ "hidden_size": 4096,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 11008,
30
  "max_position_embeddings": 4096,
31
+ "mlp_bias": false,
32
  "model_type": "llama",
33
+ "num_attention_heads": 32,
34
+ "num_hidden_layers": 32,
35
+ "num_key_value_heads": 32,
36
  "pad_token_id": 0,
37
+ "pretraining_tp": 1,
38
  "rms_norm_eps": 1e-05,
39
  "rope_scaling": {
40
  "factor": 1.5,
41
  "rope_type": "linear"
42
  },
43
+ "rope_theta": 10000.0,
44
  "torch_dtype": "bfloat16",
45
+ "use_cache": true,
46
  "vocab_size": 32207
47
  },
48
  "torch_dtype": "float32",
49
+ "transformers_version": "4.51.3",
50
  "vision_config": {
51
  "apply_layernorm": true,
52
  "architectures": [
 
58
  "hidden_dropout_prob": 0.0,
59
  "hidden_size": 768,
60
  "image_size": 518,
61
+ "initializer_range": 0.02,
62
  "layer_norm_eps": 1e-06,
63
  "layerscale_value": 1.0,
64
  "mlp_ratio": 4,
65
  "model_type": "dinov2",
66
  "num_attention_heads": 12,
67
+ "num_channels": 3,
68
  "num_hidden_layers": 12,
69
  "out_features": [
70
  "stage12"
 
72
  "out_indices": [
73
  12
74
  ],
75
+ "patch_size": 14,
76
  "qkv_bias": true,
77
  "reshape_hidden_states": false,
78
  "stage_names": [
 
91
  "stage12"
92
  ],
93
  "torch_dtype": "float32",
94
+ "use_mask_token": true,
95
  "use_swiglu_ffn": false
96
  },
97
  "vision_feature_layer": -1,
generation_config.json CHANGED
@@ -5,5 +5,5 @@
5
  "max_length": 4096,
6
  "max_new_tokens": 450,
7
  "pad_token_id": 0,
8
- "transformers_version": "4.48.0"
9
  }
 
5
  "max_length": 4096,
6
  "max_new_tokens": 450,
7
  "pad_token_id": 0,
8
+ "transformers_version": "4.51.3"
9
  }
modeling_maira2.py CHANGED
@@ -2,9 +2,16 @@
2
  # Licensed under the MSRLA License. See LICENSE in the repo root for license information.
3
 
4
 
 
 
5
  import torch
6
  from torch.nn import Linear, Module, Sequential
7
- from transformers import AutoBackbone, AutoModelForCausalLM, LlavaForConditionalGeneration, LlavaPreTrainedModel
 
 
 
 
 
8
  from transformers.activations import ACT2FN
9
  from transformers.utils import check_min_version
10
 
@@ -48,7 +55,6 @@ class Maira2ForConditionalGeneration(LlavaForConditionalGeneration):
48
  config_class = Maira2Config
49
 
50
  def __init__(self, config: Maira2Config) -> None:
51
-
52
  # Check transformers version is at least 4.46.0.dev0 otherwise the model fails
53
  # silently since get_image_features is not called in the forward pass
54
  check_min_version("4.46.0.dev0")
@@ -62,11 +68,17 @@ class Maira2ForConditionalGeneration(LlavaForConditionalGeneration):
62
  config.text_config,
63
  attn_implementation=config._attn_implementation,
64
  )
65
- self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
 
 
66
  self.post_init()
67
 
68
  def get_image_features(
69
- self, pixel_values: torch.FloatTensor, vision_feature_layer: int, vision_feature_select_strategy: str
 
 
 
 
70
  ) -> torch.Tensor:
71
  """
72
  This method extracts the image features from the vision backbone using the specified feature layer and
@@ -74,15 +86,27 @@ class Maira2ForConditionalGeneration(LlavaForConditionalGeneration):
74
  class instead of the `hidden_states` which are used in the default implementation of `get_image_features` in LlavaForConditionalGeneration.
75
  The feature_maps returned by Dinov2Backbone are the hideen_states with a layernorm applied to them.
76
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
 
78
  selected_image_feature = image_outputs.feature_maps[vision_feature_layer]
79
 
80
  if vision_feature_select_strategy == "default":
81
  selected_image_feature = selected_image_feature[:, 1:]
82
- elif vision_feature_select_strategy == "full":
83
- selected_image_feature = selected_image_feature
84
- else:
85
- raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
86
 
87
  image_features = self.multi_modal_projector(selected_image_feature)
88
  return image_features # type: ignore[no-any-return]
 
2
  # Licensed under the MSRLA License. See LICENSE in the repo root for license information.
3
 
4
 
5
+ from typing import Any
6
+
7
  import torch
8
  from torch.nn import Linear, Module, Sequential
9
+ from transformers import (
10
+ AutoBackbone,
11
+ AutoModelForCausalLM,
12
+ LlavaForConditionalGeneration,
13
+ LlavaPreTrainedModel,
14
+ )
15
  from transformers.activations import ACT2FN
16
  from transformers.utils import check_min_version
17
 
 
55
  config_class = Maira2Config
56
 
57
  def __init__(self, config: Maira2Config) -> None:
 
58
  # Check transformers version is at least 4.46.0.dev0 otherwise the model fails
59
  # silently since get_image_features is not called in the forward pass
60
  check_min_version("4.46.0.dev0")
 
68
  config.text_config,
69
  attn_implementation=config._attn_implementation,
70
  )
71
+ self.pad_token_id = (
72
+ self.config.pad_token_id if self.config.pad_token_id is not None else -1
73
+ )
74
  self.post_init()
75
 
76
  def get_image_features(
77
+ self,
78
+ pixel_values: torch.FloatTensor,
79
+ vision_feature_layer: int | list[int],
80
+ vision_feature_select_strategy: str,
81
+ **kwargs: Any,
82
  ) -> torch.Tensor:
83
  """
84
  This method extracts the image features from the vision backbone using the specified feature layer and
 
86
  class instead of the `hidden_states` which are used in the default implementation of `get_image_features` in LlavaForConditionalGeneration.
87
  The feature_maps returned by Dinov2Backbone are the hideen_states with a layernorm applied to them.
88
  """
89
+ if isinstance(vision_feature_layer, list):
90
+ raise ValueError(
91
+ "MAIRA-2 does not support list values for vision_feature_layer."
92
+ )
93
+
94
+ if vision_feature_select_strategy not in ["default", "full"]:
95
+ raise ValueError(
96
+ f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
97
+ )
98
+
99
+ extra_kwargs = {k: v for k, v in kwargs.items() if v is not None}
100
+ if extra_kwargs:
101
+ raise ValueError(
102
+ f"MAIRA-2 does not support passing extra kwargs to the vision tower, received: {extra_kwargs}"
103
+ )
104
  image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
105
+
106
  selected_image_feature = image_outputs.feature_maps[vision_feature_layer]
107
 
108
  if vision_feature_select_strategy == "default":
109
  selected_image_feature = selected_image_feature[:, 1:]
 
 
 
 
110
 
111
  image_features = self.multi_modal_projector(selected_image_feature)
112
  return image_features # type: ignore[no-any-return]