Erin Ho commited on
Commit
35d0d51
·
1 Parent(s): e440269
Files changed (1) hide show
  1. modeling_qwen2_rm.py +2 -123
modeling_qwen2_rm.py CHANGED
@@ -26,7 +26,6 @@ import torch
26
  import torch.utils.checkpoint
27
  from torch import nn
28
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
29
- from transformers.configuration_utils import PretrainedConfig
30
 
31
  from transformers.activations import ACT2FN
32
  from transformers.cache_utils import Cache, DynamicCache#, StaticCache
@@ -46,7 +45,7 @@ from transformers.utils import (
46
  logging,
47
  replace_return_docstrings,
48
  )
49
- # from .configuration_qwen2_rm import Qwen2RMConfig as Qwen2Config
50
 
51
 
52
  if is_flash_attn_2_available():
@@ -56,130 +55,10 @@ if is_flash_attn_2_available():
56
  logger = logging.get_logger(__name__)
57
 
58
 
59
-
60
-
61
-
62
- class Qwen2Config(PretrainedConfig):
63
- r"""
64
- This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
65
- Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
66
- with the defaults will yield a similar configuration to that of
67
- Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
68
-
69
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
70
- documentation from [`PretrainedConfig`] for more information.
71
-
72
-
73
- Args:
74
- vocab_size (`int`, *optional*, defaults to 151936):
75
- Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
76
- `inputs_ids` passed when calling [`Qwen2Model`]
77
- hidden_size (`int`, *optional*, defaults to 4096):
78
- Dimension of the hidden representations.
79
- intermediate_size (`int`, *optional*, defaults to 22016):
80
- Dimension of the MLP representations.
81
- num_hidden_layers (`int`, *optional*, defaults to 32):
82
- Number of hidden layers in the Transformer encoder.
83
- num_attention_heads (`int`, *optional*, defaults to 32):
84
- Number of attention heads for each attention layer in the Transformer encoder.
85
- num_key_value_heads (`int`, *optional*, defaults to 32):
86
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
87
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
88
- `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
89
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
90
- by meanpooling all the original heads within that group. For more details checkout [this
91
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
92
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
93
- The non-linear activation function (function or string) in the decoder.
94
- max_position_embeddings (`int`, *optional*, defaults to 32768):
95
- The maximum sequence length that this model might ever be used with.
96
- initializer_range (`float`, *optional*, defaults to 0.02):
97
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
98
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
99
- The epsilon used by the rms normalization layers.
100
- use_cache (`bool`, *optional*, defaults to `True`):
101
- Whether or not the model should return the last key/values attentions (not used by all models). Only
102
- relevant if `config.is_decoder=True`.
103
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
104
- Whether the model's input and output word embeddings should be tied.
105
- rope_theta (`float`, *optional*, defaults to 10000.0):
106
- The base period of the RoPE embeddings.
107
- use_sliding_window (`bool`, *optional*, defaults to `False`):
108
- Whether to use sliding window attention.
109
- sliding_window (`int`, *optional*, defaults to 4096):
110
- Sliding window attention (SWA) window size. If not specified, will default to `4096`.
111
- max_window_layers (`int`, *optional*, defaults to 28):
112
- The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
113
- attention_dropout (`float`, *optional*, defaults to 0.0):
114
- The dropout ratio for the attention probabilities.
115
-
116
- ```python
117
- >>> from transformers import Qwen2Model, Qwen2Config
118
-
119
- >>> # Initializing a Qwen2 style configuration
120
- >>> configuration = Qwen2Config()
121
-
122
- >>> # Initializing a model from the Qwen2-7B style configuration
123
- >>> model = Qwen2Model(configuration)
124
-
125
- >>> # Accessing the model configuration
126
- >>> configuration = model.config
127
- ```"""
128
-
129
- model_type = "qwen2"
130
- keys_to_ignore_at_inference = ["past_key_values"]
131
-
132
- def __init__(
133
- self,
134
- vocab_size=151936,
135
- hidden_size=4096,
136
- intermediate_size=22016,
137
- num_hidden_layers=32,
138
- num_attention_heads=32,
139
- num_key_value_heads=32,
140
- hidden_act="silu",
141
- max_position_embeddings=32768,
142
- initializer_range=0.02,
143
- rms_norm_eps=1e-6,
144
- use_cache=True,
145
- tie_word_embeddings=False,
146
- rope_theta=10000.0,
147
- use_sliding_window=False,
148
- sliding_window=4096,
149
- max_window_layers=28,
150
- attention_dropout=0.0,
151
- **kwargs,
152
- ):
153
- self.vocab_size = vocab_size
154
- self.max_position_embeddings = max_position_embeddings
155
- self.hidden_size = hidden_size
156
- self.intermediate_size = intermediate_size
157
- self.num_hidden_layers = num_hidden_layers
158
- self.num_attention_heads = num_attention_heads
159
- self.use_sliding_window = use_sliding_window
160
- self.sliding_window = sliding_window if use_sliding_window else None
161
- self.max_window_layers = max_window_layers
162
-
163
- # for backward compatibility
164
- if num_key_value_heads is None:
165
- num_key_value_heads = num_attention_heads
166
-
167
- self.num_key_value_heads = num_key_value_heads
168
- self.hidden_act = hidden_act
169
- self.initializer_range = initializer_range
170
- self.rms_norm_eps = rms_norm_eps
171
- self.use_cache = use_cache
172
- self.rope_theta = rope_theta
173
- self.attention_dropout = attention_dropout
174
-
175
- super().__init__(
176
- tie_word_embeddings=tie_word_embeddings,
177
- **kwargs,
178
- )
179
-
180
  _CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
181
  _CONFIG_FOR_DOC = "Qwen2Config"
182
 
 
183
  # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
184
  def _prepare_4d_causal_attention_mask_with_cache_position(
185
  attention_mask: torch.Tensor,
 
26
  import torch.utils.checkpoint
27
  from torch import nn
28
  from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
29
 
30
  from transformers.activations import ACT2FN
31
  from transformers.cache_utils import Cache, DynamicCache#, StaticCache
 
45
  logging,
46
  replace_return_docstrings,
47
  )
48
+ from .configuration_qwen2_rm import Qwen2RMConfig as Qwen2Config
49
 
50
 
51
  if is_flash_attn_2_available():
 
55
  logger = logging.get_logger(__name__)
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  _CHECKPOINT_FOR_DOC = "Qwen/Qwen2-7B-beta"
59
  _CONFIG_FOR_DOC = "Qwen2Config"
60
 
61
+
62
  # Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
63
  def _prepare_4d_causal_attention_mask_with_cache_position(
64
  attention_mask: torch.Tensor,