mmcarpi commited on
Commit
0347e64
·
verified ·
1 Parent(s): 7e9dc48

Upload custom model with source code and tokenizer

Browse files
Files changed (3) hide show
  1. config.json +1 -0
  2. pytorch_model.bin +1 -1
  3. qwen.py +10 -4
config.json CHANGED
@@ -10,6 +10,7 @@
10
  },
11
  "cls_token_id": 1,
12
  "context_length": 1024,
 
13
  "dropout_rate": 0.0,
14
  "embedding_dim": 384,
15
  "head_dim": 128,
 
10
  },
11
  "cls_token_id": 1,
12
  "context_length": 1024,
13
+ "dropout_p": 0.0,
14
  "dropout_rate": 0.0,
15
  "embedding_dim": 384,
16
  "head_dim": 128,
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:393214c1dd86d89df226806c609500210a77fc89e3a0f3da3bbcf78506365afe
3
  size 274585131
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ff9098ed08946c6a31592eceabe3adaeb14c53f03f1882534dc8f295e55a7e5
3
  size 274585131
qwen.py CHANGED
@@ -49,7 +49,7 @@ class FlexQwenConfig(PretrainedConfig):
49
  cls_token_id: int = 1,
50
  pad_token_id: int = 3,
51
  tie_word_embeddings: bool = False,
52
- dropout_rate: float = 0.0,
53
  **kwargs,
54
  ):
55
  super().__init__(
@@ -87,7 +87,7 @@ class FlexQwenConfig(PretrainedConfig):
87
  # Standard HF Config params
88
  self.tie_word_embeddings = tie_word_embeddings
89
 
90
- self.dropout_rate = dropout_rate
91
 
92
 
93
  class FlexQwenPreTrainedModel(PreTrainedModel):
@@ -117,6 +117,7 @@ class GroupedQueryAttention(nn.Module):
117
  head_dim: int | None = None,
118
  qk_norm: bool = False,
119
  rms_norm_eps: float = 1e-6,
 
120
  device: torch.device | None = None,
121
  dtype: torch.dtype | None = None,
122
  ):
@@ -129,6 +130,7 @@ class GroupedQueryAttention(nn.Module):
129
  self.num_heads = num_heads
130
  self.num_kv_groups = num_kv_groups
131
  self.group_size = num_heads // num_kv_groups
 
132
 
133
  if head_dim is None:
134
  assert in_features % num_heads == 0, (
@@ -210,7 +212,7 @@ class GroupedQueryAttention(nn.Module):
210
  key,
211
  value,
212
  attn_mask=attention_mask,
213
- dropout_p=0.0,
214
  enable_gqa=True,
215
  )
216
  out = self.out_proj(
@@ -234,6 +236,7 @@ class Transformer(nn.Module):
234
  moe_num_experts: int = 0,
235
  moe_hidden_dim: int = 128,
236
  rms_norm_eps: float = 1e-6,
 
237
  device: torch.device | None = None,
238
  dtype: torch.dtype | None = None,
239
  ):
@@ -245,6 +248,8 @@ class Transformer(nn.Module):
245
  head_dim=head_dim,
246
  num_kv_groups=num_kv_groups,
247
  qk_norm=qk_norm,
 
 
248
  **factory_kwargs,
249
  )
250
 
@@ -331,6 +336,7 @@ class FlexQwen(FlexQwenPreTrainedModel):
331
  moe_num_experts=config.moe_num_experts,
332
  moe_hidden_dim=config.moe_hidden_dim,
333
  rms_norm_eps=config.rms_norm_eps,
 
334
  device=device,
335
  dtype=dtype,
336
  )
@@ -499,7 +505,7 @@ class FlexQwenForSequenceClassification(FlexQwenPreTrainedModel):
499
  super().__init__(config)
500
  self.num_labels = config.num_labels
501
  self.model = FlexQwen(config, device=device, dtype=dtype)
502
- self.dropout = nn.Dropout(p=config.dropout_rate)
503
  self.score = CastedLinear(
504
  config.embedding_dim,
505
  self.num_labels,
 
49
  cls_token_id: int = 1,
50
  pad_token_id: int = 3,
51
  tie_word_embeddings: bool = False,
52
+ dropout_p: float = 0.0,
53
  **kwargs,
54
  ):
55
  super().__init__(
 
87
  # Standard HF Config params
88
  self.tie_word_embeddings = tie_word_embeddings
89
 
90
+ self.dropout_p = dropout_p
91
 
92
 
93
  class FlexQwenPreTrainedModel(PreTrainedModel):
 
117
  head_dim: int | None = None,
118
  qk_norm: bool = False,
119
  rms_norm_eps: float = 1e-6,
120
+ dropout_p: float = 0.0,
121
  device: torch.device | None = None,
122
  dtype: torch.dtype | None = None,
123
  ):
 
130
  self.num_heads = num_heads
131
  self.num_kv_groups = num_kv_groups
132
  self.group_size = num_heads // num_kv_groups
133
+ self.dropout_p = dropout_p
134
 
135
  if head_dim is None:
136
  assert in_features % num_heads == 0, (
 
212
  key,
213
  value,
214
  attn_mask=attention_mask,
215
+ dropout_p=self.dropout_p,
216
  enable_gqa=True,
217
  )
218
  out = self.out_proj(
 
236
  moe_num_experts: int = 0,
237
  moe_hidden_dim: int = 128,
238
  rms_norm_eps: float = 1e-6,
239
+ dropout_p: float = 0.0,
240
  device: torch.device | None = None,
241
  dtype: torch.dtype | None = None,
242
  ):
 
248
  head_dim=head_dim,
249
  num_kv_groups=num_kv_groups,
250
  qk_norm=qk_norm,
251
+ rms_norm_eps=rms_norm_eps,
252
+ dropout_p=dropout_p,
253
  **factory_kwargs,
254
  )
255
 
 
336
  moe_num_experts=config.moe_num_experts,
337
  moe_hidden_dim=config.moe_hidden_dim,
338
  rms_norm_eps=config.rms_norm_eps,
339
+ dropout_p=config.dropout_p,
340
  device=device,
341
  dtype=dtype,
342
  )
 
505
  super().__init__(config)
506
  self.num_labels = config.num_labels
507
  self.model = FlexQwen(config, device=device, dtype=dtype)
508
+ self.dropout = nn.Dropout(p=config.dropout_p)
509
  self.score = CastedLinear(
510
  config.embedding_dim,
511
  self.num_labels,