Safetensors
Hebrew
neobert
custom_code
Shaltiel commited on
Commit
834c629
·
verified ·
1 Parent(s): a824d0f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ vocab.txt filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "NeoBERTLMHead"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "modeling_neobert.NeoBERTConfig",
7
+ "AutoModel": "modeling_neobert.NeoBERT",
8
+ "AutoModelForMaskedLM": "modeling_neobert.NeoBERTLMHead",
9
+ "AutoModelForSequenceClassification": "modeling_neobert.NeoBERTForSequenceClassification",
10
+ "AutoModelForTokenClassification": "modeling_neobert.NeoBERTForTokenClassification",
11
+ "AutoModelForQuestionAnswering": "modeling_neobert.NeoBERTForQuestionAnswering"
12
+ },
13
+ "decoder_init_range": 0.02,
14
+ "dim_head": 64,
15
+ "embedding_init_range": 0.02,
16
+ "encoder_init_range": 0.02,
17
+ "hidden_size": 768,
18
+ "intermediate_size": 3072,
19
+ "kwargs": {
20
+ "decoder_init_range": 0.02
21
+ },
22
+ "max_length": 4096,
23
+ "model_type": "neobert",
24
+ "norm_eps": 1e-06,
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 28,
27
+ "pad_token_id": 3,
28
+ "torch_dtype": "bfloat16",
29
+ "transformers_version": "4.53.0",
30
+ "vocab_size": 128000
31
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fd2d7b72b126e959f66dd694757bfb658d88823c171052581c307163127069c
3
+ size 922061824
modeling_neobert.py ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ import torch.nn.functional as F
4
+
5
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
6
+ from torch.nn.functional import scaled_dot_product_attention
7
+
8
+ from typing import Optional, Tuple, Union
9
+ import numpy as np
10
+
11
+ try:
12
+ from xformers.ops import SwiGLU
13
+ except:
14
+ class SwiGLU(nn.Module):
15
+ """
16
+ A Module that mimicks the call to :attr:`xformers.ops.swiglu`,
17
+ and holds the weights for the 3 linear layers
18
+ """
19
+ def __init__(
20
+ self,
21
+ in_features: int,
22
+ hidden_features: int,
23
+ out_features: Optional[int] = None,
24
+ bias: bool = True,
25
+ *,
26
+ _pack_weights: bool = True,
27
+ ) -> None:
28
+ """Create a SwiGLU module
29
+
30
+ Args:
31
+ in_features (int): Number of features of the input
32
+ hidden_features (int): Number of hidden features
33
+ out_features (Optional[int], optional): Number of features of the input. Defaults to None.
34
+ bias (bool, optional): Whether linear layers also include a bias. Defaults to True.
35
+ """
36
+ super().__init__()
37
+ out_features = out_features or in_features
38
+ hidden_features = hidden_features or in_features
39
+
40
+ self.w12: Optional[nn.Linear]
41
+ if _pack_weights:
42
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
43
+ else:
44
+ self.w12 = None
45
+ self.w1 = nn.Linear(in_features, hidden_features, bias=bias)
46
+ self.w2 = nn.Linear(in_features, hidden_features, bias=bias)
47
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
48
+
49
+ self.hidden_features = hidden_features
50
+ self.out_features = out_features
51
+ self.in_features = in_features
52
+ self.op: Optional[SwiGLUOp] = None
53
+
54
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
55
+ """Computes :attr:`swiglu` with the module's weights
56
+
57
+ Args:
58
+ x (torch.Tensor): A Tensor of shape ``[..., in_features]``
59
+
60
+ Returns:
61
+ torch.Tensor: A Tensor of shape ``[..., out_features]``
62
+ """
63
+ if self.w12 is not None:
64
+ gate, x = self.w12(x).chunk(2, dim=-1)
65
+ hidden = F.silu(gate) * x
66
+ else:
67
+ x1 = self.w1(x)
68
+ x2 = self.w2(x)
69
+ hidden = F.silu(x1) * x2
70
+
71
+ return self.w3(hidden)
72
+
73
+
74
+ try:
75
+ from flash_attn.flash_attn_interface import flash_attn_varlen_func
76
+
77
+ FLASH_ATTN_AVAILABLE = True
78
+ except ImportError:
79
+ FLASH_ATTN_AVAILABLE = False
80
+
81
+ from transformers import (
82
+ PreTrainedModel,
83
+ PretrainedConfig,
84
+ DataCollatorForLanguageModeling,
85
+ )
86
+ from transformers.modeling_outputs import (
87
+ BaseModelOutput,
88
+ MaskedLMOutput,
89
+ SequenceClassifierOutput,
90
+ TokenClassifierOutput,
91
+ QuestionAnsweringModelOutput
92
+ )
93
+
94
+ import torch
95
+ from typing import Tuple
96
+
97
+ def precompute_freqs(dim: int, end: int, theta: float = 10000.0, *, device=None, dtype=torch.float32):
98
+ """
99
+ Returns (cos, sin) tensors of shape [end, dim//2], no complex dtype.
100
+ """
101
+ h = dim // 2
102
+ idx = torch.arange(0, h, device=device, dtype=dtype)
103
+ inv_freq = 1.0 / (theta ** ((2.0 * idx) / dim))
104
+ t = torch.arange(end, device=device, dtype=dtype)
105
+ angles = torch.outer(t, inv_freq) # [L, h]
106
+ return angles.cos(), angles.sin() # ([L, h], [L, h])
107
+
108
+ def reshape_for_broadcast(freqs: torch.Tensor, x: torch.Tensor):
109
+ # freqs: [L, h]; x: [B, L, H, h] for the half-dim tensors
110
+ assert freqs.shape == (x.shape[1], x.shape[-1]), (freqs.shape, x.shape)
111
+ return freqs[None, :, None, :] # [1, L, 1, h]
112
+
113
+ # Rotary embedding without complex numbers (megatron-core pairing: first half with second half)
114
+ def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs: tuple[torch.Tensor, torch.Tensor]):
115
+ # x*: [B, L, H, D]; freqs = (cos[L,h], sin[L,h])
116
+ D = xq.shape[-1]
117
+ h = D // 2
118
+ xq1, xq2 = xq[..., :h], xq[..., h:]
119
+ xk1, xk2 = xk[..., :h], xk[..., h:]
120
+
121
+ cos, sin = freqs
122
+ cos = reshape_for_broadcast(cos.type_as(xq1), xq1) # [1, L, 1, h]
123
+ sin = reshape_for_broadcast(sin.type_as(xq1), xq1) # [1, L, 1, h]
124
+
125
+ q1 = xq1 * cos - xq2 * sin
126
+ q2 = xq1 * sin + xq2 * cos
127
+ k1 = xk1 * cos - xk2 * sin
128
+ k2 = xk1 * sin + xk2 * cos
129
+
130
+ return torch.cat([q1, q2], dim=-1), torch.cat([k1, k2], dim=-1)
131
+
132
+ class NeoBERTEagerRMSNorm(nn.Module):
133
+ def __init__(self, hidden_size, eps=1e-6):
134
+ """
135
+ NeoBERTEagerRMSNorm is equivalent to nn.RMSNorm
136
+ """
137
+ super().__init__()
138
+ self.weight = nn.Parameter(torch.ones(hidden_size))
139
+ self.variance_epsilon = eps
140
+
141
+ def forward(self, hidden_states):
142
+ input_dtype = hidden_states.dtype
143
+ hidden_states = hidden_states.to(torch.float32)
144
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
145
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
146
+ return self.weight * hidden_states.to(input_dtype)
147
+
148
+ def extra_repr(self):
149
+ return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
150
+
151
+
152
+ class NeoBERTConfig(PretrainedConfig):
153
+ model_type = "neobert"
154
+
155
+ # All config parameters must have a default value.
156
+ def __init__(
157
+ self,
158
+ hidden_size: int = 768,
159
+ num_hidden_layers: int = 28,
160
+ num_attention_heads: int = 12,
161
+ intermediate_size: int = 3072,
162
+ embedding_init_range: float = 0.02,
163
+ encoder_init_range: float = 0.02,
164
+ norm_eps: float = 1e-06,
165
+ vocab_size: int = 30522,
166
+ pad_token_id: int = 0,
167
+ max_length: int = 1024,
168
+ **kwargs,
169
+ ):
170
+ super().__init__(**kwargs)
171
+
172
+ self.hidden_size = hidden_size
173
+ self.num_hidden_layers = num_hidden_layers
174
+ self.num_attention_heads = num_attention_heads
175
+ if hidden_size % num_attention_heads != 0:
176
+ raise ValueError("Hidden size must be divisible by the number of heads.")
177
+ self.dim_head = hidden_size // num_attention_heads
178
+ self.intermediate_size = intermediate_size
179
+ self.embedding_init_range = embedding_init_range
180
+ self.encoder_init_range = encoder_init_range
181
+ self.norm_eps = norm_eps
182
+ self.vocab_size = vocab_size
183
+ self.pad_token_id = pad_token_id
184
+ self.max_length = max_length
185
+ self.kwargs = kwargs
186
+
187
+
188
+ class EncoderBlock(nn.Module):
189
+ """Transformer encoder block."""
190
+
191
+ def __init__(self, config: NeoBERTConfig):
192
+ super().__init__()
193
+
194
+ self.config = config
195
+
196
+ # Attention
197
+ self.qkv = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size * 3, bias=False)
198
+ self.wo = nn.Linear(in_features=config.hidden_size, out_features=config.hidden_size, bias=False)
199
+
200
+ # Feedforward network
201
+ # Original NeoBERT:
202
+ # multiple_of = 8
203
+ # intermediate_size = int(2 * config.intermediate_size / 3)
204
+ # intermediate_size = multiple_of * ((intermediate_size + multiple_of - 1) // multiple_of)
205
+ intermediate_size = config.intermediate_size
206
+ self.ffn = SwiGLU(config.hidden_size, intermediate_size, config.hidden_size, bias=False)
207
+
208
+ # Layer norms
209
+ rms_norm_cls = nn.RMSNorm if config._attn_implementation != 'eager' else NeoBERTEagerRMSNorm
210
+ self.attention_norm = rms_norm_cls(config.hidden_size, config.norm_eps)
211
+ self.ffn_norm = rms_norm_cls(config.hidden_size, config.norm_eps)
212
+
213
+ def forward(
214
+ self,
215
+ x: torch.Tensor,
216
+ attention_mask: torch.Tensor,
217
+ freqs_cis: torch.Tensor,
218
+ output_attentions: bool,
219
+ max_seqlen: int = None,
220
+ cu_seqlens: torch.Tensor = None,
221
+ ):
222
+ # Attention
223
+ attn_output, attn_weights = self._att_block(
224
+ self.attention_norm(x), attention_mask, freqs_cis, output_attentions, max_seqlen, cu_seqlens
225
+ )
226
+
227
+ # Residual
228
+ x = x + attn_output
229
+
230
+ # Feed-forward
231
+ x = x + self.ffn(self.ffn_norm(x))
232
+
233
+ return x, attn_weights
234
+
235
+ def _att_block(
236
+ self,
237
+ x: torch.Tensor,
238
+ attention_mask: torch.Tensor,
239
+ freqs_cis: torch.Tensor,
240
+ output_attentions: bool,
241
+ max_seqlen: int = None,
242
+ cu_seqlens: torch.Tensor = None,
243
+ ):
244
+ batch_size, seq_len, _ = x.shape
245
+
246
+ xq, xk, xv = self.qkv(x).view(batch_size, seq_len, self.config.num_attention_heads, self.config.dim_head * 3).chunk(3, axis=-1)
247
+
248
+ xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
249
+
250
+ # Attn block
251
+ attn_weights = None
252
+
253
+ # Flash attention if the tensors are packed
254
+ if cu_seqlens is not None:
255
+ attn = flash_attn_varlen_func(
256
+ q=xq.squeeze(0),
257
+ k=xk.squeeze(0),
258
+ v=xv.squeeze(0),
259
+ cu_seqlens_q=cu_seqlens,
260
+ cu_seqlens_k=cu_seqlens,
261
+ max_seqlen_q=max_seqlen,
262
+ max_seqlen_k=max_seqlen,
263
+ dropout_p=0.0,
264
+ causal=False,
265
+ )
266
+ # Eager attention if attention weights are needed in the output
267
+ elif output_attentions or self.config._attn_implementation == 'eager':
268
+ attn_weights = xq.permute(0, 2, 1, 3) @ xk.permute(0, 2, 3, 1) / (xq.size(-1) ** 0.5)
269
+ if attention_mask is not None:
270
+ attn_weights = attn_weights * attention_mask
271
+ attn_weights = attn_weights.softmax(-1)
272
+ attn = attn_weights @ xv.permute(0, 2, 1, 3)
273
+ attn = attn.transpose(1, 2)
274
+ # Fall back to SDPA otherwise
275
+ else:
276
+ attn = scaled_dot_product_attention(
277
+ query=xq.transpose(1, 2),
278
+ key=xk.transpose(1, 2),
279
+ value=xv.transpose(1, 2),
280
+ attn_mask=attention_mask.bool(),
281
+ dropout_p=0,
282
+ ).transpose(1, 2)
283
+
284
+ return self.wo(attn.reshape(batch_size, seq_len, self.config.num_attention_heads * self.config.dim_head)), attn_weights
285
+
286
+
287
+ class NeoBERTPreTrainedModel(PreTrainedModel):
288
+ config_class = NeoBERTConfig
289
+ base_model_prefix = "model"
290
+ _supports_cache_class = True
291
+
292
+ def _init_weights(self, module):
293
+ if isinstance(module, nn.Linear):
294
+ module.weight.data.uniform_(-self.config.encoder_init_range, self.config.encoder_init_range)
295
+ elif isinstance(module, nn.Embedding):
296
+ module.weight.data.uniform_(-self.config.embedding_init_range, self.config.embedding_init_range)
297
+
298
+
299
+ class NeoBERT(NeoBERTPreTrainedModel):
300
+ config_class = NeoBERTConfig
301
+
302
+ def __init__(self, config: NeoBERTConfig):
303
+ super().__init__(config)
304
+
305
+ self.config = config
306
+
307
+ self.encoder = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
308
+
309
+ # Ensures freqs_cis is moved to the same devices as the model. Non-persistent buffers are not saved in the state_dict.
310
+ cos, sin = precompute_freqs(config.hidden_size // config.num_attention_heads, config.max_length)
311
+ self.register_buffer("freqs_cos", cos, persistent=False)
312
+ self.register_buffer("freqs_sin", sin, persistent=False)
313
+
314
+ self.transformer_encoder = nn.ModuleList()
315
+ for _ in range(config.num_hidden_layers):
316
+ self.transformer_encoder.append(EncoderBlock(config))
317
+
318
+ rms_norm_cls = nn.RMSNorm if config._attn_implementation != 'eager' else NeoBERTEagerRMSNorm
319
+ self.layer_norm = rms_norm_cls(config.hidden_size, config.norm_eps)
320
+
321
+ # Initialize weights and apply final processing
322
+ self.post_init()
323
+
324
+ def forward(
325
+ self,
326
+ input_ids: Optional[torch.Tensor] = None,
327
+ position_ids: torch.Tensor = None,
328
+ max_seqlen: int = None,
329
+ cu_seqlens: torch.Tensor = None,
330
+ attention_mask: torch.Tensor = None,
331
+ inputs_embeds: Optional[torch.Tensor] = None,
332
+ token_type_ids: Optional[torch.Tensor] = None, # kept in to not break compatibility with tokenizer(...), ignored
333
+ output_hidden_states: bool = False,
334
+ output_attentions: bool = False,
335
+ **kwargs,
336
+ ):
337
+ # Initialize
338
+ hidden_states, attentions = [], []
339
+
340
+ if (input_ids is None) ^ (inputs_embeds is not None):
341
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
342
+
343
+ # Expand and repeat: (Batch, Length) -> (Batch, Heads, Length, Length)
344
+ if attention_mask is not None:
345
+ attention_mask = attention_mask[:, None, None, :]
346
+ # attention_mask = attention_mask.unsqueeze(1).unsqueeze(1).repeat(1, self.config.num_attention_heads, attention_mask.size(-1), 1)
347
+
348
+ # Checks to be done if inputs are packed sequences
349
+ if cu_seqlens is not None:
350
+ assert (
351
+ FLASH_ATTN_AVAILABLE
352
+ ), "Flash-attention is not available. Please ''pip install flash_attn'', or provide un-packed sequences."
353
+ assert not output_attentions, "Output attentions is not supported when sequences are packed."
354
+ assert max_seqlen is not None, "Missing max_seqlen. It must be provided when cu_seqlens are not None."
355
+ assert (input_ids if input_ids is not None else inputs_embeds).shape[
356
+ 0
357
+ ] == 1, "Cumulative sequence lengths are provided but inputs are not packed."
358
+ assert (
359
+ input_ids if input_ids is not None else inputs_embeds
360
+ ).is_cuda, "Packing uses an implementation of flash-attention and is only supported on GPU."
361
+
362
+ # RoPE
363
+ if position_ids is not None:
364
+ freqs = (self.freqs_cos[position_ids], self.freqs_sin[position_ids])
365
+ else:
366
+ L = (input_ids if input_ids is not None else inputs_embeds).shape[1]
367
+ freqs = (self.freqs_cos[:L], self.freqs_sin[:L])
368
+
369
+ # Embedding
370
+ x = self.encoder(input_ids) if input_ids is not None else inputs_embeds
371
+
372
+ # Transformer encoder
373
+ for layer in self.transformer_encoder:
374
+ x, attn = layer(x, attention_mask, freqs, output_attentions, max_seqlen, cu_seqlens)
375
+ if output_hidden_states:
376
+ hidden_states.append(x)
377
+ if output_attentions:
378
+ attentions.append(attn)
379
+
380
+ # Final normalization layer
381
+ x = self.layer_norm(x)
382
+
383
+ # Return the output of the last hidden layer
384
+ return BaseModelOutput(
385
+ last_hidden_state=x,
386
+ hidden_states=hidden_states if output_hidden_states else None,
387
+ attentions=attentions if output_attentions else None,
388
+ )
389
+
390
+
391
+ class NeoBERTLMHead(NeoBERTPreTrainedModel):
392
+ config_class = NeoBERTConfig
393
+
394
+ def __init__(self, config: NeoBERTConfig):
395
+ super().__init__(config)
396
+
397
+ self.config = config
398
+
399
+ self.model = NeoBERT(config)
400
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
401
+
402
+ self.post_init()
403
+
404
+ def forward(
405
+ self,
406
+ input_ids: torch.Tensor,
407
+ position_ids: torch.Tensor = None,
408
+ max_seqlen: int = None,
409
+ cu_seqlens: torch.Tensor = None,
410
+ attention_mask: torch.Tensor = None,
411
+ inputs_embeds: Optional[torch.Tensor] = None,
412
+ token_type_ids: Optional[torch.Tensor] = None, # kept in to not break compatibility with tokenizer(...), ignored
413
+ output_hidden_states: bool = False,
414
+ output_attentions: bool = False,
415
+ **kwargs,
416
+ ):
417
+
418
+ output = self.model.forward(
419
+ input_ids=input_ids,
420
+ position_ids=position_ids,
421
+ inputs_embeds=inputs_embeds,
422
+ max_seqlen=max_seqlen,
423
+ cu_seqlens=cu_seqlens,
424
+ attention_mask=attention_mask,
425
+ output_hidden_states=output_hidden_states,
426
+ output_attentions=output_attentions,
427
+ )
428
+ logits = self.decoder(output.last_hidden_state)
429
+
430
+ return MaskedLMOutput(
431
+ hidden_states=output.hidden_states if output_hidden_states else None,
432
+ attentions=output.attentions if output_attentions else None,
433
+ logits=logits,
434
+ )
435
+
436
+
437
+ class NeoBERTForTokenClassification(NeoBERTPreTrainedModel):
438
+ config_class = NeoBERTConfig
439
+
440
+ def __init__(self, config: NeoBERTConfig):
441
+ super().__init__(config)
442
+
443
+ self.config = config
444
+
445
+ self.num_labels = getattr(config, "num_labels", 2)
446
+ self.classifier_dropout = getattr(config, "classifier_dropout", 0.1)
447
+ self.classifier_init_range = getattr(config, "classifier_init_range", 0.02)
448
+
449
+ self.model = NeoBERT(config)
450
+
451
+ self.dense = nn.Linear(self.config.hidden_size, self.config.hidden_size)
452
+ self.dropout = nn.Dropout(self.classifier_dropout)
453
+ self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
454
+
455
+ self.post_init()
456
+
457
+ def _init_weights(self, module):
458
+ if isinstance(module, nn.Linear):
459
+ module.weight.data.normal_(mean=0.0, std=self.classifier_init_range)
460
+ if module.bias is not None:
461
+ module.bias.data.zero_()
462
+
463
+ def forward(
464
+ self,
465
+ input_ids: Optional[torch.Tensor] = None,
466
+ position_ids: torch.Tensor = None,
467
+ max_seqlen: int = None,
468
+ cu_seqlens: torch.Tensor = None,
469
+ attention_mask: torch.Tensor = None,
470
+ inputs_embeds: Optional[torch.Tensor] = None,
471
+ token_type_ids: Optional[torch.Tensor] = None, # kept in to not break compatibility with tokenizer(...), ignored
472
+ output_hidden_states: bool = False,
473
+ output_attentions: bool = False,
474
+ labels: Optional[torch.Tensor] = None,
475
+ return_dict: Optional[bool] = None,
476
+ ):
477
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
478
+
479
+ output = self.model.forward(
480
+ input_ids=input_ids,
481
+ position_ids=position_ids,
482
+ inputs_embeds=inputs_embeds,
483
+ max_seqlen=max_seqlen,
484
+ cu_seqlens=cu_seqlens,
485
+ attention_mask=attention_mask,
486
+ output_hidden_states=output_hidden_states,
487
+ output_attentions=output_attentions,
488
+ )
489
+ hidden_states = output.last_hidden_state
490
+
491
+ x = self.dropout(hidden_states)
492
+ x = self.dense(x)
493
+ x = torch.tanh(x)
494
+ x = self.dropout(x)
495
+
496
+ logits = self.classifier(x)
497
+
498
+ loss = None
499
+ if labels is not None:
500
+ loss_fct = CrossEntropyLoss()
501
+ # only keep active parts of the loss
502
+ if attention_mask is not None:
503
+ active_loss = attention_mask.view(-1) == 1
504
+ active_logits = logits.view(-1, self.num_labels)[active_loss]
505
+ active_labels = labels.view(-1)[active_loss]
506
+ loss = loss_fct(active_logits, active_labels)
507
+ else:
508
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
509
+
510
+ if not return_dict:
511
+ result = (logits,)
512
+ return ((loss,) + result) if loss is not None else result
513
+
514
+ return TokenClassifierOutput(
515
+ loss=loss,
516
+ logits=logits,
517
+ hidden_states=output.hidden_states if output_hidden_states else None,
518
+ attentions=output.attentions if output_attentions else None,
519
+ )
520
+
521
+
522
+ class NeoBERTForSequenceClassification(NeoBERTPreTrainedModel):
523
+ config_class = NeoBERTConfig
524
+
525
+ def __init__(self, config: NeoBERTConfig):
526
+ super().__init__(config)
527
+
528
+ self.config = config
529
+
530
+ self.num_labels = getattr(config, "num_labels", 2)
531
+ self.classifier_dropout = getattr(config, "classifier_dropout", 0.1)
532
+ self.classifier_init_range = getattr(config, "classifier_init_range", 0.02)
533
+
534
+ self.model = NeoBERT(config)
535
+
536
+ self.dense = nn.Linear(self.config.hidden_size, self.config.hidden_size)
537
+ self.dropout = nn.Dropout(self.classifier_dropout)
538
+ self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
539
+
540
+ self.post_init()
541
+
542
+ def _init_weights(self, module):
543
+ if isinstance(module, nn.Linear):
544
+ module.weight.data.normal_(mean=0.0, std=self.classifier_init_range)
545
+ if module.bias is not None:
546
+ module.bias.data.zero_()
547
+
548
+ def forward(
549
+ self,
550
+ input_ids: Optional[torch.Tensor] = None,
551
+ position_ids: torch.Tensor = None,
552
+ max_seqlen: int = None,
553
+ cu_seqlens: torch.Tensor = None,
554
+ attention_mask: torch.Tensor = None,
555
+ inputs_embeds: Optional[torch.Tensor] = None,
556
+ token_type_ids: Optional[torch.Tensor] = None, # kept in to not break compatibility with tokenizer(...), ignored
557
+ output_hidden_states: bool = False,
558
+ output_attentions: bool = False,
559
+ labels: Optional[torch.Tensor] = None,
560
+ return_dict: Optional[bool] = None,
561
+ ):
562
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
563
+
564
+ output = self.model.forward(
565
+ input_ids=input_ids,
566
+ position_ids=position_ids,
567
+ inputs_embeds=inputs_embeds,
568
+ max_seqlen=max_seqlen,
569
+ cu_seqlens=cu_seqlens,
570
+ attention_mask=attention_mask,
571
+ output_hidden_states=output_hidden_states,
572
+ output_attentions=output_attentions,
573
+ )
574
+ hidden_states = output.last_hidden_state
575
+
576
+ x = hidden_states[:, 0, :]
577
+ x = self.dropout(x)
578
+ x = self.dense(x)
579
+ x = torch.tanh(x)
580
+ x = self.dropout(x)
581
+
582
+ logits = self.classifier(x)
583
+
584
+ loss = None
585
+ if labels is not None:
586
+ if self.config.problem_type is None:
587
+ if self.num_labels == 1:
588
+ self.config.problem_type = "regression"
589
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
590
+ self.config.problem_type = "single_label_classification"
591
+ else:
592
+ self.config.problem_type = "multi_label_classification"
593
+
594
+ if self.config.problem_type == "regression":
595
+ loss_fct = MSELoss()
596
+ if self.num_labels == 1:
597
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
598
+ else:
599
+ loss = loss_fct(logits, labels)
600
+ elif self.config.problem_type == "single_label_classification":
601
+ loss_fct = CrossEntropyLoss()
602
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
603
+ elif self.config.problem_type == "multi_label_classification":
604
+ loss_fct = BCEWithLogitsLoss()
605
+ loss = loss_fct(logits, labels)
606
+
607
+ if not return_dict:
608
+ result = (logits,)
609
+ return ((loss,) + result) if loss is not None else result
610
+
611
+ return SequenceClassifierOutput(
612
+ loss=loss,
613
+ logits=logits,
614
+ hidden_states=output.hidden_states if output_hidden_states else None,
615
+ attentions=output.attentions if output_attentions else None,
616
+ )
617
+
618
+ class NeoBERTForQuestionAnswering(NeoBERTPreTrainedModel):
619
+ def __init__(self, config):
620
+ super().__init__(config)
621
+ self.num_labels = config.num_labels
622
+
623
+ self.model = NeoBERT(config)
624
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
625
+
626
+ # Initialize weights and apply final processing
627
+ self.post_init()
628
+
629
+ def forward(
630
+ self,
631
+ input_ids: Optional[torch.Tensor] = None,
632
+ position_ids: torch.Tensor = None,
633
+ max_seqlen: int = None,
634
+ cu_seqlens: torch.Tensor = None,
635
+ attention_mask: torch.Tensor = None,
636
+ inputs_embeds: Optional[torch.Tensor] = None,
637
+ token_type_ids: Optional[torch.Tensor] = None, # kept in to not break compatibility with tokenizer(...), ignored
638
+ start_positions: Optional[torch.Tensor] = None,
639
+ end_positions: Optional[torch.Tensor] = None,
640
+ output_hidden_states: bool = False,
641
+ output_attentions: bool = False,
642
+ return_dict: Optional[bool] = None,
643
+ ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
644
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
645
+ if output_attentions or output_hidden_states: return_dict = True
646
+
647
+ output = self.model.forward(
648
+ input_ids=input_ids,
649
+ position_ids=position_ids,
650
+ inputs_embeds=inputs_embeds,
651
+ max_seqlen=max_seqlen,
652
+ cu_seqlens=cu_seqlens,
653
+ attention_mask=attention_mask,
654
+ output_hidden_states=output_hidden_states,
655
+ output_attentions=output_attentions,
656
+ return_dict=True
657
+ )
658
+ hidden_states = output.last_hidden_state
659
+
660
+ logits = self.qa_outputs(hidden_states)
661
+ start_logits, end_logits = logits.split(1, dim=-1)
662
+ start_logits = start_logits.squeeze(-1).contiguous()
663
+ end_logits = end_logits.squeeze(-1).contiguous()
664
+
665
+ total_loss = None
666
+ if start_positions is not None and end_positions is not None:
667
+ # If we are on multi-GPU, split add a dimension
668
+ if len(start_positions.size()) > 1:
669
+ start_positions = start_positions.squeeze(-1)
670
+ if len(end_positions.size()) > 1:
671
+ end_positions = end_positions.squeeze(-1)
672
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
673
+ ignored_index = start_logits.size(1)
674
+ start_positions = start_positions.clamp(0, ignored_index)
675
+ end_positions = end_positions.clamp(0, ignored_index)
676
+
677
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
678
+ start_loss = loss_fct(start_logits, start_positions)
679
+ end_loss = loss_fct(end_logits, end_positions)
680
+ total_loss = (start_loss + end_loss) / 2
681
+
682
+ if not return_dict:
683
+ output = (start_logits, end_logits)
684
+ return ((total_loss,) + output) if total_loss is not None else output
685
+
686
+ return QuestionAnsweringModelOutput(
687
+ loss=total_loss,
688
+ start_logits=start_logits,
689
+ end_logits=end_logits,
690
+ hidden_states=output.hidden_states,
691
+ attentions=output.attentions,
692
+ )
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[SEP]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BLANK]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "bos_token": "[CLS]",
53
+ "clean_up_tokenization_spaces": true,
54
+ "cls_token": "[CLS]",
55
+ "do_lower_case": true,
56
+ "eos_token": "[SEP]",
57
+ "extra_special_tokens": {},
58
+ "mask_token": "[MASK]",
59
+ "model_max_length": 4096,
60
+ "pad_token": "[PAD]",
61
+ "sep_token": "[SEP]",
62
+ "strip_accents": null,
63
+ "tokenize_chinese_chars": true,
64
+ "tokenizer_class": "BertTokenizer",
65
+ "unk_token": "[UNK]"
66
+ }
vocab.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb90bfa35244d26f0065d1fcd0b5becc3da3d44d616a7e2aacaf6320b9fa2d0
3
+ size 1500244