apps1 commited on
Commit
85dacbc
·
verified ·
1 Parent(s): 33f1143

Model save

Browse files
README.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: NeuML/bert-hash-nano
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: without_distillation
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # without_distillation
16
+
17
+ This model is a fine-tuned version of [NeuML/bert-hash-nano](https://huggingface.co/NeuML/bert-hash-nano) on an unknown dataset.
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 0.0005
37
+ - train_batch_size: 32
38
+ - eval_batch_size: 32
39
+ - seed: 42
40
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
41
+ - lr_scheduler_type: linear
42
+ - num_epochs: 10
43
+
44
+ ### Training results
45
+
46
+
47
+
48
+ ### Framework versions
49
+
50
+ - Transformers 5.8.1
51
+ - Pytorch 2.10.0+cu128
52
+ - Datasets 4.8.3
53
+ - Tokenizers 0.22.2
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertHashForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_bert_hash.BertHashConfig",
9
+ "AutoModel": "modeling_bert_hash.BertHashModel",
10
+ "AutoModelForMaskedLM": "modeling_bert_hash.BertHashForMaskedLM",
11
+ "AutoModelForSequenceClassification": "modeling_bert_hash.BertHashForSequenceClassification"
12
+ },
13
+ "bos_token_id": null,
14
+ "classifier_dropout": null,
15
+ "dtype": "float32",
16
+ "eos_token_id": null,
17
+ "hidden_act": "gelu",
18
+ "hidden_dropout_prob": 0.1,
19
+ "hidden_size": 128,
20
+ "id2label": {
21
+ "0": "LABEL_0",
22
+ "1": "LABEL_1",
23
+ "2": "LABEL_2"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 512,
27
+ "is_decoder": false,
28
+ "label2id": {
29
+ "LABEL_0": 0,
30
+ "LABEL_1": 1,
31
+ "LABEL_2": 2
32
+ },
33
+ "layer_norm_eps": 1e-12,
34
+ "max_position_embeddings": 512,
35
+ "model_type": "bert_hash",
36
+ "num_attention_heads": 2,
37
+ "num_hidden_layers": 2,
38
+ "pad_token_id": 0,
39
+ "position_embedding_type": "absolute",
40
+ "problem_type": "single_label_classification",
41
+ "projections": 16,
42
+ "tie_word_embeddings": true,
43
+ "transformers_version": "5.8.1",
44
+ "type_vocab_size": 2,
45
+ "use_cache": false,
46
+ "vocab_size": 30522
47
+ }
configuration_bert_hash.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.models.bert.configuration_bert import BertConfig
2
+
3
+
4
+ class BertHashConfig(BertConfig):
5
+ """
6
+ Extension of Bert configuration to add projections parameter.
7
+ """
8
+
9
+ model_type = "bert_hash"
10
+
11
+ def __init__(self, projections=5, **kwargs):
12
+ super().__init__(**kwargs)
13
+
14
+ self.projections = projections
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4000eb0c9c0a0051512c218a5a4fa3a21d6004df298de68d257294dfdb9fc833
3
+ size 3884844
modeling_bert_hash.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Union
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
6
+
7
+ from transformers.cache_utils import Cache, DynamicCache, EncoderDecoderCache
8
+ from transformers.masking_utils import create_bidirectional_mask, create_causal_mask
9
+ from transformers.models.bert.modeling_bert import BertEncoder, BertPooler, BertPreTrainedModel, BertOnlyMLMHead
10
+ from transformers.modeling_outputs import (
11
+ BaseModelOutputWithPoolingAndCrossAttentions,
12
+ MaskedLMOutput,
13
+ SequenceClassifierOutput,
14
+ )
15
+ from transformers.processing_utils import Unpack
16
+ from transformers.utils import TransformersKwargs, auto_docstring, logging
17
+ from transformers.utils.generic import can_return_tuple, merge_with_config_defaults
18
+ from transformers.utils.output_capturing import capture_outputs
19
+
20
+ from .configuration_bert_hash import BertHashConfig
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ class BertHashTokens(nn.Module):
26
+ """
27
+ Module that embeds token vocabulary to an intermediate embeddings layer then projects those embeddings to the
28
+ hidden size.
29
+
30
+ The number of projections is like a hash. Setting the projections parameter to 5 is like generating a
31
+ 160-bit hash (5 x float32) for each token. That hash is then projected to the hidden size.
32
+
33
+ This significantly reduces the number of parameters necessary for token embeddings.
34
+
35
+ For example:
36
+ Standard token embeddings:
37
+ 30,522 (vocab size) x 768 (hidden size) = 23,440,896 parameters
38
+ 23,440,896 x 4 (float32) = 93,763,584 bytes
39
+
40
+ Hash token embeddings:
41
+ 30,522 (vocab size) x 5 (hash buckets) + 5 x 768 (projection matrix)= 156,450 parameters
42
+ 156,450 x 4 (float32) = 625,800 bytes
43
+ """
44
+
45
+ def __init__(self, config):
46
+ super().__init__()
47
+ self.config = config
48
+
49
+ # Token embeddings
50
+ self.embeddings = nn.Embedding(config.vocab_size, config.projections, padding_idx=config.pad_token_id)
51
+
52
+ # Token embeddings projections
53
+ self.projections = nn.Linear(config.projections, config.hidden_size)
54
+
55
+ def forward(self, input_ids):
56
+ # Project embeddings to hidden size
57
+ return self.projections(self.embeddings(input_ids))
58
+
59
+
60
+ class BertHashEmbeddings(nn.Module):
61
+ """Construct the embeddings from word, position and token_type embeddings."""
62
+
63
+ def __init__(self, config):
64
+ super().__init__()
65
+ self.word_embeddings = BertHashTokens(config)
66
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
67
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
68
+
69
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
70
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
71
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
72
+ self.register_buffer(
73
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
74
+ )
75
+ self.register_buffer(
76
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
77
+ )
78
+
79
+ def forward(
80
+ self,
81
+ input_ids: torch.LongTensor | None = None,
82
+ token_type_ids: torch.LongTensor | None = None,
83
+ position_ids: torch.LongTensor | None = None,
84
+ inputs_embeds: torch.FloatTensor | None = None,
85
+ past_key_values_length: int = 0,
86
+ ) -> torch.Tensor:
87
+ if input_ids is not None:
88
+ input_shape = input_ids.size()
89
+ else:
90
+ input_shape = inputs_embeds.size()[:-1]
91
+
92
+ batch_size, seq_length = input_shape
93
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
94
+
95
+ if position_ids is None:
96
+ position_ids = (
97
+ torch.arange(seq_length, dtype=torch.long, device=device)
98
+ .unsqueeze(0)
99
+ .expand(batch_size, seq_length)
100
+ )
101
+
102
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
103
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
104
+ # issue #5664
105
+ if token_type_ids is None:
106
+ if hasattr(self, "token_type_ids"):
107
+ # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0])
108
+ buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1)
109
+ buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids)
110
+ token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length)
111
+ else:
112
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
113
+
114
+ if inputs_embeds is None:
115
+ inputs_embeds = self.word_embeddings(input_ids)
116
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
117
+ embeddings = inputs_embeds + token_type_embeddings
118
+
119
+ position_embeddings = self.position_embeddings(position_ids)
120
+ embeddings = embeddings + position_embeddings
121
+
122
+ embeddings = self.LayerNorm(embeddings)
123
+ embeddings = self.dropout(embeddings)
124
+ return embeddings
125
+
126
+
127
+ @auto_docstring(
128
+ custom_intro="""
129
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
130
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
131
+ all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
132
+ Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
133
+
134
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
135
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
136
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
137
+ """
138
+ )
139
+ class BertHashModel(BertPreTrainedModel):
140
+ config_class = BertHashConfig
141
+
142
+ _no_split_modules = ["BertEmbeddings", "BertLayer"]
143
+
144
+ def __init__(self, config, add_pooling_layer=True):
145
+ r"""
146
+ add_pooling_layer (bool, *optional*, defaults to `True`):
147
+ Whether to add a pooling layer
148
+ """
149
+ super().__init__(config)
150
+ self.config = config
151
+ self.gradient_checkpointing = False
152
+
153
+ self.embeddings = BertHashEmbeddings(config)
154
+ self.encoder = BertEncoder(config)
155
+
156
+ self.pooler = BertPooler(config) if add_pooling_layer else None
157
+
158
+ # Initialize weights and apply final processing
159
+ self.post_init()
160
+
161
+ def get_input_embeddings(self):
162
+ return self.embeddings.word_embeddings.embeddings
163
+
164
+ def set_input_embeddings(self, value):
165
+ self.embeddings.word_embeddings = value
166
+
167
+ @merge_with_config_defaults
168
+ @capture_outputs
169
+ @auto_docstring
170
+ def forward(
171
+ self,
172
+ input_ids: torch.Tensor | None = None,
173
+ attention_mask: torch.Tensor | None = None,
174
+ token_type_ids: torch.Tensor | None = None,
175
+ position_ids: torch.Tensor | None = None,
176
+ inputs_embeds: torch.Tensor | None = None,
177
+ encoder_hidden_states: torch.Tensor | None = None,
178
+ encoder_attention_mask: torch.Tensor | None = None,
179
+ past_key_values: Cache | None = None,
180
+ use_cache: bool | None = None,
181
+ **kwargs: Unpack[TransformersKwargs],
182
+ ) -> tuple[torch.Tensor] | BaseModelOutputWithPoolingAndCrossAttentions:
183
+ if (input_ids is None) ^ (inputs_embeds is not None):
184
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
185
+
186
+ if self.config.is_decoder:
187
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
188
+ else:
189
+ use_cache = False
190
+
191
+ if use_cache and past_key_values is None:
192
+ past_key_values = (
193
+ EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
194
+ if encoder_hidden_states is not None or self.config.is_encoder_decoder
195
+ else DynamicCache(config=self.config)
196
+ )
197
+
198
+ past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
199
+
200
+ embedding_output = self.embeddings(
201
+ input_ids=input_ids,
202
+ position_ids=position_ids,
203
+ token_type_ids=token_type_ids,
204
+ inputs_embeds=inputs_embeds,
205
+ past_key_values_length=past_key_values_length,
206
+ )
207
+
208
+ attention_mask, encoder_attention_mask = self._create_attention_masks(
209
+ attention_mask=attention_mask,
210
+ encoder_attention_mask=encoder_attention_mask,
211
+ embedding_output=embedding_output,
212
+ encoder_hidden_states=encoder_hidden_states,
213
+ past_key_values=past_key_values,
214
+ )
215
+
216
+ encoder_outputs = self.encoder(
217
+ embedding_output,
218
+ attention_mask=attention_mask,
219
+ encoder_hidden_states=encoder_hidden_states,
220
+ encoder_attention_mask=encoder_attention_mask,
221
+ past_key_values=past_key_values,
222
+ use_cache=use_cache,
223
+ position_ids=position_ids,
224
+ **kwargs,
225
+ )
226
+ sequence_output = encoder_outputs.last_hidden_state
227
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
228
+
229
+ return BaseModelOutputWithPoolingAndCrossAttentions(
230
+ last_hidden_state=sequence_output,
231
+ pooler_output=pooled_output,
232
+ past_key_values=encoder_outputs.past_key_values,
233
+ )
234
+
235
+ def _create_attention_masks(
236
+ self,
237
+ attention_mask,
238
+ encoder_attention_mask,
239
+ embedding_output,
240
+ encoder_hidden_states,
241
+ past_key_values,
242
+ ):
243
+ if self.config.is_decoder:
244
+ attention_mask = create_causal_mask(
245
+ config=self.config,
246
+ inputs_embeds=embedding_output,
247
+ attention_mask=attention_mask,
248
+ past_key_values=past_key_values,
249
+ )
250
+ else:
251
+ attention_mask = create_bidirectional_mask(
252
+ config=self.config,
253
+ inputs_embeds=embedding_output,
254
+ attention_mask=attention_mask,
255
+ )
256
+
257
+ if encoder_attention_mask is not None:
258
+ encoder_attention_mask = create_bidirectional_mask(
259
+ config=self.config,
260
+ inputs_embeds=embedding_output,
261
+ attention_mask=encoder_attention_mask,
262
+ encoder_hidden_states=encoder_hidden_states,
263
+ )
264
+
265
+ return attention_mask, encoder_attention_mask
266
+
267
+
268
+ @auto_docstring
269
+ class BertForMaskedLM(BertPreTrainedModel):
270
+ _tied_weights_keys = {
271
+ "cls.predictions.decoder.weight": "bert.embeddings.word_embeddings.weight",
272
+ "cls.predictions.decoder.bias": "cls.predictions.bias",
273
+ }
274
+ config_class = BertHashConfig
275
+
276
+ def __init__(self, config):
277
+ super().__init__(config)
278
+
279
+ if config.is_decoder:
280
+ logger.warning(
281
+ "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
282
+ "bi-directional self-attention."
283
+ )
284
+
285
+ self.bert = BertHashModel(config, add_pooling_layer=False)
286
+ self.cls = BertOnlyMLMHead(config)
287
+
288
+ # Initialize weights and apply final processing
289
+ self.post_init()
290
+
291
+ def get_output_embeddings(self):
292
+ return self.cls.predictions.decoder
293
+
294
+ def set_output_embeddings(self, new_embeddings):
295
+ self.cls.predictions.decoder = new_embeddings
296
+ self.cls.predictions.bias = new_embeddings.bias
297
+
298
+ @can_return_tuple
299
+ @auto_docstring
300
+ def forward(
301
+ self,
302
+ input_ids: torch.Tensor | None = None,
303
+ attention_mask: torch.Tensor | None = None,
304
+ token_type_ids: torch.Tensor | None = None,
305
+ position_ids: torch.Tensor | None = None,
306
+ inputs_embeds: torch.Tensor | None = None,
307
+ encoder_hidden_states: torch.Tensor | None = None,
308
+ encoder_attention_mask: torch.Tensor | None = None,
309
+ labels: torch.Tensor | None = None,
310
+ **kwargs: Unpack[TransformersKwargs],
311
+ ) -> tuple[torch.Tensor] | MaskedLMOutput:
312
+ r"""
313
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
314
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
315
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
316
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
317
+ """
318
+ outputs = self.bert(
319
+ input_ids,
320
+ attention_mask=attention_mask,
321
+ token_type_ids=token_type_ids,
322
+ position_ids=position_ids,
323
+ inputs_embeds=inputs_embeds,
324
+ encoder_hidden_states=encoder_hidden_states,
325
+ encoder_attention_mask=encoder_attention_mask,
326
+ return_dict=True,
327
+ **kwargs,
328
+ )
329
+
330
+ sequence_output = outputs[0]
331
+ prediction_scores = self.cls(sequence_output)
332
+
333
+ masked_lm_loss = None
334
+ if labels is not None:
335
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
336
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
337
+
338
+ return MaskedLMOutput(
339
+ loss=masked_lm_loss,
340
+ logits=prediction_scores,
341
+ hidden_states=outputs.hidden_states,
342
+ attentions=outputs.attentions,
343
+ )
344
+
345
+
346
+
347
+ @auto_docstring(
348
+ custom_intro="""
349
+ Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
350
+ output) e.g. for GLUE tasks.
351
+ """
352
+ )
353
+ class BertHashForSequenceClassification(BertPreTrainedModel):
354
+ config_class = BertHashConfig
355
+
356
+ def __init__(self, config):
357
+ super().__init__(config)
358
+ self.num_labels = config.num_labels
359
+ self.config = config
360
+
361
+ self.bert = BertHashModel(config)
362
+ classifier_dropout = (
363
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
364
+ )
365
+ self.dropout = nn.Dropout(classifier_dropout)
366
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
367
+
368
+ # Initialize weights and apply final processing
369
+ self.post_init()
370
+
371
+ @can_return_tuple
372
+ @auto_docstring
373
+ def forward(
374
+ self,
375
+ input_ids: torch.Tensor | None = None,
376
+ attention_mask: torch.Tensor | None = None,
377
+ token_type_ids: torch.Tensor | None = None,
378
+ position_ids: torch.Tensor | None = None,
379
+ inputs_embeds: torch.Tensor | None = None,
380
+ labels: torch.Tensor | None = None,
381
+ **kwargs: Unpack[TransformersKwargs],
382
+ ) -> tuple[torch.Tensor] | SequenceClassifierOutput:
383
+ r"""
384
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
385
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
386
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
387
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
388
+ """
389
+ outputs = self.bert(
390
+ input_ids,
391
+ attention_mask=attention_mask,
392
+ token_type_ids=token_type_ids,
393
+ position_ids=position_ids,
394
+ inputs_embeds=inputs_embeds,
395
+ return_dict=True,
396
+ **kwargs,
397
+ )
398
+
399
+ pooled_output = outputs[1]
400
+
401
+ pooled_output = self.dropout(pooled_output)
402
+ logits = self.classifier(pooled_output)
403
+
404
+ loss = None
405
+ if labels is not None:
406
+ if self.config.problem_type is None:
407
+ if self.num_labels == 1:
408
+ self.config.problem_type = "regression"
409
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
410
+ self.config.problem_type = "single_label_classification"
411
+ else:
412
+ self.config.problem_type = "multi_label_classification"
413
+
414
+ if self.config.problem_type == "regression":
415
+ loss_fct = MSELoss()
416
+ if self.num_labels == 1:
417
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
418
+ else:
419
+ loss = loss_fct(logits, labels)
420
+ elif self.config.problem_type == "single_label_classification":
421
+ loss_fct = CrossEntropyLoss()
422
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
423
+ elif self.config.problem_type == "multi_label_classification":
424
+ loss_fct = BCEWithLogitsLoss()
425
+ loss = loss_fct(logits, labels)
426
+
427
+ return SequenceClassifierOutput(
428
+ loss=loss,
429
+ logits=logits,
430
+ hidden_states=outputs.hidden_states,
431
+ attentions=outputs.attentions,
432
+ )
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": true,
5
+ "is_local": false,
6
+ "local_files_only": false,
7
+ "mask_token": "[MASK]",
8
+ "model_max_length": 512,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "strip_accents": null,
12
+ "tokenize_chinese_chars": true,
13
+ "tokenizer_class": "BertTokenizer",
14
+ "unk_token": "[UNK]"
15
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d64176ded3e893eb3173645129081f03bc47574987ac198ce04e75f938f046d
3
+ size 5265