Upload 7 files
Browse files- config.json +2 -2
- configuration_ltgbert.py +26 -2
- modeling_ltgbert.py +44 -15
config.json
CHANGED
|
@@ -5,8 +5,8 @@
|
|
| 5 |
],
|
| 6 |
"attention_probs_dropout_prob": 0.1,
|
| 7 |
"auto_map": {
|
| 8 |
-
"AutoConfig": "
|
| 9 |
-
"AutoModelForMaskedLM": "
|
| 10 |
"AutoModelForSequenceClassification": "modeling_ltgbert.LtgBertForSequenceClassification"
|
| 11 |
},
|
| 12 |
"classifier_dropout": 0.2,
|
|
|
|
| 5 |
],
|
| 6 |
"attention_probs_dropout_prob": 0.1,
|
| 7 |
"auto_map": {
|
| 8 |
+
"AutoConfig": "configuration_ltgbert.LtgBertConfig",
|
| 9 |
+
"AutoModelForMaskedLM": "modeling_ltgbert.LtgBertForMaskedLM",
|
| 10 |
"AutoModelForSequenceClassification": "modeling_ltgbert.LtgBertForSequenceClassification"
|
| 11 |
},
|
| 12 |
"classifier_dropout": 0.2,
|
configuration_ltgbert.py
CHANGED
|
@@ -19,6 +19,30 @@
|
|
| 19 |
from transformers.configuration_utils import PretrainedConfig
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
class LtgBertConfig(PretrainedConfig):
|
| 23 |
r"""
|
| 24 |
This is the configuration class to store the configuration of a [`LtgBertModel`]. It is used to
|
|
@@ -49,7 +73,7 @@ class LtgBertConfig(PretrainedConfig):
|
|
| 49 |
classifier_dropout (`float`, *optional*):
|
| 50 |
The dropout ratio for the classification head.
|
| 51 |
"""
|
| 52 |
-
model_type = "
|
| 53 |
def __init__(
|
| 54 |
self,
|
| 55 |
vocab_size=16384,
|
|
@@ -80,4 +104,4 @@ class LtgBertConfig(PretrainedConfig):
|
|
| 80 |
self.output_all_encoded_layers = output_all_encoded_layers
|
| 81 |
self.position_bucket_size = position_bucket_size
|
| 82 |
self.layer_norm_eps = layer_norm_eps
|
| 83 |
-
self.classifier_dropout = classifier_dropout
|
|
|
|
| 19 |
from transformers.configuration_utils import PretrainedConfig
|
| 20 |
|
| 21 |
|
| 22 |
+
LTG_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
| 23 |
+
"bnc-bert-span": "https://huggingface.co/ltg/bnc-bert-span",
|
| 24 |
+
"bnc-bert-span-2x": "https://huggingface.co/ltg/bnc-bert-span-2x",
|
| 25 |
+
"bnc-bert-span-0.5x": "https://huggingface.co/ltg/bnc-bert-span-0.5x",
|
| 26 |
+
"bnc-bert-span-0.25x": "https://huggingface.co/ltg/bnc-bert-span-0.25x",
|
| 27 |
+
"bnc-bert-span-order": "https://huggingface.co/ltg/bnc-bert-span-order",
|
| 28 |
+
"bnc-bert-span-document": "https://huggingface.co/ltg/bnc-bert-span-document",
|
| 29 |
+
"bnc-bert-span-word": "https://huggingface.co/ltg/bnc-bert-span-word",
|
| 30 |
+
"bnc-bert-span-subword": "https://huggingface.co/ltg/bnc-bert-span-subword",
|
| 31 |
+
|
| 32 |
+
"norbert3-xs": "https://huggingface.co/ltg/norbert3-xs/config.json",
|
| 33 |
+
"norbert3-small": "https://huggingface.co/ltg/norbert3-small/config.json",
|
| 34 |
+
"norbert3-base": "https://huggingface.co/ltg/norbert3-base/config.json",
|
| 35 |
+
"norbert3-large": "https://huggingface.co/ltg/norbert3-large/config.json",
|
| 36 |
+
|
| 37 |
+
"norbert3-oversampled-base": "https://huggingface.co/ltg/norbert3-oversampled-base/config.json",
|
| 38 |
+
"norbert3-ncc-base": "https://huggingface.co/ltg/norbert3-ncc-base/config.json",
|
| 39 |
+
"norbert3-nak-base": "https://huggingface.co/ltg/norbert3-nak-base/config.json",
|
| 40 |
+
"norbert3-nb-base": "https://huggingface.co/ltg/norbert3-nb-base/config.json",
|
| 41 |
+
"norbert3-wiki-base": "https://huggingface.co/ltg/norbert3-wiki-base/config.json",
|
| 42 |
+
"norbert3-c4-base": "https://huggingface.co/ltg/norbert3-c4-base/config.json"
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
class LtgBertConfig(PretrainedConfig):
|
| 47 |
r"""
|
| 48 |
This is the configuration class to store the configuration of a [`LtgBertModel`]. It is used to
|
|
|
|
| 73 |
classifier_dropout (`float`, *optional*):
|
| 74 |
The dropout ratio for the classification head.
|
| 75 |
"""
|
| 76 |
+
model_type = "ltgbert"
|
| 77 |
def __init__(
|
| 78 |
self,
|
| 79 |
vocab_size=16384,
|
|
|
|
| 104 |
self.output_all_encoded_layers = output_all_encoded_layers
|
| 105 |
self.position_bucket_size = position_bucket_size
|
| 106 |
self.layer_norm_eps = layer_norm_eps
|
| 107 |
+
self.classifier_dropout = classifier_dropout
|
modeling_ltgbert.py
CHANGED
|
@@ -39,10 +39,34 @@ from transformers.pytorch_utils import softmax_backward_data
|
|
| 39 |
from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
|
| 40 |
|
| 41 |
|
| 42 |
-
_CHECKPOINT_FOR_DOC = "ltg/
|
| 43 |
_CONFIG_FOR_DOC = "LtgBertConfig"
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
class Encoder(nn.Module):
|
| 47 |
def __init__(self, config, activation_checkpointing=False):
|
| 48 |
super().__init__()
|
|
@@ -224,8 +248,10 @@ class Attention(nn.Module):
|
|
| 224 |
|
| 225 |
attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
|
| 226 |
|
| 227 |
-
|
| 228 |
-
query_pos
|
|
|
|
|
|
|
| 229 |
query = query.view(batch_size, self.num_heads, query_len, self.head_size)
|
| 230 |
key = key.view(batch_size, self.num_heads, query_len, self.head_size)
|
| 231 |
|
|
@@ -367,8 +393,6 @@ class LtgBertModel(LtgBertPreTrainedModel):
|
|
| 367 |
) -> List[torch.Tensor]:
|
| 368 |
if input_ids is not None:
|
| 369 |
input_shape = input_ids.size()
|
| 370 |
-
# elif inputs_embeds is not None:
|
| 371 |
-
# input_shape = inputs_embeds.size()[:-1]
|
| 372 |
else:
|
| 373 |
raise ValueError("You have to specify input_ids")
|
| 374 |
|
|
@@ -380,9 +404,7 @@ class LtgBertModel(LtgBertPreTrainedModel):
|
|
| 380 |
else:
|
| 381 |
attention_mask = ~attention_mask.bool()
|
| 382 |
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
| 383 |
-
|
| 384 |
-
# if inputs_embeds is None:
|
| 385 |
-
# static_embeddings, relative_embedding = self.embedding(input_ids.t())
|
| 386 |
static_embeddings, relative_embedding = self.embedding(input_ids.t())
|
| 387 |
contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
|
| 388 |
contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
|
|
@@ -409,7 +431,8 @@ class LtgBertModel(LtgBertPreTrainedModel):
|
|
| 409 |
)
|
| 410 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 411 |
|
| 412 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
|
| 413 |
|
| 414 |
if not return_dict:
|
| 415 |
return (
|
|
@@ -456,7 +479,8 @@ class LtgBertForMaskedLM(LtgBertModel):
|
|
| 456 |
"""
|
| 457 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 458 |
|
| 459 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
|
| 460 |
subword_prediction = self.classifier(sequence_output)
|
| 461 |
|
| 462 |
masked_lm_loss = None
|
|
@@ -554,8 +578,9 @@ class LtgBertForSequenceClassification(LtgBertModel):
|
|
| 554 |
"""
|
| 555 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 556 |
|
| 557 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
| 558 |
-
|
|
|
|
| 559 |
logits = self.head(sequence_output[:, 0, :])
|
| 560 |
|
| 561 |
loss = None
|
|
@@ -628,7 +653,8 @@ class LtgBertForTokenClassification(LtgBertModel):
|
|
| 628 |
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
|
| 629 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 630 |
|
| 631 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
|
| 632 |
logits = self.head(sequence_output)
|
| 633 |
|
| 634 |
loss = None
|
|
@@ -684,7 +710,8 @@ class LtgBertForQuestionAnswering(LtgBertModel):
|
|
| 684 |
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
|
| 685 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 686 |
|
| 687 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids,
|
|
|
|
| 688 |
logits = self.head(sequence_output)
|
| 689 |
|
| 690 |
start_logits, end_logits = logits.split(1, dim=-1)
|
|
@@ -762,7 +789,8 @@ class LtgBertForMultipleChoice(LtgBertModel):
|
|
| 762 |
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
| 763 |
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
| 764 |
|
| 765 |
-
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(flat_input_ids,
|
|
|
|
| 766 |
logits = self.head(sequence_output)
|
| 767 |
reshaped_logits = logits.view(-1, num_choices)
|
| 768 |
|
|
@@ -785,3 +813,4 @@ class LtgBertForMultipleChoice(LtgBertModel):
|
|
| 785 |
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
| 786 |
attentions=attention_probs if output_attentions else None
|
| 787 |
)
|
|
|
|
|
|
| 39 |
from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward
|
| 40 |
|
| 41 |
|
| 42 |
+
_CHECKPOINT_FOR_DOC = "ltg/bnc-bert-span"
|
| 43 |
_CONFIG_FOR_DOC = "LtgBertConfig"
|
| 44 |
|
| 45 |
|
| 46 |
+
LTG_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
| 47 |
+
"bnc-bert-span",
|
| 48 |
+
"bnc-bert-span-2x",
|
| 49 |
+
"bnc-bert-span-0.5x",
|
| 50 |
+
"bnc-bert-span-0.25x",
|
| 51 |
+
"bnc-bert-span-order",
|
| 52 |
+
"bnc-bert-span-document",
|
| 53 |
+
"bnc-bert-span-word",
|
| 54 |
+
"bnc-bert-span-subword",
|
| 55 |
+
|
| 56 |
+
"norbert3-xs",
|
| 57 |
+
"norbert3-small",
|
| 58 |
+
"norbert3-base",
|
| 59 |
+
"norbert3-large",
|
| 60 |
+
|
| 61 |
+
"norbert3-oversampled-base",
|
| 62 |
+
"norbert3-ncc-base",
|
| 63 |
+
"norbert3-nak-base",
|
| 64 |
+
"norbert3-nb-base",
|
| 65 |
+
"norbert3-wiki-base",
|
| 66 |
+
"norbert3-c4-base"
|
| 67 |
+
]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
class Encoder(nn.Module):
|
| 71 |
def __init__(self, config, activation_checkpointing=False):
|
| 72 |
super().__init__()
|
|
|
|
| 248 |
|
| 249 |
attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
|
| 250 |
|
| 251 |
+
query_pos, key_pos = self.in_proj_qk(self.dropout(relative_embedding)).chunk(2, dim=-1) # shape: [2T-1, D]
|
| 252 |
+
query_pos = query_pos.view(-1, self.num_heads, self.head_size) # shape: [2T-1, H, D]
|
| 253 |
+
key_pos = key_pos.view(-1, self.num_heads, self.head_size) # shape: [2T-1, H, D]
|
| 254 |
+
|
| 255 |
query = query.view(batch_size, self.num_heads, query_len, self.head_size)
|
| 256 |
key = key.view(batch_size, self.num_heads, query_len, self.head_size)
|
| 257 |
|
|
|
|
| 393 |
) -> List[torch.Tensor]:
|
| 394 |
if input_ids is not None:
|
| 395 |
input_shape = input_ids.size()
|
|
|
|
|
|
|
| 396 |
else:
|
| 397 |
raise ValueError("You have to specify input_ids")
|
| 398 |
|
|
|
|
| 404 |
else:
|
| 405 |
attention_mask = ~attention_mask.bool()
|
| 406 |
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
| 407 |
+
|
|
|
|
|
|
|
| 408 |
static_embeddings, relative_embedding = self.embedding(input_ids.t())
|
| 409 |
contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
|
| 410 |
contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
|
|
|
|
| 431 |
)
|
| 432 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 433 |
|
| 434 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
| 435 |
+
attention_mask=attention_mask)
|
| 436 |
|
| 437 |
if not return_dict:
|
| 438 |
return (
|
|
|
|
| 479 |
"""
|
| 480 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 481 |
|
| 482 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
| 483 |
+
attention_mask=attention_mask)
|
| 484 |
subword_prediction = self.classifier(sequence_output)
|
| 485 |
|
| 486 |
masked_lm_loss = None
|
|
|
|
| 578 |
"""
|
| 579 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 580 |
|
| 581 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
| 582 |
+
inputs_embeds=inputs_embeds,
|
| 583 |
+
attention_mask=attention_mask)
|
| 584 |
logits = self.head(sequence_output[:, 0, :])
|
| 585 |
|
| 586 |
loss = None
|
|
|
|
| 653 |
) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
|
| 654 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 655 |
|
| 656 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
| 657 |
+
attention_mask=attention_mask)
|
| 658 |
logits = self.head(sequence_output)
|
| 659 |
|
| 660 |
loss = None
|
|
|
|
| 710 |
) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
|
| 711 |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 712 |
|
| 713 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=input_ids,
|
| 714 |
+
attention_mask=attention_mask)
|
| 715 |
logits = self.head(sequence_output)
|
| 716 |
|
| 717 |
start_logits, end_logits = logits.split(1, dim=-1)
|
|
|
|
| 789 |
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
| 790 |
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
| 791 |
|
| 792 |
+
sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids=flat_input_ids,
|
| 793 |
+
attention_mask=flat_attention_mask)
|
| 794 |
logits = self.head(sequence_output)
|
| 795 |
reshaped_logits = logits.view(-1, num_choices)
|
| 796 |
|
|
|
|
| 813 |
hidden_states=contextualized_embeddings if output_hidden_states else None,
|
| 814 |
attentions=attention_probs if output_attentions else None
|
| 815 |
)
|
| 816 |
+
|