Create README.md
Browse files
README.md
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Pretrained model based on [microsoft/deberta-v3-base](https://huggingface.co/microsoft/deberta-v3-base) with further mathematical pre-training.
|
| 2 |
+
|
| 3 |
+
Compared to deberta-v3-base, 300 additional mathematical LaTeX tokens have been added before the mathematical pre-training. As this additional pre-training used NSP-like tasks, a pooling layer has been added to the model (`bias` and `weight`). If you don't need this pooling layer, just use the standard transformers DeBERTa model. If you want to use the additional pooling layer like the BERT one, a wrapper class like the following may be used:
|
| 4 |
+
```python
|
| 5 |
+
from typing import Mapping, Any
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
from torch import nn
|
| 9 |
+
from transformers import DebertaV2Model, DebertaV2Tokenizer, AutoConfig, AutoTokenizer
|
| 10 |
+
|
| 11 |
+
class DebertaV2ModelWithPoolingLayer:
|
| 12 |
+
|
| 13 |
+
def __init__(self, pretrained_model_name):
|
| 14 |
+
super(DebertaV2ModelWithPoolingLayer, self).__init__()
|
| 15 |
+
|
| 16 |
+
# Load the Deberta model and tokenizer
|
| 17 |
+
self.deberta = DebertaV2Model.from_pretrained(pretrained_model_name)
|
| 18 |
+
self.tokenizer = DebertaV2Tokenizer.from_pretrained(pretrained_model_name)
|
| 19 |
+
|
| 20 |
+
# Add a pooling layer (Linear + tanh activation) for the CLS token
|
| 21 |
+
self.pooling_layer = nn.Sequential(
|
| 22 |
+
nn.Linear(self.deberta.config.hidden_size, self.deberta.config.hidden_size),
|
| 23 |
+
nn.Tanh()
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
self.config = self.deberta.config
|
| 27 |
+
self.embeddings = self.deberta.embeddings
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def forward(self, input_ids, attention_mask, *args, **kwargs):
|
| 31 |
+
# Forward pass through the Deberta model
|
| 32 |
+
outputs = self.deberta(input_ids, attention_mask=attention_mask, *args, **kwargs)
|
| 33 |
+
|
| 34 |
+
# Extract the hidden states from the output
|
| 35 |
+
hidden_states = outputs.last_hidden_state
|
| 36 |
+
|
| 37 |
+
# Get the CLS token representation (first token)
|
| 38 |
+
cls_token = hidden_states[:, 0, :]
|
| 39 |
+
|
| 40 |
+
# Apply the pooling layer to the CLS token representation
|
| 41 |
+
pooled_output = self.pooling_layer(cls_token)
|
| 42 |
+
# Include the pooled_output in the output dictionary as 'pooling_layer'
|
| 43 |
+
outputs["pooler_output"] = pooled_output
|
| 44 |
+
|
| 45 |
+
return outputs
|
| 46 |
+
|
| 47 |
+
def save_pretrained(self, path):
|
| 48 |
+
# Save the model's state_dict, configuration, and tokenizer
|
| 49 |
+
state_dict = self.deberta.state_dict()
|
| 50 |
+
state_dict.update(self.pooling_layer[0].state_dict())
|
| 51 |
+
|
| 52 |
+
torch.save(state_dict, f"{path}/pytorch_model.bin")
|
| 53 |
+
self.deberta.config.save_pretrained(path)
|
| 54 |
+
self.tokenizer.save_pretrained(path)
|
| 55 |
+
|
| 56 |
+
def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True):
|
| 57 |
+
pooler_keys = ['bias', 'weight']
|
| 58 |
+
deberta_state_dict = {k: v for k, v in state_dict.items() if k not in pooler_keys}
|
| 59 |
+
pooler_state_dict = {k: v for k, v in state_dict.items() if k in pooler_keys}
|
| 60 |
+
self.deberta.load_state_dict(deberta_state_dict, strict=strict)
|
| 61 |
+
self.pooling_layer[0].load_state_dict(pooler_state_dict)
|
| 62 |
+
|
| 63 |
+
@classmethod
|
| 64 |
+
def from_pretrained(cls, name):
|
| 65 |
+
# Initialize the instance
|
| 66 |
+
instance = cls(name)
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
# Load the model's state_dict
|
| 70 |
+
instance.load_state_dict(torch.load(f"{name}/pytorch_model.bin"))
|
| 71 |
+
except FileNotFoundError:
|
| 72 |
+
print("Could not find DeBERTa pooling layer. Initialize new values")
|
| 73 |
+
|
| 74 |
+
# Load the configuration and tokenizer
|
| 75 |
+
instance.deberta.config = AutoConfig.from_pretrained(name)
|
| 76 |
+
instance.tokenizer = AutoTokenizer.from_pretrained(name)
|
| 77 |
+
|
| 78 |
+
return instance
|
| 79 |
+
```
|