|
|
"""HELM-BERT configuration.""" |
|
|
|
|
|
from transformers import PretrainedConfig |
|
|
|
|
|
|
|
|
class HELMBertConfig(PretrainedConfig): |
|
|
"""Configuration class for HELM-BERT model. |
|
|
|
|
|
This configuration class stores all the parameters needed to instantiate a HELM-BERT model. |
|
|
It inherits from PretrainedConfig and can be used with HuggingFace's from_pretrained and |
|
|
save_pretrained methods. |
|
|
|
|
|
Args: |
|
|
vocab_size: Size of the vocabulary (default: 78 for HELM character vocabulary) |
|
|
hidden_size: Dimensionality of the encoder layers (default: 768) |
|
|
num_hidden_layers: Number of transformer layers (default: 6) |
|
|
num_attention_heads: Number of attention heads (default: 12) |
|
|
intermediate_size: Dimensionality of the feed-forward layer (default: 3072) |
|
|
hidden_dropout_prob: Dropout probability for hidden layers (default: 0.1) |
|
|
attention_probs_dropout_prob: Dropout probability for attention (default: 0.1) |
|
|
max_position_embeddings: Maximum sequence length (default: 512) |
|
|
max_relative_positions: Maximum relative position distance (default: 512) |
|
|
position_buckets: Number of position buckets for log-bucketing (default: 256) |
|
|
pos_att_type: Position attention types, pipe-separated (default: "c2p|p2c") |
|
|
share_att_key: Whether to share attention key projections (default: False) |
|
|
ngie_kernel_size: Kernel size for nGiE convolution (default: 3) |
|
|
ngie_dropout: Dropout for nGiE layer (default: 0.1) |
|
|
pad_token_id: ID of padding token (default: 0) |
|
|
bos_token_id: ID of beginning-of-sequence token (default: 1) |
|
|
eos_token_id: ID of end-of-sequence token (default: 2) |
|
|
sep_token_id: ID of separator token (default: 2) |
|
|
mask_token_id: ID of mask token (default: 4) |
|
|
|
|
|
Example: |
|
|
>>> from helmbert import HELMBertConfig, HELMBertModel |
|
|
>>> config = HELMBertConfig(hidden_size=768, num_hidden_layers=6) |
|
|
>>> model = HELMBertModel(config) |
|
|
""" |
|
|
|
|
|
model_type = "helmbert" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
vocab_size: int = 78, |
|
|
hidden_size: int = 768, |
|
|
num_hidden_layers: int = 6, |
|
|
num_attention_heads: int = 12, |
|
|
intermediate_size: int = 3072, |
|
|
hidden_dropout_prob: float = 0.1, |
|
|
attention_probs_dropout_prob: float = 0.1, |
|
|
max_position_embeddings: int = 512, |
|
|
|
|
|
max_relative_positions: int = 512, |
|
|
position_buckets: int = 256, |
|
|
pos_att_type: str = "c2p|p2c", |
|
|
share_att_key: bool = False, |
|
|
|
|
|
ngie_kernel_size: int = 3, |
|
|
ngie_dropout: float = 0.1, |
|
|
|
|
|
pad_token_id: int = 0, |
|
|
bos_token_id: int = 1, |
|
|
eos_token_id: int = 2, |
|
|
sep_token_id: int = 2, |
|
|
mask_token_id: int = 4, |
|
|
|
|
|
num_labels: int = 2, |
|
|
problem_type: str = None, |
|
|
classifier_num_layers: int = 0, |
|
|
classifier_dropout: float = 0.1, |
|
|
**kwargs, |
|
|
): |
|
|
super().__init__( |
|
|
pad_token_id=pad_token_id, |
|
|
bos_token_id=bos_token_id, |
|
|
eos_token_id=eos_token_id, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
|
|
|
self.vocab_size = vocab_size |
|
|
self.hidden_size = hidden_size |
|
|
self.num_hidden_layers = num_hidden_layers |
|
|
self.num_attention_heads = num_attention_heads |
|
|
self.intermediate_size = intermediate_size |
|
|
self.hidden_dropout_prob = hidden_dropout_prob |
|
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob |
|
|
self.max_position_embeddings = max_position_embeddings |
|
|
|
|
|
|
|
|
self.max_relative_positions = max_relative_positions |
|
|
self.position_buckets = position_buckets |
|
|
self.pos_att_type = pos_att_type |
|
|
self.share_att_key = share_att_key |
|
|
|
|
|
|
|
|
self.ngie_kernel_size = ngie_kernel_size |
|
|
self.ngie_dropout = ngie_dropout |
|
|
|
|
|
|
|
|
self.sep_token_id = sep_token_id |
|
|
self.mask_token_id = mask_token_id |
|
|
|
|
|
|
|
|
self.num_labels = num_labels |
|
|
self.problem_type = problem_type |
|
|
self.classifier_num_layers = classifier_num_layers |
|
|
self.classifier_dropout = classifier_dropout |
|
|
|