UTR-LM-MLM / configuration_utrlm.py
Taykhoom's picture
Upload folder using huggingface_hub
0a535de verified
from transformers import PretrainedConfig
class UtrLmConfig(PretrainedConfig):
"""
Configuration for UTR-LM (ESM2-based RNA language model).
Vocab (10 tokens):
<pad>:0 <eos>:1 <unk>:2 A:3 G:4 C:5 T:6 <cls>:7 <mask>:8 <sep>:9
"""
model_type = "utrlm"
def __init__(
self,
num_layers: int = 6,
embed_dim: int = 128,
attention_heads: int = 16,
alphabet_size: int = 10,
padding_idx: int = 0,
mask_idx: int = 8,
cls_idx: int = 7,
eos_idx: int = 1,
prepend_bos: bool = True,
append_eos: bool = True,
token_dropout: bool = True,
**kwargs,
):
kwargs.setdefault("pad_token_id", padding_idx)
super().__init__(**kwargs)
# Written into config.json so AutoModel / AutoModelForMaskedLM resolve
# the correct classes when loading from the Hub with trust_remote_code=True.
self.auto_map = {
"AutoConfig": "configuration_utrlm.UtrLmConfig",
"AutoTokenizer": "tokenization_utrlm.UtrLmTokenizer",
"AutoModel": "modeling_utrlm.UtrLmModel",
"AutoModelForMaskedLM": "modeling_utrlm.UtrLmForMaskedLM",
}
self.num_layers = num_layers
self.embed_dim = embed_dim
self.attention_heads = attention_heads
self.alphabet_size = alphabet_size
self.padding_idx = padding_idx
self.mask_idx = mask_idx
self.cls_idx = cls_idx
self.eos_idx = eos_idx
self.prepend_bos = prepend_bos
self.append_eos = append_eos
self.token_dropout = token_dropout
@property
def hidden_size(self) -> int:
return self.embed_dim