from transformers import PretrainedConfig class UtrLmConfig(PretrainedConfig): """ Configuration for UTR-LM (ESM2-based RNA language model). Vocab (10 tokens): :0 :1 :2 A:3 G:4 C:5 T:6 :7 :8 :9 """ model_type = "utrlm" def __init__( self, num_layers: int = 6, embed_dim: int = 128, attention_heads: int = 16, alphabet_size: int = 10, padding_idx: int = 0, mask_idx: int = 8, cls_idx: int = 7, eos_idx: int = 1, prepend_bos: bool = True, append_eos: bool = True, token_dropout: bool = True, **kwargs, ): kwargs.setdefault("pad_token_id", padding_idx) super().__init__(**kwargs) # Written into config.json so AutoModel / AutoModelForMaskedLM resolve # the correct classes when loading from the Hub with trust_remote_code=True. self.auto_map = { "AutoConfig": "configuration_utrlm.UtrLmConfig", "AutoTokenizer": "tokenization_utrlm.UtrLmTokenizer", "AutoModel": "modeling_utrlm.UtrLmModel", "AutoModelForMaskedLM": "modeling_utrlm.UtrLmForMaskedLM", } self.num_layers = num_layers self.embed_dim = embed_dim self.attention_heads = attention_heads self.alphabet_size = alphabet_size self.padding_idx = padding_idx self.mask_idx = mask_idx self.cls_idx = cls_idx self.eos_idx = eos_idx self.prepend_bos = prepend_bos self.append_eos = append_eos self.token_dropout = token_dropout @property def hidden_size(self) -> int: return self.embed_dim