File size: 1,724 Bytes
03d9aff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from transformers import PretrainedConfig


class UtrLmConfig(PretrainedConfig):
    """
    Configuration for UTR-LM (ESM2-based RNA language model).

    Vocab (10 tokens):
        <pad>:0  <eos>:1  <unk>:2  A:3  G:4  C:5  T:6  <cls>:7  <mask>:8  <sep>:9
    """

    model_type = "utrlm"

    def __init__(
        self,
        num_layers: int = 6,
        embed_dim: int = 128,
        attention_heads: int = 16,
        alphabet_size: int = 10,
        padding_idx: int = 0,
        mask_idx: int = 8,
        cls_idx: int = 7,
        eos_idx: int = 1,
        prepend_bos: bool = True,
        append_eos: bool = True,
        token_dropout: bool = True,
        **kwargs,
    ):
        kwargs.setdefault("pad_token_id", padding_idx)
        super().__init__(**kwargs)
        # Written into config.json so AutoModel / AutoModelForMaskedLM resolve
        # the correct classes when loading from the Hub with trust_remote_code=True.
        self.auto_map = {
            "AutoConfig": "configuration_utrlm.UtrLmConfig",
            "AutoTokenizer": "tokenization_utrlm.UtrLmTokenizer",
            "AutoModel": "modeling_utrlm.UtrLmModel",
            "AutoModelForMaskedLM": "modeling_utrlm.UtrLmForMaskedLM",
        }
        self.num_layers = num_layers
        self.embed_dim = embed_dim
        self.attention_heads = attention_heads
        self.alphabet_size = alphabet_size
        self.padding_idx = padding_idx
        self.mask_idx = mask_idx
        self.cls_idx = cls_idx
        self.eos_idx = eos_idx
        self.prepend_bos = prepend_bos
        self.append_eos = append_eos
        self.token_dropout = token_dropout

    @property
    def hidden_size(self) -> int:
        return self.embed_dim