sarahalamdari commited on
Commit
062f736
·
verified ·
1 Parent(s): 6235f57

Add files using upload-large-folder tool

Browse files
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
__init__.py ADDED
File without changes
config.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "JambaForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_layer_offset": 4,
7
+ "attn_layer_period": 8,
8
+ "auto_map": {
9
+ "AutoConfig": "ai21labs/Jamba-v0.1--configuration_jamba.JambaConfig",
10
+ "AutoModel": "ai21labs/Jamba-v0.1--modeling_jamba.JambaModel",
11
+ "AutoModelForCausalLM": "ai21labs/Jamba-v0.1--modeling_jamba.JambaForCausalLM",
12
+ "AutoModelForSequenceClassification": "ai21labs/Jamba-v0.1--model.JambaForSequenceClassification"
13
+ },
14
+ "bos_token_id": 29,
15
+ "eos_token_id": 27,
16
+ "expert_layer_offset": 1,
17
+ "expert_layer_period": 2,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 256,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 1024,
22
+ "mamba_conv_bias": true,
23
+ "mamba_d_conv": 4,
24
+ "mamba_d_state": 16,
25
+ "mamba_dt_rank": 16,
26
+ "mamba_expand": 2,
27
+ "mamba_proj_bias": false,
28
+ "max_position_embeddings": 262144,
29
+ "model_type": "jamba",
30
+ "num_attention_heads": 16,
31
+ "num_experts": 16,
32
+ "num_experts_per_tok": 2,
33
+ "num_hidden_layers": 24,
34
+ "num_key_value_heads": 8,
35
+ "num_logits_to_keep": 1,
36
+ "output_router_logits": true,
37
+ "pad_token_id": 30,
38
+ "rms_norm_eps": 1e-06,
39
+ "router_aux_loss_coef": 0.001,
40
+ "sliding_window": null,
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "bfloat16",
43
+ "transformers_version": "4.51.3",
44
+ "use_cache": false,
45
+ "use_mamba_kernels": true,
46
+ "vocab_size": 40
47
+ }
data_summary_card.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Data Summary for microsoft_Dayhoff-170m-UR90, Dayhoff-3b-UR90, Dayhoff-170m-GR, Dayhoffm-UR-50-BRn, Dayhoff-3b-GR-HM-c, Dayhoff-3b-GR-HM, Dayhoff-170m-UR50, Dayhoff-170m-UR50-BRq, Dayhoff-170m-UR50-BRu
3
+
4
+
5
+
6
+
7
+
8
+ ## 1. General information
9
+
10
+ **1.0.1 Version of the Summary:** 1.0
11
+
12
+
13
+
14
+ **1.0.2 Last update:** 4-Dec-2025
15
+
16
+
17
+
18
+ ## 1.1 Model Developer Identification
19
+
20
+ **1.1.1 Model Developer name and contact details:** Microsoft Corporation at One Microsoft Way, Redmond, WA 98052. Tel: 425-882-8080
21
+
22
+
23
+
24
+ ## 1.2 Model Identification
25
+
26
+ **1.2.1 Versioned model name(s):** Dayhoff
27
+
28
+
29
+
30
+ **1.2.2 Model release date:** 25-Jul-2025
31
+
32
+
33
+
34
+ ## 1.3 Overall training data size and characteristics
35
+
36
+ ### 1.3.1 Size of dataset and characteristics
37
+
38
+ **1.3.1.A Text training data size:** Not applicable.
39
+
40
+
41
+
42
+ **1.3.1.B Text training data content:** Not applicable. Text data is not part of the training data.
43
+
44
+
45
+
46
+ **1.3.1.C Image training data size:** Not applicable.
47
+
48
+
49
+
50
+ **1.3.1.D Image training data content:** Not applicable. Images are not part of the training data.
51
+
52
+
53
+
54
+ **1.3.1.E Audio training data size:** Not applicable.
55
+
56
+
57
+
58
+ **1.3.1.F Audio training data content:** Not applicable. Audio data is not part of the training data.
59
+
60
+
61
+
62
+ **1.3.1.G Video training data size:** Not applicable.
63
+
64
+
65
+
66
+ **1.3.1.H Video training data content:** Not applicable. Video data is not part of the training data.
67
+
68
+
69
+
70
+ **1.3.1.I Other training data size:** Training data consists of protein sequences and multiple sequence alignments; sizes include 3.34 billion sequences across 1.7 billion clusters (Gigaref), 46 million structure-derived synthetic sequences (BackboneRef), and 16 million MSAs (OpenProteinSet)
71
+
72
+
73
+
74
+ **1.3.1.J Other training data content:**
75
+
76
+
77
+
78
+ **1.3.2 Latest date of data acquisition/collection for model training:** Uniref (January 2024), Gigaref (July 2024), BackboneRef (July 2024), OpenProteinSet (August 2023)
79
+
80
+
81
+
82
+ **1.3.3 Is data collection ongoing to update the model with new data collection after deployment?** No
83
+
84
+
85
+
86
+ **1.3.4 Date the training dataset was first used to train the model:** April 2024
87
+
88
+
89
+
90
+ **1.3.5 Rationale or purpose of data selection:** Datasets combine large-scale metagenomic and structure-based synthetic protein sequences to maximize coverage, diversity, and novelty of protein sequence space, supporting tasks like zero-shot mutation effect prediction, motif scaffolding, and guided generation of novel proteins with improved cellular expression rates
91
+
92
+
93
+
94
+ ## 2. List of data sources
95
+
96
+ ### 2.1 Publicly available datasets
97
+
98
+ **2.1.1 Have you used publicly available datasets to train the model?** Yes
99
+
100
+
101
+
102
+ ## 2.2 Private non-publicly available datasets obtained from third parties
103
+
104
+ ### 2.2.1 Datasets commercially licensed by rights holders or their representatives
105
+
106
+ **2.2.1.A Have you concluded transactional commercial licensing agreement(s) with rights holder(s) or with their representatives?** No
107
+
108
+
109
+
110
+ ### 2.2.2 Private datasets obtained from other third-parties
111
+
112
+ **2.2.2.A Have you obtained private datasets from third parties that are not licensed as described in Section 2.2.1, such as data obtained from providers of private databases, or data intermediaries?** No
113
+
114
+
115
+
116
+ ## 2.3 Personal Information
117
+
118
+ **2.3.1 Was personal data used to train the model?** Microsoft follows all relevant laws and regulations pertaining to personal information.
119
+
120
+
121
+
122
+
123
+
124
+
125
+
126
+ ## 2.4 Synthetic data
127
+
128
+ **2.4.1 Was any synthetic AI-generated data used to train the model?** Yes
129
+
130
+
131
+
132
+ ## 3. Data processing aspects
133
+
134
+ ### 3.1 Respect of reservation of rights from text and data mining exception or limitation
135
+
136
+ **3.1.1 Does this dataset include any data protected by copyright, trademark, or patent?** Microsoft follows all required regulations and laws for processing data protected by copyright, trademark, or patent.
137
+
138
+
139
+
140
+ ## 3.2 Other information
141
+
142
+ **3.2.1 Does the dataset include information about consumer groups without revealing individual consumer identities?** Microsoft follows all required regulations and laws for protecting consumer identities.
143
+
144
+
145
+
146
+
147
+
148
+ **3.2.2 Was the dataset cleaned or modified before model training?** Yes
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 29,
4
+ "eos_token_id": 27,
5
+ "pad_token_id": 30,
6
+ "transformers_version": "4.51.3",
7
+ "use_cache": false
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cd100adee7d6ef17c65271c769e8b51d6b4ed1220c1aa904bf868f2115703a2
3
+ size 341054112
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "@",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "*",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "mask_token": {
17
+ "content": "#",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "!",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "sep_token": {
31
+ "content": "/",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "27": {
4
+ "content": "*",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "28": {
12
+ "content": "#",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "29": {
20
+ "content": "@",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "30": {
28
+ "content": "!",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "31": {
36
+ "content": "/",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "auto_map": {
45
+ "AutoTokenizer": [
46
+ "tokenizers.ProteinTokenizer",
47
+ null
48
+ ]
49
+ },
50
+ "bos_token": "@",
51
+ "clean_up_tokenization_spaces": false,
52
+ "eos_token": "*",
53
+ "extra_special_tokens": {},
54
+ "mask_token": "#",
55
+ "model_max_length": 2048,
56
+ "pad_token": "!",
57
+ "sep_token": "/",
58
+ "tokenizer_class": "ProteinTokenizer"
59
+ }
tokenizers.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Optional, Union
3
+
4
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
5
+
6
+ MASK = "#"
7
+ MSA_PAD = "!"
8
+ UL_ALPHABET_PLUS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-*#@!/[]{}"
9
+ MSA_AAS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-"
10
+ GAP = "-"
11
+ START = "@"
12
+ STOP = "*"
13
+ SEP = "/"
14
+ END_AL = "]"
15
+ END_UL = "}"
16
+ START_AL = "["
17
+ START_UL = "{"
18
+
19
+ class ProteinTokenizer(PreTrainedTokenizer):
20
+
21
+ def __init__(
22
+ self,
23
+ protein_alphabet: str = UL_ALPHABET_PLUS,
24
+ model_max_length: int = 2048,
25
+ pad_token=MSA_PAD,
26
+ mask_token=MASK,
27
+ all_aas=MSA_AAS,
28
+ gap_token=GAP,
29
+ bos_token=START,
30
+ eos_token=STOP,
31
+ sep_token=SEP,
32
+ **kwargs
33
+ ):
34
+ """Character tokenizer for Hugging Face transformers.
35
+
36
+ model_max_length (int): Model maximum sequence length.
37
+ """
38
+ self.alphabet = list("".join(protein_alphabet))
39
+ self.all_aas = list("".join(all_aas))
40
+ self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
41
+ self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
42
+ self.gap_token = gap_token
43
+
44
+
45
+ bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
46
+ eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
47
+ sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
48
+ mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
49
+ pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
50
+ gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
51
+
52
+ super().__init__(
53
+ pad_token=pad_token,
54
+ mask_token=mask_token,
55
+ eos_token=eos_token,
56
+ bos_token=bos_token,
57
+ sep_token=sep_token,
58
+ model_max_length=model_max_length,
59
+ **kwargs
60
+ )
61
+
62
+ @property
63
+ def vocab_size(self):
64
+ return len(self.alphabet)
65
+
66
+ @property
67
+ def gap_token_id(self):
68
+ return self.convert_tokens_to_ids(self.gap_token)
69
+
70
+ def get_vocab(self):
71
+ return self.a_to_i
72
+
73
+ def _tokenize(self, text: str) -> List[str]:
74
+ return list(text)
75
+
76
+ def _convert_token_to_id(self, token) -> int:
77
+ return self.a_to_i[token]
78
+
79
+ def _convert_id_to_token(self, index) -> str:
80
+ return self.i_to_a[index]
81
+
82
+ def convert_tokens_to_string(self, tokens):
83
+ return "".join(tokens)
84
+
85
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
86
+ result = token_ids_0
87
+ if token_ids_1 is not None:
88
+ raise NotImplementedError("This tokenizer does not support two sequences")
89
+ return result
90
+
91
+ def get_special_tokens_mask(
92
+ self,
93
+ token_ids_0: List[int],
94
+ token_ids_1: Optional[List[int]] = None,
95
+ already_has_special_tokens: bool = False,
96
+ ) -> List[int]:
97
+ if already_has_special_tokens:
98
+ return super().get_special_tokens_mask(
99
+ token_ids_0=token_ids_0,
100
+ token_ids_1=token_ids_1,
101
+ already_has_special_tokens=True,
102
+ )
103
+
104
+ result = [0] * len(token_ids_0)
105
+ if token_ids_1 is not None:
106
+ raise NotImplementedError("This tokenizer does not support two sequences")
107
+
108
+ return result
109
+
110
+ def create_token_type_ids_from_sequences(
111
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
112
+ ) -> List[int]:
113
+ """
114
+ Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists
115
+ """
116
+
117
+ result = len(token_ids_0) * [0]
118
+
119
+ if token_ids_1 is not None:
120
+ raise NotImplementedError("This tokenizer does not support two sequences")
121
+ return result
122
+
123
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
124
+ super().save_pretrained(save_directory, **kwargs)
125
+
126
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
127
+ return ()