Add files using upload-large-folder tool

Browse files

Files changed (9) hide show

LICENSE +21 -0
__init__.py +0 -0
config.json +47 -0
data_summary_card.md +148 -0
generation_config.json +8 -0
model.safetensors +3 -0
special_tokens_map.json +37 -0
tokenizer_config.json +59 -0
tokenizers.py +127 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Microsoft Corporation.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,47 @@

+{
+  "architectures": [
+    "JambaForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_layer_offset": 4,
+  "attn_layer_period": 8,
+  "auto_map": {
+    "AutoConfig": "ai21labs/Jamba-v0.1--configuration_jamba.JambaConfig",
+    "AutoModel": "ai21labs/Jamba-v0.1--modeling_jamba.JambaModel",
+    "AutoModelForCausalLM": "ai21labs/Jamba-v0.1--modeling_jamba.JambaForCausalLM",
+    "AutoModelForSequenceClassification": "ai21labs/Jamba-v0.1--model.JambaForSequenceClassification"
+  },
+  "bos_token_id": 29,
+  "eos_token_id": 27,
+  "expert_layer_offset": 1,
+  "expert_layer_period": 2,
+  "hidden_act": "silu",
+  "hidden_size": 256,
+  "initializer_range": 0.02,
+  "intermediate_size": 1024,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_state": 16,
+  "mamba_dt_rank": 16,
+  "mamba_expand": 2,
+  "mamba_proj_bias": false,
+  "max_position_embeddings": 262144,
+  "model_type": "jamba",
+  "num_attention_heads": 16,
+  "num_experts": 16,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 8,
+  "num_logits_to_keep": 1,
+  "output_router_logits": true,
+  "pad_token_id": 30,
+  "rms_norm_eps": 1e-06,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "use_cache": false,
+  "use_mamba_kernels": true,
+  "vocab_size": 40
+}

data_summary_card.md ADDED Viewed

	@@ -0,0 +1,148 @@

+# Data Summary for microsoft_Dayhoff-170m-UR90, Dayhoff-3b-UR90, Dayhoff-170m-GR, Dayhoffm-UR-50-BRn, Dayhoff-3b-GR-HM-c, Dayhoff-3b-GR-HM, Dayhoff-170m-UR50, Dayhoff-170m-UR50-BRq, Dayhoff-170m-UR50-BRu
+## 1. General information
+**1.0.1 Version of the Summary:** 1.0
+**1.0.2 Last update:** 4-Dec-2025
+## 1.1 Model Developer Identification
+**1.1.1 Model Developer name and contact details:** Microsoft Corporation at One Microsoft Way, Redmond, WA 98052. Tel: 425-882-8080
+## 1.2 Model Identification
+**1.2.1 Versioned model name(s):** Dayhoff
+**1.2.2 Model release date:** 25-Jul-2025
+## 1.3 Overall training data size and characteristics
+### 1.3.1 Size of dataset and characteristics
+**1.3.1.A Text training data size:** Not applicable.
+**1.3.1.B Text training data content:** Not applicable. Text data is not part of the training data.
+**1.3.1.C Image training data size:** Not applicable.
+**1.3.1.D Image training data content:** Not applicable. Images are not part of the training data.
+**1.3.1.E Audio training data size:** Not applicable.
+**1.3.1.F Audio training data content:** Not applicable. Audio data is not part of the training data.
+**1.3.1.G Video training data size:** Not applicable.
+**1.3.1.H Video training data content:** Not applicable. Video data is not part of the training data.
+**1.3.1.I Other training data size:** Training data consists of protein sequences and multiple sequence alignments; sizes include 3.34 billion sequences across 1.7 billion clusters (Gigaref), 46 million structure-derived synthetic sequences (BackboneRef), and 16 million MSAs (OpenProteinSet)
+**1.3.1.J Other training data content:**
+**1.3.2 Latest date of data acquisition/collection for model training:** Uniref (January 2024), Gigaref (July 2024), BackboneRef (July 2024), OpenProteinSet (August 2023)
+**1.3.3 Is data collection ongoing to update the model with new data collection after deployment?** No
+**1.3.4 Date the training dataset was first used to train the model:** April 2024
+**1.3.5 Rationale or purpose of data selection:** Datasets combine large-scale metagenomic and structure-based synthetic protein sequences to maximize coverage, diversity, and novelty of protein sequence space, supporting tasks like zero-shot mutation effect prediction, motif scaffolding, and guided generation of novel proteins with improved cellular expression rates
+## 2. List of data sources
+### 2.1 Publicly available datasets
+**2.1.1 Have you used publicly available datasets to train the model?** Yes
+## 2.2 Private non-publicly available datasets obtained from third parties
+### 2.2.1 Datasets commercially licensed by rights holders or their representatives
+**2.2.1.A Have you concluded transactional commercial licensing agreement(s) with rights holder(s) or with their representatives?** No
+### 2.2.2 Private datasets obtained from other third-parties
+**2.2.2.A Have you obtained private datasets from third parties that are not licensed as described in Section 2.2.1, such as data obtained from providers of private databases, or data intermediaries?** No
+## 2.3 Personal Information
+**2.3.1 Was personal data used to train the model?** Microsoft follows all relevant laws and regulations pertaining to personal information.
+## 2.4 Synthetic data
+**2.4.1 Was any synthetic AI-generated data used to train the model?** Yes
+## 3. Data processing aspects
+### 3.1 Respect of reservation of rights from text and data mining exception or limitation
+**3.1.1 Does this dataset include any data protected by copyright, trademark, or patent?** Microsoft follows all required regulations and laws for processing data protected by copyright, trademark, or patent.
+## 3.2 Other information
+**3.2.1 Does the dataset include information about consumer groups without revealing individual consumer identities?** Microsoft follows all required regulations and laws for protecting consumer identities.
+**3.2.2 Was the dataset cleaned or modified before model training?** Yes

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 29,
+  "eos_token_id": 27,
+  "pad_token_id": 30,
+  "transformers_version": "4.51.3",
+  "use_cache": false
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cd100adee7d6ef17c65271c769e8b51d6b4ed1220c1aa904bf868f2115703a2
+size 341054112

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "bos_token": {
+    "content": "@",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "*",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "#",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "!",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "/",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "added_tokens_decoder": {
+    "27": {
+      "content": "*",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "#",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "@",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "!",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "/",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizers.ProteinTokenizer",
+      null
+    ]
+  },
+  "bos_token": "@",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "*",
+  "extra_special_tokens": {},
+  "mask_token": "#",
+  "model_max_length": 2048,
+  "pad_token": "!",
+  "sep_token": "/",
+  "tokenizer_class": "ProteinTokenizer"
+}

tokenizers.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+from typing import List, Optional, Union
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+MASK = "#"
+MSA_PAD = "!"
+UL_ALPHABET_PLUS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-*#@!/[]{}"
+MSA_AAS = "ACDEFGHIKLMNPQRSTVWYBZXJOU-"
+GAP = "-"
+START = "@"
+STOP = "*"
+SEP = "/"
+END_AL = "]"
+END_UL = "}"
+START_AL = "["
+START_UL = "{"
+class ProteinTokenizer(PreTrainedTokenizer):
+    def __init__(
+        self,
+        protein_alphabet: str = UL_ALPHABET_PLUS,
+        model_max_length: int = 2048,
+        pad_token=MSA_PAD,
+        mask_token=MASK,
+        all_aas=MSA_AAS,
+        gap_token=GAP,
+        bos_token=START,
+        eos_token=STOP,
+        sep_token=SEP,
+        **kwargs
+    ):
+        """Character tokenizer for Hugging Face transformers.
+        model_max_length (int): Model maximum sequence length.
+        """
+        self.alphabet = list("".join(protein_alphabet))
+        self.all_aas = list("".join(all_aas))
+        self.a_to_i = {u: i for i, u in enumerate(self.alphabet)}
+        self.i_to_a = {i: u for i, u in enumerate(self.alphabet)}
+        self.gap_token = gap_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        mask_token = AddedToken(mask_token, lstrip=False, rstrip=False) if isinstance(mask_token, str) else mask_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        gap_token = AddedToken(gap_token, lstrip=False, rstrip=False) if isinstance(gap_token, str) else gap_token
+        super().__init__(
+            pad_token=pad_token,
+            mask_token=mask_token,
+            eos_token=eos_token,
+            bos_token=bos_token,
+            sep_token=sep_token,
+            model_max_length=model_max_length,
+            **kwargs
+        )
+    @property
+    def vocab_size(self):
+        return len(self.alphabet)
+    @property
+    def gap_token_id(self):
+        return self.convert_tokens_to_ids(self.gap_token)
+    def get_vocab(self):
+        return self.a_to_i
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token) -> int:
+        return self.a_to_i[token]
+    def _convert_id_to_token(self, index) -> str:
+        return self.i_to_a[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        result = token_ids_0
+        if token_ids_1 is not None:
+            raise NotImplementedError("This tokenizer does not support two sequences")
+        return result
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = [0] * len(token_ids_0)
+        if token_ids_1 is not None:
+            raise NotImplementedError("This tokenizer does not support two sequences")
+        return result
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Identifies the type of token. 0 for the first sentence, 1 for the second sentence if it exists
+        """
+        result = len(token_ids_0) * [0]
+        if token_ids_1 is not None:
+            raise NotImplementedError("This tokenizer does not support two sequences")
+        return result
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        super().save_pretrained(save_directory, **kwargs)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):
+        return ()