Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

README.md +97 -0
config.json +58 -0
configuration_hybridna.py +194 -0
generation_config.json +8 -0
hybridna_tokenizer.py +226 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +210 -0
modeling_hybridna.py +0 -0
special_tokens_map.json +51 -0
tokenizer_config.json +70 -0
vocab.json +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+license: apache-2.0
+language:
+- en
+library_name: transformers
+tags:
+- genomics
+- dna
+- mamba
+- hybrid
+- biology
+---
+# HybriDNA-3B
+HybriDNA is a hybrid Mamba-Attention model for DNA sequence modeling. This is the 3B parameter variant.
+## Model Description
+HybriDNA combines the efficiency of Mamba state space models with the expressiveness of attention mechanisms in a hybrid architecture. The model alternates between Mamba and Attention layers to achieve both computational efficiency and strong sequence modeling capabilities.
+### Architecture
+- **Parameters**: ~3B
+- **Hidden Size**: 4096
+- **Layers**: 16 (hybrid Mamba + Attention)
+- **Attention Heads**: 32
+- **Key-Value Heads**: 8 (Grouped Query Attention)
+- **Mamba Version**: Mamba-2
+- **Vocabulary**: 12 tokens (A, C, G, T, N + special tokens)
+- **Max Sequence Length**: 131,202 bp
+## Installation
+```bash
+pip install transformers torch mamba-ssm causal-conv1d flash-attn
+```
+## Usage
+### Text Generation
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+model_name = "Mishamq/HybriDNA-3B"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+prompt = "ACGTACGT"
+inputs = tokenizer(prompt, return_tensors="pt")
+outputs = model.generate(**inputs, max_new_tokens=64)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+```
+### Embeddings
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+model_name = "Mishamq/HybriDNA-3B"
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
+sequence = "ACGTACGTACGTACGT"
+inputs = tokenizer(sequence, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+    embeddings = outputs.last_hidden_state
+```
+## Model Variants
+| Model | Parameters | Hidden Size | Layers |
+|-------|------------|-------------|--------|
+| [HybriDNA-300M](https://huggingface.co/Mishamq/HybriDNA-300M) | 300M | 1024 | 24 |
+| [HybriDNA-3B](https://huggingface.co/Mishamq/HybriDNA-3B) | 3B | 4096 | 16 |
+| [HybriDNA-7B](https://huggingface.co/Mishamq/HybriDNA-7B) | 7B | 4096 | 32 |
+## Citation
+If you use HybriDNA in your research, please cite:
+```bibtex
+@article{ma2025hybridna,
+  title={HybriDNA: A Hybrid Transformer-Mamba2 Long-Range DNA Language Model},
+  author={Ma, Mingqian and Liu, Guoqing and Cao, Chuan and Deng, Pan and Dao, Tri and Gu, Albert and Jin, Peiran and Yang, Zhao and Xia, Yingce and Luo, Renqian and others},
+  journal={arXiv preprint arXiv:2502.10807},
+  year={2025}
+}
+```
+## License
+Apache 2.0

config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "_name_or_path": "./",
+  "architectures": [
+    "HybriDNAForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_layer_offset": 4,
+  "attn_layer_period": 8,
+  "auto_map": {
+    "AutoConfig": "configuration_hybridna.HybriDNAConfig",
+    "AutoModel": "modeling_hybridna.HybriDNAModel",
+    "AutoModelForCausalLM": "modeling_hybridna.HybriDNAForCausalLM"
+  },
+  "bos_token_id": 2,
+  "chunk_size": 256,
+  "eos_token_id": 1,
+  "expert_layer_offset": 7565761,
+  "expert_layer_period": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_state": 64,
+  "mamba_dt_rank": 64,
+  "mamba_expand": 2,
+  "mamba_proj_bias": false,
+  "mamba_version": "mamba-2",
+  "max_position_embeddings": 8194,
+  "model_type": "hybridna",
+  "n_groups": 8,
+  "num_attention_heads": 32,
+  "num_experts": 8,
+  "num_experts_per_tok": 2,
+  "num_hidden_layers": 16,
+  "num_key_value_heads": 8,
+  "num_logits_to_keep": 2,
+  "output_router_logits": false,
+  "pad_token_id": 4,
+  "rms_norm_eps": 1e-06,
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "time_step_floor": 0.0001,
+  "time_step_limit": [
+    0.0,
+    Infinity
+  ],
+  "time_step_max": 0.1,
+  "time_step_min": 0.001,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.42.4",
+  "use_cache": false,
+  "use_mamba_kernels": true,
+  "vocab_size": 12
+}

configuration_hybridna.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import math
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class HybriDNAConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HybriDNA`] model. It is adopted from the AI21 lab work of Jamba Model.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the HybriDNA model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HybriDNAModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        max_position_embeddings (`int`, *optional*, defaults to 262144):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+        head_dim (`int`, *optional*, defaults to 64):
+            Dimension of each attention head.
+        chunk_size (`int`, *optional*, defaults to 256):
+            The size of each chunk for processing.
+        n_groups (`int`, *optional*, defaults to 8):
+            Number of groups for the evolution matrices of mamba 2.
+        time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj.bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj.bias`.
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`):
+            Accepted range of time step values.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether to return the router logits from mixture-of-experts layers.
+    """
+    model_type = "hybridna"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+            self,
+            vocab_size=65536,
+            tie_word_embeddings=False,
+            hidden_size=4096,
+            intermediate_size=14336,
+            num_hidden_layers=32,
+            num_attention_heads=32,
+            num_key_value_heads=8,
+            hidden_act="silu",
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            num_logits_to_keep=1,
+            sliding_window=None,
+            max_position_embeddings=262144,
+            attention_dropout=0.0,
+            use_mamba_kernels=True,
+            mamba_d_state=16,
+            mamba_d_conv=4,
+            mamba_expand=2,
+            mamba_dt_rank="auto",
+            mamba_conv_bias=True,
+            mamba_proj_bias=False,
+            head_dim=64,
+            chunk_size=256,
+            n_groups=8,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            time_step_min=0.001,
+            time_step_max=0.1,
+            time_step_floor=1e-4,
+            time_step_limit=(0.0, float("inf")),
+            output_router_logits=False,
+            **kwargs,
+    ):
+        self.output_router_logits = output_router_logits
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.head_dim = head_dim
+        self.chunk_size = chunk_size
+        self.n_groups = n_groups
+        self.time_step_limit = time_step_limit
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_floor = time_step_floor
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            output_router_logits=output_router_logits,
+            **kwargs,
+        )
+    @property
+    def layers_block_type(self):
+        return [
+            "attention" if i % self.attn_layer_period == self.attn_layer_offset else "mamba"
+            for i in range(self.num_hidden_layers)
+        ]

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "pad_token_id": 4,
+  "transformers_version": "4.42.4",
+  "use_cache": false
+}

hybridna_tokenizer.py ADDED Viewed

	@@ -0,0 +1,226 @@

+from transformers import PreTrainedTokenizer, AddedToken
+from typing import List, Optional, Union, Dict, Sequence, Tuple
+from pathlib import Path
+import numpy as np
+import json
+import os
+class HybriDNATokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self,
+                 model_max_length: int,
+                 bos_token="[BOS]",
+                 eos_token="[SEP]",
+                 sep_token="[SEP]",
+                 cls_token="[CLS]",
+                 pad_token="[PAD]",
+                 mask_token="[MASK]",
+                 unk_token="[UNK]",
+                 **kwargs):
+        """Character tokenizer for Hugging Face transformers.
+        Args:
+            characters (Sequence[str]): List of desired characters. Any character which
+                is not included in this list will be replaced by a special token called
+                [UNK] with id=6. Following are list of all of the special tokens with
+                their corresponding ids:
+                    "[CLS]": 0
+                    "[SEP]": 1
+                    "[BOS]": 2
+                    "[MASK]": 3
+                    "[PAD]": 4
+                    "[RESERVED]": 5
+                    "[UNK]": 6
+                an id (starting at 7) will be assigned to each character.
+            model_max_length (int): Model maximum sequence length.
+        """
+        self.characters = ('A', 'C', 'G', 'T', 'N')
+        self.model_max_length = model_max_length
+        self._vocab_str_to_int = {
+            "[CLS]": 0,
+            "[SEP]": 1,
+            "[BOS]": 2,
+            "[MASK]": 3,
+            "[PAD]": 4,
+            "[RESERVED]": 5,
+            "[UNK]": 6,
+            **{ch: i + 7 for i, ch in enumerate(self.characters)},
+        }
+        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
+        self._bos_id = self._vocab_str_to_int["[BOS]"]
+        self._eos_id = self._vocab_str_to_int["[SEP]"]
+        self._pad_id = self._vocab_str_to_int["[PAD]"]
+        self._unk_id = self._vocab_str_to_int["[UNK]"]
+        self._bos_np = np.array([self._bos_id], dtype=np.uint16)
+        self._eos_np = np.array([self._eos_id], dtype=np.uint16)
+        self._numpy_lookup = np.full(256, self._unk_id, dtype=np.uint16)
+        for ch in self.characters:
+            self._numpy_lookup[ord(ch)] = self._vocab_str_to_int[ch]
+        for special in ("[CLS]", "[SEP]", "[BOS]", "[MASK]", "[PAD]", "[RESERVED]", "[UNK]"):
+            token_id = self._vocab_str_to_int[special]
+            if special.startswith("[") and len(special) == 5:
+                # Skip bracketed four-letter tokens from attempting ascii mapping.
+                continue
+            # Explicitly map special token string representations if they are single characters.
+            if len(special) == 1:
+                self._numpy_lookup[ord(special)] = token_id
+        add_prefix_space = kwargs.pop("add_prefix_space", False)
+        padding_side = kwargs.pop("padding_side", "left")
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_token=unk_token,
+            add_prefix_space=add_prefix_space,
+            model_max_length=model_max_length,
+            padding_side=padding_side,
+            **kwargs,
+        )
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab_str_to_int)
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._vocab_int_to_str[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            result += ([0] * len(token_ids_1)) + [1]
+        return result
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        sep = [self.sep_token_id]
+        bos = [self.bos_token_id]
+        eos = [self.eos_token_id]
+        result = bos + token_ids_0 + eos
+        if token_ids_1 is not None:
+            result += token_ids_1 + eos
+        return result
+    def create_attention_mask(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """Creates an attention mask to differentiate between padding and non-padding tokens.
+        Args:
+            token_ids_0 (List[int]): List of token IDs for the first sequence.
+            token_ids_1 (Optional[List[int]]): List of token IDs for the second sequence if available.
+        Returns:
+            List[int]: A list where 1 represents non-padding tokens and 0 represents padding tokens.
+        """
+        mask = [1] * len(token_ids_0)
+        if token_ids_1 is not None:
+            mask += [1] * len(token_ids_1)
+        return mask
+    def get_vocab(self) -> Dict[str, int]:
+        return self._vocab_str_to_int
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple:
+        vocab_file = os.path.join(save_directory, (filename_prefix or '') + 'vocab.json')
+        with open(vocab_file, 'w') as f:
+            json.dump(self._vocab_str_to_int, f)
+        return (vocab_file,)
+    def __call__(
+        self,
+        text: Union[str, List[str]],
+        *,
+        padding: bool = True,
+        truncation: bool = True,
+        max_length: Optional[int] = None,
+        add_special_tokens: bool = True,
+    ):
+        # ---------- detect batch vs single ----------
+        is_batch = not isinstance(text, str)
+        seqs = text if is_batch else [text]          # always work on a list internally
+        max_len = max_length or self.model_max_length
+        # ---------- encode every sequence ----------
+        batch_input_ids = []
+        for seq in seqs:
+            seq_bytes = np.frombuffer(seq.encode("ascii", "ignore"), dtype=np.uint8)
+            ids = self._numpy_lookup[seq_bytes]
+            if add_special_tokens:
+                ids = np.concatenate((self._bos_np, ids, self._eos_np))
+            if truncation and ids.size > max_len:
+                ids = ids[:max_len]
+            batch_input_ids.append(ids.astype(np.uint16, copy=False))
+        # ---------- pad ----------
+        if padding and batch_input_ids:
+            if padding == "max_length":
+                pad_len = max_len
+            elif padding == "longest":
+                pad_len = max(ids.size for ids in batch_input_ids)
+            elif padding is True:
+                pad_len = max(ids.size for ids in batch_input_ids)
+            else:
+                pad_len = None
+            if pad_len is not None:
+                pad_len = min(pad_len, max_len)
+                padded_ids = []
+                for ids in batch_input_ids:
+                    if ids.size < pad_len:
+                        pad_width = pad_len - ids.size
+                        ids = np.pad(ids, (0, pad_width), constant_values=self._pad_id)
+                    elif ids.size > pad_len:
+                        ids = ids[:pad_len]
+                    ids = np.asarray(ids, dtype=np.uint16, order="C")
+                    padded_ids.append(ids[:pad_len])
+                batch_input_ids = padded_ids
+        for ids in batch_input_ids:
+            if not isinstance(ids, np.ndarray):
+                # Fallback for any non-numpy path
+                continue
+        # ---------- masks ----------
+        batch_attention = []
+        for ids in batch_input_ids:
+            if isinstance(ids, np.ndarray):
+                mask = (ids != self._pad_id).astype(np.uint8, copy=False)
+            else:
+                mask = [0 if tok == self._pad_id else 1 for tok in ids]
+            batch_attention.append(mask)
+        # ---------- collapse back if it was a single example ----------
+        if not is_batch:
+            batch_input_ids = batch_input_ids[0]
+            batch_attention = batch_attention[0]
+        return {
+            "input_ids": batch_input_ids,
+            "attention_mask": batch_attention,
+        }

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bc123358e87c99d4b88ea170e3eaf6d05145a3c1160497ddf57fb08c91f1c9e
+size 4956986656

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf7cee3faa7bfab0d97a3498720e8185a3724f631401619a20ab1edb549b5b65
+size 1281837808

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,210 @@

+{
+  "metadata": {
+    "total_size": 6238801920
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.final_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.0.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.feed_forward.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.pre_ff_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.feed_forward.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.feed_forward.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.feed_forward.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.D": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.conv1d.bias": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.mamba.out_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.13.pre_ff_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.feed_forward.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.feed_forward.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.feed_forward.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.D": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.conv1d.bias": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.mamba.out_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.14.pre_ff_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.feed_forward.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.feed_forward.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.feed_forward.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.A_log": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.D": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.conv1d.bias": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.conv1d.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.dt_bias": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.in_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.norm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.mamba.out_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.15.pre_ff_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.2.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.feed_forward.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.feed_forward.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.feed_forward.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.A_log": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.D": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.conv1d.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.conv1d.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.dt_bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.in_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.norm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mamba.out_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.pre_ff_layernorm.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_hybridna.py ADDED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "[BOS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[BOS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "auto_map": {
+    "AutoTokenizer": [
+      "hybridna_tokenizer.HybriDNATokenizer",
+      null
+    ]
+  },
+  "bos_token": "[BOS]",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "eos_token": "[SEP]",
+  "mask_token": "[MASK]",
+  "model_max_length": 131202,
+  "pad_token": "[PAD]",
+  "padding_side": "left",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "HybriDNATokenizer",
+  "unk_token": "[UNK]"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"[CLS]": 0, "[SEP]": 1, "[BOS]": 2, "[MASK]": 3, "[PAD]": 4, "[RESERVED]": 5, "[UNK]": 6, "A": 7, "C": 8, "G": 9, "T": 10, "N": 11}