Upload 9 files

Browse files

Files changed (9) hide show

__init__.py +0 -0
clean_split_sequence.py +11 -0
config.json +32 -0
model.safetensors +3 -0
special_tokens_map.json +37 -0
species_token_type.py +0 -0
synonomous_codons.py +23 -0
tokenizer.json +199 -0
tokenizer_config.json +60 -0

__init__.py ADDED Viewed

File without changes

clean_split_sequence.py ADDED Viewed

	@@ -0,0 +1,11 @@

+def clean_split_sequence(seq):
+    seq = seq.upper()
+    seq = seq.replace('U', 'T')
+    for base in seq:
+        if base not in {'A', 'T', 'G', 'C'}:
+            raise ValueError(f"Invalid character '{base}' found in sequence. Only A, T, G, C, and U are allowed.")
+    spaced_seq = " ".join([seq[i:i+3] for i in range(0, len(seq), 3)])
+    return spaced_seq

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "architectures": [
+    "SynCodonLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu_new",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-07,
+  "legacy": true,
+  "max_position_embeddings": 1024,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 768,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": true,
+  "relative_attention": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.3",
+  "type_vocab_size": 501,
+  "vocab_size": 69
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f2462757de28e7dc0881e013e2a6d0aa4fe75ce005133f1a9d67fb011904fc2
+size 410508700

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "bos_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

species_token_type.py ADDED Viewed

The diff for this file is too large to render. See raw diff

synonomous_codons.py ADDED Viewed

	@@ -0,0 +1,23 @@

+synonymous_codons = {
+    "A": ["GCT", "GCC", "GCA", "GCG"],
+    "C": ["TGT", "TGC"],
+    "D": ["GAT", "GAC"],
+    "E": ["GAA", "GAG"],
+    "F": ["TTT", "TTC"],
+    "G": ["GGT", "GGC", "GGA", "GGG"],
+    "H": ["CAT", "CAC"],
+    "I": ["ATT", "ATC", "ATA"],
+    "K": ["AAA", "AAG"],
+    "L": ["TTA", "TTG", "CTT", "CTC", "CTA", "CTG"],
+    "M": ["ATG"],
+    "N": ["AAT", "AAC"],
+    "P": ["CCT", "CCC", "CCA", "CCG"],
+    "Q": ["CAA", "CAG"],
+    "R": ["CGT", "CGC", "CGA", "CGG", "AGA", "AGG"],
+    "S": ["TCT", "TCC", "TCA", "TCG", "AGT", "AGC"],
+    "T": ["ACT", "ACC", "ACA", "ACG"],
+    "V": ["GTT", "GTC", "GTA", "GTG"],
+    "W": ["TGG"],
+    "Y": ["TAT", "TAC"],
+    "*": ["TAA", "TAG", "TGA"]
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,199 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 500
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 500
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 500
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "[CLS]",
+          "type_id": 500
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 500
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 500
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 500
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[SEP]",
+          "type_id": 500
+        }
+      }
+    ],
+    "special_tokens": {
+      "[CLS]": {
+        "id": "[CLS]",
+        "ids": [1],
+        "tokens": ["[CLS]"]
+      },
+      "[SEP]": {
+        "id": "[SEP]",
+        "ids": [2],
+        "tokens": ["[SEP]"]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "[PAD]": 0,
+      "[CLS]": 1,
+      "[SEP]": 2,
+      "[UNK]": 3,
+      "[MASK]": 4,
+      "GCT": 5,
+      "GCC": 6,
+      "GCA": 7,
+      "GCG": 8,
+      "TGT": 9,
+      "TGC": 10,
+      "GAT": 11,
+      "GAC": 12,
+      "GAA": 13,
+      "GAG": 14,
+      "TTT": 15,
+      "TTC": 16,
+      "GGT": 17,
+      "GGC": 18,
+      "GGA": 19,
+      "GGG": 20,
+      "CAT": 21,
+      "CAC": 22,
+      "ATT": 23,
+      "ATC": 24,
+      "ATA": 25,
+      "AAA": 26,
+      "AAG": 27,
+      "TTA": 28,
+      "TTG": 29,
+      "CTT": 30,
+      "CTC": 31,
+      "CTA": 32,
+      "CTG": 33,
+      "ATG": 34,
+      "AAT": 35,
+      "AAC": 36,
+      "CCT": 37,
+      "CCC": 38,
+      "CCA": 39,
+      "CCG": 40,
+      "CAA": 41,
+      "CAG": 42,
+      "CGT": 43,
+      "CGC": 44,
+      "CGA": 45,
+      "CGG": 46,
+      "AGA": 47,
+      "AGG": 48,
+      "TCT": 49,
+      "TCC": 50,
+      "TCA": 51,
+      "TCG": 52,
+      "AGT": 53,
+      "AGC": 54,
+      "ACT": 55,
+      "ACC": 56,
+      "ACA": 57,
+      "ACG": 58,
+      "GTT": 59,
+      "GTC": 60,
+      "GTA": 61,
+      "GTG": 62,
+      "TGG": 63,
+      "TAT": 64,
+      "TAC": 65,
+      "TAA": 66,
+      "TAG": 67,
+      "TGA": 68
+    },
+    "unk_token": "[UNK]"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "[CLS]",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "[SEP]",
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "max_length": 1024,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_to_multiple_of": null,
+  "pad_token": "[PAD]",
+  "pad_token_type_id": 500,
+  "padding_side": "right",
+  "stride": 0,
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "[UNK]"
+}