Upload protein amino-acid fast tokenizer

Browse files

Files changed (4) hide show

README.md +66 -0
special_tokens_map.json +8 -0
tokenizer.json +190 -0
tokenizer_config.json +66 -0

README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+language:
+  - en
+tags:
+  - protein
+  - amino-acid
+  - tokenizer
+  - biology
+license: mit
+library_name: transformers
+---
+# Protein Amino-Acid Fast Tokenizer
+Fast Rust-backed tokenizer for protein sequences.
+## Features
+- **1 token = 1 amino acid** — character-level tokenization
+- **Fast Rust backend** — efficient processing via HuggingFace Tokenizers
+- **Transformer-ready** — compatible with `AutoTokenizer`
+## Usage
+```python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("pszmk/protein-aa-fast-tokenizer")
+# Single sequence
+tokens = tokenizer("MKTLLILAVAVCSAA")
+print(tokens)
+# {'input_ids': [2, 16, 14, ...], 'attention_mask': [1, 1, ...]}
+# Batch with padding
+batch = tokenizer(
+    ["MKTLLILAVAVCSAA", "ACDEFGHIK"],
+    padding=True,
+    return_tensors="pt",
+)
+```
+## Vocabulary
+| ID | Token | Description |
+|----|-------|-------------|
+| 0 | `<PAD>` | Padding |
+| 1 | `<MASK>` | Masked token |
+| 2 | `<CLS>` | Classification / Start |
+| 3 | `<SEP>` | Separator |
+| 4 | `<EOS>` | End of sequence |
+| 5 | `<UNK>` | Unknown |
+| 6-25 | A-Y | Standard amino acids |
+| 26 | X | Any amino acid |
+| 27 | B | Asparagine or Aspartic acid |
+| 28 | Z | Glutamine or Glutamic acid |
+## Template Processing
+- **Single sequence:** `<CLS> SEQUENCE <EOS>`
+- **Pair sequences:** `<CLS> SEQ_A <SEP> SEQ_B <EOS>`
+## Citation
+Part of the LAMP (Latent Anti-Microbial Peptides) project.

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "cls_token": "<CLS>",
+  "eos_token": "<EOS>",
+  "mask_token": "<MASK>",
+  "pad_token": "<PAD>",
+  "sep_token": "<SEP>",
+  "unk_token": "<UNK>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,190 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<PAD>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<MASK>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<CLS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<SEP>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "<EOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 5,
+      "content": "<UNK>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Split",
+    "pattern": {
+      "String": ""
+    },
+    "behavior": "Isolated",
+    "invert": false
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<CLS>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<EOS>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "SpecialToken": {
+          "id": "<CLS>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<SEP>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<EOS>",
+          "type_id": 0
+        }
+      }
+    ],
+    "special_tokens": {
+      "<CLS>": {
+        "id": "<CLS>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<CLS>"
+        ]
+      },
+      "<EOS>": {
+        "id": "<EOS>",
+        "ids": [
+          4
+        ],
+        "tokens": [
+          "<EOS>"
+        ]
+      },
+      "<SEP>": {
+        "id": "<SEP>",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "<SEP>"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "<PAD>": 0,
+      "<MASK>": 1,
+      "<CLS>": 2,
+      "<SEP>": 3,
+      "<EOS>": 4,
+      "<UNK>": 5,
+      "A": 6,
+      "C": 7,
+      "D": 8,
+      "E": 9,
+      "F": 10,
+      "G": 11,
+      "H": 12,
+      "I": 13,
+      "K": 14,
+      "L": 15,
+      "M": 16,
+      "N": 17,
+      "P": 18,
+      "Q": 19,
+      "R": 20,
+      "S": 21,
+      "T": 22,
+      "V": 23,
+      "W": 24,
+      "Y": 25,
+      "X": 26,
+      "B": 27,
+      "Z": 28
+    },
+    "unk_token": "<UNK>"
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<PAD>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<MASK>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<CLS>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<SEP>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<EOS>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<UNK>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<CLS>",
+  "eos_token": "<EOS>",
+  "extra_special_tokens": {},
+  "mask_token": "<MASK>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<PAD>",
+  "sep_token": "<SEP>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<UNK>"
+}