FreakingPotato commited on
Commit
b77d27f
·
verified ·
1 Parent(s): 5249136

Upload tokenizer.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenizer.py +221 -0
tokenizer.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Optional, Union, Any, Tuple
2
+ import os
3
+ from transformers import PreTrainedTokenizer
4
+ from itertools import product
5
+ import json
6
+
7
+ class NucEL_Tokenizer(PreTrainedTokenizer):
8
+ """
9
+ KMER Tokenizer for DNA sequences, inheriting from Hugging Face's PreTrainedTokenizer.
10
+ Handles k-mer tokenization with support for special tokens, padding, and truncation.
11
+ """
12
+
13
+ model_input_names = ["input_ids", "attention_mask"]
14
+
15
+ def __init__(
16
+ self,
17
+ k: int = 6,
18
+ model_max_length: int = 2048,
19
+ pad_token: str = "[PAD]",
20
+ unk_token: str = "[UNK]",
21
+ sep_token: str = "[SEP]",
22
+ cls_token: str = "[CLS]",
23
+ mask_token: str = "[MASK]",
24
+ bos_token: str = "[BOS]",
25
+ eos_token: str = "[EOS]",
26
+ num_reserved_tokens: int = 16,
27
+ **kwargs
28
+ ):
29
+ """Initialize the KMER tokenizer."""
30
+ self.k = k
31
+ self.nucleotides = ['A', 'C', 'G', 'T']
32
+ self.num_reserved_tokens = num_reserved_tokens
33
+
34
+ # Define special tokens
35
+ self.special_tokens = {
36
+ "pad_token": pad_token,
37
+ "unk_token": unk_token,
38
+ "sep_token": sep_token,
39
+ "cls_token": cls_token,
40
+ "mask_token": mask_token,
41
+ "bos_token": bos_token,
42
+ "eos_token": eos_token,
43
+ }
44
+
45
+ # Build vocabulary (includes special tokens, nucleotides, and k-mers)
46
+ self._init_vocabulary()
47
+
48
+ # Now initialize the parent class.
49
+ super().__init__(
50
+ model_max_length=model_max_length,
51
+ pad_token=pad_token,
52
+ unk_token=unk_token,
53
+ sep_token=sep_token,
54
+ cls_token=cls_token,
55
+ mask_token=mask_token,
56
+ bos_token=bos_token,
57
+ eos_token=eos_token,
58
+ **kwargs
59
+ )
60
+
61
+ def _init_vocabulary(self):
62
+ """Initialize the vocabulary with special tokens, nucleotides, and k-mers."""
63
+ # Get special tokens in a specific order
64
+ special_tokens = [
65
+ self.special_tokens["pad_token"],
66
+ self.special_tokens["unk_token"],
67
+ self.special_tokens["cls_token"],
68
+ self.special_tokens["sep_token"],
69
+ self.special_tokens["mask_token"],
70
+ self.special_tokens["bos_token"],
71
+ self.special_tokens["eos_token"]
72
+ ]
73
+
74
+ # Add individual nucleotides
75
+ nucleotides = self.nucleotides
76
+
77
+ # Generate all possible k-mers
78
+ kmers = [''.join(p) for p in product(self.nucleotides, repeat=self.k)]
79
+
80
+ # Add reserved tokens for future use
81
+ reserved_tokens = [f"[RESERVED_{i}]" for i in range(self.num_reserved_tokens)]
82
+
83
+ # Combine all tokens in a specific order
84
+ all_tokens = special_tokens + nucleotides + kmers + reserved_tokens
85
+
86
+ # Create vocabulary: token -> index
87
+ self.vocab = {}
88
+ for idx, token in enumerate(all_tokens):
89
+ self.vocab[token] = idx
90
+
91
+ # Create reverse mapping: index -> token
92
+ self.ids_to_tokens = {idx: token for token, idx in self.vocab.items()}
93
+
94
+ @property
95
+ def vocab_size(self) -> int:
96
+ """Return the size of vocabulary."""
97
+ return len(self.vocab)
98
+
99
+ def get_vocab(self) -> Dict[str, int]:
100
+ """Return the vocabulary dictionary."""
101
+ return self.vocab.copy()
102
+
103
+ def _tokenize(self, text: str) -> List[str]:
104
+ """
105
+ Tokenize a DNA sequence into k-mers and individual nucleotides.
106
+
107
+ Args:
108
+ text: DNA sequence to tokenize
109
+
110
+ Returns:
111
+ List of tokens.
112
+ """
113
+ text = text.upper().strip()
114
+ tokens = [self.cls_token]
115
+ i = 0
116
+
117
+ while i < len(text):
118
+ # Try to get a k-mer
119
+ if i <= len(text) - self.k:
120
+ kmer = text[i:i+self.k]
121
+ if kmer in self.vocab:
122
+ tokens.append(kmer)
123
+ i += self.k
124
+ continue
125
+
126
+ # Fallback: tokenize a single nucleotide
127
+ if i < len(text):
128
+ nucleotide = text[i]
129
+ if nucleotide in self.nucleotides:
130
+ tokens.append(nucleotide)
131
+ else:
132
+ tokens.append(self.unk_token)
133
+ i += 1
134
+
135
+ return tokens
136
+
137
+ def _convert_token_to_id(self, token: str) -> int:
138
+ """Convert a token to its ID in the vocabulary."""
139
+ return self.vocab.get(token, self.vocab[self.unk_token])
140
+
141
+ def _convert_id_to_token(self, index: int) -> str:
142
+ """Convert an ID to its token in the vocabulary."""
143
+ return self.ids_to_tokens.get(index, self.unk_token)
144
+
145
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
146
+ """Save the tokenizer vocabulary to a directory."""
147
+ if not filename_prefix:
148
+ filename_prefix = "vocab"
149
+
150
+ vocab_file = os.path.join(save_directory, f"{filename_prefix}.json")
151
+
152
+ with open(vocab_file, 'w', encoding='utf-8') as f:
153
+ json.dump(self.vocab, f, ensure_ascii=False, indent=2)
154
+
155
+ return (vocab_file,)
156
+
157
+ def save_pretrained(self, save_directory: str, legacy_format: bool = True, filename_prefix: Optional[str] = None, **kwargs):
158
+ """
159
+ Save the tokenizer configuration and vocabulary.
160
+ """
161
+ # Save the vocabulary
162
+ vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
163
+
164
+ # Save the config
165
+ config = {
166
+ 'k': self.k,
167
+ 'model_max_length': self.model_max_length,
168
+ 'padding_side': self.padding_side,
169
+ 'truncation_side': self.truncation_side,
170
+ 'special_tokens': {
171
+ 'pad_token': self.pad_token,
172
+ 'unk_token': self.unk_token,
173
+ 'sep_token': self.sep_token,
174
+ 'cls_token': self.cls_token,
175
+ 'mask_token': self.mask_token,
176
+ 'bos_token': self.bos_token,
177
+ 'eos_token': self.eos_token,
178
+ }
179
+ }
180
+
181
+ super().save_pretrained(save_directory, config=config, legacy_format=legacy_format, **kwargs)
182
+
183
+ return vocab_files
184
+
185
+ @classmethod
186
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
187
+ """
188
+ Load a tokenizer from a pretrained model.
189
+ """
190
+ # Load the tokenizer configuration
191
+ config_file = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
192
+ with open(config_file, 'r', encoding='utf-8') as f:
193
+ config = json.load(f)
194
+
195
+ # Load the vocabulary
196
+ vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
197
+ with open(vocab_file, 'r', encoding='utf-8') as f:
198
+ vocab = json.load(f)
199
+
200
+ # Extract k from config (add it to your tokenizer_config.json if not present)
201
+ k = config.get('k', 6)
202
+
203
+ # Create tokenizer instance - tokens are at top level in tokenizer_config.json
204
+ tokenizer = cls(
205
+ k=k,
206
+ model_max_length=config.get('model_max_length', 2048),
207
+ pad_token=config.get('pad_token', '[PAD]'),
208
+ unk_token=config.get('unk_token', '[UNK]'),
209
+ sep_token=config.get('sep_token', '[SEP]'),
210
+ cls_token=config.get('cls_token', '[CLS]'),
211
+ mask_token=config.get('mask_token', '[MASK]'),
212
+ bos_token=config.get('bos_token', '[BOS]'),
213
+ eos_token=config.get('eos_token', '[EOS]'),
214
+ **kwargs
215
+ )
216
+
217
+ # Override the vocabulary with the saved one
218
+ tokenizer.vocab = vocab
219
+ tokenizer.ids_to_tokens = {idx: token for token, idx in vocab.items()}
220
+
221
+ return tokenizer