loubnabnl HF Staff commited on
Commit
401b752
·
verified ·
1 Parent(s): c3a251e

Promote hybrid step-286000 to main (300B CE + 300B FNS, total 600B tokens)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "max_position_embeddings": 8192,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 28,
20
+ "num_key_value_heads": 8,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 500000.0,
25
+ "tie_word_embeddings": true,
26
+ "transformers_version": "4.57.6",
27
+ "use_cache": true,
28
+ "vocab_size": 155776
29
+ }
dna_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "k": 6,
3
+ "dna_start_id": 151669,
4
+ "dna_vocab_size": 4107,
5
+ "dna_special_tokens": [
6
+ "<dna>",
7
+ "</dna>",
8
+ "<oov>"
9
+ ]
10
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.6"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e257506988203fdb8bb46976ee81c97e24f29073754bbff70137c7704dbadaa8
3
+ size 1023817968
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
tokenizer.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HybridDNATokenizer: Combines Qwen3 BPE tokenization with DNA 6-mer tokenization.
3
+
4
+ DNA sequences wrapped in <dna>...</dna> tags are tokenized as 6-mers.
5
+ All other text uses Qwen3's BPE tokenization.
6
+
7
+ Supports token_mask for Fine-grained Nucleotide Supervision (FNS):
8
+ -2: padding token
9
+ -1: text token (BPE)
10
+ 0: DNA special token (<dna>, </dna>, <oov>)
11
+ 1-5: partial 6-mer (number of valid bases)
12
+ 6: full 6-mer
13
+ """
14
+
15
+ import os
16
+ import json
17
+ import itertools
18
+ from typing import List, Optional, Tuple, Dict, Union, Any
19
+
20
+ from transformers import PreTrainedTokenizer, AutoTokenizer, BatchEncoding
21
+
22
+
23
+ class HybridDNATokenizer(PreTrainedTokenizer):
24
+ """
25
+ Hybrid tokenizer combining Qwen3 BPE with DNA 6-mer tokenization.
26
+
27
+ DNA regions must be wrapped in <dna>...</dna> tags to be tokenized as 6-mers.
28
+ Without tags, DNA sequences are tokenized as regular BPE text.
29
+ """
30
+
31
+ model_input_names = ["input_ids", "attention_mask"]
32
+
33
+ def __init__(
34
+ self,
35
+ base_tokenizer_path: Optional[str] = None,
36
+ k: int = 6,
37
+ **kwargs
38
+ ):
39
+ self.k = k
40
+
41
+ # Load base tokenizer (Qwen3-4B-Base)
42
+ self._base_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B-Base")
43
+
44
+ # Get base vocabulary
45
+ self._base_vocab = self._base_tokenizer.get_vocab()
46
+ self._base_vocab_size = len(self._base_vocab)
47
+
48
+ # Initialize DNA vocabulary
49
+ self._init_dna_vocab()
50
+
51
+ # Build combined vocabulary
52
+ self._build_combined_vocab()
53
+
54
+ # Set special tokens
55
+ self._eos_token = kwargs.pop('eos_token', None) or "<|endoftext|>"
56
+ self._pad_token = kwargs.pop('pad_token', None) or self._base_tokenizer.pad_token or "<|endoftext|>"
57
+
58
+ # Initialize parent class
59
+ super().__init__(
60
+ eos_token=self._eos_token,
61
+ pad_token=self._pad_token,
62
+ **kwargs
63
+ )
64
+
65
+ self.special_tokens = self.dna_special_tokens + [self._eos_token, self._pad_token]
66
+
67
+ def _init_dna_vocab(self):
68
+ """Initialize DNA vocabulary (special tokens + k-mers + padding for 128 alignment)."""
69
+ bases = ['A', 'T', 'C', 'G']
70
+
71
+ # DNA special tokens
72
+ self.dna_special_tokens = ["<dna>", "</dna>", "<oov>"]
73
+
74
+ # Generate all k-mer combinations (4^k = 4096 for k=6)
75
+ self.kmers = [''.join(kmer) for kmer in itertools.product(bases, repeat=self.k)]
76
+
77
+ # DNA tokens start after base vocabulary
78
+ self.dna_start_id = self._base_vocab_size
79
+
80
+ # All DNA tokens get new IDs (no reuse of base vocab IDs, even for
81
+ # overlapping tokens like CCCCCC — they have different semantics in
82
+ # DNA context vs BPE context, per Qiuyi's recommendation)
83
+ base_dna_tokens = self.dna_special_tokens + self.kmers
84
+
85
+ # Calculate padding for 128 alignment
86
+ total_vocab_unpadded = self._base_vocab_size + len(base_dna_tokens)
87
+ target_vocab_size = ((total_vocab_unpadded + 127) // 128) * 128
88
+ num_padding_tokens = target_vocab_size - total_vocab_unpadded
89
+
90
+ # Add unused padding tokens
91
+ self.padding_tokens = [f"<unused_{i}>" for i in range(num_padding_tokens)]
92
+
93
+ # Create DNA token mappings — all get sequential new IDs
94
+ self.dna_token_to_id = {}
95
+ self.dna_id_to_token = {}
96
+
97
+ current_id = self.dna_start_id
98
+ for token in base_dna_tokens:
99
+ self.dna_token_to_id[token] = current_id
100
+ self.dna_id_to_token[current_id] = token
101
+ current_id += 1
102
+
103
+ # Add padding tokens
104
+ for token in self.padding_tokens:
105
+ self.dna_token_to_id[token] = current_id
106
+ self.dna_id_to_token[current_id] = token
107
+ current_id += 1
108
+
109
+ self.dna_vocab_size = len(base_dna_tokens) + len(self.padding_tokens)
110
+
111
+ # Set DNA special token IDs
112
+ self.dna_begin_token_id = self.dna_token_to_id["<dna>"]
113
+ self.dna_end_token_id = self.dna_token_to_id["</dna>"]
114
+ self.oov_token_id = self.dna_token_to_id["<oov>"]
115
+
116
+ def _build_combined_vocab(self):
117
+ """Build combined vocabulary (base + DNA)."""
118
+ self._vocab = self._base_vocab.copy()
119
+
120
+ for token, token_id in self.dna_token_to_id.items():
121
+ if token not in self._vocab:
122
+ self._vocab[token] = token_id
123
+
124
+ self._id_to_token = {v: k for k, v in self._vocab.items()}
125
+ for token_id, token in self.dna_id_to_token.items():
126
+ if token_id not in self._id_to_token:
127
+ self._id_to_token[token_id] = token
128
+
129
+ @property
130
+ def vocab_size(self) -> int:
131
+ return max(self._vocab.values()) + 1
132
+
133
+ def get_vocab(self) -> Dict[str, int]:
134
+ return self._vocab.copy()
135
+
136
+ def __len__(self):
137
+ # Override default (len(get_vocab())) because get_vocab() deduplicates
138
+ # CCCCCC which exists as both BPE (ID 91443) and DNA 6-mer (ID 154402).
139
+ return self.vocab_size
140
+
141
+ def _split_by_dna_tags(self, text: str) -> List[Tuple[str, bool]]:
142
+ segments = []
143
+ i = 0
144
+ n = len(text)
145
+
146
+ while i < n:
147
+ start_pos = text.find('<dna>', i)
148
+ end_pos = text.find('</dna>', i)
149
+
150
+ if start_pos == -1 and end_pos == -1:
151
+ remaining = text[i:]
152
+ if remaining:
153
+ segments.append((remaining, False))
154
+ break
155
+
156
+ if start_pos == -1 and end_pos != -1:
157
+ dna_region = text[i:end_pos + 6]
158
+ if dna_region:
159
+ segments.append((dna_region, True))
160
+ i = end_pos + 6
161
+ continue
162
+
163
+ if start_pos != -1 and end_pos == -1:
164
+ if i < start_pos:
165
+ normal_text = text[i:start_pos]
166
+ if normal_text:
167
+ segments.append((normal_text, False))
168
+ dna_region = text[start_pos:]
169
+ if dna_region:
170
+ segments.append((dna_region, True))
171
+ break
172
+
173
+ if start_pos < end_pos:
174
+ if i < start_pos:
175
+ normal_text = text[i:start_pos]
176
+ if normal_text:
177
+ segments.append((normal_text, False))
178
+ dna_region = text[start_pos:end_pos + 6]
179
+ if dna_region:
180
+ segments.append((dna_region, True))
181
+ i = end_pos + 6
182
+ else:
183
+ dna_region = text[i:end_pos + 6]
184
+ if dna_region:
185
+ segments.append((dna_region, True))
186
+ i = end_pos + 6
187
+
188
+ return segments
189
+
190
+ def _parse_dna_region(self, dna_region: str) -> Tuple[str, bool, bool]:
191
+ if dna_region == '<dna>':
192
+ return '', True, False
193
+ elif dna_region == '</dna>':
194
+ return '', False, True
195
+
196
+ has_start = dna_region.startswith('<dna>')
197
+ has_end = dna_region.endswith('</dna>')
198
+
199
+ content = dna_region
200
+ if has_start:
201
+ content = content[5:]
202
+ if has_end and content.endswith('</dna>'):
203
+ content = content[:-6]
204
+
205
+ return content.strip(), has_start, has_end
206
+
207
+ def _process_dna_sequence(self, dna_seq: str) -> Dict:
208
+ k = self.k
209
+ dna_seq = dna_seq.upper()
210
+
211
+ kmer_tokens = []
212
+ valid_bases = set('ATCG')
213
+
214
+ def is_valid_kmer(kmer):
215
+ return len(kmer) == k and all(base in valid_bases for base in kmer)
216
+
217
+ for i in range(0, len(dna_seq) - k + 1, k):
218
+ kmer = dna_seq[i:i+k]
219
+ if is_valid_kmer(kmer):
220
+ kmer_tokens.append(kmer)
221
+ else:
222
+ kmer_tokens.append("<oov>")
223
+
224
+ processed_length = len(kmer_tokens) * k
225
+ remaining = dna_seq[processed_length:]
226
+ padding_length = 0
227
+ valid_length = k
228
+
229
+ if remaining:
230
+ padding_needed = k - len(remaining)
231
+ padded = remaining + 'A' * padding_needed
232
+
233
+ if is_valid_kmer(padded):
234
+ kmer_tokens.append(padded)
235
+ else:
236
+ kmer_tokens.append("<oov>")
237
+
238
+ padding_length = padding_needed
239
+ valid_length = len(remaining)
240
+
241
+ return {
242
+ "kmer_tokens": kmer_tokens,
243
+ "padding_length": padding_length,
244
+ "valid_length": valid_length,
245
+ }
246
+
247
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
248
+ return list(text)
249
+
250
+ def _convert_token_to_id(self, token: str) -> int:
251
+ if token in self.dna_token_to_id:
252
+ return self.dna_token_to_id[token]
253
+ return self._base_vocab.get(token, self._base_tokenizer.unk_token_id or 0)
254
+
255
+ def _convert_id_to_token(self, index: int) -> str:
256
+ if index in self.dna_id_to_token:
257
+ return self.dna_id_to_token[index]
258
+ return self._id_to_token.get(index, "<oov>")
259
+
260
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
261
+ return "".join(tokens)
262
+
263
+ def encode(
264
+ self,
265
+ text: str,
266
+ add_special_tokens: bool = False,
267
+ return_token_mask: bool = False,
268
+ **kwargs
269
+ ) -> Union[List[int], Tuple[List[int], List[int]]]:
270
+ segments = self._split_by_dna_tags(text)
271
+
272
+ token_ids = []
273
+ token_mask = [] if return_token_mask else None
274
+
275
+ for segment_content, is_dna in segments:
276
+ if is_dna:
277
+ dna_content, has_start, has_end = self._parse_dna_region(segment_content)
278
+
279
+ if has_start:
280
+ token_ids.append(self.dna_begin_token_id)
281
+ if return_token_mask:
282
+ token_mask.append(0)
283
+
284
+ if dna_content:
285
+ result = self._process_dna_sequence(dna_content)
286
+
287
+ for idx, kmer in enumerate(result["kmer_tokens"]):
288
+ token_id = self.dna_token_to_id.get(kmer, self.oov_token_id)
289
+ token_ids.append(token_id)
290
+
291
+ if return_token_mask:
292
+ if kmer == "<oov>":
293
+ token_mask.append(0)
294
+ elif idx == len(result["kmer_tokens"]) - 1 and result["padding_length"] > 0:
295
+ token_mask.append(result["valid_length"])
296
+ else:
297
+ token_mask.append(self.k)
298
+
299
+ if has_end:
300
+ token_ids.append(self.dna_end_token_id)
301
+ if return_token_mask:
302
+ token_mask.append(0)
303
+ else:
304
+ base_ids = self._base_tokenizer.encode(
305
+ segment_content,
306
+ add_special_tokens=False
307
+ )
308
+ token_ids.extend(base_ids)
309
+ if return_token_mask:
310
+ token_mask.extend([-1] * len(base_ids))
311
+
312
+ if add_special_tokens and self.eos_token_id is not None:
313
+ token_ids.append(self.eos_token_id)
314
+ if return_token_mask:
315
+ token_mask.append(-1)
316
+
317
+ if return_token_mask:
318
+ return token_ids, token_mask
319
+ return token_ids
320
+
321
+ def decode(
322
+ self,
323
+ token_ids: Union[int, List[int]],
324
+ skip_special_tokens: bool = False,
325
+ **kwargs
326
+ ) -> str:
327
+ if isinstance(token_ids, int):
328
+ token_ids = [token_ids]
329
+
330
+ if skip_special_tokens:
331
+ special_ids = {self.eos_token_id, self.pad_token_id}
332
+ token_ids = [tid for tid in token_ids if tid not in special_ids]
333
+
334
+ parts = []
335
+ i = 0
336
+
337
+ while i < len(token_ids):
338
+ tid = token_ids[i]
339
+
340
+ if tid == self.dna_begin_token_id:
341
+ dna_tokens = []
342
+ i += 1
343
+
344
+ while i < len(token_ids) and token_ids[i] != self.dna_end_token_id:
345
+ if token_ids[i] in self.dna_id_to_token:
346
+ dna_tokens.append(self.dna_id_to_token[token_ids[i]])
347
+ i += 1
348
+
349
+ dna_seq = ''.join(dna_tokens)
350
+
351
+ if skip_special_tokens:
352
+ parts.append(dna_seq)
353
+ else:
354
+ parts.append(f"<dna>{dna_seq}")
355
+ if i < len(token_ids) and token_ids[i] == self.dna_end_token_id:
356
+ parts.append("</dna>")
357
+ i += 1
358
+
359
+ elif tid in self.dna_id_to_token:
360
+ if not skip_special_tokens:
361
+ parts.append(self.dna_id_to_token[tid])
362
+ i += 1
363
+
364
+ else:
365
+ text_ids = []
366
+ while i < len(token_ids):
367
+ curr_id = token_ids[i]
368
+ if curr_id in self.dna_id_to_token or curr_id == self.dna_begin_token_id:
369
+ break
370
+ text_ids.append(curr_id)
371
+ i += 1
372
+
373
+ if text_ids:
374
+ decoded = self._base_tokenizer.decode(text_ids, skip_special_tokens=skip_special_tokens)
375
+ parts.append(decoded)
376
+
377
+ return ''.join(parts)
378
+
379
+ def batch_decode(
380
+ self,
381
+ sequences: Union[List[int], List[List[int]], "torch.Tensor"],
382
+ skip_special_tokens: bool = False,
383
+ **kwargs
384
+ ) -> List[str]:
385
+ return [
386
+ self.decode(
387
+ seq.tolist() if hasattr(seq, 'tolist') else list(seq),
388
+ skip_special_tokens=skip_special_tokens,
389
+ **kwargs
390
+ )
391
+ for seq in sequences
392
+ ]
393
+
394
+ def __call__(
395
+ self,
396
+ text: Union[str, List[str]],
397
+ add_special_tokens: bool = False,
398
+ padding: bool = False,
399
+ truncation: bool = False,
400
+ max_length: Optional[int] = None,
401
+ return_tensors: Optional[str] = None,
402
+ return_token_mask: bool = False,
403
+ **kwargs
404
+ ) -> Dict[str, Any]:
405
+ is_batch = isinstance(text, list)
406
+ texts = text if is_batch else [text]
407
+
408
+ all_ids = []
409
+ all_masks = [] if return_token_mask else None
410
+
411
+ for t in texts:
412
+ if return_token_mask:
413
+ ids, mask = self.encode(t, add_special_tokens=add_special_tokens, return_token_mask=True)
414
+ all_ids.append(ids)
415
+ all_masks.append(mask)
416
+ else:
417
+ ids = self.encode(t, add_special_tokens=add_special_tokens, return_token_mask=False)
418
+ all_ids.append(ids)
419
+
420
+ if padding:
421
+ max_len = max(len(ids) for ids in all_ids)
422
+ if max_length:
423
+ max_len = min(max_len, max_length)
424
+
425
+ padded_ids = []
426
+ attention_masks = []
427
+ padded_token_masks = [] if return_token_mask else None
428
+
429
+ for idx, ids in enumerate(all_ids):
430
+ pad_len = max_len - len(ids)
431
+
432
+ if pad_len > 0:
433
+ ids = ids + [self.pad_token_id] * pad_len
434
+ attn = [1] * (max_len - pad_len) + [0] * pad_len
435
+ if return_token_mask:
436
+ mask = all_masks[idx] + [-2] * pad_len
437
+ else:
438
+ ids = ids[:max_len]
439
+ attn = [1] * max_len
440
+ if return_token_mask:
441
+ mask = all_masks[idx][:max_len]
442
+
443
+ padded_ids.append(ids)
444
+ attention_masks.append(attn)
445
+ if return_token_mask:
446
+ padded_token_masks.append(mask)
447
+
448
+ all_ids = padded_ids
449
+ all_masks = padded_token_masks
450
+ else:
451
+ attention_masks = [[1] * len(ids) for ids in all_ids]
452
+
453
+ result = {
454
+ "input_ids": all_ids if is_batch else all_ids[0],
455
+ "attention_mask": attention_masks if is_batch else attention_masks[0],
456
+ }
457
+
458
+ if return_token_mask:
459
+ result["token_mask"] = all_masks if is_batch else all_masks[0]
460
+
461
+ if return_tensors == "pt":
462
+ import torch
463
+ if is_batch:
464
+ result["input_ids"] = torch.tensor(result["input_ids"])
465
+ result["attention_mask"] = torch.tensor(result["attention_mask"])
466
+ if return_token_mask:
467
+ result["token_mask"] = torch.tensor(result["token_mask"])
468
+ else:
469
+ result["input_ids"] = torch.tensor([result["input_ids"]])
470
+ result["attention_mask"] = torch.tensor([result["attention_mask"]])
471
+ if return_token_mask:
472
+ result["token_mask"] = torch.tensor([result["token_mask"]])
473
+
474
+ return BatchEncoding(result, tensor_type=return_tensors)
475
+
476
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
477
+ vocab_file = os.path.join(
478
+ save_directory,
479
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.json"
480
+ )
481
+
482
+ with open(vocab_file, "w", encoding="utf-8") as f:
483
+ json.dump(self._vocab, f, ensure_ascii=False, indent=2)
484
+
485
+ return (vocab_file,)
486
+
487
+ def save_pretrained(self, save_directory: str, **kwargs):
488
+ os.makedirs(save_directory, exist_ok=True)
489
+
490
+ # Save base tokenizer files
491
+ self._base_tokenizer.save_pretrained(save_directory)
492
+
493
+ # Save DNA config
494
+ dna_config = {
495
+ "k": self.k,
496
+ "dna_start_id": self.dna_start_id,
497
+ "dna_vocab_size": self.dna_vocab_size,
498
+ "dna_special_tokens": self.dna_special_tokens,
499
+ }
500
+
501
+ dna_config_path = os.path.join(save_directory, "dna_config.json")
502
+ with open(dna_config_path, "w", encoding="utf-8") as f:
503
+ json.dump(dna_config, f, indent=2)
504
+
505
+ # Update tokenizer_config.json with auto_map
506
+ config_path = os.path.join(save_directory, "tokenizer_config.json")
507
+
508
+ if os.path.exists(config_path):
509
+ with open(config_path, "r") as f:
510
+ config = json.load(f)
511
+ else:
512
+ config = {}
513
+
514
+ config.update({
515
+ "tokenizer_class": "HybridDNATokenizer",
516
+ "auto_map": {
517
+ "AutoTokenizer": ["tokenizer.HybridDNATokenizer", None]
518
+ },
519
+ "k": self.k,
520
+ })
521
+
522
+ with open(config_path, "w", encoding="utf-8") as f:
523
+ json.dump(config, f, indent=2, ensure_ascii=False)
524
+
525
+ # Copy this tokenizer.py to save directory
526
+ import shutil
527
+ src_py = os.path.abspath(__file__)
528
+ dst_py = os.path.join(save_directory, "tokenizer.py")
529
+ if os.path.exists(src_py) and src_py != dst_py:
530
+ shutil.copy2(src_py, dst_py)
531
+
532
+ return (save_directory,)
533
+
534
+ @classmethod
535
+ def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
536
+ dna_config_path = os.path.join(pretrained_model_name_or_path, "dna_config.json")
537
+
538
+ if os.path.exists(dna_config_path):
539
+ with open(dna_config_path, "r") as f:
540
+ dna_config = json.load(f)
541
+ k = dna_config.get("k", 6)
542
+ else:
543
+ config_path = os.path.join(pretrained_model_name_or_path, "tokenizer_config.json")
544
+ if os.path.exists(config_path):
545
+ with open(config_path, "r") as f:
546
+ config = json.load(f)
547
+ k = config.get("k", 6)
548
+ else:
549
+ k = 6
550
+
551
+ return cls(base_tokenizer_path=pretrained_model_name_or_path, k=k, **kwargs)
tokenizer_config.json ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|endoftext|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "HybridDNATokenizer",
238
+ "unk_token": null,
239
+ "auto_map": {
240
+ "AutoTokenizer": [
241
+ "tokenizer.HybridDNATokenizer",
242
+ null
243
+ ]
244
+ },
245
+ "k": 6
246
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff