seconds-0 commited on
Commit
d4bae99
·
verified ·
1 Parent(s): 9a7a74a

Upload tokenizer

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {% for m in messages %}{% if m['role']=='user' %}<|user|>{{ m['content'] }}
2
+ {% elif m['role']=='assistant' %}<|assistant|>{{ m['content'] }}
3
+ {% endif %}{% endfor %}<|assistant|>
special_tokens_map.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "eos_token": "<0>",
3
+ "pad_token": "<0>"
4
+ }
tokenization_nsa.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Remote code: configuration and modeling for NSA
2
+ from typing import List, Optional, Dict
3
+ import json
4
+ from transformers import PreTrainedTokenizer
5
+
6
+
7
+ class NSAByteTokenizer(PreTrainedTokenizer):
8
+ """A simple byte-level tokenizer with fixed vocab size 256.
9
+
10
+ - Encodes UTF-8 bytes of the input string as token ids 0..255.
11
+ - No special tokens by default; EOS/PAD can be configured via special tokens map.
12
+ - Decoding uses UTF-8 with replacement for invalid sequences.
13
+ """
14
+
15
+ def __init__(self, **kwargs):
16
+ # Build a stable 256-entry vocab mapping before base init (base may query the vocab)
17
+ self._vocab: Dict[str, int] = {f"<{i}>": i for i in range(256)}
18
+ self._ids_to_tokens: Dict[int, str] = {i: f"<{i}>" for i in range(256)}
19
+ super().__init__(**kwargs)
20
+ # Only return input_ids and attention_mask to avoid unused token_type_ids in generation
21
+ self.model_input_names = ["input_ids", "attention_mask"]
22
+
23
+ @property
24
+ def vocab_size(self) -> int: # type: ignore[override]
25
+ return 256
26
+
27
+ def get_vocab(self) -> Dict[str, int]: # type: ignore[override]
28
+ return dict(self._vocab)
29
+
30
+ def _tokenize(self, text: str) -> List[str]: # type: ignore[override]
31
+ data = text.encode("utf-8", errors="replace")
32
+ return [f"<{b}>" for b in data]
33
+
34
+ def _convert_token_to_id(self, token: str) -> int: # type: ignore[override]
35
+ if token in self._vocab:
36
+ return self._vocab[token]
37
+ # Fallback: try parse numeric inside <..>
38
+ if token.startswith("<") and token.endswith(">"):
39
+ try:
40
+ v = int(token[1:-1])
41
+ if 0 <= v < 256:
42
+ return v
43
+ except Exception:
44
+ pass
45
+ return 0
46
+
47
+ def _convert_id_to_token(self, index: int) -> str: # type: ignore[override]
48
+ return self._ids_to_tokens.get(int(index) % 256, "<0>")
49
+
50
+ def convert_tokens_to_string(self, tokens: List[str]) -> str: # type: ignore[override]
51
+ bs = []
52
+ for t in tokens:
53
+ if t in self._vocab:
54
+ bs.append(self._vocab[t])
55
+ else:
56
+ try:
57
+ if t.startswith("<") and t.endswith(">"):
58
+ v = int(t[1:-1])
59
+ if 0 <= v < 256:
60
+ bs.append(v)
61
+ continue
62
+ except Exception:
63
+ pass
64
+ return bytes(bs).decode("utf-8", errors="replace")
65
+
66
+ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]: # type: ignore[override]
67
+ if token_ids_1 is None:
68
+ return token_ids_0
69
+ return token_ids_0 + token_ids_1
70
+
71
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None): # type: ignore[override]
72
+ # Nothing to save besides special tokens map handled by the base class.
73
+ return (), ()
tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<0>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ }
11
+ },
12
+ "auto_map": {
13
+ "AutoTokenizer": [
14
+ "tokenization_nsa.NSAByteTokenizer",
15
+ null
16
+ ]
17
+ },
18
+ "clean_up_tokenization_spaces": false,
19
+ "eos_token": "<0>",
20
+ "extra_special_tokens": {},
21
+ "model_max_length": 2048,
22
+ "pad_token": "<0>",
23
+ "tokenizer_class": "NSAByteTokenizer"
24
+ }