dpe1/jules-tinyreasoner / src /tokenizer.py
dpe1's picture
download
raw
3.02 kB
import torch
class CharTokenizer:
def __init__(self):
# Base characters: numbers (10), lowercase english (26), punctuation/space (around 32)
# Plus special tokens: [UPPER], [DEFINE], [SYMPY], [CAPABILITY_STOP], [PAD], [BOS], [EOS]
self.chars = "0123456789abcdefghijklmnopqrstuvwxyz !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
self.special_tokens = ["[UPPER]", "[DEFINE]", "[SYMPY]", "[CAPABILITY_STOP]", "[PAD]", "[BOS]", "[EOS]"]
self.stoi = {char: i for i, char in enumerate(self.chars)}
start_idx = len(self.chars)
for i, token in enumerate(self.special_tokens):
self.stoi[token] = start_idx + i
self.itos = {i: s for s, i in self.stoi.items()}
self.vocab_size = len(self.stoi)
self.pad_token_id = self.stoi["[PAD]"]
self.bos_token_id = self.stoi["[BOS]"]
self.eos_token_id = self.stoi["[EOS]"]
self.upper_token_id = self.stoi["[UPPER]"]
self.define_token_id = self.stoi["[DEFINE]"]
self.sympy_token_id = self.stoi["[SYMPY]"]
self.stop_token_id = self.stoi["[CAPABILITY_STOP]"]
def encode(self, text):
tokens = []
i = 0
while i < len(text):
# Check for special tokens first
found_special = False
for token in self.special_tokens:
if text.startswith(token, i):
tokens.append(self.stoi[token])
i += len(token)
found_special = True
break
if found_special:
continue
char = text[i]
if char.isupper():
tokens.append(self.upper_token_id)
char = char.lower()
if char in self.stoi:
tokens.append(self.stoi[char])
else:
# Unknown characters treated as space or ignored? Let's just skip or use a '?'
tokens.append(self.stoi.get('?', self.stoi[' ']))
i += 1
return tokens
def decode(self, tokens):
res = ""
upper_next = False
i = 0
while i < len(tokens):
t = tokens[i]
s = self.itos.get(t, "")
if s == "[UPPER]":
upper_next = True
elif s in self.special_tokens:
res += s
upper_next = False
else:
if upper_next:
res += s.upper()
upper_next = False
else:
res += s
i += 1
return res
if __name__ == "__main__":
tokenizer = CharTokenizer()
test_str = "Hello World! 123 [DEFINE]test[CAPABILITY_STOP]"
encoded = tokenizer.encode(test_str)
decoded = tokenizer.decode(encoded)
print(f"Original: {test_str}")
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")
assert test_str == decoded, f"Mismatch: {test_str} != {decoded}"
print("Tokenizer test passed!")

Xet Storage Details

Size:
3.02 kB
·
Xet hash:
3fdc076415c0efe5fc97be4118871415a7ae1017b8461ecc9c4062a4621bfca2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.