Buckets:

dpe1
/

jules-tinyreasoner

Files

xet

dpe1/jules-tinyreasoner / src /tokenizer.py

dpe1

10 days ago

download

raw

3.02 kB

	import torch

	class CharTokenizer:
	def __init__(self):
	# Base characters: numbers (10), lowercase english (26), punctuation/space (around 32)
	# Plus special tokens: [UPPER], [DEFINE], [SYMPY], [CAPABILITY_STOP], [PAD], [BOS], [EOS]

	self.chars = "0123456789abcdefghijklmnopqrstuvwxyz !\"#$%&'()*+,-./:;<=>?@[\\]^_`{\|}~"
	self.special_tokens = ["[UPPER]", "[DEFINE]", "[SYMPY]", "[CAPABILITY_STOP]", "[PAD]", "[BOS]", "[EOS]"]

	self.stoi = {char: i for i, char in enumerate(self.chars)}
	start_idx = len(self.chars)
	for i, token in enumerate(self.special_tokens):
	self.stoi[token] = start_idx + i

	self.itos = {i: s for s, i in self.stoi.items()}
	self.vocab_size = len(self.stoi)

	self.pad_token_id = self.stoi["[PAD]"]
	self.bos_token_id = self.stoi["[BOS]"]
	self.eos_token_id = self.stoi["[EOS]"]
	self.upper_token_id = self.stoi["[UPPER]"]
	self.define_token_id = self.stoi["[DEFINE]"]
	self.sympy_token_id = self.stoi["[SYMPY]"]
	self.stop_token_id = self.stoi["[CAPABILITY_STOP]"]

	def encode(self, text):
	tokens = []
	i = 0
	while i < len(text):
	# Check for special tokens first
	found_special = False
	for token in self.special_tokens:
	if text.startswith(token, i):
	tokens.append(self.stoi[token])
	i += len(token)
	found_special = True
	break
	if found_special:
	continue

	char = text[i]
	if char.isupper():
	tokens.append(self.upper_token_id)
	char = char.lower()

	if char in self.stoi:
	tokens.append(self.stoi[char])
	else:
	# Unknown characters treated as space or ignored? Let's just skip or use a '?'
	tokens.append(self.stoi.get('?', self.stoi[' ']))
	i += 1
	return tokens

	def decode(self, tokens):
	res = ""
	upper_next = False
	i = 0
	while i < len(tokens):
	t = tokens[i]
	s = self.itos.get(t, "")
	if s == "[UPPER]":
	upper_next = True
	elif s in self.special_tokens:
	res += s
	upper_next = False
	else:
	if upper_next:
	res += s.upper()
	upper_next = False
	else:
	res += s
	i += 1
	return res

	if __name__ == "__main__":
	tokenizer = CharTokenizer()
	test_str = "Hello World! 123 [DEFINE]test[CAPABILITY_STOP]"
	encoded = tokenizer.encode(test_str)
	decoded = tokenizer.decode(encoded)
	print(f"Original: {test_str}")
	print(f"Encoded: {encoded}")
	print(f"Decoded: {decoded}")
	assert test_str == decoded, f"Mismatch: {test_str} != {decoded}"
	print("Tokenizer test passed!")

Xet Storage Details

Size:: 3.02 kB
Xet hash:: 3fdc076415c0efe5fc97be4118871415a7ae1017b8461ecc9c4062a4621bfca2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.