# tokenizer.py class ChessTokenizer: def __init__(self, chars = "abcdefgh0123456789prnqkPRNBQK/- w"): self.char_to_int = {"": 0, "": 1, "": 2} self.int_to_char = {0: "", 1: "", 2: ""} for i, char in enumerate(chars): self.char_to_int[char] = i + 3 self.int_to_char[i + 3] = char self.vocab_size = len(self.char_to_int) def encode(self, text, is_target=False): tokens = [self.char_to_int[c] for c in text if c in self.char_to_int] if is_target: tokens = [self.char_to_int[""]] + tokens + [self.char_to_int[""]] return tokens def decode(self, tokens): chars = [] for token in tokens: if token in (0, 1): continue if token == 2: break chars.append(self.int_to_char.get(token, "")) return "".join(chars)