import json, os, torch class ChessTokenizer: def __init__(self, vocab=None): self.vocab = vocab if vocab else {} self.id_to_token = {v: k for k, v in self.vocab.items()} self.pad_token_id = self.vocab.get("[PAD]", 0) self.bos_token_id = self.vocab.get("[BOS]", 1) self.eos_token_id = self.vocab.get("[EOS]", 2) @property def vocab_size(self): return len(self.vocab) def _convert_token_to_id(self, token): return self.vocab.get(token, self.vocab.get("[UNK]")) def pad(self, encoded_inputs, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors=None): batch_ids = [x["input_ids"] for x in encoded_inputs] max_len = max(len(ids) for ids in batch_ids) padded_batch = [] for ids in batch_ids: padded_ids = ids + [self.pad_token_id] * (max_len - len(ids)) padded_batch.append(padded_ids) if return_tensors == "pt": return {"input_ids": torch.tensor(padded_batch, dtype=torch.long)} return {"input_ids": padded_batch} def save_pretrained(self, save_directory): os.makedirs(save_directory, exist_ok=True) with open(os.path.join(save_directory, "vocab.json"), "w") as f: json.dump(self.vocab, f, indent=4) @classmethod def from_pretrained(cls, load_directory): with open(os.path.join(load_directory, "vocab.json"), "r") as f: vocab = json.load(f) return cls(vocab)