| |
|
| | import json, os, torch |
| | class ChessTokenizer: |
| | def __init__(self, vocab=None): |
| | self.vocab = vocab if vocab else {} |
| | self.id_to_token = {v: k for k, v in self.vocab.items()} |
| | self.pad_token_id = self.vocab.get("[PAD]", 0) |
| | self.bos_token_id = self.vocab.get("[BOS]", 1) |
| | self.eos_token_id = self.vocab.get("[EOS]", 2) |
| | @property |
| | def vocab_size(self): return len(self.vocab) |
| | def _convert_token_to_id(self, token): return self.vocab.get(token, self.vocab.get("[UNK]")) |
| | def pad(self, encoded_inputs, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors=None): |
| | batch_ids = [x["input_ids"] for x in encoded_inputs] |
| | max_len = max(len(ids) for ids in batch_ids) |
| | padded_batch = [] |
| | for ids in batch_ids: |
| | padded_ids = ids + [self.pad_token_id] * (max_len - len(ids)) |
| | padded_batch.append(padded_ids) |
| | if return_tensors == "pt": return {"input_ids": torch.tensor(padded_batch, dtype=torch.long)} |
| | return {"input_ids": padded_batch} |
| | def save_pretrained(self, save_directory): |
| | os.makedirs(save_directory, exist_ok=True) |
| | with open(os.path.join(save_directory, "vocab.json"), "w") as f: json.dump(self.vocab, f, indent=4) |
| | @classmethod |
| | def from_pretrained(cls, load_directory): |
| | with open(os.path.join(load_directory, "vocab.json"), "r") as f: vocab = json.load(f) |
| | return cls(vocab) |
| |
|