chess-submission-v3 / tokenizer.py
MDaytek's picture
Direct Upload Backup
45b1630 verified
import json, os, torch
class ChessTokenizer:
def __init__(self, vocab=None):
self.vocab = vocab if vocab else {}
self.id_to_token = {v: k for k, v in self.vocab.items()}
self.pad_token_id = self.vocab.get("[PAD]", 0)
self.bos_token_id = self.vocab.get("[BOS]", 1)
self.eos_token_id = self.vocab.get("[EOS]", 2)
@property
def vocab_size(self): return len(self.vocab)
def _convert_token_to_id(self, token): return self.vocab.get(token, self.vocab.get("[UNK]"))
def pad(self, encoded_inputs, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors=None):
batch_ids = [x["input_ids"] for x in encoded_inputs]
max_len = max(len(ids) for ids in batch_ids)
padded_batch = []
for ids in batch_ids:
padded_ids = ids + [self.pad_token_id] * (max_len - len(ids))
padded_batch.append(padded_ids)
if return_tensors == "pt": return {"input_ids": torch.tensor(padded_batch, dtype=torch.long)}
return {"input_ids": padded_batch}
def save_pretrained(self, save_directory):
os.makedirs(save_directory, exist_ok=True)
with open(os.path.join(save_directory, "vocab.json"), "w") as f: json.dump(self.vocab, f, indent=4)
@classmethod
def from_pretrained(cls, load_directory):
with open(os.path.join(load_directory, "vocab.json"), "r") as f: vocab = json.load(f)
return cls(vocab)