File size: 1,520 Bytes
e07124f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

import json
import os
import torch

class ChessTokenizer:
    def __init__(self, vocab=None):
        self.vocab = vocab if vocab else {}
        self.id_to_token = {v: k for k, v in self.vocab.items()}
        self.pad_token_id = self.vocab.get("[PAD]", 0)
        self.bos_token_id = self.vocab.get("[BOS]", 1)
        self.eos_token_id = self.vocab.get("[EOS]", 2)

    @property
    def vocab_size(self):
        return len(self.vocab)

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get("[UNK]"))

    def pad(self, encoded_inputs, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors=None):
        batch_ids = [x["input_ids"] for x in encoded_inputs]
        max_len = max(len(ids) for ids in batch_ids)
        padded_batch = []
        for ids in batch_ids:
            padded_ids = ids + [self.pad_token_id] * (max_len - len(ids))
            padded_batch.append(padded_ids)
        if return_tensors == "pt":
            return {"input_ids": torch.tensor(padded_batch, dtype=torch.long)}
        return {"input_ids": padded_batch}

    def save_pretrained(self, save_directory):
        os.makedirs(save_directory, exist_ok=True)
        with open(os.path.join(save_directory, "vocab.json"), "w") as f:
            json.dump(self.vocab, f, indent=4)

    @classmethod
    def from_pretrained(cls, load_directory):
        with open(os.path.join(load_directory, "vocab.json"), "r") as f:
            vocab = json.load(f)
        return cls(vocab)