File size: 1,856 Bytes
b0fd683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

from typing import List, Optional
import tokenmonster
from transformers import PreTrainedTokenizer


TOKENMONSTER_URL = (
    "https://huggingface.co/gvlassis/tokenmonster/resolve/main/"
    "englishcode-32000-strict-nocapcode-v1-eot%3D14199.vocab"
    "?download=true"
)


class CloverLMTokenizer(PreTrainedTokenizer):
    model_input_names = ["input_ids", "attention_mask"]

    def __init__(self, vocab_url: str = TOKENMONSTER_URL,
                 eot_id: int = 14199, **kwargs):
        self._tm = tokenmonster.load(vocab_url)
        self._eot_id = eot_id
        self._vocab_size = 32000

        super().__init__(
            eos_token="<eot>",
            pad_token="<eot>",
            bos_token="<eot>",
            **kwargs,
        )
        self.eos_token_id = eot_id
        self.pad_token_id = eot_id
        self.bos_token_id = eot_id

    @property
    def vocab_size(self) -> int:
        return self._vocab_size

    def get_vocab(self):
        return {f"<tok_{i}>": i for i in range(self._vocab_size)}

    def _tokenize(self, text: str, **kwargs) -> List[str]:
        ids = self._tm.tokenize(text).tolist()
        return [str(i) for i in ids]

    def _convert_token_to_id(self, token: str) -> int:
        return int(token)

    def _convert_id_to_token(self, index: int) -> str:
        return str(index)

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        ids = [int(t) for t in tokens]
        return self._tm.decode(ids)

    @property
    def all_special_tokens_extended(self):
        return [self.eos_token]

    @property
    def all_special_tokens(self):
        return [self.eos_token]

    @property
    def all_special_ids(self):
        return [self._eot_id]

    def save_vocabulary(self, save_directory: str,
                        filename_prefix: Optional[str] = None):
        return ()