File size: 3,242 Bytes
6957bb5
 
 
eceb763
6957bb5
 
8c303bd
6957bb5
 
 
 
 
 
 
 
 
48e91ff
6957bb5
 
 
 
ee828b8
0e38f41
 
ee828b8
6957bb5
 
 
 
 
 
 
ee828b8
 
 
 
 
 
6957bb5
 
 
 
48e91ff
6957bb5
 
 
 
 
48e91ff
6957bb5
 
 
 
 
 
 
4268d0d
48e91ff
6957bb5
22c914e
6957bb5
 
 
c742bef
48e91ff
c742bef
6957bb5
 
 
 
 
 
b272d5a
 
 
48e91ff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os
import json
from transformers import PreTrainedTokenizer
from huggingface_hub import hf_hub_download


class MedicalcodeTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
        with open(vocab_file, "r") as f:
            self.vocab = json.load(f)
        with open(code2tokens_file, "r") as f:
            self.code2tok = json.load(f)
        with open(embedding_file, 'r') as f:
            self.code2emb = json.load(f)

        self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
        self.tokens_to_ids = self.vocab

        super().__init__(**kwargs)

    def _tokenize(self, text):
        if text in self.code2tok:
            return [f"token_{t}" for t in self.code2tok[text]]
        return self._infer_and_register(text)
        return self._infer_and_register(text)

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get(self.unk_token, 0))

    def _convert_id_to_token(self, idx):
        return self.ids_to_tokens.get(idx, self.unk_token)

    def embed(self, text):
        tokens = self._tokenize(text)         
        if text in self.code2emb:
            return self.code2emb[text]
        return tokens

    def get_vocab(self):
        return self.vocab

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        return token_ids_0 if token_ids_1 is None else token_ids_0 + token_ids_1

    def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False):
        return [0] * len(token_ids)

    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
        return [0] * len(token_ids_0) if token_ids_1 is None else [0] * len(token_ids_0) + [1] * len(token_ids_1)

    def save_pretrained(self, save_directory):
        os.makedirs(save_directory, exist_ok=True)
        with open(os.path.join(save_directory, "vocab.json"), "w") as f:
            json.dump(self.vocab, f, indent=2)
        with open(os.path.join(save_directory, "code2tokens.json"), "w") as f:
            json.dump(self.code2tok, f, indent=2)
        with open(os.path.join(save_directory, "code2embeddings.json"), "w") as f:
            json.dump(self.code2emb, f, indent=2)
        tokenizer_config = {
            "tokenizer_class": "tokenizer.MedicalcodeTokenizer",
            "vocab_file": "vocab.json",
            "code2tokens_file": "code2tokens.json",
            "code2embedding_file": "code2embeddings.json",
            "auto_map": {
                "AutoTokenizer": ["tokenizer", "MedicalcodeTokenizer"]
            }
        }
        with open(os.path.join(save_directory, "tokenizer_config.json"), "w") as f:
            json.dump(tokenizer_config, f, indent=2)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        vocab_file = hf_hub_download(pretrained_model_name_or_path, "vocab.json")
        code2tokens_file = hf_hub_download(pretrained_model_name_or_path, "code2tokens.json")
        embedding_file = hf_hub_download(pretrained_model_name_or_path, "code2embeddings.json")
        return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)