File size: 3,242 Bytes
6957bb5 eceb763 6957bb5 8c303bd 6957bb5 48e91ff 6957bb5 ee828b8 0e38f41 ee828b8 6957bb5 ee828b8 6957bb5 48e91ff 6957bb5 48e91ff 6957bb5 4268d0d 48e91ff 6957bb5 22c914e 6957bb5 c742bef 48e91ff c742bef 6957bb5 b272d5a 48e91ff |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import os
import json
from transformers import PreTrainedTokenizer
from huggingface_hub import hf_hub_download
class MedicalcodeTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
with open(vocab_file, "r") as f:
self.vocab = json.load(f)
with open(code2tokens_file, "r") as f:
self.code2tok = json.load(f)
with open(embedding_file, 'r') as f:
self.code2emb = json.load(f)
self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
self.tokens_to_ids = self.vocab
super().__init__(**kwargs)
def _tokenize(self, text):
if text in self.code2tok:
return [f"token_{t}" for t in self.code2tok[text]]
return self._infer_and_register(text)
return self._infer_and_register(text)
def _convert_token_to_id(self, token):
return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
def _convert_id_to_token(self, idx):
return self.ids_to_tokens.get(idx, self.unk_token)
def embed(self, text):
tokens = self._tokenize(text)
if text in self.code2emb:
return self.code2emb[text]
return tokens
def get_vocab(self):
return self.vocab
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
return token_ids_0 if token_ids_1 is None else token_ids_0 + token_ids_1
def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False):
return [0] * len(token_ids)
def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
return [0] * len(token_ids_0) if token_ids_1 is None else [0] * len(token_ids_0) + [1] * len(token_ids_1)
def save_pretrained(self, save_directory):
os.makedirs(save_directory, exist_ok=True)
with open(os.path.join(save_directory, "vocab.json"), "w") as f:
json.dump(self.vocab, f, indent=2)
with open(os.path.join(save_directory, "code2tokens.json"), "w") as f:
json.dump(self.code2tok, f, indent=2)
with open(os.path.join(save_directory, "code2embeddings.json"), "w") as f:
json.dump(self.code2emb, f, indent=2)
tokenizer_config = {
"tokenizer_class": "tokenizer.MedicalcodeTokenizer",
"vocab_file": "vocab.json",
"code2tokens_file": "code2tokens.json",
"code2embedding_file": "code2embeddings.json",
"auto_map": {
"AutoTokenizer": ["tokenizer", "MedicalcodeTokenizer"]
}
}
with open(os.path.join(save_directory, "tokenizer_config.json"), "w") as f:
json.dump(tokenizer_config, f, indent=2)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
vocab_file = hf_hub_download(pretrained_model_name_or_path, "vocab.json")
code2tokens_file = hf_hub_download(pretrained_model_name_or_path, "code2tokens.json")
embedding_file = hf_hub_download(pretrained_model_name_or_path, "code2embeddings.json")
return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)
|