Update tokenizer.py
Browse files- tokenizer.py +9 -1
tokenizer.py
CHANGED
|
@@ -19,7 +19,9 @@ class MedicalcodeTokenizer(PreTrainedTokenizer):
|
|
| 19 |
super().__init__(**kwargs)
|
| 20 |
|
| 21 |
def _tokenize(self, text):
|
| 22 |
-
|
|
|
|
|
|
|
| 23 |
|
| 24 |
def _convert_token_to_id(self, token):
|
| 25 |
return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
|
|
@@ -27,6 +29,12 @@ class MedicalcodeTokenizer(PreTrainedTokenizer):
|
|
| 27 |
def _convert_id_to_token(self, idx):
|
| 28 |
return self.ids_to_tokens.get(idx, self.unk_token)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
def get_vocab(self):
|
| 31 |
return self.vocab
|
| 32 |
|
|
|
|
| 19 |
super().__init__(**kwargs)
|
| 20 |
|
| 21 |
def _tokenize(self, text):
|
| 22 |
+
if text in self.code2tok:
|
| 23 |
+
return self.code2tok[text]
|
| 24 |
+
return self._infer_and_register(text)
|
| 25 |
|
| 26 |
def _convert_token_to_id(self, token):
|
| 27 |
return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
|
|
|
|
| 29 |
def _convert_id_to_token(self, idx):
|
| 30 |
return self.ids_to_tokens.get(idx, self.unk_token)
|
| 31 |
|
| 32 |
+
def embed(self, text):
|
| 33 |
+
tokens = self._tokenize(text)
|
| 34 |
+
if text in self.code2emb:
|
| 35 |
+
return self.code2emb[text]
|
| 36 |
+
return tokens
|
| 37 |
+
|
| 38 |
def get_vocab(self):
|
| 39 |
return self.vocab
|
| 40 |
|