Blair1213 commited on
Commit
ee828b8
·
verified ·
1 Parent(s): eceb763

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +9 -1
tokenizer.py CHANGED
@@ -19,7 +19,9 @@ class MedicalcodeTokenizer(PreTrainedTokenizer):
19
  super().__init__(**kwargs)
20
 
21
  def _tokenize(self, text):
22
- return self.code2tok.get(text, [])
 
 
23
 
24
  def _convert_token_to_id(self, token):
25
  return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
@@ -27,6 +29,12 @@ class MedicalcodeTokenizer(PreTrainedTokenizer):
27
  def _convert_id_to_token(self, idx):
28
  return self.ids_to_tokens.get(idx, self.unk_token)
29
 
 
 
 
 
 
 
30
  def get_vocab(self):
31
  return self.vocab
32
 
 
19
  super().__init__(**kwargs)
20
 
21
  def _tokenize(self, text):
22
+ if text in self.code2tok:
23
+ return self.code2tok[text]
24
+ return self._infer_and_register(text)
25
 
26
  def _convert_token_to_id(self, token):
27
  return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
 
29
  def _convert_id_to_token(self, idx):
30
  return self.ids_to_tokens.get(idx, self.unk_token)
31
 
32
+ def embed(self, text):
33
+ tokens = self._tokenize(text)
34
+ if text in self.code2emb:
35
+ return self.code2emb[text]
36
+ return tokens
37
+
38
  def get_vocab(self):
39
  return self.vocab
40