Blair1213 commited on
Commit
8c303bd
·
verified ·
1 Parent(s): 8436c9d

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +2 -24
tokenizer.py CHANGED
@@ -4,7 +4,7 @@ import torch
4
  from transformers import PreTrainedTokenizer
5
 
6
 
7
- class MedTok(PreTrainedTokenizer):
8
  def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
9
  print(f"Loading vocab from: {vocab_file}")
10
  print(f"Loading token map from: {code2tokens_file}")
@@ -104,26 +104,4 @@ class MedTok(PreTrainedTokenizer):
104
  vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
105
  code2tokens_file = os.path.join(pretrained_model_name_or_path, "code2tokens.json")
106
  embedding_file = os.path.join(pretrained_model_name_or_path, "code2embeddings.json")
107
- return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)
108
-
109
-
110
-
111
- '''vocab_path = "vocab.json"
112
- token_path = "code2tokens.json"
113
- embedding_path = "code2embeddings.json"
114
-
115
- tokenizer = MedTok(
116
- vocab_file=vocab_path,
117
- code2tokens_file=token_path,
118
- embedding_file=embedding_path,
119
- unk_token='[UNK]',
120
- pad_token='[PAD]',
121
- )
122
-
123
- tokens = tokenizer.tokenize("E11.9")
124
- ids = tokenizer.encode("E11.9")
125
- embed = tokenizer.embed("E11.9")
126
- print("Tokens:", tokens)
127
- print("Token IDs:", ids)
128
- print("Decoded:", tokenizer.decode(tokens))
129
- print("Embedding:", embed)'''
 
4
  from transformers import PreTrainedTokenizer
5
 
6
 
7
+ class MedicalcodeTokenizer(PreTrainedTokenizer):
8
  def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
9
  print(f"Loading vocab from: {vocab_file}")
10
  print(f"Loading token map from: {code2tokens_file}")
 
104
  vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
105
  code2tokens_file = os.path.join(pretrained_model_name_or_path, "code2tokens.json")
106
  embedding_file = os.path.join(pretrained_model_name_or_path, "code2embeddings.json")
107
+ return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)