Update tokenizer.py
Browse files- tokenizer.py +2 -24
tokenizer.py
CHANGED
|
@@ -4,7 +4,7 @@ import torch
|
|
| 4 |
from transformers import PreTrainedTokenizer
|
| 5 |
|
| 6 |
|
| 7 |
-
class
|
| 8 |
def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
|
| 9 |
print(f"Loading vocab from: {vocab_file}")
|
| 10 |
print(f"Loading token map from: {code2tokens_file}")
|
|
@@ -104,26 +104,4 @@ class MedTok(PreTrainedTokenizer):
|
|
| 104 |
vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
|
| 105 |
code2tokens_file = os.path.join(pretrained_model_name_or_path, "code2tokens.json")
|
| 106 |
embedding_file = os.path.join(pretrained_model_name_or_path, "code2embeddings.json")
|
| 107 |
-
return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
'''vocab_path = "vocab.json"
|
| 112 |
-
token_path = "code2tokens.json"
|
| 113 |
-
embedding_path = "code2embeddings.json"
|
| 114 |
-
|
| 115 |
-
tokenizer = MedTok(
|
| 116 |
-
vocab_file=vocab_path,
|
| 117 |
-
code2tokens_file=token_path,
|
| 118 |
-
embedding_file=embedding_path,
|
| 119 |
-
unk_token='[UNK]',
|
| 120 |
-
pad_token='[PAD]',
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
tokens = tokenizer.tokenize("E11.9")
|
| 124 |
-
ids = tokenizer.encode("E11.9")
|
| 125 |
-
embed = tokenizer.embed("E11.9")
|
| 126 |
-
print("Tokens:", tokens)
|
| 127 |
-
print("Token IDs:", ids)
|
| 128 |
-
print("Decoded:", tokenizer.decode(tokens))
|
| 129 |
-
print("Embedding:", embed)'''
|
|
|
|
| 4 |
from transformers import PreTrainedTokenizer
|
| 5 |
|
| 6 |
|
| 7 |
+
class MedicalcodeTokenizer(PreTrainedTokenizer):
|
| 8 |
def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
|
| 9 |
print(f"Loading vocab from: {vocab_file}")
|
| 10 |
print(f"Loading token map from: {code2tokens_file}")
|
|
|
|
| 104 |
vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
|
| 105 |
code2tokens_file = os.path.join(pretrained_model_name_or_path, "code2tokens.json")
|
| 106 |
embedding_file = os.path.join(pretrained_model_name_or_path, "code2embeddings.json")
|
| 107 |
+
return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|