Blair1213 commited on
Commit
48e91ff
·
verified ·
1 Parent(s): 7666a84

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +7 -48
tokenizer.py CHANGED
@@ -1,43 +1,24 @@
1
  import os
2
  import json
3
- import torch
4
  from transformers import PreTrainedTokenizer
5
 
6
 
7
  class MedicalcodeTokenizer(PreTrainedTokenizer):
8
  def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
9
- print(f"Loading vocab from: {vocab_file}")
10
- print(f"Loading token map from: {code2tokens_file}")
11
-
12
  with open(vocab_file, "r") as f:
13
  self.vocab = json.load(f)
14
-
15
  with open(code2tokens_file, "r") as f:
16
  self.code2tok = json.load(f)
17
-
18
  with open(embedding_file, 'r') as f:
19
  self.code2emb = json.load(f)
20
 
21
  self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
22
- self.tokens_to_ids = self.vocab # alias
23
 
24
  super().__init__(**kwargs)
25
 
26
- # ---------- required interface ----------
27
  def _tokenize(self, text):
28
- if text in self.code2tok:
29
- return self.code2tok[text]
30
- return self._infer_and_register(text)
31
-
32
- def embed(self, text):
33
- tokens = self._tokenize(text) # 先分词
34
- if text in self.code2emb:
35
- return self.code2emb[text]
36
- return ids
37
-
38
- def encode(self, text):
39
- tokens = self._tokenize(text) # 先分词
40
- return tokens
41
 
42
  def _convert_token_to_id(self, token):
43
  return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
@@ -49,50 +30,29 @@ class MedicalcodeTokenizer(PreTrainedTokenizer):
49
  return self.vocab
50
 
51
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
52
- """
53
- Build model inputs from a sequence or a pair of sequences by adding special tokens.
54
- token_ids_0: list[int] — input ids for the first sequence
55
- token_ids_1: Optional[list[int]] — input ids for the second sequence (if any)
56
- """
57
- if token_ids_1 is None:
58
- return token_ids_0
59
- else:
60
- return token_ids_0 + token_ids_1
61
-
62
 
63
  def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False):
64
  return [0] * len(token_ids)
65
 
66
  def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
67
- if token_ids_1 is None:
68
- return [0] * len(token_ids_0)
69
- return [0] * len(token_ids_0) + [1] * len(token_ids_1)
70
-
71
- # ---------- dynamic extension interface (disabled) ----------
72
- def _infer_and_register(self, code, code_desc="This is a medical code"):
73
- raise NotImplementedError("Dynamic token generation is disabled in this version.")
74
-
75
- # ---------- saving ----------
76
- def save_updates(self, out_vocab="vocab.json", out_map="code2tokens.json"):
77
- json.dump(self.vocab, open(out_vocab, "w"), indent=2)
78
- json.dump(self.code2tok, open(out_map, "w"), indent=2)
79
 
80
  def save_pretrained(self, save_directory):
81
- import os, json
82
  os.makedirs(save_directory, exist_ok=True)
83
  with open(os.path.join(save_directory, "vocab.json"), "w") as f:
84
  json.dump(self.vocab, f, indent=2)
85
  with open(os.path.join(save_directory, "code2tokens.json"), "w") as f:
86
  json.dump(self.code2tok, f, indent=2)
87
  with open(os.path.join(save_directory, "code2embeddings.json"), "w") as f:
88
- json.dump(self.code2embed, f, indent=2)
89
  tokenizer_config = {
90
  "tokenizer_class": "tokenizer.MedicalcodeTokenizer",
91
  "vocab_file": "vocab.json",
92
  "code2tokens_file": "code2tokens.json",
93
  "code2embedding_file": "code2embeddings.json",
94
  "auto_map": {
95
- "AutoTokenizer": "MedTok"
96
  }
97
  }
98
  with open(os.path.join(save_directory, "tokenizer_config.json"), "w") as f:
@@ -100,8 +60,7 @@ class MedicalcodeTokenizer(PreTrainedTokenizer):
100
 
101
  @classmethod
102
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
103
- import os, json
104
  vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
105
  code2tokens_file = os.path.join(pretrained_model_name_or_path, "code2tokens.json")
106
  embedding_file = os.path.join(pretrained_model_name_or_path, "code2embeddings.json")
107
- return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)
 
1
  import os
2
  import json
 
3
  from transformers import PreTrainedTokenizer
4
 
5
 
6
  class MedicalcodeTokenizer(PreTrainedTokenizer):
7
  def __init__(self, vocab_file, code2tokens_file, embedding_file, **kwargs):
 
 
 
8
  with open(vocab_file, "r") as f:
9
  self.vocab = json.load(f)
 
10
  with open(code2tokens_file, "r") as f:
11
  self.code2tok = json.load(f)
 
12
  with open(embedding_file, 'r') as f:
13
  self.code2emb = json.load(f)
14
 
15
  self.ids_to_tokens = {v: k for k, v in self.vocab.items()}
16
+ self.tokens_to_ids = self.vocab
17
 
18
  super().__init__(**kwargs)
19
 
 
20
  def _tokenize(self, text):
21
+ return self.code2tok.get(text, [])
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  def _convert_token_to_id(self, token):
24
  return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
 
30
  return self.vocab
31
 
32
  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
33
+ return token_ids_0 if token_ids_1 is None else token_ids_0 + token_ids_1
 
 
 
 
 
 
 
 
 
34
 
35
  def get_special_tokens_mask(self, token_ids, already_has_special_tokens=False):
36
  return [0] * len(token_ids)
37
 
38
  def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
39
+ return [0] * len(token_ids_0) if token_ids_1 is None else [0] * len(token_ids_0) + [1] * len(token_ids_1)
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def save_pretrained(self, save_directory):
 
42
  os.makedirs(save_directory, exist_ok=True)
43
  with open(os.path.join(save_directory, "vocab.json"), "w") as f:
44
  json.dump(self.vocab, f, indent=2)
45
  with open(os.path.join(save_directory, "code2tokens.json"), "w") as f:
46
  json.dump(self.code2tok, f, indent=2)
47
  with open(os.path.join(save_directory, "code2embeddings.json"), "w") as f:
48
+ json.dump(self.code2emb, f, indent=2)
49
  tokenizer_config = {
50
  "tokenizer_class": "tokenizer.MedicalcodeTokenizer",
51
  "vocab_file": "vocab.json",
52
  "code2tokens_file": "code2tokens.json",
53
  "code2embedding_file": "code2embeddings.json",
54
  "auto_map": {
55
+ "AutoTokenizer": ["tokenizer", "MedicalcodeTokenizer"]
56
  }
57
  }
58
  with open(os.path.join(save_directory, "tokenizer_config.json"), "w") as f:
 
60
 
61
  @classmethod
62
  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
63
  vocab_file = os.path.join(pretrained_model_name_or_path, "vocab.json")
64
  code2tokens_file = os.path.join(pretrained_model_name_or_path, "code2tokens.json")
65
  embedding_file = os.path.join(pretrained_model_name_or_path, "code2embeddings.json")
66
+ return cls(vocab_file=vocab_file, code2tokens_file=code2tokens_file, embedding_file=embedding_file, **kwargs)