codemurt commited on
Commit
6368be6
·
verified ·
1 Parent(s): 55906f9

Update char_tokenizer.py

Browse files
Files changed (1) hide show
  1. char_tokenizer.py +17 -12
char_tokenizer.py CHANGED
@@ -39,7 +39,20 @@ class CharTokenizer(PreTrainedTokenizer):
39
  do_lower_case=False,
40
  *args,
41
  **kwargs
42
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  super().__init__(
44
  pad_token=pad_token,
45
  unk_token=unk_token,
@@ -51,15 +64,6 @@ class CharTokenizer(PreTrainedTokenizer):
51
  do_lower_case=do_lower_case,
52
  **kwargs
53
  )
54
- self.do_lower_case = do_lower_case
55
- self.space_token = space_token
56
-
57
- if not vocab_file or not os.path.isfile(vocab_file):
58
- self.vocab = OrderedDict()
59
- self.ids_to_tokens = OrderedDict()
60
- else:
61
- self.vocab = load_vocab(vocab_file)
62
- self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
63
 
64
  def train(self, file_path):
65
  vocab = set()
@@ -74,9 +78,10 @@ class CharTokenizer(PreTrainedTokenizer):
74
  special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token]
75
  vocab = special_tokens + vocab
76
 
 
77
  for i, ch in enumerate(vocab):
78
  self.vocab[ch] = i
79
- self.ids_to_tokens = vocab
80
 
81
  @property
82
  def vocab_size(self):
@@ -91,7 +96,7 @@ class CharTokenizer(PreTrainedTokenizer):
91
  return self.vocab.get(token, self.vocab[self.unk_token])
92
 
93
  def _convert_id_to_token(self, index):
94
- return self.ids_to_tokens[index]
95
 
96
  def prepare_for_tokenization(
97
  self, text, is_split_into_words: bool = False, spaces=0, **kwargs
 
39
  do_lower_case=False,
40
  *args,
41
  **kwargs
42
+ ):
43
+ self.do_lower_case = do_lower_case
44
+ self.space_token = space_token
45
+
46
+ if not vocab_file or not os.path.isfile(vocab_file):
47
+ self.vocab = OrderedDict()
48
+ special_tokens = [pad_token, unk_token, bos_token, eos_token, cls_token, sep_token, mask_token]
49
+ for i, token in enumerate(special_tokens):
50
+ self.vocab[token] = i
51
+ else:
52
+ self.vocab = load_vocab(vocab_file)
53
+
54
+ self.ids_to_tokens = OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
55
+
56
  super().__init__(
57
  pad_token=pad_token,
58
  unk_token=unk_token,
 
64
  do_lower_case=do_lower_case,
65
  **kwargs
66
  )
 
 
 
 
 
 
 
 
 
67
 
68
  def train(self, file_path):
69
  vocab = set()
 
78
  special_tokens = [self.pad_token, self.unk_token, self.bos_token, self.eos_token]
79
  vocab = special_tokens + vocab
80
 
81
+ self.vocab = OrderedDict()
82
  for i, ch in enumerate(vocab):
83
  self.vocab[ch] = i
84
+ self.ids_to_tokens = OrderedDict([(i, ch) for i, ch in enumerate(vocab)])
85
 
86
  @property
87
  def vocab_size(self):
 
96
  return self.vocab.get(token, self.vocab[self.unk_token])
97
 
98
  def _convert_id_to_token(self, index):
99
+ return self.ids_to_tokens.get(index, self.unk_token)
100
 
101
  def prepare_for_tokenization(
102
  self, text, is_split_into_words: bool = False, spaces=0, **kwargs