kkuramitsu commited on
Commit
078eb99
·
1 Parent(s): bc6a4ce

added tokenizer code

Browse files
Files changed (2) hide show
  1. kogitune.py +133 -0
  2. tokenizer_config.json +2 -2
kogitune.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, Optional
2
+
3
+ from transformers import T5Tokenizer
4
+ import re
5
+
6
+ def _upper_repl(matchobj):
7
+ #print(matchobj, matchobj.group(0))
8
+ return '<cZ>' + matchobj.group(0).lower()
9
+
10
+ def _cap_repl(matchobj):
11
+ #print(matchobj, matchobj.group(0))
12
+ return matchobj.group(0)[4:].upper()
13
+
14
+ _UpperPattern = re.compile('([A-Z][a-z])')
15
+ _CapitalizedPattern = re.compile(r'(\<cZ\>[a-z])')
16
+
17
+ def pre_encode(s):
18
+ if isinstance(s, str):
19
+ s = _UpperPattern.sub(_upper_repl, s)
20
+ return s.replace('\t', ' ').replace('\n', '<nL>')
21
+ if isinstance(s, tuple):
22
+ return tuple(map(pre_encode, s))
23
+ return s
24
+
25
+ def post_decode(s):
26
+ if isinstance(s, str):
27
+ return _CapitalizedPattern.sub(_cap_repl,s).replace('<nL>', '\n')
28
+ return s
29
+
30
+
31
+ class KawagoeT5Tokenizer(T5Tokenizer):
32
+
33
+ def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False) -> List[str]:
34
+ text=pre_encode(text)
35
+ pair=pre_encode(pair)
36
+ return super().tokenize(text) #, add_special_tokens=add_special_tokens)
37
+
38
+ def encode_plus(self,
39
+ text,
40
+ text_pair = None,
41
+ add_special_tokens: bool = True,
42
+ padding = False,
43
+ truncation = None,
44
+ max_length: Optional[int] = None,
45
+ stride: int = 0,
46
+ is_split_into_words: bool = False,
47
+ pad_to_multiple_of: Optional[int] = None,
48
+ return_tensors = None,
49
+ return_token_type_ids: Optional[bool] = None,
50
+ return_attention_mask: Optional[bool] = None,
51
+ return_overflowing_tokens: bool = False,
52
+ return_special_tokens_mask: bool = False,
53
+ return_offsets_mapping: bool = False,
54
+ return_length: bool = False,
55
+ verbose: bool = True,
56
+ **kwargs,
57
+ ):
58
+ text = pre_encode(text)
59
+ text_pair=pre_encode(text_pair)
60
+ return super().encode_plus(text,
61
+ text_pair=text_pair,
62
+ add_special_tokens = add_special_tokens,
63
+ padding=padding,
64
+ truncation=truncation,
65
+ max_length=max_length,
66
+ stride=stride,
67
+ is_split_into_words=is_split_into_words,
68
+ pad_to_multiple_of=pad_to_multiple_of,
69
+ return_tensors=return_tensors,
70
+ return_token_type_ids=return_token_type_ids,
71
+ return_attention_mask=return_attention_mask,
72
+ return_overflowing_tokens=return_overflowing_tokens,
73
+ return_special_tokens_mask=return_special_tokens_mask,
74
+ return_offsets_mapping=return_offsets_mapping,
75
+ return_length=return_length,
76
+ verbose=verbose,
77
+ **kwargs,
78
+ )
79
+
80
+ def batch_encode_plus(self,
81
+ batch_text_or_text_pairs,
82
+ add_special_tokens: bool = True,
83
+ padding = False,
84
+ truncation = None,
85
+ max_length = None,
86
+ stride: int = 0,
87
+ is_split_into_words: bool = False,
88
+ pad_to_multiple_of: Optional[int] = None,
89
+ return_tensors = None,
90
+ return_token_type_ids: Optional[bool] = None,
91
+ return_attention_mask: Optional[bool] = None,
92
+ return_overflowing_tokens: bool = False,
93
+ return_special_tokens_mask: bool = False,
94
+ return_offsets_mapping: bool = False,
95
+ return_length: bool = False,
96
+ verbose: bool = True,
97
+ **kwargs,
98
+ ):
99
+ batch_text_or_text_pairs = [pre_encode(x) for x in batch_text_or_text_pairs]
100
+ return super().batch_encode_plus(
101
+ batch_text_or_text_pairs=batch_text_or_text_pairs,
102
+ add_special_tokens=add_special_tokens,
103
+ padding=padding,
104
+ truncation=truncation,
105
+ max_length=max_length,
106
+ stride=stride,
107
+ is_split_into_words=is_split_into_words,
108
+ pad_to_multiple_of=pad_to_multiple_of,
109
+ return_tensors=return_tensors,
110
+ return_token_type_ids=return_token_type_ids,
111
+ return_attention_mask=return_attention_mask,
112
+ return_overflowing_tokens=return_overflowing_tokens,
113
+ return_special_tokens_mask=return_special_tokens_mask,
114
+ return_offsets_mapping=return_offsets_mapping,
115
+ return_length=return_length,
116
+ verbose=verbose,
117
+ **kwargs,
118
+ )
119
+
120
+ def convert_tokens_to_string(self, tokens: List[str]):
121
+ s = super().convert_tokens_to_string(tokens)
122
+ return post_decode(s)
123
+
124
+ def decode(self,
125
+ token_ids,
126
+ skip_special_tokens: bool = False,
127
+ clean_up_tokenization_spaces: bool = None,
128
+ **kwargs):
129
+ s = super().decode(token_ids,
130
+ skip_special_tokens=skip_special_tokens,
131
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
132
+ **kwargs)
133
+ return post_decode(s)
tokenizer_config.json CHANGED
@@ -8,7 +8,7 @@
8
  "model_max_length": 1000000000000000019884624838656,
9
  "pad_token": "<pad>",
10
  "sp_model_kwargs": {},
11
- "tokenizer_class": "T5Tokenizer",
12
- "unk_token": "<unk>",
13
  "use_fast": false
14
  }
 
8
  "model_max_length": 1000000000000000019884624838656,
9
  "pad_token": "<pad>",
10
  "sp_model_kwargs": {},
11
+ "tokenizer_class": "KawagoeT5Tokenizer",
12
+ "auto_map": {"AutoTokenizer": ["kogitune.KawagoeT5Tokenizer", null]},
13
  "use_fast": false
14
  }