English
pszemraj commited on
Commit
625f47e
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +90 -0
  3. special_tokens_map.json +51 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +294 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - BEE-spoke-data/bees-internal
4
+ language:
5
+ - en
6
+ license: apache-2.0
7
+ ---
8
+
9
+ # BeeTokenizer
10
+
11
+ > note: this is **literally** a tokenizer trained on beekeeping text
12
+
13
+ After minutes of hard work, it is now available.
14
+
15
+
16
+ ```python
17
+ from transformers import AutoTokenizer
18
+ tokenizer = AutoTokenizer.from_pretrained("BEE-spoke-data/BeeTokenizer")
19
+
20
+ test_string = "When dealing with Varroa destructor mites, it's crucial to administer the right acaricides during the late autumn months, but only after ensuring that the worker bee population is free from pesticide contamination."
21
+
22
+ output = tokenizer(test_string)
23
+ print(f"Test string: {test_string}")
24
+ print(f"Tokens ({len(output.input_ids)}):\n\t{output.input_ids}")
25
+ ```
26
+
27
+
28
+ ## Notes
29
+
30
+ 1. the default tokenizer (on branch `main`) has a vocab size of 32000
31
+ 2. based on the `SentencePieceBPETokenizer` class
32
+
33
+ <details>
34
+ <summary>How to Tokenize Text and Retrieve Offsets</summary>
35
+
36
+ To tokenize a complex sentence and also retrieve the offsets mapping, you can use the following Python code snippet:
37
+
38
+ ```python
39
+ from transformers import AutoTokenizer
40
+
41
+ # Initialize the tokenizer
42
+ tokenizer = AutoTokenizer.from_pretrained("BEE-spoke-data/BeeTokenizer")
43
+
44
+ # Sample complex sentence related to beekeeping
45
+ test_string = "When dealing with Varroa destructor mites, it's crucial to administer the right acaricides during the late autumn months, but only after ensuring that the worker bee population is free from pesticide contamination."
46
+
47
+ # Tokenize the input string and get the offsets mapping
48
+ output = tokenizer.encode_plus(test_string, return_offsets_mapping=True)
49
+
50
+ print(f"Test string: {test_string}")
51
+
52
+ # Tokens
53
+ tokens = tokenizer.convert_ids_to_tokens(output['input_ids'])
54
+ print(f"Tokens: {tokens}")
55
+
56
+ # Offsets
57
+ offsets = output['offset_mapping']
58
+ print(f"Offsets: {offsets}")
59
+ ```
60
+
61
+ This should result in the following (_Feb '24 version_):
62
+
63
+ ```
64
+ >>> print(f"Test string: {test_string}")
65
+ Test string: When dealing with Varroa destructor mites, it's crucial to administer the right acaricides during the late autumn months, but only after ensuring that the worker bee population is free from pesticide contamination.
66
+ >>>
67
+ >>> # Tokens
68
+ >>> tokens = tokenizer.convert_ids_to_tokens(output['input_ids'])
69
+ >>> print(f"Tokens: {tokens}")
70
+ Tokens: ['When', '▁dealing', '▁with', '▁Varroa', '▁destructor', '▁mites,', "▁it's", '▁cru', 'cial', '▁to', '▁administer', '▁the', '▁right', '▁acar', 'icides', '▁during', '▁the', '▁late', '▁autumn', '▁months,', '▁but', '▁only', '▁after', '▁ensuring', '▁that', '▁the', '▁worker', '▁bee', '▁population', '▁is', '▁free', '▁from', '▁pesticide', '▁contam', 'ination.']
71
+ >>>
72
+ >>> # Offsets
73
+ >>> offsets = output['offset_mapping']
74
+ >>> print(f"Offsets: {offsets}")
75
+ Offsets: [(0, 4), (4, 12), (12, 17), (17, 24), (24, 35), (35, 42), (42, 47), (47, 51), (51, 55), (55, 58), (58, 69), (69, 73), (73, 79), (79, 84), (84, 90), (90, 97), (97, 101), (101, 106), (106, 113), (113, 121), (121, 125), (125, 130), (130, 136), (136, 145), (145, 150), (150, 154), (154, 161), (161, 165), (165, 176), (176, 179), (179, 184), (184, 189), (189, 199), (199, 206), (206, 214)]
76
+ ```
77
+
78
+ if you compare this to the output of [the llama tokenizer](https://huggingface.co/fxmarty/tiny-llama-fast-tokenizer) (below), you can quickly see which is more suited for beekeeping related language modeling.
79
+
80
+ ```
81
+ >>> print(f"Test string: {test_string}")
82
+ Test string: When dealing with Varroa destructor mites, it's crucial to administer the right acaricides during the late autumn months, but only after ensuring that the worker bee population is free from pesticide contamination.
83
+ >>> # Tokens
84
+ >>> tokens = tokenizer.convert_ids_to_tokens(output['input_ids'])
85
+ >>> print(f"Tokens: {toke>>> print(f"Tokens: {tokens}")
86
+ Tokens: ['<s>', '▁When', '▁dealing', '▁with', '▁Var', 'ro', 'a', '▁destruct', 'or', '▁mit', 'es', ',', '▁it', "'", 's', '▁cru', 'cial', '▁to', '▁admin', 'ister', '▁the', '▁right', '▁ac', 'ar', 'ic', 'ides', '▁during', '▁the', '▁late', '▁aut', 'umn', '▁months', ',', '▁but', '▁only', '▁after', '▁ens', 'uring', '▁that', '▁the', '▁worker', '▁be', 'e', '▁population', '▁is', '▁free', '▁from', '▁p', 'estic', 'ide', '▁cont', 'am', 'ination', '.']
87
+ >>> offsets = output['offset_mapping']
88
+ >>> print(f"Offsets: {offsets}")
89
+ Offsets: [(0, 0), (0, 4), (4, 12), (12, 17), (17, 21), (21, 23), (23, 24), (24, 33), (33, 35), (35, 39), (39, 41), (41, 42), (42, 45), (45, 46), (46, 47), (47, 51), (51, 55), (55, 58), (58, 64), (64, 69), (69, 73), (73, 79), (79, 82), (82, 84), (84, 86), (86, 90), (90, 97), (97, 101), (101, 106), (106, 110), (110, 113), (113, 120), (120, 121), (121, 125), (125, 130), (130, 136), (136, 140), (140, 145), (145, 150), (150, 154), (154, 161), (161, 164), (164, 165), (165, 176), (176, 179), (179, 184), (184, 189), (189, 191), (191, 196), (196, 199), (199, 204), (204, 206), (206, 213), (213, 214)]
90
+ ```
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<bos>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<cls>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "<sep>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<unk>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<cls>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "<sep>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "<mask>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "<|endoftext|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "31972": {
60
+ "content": " ",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "31973": {
68
+ "content": " ",
69
+ "lstrip": false,
70
+ "normalized": true,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "31974": {
76
+ "content": " ",
77
+ "lstrip": false,
78
+ "normalized": true,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "31975": {
84
+ "content": " ",
85
+ "lstrip": false,
86
+ "normalized": true,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "31976": {
92
+ "content": " ",
93
+ "lstrip": false,
94
+ "normalized": true,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "31977": {
100
+ "content": " ",
101
+ "lstrip": false,
102
+ "normalized": true,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "31978": {
108
+ "content": " ",
109
+ "lstrip": false,
110
+ "normalized": true,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "31979": {
116
+ "content": " ",
117
+ "lstrip": false,
118
+ "normalized": true,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": false
122
+ },
123
+ "31980": {
124
+ "content": " ",
125
+ "lstrip": false,
126
+ "normalized": true,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "31981": {
132
+ "content": " ",
133
+ "lstrip": false,
134
+ "normalized": true,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "31982": {
140
+ "content": " ",
141
+ "lstrip": false,
142
+ "normalized": true,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": false
146
+ },
147
+ "31983": {
148
+ "content": " ",
149
+ "lstrip": false,
150
+ "normalized": true,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": false
154
+ },
155
+ "31984": {
156
+ "content": " ",
157
+ "lstrip": false,
158
+ "normalized": true,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": false
162
+ },
163
+ "31985": {
164
+ "content": " ",
165
+ "lstrip": false,
166
+ "normalized": true,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": false
170
+ },
171
+ "31986": {
172
+ "content": " ",
173
+ "lstrip": false,
174
+ "normalized": true,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": false
178
+ },
179
+ "31987": {
180
+ "content": " ",
181
+ "lstrip": false,
182
+ "normalized": true,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": false
186
+ },
187
+ "31988": {
188
+ "content": " ",
189
+ "lstrip": false,
190
+ "normalized": true,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": false
194
+ },
195
+ "31989": {
196
+ "content": " ",
197
+ "lstrip": false,
198
+ "normalized": true,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": false
202
+ },
203
+ "31990": {
204
+ "content": " ",
205
+ "lstrip": false,
206
+ "normalized": true,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": false
210
+ },
211
+ "31991": {
212
+ "content": " ",
213
+ "lstrip": false,
214
+ "normalized": true,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": false
218
+ },
219
+ "31992": {
220
+ "content": " ",
221
+ "lstrip": false,
222
+ "normalized": true,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": false
226
+ },
227
+ "31993": {
228
+ "content": " ",
229
+ "lstrip": false,
230
+ "normalized": true,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": false
234
+ },
235
+ "31994": {
236
+ "content": " ",
237
+ "lstrip": false,
238
+ "normalized": true,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": false
242
+ },
243
+ "31995": {
244
+ "content": " ",
245
+ "lstrip": false,
246
+ "normalized": true,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": false
250
+ },
251
+ "31996": {
252
+ "content": " ",
253
+ "lstrip": false,
254
+ "normalized": true,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": false
258
+ },
259
+ "31997": {
260
+ "content": " ",
261
+ "lstrip": false,
262
+ "normalized": true,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": false
266
+ },
267
+ "31998": {
268
+ "content": " ",
269
+ "lstrip": false,
270
+ "normalized": true,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": false
274
+ },
275
+ "31999": {
276
+ "content": " ",
277
+ "lstrip": false,
278
+ "normalized": true,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": false
282
+ }
283
+ },
284
+ "bos_token": "<bos>",
285
+ "clean_up_tokenization_spaces": true,
286
+ "cls_token": "<cls>",
287
+ "eos_token": "<|endoftext|>",
288
+ "mask_token": "<mask>",
289
+ "model_max_length": 1000000000000000019884624838656,
290
+ "pad_token": "<pad>",
291
+ "sep_token": "<sep>",
292
+ "tokenizer_class": "PreTrainedTokenizerFast",
293
+ "unk_token": "<unk>"
294
+ }