Adapters
GhostNetworkUser commited on
Commit
a4facfd
·
verified ·
1 Parent(s): 6b02503

Upload 3 files

Browse files
Files changed (3) hide show
  1. __init__.cpython-312.pyc +0 -0
  2. __init__.py +13 -6
  3. __init__.pyi +382 -363
__init__.cpython-312.pyc CHANGED
Binary files a/__init__.cpython-312.pyc and b/__init__.cpython-312.pyc differ
 
__init__.py CHANGED
@@ -1,8 +1,15 @@
1
  # Generated content DO NOT EDIT
2
- from .. import models
3
 
4
- Model = models.Model
5
- BPE = models.BPE
6
- Unigram = models.Unigram
7
- WordLevel = models.WordLevel
8
- WordPiece = models.WordPiece
 
 
 
 
 
 
 
 
1
  # Generated content DO NOT EDIT
2
+ from .. import pre_tokenizers
3
 
4
+ PreTokenizer = pre_tokenizers.PreTokenizer
5
+ BertPreTokenizer = pre_tokenizers.BertPreTokenizer
6
+ ByteLevel = pre_tokenizers.ByteLevel
7
+ CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
8
+ Digits = pre_tokenizers.Digits
9
+ Metaspace = pre_tokenizers.Metaspace
10
+ Punctuation = pre_tokenizers.Punctuation
11
+ Sequence = pre_tokenizers.Sequence
12
+ Split = pre_tokenizers.Split
13
+ UnicodeScripts = pre_tokenizers.UnicodeScripts
14
+ Whitespace = pre_tokenizers.Whitespace
15
+ WhitespaceSplit = pre_tokenizers.WhitespaceSplit
__init__.pyi CHANGED
@@ -1,591 +1,610 @@
1
  # Generated content DO NOT EDIT
2
- class Model:
3
  """
4
- Base class for all models
5
 
6
- The model represents the actual tokenization algorithm. This is the part that
7
- will contain and manage the learned vocabulary.
8
-
9
- This class cannot be constructed directly. Please use one of the concrete models.
10
  """
11
- def get_trainer(self):
12
  """
13
- Get the associated :class:`~tokenizers.trainers.Trainer`
14
 
15
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
16
- :class:`~tokenizers.models.Model`.
 
 
 
17
 
18
- Returns:
19
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
 
 
20
  """
21
  pass
22
 
23
- def id_to_token(self, id):
24
  """
25
- Get the token associated to an ID
 
 
 
 
 
 
26
 
27
  Args:
28
- id (:obj:`int`):
29
- An ID to convert to a token
30
 
31
  Returns:
32
- :obj:`str`: The token associated to the ID
 
33
  """
34
  pass
35
 
36
- def save(self, folder, prefix):
37
- """
38
- Save the current model
39
-
40
- Save the current model in the given folder, using the given prefix for the various
41
- files that will get created.
42
- Any file with the same name that already exists in this folder will be overwritten.
43
-
44
- Args:
45
- folder (:obj:`str`):
46
- The path to the target folder in which to save the various files
47
-
48
- prefix (:obj:`str`, `optional`):
49
- An optional prefix, used to prefix each file name
50
 
51
- Returns:
52
- :obj:`List[str]`: The list of saved files
53
- """
 
54
  pass
55
 
56
- def token_to_id(self, tokens):
57
  """
58
- Get the ID associated to a token
59
 
60
- Args:
61
- token (:obj:`str`):
62
- A token to convert to an ID
 
 
63
 
64
- Returns:
65
- :obj:`int`: The ID associated to the token
 
 
66
  """
67
  pass
68
 
69
- def tokenize(self, sequence):
70
  """
71
- Tokenize a sequence
 
 
 
 
 
 
72
 
73
  Args:
74
  sequence (:obj:`str`):
75
- A sequence to tokenize
76
 
77
  Returns:
78
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
 
79
  """
80
  pass
81
 
82
- class BPE(Model):
83
  """
84
- An implementation of the BPE (Byte-Pair Encoding) algorithm
85
 
86
- Args:
87
- vocab (:obj:`Dict[str, int]`, `optional`):
88
- A dictionary of string keys and their ids :obj:`{"am": 0,...}`
89
-
90
- merges (:obj:`List[Tuple[str, str]]`, `optional`):
91
- A list of pairs of tokens (:obj:`Tuple[str, str]`) :obj:`[("a", "b"),...]`
92
-
93
- cache_capacity (:obj:`int`, `optional`):
94
- The number of words that the BPE cache can contain. The cache allows
95
- to speed-up the process by keeping the result of the merge operations
96
- for a number of words.
97
-
98
- dropout (:obj:`float`, `optional`):
99
- A float between 0 and 1 that represents the BPE dropout to use.
100
-
101
- unk_token (:obj:`str`, `optional`):
102
- The unknown token to be used by the model.
103
-
104
- continuing_subword_prefix (:obj:`str`, `optional`):
105
- The prefix to attach to subword units that don't represent a beginning of word.
106
-
107
- end_of_word_suffix (:obj:`str`, `optional`):
108
- The suffix to attach to subword units that represent an end of word.
109
-
110
- fuse_unk (:obj:`bool`, `optional`):
111
- Whether to fuse any subsequent unknown tokens into a single one
112
-
113
- byte_fallback (:obj:`bool`, `optional`):
114
- Whether to use spm byte-fallback trick (defaults to False)
115
 
116
- ignore_merges (:obj:`bool`, `optional`):
117
- Whether or not to match tokens with the vocab before using merges.
 
 
 
 
 
118
  """
119
- def __init__(
120
- self,
121
- vocab=None,
122
- merges=None,
123
- cache_capacity=None,
124
- dropout=None,
125
- unk_token=None,
126
- continuing_subword_prefix=None,
127
- end_of_word_suffix=None,
128
- fuse_unk=None,
129
- byte_fallback=False,
130
- ignore_merges=False,
131
- ):
132
  pass
133
 
134
  @staticmethod
135
- def from_file(cls, vocab, merge, **kwargs):
136
  """
137
- Instantiate a BPE model from the given files.
138
-
139
- This method is roughly equivalent to doing::
140
-
141
- vocab, merges = BPE.read_file(vocab_filename, merges_filename)
142
- bpe = BPE(vocab, merges)
143
 
144
- If you don't need to keep the :obj:`vocab, merges` values lying around,
145
- this method is more optimized than manually calling
146
- :meth:`~tokenizers.models.BPE.read_file` to initialize a :class:`~tokenizers.models.BPE`
147
-
148
- Args:
149
- vocab (:obj:`str`):
150
- The path to a :obj:`vocab.json` file
151
-
152
- merges (:obj:`str`):
153
- The path to a :obj:`merges.txt` file
154
 
155
  Returns:
156
- :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
157
  """
158
  pass
159
 
160
- def get_trainer(self):
161
  """
162
- Get the associated :class:`~tokenizers.trainers.Trainer`
163
 
164
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
165
- :class:`~tokenizers.models.Model`.
166
-
167
- Returns:
168
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
169
- """
170
- pass
171
-
172
- def id_to_token(self, id):
173
- """
174
- Get the token associated to an ID
175
 
176
  Args:
177
- id (:obj:`int`):
178
- An ID to convert to a token
179
-
180
- Returns:
181
- :obj:`str`: The token associated to the ID
182
  """
183
  pass
184
 
185
- @staticmethod
186
- def read_file(self, vocab, merges):
187
  """
188
- Read a :obj:`vocab.json` and a :obj:`merges.txt` files
189
 
190
- This method provides a way to read and parse the content of these files,
191
- returning the relevant data structures. If you want to instantiate some BPE models
192
- from memory, this method gives you the expected input from the standard files.
 
 
193
 
194
  Args:
195
- vocab (:obj:`str`):
196
- The path to a :obj:`vocab.json` file
197
-
198
- merges (:obj:`str`):
199
- The path to a :obj:`merges.txt` file
200
 
201
  Returns:
202
- A :obj:`Tuple` with the vocab and the merges:
203
- The vocabulary and merges loaded into memory
204
  """
205
  pass
206
 
207
- def save(self, folder, prefix):
208
- """
209
- Save the current model
210
-
211
- Save the current model in the given folder, using the given prefix for the various
212
- files that will get created.
213
- Any file with the same name that already exists in this folder will be overwritten.
214
-
215
- Args:
216
- folder (:obj:`str`):
217
- The path to the target folder in which to save the various files
218
-
219
- prefix (:obj:`str`, `optional`):
220
- An optional prefix, used to prefix each file name
221
 
222
- Returns:
223
- :obj:`List[str]`: The list of saved files
 
 
 
224
  """
225
- pass
226
 
227
- def token_to_id(self, tokens):
228
- """
229
- Get the ID associated to a token
 
 
230
 
231
  Args:
232
- token (:obj:`str`):
233
- A token to convert to an ID
234
-
235
- Returns:
236
- :obj:`int`: The ID associated to the token
237
  """
238
  pass
239
 
240
- def tokenize(self, sequence):
241
  """
242
- Tokenize a sequence
 
 
 
 
 
 
243
 
244
  Args:
245
  sequence (:obj:`str`):
246
- A sequence to tokenize
247
 
248
  Returns:
249
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
 
250
  """
251
  pass
252
 
253
- class Unigram(Model):
254
  """
255
- An implementation of the Unigram algorithm
256
 
257
  Args:
258
- vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
259
- A list of vocabulary items and their relative score [("am", -0.2442),...]
 
 
 
 
 
 
260
  """
261
- def __init__(self, vocab, unk_id, byte_fallback):
262
  pass
263
 
264
- def get_trainer(self):
265
  """
266
- Get the associated :class:`~tokenizers.trainers.Trainer`
267
 
268
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
269
- :class:`~tokenizers.models.Model`.
 
 
 
270
 
271
- Returns:
272
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
 
 
273
  """
274
  pass
275
 
276
- def id_to_token(self, id):
277
  """
278
- Get the token associated to an ID
 
 
 
 
 
 
279
 
280
  Args:
281
- id (:obj:`int`):
282
- An ID to convert to a token
283
 
284
  Returns:
285
- :obj:`str`: The token associated to the ID
 
286
  """
287
  pass
288
 
289
- def save(self, folder, prefix):
290
- """
291
- Save the current model
292
 
293
- Save the current model in the given folder, using the given prefix for the various
294
- files that will get created.
295
- Any file with the same name that already exists in this folder will be overwritten.
296
 
297
- Args:
298
- folder (:obj:`str`):
299
- The path to the target folder in which to save the various files
 
300
 
301
- prefix (:obj:`str`, `optional`):
302
- An optional prefix, used to prefix each file name
 
 
 
303
 
304
- Returns:
305
- :obj:`List[str]`: The list of saved files
306
- """
307
  pass
308
 
309
- def token_to_id(self, tokens):
310
  """
311
- Get the ID associated to a token
312
 
313
- Args:
314
- token (:obj:`str`):
315
- A token to convert to an ID
 
 
316
 
317
- Returns:
318
- :obj:`int`: The ID associated to the token
 
 
319
  """
320
  pass
321
 
322
- def tokenize(self, sequence):
323
  """
324
- Tokenize a sequence
 
 
 
 
 
 
325
 
326
  Args:
327
  sequence (:obj:`str`):
328
- A sequence to tokenize
329
 
330
  Returns:
331
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
 
332
  """
333
  pass
334
 
335
- class WordLevel(Model):
336
  """
337
- An implementation of the WordLevel algorithm
338
-
339
- Most simple tokenizer model based on mapping tokens to their corresponding id.
340
 
341
  Args:
342
- vocab (:obj:`str`, `optional`):
343
- A dictionary of string keys and their ids :obj:`{"am": 0,...}`
344
-
345
- unk_token (:obj:`str`, `optional`):
346
- The unknown token to be used by the model.
347
  """
348
- def __init__(self, vocab, unk_token):
349
  pass
350
 
351
- @staticmethod
352
- def from_file(vocab, unk_token):
353
  """
354
- Instantiate a WordLevel model from the given file
355
-
356
- This method is roughly equivalent to doing::
357
 
358
- vocab = WordLevel.read_file(vocab_filename)
359
- wordlevel = WordLevel(vocab)
360
-
361
- If you don't need to keep the :obj:`vocab` values lying around, this method is
362
- more optimized than manually calling :meth:`~tokenizers.models.WordLevel.read_file` to
363
- initialize a :class:`~tokenizers.models.WordLevel`
364
 
365
  Args:
366
- vocab (:obj:`str`):
367
- The path to a :obj:`vocab.json` file
368
-
369
- Returns:
370
- :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
371
  """
372
  pass
373
 
374
- def get_trainer(self):
375
  """
376
- Get the associated :class:`~tokenizers.trainers.Trainer`
377
 
378
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
379
- :class:`~tokenizers.models.Model`.
 
 
 
 
 
 
 
380
 
381
  Returns:
382
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
 
383
  """
384
  pass
385
 
386
- def id_to_token(self, id):
 
 
 
 
 
 
 
387
  """
388
- Get the token associated to an ID
389
 
390
- Args:
391
- id (:obj:`int`):
392
- An ID to convert to a token
 
 
393
 
394
- Returns:
395
- :obj:`str`: The token associated to the ID
 
 
396
  """
397
  pass
398
 
399
- @staticmethod
400
- def read_file(vocab):
401
  """
402
- Read a :obj:`vocab.json`
403
 
404
- This method provides a way to read and parse the content of a vocabulary file,
405
- returning the relevant data structures. If you want to instantiate some WordLevel models
406
- from memory, this method gives you the expected input from the standard files.
 
 
407
 
408
  Args:
409
- vocab (:obj:`str`):
410
- The path to a :obj:`vocab.json` file
411
 
412
  Returns:
413
- :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
 
414
  """
415
  pass
416
 
417
- def save(self, folder, prefix):
418
- """
419
- Save the current model
420
-
421
- Save the current model in the given folder, using the given prefix for the various
422
- files that will get created.
423
- Any file with the same name that already exists in this folder will be overwritten.
424
-
425
- Args:
426
- folder (:obj:`str`):
427
- The path to the target folder in which to save the various files
428
 
429
- prefix (:obj:`str`, `optional`):
430
- An optional prefix, used to prefix each file name
 
431
 
432
- Returns:
433
- :obj:`List[str]`: The list of saved files
434
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
435
  pass
436
 
437
- def token_to_id(self, tokens):
438
  """
439
- Get the ID associated to a token
440
 
441
- Args:
442
- token (:obj:`str`):
443
- A token to convert to an ID
 
 
444
 
445
- Returns:
446
- :obj:`int`: The ID associated to the token
 
 
447
  """
448
  pass
449
 
450
- def tokenize(self, sequence):
451
  """
452
- Tokenize a sequence
 
 
 
 
 
 
453
 
454
  Args:
455
  sequence (:obj:`str`):
456
- A sequence to tokenize
457
 
458
  Returns:
459
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
 
460
  """
461
  pass
462
 
463
- class WordPiece(Model):
464
  """
465
- An implementation of the WordPiece algorithm
466
-
467
- Args:
468
- vocab (:obj:`Dict[str, int]`, `optional`):
469
- A dictionary of string keys and their ids :obj:`{"am": 0,...}`
470
-
471
- unk_token (:obj:`str`, `optional`):
472
- The unknown token to be used by the model.
473
-
474
- max_input_chars_per_word (:obj:`int`, `optional`):
475
- The maximum number of characters to authorize in a single word.
476
  """
477
- def __init__(self, vocab, unk_token, max_input_chars_per_word):
478
  pass
479
 
480
- @staticmethod
481
- def from_file(vocab, **kwargs):
482
  """
483
- Instantiate a WordPiece model from the given file
484
-
485
- This method is roughly equivalent to doing::
486
-
487
- vocab = WordPiece.read_file(vocab_filename)
488
- wordpiece = WordPiece(vocab)
489
 
490
- If you don't need to keep the :obj:`vocab` values lying around, this method is
491
- more optimized than manually calling :meth:`~tokenizers.models.WordPiece.read_file` to
492
- initialize a :class:`~tokenizers.models.WordPiece`
 
 
493
 
494
  Args:
495
- vocab (:obj:`str`):
496
- The path to a :obj:`vocab.txt` file
497
-
498
- Returns:
499
- :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
500
  """
501
  pass
502
 
503
- def get_trainer(self):
504
  """
505
- Get the associated :class:`~tokenizers.trainers.Trainer`
506
 
507
- Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
508
- :class:`~tokenizers.models.Model`.
509
-
510
- Returns:
511
- :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
512
- """
513
- pass
514
-
515
- def id_to_token(self, id):
516
- """
517
- Get the token associated to an ID
518
 
519
  Args:
520
- id (:obj:`int`):
521
- An ID to convert to a token
522
 
523
  Returns:
524
- :obj:`str`: The token associated to the ID
 
525
  """
526
  pass
527
 
528
- @staticmethod
529
- def read_file(vocab):
 
 
 
 
 
 
530
  """
531
- Read a :obj:`vocab.txt` file
532
 
533
- This method provides a way to read and parse the content of a standard `vocab.txt`
534
- file as used by the WordPiece Model, returning the relevant data structures. If you
535
- want to instantiate some WordPiece models from memory, this method gives you the
536
- expected input from the standard files.
 
537
 
538
  Args:
539
- vocab (:obj:`str`):
540
- The path to a :obj:`vocab.txt` file
541
-
542
- Returns:
543
- :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict`
544
  """
545
  pass
546
 
547
- def save(self, folder, prefix):
548
  """
549
- Save the current model
550
 
551
- Save the current model in the given folder, using the given prefix for the various
552
- files that will get created.
553
- Any file with the same name that already exists in this folder will be overwritten.
 
 
554
 
555
  Args:
556
- folder (:obj:`str`):
557
- The path to the target folder in which to save the various files
558
-
559
- prefix (:obj:`str`, `optional`):
560
- An optional prefix, used to prefix each file name
561
 
562
  Returns:
563
- :obj:`List[str]`: The list of saved files
 
564
  """
565
  pass
566
 
567
- def token_to_id(self, tokens):
 
 
 
 
 
 
 
568
  """
569
- Get the ID associated to a token
570
 
571
- Args:
572
- token (:obj:`str`):
573
- A token to convert to an ID
 
 
574
 
575
- Returns:
576
- :obj:`int`: The ID associated to the token
 
 
577
  """
578
  pass
579
 
580
- def tokenize(self, sequence):
581
  """
582
- Tokenize a sequence
 
 
 
 
 
 
583
 
584
  Args:
585
  sequence (:obj:`str`):
586
- A sequence to tokenize
587
 
588
  Returns:
589
- A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
 
590
  """
591
  pass
 
1
  # Generated content DO NOT EDIT
2
+ class PreTokenizer:
3
  """
4
+ Base class for all pre-tokenizers
5
 
6
+ This class is not supposed to be instantiated directly. Instead, any implementation of a
7
+ PreTokenizer will return an instance of this class when instantiated.
 
 
8
  """
9
+ def pre_tokenize(self, pretok):
10
  """
11
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
12
 
13
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
14
+ keep track of the pre-tokenization, and leverage the capabilities of the
15
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
16
+ the pre-tokenization of a raw string, you can use
17
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
18
 
19
+ Args:
20
+ pretok (:class:`~tokenizers.PreTokenizedString):
21
+ The pre-tokenized string on which to apply this
22
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
23
  """
24
  pass
25
 
26
+ def pre_tokenize_str(self, sequence):
27
  """
28
+ Pre tokenize the given string
29
+
30
+ This method provides a way to visualize the effect of a
31
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
32
+ alignment, nor does it provide all the capabilities of the
33
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
34
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
35
 
36
  Args:
37
+ sequence (:obj:`str`):
38
+ A string to pre-tokeize
39
 
40
  Returns:
41
+ :obj:`List[Tuple[str, Offsets]]`:
42
+ A list of tuple with the pre-tokenized parts and their offsets
43
  """
44
  pass
45
 
46
+ class BertPreTokenizer(PreTokenizer):
47
+ """
48
+ BertPreTokenizer
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ This pre-tokenizer splits tokens on spaces, and also on punctuation.
51
+ Each occurence of a punctuation character will be treated separately.
52
+ """
53
+ def __init__(self):
54
  pass
55
 
56
+ def pre_tokenize(self, pretok):
57
  """
58
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
59
 
60
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
61
+ keep track of the pre-tokenization, and leverage the capabilities of the
62
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
63
+ the pre-tokenization of a raw string, you can use
64
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
65
 
66
+ Args:
67
+ pretok (:class:`~tokenizers.PreTokenizedString):
68
+ The pre-tokenized string on which to apply this
69
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
70
  """
71
  pass
72
 
73
+ def pre_tokenize_str(self, sequence):
74
  """
75
+ Pre tokenize the given string
76
+
77
+ This method provides a way to visualize the effect of a
78
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
79
+ alignment, nor does it provide all the capabilities of the
80
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
81
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
82
 
83
  Args:
84
  sequence (:obj:`str`):
85
+ A string to pre-tokeize
86
 
87
  Returns:
88
+ :obj:`List[Tuple[str, Offsets]]`:
89
+ A list of tuple with the pre-tokenized parts and their offsets
90
  """
91
  pass
92
 
93
+ class ByteLevel(PreTokenizer):
94
  """
95
+ ByteLevel PreTokenizer
96
 
97
+ This pre-tokenizer takes care of replacing all bytes of the given string
98
+ with a corresponding representation, as well as splitting into words.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ Args:
101
+ add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
102
+ Whether to add a space to the first word if there isn't already one. This
103
+ lets us treat `hello` exactly like `say hello`.
104
+ use_regex (:obj:`bool`, `optional`, defaults to :obj:`True`):
105
+ Set this to :obj:`False` to prevent this `pre_tokenizer` from using
106
+ the GPT2 specific regexp for spliting on whitespace.
107
  """
108
+ def __init__(self, add_prefix_space=True, use_regex=True):
 
 
 
 
 
 
 
 
 
 
 
 
109
  pass
110
 
111
  @staticmethod
112
+ def alphabet():
113
  """
114
+ Returns the alphabet used by this PreTokenizer.
 
 
 
 
 
115
 
116
+ Since the ByteLevel works as its name suggests, at the byte level, it
117
+ encodes each byte value to a unique visible character. This means that there is a
118
+ total of 256 different characters composing this alphabet.
 
 
 
 
 
 
 
119
 
120
  Returns:
121
+ :obj:`List[str]`: A list of characters that compose the alphabet
122
  """
123
  pass
124
 
125
+ def pre_tokenize(self, pretok):
126
  """
127
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
128
 
129
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
130
+ keep track of the pre-tokenization, and leverage the capabilities of the
131
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
132
+ the pre-tokenization of a raw string, you can use
133
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
 
 
 
 
 
 
134
 
135
  Args:
136
+ pretok (:class:`~tokenizers.PreTokenizedString):
137
+ The pre-tokenized string on which to apply this
138
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
 
 
139
  """
140
  pass
141
 
142
+ def pre_tokenize_str(self, sequence):
 
143
  """
144
+ Pre tokenize the given string
145
 
146
+ This method provides a way to visualize the effect of a
147
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
148
+ alignment, nor does it provide all the capabilities of the
149
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
150
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
151
 
152
  Args:
153
+ sequence (:obj:`str`):
154
+ A string to pre-tokeize
 
 
 
155
 
156
  Returns:
157
+ :obj:`List[Tuple[str, Offsets]]`:
158
+ A list of tuple with the pre-tokenized parts and their offsets
159
  """
160
  pass
161
 
162
+ class CharDelimiterSplit(PreTokenizer):
163
+ """
164
+ This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ Args:
167
+ delimiter: str:
168
+ The delimiter char that will be used to split input
169
+ """
170
+ def pre_tokenize(self, pretok):
171
  """
172
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
173
 
174
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
175
+ keep track of the pre-tokenization, and leverage the capabilities of the
176
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
177
+ the pre-tokenization of a raw string, you can use
178
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
179
 
180
  Args:
181
+ pretok (:class:`~tokenizers.PreTokenizedString):
182
+ The pre-tokenized string on which to apply this
183
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
 
 
184
  """
185
  pass
186
 
187
+ def pre_tokenize_str(self, sequence):
188
  """
189
+ Pre tokenize the given string
190
+
191
+ This method provides a way to visualize the effect of a
192
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
193
+ alignment, nor does it provide all the capabilities of the
194
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
195
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
196
 
197
  Args:
198
  sequence (:obj:`str`):
199
+ A string to pre-tokeize
200
 
201
  Returns:
202
+ :obj:`List[Tuple[str, Offsets]]`:
203
+ A list of tuple with the pre-tokenized parts and their offsets
204
  """
205
  pass
206
 
207
+ class Digits(PreTokenizer):
208
  """
209
+ This pre-tokenizer simply splits using the digits in separate tokens
210
 
211
  Args:
212
+ individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
213
+ If set to True, digits will each be separated as follows::
214
+
215
+ "Call 123 please" -> "Call ", "1", "2", "3", " please"
216
+
217
+ If set to False, digits will grouped as follows::
218
+
219
+ "Call 123 please" -> "Call ", "123", " please"
220
  """
221
+ def __init__(self, individual_digits=False):
222
  pass
223
 
224
+ def pre_tokenize(self, pretok):
225
  """
226
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
227
 
228
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
229
+ keep track of the pre-tokenization, and leverage the capabilities of the
230
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
231
+ the pre-tokenization of a raw string, you can use
232
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
233
 
234
+ Args:
235
+ pretok (:class:`~tokenizers.PreTokenizedString):
236
+ The pre-tokenized string on which to apply this
237
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
238
  """
239
  pass
240
 
241
+ def pre_tokenize_str(self, sequence):
242
  """
243
+ Pre tokenize the given string
244
+
245
+ This method provides a way to visualize the effect of a
246
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
247
+ alignment, nor does it provide all the capabilities of the
248
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
249
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
250
 
251
  Args:
252
+ sequence (:obj:`str`):
253
+ A string to pre-tokeize
254
 
255
  Returns:
256
+ :obj:`List[Tuple[str, Offsets]]`:
257
+ A list of tuple with the pre-tokenized parts and their offsets
258
  """
259
  pass
260
 
261
+ class Metaspace(PreTokenizer):
262
+ """
263
+ Metaspace pre-tokenizer
264
 
265
+ This pre-tokenizer replaces any whitespace by the provided replacement character.
266
+ It then tries to split on these spaces.
 
267
 
268
+ Args:
269
+ replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
270
+ The replacement character. Must be exactly one character. By default we
271
+ use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
272
 
273
+ prepend_scheme (:obj:`str`, `optional`, defaults to :obj:`"always"`):
274
+ Whether to add a space to the first word if there isn't already one. This
275
+ lets us treat `hello` exactly like `say hello`.
276
+ Choices: "always", "never", "first". First means the space is only added on the first
277
+ token (relevant when special tokens are used or other pre_tokenizer are used).
278
 
279
+ """
280
+ def __init__(self, replacement="_", prepend_scheme="always", split=True):
 
281
  pass
282
 
283
+ def pre_tokenize(self, pretok):
284
  """
285
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
286
 
287
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
288
+ keep track of the pre-tokenization, and leverage the capabilities of the
289
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
290
+ the pre-tokenization of a raw string, you can use
291
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
292
 
293
+ Args:
294
+ pretok (:class:`~tokenizers.PreTokenizedString):
295
+ The pre-tokenized string on which to apply this
296
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
297
  """
298
  pass
299
 
300
+ def pre_tokenize_str(self, sequence):
301
  """
302
+ Pre tokenize the given string
303
+
304
+ This method provides a way to visualize the effect of a
305
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
306
+ alignment, nor does it provide all the capabilities of the
307
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
308
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
309
 
310
  Args:
311
  sequence (:obj:`str`):
312
+ A string to pre-tokeize
313
 
314
  Returns:
315
+ :obj:`List[Tuple[str, Offsets]]`:
316
+ A list of tuple with the pre-tokenized parts and their offsets
317
  """
318
  pass
319
 
320
+ class Punctuation(PreTokenizer):
321
  """
322
+ This pre-tokenizer simply splits on punctuation as individual characters.
 
 
323
 
324
  Args:
325
+ behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
326
+ The behavior to use when splitting.
327
+ Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
328
+ "contiguous"
 
329
  """
330
+ def __init__(self, behavior="isolated"):
331
  pass
332
 
333
+ def pre_tokenize(self, pretok):
 
334
  """
335
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
 
 
336
 
337
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
338
+ keep track of the pre-tokenization, and leverage the capabilities of the
339
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
340
+ the pre-tokenization of a raw string, you can use
341
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
 
342
 
343
  Args:
344
+ pretok (:class:`~tokenizers.PreTokenizedString):
345
+ The pre-tokenized string on which to apply this
346
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
 
 
347
  """
348
  pass
349
 
350
+ def pre_tokenize_str(self, sequence):
351
  """
352
+ Pre tokenize the given string
353
 
354
+ This method provides a way to visualize the effect of a
355
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
356
+ alignment, nor does it provide all the capabilities of the
357
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
358
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
359
+
360
+ Args:
361
+ sequence (:obj:`str`):
362
+ A string to pre-tokeize
363
 
364
  Returns:
365
+ :obj:`List[Tuple[str, Offsets]]`:
366
+ A list of tuple with the pre-tokenized parts and their offsets
367
  """
368
  pass
369
 
370
+ class Sequence(PreTokenizer):
371
+ """
372
+ This pre-tokenizer composes other pre_tokenizers and applies them in sequence
373
+ """
374
+ def __init__(self, pretokenizers):
375
+ pass
376
+
377
+ def pre_tokenize(self, pretok):
378
  """
379
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
380
 
381
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
382
+ keep track of the pre-tokenization, and leverage the capabilities of the
383
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
384
+ the pre-tokenization of a raw string, you can use
385
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
386
 
387
+ Args:
388
+ pretok (:class:`~tokenizers.PreTokenizedString):
389
+ The pre-tokenized string on which to apply this
390
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
391
  """
392
  pass
393
 
394
+ def pre_tokenize_str(self, sequence):
 
395
  """
396
+ Pre tokenize the given string
397
 
398
+ This method provides a way to visualize the effect of a
399
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
400
+ alignment, nor does it provide all the capabilities of the
401
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
402
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
403
 
404
  Args:
405
+ sequence (:obj:`str`):
406
+ A string to pre-tokeize
407
 
408
  Returns:
409
+ :obj:`List[Tuple[str, Offsets]]`:
410
+ A list of tuple with the pre-tokenized parts and their offsets
411
  """
412
  pass
413
 
414
+ class Split(PreTokenizer):
415
+ """
416
+ Split PreTokenizer
 
 
 
 
 
 
 
 
417
 
418
+ This versatile pre-tokenizer splits using the provided pattern and
419
+ according to the provided behavior. The pattern can be inverted by
420
+ making use of the invert flag.
421
 
422
+ Args:
423
+ pattern (:obj:`str` or :class:`~tokenizers.Regex`):
424
+ A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
425
+ If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
426
+ otherwise we consider is as a string pattern. For example `pattern="|"`
427
+ means you want to split on `|` (imagine a csv file for example), while
428
+ `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
429
+ behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
430
+ The behavior to use when splitting.
431
+ Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
432
+ "contiguous"
433
+
434
+ invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
435
+ Whether to invert the pattern.
436
+ """
437
+ def __init__(self, pattern, behavior, invert=False):
438
  pass
439
 
440
+ def pre_tokenize(self, pretok):
441
  """
442
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
443
 
444
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
445
+ keep track of the pre-tokenization, and leverage the capabilities of the
446
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
447
+ the pre-tokenization of a raw string, you can use
448
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
449
 
450
+ Args:
451
+ pretok (:class:`~tokenizers.PreTokenizedString):
452
+ The pre-tokenized string on which to apply this
453
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
454
  """
455
  pass
456
 
457
+ def pre_tokenize_str(self, sequence):
458
  """
459
+ Pre tokenize the given string
460
+
461
+ This method provides a way to visualize the effect of a
462
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
463
+ alignment, nor does it provide all the capabilities of the
464
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
465
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
466
 
467
  Args:
468
  sequence (:obj:`str`):
469
+ A string to pre-tokeize
470
 
471
  Returns:
472
+ :obj:`List[Tuple[str, Offsets]]`:
473
+ A list of tuple with the pre-tokenized parts and their offsets
474
  """
475
  pass
476
 
477
+ class UnicodeScripts(PreTokenizer):
478
  """
479
+ This pre-tokenizer splits on characters that belong to different language family
480
+ It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
481
+ Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
482
+ This mimicks SentencePiece Unigram implementation.
 
 
 
 
 
 
 
483
  """
484
+ def __init__(self):
485
  pass
486
 
487
+ def pre_tokenize(self, pretok):
 
488
  """
489
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
 
 
 
 
 
490
 
491
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
492
+ keep track of the pre-tokenization, and leverage the capabilities of the
493
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
494
+ the pre-tokenization of a raw string, you can use
495
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
496
 
497
  Args:
498
+ pretok (:class:`~tokenizers.PreTokenizedString):
499
+ The pre-tokenized string on which to apply this
500
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
 
 
501
  """
502
  pass
503
 
504
+ def pre_tokenize_str(self, sequence):
505
  """
506
+ Pre tokenize the given string
507
 
508
+ This method provides a way to visualize the effect of a
509
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
510
+ alignment, nor does it provide all the capabilities of the
511
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
512
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
 
 
 
 
 
 
513
 
514
  Args:
515
+ sequence (:obj:`str`):
516
+ A string to pre-tokeize
517
 
518
  Returns:
519
+ :obj:`List[Tuple[str, Offsets]]`:
520
+ A list of tuple with the pre-tokenized parts and their offsets
521
  """
522
  pass
523
 
524
+ class Whitespace(PreTokenizer):
525
+ """
526
+ This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
527
+ """
528
+ def __init__(self):
529
+ pass
530
+
531
+ def pre_tokenize(self, pretok):
532
  """
533
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
534
 
535
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
536
+ keep track of the pre-tokenization, and leverage the capabilities of the
537
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
538
+ the pre-tokenization of a raw string, you can use
539
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
540
 
541
  Args:
542
+ pretok (:class:`~tokenizers.PreTokenizedString):
543
+ The pre-tokenized string on which to apply this
544
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
 
 
545
  """
546
  pass
547
 
548
+ def pre_tokenize_str(self, sequence):
549
  """
550
+ Pre tokenize the given string
551
 
552
+ This method provides a way to visualize the effect of a
553
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
554
+ alignment, nor does it provide all the capabilities of the
555
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
556
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
557
 
558
  Args:
559
+ sequence (:obj:`str`):
560
+ A string to pre-tokeize
 
 
 
561
 
562
  Returns:
563
+ :obj:`List[Tuple[str, Offsets]]`:
564
+ A list of tuple with the pre-tokenized parts and their offsets
565
  """
566
  pass
567
 
568
+ class WhitespaceSplit(PreTokenizer):
569
+ """
570
+ This pre-tokenizer simply splits on the whitespace. Works like `.split()`
571
+ """
572
+ def __init__(self):
573
+ pass
574
+
575
+ def pre_tokenize(self, pretok):
576
  """
577
+ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
578
 
579
+ This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
580
+ keep track of the pre-tokenization, and leverage the capabilities of the
581
+ :class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
582
+ the pre-tokenization of a raw string, you can use
583
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
584
 
585
+ Args:
586
+ pretok (:class:`~tokenizers.PreTokenizedString):
587
+ The pre-tokenized string on which to apply this
588
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer`
589
  """
590
  pass
591
 
592
+ def pre_tokenize_str(self, sequence):
593
  """
594
+ Pre tokenize the given string
595
+
596
+ This method provides a way to visualize the effect of a
597
+ :class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
598
+ alignment, nor does it provide all the capabilities of the
599
+ :class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
600
+ :meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
601
 
602
  Args:
603
  sequence (:obj:`str`):
604
+ A string to pre-tokeize
605
 
606
  Returns:
607
+ :obj:`List[Tuple[str, Offsets]]`:
608
+ A list of tuple with the pre-tokenized parts and their offsets
609
  """
610
  pass