ankur-bohra commited on
Commit
6494c25
·
verified ·
1 Parent(s): d6b03fc

Fix escaped sequences

Browse files
Files changed (1) hide show
  1. tokenization_inflm.py +1 -1
tokenization_inflm.py CHANGED
@@ -33,7 +33,7 @@ from tokenizers.pre_tokenizers import Digits, Split, ByteLevel
33
  import os
34
 
35
  # same as gpt4 cl-base-100k
36
- PATTERN = Regex("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+\s+(\S)+")
37
 
38
  logger = logging.get_logger(__name__)
39
 
 
33
  import os
34
 
35
  # same as gpt4 cl-base-100k
36
+ PATTERN = Regex(r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+\s+(\S)+")
37
 
38
  logger = logging.get_logger(__name__)
39