Fix escaped sequences
Browse files- tokenization_inflm.py +1 -1
tokenization_inflm.py
CHANGED
|
@@ -33,7 +33,7 @@ from tokenizers.pre_tokenizers import Digits, Split, ByteLevel
|
|
| 33 |
import os
|
| 34 |
|
| 35 |
# same as gpt4 cl-base-100k
|
| 36 |
-
PATTERN = Regex("(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+\s+(\S)+")
|
| 37 |
|
| 38 |
logger = logging.get_logger(__name__)
|
| 39 |
|
|
|
|
| 33 |
import os
|
| 34 |
|
| 35 |
# same as gpt4 cl-base-100k
|
| 36 |
+
PATTERN = Regex(r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+\s+(\S)+")
|
| 37 |
|
| 38 |
logger = logging.get_logger(__name__)
|
| 39 |
|