Nattapong Tapachoom commited on
Commit ·
bf83b0d
1
Parent(s): 12f0d60
Update tokenizer.json and vocab.json for improved formatting and added new token
Browse files- tokenizer.json +6 -118
- vocab.json +2 -1
tokenizer.json
CHANGED
|
@@ -1093,144 +1093,32 @@
|
|
| 1093 |
{
|
| 1094 |
"type": "Replace",
|
| 1095 |
"pattern": {
|
| 1096 |
-
"String": "
|
| 1097 |
},
|
| 1098 |
-
"content": "\"
|
| 1099 |
},
|
| 1100 |
{
|
| 1101 |
"type": "Replace",
|
| 1102 |
"pattern": {
|
| 1103 |
-
"String": "
|
| 1104 |
},
|
| 1105 |
-
"content": "\"
|
| 1106 |
-
},
|
| 1107 |
-
{
|
| 1108 |
-
"type": "Replace",
|
| 1109 |
-
"pattern": {
|
| 1110 |
-
"String": "‘"
|
| 1111 |
-
},
|
| 1112 |
-
"content": "'"
|
| 1113 |
-
},
|
| 1114 |
-
{
|
| 1115 |
-
"type": "Replace",
|
| 1116 |
-
"pattern": {
|
| 1117 |
-
"String": "’"
|
| 1118 |
-
},
|
| 1119 |
-
"content": "'"
|
| 1120 |
-
},
|
| 1121 |
-
{
|
| 1122 |
-
"type": "Replace",
|
| 1123 |
-
"pattern": {
|
| 1124 |
-
"String": "—"
|
| 1125 |
-
},
|
| 1126 |
-
"content": "-"
|
| 1127 |
-
},
|
| 1128 |
-
{
|
| 1129 |
-
"type": "Replace",
|
| 1130 |
-
"pattern": {
|
| 1131 |
-
"String": "–"
|
| 1132 |
-
},
|
| 1133 |
-
"content": "-"
|
| 1134 |
}
|
| 1135 |
]
|
| 1136 |
},
|
| 1137 |
"pre_tokenizer": {
|
| 1138 |
"type": "Sequence",
|
| 1139 |
"pretokenizers": [
|
| 1140 |
-
{
|
| 1141 |
-
"type": "Whitespace"
|
| 1142 |
-
},
|
| 1143 |
-
{
|
| 1144 |
-
"type": "Split",
|
| 1145 |
-
"pattern": {
|
| 1146 |
-
"String": "(?:[\\U0001F1E6-\\U0001F1FF]{2}|[\\U0001F300-\\U0001FAFF]|[\\u2600-\\u26FF]|[\\u2700-\\u27BF]|[\\U0001F900-\\U0001F9FF]|[\\U0001F600-\\U0001F64F]|[\\U0001F680-\\U0001F6FF]|(?:[\\U0001F300-\\U0001FAFF]|[\\U0001F1E6-\\U0001F1FF]|[\\u2600-\\u26FF]|[\\u2700-\\u27BF])(?:[\\uFE0F\\u200D][^\\s]){0,5})"
|
| 1147 |
-
},
|
| 1148 |
-
"behavior": "Isolated",
|
| 1149 |
-
"invert": false
|
| 1150 |
-
},
|
| 1151 |
-
{
|
| 1152 |
-
"type": "Split",
|
| 1153 |
-
"pattern": {
|
| 1154 |
-
"String": "[0-9\\u0E50-\\u0E59]+(?:กม\\.|ชม\\.|ซม\\.|มม\\.|°C|cm\\.|mm\\.|km\\.|kg\\.|g\\.|mg\\.|m\\.|l\\.|ml\\.|Hz|kHz|MHz|GHz|%)"
|
| 1155 |
-
},
|
| 1156 |
-
"behavior": "Isolated",
|
| 1157 |
-
"invert": false
|
| 1158 |
-
},
|
| 1159 |
-
{
|
| 1160 |
-
"type": "Split",
|
| 1161 |
-
"pattern": {
|
| 1162 |
-
"String": "(?:กม\\.|ชม\\.|ซม\\.|มม\\.|°C|cm\\.|mm\\.|km\\.|kg\\.|g\\.|mg\\.|m\\.|l\\.|ml\\.|Hz|kHz|MHz|GHz|%)"
|
| 1163 |
-
},
|
| 1164 |
-
"behavior": "Isolated",
|
| 1165 |
-
"invert": false
|
| 1166 |
-
},
|
| 1167 |
-
{
|
| 1168 |
-
"type": "Split",
|
| 1169 |
-
"pattern": {
|
| 1170 |
-
"String": "[0-9\\u0E50-\\u0E59]{1,2}:[0-9\\u0E50-\\u0E59]{2}|[0-9\\u0E50-\\u0E59]{1,2}\\.[0-9\\u0E50-\\u0E59]{2}|[0-9\\u0E50-\\u0E59]{1,2}/[0-9\\u0E50-\\u0E59]{1,2}/[0-9\\u0E50-\\u0E59]{2,4}"
|
| 1171 |
-
},
|
| 1172 |
-
"behavior": "Isolated",
|
| 1173 |
-
"invert": false
|
| 1174 |
-
},
|
| 1175 |
-
{
|
| 1176 |
-
"type": "Split",
|
| 1177 |
-
"pattern": {
|
| 1178 |
-
"String": "(?:\\$|฿|€|£)\\d+(?:\\.\\d+)?"
|
| 1179 |
-
},
|
| 1180 |
-
"behavior": "Isolated",
|
| 1181 |
-
"invert": false
|
| 1182 |
-
},
|
| 1183 |
-
{
|
| 1184 |
-
"type": "Split",
|
| 1185 |
-
"pattern": {
|
| 1186 |
-
"String": "https?://[^\\s)\\]\\}<>]+"
|
| 1187 |
-
},
|
| 1188 |
-
"behavior": "Isolated",
|
| 1189 |
-
"invert": false
|
| 1190 |
-
},
|
| 1191 |
{
|
| 1192 |
"type": "Split",
|
| 1193 |
"pattern": {
|
| 1194 |
-
"String": "
|
| 1195 |
},
|
| 1196 |
"behavior": "Isolated",
|
| 1197 |
"invert": false
|
| 1198 |
},
|
| 1199 |
{
|
| 1200 |
-
"type": "
|
| 1201 |
-
"pattern": {
|
| 1202 |
-
"String": "#[\\w\\u0E00-\\u0E7F]+"
|
| 1203 |
-
},
|
| 1204 |
-
"behavior": "Isolated",
|
| 1205 |
-
"invert": false
|
| 1206 |
-
},
|
| 1207 |
-
{
|
| 1208 |
-
"type": "Split",
|
| 1209 |
-
"pattern": {
|
| 1210 |
-
"String": "@[A-Za-z0-9_]+"
|
| 1211 |
-
},
|
| 1212 |
-
"behavior": "Isolated",
|
| 1213 |
-
"invert": false
|
| 1214 |
-
},
|
| 1215 |
-
{
|
| 1216 |
-
"type": "Split",
|
| 1217 |
-
"pattern": {
|
| 1218 |
-
"String": "[A-Za-z]:\\\\[^\\s]+"
|
| 1219 |
-
},
|
| 1220 |
-
"behavior": "Isolated",
|
| 1221 |
-
"invert": false
|
| 1222 |
-
},
|
| 1223 |
-
{
|
| 1224 |
-
"type": "Split",
|
| 1225 |
-
"pattern": {
|
| 1226 |
-
"String": "/[^\\s]+"
|
| 1227 |
-
},
|
| 1228 |
-
"behavior": "Isolated",
|
| 1229 |
-
"invert": false
|
| 1230 |
-
},
|
| 1231 |
-
{
|
| 1232 |
-
"type": "Digits",
|
| 1233 |
-
"individual_digits": false
|
| 1234 |
}
|
| 1235 |
]
|
| 1236 |
},
|
|
|
|
| 1093 |
{
|
| 1094 |
"type": "Replace",
|
| 1095 |
"pattern": {
|
| 1096 |
+
"String": "\r\n"
|
| 1097 |
},
|
| 1098 |
+
"content": "\n"
|
| 1099 |
},
|
| 1100 |
{
|
| 1101 |
"type": "Replace",
|
| 1102 |
"pattern": {
|
| 1103 |
+
"String": "\r"
|
| 1104 |
},
|
| 1105 |
+
"content": "\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1106 |
}
|
| 1107 |
]
|
| 1108 |
},
|
| 1109 |
"pre_tokenizer": {
|
| 1110 |
"type": "Sequence",
|
| 1111 |
"pretokenizers": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1112 |
{
|
| 1113 |
"type": "Split",
|
| 1114 |
"pattern": {
|
| 1115 |
+
"String": "\n"
|
| 1116 |
},
|
| 1117 |
"behavior": "Isolated",
|
| 1118 |
"invert": false
|
| 1119 |
},
|
| 1120 |
{
|
| 1121 |
+
"type": "Whitespace"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1122 |
}
|
| 1123 |
]
|
| 1124 |
},
|
vocab.json
CHANGED
|
@@ -35588,5 +35588,6 @@
|
|
| 35588 |
"หนามยอก": 19797,
|
| 35589 |
"รซ์": 6623,
|
| 35590 |
"มสิน": 7082,
|
| 35591 |
-
"สแตมฟอร์": 9937
|
|
|
|
| 35592 |
}
|
|
|
|
| 35588 |
"หนามยอก": 19797,
|
| 35589 |
"รซ์": 6623,
|
| 35590 |
"มสิน": 7082,
|
| 35591 |
+
"สแตมฟอร์": 9937,
|
| 35592 |
+
"<NL>": 35593
|
| 35593 |
}
|