Nattapong Tapachoom commited on
Commit
bf83b0d
·
1 Parent(s): 12f0d60

Update tokenizer.json and vocab.json for improved formatting and added new token

Browse files
Files changed (2) hide show
  1. tokenizer.json +6 -118
  2. vocab.json +2 -1
tokenizer.json CHANGED
@@ -1093,144 +1093,32 @@
1093
  {
1094
  "type": "Replace",
1095
  "pattern": {
1096
- "String": ""
1097
  },
1098
- "content": "\""
1099
  },
1100
  {
1101
  "type": "Replace",
1102
  "pattern": {
1103
- "String": ""
1104
  },
1105
- "content": "\""
1106
- },
1107
- {
1108
- "type": "Replace",
1109
- "pattern": {
1110
- "String": "‘"
1111
- },
1112
- "content": "'"
1113
- },
1114
- {
1115
- "type": "Replace",
1116
- "pattern": {
1117
- "String": "’"
1118
- },
1119
- "content": "'"
1120
- },
1121
- {
1122
- "type": "Replace",
1123
- "pattern": {
1124
- "String": "—"
1125
- },
1126
- "content": "-"
1127
- },
1128
- {
1129
- "type": "Replace",
1130
- "pattern": {
1131
- "String": "–"
1132
- },
1133
- "content": "-"
1134
  }
1135
  ]
1136
  },
1137
  "pre_tokenizer": {
1138
  "type": "Sequence",
1139
  "pretokenizers": [
1140
- {
1141
- "type": "Whitespace"
1142
- },
1143
- {
1144
- "type": "Split",
1145
- "pattern": {
1146
- "String": "(?:[\\U0001F1E6-\\U0001F1FF]{2}|[\\U0001F300-\\U0001FAFF]|[\\u2600-\\u26FF]|[\\u2700-\\u27BF]|[\\U0001F900-\\U0001F9FF]|[\\U0001F600-\\U0001F64F]|[\\U0001F680-\\U0001F6FF]|(?:[\\U0001F300-\\U0001FAFF]|[\\U0001F1E6-\\U0001F1FF]|[\\u2600-\\u26FF]|[\\u2700-\\u27BF])(?:[\\uFE0F\\u200D][^\\s]){0,5})"
1147
- },
1148
- "behavior": "Isolated",
1149
- "invert": false
1150
- },
1151
- {
1152
- "type": "Split",
1153
- "pattern": {
1154
- "String": "[0-9\\u0E50-\\u0E59]+(?:กม\\.|ชม\\.|ซม\\.|มม\\.|°C|cm\\.|mm\\.|km\\.|kg\\.|g\\.|mg\\.|m\\.|l\\.|ml\\.|Hz|kHz|MHz|GHz|%)"
1155
- },
1156
- "behavior": "Isolated",
1157
- "invert": false
1158
- },
1159
- {
1160
- "type": "Split",
1161
- "pattern": {
1162
- "String": "(?:กม\\.|ชม\\.|ซม\\.|มม\\.|°C|cm\\.|mm\\.|km\\.|kg\\.|g\\.|mg\\.|m\\.|l\\.|ml\\.|Hz|kHz|MHz|GHz|%)"
1163
- },
1164
- "behavior": "Isolated",
1165
- "invert": false
1166
- },
1167
- {
1168
- "type": "Split",
1169
- "pattern": {
1170
- "String": "[0-9\\u0E50-\\u0E59]{1,2}:[0-9\\u0E50-\\u0E59]{2}|[0-9\\u0E50-\\u0E59]{1,2}\\.[0-9\\u0E50-\\u0E59]{2}|[0-9\\u0E50-\\u0E59]{1,2}/[0-9\\u0E50-\\u0E59]{1,2}/[0-9\\u0E50-\\u0E59]{2,4}"
1171
- },
1172
- "behavior": "Isolated",
1173
- "invert": false
1174
- },
1175
- {
1176
- "type": "Split",
1177
- "pattern": {
1178
- "String": "(?:\\$|฿|€|£)\\d+(?:\\.\\d+)?"
1179
- },
1180
- "behavior": "Isolated",
1181
- "invert": false
1182
- },
1183
- {
1184
- "type": "Split",
1185
- "pattern": {
1186
- "String": "https?://[^\\s)\\]\\}<>]+"
1187
- },
1188
- "behavior": "Isolated",
1189
- "invert": false
1190
- },
1191
  {
1192
  "type": "Split",
1193
  "pattern": {
1194
- "String": "[A-Za-z0-9._%+\\-]+@[A-Za-z0-9.\\-]+\\.[A-Za-z]{2,}"
1195
  },
1196
  "behavior": "Isolated",
1197
  "invert": false
1198
  },
1199
  {
1200
- "type": "Split",
1201
- "pattern": {
1202
- "String": "#[\\w\\u0E00-\\u0E7F]+"
1203
- },
1204
- "behavior": "Isolated",
1205
- "invert": false
1206
- },
1207
- {
1208
- "type": "Split",
1209
- "pattern": {
1210
- "String": "@[A-Za-z0-9_]+"
1211
- },
1212
- "behavior": "Isolated",
1213
- "invert": false
1214
- },
1215
- {
1216
- "type": "Split",
1217
- "pattern": {
1218
- "String": "[A-Za-z]:\\\\[^\\s]+"
1219
- },
1220
- "behavior": "Isolated",
1221
- "invert": false
1222
- },
1223
- {
1224
- "type": "Split",
1225
- "pattern": {
1226
- "String": "/[^\\s]+"
1227
- },
1228
- "behavior": "Isolated",
1229
- "invert": false
1230
- },
1231
- {
1232
- "type": "Digits",
1233
- "individual_digits": false
1234
  }
1235
  ]
1236
  },
 
1093
  {
1094
  "type": "Replace",
1095
  "pattern": {
1096
+ "String": "\r\n"
1097
  },
1098
+ "content": "\n"
1099
  },
1100
  {
1101
  "type": "Replace",
1102
  "pattern": {
1103
+ "String": "\r"
1104
  },
1105
+ "content": "\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1106
  }
1107
  ]
1108
  },
1109
  "pre_tokenizer": {
1110
  "type": "Sequence",
1111
  "pretokenizers": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1112
  {
1113
  "type": "Split",
1114
  "pattern": {
1115
+ "String": "\n"
1116
  },
1117
  "behavior": "Isolated",
1118
  "invert": false
1119
  },
1120
  {
1121
+ "type": "Whitespace"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1122
  }
1123
  ]
1124
  },
vocab.json CHANGED
@@ -35588,5 +35588,6 @@
35588
  "หนามยอก": 19797,
35589
  "รซ์": 6623,
35590
  "มสิน": 7082,
35591
- "สแตมฟอร์": 9937
 
35592
  }
 
35588
  "หนามยอก": 19797,
35589
  "รซ์": 6623,
35590
  "มสิน": 7082,
35591
+ "สแตมฟอร์": 9937,
35592
+ "<NL>": 35593
35593
  }