hans00 commited on
Commit
e53e5cd
·
verified ·
1 Parent(s): d850d55

fix(tokenizer): add bare-name special tokens (BOS/EOS/START_SPEECH/STOP_SPEECH/EXAGGERATION) so post_processor template lookups resolve to correct IDs

Browse files

The template references 'BOS'/'EOS'/'START_SPEECH'/'EXAGGERATION' but those
names aren't in added_tokens — they were silently becoming UNK (1). transformers.js
looks up post-processor template names via added_tokens_map, so adding entries
with content=BOS / id=255 etc fixes the resolved sequence to match the English
model template.

Files changed (1) hide show
  1. tokenizer.json +182 -29
tokenizer.json CHANGED
@@ -12,6 +12,15 @@
12
  "rstrip": false,
13
  "normalized": false
14
  },
 
 
 
 
 
 
 
 
 
15
  {
16
  "id": 1,
17
  "special": true,
@@ -39,6 +48,15 @@
39
  "rstrip": false,
40
  "normalized": false
41
  },
 
 
 
 
 
 
 
 
 
42
  {
43
  "id": 604,
44
  "content": "[UH]",
@@ -1064,6 +1082,33 @@
1064
  "rstrip": false,
1065
  "normalized": false,
1066
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1067
  }
1068
  ],
1069
  "normalizer": {
@@ -1071,7 +1116,9 @@
1071
  "normalizers": [
1072
  {
1073
  "type": "Replace",
1074
- "pattern": { "String": " " },
 
 
1075
  "content": "[SPACE]"
1076
  }
1077
  ]
@@ -1082,47 +1129,153 @@
1082
  "post_processor": {
1083
  "type": "TemplateProcessing",
1084
  "single": [
1085
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
1086
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
1087
- { "Sequence": { "id": "A", "type_id": 0 } },
1088
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
1089
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
1090
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1091
  ],
1092
  "pair": [
1093
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 0 } },
1094
- { "SpecialToken": { "id": "BOS", "type_id": 0 } },
1095
- { "Sequence": { "id": "A", "type_id": 0 } },
1096
- { "SpecialToken": { "id": "EOS", "type_id": 0 } },
1097
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
1098
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 0 } },
1099
- { "SpecialToken": { "id": "EXAGGERATION", "type_id": 1 } },
1100
- { "SpecialToken": { "id": "BOS", "type_id": 1 } },
1101
- { "Sequence": { "id": "B", "type_id": 1 } },
1102
- { "SpecialToken": { "id": "EOS", "type_id": 1 } },
1103
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } },
1104
- { "SpecialToken": { "id": "START_SPEECH", "type_id": 1 } }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1105
  ],
1106
  "special_tokens": {
1107
  "BOS": {
1108
  "id": "BOS",
1109
- "ids": [255],
1110
- "tokens": ["<s>"]
 
 
 
 
1111
  },
1112
  "EOS": {
1113
  "id": "EOS",
1114
- "ids": [0],
1115
- "tokens": ["</s>"]
 
 
 
 
1116
  },
1117
  "EXAGGERATION": {
1118
  "id": "EXAGGERATION",
1119
- "ids": [6563],
1120
- "tokens": ["<EXAGGERATION>"]
 
 
 
 
1121
  },
1122
  "START_SPEECH": {
1123
  "id": "START_SPEECH",
1124
- "ids": [6561],
1125
- "tokens": ["<START_SPEECH>"]
 
 
 
 
1126
  }
1127
  }
1128
  },
@@ -3539,7 +3692,7 @@
3539
  "ώ": 2401,
3540
  "Έ": 2402,
3541
  "Ό": 2403,
3542
- "Ή": 2404,
3543
  "ž": 2405,
3544
  "š": 2406,
3545
  "ū": 2407,
@@ -3589,7 +3742,7 @@
3589
  "ụ": 2451,
3590
  "ọ": 2452,
3591
  "ạ": 2453
3592
- },
3593
  "merges": [
3594
  "t h",
3595
  "i n",
 
12
  "rstrip": false,
13
  "normalized": false
14
  },
15
+ {
16
+ "id": 0,
17
+ "content": "EOS",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
  {
25
  "id": 1,
26
  "special": true,
 
48
  "rstrip": false,
49
  "normalized": false
50
  },
51
+ {
52
+ "id": 255,
53
+ "content": "BOS",
54
+ "single_word": false,
55
+ "lstrip": false,
56
+ "rstrip": false,
57
+ "normalized": false,
58
+ "special": true
59
+ },
60
  {
61
  "id": 604,
62
  "content": "[UH]",
 
1082
  "rstrip": false,
1083
  "normalized": false,
1084
  "special": true
1085
+ },
1086
+ {
1087
+ "id": 6561,
1088
+ "content": "START_SPEECH",
1089
+ "single_word": false,
1090
+ "lstrip": false,
1091
+ "rstrip": false,
1092
+ "normalized": false,
1093
+ "special": true
1094
+ },
1095
+ {
1096
+ "id": 6562,
1097
+ "content": "STOP_SPEECH",
1098
+ "single_word": false,
1099
+ "lstrip": false,
1100
+ "rstrip": false,
1101
+ "normalized": false,
1102
+ "special": true
1103
+ },
1104
+ {
1105
+ "id": 6563,
1106
+ "content": "EXAGGERATION",
1107
+ "single_word": false,
1108
+ "lstrip": false,
1109
+ "rstrip": false,
1110
+ "normalized": false,
1111
+ "special": true
1112
  }
1113
  ],
1114
  "normalizer": {
 
1116
  "normalizers": [
1117
  {
1118
  "type": "Replace",
1119
+ "pattern": {
1120
+ "String": " "
1121
+ },
1122
  "content": "[SPACE]"
1123
  }
1124
  ]
 
1129
  "post_processor": {
1130
  "type": "TemplateProcessing",
1131
  "single": [
1132
+ {
1133
+ "SpecialToken": {
1134
+ "id": "EXAGGERATION",
1135
+ "type_id": 0
1136
+ }
1137
+ },
1138
+ {
1139
+ "SpecialToken": {
1140
+ "id": "BOS",
1141
+ "type_id": 0
1142
+ }
1143
+ },
1144
+ {
1145
+ "Sequence": {
1146
+ "id": "A",
1147
+ "type_id": 0
1148
+ }
1149
+ },
1150
+ {
1151
+ "SpecialToken": {
1152
+ "id": "EOS",
1153
+ "type_id": 0
1154
+ }
1155
+ },
1156
+ {
1157
+ "SpecialToken": {
1158
+ "id": "START_SPEECH",
1159
+ "type_id": 0
1160
+ }
1161
+ },
1162
+ {
1163
+ "SpecialToken": {
1164
+ "id": "START_SPEECH",
1165
+ "type_id": 0
1166
+ }
1167
+ }
1168
  ],
1169
  "pair": [
1170
+ {
1171
+ "SpecialToken": {
1172
+ "id": "EXAGGERATION",
1173
+ "type_id": 0
1174
+ }
1175
+ },
1176
+ {
1177
+ "SpecialToken": {
1178
+ "id": "BOS",
1179
+ "type_id": 0
1180
+ }
1181
+ },
1182
+ {
1183
+ "Sequence": {
1184
+ "id": "A",
1185
+ "type_id": 0
1186
+ }
1187
+ },
1188
+ {
1189
+ "SpecialToken": {
1190
+ "id": "EOS",
1191
+ "type_id": 0
1192
+ }
1193
+ },
1194
+ {
1195
+ "SpecialToken": {
1196
+ "id": "START_SPEECH",
1197
+ "type_id": 0
1198
+ }
1199
+ },
1200
+ {
1201
+ "SpecialToken": {
1202
+ "id": "START_SPEECH",
1203
+ "type_id": 0
1204
+ }
1205
+ },
1206
+ {
1207
+ "SpecialToken": {
1208
+ "id": "EXAGGERATION",
1209
+ "type_id": 1
1210
+ }
1211
+ },
1212
+ {
1213
+ "SpecialToken": {
1214
+ "id": "BOS",
1215
+ "type_id": 1
1216
+ }
1217
+ },
1218
+ {
1219
+ "Sequence": {
1220
+ "id": "B",
1221
+ "type_id": 1
1222
+ }
1223
+ },
1224
+ {
1225
+ "SpecialToken": {
1226
+ "id": "EOS",
1227
+ "type_id": 1
1228
+ }
1229
+ },
1230
+ {
1231
+ "SpecialToken": {
1232
+ "id": "START_SPEECH",
1233
+ "type_id": 1
1234
+ }
1235
+ },
1236
+ {
1237
+ "SpecialToken": {
1238
+ "id": "START_SPEECH",
1239
+ "type_id": 1
1240
+ }
1241
+ }
1242
  ],
1243
  "special_tokens": {
1244
  "BOS": {
1245
  "id": "BOS",
1246
+ "ids": [
1247
+ 255
1248
+ ],
1249
+ "tokens": [
1250
+ "<s>"
1251
+ ]
1252
  },
1253
  "EOS": {
1254
  "id": "EOS",
1255
+ "ids": [
1256
+ 0
1257
+ ],
1258
+ "tokens": [
1259
+ "</s>"
1260
+ ]
1261
  },
1262
  "EXAGGERATION": {
1263
  "id": "EXAGGERATION",
1264
+ "ids": [
1265
+ 6563
1266
+ ],
1267
+ "tokens": [
1268
+ "<EXAGGERATION>"
1269
+ ]
1270
  },
1271
  "START_SPEECH": {
1272
  "id": "START_SPEECH",
1273
+ "ids": [
1274
+ 6561
1275
+ ],
1276
+ "tokens": [
1277
+ "<START_SPEECH>"
1278
+ ]
1279
  }
1280
  }
1281
  },
 
3692
  "ώ": 2401,
3693
  "Έ": 2402,
3694
  "Ό": 2403,
3695
+ "Ή": 2404,
3696
  "ž": 2405,
3697
  "š": 2406,
3698
  "ū": 2407,
 
3742
  "ụ": 2451,
3743
  "ọ": 2452,
3744
  "ạ": 2453
3745
+ },
3746
  "merges": [
3747
  "t h",
3748
  "i n",