Instructions to use BricksDisplay/chatterbox-multilingual-ONNX with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Chatterbox
How to use BricksDisplay/chatterbox-multilingual-ONNX with Chatterbox:
# pip install chatterbox-tts import torchaudio as ta from chatterbox.tts import ChatterboxTTS model = ChatterboxTTS.from_pretrained(device="cuda") text = "Ezreal and Jinx teamed up with Ahri, Yasuo, and Teemo to take down the enemy's Nexus in an epic late-game pentakill." wav = model.generate(text) ta.save("test-1.wav", wav, model.sr) # If you want to synthesize with a different voice, specify the audio prompt AUDIO_PROMPT_PATH="YOUR_FILE.wav" wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH) ta.save("test-2.wav", wav, model.sr) - Notebooks
- Google Colab
- Kaggle
fix(tokenizer): add bare-name special tokens (BOS/EOS/START_SPEECH/STOP_SPEECH/EXAGGERATION) so post_processor template lookups resolve to correct IDs
Browse filesThe template references 'BOS'/'EOS'/'START_SPEECH'/'EXAGGERATION' but those
names aren't in added_tokens — they were silently becoming UNK (1). transformers.js
looks up post-processor template names via added_tokens_map, so adding entries
with content=BOS / id=255 etc fixes the resolved sequence to match the English
model template.
- tokenizer.json +182 -29
tokenizer.json
CHANGED
|
@@ -12,6 +12,15 @@
|
|
| 12 |
"rstrip": false,
|
| 13 |
"normalized": false
|
| 14 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
{
|
| 16 |
"id": 1,
|
| 17 |
"special": true,
|
|
@@ -39,6 +48,15 @@
|
|
| 39 |
"rstrip": false,
|
| 40 |
"normalized": false
|
| 41 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
{
|
| 43 |
"id": 604,
|
| 44 |
"content": "[UH]",
|
|
@@ -1064,6 +1082,33 @@
|
|
| 1064 |
"rstrip": false,
|
| 1065 |
"normalized": false,
|
| 1066 |
"special": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1067 |
}
|
| 1068 |
],
|
| 1069 |
"normalizer": {
|
|
@@ -1071,7 +1116,9 @@
|
|
| 1071 |
"normalizers": [
|
| 1072 |
{
|
| 1073 |
"type": "Replace",
|
| 1074 |
-
"pattern": {
|
|
|
|
|
|
|
| 1075 |
"content": "[SPACE]"
|
| 1076 |
}
|
| 1077 |
]
|
|
@@ -1082,47 +1129,153 @@
|
|
| 1082 |
"post_processor": {
|
| 1083 |
"type": "TemplateProcessing",
|
| 1084 |
"single": [
|
| 1085 |
-
{
|
| 1086 |
-
|
| 1087 |
-
|
| 1088 |
-
|
| 1089 |
-
|
| 1090 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1091 |
],
|
| 1092 |
"pair": [
|
| 1093 |
-
{
|
| 1094 |
-
|
| 1095 |
-
|
| 1096 |
-
|
| 1097 |
-
|
| 1098 |
-
|
| 1099 |
-
{
|
| 1100 |
-
|
| 1101 |
-
|
| 1102 |
-
|
| 1103 |
-
|
| 1104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1105 |
],
|
| 1106 |
"special_tokens": {
|
| 1107 |
"BOS": {
|
| 1108 |
"id": "BOS",
|
| 1109 |
-
"ids": [
|
| 1110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1111 |
},
|
| 1112 |
"EOS": {
|
| 1113 |
"id": "EOS",
|
| 1114 |
-
"ids": [
|
| 1115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1116 |
},
|
| 1117 |
"EXAGGERATION": {
|
| 1118 |
"id": "EXAGGERATION",
|
| 1119 |
-
"ids": [
|
| 1120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1121 |
},
|
| 1122 |
"START_SPEECH": {
|
| 1123 |
"id": "START_SPEECH",
|
| 1124 |
-
"ids": [
|
| 1125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1126 |
}
|
| 1127 |
}
|
| 1128 |
},
|
|
@@ -3539,7 +3692,7 @@
|
|
| 3539 |
"ώ": 2401,
|
| 3540 |
"Έ": 2402,
|
| 3541 |
"Ό": 2403,
|
| 3542 |
-
"Ή": 2404,
|
| 3543 |
"ž": 2405,
|
| 3544 |
"š": 2406,
|
| 3545 |
"ū": 2407,
|
|
@@ -3589,7 +3742,7 @@
|
|
| 3589 |
"ụ": 2451,
|
| 3590 |
"ọ": 2452,
|
| 3591 |
"ạ": 2453
|
| 3592 |
-
|
| 3593 |
"merges": [
|
| 3594 |
"t h",
|
| 3595 |
"i n",
|
|
|
|
| 12 |
"rstrip": false,
|
| 13 |
"normalized": false
|
| 14 |
},
|
| 15 |
+
{
|
| 16 |
+
"id": 0,
|
| 17 |
+
"content": "EOS",
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"normalized": false,
|
| 22 |
+
"special": true
|
| 23 |
+
},
|
| 24 |
{
|
| 25 |
"id": 1,
|
| 26 |
"special": true,
|
|
|
|
| 48 |
"rstrip": false,
|
| 49 |
"normalized": false
|
| 50 |
},
|
| 51 |
+
{
|
| 52 |
+
"id": 255,
|
| 53 |
+
"content": "BOS",
|
| 54 |
+
"single_word": false,
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"rstrip": false,
|
| 57 |
+
"normalized": false,
|
| 58 |
+
"special": true
|
| 59 |
+
},
|
| 60 |
{
|
| 61 |
"id": 604,
|
| 62 |
"content": "[UH]",
|
|
|
|
| 1082 |
"rstrip": false,
|
| 1083 |
"normalized": false,
|
| 1084 |
"special": true
|
| 1085 |
+
},
|
| 1086 |
+
{
|
| 1087 |
+
"id": 6561,
|
| 1088 |
+
"content": "START_SPEECH",
|
| 1089 |
+
"single_word": false,
|
| 1090 |
+
"lstrip": false,
|
| 1091 |
+
"rstrip": false,
|
| 1092 |
+
"normalized": false,
|
| 1093 |
+
"special": true
|
| 1094 |
+
},
|
| 1095 |
+
{
|
| 1096 |
+
"id": 6562,
|
| 1097 |
+
"content": "STOP_SPEECH",
|
| 1098 |
+
"single_word": false,
|
| 1099 |
+
"lstrip": false,
|
| 1100 |
+
"rstrip": false,
|
| 1101 |
+
"normalized": false,
|
| 1102 |
+
"special": true
|
| 1103 |
+
},
|
| 1104 |
+
{
|
| 1105 |
+
"id": 6563,
|
| 1106 |
+
"content": "EXAGGERATION",
|
| 1107 |
+
"single_word": false,
|
| 1108 |
+
"lstrip": false,
|
| 1109 |
+
"rstrip": false,
|
| 1110 |
+
"normalized": false,
|
| 1111 |
+
"special": true
|
| 1112 |
}
|
| 1113 |
],
|
| 1114 |
"normalizer": {
|
|
|
|
| 1116 |
"normalizers": [
|
| 1117 |
{
|
| 1118 |
"type": "Replace",
|
| 1119 |
+
"pattern": {
|
| 1120 |
+
"String": " "
|
| 1121 |
+
},
|
| 1122 |
"content": "[SPACE]"
|
| 1123 |
}
|
| 1124 |
]
|
|
|
|
| 1129 |
"post_processor": {
|
| 1130 |
"type": "TemplateProcessing",
|
| 1131 |
"single": [
|
| 1132 |
+
{
|
| 1133 |
+
"SpecialToken": {
|
| 1134 |
+
"id": "EXAGGERATION",
|
| 1135 |
+
"type_id": 0
|
| 1136 |
+
}
|
| 1137 |
+
},
|
| 1138 |
+
{
|
| 1139 |
+
"SpecialToken": {
|
| 1140 |
+
"id": "BOS",
|
| 1141 |
+
"type_id": 0
|
| 1142 |
+
}
|
| 1143 |
+
},
|
| 1144 |
+
{
|
| 1145 |
+
"Sequence": {
|
| 1146 |
+
"id": "A",
|
| 1147 |
+
"type_id": 0
|
| 1148 |
+
}
|
| 1149 |
+
},
|
| 1150 |
+
{
|
| 1151 |
+
"SpecialToken": {
|
| 1152 |
+
"id": "EOS",
|
| 1153 |
+
"type_id": 0
|
| 1154 |
+
}
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"SpecialToken": {
|
| 1158 |
+
"id": "START_SPEECH",
|
| 1159 |
+
"type_id": 0
|
| 1160 |
+
}
|
| 1161 |
+
},
|
| 1162 |
+
{
|
| 1163 |
+
"SpecialToken": {
|
| 1164 |
+
"id": "START_SPEECH",
|
| 1165 |
+
"type_id": 0
|
| 1166 |
+
}
|
| 1167 |
+
}
|
| 1168 |
],
|
| 1169 |
"pair": [
|
| 1170 |
+
{
|
| 1171 |
+
"SpecialToken": {
|
| 1172 |
+
"id": "EXAGGERATION",
|
| 1173 |
+
"type_id": 0
|
| 1174 |
+
}
|
| 1175 |
+
},
|
| 1176 |
+
{
|
| 1177 |
+
"SpecialToken": {
|
| 1178 |
+
"id": "BOS",
|
| 1179 |
+
"type_id": 0
|
| 1180 |
+
}
|
| 1181 |
+
},
|
| 1182 |
+
{
|
| 1183 |
+
"Sequence": {
|
| 1184 |
+
"id": "A",
|
| 1185 |
+
"type_id": 0
|
| 1186 |
+
}
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
"SpecialToken": {
|
| 1190 |
+
"id": "EOS",
|
| 1191 |
+
"type_id": 0
|
| 1192 |
+
}
|
| 1193 |
+
},
|
| 1194 |
+
{
|
| 1195 |
+
"SpecialToken": {
|
| 1196 |
+
"id": "START_SPEECH",
|
| 1197 |
+
"type_id": 0
|
| 1198 |
+
}
|
| 1199 |
+
},
|
| 1200 |
+
{
|
| 1201 |
+
"SpecialToken": {
|
| 1202 |
+
"id": "START_SPEECH",
|
| 1203 |
+
"type_id": 0
|
| 1204 |
+
}
|
| 1205 |
+
},
|
| 1206 |
+
{
|
| 1207 |
+
"SpecialToken": {
|
| 1208 |
+
"id": "EXAGGERATION",
|
| 1209 |
+
"type_id": 1
|
| 1210 |
+
}
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"SpecialToken": {
|
| 1214 |
+
"id": "BOS",
|
| 1215 |
+
"type_id": 1
|
| 1216 |
+
}
|
| 1217 |
+
},
|
| 1218 |
+
{
|
| 1219 |
+
"Sequence": {
|
| 1220 |
+
"id": "B",
|
| 1221 |
+
"type_id": 1
|
| 1222 |
+
}
|
| 1223 |
+
},
|
| 1224 |
+
{
|
| 1225 |
+
"SpecialToken": {
|
| 1226 |
+
"id": "EOS",
|
| 1227 |
+
"type_id": 1
|
| 1228 |
+
}
|
| 1229 |
+
},
|
| 1230 |
+
{
|
| 1231 |
+
"SpecialToken": {
|
| 1232 |
+
"id": "START_SPEECH",
|
| 1233 |
+
"type_id": 1
|
| 1234 |
+
}
|
| 1235 |
+
},
|
| 1236 |
+
{
|
| 1237 |
+
"SpecialToken": {
|
| 1238 |
+
"id": "START_SPEECH",
|
| 1239 |
+
"type_id": 1
|
| 1240 |
+
}
|
| 1241 |
+
}
|
| 1242 |
],
|
| 1243 |
"special_tokens": {
|
| 1244 |
"BOS": {
|
| 1245 |
"id": "BOS",
|
| 1246 |
+
"ids": [
|
| 1247 |
+
255
|
| 1248 |
+
],
|
| 1249 |
+
"tokens": [
|
| 1250 |
+
"<s>"
|
| 1251 |
+
]
|
| 1252 |
},
|
| 1253 |
"EOS": {
|
| 1254 |
"id": "EOS",
|
| 1255 |
+
"ids": [
|
| 1256 |
+
0
|
| 1257 |
+
],
|
| 1258 |
+
"tokens": [
|
| 1259 |
+
"</s>"
|
| 1260 |
+
]
|
| 1261 |
},
|
| 1262 |
"EXAGGERATION": {
|
| 1263 |
"id": "EXAGGERATION",
|
| 1264 |
+
"ids": [
|
| 1265 |
+
6563
|
| 1266 |
+
],
|
| 1267 |
+
"tokens": [
|
| 1268 |
+
"<EXAGGERATION>"
|
| 1269 |
+
]
|
| 1270 |
},
|
| 1271 |
"START_SPEECH": {
|
| 1272 |
"id": "START_SPEECH",
|
| 1273 |
+
"ids": [
|
| 1274 |
+
6561
|
| 1275 |
+
],
|
| 1276 |
+
"tokens": [
|
| 1277 |
+
"<START_SPEECH>"
|
| 1278 |
+
]
|
| 1279 |
}
|
| 1280 |
}
|
| 1281 |
},
|
|
|
|
| 3692 |
"ώ": 2401,
|
| 3693 |
"Έ": 2402,
|
| 3694 |
"Ό": 2403,
|
| 3695 |
+
"Ή": 2404,
|
| 3696 |
"ž": 2405,
|
| 3697 |
"š": 2406,
|
| 3698 |
"ū": 2407,
|
|
|
|
| 3742 |
"ụ": 2451,
|
| 3743 |
"ọ": 2452,
|
| 3744 |
"ạ": 2453
|
| 3745 |
+
},
|
| 3746 |
"merges": [
|
| 3747 |
"t h",
|
| 3748 |
"i n",
|