Spaces:
Running
Running
| import { FalconTokenizer } from "../../../src/tokenizers.js"; | |
| import { BASE_TEST_STRINGS, FALCON_TEST_STRINGS } from "../test_strings.js"; | |
| export const TOKENIZER_CLASS = FalconTokenizer; | |
| export const TEST_CONFIG = { | |
| "tiiuae/falcon-7b": { | |
| SIMPLE: { | |
| text: BASE_TEST_STRINGS.SIMPLE, | |
| tokens: ["How", "\u0120are", "\u0120you", "\u0120doing", "?"], | |
| ids: [1830, 362, 299, 1836, 42], | |
| decoded: "How are you doing?", | |
| }, | |
| SIMPLE_WITH_PUNCTUATION: { | |
| text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION, | |
| tokens: ["You", "\u0120should", "'", "ve", "\u0120done", "\u0120this"], | |
| ids: [1357, 808, 18, 298, 1782, 414], | |
| decoded: "You should've done this", | |
| }, | |
| NUMBERS: { | |
| text: BASE_TEST_STRINGS.NUMBERS, | |
| tokens: ["012", "345", "678", "9", "\u0120", "0", "\u0120", "1", "\u0120", "2", "\u0120", "3", "\u0120", "4", "\u0120", "5", "\u0120", "6", "\u0120", "7", "\u0120", "8", "\u0120", "9", "\u0120", "10", "\u0120", "100", "\u0120", "100", "0"], | |
| ids: [24445, 29094, 41583, 36, 204, 27, 204, 28, 204, 29, 204, 30, 204, 31, 204, 32, 204, 33, 204, 34, 204, 35, 204, 36, 204, 696, 204, 1425, 204, 1425, 27], | |
| decoded: "0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000", | |
| }, | |
| TEXT_WITH_NUMBERS: { | |
| text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS, | |
| tokens: ["The", "\u0120company", "\u0120was", "\u0120founded", "\u0120in", "\u0120", "201", "6", "."], | |
| ids: [487, 1438, 398, 9923, 272, 204, 626, 33, 25], | |
| decoded: "The company was founded in 2016.", | |
| }, | |
| PUNCTUATION: { | |
| text: BASE_TEST_STRINGS.PUNCTUATION, | |
| tokens: ["A", "\u010a", "'", "ll", "\u0120", "!!", "to", "?'", "d", "''", "d", "\u0120of", ",", "\u0120can", "'", "t", "."], | |
| ids: [44, 193, 18, 567, 204, 1409, 534, 12493, 79, 7544, 79, 275, 23, 418, 18, 95, 25], | |
| decoded: "A\n'll!!to?'d''d of, can't.", | |
| }, | |
| PYTHON_CODE: { | |
| text: BASE_TEST_STRINGS.PYTHON_CODE, | |
| tokens: ["def", "\u0120main", "():", "\u010a", "\u0109", "pass"], | |
| ids: [3071, 1316, 13160, 193, 192, 5412], | |
| decoded: "def main():\n\tpass", | |
| }, | |
| JAVASCRIPT_CODE: { | |
| text: BASE_TEST_STRINGS.JAVASCRIPT_CODE, | |
| tokens: ["let", "\u0120a", "\u0120", "=", "\u0120obj", ".", "toString", "();", "\u010a", "toString", "();"], | |
| ids: [1025, 241, 204, 40, 13756, 25, 19409, 2032, 193, 19409, 2032], | |
| decoded: "let a = obj.toString();\ntoString();", | |
| }, | |
| NEWLINES: { | |
| text: BASE_TEST_STRINGS.NEWLINES, | |
| tokens: ["This", "\u010a", "\u010a", "is", "\u010a", "a", "\u010a", "test", "."], | |
| ids: [1182, 193, 193, 259, 193, 76, 193, 4780, 25], | |
| decoded: "This\n\nis\na\ntest.", | |
| }, | |
| BASIC: { | |
| text: BASE_TEST_STRINGS.BASIC, | |
| tokens: ["UN", "want", "\u00c3\u00a9d", ",", "running"], | |
| ids: [4000, 32108, 5706, 23, 27386], | |
| decoded: "UNwant\u00e9d,running", | |
| }, | |
| CONTROL_TOKENS: { | |
| text: BASE_TEST_STRINGS.CONTROL_TOKENS, | |
| tokens: ["1", "\u0100", "2", "\u00ef\u00bf", "\u00bd", "3"], | |
| ids: [28, 186, 29, 13112, 133, 30], | |
| decoded: "1\u00002\ufffd3", | |
| }, | |
| HELLO_WORLD_TITLECASE: { | |
| text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE, | |
| tokens: ["Hello", "\u0120World"], | |
| ids: [9856, 2889], | |
| decoded: "Hello World", | |
| }, | |
| HELLO_WORLD_LOWERCASE: { | |
| text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE, | |
| tokens: ["hello", "\u0120world"], | |
| ids: [30835, 1079], | |
| decoded: "hello world", | |
| }, | |
| CHINESE_ONLY: { | |
| text: BASE_TEST_STRINGS.CHINESE_ONLY, | |
| tokens: ["\u00e7\u0136\u0141\u00e6\u00b4\u00bb", "\u00e7\u013c\u0126", "\u00e7\u013e\u0141", "\u00e8\u00b0", "\u013d", "\u00e6\u013a\u00af"], | |
| ids: [32725, 1105, 15498, 8061, 233, 2364], | |
| decoded: "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", | |
| }, | |
| LEADING_SPACE: { | |
| text: BASE_TEST_STRINGS.LEADING_SPACE, | |
| tokens: ["\u0120\u0120", "\u0120leading", "\u0120space"], | |
| ids: [258, 3736, 2151], | |
| decoded: " leading space", | |
| }, | |
| TRAILING_SPACE: { | |
| text: BASE_TEST_STRINGS.TRAILING_SPACE, | |
| tokens: ["tra", "iling", "\u0120space", "\u0120\u0120\u0120"], | |
| ids: [9172, 4447, 2151, 466], | |
| decoded: "trailing space ", | |
| }, | |
| DOUBLE_SPACE: { | |
| text: BASE_TEST_STRINGS.DOUBLE_SPACE, | |
| tokens: ["Hi", "\u0120", "\u0120Hello"], | |
| ids: [5516, 204, 23090], | |
| decoded: "Hi Hello", | |
| }, | |
| CURRENCY: { | |
| text: BASE_TEST_STRINGS.CURRENCY, | |
| tokens: ["test", "\u0120", "$", "1", "\u0120R", "2", "\u0120", "#", "3", "\u0120\u00e2\u0124\u00ac", "4", "\u0120\u00c2\u00a3", "5", "\u0120\u00c2", "\u00a5", "6", "\u0120\u00e2\u0124", "\u00a3", "7", "\u0120\u00e2\u0124", "\u00b9", "8", "\u0120\u00e2\u0124", "\u00b1", "9", "\u0120test"], | |
| ids: [4780, 204, 15, 28, 382, 29, 204, 14, 30, 6471, 31, 5131, 32, 3068, 110, 33, 25631, 108, 34, 25631, 129, 35, 25631, 121, 36, 1318], | |
| decoded: "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", | |
| }, | |
| CURRENCY_WITH_DECIMALS: { | |
| text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS, | |
| tokens: ["I", "\u0120bought", "\u0120an", "\u0120apple", "\u0120for", "\u0120", "$", "1", ".", "00", "\u0120at", "\u0120the", "\u0120store", "."], | |
| ids: [52, 5659, 267, 12381, 312, 204, 15, 28, 25, 527, 388, 248, 2946, 25], | |
| decoded: "I bought an apple for $1.00 at the store.", | |
| }, | |
| ELLIPSIS: { | |
| text: BASE_TEST_STRINGS.ELLIPSIS, | |
| tokens: ["you", "\u00e2\u0122\u00a6", "\u0120\u0120"], | |
| ids: [5667, 898, 258], | |
| decoded: "you\u2026 ", | |
| }, | |
| TEXT_WITH_ESCAPE_CHARACTERS: { | |
| text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS, | |
| tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"], | |
| ids: [5667, 898, 60482], | |
| decoded: "you\u2026\u00a0\u00a0", | |
| }, | |
| TEXT_WITH_ESCAPE_CHARACTERS_2: { | |
| text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2, | |
| tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142", "\u00c2\u0142", "you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"], | |
| ids: [5667, 898, 4381, 4381, 5667, 898, 60482], | |
| decoded: "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0", | |
| }, | |
| TILDE_NORMALIZATION: { | |
| text: BASE_TEST_STRINGS.TILDE_NORMALIZATION, | |
| tokens: ["we", "ird", "\u0120", "\u00ef", "\u00bd", "\u0140", "\u0120edge", "\u0120", "\u00ef", "\u00bd", "\u0140", "\u0120case"], | |
| ids: [698, 1505, 204, 181, 133, 236, 5753, 204, 181, 133, 236, 1494], | |
| decoded: "weird \uff5e edge \uff5e case", | |
| }, | |
| SPIECE_UNDERSCORE: { | |
| text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE, | |
| tokens: ["\u00e2\u0138", "\u0123", "This", "\u0120\u00e2\u0138", "\u0123", "is", "\u0120\u00e2\u0138", "\u0123", "a", "\u0120\u00e2\u0138", "\u0123", "test", "\u0120\u00e2\u0138", "\u0123", "."], | |
| ids: [13856, 207, 1182, 26607, 207, 259, 26607, 207, 76, 26607, 207, 4780, 26607, 207, 25], | |
| decoded: "\u2581This \u2581is \u2581a \u2581test \u2581.", | |
| }, | |
| NUMBERS_SPLIT: { | |
| text: FALCON_TEST_STRINGS.NUMBERS_SPLIT, | |
| tokens: ["12", "\u0120and", "\u0120", "123", "\u0120and", "\u0120", "123", "4"], | |
| ids: [928, 273, 204, 10963, 273, 204, 10963, 31], | |
| decoded: "12 and 123 and 1234", | |
| }, | |
| }, | |
| "tiiuae/falcon-rw-1b": { | |
| SIMPLE_WITH_PUNCTUATION: { | |
| text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION, | |
| tokens: ["You", "\u0120should", "'ve", "\u0120done", "\u0120this"], | |
| ids: [1639, 815, 1053, 1760, 428], | |
| decoded: "You should've done this", | |
| }, | |
| NUMBERS: { | |
| text: BASE_TEST_STRINGS.NUMBERS, | |
| tokens: ["01", "23", "45", "67", "89", "\u01200", "\u01201", "\u01202", "\u01203", "\u01204", "\u01205", "\u01206", "\u01207", "\u01208", "\u01209", "\u012010", "\u0120100", "\u01201000"], | |
| ids: [486, 1954, 2231, 3134, 4531, 657, 352, 362, 513, 604, 642, 718, 767, 807, 860, 838, 1802, 8576], | |
| decoded: "0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000", | |
| }, | |
| TEXT_WITH_NUMBERS: { | |
| text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS, | |
| tokens: ["The", "\u0120company", "\u0120was", "\u0120founded", "\u0120in", "\u01202016", "."], | |
| ids: [464, 1664, 373, 9393, 287, 1584, 13], | |
| decoded: "The company was founded in 2016.", | |
| }, | |
| PUNCTUATION: { | |
| text: BASE_TEST_STRINGS.PUNCTUATION, | |
| tokens: ["A", "\u010a", "'ll", "\u0120!!", "to", "?'", "d", "''", "d", "\u0120of", ",", "\u0120can", "'t", "."], | |
| ids: [32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13], | |
| decoded: "A\n'll!!to?'d''d of, can't.", | |
| }, | |
| JAVASCRIPT_CODE: { | |
| text: BASE_TEST_STRINGS.JAVASCRIPT_CODE, | |
| tokens: ["let", "\u0120a", "\u0120=", "\u0120obj", ".", "to", "String", "();", "\u010a", "to", "String", "();"], | |
| ids: [1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783], | |
| decoded: "let a = obj.toString();\ntoString();", | |
| }, | |
| BASIC: { | |
| text: BASE_TEST_STRINGS.BASIC, | |
| tokens: ["UN", "want", "\u00c3\u00a9", "d", ",", "running"], | |
| ids: [4944, 42949, 2634, 67, 11, 20270], | |
| decoded: "UNwant\u00e9d,running", | |
| }, | |
| CONTROL_TOKENS: { | |
| text: BASE_TEST_STRINGS.CONTROL_TOKENS, | |
| tokens: ["1", "\u0100", "2", "\u00ef\u00bf\u00bd", "3"], | |
| ids: [16, 188, 17, 4210, 18], | |
| decoded: "1\u00002\ufffd3", | |
| }, | |
| CHINESE_ONLY: { | |
| text: BASE_TEST_STRINGS.CHINESE_ONLY, | |
| tokens: ["\u00e7\u0136\u0141", "\u00e6", "\u00b4", "\u00bb", "\u00e7\u013c\u0126", "\u00e7\u013e", "\u0141", "\u00e8", "\u00b0", "\u013d", "\u00e6\u013a\u00af"], | |
| ids: [37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468], | |
| decoded: "\u751f\u6d3b\u7684\u771f\u8c1b\u662f", | |
| }, | |
| LEADING_SPACE: { | |
| text: BASE_TEST_STRINGS.LEADING_SPACE, | |
| tokens: ["\u0120", "\u0120", "\u0120leading", "\u0120space"], | |
| ids: [220, 220, 3756, 2272], | |
| decoded: " leading space", | |
| }, | |
| TRAILING_SPACE: { | |
| text: BASE_TEST_STRINGS.TRAILING_SPACE, | |
| tokens: ["tra", "iling", "\u0120space", "\u0120", "\u0120", "\u0120"], | |
| ids: [9535, 4386, 2272, 220, 220, 220], | |
| decoded: "trailing space ", | |
| }, | |
| CURRENCY: { | |
| text: BASE_TEST_STRINGS.CURRENCY, | |
| tokens: ["test", "\u0120$", "1", "\u0120R", "2", "\u0120#", "3", "\u0120\u00e2\u0124\u00ac", "4", "\u0120\u00c2\u00a3", "5", "\u0120\u00c2\u00a5", "6", "\u0120\u00e2", "\u0124", "\u00a3", "7", "\u0120\u00e2", "\u0124", "\u00b9", "8", "\u0120\u00e2", "\u0124", "\u00b1", "9", "\u0120test"], | |
| ids: [9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332], | |
| decoded: "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test", | |
| }, | |
| CURRENCY_WITH_DECIMALS: { | |
| text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS, | |
| tokens: ["I", "\u0120bought", "\u0120an", "\u0120apple", "\u0120for", "\u0120$", "1", ".", "00", "\u0120at", "\u0120the", "\u0120store", "."], | |
| ids: [40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13], | |
| decoded: "I bought an apple for $1.00 at the store.", | |
| }, | |
| ELLIPSIS: { | |
| text: BASE_TEST_STRINGS.ELLIPSIS, | |
| tokens: ["you", "\u00e2\u0122\u00a6", "\u0120", "\u0120"], | |
| ids: [5832, 1399, 220, 220], | |
| decoded: "you\u2026 ", | |
| }, | |
| TILDE_NORMALIZATION: { | |
| text: BASE_TEST_STRINGS.TILDE_NORMALIZATION, | |
| tokens: ["we", "ird", "\u0120\u00ef", "\u00bd", "\u0140", "\u0120edge", "\u0120\u00ef", "\u00bd", "\u0140", "\u0120case"], | |
| ids: [732, 1447, 27332, 121, 252, 5743, 27332, 121, 252, 1339], | |
| decoded: "weird \uff5e edge \uff5e case", | |
| }, | |
| NUMBERS_SPLIT: { | |
| text: FALCON_TEST_STRINGS.NUMBERS_SPLIT, | |
| tokens: ["12", "\u0120and", "\u0120123", "\u0120and", "\u012012", "34"], | |
| ids: [1065, 290, 17031, 290, 1105, 2682], | |
| decoded: "12 and 123 and 1234", | |
| }, | |
| }, | |
| }; | |