rnnandi's picture
Add all files to convert gemma3 model to onnx
ca97aa9
import { FalconTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS, FALCON_TEST_STRINGS } from "../test_strings.js";
export const TOKENIZER_CLASS = FalconTokenizer;
export const TEST_CONFIG = {
"tiiuae/falcon-7b": {
SIMPLE: {
text: BASE_TEST_STRINGS.SIMPLE,
tokens: ["How", "\u0120are", "\u0120you", "\u0120doing", "?"],
ids: [1830, 362, 299, 1836, 42],
decoded: "How are you doing?",
},
SIMPLE_WITH_PUNCTUATION: {
text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
tokens: ["You", "\u0120should", "'", "ve", "\u0120done", "\u0120this"],
ids: [1357, 808, 18, 298, 1782, 414],
decoded: "You should've done this",
},
NUMBERS: {
text: BASE_TEST_STRINGS.NUMBERS,
tokens: ["012", "345", "678", "9", "\u0120", "0", "\u0120", "1", "\u0120", "2", "\u0120", "3", "\u0120", "4", "\u0120", "5", "\u0120", "6", "\u0120", "7", "\u0120", "8", "\u0120", "9", "\u0120", "10", "\u0120", "100", "\u0120", "100", "0"],
ids: [24445, 29094, 41583, 36, 204, 27, 204, 28, 204, 29, 204, 30, 204, 31, 204, 32, 204, 33, 204, 34, 204, 35, 204, 36, 204, 696, 204, 1425, 204, 1425, 27],
decoded: "0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000",
},
TEXT_WITH_NUMBERS: {
text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
tokens: ["The", "\u0120company", "\u0120was", "\u0120founded", "\u0120in", "\u0120", "201", "6", "."],
ids: [487, 1438, 398, 9923, 272, 204, 626, 33, 25],
decoded: "The company was founded in 2016.",
},
PUNCTUATION: {
text: BASE_TEST_STRINGS.PUNCTUATION,
tokens: ["A", "\u010a", "'", "ll", "\u0120", "!!", "to", "?'", "d", "''", "d", "\u0120of", ",", "\u0120can", "'", "t", "."],
ids: [44, 193, 18, 567, 204, 1409, 534, 12493, 79, 7544, 79, 275, 23, 418, 18, 95, 25],
decoded: "A\n'll!!to?'d''d of, can't.",
},
PYTHON_CODE: {
text: BASE_TEST_STRINGS.PYTHON_CODE,
tokens: ["def", "\u0120main", "():", "\u010a", "\u0109", "pass"],
ids: [3071, 1316, 13160, 193, 192, 5412],
decoded: "def main():\n\tpass",
},
JAVASCRIPT_CODE: {
text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
tokens: ["let", "\u0120a", "\u0120", "=", "\u0120obj", ".", "toString", "();", "\u010a", "toString", "();"],
ids: [1025, 241, 204, 40, 13756, 25, 19409, 2032, 193, 19409, 2032],
decoded: "let a = obj.toString();\ntoString();",
},
NEWLINES: {
text: BASE_TEST_STRINGS.NEWLINES,
tokens: ["This", "\u010a", "\u010a", "is", "\u010a", "a", "\u010a", "test", "."],
ids: [1182, 193, 193, 259, 193, 76, 193, 4780, 25],
decoded: "This\n\nis\na\ntest.",
},
BASIC: {
text: BASE_TEST_STRINGS.BASIC,
tokens: ["UN", "want", "\u00c3\u00a9d", ",", "running"],
ids: [4000, 32108, 5706, 23, 27386],
decoded: "UNwant\u00e9d,running",
},
CONTROL_TOKENS: {
text: BASE_TEST_STRINGS.CONTROL_TOKENS,
tokens: ["1", "\u0100", "2", "\u00ef\u00bf", "\u00bd", "3"],
ids: [28, 186, 29, 13112, 133, 30],
decoded: "1\u00002\ufffd3",
},
HELLO_WORLD_TITLECASE: {
text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
tokens: ["Hello", "\u0120World"],
ids: [9856, 2889],
decoded: "Hello World",
},
HELLO_WORLD_LOWERCASE: {
text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
tokens: ["hello", "\u0120world"],
ids: [30835, 1079],
decoded: "hello world",
},
CHINESE_ONLY: {
text: BASE_TEST_STRINGS.CHINESE_ONLY,
tokens: ["\u00e7\u0136\u0141\u00e6\u00b4\u00bb", "\u00e7\u013c\u0126", "\u00e7\u013e\u0141", "\u00e8\u00b0", "\u013d", "\u00e6\u013a\u00af"],
ids: [32725, 1105, 15498, 8061, 233, 2364],
decoded: "\u751f\u6d3b\u7684\u771f\u8c1b\u662f",
},
LEADING_SPACE: {
text: BASE_TEST_STRINGS.LEADING_SPACE,
tokens: ["\u0120\u0120", "\u0120leading", "\u0120space"],
ids: [258, 3736, 2151],
decoded: " leading space",
},
TRAILING_SPACE: {
text: BASE_TEST_STRINGS.TRAILING_SPACE,
tokens: ["tra", "iling", "\u0120space", "\u0120\u0120\u0120"],
ids: [9172, 4447, 2151, 466],
decoded: "trailing space ",
},
DOUBLE_SPACE: {
text: BASE_TEST_STRINGS.DOUBLE_SPACE,
tokens: ["Hi", "\u0120", "\u0120Hello"],
ids: [5516, 204, 23090],
decoded: "Hi Hello",
},
CURRENCY: {
text: BASE_TEST_STRINGS.CURRENCY,
tokens: ["test", "\u0120", "$", "1", "\u0120R", "2", "\u0120", "#", "3", "\u0120\u00e2\u0124\u00ac", "4", "\u0120\u00c2\u00a3", "5", "\u0120\u00c2", "\u00a5", "6", "\u0120\u00e2\u0124", "\u00a3", "7", "\u0120\u00e2\u0124", "\u00b9", "8", "\u0120\u00e2\u0124", "\u00b1", "9", "\u0120test"],
ids: [4780, 204, 15, 28, 382, 29, 204, 14, 30, 6471, 31, 5131, 32, 3068, 110, 33, 25631, 108, 34, 25631, 129, 35, 25631, 121, 36, 1318],
decoded: "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test",
},
CURRENCY_WITH_DECIMALS: {
text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
tokens: ["I", "\u0120bought", "\u0120an", "\u0120apple", "\u0120for", "\u0120", "$", "1", ".", "00", "\u0120at", "\u0120the", "\u0120store", "."],
ids: [52, 5659, 267, 12381, 312, 204, 15, 28, 25, 527, 388, 248, 2946, 25],
decoded: "I bought an apple for $1.00 at the store.",
},
ELLIPSIS: {
text: BASE_TEST_STRINGS.ELLIPSIS,
tokens: ["you", "\u00e2\u0122\u00a6", "\u0120\u0120"],
ids: [5667, 898, 258],
decoded: "you\u2026 ",
},
TEXT_WITH_ESCAPE_CHARACTERS: {
text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
ids: [5667, 898, 60482],
decoded: "you\u2026\u00a0\u00a0",
},
TEXT_WITH_ESCAPE_CHARACTERS_2: {
text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
tokens: ["you", "\u00e2\u0122\u00a6", "\u00c2\u0142", "\u00c2\u0142", "you", "\u00e2\u0122\u00a6", "\u00c2\u0142\u00c2\u0142"],
ids: [5667, 898, 4381, 4381, 5667, 898, 60482],
decoded: "you\u2026\u00a0\u00a0you\u2026\u00a0\u00a0",
},
TILDE_NORMALIZATION: {
text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
tokens: ["we", "ird", "\u0120", "\u00ef", "\u00bd", "\u0140", "\u0120edge", "\u0120", "\u00ef", "\u00bd", "\u0140", "\u0120case"],
ids: [698, 1505, 204, 181, 133, 236, 5753, 204, 181, 133, 236, 1494],
decoded: "weird \uff5e edge \uff5e case",
},
SPIECE_UNDERSCORE: {
text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
tokens: ["\u00e2\u0138", "\u0123", "This", "\u0120\u00e2\u0138", "\u0123", "is", "\u0120\u00e2\u0138", "\u0123", "a", "\u0120\u00e2\u0138", "\u0123", "test", "\u0120\u00e2\u0138", "\u0123", "."],
ids: [13856, 207, 1182, 26607, 207, 259, 26607, 207, 76, 26607, 207, 4780, 26607, 207, 25],
decoded: "\u2581This \u2581is \u2581a \u2581test \u2581.",
},
NUMBERS_SPLIT: {
text: FALCON_TEST_STRINGS.NUMBERS_SPLIT,
tokens: ["12", "\u0120and", "\u0120", "123", "\u0120and", "\u0120", "123", "4"],
ids: [928, 273, 204, 10963, 273, 204, 10963, 31],
decoded: "12 and 123 and 1234",
},
},
"tiiuae/falcon-rw-1b": {
SIMPLE_WITH_PUNCTUATION: {
text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
tokens: ["You", "\u0120should", "'ve", "\u0120done", "\u0120this"],
ids: [1639, 815, 1053, 1760, 428],
decoded: "You should've done this",
},
NUMBERS: {
text: BASE_TEST_STRINGS.NUMBERS,
tokens: ["01", "23", "45", "67", "89", "\u01200", "\u01201", "\u01202", "\u01203", "\u01204", "\u01205", "\u01206", "\u01207", "\u01208", "\u01209", "\u012010", "\u0120100", "\u01201000"],
ids: [486, 1954, 2231, 3134, 4531, 657, 352, 362, 513, 604, 642, 718, 767, 807, 860, 838, 1802, 8576],
decoded: "0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000",
},
TEXT_WITH_NUMBERS: {
text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
tokens: ["The", "\u0120company", "\u0120was", "\u0120founded", "\u0120in", "\u01202016", "."],
ids: [464, 1664, 373, 9393, 287, 1584, 13],
decoded: "The company was founded in 2016.",
},
PUNCTUATION: {
text: BASE_TEST_STRINGS.PUNCTUATION,
tokens: ["A", "\u010a", "'ll", "\u0120!!", "to", "?'", "d", "''", "d", "\u0120of", ",", "\u0120can", "'t", "."],
ids: [32, 198, 1183, 37867, 1462, 8348, 67, 7061, 67, 286, 11, 460, 470, 13],
decoded: "A\n'll!!to?'d''d of, can't.",
},
JAVASCRIPT_CODE: {
text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
tokens: ["let", "\u0120a", "\u0120=", "\u0120obj", ".", "to", "String", "();", "\u010a", "to", "String", "();"],
ids: [1616, 257, 796, 26181, 13, 1462, 10100, 9783, 198, 1462, 10100, 9783],
decoded: "let a = obj.toString();\ntoString();",
},
BASIC: {
text: BASE_TEST_STRINGS.BASIC,
tokens: ["UN", "want", "\u00c3\u00a9", "d", ",", "running"],
ids: [4944, 42949, 2634, 67, 11, 20270],
decoded: "UNwant\u00e9d,running",
},
CONTROL_TOKENS: {
text: BASE_TEST_STRINGS.CONTROL_TOKENS,
tokens: ["1", "\u0100", "2", "\u00ef\u00bf\u00bd", "3"],
ids: [16, 188, 17, 4210, 18],
decoded: "1\u00002\ufffd3",
},
CHINESE_ONLY: {
text: BASE_TEST_STRINGS.CHINESE_ONLY,
tokens: ["\u00e7\u0136\u0141", "\u00e6", "\u00b4", "\u00bb", "\u00e7\u013c\u0126", "\u00e7\u013e", "\u0141", "\u00e8", "\u00b0", "\u013d", "\u00e6\u013a\u00af"],
ids: [37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468],
decoded: "\u751f\u6d3b\u7684\u771f\u8c1b\u662f",
},
LEADING_SPACE: {
text: BASE_TEST_STRINGS.LEADING_SPACE,
tokens: ["\u0120", "\u0120", "\u0120leading", "\u0120space"],
ids: [220, 220, 3756, 2272],
decoded: " leading space",
},
TRAILING_SPACE: {
text: BASE_TEST_STRINGS.TRAILING_SPACE,
tokens: ["tra", "iling", "\u0120space", "\u0120", "\u0120", "\u0120"],
ids: [9535, 4386, 2272, 220, 220, 220],
decoded: "trailing space ",
},
CURRENCY: {
text: BASE_TEST_STRINGS.CURRENCY,
tokens: ["test", "\u0120$", "1", "\u0120R", "2", "\u0120#", "3", "\u0120\u00e2\u0124\u00ac", "4", "\u0120\u00c2\u00a3", "5", "\u0120\u00c2\u00a5", "6", "\u0120\u00e2", "\u0124", "\u00a3", "7", "\u0120\u00e2", "\u0124", "\u00b9", "8", "\u0120\u00e2", "\u0124", "\u00b1", "9", "\u0120test"],
ids: [9288, 720, 16, 371, 17, 1303, 18, 10432, 19, 4248, 20, 38221, 21, 2343, 224, 96, 22, 2343, 224, 117, 23, 2343, 224, 109, 24, 1332],
decoded: "test $1 R2 #3 \u20ac4 \u00a35 \u00a56 \u20a37 \u20b98 \u20b19 test",
},
CURRENCY_WITH_DECIMALS: {
text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
tokens: ["I", "\u0120bought", "\u0120an", "\u0120apple", "\u0120for", "\u0120$", "1", ".", "00", "\u0120at", "\u0120the", "\u0120store", "."],
ids: [40, 5839, 281, 17180, 329, 720, 16, 13, 405, 379, 262, 3650, 13],
decoded: "I bought an apple for $1.00 at the store.",
},
ELLIPSIS: {
text: BASE_TEST_STRINGS.ELLIPSIS,
tokens: ["you", "\u00e2\u0122\u00a6", "\u0120", "\u0120"],
ids: [5832, 1399, 220, 220],
decoded: "you\u2026 ",
},
TILDE_NORMALIZATION: {
text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
tokens: ["we", "ird", "\u0120\u00ef", "\u00bd", "\u0140", "\u0120edge", "\u0120\u00ef", "\u00bd", "\u0140", "\u0120case"],
ids: [732, 1447, 27332, 121, 252, 5743, 27332, 121, 252, 1339],
decoded: "weird \uff5e edge \uff5e case",
},
NUMBERS_SPLIT: {
text: FALCON_TEST_STRINGS.NUMBERS_SPLIT,
tokens: ["12", "\u0120and", "\u0120123", "\u0120and", "\u012012", "34"],
ids: [1065, 290, 17031, 290, 1105, 2682],
decoded: "12 and 123 and 1234",
},
},
};