rnnandi's picture
Add all files to convert gemma3 model to onnx
ca97aa9
import { AlbertTokenizer } from "../../../src/tokenizers.js";
import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js";
export const TOKENIZER_CLASS = AlbertTokenizer;
export const TEST_CONFIG = {
// - uses `StripAccents` normalizer
"Xenova/albert-base-v2": {
SIMPLE: {
text: BASE_TEST_STRINGS.SIMPLE,
tokens: ["\u2581how", "\u2581are", "\u2581you", "\u2581doing", "?"],
ids: [2, 184, 50, 42, 845, 60, 3],
decoded: "[CLS] how are you doing?[SEP]",
},
SIMPLE_WITH_PUNCTUATION: {
text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION,
tokens: ["\u2581you", "\u2581should", "'", "ve", "\u2581done", "\u2581this"],
ids: [2, 42, 378, 22, 195, 677, 48, 3],
decoded: "[CLS] you should've done this[SEP]",
},
NUMBERS: {
text: BASE_TEST_STRINGS.NUMBERS,
tokens: ["\u25810", "12", "345", "67", "89", "\u25810", "\u25811", "\u25812", "\u25813", "\u25814", "\u25815", "\u25816", "\u25817", "\u25818", "\u25819", "\u258110", "\u2581100", "\u25811000"],
ids: [2, 713, 918, 21997, 4167, 3877, 713, 137, 172, 203, 268, 331, 400, 453, 469, 561, 332, 808, 6150, 3],
decoded: "[CLS] 0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000[SEP]",
},
TEXT_WITH_NUMBERS: {
text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS,
tokens: ["\u2581the", "\u2581company", "\u2581was", "\u2581founded", "\u2581in", "\u25812016", "."],
ids: [2, 14, 237, 23, 785, 19, 690, 9, 3],
decoded: "[CLS] the company was founded in 2016.[SEP]",
},
PUNCTUATION: {
text: BASE_TEST_STRINGS.PUNCTUATION,
tokens: ["\u2581a", "\u2581", "'", "ll", "\u2581", "!!", "to", "?'", "d", '"', "d", "\u2581of", ",", "\u2581can", "'", "t", "."],
ids: [2, 21, 13, 22, 211, 13, 19015, 262, 5663, 43, 7, 43, 16, 15, 92, 22, 38, 9, 3],
decoded: "[CLS] a 'll!!to?'d\"d of, can't.[SEP]",
},
PYTHON_CODE: {
text: BASE_TEST_STRINGS.PYTHON_CODE,
tokens: ["\u2581def", "\u2581main", "(", ")", ":", "\u2581pass"],
ids: [2, 6312, 407, 5, 6, 45, 1477, 3],
decoded: "[CLS] def main(): pass[SEP]",
},
JAVASCRIPT_CODE: {
text: BASE_TEST_STRINGS.JAVASCRIPT_CODE,
tokens: ["\u2581let", "\u2581a", "\u2581=", "\u2581ob", "j", ".", "to", "string", "(", ")", ";", "\u2581to", "string", "(", ")", ";"],
ids: [2, 408, 21, 800, 5122, 728, 9, 262, 11130, 5, 6, 73, 20, 11130, 5, 6, 73, 3],
decoded: "[CLS] let a = obj.tostring(); tostring();[SEP]",
},
NEWLINES: {
text: BASE_TEST_STRINGS.NEWLINES,
tokens: ["\u2581this", "\u2581is", "\u2581a", "\u2581test", "."],
ids: [2, 48, 25, 21, 1289, 9, 3],
decoded: "[CLS] this is a test.[SEP]",
},
BASIC: {
text: BASE_TEST_STRINGS.BASIC,
tokens: ["\u2581unwanted", ",", "running"],
ids: [2, 21095, 15, 11325, 3],
decoded: "[CLS] unwanted,running[SEP]",
},
CONTROL_TOKENS: {
text: BASE_TEST_STRINGS.CONTROL_TOKENS,
tokens: ["\u25811", "\u0000", "2", "\u25813"],
ids: [2, 137, 1, 135, 203, 3],
decoded: "[CLS] 1<unk>2 3[SEP]",
},
HELLO_WORLD_TITLECASE: {
text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE,
tokens: ["\u2581hello", "\u2581world"],
ids: [2, 10975, 126, 3],
decoded: "[CLS] hello world[SEP]",
},
HELLO_WORLD_LOWERCASE: {
text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE,
tokens: ["\u2581hello", "\u2581world"],
ids: [2, 10975, 126, 3],
decoded: "[CLS] hello world[SEP]",
},
CHINESE_ONLY: {
text: BASE_TEST_STRINGS.CHINESE_ONLY,
tokens: ["\u2581", "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"],
ids: [2, 13, 1, 3],
decoded: "[CLS] <unk>[SEP]",
},
LEADING_SPACE: {
text: BASE_TEST_STRINGS.LEADING_SPACE,
tokens: ["\u2581leading", "\u2581space"],
ids: [2, 1005, 726, 3],
decoded: "[CLS] leading space[SEP]",
},
TRAILING_SPACE: {
text: BASE_TEST_STRINGS.TRAILING_SPACE,
tokens: ["\u2581trailing", "\u2581space"],
ids: [2, 14323, 726, 3],
decoded: "[CLS] trailing space[SEP]",
},
DOUBLE_SPACE: {
text: BASE_TEST_STRINGS.DOUBLE_SPACE,
tokens: ["\u2581hi", "\u2581hello"],
ids: [2, 4148, 10975, 3],
decoded: "[CLS] hi hello[SEP]",
},
CURRENCY: {
text: BASE_TEST_STRINGS.CURRENCY,
tokens: ["\u2581test", "\u2581$1", "\u2581r", "2", "\u2581#3", "\u2581", "\u20ac", "4", "\u2581", "\u00a3", "5", "\u2581", "\u00a5", "6", "\u2581", "\u20a3", "7", "\u2581", "\u20b9", "8", "\u2581", "\u20b1", "9", "\u2581test"],
ids: [2, 1289, 3742, 761, 135, 11489, 13, 12, 300, 13, 11, 264, 13, 1, 379, 13, 1, 465, 13, 1, 457, 13, 1, 518, 1289, 3],
decoded: "[CLS] test $1 r2 #3 \u20ac4 \u00a35 <unk>6 <unk>7 <unk>8 <unk>9 test[SEP]",
},
CURRENCY_WITH_DECIMALS: {
text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS,
tokens: ["\u2581i", "\u2581bought", "\u2581an", "\u2581apple", "\u2581for", "\u2581$1", ".", "00", "\u2581at", "\u2581the", "\u2581store", "."],
ids: [2, 31, 2448, 40, 4037, 26, 3742, 9, 2032, 35, 14, 1718, 9, 3],
decoded: "[CLS] i bought an apple for $1.00 at the store.[SEP]",
},
ELLIPSIS: {
text: BASE_TEST_STRINGS.ELLIPSIS,
tokens: ["\u2581you", ".", ".", "."],
ids: [2, 42, 9, 9, 9, 3],
decoded: "[CLS] you...[SEP]",
},
TEXT_WITH_ESCAPE_CHARACTERS: {
text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS,
tokens: ["\u2581you", ".", ".", "."],
ids: [2, 42, 9, 9, 9, 3],
decoded: "[CLS] you...[SEP]",
},
TEXT_WITH_ESCAPE_CHARACTERS_2: {
text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2,
tokens: ["\u2581you", ".", ".", ".", "\u2581you", ".", ".", "."],
ids: [2, 42, 9, 9, 9, 42, 9, 9, 9, 3],
decoded: "[CLS] you... you...[SEP]",
},
TILDE_NORMALIZATION: {
text: BASE_TEST_STRINGS.TILDE_NORMALIZATION,
tokens: ["\u2581weird", "\u2581", "~", "\u2581edge", "\u2581", "~", "\u2581case"],
ids: [2, 5455, 13, 1, 1407, 13, 1, 610, 3],
decoded: "[CLS] weird <unk> edge <unk> case[SEP]",
},
SPIECE_UNDERSCORE: {
text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE,
tokens: ["\u2581this", "\u2581is", "\u2581a", "\u2581test", "\u2581", "."],
ids: [2, 48, 25, 21, 1289, 13, 9, 3],
decoded: "[CLS] this is a test.[SEP]",
},
POPULAR_EMOJIS: {
text: BASE_TEST_STRINGS.POPULAR_EMOJIS,
tokens: ["\u2581", "\ud83d\ude02", "\u2581", "\ud83d\udc4d", "\u2581", "\ud83e\udd23", "\u2581", "\ud83d\ude0d", "\u2581", "\ud83d\ude2d", "\u2581", "\ud83c\udf89", "\u2581", "\ud83d\ude4f", "\u2581", "\ud83d\ude0a", "\u2581", "\ud83d\udd25", "\u2581", "\ud83d\ude01", "\u2581", "\ud83d\ude05", "\u2581", "\ud83e\udd17", "\u2581", "\ud83d\ude06", "\u2581", "\ud83d\udc4f", "\u2581", "\u2764", "\u2581", "\ud83d\udc9c", "\u2581", "\ud83d\udc9a", "\u2581", "\ud83d\udc97", "\u2581", "\ud83d\udc99", "\u2581", "\ud83d\udda4", "\u2581", "\ud83d\ude0e", "\u2581", "\ud83d\udc4c", "\u2581", "\ud83e\udd73", "\u2581", "\ud83d\udcaa", "\u2581", "\u2728", "\u2581", "\ud83d\udc49", "\u2581", "\ud83d\udc40", "\u2581", "\ud83d\udcaf", "\u2581", "\ud83c\udf88", "\u2581", "\ud83d\ude48", "\u2581", "\ud83d\ude4c", "\u2581", "\ud83d\udc80", "\u2581", "\ud83d\udc47", "\u2581", "\ud83d\udc4b", "\u2581", "\u2705", "\u2581", "\ud83c\udf81", "\u2581", "\ud83c\udf1e", "\u2581", "\ud83c\udf38", "\u2581", "\ud83d\udcb0"],
ids: [2, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 3],
decoded: "[CLS] <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>[SEP]",
},
MULTIBYTE_EMOJIS: {
text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS,
tokens: ["\u2581", "\u2728", "\u2581", "\ud83e\udd17", "\u2581", "\ud83d\udc41", "\u2581", "\ud83d\udc71\ud83c\udffb", "\u2581", "\ud83d\udd75", "\u2581", "\u2642", "\u2581", "\ud83e\uddd9\ud83c\udffb", "\u2581", "\u2642", "\u2581", "\ud83d\udc68\ud83c\udffb", "\u2581", "\ud83c\udf3e", "\u2581", "\ud83e\uddd1", "\u2581", "\ud83e\udd1d", "\u2581", "\ud83e\uddd1", "\u2581", "\ud83d\udc69", "\u2581", "\u2764", "\u2581", "\ud83d\udc8b", "\u2581", "\ud83d\udc68", "\u2581", "\ud83d\udc69", "\u2581", "\ud83d\udc69", "\u2581", "\ud83d\udc67", "\u2581", "\ud83d\udc66", "\u2581", "\ud83e\uddd1\ud83c\udffb", "\u2581", "\ud83e\udd1d", "\u2581", "\ud83e\uddd1\ud83c\udffb", "\u2581", "\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f", "\u2581", "\ud83d\udc68\ud83c\udffb", "\u2581", "\u2764", "\u2581", "\ud83d\udc8b", "\u2581", "\ud83d\udc68\ud83c\udffc"],
ids: [2, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 3],
decoded: "[CLS] <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>[SEP]",
},
CHINESE_LATIN_MIXED: {
text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED,
tokens: ["\u2581", "ah", "\u535a\u63a8", "zz"],
ids: [2, 13, 1307, 1, 5092, 3],
decoded: "[CLS] ah<unk>zz[SEP]",
},
SIMPLE_WITH_ACCENTS: {
text: BERT_TEST_STRINGS.SIMPLE_WITH_ACCENTS,
tokens: ["\u2581hello"],
ids: [2, 10975, 3],
decoded: "[CLS] hello[SEP]",
},
MIXED_CASE_WITHOUT_ACCENTS: {
text: BERT_TEST_STRINGS.MIXED_CASE_WITHOUT_ACCENTS,
tokens: ["\u2581hello", "!", "how", "\u2581are", "\u2581you", "?"],
ids: [2, 10975, 187, 1544, 50, 42, 60, 3],
decoded: "[CLS] hello!how are you?[SEP]",
},
MIXED_CASE_WITH_ACCENTS: {
text: BERT_TEST_STRINGS.MIXED_CASE_WITH_ACCENTS,
tokens: ["\u2581hall", "o", "!", "how", "\u2581are", "\u2581you", "?"],
ids: [2, 554, 111, 187, 1544, 50, 42, 60, 3],
decoded: "[CLS] hallo!how are you?[SEP]",
},
},
};