import { AlbertTokenizer } from "../../../src/tokenizers.js"; import { BASE_TEST_STRINGS, BERT_TEST_STRINGS } from "../test_strings.js"; export const TOKENIZER_CLASS = AlbertTokenizer; export const TEST_CONFIG = { // - uses `StripAccents` normalizer "Xenova/albert-base-v2": { SIMPLE: { text: BASE_TEST_STRINGS.SIMPLE, tokens: ["\u2581how", "\u2581are", "\u2581you", "\u2581doing", "?"], ids: [2, 184, 50, 42, 845, 60, 3], decoded: "[CLS] how are you doing?[SEP]", }, SIMPLE_WITH_PUNCTUATION: { text: BASE_TEST_STRINGS.SIMPLE_WITH_PUNCTUATION, tokens: ["\u2581you", "\u2581should", "'", "ve", "\u2581done", "\u2581this"], ids: [2, 42, 378, 22, 195, 677, 48, 3], decoded: "[CLS] you should've done this[SEP]", }, NUMBERS: { text: BASE_TEST_STRINGS.NUMBERS, tokens: ["\u25810", "12", "345", "67", "89", "\u25810", "\u25811", "\u25812", "\u25813", "\u25814", "\u25815", "\u25816", "\u25817", "\u25818", "\u25819", "\u258110", "\u2581100", "\u25811000"], ids: [2, 713, 918, 21997, 4167, 3877, 713, 137, 172, 203, 268, 331, 400, 453, 469, 561, 332, 808, 6150, 3], decoded: "[CLS] 0123456789 0 1 2 3 4 5 6 7 8 9 10 100 1000[SEP]", }, TEXT_WITH_NUMBERS: { text: BASE_TEST_STRINGS.TEXT_WITH_NUMBERS, tokens: ["\u2581the", "\u2581company", "\u2581was", "\u2581founded", "\u2581in", "\u25812016", "."], ids: [2, 14, 237, 23, 785, 19, 690, 9, 3], decoded: "[CLS] the company was founded in 2016.[SEP]", }, PUNCTUATION: { text: BASE_TEST_STRINGS.PUNCTUATION, tokens: ["\u2581a", "\u2581", "'", "ll", "\u2581", "!!", "to", "?'", "d", '"', "d", "\u2581of", ",", "\u2581can", "'", "t", "."], ids: [2, 21, 13, 22, 211, 13, 19015, 262, 5663, 43, 7, 43, 16, 15, 92, 22, 38, 9, 3], decoded: "[CLS] a 'll!!to?'d\"d of, can't.[SEP]", }, PYTHON_CODE: { text: BASE_TEST_STRINGS.PYTHON_CODE, tokens: ["\u2581def", "\u2581main", "(", ")", ":", "\u2581pass"], ids: [2, 6312, 407, 5, 6, 45, 1477, 3], decoded: "[CLS] def main(): pass[SEP]", }, JAVASCRIPT_CODE: { text: BASE_TEST_STRINGS.JAVASCRIPT_CODE, tokens: ["\u2581let", "\u2581a", "\u2581=", "\u2581ob", "j", ".", "to", "string", "(", ")", ";", "\u2581to", "string", "(", ")", ";"], ids: [2, 408, 21, 800, 5122, 728, 9, 262, 11130, 5, 6, 73, 20, 11130, 5, 6, 73, 3], decoded: "[CLS] let a = obj.tostring(); tostring();[SEP]", }, NEWLINES: { text: BASE_TEST_STRINGS.NEWLINES, tokens: ["\u2581this", "\u2581is", "\u2581a", "\u2581test", "."], ids: [2, 48, 25, 21, 1289, 9, 3], decoded: "[CLS] this is a test.[SEP]", }, BASIC: { text: BASE_TEST_STRINGS.BASIC, tokens: ["\u2581unwanted", ",", "running"], ids: [2, 21095, 15, 11325, 3], decoded: "[CLS] unwanted,running[SEP]", }, CONTROL_TOKENS: { text: BASE_TEST_STRINGS.CONTROL_TOKENS, tokens: ["\u25811", "\u0000", "2", "\u25813"], ids: [2, 137, 1, 135, 203, 3], decoded: "[CLS] 12 3[SEP]", }, HELLO_WORLD_TITLECASE: { text: BASE_TEST_STRINGS.HELLO_WORLD_TITLECASE, tokens: ["\u2581hello", "\u2581world"], ids: [2, 10975, 126, 3], decoded: "[CLS] hello world[SEP]", }, HELLO_WORLD_LOWERCASE: { text: BASE_TEST_STRINGS.HELLO_WORLD_LOWERCASE, tokens: ["\u2581hello", "\u2581world"], ids: [2, 10975, 126, 3], decoded: "[CLS] hello world[SEP]", }, CHINESE_ONLY: { text: BASE_TEST_STRINGS.CHINESE_ONLY, tokens: ["\u2581", "\u751f\u6d3b\u7684\u771f\u8c1b\u662f"], ids: [2, 13, 1, 3], decoded: "[CLS] [SEP]", }, LEADING_SPACE: { text: BASE_TEST_STRINGS.LEADING_SPACE, tokens: ["\u2581leading", "\u2581space"], ids: [2, 1005, 726, 3], decoded: "[CLS] leading space[SEP]", }, TRAILING_SPACE: { text: BASE_TEST_STRINGS.TRAILING_SPACE, tokens: ["\u2581trailing", "\u2581space"], ids: [2, 14323, 726, 3], decoded: "[CLS] trailing space[SEP]", }, DOUBLE_SPACE: { text: BASE_TEST_STRINGS.DOUBLE_SPACE, tokens: ["\u2581hi", "\u2581hello"], ids: [2, 4148, 10975, 3], decoded: "[CLS] hi hello[SEP]", }, CURRENCY: { text: BASE_TEST_STRINGS.CURRENCY, tokens: ["\u2581test", "\u2581$1", "\u2581r", "2", "\u2581#3", "\u2581", "\u20ac", "4", "\u2581", "\u00a3", "5", "\u2581", "\u00a5", "6", "\u2581", "\u20a3", "7", "\u2581", "\u20b9", "8", "\u2581", "\u20b1", "9", "\u2581test"], ids: [2, 1289, 3742, 761, 135, 11489, 13, 12, 300, 13, 11, 264, 13, 1, 379, 13, 1, 465, 13, 1, 457, 13, 1, 518, 1289, 3], decoded: "[CLS] test $1 r2 #3 \u20ac4 \u00a35 6 7 8 9 test[SEP]", }, CURRENCY_WITH_DECIMALS: { text: BASE_TEST_STRINGS.CURRENCY_WITH_DECIMALS, tokens: ["\u2581i", "\u2581bought", "\u2581an", "\u2581apple", "\u2581for", "\u2581$1", ".", "00", "\u2581at", "\u2581the", "\u2581store", "."], ids: [2, 31, 2448, 40, 4037, 26, 3742, 9, 2032, 35, 14, 1718, 9, 3], decoded: "[CLS] i bought an apple for $1.00 at the store.[SEP]", }, ELLIPSIS: { text: BASE_TEST_STRINGS.ELLIPSIS, tokens: ["\u2581you", ".", ".", "."], ids: [2, 42, 9, 9, 9, 3], decoded: "[CLS] you...[SEP]", }, TEXT_WITH_ESCAPE_CHARACTERS: { text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS, tokens: ["\u2581you", ".", ".", "."], ids: [2, 42, 9, 9, 9, 3], decoded: "[CLS] you...[SEP]", }, TEXT_WITH_ESCAPE_CHARACTERS_2: { text: BASE_TEST_STRINGS.TEXT_WITH_ESCAPE_CHARACTERS_2, tokens: ["\u2581you", ".", ".", ".", "\u2581you", ".", ".", "."], ids: [2, 42, 9, 9, 9, 42, 9, 9, 9, 3], decoded: "[CLS] you... you...[SEP]", }, TILDE_NORMALIZATION: { text: BASE_TEST_STRINGS.TILDE_NORMALIZATION, tokens: ["\u2581weird", "\u2581", "~", "\u2581edge", "\u2581", "~", "\u2581case"], ids: [2, 5455, 13, 1, 1407, 13, 1, 610, 3], decoded: "[CLS] weird edge case[SEP]", }, SPIECE_UNDERSCORE: { text: BASE_TEST_STRINGS.SPIECE_UNDERSCORE, tokens: ["\u2581this", "\u2581is", "\u2581a", "\u2581test", "\u2581", "."], ids: [2, 48, 25, 21, 1289, 13, 9, 3], decoded: "[CLS] this is a test.[SEP]", }, POPULAR_EMOJIS: { text: BASE_TEST_STRINGS.POPULAR_EMOJIS, tokens: ["\u2581", "\ud83d\ude02", "\u2581", "\ud83d\udc4d", "\u2581", "\ud83e\udd23", "\u2581", "\ud83d\ude0d", "\u2581", "\ud83d\ude2d", "\u2581", "\ud83c\udf89", "\u2581", "\ud83d\ude4f", "\u2581", "\ud83d\ude0a", "\u2581", "\ud83d\udd25", "\u2581", "\ud83d\ude01", "\u2581", "\ud83d\ude05", "\u2581", "\ud83e\udd17", "\u2581", "\ud83d\ude06", "\u2581", "\ud83d\udc4f", "\u2581", "\u2764", "\u2581", "\ud83d\udc9c", "\u2581", "\ud83d\udc9a", "\u2581", "\ud83d\udc97", "\u2581", "\ud83d\udc99", "\u2581", "\ud83d\udda4", "\u2581", "\ud83d\ude0e", "\u2581", "\ud83d\udc4c", "\u2581", "\ud83e\udd73", "\u2581", "\ud83d\udcaa", "\u2581", "\u2728", "\u2581", "\ud83d\udc49", "\u2581", "\ud83d\udc40", "\u2581", "\ud83d\udcaf", "\u2581", "\ud83c\udf88", "\u2581", "\ud83d\ude48", "\u2581", "\ud83d\ude4c", "\u2581", "\ud83d\udc80", "\u2581", "\ud83d\udc47", "\u2581", "\ud83d\udc4b", "\u2581", "\u2705", "\u2581", "\ud83c\udf81", "\u2581", "\ud83c\udf1e", "\u2581", "\ud83c\udf38", "\u2581", "\ud83d\udcb0"], ids: [2, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 3], decoded: "[CLS] [SEP]", }, MULTIBYTE_EMOJIS: { text: BASE_TEST_STRINGS.MULTIBYTE_EMOJIS, tokens: ["\u2581", "\u2728", "\u2581", "\ud83e\udd17", "\u2581", "\ud83d\udc41", "\u2581", "\ud83d\udc71\ud83c\udffb", "\u2581", "\ud83d\udd75", "\u2581", "\u2642", "\u2581", "\ud83e\uddd9\ud83c\udffb", "\u2581", "\u2642", "\u2581", "\ud83d\udc68\ud83c\udffb", "\u2581", "\ud83c\udf3e", "\u2581", "\ud83e\uddd1", "\u2581", "\ud83e\udd1d", "\u2581", "\ud83e\uddd1", "\u2581", "\ud83d\udc69", "\u2581", "\u2764", "\u2581", "\ud83d\udc8b", "\u2581", "\ud83d\udc68", "\u2581", "\ud83d\udc69", "\u2581", "\ud83d\udc69", "\u2581", "\ud83d\udc67", "\u2581", "\ud83d\udc66", "\u2581", "\ud83e\uddd1\ud83c\udffb", "\u2581", "\ud83e\udd1d", "\u2581", "\ud83e\uddd1\ud83c\udffb", "\u2581", "\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f", "\u2581", "\ud83d\udc68\ud83c\udffb", "\u2581", "\u2764", "\u2581", "\ud83d\udc8b", "\u2581", "\ud83d\udc68\ud83c\udffc"], ids: [2, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 13, 1, 3], decoded: "[CLS] [SEP]", }, CHINESE_LATIN_MIXED: { text: BERT_TEST_STRINGS.CHINESE_LATIN_MIXED, tokens: ["\u2581", "ah", "\u535a\u63a8", "zz"], ids: [2, 13, 1307, 1, 5092, 3], decoded: "[CLS] ahzz[SEP]", }, SIMPLE_WITH_ACCENTS: { text: BERT_TEST_STRINGS.SIMPLE_WITH_ACCENTS, tokens: ["\u2581hello"], ids: [2, 10975, 3], decoded: "[CLS] hello[SEP]", }, MIXED_CASE_WITHOUT_ACCENTS: { text: BERT_TEST_STRINGS.MIXED_CASE_WITHOUT_ACCENTS, tokens: ["\u2581hello", "!", "how", "\u2581are", "\u2581you", "?"], ids: [2, 10975, 187, 1544, 50, 42, 60, 3], decoded: "[CLS] hello!how are you?[SEP]", }, MIXED_CASE_WITH_ACCENTS: { text: BERT_TEST_STRINGS.MIXED_CASE_WITH_ACCENTS, tokens: ["\u2581hall", "o", "!", "how", "\u2581are", "\u2581you", "?"], ids: [2, 554, 111, 187, 1544, 50, 42, 60, 3], decoded: "[CLS] hallo!how are you?[SEP]", }, }, };