diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,19541 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 7, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 8, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": null, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": "", + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "": 5, + "": 6, + "": 7, + "": 9, + "!": 10, + "\"": 11, + "#": 12, + "$": 13, + "%": 14, + "&": 15, + "'": 16, + "(": 17, + ")": 18, + "*": 19, + "+": 20, + ",": 21, + "-": 22, + ".": 23, + "/": 24, + "0": 25, + "1": 26, + "2": 27, + "3": 28, + "4": 29, + "5": 30, + "6": 31, + "7": 32, + "8": 33, + "9": 34, + ":": 35, + ";": 36, + "<": 37, + "=": 38, + ">": 39, + "?": 40, + "@": 41, + "A": 42, + "B": 43, + "C": 44, + "D": 45, + "E": 46, + "F": 47, + "G": 48, + "H": 49, + "I": 50, + "J": 51, + "K": 52, + "L": 53, + "M": 54, + "N": 55, + "O": 56, + "P": 57, + "Q": 58, + "R": 59, + "S": 60, + "T": 61, + "U": 62, + "V": 63, + "W": 64, + "X": 65, + "Y": 66, + "Z": 67, + "[": 68, + "\\": 69, + "]": 70, + "^": 71, + "_": 72, + "`": 73, + "a": 74, + "b": 75, + "c": 76, + "d": 77, + "e": 78, + "f": 79, + "g": 80, + "h": 81, + "i": 82, + "j": 83, + "k": 84, + "l": 85, + "m": 86, + "n": 87, + "o": 88, + "p": 89, + "q": 90, + "r": 91, + "s": 92, + "t": 93, + "u": 94, + "v": 95, + "w": 96, + "x": 97, + "y": 98, + "z": 99, + "{": 100, + "|": 101, + "}": 102, + "~": 103, + "¡": 104, + "¢": 105, + "£": 106, + "¤": 107, + "¥": 108, + "¦": 109, + "§": 110, + "¨": 111, + "©": 112, + "ª": 113, + "«": 114, + "¬": 115, + "®": 116, + "¯": 117, + "°": 118, + "±": 119, + "²": 120, + "³": 121, + "´": 122, + "µ": 123, + "¶": 124, + "·": 125, + "¸": 126, + "¹": 127, + "º": 128, + "»": 129, + "¼": 130, + "½": 131, + "¾": 132, + "¿": 133, + "À": 134, + "Á": 135, + "Â": 136, + "Ã": 137, + "Ä": 138, + "Å": 139, + "Æ": 140, + "Ç": 141, + "È": 142, + "É": 143, + "Ê": 144, + "Ë": 145, + "Ì": 146, + "Í": 147, + "Î": 148, + "Ï": 149, + "Ð": 150, + "Ñ": 151, + "Ò": 152, + "Ó": 153, + "Ô": 154, + "Õ": 155, + "Ö": 156, + "×": 157, + "Ø": 158, + "Ù": 159, + "Ú": 160, + "Û": 161, + "Ü": 162, + "Ý": 163, + "Þ": 164, + "ß": 165, + "à": 166, + "á": 167, + "â": 168, + "ã": 169, + "ä": 170, + "å": 171, + "æ": 172, + "ç": 173, + "è": 174, + "é": 175, + "ê": 176, + "ë": 177, + "ì": 178, + "í": 179, + "î": 180, + "ï": 181, + "ð": 182, + "ñ": 183, + "ò": 184, + "ó": 185, + "ô": 186, + "õ": 187, + "ö": 188, + "÷": 189, + "ø": 190, + "ù": 191, + "ú": 192, + "û": 193, + "ü": 194, + "ý": 195, + "þ": 196, + "ÿ": 197, + "Ā": 198, + "ā": 199, + "Ă": 200, + "ă": 201, + "Ą": 202, + "ą": 203, + "Ć": 204, + "ć": 205, + "Ĉ": 206, + "ĉ": 207, + "Ċ": 208, + "ċ": 209, + "Č": 210, + "č": 211, + "Ď": 212, + "ď": 213, + "Đ": 214, + "đ": 215, + "Ē": 216, + "ē": 217, + "Ĕ": 218, + "ĕ": 219, + "Ė": 220, + "ė": 221, + "Ę": 222, + "ę": 223, + "Ě": 224, + "ě": 225, + "Ĝ": 226, + "ĝ": 227, + "Ğ": 228, + "ğ": 229, + "Ġ": 230, + "ġ": 231, + "Ģ": 232, + "ģ": 233, + "Ĥ": 234, + "ĥ": 235, + "Ħ": 236, + "ħ": 237, + "Ĩ": 238, + "ĩ": 239, + "Ī": 240, + "ī": 241, + "Ĭ": 242, + "ĭ": 243, + "Į": 244, + "į": 245, + "İ": 246, + "ı": 247, + "IJ": 248, + "ij": 249, + "Ĵ": 250, + "ĵ": 251, + "Ķ": 252, + "ķ": 253, + "ĸ": 254, + "Ĺ": 255, + "ĺ": 256, + "Ļ": 257, + "ļ": 258, + "Ľ": 259, + "ľ": 260, + "Ŀ": 261, + "ŀ": 262, + "Ł": 263, + "ł": 264, + "Ń": 265, + "Ġt": 266, + "in": 267, + "Ġa": 268, + "re": 269, + "he": 270, + "ou": 271, + "Ġs": 272, + "..": 273, + "ĠĊ": 274, + "Ġw": 275, + "at": 276, + "**": 277, + "ing": 278, + "on": 279, + "Ġi": 280, + "ĠĠĊ": 281, + "âĢ": 282, + "Ġy": 283, + "er": 284, + "Ġb": 285, + "Ġm": 286, + "Ġthe": 287, + "Ġf": 288, + "en": 289, + "Ġyou": 290, + "Ġc": 291, + "an": 292, + "es": 293, + "Ġth": 294, + "Ġto": 295, + "el": 296, + "or": 297, + "Ġd": 298, + "Ġn": 299, + "is": 300, + "Ġl": 301, + "nd": 302, + "ot": 303, + "ed": 304, + "it": 305, + "Ġp": 306, + "ĠI": 307, + "Ġin": 308, + "Ġit": 309, + "al": 310, + "st": 311, + "Ġo": 312, + "Ġh": 313, + "Ġ**": 314, + "ow": 315, + "Ġre": 316, + "ion": 317, + "om": 318, + "et": 319, + "ar": 320, + "Ġe": 321, + "ll": 322, + "...": 323, + "Ġis": 324, + "ce": 325, + "âĢĻ": 326, + "Ġbe": 327, + "ct": 328, + "Ġof": 329, + "Ġst": 330, + "ve": 331, + "Ġand": 332, + "ay": 333, + "ut": 334, + "se": 335, + "ly": 336, + "Ġthat": 337, + "hat": 338, + "ld": 339, + "âĢĶ": 340, + "Ġnot": 341, + "le": 342, + "Ġme": 343, + ".**": 344, + "ent": 345, + "as": 346, + "id": 347, + "ke": 348, + "gh": 349, + "Ġhe": 350, + "ver": 351, + "Ġg": 352, + "Ġyour": 353, + "ad": 354, + "Ġfor": 355, + "hing": 356, + "Ġon": 357, + "ith": 358, + "Ġ*": 359, + "ust": 360, + "Ġli": 361, + "ra": 362, + "ore": 363, + "Ġ..": 364, + "Ġwith": 365, + "You": 366, + "Ġu": 367, + "Ġfe": 368, + "ch": 369, + "The": 370, + "ill": 371, + "Ġas": 372, + "ro": 373, + "Ġ...": 374, + "ep": 375, + "ic": 376, + "Ġex": 377, + "ght": 378, + "ation": 379, + "Ġint": 380, + "me": 381, + "Ġfeel": 382, + "Ġare": 383, + "Ġ.": 384, + "ur": 385, + "Ġj": 386, + "And": 387, + "ess": 388, + "Ġthis": 389, + "out": 390, + "Ġdo": 391, + "Ġ-": 392, + "Ġne": 393, + "ri": 394, + "ris": 395, + "Ġlo": 396, + "Ġan": 397, + "Ġwas": 398, + "Ġcon": 399, + "Ġmy": 400, + "ap": 401, + "and": 402, + "ru": 403, + "Ġcan": 404, + "res": 405, + "Ġlike": 406, + "Ġsom": 407, + "Ġjust": 408, + "Ġwhat": 409, + "Iris": 410, + "Ġsh": 411, + "Mel": 412, + "