diff --git "a/tokenizer.json" "b/tokenizer.json" new file mode 100644--- /dev/null +++ "b/tokenizer.json" @@ -0,0 +1,94414 @@ +{ + "version": "1.0", + "truncation": { + "direction": "Right", + "max_length": 128, + "strategy": "LongestFirst", + "stride": 0 + }, + "padding": { + "strategy": { + "Fixed": 128 + }, + "direction": "Right", + "pad_to_multiple_of": null, + "pad_id": 1, + "pad_type_id": 0, + "pad_token": "" + }, + "added_tokens": [ + { + "id": 0, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": true, + "use_regex": true + }, + "post_processor": null, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": true, + "trim_offsets": true, + "use_regex": true + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "": 0, + "": 1, + "": 2, + "": 3, + "": 4, + "'": 5, + "-": 6, + ".": 7, + "/": 8, + "<": 9, + ">": 10, + "?": 11, + "A": 12, + "B": 13, + "C": 14, + "D": 15, + "E": 16, + "F": 17, + "G": 18, + "H": 19, + "I": 20, + "J": 21, + "K": 22, + "L": 23, + "M": 24, + "N": 25, + "O": 26, + "P": 27, + "Q": 28, + "R": 29, + "S": 30, + "T": 31, + "U": 32, + "V": 33, + "W": 34, + "X": 35, + "Y": 36, + "Z": 37, + "_": 38, + "a": 39, + "b": 40, + "c": 41, + "d": 42, + "e": 43, + "f": 44, + "g": 45, + "h": 46, + "i": 47, + "j": 48, + "k": 49, + "l": 50, + "m": 51, + "n": 52, + "o": 53, + "p": 54, + "q": 55, + "r": 56, + "s": 57, + "t": 58, + "u": 59, + "v": 60, + "w": 61, + "x": 62, + "y": 63, + "z": 64, + "·": 65, + "º": 66, + "Â": 67, + "Ä": 68, + "Ġ": 69, + "an": 70, + "ey": 71, + "Ġd": 72, + "on": 73, + "Ġa": 74, + "Ġs": 75, + "Ġg": 76, + "Ġb": 77, + "Ġt": 78, + "Ġh": 79, + "kk": 80, + "Ġe": 81, + "ar": 82, + "or": 83, + "eg": 84, + "Ġp": 85, + "ot": 86, + "di": 87, + "Ġag": 88, + "Ġm": 89, + "ur": 90, + "Ġdeg": 91, + "ol": 92, + "in": 93, + "en": 94, + "Ġekk": 95, + "Ġj": 96, + "ong": 97, + "un": 98, + "al": 99, + "Ġdegong": 100, + "Ġar": 101, + "Ġi": 102, + "Ġgur": 103, + "Ġagey": 104, + "Ġn": 105, + "Ġekkan": 106, + "od": 107, + "at": 108, + "Ġo": 109, + "Ġaro": 110, + "ag": 111, + "Ġhe": 112, + "Ġsi": 113, + "Ġl": 114, + "ang": 115, + "Ġ<": 116, + "Ġse": 117, + "Ġte": 118, + "Ġdo": 119, + "Ġhi": 120, + "Ġol": 121, + "Ġman": 122, + "ai": 123, + "Ġagon": 124, + "ch": 125, + "oi": 126, + "ye": 127, + "edi": 128, + "ani": 129, + "Ġguri": 130, + "Ġdega": 131, + "Ġgor": 132, + "aj": 133, + "Ġu": 134, + "st": 135, + "uj": 136, + "il": 137, + "Ġob": 138, + "Ġr": 139, + "er": 140, + "ic": 141, + "ara": 142, + "ul": 143, + "gan": 144, + "Ġtey": 145, + "ay": 146, + "yot": 147, + "Ġbi": 148, + "Ġdogan": 149, + "Ġdi": 150, + "Ġk": 151, + "Ġc": 152, + "Ġna": 153, + "us": 154, + "Ġdol": 155, + "de": 156, + "eni": 157, + "Ġ", + "