hf-tokenizer / tokenizer.json
bylang's picture
Upload tokenizer
d3c4784 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<unk>": 0,
"\"": 1,
",": 2,
".": 3,
":": 4,
"?": 5,
"A": 6,
"B": 7,
"E": 8,
"G": 9,
"H": 10,
"K": 11,
"N": 12,
"O": 13,
"S": 14,
"T": 15,
"Y": 16,
"Z": 17,
"a": 18,
"b": 19,
"c": 20,
"d": 21,
"e": 22,
"g": 23,
"h": 24,
"i": 25,
"k": 26,
"l": 27,
"m": 28,
"n": 29,
"o": 30,
"p": 31,
"r": 32,
"s": 33,
"t": 34,
"u": 35,
"v": 36,
"y": 37,
"z": 38,
"Ç": 39,
"ç": 40,
"ö": 41,
"ü": 42,
"ğ": 43,
"ı": 44,
"Ş": 45,
"ş": 46,
"“": 47,
"”": 48,
"en": 49,
"ma": 50,
"bi": 51,
"er": 52,
"ol": 53,
"de": 54,
"li": 55,
"ar": 56,
"or": 57,
"yor": 58,
"da": 59,
"di": 60,
"bir": 61,
"bu": 62,
"nl": 63
},
"merges": [
[
"e",
"n"
],
[
"m",
"a"
],
[
"b",
"i"
],
[
"e",
"r"
],
[
"o",
"l"
],
[
"d",
"e"
],
[
"l",
"i"
],
[
"a",
"r"
],
[
"o",
"r"
],
[
"y",
"or"
],
[
"d",
"a"
],
[
"d",
"i"
],
[
"bi",
"r"
],
[
"b",
"u"
],
[
"n",
"l"
]
]
}
}