hf_tokenizer_V / tokenizer.json
mel-nur's picture
Upload tokenizer
21e934f verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": null,
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"<unk>": 0,
"A": 1,
"F": 2,
"M": 3,
"N": 4,
"P": 5,
"T": 6,
"a": 7,
"c": 8,
"d": 9,
"e": 10,
"f": 11,
"g": 12,
"h": 13,
"i": 14,
"l": 15,
"m": 16,
"n": 17,
"o": 18,
"p": 19,
"r": 20,
"s": 21,
"t": 22,
"u": 23,
"v": 24,
"y": 25,
"in": 26,
"an": 27,
"he": 28,
"is": 29,
"at": 30,
"ch": 31,
"ed": 32,
"ing": 33,
"The": 34,
"al": 35,
"as": 36,
"ar": 37,
"ce": 38,
"cat": 39,
"do": 40,
"er": 41,
"or": 42,
"ran": 43,
"the": 44,
"ang": 45,
"chas": 46,
"dog": 47,
"chased": 48,
"At": 49,
"Fran": 50,
"Ma": 51,
"Nat": 52,
"Par": 53,
"Tran": 54,
"ag": 55,
"ap": 56,
"cin": 57,
"cap": 58,
"en": 59,
"ev": 60,
"eed": 61,
"ear": 62,
"fas": 63
},
"merges": [
[
"i",
"n"
],
[
"a",
"n"
],
[
"h",
"e"
],
[
"i",
"s"
],
[
"a",
"t"
],
[
"c",
"h"
],
[
"e",
"d"
],
[
"in",
"g"
],
[
"T",
"he"
],
[
"a",
"l"
],
[
"a",
"s"
],
[
"a",
"r"
],
[
"c",
"e"
],
[
"c",
"at"
],
[
"d",
"o"
],
[
"e",
"r"
],
[
"o",
"r"
],
[
"r",
"an"
],
[
"t",
"he"
],
[
"an",
"g"
],
[
"ch",
"as"
],
[
"do",
"g"
],
[
"chas",
"ed"
],
[
"A",
"t"
],
[
"F",
"ran"
],
[
"M",
"a"
],
[
"N",
"at"
],
[
"P",
"ar"
],
[
"T",
"ran"
],
[
"a",
"g"
],
[
"a",
"p"
],
[
"c",
"in"
],
[
"c",
"ap"
],
[
"e",
"n"
],
[
"e",
"v"
],
[
"e",
"ed"
],
[
"e",
"ar"
],
[
"f",
"as"
]
]
}
}