greffe-de-caca-v1 / tokenizer.json
Eleonord's picture
Ajout du tokenizer associé au modèle final
7d18093 verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[TOXIC]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[REPARENCE]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SUIVANT]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[REMPLI]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 85,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[TOXIC]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[TOXIC]": 0,
"[REPARENCE]": 1,
"[SUIVANT]": 2,
"[REMPLI]": 3,
"[MASK]": 4,
"!": 5,
"'": 6,
".": 7,
"G": 8,
"a": 9,
"b": 10,
"c": 11,
"d": 12,
"e": 13,
"f": 14,
"g": 15,
"i": 16,
"j": 17,
"l": 18,
"m": 19,
"n": 20,
"o": 21,
"p": 22,
"r": 23,
"s": 24,
"t": 25,
"u": 26,
"v": 27,
"x": 28,
"z": 29,
"ca": 30,
"re": 31,
"Gre": 32,
"fe": 33,
"ffe": 34,
"caca": 35,
"Greffe": 36,
"ou": 37,
"an": 38,
"de": 39,
"du": 40,
"dan": 41,
"es": 42,
"vou": 43,
"Greffez": 44,
"dans": 45,
"vous": 46,
"les": 47,
"bou": 48,
"di": 49,
"le": 50,
"boudi": 51,
"boudin": 52,
"ac": 53,
"am": 54,
"as": 55,
"au": 56,
"br": 57,
"bes": 58,
"ce": 59,
"ds": 60,
"en": 61,
"ez": 62,
"eau": 63,
"eds": 64,
"gen": 65,
"il": 66,
"ieds": 67,
"jam": 68,
"mac": 69,
"nez": 70,
"ore": 71,
"omac": 72,
"pieds": 73,
"rv": 74,
"tomac": 75,
"oux": 76,
"estomac": 77,
"bras": 78,
"cerv": 79,
"genoux": 80,
"illes": 81,
"jambes": 82,
"oreilles": 83,
"cerveau": 84
},
"merges": [
[
"c",
"a"
],
[
"r",
"e"
],
[
"G",
"re"
],
[
"f",
"e"
],
[
"f",
"fe"
],
[
"ca",
"ca"
],
[
"Gre",
"ffe"
],
[
"o",
"u"
],
[
"a",
"n"
],
[
"d",
"e"
],
[
"d",
"u"
],
[
"d",
"an"
],
[
"e",
"s"
],
[
"v",
"ou"
],
[
"Greffe",
"z"
],
[
"dan",
"s"
],
[
"vou",
"s"
],
[
"l",
"es"
],
[
"b",
"ou"
],
[
"d",
"i"
],
[
"l",
"e"
],
[
"bou",
"di"
],
[
"boudi",
"n"
],
[
"a",
"c"
],
[
"a",
"m"
],
[
"a",
"s"
],
[
"a",
"u"
],
[
"b",
"r"
],
[
"b",
"es"
],
[
"c",
"e"
],
[
"d",
"s"
],
[
"e",
"n"
],
[
"e",
"z"
],
[
"e",
"au"
],
[
"e",
"ds"
],
[
"g",
"en"
],
[
"i",
"l"
],
[
"i",
"eds"
],
[
"j",
"am"
],
[
"m",
"ac"
],
[
"n",
"ez"
],
[
"o",
"re"
],
[
"o",
"mac"
],
[
"p",
"ieds"
],
[
"r",
"v"
],
[
"t",
"omac"
],
[
"ou",
"x"
],
[
"es",
"tomac"
],
[
"br",
"as"
],
[
"ce",
"rv"
],
[
"gen",
"oux"
],
[
"il",
"les"
],
[
"jam",
"bes"
],
[
"ore",
"illes"
],
[
"cerv",
"eau"
]
]
}
}