Mini-marius-language / tokenizer.json
Clemylia's picture
Ajout du tokenizer associé au modèle final
2d0ebbd verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 3,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 4,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": null,
"pre_tokenizer": {
"type": "Whitespace"
},
"post_processor": null,
"decoder": null,
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "[UNK]",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": false,
"byte_fallback": false,
"ignore_merges": false,
"vocab": {
"[UNK]": 0,
"[CLS]": 1,
"[SEP]": 2,
"[PAD]": 3,
"[MASK]": 4,
"!": 5,
".": 6,
"G": 7,
"M": 8,
"Q": 9,
"T": 10,
"W": 11,
"a": 12,
"b": 13,
"d": 14,
"e": 15,
"f": 16,
"i": 17,
"j": 18,
"l": 19,
"m": 20,
"n": 21,
"o": 22,
"p": 23,
"q": 24,
"r": 25,
"s": 26,
"t": 27,
"u": 28,
"v": 29,
"w": 30,
"ç": 31,
"é": 32,
"..": 33,
"ou": 34,
"aou": 35,
"aouf": 36,
"Waouf": 37,
"rr": 38,
"is": 39,
"Grr": 40,
"le": 41,
"on": 42,
"su": 43,
"...": 44,
"Grrr": 45,
"suis": 46,
"as": 47,
"ble": 48,
"dis": 49,
"ible": 50,
"je": 51,
"oi": 52,
"pon": 53,
"pas": 54,
"rd": 55,
"sou": 56,
"te": 57,
"uoi": 58,
"....": 59,
"dispon": 60,
"sourd": 61,
"disponible": 62,
"!!": 63,
"!.": 64,
"Ma": 65,
"Quoi": 66,
"Ta": 67,
"Tu": 68,
"aa": 69,
"ai": 70,
"dé": 71,
"eu": 72,
"fai": 73,
"ir": 74,
"iu": 75,
"me": 76,
"non": 77,
"or": 78,
"quoi": 79,
"riu": 80,
"ste": 81,
"sor": 82,
"tir": 83,
"veu": 84,
"waouf": 85,
"waa": 86,
"ça": 87,
"les": 88,
"teste": 89,
"Mariu": 90,
"déteste": 91,
"fait": 92,
"sortir": 93,
"veut": 94,
"Marius": 95
},
"merges": [
[
".",
"."
],
[
"o",
"u"
],
[
"a",
"ou"
],
[
"aou",
"f"
],
[
"W",
"aouf"
],
[
"r",
"r"
],
[
"i",
"s"
],
[
"G",
"rr"
],
[
"l",
"e"
],
[
"o",
"n"
],
[
"s",
"u"
],
[
"..",
"."
],
[
"Grr",
"r"
],
[
"su",
"is"
],
[
"a",
"s"
],
[
"b",
"le"
],
[
"d",
"is"
],
[
"i",
"ble"
],
[
"j",
"e"
],
[
"o",
"i"
],
[
"p",
"on"
],
[
"p",
"as"
],
[
"r",
"d"
],
[
"s",
"ou"
],
[
"t",
"e"
],
[
"u",
"oi"
],
[
"..",
".."
],
[
"dis",
"pon"
],
[
"sou",
"rd"
],
[
"dispon",
"ible"
],
[
"!",
"!"
],
[
"!",
"."
],
[
"M",
"a"
],
[
"Q",
"uoi"
],
[
"T",
"a"
],
[
"T",
"u"
],
[
"a",
"a"
],
[
"a",
"i"
],
[
"d",
"é"
],
[
"e",
"u"
],
[
"f",
"ai"
],
[
"i",
"r"
],
[
"i",
"u"
],
[
"m",
"e"
],
[
"n",
"on"
],
[
"o",
"r"
],
[
"q",
"uoi"
],
[
"r",
"iu"
],
[
"s",
"te"
],
[
"s",
"or"
],
[
"t",
"ir"
],
[
"v",
"eu"
],
[
"w",
"aouf"
],
[
"w",
"aa"
],
[
"ç",
"a"
],
[
"le",
"s"
],
[
"te",
"ste"
],
[
"Ma",
"riu"
],
[
"dé",
"teste"
],
[
"fai",
"t"
],
[
"sor",
"tir"
],
[
"veu",
"t"
],
[
"Mariu",
"s"
]
]
}
}