mimic_cov-small_tokenizer / tokenizer.json
mariamma342's picture
Upload tokenizer
29f747e verified
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 0,
"content": "<unk>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 1,
"content": "<s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 2,
"content": "</s>",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "Sequence",
"normalizers": [
{
"type": "Prepend",
"prepend": "▁"
},
{
"type": "Replace",
"pattern": {
"String": " "
},
"content": "▁"
}
]
},
"pre_tokenizer": null,
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "<s>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "<s>",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "<s>",
"type_id": 1
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
}
],
"special_tokens": {
"<s>": {
"id": "<s>",
"ids": [
1
],
"tokens": [
"<s>"
]
}
}
},
"decoder": {
"type": "Sequence",
"decoders": [
{
"type": "Replace",
"pattern": {
"String": "▁"
},
"content": " "
},
{
"type": "ByteFallback"
},
{
"type": "Fuse"
},
{
"type": "Strip",
"content": " ",
"start": 1,
"stop": 0
}
]
},
"model": {
"type": "BPE",
"dropout": null,
"unk_token": "<unk>",
"continuing_subword_prefix": null,
"end_of_word_suffix": null,
"fuse_unk": true,
"byte_fallback": true,
"vocab": {
"<unk>": 0,
"<s>": 1,
"</s>": 2,
"\n": 3,
"'": 4,
",": 5,
".": 6,
":": 7,
"<": 8,
">": 9,
"A": 10,
"C": 11,
"D": 12,
"E": 13,
"F": 14,
"I": 15,
"L": 16,
"N": 17,
"O": 18,
"P": 19,
"R": 20,
"S": 21,
"T": 22,
"U": 23,
"\\": 24,
"a": 25,
"b": 26,
"c": 27,
"d": 28,
"e": 29,
"f": 30,
"g": 31,
"h": 32,
"i": 33,
"l": 34,
"m": 35,
"n": 36,
"o": 37,
"p": 38,
"q": 39,
"r": 40,
"s": 41,
"t": 42,
"u": 43,
"v": 44,
"w": 45,
"x": 46,
"y": 47,
"▁": 48,
"ng": 49,
"di": 50,
"ung": 51,
"in": 52,
"fin": 53,
"lung": 54,
"▁fin": 55,
"▁lung": 56,
"ngs": 57,
"dings": 58,
"▁findings": 59,
"ar": 60,
"es": 61,
"▁L": 62,
"is": 63,
"ma": 64,
"ge": 65,
"ist": 66,
"▁n": 67,
"<i": 68,
"am": 69,
"of": 70,
"▁<i": 71,
"▁of": 72,
"mage": 73,
"▁nam": 74,
"▁<image": 75,
"▁names": 76,
"▁<image>": 77,
"▁List": 78,
"he": 79,
"The": 80,
"▁ar": 81,
"▁The": 82,
"▁are": 83,
"on": 84,
"le": 85,
"▁P": 86,
"si": 87,
"po": 88,
"al": 89,
"▁N": 90,
"one": 91,
"▁None": 92,
"as": 93,
"▁C": 94,
"um": 95,
"ur": 96,
"me": 97,
"ome": 98,
"diome": 99,
"ardiome": 100,
"▁Lung": 101,
"ac": 102,
"leur": 103,
"▁Pleur": 104,
"▁Pleural": 105,
"vi": 106,
"fu": 107,
"Ef": 108,
"sion": 109,
"▁PleuralEf": 110,
"fusion": 111,
"▁PleuralEffusion": 112,
"De": 113,
"Su": 114,
"ces": 115,
"ppo": 116,
"rt": 117,
"▁Su": 118,
"vices": 119,
"Devices": 120,
"pport": 121,
"▁Support": 122,
"▁SupportDevices": 123,
"it": 124,
"Op": 125,
"▁LungOp": 126,
"acit": 127,
"▁LungOpacit": 128,
"▁LungOpacity": 129,
"▁A": 130,
"te": 131,
"ct": 132,
"lect": 133,
"asis": 134,
"▁Ate": 135,
"lectasis": 136,
"▁Atelectasis": 137,
"▁po": 138,
"bl": 139,
"ssi": 140,
"▁possi": 141,
"bly": 142,
"▁possibly": 143,
"▁E": 144,
"eum": 145,
"neum": 146,
"▁Pneum": 147,
"gal": 148,
"▁Cardiome": 149,
"galy": 150,
"▁Cardiomegaly": 151,
"an": 152,
"▁an": 153,
"▁and": 154,
"ia": 155,
"onia": 156,
"▁Pneumonia": 157,
"de": 158,
"▁Ede": 159,
"▁Edema": 160,
"ion": 161,
"or": 162,
"Cardiome": 163,
"dCardiome": 164,
"lar": 165,
"nlar": 166,
"tin": 167,
"dias": 168,
"gedCardiome": 169,
"▁Enlar": 170,
"tinum": 171,
"diastinum": 172,
"gedCardiomediastinum": 173,
"▁EnlargedCardiomediastinum": 174,
"at": 175,
"li": 176,
"dat": 177,
"oli": 178,
"soli": 179,
"onsoli": 180,
"▁Consoli": 181,
"dation": 182,
"▁Consolidation": 183,
"ax": 184,
"hor": 185,
"ot": 186,
"▁Pneumot": 187,
"horax": 188,
"▁Pneumothorax": 189,
"Les": 190,
"▁LungLes": 191,
"▁LungLesion": 192,
"Fr": 193,
"tur": 194,
"▁Fr": 195,
"actur": 196,
"▁Fractur": 197,
"▁Fracture": 198,
"mal": 199,
"▁Nor": 200,
"▁Normal": 201,
"ovi": 202,
"▁Covi": 203,
"▁Covid": 204,
"the": 205,
"Othe": 206,
"▁PleuralOthe": 207,
"▁PleuralOther": 208,
"er": 209,
"us": 210,
"sist": 211,
"▁a": 212,
"▁c": 213,
"▁as": 214,
"▁us": 215,
"ant": 216,
"sistant": 217,
"▁assistant": 218,
"▁user": 219,
"'s": 220,
"AN": 221,
"ER": 222,
"IS": 223,
"List": 224,
"SS": 225,
"SER": 226,
"T:": 227,
"TAN": 228,
"USER": 229,
"\\n": 230,
"ai": 231,
"be": 232,
"ce": 233,
"ci": 234,
"ee": 235,
"fi": 236,
"gi": 237,
"hat": 238,
"io": 239,
"ifi": 240,
"lp": 241,
"lit": 242,
"lli": 243,
"nce": 244,
"qu": 245,
"sw": 246,
"to": 247,
"tw": 248,
"tion": 249,
"tai": 250,
"tifi": 251,
"ves": 252,
"▁\n": 253,
"▁,": 254,
"▁.": 255,
"▁:": 256,
"▁in": 257,
"▁he": 258,
"▁de": 259,
"▁the": 260,
"▁USER": 261,
"▁be": 262,
"▁gi": 263,
"▁qu": 264,
"▁to": 265,
"estion": 266,
"gence": 267,
"▁nsw": 268,
"▁<image>\\n": 269,
"▁artifi": 270,
"led": 271,
"urio": 272,
"ful": 273,
"▁ASS": 274,
"telli": 275,
"▁polit": 276,
"ers": 277,
"▁chat": 278,
"▁curio": 279,
"▁user's": 280,
"ISTAN": 281,
"cial": 282,
"een": 283,
"lpful": 284,
"tween": 285,
"tailed": 286,
"▁intelli": 287,
"▁helpful": 288,
"▁detailed": 289,
"▁USER:": 290,
"▁between": 291,
"▁gives": 292,
"▁question": 293,
"▁nswers": 294,
"▁<image>\\nList": 295,
"▁artificial": 296,
"▁ASSISTAN": 297,
"▁polite": 298,
"▁curious": 299,
"▁intelligence": 300,
"▁questions": 301,
"▁ASSISTANT:": 302
},
"merges": [
"n g",
"d i",
"u ng",
"i n",
"f in",
"l ung",
"▁ fin",
"▁ lung",
"ng s",
"di ngs",
"▁fin dings",
"a r",
"e s",
"▁ L",
"i s",
"m a",
"g e",
"is t",
"▁ n",
"< i",
"a m",
"o f",
"▁ <i",
"▁ of",
"ma ge",
"▁n am",
"▁<i mage",
"▁nam es",
"▁<image >",
"▁L ist",
"h e",
"T he",
"▁ ar",
"▁ The",
"▁ar e",
"o n",
"l e",
"▁ P",
"s i",
"p o",
"a l",
"▁ N",
"on e",
"▁N one",
"a s",
"▁ C",
"u m",
"u r",
"m e",
"o me",
"di ome",
"ar diome",
"▁L ung",
"a c",
"le ur",
"▁P leur",
"▁Pleur al",
"v i",
"f u",
"E f",
"si on",
"▁Pleural Ef",
"fu sion",
"▁PleuralEf fusion",
"D e",
"S u",
"c es",
"p po",
"r t",
"▁ Su",
"vi ces",
"De vices",
"ppo rt",
"▁Su pport",
"▁Support Devices",
"i t",
"O p",
"▁Lung Op",
"ac it",
"▁LungOp acit",
"▁LungOpacit y",
"▁ A",
"t e",
"c t",
"le ct",
"as is",
"▁A te",
"lect asis",
"▁Ate lectasis",
"▁ po",
"b l",
"s si",
"▁po ssi",
"bl y",
"▁possi bly",
"▁ E",
"e um",
"n eum",
"▁P neum",
"g al",
"▁C ardiome",
"gal y",
"▁Cardiome galy",
"a n",
"▁ an",
"▁an d",
"i a",
"on ia",
"▁Pneum onia",
"d e",
"▁E de",
"▁Ede ma",
"i on",
"o r",
"C ardiome",
"d Cardiome",
"l ar",
"n lar",
"t in",
"di as",
"ge dCardiome",
"▁E nlar",
"tin um",
"dias tinum",
"gedCardiome diastinum",
"▁Enlar gedCardiomediastinum",
"a t",
"l i",
"d at",
"o li",
"s oli",
"on soli",
"▁C onsoli",
"dat ion",
"▁Consoli dation",
"a x",
"h or",
"o t",
"▁Pneum ot",
"hor ax",
"▁Pneumot horax",
"L es",
"▁Lung Les",
"▁LungLes ion",
"F r",
"t ur",
"▁ Fr",
"ac tur",
"▁Fr actur",
"▁Fractur e",
"ma l",
"▁N or",
"▁Nor mal",
"o vi",
"▁C ovi",
"▁Covi d",
"t he",
"O the",
"▁Pleural Othe",
"▁PleuralOthe r",
"e r",
"u s",
"s ist",
"▁ a",
"▁ c",
"▁ as",
"▁ us",
"an t",
"sist ant",
"▁as sistant",
"▁us er",
"' s",
"A N",
"E R",
"I S",
"L ist",
"S S",
"S ER",
"T :",
"T AN",
"U SER",
"\\ n",
"a i",
"b e",
"c e",
"c i",
"e e",
"f i",
"g i",
"h at",
"i o",
"i fi",
"l p",
"l it",
"l li",
"n ce",
"q u",
"s w",
"t o",
"t w",
"t ion",
"t ai",
"t ifi",
"v es",
"▁ \n",
"▁ ,",
"▁ .",
"▁ :",
"▁ in",
"▁ he",
"▁ de",
"▁ the",
"▁ USER",
"▁ be",
"▁ gi",
"▁ qu",
"▁ to",
"es tion",
"ge nce",
"▁n sw",
"▁<image> \\n",
"▁ar tifi",
"le d",
"ur io",
"fu l",
"▁A SS",
"te lli",
"▁po lit",
"er s",
"▁c hat",
"▁c urio",
"▁user 's",
"IS TAN",
"ci al",
"ee n",
"lp ful",
"tw een",
"tai led",
"▁in telli",
"▁he lpful",
"▁de tailed",
"▁USER :",
"▁be tween",
"▁gi ves",
"▁qu estion",
"▁nsw ers",
"▁<image>\\n List",
"▁artifi cial",
"▁ASS ISTAN",
"▁polit e",
"▁curio us",
"▁intelli gence",
"▁question s",
"▁ASSISTAN T:"
]
}
}