good / tokenizer.json
callidus's picture
Upload folder using huggingface_hub
54b8ba7 verified
{
"word2idx": {
"<PAD>": 0,
"<UNK>": 1,
"<SOS>": 2,
"<EOS>": 3,
"artificial": 4,
"intelligence": 5,
"is": 6,
"transforming": 7,
"the": 8,
"world": 9,
"machine": 10,
"learning": 11,
"algorithms": 12,
"learn": 13,
"from": 14,
"data": 15,
"and": 16,
"make": 17,
"predictions": 18,
"deep": 19,
"uses": 20,
"neural": 21,
"networks": 22,
"with": 23,
"multiple": 24,
"layers": 25,
"to": 26,
"process": 27,
"information": 28,
"natural": 29,
"language": 30,
"processing": 31,
"helps": 32,
"computers": 33,
"understand": 34,
"human": 35,
"text": 36,
"computer": 37,
"vision": 38,
"enables": 39,
"machines": 40,
"interpret": 41,
"visual": 42,
"images": 43,
"videos": 44,
"robots": 45,
"are": 46,
"becoming": 47,
"more": 48,
"sophisticated": 49,
"ai": 50,
"technology": 51,
"autonomous": 52,
"vehicles": 53,
"use": 54,
"navigate": 55,
"roads": 56,
"safely": 57,
"healthcare": 58,
"being": 59,
"revolutionized": 60,
"by": 61,
"diagnostics": 62,
"education": 63,
"enhanced": 64,
"through": 65,
"personalized": 66,
"systems": 67,
"powered": 68,
"science": 69,
"combines": 70,
"statistics": 71,
"programming": 72,
"big": 73,
"analytics": 74,
"reveals": 75,
"hidden": 76,
"patterns": 77,
"in": 78,
"large": 79,
"datasets": 80,
"cloud": 81,
"computing": 82,
"provides": 83,
"scalable": 84,
"infrastructure": 85,
"for": 86,
"applications": 87,
"cybersecurity": 88,
"protect": 89,
"digital": 90,
"assets": 91,
"threats": 92,
"internet": 93,
"of": 94,
"things": 95,
"connects": 96,
"everyday": 97,
"devices": 98,
"smart": 99,
"homes": 100,
"automate": 101,
"tasks": 102,
"save": 103,
"energy": 104,
"virtual": 105,
"assistants": 106,
"help": 107,
"people": 108,
"daily": 109,
"activities": 110,
"using": 111,
"inspired": 112,
"brain": 113,
"structure": 114,
"training": 115,
"essential": 116,
"models": 117,
"supervised": 118,
"labeled": 119,
"unsupervised": 120,
"finds": 121,
"unlabeled": 122,
"automatically": 123,
"reinforcement": 124,
"trains": 125,
"agents": 126,
"rewards": 127,
"penalties": 128,
"transfer": 129,
"reuses": 130,
"knowledge": 131,
"one": 132,
"task": 133,
"another": 134,
"step": 135,
"efficiently": 136,
"languages": 137,
"like": 138,
"python": 139,
"popular": 140,
"development": 141,
"mathematical": 142,
"optimization": 143,
"improves": 144,
"model": 145,
"performance": 146,
"over": 147,
"time": 148,
"statistical": 149,
"analysis": 150,
"distributions": 151,
"probability": 152,
"theory": 153,
"fundamental": 154,
"linear": 155,
"algebra": 156,
"operations": 157,
"core": 158,
"network": 159,
"computations": 160,
"gradient": 161,
"descent": 162,
"optimizes": 163,
"weights": 164,
"during": 165,
"backpropagation": 166,
"calculates": 167,
"gradients": 168,
"activation": 169,
"functions": 170,
"introduce": 171,
"nonlinearity": 172,
"into": 173,
"convolutional": 174,
"excel": 175,
"at": 176,
"image": 177,
"recurrent": 178,
"sequential": 179,
"speech": 180,
"transformer": 181,
"attention": 182,
"mechanisms": 183,
"better": 184,
"can": 185,
"generate": 186,
"responses": 187,
"generative": 188,
"create": 189,
"new": 190,
"content": 191,
"similar": 192,
"ethics": 193,
"ensures": 194,
"responsible": 195,
"deployment": 196,
"bias": 197,
"lead": 198,
"unfair": 199,
"outcomes": 200,
"discrimination": 201,
"privacy": 202,
"concerns": 203,
"arise": 204,
"collecting": 205,
"personal": 206,
"transparency": 207,
"builds": 208,
"trust": 209,
"users": 210,
"future": 211,
"will": 212,
"integrate": 213,
"innovation": 214,
"drives": 215,
"progress": 216,
"research": 217,
"scientists": 218,
"engineers": 219,
"collaborate": 220,
"on": 221,
"breakthrough": 222,
"solutions": 223,
"industry": 224,
"adoption": 225,
"continues": 226,
"accelerate": 227,
"rapidly": 228
},
"idx2word": {
"0": "<PAD>",
"1": "<UNK>",
"2": "<SOS>",
"3": "<EOS>",
"4": "artificial",
"5": "intelligence",
"6": "is",
"7": "transforming",
"8": "the",
"9": "world",
"10": "machine",
"11": "learning",
"12": "algorithms",
"13": "learn",
"14": "from",
"15": "data",
"16": "and",
"17": "make",
"18": "predictions",
"19": "deep",
"20": "uses",
"21": "neural",
"22": "networks",
"23": "with",
"24": "multiple",
"25": "layers",
"26": "to",
"27": "process",
"28": "information",
"29": "natural",
"30": "language",
"31": "processing",
"32": "helps",
"33": "computers",
"34": "understand",
"35": "human",
"36": "text",
"37": "computer",
"38": "vision",
"39": "enables",
"40": "machines",
"41": "interpret",
"42": "visual",
"43": "images",
"44": "videos",
"45": "robots",
"46": "are",
"47": "becoming",
"48": "more",
"49": "sophisticated",
"50": "ai",
"51": "technology",
"52": "autonomous",
"53": "vehicles",
"54": "use",
"55": "navigate",
"56": "roads",
"57": "safely",
"58": "healthcare",
"59": "being",
"60": "revolutionized",
"61": "by",
"62": "diagnostics",
"63": "education",
"64": "enhanced",
"65": "through",
"66": "personalized",
"67": "systems",
"68": "powered",
"69": "science",
"70": "combines",
"71": "statistics",
"72": "programming",
"73": "big",
"74": "analytics",
"75": "reveals",
"76": "hidden",
"77": "patterns",
"78": "in",
"79": "large",
"80": "datasets",
"81": "cloud",
"82": "computing",
"83": "provides",
"84": "scalable",
"85": "infrastructure",
"86": "for",
"87": "applications",
"88": "cybersecurity",
"89": "protect",
"90": "digital",
"91": "assets",
"92": "threats",
"93": "internet",
"94": "of",
"95": "things",
"96": "connects",
"97": "everyday",
"98": "devices",
"99": "smart",
"100": "homes",
"101": "automate",
"102": "tasks",
"103": "save",
"104": "energy",
"105": "virtual",
"106": "assistants",
"107": "help",
"108": "people",
"109": "daily",
"110": "activities",
"111": "using",
"112": "inspired",
"113": "brain",
"114": "structure",
"115": "training",
"116": "essential",
"117": "models",
"118": "supervised",
"119": "labeled",
"120": "unsupervised",
"121": "finds",
"122": "unlabeled",
"123": "automatically",
"124": "reinforcement",
"125": "trains",
"126": "agents",
"127": "rewards",
"128": "penalties",
"129": "transfer",
"130": "reuses",
"131": "knowledge",
"132": "one",
"133": "task",
"134": "another",
"135": "step",
"136": "efficiently",
"137": "languages",
"138": "like",
"139": "python",
"140": "popular",
"141": "development",
"142": "mathematical",
"143": "optimization",
"144": "improves",
"145": "model",
"146": "performance",
"147": "over",
"148": "time",
"149": "statistical",
"150": "analysis",
"151": "distributions",
"152": "probability",
"153": "theory",
"154": "fundamental",
"155": "linear",
"156": "algebra",
"157": "operations",
"158": "core",
"159": "network",
"160": "computations",
"161": "gradient",
"162": "descent",
"163": "optimizes",
"164": "weights",
"165": "during",
"166": "backpropagation",
"167": "calculates",
"168": "gradients",
"169": "activation",
"170": "functions",
"171": "introduce",
"172": "nonlinearity",
"173": "into",
"174": "convolutional",
"175": "excel",
"176": "at",
"177": "image",
"178": "recurrent",
"179": "sequential",
"180": "speech",
"181": "transformer",
"182": "attention",
"183": "mechanisms",
"184": "better",
"185": "can",
"186": "generate",
"187": "responses",
"188": "generative",
"189": "create",
"190": "new",
"191": "content",
"192": "similar",
"193": "ethics",
"194": "ensures",
"195": "responsible",
"196": "deployment",
"197": "bias",
"198": "lead",
"199": "unfair",
"200": "outcomes",
"201": "discrimination",
"202": "privacy",
"203": "concerns",
"204": "arise",
"205": "collecting",
"206": "personal",
"207": "transparency",
"208": "builds",
"209": "trust",
"210": "users",
"211": "future",
"212": "will",
"213": "integrate",
"214": "innovation",
"215": "drives",
"216": "progress",
"217": "research",
"218": "scientists",
"219": "engineers",
"220": "collaborate",
"221": "on",
"222": "breakthrough",
"223": "solutions",
"224": "industry",
"225": "adoption",
"226": "continues",
"227": "accelerate",
"228": "rapidly"
},
"vocab_size": 2000,
"special_tokens": [
"<PAD>",
"<UNK>",
"<SOS>",
"<EOS>"
]
}