[ { "loss": 1.9181, "grad_norm": 0.8695156574249268, "learning_rate": 1.8e-05, "entropy": 1.457271361351013, "num_tokens": 223301.0, "mean_token_accuracy": 0.5781058162450791, "epoch": 0.016406890894175553, "step": 10 }, { "loss": 1.7798, "grad_norm": 0.6769833564758301, "learning_rate": 3.8e-05, "entropy": 1.6721824526786804, "num_tokens": 449106.0, "mean_token_accuracy": 0.585400715470314, "epoch": 0.03281378178835111, "step": 20 }, { "loss": 1.5073, "grad_norm": 0.903017520904541, "learning_rate": 5.8e-05, "entropy": 1.568133682012558, "num_tokens": 681766.0, "mean_token_accuracy": 0.6311331987380981, "epoch": 0.04922067268252666, "step": 30 }, { "loss": 1.1072, "grad_norm": 0.4333101511001587, "learning_rate": 7.800000000000001e-05, "entropy": 1.0811064839363098, "num_tokens": 919022.0, "mean_token_accuracy": 0.7193854421377182, "epoch": 0.06562756357670221, "step": 40 }, { "loss": 0.9973, "grad_norm": 0.33554309606552124, "learning_rate": 9.8e-05, "entropy": 0.9973530560731888, "num_tokens": 1148282.0, "mean_token_accuracy": 0.7452630162239074, "epoch": 0.08203445447087777, "step": 50 }, { "loss": 0.8928, "grad_norm": 0.338527649641037, "learning_rate": 9.993628283308581e-05, "entropy": 0.8918034493923187, "num_tokens": 1372211.0, "mean_token_accuracy": 0.7673784762620925, "epoch": 0.09844134536505332, "step": 60 }, { "loss": 0.8782, "grad_norm": 0.32297801971435547, "learning_rate": 9.971623444249021e-05, "entropy": 0.8743007063865662, "num_tokens": 1604226.0, "mean_token_accuracy": 0.7686779618263244, "epoch": 0.11484823625922888, "step": 70 }, { "loss": 0.8258, "grad_norm": 0.33724743127822876, "learning_rate": 9.933976038510333e-05, "entropy": 0.8295134097337723, "num_tokens": 1838081.0, "mean_token_accuracy": 0.7803715646266938, "epoch": 0.13125512715340443, "step": 80 }, { "loss": 0.7891, "grad_norm": 0.3447152078151703, "learning_rate": 9.88080451875917e-05, "entropy": 0.788675120472908, "num_tokens": 2071855.0, "mean_token_accuracy": 0.7867679089307785, "epoch": 0.14766201804757997, "step": 90 }, { "loss": 0.8272, "grad_norm": 0.3010920286178589, "learning_rate": 9.812276182268236e-05, "entropy": 0.8329822063446045, "num_tokens": 2299745.0, "mean_token_accuracy": 0.7769433975219726, "epoch": 0.16406890894175555, "step": 100 }, { "loss": 0.797, "grad_norm": 0.3699570298194885, "learning_rate": 9.728606644537178e-05, "entropy": 0.8039814531803131, "num_tokens": 2528567.0, "mean_token_accuracy": 0.784355628490448, "epoch": 0.1804757998359311, "step": 110 }, { "loss": 0.7743, "grad_norm": 0.3595486581325531, "learning_rate": 9.63005916088644e-05, "entropy": 0.7768057614564896, "num_tokens": 2751275.0, "mean_token_accuracy": 0.7889596104621888, "epoch": 0.19688269073010664, "step": 120 }, { "loss": 0.7735, "grad_norm": 0.3556647002696991, "learning_rate": 9.516943798158649e-05, "entropy": 0.776089146733284, "num_tokens": 2978023.0, "mean_token_accuracy": 0.7897650897502899, "epoch": 0.2132895816242822, "step": 130 }, { "loss": 0.7622, "grad_norm": 0.3334764242172241, "learning_rate": 9.389616459133597e-05, "entropy": 0.7803491950035095, "num_tokens": 3204517.0, "mean_token_accuracy": 0.7920954555273056, "epoch": 0.22969647251845776, "step": 140 }, { "loss": 0.7576, "grad_norm": 0.4009985029697418, "learning_rate": 9.248477762726437e-05, "entropy": 0.7722930639982224, "num_tokens": 3434323.0, "mean_token_accuracy": 0.792155721783638, "epoch": 0.2461033634126333, "step": 150 }, { "loss": 0.7741, "grad_norm": 0.3408016264438629, "learning_rate": 9.093971783492355e-05, "entropy": 0.7827848076820374, "num_tokens": 3661534.0, "mean_token_accuracy": 0.7909984678030014, "epoch": 0.26251025430680885, "step": 160 }, { "loss": 0.7409, "grad_norm": 0.3399813175201416, "learning_rate": 8.926584654403724e-05, "entropy": 0.7509049952030182, "num_tokens": 3890921.0, "mean_token_accuracy": 0.7963089287281037, "epoch": 0.27891714520098443, "step": 170 }, { "loss": 0.7478, "grad_norm": 0.30348479747772217, "learning_rate": 8.746843037295937e-05, "entropy": 0.7562560260295867, "num_tokens": 4119537.0, "mean_token_accuracy": 0.794661870598793, "epoch": 0.29532403609515995, "step": 180 }, { "loss": 0.737, "grad_norm": 0.3403272330760956, "learning_rate": 8.555312465794403e-05, "entropy": 0.7475169450044632, "num_tokens": 4357977.0, "mean_token_accuracy": 0.796341797709465, "epoch": 0.3117309269893355, "step": 190 }, { "loss": 0.7512, "grad_norm": 0.3219321072101593, "learning_rate": 8.352595565936554e-05, "entropy": 0.7560538798570633, "num_tokens": 4581046.0, "mean_token_accuracy": 0.7952380329370499, "epoch": 0.3281378178835111, "step": 200 }, { "loss": 0.7317, "grad_norm": 0.3038958013057709, "learning_rate": 8.139330160087374e-05, "entropy": 0.7322431743144989, "num_tokens": 4814227.0, "mean_token_accuracy": 0.7974914610385895, "epoch": 0.3445447087776866, "step": 210 }, { "loss": 0.7205, "grad_norm": 0.32955309748649597, "learning_rate": 7.916187260114263e-05, "entropy": 0.7275226473808288, "num_tokens": 5048157.0, "mean_token_accuracy": 0.8014717370271682, "epoch": 0.3609515996718622, "step": 220 }, { "loss": 0.7282, "grad_norm": 0.4224933683872223, "learning_rate": 7.68386895613546e-05, "entropy": 0.7310106873512268, "num_tokens": 5274071.0, "mean_token_accuracy": 0.7992997527122497, "epoch": 0.37735849056603776, "step": 230 }, { "loss": 0.6971, "grad_norm": 0.33381229639053345, "learning_rate": 7.443106207484776e-05, "entropy": 0.6995288044214248, "num_tokens": 5509360.0, "mean_token_accuracy": 0.8060027480125427, "epoch": 0.3937653814602133, "step": 240 }, { "loss": 0.711, "grad_norm": 0.31851011514663696, "learning_rate": 7.194656542843102e-05, "entropy": 0.7142476379871369, "num_tokens": 5738285.0, "mean_token_accuracy": 0.8031993210315704, "epoch": 0.41017227235438886, "step": 250 }, { "loss": 0.7237, "grad_norm": 0.33157357573509216, "learning_rate": 6.939301676772927e-05, "entropy": 0.7256091266870499, "num_tokens": 5962580.0, "mean_token_accuracy": 0.8013624370098114, "epoch": 0.4265791632485644, "step": 260 }, { "loss": 0.721, "grad_norm": 0.3370811641216278, "learning_rate": 6.677845050155107e-05, "entropy": 0.7265744864940643, "num_tokens": 6196094.0, "mean_token_accuracy": 0.7991349190473557, "epoch": 0.44298605414273995, "step": 270 }, { "loss": 0.6843, "grad_norm": 0.38120874762535095, "learning_rate": 6.411109302266616e-05, "entropy": 0.6908316820859909, "num_tokens": 6420601.0, "mean_token_accuracy": 0.8087756901979446, "epoch": 0.4593929450369155, "step": 280 }, { "loss": 0.7072, "grad_norm": 0.38430851697921753, "learning_rate": 6.139933682453036e-05, "entropy": 0.7136244118213654, "num_tokens": 6655119.0, "mean_token_accuracy": 0.8047497570514679, "epoch": 0.47579983593109104, "step": 290 }, { "loss": 0.7252, "grad_norm": 0.3509667217731476, "learning_rate": 5.8651714095396135e-05, "entropy": 0.7334865719079972, "num_tokens": 6876910.0, "mean_token_accuracy": 0.799770200252533, "epoch": 0.4922067268252666, "step": 300 }, { "loss": 0.6821, "grad_norm": 0.3153151571750641, "learning_rate": 5.587686987289189e-05, "entropy": 0.6873683601617813, "num_tokens": 7112299.0, "mean_token_accuracy": 0.8088241666555405, "epoch": 0.5086136177194421, "step": 310 }, { "loss": 0.7196, "grad_norm": 0.34774187207221985, "learning_rate": 5.3083534843535074e-05, "entropy": 0.7214434593915939, "num_tokens": 7346455.0, "mean_token_accuracy": 0.8039845436811447, "epoch": 0.5250205086136177, "step": 320 }, { "loss": 0.6638, "grad_norm": 0.387768030166626, "learning_rate": 5.028049787276249e-05, "entropy": 0.6638175457715988, "num_tokens": 7571791.0, "mean_token_accuracy": 0.812444058060646, "epoch": 0.5414273995077933, "step": 330 }, { "loss": 0.6766, "grad_norm": 0.3517005741596222, "learning_rate": 4.7476578351907954e-05, "entropy": 0.6799941658973694, "num_tokens": 7801695.0, "mean_token_accuracy": 0.811230742931366, "epoch": 0.5578342904019689, "step": 340 }, { "loss": 0.6779, "grad_norm": 0.32577675580978394, "learning_rate": 4.468059844913444e-05, "entropy": 0.6814499109983444, "num_tokens": 8039821.0, "mean_token_accuracy": 0.8104382246732712, "epoch": 0.5742411812961444, "step": 350 }, { "loss": 0.6539, "grad_norm": 0.35933127999305725, "learning_rate": 4.1901355351628945e-05, "entropy": 0.6585495263338089, "num_tokens": 8273149.0, "mean_token_accuracy": 0.8166852772235871, "epoch": 0.5906480721903199, "step": 360 }, { "loss": 0.6843, "grad_norm": 0.31598055362701416, "learning_rate": 3.914759358639719e-05, "entropy": 0.6861207246780395, "num_tokens": 8503164.0, "mean_token_accuracy": 0.8086160510778427, "epoch": 0.6070549630844955, "step": 370 }, { "loss": 0.7094, "grad_norm": 0.3427006006240845, "learning_rate": 3.642797750674629e-05, "entropy": 0.7133786290884018, "num_tokens": 8726435.0, "mean_token_accuracy": 0.8027824640274048, "epoch": 0.623461853978671, "step": 380 }, { "loss": 0.6877, "grad_norm": 0.34877264499664307, "learning_rate": 3.375106403102389e-05, "entropy": 0.6881168276071549, "num_tokens": 8954291.0, "mean_token_accuracy": 0.8073496133089065, "epoch": 0.6398687448728466, "step": 390 }, { "loss": 0.6835, "grad_norm": 0.3225726783275604, "learning_rate": 3.112527571938717e-05, "entropy": 0.6862167656421662, "num_tokens": 9177163.0, "mean_token_accuracy": 0.8089945495128632, "epoch": 0.6562756357670222, "step": 400 }, { "loss": 0.7008, "grad_norm": 0.329756498336792, "learning_rate": 2.8558874273312674e-05, "entropy": 0.7071986079216004, "num_tokens": 9404151.0, "mean_token_accuracy": 0.8044474363327027, "epoch": 0.6726825266611977, "step": 410 }, { "loss": 0.6947, "grad_norm": 0.3715651035308838, "learning_rate": 2.605993454122687e-05, "entropy": 0.69432153403759, "num_tokens": 9639400.0, "mean_token_accuracy": 0.8064981371164321, "epoch": 0.6890894175553732, "step": 420 }, { "loss": 0.7066, "grad_norm": 0.3599180281162262, "learning_rate": 2.3636319112045496e-05, "entropy": 0.7111173301935196, "num_tokens": 9867668.0, "mean_token_accuracy": 0.8044642627239227, "epoch": 0.7054963084495488, "step": 430 }, { "loss": 0.7259, "grad_norm": 0.2912443280220032, "learning_rate": 2.1295653576560163e-05, "entropy": 0.7254415988922119, "num_tokens": 10100826.0, "mean_token_accuracy": 0.8003069430589675, "epoch": 0.7219031993437244, "step": 440 }, { "loss": 0.6761, "grad_norm": 0.30693626403808594, "learning_rate": 1.9045302534508297e-05, "entropy": 0.6833124309778214, "num_tokens": 10332359.0, "mean_token_accuracy": 0.8109049916267395, "epoch": 0.7383100902379, "step": 450 }, { "loss": 0.736, "grad_norm": 0.3155220150947571, "learning_rate": 1.6892346422817946e-05, "entropy": 0.736938726902008, "num_tokens": 10563841.0, "mean_token_accuracy": 0.7979681819677353, "epoch": 0.7547169811320755, "step": 460 }, { "loss": 0.6945, "grad_norm": 0.3748078942298889, "learning_rate": 1.4843559237933473e-05, "entropy": 0.7031238079071045, "num_tokens": 10788876.0, "mean_token_accuracy": 0.8057133972644805, "epoch": 0.771123872026251, "step": 470 }, { "loss": 0.6776, "grad_norm": 0.3635546565055847, "learning_rate": 1.2905387222316822e-05, "entropy": 0.6805126667022705, "num_tokens": 11015156.0, "mean_token_accuracy": 0.8101104766130447, "epoch": 0.7875307629204266, "step": 480 }, { "loss": 0.676, "grad_norm": 0.3111382722854614, "learning_rate": 1.1083928582183711e-05, "entropy": 0.6774959295988083, "num_tokens": 11245860.0, "mean_token_accuracy": 0.8107922226190567, "epoch": 0.8039376538146021, "step": 490 }, { "loss": 0.6742, "grad_norm": 0.32188844680786133, "learning_rate": 9.384914300290748e-06, "entropy": 0.6842435419559478, "num_tokens": 11476241.0, "mean_token_accuracy": 0.8111602008342743, "epoch": 0.8203445447087777, "step": 500 }, { "loss": 0.6544, "grad_norm": 0.36185422539711, "learning_rate": 7.813690104143557e-06, "entropy": 0.6514311820268631, "num_tokens": 11708112.0, "mean_token_accuracy": 0.8149820327758789, "epoch": 0.8367514356029533, "step": 510 }, { "loss": 0.6765, "grad_norm": 0.3183876574039459, "learning_rate": 6.375199646360142e-06, "entropy": 0.6856429934501648, "num_tokens": 11939337.0, "mean_token_accuracy": 0.8090052843093872, "epoch": 0.8531583264971287, "step": 520 }, { "loss": 0.6761, "grad_norm": 0.3287002742290497, "learning_rate": 5.073968950110941e-06, "entropy": 0.6834310472011567, "num_tokens": 12174723.0, "mean_token_accuracy": 0.8104397505521774, "epoch": 0.8695652173913043, "step": 530 }, { "loss": 0.6751, "grad_norm": 0.35229238867759705, "learning_rate": 3.914092168575306e-06, "entropy": 0.6824660181999207, "num_tokens": 12398555.0, "mean_token_accuracy": 0.8104325562715531, "epoch": 0.8859721082854799, "step": 540 }, { "loss": 0.6834, "grad_norm": 0.38912639021873474, "learning_rate": 2.8992187032210518e-06, "entropy": 0.682240468263626, "num_tokens": 12624846.0, "mean_token_accuracy": 0.8091065347194671, "epoch": 0.9023789991796555, "step": 550 }, { "loss": 0.696, "grad_norm": 0.306355744600296, "learning_rate": 2.032541721437209e-06, "entropy": 0.7058492481708527, "num_tokens": 12859015.0, "mean_token_accuracy": 0.8039765357971191, "epoch": 0.918785890073831, "step": 560 }, { "loss": 0.6727, "grad_norm": 0.38508960604667664, "learning_rate": 1.3167881096480372e-06, "entropy": 0.681548210978508, "num_tokens": 13083551.0, "mean_token_accuracy": 0.8100948423147202, "epoch": 0.9351927809680065, "step": 570 }, { "loss": 0.7208, "grad_norm": 0.33893731236457825, "learning_rate": 7.542098935195918e-07, "entropy": 0.7220237284898758, "num_tokens": 13308857.0, "mean_token_accuracy": 0.8005945891141891, "epoch": 0.9515996718621821, "step": 580 }, { "loss": 0.6759, "grad_norm": 0.3534739911556244, "learning_rate": 3.465771522536854e-07, "entropy": 0.6739370882511139, "num_tokens": 13543857.0, "mean_token_accuracy": 0.8097480118274689, "epoch": 0.9680065627563577, "step": 590 }, { "loss": 0.6865, "grad_norm": 0.3553875982761383, "learning_rate": 9.517244926393609e-08, "entropy": 0.6908959478139878, "num_tokens": 13769574.0, "mean_token_accuracy": 0.806584045290947, "epoch": 0.9844134536505332, "step": 600 }, { "loss": 0.6525, "grad_norm": 0.5078703761100769, "learning_rate": 7.867967567354306e-10, "entropy": 0.6598060852602908, "num_tokens": 13978118.0, "mean_token_accuracy": 0.8165042933664823, "epoch": 1.0, "step": 610 }, { "train_runtime": 6449.4338, "train_samples_per_second": 3.024, "train_steps_per_second": 0.095, "total_flos": 7.156995496917074e+18, "train_loss": 0.7796625786140317, "epoch": 1.0, "step": 610 } ]