diff --git "a/checkpoint-6076/trainer_state.json" "b/checkpoint-6076/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-6076/trainer_state.json" @@ -0,0 +1,12304 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 608, + "global_step": 6076, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_entropy": 2.563803545593872, + "eval_loss": 1.3544182777404785, + "eval_mean_token_accuracy": 0.687368397152733, + "eval_num_tokens": 0.0, + "eval_runtime": 198.6529, + "eval_samples_per_second": 41.917, + "eval_steps_per_second": 6.987, + "step": 0 + }, + { + "entropy": 2.578546404838562, + "epoch": 0.0003291639236339697, + "grad_norm": 58.16326141357422, + "learning_rate": 0.0, + "loss": 1.4746, + "mean_token_accuracy": 0.6601307392120361, + "num_tokens": 2101.0, + "step": 1 + }, + { + "entropy": 2.547638326883316, + "epoch": 0.0016458196181698486, + "grad_norm": 50.99738311767578, + "learning_rate": 2.6315789473684213e-07, + "loss": 1.2383, + "mean_token_accuracy": 0.7003932222723961, + "num_tokens": 10649.0, + "step": 5 + }, + { + "entropy": 2.5566530227661133, + "epoch": 0.0032916392363396972, + "grad_norm": 38.35203552246094, + "learning_rate": 5.921052631578947e-07, + "loss": 1.0069, + "mean_token_accuracy": 0.7398434042930603, + "num_tokens": 21271.0, + "step": 10 + }, + { + "entropy": 2.452682948112488, + "epoch": 0.004937458854509546, + "grad_norm": 27.224767684936523, + "learning_rate": 9.210526315789474e-07, + "loss": 0.5295, + "mean_token_accuracy": 0.8550694227218628, + "num_tokens": 32112.0, + "step": 15 + }, + { + "entropy": 2.1593106389045715, + "epoch": 0.0065832784726793945, + "grad_norm": 23.221702575683594, + "learning_rate": 1.25e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.9244555592536926, + "num_tokens": 42919.0, + "step": 20 + }, + { + "entropy": 1.6036222577095032, + "epoch": 0.008229098090849244, + "grad_norm": 15.987848281860352, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.1681, + "mean_token_accuracy": 0.9467954277992249, + "num_tokens": 53618.0, + "step": 25 + }, + { + "entropy": 1.486020290851593, + "epoch": 0.009874917709019092, + "grad_norm": 11.537003517150879, + "learning_rate": 1.9078947368421057e-06, + "loss": 0.1446, + "mean_token_accuracy": 0.9633475482463837, + "num_tokens": 64755.0, + "step": 30 + }, + { + "entropy": 1.6325503945350648, + "epoch": 0.01152073732718894, + "grad_norm": 7.058774471282959, + "learning_rate": 2.236842105263158e-06, + "loss": 0.149, + "mean_token_accuracy": 0.9636693239212036, + "num_tokens": 75388.0, + "step": 35 + }, + { + "entropy": 1.7897157430648805, + "epoch": 0.013166556945358789, + "grad_norm": 6.356208324432373, + "learning_rate": 2.565789473684211e-06, + "loss": 0.0838, + "mean_token_accuracy": 0.9791653394699097, + "num_tokens": 86153.0, + "step": 40 + }, + { + "entropy": 1.7598371386528016, + "epoch": 0.014812376563528637, + "grad_norm": 11.368063926696777, + "learning_rate": 2.8947368421052634e-06, + "loss": 0.0981, + "mean_token_accuracy": 0.9723430037498474, + "num_tokens": 96649.0, + "step": 45 + }, + { + "entropy": 1.5412654876708984, + "epoch": 0.016458196181698487, + "grad_norm": 11.02826976776123, + "learning_rate": 3.223684210526316e-06, + "loss": 0.0802, + "mean_token_accuracy": 0.9778924286365509, + "num_tokens": 107209.0, + "step": 50 + }, + { + "entropy": 1.419243288040161, + "epoch": 0.018104015799868336, + "grad_norm": 14.368644714355469, + "learning_rate": 3.5526315789473687e-06, + "loss": 0.1422, + "mean_token_accuracy": 0.9643596112728119, + "num_tokens": 117665.0, + "step": 55 + }, + { + "entropy": 1.5144587397575378, + "epoch": 0.019749835418038184, + "grad_norm": 8.765934944152832, + "learning_rate": 3.8815789473684214e-06, + "loss": 0.1094, + "mean_token_accuracy": 0.9742422461509704, + "num_tokens": 128149.0, + "step": 60 + }, + { + "entropy": 1.6769550323486329, + "epoch": 0.021395655036208033, + "grad_norm": 9.26172161102295, + "learning_rate": 4.210526315789474e-06, + "loss": 0.0886, + "mean_token_accuracy": 0.9756901502609253, + "num_tokens": 138845.0, + "step": 65 + }, + { + "entropy": 1.804041564464569, + "epoch": 0.02304147465437788, + "grad_norm": 7.930450439453125, + "learning_rate": 4.539473684210527e-06, + "loss": 0.1117, + "mean_token_accuracy": 0.9712024033069611, + "num_tokens": 149695.0, + "step": 70 + }, + { + "entropy": 1.8376032829284668, + "epoch": 0.02468729427254773, + "grad_norm": 5.366224765777588, + "learning_rate": 4.8684210526315795e-06, + "loss": 0.0885, + "mean_token_accuracy": 0.9742565512657165, + "num_tokens": 160769.0, + "step": 75 + }, + { + "entropy": 1.8037242650985719, + "epoch": 0.026333113890717578, + "grad_norm": 50.815311431884766, + "learning_rate": 5.197368421052632e-06, + "loss": 0.0664, + "mean_token_accuracy": 0.9830366194248199, + "num_tokens": 171557.0, + "step": 80 + }, + { + "entropy": 1.7092326641082765, + "epoch": 0.027978933508887426, + "grad_norm": 22.222219467163086, + "learning_rate": 5.526315789473685e-06, + "loss": 0.0962, + "mean_token_accuracy": 0.9750658094882965, + "num_tokens": 182226.0, + "step": 85 + }, + { + "entropy": 1.6772519588470458, + "epoch": 0.029624753127057275, + "grad_norm": 7.502695560455322, + "learning_rate": 5.855263157894738e-06, + "loss": 0.1128, + "mean_token_accuracy": 0.9759301841259003, + "num_tokens": 193341.0, + "step": 90 + }, + { + "entropy": 1.8118021130561828, + "epoch": 0.031270572745227126, + "grad_norm": 4.66135311126709, + "learning_rate": 6.18421052631579e-06, + "loss": 0.0788, + "mean_token_accuracy": 0.9772343516349793, + "num_tokens": 204280.0, + "step": 95 + }, + { + "entropy": 1.7658962607383728, + "epoch": 0.032916392363396975, + "grad_norm": 9.973278045654297, + "learning_rate": 6.513157894736842e-06, + "loss": 0.0693, + "mean_token_accuracy": 0.9800639986991883, + "num_tokens": 214849.0, + "step": 100 + }, + { + "entropy": 1.6705022692680358, + "epoch": 0.03456221198156682, + "grad_norm": 12.13133430480957, + "learning_rate": 6.842105263157896e-06, + "loss": 0.1029, + "mean_token_accuracy": 0.9713725507259369, + "num_tokens": 225617.0, + "step": 105 + }, + { + "entropy": 1.6237557530403137, + "epoch": 0.03620803159973667, + "grad_norm": 12.934727668762207, + "learning_rate": 7.1710526315789475e-06, + "loss": 0.092, + "mean_token_accuracy": 0.9715362131595612, + "num_tokens": 236011.0, + "step": 110 + }, + { + "entropy": 1.6253692150115966, + "epoch": 0.03785385121790652, + "grad_norm": 5.850583076477051, + "learning_rate": 7.500000000000001e-06, + "loss": 0.0856, + "mean_token_accuracy": 0.9789658904075622, + "num_tokens": 246517.0, + "step": 115 + }, + { + "entropy": 1.699216902256012, + "epoch": 0.03949967083607637, + "grad_norm": 9.97748851776123, + "learning_rate": 7.828947368421054e-06, + "loss": 0.1076, + "mean_token_accuracy": 0.9751021385192871, + "num_tokens": 257039.0, + "step": 120 + }, + { + "entropy": 1.7488463997840882, + "epoch": 0.04114549045424622, + "grad_norm": 9.265748023986816, + "learning_rate": 8.157894736842106e-06, + "loss": 0.0841, + "mean_token_accuracy": 0.9726714611053466, + "num_tokens": 267666.0, + "step": 125 + }, + { + "entropy": 1.7969783902168275, + "epoch": 0.042791310072416065, + "grad_norm": 6.479616641998291, + "learning_rate": 8.486842105263159e-06, + "loss": 0.0759, + "mean_token_accuracy": 0.9793728470802308, + "num_tokens": 278047.0, + "step": 130 + }, + { + "entropy": 1.6162196516990661, + "epoch": 0.044437129690585914, + "grad_norm": 8.1745023727417, + "learning_rate": 8.81578947368421e-06, + "loss": 0.0684, + "mean_token_accuracy": 0.9798145830631256, + "num_tokens": 288638.0, + "step": 135 + }, + { + "entropy": 1.603439712524414, + "epoch": 0.04608294930875576, + "grad_norm": 11.199191093444824, + "learning_rate": 9.144736842105264e-06, + "loss": 0.1103, + "mean_token_accuracy": 0.973408317565918, + "num_tokens": 299280.0, + "step": 140 + }, + { + "entropy": 1.7563074707984925, + "epoch": 0.04772876892692561, + "grad_norm": 7.97125244140625, + "learning_rate": 9.473684210526315e-06, + "loss": 0.1014, + "mean_token_accuracy": 0.9699687659740448, + "num_tokens": 309815.0, + "step": 145 + }, + { + "entropy": 1.954578173160553, + "epoch": 0.04937458854509546, + "grad_norm": 5.959860324859619, + "learning_rate": 9.80263157894737e-06, + "loss": 0.0905, + "mean_token_accuracy": 0.9725431621074676, + "num_tokens": 320598.0, + "step": 150 + }, + { + "entropy": 1.997165060043335, + "epoch": 0.05102040816326531, + "grad_norm": 4.954457759857178, + "learning_rate": 1.0131578947368421e-05, + "loss": 0.0671, + "mean_token_accuracy": 0.983484423160553, + "num_tokens": 331592.0, + "step": 155 + }, + { + "entropy": 1.9534252166748047, + "epoch": 0.052666227781435156, + "grad_norm": 8.918072700500488, + "learning_rate": 1.0460526315789474e-05, + "loss": 0.0995, + "mean_token_accuracy": 0.9751366376876831, + "num_tokens": 342211.0, + "step": 160 + }, + { + "entropy": 1.8752238154411316, + "epoch": 0.054312047399605004, + "grad_norm": 8.222981452941895, + "learning_rate": 1.0789473684210528e-05, + "loss": 0.1081, + "mean_token_accuracy": 0.9731775224208832, + "num_tokens": 352894.0, + "step": 165 + }, + { + "entropy": 1.8922731161117554, + "epoch": 0.05595786701777485, + "grad_norm": 6.892168045043945, + "learning_rate": 1.111842105263158e-05, + "loss": 0.1093, + "mean_token_accuracy": 0.9666784107685089, + "num_tokens": 363571.0, + "step": 170 + }, + { + "entropy": 1.8998683929443358, + "epoch": 0.0576036866359447, + "grad_norm": 3.256152629852295, + "learning_rate": 1.1447368421052632e-05, + "loss": 0.0701, + "mean_token_accuracy": 0.9812213480472565, + "num_tokens": 374077.0, + "step": 175 + }, + { + "entropy": 1.8406203866004944, + "epoch": 0.05924950625411455, + "grad_norm": 5.9513983726501465, + "learning_rate": 1.1776315789473684e-05, + "loss": 0.0642, + "mean_token_accuracy": 0.9829223692417145, + "num_tokens": 384936.0, + "step": 180 + }, + { + "entropy": 1.8397523403167724, + "epoch": 0.0608953258722844, + "grad_norm": 5.223180770874023, + "learning_rate": 1.2105263157894737e-05, + "loss": 0.1066, + "mean_token_accuracy": 0.9780671417713165, + "num_tokens": 395309.0, + "step": 185 + }, + { + "entropy": 1.8732271909713745, + "epoch": 0.06254114549045425, + "grad_norm": 5.581716060638428, + "learning_rate": 1.2434210526315791e-05, + "loss": 0.1243, + "mean_token_accuracy": 0.9716349482536316, + "num_tokens": 406122.0, + "step": 190 + }, + { + "entropy": 1.8642462372779847, + "epoch": 0.0641869651086241, + "grad_norm": 4.00003719329834, + "learning_rate": 1.2763157894736844e-05, + "loss": 0.0881, + "mean_token_accuracy": 0.9758350074291229, + "num_tokens": 416622.0, + "step": 195 + }, + { + "entropy": 1.763167428970337, + "epoch": 0.06583278472679395, + "grad_norm": 4.925802707672119, + "learning_rate": 1.3092105263157895e-05, + "loss": 0.0865, + "mean_token_accuracy": 0.9732065200805664, + "num_tokens": 427374.0, + "step": 200 + }, + { + "entropy": 1.707397425174713, + "epoch": 0.06747860434496379, + "grad_norm": 3.0551066398620605, + "learning_rate": 1.3421052631578948e-05, + "loss": 0.081, + "mean_token_accuracy": 0.9832678198814392, + "num_tokens": 437906.0, + "step": 205 + }, + { + "entropy": 1.6210663080215455, + "epoch": 0.06912442396313365, + "grad_norm": 5.318910598754883, + "learning_rate": 1.375e-05, + "loss": 0.0959, + "mean_token_accuracy": 0.978535383939743, + "num_tokens": 448512.0, + "step": 210 + }, + { + "entropy": 1.7267163753509522, + "epoch": 0.07077024358130349, + "grad_norm": 18.240541458129883, + "learning_rate": 1.4078947368421055e-05, + "loss": 0.0878, + "mean_token_accuracy": 0.9779551386833191, + "num_tokens": 459296.0, + "step": 215 + }, + { + "entropy": 1.9074785947799682, + "epoch": 0.07241606319947334, + "grad_norm": 4.598775386810303, + "learning_rate": 1.4407894736842108e-05, + "loss": 0.1177, + "mean_token_accuracy": 0.9697455763816833, + "num_tokens": 469857.0, + "step": 220 + }, + { + "entropy": 2.076517927646637, + "epoch": 0.07406188281764318, + "grad_norm": 6.2975239753723145, + "learning_rate": 1.4736842105263159e-05, + "loss": 0.1191, + "mean_token_accuracy": 0.9704063773155213, + "num_tokens": 480679.0, + "step": 225 + }, + { + "entropy": 2.075191152095795, + "epoch": 0.07570770243581304, + "grad_norm": 5.611932754516602, + "learning_rate": 1.5065789473684211e-05, + "loss": 0.1126, + "mean_token_accuracy": 0.9733084321022034, + "num_tokens": 491266.0, + "step": 230 + }, + { + "entropy": 1.8972731590270997, + "epoch": 0.07735352205398288, + "grad_norm": 7.2364888191223145, + "learning_rate": 1.5394736842105264e-05, + "loss": 0.0767, + "mean_token_accuracy": 0.9798118114471436, + "num_tokens": 502150.0, + "step": 235 + }, + { + "entropy": 1.9421729803085328, + "epoch": 0.07899934167215274, + "grad_norm": 5.142597198486328, + "learning_rate": 1.572368421052632e-05, + "loss": 0.0984, + "mean_token_accuracy": 0.9767365634441376, + "num_tokens": 512761.0, + "step": 240 + }, + { + "entropy": 2.1024407386779784, + "epoch": 0.08064516129032258, + "grad_norm": 6.71290922164917, + "learning_rate": 1.605263157894737e-05, + "loss": 0.0941, + "mean_token_accuracy": 0.968624371290207, + "num_tokens": 523326.0, + "step": 245 + }, + { + "entropy": 2.108752429485321, + "epoch": 0.08229098090849243, + "grad_norm": 6.765599727630615, + "learning_rate": 1.638157894736842e-05, + "loss": 0.1249, + "mean_token_accuracy": 0.9727495968341827, + "num_tokens": 534182.0, + "step": 250 + }, + { + "entropy": 2.1626508831977844, + "epoch": 0.08393680052666228, + "grad_norm": 6.46060848236084, + "learning_rate": 1.6710526315789475e-05, + "loss": 0.0776, + "mean_token_accuracy": 0.9789380371570587, + "num_tokens": 544990.0, + "step": 255 + }, + { + "entropy": 2.010746192932129, + "epoch": 0.08558262014483213, + "grad_norm": 5.445334434509277, + "learning_rate": 1.703947368421053e-05, + "loss": 0.0855, + "mean_token_accuracy": 0.9773862421512604, + "num_tokens": 555445.0, + "step": 260 + }, + { + "entropy": 1.7812985062599183, + "epoch": 0.08722843976300197, + "grad_norm": 5.686192512512207, + "learning_rate": 1.736842105263158e-05, + "loss": 0.0891, + "mean_token_accuracy": 0.975337165594101, + "num_tokens": 566278.0, + "step": 265 + }, + { + "entropy": 1.6327629923820495, + "epoch": 0.08887425938117183, + "grad_norm": 6.93804931640625, + "learning_rate": 1.769736842105263e-05, + "loss": 0.1093, + "mean_token_accuracy": 0.9774836421012878, + "num_tokens": 577347.0, + "step": 270 + }, + { + "entropy": 1.6540838599205017, + "epoch": 0.09052007899934167, + "grad_norm": 4.582030773162842, + "learning_rate": 1.8026315789473685e-05, + "loss": 0.0712, + "mean_token_accuracy": 0.9831354200839997, + "num_tokens": 587873.0, + "step": 275 + }, + { + "entropy": 1.614749014377594, + "epoch": 0.09216589861751152, + "grad_norm": 4.63378381729126, + "learning_rate": 1.835526315789474e-05, + "loss": 0.0818, + "mean_token_accuracy": 0.9796861112117767, + "num_tokens": 598584.0, + "step": 280 + }, + { + "entropy": 1.7127745985984801, + "epoch": 0.09381171823568137, + "grad_norm": 5.947144508361816, + "learning_rate": 1.868421052631579e-05, + "loss": 0.0864, + "mean_token_accuracy": 0.9765213131904602, + "num_tokens": 608860.0, + "step": 285 + }, + { + "entropy": 1.871683084964752, + "epoch": 0.09545753785385122, + "grad_norm": 7.703008651733398, + "learning_rate": 1.9013157894736845e-05, + "loss": 0.0671, + "mean_token_accuracy": 0.9811056196689606, + "num_tokens": 619377.0, + "step": 290 + }, + { + "entropy": 1.9299902677536012, + "epoch": 0.09710335747202106, + "grad_norm": 6.416894912719727, + "learning_rate": 1.9342105263157896e-05, + "loss": 0.089, + "mean_token_accuracy": 0.9779207348823548, + "num_tokens": 630207.0, + "step": 295 + }, + { + "entropy": 2.0591522693634032, + "epoch": 0.09874917709019092, + "grad_norm": 2.4779272079467773, + "learning_rate": 1.9671052631578947e-05, + "loss": 0.0552, + "mean_token_accuracy": 0.9879588544368744, + "num_tokens": 640467.0, + "step": 300 + }, + { + "entropy": 2.069437396526337, + "epoch": 0.10039499670836076, + "grad_norm": 6.207917213439941, + "learning_rate": 2e-05, + "loss": 0.1163, + "mean_token_accuracy": 0.9745943665504455, + "num_tokens": 651341.0, + "step": 305 + }, + { + "entropy": 2.21907696723938, + "epoch": 0.10204081632653061, + "grad_norm": 4.81263542175293, + "learning_rate": 1.9999962969732823e-05, + "loss": 0.0853, + "mean_token_accuracy": 0.9776013195514679, + "num_tokens": 661968.0, + "step": 310 + }, + { + "entropy": 2.249755620956421, + "epoch": 0.10368663594470046, + "grad_norm": 5.646886348724365, + "learning_rate": 1.999985187920555e-05, + "loss": 0.114, + "mean_token_accuracy": 0.9764432370662689, + "num_tokens": 672739.0, + "step": 315 + }, + { + "entropy": 2.1538678646087646, + "epoch": 0.10533245556287031, + "grad_norm": 6.742599010467529, + "learning_rate": 1.9999666729240908e-05, + "loss": 0.0768, + "mean_token_accuracy": 0.9786048173904419, + "num_tokens": 683514.0, + "step": 320 + }, + { + "entropy": 1.9418102025985717, + "epoch": 0.10697827518104015, + "grad_norm": 5.256528854370117, + "learning_rate": 1.9999407521210143e-05, + "loss": 0.091, + "mean_token_accuracy": 0.9797745287418366, + "num_tokens": 694254.0, + "step": 325 + }, + { + "entropy": 1.8409188032150268, + "epoch": 0.10862409479921001, + "grad_norm": 6.152730464935303, + "learning_rate": 1.9999074257032953e-05, + "loss": 0.0924, + "mean_token_accuracy": 0.9798021256923676, + "num_tokens": 704933.0, + "step": 330 + }, + { + "entropy": 1.9683008909225463, + "epoch": 0.11026991441737985, + "grad_norm": 6.580195903778076, + "learning_rate": 1.9998666939177514e-05, + "loss": 0.1345, + "mean_token_accuracy": 0.9660695493221283, + "num_tokens": 715336.0, + "step": 335 + }, + { + "entropy": 2.197187530994415, + "epoch": 0.1119157340355497, + "grad_norm": 5.056039810180664, + "learning_rate": 1.9998185570660445e-05, + "loss": 0.1243, + "mean_token_accuracy": 0.9728079676628113, + "num_tokens": 726034.0, + "step": 340 + }, + { + "entropy": 2.3285026073455812, + "epoch": 0.11356155365371955, + "grad_norm": 6.029480934143066, + "learning_rate": 1.9997630155046784e-05, + "loss": 0.1302, + "mean_token_accuracy": 0.9665205538272857, + "num_tokens": 737083.0, + "step": 345 + }, + { + "entropy": 2.1871792316436767, + "epoch": 0.1152073732718894, + "grad_norm": 5.840641498565674, + "learning_rate": 1.9997000696449973e-05, + "loss": 0.087, + "mean_token_accuracy": 0.9797586441040039, + "num_tokens": 747987.0, + "step": 350 + }, + { + "entropy": 2.055765151977539, + "epoch": 0.11685319289005924, + "grad_norm": 10.675565719604492, + "learning_rate": 1.9996297199531813e-05, + "loss": 0.1199, + "mean_token_accuracy": 0.9669549405574799, + "num_tokens": 758532.0, + "step": 355 + }, + { + "entropy": 2.0415255546569826, + "epoch": 0.1184990125082291, + "grad_norm": 6.482625961303711, + "learning_rate": 1.9995519669502438e-05, + "loss": 0.1343, + "mean_token_accuracy": 0.9686403274536133, + "num_tokens": 769143.0, + "step": 360 + }, + { + "entropy": 2.146265912055969, + "epoch": 0.12014483212639894, + "grad_norm": 6.276276588439941, + "learning_rate": 1.9994668112120283e-05, + "loss": 0.0893, + "mean_token_accuracy": 0.9802504718303681, + "num_tokens": 779714.0, + "step": 365 + }, + { + "entropy": 2.267709231376648, + "epoch": 0.1217906517445688, + "grad_norm": 3.6624529361724854, + "learning_rate": 1.999374253369202e-05, + "loss": 0.1289, + "mean_token_accuracy": 0.967824923992157, + "num_tokens": 790409.0, + "step": 370 + }, + { + "entropy": 2.3208181142807005, + "epoch": 0.12343647136273865, + "grad_norm": 8.087434768676758, + "learning_rate": 1.999274294107254e-05, + "loss": 0.0958, + "mean_token_accuracy": 0.9716035008430481, + "num_tokens": 801020.0, + "step": 375 + }, + { + "entropy": 2.2529816150665285, + "epoch": 0.1250822909809085, + "grad_norm": 6.710372447967529, + "learning_rate": 1.9991669341664873e-05, + "loss": 0.1048, + "mean_token_accuracy": 0.971696001291275, + "num_tokens": 811861.0, + "step": 380 + }, + { + "entropy": 2.1597923040390015, + "epoch": 0.12672811059907835, + "grad_norm": 3.219034194946289, + "learning_rate": 1.9990521743420156e-05, + "loss": 0.1137, + "mean_token_accuracy": 0.9790872097015381, + "num_tokens": 822548.0, + "step": 385 + }, + { + "entropy": 2.06735520362854, + "epoch": 0.1283739302172482, + "grad_norm": 2.9641964435577393, + "learning_rate": 1.9989300154837564e-05, + "loss": 0.0935, + "mean_token_accuracy": 0.9800699293613434, + "num_tokens": 833368.0, + "step": 390 + }, + { + "entropy": 2.0970317363739013, + "epoch": 0.13001974983541803, + "grad_norm": 4.221366882324219, + "learning_rate": 1.9988004584964243e-05, + "loss": 0.0939, + "mean_token_accuracy": 0.9755605041980744, + "num_tokens": 844003.0, + "step": 395 + }, + { + "entropy": 2.073064410686493, + "epoch": 0.1316655694535879, + "grad_norm": 7.461073398590088, + "learning_rate": 1.9986635043395258e-05, + "loss": 0.1238, + "mean_token_accuracy": 0.9669863104820251, + "num_tokens": 854150.0, + "step": 400 + }, + { + "entropy": 1.975202488899231, + "epoch": 0.13331138907175774, + "grad_norm": 6.416271209716797, + "learning_rate": 1.9985191540273506e-05, + "loss": 0.0933, + "mean_token_accuracy": 0.9760487675666809, + "num_tokens": 864751.0, + "step": 405 + }, + { + "entropy": 1.9307495832443238, + "epoch": 0.13495720868992758, + "grad_norm": 6.192378044128418, + "learning_rate": 1.9983674086289647e-05, + "loss": 0.0756, + "mean_token_accuracy": 0.9799009144306183, + "num_tokens": 875364.0, + "step": 410 + }, + { + "entropy": 1.9320047855377198, + "epoch": 0.13660302830809742, + "grad_norm": 4.568058013916016, + "learning_rate": 1.9982082692682027e-05, + "loss": 0.0647, + "mean_token_accuracy": 0.9817326724529266, + "num_tokens": 885830.0, + "step": 415 + }, + { + "entropy": 1.8869742393493651, + "epoch": 0.1382488479262673, + "grad_norm": 6.1201019287109375, + "learning_rate": 1.998041737123659e-05, + "loss": 0.1027, + "mean_token_accuracy": 0.9717364609241486, + "num_tokens": 896622.0, + "step": 420 + }, + { + "entropy": 1.9822751045227052, + "epoch": 0.13989466754443713, + "grad_norm": 4.20133638381958, + "learning_rate": 1.9978678134286796e-05, + "loss": 0.1138, + "mean_token_accuracy": 0.9745773077011108, + "num_tokens": 907113.0, + "step": 425 + }, + { + "entropy": 2.109739363193512, + "epoch": 0.14154048716260698, + "grad_norm": 4.79110860824585, + "learning_rate": 1.997686499471353e-05, + "loss": 0.0759, + "mean_token_accuracy": 0.9797537565231323, + "num_tokens": 917765.0, + "step": 430 + }, + { + "entropy": 2.127437674999237, + "epoch": 0.14318630678077682, + "grad_norm": 7.135578632354736, + "learning_rate": 1.9974977965945e-05, + "loss": 0.0929, + "mean_token_accuracy": 0.9721114039421082, + "num_tokens": 928469.0, + "step": 435 + }, + { + "entropy": 1.9786985039710998, + "epoch": 0.1448321263989467, + "grad_norm": 5.621862888336182, + "learning_rate": 1.9973017061956638e-05, + "loss": 0.1085, + "mean_token_accuracy": 0.9724132001399994, + "num_tokens": 939212.0, + "step": 440 + }, + { + "entropy": 1.7866165399551392, + "epoch": 0.14647794601711653, + "grad_norm": 3.2050716876983643, + "learning_rate": 1.9970982297271007e-05, + "loss": 0.0794, + "mean_token_accuracy": 0.978398722410202, + "num_tokens": 950445.0, + "step": 445 + }, + { + "entropy": 1.7645732879638671, + "epoch": 0.14812376563528637, + "grad_norm": 6.321614742279053, + "learning_rate": 1.996887368695768e-05, + "loss": 0.0907, + "mean_token_accuracy": 0.9744451761245727, + "num_tokens": 961106.0, + "step": 450 + }, + { + "entropy": 1.7317794203758239, + "epoch": 0.1497695852534562, + "grad_norm": 4.161308765411377, + "learning_rate": 1.9966691246633143e-05, + "loss": 0.0625, + "mean_token_accuracy": 0.9846729040145874, + "num_tokens": 971898.0, + "step": 455 + }, + { + "entropy": 1.742343807220459, + "epoch": 0.15141540487162608, + "grad_norm": 4.4447503089904785, + "learning_rate": 1.9964434992460662e-05, + "loss": 0.0797, + "mean_token_accuracy": 0.9820732772350311, + "num_tokens": 982441.0, + "step": 460 + }, + { + "entropy": 1.8356537818908691, + "epoch": 0.15306122448979592, + "grad_norm": 10.55756664276123, + "learning_rate": 1.9962104941150177e-05, + "loss": 0.1023, + "mean_token_accuracy": 0.9745754778385163, + "num_tokens": 993128.0, + "step": 465 + }, + { + "entropy": 2.004080033302307, + "epoch": 0.15470704410796576, + "grad_norm": 3.8827741146087646, + "learning_rate": 1.995970110995817e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9818957507610321, + "num_tokens": 1003682.0, + "step": 470 + }, + { + "entropy": 2.0348508238792418, + "epoch": 0.1563528637261356, + "grad_norm": 5.195158958435059, + "learning_rate": 1.9957223516687545e-05, + "loss": 0.0934, + "mean_token_accuracy": 0.9747053384780884, + "num_tokens": 1014255.0, + "step": 475 + }, + { + "entropy": 2.0494486331939696, + "epoch": 0.15799868334430547, + "grad_norm": 6.356441497802734, + "learning_rate": 1.995467217968749e-05, + "loss": 0.1203, + "mean_token_accuracy": 0.9745293378829956, + "num_tokens": 1024794.0, + "step": 480 + }, + { + "entropy": 2.107176351547241, + "epoch": 0.15964450296247532, + "grad_norm": 4.578567028045654, + "learning_rate": 1.9952047117853345e-05, + "loss": 0.0774, + "mean_token_accuracy": 0.9803178489208222, + "num_tokens": 1035500.0, + "step": 485 + }, + { + "entropy": 2.1053681492805483, + "epoch": 0.16129032258064516, + "grad_norm": 3.887092113494873, + "learning_rate": 1.9949348350626456e-05, + "loss": 0.0903, + "mean_token_accuracy": 0.9732440710067749, + "num_tokens": 1046221.0, + "step": 490 + }, + { + "entropy": 2.17068989276886, + "epoch": 0.162936142198815, + "grad_norm": 8.976771354675293, + "learning_rate": 1.9946575897994042e-05, + "loss": 0.0549, + "mean_token_accuracy": 0.9809908986091613, + "num_tokens": 1056570.0, + "step": 495 + }, + { + "entropy": 2.05935001373291, + "epoch": 0.16458196181698487, + "grad_norm": 4.243111610412598, + "learning_rate": 1.994372978048903e-05, + "loss": 0.0691, + "mean_token_accuracy": 0.9819040298461914, + "num_tokens": 1067196.0, + "step": 500 + }, + { + "entropy": 2.047864282131195, + "epoch": 0.1662277814351547, + "grad_norm": 3.212517738342285, + "learning_rate": 1.9940810019189912e-05, + "loss": 0.0839, + "mean_token_accuracy": 0.9838090300559997, + "num_tokens": 1077680.0, + "step": 505 + }, + { + "entropy": 2.0122323632240295, + "epoch": 0.16787360105332455, + "grad_norm": 7.405489444732666, + "learning_rate": 1.9937816635720614e-05, + "loss": 0.1078, + "mean_token_accuracy": 0.9703534841537476, + "num_tokens": 1088097.0, + "step": 510 + }, + { + "entropy": 2.0119399070739745, + "epoch": 0.1695194206714944, + "grad_norm": 5.046123027801514, + "learning_rate": 1.9934749652250275e-05, + "loss": 0.085, + "mean_token_accuracy": 0.9790121138095855, + "num_tokens": 1098795.0, + "step": 515 + }, + { + "entropy": 2.0802261352539064, + "epoch": 0.17116524028966426, + "grad_norm": 4.109484672546387, + "learning_rate": 1.9931609091493154e-05, + "loss": 0.0873, + "mean_token_accuracy": 0.9734878361225128, + "num_tokens": 1109471.0, + "step": 520 + }, + { + "entropy": 2.095242714881897, + "epoch": 0.1728110599078341, + "grad_norm": 3.953889846801758, + "learning_rate": 1.9928394976708403e-05, + "loss": 0.089, + "mean_token_accuracy": 0.9813098549842835, + "num_tokens": 1119968.0, + "step": 525 + }, + { + "entropy": 2.2089867115020754, + "epoch": 0.17445687952600394, + "grad_norm": 5.200376510620117, + "learning_rate": 1.9925107331699928e-05, + "loss": 0.1003, + "mean_token_accuracy": 0.9750366508960724, + "num_tokens": 1130579.0, + "step": 530 + }, + { + "entropy": 2.2357950687408445, + "epoch": 0.17610269914417379, + "grad_norm": 5.884069919586182, + "learning_rate": 1.992174618081621e-05, + "loss": 0.0867, + "mean_token_accuracy": 0.979562520980835, + "num_tokens": 1140933.0, + "step": 535 + }, + { + "entropy": 2.1410197973251344, + "epoch": 0.17774851876234365, + "grad_norm": 6.01693058013916, + "learning_rate": 1.9918311548950102e-05, + "loss": 0.0957, + "mean_token_accuracy": 0.9744680047035217, + "num_tokens": 1151887.0, + "step": 540 + }, + { + "entropy": 2.228221225738525, + "epoch": 0.1793943383805135, + "grad_norm": 3.3614420890808105, + "learning_rate": 1.991480346153868e-05, + "loss": 0.0887, + "mean_token_accuracy": 0.9763470590114594, + "num_tokens": 1162403.0, + "step": 545 + }, + { + "entropy": 2.1968831777572633, + "epoch": 0.18104015799868334, + "grad_norm": 4.788167476654053, + "learning_rate": 1.9911221944563022e-05, + "loss": 0.0929, + "mean_token_accuracy": 0.9781689345836639, + "num_tokens": 1173014.0, + "step": 550 + }, + { + "entropy": 2.0954206466674803, + "epoch": 0.18268597761685318, + "grad_norm": 9.105097770690918, + "learning_rate": 1.9907567024548037e-05, + "loss": 0.0834, + "mean_token_accuracy": 0.9764321863651275, + "num_tokens": 1183721.0, + "step": 555 + }, + { + "entropy": 2.082006883621216, + "epoch": 0.18433179723502305, + "grad_norm": 5.6749267578125, + "learning_rate": 1.990383872856226e-05, + "loss": 0.0805, + "mean_token_accuracy": 0.9781780660152435, + "num_tokens": 1194206.0, + "step": 560 + }, + { + "entropy": 2.0844071984291075, + "epoch": 0.1859776168531929, + "grad_norm": 5.848039150238037, + "learning_rate": 1.9900037084217637e-05, + "loss": 0.1125, + "mean_token_accuracy": 0.9710779368877411, + "num_tokens": 1205005.0, + "step": 565 + }, + { + "entropy": 2.138554549217224, + "epoch": 0.18762343647136273, + "grad_norm": 4.573561668395996, + "learning_rate": 1.9896162119669367e-05, + "loss": 0.0784, + "mean_token_accuracy": 0.978986918926239, + "num_tokens": 1215653.0, + "step": 570 + }, + { + "entropy": 2.1205518007278443, + "epoch": 0.1892692560895326, + "grad_norm": 7.720390796661377, + "learning_rate": 1.9892213863615634e-05, + "loss": 0.1176, + "mean_token_accuracy": 0.9739040195941925, + "num_tokens": 1226075.0, + "step": 575 + }, + { + "entropy": 2.1874123096466063, + "epoch": 0.19091507570770244, + "grad_norm": 3.979853630065918, + "learning_rate": 1.9888192345297438e-05, + "loss": 0.092, + "mean_token_accuracy": 0.9774543702602386, + "num_tokens": 1236488.0, + "step": 580 + }, + { + "entropy": 2.240754795074463, + "epoch": 0.19256089532587228, + "grad_norm": 2.265650510787964, + "learning_rate": 1.9884097594498355e-05, + "loss": 0.0671, + "mean_token_accuracy": 0.9827798247337342, + "num_tokens": 1246837.0, + "step": 585 + }, + { + "entropy": 2.20197674036026, + "epoch": 0.19420671494404212, + "grad_norm": 8.468692779541016, + "learning_rate": 1.9879929641544328e-05, + "loss": 0.0818, + "mean_token_accuracy": 0.9744803309440613, + "num_tokens": 1257412.0, + "step": 590 + }, + { + "entropy": 2.1012311816215514, + "epoch": 0.195852534562212, + "grad_norm": 4.669355869293213, + "learning_rate": 1.9875688517303442e-05, + "loss": 0.0844, + "mean_token_accuracy": 0.9798676431179046, + "num_tokens": 1268044.0, + "step": 595 + }, + { + "entropy": 2.048796272277832, + "epoch": 0.19749835418038184, + "grad_norm": 3.2685132026672363, + "learning_rate": 1.987137425318569e-05, + "loss": 0.0601, + "mean_token_accuracy": 0.9844081580638886, + "num_tokens": 1278849.0, + "step": 600 + }, + { + "entropy": 2.0971350193023683, + "epoch": 0.19914417379855168, + "grad_norm": 5.5244317054748535, + "learning_rate": 1.9866986881142737e-05, + "loss": 0.1095, + "mean_token_accuracy": 0.9722402155399322, + "num_tokens": 1289463.0, + "step": 605 + }, + { + "epoch": 0.20013166556945358, + "eval_entropy": 2.089414228537584, + "eval_loss": 0.0923554077744484, + "eval_mean_token_accuracy": 0.9766228082200635, + "eval_num_tokens": 1295891.0, + "eval_runtime": 196.5996, + "eval_samples_per_second": 42.355, + "eval_steps_per_second": 7.06, + "step": 608 + }, + { + "entropy": 2.0954473495483397, + "epoch": 0.20078999341672152, + "grad_norm": 8.191391944885254, + "learning_rate": 1.9862526433667702e-05, + "loss": 0.1212, + "mean_token_accuracy": 0.97202570438385, + "num_tokens": 1300049.0, + "step": 610 + }, + { + "entropy": 2.0918750286102297, + "epoch": 0.2024358130348914, + "grad_norm": 5.4046478271484375, + "learning_rate": 1.9857992943794894e-05, + "loss": 0.0611, + "mean_token_accuracy": 0.984895896911621, + "num_tokens": 1310575.0, + "step": 615 + }, + { + "entropy": 2.0892592668533325, + "epoch": 0.20408163265306123, + "grad_norm": 4.6009840965271, + "learning_rate": 1.9853386445099585e-05, + "loss": 0.107, + "mean_token_accuracy": 0.9737550795078278, + "num_tokens": 1321474.0, + "step": 620 + }, + { + "entropy": 2.1745733857154845, + "epoch": 0.20572745227123107, + "grad_norm": 5.094756603240967, + "learning_rate": 1.9848706971697744e-05, + "loss": 0.0719, + "mean_token_accuracy": 0.9817912340164184, + "num_tokens": 1332086.0, + "step": 625 + }, + { + "entropy": 2.1878711700439455, + "epoch": 0.2073732718894009, + "grad_norm": 8.04099178314209, + "learning_rate": 1.98439545582458e-05, + "loss": 0.1345, + "mean_token_accuracy": 0.9673551738262176, + "num_tokens": 1342659.0, + "step": 630 + }, + { + "entropy": 2.1850104570388793, + "epoch": 0.20901909150757078, + "grad_norm": 5.618947982788086, + "learning_rate": 1.9839129239940392e-05, + "loss": 0.1271, + "mean_token_accuracy": 0.9722469210624695, + "num_tokens": 1353255.0, + "step": 635 + }, + { + "entropy": 2.2415198564529417, + "epoch": 0.21066491112574062, + "grad_norm": 4.641305923461914, + "learning_rate": 1.9834231052518074e-05, + "loss": 0.0918, + "mean_token_accuracy": 0.9772710859775543, + "num_tokens": 1363984.0, + "step": 640 + }, + { + "entropy": 2.126233756542206, + "epoch": 0.21231073074391046, + "grad_norm": 6.022129535675049, + "learning_rate": 1.9829260032255093e-05, + "loss": 0.1311, + "mean_token_accuracy": 0.9707091450691223, + "num_tokens": 1374522.0, + "step": 645 + }, + { + "entropy": 2.136021304130554, + "epoch": 0.2139565503620803, + "grad_norm": 5.552321910858154, + "learning_rate": 1.9824216215967082e-05, + "loss": 0.12, + "mean_token_accuracy": 0.9679823756217957, + "num_tokens": 1385283.0, + "step": 650 + }, + { + "entropy": 2.1720472097396852, + "epoch": 0.21560236998025017, + "grad_norm": 2.7927908897399902, + "learning_rate": 1.9819099641008817e-05, + "loss": 0.1065, + "mean_token_accuracy": 0.9764181435108185, + "num_tokens": 1395464.0, + "step": 655 + }, + { + "entropy": 2.1303334712982176, + "epoch": 0.21724818959842002, + "grad_norm": 6.158787250518799, + "learning_rate": 1.9813910345273927e-05, + "loss": 0.0986, + "mean_token_accuracy": 0.9694771349430085, + "num_tokens": 1406190.0, + "step": 660 + }, + { + "entropy": 2.0735477566719056, + "epoch": 0.21889400921658986, + "grad_norm": 3.1743812561035156, + "learning_rate": 1.9808648367194614e-05, + "loss": 0.1286, + "mean_token_accuracy": 0.9723605871200561, + "num_tokens": 1417233.0, + "step": 665 + }, + { + "entropy": 1.9879053115844727, + "epoch": 0.2205398288347597, + "grad_norm": 4.825967311859131, + "learning_rate": 1.980331374574137e-05, + "loss": 0.0633, + "mean_token_accuracy": 0.9823581516742707, + "num_tokens": 1427885.0, + "step": 670 + }, + { + "entropy": 1.953224778175354, + "epoch": 0.22218564845292957, + "grad_norm": 5.118729591369629, + "learning_rate": 1.979790652042268e-05, + "loss": 0.0851, + "mean_token_accuracy": 0.9767945885658265, + "num_tokens": 1438245.0, + "step": 675 + }, + { + "entropy": 1.881700599193573, + "epoch": 0.2238314680710994, + "grad_norm": 4.021531105041504, + "learning_rate": 1.9792426731284745e-05, + "loss": 0.1035, + "mean_token_accuracy": 0.9744091331958771, + "num_tokens": 1449016.0, + "step": 680 + }, + { + "entropy": 1.907604944705963, + "epoch": 0.22547728768926925, + "grad_norm": 3.6351332664489746, + "learning_rate": 1.9786874418911187e-05, + "loss": 0.069, + "mean_token_accuracy": 0.9822580933570861, + "num_tokens": 1459773.0, + "step": 685 + }, + { + "entropy": 1.8353791952133178, + "epoch": 0.2271231073074391, + "grad_norm": 5.1667022705078125, + "learning_rate": 1.9781249624422714e-05, + "loss": 0.0554, + "mean_token_accuracy": 0.9839705526828766, + "num_tokens": 1470406.0, + "step": 690 + }, + { + "entropy": 1.7690637350082397, + "epoch": 0.22876892692560896, + "grad_norm": 1.4933720827102661, + "learning_rate": 1.9775552389476865e-05, + "loss": 0.0929, + "mean_token_accuracy": 0.9805386900901795, + "num_tokens": 1481079.0, + "step": 695 + }, + { + "entropy": 1.7553959727287292, + "epoch": 0.2304147465437788, + "grad_norm": 7.114426612854004, + "learning_rate": 1.976978275626766e-05, + "loss": 0.0793, + "mean_token_accuracy": 0.9830655574798584, + "num_tokens": 1491605.0, + "step": 700 + }, + { + "entropy": 1.6573581099510193, + "epoch": 0.23206056616194864, + "grad_norm": 8.001904487609863, + "learning_rate": 1.976394076752531e-05, + "loss": 0.0476, + "mean_token_accuracy": 0.986677348613739, + "num_tokens": 1502176.0, + "step": 705 + }, + { + "entropy": 1.6726499915122985, + "epoch": 0.2337063857801185, + "grad_norm": 2.7234785556793213, + "learning_rate": 1.9758026466515902e-05, + "loss": 0.0755, + "mean_token_accuracy": 0.9787233471870422, + "num_tokens": 1512737.0, + "step": 710 + }, + { + "entropy": 1.6567459583282471, + "epoch": 0.23535220539828836, + "grad_norm": 4.0856547355651855, + "learning_rate": 1.975203989704106e-05, + "loss": 0.0923, + "mean_token_accuracy": 0.980503261089325, + "num_tokens": 1523545.0, + "step": 715 + }, + { + "entropy": 1.7129284858703613, + "epoch": 0.2369980250164582, + "grad_norm": 5.151584625244141, + "learning_rate": 1.9745981103437643e-05, + "loss": 0.0799, + "mean_token_accuracy": 0.979759806394577, + "num_tokens": 1534017.0, + "step": 720 + }, + { + "entropy": 1.7500203251838684, + "epoch": 0.23864384463462804, + "grad_norm": 4.301064968109131, + "learning_rate": 1.9739850130577393e-05, + "loss": 0.0907, + "mean_token_accuracy": 0.9790736913681031, + "num_tokens": 1544763.0, + "step": 725 + }, + { + "entropy": 1.741828978061676, + "epoch": 0.24028966425279788, + "grad_norm": 8.185233116149902, + "learning_rate": 1.973364702386663e-05, + "loss": 0.1146, + "mean_token_accuracy": 0.9718895375728607, + "num_tokens": 1555495.0, + "step": 730 + }, + { + "entropy": 1.706503689289093, + "epoch": 0.24193548387096775, + "grad_norm": 2.9231204986572266, + "learning_rate": 1.972737182924589e-05, + "loss": 0.0904, + "mean_token_accuracy": 0.9795828342437745, + "num_tokens": 1566520.0, + "step": 735 + }, + { + "entropy": 1.6863952279090881, + "epoch": 0.2435813034891376, + "grad_norm": 5.155311584472656, + "learning_rate": 1.9721024593189596e-05, + "loss": 0.0856, + "mean_token_accuracy": 0.9796004176139832, + "num_tokens": 1577011.0, + "step": 740 + }, + { + "entropy": 1.6566576838493348, + "epoch": 0.24522712310730743, + "grad_norm": 5.252721309661865, + "learning_rate": 1.9714605362705725e-05, + "loss": 0.0973, + "mean_token_accuracy": 0.9750127613544464, + "num_tokens": 1587496.0, + "step": 745 + }, + { + "entropy": 1.646758258342743, + "epoch": 0.2468729427254773, + "grad_norm": 6.778387069702148, + "learning_rate": 1.9708114185335434e-05, + "loss": 0.0768, + "mean_token_accuracy": 0.9819158792495728, + "num_tokens": 1598276.0, + "step": 750 + }, + { + "entropy": 1.5819305777549744, + "epoch": 0.24851876234364714, + "grad_norm": 2.566246747970581, + "learning_rate": 1.9701551109152732e-05, + "loss": 0.0955, + "mean_token_accuracy": 0.9769714951515198, + "num_tokens": 1609214.0, + "step": 755 + }, + { + "entropy": 1.5540341138839722, + "epoch": 0.250164581961817, + "grad_norm": 7.977406978607178, + "learning_rate": 1.9694916182764113e-05, + "loss": 0.1004, + "mean_token_accuracy": 0.9762414395809174, + "num_tokens": 1620096.0, + "step": 760 + }, + { + "entropy": 1.6054931163787842, + "epoch": 0.25181040157998685, + "grad_norm": 3.9608373641967773, + "learning_rate": 1.96882094553082e-05, + "loss": 0.0981, + "mean_token_accuracy": 0.9779304265975952, + "num_tokens": 1630872.0, + "step": 765 + }, + { + "entropy": 1.6776898503303528, + "epoch": 0.2534562211981567, + "grad_norm": 5.155510425567627, + "learning_rate": 1.9681430976455363e-05, + "loss": 0.0898, + "mean_token_accuracy": 0.9730437695980072, + "num_tokens": 1641601.0, + "step": 770 + }, + { + "entropy": 1.8331576704978942, + "epoch": 0.25510204081632654, + "grad_norm": 3.792520523071289, + "learning_rate": 1.9674580796407392e-05, + "loss": 0.098, + "mean_token_accuracy": 0.9796571433544159, + "num_tokens": 1652224.0, + "step": 775 + }, + { + "entropy": 1.8529532432556153, + "epoch": 0.2567478604344964, + "grad_norm": 5.989309310913086, + "learning_rate": 1.966765896589708e-05, + "loss": 0.1201, + "mean_token_accuracy": 0.9693608701229095, + "num_tokens": 1662822.0, + "step": 780 + }, + { + "entropy": 1.8614219188690186, + "epoch": 0.2583936800526662, + "grad_norm": 4.173523426055908, + "learning_rate": 1.9660665536187875e-05, + "loss": 0.092, + "mean_token_accuracy": 0.9794945657253266, + "num_tokens": 1673483.0, + "step": 785 + }, + { + "entropy": 1.8263102531433106, + "epoch": 0.26003949967083606, + "grad_norm": 3.7985875606536865, + "learning_rate": 1.965360055907349e-05, + "loss": 0.0998, + "mean_token_accuracy": 0.9791225135326386, + "num_tokens": 1684086.0, + "step": 790 + }, + { + "entropy": 1.8916141629219054, + "epoch": 0.2616853192890059, + "grad_norm": 5.6334052085876465, + "learning_rate": 1.9646464086877524e-05, + "loss": 0.0919, + "mean_token_accuracy": 0.9774609744548798, + "num_tokens": 1694892.0, + "step": 795 + }, + { + "entropy": 1.8735905647277833, + "epoch": 0.2633311389071758, + "grad_norm": 4.0586466789245605, + "learning_rate": 1.963925617245307e-05, + "loss": 0.0778, + "mean_token_accuracy": 0.9765025436878204, + "num_tokens": 1705771.0, + "step": 800 + }, + { + "entropy": 1.8850264430046082, + "epoch": 0.26497695852534564, + "grad_norm": 3.566467523574829, + "learning_rate": 1.963197686918233e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9790683507919311, + "num_tokens": 1716121.0, + "step": 805 + }, + { + "entropy": 1.8341678500175476, + "epoch": 0.2666227781435155, + "grad_norm": 3.038248062133789, + "learning_rate": 1.9624626230976208e-05, + "loss": 0.0674, + "mean_token_accuracy": 0.9794325351715087, + "num_tokens": 1726679.0, + "step": 810 + }, + { + "entropy": 1.8264055967330932, + "epoch": 0.2682685977616853, + "grad_norm": 7.074533462524414, + "learning_rate": 1.961720431227393e-05, + "loss": 0.0827, + "mean_token_accuracy": 0.9796413004398346, + "num_tokens": 1737353.0, + "step": 815 + }, + { + "entropy": 1.8825155735015868, + "epoch": 0.26991441737985516, + "grad_norm": 6.188788890838623, + "learning_rate": 1.9609711168042612e-05, + "loss": 0.0939, + "mean_token_accuracy": 0.9734999477863312, + "num_tokens": 1748061.0, + "step": 820 + }, + { + "entropy": 1.9580488204956055, + "epoch": 0.271560236998025, + "grad_norm": 3.9740562438964844, + "learning_rate": 1.9602146853776894e-05, + "loss": 0.0535, + "mean_token_accuracy": 0.9856287181377411, + "num_tokens": 1758768.0, + "step": 825 + }, + { + "entropy": 1.9614617347717285, + "epoch": 0.27320605661619485, + "grad_norm": 4.990133285522461, + "learning_rate": 1.9594511425498487e-05, + "loss": 0.0551, + "mean_token_accuracy": 0.9855091035366058, + "num_tokens": 1769139.0, + "step": 830 + }, + { + "entropy": 1.7971890330314637, + "epoch": 0.2748518762343647, + "grad_norm": 5.05476713180542, + "learning_rate": 1.958680493975578e-05, + "loss": 0.0687, + "mean_token_accuracy": 0.9821464836597442, + "num_tokens": 1780069.0, + "step": 835 + }, + { + "entropy": 1.7543489336967468, + "epoch": 0.2764976958525346, + "grad_norm": 4.147070407867432, + "learning_rate": 1.957902745362341e-05, + "loss": 0.0743, + "mean_token_accuracy": 0.9777528941631317, + "num_tokens": 1790802.0, + "step": 840 + }, + { + "entropy": 1.6835835099220275, + "epoch": 0.2781435154707044, + "grad_norm": 7.310057640075684, + "learning_rate": 1.957117902470187e-05, + "loss": 0.0913, + "mean_token_accuracy": 0.9787399888038635, + "num_tokens": 1801255.0, + "step": 845 + }, + { + "entropy": 1.7269906282424927, + "epoch": 0.27978933508887427, + "grad_norm": 6.00834321975708, + "learning_rate": 1.956325971111703e-05, + "loss": 0.1044, + "mean_token_accuracy": 0.9713066816329956, + "num_tokens": 1811973.0, + "step": 850 + }, + { + "entropy": 1.7431592702865601, + "epoch": 0.2814351547070441, + "grad_norm": 4.275935173034668, + "learning_rate": 1.955526957151976e-05, + "loss": 0.0821, + "mean_token_accuracy": 0.9789995789527893, + "num_tokens": 1822550.0, + "step": 855 + }, + { + "entropy": 1.7215226411819458, + "epoch": 0.28308097432521395, + "grad_norm": 2.8283443450927734, + "learning_rate": 1.954720866508546e-05, + "loss": 0.0592, + "mean_token_accuracy": 0.9890708029270172, + "num_tokens": 1833405.0, + "step": 860 + }, + { + "entropy": 1.7675551056861878, + "epoch": 0.2847267939433838, + "grad_norm": 4.092453956604004, + "learning_rate": 1.9539077051513624e-05, + "loss": 0.0992, + "mean_token_accuracy": 0.973135906457901, + "num_tokens": 1843944.0, + "step": 865 + }, + { + "entropy": 1.7665972590446473, + "epoch": 0.28637261356155364, + "grad_norm": 3.9631991386413574, + "learning_rate": 1.9530874791027425e-05, + "loss": 0.1248, + "mean_token_accuracy": 0.9706246018409729, + "num_tokens": 1854547.0, + "step": 870 + }, + { + "entropy": 1.7626410603523255, + "epoch": 0.2880184331797235, + "grad_norm": 5.616950988769531, + "learning_rate": 1.952260194437324e-05, + "loss": 0.0911, + "mean_token_accuracy": 0.9797743022441864, + "num_tokens": 1864880.0, + "step": 875 + }, + { + "entropy": 1.7034204244613647, + "epoch": 0.2896642527978934, + "grad_norm": 3.930392265319824, + "learning_rate": 1.9514258572820216e-05, + "loss": 0.0789, + "mean_token_accuracy": 0.9830069661140441, + "num_tokens": 1875669.0, + "step": 880 + }, + { + "entropy": 1.72216739654541, + "epoch": 0.2913100724160632, + "grad_norm": 3.14703631401062, + "learning_rate": 1.9505844738159807e-05, + "loss": 0.0851, + "mean_token_accuracy": 0.9783756732940674, + "num_tokens": 1886494.0, + "step": 885 + }, + { + "entropy": 1.671413540840149, + "epoch": 0.29295589203423306, + "grad_norm": 4.539898872375488, + "learning_rate": 1.949736050270532e-05, + "loss": 0.0766, + "mean_token_accuracy": 0.9854376316070557, + "num_tokens": 1897127.0, + "step": 890 + }, + { + "entropy": 1.6798272371292113, + "epoch": 0.2946017116524029, + "grad_norm": 3.341653347015381, + "learning_rate": 1.948880592929146e-05, + "loss": 0.0631, + "mean_token_accuracy": 0.9869305789470673, + "num_tokens": 1907594.0, + "step": 895 + }, + { + "entropy": 1.6980504512786865, + "epoch": 0.29624753127057274, + "grad_norm": 3.1306231021881104, + "learning_rate": 1.9480181081273846e-05, + "loss": 0.0964, + "mean_token_accuracy": 0.9781007289886474, + "num_tokens": 1918632.0, + "step": 900 + }, + { + "entropy": 1.7191000819206237, + "epoch": 0.2978933508887426, + "grad_norm": 3.709050416946411, + "learning_rate": 1.947148602252858e-05, + "loss": 0.0767, + "mean_token_accuracy": 0.9797729074954986, + "num_tokens": 1929270.0, + "step": 905 + }, + { + "entropy": 1.7052234768867494, + "epoch": 0.2995391705069124, + "grad_norm": 2.6057310104370117, + "learning_rate": 1.946272081745171e-05, + "loss": 0.0611, + "mean_token_accuracy": 0.9820807099342346, + "num_tokens": 1940143.0, + "step": 910 + }, + { + "entropy": 1.7210368633270263, + "epoch": 0.30118499012508226, + "grad_norm": 5.367160320281982, + "learning_rate": 1.9453885530958835e-05, + "loss": 0.0817, + "mean_token_accuracy": 0.9767018914222717, + "num_tokens": 1951059.0, + "step": 915 + }, + { + "entropy": 1.6308549642562866, + "epoch": 0.30283080974325216, + "grad_norm": 5.0577592849731445, + "learning_rate": 1.9444980228484542e-05, + "loss": 0.0641, + "mean_token_accuracy": 0.9800835251808167, + "num_tokens": 1962214.0, + "step": 920 + }, + { + "entropy": 1.6711305975914001, + "epoch": 0.304476629361422, + "grad_norm": 6.653420925140381, + "learning_rate": 1.9436004975981986e-05, + "loss": 0.108, + "mean_token_accuracy": 0.9743327260017395, + "num_tokens": 1972789.0, + "step": 925 + }, + { + "entropy": 1.679636800289154, + "epoch": 0.30612244897959184, + "grad_norm": 4.994918346405029, + "learning_rate": 1.9426959839922367e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.9823659062385559, + "num_tokens": 1983264.0, + "step": 930 + }, + { + "entropy": 1.7234082102775574, + "epoch": 0.3077682685977617, + "grad_norm": 5.689176082611084, + "learning_rate": 1.941784488729444e-05, + "loss": 0.0774, + "mean_token_accuracy": 0.9792181611061096, + "num_tokens": 1993783.0, + "step": 935 + }, + { + "entropy": 1.7042930126190186, + "epoch": 0.3094140882159315, + "grad_norm": 2.4102768898010254, + "learning_rate": 1.9408660185604035e-05, + "loss": 0.0876, + "mean_token_accuracy": 0.9759888887405396, + "num_tokens": 2004295.0, + "step": 940 + }, + { + "entropy": 1.6130717277526856, + "epoch": 0.31105990783410137, + "grad_norm": 2.775099277496338, + "learning_rate": 1.939940580287354e-05, + "loss": 0.0805, + "mean_token_accuracy": 0.9795459330081939, + "num_tokens": 2015224.0, + "step": 945 + }, + { + "entropy": 1.5929541826248168, + "epoch": 0.3127057274522712, + "grad_norm": 3.403689384460449, + "learning_rate": 1.9390081807641413e-05, + "loss": 0.0899, + "mean_token_accuracy": 0.9769740760326385, + "num_tokens": 2025787.0, + "step": 950 + }, + { + "entropy": 1.5571842432022094, + "epoch": 0.3143515470704411, + "grad_norm": 3.0090384483337402, + "learning_rate": 1.938068826896166e-05, + "loss": 0.06, + "mean_token_accuracy": 0.9822336971759796, + "num_tokens": 2036384.0, + "step": 955 + }, + { + "entropy": 1.5760810375213623, + "epoch": 0.31599736668861095, + "grad_norm": 2.722409725189209, + "learning_rate": 1.9371225256403328e-05, + "loss": 0.0888, + "mean_token_accuracy": 0.9776059925556183, + "num_tokens": 2047090.0, + "step": 960 + }, + { + "entropy": 1.6288114428520202, + "epoch": 0.3176431863067808, + "grad_norm": 2.649474620819092, + "learning_rate": 1.9361692840049997e-05, + "loss": 0.066, + "mean_token_accuracy": 0.979485136270523, + "num_tokens": 2057684.0, + "step": 965 + }, + { + "entropy": 1.710892951488495, + "epoch": 0.31928900592495063, + "grad_norm": 2.8936843872070312, + "learning_rate": 1.935209109049925e-05, + "loss": 0.0862, + "mean_token_accuracy": 0.9751926064491272, + "num_tokens": 2068108.0, + "step": 970 + }, + { + "entropy": 1.7829145908355712, + "epoch": 0.32093482554312047, + "grad_norm": 2.203496217727661, + "learning_rate": 1.9342420078862153e-05, + "loss": 0.0902, + "mean_token_accuracy": 0.9730205178260803, + "num_tokens": 2078727.0, + "step": 975 + }, + { + "entropy": 1.6852322578430177, + "epoch": 0.3225806451612903, + "grad_norm": 2.909604072570801, + "learning_rate": 1.933267987676274e-05, + "loss": 0.0754, + "mean_token_accuracy": 0.9842035174369812, + "num_tokens": 2089939.0, + "step": 980 + }, + { + "entropy": 1.73391033411026, + "epoch": 0.32422646477946016, + "grad_norm": 2.8957767486572266, + "learning_rate": 1.9322870556337466e-05, + "loss": 0.0894, + "mean_token_accuracy": 0.9820044219493866, + "num_tokens": 2100675.0, + "step": 985 + }, + { + "entropy": 1.7834203481674193, + "epoch": 0.32587228439763, + "grad_norm": 5.757988929748535, + "learning_rate": 1.931299219023469e-05, + "loss": 0.085, + "mean_token_accuracy": 0.977233600616455, + "num_tokens": 2111220.0, + "step": 990 + }, + { + "entropy": 1.7618568420410157, + "epoch": 0.3275181040157999, + "grad_norm": 5.181485176086426, + "learning_rate": 1.9303044851614106e-05, + "loss": 0.0899, + "mean_token_accuracy": 0.9746578216552735, + "num_tokens": 2121728.0, + "step": 995 + }, + { + "entropy": 1.7923735499382019, + "epoch": 0.32916392363396973, + "grad_norm": 9.63333797454834, + "learning_rate": 1.9293028614146246e-05, + "loss": 0.1188, + "mean_token_accuracy": 0.9688325583934784, + "num_tokens": 2132434.0, + "step": 1000 + }, + { + "entropy": 1.860191786289215, + "epoch": 0.3308097432521396, + "grad_norm": 4.145689487457275, + "learning_rate": 1.9282943552011892e-05, + "loss": 0.0798, + "mean_token_accuracy": 0.9769950270652771, + "num_tokens": 2142917.0, + "step": 1005 + }, + { + "entropy": 1.833854353427887, + "epoch": 0.3324555628703094, + "grad_norm": 3.677757501602173, + "learning_rate": 1.927278973990156e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.9798845946788788, + "num_tokens": 2153639.0, + "step": 1010 + }, + { + "entropy": 1.8186659574508668, + "epoch": 0.33410138248847926, + "grad_norm": 4.619242191314697, + "learning_rate": 1.9262567253014922e-05, + "loss": 0.0607, + "mean_token_accuracy": 0.9790556609630585, + "num_tokens": 2164673.0, + "step": 1015 + }, + { + "entropy": 1.8815788626670837, + "epoch": 0.3357472021066491, + "grad_norm": 6.019654273986816, + "learning_rate": 1.925227616706026e-05, + "loss": 0.0804, + "mean_token_accuracy": 0.9770128130912781, + "num_tokens": 2175078.0, + "step": 1020 + }, + { + "entropy": 1.8578821659088134, + "epoch": 0.33739302172481894, + "grad_norm": 4.046188831329346, + "learning_rate": 1.924191655825391e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.979873389005661, + "num_tokens": 2185679.0, + "step": 1025 + }, + { + "entropy": 1.9458543419837953, + "epoch": 0.3390388413429888, + "grad_norm": 8.663795471191406, + "learning_rate": 1.9231488503319687e-05, + "loss": 0.1165, + "mean_token_accuracy": 0.9754189252853394, + "num_tokens": 2196306.0, + "step": 1030 + }, + { + "entropy": 1.9784336686134338, + "epoch": 0.3406846609611587, + "grad_norm": 4.623157501220703, + "learning_rate": 1.9220992079488322e-05, + "loss": 0.0863, + "mean_token_accuracy": 0.9824258208274841, + "num_tokens": 2207144.0, + "step": 1035 + }, + { + "entropy": 2.0585790038108827, + "epoch": 0.3423304805793285, + "grad_norm": 5.089199066162109, + "learning_rate": 1.9210427364496894e-05, + "loss": 0.0813, + "mean_token_accuracy": 0.9741666853427887, + "num_tokens": 2217699.0, + "step": 1040 + }, + { + "entropy": 1.9863484263420106, + "epoch": 0.34397630019749836, + "grad_norm": 2.4876182079315186, + "learning_rate": 1.9199794436588244e-05, + "loss": 0.0936, + "mean_token_accuracy": 0.9748422205448151, + "num_tokens": 2228579.0, + "step": 1045 + }, + { + "entropy": 2.034103310108185, + "epoch": 0.3456221198156682, + "grad_norm": 3.3714184761047363, + "learning_rate": 1.9189093374510403e-05, + "loss": 0.1052, + "mean_token_accuracy": 0.9673223495483398, + "num_tokens": 2239194.0, + "step": 1050 + }, + { + "entropy": 1.954673457145691, + "epoch": 0.34726793943383805, + "grad_norm": 3.50931453704834, + "learning_rate": 1.917832425751601e-05, + "loss": 0.0578, + "mean_token_accuracy": 0.9865812182426452, + "num_tokens": 2249697.0, + "step": 1055 + }, + { + "entropy": 1.8305863380432128, + "epoch": 0.3489137590520079, + "grad_norm": 5.005392074584961, + "learning_rate": 1.9167487165361726e-05, + "loss": 0.0775, + "mean_token_accuracy": 0.9806217133998871, + "num_tokens": 2260788.0, + "step": 1060 + }, + { + "entropy": 1.7882127285003662, + "epoch": 0.35055957867017773, + "grad_norm": 1.8662757873535156, + "learning_rate": 1.9156582178307625e-05, + "loss": 0.1092, + "mean_token_accuracy": 0.9730004489421844, + "num_tokens": 2271354.0, + "step": 1065 + }, + { + "entropy": 1.7463540315628052, + "epoch": 0.35220539828834757, + "grad_norm": 7.321222305297852, + "learning_rate": 1.9145609377116635e-05, + "loss": 0.0659, + "mean_token_accuracy": 0.9823782205581665, + "num_tokens": 2282285.0, + "step": 1070 + }, + { + "entropy": 1.7453363537788391, + "epoch": 0.35385121790651747, + "grad_norm": 3.2284722328186035, + "learning_rate": 1.9134568843053895e-05, + "loss": 0.0813, + "mean_token_accuracy": 0.9803031504154205, + "num_tokens": 2293037.0, + "step": 1075 + }, + { + "entropy": 1.76043781042099, + "epoch": 0.3554970375246873, + "grad_norm": 2.238819122314453, + "learning_rate": 1.91234606578862e-05, + "loss": 0.0761, + "mean_token_accuracy": 0.9841830551624298, + "num_tokens": 2303850.0, + "step": 1080 + }, + { + "entropy": 1.7409037947654724, + "epoch": 0.35714285714285715, + "grad_norm": 7.227831840515137, + "learning_rate": 1.911228490388136e-05, + "loss": 0.0743, + "mean_token_accuracy": 0.9812696516513825, + "num_tokens": 2314805.0, + "step": 1085 + }, + { + "entropy": 1.8239703059196473, + "epoch": 0.358788676761027, + "grad_norm": 5.955824851989746, + "learning_rate": 1.9101041663807606e-05, + "loss": 0.0915, + "mean_token_accuracy": 0.976921284198761, + "num_tokens": 2325296.0, + "step": 1090 + }, + { + "entropy": 1.8835476994514466, + "epoch": 0.36043449637919683, + "grad_norm": 2.8221843242645264, + "learning_rate": 1.9089731020932972e-05, + "loss": 0.0631, + "mean_token_accuracy": 0.9831080317497254, + "num_tokens": 2336116.0, + "step": 1095 + }, + { + "entropy": 1.9333994030952453, + "epoch": 0.3620803159973667, + "grad_norm": 6.433844089508057, + "learning_rate": 1.907835305902469e-05, + "loss": 0.0998, + "mean_token_accuracy": 0.9790532469749451, + "num_tokens": 2346756.0, + "step": 1100 + }, + { + "entropy": 2.0366763949394224, + "epoch": 0.3637261356155365, + "grad_norm": 13.404642105102539, + "learning_rate": 1.906690786234855e-05, + "loss": 0.0569, + "mean_token_accuracy": 0.9880981385707855, + "num_tokens": 2357587.0, + "step": 1105 + }, + { + "entropy": 2.0615396976470945, + "epoch": 0.36537195523370636, + "grad_norm": 4.546095371246338, + "learning_rate": 1.9055395515668288e-05, + "loss": 0.0903, + "mean_token_accuracy": 0.9765397369861603, + "num_tokens": 2368379.0, + "step": 1110 + }, + { + "entropy": 2.036512088775635, + "epoch": 0.36701777485187626, + "grad_norm": 4.192592144012451, + "learning_rate": 1.9043816104244964e-05, + "loss": 0.0779, + "mean_token_accuracy": 0.9820389151573181, + "num_tokens": 2379490.0, + "step": 1115 + }, + { + "entropy": 2.113160729408264, + "epoch": 0.3686635944700461, + "grad_norm": 4.126070976257324, + "learning_rate": 1.9032169713836314e-05, + "loss": 0.0884, + "mean_token_accuracy": 0.9770908057689667, + "num_tokens": 2390060.0, + "step": 1120 + }, + { + "entropy": 2.1656644344329834, + "epoch": 0.37030941408821594, + "grad_norm": 2.7143678665161133, + "learning_rate": 1.9020456430696126e-05, + "loss": 0.0493, + "mean_token_accuracy": 0.9862718224525452, + "num_tokens": 2400221.0, + "step": 1125 + }, + { + "entropy": 2.1137581944465635, + "epoch": 0.3719552337063858, + "grad_norm": 4.085227012634277, + "learning_rate": 1.9008676341573606e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.988205760717392, + "num_tokens": 2410885.0, + "step": 1130 + }, + { + "entropy": 2.044228506088257, + "epoch": 0.3736010533245556, + "grad_norm": 5.65952205657959, + "learning_rate": 1.8996829533712723e-05, + "loss": 0.0629, + "mean_token_accuracy": 0.984690648317337, + "num_tokens": 2421494.0, + "step": 1135 + }, + { + "entropy": 2.0729115128517153, + "epoch": 0.37524687294272546, + "grad_norm": 3.8629510402679443, + "learning_rate": 1.898491609485156e-05, + "loss": 0.0862, + "mean_token_accuracy": 0.9760863423347473, + "num_tokens": 2432022.0, + "step": 1140 + }, + { + "entropy": 2.1294867634773254, + "epoch": 0.3768926925608953, + "grad_norm": 4.639437198638916, + "learning_rate": 1.8972936113221696e-05, + "loss": 0.0759, + "mean_token_accuracy": 0.9800122320652008, + "num_tokens": 2442748.0, + "step": 1145 + }, + { + "entropy": 2.141499698162079, + "epoch": 0.3785385121790652, + "grad_norm": 3.745588541030884, + "learning_rate": 1.8960889677547506e-05, + "loss": 0.0843, + "mean_token_accuracy": 0.9796210825443268, + "num_tokens": 2453552.0, + "step": 1150 + }, + { + "entropy": 2.2506140947341917, + "epoch": 0.38018433179723504, + "grad_norm": 1.9200206995010376, + "learning_rate": 1.8948776877045535e-05, + "loss": 0.0818, + "mean_token_accuracy": 0.9776529312133789, + "num_tokens": 2464201.0, + "step": 1155 + }, + { + "entropy": 2.2783297538757323, + "epoch": 0.3818301514154049, + "grad_norm": 3.2144906520843506, + "learning_rate": 1.893659780142384e-05, + "loss": 0.0748, + "mean_token_accuracy": 0.9799034297466278, + "num_tokens": 2475109.0, + "step": 1160 + }, + { + "entropy": 2.235105013847351, + "epoch": 0.3834759710335747, + "grad_norm": 11.074976921081543, + "learning_rate": 1.8924352540881298e-05, + "loss": 0.0759, + "mean_token_accuracy": 0.9808390080928803, + "num_tokens": 2485624.0, + "step": 1165 + }, + { + "entropy": 2.216382932662964, + "epoch": 0.38512179065174457, + "grad_norm": 5.54982328414917, + "learning_rate": 1.891204118610696e-05, + "loss": 0.1232, + "mean_token_accuracy": 0.9734435498714447, + "num_tokens": 2496301.0, + "step": 1170 + }, + { + "entropy": 2.362920618057251, + "epoch": 0.3867676102699144, + "grad_norm": 4.762061595916748, + "learning_rate": 1.8899663828279387e-05, + "loss": 0.0906, + "mean_token_accuracy": 0.9737257599830628, + "num_tokens": 2506985.0, + "step": 1175 + }, + { + "entropy": 2.3827850580215455, + "epoch": 0.38841342988808425, + "grad_norm": 4.1375732421875, + "learning_rate": 1.8887220559065946e-05, + "loss": 0.0868, + "mean_token_accuracy": 0.9808934390544891, + "num_tokens": 2517845.0, + "step": 1180 + }, + { + "entropy": 2.4273977756500242, + "epoch": 0.3900592495062541, + "grad_norm": 3.6323893070220947, + "learning_rate": 1.8874711470622152e-05, + "loss": 0.0928, + "mean_token_accuracy": 0.9732855081558227, + "num_tokens": 2528576.0, + "step": 1185 + }, + { + "entropy": 2.3609946250915526, + "epoch": 0.391705069124424, + "grad_norm": 2.174260377883911, + "learning_rate": 1.886213665559099e-05, + "loss": 0.0884, + "mean_token_accuracy": 0.9792202174663543, + "num_tokens": 2539169.0, + "step": 1190 + }, + { + "entropy": 2.293043828010559, + "epoch": 0.39335088874259383, + "grad_norm": 4.491889953613281, + "learning_rate": 1.8849496207102204e-05, + "loss": 0.0859, + "mean_token_accuracy": 0.9761190593242646, + "num_tokens": 2549674.0, + "step": 1195 + }, + { + "entropy": 2.2694995403289795, + "epoch": 0.39499670836076367, + "grad_norm": 5.4928812980651855, + "learning_rate": 1.8836790218771637e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.9821516633033752, + "num_tokens": 2560364.0, + "step": 1200 + }, + { + "entropy": 2.3214454650878906, + "epoch": 0.3966425279789335, + "grad_norm": 3.3526692390441895, + "learning_rate": 1.882401878470052e-05, + "loss": 0.0621, + "mean_token_accuracy": 0.9838208556175232, + "num_tokens": 2570839.0, + "step": 1205 + }, + { + "entropy": 2.4305625438690184, + "epoch": 0.39828834759710335, + "grad_norm": 2.5236058235168457, + "learning_rate": 1.8811181999474763e-05, + "loss": 0.0772, + "mean_token_accuracy": 0.9798738360404968, + "num_tokens": 2581459.0, + "step": 1210 + }, + { + "entropy": 2.4986673831939696, + "epoch": 0.3999341672152732, + "grad_norm": 5.663600921630859, + "learning_rate": 1.8798279958164295e-05, + "loss": 0.0663, + "mean_token_accuracy": 0.9851954817771912, + "num_tokens": 2591722.0, + "step": 1215 + }, + { + "epoch": 0.40026333113890716, + "eval_entropy": 2.455367434179405, + "eval_loss": 0.07768969982862473, + "eval_mean_token_accuracy": 0.9805304530889912, + "eval_num_tokens": 2593845.0, + "eval_runtime": 196.539, + "eval_samples_per_second": 42.368, + "eval_steps_per_second": 7.062, + "step": 1216 + }, + { + "entropy": 2.4370908737182617, + "epoch": 0.40157998683344304, + "grad_norm": 3.0529227256774902, + "learning_rate": 1.878531275632232e-05, + "loss": 0.1109, + "mean_token_accuracy": 0.9730750441551208, + "num_tokens": 2602413.0, + "step": 1220 + }, + { + "entropy": 2.4589828968048097, + "epoch": 0.4032258064516129, + "grad_norm": 7.966182231903076, + "learning_rate": 1.8772280489984628e-05, + "loss": 0.0739, + "mean_token_accuracy": 0.9801295638084412, + "num_tokens": 2613250.0, + "step": 1225 + }, + { + "entropy": 2.4703043937683105, + "epoch": 0.4048716260697828, + "grad_norm": 4.381008148193359, + "learning_rate": 1.875918325566888e-05, + "loss": 0.0812, + "mean_token_accuracy": 0.9783048331737518, + "num_tokens": 2624053.0, + "step": 1230 + }, + { + "entropy": 2.4899261713027956, + "epoch": 0.4065174456879526, + "grad_norm": 4.341674327850342, + "learning_rate": 1.8746021150373892e-05, + "loss": 0.0768, + "mean_token_accuracy": 0.9822542846202851, + "num_tokens": 2634584.0, + "step": 1235 + }, + { + "entropy": 2.5025168657302856, + "epoch": 0.40816326530612246, + "grad_norm": 4.821120262145996, + "learning_rate": 1.873279427157892e-05, + "loss": 0.0649, + "mean_token_accuracy": 0.9847510397434235, + "num_tokens": 2645173.0, + "step": 1240 + }, + { + "entropy": 2.5438794136047362, + "epoch": 0.4098090849242923, + "grad_norm": 4.269495010375977, + "learning_rate": 1.8719502717242937e-05, + "loss": 0.0801, + "mean_token_accuracy": 0.9804035782814026, + "num_tokens": 2655404.0, + "step": 1245 + }, + { + "entropy": 2.467151665687561, + "epoch": 0.41145490454246214, + "grad_norm": 2.994798421859741, + "learning_rate": 1.8706146585803903e-05, + "loss": 0.0676, + "mean_token_accuracy": 0.9815965473651886, + "num_tokens": 2666063.0, + "step": 1250 + }, + { + "entropy": 2.3562156200408935, + "epoch": 0.413100724160632, + "grad_norm": 4.536716461181641, + "learning_rate": 1.8692725976178038e-05, + "loss": 0.0718, + "mean_token_accuracy": 0.9820040583610534, + "num_tokens": 2676929.0, + "step": 1255 + }, + { + "entropy": 2.3580038785934447, + "epoch": 0.4147465437788018, + "grad_norm": 3.2308404445648193, + "learning_rate": 1.8679240987759098e-05, + "loss": 0.0654, + "mean_token_accuracy": 0.9846524596214294, + "num_tokens": 2687554.0, + "step": 1260 + }, + { + "entropy": 2.24311740398407, + "epoch": 0.41639236339697167, + "grad_norm": 3.5214755535125732, + "learning_rate": 1.8665691720417624e-05, + "loss": 0.0644, + "mean_token_accuracy": 0.9793017745018006, + "num_tokens": 2698138.0, + "step": 1265 + }, + { + "entropy": 2.195026898384094, + "epoch": 0.41803818301514156, + "grad_norm": 2.515350818634033, + "learning_rate": 1.865207827450022e-05, + "loss": 0.0624, + "mean_token_accuracy": 0.9875331342220306, + "num_tokens": 2708644.0, + "step": 1270 + }, + { + "entropy": 2.1811726808547975, + "epoch": 0.4196840026333114, + "grad_norm": 3.122403144836426, + "learning_rate": 1.8638400750828793e-05, + "loss": 0.0525, + "mean_token_accuracy": 0.9861293792724609, + "num_tokens": 2719229.0, + "step": 1275 + }, + { + "entropy": 2.1384164810180666, + "epoch": 0.42132982225148125, + "grad_norm": 3.760925531387329, + "learning_rate": 1.8624659250699807e-05, + "loss": 0.1054, + "mean_token_accuracy": 0.973668885231018, + "num_tokens": 2729859.0, + "step": 1280 + }, + { + "entropy": 2.136362624168396, + "epoch": 0.4229756418696511, + "grad_norm": 3.5055840015411377, + "learning_rate": 1.8610853875883553e-05, + "loss": 0.0707, + "mean_token_accuracy": 0.980605673789978, + "num_tokens": 2740393.0, + "step": 1285 + }, + { + "entropy": 2.1360629320144655, + "epoch": 0.42462146148782093, + "grad_norm": 5.643499851226807, + "learning_rate": 1.8596984728623374e-05, + "loss": 0.0891, + "mean_token_accuracy": 0.9784270226955414, + "num_tokens": 2750842.0, + "step": 1290 + }, + { + "entropy": 2.1816317081451415, + "epoch": 0.42626728110599077, + "grad_norm": 2.8327138423919678, + "learning_rate": 1.858305191163491e-05, + "loss": 0.0915, + "mean_token_accuracy": 0.9775237262248992, + "num_tokens": 2761249.0, + "step": 1295 + }, + { + "entropy": 2.2412596225738524, + "epoch": 0.4279131007241606, + "grad_norm": 1.0889869928359985, + "learning_rate": 1.8569055528105356e-05, + "loss": 0.0952, + "mean_token_accuracy": 0.9791838228702545, + "num_tokens": 2771715.0, + "step": 1300 + }, + { + "entropy": 2.200755214691162, + "epoch": 0.4295589203423305, + "grad_norm": 2.3900723457336426, + "learning_rate": 1.855499568169267e-05, + "loss": 0.0761, + "mean_token_accuracy": 0.9767322957515716, + "num_tokens": 2782449.0, + "step": 1305 + }, + { + "entropy": 2.1307253360748293, + "epoch": 0.43120473996050035, + "grad_norm": 3.4063174724578857, + "learning_rate": 1.854087247652483e-05, + "loss": 0.0675, + "mean_token_accuracy": 0.9809297919273376, + "num_tokens": 2792843.0, + "step": 1310 + }, + { + "entropy": 2.0838756799697875, + "epoch": 0.4328505595786702, + "grad_norm": 2.3350863456726074, + "learning_rate": 1.8526686017199046e-05, + "loss": 0.0859, + "mean_token_accuracy": 0.9777861475944519, + "num_tokens": 2803434.0, + "step": 1315 + }, + { + "entropy": 2.061713254451752, + "epoch": 0.43449637919684003, + "grad_norm": 3.2840187549591064, + "learning_rate": 1.8512436408780995e-05, + "loss": 0.0994, + "mean_token_accuracy": 0.979932302236557, + "num_tokens": 2814246.0, + "step": 1320 + }, + { + "entropy": 2.1339207768440245, + "epoch": 0.4361421988150099, + "grad_norm": 3.665444850921631, + "learning_rate": 1.8498123756804038e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9794367909431457, + "num_tokens": 2825017.0, + "step": 1325 + }, + { + "entropy": 2.1423274278640747, + "epoch": 0.4377880184331797, + "grad_norm": 4.194466590881348, + "learning_rate": 1.848374816726844e-05, + "loss": 0.0636, + "mean_token_accuracy": 0.9790065705776214, + "num_tokens": 2835765.0, + "step": 1330 + }, + { + "entropy": 2.220040726661682, + "epoch": 0.43943383805134956, + "grad_norm": 2.7448790073394775, + "learning_rate": 1.8469309746640587e-05, + "loss": 0.0651, + "mean_token_accuracy": 0.9824341356754303, + "num_tokens": 2846294.0, + "step": 1335 + }, + { + "entropy": 2.1996994495391844, + "epoch": 0.4410796576695194, + "grad_norm": 3.1929495334625244, + "learning_rate": 1.845480860185219e-05, + "loss": 0.0431, + "mean_token_accuracy": 0.9881280601024628, + "num_tokens": 2856892.0, + "step": 1340 + }, + { + "entropy": 2.0707924485206606, + "epoch": 0.4427254772876893, + "grad_norm": 7.250471115112305, + "learning_rate": 1.8440244840299507e-05, + "loss": 0.0697, + "mean_token_accuracy": 0.983168751001358, + "num_tokens": 2867704.0, + "step": 1345 + }, + { + "entropy": 1.9964185953140259, + "epoch": 0.44437129690585914, + "grad_norm": 5.546008586883545, + "learning_rate": 1.8425618569842528e-05, + "loss": 0.0749, + "mean_token_accuracy": 0.9826294600963592, + "num_tokens": 2878605.0, + "step": 1350 + }, + { + "entropy": 2.0520959854125977, + "epoch": 0.446017116524029, + "grad_norm": 2.6965854167938232, + "learning_rate": 1.8410929898804197e-05, + "loss": 0.0748, + "mean_token_accuracy": 0.983626252412796, + "num_tokens": 2889180.0, + "step": 1355 + }, + { + "entropy": 2.033618211746216, + "epoch": 0.4476629361421988, + "grad_norm": 6.2912116050720215, + "learning_rate": 1.83961789359696e-05, + "loss": 0.0658, + "mean_token_accuracy": 0.9775195240974426, + "num_tokens": 2899792.0, + "step": 1360 + }, + { + "entropy": 2.0571701526641846, + "epoch": 0.44930875576036866, + "grad_norm": 5.728684425354004, + "learning_rate": 1.838136579058515e-05, + "loss": 0.1018, + "mean_token_accuracy": 0.9756032049655914, + "num_tokens": 2910450.0, + "step": 1365 + }, + { + "entropy": 2.135818696022034, + "epoch": 0.4509545753785385, + "grad_norm": 2.3949079513549805, + "learning_rate": 1.8366490572357798e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9789416313171386, + "num_tokens": 2921065.0, + "step": 1370 + }, + { + "entropy": 2.213190221786499, + "epoch": 0.45260039499670834, + "grad_norm": 3.1379809379577637, + "learning_rate": 1.8351553391454203e-05, + "loss": 0.0926, + "mean_token_accuracy": 0.9748626470565795, + "num_tokens": 2931783.0, + "step": 1375 + }, + { + "entropy": 2.259020519256592, + "epoch": 0.4542462146148782, + "grad_norm": 6.139921188354492, + "learning_rate": 1.8336554358499923e-05, + "loss": 0.0883, + "mean_token_accuracy": 0.9798992872238159, + "num_tokens": 2942729.0, + "step": 1380 + }, + { + "entropy": 2.2399872303009034, + "epoch": 0.4558920342330481, + "grad_norm": 4.462457180023193, + "learning_rate": 1.83214935845786e-05, + "loss": 0.0942, + "mean_token_accuracy": 0.9766364514827728, + "num_tokens": 2953444.0, + "step": 1385 + }, + { + "entropy": 2.3818634748458862, + "epoch": 0.4575378538512179, + "grad_norm": 2.3201687335968018, + "learning_rate": 1.830637118123113e-05, + "loss": 0.06, + "mean_token_accuracy": 0.987183290719986, + "num_tokens": 2963968.0, + "step": 1390 + }, + { + "entropy": 2.319228458404541, + "epoch": 0.45918367346938777, + "grad_norm": 1.5512436628341675, + "learning_rate": 1.8291187260454842e-05, + "loss": 0.0429, + "mean_token_accuracy": 0.9886379659175872, + "num_tokens": 2974544.0, + "step": 1395 + }, + { + "entropy": 2.212476873397827, + "epoch": 0.4608294930875576, + "grad_norm": 3.864231586456299, + "learning_rate": 1.827594193470266e-05, + "loss": 0.0719, + "mean_token_accuracy": 0.9817937731742858, + "num_tokens": 2985035.0, + "step": 1400 + }, + { + "entropy": 2.160155749320984, + "epoch": 0.46247531270572745, + "grad_norm": 4.27418851852417, + "learning_rate": 1.8260635316882288e-05, + "loss": 0.0752, + "mean_token_accuracy": 0.9822825253009796, + "num_tokens": 2995814.0, + "step": 1405 + }, + { + "entropy": 2.1970561265945436, + "epoch": 0.4641211323238973, + "grad_norm": 7.332748889923096, + "learning_rate": 1.8245267520355348e-05, + "loss": 0.0787, + "mean_token_accuracy": 0.9827774345874787, + "num_tokens": 3006260.0, + "step": 1410 + }, + { + "entropy": 2.339077877998352, + "epoch": 0.46576695194206713, + "grad_norm": 1.8340346813201904, + "learning_rate": 1.8229838658936566e-05, + "loss": 0.0763, + "mean_token_accuracy": 0.980904471874237, + "num_tokens": 3016932.0, + "step": 1415 + }, + { + "entropy": 2.3882060766220095, + "epoch": 0.467412771560237, + "grad_norm": 2.010164737701416, + "learning_rate": 1.8214348846892913e-05, + "loss": 0.0488, + "mean_token_accuracy": 0.9874413549900055, + "num_tokens": 3027677.0, + "step": 1420 + }, + { + "entropy": 2.291885328292847, + "epoch": 0.46905859117840687, + "grad_norm": 2.1564724445343018, + "learning_rate": 1.8198798198942768e-05, + "loss": 0.0702, + "mean_token_accuracy": 0.9820050477981568, + "num_tokens": 3038413.0, + "step": 1425 + }, + { + "entropy": 2.285910177230835, + "epoch": 0.4707044107965767, + "grad_norm": 3.993086576461792, + "learning_rate": 1.8183186830255058e-05, + "loss": 0.0707, + "mean_token_accuracy": 0.9839109480381012, + "num_tokens": 3049009.0, + "step": 1430 + }, + { + "entropy": 2.243725609779358, + "epoch": 0.47235023041474655, + "grad_norm": 3.17380952835083, + "learning_rate": 1.8167514856448413e-05, + "loss": 0.0509, + "mean_token_accuracy": 0.9843453109264374, + "num_tokens": 3059423.0, + "step": 1435 + }, + { + "entropy": 2.16868679523468, + "epoch": 0.4739960500329164, + "grad_norm": 2.891127824783325, + "learning_rate": 1.815178239359031e-05, + "loss": 0.0586, + "mean_token_accuracy": 0.9830320298671722, + "num_tokens": 3070009.0, + "step": 1440 + }, + { + "entropy": 2.055533969402313, + "epoch": 0.47564186965108624, + "grad_norm": 4.964085578918457, + "learning_rate": 1.8135989558196207e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9861751973628998, + "num_tokens": 3080665.0, + "step": 1445 + }, + { + "entropy": 2.027339553833008, + "epoch": 0.4772876892692561, + "grad_norm": 7.009729862213135, + "learning_rate": 1.812013646722869e-05, + "loss": 0.096, + "mean_token_accuracy": 0.9795560419559479, + "num_tokens": 3090979.0, + "step": 1450 + }, + { + "entropy": 2.1618866443634035, + "epoch": 0.4789335088874259, + "grad_norm": 3.8036680221557617, + "learning_rate": 1.8104223238096596e-05, + "loss": 0.0902, + "mean_token_accuracy": 0.9777205526828766, + "num_tokens": 3101297.0, + "step": 1455 + }, + { + "entropy": 2.24229896068573, + "epoch": 0.48057932850559576, + "grad_norm": 2.5701446533203125, + "learning_rate": 1.808824998865415e-05, + "loss": 0.0397, + "mean_token_accuracy": 0.9876787722110748, + "num_tokens": 3111894.0, + "step": 1460 + }, + { + "entropy": 2.251754140853882, + "epoch": 0.48222514812376566, + "grad_norm": 5.5385847091674805, + "learning_rate": 1.8072216837200094e-05, + "loss": 0.0769, + "mean_token_accuracy": 0.9802028417587281, + "num_tokens": 3122674.0, + "step": 1465 + }, + { + "entropy": 2.185230827331543, + "epoch": 0.4838709677419355, + "grad_norm": 4.962352752685547, + "learning_rate": 1.80561239024768e-05, + "loss": 0.0529, + "mean_token_accuracy": 0.9858423829078674, + "num_tokens": 3133175.0, + "step": 1470 + }, + { + "entropy": 2.1357677221298217, + "epoch": 0.48551678736010534, + "grad_norm": 6.6793389320373535, + "learning_rate": 1.8039971303669407e-05, + "loss": 0.0698, + "mean_token_accuracy": 0.983429902791977, + "num_tokens": 3143713.0, + "step": 1475 + }, + { + "entropy": 2.1006932973861696, + "epoch": 0.4871626069782752, + "grad_norm": 1.9220439195632935, + "learning_rate": 1.8023759160404923e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9849179744720459, + "num_tokens": 3154604.0, + "step": 1480 + }, + { + "entropy": 2.1457018852233887, + "epoch": 0.488808426596445, + "grad_norm": 2.7263882160186768, + "learning_rate": 1.8007487592751343e-05, + "loss": 0.0578, + "mean_token_accuracy": 0.9825065851211547, + "num_tokens": 3165594.0, + "step": 1485 + }, + { + "entropy": 2.2146376371383667, + "epoch": 0.49045424621461486, + "grad_norm": 4.0069804191589355, + "learning_rate": 1.799115672121677e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.9832662463188171, + "num_tokens": 3176301.0, + "step": 1490 + }, + { + "entropy": 2.2213491678237913, + "epoch": 0.4921000658327847, + "grad_norm": 2.8298332691192627, + "learning_rate": 1.7974766666748516e-05, + "loss": 0.0803, + "mean_token_accuracy": 0.9778595089912414, + "num_tokens": 3186898.0, + "step": 1495 + }, + { + "entropy": 2.203892719745636, + "epoch": 0.4937458854509546, + "grad_norm": 2.422091484069824, + "learning_rate": 1.7958317550732193e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.9813643217086792, + "num_tokens": 3197716.0, + "step": 1500 + }, + { + "entropy": 2.1539814710617065, + "epoch": 0.49539170506912444, + "grad_norm": 1.6230992078781128, + "learning_rate": 1.7941809494990838e-05, + "loss": 0.0702, + "mean_token_accuracy": 0.9812954783439636, + "num_tokens": 3208681.0, + "step": 1505 + }, + { + "entropy": 2.0968565344810486, + "epoch": 0.4970375246872943, + "grad_norm": 4.276401042938232, + "learning_rate": 1.792524262178399e-05, + "loss": 0.0957, + "mean_token_accuracy": 0.9712316334247589, + "num_tokens": 3219313.0, + "step": 1510 + }, + { + "entropy": 1.9671455860137939, + "epoch": 0.4986833443054641, + "grad_norm": 2.3620660305023193, + "learning_rate": 1.7908617053806802e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9848778009414673, + "num_tokens": 3229908.0, + "step": 1515 + }, + { + "entropy": 1.9334992289543151, + "epoch": 0.500329163923634, + "grad_norm": 2.5839757919311523, + "learning_rate": 1.7891932914189112e-05, + "loss": 0.0542, + "mean_token_accuracy": 0.9889938116073609, + "num_tokens": 3240444.0, + "step": 1520 + }, + { + "entropy": 1.9328988552093507, + "epoch": 0.5019749835418038, + "grad_norm": 2.2053468227386475, + "learning_rate": 1.7875190326494552e-05, + "loss": 0.0706, + "mean_token_accuracy": 0.9844483017921448, + "num_tokens": 3251129.0, + "step": 1525 + }, + { + "entropy": 1.900172483921051, + "epoch": 0.5036208031599737, + "grad_norm": 3.773327112197876, + "learning_rate": 1.7858389414719628e-05, + "loss": 0.0567, + "mean_token_accuracy": 0.9843137383460998, + "num_tokens": 3261867.0, + "step": 1530 + }, + { + "entropy": 1.9451073169708253, + "epoch": 0.5052666227781435, + "grad_norm": 3.007178544998169, + "learning_rate": 1.7841530303292782e-05, + "loss": 0.0478, + "mean_token_accuracy": 0.9842455625534058, + "num_tokens": 3272277.0, + "step": 1535 + }, + { + "entropy": 1.9109816670417785, + "epoch": 0.5069124423963134, + "grad_norm": 6.694243431091309, + "learning_rate": 1.78246131170735e-05, + "loss": 0.0934, + "mean_token_accuracy": 0.9757687628269196, + "num_tokens": 3282740.0, + "step": 1540 + }, + { + "entropy": 1.945954716205597, + "epoch": 0.5085582620144832, + "grad_norm": 4.0865936279296875, + "learning_rate": 1.780763798135136e-05, + "loss": 0.055, + "mean_token_accuracy": 0.9839641332626343, + "num_tokens": 3293583.0, + "step": 1545 + }, + { + "entropy": 1.8939694523811341, + "epoch": 0.5102040816326531, + "grad_norm": 5.349853038787842, + "learning_rate": 1.779060502184513e-05, + "loss": 0.093, + "mean_token_accuracy": 0.9804658055305481, + "num_tokens": 3304579.0, + "step": 1550 + }, + { + "entropy": 1.9622906923294068, + "epoch": 0.5118499012508229, + "grad_norm": 4.101441860198975, + "learning_rate": 1.777351436470182e-05, + "loss": 0.0978, + "mean_token_accuracy": 0.9735360503196716, + "num_tokens": 3315316.0, + "step": 1555 + }, + { + "entropy": 2.1062827825546266, + "epoch": 0.5134957208689928, + "grad_norm": 2.2984564304351807, + "learning_rate": 1.775636613649574e-05, + "loss": 0.0659, + "mean_token_accuracy": 0.9824669599533081, + "num_tokens": 3325816.0, + "step": 1560 + }, + { + "entropy": 2.0575469732284546, + "epoch": 0.5151415404871627, + "grad_norm": 3.7546675205230713, + "learning_rate": 1.7739160464227593e-05, + "loss": 0.0902, + "mean_token_accuracy": 0.9760908842086792, + "num_tokens": 3336506.0, + "step": 1565 + }, + { + "entropy": 1.938406229019165, + "epoch": 0.5167873601053324, + "grad_norm": 2.3365416526794434, + "learning_rate": 1.7721897475323508e-05, + "loss": 0.084, + "mean_token_accuracy": 0.9817908525466919, + "num_tokens": 3346922.0, + "step": 1570 + }, + { + "entropy": 1.9214160919189454, + "epoch": 0.5184331797235023, + "grad_norm": 3.7224996089935303, + "learning_rate": 1.7704577297634096e-05, + "loss": 0.0595, + "mean_token_accuracy": 0.9839179992675782, + "num_tokens": 3357419.0, + "step": 1575 + }, + { + "entropy": 1.8380493998527527, + "epoch": 0.5200789993416721, + "grad_norm": 4.267509937286377, + "learning_rate": 1.768720005943353e-05, + "loss": 0.087, + "mean_token_accuracy": 0.9799594342708587, + "num_tokens": 3368205.0, + "step": 1580 + }, + { + "entropy": 1.8690474152565002, + "epoch": 0.521724818959842, + "grad_norm": 2.147115468978882, + "learning_rate": 1.7669765889418553e-05, + "loss": 0.1006, + "mean_token_accuracy": 0.9794728994369507, + "num_tokens": 3378592.0, + "step": 1585 + }, + { + "entropy": 1.880600130558014, + "epoch": 0.5233706385780118, + "grad_norm": 3.2792811393737793, + "learning_rate": 1.7652274916707566e-05, + "loss": 0.0709, + "mean_token_accuracy": 0.983178836107254, + "num_tokens": 3389050.0, + "step": 1590 + }, + { + "entropy": 1.9782520651817321, + "epoch": 0.5250164581961817, + "grad_norm": 2.2735812664031982, + "learning_rate": 1.7634727270839645e-05, + "loss": 0.0628, + "mean_token_accuracy": 0.9815650939941406, + "num_tokens": 3399987.0, + "step": 1595 + }, + { + "entropy": 2.048723006248474, + "epoch": 0.5266622778143516, + "grad_norm": 6.50067138671875, + "learning_rate": 1.761712308177359e-05, + "loss": 0.1481, + "mean_token_accuracy": 0.959316509962082, + "num_tokens": 3410677.0, + "step": 1600 + }, + { + "entropy": 2.094959032535553, + "epoch": 0.5283080974325214, + "grad_norm": 7.985395908355713, + "learning_rate": 1.7599462479886976e-05, + "loss": 0.0756, + "mean_token_accuracy": 0.9803730607032776, + "num_tokens": 3421071.0, + "step": 1605 + }, + { + "entropy": 2.046134078502655, + "epoch": 0.5299539170506913, + "grad_norm": 5.17471981048584, + "learning_rate": 1.7581745595975158e-05, + "loss": 0.0956, + "mean_token_accuracy": 0.9822999238967896, + "num_tokens": 3432117.0, + "step": 1610 + }, + { + "entropy": 2.171485185623169, + "epoch": 0.5315997366688611, + "grad_norm": 1.3401827812194824, + "learning_rate": 1.7563972561250323e-05, + "loss": 0.0513, + "mean_token_accuracy": 0.9851057410240174, + "num_tokens": 3442596.0, + "step": 1615 + }, + { + "entropy": 2.094390308856964, + "epoch": 0.533245556287031, + "grad_norm": 5.245898246765137, + "learning_rate": 1.7546143507340517e-05, + "loss": 0.068, + "mean_token_accuracy": 0.9802603662014008, + "num_tokens": 3453151.0, + "step": 1620 + }, + { + "entropy": 2.0529758810997008, + "epoch": 0.5348913759052007, + "grad_norm": 2.95365309715271, + "learning_rate": 1.7528258566288666e-05, + "loss": 0.0872, + "mean_token_accuracy": 0.9790965497493744, + "num_tokens": 3463685.0, + "step": 1625 + }, + { + "entropy": 2.0168527364730835, + "epoch": 0.5365371955233706, + "grad_norm": 3.4701573848724365, + "learning_rate": 1.75103178705516e-05, + "loss": 0.0809, + "mean_token_accuracy": 0.979897940158844, + "num_tokens": 3474671.0, + "step": 1630 + }, + { + "entropy": 2.0435917496681215, + "epoch": 0.5381830151415404, + "grad_norm": 2.95245361328125, + "learning_rate": 1.7492321552999076e-05, + "loss": 0.0423, + "mean_token_accuracy": 0.9878913462162018, + "num_tokens": 3485155.0, + "step": 1635 + }, + { + "entropy": 1.9816929221153259, + "epoch": 0.5398288347597103, + "grad_norm": 3.2681093215942383, + "learning_rate": 1.747426974691277e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9853954315185547, + "num_tokens": 3495688.0, + "step": 1640 + }, + { + "entropy": 1.8952534437179565, + "epoch": 0.5414746543778802, + "grad_norm": 3.5420217514038086, + "learning_rate": 1.7456162585985335e-05, + "loss": 0.055, + "mean_token_accuracy": 0.9834610342979431, + "num_tokens": 3506316.0, + "step": 1645 + }, + { + "entropy": 1.9181370496749879, + "epoch": 0.54312047399605, + "grad_norm": 2.8061180114746094, + "learning_rate": 1.7438000204319365e-05, + "loss": 0.0615, + "mean_token_accuracy": 0.9796540081501007, + "num_tokens": 3517096.0, + "step": 1650 + }, + { + "entropy": 1.900493335723877, + "epoch": 0.5447662936142199, + "grad_norm": 6.022247791290283, + "learning_rate": 1.7419782736426433e-05, + "loss": 0.0686, + "mean_token_accuracy": 0.9824603438377381, + "num_tokens": 3527608.0, + "step": 1655 + }, + { + "entropy": 1.9555213689804076, + "epoch": 0.5464121132323897, + "grad_norm": 2.4826130867004395, + "learning_rate": 1.7401510317226077e-05, + "loss": 0.0602, + "mean_token_accuracy": 0.9860040962696075, + "num_tokens": 3538016.0, + "step": 1660 + }, + { + "entropy": 1.9975322604179382, + "epoch": 0.5480579328505596, + "grad_norm": 1.6209336519241333, + "learning_rate": 1.7383183082044814e-05, + "loss": 0.0632, + "mean_token_accuracy": 0.9838661909103393, + "num_tokens": 3548608.0, + "step": 1665 + }, + { + "entropy": 2.0439995765686034, + "epoch": 0.5497037524687294, + "grad_norm": 2.632179021835327, + "learning_rate": 1.7364801166615124e-05, + "loss": 0.0615, + "mean_token_accuracy": 0.9858087658882141, + "num_tokens": 3558846.0, + "step": 1670 + }, + { + "entropy": 2.084509313106537, + "epoch": 0.5513495720868993, + "grad_norm": 2.9909346103668213, + "learning_rate": 1.7346364707074453e-05, + "loss": 0.045, + "mean_token_accuracy": 0.9885495722293853, + "num_tokens": 3569553.0, + "step": 1675 + }, + { + "entropy": 2.1361532688140867, + "epoch": 0.5529953917050692, + "grad_norm": 2.4530577659606934, + "learning_rate": 1.732787383996421e-05, + "loss": 0.0783, + "mean_token_accuracy": 0.9758898675441742, + "num_tokens": 3580282.0, + "step": 1680 + }, + { + "entropy": 2.134485626220703, + "epoch": 0.554641211323239, + "grad_norm": 2.1198272705078125, + "learning_rate": 1.7309328702228742e-05, + "loss": 0.0757, + "mean_token_accuracy": 0.9795774221420288, + "num_tokens": 3591130.0, + "step": 1685 + }, + { + "entropy": 2.2107550859451295, + "epoch": 0.5562870309414089, + "grad_norm": 3.947378158569336, + "learning_rate": 1.729072943121433e-05, + "loss": 0.1107, + "mean_token_accuracy": 0.9773122131824493, + "num_tokens": 3601615.0, + "step": 1690 + }, + { + "entropy": 2.156628942489624, + "epoch": 0.5579328505595786, + "grad_norm": 5.117584228515625, + "learning_rate": 1.727207616466817e-05, + "loss": 0.0573, + "mean_token_accuracy": 0.9852468788623809, + "num_tokens": 3612527.0, + "step": 1695 + }, + { + "entropy": 2.1495004177093504, + "epoch": 0.5595786701777485, + "grad_norm": 3.0924577713012695, + "learning_rate": 1.725336904073735e-05, + "loss": 0.0499, + "mean_token_accuracy": 0.9855070650577545, + "num_tokens": 3623196.0, + "step": 1700 + }, + { + "entropy": 2.139863908290863, + "epoch": 0.5612244897959183, + "grad_norm": 2.3820176124572754, + "learning_rate": 1.723460819796783e-05, + "loss": 0.0439, + "mean_token_accuracy": 0.9874242305755615, + "num_tokens": 3634081.0, + "step": 1705 + }, + { + "entropy": 2.0737234473228456, + "epoch": 0.5628703094140882, + "grad_norm": 4.342321395874023, + "learning_rate": 1.7215793775303415e-05, + "loss": 0.0857, + "mean_token_accuracy": 0.9764466941356659, + "num_tokens": 3644499.0, + "step": 1710 + }, + { + "entropy": 2.0394096612930297, + "epoch": 0.5645161290322581, + "grad_norm": 2.6576485633850098, + "learning_rate": 1.719692591208472e-05, + "loss": 0.0679, + "mean_token_accuracy": 0.9818757474422455, + "num_tokens": 3655219.0, + "step": 1715 + }, + { + "entropy": 2.023374152183533, + "epoch": 0.5661619486504279, + "grad_norm": 1.6631029844284058, + "learning_rate": 1.7178004748048157e-05, + "loss": 0.0681, + "mean_token_accuracy": 0.9818334817886353, + "num_tokens": 3665785.0, + "step": 1720 + }, + { + "entropy": 2.0491710901260376, + "epoch": 0.5678077682685978, + "grad_norm": 1.3181400299072266, + "learning_rate": 1.7159030423324873e-05, + "loss": 0.0395, + "mean_token_accuracy": 0.9904541313648224, + "num_tokens": 3676536.0, + "step": 1725 + }, + { + "entropy": 1.9250925421714782, + "epoch": 0.5694535878867676, + "grad_norm": 3.0907676219940186, + "learning_rate": 1.7140003078439727e-05, + "loss": 0.0726, + "mean_token_accuracy": 0.9785450220108032, + "num_tokens": 3687519.0, + "step": 1730 + }, + { + "entropy": 1.9846270561218262, + "epoch": 0.5710994075049375, + "grad_norm": 2.5993564128875732, + "learning_rate": 1.712092285431026e-05, + "loss": 0.0729, + "mean_token_accuracy": 0.9780683517456055, + "num_tokens": 3698184.0, + "step": 1735 + }, + { + "entropy": 1.9407028794288634, + "epoch": 0.5727452271231073, + "grad_norm": 2.01981520652771, + "learning_rate": 1.710178989224562e-05, + "loss": 0.0693, + "mean_token_accuracy": 0.9816851794719696, + "num_tokens": 3709017.0, + "step": 1740 + }, + { + "entropy": 1.9194607377052306, + "epoch": 0.5743910467412772, + "grad_norm": 2.252833843231201, + "learning_rate": 1.7082604333945557e-05, + "loss": 0.0855, + "mean_token_accuracy": 0.9747888624668122, + "num_tokens": 3719609.0, + "step": 1745 + }, + { + "entropy": 1.9611817717552185, + "epoch": 0.576036866359447, + "grad_norm": 6.348813056945801, + "learning_rate": 1.7063366321499338e-05, + "loss": 0.0764, + "mean_token_accuracy": 0.9813452184200286, + "num_tokens": 3730007.0, + "step": 1750 + }, + { + "entropy": 1.9477728486061097, + "epoch": 0.5776826859776168, + "grad_norm": 2.917712926864624, + "learning_rate": 1.7044075997384707e-05, + "loss": 0.0559, + "mean_token_accuracy": 0.9842540562152863, + "num_tokens": 3740830.0, + "step": 1755 + }, + { + "entropy": 2.0349454164505003, + "epoch": 0.5793285055957867, + "grad_norm": 2.951350212097168, + "learning_rate": 1.7024733504466843e-05, + "loss": 0.0465, + "mean_token_accuracy": 0.9853815793991089, + "num_tokens": 3751273.0, + "step": 1760 + }, + { + "entropy": 2.0125353813171385, + "epoch": 0.5809743252139565, + "grad_norm": 2.382908582687378, + "learning_rate": 1.7005338985997273e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9872986257076264, + "num_tokens": 3761728.0, + "step": 1765 + }, + { + "entropy": 2.04194176197052, + "epoch": 0.5826201448321264, + "grad_norm": 5.703618049621582, + "learning_rate": 1.6985892585612848e-05, + "loss": 0.1025, + "mean_token_accuracy": 0.9684024691581726, + "num_tokens": 3772794.0, + "step": 1770 + }, + { + "entropy": 2.026109528541565, + "epoch": 0.5842659644502962, + "grad_norm": 2.672579288482666, + "learning_rate": 1.6966394447334638e-05, + "loss": 0.0634, + "mean_token_accuracy": 0.9840988755226135, + "num_tokens": 3783388.0, + "step": 1775 + }, + { + "entropy": 2.0058682441711424, + "epoch": 0.5859117840684661, + "grad_norm": 2.395892858505249, + "learning_rate": 1.69468447155669e-05, + "loss": 0.0432, + "mean_token_accuracy": 0.9877268433570862, + "num_tokens": 3793954.0, + "step": 1780 + }, + { + "entropy": 1.9445565462112426, + "epoch": 0.5875576036866359, + "grad_norm": 2.7134897708892822, + "learning_rate": 1.6927243535095995e-05, + "loss": 0.077, + "mean_token_accuracy": 0.9795953154563903, + "num_tokens": 3804724.0, + "step": 1785 + }, + { + "entropy": 1.936465561389923, + "epoch": 0.5892034233048058, + "grad_norm": 1.9062567949295044, + "learning_rate": 1.6907591051089313e-05, + "loss": 0.0685, + "mean_token_accuracy": 0.9803402423858643, + "num_tokens": 3815254.0, + "step": 1790 + }, + { + "entropy": 1.9493210196495057, + "epoch": 0.5908492429229757, + "grad_norm": 3.1633780002593994, + "learning_rate": 1.6887887409094195e-05, + "loss": 0.0475, + "mean_token_accuracy": 0.9850197255611419, + "num_tokens": 3825885.0, + "step": 1795 + }, + { + "entropy": 1.8948396682739257, + "epoch": 0.5924950625411455, + "grad_norm": 3.2793633937835693, + "learning_rate": 1.6868132755036875e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9861530363559723, + "num_tokens": 3836784.0, + "step": 1800 + }, + { + "entropy": 1.935602605342865, + "epoch": 0.5941408821593154, + "grad_norm": 3.2881906032562256, + "learning_rate": 1.6848327235221368e-05, + "loss": 0.0827, + "mean_token_accuracy": 0.9799518406391143, + "num_tokens": 3847307.0, + "step": 1805 + }, + { + "entropy": 1.9104020118713378, + "epoch": 0.5957867017774852, + "grad_norm": 1.5671672821044922, + "learning_rate": 1.6828470996328418e-05, + "loss": 0.0566, + "mean_token_accuracy": 0.9839206755161285, + "num_tokens": 3858140.0, + "step": 1810 + }, + { + "entropy": 1.9541502475738526, + "epoch": 0.5974325213956551, + "grad_norm": 3.0334932804107666, + "learning_rate": 1.680856418541439e-05, + "loss": 0.0717, + "mean_token_accuracy": 0.9826797246932983, + "num_tokens": 3868625.0, + "step": 1815 + }, + { + "entropy": 1.9071632623672485, + "epoch": 0.5990783410138248, + "grad_norm": 3.8940181732177734, + "learning_rate": 1.6788606949910188e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.9818158507347107, + "num_tokens": 3879340.0, + "step": 1820 + }, + { + "epoch": 0.6003949967083607, + "eval_entropy": 1.9597396565445562, + "eval_loss": 0.06656259298324585, + "eval_mean_token_accuracy": 0.9824842680084602, + "eval_num_tokens": 3887690.0, + "eval_runtime": 195.6298, + "eval_samples_per_second": 42.565, + "eval_steps_per_second": 7.095, + "step": 1824 + }, + { + "entropy": 1.948818302154541, + "epoch": 0.6007241606319947, + "grad_norm": 2.146757125854492, + "learning_rate": 1.6768599437620166e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.9824907779693604, + "num_tokens": 3889882.0, + "step": 1825 + }, + { + "entropy": 1.953722858428955, + "epoch": 0.6023699802501645, + "grad_norm": 4.7035112380981445, + "learning_rate": 1.6748541796721026e-05, + "loss": 0.0759, + "mean_token_accuracy": 0.9829168796539307, + "num_tokens": 3900763.0, + "step": 1830 + }, + { + "entropy": 1.9083998680114747, + "epoch": 0.6040157998683344, + "grad_norm": 3.567713737487793, + "learning_rate": 1.6728434175760733e-05, + "loss": 0.0602, + "mean_token_accuracy": 0.9855620503425598, + "num_tokens": 3911307.0, + "step": 1835 + }, + { + "entropy": 1.9374099254608155, + "epoch": 0.6056616194865043, + "grad_norm": 5.015064716339111, + "learning_rate": 1.6708276723657396e-05, + "loss": 0.0556, + "mean_token_accuracy": 0.986617261171341, + "num_tokens": 3921526.0, + "step": 1840 + }, + { + "entropy": 1.9269861102104187, + "epoch": 0.6073074391046741, + "grad_norm": 3.2826483249664307, + "learning_rate": 1.6688069589698194e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.9803520023822785, + "num_tokens": 3932144.0, + "step": 1845 + }, + { + "entropy": 1.8954154014587403, + "epoch": 0.608953258722844, + "grad_norm": 6.194475173950195, + "learning_rate": 1.6667812923538226e-05, + "loss": 0.0841, + "mean_token_accuracy": 0.9779143512248993, + "num_tokens": 3943304.0, + "step": 1850 + }, + { + "entropy": 1.989688503742218, + "epoch": 0.6105990783410138, + "grad_norm": 4.787328243255615, + "learning_rate": 1.664750687519945e-05, + "loss": 0.0542, + "mean_token_accuracy": 0.9866823434829712, + "num_tokens": 3953946.0, + "step": 1855 + }, + { + "entropy": 2.018328094482422, + "epoch": 0.6122448979591837, + "grad_norm": 6.004918098449707, + "learning_rate": 1.662715159506955e-05, + "loss": 0.0683, + "mean_token_accuracy": 0.9783054530620575, + "num_tokens": 3964563.0, + "step": 1860 + }, + { + "entropy": 2.0147560954093935, + "epoch": 0.6138907175773535, + "grad_norm": 3.540149450302124, + "learning_rate": 1.6606747233900816e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.983444994688034, + "num_tokens": 3974938.0, + "step": 1865 + }, + { + "entropy": 2.0170241355895997, + "epoch": 0.6155365371955234, + "grad_norm": 2.3520843982696533, + "learning_rate": 1.6586293942809034e-05, + "loss": 0.0675, + "mean_token_accuracy": 0.980956619977951, + "num_tokens": 3985380.0, + "step": 1870 + }, + { + "entropy": 2.022734725475311, + "epoch": 0.6171823568136933, + "grad_norm": 2.997664451599121, + "learning_rate": 1.6565791873272373e-05, + "loss": 0.0435, + "mean_token_accuracy": 0.9847745478153229, + "num_tokens": 3995690.0, + "step": 1875 + }, + { + "entropy": 2.0023676991462707, + "epoch": 0.618828176431863, + "grad_norm": 2.1239171028137207, + "learning_rate": 1.6545241177130254e-05, + "loss": 0.0486, + "mean_token_accuracy": 0.9879401743412017, + "num_tokens": 4006475.0, + "step": 1880 + }, + { + "entropy": 1.9986587166786194, + "epoch": 0.620473996050033, + "grad_norm": 3.1917483806610107, + "learning_rate": 1.652464200658223e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9839587926864624, + "num_tokens": 4017079.0, + "step": 1885 + }, + { + "entropy": 2.038959813117981, + "epoch": 0.6221198156682027, + "grad_norm": 2.038222551345825, + "learning_rate": 1.650399451418686e-05, + "loss": 0.0706, + "mean_token_accuracy": 0.98506378531456, + "num_tokens": 4027591.0, + "step": 1890 + }, + { + "entropy": 2.021359896659851, + "epoch": 0.6237656352863726, + "grad_norm": 3.2785909175872803, + "learning_rate": 1.6483298852860584e-05, + "loss": 0.063, + "mean_token_accuracy": 0.9845045149326325, + "num_tokens": 4038090.0, + "step": 1895 + }, + { + "entropy": 2.045257091522217, + "epoch": 0.6254114549045424, + "grad_norm": 1.1910496950149536, + "learning_rate": 1.646255517587656e-05, + "loss": 0.0417, + "mean_token_accuracy": 0.9895485579967499, + "num_tokens": 4048536.0, + "step": 1900 + }, + { + "entropy": 2.0391509175300597, + "epoch": 0.6270572745227123, + "grad_norm": 6.128629207611084, + "learning_rate": 1.644176363686358e-05, + "loss": 0.0824, + "mean_token_accuracy": 0.9755302667617798, + "num_tokens": 4059195.0, + "step": 1905 + }, + { + "entropy": 2.0614427804946898, + "epoch": 0.6287030941408822, + "grad_norm": 2.5974950790405273, + "learning_rate": 1.6420924389804887e-05, + "loss": 0.0469, + "mean_token_accuracy": 0.989526915550232, + "num_tokens": 4069610.0, + "step": 1910 + }, + { + "entropy": 2.0109457969665527, + "epoch": 0.630348913759052, + "grad_norm": 4.8466362953186035, + "learning_rate": 1.6400037589037062e-05, + "loss": 0.0814, + "mean_token_accuracy": 0.9819941699504853, + "num_tokens": 4080106.0, + "step": 1915 + }, + { + "entropy": 2.0402795314788817, + "epoch": 0.6319947333772219, + "grad_norm": 3.8476603031158447, + "learning_rate": 1.6379103389248867e-05, + "loss": 0.0909, + "mean_token_accuracy": 0.9787125289440155, + "num_tokens": 4090739.0, + "step": 1920 + }, + { + "entropy": 2.1076374292373656, + "epoch": 0.6336405529953917, + "grad_norm": 3.3376083374023438, + "learning_rate": 1.63581219454801e-05, + "loss": 0.0598, + "mean_token_accuracy": 0.98504838347435, + "num_tokens": 4101214.0, + "step": 1925 + }, + { + "entropy": 2.0517534494400023, + "epoch": 0.6352863726135616, + "grad_norm": 6.078008651733398, + "learning_rate": 1.6337093413120463e-05, + "loss": 0.0745, + "mean_token_accuracy": 0.9847012758255005, + "num_tokens": 4111702.0, + "step": 1930 + }, + { + "entropy": 2.0724000096321107, + "epoch": 0.6369321922317314, + "grad_norm": 1.9413197040557861, + "learning_rate": 1.631601794790838e-05, + "loss": 0.0533, + "mean_token_accuracy": 0.9855819940567017, + "num_tokens": 4122503.0, + "step": 1935 + }, + { + "entropy": 2.0283157587051392, + "epoch": 0.6385780118499013, + "grad_norm": 3.654207468032837, + "learning_rate": 1.629489570592988e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9817177176475524, + "num_tokens": 4132981.0, + "step": 1940 + }, + { + "entropy": 1.9633773922920228, + "epoch": 0.640223831468071, + "grad_norm": 5.509681701660156, + "learning_rate": 1.6273726843617413e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9831919133663177, + "num_tokens": 4143616.0, + "step": 1945 + }, + { + "entropy": 1.8812169671058654, + "epoch": 0.6418696510862409, + "grad_norm": 5.453320503234863, + "learning_rate": 1.625251151774871e-05, + "loss": 0.0743, + "mean_token_accuracy": 0.9811854064464569, + "num_tokens": 4154118.0, + "step": 1950 + }, + { + "entropy": 1.8234861969947815, + "epoch": 0.6435154707044108, + "grad_norm": 1.3750858306884766, + "learning_rate": 1.62312498854456e-05, + "loss": 0.0351, + "mean_token_accuracy": 0.987557852268219, + "num_tokens": 4165209.0, + "step": 1955 + }, + { + "entropy": 1.8196370124816894, + "epoch": 0.6451612903225806, + "grad_norm": 3.4758753776550293, + "learning_rate": 1.620994210417287e-05, + "loss": 0.0615, + "mean_token_accuracy": 0.9843147337436676, + "num_tokens": 4175834.0, + "step": 1960 + }, + { + "entropy": 1.8451374650001526, + "epoch": 0.6468071099407505, + "grad_norm": 0.8775233626365662, + "learning_rate": 1.6188588331737086e-05, + "loss": 0.1016, + "mean_token_accuracy": 0.9828826725482941, + "num_tokens": 4186493.0, + "step": 1965 + }, + { + "entropy": 1.8466584086418152, + "epoch": 0.6484529295589203, + "grad_norm": 2.0667057037353516, + "learning_rate": 1.6167188726285433e-05, + "loss": 0.041, + "mean_token_accuracy": 0.985245656967163, + "num_tokens": 4196992.0, + "step": 1970 + }, + { + "entropy": 1.820866048336029, + "epoch": 0.6500987491770902, + "grad_norm": 1.9866774082183838, + "learning_rate": 1.6145743446304524e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9827068746089935, + "num_tokens": 4207748.0, + "step": 1975 + }, + { + "entropy": 1.7237545490264892, + "epoch": 0.65174456879526, + "grad_norm": 4.054563045501709, + "learning_rate": 1.6124252650619257e-05, + "loss": 0.0721, + "mean_token_accuracy": 0.9837478876113892, + "num_tokens": 4218653.0, + "step": 1980 + }, + { + "entropy": 1.7705234289169312, + "epoch": 0.6533903884134299, + "grad_norm": 2.3912136554718018, + "learning_rate": 1.610271649839161e-05, + "loss": 0.0548, + "mean_token_accuracy": 0.9835083067417145, + "num_tokens": 4229354.0, + "step": 1985 + }, + { + "entropy": 1.733774983882904, + "epoch": 0.6550362080315998, + "grad_norm": 7.83612060546875, + "learning_rate": 1.608113514911948e-05, + "loss": 0.0886, + "mean_token_accuracy": 0.9790404379367829, + "num_tokens": 4240302.0, + "step": 1990 + }, + { + "entropy": 1.6919760227203369, + "epoch": 0.6566820276497696, + "grad_norm": 4.122557163238525, + "learning_rate": 1.6059508762635482e-05, + "loss": 0.0658, + "mean_token_accuracy": 0.9874409794807434, + "num_tokens": 4250965.0, + "step": 1995 + }, + { + "entropy": 1.727248990535736, + "epoch": 0.6583278472679395, + "grad_norm": 2.4015748500823975, + "learning_rate": 1.6037837499105804e-05, + "loss": 0.0619, + "mean_token_accuracy": 0.9821118116378784, + "num_tokens": 4261750.0, + "step": 2000 + }, + { + "entropy": 1.7900878310203552, + "epoch": 0.6599736668861093, + "grad_norm": 3.262315034866333, + "learning_rate": 1.601612151902897e-05, + "loss": 0.053, + "mean_token_accuracy": 0.9853106379508972, + "num_tokens": 4272360.0, + "step": 2005 + }, + { + "entropy": 1.8539373755455018, + "epoch": 0.6616194865042792, + "grad_norm": 3.7115604877471924, + "learning_rate": 1.5994360983234698e-05, + "loss": 0.0681, + "mean_token_accuracy": 0.9834797859191895, + "num_tokens": 4283012.0, + "step": 2010 + }, + { + "entropy": 1.888733983039856, + "epoch": 0.6632653061224489, + "grad_norm": 1.2049510478973389, + "learning_rate": 1.5972556052882672e-05, + "loss": 0.0399, + "mean_token_accuracy": 0.9876511216163635, + "num_tokens": 4293353.0, + "step": 2015 + }, + { + "entropy": 1.8981561660766602, + "epoch": 0.6649111257406188, + "grad_norm": 4.529847621917725, + "learning_rate": 1.595070688946138e-05, + "loss": 0.0757, + "mean_token_accuracy": 0.9844479858875275, + "num_tokens": 4303907.0, + "step": 2020 + }, + { + "entropy": 1.9379532098770142, + "epoch": 0.6665569453587886, + "grad_norm": 1.7552553415298462, + "learning_rate": 1.592881365478688e-05, + "loss": 0.0662, + "mean_token_accuracy": 0.983349347114563, + "num_tokens": 4314505.0, + "step": 2025 + }, + { + "entropy": 1.9738388657569885, + "epoch": 0.6682027649769585, + "grad_norm": 2.010423183441162, + "learning_rate": 1.590687651100165e-05, + "loss": 0.0707, + "mean_token_accuracy": 0.9822420716285706, + "num_tokens": 4325178.0, + "step": 2030 + }, + { + "entropy": 1.9870564699172975, + "epoch": 0.6698485845951284, + "grad_norm": 3.9561121463775635, + "learning_rate": 1.5884895620573346e-05, + "loss": 0.0568, + "mean_token_accuracy": 0.9854467332363128, + "num_tokens": 4335932.0, + "step": 2035 + }, + { + "entropy": 1.989486038684845, + "epoch": 0.6714944042132982, + "grad_norm": 2.525803565979004, + "learning_rate": 1.5862871146293616e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9792122423648835, + "num_tokens": 4346565.0, + "step": 2040 + }, + { + "entropy": 1.9590755701065063, + "epoch": 0.6731402238314681, + "grad_norm": 3.8919122219085693, + "learning_rate": 1.5840803251276892e-05, + "loss": 0.0805, + "mean_token_accuracy": 0.9800050914287567, + "num_tokens": 4357420.0, + "step": 2045 + }, + { + "entropy": 2.030180037021637, + "epoch": 0.6747860434496379, + "grad_norm": 3.5018019676208496, + "learning_rate": 1.5818692098959187e-05, + "loss": 0.099, + "mean_token_accuracy": 0.9742182731628418, + "num_tokens": 4368006.0, + "step": 2050 + }, + { + "entropy": 2.0822962045669557, + "epoch": 0.6764318630678078, + "grad_norm": 4.686304092407227, + "learning_rate": 1.5796537853096875e-05, + "loss": 0.0766, + "mean_token_accuracy": 0.9759474039077759, + "num_tokens": 4378686.0, + "step": 2055 + }, + { + "entropy": 2.134053909778595, + "epoch": 0.6780776826859776, + "grad_norm": 4.483613014221191, + "learning_rate": 1.5774340677765483e-05, + "loss": 0.0743, + "mean_token_accuracy": 0.9787837326526642, + "num_tokens": 4389381.0, + "step": 2060 + }, + { + "entropy": 2.101311683654785, + "epoch": 0.6797235023041475, + "grad_norm": 4.380194187164307, + "learning_rate": 1.575210073735848e-05, + "loss": 0.0875, + "mean_token_accuracy": 0.9795165956020355, + "num_tokens": 4400273.0, + "step": 2065 + }, + { + "entropy": 2.1265645384788514, + "epoch": 0.6813693219223174, + "grad_norm": 5.121950626373291, + "learning_rate": 1.572981819658605e-05, + "loss": 0.0539, + "mean_token_accuracy": 0.9842699348926545, + "num_tokens": 4411134.0, + "step": 2070 + }, + { + "entropy": 2.063078737258911, + "epoch": 0.6830151415404871, + "grad_norm": 1.915604829788208, + "learning_rate": 1.5707493220473886e-05, + "loss": 0.039, + "mean_token_accuracy": 0.9892246127128601, + "num_tokens": 4421711.0, + "step": 2075 + }, + { + "entropy": 2.0187936067581176, + "epoch": 0.684660961158657, + "grad_norm": 6.068004131317139, + "learning_rate": 1.568512597436195e-05, + "loss": 0.0724, + "mean_token_accuracy": 0.9800882399082184, + "num_tokens": 4432281.0, + "step": 2080 + }, + { + "entropy": 2.019049596786499, + "epoch": 0.6863067807768268, + "grad_norm": 3.6800379753112793, + "learning_rate": 1.566271662390326e-05, + "loss": 0.0666, + "mean_token_accuracy": 0.9861168086528778, + "num_tokens": 4442927.0, + "step": 2085 + }, + { + "entropy": 2.0945070028305053, + "epoch": 0.6879526003949967, + "grad_norm": 5.768988132476807, + "learning_rate": 1.564026533506267e-05, + "loss": 0.0637, + "mean_token_accuracy": 0.9825628995895386, + "num_tokens": 4453272.0, + "step": 2090 + }, + { + "entropy": 2.1388468503952027, + "epoch": 0.6895984200131665, + "grad_norm": 3.217071294784546, + "learning_rate": 1.5617772274115618e-05, + "loss": 0.0478, + "mean_token_accuracy": 0.9849779725074768, + "num_tokens": 4464006.0, + "step": 2095 + }, + { + "entropy": 2.1074278712272645, + "epoch": 0.6912442396313364, + "grad_norm": 2.122255563735962, + "learning_rate": 1.559523760764692e-05, + "loss": 0.1015, + "mean_token_accuracy": 0.9739717364311218, + "num_tokens": 4475018.0, + "step": 2100 + }, + { + "entropy": 2.1118789672851563, + "epoch": 0.6928900592495063, + "grad_norm": 2.033972978591919, + "learning_rate": 1.5572661502549514e-05, + "loss": 0.0485, + "mean_token_accuracy": 0.9862671077251435, + "num_tokens": 4485676.0, + "step": 2105 + }, + { + "entropy": 2.1021223068237305, + "epoch": 0.6945358788676761, + "grad_norm": 0.6050166487693787, + "learning_rate": 1.5550044126023245e-05, + "loss": 0.0395, + "mean_token_accuracy": 0.9876242458820343, + "num_tokens": 4496186.0, + "step": 2110 + }, + { + "entropy": 1.970346164703369, + "epoch": 0.696181698485846, + "grad_norm": 3.1835389137268066, + "learning_rate": 1.5527385645573613e-05, + "loss": 0.0538, + "mean_token_accuracy": 0.9837459921836853, + "num_tokens": 4507128.0, + "step": 2115 + }, + { + "entropy": 1.9633802771568298, + "epoch": 0.6978275181040158, + "grad_norm": 0.6275081634521484, + "learning_rate": 1.5504686229010535e-05, + "loss": 0.0388, + "mean_token_accuracy": 0.987060683965683, + "num_tokens": 4517561.0, + "step": 2120 + }, + { + "entropy": 1.889497458934784, + "epoch": 0.6994733377221857, + "grad_norm": 8.642451286315918, + "learning_rate": 1.54819460444471e-05, + "loss": 0.0617, + "mean_token_accuracy": 0.9837220013141632, + "num_tokens": 4528490.0, + "step": 2125 + }, + { + "entropy": 1.8756322503089904, + "epoch": 0.7011191573403555, + "grad_norm": 1.7640129327774048, + "learning_rate": 1.545916526029833e-05, + "loss": 0.0606, + "mean_token_accuracy": 0.9831870555877685, + "num_tokens": 4539249.0, + "step": 2130 + }, + { + "entropy": 1.84752117395401, + "epoch": 0.7027649769585254, + "grad_norm": 1.39552903175354, + "learning_rate": 1.5436344045279935e-05, + "loss": 0.079, + "mean_token_accuracy": 0.9791609466075897, + "num_tokens": 4550292.0, + "step": 2135 + }, + { + "entropy": 1.8886283040046692, + "epoch": 0.7044107965766951, + "grad_norm": 1.880411148071289, + "learning_rate": 1.5413482568407044e-05, + "loss": 0.0807, + "mean_token_accuracy": 0.9817405939102173, + "num_tokens": 4560802.0, + "step": 2140 + }, + { + "entropy": 1.8611098527908325, + "epoch": 0.706056616194865, + "grad_norm": 2.7298200130462646, + "learning_rate": 1.539058099899299e-05, + "loss": 0.0653, + "mean_token_accuracy": 0.9814600467681884, + "num_tokens": 4571690.0, + "step": 2145 + }, + { + "entropy": 1.791181170940399, + "epoch": 0.7077024358130349, + "grad_norm": 5.456151485443115, + "learning_rate": 1.5367639506648006e-05, + "loss": 0.0708, + "mean_token_accuracy": 0.9785441160202026, + "num_tokens": 4582204.0, + "step": 2150 + }, + { + "entropy": 1.752633535861969, + "epoch": 0.7093482554312047, + "grad_norm": 4.673461437225342, + "learning_rate": 1.5344658261278013e-05, + "loss": 0.0629, + "mean_token_accuracy": 0.982696121931076, + "num_tokens": 4592993.0, + "step": 2155 + }, + { + "entropy": 1.810698115825653, + "epoch": 0.7109940750493746, + "grad_norm": 4.658589839935303, + "learning_rate": 1.532163743308335e-05, + "loss": 0.0453, + "mean_token_accuracy": 0.990430223941803, + "num_tokens": 4603443.0, + "step": 2160 + }, + { + "entropy": 1.7778592824935913, + "epoch": 0.7126398946675444, + "grad_norm": 3.202155828475952, + "learning_rate": 1.5298577192557487e-05, + "loss": 0.069, + "mean_token_accuracy": 0.98820481300354, + "num_tokens": 4614058.0, + "step": 2165 + }, + { + "entropy": 1.7779869794845582, + "epoch": 0.7142857142857143, + "grad_norm": 1.4483567476272583, + "learning_rate": 1.5275477710485812e-05, + "loss": 0.0808, + "mean_token_accuracy": 0.9856044292449951, + "num_tokens": 4625236.0, + "step": 2170 + }, + { + "entropy": 1.900163722038269, + "epoch": 0.7159315339038841, + "grad_norm": 1.0799473524093628, + "learning_rate": 1.525233915794432e-05, + "loss": 0.0379, + "mean_token_accuracy": 0.9869786322116851, + "num_tokens": 4635887.0, + "step": 2175 + }, + { + "entropy": 1.8798472046852113, + "epoch": 0.717577353522054, + "grad_norm": 4.686890602111816, + "learning_rate": 1.522916170629836e-05, + "loss": 0.0359, + "mean_token_accuracy": 0.9911372125148773, + "num_tokens": 4646429.0, + "step": 2180 + }, + { + "entropy": 1.9202223777770997, + "epoch": 0.7192231731402239, + "grad_norm": 3.380596399307251, + "learning_rate": 1.5205945527201386e-05, + "loss": 0.0877, + "mean_token_accuracy": 0.9812184333801269, + "num_tokens": 4656951.0, + "step": 2185 + }, + { + "entropy": 1.916778802871704, + "epoch": 0.7208689927583937, + "grad_norm": 2.9656357765197754, + "learning_rate": 1.5182690792593659e-05, + "loss": 0.0866, + "mean_token_accuracy": 0.9801094174385071, + "num_tokens": 4667789.0, + "step": 2190 + }, + { + "entropy": 1.9705326914787293, + "epoch": 0.7225148123765636, + "grad_norm": 1.6277377605438232, + "learning_rate": 1.515939767470098e-05, + "loss": 0.0684, + "mean_token_accuracy": 0.9821272671222687, + "num_tokens": 4678462.0, + "step": 2195 + }, + { + "entropy": 2.0496770977973937, + "epoch": 0.7241606319947334, + "grad_norm": 2.6740543842315674, + "learning_rate": 1.5136066346033431e-05, + "loss": 0.0827, + "mean_token_accuracy": 0.9822946667671204, + "num_tokens": 4689142.0, + "step": 2200 + }, + { + "entropy": 1.99622882604599, + "epoch": 0.7258064516129032, + "grad_norm": 2.084319829940796, + "learning_rate": 1.5112696979384076e-05, + "loss": 0.0622, + "mean_token_accuracy": 0.9843969225883484, + "num_tokens": 4699742.0, + "step": 2205 + }, + { + "entropy": 1.9976887702941895, + "epoch": 0.727452271231073, + "grad_norm": 4.397096633911133, + "learning_rate": 1.5089289747827698e-05, + "loss": 0.0672, + "mean_token_accuracy": 0.9822742283344269, + "num_tokens": 4710529.0, + "step": 2210 + }, + { + "entropy": 2.00170783996582, + "epoch": 0.7290980908492429, + "grad_norm": 1.066505789756775, + "learning_rate": 1.5065844824719498e-05, + "loss": 0.0403, + "mean_token_accuracy": 0.9920450508594513, + "num_tokens": 4721625.0, + "step": 2215 + }, + { + "entropy": 2.0141910314559937, + "epoch": 0.7307439104674127, + "grad_norm": 3.045076370239258, + "learning_rate": 1.504236238369383e-05, + "loss": 0.0529, + "mean_token_accuracy": 0.9883207738399505, + "num_tokens": 4732195.0, + "step": 2220 + }, + { + "entropy": 2.031786823272705, + "epoch": 0.7323897300855826, + "grad_norm": 5.866887092590332, + "learning_rate": 1.5018842598662913e-05, + "loss": 0.08, + "mean_token_accuracy": 0.9810294449329376, + "num_tokens": 4742750.0, + "step": 2225 + }, + { + "entropy": 2.0390859127044676, + "epoch": 0.7340355497037525, + "grad_norm": 3.7776222229003906, + "learning_rate": 1.499528564381553e-05, + "loss": 0.0844, + "mean_token_accuracy": 0.9762147605419159, + "num_tokens": 4753095.0, + "step": 2230 + }, + { + "entropy": 2.058468997478485, + "epoch": 0.7356813693219223, + "grad_norm": 2.748897075653076, + "learning_rate": 1.497169169361574e-05, + "loss": 0.0628, + "mean_token_accuracy": 0.9859977424144745, + "num_tokens": 4763609.0, + "step": 2235 + }, + { + "entropy": 1.9909031867980957, + "epoch": 0.7373271889400922, + "grad_norm": 1.4906212091445923, + "learning_rate": 1.4948060922801609e-05, + "loss": 0.0506, + "mean_token_accuracy": 0.9845727860927582, + "num_tokens": 4774709.0, + "step": 2240 + }, + { + "entropy": 1.9794199824333192, + "epoch": 0.738973008558262, + "grad_norm": 5.407120227813721, + "learning_rate": 1.492439350638388e-05, + "loss": 0.0638, + "mean_token_accuracy": 0.9835636436939239, + "num_tokens": 4785745.0, + "step": 2245 + }, + { + "entropy": 2.052793502807617, + "epoch": 0.7406188281764319, + "grad_norm": 0.8577380776405334, + "learning_rate": 1.4900689619644709e-05, + "loss": 0.0635, + "mean_token_accuracy": 0.9855947732925415, + "num_tokens": 4796471.0, + "step": 2250 + }, + { + "entropy": 2.1061079859733582, + "epoch": 0.7422646477946017, + "grad_norm": 3.701122283935547, + "learning_rate": 1.4876949438136348e-05, + "loss": 0.0884, + "mean_token_accuracy": 0.9817906200885773, + "num_tokens": 4807169.0, + "step": 2255 + }, + { + "entropy": 2.124563980102539, + "epoch": 0.7439104674127716, + "grad_norm": 0.8693974018096924, + "learning_rate": 1.4853173137679845e-05, + "loss": 0.0565, + "mean_token_accuracy": 0.9871229946613311, + "num_tokens": 4818068.0, + "step": 2260 + }, + { + "entropy": 2.1583971261978148, + "epoch": 0.7455562870309415, + "grad_norm": 3.7275748252868652, + "learning_rate": 1.4829360894363755e-05, + "loss": 0.0685, + "mean_token_accuracy": 0.981871497631073, + "num_tokens": 4829134.0, + "step": 2265 + }, + { + "entropy": 2.1540443658828736, + "epoch": 0.7472021066491112, + "grad_norm": 5.0215373039245605, + "learning_rate": 1.4805512884542828e-05, + "loss": 0.1157, + "mean_token_accuracy": 0.9756322801113129, + "num_tokens": 4839612.0, + "step": 2270 + }, + { + "entropy": 2.0997665166854858, + "epoch": 0.7488479262672811, + "grad_norm": 1.8622218370437622, + "learning_rate": 1.4781629284836689e-05, + "loss": 0.076, + "mean_token_accuracy": 0.9799229741096497, + "num_tokens": 4850560.0, + "step": 2275 + }, + { + "entropy": 2.082018828392029, + "epoch": 0.7504937458854509, + "grad_norm": 3.323389768600464, + "learning_rate": 1.4757710272128562e-05, + "loss": 0.0679, + "mean_token_accuracy": 0.9854858696460724, + "num_tokens": 4861281.0, + "step": 2280 + }, + { + "entropy": 2.145115876197815, + "epoch": 0.7521395655036208, + "grad_norm": 3.6958255767822266, + "learning_rate": 1.4733756023563932e-05, + "loss": 0.0801, + "mean_token_accuracy": 0.9791579902172088, + "num_tokens": 4872072.0, + "step": 2285 + }, + { + "entropy": 2.2006231546401978, + "epoch": 0.7537853851217906, + "grad_norm": 2.5469164848327637, + "learning_rate": 1.4709766716549246e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.9853972256183624, + "num_tokens": 4882541.0, + "step": 2290 + }, + { + "entropy": 2.155518102645874, + "epoch": 0.7554312047399605, + "grad_norm": 3.5733323097229004, + "learning_rate": 1.4685742528750584e-05, + "loss": 0.0609, + "mean_token_accuracy": 0.9829520225524903, + "num_tokens": 4893417.0, + "step": 2295 + }, + { + "entropy": 2.142457294464111, + "epoch": 0.7570770243581304, + "grad_norm": 3.7663276195526123, + "learning_rate": 1.4661683638092375e-05, + "loss": 0.0666, + "mean_token_accuracy": 0.9813594579696655, + "num_tokens": 4904063.0, + "step": 2300 + }, + { + "entropy": 2.1797530174255373, + "epoch": 0.7587228439763002, + "grad_norm": 3.873552083969116, + "learning_rate": 1.4637590222756041e-05, + "loss": 0.0482, + "mean_token_accuracy": 0.9882281720638275, + "num_tokens": 4914583.0, + "step": 2305 + }, + { + "entropy": 2.1009862065315246, + "epoch": 0.7603686635944701, + "grad_norm": 2.9890084266662598, + "learning_rate": 1.4613462461178714e-05, + "loss": 0.0542, + "mean_token_accuracy": 0.9857582569122314, + "num_tokens": 4925125.0, + "step": 2310 + }, + { + "entropy": 2.1062487721443177, + "epoch": 0.7620144832126399, + "grad_norm": 2.7217583656311035, + "learning_rate": 1.4589300532051875e-05, + "loss": 0.0649, + "mean_token_accuracy": 0.987279736995697, + "num_tokens": 4935529.0, + "step": 2315 + }, + { + "entropy": 2.0136227369308473, + "epoch": 0.7636603028308098, + "grad_norm": 6.62637996673584, + "learning_rate": 1.4565104614320065e-05, + "loss": 0.0809, + "mean_token_accuracy": 0.9829951286315918, + "num_tokens": 4946511.0, + "step": 2320 + }, + { + "entropy": 2.0690441846847536, + "epoch": 0.7653061224489796, + "grad_norm": 4.281233787536621, + "learning_rate": 1.4540874887179546e-05, + "loss": 0.0676, + "mean_token_accuracy": 0.9836612105369568, + "num_tokens": 4957109.0, + "step": 2325 + }, + { + "entropy": 2.0863526821136475, + "epoch": 0.7669519420671495, + "grad_norm": 2.6421709060668945, + "learning_rate": 1.451661153007697e-05, + "loss": 0.0828, + "mean_token_accuracy": 0.9836499035358429, + "num_tokens": 4967988.0, + "step": 2330 + }, + { + "entropy": 2.0738728761672975, + "epoch": 0.7685977616853192, + "grad_norm": 1.4690742492675781, + "learning_rate": 1.4492314722708057e-05, + "loss": 0.0488, + "mean_token_accuracy": 0.9871618390083313, + "num_tokens": 4978584.0, + "step": 2335 + }, + { + "entropy": 2.100434124469757, + "epoch": 0.7702435813034891, + "grad_norm": 3.107667922973633, + "learning_rate": 1.4467984645016259e-05, + "loss": 0.0678, + "mean_token_accuracy": 0.9847745180130005, + "num_tokens": 4989176.0, + "step": 2340 + }, + { + "entropy": 2.0772449016571044, + "epoch": 0.771889400921659, + "grad_norm": 2.9565625190734863, + "learning_rate": 1.4443621477191434e-05, + "loss": 0.0564, + "mean_token_accuracy": 0.9867813110351562, + "num_tokens": 4999990.0, + "step": 2345 + }, + { + "entropy": 2.0808992266654966, + "epoch": 0.7735352205398288, + "grad_norm": 4.1320390701293945, + "learning_rate": 1.4419225399668504e-05, + "loss": 0.0641, + "mean_token_accuracy": 0.982687771320343, + "num_tokens": 5010626.0, + "step": 2350 + }, + { + "entropy": 2.0873427987098694, + "epoch": 0.7751810401579987, + "grad_norm": 7.7029595375061035, + "learning_rate": 1.4394796593126121e-05, + "loss": 0.0628, + "mean_token_accuracy": 0.9848801851272583, + "num_tokens": 5021336.0, + "step": 2355 + }, + { + "entropy": 2.007732594013214, + "epoch": 0.7768268597761685, + "grad_norm": 1.4012236595153809, + "learning_rate": 1.4370335238485336e-05, + "loss": 0.042, + "mean_token_accuracy": 0.9886705160140992, + "num_tokens": 5032297.0, + "step": 2360 + }, + { + "entropy": 2.067774224281311, + "epoch": 0.7784726793943384, + "grad_norm": 3.532789945602417, + "learning_rate": 1.4345841516908244e-05, + "loss": 0.093, + "mean_token_accuracy": 0.9788289248943329, + "num_tokens": 5042834.0, + "step": 2365 + }, + { + "entropy": 2.1315778255462647, + "epoch": 0.7801184990125082, + "grad_norm": 3.001561403274536, + "learning_rate": 1.4321315609796656e-05, + "loss": 0.0596, + "mean_token_accuracy": 0.985314530134201, + "num_tokens": 5053499.0, + "step": 2370 + }, + { + "entropy": 2.088835906982422, + "epoch": 0.7817643186306781, + "grad_norm": 1.8496758937835693, + "learning_rate": 1.4296757698790752e-05, + "loss": 0.0544, + "mean_token_accuracy": 0.9835005939006806, + "num_tokens": 5064232.0, + "step": 2375 + }, + { + "entropy": 2.1136799812316895, + "epoch": 0.783410138248848, + "grad_norm": 3.9325993061065674, + "learning_rate": 1.4272167965767735e-05, + "loss": 0.0599, + "mean_token_accuracy": 0.9857981920242309, + "num_tokens": 5074681.0, + "step": 2380 + }, + { + "entropy": 2.082188093662262, + "epoch": 0.7850559578670178, + "grad_norm": 2.6712751388549805, + "learning_rate": 1.424754659284048e-05, + "loss": 0.0425, + "mean_token_accuracy": 0.9875874161720276, + "num_tokens": 5085364.0, + "step": 2385 + }, + { + "entropy": 2.0782926797866823, + "epoch": 0.7867017774851877, + "grad_norm": 2.5082037448883057, + "learning_rate": 1.422289376235619e-05, + "loss": 0.0924, + "mean_token_accuracy": 0.9773596286773681, + "num_tokens": 5095986.0, + "step": 2390 + }, + { + "entropy": 2.1379014015197755, + "epoch": 0.7883475971033574, + "grad_norm": 3.3415675163269043, + "learning_rate": 1.4198209656895045e-05, + "loss": 0.0538, + "mean_token_accuracy": 0.9802229166030884, + "num_tokens": 5106549.0, + "step": 2395 + }, + { + "entropy": 2.1798630714416505, + "epoch": 0.7899934167215273, + "grad_norm": 2.5034339427948, + "learning_rate": 1.4173494459268848e-05, + "loss": 0.0529, + "mean_token_accuracy": 0.9868557870388031, + "num_tokens": 5117206.0, + "step": 2400 + }, + { + "entropy": 2.1523491382598876, + "epoch": 0.7916392363396971, + "grad_norm": 2.866143226623535, + "learning_rate": 1.4148748352519677e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9868989825248718, + "num_tokens": 5127670.0, + "step": 2405 + }, + { + "entropy": 2.1185644268989563, + "epoch": 0.793285055957867, + "grad_norm": 3.293668270111084, + "learning_rate": 1.4123971519918516e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9803002297878265, + "num_tokens": 5138300.0, + "step": 2410 + }, + { + "entropy": 2.1225445747375487, + "epoch": 0.7949308755760369, + "grad_norm": 3.2386057376861572, + "learning_rate": 1.4099164144963914e-05, + "loss": 0.0745, + "mean_token_accuracy": 0.976332038640976, + "num_tokens": 5148898.0, + "step": 2415 + }, + { + "entropy": 2.0981351375579833, + "epoch": 0.7965766951942067, + "grad_norm": 2.420224666595459, + "learning_rate": 1.4074326411380617e-05, + "loss": 0.0554, + "mean_token_accuracy": 0.9834634006023407, + "num_tokens": 5159779.0, + "step": 2420 + }, + { + "entropy": 2.102435576915741, + "epoch": 0.7982225148123766, + "grad_norm": 4.499454975128174, + "learning_rate": 1.4049458503118206e-05, + "loss": 0.0728, + "mean_token_accuracy": 0.9828149318695069, + "num_tokens": 5170439.0, + "step": 2425 + }, + { + "entropy": 2.02862309217453, + "epoch": 0.7998683344305464, + "grad_norm": 1.3120136260986328, + "learning_rate": 1.4024560604349738e-05, + "loss": 0.07, + "mean_token_accuracy": 0.9814626455307007, + "num_tokens": 5181355.0, + "step": 2430 + }, + { + "epoch": 0.8005266622778143, + "eval_entropy": 2.0658205878150566, + "eval_loss": 0.05905340239405632, + "eval_mean_token_accuracy": 0.9841244661327054, + "eval_num_tokens": 5185715.0, + "eval_runtime": 197.2045, + "eval_samples_per_second": 42.225, + "eval_steps_per_second": 7.038, + "step": 2432 + }, + { + "entropy": 1.9992692589759826, + "epoch": 0.8015141540487163, + "grad_norm": 2.716399669647217, + "learning_rate": 1.3999632899470377e-05, + "loss": 0.0415, + "mean_token_accuracy": 0.9883556425571441, + "num_tokens": 5192402.0, + "step": 2435 + }, + { + "entropy": 2.073114001750946, + "epoch": 0.8031599736668861, + "grad_norm": 4.84559440612793, + "learning_rate": 1.3974675573096046e-05, + "loss": 0.0516, + "mean_token_accuracy": 0.9831765830516815, + "num_tokens": 5203184.0, + "step": 2440 + }, + { + "entropy": 2.0994094610214233, + "epoch": 0.804805793285056, + "grad_norm": 3.8086822032928467, + "learning_rate": 1.3949688810062033e-05, + "loss": 0.0714, + "mean_token_accuracy": 0.9829389989376068, + "num_tokens": 5213537.0, + "step": 2445 + }, + { + "entropy": 2.1006295680999756, + "epoch": 0.8064516129032258, + "grad_norm": 4.448069095611572, + "learning_rate": 1.3924672795421638e-05, + "loss": 0.0567, + "mean_token_accuracy": 0.9861303806304932, + "num_tokens": 5224028.0, + "step": 2450 + }, + { + "entropy": 2.0546552419662474, + "epoch": 0.8080974325213957, + "grad_norm": 2.4029576778411865, + "learning_rate": 1.389962771444481e-05, + "loss": 0.0561, + "mean_token_accuracy": 0.986747932434082, + "num_tokens": 5234600.0, + "step": 2455 + }, + { + "entropy": 2.0826905846595762, + "epoch": 0.8097432521395656, + "grad_norm": 3.8037831783294678, + "learning_rate": 1.3874553752616747e-05, + "loss": 0.0647, + "mean_token_accuracy": 0.9837893486022949, + "num_tokens": 5245430.0, + "step": 2460 + }, + { + "entropy": 2.101279282569885, + "epoch": 0.8113890717577353, + "grad_norm": 4.736576080322266, + "learning_rate": 1.3849451095636555e-05, + "loss": 0.0393, + "mean_token_accuracy": 0.9862057268619537, + "num_tokens": 5256031.0, + "step": 2465 + }, + { + "entropy": 2.0845310568809508, + "epoch": 0.8130348913759052, + "grad_norm": 3.912712812423706, + "learning_rate": 1.3824319929415856e-05, + "loss": 0.098, + "mean_token_accuracy": 0.9780865371227264, + "num_tokens": 5266625.0, + "step": 2470 + }, + { + "entropy": 1.9965445041656493, + "epoch": 0.814680710994075, + "grad_norm": 2.1379354000091553, + "learning_rate": 1.3799160440077407e-05, + "loss": 0.0305, + "mean_token_accuracy": 0.9910949647426606, + "num_tokens": 5277179.0, + "step": 2475 + }, + { + "entropy": 1.9245966672897339, + "epoch": 0.8163265306122449, + "grad_norm": 3.4994077682495117, + "learning_rate": 1.3773972813953726e-05, + "loss": 0.0606, + "mean_token_accuracy": 0.9876502931118012, + "num_tokens": 5287715.0, + "step": 2480 + }, + { + "entropy": 1.872160291671753, + "epoch": 0.8179723502304147, + "grad_norm": 4.733576774597168, + "learning_rate": 1.3748757237585729e-05, + "loss": 0.0691, + "mean_token_accuracy": 0.9822778880596161, + "num_tokens": 5298582.0, + "step": 2485 + }, + { + "entropy": 1.8784523367881776, + "epoch": 0.8196181698485846, + "grad_norm": 3.7372028827667236, + "learning_rate": 1.372351389772131e-05, + "loss": 0.0713, + "mean_token_accuracy": 0.9856967926025391, + "num_tokens": 5309140.0, + "step": 2490 + }, + { + "entropy": 1.9651299357414245, + "epoch": 0.8212639894667545, + "grad_norm": 3.022817611694336, + "learning_rate": 1.3698242981314e-05, + "loss": 0.0547, + "mean_token_accuracy": 0.9867731034755707, + "num_tokens": 5319522.0, + "step": 2495 + }, + { + "entropy": 1.943724513053894, + "epoch": 0.8229098090849243, + "grad_norm": 3.299440622329712, + "learning_rate": 1.3672944675521555e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.985489410161972, + "num_tokens": 5330359.0, + "step": 2500 + }, + { + "entropy": 1.9148884296417237, + "epoch": 0.8245556287030942, + "grad_norm": 2.022271156311035, + "learning_rate": 1.3647619167704578e-05, + "loss": 0.0457, + "mean_token_accuracy": 0.9900146067142487, + "num_tokens": 5341225.0, + "step": 2505 + }, + { + "entropy": 1.8972573161125184, + "epoch": 0.826201448321264, + "grad_norm": 1.6173745393753052, + "learning_rate": 1.3622266645425135e-05, + "loss": 0.0371, + "mean_token_accuracy": 0.9884720623493195, + "num_tokens": 5351753.0, + "step": 2510 + }, + { + "entropy": 1.845130443572998, + "epoch": 0.8278472679394339, + "grad_norm": 3.775874376296997, + "learning_rate": 1.359688729644536e-05, + "loss": 0.0575, + "mean_token_accuracy": 0.985155212879181, + "num_tokens": 5362524.0, + "step": 2515 + }, + { + "entropy": 1.8224045038223267, + "epoch": 0.8294930875576036, + "grad_norm": 2.454981803894043, + "learning_rate": 1.3571481308726064e-05, + "loss": 0.0544, + "mean_token_accuracy": 0.9880367636680603, + "num_tokens": 5373427.0, + "step": 2520 + }, + { + "entropy": 1.854708468914032, + "epoch": 0.8311389071757735, + "grad_norm": 4.371317386627197, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.0705, + "mean_token_accuracy": 0.9838429808616638, + "num_tokens": 5383996.0, + "step": 2525 + }, + { + "entropy": 1.8091017842292785, + "epoch": 0.8327847267939433, + "grad_norm": 2.3962645530700684, + "learning_rate": 1.3520590169897232e-05, + "loss": 0.0525, + "mean_token_accuracy": 0.9856068968772889, + "num_tokens": 5394554.0, + "step": 2530 + }, + { + "entropy": 1.827188992500305, + "epoch": 0.8344305464121132, + "grad_norm": 3.678561210632324, + "learning_rate": 1.3495105395690185e-05, + "loss": 0.0538, + "mean_token_accuracy": 0.9860463380813599, + "num_tokens": 5405311.0, + "step": 2535 + }, + { + "entropy": 1.8938659191131593, + "epoch": 0.8360763660302831, + "grad_norm": 4.154452323913574, + "learning_rate": 1.3469594736545816e-05, + "loss": 0.0566, + "mean_token_accuracy": 0.982678347826004, + "num_tokens": 5415855.0, + "step": 2540 + }, + { + "entropy": 1.8611347198486328, + "epoch": 0.8377221856484529, + "grad_norm": 5.7794365882873535, + "learning_rate": 1.344405838139743e-05, + "loss": 0.06, + "mean_token_accuracy": 0.9849008440971374, + "num_tokens": 5426534.0, + "step": 2545 + }, + { + "entropy": 1.8572628855705262, + "epoch": 0.8393680052666228, + "grad_norm": 3.2509214878082275, + "learning_rate": 1.341849651936864e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.9872718393802643, + "num_tokens": 5437422.0, + "step": 2550 + }, + { + "entropy": 1.9231997847557067, + "epoch": 0.8410138248847926, + "grad_norm": 1.5836360454559326, + "learning_rate": 1.3392909339771957e-05, + "loss": 0.0687, + "mean_token_accuracy": 0.9836991250514984, + "num_tokens": 5447835.0, + "step": 2555 + }, + { + "entropy": 2.006228494644165, + "epoch": 0.8426596445029625, + "grad_norm": 2.372229814529419, + "learning_rate": 1.3367297032107404e-05, + "loss": 0.0748, + "mean_token_accuracy": 0.9789046466350555, + "num_tokens": 5458430.0, + "step": 2560 + }, + { + "entropy": 2.013643407821655, + "epoch": 0.8443054641211323, + "grad_norm": 2.6997487545013428, + "learning_rate": 1.33416597860611e-05, + "loss": 0.0693, + "mean_token_accuracy": 0.9819782853126526, + "num_tokens": 5469093.0, + "step": 2565 + }, + { + "entropy": 2.086654710769653, + "epoch": 0.8459512837393022, + "grad_norm": 5.783359050750732, + "learning_rate": 1.331599779150386e-05, + "loss": 0.0602, + "mean_token_accuracy": 0.9840903103351593, + "num_tokens": 5479883.0, + "step": 2570 + }, + { + "entropy": 1.9687761425971986, + "epoch": 0.8475971033574721, + "grad_norm": 1.4621000289916992, + "learning_rate": 1.3290311238489784e-05, + "loss": 0.037, + "mean_token_accuracy": 0.9909431576728821, + "num_tokens": 5490442.0, + "step": 2575 + }, + { + "entropy": 1.9323675155639648, + "epoch": 0.8492429229756419, + "grad_norm": 3.7601616382598877, + "learning_rate": 1.3264600317254854e-05, + "loss": 0.0535, + "mean_token_accuracy": 0.9863576173782349, + "num_tokens": 5501192.0, + "step": 2580 + }, + { + "entropy": 1.993829894065857, + "epoch": 0.8508887425938118, + "grad_norm": 2.846726655960083, + "learning_rate": 1.3238865218215535e-05, + "loss": 0.0735, + "mean_token_accuracy": 0.9818755805492401, + "num_tokens": 5511779.0, + "step": 2585 + }, + { + "entropy": 1.9815747618675232, + "epoch": 0.8525345622119815, + "grad_norm": 2.043933868408203, + "learning_rate": 1.3213106131967339e-05, + "loss": 0.0394, + "mean_token_accuracy": 0.9880852580070496, + "num_tokens": 5522208.0, + "step": 2590 + }, + { + "entropy": 1.9507970690727234, + "epoch": 0.8541803818301514, + "grad_norm": 3.3365821838378906, + "learning_rate": 1.3187323249283439e-05, + "loss": 0.0449, + "mean_token_accuracy": 0.987623393535614, + "num_tokens": 5532774.0, + "step": 2595 + }, + { + "entropy": 1.8794512629508973, + "epoch": 0.8558262014483212, + "grad_norm": 4.568225860595703, + "learning_rate": 1.316151676111324e-05, + "loss": 0.047, + "mean_token_accuracy": 0.9890295684337616, + "num_tokens": 5543614.0, + "step": 2600 + }, + { + "entropy": 1.8917754292488098, + "epoch": 0.8574720210664911, + "grad_norm": 1.8697177171707153, + "learning_rate": 1.313568685858097e-05, + "loss": 0.0604, + "mean_token_accuracy": 0.9812738180160523, + "num_tokens": 5554191.0, + "step": 2605 + }, + { + "entropy": 1.8788981676101684, + "epoch": 0.859117840684661, + "grad_norm": 2.4049696922302246, + "learning_rate": 1.3109833732984272e-05, + "loss": 0.0368, + "mean_token_accuracy": 0.9879702389240265, + "num_tokens": 5564836.0, + "step": 2610 + }, + { + "entropy": 1.8883351802825927, + "epoch": 0.8607636603028308, + "grad_norm": 3.053053379058838, + "learning_rate": 1.3083957575792772e-05, + "loss": 0.0608, + "mean_token_accuracy": 0.9815441846847535, + "num_tokens": 5575491.0, + "step": 2615 + }, + { + "entropy": 1.8524894714355469, + "epoch": 0.8624094799210007, + "grad_norm": 2.5347635746002197, + "learning_rate": 1.3058058578646673e-05, + "loss": 0.0676, + "mean_token_accuracy": 0.9812525391578675, + "num_tokens": 5586127.0, + "step": 2620 + }, + { + "entropy": 1.9090933799743652, + "epoch": 0.8640552995391705, + "grad_norm": 4.13297176361084, + "learning_rate": 1.3032136933355336e-05, + "loss": 0.0675, + "mean_token_accuracy": 0.9843396723270417, + "num_tokens": 5596745.0, + "step": 2625 + }, + { + "entropy": 1.8861599802970885, + "epoch": 0.8657011191573404, + "grad_norm": 3.5086593627929688, + "learning_rate": 1.3006192831895846e-05, + "loss": 0.0521, + "mean_token_accuracy": 0.9839343369007111, + "num_tokens": 5607882.0, + "step": 2630 + }, + { + "entropy": 1.9625224947929383, + "epoch": 0.8673469387755102, + "grad_norm": 2.0559775829315186, + "learning_rate": 1.2980226466411605e-05, + "loss": 0.06, + "mean_token_accuracy": 0.9822891354560852, + "num_tokens": 5618184.0, + "step": 2635 + }, + { + "entropy": 1.8167145729064942, + "epoch": 0.8689927583936801, + "grad_norm": 11.442062377929688, + "learning_rate": 1.2954238029210906e-05, + "loss": 0.0468, + "mean_token_accuracy": 0.9872013688087463, + "num_tokens": 5628846.0, + "step": 2640 + }, + { + "entropy": 1.7704446077346803, + "epoch": 0.8706385780118499, + "grad_norm": 3.554111957550049, + "learning_rate": 1.2928227712765504e-05, + "loss": 0.0784, + "mean_token_accuracy": 0.9803166925907135, + "num_tokens": 5639683.0, + "step": 2645 + }, + { + "entropy": 1.746169638633728, + "epoch": 0.8722843976300197, + "grad_norm": 4.623286247253418, + "learning_rate": 1.290219570970919e-05, + "loss": 0.0622, + "mean_token_accuracy": 0.9823844611644745, + "num_tokens": 5650210.0, + "step": 2650 + }, + { + "entropy": 1.671170949935913, + "epoch": 0.8739302172481896, + "grad_norm": 3.731656551361084, + "learning_rate": 1.2876142212836373e-05, + "loss": 0.0572, + "mean_token_accuracy": 0.9831017911434173, + "num_tokens": 5660768.0, + "step": 2655 + }, + { + "entropy": 1.7002404808998108, + "epoch": 0.8755760368663594, + "grad_norm": 3.3224618434906006, + "learning_rate": 1.2850067415100643e-05, + "loss": 0.0796, + "mean_token_accuracy": 0.9770964682102203, + "num_tokens": 5671563.0, + "step": 2660 + }, + { + "entropy": 1.7098248720169067, + "epoch": 0.8772218564845293, + "grad_norm": 4.906748294830322, + "learning_rate": 1.2823971509613338e-05, + "loss": 0.0776, + "mean_token_accuracy": 0.9807847082614899, + "num_tokens": 5682040.0, + "step": 2665 + }, + { + "entropy": 1.8245792269706727, + "epoch": 0.8788676761026991, + "grad_norm": 3.9868807792663574, + "learning_rate": 1.2797854689642136e-05, + "loss": 0.074, + "mean_token_accuracy": 0.9827134788036347, + "num_tokens": 5692607.0, + "step": 2670 + }, + { + "entropy": 1.8712535619735717, + "epoch": 0.880513495720869, + "grad_norm": 2.729797601699829, + "learning_rate": 1.2771717148609598e-05, + "loss": 0.0657, + "mean_token_accuracy": 0.9844698071479797, + "num_tokens": 5703254.0, + "step": 2675 + }, + { + "entropy": 1.8171938061714172, + "epoch": 0.8821593153390388, + "grad_norm": 2.3974361419677734, + "learning_rate": 1.2745559080091749e-05, + "loss": 0.0492, + "mean_token_accuracy": 0.98358274102211, + "num_tokens": 5713800.0, + "step": 2680 + }, + { + "entropy": 1.7879636526107787, + "epoch": 0.8838051349572087, + "grad_norm": 0.965469479560852, + "learning_rate": 1.2719380677816648e-05, + "loss": 0.0679, + "mean_token_accuracy": 0.9823775053024292, + "num_tokens": 5724150.0, + "step": 2685 + }, + { + "entropy": 1.752970778942108, + "epoch": 0.8854509545753786, + "grad_norm": 3.1124043464660645, + "learning_rate": 1.2693182135662933e-05, + "loss": 0.0431, + "mean_token_accuracy": 0.9858216881752014, + "num_tokens": 5734916.0, + "step": 2690 + }, + { + "entropy": 1.8105898737907409, + "epoch": 0.8870967741935484, + "grad_norm": 1.5042496919631958, + "learning_rate": 1.2666963647658413e-05, + "loss": 0.0514, + "mean_token_accuracy": 0.9884253561496734, + "num_tokens": 5745344.0, + "step": 2695 + }, + { + "entropy": 1.7806416988372802, + "epoch": 0.8887425938117183, + "grad_norm": 2.657801389694214, + "learning_rate": 1.2640725407978607e-05, + "loss": 0.0344, + "mean_token_accuracy": 0.9910404562950135, + "num_tokens": 5755871.0, + "step": 2700 + }, + { + "entropy": 1.7876773238182069, + "epoch": 0.8903884134298881, + "grad_norm": 2.7087864875793457, + "learning_rate": 1.2614467610945323e-05, + "loss": 0.0704, + "mean_token_accuracy": 0.9819835543632507, + "num_tokens": 5766351.0, + "step": 2705 + }, + { + "entropy": 1.7520806312561035, + "epoch": 0.892034233048058, + "grad_norm": 2.7425425052642822, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.0391, + "mean_token_accuracy": 0.9887621998786926, + "num_tokens": 5776778.0, + "step": 2710 + }, + { + "entropy": 1.7028562068939208, + "epoch": 0.8936800526662277, + "grad_norm": 1.6712589263916016, + "learning_rate": 1.2561894122828315e-05, + "loss": 0.0588, + "mean_token_accuracy": 0.9863285422325134, + "num_tokens": 5787425.0, + "step": 2715 + }, + { + "entropy": 1.6849963307380675, + "epoch": 0.8953258722843976, + "grad_norm": 4.329936504364014, + "learning_rate": 1.2535578821106648e-05, + "loss": 0.0553, + "mean_token_accuracy": 0.984381890296936, + "num_tokens": 5798063.0, + "step": 2720 + }, + { + "entropy": 1.7037844061851501, + "epoch": 0.8969716919025674, + "grad_norm": 2.813596248626709, + "learning_rate": 1.2509244740752748e-05, + "loss": 0.0612, + "mean_token_accuracy": 0.9847327530384063, + "num_tokens": 5808513.0, + "step": 2725 + }, + { + "entropy": 1.763570249080658, + "epoch": 0.8986175115207373, + "grad_norm": 5.475368976593018, + "learning_rate": 1.2482892076798216e-05, + "loss": 0.0659, + "mean_token_accuracy": 0.9854476451873779, + "num_tokens": 5819032.0, + "step": 2730 + }, + { + "entropy": 1.8239656686782837, + "epoch": 0.9002633311389072, + "grad_norm": 2.5298304557800293, + "learning_rate": 1.2456521024412287e-05, + "loss": 0.0817, + "mean_token_accuracy": 0.983061420917511, + "num_tokens": 5829661.0, + "step": 2735 + }, + { + "entropy": 1.9393372416496277, + "epoch": 0.901909150757077, + "grad_norm": 2.5725841522216797, + "learning_rate": 1.243013177890039e-05, + "loss": 0.0466, + "mean_token_accuracy": 0.987521517276764, + "num_tokens": 5840288.0, + "step": 2740 + }, + { + "entropy": 2.0387110352516173, + "epoch": 0.9035549703752469, + "grad_norm": 2.8396382331848145, + "learning_rate": 1.2403724535702679e-05, + "loss": 0.1089, + "mean_token_accuracy": 0.9775011360645294, + "num_tokens": 5850912.0, + "step": 2745 + }, + { + "entropy": 2.0455712914466857, + "epoch": 0.9052007899934167, + "grad_norm": 3.0769193172454834, + "learning_rate": 1.2377299490392618e-05, + "loss": 0.0575, + "mean_token_accuracy": 0.9838365972042084, + "num_tokens": 5861530.0, + "step": 2750 + }, + { + "entropy": 2.0284469962120055, + "epoch": 0.9068466096115866, + "grad_norm": 3.171985626220703, + "learning_rate": 1.23508568386755e-05, + "loss": 0.0433, + "mean_token_accuracy": 0.986922162771225, + "num_tokens": 5872294.0, + "step": 2755 + }, + { + "entropy": 2.027714431285858, + "epoch": 0.9084924292297564, + "grad_norm": 1.4176881313323975, + "learning_rate": 1.2324396776387014e-05, + "loss": 0.0582, + "mean_token_accuracy": 0.9820313692092896, + "num_tokens": 5882926.0, + "step": 2760 + }, + { + "entropy": 1.9616046786308288, + "epoch": 0.9101382488479263, + "grad_norm": 1.9177738428115845, + "learning_rate": 1.2297919499491797e-05, + "loss": 0.0621, + "mean_token_accuracy": 0.9839075744152069, + "num_tokens": 5893515.0, + "step": 2765 + }, + { + "entropy": 1.8943446040153504, + "epoch": 0.9117840684660962, + "grad_norm": 3.43756103515625, + "learning_rate": 1.2271425204081981e-05, + "loss": 0.0688, + "mean_token_accuracy": 0.9814131796360016, + "num_tokens": 5904107.0, + "step": 2770 + }, + { + "entropy": 1.8612539172172546, + "epoch": 0.913429888084266, + "grad_norm": 2.007723808288574, + "learning_rate": 1.2244914086375726e-05, + "loss": 0.0566, + "mean_token_accuracy": 0.9836967706680297, + "num_tokens": 5914642.0, + "step": 2775 + }, + { + "entropy": 1.8655507802963256, + "epoch": 0.9150757077024358, + "grad_norm": 2.231388568878174, + "learning_rate": 1.2218386342715793e-05, + "loss": 0.0776, + "mean_token_accuracy": 0.9767457246780396, + "num_tokens": 5925381.0, + "step": 2780 + }, + { + "entropy": 1.9478851079940795, + "epoch": 0.9167215273206056, + "grad_norm": 3.511413335800171, + "learning_rate": 1.2191842169568067e-05, + "loss": 0.0627, + "mean_token_accuracy": 0.9826519787311554, + "num_tokens": 5936411.0, + "step": 2785 + }, + { + "entropy": 1.9535033702850342, + "epoch": 0.9183673469387755, + "grad_norm": 1.3818976879119873, + "learning_rate": 1.2165281763520106e-05, + "loss": 0.0284, + "mean_token_accuracy": 0.9931237936019898, + "num_tokens": 5947401.0, + "step": 2790 + }, + { + "entropy": 2.046694076061249, + "epoch": 0.9200131665569453, + "grad_norm": 2.708840847015381, + "learning_rate": 1.2138705321279709e-05, + "loss": 0.0357, + "mean_token_accuracy": 0.9874314427375793, + "num_tokens": 5957977.0, + "step": 2795 + }, + { + "entropy": 1.9919468641281128, + "epoch": 0.9216589861751152, + "grad_norm": 3.4317140579223633, + "learning_rate": 1.2112113039673418e-05, + "loss": 0.0491, + "mean_token_accuracy": 0.9871409773826599, + "num_tokens": 5968581.0, + "step": 2800 + }, + { + "entropy": 2.0010817289352416, + "epoch": 0.9233048057932851, + "grad_norm": 3.1390058994293213, + "learning_rate": 1.2085505115645095e-05, + "loss": 0.0463, + "mean_token_accuracy": 0.9875739812850952, + "num_tokens": 5979403.0, + "step": 2805 + }, + { + "entropy": 1.9707128882408143, + "epoch": 0.9249506254114549, + "grad_norm": 2.6084043979644775, + "learning_rate": 1.2058881746254447e-05, + "loss": 0.0593, + "mean_token_accuracy": 0.9853511095046997, + "num_tokens": 5990059.0, + "step": 2810 + }, + { + "entropy": 2.035856246948242, + "epoch": 0.9265964450296248, + "grad_norm": 1.959696888923645, + "learning_rate": 1.203224312867557e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9854652583599091, + "num_tokens": 6000522.0, + "step": 2815 + }, + { + "entropy": 2.0164458394050597, + "epoch": 0.9282422646477946, + "grad_norm": 5.868241786956787, + "learning_rate": 1.2005589460195486e-05, + "loss": 0.0773, + "mean_token_accuracy": 0.9825805127620697, + "num_tokens": 6011268.0, + "step": 2820 + }, + { + "entropy": 2.0600091218948364, + "epoch": 0.9298880842659645, + "grad_norm": 4.795652389526367, + "learning_rate": 1.1978920938212691e-05, + "loss": 0.0591, + "mean_token_accuracy": 0.984807425737381, + "num_tokens": 6022053.0, + "step": 2825 + }, + { + "entropy": 2.1145251750946046, + "epoch": 0.9315339038841343, + "grad_norm": 1.8055132627487183, + "learning_rate": 1.1952237760235686e-05, + "loss": 0.0611, + "mean_token_accuracy": 0.982151460647583, + "num_tokens": 6032622.0, + "step": 2830 + }, + { + "entropy": 2.1461195945739746, + "epoch": 0.9331797235023042, + "grad_norm": 3.280885934829712, + "learning_rate": 1.192554012388151e-05, + "loss": 0.0711, + "mean_token_accuracy": 0.9854592263698578, + "num_tokens": 6043224.0, + "step": 2835 + }, + { + "entropy": 2.1928713798522947, + "epoch": 0.934825543120474, + "grad_norm": 2.116999864578247, + "learning_rate": 1.1898828226874284e-05, + "loss": 0.0536, + "mean_token_accuracy": 0.9854759395122528, + "num_tokens": 6053910.0, + "step": 2840 + }, + { + "entropy": 2.153935670852661, + "epoch": 0.9364713627386438, + "grad_norm": 3.3620643615722656, + "learning_rate": 1.1872102267043748e-05, + "loss": 0.0525, + "mean_token_accuracy": 0.9844566106796264, + "num_tokens": 6064731.0, + "step": 2845 + }, + { + "entropy": 2.120525050163269, + "epoch": 0.9381171823568137, + "grad_norm": 2.1633312702178955, + "learning_rate": 1.1845362442323784e-05, + "loss": 0.0356, + "mean_token_accuracy": 0.9903881013393402, + "num_tokens": 6075595.0, + "step": 2850 + }, + { + "entropy": 2.168933653831482, + "epoch": 0.9397630019749835, + "grad_norm": 2.078237771987915, + "learning_rate": 1.1818608950750967e-05, + "loss": 0.045, + "mean_token_accuracy": 0.9854483067989349, + "num_tokens": 6085983.0, + "step": 2855 + }, + { + "entropy": 2.104006254673004, + "epoch": 0.9414088215931534, + "grad_norm": 1.4718915224075317, + "learning_rate": 1.1791841990463083e-05, + "loss": 0.0375, + "mean_token_accuracy": 0.9886513173580169, + "num_tokens": 6096996.0, + "step": 2860 + }, + { + "entropy": 2.048595929145813, + "epoch": 0.9430546412113232, + "grad_norm": 1.9670352935791016, + "learning_rate": 1.1765061759697669e-05, + "loss": 0.0316, + "mean_token_accuracy": 0.9901619255542755, + "num_tokens": 6107708.0, + "step": 2865 + }, + { + "entropy": 1.970208466053009, + "epoch": 0.9447004608294931, + "grad_norm": 5.284751892089844, + "learning_rate": 1.1738268456790548e-05, + "loss": 0.0672, + "mean_token_accuracy": 0.981162142753601, + "num_tokens": 6118639.0, + "step": 2870 + }, + { + "entropy": 1.8950253486633302, + "epoch": 0.9463462804476629, + "grad_norm": 4.18200159072876, + "learning_rate": 1.171146228017435e-05, + "loss": 0.0414, + "mean_token_accuracy": 0.9900772452354432, + "num_tokens": 6129781.0, + "step": 2875 + }, + { + "entropy": 1.8563172101974488, + "epoch": 0.9479921000658328, + "grad_norm": 2.1970016956329346, + "learning_rate": 1.1684643428377056e-05, + "loss": 0.058, + "mean_token_accuracy": 0.9863704919815064, + "num_tokens": 6140682.0, + "step": 2880 + }, + { + "entropy": 1.8867647051811218, + "epoch": 0.9496379196840027, + "grad_norm": 1.9754729270935059, + "learning_rate": 1.1657812100020507e-05, + "loss": 0.0588, + "mean_token_accuracy": 0.9856977164745331, + "num_tokens": 6151093.0, + "step": 2885 + }, + { + "entropy": 1.875185477733612, + "epoch": 0.9512837393021725, + "grad_norm": 4.4464592933654785, + "learning_rate": 1.1630968493818961e-05, + "loss": 0.062, + "mean_token_accuracy": 0.9841465830802918, + "num_tokens": 6161567.0, + "step": 2890 + }, + { + "entropy": 1.8699720621109008, + "epoch": 0.9529295589203424, + "grad_norm": 2.7016501426696777, + "learning_rate": 1.1604112808577603e-05, + "loss": 0.0727, + "mean_token_accuracy": 0.9823096752166748, + "num_tokens": 6172041.0, + "step": 2895 + }, + { + "entropy": 1.9755351066589355, + "epoch": 0.9545753785385122, + "grad_norm": 14.661170959472656, + "learning_rate": 1.1577245243191068e-05, + "loss": 0.0641, + "mean_token_accuracy": 0.979709017276764, + "num_tokens": 6182650.0, + "step": 2900 + }, + { + "entropy": 2.0633350372314454, + "epoch": 0.956221198156682, + "grad_norm": 3.1461470127105713, + "learning_rate": 1.155036599664198e-05, + "loss": 0.0693, + "mean_token_accuracy": 0.9809099555015564, + "num_tokens": 6193204.0, + "step": 2905 + }, + { + "entropy": 2.077620244026184, + "epoch": 0.9578670177748518, + "grad_norm": 1.7489817142486572, + "learning_rate": 1.1523475267999477e-05, + "loss": 0.0777, + "mean_token_accuracy": 0.9819590091705322, + "num_tokens": 6203825.0, + "step": 2910 + }, + { + "entropy": 2.087475097179413, + "epoch": 0.9595128373930217, + "grad_norm": 3.104898691177368, + "learning_rate": 1.1496573256417733e-05, + "loss": 0.0437, + "mean_token_accuracy": 0.9875694811344147, + "num_tokens": 6214779.0, + "step": 2915 + }, + { + "entropy": 2.038085675239563, + "epoch": 0.9611586570111915, + "grad_norm": 3.818824052810669, + "learning_rate": 1.1469660161134481e-05, + "loss": 0.0563, + "mean_token_accuracy": 0.9853502690792084, + "num_tokens": 6225540.0, + "step": 2920 + }, + { + "entropy": 1.9863147258758544, + "epoch": 0.9628044766293614, + "grad_norm": 3.5189995765686035, + "learning_rate": 1.1442736181469546e-05, + "loss": 0.0534, + "mean_token_accuracy": 0.9900292754173279, + "num_tokens": 6236269.0, + "step": 2925 + }, + { + "entropy": 1.8946941256523133, + "epoch": 0.9644502962475313, + "grad_norm": 1.297648549079895, + "learning_rate": 1.1415801516823358e-05, + "loss": 0.0523, + "mean_token_accuracy": 0.9893131017684936, + "num_tokens": 6247052.0, + "step": 2930 + }, + { + "entropy": 1.9294702768325807, + "epoch": 0.9660961158657011, + "grad_norm": 2.645850419998169, + "learning_rate": 1.1388856366675482e-05, + "loss": 0.0705, + "mean_token_accuracy": 0.9818682849407196, + "num_tokens": 6257581.0, + "step": 2935 + }, + { + "entropy": 1.9064095735549926, + "epoch": 0.967741935483871, + "grad_norm": 2.203904151916504, + "learning_rate": 1.1361900930583143e-05, + "loss": 0.0589, + "mean_token_accuracy": 0.9860333859920501, + "num_tokens": 6268381.0, + "step": 2940 + }, + { + "entropy": 1.8867696881294251, + "epoch": 0.9693877551020408, + "grad_norm": 4.154993534088135, + "learning_rate": 1.1334935408179736e-05, + "loss": 0.0495, + "mean_token_accuracy": 0.9842896819114685, + "num_tokens": 6279423.0, + "step": 2945 + }, + { + "entropy": 1.8813761711120605, + "epoch": 0.9710335747202107, + "grad_norm": 1.7893234491348267, + "learning_rate": 1.1307959999173362e-05, + "loss": 0.0524, + "mean_token_accuracy": 0.9866505920886993, + "num_tokens": 6290068.0, + "step": 2950 + }, + { + "entropy": 1.915200400352478, + "epoch": 0.9726793943383805, + "grad_norm": 2.213712692260742, + "learning_rate": 1.1280974903345347e-05, + "loss": 0.064, + "mean_token_accuracy": 0.9851447463035583, + "num_tokens": 6300817.0, + "step": 2955 + }, + { + "entropy": 1.9231054186820984, + "epoch": 0.9743252139565504, + "grad_norm": 1.9981787204742432, + "learning_rate": 1.1253980320548746e-05, + "loss": 0.0434, + "mean_token_accuracy": 0.9863059282302856, + "num_tokens": 6311420.0, + "step": 2960 + }, + { + "entropy": 1.9738001823425293, + "epoch": 0.9759710335747203, + "grad_norm": 0.5440634489059448, + "learning_rate": 1.1226976450706887e-05, + "loss": 0.041, + "mean_token_accuracy": 0.9861558496952056, + "num_tokens": 6321771.0, + "step": 2965 + }, + { + "entropy": 1.9686074256896973, + "epoch": 0.97761685319289, + "grad_norm": 5.0454936027526855, + "learning_rate": 1.119996349381187e-05, + "loss": 0.0772, + "mean_token_accuracy": 0.9804552078247071, + "num_tokens": 6332139.0, + "step": 2970 + }, + { + "entropy": 1.9591160893440247, + "epoch": 0.9792626728110599, + "grad_norm": 1.6458666324615479, + "learning_rate": 1.1172941649923096e-05, + "loss": 0.0463, + "mean_token_accuracy": 0.9881701529026031, + "num_tokens": 6342546.0, + "step": 2975 + }, + { + "entropy": 2.0319183111190795, + "epoch": 0.9809084924292297, + "grad_norm": 2.887859582901001, + "learning_rate": 1.1145911119165789e-05, + "loss": 0.0553, + "mean_token_accuracy": 0.9828070223331451, + "num_tokens": 6353442.0, + "step": 2980 + }, + { + "entropy": 2.058086669445038, + "epoch": 0.9825543120473996, + "grad_norm": 0.9121004343032837, + "learning_rate": 1.11188721017295e-05, + "loss": 0.0366, + "mean_token_accuracy": 0.9888731300830841, + "num_tokens": 6363972.0, + "step": 2985 + }, + { + "entropy": 1.9924060463905335, + "epoch": 0.9842001316655694, + "grad_norm": 1.7889047861099243, + "learning_rate": 1.1091824797866639e-05, + "loss": 0.0469, + "mean_token_accuracy": 0.9876609027385712, + "num_tokens": 6374648.0, + "step": 2990 + }, + { + "entropy": 1.9006078600883485, + "epoch": 0.9858459512837393, + "grad_norm": 3.3813469409942627, + "learning_rate": 1.1064769407890986e-05, + "loss": 0.0228, + "mean_token_accuracy": 0.9935931503772736, + "num_tokens": 6385573.0, + "step": 2995 + }, + { + "entropy": 1.8375752925872804, + "epoch": 0.9874917709019092, + "grad_norm": 4.215015888214111, + "learning_rate": 1.1037706132176197e-05, + "loss": 0.0313, + "mean_token_accuracy": 0.9893752455711364, + "num_tokens": 6396486.0, + "step": 3000 + }, + { + "entropy": 1.7838311195373535, + "epoch": 0.989137590520079, + "grad_norm": 1.322409749031067, + "learning_rate": 1.1010635171154342e-05, + "loss": 0.071, + "mean_token_accuracy": 0.9886986196041108, + "num_tokens": 6407411.0, + "step": 3005 + }, + { + "entropy": 1.7372970938682557, + "epoch": 0.9907834101382489, + "grad_norm": 2.0234224796295166, + "learning_rate": 1.098355672531441e-05, + "loss": 0.0549, + "mean_token_accuracy": 0.9890697836875916, + "num_tokens": 6418480.0, + "step": 3010 + }, + { + "entropy": 1.8203161001205443, + "epoch": 0.9924292297564187, + "grad_norm": 4.083996295928955, + "learning_rate": 1.0956470995200816e-05, + "loss": 0.0559, + "mean_token_accuracy": 0.9861119568347931, + "num_tokens": 6429376.0, + "step": 3015 + }, + { + "entropy": 1.8980080366134644, + "epoch": 0.9940750493745886, + "grad_norm": 2.3635177612304688, + "learning_rate": 1.0929378181411918e-05, + "loss": 0.0407, + "mean_token_accuracy": 0.9869163811206818, + "num_tokens": 6440084.0, + "step": 3020 + }, + { + "entropy": 1.9592399954795838, + "epoch": 0.9957208689927584, + "grad_norm": 1.469984769821167, + "learning_rate": 1.0902278484598549e-05, + "loss": 0.046, + "mean_token_accuracy": 0.9860125243663788, + "num_tokens": 6450632.0, + "step": 3025 + }, + { + "entropy": 1.943799901008606, + "epoch": 0.9973666886109283, + "grad_norm": 2.726168394088745, + "learning_rate": 1.0875172105462513e-05, + "loss": 0.0617, + "mean_token_accuracy": 0.9823821485042572, + "num_tokens": 6461324.0, + "step": 3030 + }, + { + "entropy": 1.979371726512909, + "epoch": 0.999012508229098, + "grad_norm": 3.588101625442505, + "learning_rate": 1.0848059244755093e-05, + "loss": 0.04, + "mean_token_accuracy": 0.9917591452598572, + "num_tokens": 6472058.0, + "step": 3035 + }, + { + "entropy": 2.0093634843826296, + "epoch": 1.000658327847268, + "grad_norm": 1.96505868434906, + "learning_rate": 1.0820940103275594e-05, + "loss": 0.0413, + "mean_token_accuracy": 0.9876884579658508, + "num_tokens": 6482261.0, + "step": 3040 + }, + { + "epoch": 1.000658327847268, + "eval_entropy": 2.00399693047966, + "eval_loss": 0.05508637800812721, + "eval_mean_token_accuracy": 0.9851879490383764, + "eval_num_tokens": 6482261.0, + "eval_runtime": 196.5531, + "eval_samples_per_second": 42.365, + "eval_steps_per_second": 7.062, + "step": 3040 + }, + { + "entropy": 2.0180766105651857, + "epoch": 1.0023041474654377, + "grad_norm": 1.1969290971755981, + "learning_rate": 1.079381488186982e-05, + "loss": 0.0215, + "mean_token_accuracy": 0.9949158251285553, + "num_tokens": 6492816.0, + "step": 3045 + }, + { + "entropy": 1.9490231990814209, + "epoch": 1.0039499670836076, + "grad_norm": 0.8785367012023926, + "learning_rate": 1.0766683781428617e-05, + "loss": 0.0451, + "mean_token_accuracy": 0.9905852675437927, + "num_tokens": 6503215.0, + "step": 3050 + }, + { + "entropy": 1.9013155460357667, + "epoch": 1.0055957867017775, + "grad_norm": 2.4878368377685547, + "learning_rate": 1.0739547002886361e-05, + "loss": 0.0177, + "mean_token_accuracy": 0.9950673639774322, + "num_tokens": 6514095.0, + "step": 3055 + }, + { + "entropy": 1.8226952791213988, + "epoch": 1.0072416063199474, + "grad_norm": 1.9651074409484863, + "learning_rate": 1.0712404747219481e-05, + "loss": 0.0253, + "mean_token_accuracy": 0.9917102456092834, + "num_tokens": 6525039.0, + "step": 3060 + }, + { + "entropy": 1.7761475205421449, + "epoch": 1.008887425938117, + "grad_norm": 0.24293629825115204, + "learning_rate": 1.0685257215444975e-05, + "loss": 0.0185, + "mean_token_accuracy": 0.9937527477741241, + "num_tokens": 6535691.0, + "step": 3065 + }, + { + "entropy": 1.7371309757232667, + "epoch": 1.010533245556287, + "grad_norm": 1.073920726776123, + "learning_rate": 1.0658104608618917e-05, + "loss": 0.0273, + "mean_token_accuracy": 0.9902878165245056, + "num_tokens": 6546449.0, + "step": 3070 + }, + { + "entropy": 1.7515403628349304, + "epoch": 1.0121790651744569, + "grad_norm": 1.7505379915237427, + "learning_rate": 1.0630947127834962e-05, + "loss": 0.0213, + "mean_token_accuracy": 0.9931905210018158, + "num_tokens": 6557347.0, + "step": 3075 + }, + { + "entropy": 1.723526620864868, + "epoch": 1.0138248847926268, + "grad_norm": 3.507692575454712, + "learning_rate": 1.0603784974222862e-05, + "loss": 0.0429, + "mean_token_accuracy": 0.9902272820472717, + "num_tokens": 6568263.0, + "step": 3080 + }, + { + "entropy": 1.7295622825622559, + "epoch": 1.0154707044107967, + "grad_norm": 1.3187499046325684, + "learning_rate": 1.0576618348946982e-05, + "loss": 0.0139, + "mean_token_accuracy": 0.9940691888332367, + "num_tokens": 6579159.0, + "step": 3085 + }, + { + "entropy": 1.7557840943336487, + "epoch": 1.0171165240289664, + "grad_norm": 0.35923171043395996, + "learning_rate": 1.0549447453204793e-05, + "loss": 0.0163, + "mean_token_accuracy": 0.9933087825775146, + "num_tokens": 6589930.0, + "step": 3090 + }, + { + "entropy": 1.7555920958518982, + "epoch": 1.0187623436471362, + "grad_norm": 2.311398983001709, + "learning_rate": 1.0522272488225411e-05, + "loss": 0.0226, + "mean_token_accuracy": 0.9911885142326355, + "num_tokens": 6600428.0, + "step": 3095 + }, + { + "entropy": 1.6874248266220093, + "epoch": 1.0204081632653061, + "grad_norm": 2.7232701778411865, + "learning_rate": 1.049509365526807e-05, + "loss": 0.0322, + "mean_token_accuracy": 0.9916168689727783, + "num_tokens": 6611008.0, + "step": 3100 + }, + { + "entropy": 1.6934003949165344, + "epoch": 1.022053982883476, + "grad_norm": 3.6890053749084473, + "learning_rate": 1.0467911155620664e-05, + "loss": 0.0252, + "mean_token_accuracy": 0.9931854546070099, + "num_tokens": 6621437.0, + "step": 3105 + }, + { + "entropy": 1.620060884952545, + "epoch": 1.0236998025016457, + "grad_norm": 1.3696199655532837, + "learning_rate": 1.044072519059824e-05, + "loss": 0.055, + "mean_token_accuracy": 0.9850696086883545, + "num_tokens": 6632746.0, + "step": 3110 + }, + { + "entropy": 1.725310730934143, + "epoch": 1.0253456221198156, + "grad_norm": 0.7018516659736633, + "learning_rate": 1.0413535961541499e-05, + "loss": 0.0239, + "mean_token_accuracy": 0.9919134438037872, + "num_tokens": 6643199.0, + "step": 3115 + }, + { + "entropy": 1.6991363763809204, + "epoch": 1.0269914417379855, + "grad_norm": 0.6958704590797424, + "learning_rate": 1.0386343669815333e-05, + "loss": 0.0304, + "mean_token_accuracy": 0.991345739364624, + "num_tokens": 6653990.0, + "step": 3120 + }, + { + "entropy": 1.7753076672554016, + "epoch": 1.0286372613561554, + "grad_norm": 3.0339064598083496, + "learning_rate": 1.0359148516807302e-05, + "loss": 0.0442, + "mean_token_accuracy": 0.9869567334651947, + "num_tokens": 6664947.0, + "step": 3125 + }, + { + "entropy": 1.7934301972389222, + "epoch": 1.0302830809743253, + "grad_norm": 3.500246047973633, + "learning_rate": 1.0331950703926165e-05, + "loss": 0.0589, + "mean_token_accuracy": 0.9841761589050293, + "num_tokens": 6675621.0, + "step": 3130 + }, + { + "entropy": 1.8426469326019288, + "epoch": 1.031928900592495, + "grad_norm": 1.5313292741775513, + "learning_rate": 1.0304750432600377e-05, + "loss": 0.0261, + "mean_token_accuracy": 0.9919860601425171, + "num_tokens": 6686232.0, + "step": 3135 + }, + { + "entropy": 1.8473934888839723, + "epoch": 1.0335747202106649, + "grad_norm": 2.6027140617370605, + "learning_rate": 1.02775479042766e-05, + "loss": 0.0334, + "mean_token_accuracy": 0.9884464621543885, + "num_tokens": 6696624.0, + "step": 3140 + }, + { + "entropy": 1.8324166655540466, + "epoch": 1.0352205398288348, + "grad_norm": 0.43423017859458923, + "learning_rate": 1.0250343320418215e-05, + "loss": 0.0232, + "mean_token_accuracy": 0.9935197830200195, + "num_tokens": 6707078.0, + "step": 3145 + }, + { + "entropy": 1.8373860239982605, + "epoch": 1.0368663594470047, + "grad_norm": 3.9265055656433105, + "learning_rate": 1.0223136882503821e-05, + "loss": 0.0292, + "mean_token_accuracy": 0.9914808392524719, + "num_tokens": 6717515.0, + "step": 3150 + }, + { + "entropy": 1.850171196460724, + "epoch": 1.0385121790651746, + "grad_norm": 1.000176191329956, + "learning_rate": 1.0195928792025754e-05, + "loss": 0.0236, + "mean_token_accuracy": 0.9916164934635162, + "num_tokens": 6727947.0, + "step": 3155 + }, + { + "entropy": 1.8611099600791932, + "epoch": 1.0401579986833442, + "grad_norm": 0.9246273636817932, + "learning_rate": 1.016871925048858e-05, + "loss": 0.0208, + "mean_token_accuracy": 0.9946603596210479, + "num_tokens": 6738673.0, + "step": 3160 + }, + { + "entropy": 1.8230043292045592, + "epoch": 1.0418038183015141, + "grad_norm": 2.162306785583496, + "learning_rate": 1.0141508459407622e-05, + "loss": 0.0322, + "mean_token_accuracy": 0.9894011855125427, + "num_tokens": 6749351.0, + "step": 3165 + }, + { + "entropy": 1.810911202430725, + "epoch": 1.043449637919684, + "grad_norm": 0.9168899655342102, + "learning_rate": 1.0114296620307455e-05, + "loss": 0.0227, + "mean_token_accuracy": 0.992879319190979, + "num_tokens": 6760013.0, + "step": 3170 + }, + { + "entropy": 1.7807167649269104, + "epoch": 1.045095457537854, + "grad_norm": 3.3147082328796387, + "learning_rate": 1.0087083934720407e-05, + "loss": 0.0245, + "mean_token_accuracy": 0.9914108633995056, + "num_tokens": 6770834.0, + "step": 3175 + }, + { + "entropy": 1.7569178104400636, + "epoch": 1.0467412771560236, + "grad_norm": 0.19522562623023987, + "learning_rate": 1.0059870604185087e-05, + "loss": 0.0244, + "mean_token_accuracy": 0.9945703566074371, + "num_tokens": 6781679.0, + "step": 3180 + }, + { + "entropy": 1.800702166557312, + "epoch": 1.0483870967741935, + "grad_norm": 0.2634085714817047, + "learning_rate": 1.003265683024487e-05, + "loss": 0.022, + "mean_token_accuracy": 0.9923857629299164, + "num_tokens": 6792206.0, + "step": 3185 + }, + { + "entropy": 1.78583163022995, + "epoch": 1.0500329163923634, + "grad_norm": 2.332631826400757, + "learning_rate": 1.0005442814446427e-05, + "loss": 0.0354, + "mean_token_accuracy": 0.9924177765846253, + "num_tokens": 6802785.0, + "step": 3190 + }, + { + "entropy": 1.7854429602622985, + "epoch": 1.0516787360105333, + "grad_norm": 3.857931613922119, + "learning_rate": 9.97822875833821e-06, + "loss": 0.0436, + "mean_token_accuracy": 0.9893366694450378, + "num_tokens": 6813246.0, + "step": 3195 + }, + { + "entropy": 1.7886322617530823, + "epoch": 1.0533245556287032, + "grad_norm": 0.9116196632385254, + "learning_rate": 9.951014863468971e-06, + "loss": 0.0308, + "mean_token_accuracy": 0.9903899192810058, + "num_tokens": 6824016.0, + "step": 3200 + }, + { + "entropy": 1.831316077709198, + "epoch": 1.0549703752468729, + "grad_norm": 4.522629737854004, + "learning_rate": 9.92380133138627e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9942168056964874, + "num_tokens": 6834519.0, + "step": 3205 + }, + { + "entropy": 1.8389117002487183, + "epoch": 1.0566161948650428, + "grad_norm": 3.8139638900756836, + "learning_rate": 9.896588363634983e-06, + "loss": 0.0348, + "mean_token_accuracy": 0.9891720294952393, + "num_tokens": 6845234.0, + "step": 3210 + }, + { + "entropy": 1.858761990070343, + "epoch": 1.0582620144832127, + "grad_norm": 1.8006324768066406, + "learning_rate": 9.869376161755797e-06, + "loss": 0.0308, + "mean_token_accuracy": 0.9904006004333497, + "num_tokens": 6855897.0, + "step": 3215 + }, + { + "entropy": 1.9025790572166443, + "epoch": 1.0599078341013826, + "grad_norm": 1.6231938600540161, + "learning_rate": 9.842164927283734e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9929196774959564, + "num_tokens": 6866586.0, + "step": 3220 + }, + { + "entropy": 1.8303877234458923, + "epoch": 1.0615536537195522, + "grad_norm": 2.040353298187256, + "learning_rate": 9.814954861746661e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9953079938888549, + "num_tokens": 6877250.0, + "step": 3225 + }, + { + "entropy": 1.799347722530365, + "epoch": 1.0631994733377221, + "grad_norm": 3.7308638095855713, + "learning_rate": 9.787746166663765e-06, + "loss": 0.0368, + "mean_token_accuracy": 0.9898776650428772, + "num_tokens": 6887880.0, + "step": 3230 + }, + { + "entropy": 1.7600205421447754, + "epoch": 1.064845292955892, + "grad_norm": 2.044337749481201, + "learning_rate": 9.760539043544105e-06, + "loss": 0.0464, + "mean_token_accuracy": 0.9879779398441315, + "num_tokens": 6898466.0, + "step": 3235 + }, + { + "entropy": 1.7838748335838317, + "epoch": 1.066491112574062, + "grad_norm": 0.9999270439147949, + "learning_rate": 9.733333693885078e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9914038121700287, + "num_tokens": 6909129.0, + "step": 3240 + }, + { + "entropy": 1.747081458568573, + "epoch": 1.0681369321922318, + "grad_norm": 0.998917281627655, + "learning_rate": 9.706130319170968e-06, + "loss": 0.0376, + "mean_token_accuracy": 0.9899788439273834, + "num_tokens": 6919953.0, + "step": 3245 + }, + { + "entropy": 1.8464920043945312, + "epoch": 1.0697827518104015, + "grad_norm": 0.4222843050956726, + "learning_rate": 9.678929120871414e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.994493979215622, + "num_tokens": 6930503.0, + "step": 3250 + }, + { + "entropy": 1.8613513350486754, + "epoch": 1.0714285714285714, + "grad_norm": 5.423548698425293, + "learning_rate": 9.651730300439954e-06, + "loss": 0.0657, + "mean_token_accuracy": 0.9828275620937348, + "num_tokens": 6941390.0, + "step": 3255 + }, + { + "entropy": 1.809139084815979, + "epoch": 1.0730743910467413, + "grad_norm": 1.3602713346481323, + "learning_rate": 9.62453405931249e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9926668286323548, + "num_tokens": 6952211.0, + "step": 3260 + }, + { + "entropy": 1.8179643034934998, + "epoch": 1.0747202106649112, + "grad_norm": 1.4231082201004028, + "learning_rate": 9.597340598905851e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9913124799728393, + "num_tokens": 6962822.0, + "step": 3265 + }, + { + "entropy": 1.7864566802978517, + "epoch": 1.076366030283081, + "grad_norm": 1.2587391138076782, + "learning_rate": 9.57015012061625e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9924269318580627, + "num_tokens": 6973461.0, + "step": 3270 + }, + { + "entropy": 1.7307344913482665, + "epoch": 1.0780118499012508, + "grad_norm": 0.7557618021965027, + "learning_rate": 9.542962825817827e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9931135952472687, + "num_tokens": 6984212.0, + "step": 3275 + }, + { + "entropy": 1.7689846754074097, + "epoch": 1.0796576695194207, + "grad_norm": 3.2683939933776855, + "learning_rate": 9.515778915861136e-06, + "loss": 0.0494, + "mean_token_accuracy": 0.9892696917057038, + "num_tokens": 6994814.0, + "step": 3280 + }, + { + "entropy": 1.862963318824768, + "epoch": 1.0813034891375906, + "grad_norm": 2.394611120223999, + "learning_rate": 9.488598592071668e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9937925040721893, + "num_tokens": 7005314.0, + "step": 3285 + }, + { + "entropy": 1.8323248624801636, + "epoch": 1.0829493087557605, + "grad_norm": 0.8254493474960327, + "learning_rate": 9.461422055748357e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9926515042781829, + "num_tokens": 7015752.0, + "step": 3290 + }, + { + "entropy": 1.8604917049407959, + "epoch": 1.0845951283739301, + "grad_norm": 0.3408713936805725, + "learning_rate": 9.434249508162076e-06, + "loss": 0.0322, + "mean_token_accuracy": 0.991688358783722, + "num_tokens": 7026426.0, + "step": 3295 + }, + { + "entropy": 1.8841025471687316, + "epoch": 1.0862409479921, + "grad_norm": 2.080605983734131, + "learning_rate": 9.407081150554172e-06, + "loss": 0.0284, + "mean_token_accuracy": 0.9886221408843994, + "num_tokens": 7036922.0, + "step": 3300 + }, + { + "entropy": 1.9134224891662597, + "epoch": 1.08788676761027, + "grad_norm": 1.2799135446548462, + "learning_rate": 9.379917184134949e-06, + "loss": 0.0271, + "mean_token_accuracy": 0.9915643930435181, + "num_tokens": 7047343.0, + "step": 3305 + }, + { + "entropy": 1.8705042123794555, + "epoch": 1.0895325872284398, + "grad_norm": 0.8514411449432373, + "learning_rate": 9.352757810082196e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9933820009231568, + "num_tokens": 7057946.0, + "step": 3310 + }, + { + "entropy": 1.8473629593849181, + "epoch": 1.0911784068466095, + "grad_norm": 0.8632181286811829, + "learning_rate": 9.325603229539684e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9923954606056213, + "num_tokens": 7068260.0, + "step": 3315 + }, + { + "entropy": 1.8017066240310669, + "epoch": 1.0928242264647794, + "grad_norm": 1.6297268867492676, + "learning_rate": 9.298453643615692e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9935080289840699, + "num_tokens": 7078998.0, + "step": 3320 + }, + { + "entropy": 1.7489684343338012, + "epoch": 1.0944700460829493, + "grad_norm": 0.9504618048667908, + "learning_rate": 9.2713092533815e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9920541942119598, + "num_tokens": 7090092.0, + "step": 3325 + }, + { + "entropy": 1.753550660610199, + "epoch": 1.0961158657011192, + "grad_norm": 2.1300246715545654, + "learning_rate": 9.244170259869918e-06, + "loss": 0.0414, + "mean_token_accuracy": 0.9851863026618958, + "num_tokens": 7100582.0, + "step": 3330 + }, + { + "entropy": 1.7830381631851195, + "epoch": 1.097761685319289, + "grad_norm": 1.9206792116165161, + "learning_rate": 9.217036864073776e-06, + "loss": 0.0305, + "mean_token_accuracy": 0.989460003376007, + "num_tokens": 7110954.0, + "step": 3335 + }, + { + "entropy": 1.7082879304885865, + "epoch": 1.0994075049374588, + "grad_norm": 1.520218849182129, + "learning_rate": 9.189909266944459e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9915473163127899, + "num_tokens": 7121660.0, + "step": 3340 + }, + { + "entropy": 1.7186639666557313, + "epoch": 1.1010533245556287, + "grad_norm": 0.7555062770843506, + "learning_rate": 9.162787669390398e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9881877601146698, + "num_tokens": 7132239.0, + "step": 3345 + }, + { + "entropy": 1.7227881669998169, + "epoch": 1.1026991441737986, + "grad_norm": 1.7483959197998047, + "learning_rate": 9.135672272275593e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9922352850437164, + "num_tokens": 7143078.0, + "step": 3350 + }, + { + "entropy": 1.7311092615127563, + "epoch": 1.1043449637919684, + "grad_norm": 0.43958139419555664, + "learning_rate": 9.10856327641813e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9921256840229035, + "num_tokens": 7153701.0, + "step": 3355 + }, + { + "entropy": 1.6882481575012207, + "epoch": 1.1059907834101383, + "grad_norm": 2.0378565788269043, + "learning_rate": 9.081460882588668e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.995065587759018, + "num_tokens": 7164253.0, + "step": 3360 + }, + { + "entropy": 1.6925435185432434, + "epoch": 1.107636603028308, + "grad_norm": 0.6830073595046997, + "learning_rate": 9.054365291508998e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9935117542743683, + "num_tokens": 7175017.0, + "step": 3365 + }, + { + "entropy": 1.7125192880630493, + "epoch": 1.109282422646478, + "grad_norm": 0.8072521090507507, + "learning_rate": 9.027276703850505e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9914268732070923, + "num_tokens": 7185582.0, + "step": 3370 + }, + { + "entropy": 1.6770400524139404, + "epoch": 1.1109282422646478, + "grad_norm": 3.0566606521606445, + "learning_rate": 9.000195320232724e-06, + "loss": 0.0457, + "mean_token_accuracy": 0.987758994102478, + "num_tokens": 7196254.0, + "step": 3375 + }, + { + "entropy": 1.7625356912612915, + "epoch": 1.1125740618828177, + "grad_norm": 2.011699914932251, + "learning_rate": 8.973121341221823e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9929955065250397, + "num_tokens": 7206923.0, + "step": 3380 + }, + { + "entropy": 1.7391006708145142, + "epoch": 1.1142198815009876, + "grad_norm": 2.011082887649536, + "learning_rate": 8.946054967329142e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9917931973934173, + "num_tokens": 7217251.0, + "step": 3385 + }, + { + "entropy": 1.732879626750946, + "epoch": 1.1158657011191573, + "grad_norm": 1.593976378440857, + "learning_rate": 8.918996399009689e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9882166802883148, + "num_tokens": 7227764.0, + "step": 3390 + }, + { + "entropy": 1.6700308084487916, + "epoch": 1.1175115207373272, + "grad_norm": 1.129323124885559, + "learning_rate": 8.891945836660673e-06, + "loss": 0.0283, + "mean_token_accuracy": 0.9915276169776917, + "num_tokens": 7238505.0, + "step": 3395 + }, + { + "entropy": 1.688683032989502, + "epoch": 1.119157340355497, + "grad_norm": 2.0232725143432617, + "learning_rate": 8.864903480619996e-06, + "loss": 0.0405, + "mean_token_accuracy": 0.9898577332496643, + "num_tokens": 7249030.0, + "step": 3400 + }, + { + "entropy": 1.6767473936080932, + "epoch": 1.120803159973667, + "grad_norm": 1.5392571687698364, + "learning_rate": 8.837869531164792e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9922107934951783, + "num_tokens": 7259760.0, + "step": 3405 + }, + { + "entropy": 1.6974989771842957, + "epoch": 1.1224489795918366, + "grad_norm": 1.918980360031128, + "learning_rate": 8.810844188509946e-06, + "loss": 0.0405, + "mean_token_accuracy": 0.9870936632156372, + "num_tokens": 7270389.0, + "step": 3410 + }, + { + "entropy": 1.6960701942443848, + "epoch": 1.1240947992100065, + "grad_norm": 1.4752129316329956, + "learning_rate": 8.783827652806577e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9912503600120545, + "num_tokens": 7281116.0, + "step": 3415 + }, + { + "entropy": 1.6978899121284485, + "epoch": 1.1257406188281764, + "grad_norm": 2.9839565753936768, + "learning_rate": 8.756820124140602e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.9897141098976135, + "num_tokens": 7291516.0, + "step": 3420 + }, + { + "entropy": 1.6348261475563048, + "epoch": 1.1273864384463463, + "grad_norm": 2.958974599838257, + "learning_rate": 8.729821802531213e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9899660766124725, + "num_tokens": 7302127.0, + "step": 3425 + }, + { + "entropy": 1.671372151374817, + "epoch": 1.129032258064516, + "grad_norm": 1.5821384191513062, + "learning_rate": 8.70283288792943e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9928022623062134, + "num_tokens": 7312938.0, + "step": 3430 + }, + { + "entropy": 1.7028130054473878, + "epoch": 1.130678077682686, + "grad_norm": 1.062685489654541, + "learning_rate": 8.67585358021659e-06, + "loss": 0.037, + "mean_token_accuracy": 0.9904503941535949, + "num_tokens": 7323516.0, + "step": 3435 + }, + { + "entropy": 1.698065197467804, + "epoch": 1.1323238973008558, + "grad_norm": 3.0057718753814697, + "learning_rate": 8.648884079202896e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9931861937046051, + "num_tokens": 7333911.0, + "step": 3440 + }, + { + "entropy": 1.6505720615386963, + "epoch": 1.1339697169190257, + "grad_norm": 1.6598554849624634, + "learning_rate": 8.6219245846259e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9931141972541809, + "num_tokens": 7344558.0, + "step": 3445 + }, + { + "entropy": 1.6310924172401429, + "epoch": 1.1356155365371956, + "grad_norm": 2.144071578979492, + "learning_rate": 8.594975296149076e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9929544270038605, + "num_tokens": 7355291.0, + "step": 3450 + }, + { + "entropy": 1.600458061695099, + "epoch": 1.1372613561553653, + "grad_norm": 0.6878706216812134, + "learning_rate": 8.568036413360283e-06, + "loss": 0.0291, + "mean_token_accuracy": 0.9917424976825714, + "num_tokens": 7365889.0, + "step": 3455 + }, + { + "entropy": 1.6151419758796692, + "epoch": 1.1389071757735352, + "grad_norm": 1.7664144039154053, + "learning_rate": 8.541108135770327e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9897194743156433, + "num_tokens": 7376467.0, + "step": 3460 + }, + { + "entropy": 1.6531212329864502, + "epoch": 1.140552995391705, + "grad_norm": 3.0760936737060547, + "learning_rate": 8.514190662811477e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9931772351264954, + "num_tokens": 7387048.0, + "step": 3465 + }, + { + "entropy": 1.6679503798484803, + "epoch": 1.142198815009875, + "grad_norm": 2.6920888423919678, + "learning_rate": 8.48728419383597e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9894031941890716, + "num_tokens": 7397403.0, + "step": 3470 + }, + { + "entropy": 1.5688734650611877, + "epoch": 1.1438446346280449, + "grad_norm": 2.5779929161071777, + "learning_rate": 8.46038892811456e-06, + "loss": 0.0249, + "mean_token_accuracy": 0.9909287571907044, + "num_tokens": 7408276.0, + "step": 3475 + }, + { + "entropy": 1.6224292755126952, + "epoch": 1.1454904542462145, + "grad_norm": 0.8870710134506226, + "learning_rate": 8.433505064835012e-06, + "loss": 0.0293, + "mean_token_accuracy": 0.9906193792819977, + "num_tokens": 7418781.0, + "step": 3480 + }, + { + "entropy": 1.644513690471649, + "epoch": 1.1471362738643844, + "grad_norm": 1.0466939210891724, + "learning_rate": 8.406632803100665e-06, + "loss": 0.034, + "mean_token_accuracy": 0.9898303151130676, + "num_tokens": 7429487.0, + "step": 3485 + }, + { + "entropy": 1.6431127905845642, + "epoch": 1.1487820934825543, + "grad_norm": 2.5962440967559814, + "learning_rate": 8.379772341928916e-06, + "loss": 0.0348, + "mean_token_accuracy": 0.9878771305084229, + "num_tokens": 7439874.0, + "step": 3490 + }, + { + "entropy": 1.6485112071037293, + "epoch": 1.1504279131007242, + "grad_norm": 5.498775005340576, + "learning_rate": 8.352923880249784e-06, + "loss": 0.0329, + "mean_token_accuracy": 0.9888358414173126, + "num_tokens": 7450555.0, + "step": 3495 + }, + { + "entropy": 1.648984158039093, + "epoch": 1.1520737327188941, + "grad_norm": 1.443813681602478, + "learning_rate": 8.326087616904401e-06, + "loss": 0.038, + "mean_token_accuracy": 0.9887622356414795, + "num_tokens": 7461255.0, + "step": 3500 + }, + { + "entropy": 1.6465593934059144, + "epoch": 1.1537195523370638, + "grad_norm": 1.363283395767212, + "learning_rate": 8.299263750643577e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9922839045524597, + "num_tokens": 7472013.0, + "step": 3505 + }, + { + "entropy": 1.6541425347328187, + "epoch": 1.1553653719552337, + "grad_norm": 2.518134355545044, + "learning_rate": 8.272452480126292e-06, + "loss": 0.0341, + "mean_token_accuracy": 0.9866783797740937, + "num_tokens": 7483066.0, + "step": 3510 + }, + { + "entropy": 1.6475726962089539, + "epoch": 1.1570111915734036, + "grad_norm": 2.8579108715057373, + "learning_rate": 8.245654003918253e-06, + "loss": 0.0305, + "mean_token_accuracy": 0.9904314935207367, + "num_tokens": 7493486.0, + "step": 3515 + }, + { + "entropy": 1.6358031511306763, + "epoch": 1.1586570111915735, + "grad_norm": 2.28322172164917, + "learning_rate": 8.218868520490404e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9955400943756103, + "num_tokens": 7504061.0, + "step": 3520 + }, + { + "entropy": 1.6113605737686156, + "epoch": 1.1603028308097432, + "grad_norm": 1.8561283349990845, + "learning_rate": 8.192096228217464e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9960749268531799, + "num_tokens": 7514617.0, + "step": 3525 + }, + { + "entropy": 1.5644055128097534, + "epoch": 1.161948650427913, + "grad_norm": 2.0931262969970703, + "learning_rate": 8.165337325376467e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9914573907852173, + "num_tokens": 7525370.0, + "step": 3530 + }, + { + "entropy": 1.502221119403839, + "epoch": 1.163594470046083, + "grad_norm": 2.6928858757019043, + "learning_rate": 8.138592010145273e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9963742852210998, + "num_tokens": 7536226.0, + "step": 3535 + }, + { + "entropy": 1.5423286318778993, + "epoch": 1.1652402896642529, + "grad_norm": 2.7901740074157715, + "learning_rate": 8.111860480601117e-06, + "loss": 0.0464, + "mean_token_accuracy": 0.9899639308452606, + "num_tokens": 7546765.0, + "step": 3540 + }, + { + "entropy": 1.6044429183006286, + "epoch": 1.1668861092824225, + "grad_norm": 0.5412013530731201, + "learning_rate": 8.085142934719131e-06, + "loss": 0.015, + "mean_token_accuracy": 0.9947813093662262, + "num_tokens": 7557291.0, + "step": 3545 + }, + { + "entropy": 1.5845131874084473, + "epoch": 1.1685319289005924, + "grad_norm": 1.3245034217834473, + "learning_rate": 8.058439570370896e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9917524337768555, + "num_tokens": 7568110.0, + "step": 3550 + }, + { + "entropy": 1.576395034790039, + "epoch": 1.1701777485187623, + "grad_norm": 1.6202564239501953, + "learning_rate": 8.031750585322948e-06, + "loss": 0.0353, + "mean_token_accuracy": 0.9901875615119934, + "num_tokens": 7578544.0, + "step": 3555 + }, + { + "entropy": 1.6079319357872008, + "epoch": 1.1718235681369322, + "grad_norm": 1.0341594219207764, + "learning_rate": 8.005076177235337e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.9924035966396332, + "num_tokens": 7589338.0, + "step": 3560 + }, + { + "entropy": 1.5887961030006408, + "epoch": 1.1734693877551021, + "grad_norm": 2.502521276473999, + "learning_rate": 7.978416543660157e-06, + "loss": 0.042, + "mean_token_accuracy": 0.9888204395771026, + "num_tokens": 7599872.0, + "step": 3565 + }, + { + "entropy": 1.5681854605674743, + "epoch": 1.1751152073732718, + "grad_norm": 0.6041249632835388, + "learning_rate": 7.951771882040082e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9941197037696838, + "num_tokens": 7610467.0, + "step": 3570 + }, + { + "entropy": 1.5940211772918702, + "epoch": 1.1767610269914417, + "grad_norm": 2.8404202461242676, + "learning_rate": 7.92514238970689e-06, + "loss": 0.0284, + "mean_token_accuracy": 0.9908346593379974, + "num_tokens": 7621740.0, + "step": 3575 + }, + { + "entropy": 1.6273322224617004, + "epoch": 1.1784068466096116, + "grad_norm": 2.6306891441345215, + "learning_rate": 7.898528263880032e-06, + "loss": 0.0421, + "mean_token_accuracy": 0.9906798958778381, + "num_tokens": 7632253.0, + "step": 3580 + }, + { + "entropy": 1.6058209180831908, + "epoch": 1.1800526662277815, + "grad_norm": 2.210401773452759, + "learning_rate": 7.871929701665147e-06, + "loss": 0.02, + "mean_token_accuracy": 0.9937019526958466, + "num_tokens": 7642945.0, + "step": 3585 + }, + { + "entropy": 1.6328086972236633, + "epoch": 1.1816984858459514, + "grad_norm": 1.6380540132522583, + "learning_rate": 7.8453469000526e-06, + "loss": 0.0288, + "mean_token_accuracy": 0.9929502904415131, + "num_tokens": 7653553.0, + "step": 3590 + }, + { + "entropy": 1.6912413835525513, + "epoch": 1.183344305464121, + "grad_norm": 4.160943508148193, + "learning_rate": 7.818780055916052e-06, + "loss": 0.0195, + "mean_token_accuracy": 0.9919560194015503, + "num_tokens": 7663891.0, + "step": 3595 + }, + { + "entropy": 1.6919382691383362, + "epoch": 1.184990125082291, + "grad_norm": 0.695886492729187, + "learning_rate": 7.792229366010959e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.9903152227401734, + "num_tokens": 7674488.0, + "step": 3600 + }, + { + "entropy": 1.6337382435798644, + "epoch": 1.1866359447004609, + "grad_norm": 1.5294524431228638, + "learning_rate": 7.765695026973155e-06, + "loss": 0.0196, + "mean_token_accuracy": 0.9943537652492523, + "num_tokens": 7685143.0, + "step": 3605 + }, + { + "entropy": 1.6397574901580811, + "epoch": 1.1882817643186308, + "grad_norm": 2.282261610031128, + "learning_rate": 7.73917723531737e-06, + "loss": 0.0403, + "mean_token_accuracy": 0.9924940526485443, + "num_tokens": 7695569.0, + "step": 3610 + }, + { + "entropy": 1.5973503470420838, + "epoch": 1.1899275839368006, + "grad_norm": 5.261276721954346, + "learning_rate": 7.71267618743579e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9901642620563507, + "num_tokens": 7706303.0, + "step": 3615 + }, + { + "entropy": 1.6234167337417602, + "epoch": 1.1915734035549703, + "grad_norm": 2.048280954360962, + "learning_rate": 7.686192079596586e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9935329794883728, + "num_tokens": 7716665.0, + "step": 3620 + }, + { + "entropy": 1.6198643565177917, + "epoch": 1.1932192231731402, + "grad_norm": 1.0738399028778076, + "learning_rate": 7.659725107942484e-06, + "loss": 0.0267, + "mean_token_accuracy": 0.9920990586280822, + "num_tokens": 7727458.0, + "step": 3625 + }, + { + "entropy": 1.658357810974121, + "epoch": 1.1948650427913101, + "grad_norm": 1.3478312492370605, + "learning_rate": 7.633275468489278e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9936649382114411, + "num_tokens": 7738325.0, + "step": 3630 + }, + { + "entropy": 1.7477381587028504, + "epoch": 1.19651086240948, + "grad_norm": 2.4919769763946533, + "learning_rate": 7.606843357124426e-06, + "loss": 0.0399, + "mean_token_accuracy": 0.9883269786834716, + "num_tokens": 7748772.0, + "step": 3635 + }, + { + "entropy": 1.7540024757385253, + "epoch": 1.1981566820276497, + "grad_norm": 1.3142441511154175, + "learning_rate": 7.5804289696055445e-06, + "loss": 0.0355, + "mean_token_accuracy": 0.9913724660873413, + "num_tokens": 7759350.0, + "step": 3640 + }, + { + "entropy": 1.7711892247200012, + "epoch": 1.1998025016458196, + "grad_norm": 2.53527569770813, + "learning_rate": 7.554032501559002e-06, + "loss": 0.0313, + "mean_token_accuracy": 0.9935957908630371, + "num_tokens": 7770189.0, + "step": 3645 + }, + { + "epoch": 1.2007899934167214, + "eval_entropy": 1.8328242014223972, + "eval_loss": 0.055253952741622925, + "eval_mean_token_accuracy": 0.9852129074527483, + "eval_num_tokens": 7776505.0, + "eval_runtime": 197.2997, + "eval_samples_per_second": 42.205, + "eval_steps_per_second": 7.035, + "step": 3648 + }, + { + "entropy": 1.820277488231659, + "epoch": 1.2014483212639895, + "grad_norm": 1.2625054121017456, + "learning_rate": 7.52765414847846e-06, + "loss": 0.0367, + "mean_token_accuracy": 0.9870195090770721, + "num_tokens": 7780633.0, + "step": 3650 + }, + { + "entropy": 1.850312602519989, + "epoch": 1.2030941408821594, + "grad_norm": 2.3248748779296875, + "learning_rate": 7.5012941057234015e-06, + "loss": 0.0348, + "mean_token_accuracy": 0.988330626487732, + "num_tokens": 7791063.0, + "step": 3655 + }, + { + "entropy": 1.7486730933189392, + "epoch": 1.204739960500329, + "grad_norm": 2.7954654693603516, + "learning_rate": 7.4749525685177145e-06, + "loss": 0.0345, + "mean_token_accuracy": 0.9921814143657685, + "num_tokens": 7801794.0, + "step": 3660 + }, + { + "entropy": 1.7875786900520325, + "epoch": 1.206385780118499, + "grad_norm": 1.7330853939056396, + "learning_rate": 7.44862973194823e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9861465573310852, + "num_tokens": 7812680.0, + "step": 3665 + }, + { + "entropy": 1.7702386498451232, + "epoch": 1.2080315997366688, + "grad_norm": 1.6117557287216187, + "learning_rate": 7.422325790963286e-06, + "loss": 0.0228, + "mean_token_accuracy": 0.9921964347362519, + "num_tokens": 7823265.0, + "step": 3670 + }, + { + "entropy": 1.750121569633484, + "epoch": 1.2096774193548387, + "grad_norm": 1.6110137701034546, + "learning_rate": 7.39604094037127e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9932965815067292, + "num_tokens": 7833616.0, + "step": 3675 + }, + { + "entropy": 1.7086394667625426, + "epoch": 1.2113232389730086, + "grad_norm": 5.368286609649658, + "learning_rate": 7.369775374839196e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9906518340110779, + "num_tokens": 7844464.0, + "step": 3680 + }, + { + "entropy": 1.7639584064483642, + "epoch": 1.2129690585911783, + "grad_norm": 0.626293420791626, + "learning_rate": 7.343529288891239e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9907599687576294, + "num_tokens": 7855198.0, + "step": 3685 + }, + { + "entropy": 1.7602010250091553, + "epoch": 1.2146148782093482, + "grad_norm": 0.723297119140625, + "learning_rate": 7.317302876907318e-06, + "loss": 0.0312, + "mean_token_accuracy": 0.9907152950763702, + "num_tokens": 7865513.0, + "step": 3690 + }, + { + "entropy": 1.780686092376709, + "epoch": 1.2162606978275181, + "grad_norm": 0.540903627872467, + "learning_rate": 7.29109633312164e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9916972815990448, + "num_tokens": 7875954.0, + "step": 3695 + }, + { + "entropy": 1.7540069580078126, + "epoch": 1.217906517445688, + "grad_norm": 1.341984510421753, + "learning_rate": 7.2649098516212715e-06, + "loss": 0.0382, + "mean_token_accuracy": 0.9883949160575867, + "num_tokens": 7886845.0, + "step": 3700 + }, + { + "entropy": 1.759465503692627, + "epoch": 1.219552337063858, + "grad_norm": 1.8915423154830933, + "learning_rate": 7.2387436263446885e-06, + "loss": 0.0222, + "mean_token_accuracy": 0.9931784868240356, + "num_tokens": 7897311.0, + "step": 3705 + }, + { + "entropy": 1.768437671661377, + "epoch": 1.2211981566820276, + "grad_norm": 1.4099783897399902, + "learning_rate": 7.2125978510803565e-06, + "loss": 0.0268, + "mean_token_accuracy": 0.9915161907672883, + "num_tokens": 7907999.0, + "step": 3710 + }, + { + "entropy": 1.7534563660621643, + "epoch": 1.2228439763001975, + "grad_norm": 1.8560601472854614, + "learning_rate": 7.1864727194652874e-06, + "loss": 0.0295, + "mean_token_accuracy": 0.9926966905593873, + "num_tokens": 7918701.0, + "step": 3715 + }, + { + "entropy": 1.7628652215003968, + "epoch": 1.2244897959183674, + "grad_norm": 1.2373379468917847, + "learning_rate": 7.1603684249835966e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9931581020355225, + "num_tokens": 7929045.0, + "step": 3720 + }, + { + "entropy": 1.7709421277046205, + "epoch": 1.2261356155365373, + "grad_norm": 1.8868582248687744, + "learning_rate": 7.134285160965091e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9951976418495179, + "num_tokens": 7939549.0, + "step": 3725 + }, + { + "entropy": 1.7510177731513976, + "epoch": 1.2277814351547072, + "grad_norm": 1.6280816793441772, + "learning_rate": 7.108223120583806e-06, + "loss": 0.0431, + "mean_token_accuracy": 0.9894875645637512, + "num_tokens": 7950189.0, + "step": 3730 + }, + { + "entropy": 1.7418721914291382, + "epoch": 1.2294272547728768, + "grad_norm": 2.297192335128784, + "learning_rate": 7.0821824968566186e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9930408895015717, + "num_tokens": 7960790.0, + "step": 3735 + }, + { + "entropy": 1.7448366165161133, + "epoch": 1.2310730743910467, + "grad_norm": 2.935821294784546, + "learning_rate": 7.056163482641769e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9896710455417633, + "num_tokens": 7971434.0, + "step": 3740 + }, + { + "entropy": 1.6850064516067504, + "epoch": 1.2327188940092166, + "grad_norm": 1.6496083736419678, + "learning_rate": 7.030166270637475e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9923489093780518, + "num_tokens": 7981902.0, + "step": 3745 + }, + { + "entropy": 1.6959288597106934, + "epoch": 1.2343647136273865, + "grad_norm": 4.362974643707275, + "learning_rate": 7.004191053380469e-06, + "loss": 0.028, + "mean_token_accuracy": 0.9899141073226929, + "num_tokens": 7992608.0, + "step": 3750 + }, + { + "entropy": 1.7090837240219117, + "epoch": 1.2360105332455562, + "grad_norm": 1.9156920909881592, + "learning_rate": 6.978238023244608e-06, + "loss": 0.0274, + "mean_token_accuracy": 0.9905971348285675, + "num_tokens": 8003032.0, + "step": 3755 + }, + { + "entropy": 1.6994237184524537, + "epoch": 1.237656352863726, + "grad_norm": 1.834991455078125, + "learning_rate": 6.952307372439411e-06, + "loss": 0.0264, + "mean_token_accuracy": 0.9915358006954194, + "num_tokens": 8013591.0, + "step": 3760 + }, + { + "entropy": 1.6717028617858887, + "epoch": 1.239302172481896, + "grad_norm": 1.8597207069396973, + "learning_rate": 6.926399293008668e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9935126841068268, + "num_tokens": 8024009.0, + "step": 3765 + }, + { + "entropy": 1.6924420833587646, + "epoch": 1.240947992100066, + "grad_norm": 2.2939343452453613, + "learning_rate": 6.900513976829e-06, + "loss": 0.0323, + "mean_token_accuracy": 0.9890012562274932, + "num_tokens": 8034690.0, + "step": 3770 + }, + { + "entropy": 1.746755886077881, + "epoch": 1.2425938117182356, + "grad_norm": 2.126143455505371, + "learning_rate": 6.874651615608441e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9934778928756713, + "num_tokens": 8045114.0, + "step": 3775 + }, + { + "entropy": 1.6977684855461121, + "epoch": 1.2442396313364055, + "grad_norm": 0.6126706600189209, + "learning_rate": 6.848812400885022e-06, + "loss": 0.0192, + "mean_token_accuracy": 0.9939151763916015, + "num_tokens": 8055532.0, + "step": 3780 + }, + { + "entropy": 1.6669098496437074, + "epoch": 1.2458854509545754, + "grad_norm": 0.8724939227104187, + "learning_rate": 6.822996524025343e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9912198483943939, + "num_tokens": 8066233.0, + "step": 3785 + }, + { + "entropy": 1.6577038049697876, + "epoch": 1.2475312705727453, + "grad_norm": 0.9917463064193726, + "learning_rate": 6.7972041762231735e-06, + "loss": 0.025, + "mean_token_accuracy": 0.990100759267807, + "num_tokens": 8076870.0, + "step": 3790 + }, + { + "entropy": 1.6371492862701416, + "epoch": 1.2491770901909152, + "grad_norm": 3.3986692428588867, + "learning_rate": 6.771435548498013e-06, + "loss": 0.035, + "mean_token_accuracy": 0.9852859020233155, + "num_tokens": 8087788.0, + "step": 3795 + }, + { + "entropy": 1.6349127650260926, + "epoch": 1.2508229098090848, + "grad_norm": 0.3032724857330322, + "learning_rate": 6.745690831693701e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.993361896276474, + "num_tokens": 8098543.0, + "step": 3800 + }, + { + "entropy": 1.6615733742713927, + "epoch": 1.2524687294272547, + "grad_norm": 1.261225700378418, + "learning_rate": 6.719970216476982e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9910578489303589, + "num_tokens": 8109340.0, + "step": 3805 + }, + { + "entropy": 1.680140745639801, + "epoch": 1.2541145490454246, + "grad_norm": 1.3544028997421265, + "learning_rate": 6.694273893336112e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9918768882751465, + "num_tokens": 8119963.0, + "step": 3810 + }, + { + "entropy": 1.6792070388793945, + "epoch": 1.2557603686635945, + "grad_norm": 1.9016791582107544, + "learning_rate": 6.668602052579425e-06, + "loss": 0.0263, + "mean_token_accuracy": 0.9923485815525055, + "num_tokens": 8130505.0, + "step": 3815 + }, + { + "entropy": 1.7175181269645692, + "epoch": 1.2574061882817644, + "grad_norm": 0.3682861924171448, + "learning_rate": 6.6429548843339554e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.991203784942627, + "num_tokens": 8141063.0, + "step": 3820 + }, + { + "entropy": 1.6892056941986084, + "epoch": 1.259052007899934, + "grad_norm": 5.687158107757568, + "learning_rate": 6.617332578543991e-06, + "loss": 0.0289, + "mean_token_accuracy": 0.9894017159938813, + "num_tokens": 8152016.0, + "step": 3825 + }, + { + "entropy": 1.7227007865905761, + "epoch": 1.260697827518104, + "grad_norm": 2.130305290222168, + "learning_rate": 6.591735324969703e-06, + "loss": 0.0423, + "mean_token_accuracy": 0.9899180591106415, + "num_tokens": 8162786.0, + "step": 3830 + }, + { + "entropy": 1.766884195804596, + "epoch": 1.262343647136274, + "grad_norm": 2.274033546447754, + "learning_rate": 6.566163313185725e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9887346982955932, + "num_tokens": 8173362.0, + "step": 3835 + }, + { + "entropy": 1.8262192487716675, + "epoch": 1.2639894667544438, + "grad_norm": 0.5273262858390808, + "learning_rate": 6.540616732579732e-06, + "loss": 0.0387, + "mean_token_accuracy": 0.9902793288230896, + "num_tokens": 8183767.0, + "step": 3840 + }, + { + "entropy": 1.8414955258369445, + "epoch": 1.2656352863726137, + "grad_norm": 0.5974478721618652, + "learning_rate": 6.515095772351072e-06, + "loss": 0.0124, + "mean_token_accuracy": 0.99626624584198, + "num_tokens": 8194098.0, + "step": 3845 + }, + { + "entropy": 1.8668495297431946, + "epoch": 1.2672811059907834, + "grad_norm": 1.7619863748550415, + "learning_rate": 6.489600621509338e-06, + "loss": 0.0206, + "mean_token_accuracy": 0.9931797504425048, + "num_tokens": 8204769.0, + "step": 3850 + }, + { + "entropy": 1.8715619444847107, + "epoch": 1.2689269256089533, + "grad_norm": 2.3929049968719482, + "learning_rate": 6.464131468872978e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.9927472591400146, + "num_tokens": 8215368.0, + "step": 3855 + }, + { + "entropy": 1.8072044491767882, + "epoch": 1.2705727452271232, + "grad_norm": 2.2075207233428955, + "learning_rate": 6.4386885030679e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.992115718126297, + "num_tokens": 8226508.0, + "step": 3860 + }, + { + "entropy": 1.905867862701416, + "epoch": 1.2722185648452928, + "grad_norm": 1.7269535064697266, + "learning_rate": 6.413271912526071e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.993303781747818, + "num_tokens": 8237461.0, + "step": 3865 + }, + { + "entropy": 1.8554383277893067, + "epoch": 1.2738643844634627, + "grad_norm": 1.8244965076446533, + "learning_rate": 6.3878818854841095e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9931906521320343, + "num_tokens": 8248162.0, + "step": 3870 + }, + { + "entropy": 1.903454637527466, + "epoch": 1.2755102040816326, + "grad_norm": 1.2271250486373901, + "learning_rate": 6.36251860998192e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9948872745037078, + "num_tokens": 8258852.0, + "step": 3875 + }, + { + "entropy": 1.8506133675575256, + "epoch": 1.2771560236998025, + "grad_norm": 1.973997712135315, + "learning_rate": 6.337182273861273e-06, + "loss": 0.03, + "mean_token_accuracy": 0.9918236672878266, + "num_tokens": 8269756.0, + "step": 3880 + }, + { + "entropy": 1.8246717214584351, + "epoch": 1.2788018433179724, + "grad_norm": 1.5060253143310547, + "learning_rate": 6.311873064764429e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.992903220653534, + "num_tokens": 8280446.0, + "step": 3885 + }, + { + "entropy": 1.7956886053085328, + "epoch": 1.280447662936142, + "grad_norm": 2.8490633964538574, + "learning_rate": 6.2865911701327445e-06, + "loss": 0.0434, + "mean_token_accuracy": 0.9881740510463715, + "num_tokens": 8291519.0, + "step": 3890 + }, + { + "entropy": 1.7901652932167054, + "epoch": 1.282093482554312, + "grad_norm": 2.23622727394104, + "learning_rate": 6.261336777205278e-06, + "loss": 0.0304, + "mean_token_accuracy": 0.9898859024047851, + "num_tokens": 8302444.0, + "step": 3895 + }, + { + "entropy": 1.795331597328186, + "epoch": 1.2837393021724819, + "grad_norm": 2.6051158905029297, + "learning_rate": 6.236110073017417e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9921227872371674, + "num_tokens": 8313239.0, + "step": 3900 + }, + { + "entropy": 1.8181136965751648, + "epoch": 1.2853851217906518, + "grad_norm": 1.3954198360443115, + "learning_rate": 6.210911244399477e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9936698138713836, + "num_tokens": 8323898.0, + "step": 3905 + }, + { + "entropy": 1.801988172531128, + "epoch": 1.2870309414088217, + "grad_norm": 1.1723895072937012, + "learning_rate": 6.185740477975335e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9923419177532196, + "num_tokens": 8334473.0, + "step": 3910 + }, + { + "entropy": 1.783213496208191, + "epoch": 1.2886767610269914, + "grad_norm": 4.490993499755859, + "learning_rate": 6.160597960161024e-06, + "loss": 0.0452, + "mean_token_accuracy": 0.9866098284721374, + "num_tokens": 8345014.0, + "step": 3915 + }, + { + "entropy": 1.73745299577713, + "epoch": 1.2903225806451613, + "grad_norm": 2.8388960361480713, + "learning_rate": 6.135483877163383e-06, + "loss": 0.0243, + "mean_token_accuracy": 0.9913448512554168, + "num_tokens": 8355724.0, + "step": 3920 + }, + { + "entropy": 1.7496454119682312, + "epoch": 1.2919684002633312, + "grad_norm": 1.7147318124771118, + "learning_rate": 6.1103984149786444e-06, + "loss": 0.0262, + "mean_token_accuracy": 0.9924720883369446, + "num_tokens": 8366189.0, + "step": 3925 + }, + { + "entropy": 1.7078065276145935, + "epoch": 1.293614219881501, + "grad_norm": 2.615743398666382, + "learning_rate": 6.085341759391089e-06, + "loss": 0.0138, + "mean_token_accuracy": 0.9948653101921081, + "num_tokens": 8376886.0, + "step": 3930 + }, + { + "entropy": 1.7135319232940673, + "epoch": 1.295260039499671, + "grad_norm": 0.48522359132766724, + "learning_rate": 6.060314095971641e-06, + "loss": 0.0258, + "mean_token_accuracy": 0.9912464320659637, + "num_tokens": 8387635.0, + "step": 3935 + }, + { + "entropy": 1.7192439675331115, + "epoch": 1.2969058591178406, + "grad_norm": 0.810149073600769, + "learning_rate": 6.035315610076518e-06, + "loss": 0.0246, + "mean_token_accuracy": 0.9908073008060455, + "num_tokens": 8398117.0, + "step": 3940 + }, + { + "entropy": 1.6878206253051757, + "epoch": 1.2985516787360105, + "grad_norm": 1.6084260940551758, + "learning_rate": 6.010346486845837e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9926587820053101, + "num_tokens": 8408773.0, + "step": 3945 + }, + { + "entropy": 1.682581651210785, + "epoch": 1.3001974983541804, + "grad_norm": 2.0567915439605713, + "learning_rate": 5.985406911202263e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9948483347892761, + "num_tokens": 8419245.0, + "step": 3950 + }, + { + "entropy": 1.6870299458503724, + "epoch": 1.3018433179723503, + "grad_norm": 2.500347375869751, + "learning_rate": 5.960497067849627e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9893729627132416, + "num_tokens": 8430019.0, + "step": 3955 + }, + { + "entropy": 1.7270505785942079, + "epoch": 1.3034891375905202, + "grad_norm": 2.2009787559509277, + "learning_rate": 5.935617141271554e-06, + "loss": 0.0255, + "mean_token_accuracy": 0.9894290864467621, + "num_tokens": 8440514.0, + "step": 3960 + }, + { + "entropy": 1.7091917991638184, + "epoch": 1.3051349572086899, + "grad_norm": 3.052794933319092, + "learning_rate": 5.910767315730119e-06, + "loss": 0.0366, + "mean_token_accuracy": 0.9881552219390869, + "num_tokens": 8451099.0, + "step": 3965 + }, + { + "entropy": 1.7440539956092835, + "epoch": 1.3067807768268598, + "grad_norm": 1.05941641330719, + "learning_rate": 5.885947775264447e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9918931663036347, + "num_tokens": 8461462.0, + "step": 3970 + }, + { + "entropy": 1.751047670841217, + "epoch": 1.3084265964450297, + "grad_norm": 1.5784897804260254, + "learning_rate": 5.861158703689389e-06, + "loss": 0.0216, + "mean_token_accuracy": 0.9915867269039154, + "num_tokens": 8472292.0, + "step": 3975 + }, + { + "entropy": 1.7028122067451477, + "epoch": 1.3100724160631994, + "grad_norm": 1.145462155342102, + "learning_rate": 5.836400284594126e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.9926209390163422, + "num_tokens": 8482780.0, + "step": 3980 + }, + { + "entropy": 1.7101820945739745, + "epoch": 1.3117182356813692, + "grad_norm": 1.7238191366195679, + "learning_rate": 5.811672701340847e-06, + "loss": 0.0314, + "mean_token_accuracy": 0.9896239399909973, + "num_tokens": 8493301.0, + "step": 3985 + }, + { + "entropy": 1.7027180314064025, + "epoch": 1.3133640552995391, + "grad_norm": 1.4847729206085205, + "learning_rate": 5.786976137063336e-06, + "loss": 0.0209, + "mean_token_accuracy": 0.9931684076786041, + "num_tokens": 8504099.0, + "step": 3990 + }, + { + "entropy": 1.7060617804527283, + "epoch": 1.315009874917709, + "grad_norm": 0.6948561072349548, + "learning_rate": 5.762310774665682e-06, + "loss": 0.0404, + "mean_token_accuracy": 0.9896627187728881, + "num_tokens": 8514523.0, + "step": 3995 + }, + { + "entropy": 1.7185169339179993, + "epoch": 1.316655694535879, + "grad_norm": 1.8867969512939453, + "learning_rate": 5.737676796820871e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9954242944717407, + "num_tokens": 8525127.0, + "step": 4000 + }, + { + "entropy": 1.6883883237838746, + "epoch": 1.3183015141540486, + "grad_norm": 2.294797420501709, + "learning_rate": 5.713074385969457e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9943160593509675, + "num_tokens": 8535541.0, + "step": 4005 + }, + { + "entropy": 1.6336073637008668, + "epoch": 1.3199473337722185, + "grad_norm": 0.9374617338180542, + "learning_rate": 5.688503724318217e-06, + "loss": 0.0102, + "mean_token_accuracy": 0.9958161950111389, + "num_tokens": 8546220.0, + "step": 4010 + }, + { + "entropy": 1.6534652829170227, + "epoch": 1.3215931533903884, + "grad_norm": 1.0187243223190308, + "learning_rate": 5.663964993838779e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9947121858596801, + "num_tokens": 8556714.0, + "step": 4015 + }, + { + "entropy": 1.6586306810379028, + "epoch": 1.3232389730085583, + "grad_norm": 2.5042691230773926, + "learning_rate": 5.639458376266295e-06, + "loss": 0.0344, + "mean_token_accuracy": 0.9897667288780212, + "num_tokens": 8567099.0, + "step": 4020 + }, + { + "entropy": 1.6488097548484801, + "epoch": 1.3248847926267282, + "grad_norm": 0.3930192291736603, + "learning_rate": 5.614984053098076e-06, + "loss": 0.029, + "mean_token_accuracy": 0.9916586577892303, + "num_tokens": 8577866.0, + "step": 4025 + }, + { + "entropy": 1.6415641903877258, + "epoch": 1.3265306122448979, + "grad_norm": 1.2496612071990967, + "learning_rate": 5.590542205592283e-06, + "loss": 0.0288, + "mean_token_accuracy": 0.9903595089912415, + "num_tokens": 8588800.0, + "step": 4030 + }, + { + "entropy": 1.6364824771881104, + "epoch": 1.3281764318630678, + "grad_norm": 2.232058048248291, + "learning_rate": 5.566133014766526e-06, + "loss": 0.0333, + "mean_token_accuracy": 0.9903694272041321, + "num_tokens": 8599559.0, + "step": 4035 + }, + { + "entropy": 1.699565863609314, + "epoch": 1.3298222514812377, + "grad_norm": 2.588693618774414, + "learning_rate": 5.541756661396591e-06, + "loss": 0.0229, + "mean_token_accuracy": 0.9930707275867462, + "num_tokens": 8610049.0, + "step": 4040 + }, + { + "entropy": 1.6720836758613586, + "epoch": 1.3314680710994076, + "grad_norm": 1.1052114963531494, + "learning_rate": 5.517413326015046e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9897372424602509, + "num_tokens": 8620905.0, + "step": 4045 + }, + { + "entropy": 1.7122917652130127, + "epoch": 1.3331138907175775, + "grad_norm": 1.7371225357055664, + "learning_rate": 5.493103188909939e-06, + "loss": 0.0256, + "mean_token_accuracy": 0.9902097463607789, + "num_tokens": 8631605.0, + "step": 4050 + }, + { + "entropy": 1.7047467947006225, + "epoch": 1.3347597103357471, + "grad_norm": 3.5729825496673584, + "learning_rate": 5.468826430123435e-06, + "loss": 0.021, + "mean_token_accuracy": 0.992638248205185, + "num_tokens": 8642346.0, + "step": 4055 + }, + { + "entropy": 1.7031975507736206, + "epoch": 1.336405529953917, + "grad_norm": 2.91983962059021, + "learning_rate": 5.444583229450518e-06, + "loss": 0.0317, + "mean_token_accuracy": 0.9902465045452118, + "num_tokens": 8653100.0, + "step": 4060 + }, + { + "entropy": 1.697403860092163, + "epoch": 1.338051349572087, + "grad_norm": 1.3388501405715942, + "learning_rate": 5.4203737664376235e-06, + "loss": 0.0412, + "mean_token_accuracy": 0.9881835162639618, + "num_tokens": 8663816.0, + "step": 4065 + }, + { + "entropy": 1.7462014317512513, + "epoch": 1.3396971691902566, + "grad_norm": 5.225465774536133, + "learning_rate": 5.3961982203813215e-06, + "loss": 0.0383, + "mean_token_accuracy": 0.9885516107082367, + "num_tokens": 8674657.0, + "step": 4070 + }, + { + "entropy": 1.7663294553756714, + "epoch": 1.3413429888084267, + "grad_norm": 1.6548188924789429, + "learning_rate": 5.3720567703270135e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.99189293384552, + "num_tokens": 8685457.0, + "step": 4075 + }, + { + "entropy": 1.7495159268379212, + "epoch": 1.3429888084265964, + "grad_norm": 1.063901662826538, + "learning_rate": 5.34794959506755e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9917633235454559, + "num_tokens": 8695979.0, + "step": 4080 + }, + { + "entropy": 1.7705445170402527, + "epoch": 1.3446346280447663, + "grad_norm": 0.8427658677101135, + "learning_rate": 5.323876873141973e-06, + "loss": 0.0224, + "mean_token_accuracy": 0.9938309490680695, + "num_tokens": 8706574.0, + "step": 4085 + }, + { + "entropy": 1.7015425443649292, + "epoch": 1.3462804476629362, + "grad_norm": 4.552727222442627, + "learning_rate": 5.299838782834141e-06, + "loss": 0.0273, + "mean_token_accuracy": 0.9894029378890992, + "num_tokens": 8717255.0, + "step": 4090 + }, + { + "entropy": 1.7069133877754212, + "epoch": 1.3479262672811059, + "grad_norm": 4.266534805297852, + "learning_rate": 5.275835502171439e-06, + "loss": 0.024, + "mean_token_accuracy": 0.992630559206009, + "num_tokens": 8728164.0, + "step": 4095 + }, + { + "entropy": 1.730119228363037, + "epoch": 1.3495720868992758, + "grad_norm": 2.2071967124938965, + "learning_rate": 5.251867208923439e-06, + "loss": 0.0151, + "mean_token_accuracy": 0.9952519774436951, + "num_tokens": 8738646.0, + "step": 4100 + }, + { + "entropy": 1.7257731676101684, + "epoch": 1.3512179065174457, + "grad_norm": 3.7456114292144775, + "learning_rate": 5.227934080600611e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9912447273731232, + "num_tokens": 8749142.0, + "step": 4105 + }, + { + "entropy": 1.6916332483291625, + "epoch": 1.3528637261356156, + "grad_norm": 2.742368459701538, + "learning_rate": 5.2040362944529765e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9944073140621186, + "num_tokens": 8759739.0, + "step": 4110 + }, + { + "entropy": 1.6806737899780273, + "epoch": 1.3545095457537855, + "grad_norm": 1.4229061603546143, + "learning_rate": 5.180174027468818e-06, + "loss": 0.0184, + "mean_token_accuracy": 0.9935629963874817, + "num_tokens": 8770377.0, + "step": 4115 + }, + { + "entropy": 1.6647310853004456, + "epoch": 1.3561553653719551, + "grad_norm": 2.2806296348571777, + "learning_rate": 5.156347456373359e-06, + "loss": 0.0251, + "mean_token_accuracy": 0.9908474087715149, + "num_tokens": 8780910.0, + "step": 4120 + }, + { + "entropy": 1.6626045823097229, + "epoch": 1.357801184990125, + "grad_norm": 1.6852504014968872, + "learning_rate": 5.1325567576274595e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.9896870791912079, + "num_tokens": 8791765.0, + "step": 4125 + }, + { + "entropy": 1.7072691798210144, + "epoch": 1.359447004608295, + "grad_norm": 0.524549126625061, + "learning_rate": 5.108802107426307e-06, + "loss": 0.0152, + "mean_token_accuracy": 0.9940847456455231, + "num_tokens": 8802312.0, + "step": 4130 + }, + { + "entropy": 1.7237987995147706, + "epoch": 1.3610928242264648, + "grad_norm": 2.021660566329956, + "learning_rate": 5.085083681698108e-06, + "loss": 0.0437, + "mean_token_accuracy": 0.9889907777309418, + "num_tokens": 8813114.0, + "step": 4135 + }, + { + "entropy": 1.7635319709777832, + "epoch": 1.3627386438446347, + "grad_norm": 3.0634493827819824, + "learning_rate": 5.061401656102791e-06, + "loss": 0.04, + "mean_token_accuracy": 0.9863172173500061, + "num_tokens": 8823776.0, + "step": 4140 + }, + { + "entropy": 1.7534740567207336, + "epoch": 1.3643844634628044, + "grad_norm": 1.1463490724563599, + "learning_rate": 5.0377562060307e-06, + "loss": 0.032, + "mean_token_accuracy": 0.9871594190597535, + "num_tokens": 8834704.0, + "step": 4145 + }, + { + "entropy": 1.7889039039611816, + "epoch": 1.3660302830809743, + "grad_norm": 0.34571316838264465, + "learning_rate": 5.014147506601308e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.9936646819114685, + "num_tokens": 8845254.0, + "step": 4150 + }, + { + "entropy": 1.8070958495140075, + "epoch": 1.3676761026991442, + "grad_norm": 1.5474847555160522, + "learning_rate": 4.990575732661902e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9930408537387848, + "num_tokens": 8855930.0, + "step": 4155 + }, + { + "entropy": 1.7869633316993714, + "epoch": 1.369321922317314, + "grad_norm": 1.3527427911758423, + "learning_rate": 4.9670410587862995e-06, + "loss": 0.0341, + "mean_token_accuracy": 0.9886087715625763, + "num_tokens": 8866777.0, + "step": 4160 + }, + { + "entropy": 1.8497827887535094, + "epoch": 1.370967741935484, + "grad_norm": 1.8562260866165161, + "learning_rate": 4.943543659273548e-06, + "loss": 0.0476, + "mean_token_accuracy": 0.9888940870761871, + "num_tokens": 8877374.0, + "step": 4165 + }, + { + "entropy": 1.8215659737586976, + "epoch": 1.3726135615536537, + "grad_norm": 1.3257911205291748, + "learning_rate": 4.920083708146655e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9913751244544983, + "num_tokens": 8888152.0, + "step": 4170 + }, + { + "entropy": 1.8450712442398072, + "epoch": 1.3742593811718236, + "grad_norm": 0.5730196833610535, + "learning_rate": 4.896661379151259e-06, + "loss": 0.0186, + "mean_token_accuracy": 0.9925144612789154, + "num_tokens": 8898690.0, + "step": 4175 + }, + { + "entropy": 1.8470216035842895, + "epoch": 1.3759052007899935, + "grad_norm": 1.4637004137039185, + "learning_rate": 4.873276845754388e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9950567662715912, + "num_tokens": 8909192.0, + "step": 4180 + }, + { + "entropy": 1.7927292108535766, + "epoch": 1.3775510204081631, + "grad_norm": 2.084494113922119, + "learning_rate": 4.849930281143146e-06, + "loss": 0.0424, + "mean_token_accuracy": 0.9881355404853821, + "num_tokens": 8920016.0, + "step": 4185 + }, + { + "entropy": 1.815053677558899, + "epoch": 1.3791968400263332, + "grad_norm": 1.7285300493240356, + "learning_rate": 4.826621858223431e-06, + "loss": 0.0247, + "mean_token_accuracy": 0.992021232843399, + "num_tokens": 8930392.0, + "step": 4190 + }, + { + "entropy": 1.80452299118042, + "epoch": 1.380842659644503, + "grad_norm": 4.97409725189209, + "learning_rate": 4.803351749618679e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9906729996204376, + "num_tokens": 8940931.0, + "step": 4195 + }, + { + "entropy": 1.7994644165039062, + "epoch": 1.3824884792626728, + "grad_norm": 1.7542165517807007, + "learning_rate": 4.780120127668553e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9934443414211274, + "num_tokens": 8951603.0, + "step": 4200 + }, + { + "entropy": 1.800536823272705, + "epoch": 1.3841342988808427, + "grad_norm": 1.1492308378219604, + "learning_rate": 4.756927164427685e-06, + "loss": 0.0318, + "mean_token_accuracy": 0.9890105724334717, + "num_tokens": 8962110.0, + "step": 4205 + }, + { + "entropy": 1.7841524839401246, + "epoch": 1.3857801184990124, + "grad_norm": 2.564023971557617, + "learning_rate": 4.733773031664398e-06, + "loss": 0.0292, + "mean_token_accuracy": 0.9909674048423767, + "num_tokens": 8972933.0, + "step": 4210 + }, + { + "entropy": 1.8291552066802979, + "epoch": 1.3874259381171823, + "grad_norm": 0.70340496301651, + "learning_rate": 4.710657900859447e-06, + "loss": 0.0205, + "mean_token_accuracy": 0.9959937572479248, + "num_tokens": 8983508.0, + "step": 4215 + }, + { + "entropy": 1.8385101079940795, + "epoch": 1.3890717577353522, + "grad_norm": 1.934988021850586, + "learning_rate": 4.687581943204711e-06, + "loss": 0.031, + "mean_token_accuracy": 0.9913569748401642, + "num_tokens": 8994226.0, + "step": 4220 + }, + { + "entropy": 1.8304347276687623, + "epoch": 1.390717577353522, + "grad_norm": 1.6579409837722778, + "learning_rate": 4.664545329601977e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.9913767516613007, + "num_tokens": 9004843.0, + "step": 4225 + }, + { + "entropy": 1.842625379562378, + "epoch": 1.392363396971692, + "grad_norm": 1.2633566856384277, + "learning_rate": 4.641548230661633e-06, + "loss": 0.0327, + "mean_token_accuracy": 0.990666514635086, + "num_tokens": 9015138.0, + "step": 4230 + }, + { + "entropy": 1.822058653831482, + "epoch": 1.3940092165898617, + "grad_norm": 2.8293750286102295, + "learning_rate": 4.618590816701422e-06, + "loss": 0.0265, + "mean_token_accuracy": 0.9897272527217865, + "num_tokens": 9025856.0, + "step": 4235 + }, + { + "entropy": 1.8393550515174866, + "epoch": 1.3956550362080316, + "grad_norm": 0.7082259654998779, + "learning_rate": 4.5956732577451745e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9960557222366333, + "num_tokens": 9036445.0, + "step": 4240 + }, + { + "entropy": 1.8048294067382813, + "epoch": 1.3973008558262014, + "grad_norm": 0.9008603096008301, + "learning_rate": 4.572795723521565e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9946598470211029, + "num_tokens": 9047123.0, + "step": 4245 + }, + { + "entropy": 1.8322034358978272, + "epoch": 1.3989466754443713, + "grad_norm": 0.7377194166183472, + "learning_rate": 4.549958383462829e-06, + "loss": 0.0232, + "mean_token_accuracy": 0.9925904154777527, + "num_tokens": 9057591.0, + "step": 4250 + }, + { + "entropy": 1.8085259079933167, + "epoch": 1.4005924950625412, + "grad_norm": 0.7783912420272827, + "learning_rate": 4.527161406703525e-06, + "loss": 0.0381, + "mean_token_accuracy": 0.9926593840122223, + "num_tokens": 9068234.0, + "step": 4255 + }, + { + "epoch": 1.400921658986175, + "eval_entropy": 1.8214755838130325, + "eval_loss": 0.05032181739807129, + "eval_mean_token_accuracy": 0.9866692206959216, + "eval_num_tokens": 9070425.0, + "eval_runtime": 195.1937, + "eval_samples_per_second": 42.66, + "eval_steps_per_second": 7.111, + "step": 4256 + }, + { + "entropy": 1.8127322912216186, + "epoch": 1.402238314680711, + "grad_norm": 2.2625997066497803, + "learning_rate": 4.504404962079293e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9912942230701447, + "num_tokens": 9079026.0, + "step": 4260 + }, + { + "entropy": 1.825773298740387, + "epoch": 1.4038841342988808, + "grad_norm": 1.3523027896881104, + "learning_rate": 4.481689218125561e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9924202978610992, + "num_tokens": 9089666.0, + "step": 4265 + }, + { + "entropy": 1.790573537349701, + "epoch": 1.4055299539170507, + "grad_norm": 2.0441343784332275, + "learning_rate": 4.459014343076356e-06, + "loss": 0.0412, + "mean_token_accuracy": 0.9893443644046783, + "num_tokens": 9100438.0, + "step": 4270 + }, + { + "entropy": 1.8189255356788636, + "epoch": 1.4071757735352206, + "grad_norm": 1.1708464622497559, + "learning_rate": 4.436380504863008e-06, + "loss": 0.0169, + "mean_token_accuracy": 0.9938211143016815, + "num_tokens": 9111194.0, + "step": 4275 + }, + { + "entropy": 1.8196102857589722, + "epoch": 1.4088215931533905, + "grad_norm": 2.159729242324829, + "learning_rate": 4.413787871112934e-06, + "loss": 0.0197, + "mean_token_accuracy": 0.9932897388935089, + "num_tokens": 9122132.0, + "step": 4280 + }, + { + "entropy": 1.805807101726532, + "epoch": 1.4104674127715602, + "grad_norm": 2.360366106033325, + "learning_rate": 4.391236609148381e-06, + "loss": 0.0336, + "mean_token_accuracy": 0.9923444032669068, + "num_tokens": 9132718.0, + "step": 4285 + }, + { + "entropy": 1.813056230545044, + "epoch": 1.41211323238973, + "grad_norm": 0.6563708782196045, + "learning_rate": 4.3687268859852105e-06, + "loss": 0.0137, + "mean_token_accuracy": 0.9964501917362213, + "num_tokens": 9143366.0, + "step": 4290 + }, + { + "entropy": 1.8228686213493348, + "epoch": 1.4137590520079, + "grad_norm": 1.703528642654419, + "learning_rate": 4.34625886833163e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9924368858337402, + "num_tokens": 9154131.0, + "step": 4295 + }, + { + "entropy": 1.763486886024475, + "epoch": 1.4154048716260696, + "grad_norm": 2.912816286087036, + "learning_rate": 4.323832722586979e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9942897677421569, + "num_tokens": 9165049.0, + "step": 4300 + }, + { + "entropy": 1.8135482668876648, + "epoch": 1.4170506912442398, + "grad_norm": 1.8574903011322021, + "learning_rate": 4.301448614840487e-06, + "loss": 0.0423, + "mean_token_accuracy": 0.9888120889663696, + "num_tokens": 9175582.0, + "step": 4305 + }, + { + "entropy": 1.8078377962112426, + "epoch": 1.4186965108624094, + "grad_norm": 1.1886160373687744, + "learning_rate": 4.279106710870059e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9940821826457977, + "num_tokens": 9186081.0, + "step": 4310 + }, + { + "entropy": 1.7859516978263854, + "epoch": 1.4203423304805793, + "grad_norm": 1.7242977619171143, + "learning_rate": 4.256807176141028e-06, + "loss": 0.0131, + "mean_token_accuracy": 0.99482541680336, + "num_tokens": 9196757.0, + "step": 4315 + }, + { + "entropy": 1.7669077515602112, + "epoch": 1.4219881500987492, + "grad_norm": 2.3228442668914795, + "learning_rate": 4.2345501758049365e-06, + "loss": 0.0278, + "mean_token_accuracy": 0.9920706391334534, + "num_tokens": 9207097.0, + "step": 4320 + }, + { + "entropy": 1.8107630848884582, + "epoch": 1.423633969716919, + "grad_norm": 2.6625027656555176, + "learning_rate": 4.2123358746983225e-06, + "loss": 0.041, + "mean_token_accuracy": 0.9908157408237457, + "num_tokens": 9217832.0, + "step": 4325 + }, + { + "entropy": 1.8029837489128113, + "epoch": 1.4252797893350888, + "grad_norm": 1.867008090019226, + "learning_rate": 4.190164437341479e-06, + "loss": 0.0204, + "mean_token_accuracy": 0.9934187710285187, + "num_tokens": 9228324.0, + "step": 4330 + }, + { + "entropy": 1.7727699518203734, + "epoch": 1.4269256089532587, + "grad_norm": 5.103766918182373, + "learning_rate": 4.168036027937267e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9946336686611176, + "num_tokens": 9239199.0, + "step": 4335 + }, + { + "entropy": 1.8098699450492859, + "epoch": 1.4285714285714286, + "grad_norm": 0.7097901105880737, + "learning_rate": 4.145950810369863e-06, + "loss": 0.0226, + "mean_token_accuracy": 0.9928686380386352, + "num_tokens": 9249956.0, + "step": 4340 + }, + { + "entropy": 1.7923429489135743, + "epoch": 1.4302172481895985, + "grad_norm": 1.6268037557601929, + "learning_rate": 4.1239089482035686e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9906105041503906, + "num_tokens": 9260596.0, + "step": 4345 + }, + { + "entropy": 1.7969419360160828, + "epoch": 1.4318630678077682, + "grad_norm": 1.7350503206253052, + "learning_rate": 4.10191060468159e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9944913208484649, + "num_tokens": 9271060.0, + "step": 4350 + }, + { + "entropy": 1.7492168545722961, + "epoch": 1.433508887425938, + "grad_norm": 3.6994593143463135, + "learning_rate": 4.079955942724845e-06, + "loss": 0.0326, + "mean_token_accuracy": 0.989514821767807, + "num_tokens": 9281476.0, + "step": 4355 + }, + { + "entropy": 1.720914614200592, + "epoch": 1.435154707044108, + "grad_norm": 0.9848082661628723, + "learning_rate": 4.0580451249307195e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9936264753341675, + "num_tokens": 9292136.0, + "step": 4360 + }, + { + "entropy": 1.7023021578788757, + "epoch": 1.4368005266622779, + "grad_norm": 0.8126395344734192, + "learning_rate": 4.03617831357191e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.994192260503769, + "num_tokens": 9302979.0, + "step": 4365 + }, + { + "entropy": 1.6948873043060302, + "epoch": 1.4384463462804478, + "grad_norm": 4.326878547668457, + "learning_rate": 4.014355670595189e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9916200697422027, + "num_tokens": 9313652.0, + "step": 4370 + }, + { + "entropy": 1.7300003051757813, + "epoch": 1.4400921658986174, + "grad_norm": 0.8770890235900879, + "learning_rate": 3.99257735762021e-06, + "loss": 0.0311, + "mean_token_accuracy": 0.9879982471466064, + "num_tokens": 9324219.0, + "step": 4375 + }, + { + "entropy": 1.728563916683197, + "epoch": 1.4417379855167873, + "grad_norm": 1.0835927724838257, + "learning_rate": 3.970843535938332e-06, + "loss": 0.0218, + "mean_token_accuracy": 0.9914672791957855, + "num_tokens": 9334800.0, + "step": 4380 + }, + { + "entropy": 1.7277835011482239, + "epoch": 1.4433838051349572, + "grad_norm": 1.7556816339492798, + "learning_rate": 3.949154366511395e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9926247537136078, + "num_tokens": 9345539.0, + "step": 4385 + }, + { + "entropy": 1.729397702217102, + "epoch": 1.4450296247531271, + "grad_norm": 0.777664840221405, + "learning_rate": 3.927510009970548e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9917639851570129, + "num_tokens": 9356267.0, + "step": 4390 + }, + { + "entropy": 1.7339849829673768, + "epoch": 1.446675444371297, + "grad_norm": 2.712010383605957, + "learning_rate": 3.905910626615046e-06, + "loss": 0.0407, + "mean_token_accuracy": 0.9871343851089478, + "num_tokens": 9367167.0, + "step": 4395 + }, + { + "entropy": 1.7719183206558227, + "epoch": 1.4483212639894667, + "grad_norm": 1.5582504272460938, + "learning_rate": 3.884356376411089e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9926240026950837, + "num_tokens": 9377539.0, + "step": 4400 + }, + { + "entropy": 1.7510367512702942, + "epoch": 1.4499670836076366, + "grad_norm": 1.32984459400177, + "learning_rate": 3.862847418990592e-06, + "loss": 0.0321, + "mean_token_accuracy": 0.9916741907596588, + "num_tokens": 9388175.0, + "step": 4405 + }, + { + "entropy": 1.7010186672210694, + "epoch": 1.4516129032258065, + "grad_norm": 2.487252950668335, + "learning_rate": 3.841383913650052e-06, + "loss": 0.0429, + "mean_token_accuracy": 0.9880555152893067, + "num_tokens": 9398803.0, + "step": 4410 + }, + { + "entropy": 1.7146742224693299, + "epoch": 1.4532587228439762, + "grad_norm": 1.8718352317810059, + "learning_rate": 3.819966019349334e-06, + "loss": 0.0141, + "mean_token_accuracy": 0.9953644871711731, + "num_tokens": 9409524.0, + "step": 4415 + }, + { + "entropy": 1.6977456569671632, + "epoch": 1.454904542462146, + "grad_norm": 0.736453115940094, + "learning_rate": 3.7985938947105073e-06, + "loss": 0.0157, + "mean_token_accuracy": 0.9962522923946381, + "num_tokens": 9420376.0, + "step": 4420 + }, + { + "entropy": 1.7264664053916932, + "epoch": 1.456550362080316, + "grad_norm": 1.4576939344406128, + "learning_rate": 3.7772676980166655e-06, + "loss": 0.0282, + "mean_token_accuracy": 0.9920338273048401, + "num_tokens": 9430868.0, + "step": 4425 + }, + { + "entropy": 1.6992250204086303, + "epoch": 1.4581961816984859, + "grad_norm": 1.5100425481796265, + "learning_rate": 3.7559875872107677e-06, + "loss": 0.0359, + "mean_token_accuracy": 0.9908050179481507, + "num_tokens": 9441532.0, + "step": 4430 + }, + { + "entropy": 1.7052590131759644, + "epoch": 1.4598420013166558, + "grad_norm": 2.6018645763397217, + "learning_rate": 3.7347537198944483e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.992498379945755, + "num_tokens": 9452300.0, + "step": 4435 + }, + { + "entropy": 1.6945829153060914, + "epoch": 1.4614878209348254, + "grad_norm": 1.3602592945098877, + "learning_rate": 3.71356625332686e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9949553668498993, + "num_tokens": 9462853.0, + "step": 4440 + }, + { + "entropy": 1.6705161213874817, + "epoch": 1.4631336405529953, + "grad_norm": 0.6543161273002625, + "learning_rate": 3.6924253444235224e-06, + "loss": 0.0178, + "mean_token_accuracy": 0.9919328927993775, + "num_tokens": 9473560.0, + "step": 4445 + }, + { + "entropy": 1.6843745946884154, + "epoch": 1.4647794601711652, + "grad_norm": 1.519394874572754, + "learning_rate": 3.671331149755123e-06, + "loss": 0.0143, + "mean_token_accuracy": 0.9956124365329743, + "num_tokens": 9484022.0, + "step": 4450 + }, + { + "entropy": 1.6935065150260926, + "epoch": 1.4664252797893351, + "grad_norm": 5.51362943649292, + "learning_rate": 3.6502838255464045e-06, + "loss": 0.0319, + "mean_token_accuracy": 0.9897434711456299, + "num_tokens": 9494354.0, + "step": 4455 + }, + { + "entropy": 1.632759189605713, + "epoch": 1.468071099407505, + "grad_norm": 1.1584962606430054, + "learning_rate": 3.6292835276749715e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.992765587568283, + "num_tokens": 9505092.0, + "step": 4460 + }, + { + "entropy": 1.6529253125190735, + "epoch": 1.4697169190256747, + "grad_norm": 0.6863316893577576, + "learning_rate": 3.6083304116701535e-06, + "loss": 0.018, + "mean_token_accuracy": 0.9955762267112732, + "num_tokens": 9515748.0, + "step": 4465 + }, + { + "entropy": 1.6669641017913819, + "epoch": 1.4713627386438446, + "grad_norm": 0.19445084035396576, + "learning_rate": 3.587424632711841e-06, + "loss": 0.0305, + "mean_token_accuracy": 0.9905413866043091, + "num_tokens": 9526278.0, + "step": 4470 + }, + { + "entropy": 1.6730605006217956, + "epoch": 1.4730085582620145, + "grad_norm": 2.6678473949432373, + "learning_rate": 3.56656634562936e-06, + "loss": 0.024, + "mean_token_accuracy": 0.9918078184127808, + "num_tokens": 9537087.0, + "step": 4475 + }, + { + "entropy": 1.6874117732048035, + "epoch": 1.4746543778801844, + "grad_norm": 0.8383910655975342, + "learning_rate": 3.5457557049002934e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9937337219715119, + "num_tokens": 9547735.0, + "step": 4480 + }, + { + "entropy": 1.68024080991745, + "epoch": 1.4763001974983543, + "grad_norm": 2.533155918121338, + "learning_rate": 3.52499286464936e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9925665855407715, + "num_tokens": 9558405.0, + "step": 4485 + }, + { + "entropy": 1.6580557942390441, + "epoch": 1.477946017116524, + "grad_norm": 4.7806830406188965, + "learning_rate": 3.5042779786472602e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9935833632946014, + "num_tokens": 9569152.0, + "step": 4490 + }, + { + "entropy": 1.6794435620307921, + "epoch": 1.4795918367346939, + "grad_norm": 2.641050100326538, + "learning_rate": 3.4836112003095524e-06, + "loss": 0.0237, + "mean_token_accuracy": 0.9917592287063599, + "num_tokens": 9579673.0, + "step": 4495 + }, + { + "entropy": 1.6873149633407594, + "epoch": 1.4812376563528638, + "grad_norm": 1.5926048755645752, + "learning_rate": 3.4629926826954997e-06, + "loss": 0.0347, + "mean_token_accuracy": 0.9919157564640045, + "num_tokens": 9590146.0, + "step": 4500 + }, + { + "entropy": 1.6695775389671326, + "epoch": 1.4828834759710336, + "grad_norm": 3.867030382156372, + "learning_rate": 3.4424225785069444e-06, + "loss": 0.0248, + "mean_token_accuracy": 0.993567830324173, + "num_tokens": 9600795.0, + "step": 4505 + }, + { + "entropy": 1.6816540002822875, + "epoch": 1.4845292955892035, + "grad_norm": 2.597640037536621, + "learning_rate": 3.421901040087177e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9921609818935394, + "num_tokens": 9611539.0, + "step": 4510 + }, + { + "entropy": 1.6816054821014403, + "epoch": 1.4861751152073732, + "grad_norm": 0.9247928857803345, + "learning_rate": 3.4014282194198045e-06, + "loss": 0.0328, + "mean_token_accuracy": 0.9923509240150452, + "num_tokens": 9621948.0, + "step": 4515 + }, + { + "entropy": 1.7215697646141053, + "epoch": 1.4878209348255431, + "grad_norm": 1.9585981369018555, + "learning_rate": 3.381004268127638e-06, + "loss": 0.0158, + "mean_token_accuracy": 0.995547354221344, + "num_tokens": 9632351.0, + "step": 4520 + }, + { + "entropy": 1.7168682098388672, + "epoch": 1.489466754443713, + "grad_norm": 1.5342450141906738, + "learning_rate": 3.360629337471548e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9930002510547637, + "num_tokens": 9643093.0, + "step": 4525 + }, + { + "entropy": 1.6930954933166504, + "epoch": 1.4911125740618827, + "grad_norm": 1.625235915184021, + "learning_rate": 3.340303578349361e-06, + "loss": 0.0175, + "mean_token_accuracy": 0.9939290165901185, + "num_tokens": 9653833.0, + "step": 4530 + }, + { + "entropy": 1.7007075071334838, + "epoch": 1.4927583936800526, + "grad_norm": 0.7999654412269592, + "learning_rate": 3.3200271412947294e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9946927070617676, + "num_tokens": 9664334.0, + "step": 4535 + }, + { + "entropy": 1.670901095867157, + "epoch": 1.4944042132982225, + "grad_norm": 0.8723247647285461, + "learning_rate": 3.2998001764760414e-06, + "loss": 0.0173, + "mean_token_accuracy": 0.9957589745521546, + "num_tokens": 9675067.0, + "step": 4540 + }, + { + "entropy": 1.7091269135475158, + "epoch": 1.4960500329163924, + "grad_norm": 3.8337857723236084, + "learning_rate": 3.2796228336952663e-06, + "loss": 0.0306, + "mean_token_accuracy": 0.9932047963142395, + "num_tokens": 9685487.0, + "step": 4545 + }, + { + "entropy": 1.7051352858543396, + "epoch": 1.4976958525345623, + "grad_norm": 2.0214622020721436, + "learning_rate": 3.259495262386888e-06, + "loss": 0.0257, + "mean_token_accuracy": 0.9929771959781647, + "num_tokens": 9696067.0, + "step": 4550 + }, + { + "entropy": 1.716159415245056, + "epoch": 1.499341672152732, + "grad_norm": 1.7375526428222656, + "learning_rate": 3.2394176116167818e-06, + "loss": 0.0188, + "mean_token_accuracy": 0.9942406177520752, + "num_tokens": 9706729.0, + "step": 4555 + }, + { + "entropy": 1.712713897228241, + "epoch": 1.500987491770902, + "grad_norm": 3.6430742740631104, + "learning_rate": 3.2193900300810908e-06, + "loss": 0.0252, + "mean_token_accuracy": 0.9919379532337189, + "num_tokens": 9717306.0, + "step": 4560 + }, + { + "entropy": 1.6966816425323485, + "epoch": 1.5026333113890717, + "grad_norm": 0.11951875686645508, + "learning_rate": 3.1994126661051628e-06, + "loss": 0.0111, + "mean_token_accuracy": 0.9968602657318115, + "num_tokens": 9728013.0, + "step": 4565 + }, + { + "entropy": 1.7037412881851197, + "epoch": 1.5042791310072416, + "grad_norm": 1.588755488395691, + "learning_rate": 3.179485667642419e-06, + "loss": 0.0284, + "mean_token_accuracy": 0.989536440372467, + "num_tokens": 9738874.0, + "step": 4570 + }, + { + "entropy": 1.6956771850585937, + "epoch": 1.5059249506254115, + "grad_norm": 0.9138298034667969, + "learning_rate": 3.1596091822732755e-06, + "loss": 0.013, + "mean_token_accuracy": 0.994870787858963, + "num_tokens": 9749398.0, + "step": 4575 + }, + { + "entropy": 1.7180339097976685, + "epoch": 1.5075707702435812, + "grad_norm": 3.6119987964630127, + "learning_rate": 3.1397833572040414e-06, + "loss": 0.0471, + "mean_token_accuracy": 0.9881613373756408, + "num_tokens": 9760121.0, + "step": 4580 + }, + { + "entropy": 1.7082261800765992, + "epoch": 1.5092165898617511, + "grad_norm": 3.047520875930786, + "learning_rate": 3.1200083392658464e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9932926952838897, + "num_tokens": 9770729.0, + "step": 4585 + }, + { + "entropy": 1.696124267578125, + "epoch": 1.510862409479921, + "grad_norm": 4.276211738586426, + "learning_rate": 3.1002842749135175e-06, + "loss": 0.025, + "mean_token_accuracy": 0.992525440454483, + "num_tokens": 9781733.0, + "step": 4590 + }, + { + "entropy": 1.689008605480194, + "epoch": 1.5125082290980907, + "grad_norm": 0.7173812389373779, + "learning_rate": 3.0806113102245395e-06, + "loss": 0.027, + "mean_token_accuracy": 0.9932247400283813, + "num_tokens": 9792414.0, + "step": 4595 + }, + { + "entropy": 1.6812505960464477, + "epoch": 1.5141540487162608, + "grad_norm": 1.7421464920043945, + "learning_rate": 3.0609895908979347e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.9890223205089569, + "num_tokens": 9802799.0, + "step": 4600 + }, + { + "entropy": 1.7080901622772218, + "epoch": 1.5157998683344305, + "grad_norm": 2.5039777755737305, + "learning_rate": 3.041419262253208e-06, + "loss": 0.0217, + "mean_token_accuracy": 0.993123060464859, + "num_tokens": 9813536.0, + "step": 4605 + }, + { + "entropy": 1.703446888923645, + "epoch": 1.5174456879526004, + "grad_norm": 1.0789647102355957, + "learning_rate": 3.021900469229253e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9934683322906495, + "num_tokens": 9824215.0, + "step": 4610 + }, + { + "entropy": 1.7133625149726868, + "epoch": 1.5190915075707703, + "grad_norm": 1.688781499862671, + "learning_rate": 3.0024333563833007e-06, + "loss": 0.0276, + "mean_token_accuracy": 0.9893021941184997, + "num_tokens": 9834752.0, + "step": 4615 + }, + { + "entropy": 1.7214069485664367, + "epoch": 1.52073732718894, + "grad_norm": 3.265244722366333, + "learning_rate": 2.983018067889828e-06, + "loss": 0.0352, + "mean_token_accuracy": 0.9897504031658173, + "num_tokens": 9845611.0, + "step": 4620 + }, + { + "entropy": 1.711961841583252, + "epoch": 1.52238314680711, + "grad_norm": 6.8311333656311035, + "learning_rate": 2.963654747539494e-06, + "loss": 0.0225, + "mean_token_accuracy": 0.9920111656188965, + "num_tokens": 9856980.0, + "step": 4625 + }, + { + "entropy": 1.7189836978912354, + "epoch": 1.5240289664252797, + "grad_norm": 1.7447693347930908, + "learning_rate": 2.9443435387380936e-06, + "loss": 0.012, + "mean_token_accuracy": 0.9950068473815918, + "num_tokens": 9867771.0, + "step": 4630 + }, + { + "entropy": 1.7515931606292725, + "epoch": 1.5256747860434496, + "grad_norm": 1.2482762336730957, + "learning_rate": 2.9250845845054586e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9916983366012573, + "num_tokens": 9878291.0, + "step": 4635 + }, + { + "entropy": 1.746778416633606, + "epoch": 1.5273206056616195, + "grad_norm": 0.6369831562042236, + "learning_rate": 2.9058780274744426e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9942344844341278, + "num_tokens": 9888741.0, + "step": 4640 + }, + { + "entropy": 1.741689419746399, + "epoch": 1.5289664252797892, + "grad_norm": 1.0881776809692383, + "learning_rate": 2.8867240098898297e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9928946614265441, + "num_tokens": 9899481.0, + "step": 4645 + }, + { + "entropy": 1.715234887599945, + "epoch": 1.5306122448979593, + "grad_norm": 2.7303154468536377, + "learning_rate": 2.8676226736072975e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9908973276615143, + "num_tokens": 9910093.0, + "step": 4650 + }, + { + "entropy": 1.7300845026969909, + "epoch": 1.532258064516129, + "grad_norm": 0.8988632559776306, + "learning_rate": 2.848574160092362e-06, + "loss": 0.0265, + "mean_token_accuracy": 0.9913157403469086, + "num_tokens": 9920988.0, + "step": 4655 + }, + { + "entropy": 1.7755305886268615, + "epoch": 1.533903884134299, + "grad_norm": 1.2511268854141235, + "learning_rate": 2.829578610419337e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.9922440350055695, + "num_tokens": 9931448.0, + "step": 4660 + }, + { + "entropy": 1.7372457265853882, + "epoch": 1.5355497037524688, + "grad_norm": 1.8746650218963623, + "learning_rate": 2.810636165270274e-06, + "loss": 0.0448, + "mean_token_accuracy": 0.9905406355857849, + "num_tokens": 9942334.0, + "step": 4665 + }, + { + "entropy": 1.7496211767196654, + "epoch": 1.5371955233706385, + "grad_norm": 3.1501920223236084, + "learning_rate": 2.7917469649339356e-06, + "loss": 0.0177, + "mean_token_accuracy": 0.9929421722888947, + "num_tokens": 9952925.0, + "step": 4670 + }, + { + "entropy": 1.769037902355194, + "epoch": 1.5388413429888086, + "grad_norm": 1.5442581176757812, + "learning_rate": 2.7729111493047458e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9939168155193329, + "num_tokens": 9963276.0, + "step": 4675 + }, + { + "entropy": 1.7426852703094482, + "epoch": 1.5404871626069783, + "grad_norm": 1.5778064727783203, + "learning_rate": 2.754128857881768e-06, + "loss": 0.022, + "mean_token_accuracy": 0.9926289975643158, + "num_tokens": 9974069.0, + "step": 4680 + }, + { + "entropy": 1.7343324899673462, + "epoch": 1.5421329822251482, + "grad_norm": 0.8580919504165649, + "learning_rate": 2.735400229767652e-06, + "loss": 0.0208, + "mean_token_accuracy": 0.9934548079967499, + "num_tokens": 9984603.0, + "step": 4685 + }, + { + "entropy": 1.7297178983688355, + "epoch": 1.543778801843318, + "grad_norm": 1.2886171340942383, + "learning_rate": 2.7167254036676183e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9902363300323487, + "num_tokens": 9995352.0, + "step": 4690 + }, + { + "entropy": 1.6793412566184998, + "epoch": 1.5454246214614877, + "grad_norm": 0.6308950185775757, + "learning_rate": 2.698104517888427e-06, + "loss": 0.0088, + "mean_token_accuracy": 0.9957121074199676, + "num_tokens": 10006198.0, + "step": 4695 + }, + { + "entropy": 1.712625253200531, + "epoch": 1.5470704410796576, + "grad_norm": 4.383965969085693, + "learning_rate": 2.679537710337352e-06, + "loss": 0.0301, + "mean_token_accuracy": 0.990826541185379, + "num_tokens": 10017039.0, + "step": 4700 + }, + { + "entropy": 1.7353214979171754, + "epoch": 1.5487162606978275, + "grad_norm": 0.49334779381752014, + "learning_rate": 2.6610251185211657e-06, + "loss": 0.029, + "mean_token_accuracy": 0.9952426433563233, + "num_tokens": 10027402.0, + "step": 4705 + }, + { + "entropy": 1.6911566019058228, + "epoch": 1.5503620803159972, + "grad_norm": 2.034087896347046, + "learning_rate": 2.6425668795451107e-06, + "loss": 0.0281, + "mean_token_accuracy": 0.9905034959316253, + "num_tokens": 10038046.0, + "step": 4710 + }, + { + "entropy": 1.69894437789917, + "epoch": 1.5520078999341673, + "grad_norm": 2.4216017723083496, + "learning_rate": 2.624163130111891e-06, + "loss": 0.0165, + "mean_token_accuracy": 0.9943257391452789, + "num_tokens": 10048564.0, + "step": 4715 + }, + { + "entropy": 1.7041826963424682, + "epoch": 1.553653719552337, + "grad_norm": 1.8711634874343872, + "learning_rate": 2.605814006520655e-06, + "loss": 0.0154, + "mean_token_accuracy": 0.9945900738239288, + "num_tokens": 10059017.0, + "step": 4720 + }, + { + "entropy": 1.6660914540290832, + "epoch": 1.555299539170507, + "grad_norm": 0.6987809538841248, + "learning_rate": 2.587519644666001e-06, + "loss": 0.03, + "mean_token_accuracy": 0.991539990901947, + "num_tokens": 10069792.0, + "step": 4725 + }, + { + "entropy": 1.6404382228851317, + "epoch": 1.5569453587886768, + "grad_norm": 0.9349547028541565, + "learning_rate": 2.5692801800369406e-06, + "loss": 0.0161, + "mean_token_accuracy": 0.9950057923793793, + "num_tokens": 10080522.0, + "step": 4730 + }, + { + "entropy": 1.6546894431114196, + "epoch": 1.5585911784068465, + "grad_norm": 2.1536412239074707, + "learning_rate": 2.5510957477159257e-06, + "loss": 0.0295, + "mean_token_accuracy": 0.9922216713428498, + "num_tokens": 10091760.0, + "step": 4735 + }, + { + "entropy": 1.6477965712547302, + "epoch": 1.5602369980250166, + "grad_norm": 0.56485915184021, + "learning_rate": 2.5329664823778444e-06, + "loss": 0.0114, + "mean_token_accuracy": 0.995609724521637, + "num_tokens": 10102446.0, + "step": 4740 + }, + { + "entropy": 1.699891173839569, + "epoch": 1.5618828176431863, + "grad_norm": 2.4956679344177246, + "learning_rate": 2.514892518288988e-06, + "loss": 0.0398, + "mean_token_accuracy": 0.9890912711620331, + "num_tokens": 10113234.0, + "step": 4745 + }, + { + "entropy": 1.6791865229606628, + "epoch": 1.5635286372613562, + "grad_norm": 2.926832914352417, + "learning_rate": 2.4968739893061132e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9936101496219635, + "num_tokens": 10124034.0, + "step": 4750 + }, + { + "entropy": 1.6816865921020507, + "epoch": 1.565174456879526, + "grad_norm": 1.028403401374817, + "learning_rate": 2.4789110288754038e-06, + "loss": 0.0326, + "mean_token_accuracy": 0.9936225533485412, + "num_tokens": 10134671.0, + "step": 4755 + }, + { + "entropy": 1.7139351129531861, + "epoch": 1.5668202764976957, + "grad_norm": 4.163893222808838, + "learning_rate": 2.461003770031504e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9935286998748779, + "num_tokens": 10145418.0, + "step": 4760 + }, + { + "entropy": 1.7017668962478638, + "epoch": 1.5684660961158658, + "grad_norm": 2.05292010307312, + "learning_rate": 2.4431523453965266e-06, + "loss": 0.0295, + "mean_token_accuracy": 0.9896788716316223, + "num_tokens": 10156080.0, + "step": 4765 + }, + { + "entropy": 1.7077699899673462, + "epoch": 1.5701119157340355, + "grad_norm": 1.2193189859390259, + "learning_rate": 2.4253568871790857e-06, + "loss": 0.0426, + "mean_token_accuracy": 0.9874286651611328, + "num_tokens": 10166779.0, + "step": 4770 + }, + { + "entropy": 1.7373961925506591, + "epoch": 1.5717577353522054, + "grad_norm": 0.09195420891046524, + "learning_rate": 2.407617527173285e-06, + "loss": 0.0234, + "mean_token_accuracy": 0.9929518520832061, + "num_tokens": 10177151.0, + "step": 4775 + }, + { + "entropy": 1.753466045856476, + "epoch": 1.5734035549703753, + "grad_norm": 1.3904253244400024, + "learning_rate": 2.3899343967577803e-06, + "loss": 0.0269, + "mean_token_accuracy": 0.990022599697113, + "num_tokens": 10187902.0, + "step": 4780 + }, + { + "entropy": 1.7257111310958861, + "epoch": 1.575049374588545, + "grad_norm": 1.9471051692962646, + "learning_rate": 2.3723076268947777e-06, + "loss": 0.0231, + "mean_token_accuracy": 0.9913094937801361, + "num_tokens": 10198342.0, + "step": 4785 + }, + { + "entropy": 1.6994914054870605, + "epoch": 1.576695194206715, + "grad_norm": 1.1079803705215454, + "learning_rate": 2.354737348129077e-06, + "loss": 0.0127, + "mean_token_accuracy": 0.994962739944458, + "num_tokens": 10209017.0, + "step": 4790 + }, + { + "entropy": 1.7078461527824402, + "epoch": 1.5783410138248848, + "grad_norm": 0.8264980316162109, + "learning_rate": 2.337223690587098e-06, + "loss": 0.019, + "mean_token_accuracy": 0.9939144611358642, + "num_tokens": 10219775.0, + "step": 4795 + }, + { + "entropy": 1.690078580379486, + "epoch": 1.5799868334430547, + "grad_norm": 1.1476809978485107, + "learning_rate": 2.3197667839759307e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9934214890003205, + "num_tokens": 10230654.0, + "step": 4800 + }, + { + "entropy": 1.7367305874824523, + "epoch": 1.5816326530612246, + "grad_norm": 0.9815186262130737, + "learning_rate": 2.302366757582355e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9940285742282867, + "num_tokens": 10241163.0, + "step": 4805 + }, + { + "entropy": 1.7170762538909912, + "epoch": 1.5832784726793943, + "grad_norm": 2.2614142894744873, + "learning_rate": 2.285023740271893e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.994717925786972, + "num_tokens": 10252132.0, + "step": 4810 + }, + { + "entropy": 1.6848518610000611, + "epoch": 1.5849242922975642, + "grad_norm": 0.6454903483390808, + "learning_rate": 2.267737860487865e-06, + "loss": 0.0182, + "mean_token_accuracy": 0.9939591407775878, + "num_tokens": 10263052.0, + "step": 4815 + }, + { + "entropy": 1.7132264494895935, + "epoch": 1.586570111915734, + "grad_norm": 1.0455392599105835, + "learning_rate": 2.2505092462504153e-06, + "loss": 0.0134, + "mean_token_accuracy": 0.9961919903755188, + "num_tokens": 10273616.0, + "step": 4820 + }, + { + "entropy": 1.7440693497657775, + "epoch": 1.5882159315339037, + "grad_norm": 1.3470888137817383, + "learning_rate": 2.2333380251555826e-06, + "loss": 0.0279, + "mean_token_accuracy": 0.9919908702373504, + "num_tokens": 10284093.0, + "step": 4825 + }, + { + "entropy": 1.7089533686637879, + "epoch": 1.5898617511520738, + "grad_norm": 1.1258236169815063, + "learning_rate": 2.2162243243743485e-06, + "loss": 0.0233, + "mean_token_accuracy": 0.9929437875747681, + "num_tokens": 10294993.0, + "step": 4830 + }, + { + "entropy": 1.677238130569458, + "epoch": 1.5915075707702435, + "grad_norm": 3.9911301136016846, + "learning_rate": 2.1991682706516935e-06, + "loss": 0.0294, + "mean_token_accuracy": 0.9925793290138245, + "num_tokens": 10305763.0, + "step": 4835 + }, + { + "entropy": 1.679075539112091, + "epoch": 1.5931533903884134, + "grad_norm": 2.0645158290863037, + "learning_rate": 2.1821699903056627e-06, + "loss": 0.028, + "mean_token_accuracy": 0.9916234374046325, + "num_tokens": 10316288.0, + "step": 4840 + }, + { + "entropy": 1.6844815373420716, + "epoch": 1.5947992100065833, + "grad_norm": 0.7942723631858826, + "learning_rate": 2.1652296092264324e-06, + "loss": 0.0288, + "mean_token_accuracy": 0.9914738476276398, + "num_tokens": 10327136.0, + "step": 4845 + }, + { + "entropy": 1.681308126449585, + "epoch": 1.596445029624753, + "grad_norm": 2.3584067821502686, + "learning_rate": 2.148347252875368e-06, + "loss": 0.0106, + "mean_token_accuracy": 0.9962167620658875, + "num_tokens": 10337795.0, + "step": 4850 + }, + { + "entropy": 1.6970802187919616, + "epoch": 1.598090849242923, + "grad_norm": 0.7867595553398132, + "learning_rate": 2.1315230462840985e-06, + "loss": 0.0253, + "mean_token_accuracy": 0.9912108719348908, + "num_tokens": 10348719.0, + "step": 4855 + }, + { + "entropy": 1.706315529346466, + "epoch": 1.5997366688610928, + "grad_norm": 1.8649603128433228, + "learning_rate": 2.114757114053605e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.992872542142868, + "num_tokens": 10359442.0, + "step": 4860 + }, + { + "epoch": 1.6010533245556287, + "eval_entropy": 1.6943830250834868, + "eval_loss": 0.048175811767578125, + "eval_mean_token_accuracy": 0.9870549200333505, + "eval_num_tokens": 10368037.0, + "eval_runtime": 196.4348, + "eval_samples_per_second": 42.391, + "eval_steps_per_second": 7.066, + "step": 4864 + }, + { + "entropy": 1.7008237361907959, + "epoch": 1.6013824884792627, + "grad_norm": 0.8595287799835205, + "learning_rate": 2.098049580353273e-06, + "loss": 0.0191, + "mean_token_accuracy": 0.9942748367786407, + "num_tokens": 10370122.0, + "step": 4865 + }, + { + "entropy": 1.697910237312317, + "epoch": 1.6030283080974326, + "grad_norm": 1.504879355430603, + "learning_rate": 2.08140056891999e-06, + "loss": 0.0133, + "mean_token_accuracy": 0.9958468794822692, + "num_tokens": 10380595.0, + "step": 4870 + }, + { + "entropy": 1.6834197640419006, + "epoch": 1.6046741277156022, + "grad_norm": 6.986445903778076, + "learning_rate": 2.0648102030572225e-06, + "loss": 0.03, + "mean_token_accuracy": 0.9888522446155548, + "num_tokens": 10391431.0, + "step": 4875 + }, + { + "entropy": 1.6846235394477844, + "epoch": 1.6063199473337724, + "grad_norm": 4.83704137802124, + "learning_rate": 2.048278605634113e-06, + "loss": 0.0198, + "mean_token_accuracy": 0.9933947384357452, + "num_tokens": 10402189.0, + "step": 4880 + }, + { + "entropy": 1.6738305926322936, + "epoch": 1.607965766951942, + "grad_norm": 1.9007184505462646, + "learning_rate": 2.0318058990845467e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9959712147712707, + "num_tokens": 10412659.0, + "step": 4885 + }, + { + "entropy": 1.668698501586914, + "epoch": 1.609611586570112, + "grad_norm": 5.049774169921875, + "learning_rate": 2.0153922054062758e-06, + "loss": 0.0212, + "mean_token_accuracy": 0.9936519682407379, + "num_tokens": 10423408.0, + "step": 4890 + }, + { + "entropy": 1.6653715133666993, + "epoch": 1.6112574061882818, + "grad_norm": 2.830352306365967, + "learning_rate": 1.999037646159989e-06, + "loss": 0.0142, + "mean_token_accuracy": 0.9954032123088836, + "num_tokens": 10434264.0, + "step": 4895 + }, + { + "entropy": 1.6867196083068847, + "epoch": 1.6129032258064515, + "grad_norm": 2.5599348545074463, + "learning_rate": 1.9827423424684267e-06, + "loss": 0.0332, + "mean_token_accuracy": 0.9906599521636963, + "num_tokens": 10445083.0, + "step": 4900 + }, + { + "entropy": 1.6548929691314698, + "epoch": 1.6145490454246214, + "grad_norm": 1.4400912523269653, + "learning_rate": 1.966506415015477e-06, + "loss": 0.0193, + "mean_token_accuracy": 0.994045513868332, + "num_tokens": 10455748.0, + "step": 4905 + }, + { + "entropy": 1.704959547519684, + "epoch": 1.6161948650427913, + "grad_norm": 1.0404367446899414, + "learning_rate": 1.9503299840452927e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9917968034744262, + "num_tokens": 10466258.0, + "step": 4910 + }, + { + "entropy": 1.7084056258201599, + "epoch": 1.6178406846609612, + "grad_norm": 1.2847148180007935, + "learning_rate": 1.9342131693613763e-06, + "loss": 0.0245, + "mean_token_accuracy": 0.9919559478759765, + "num_tokens": 10476806.0, + "step": 4915 + }, + { + "entropy": 1.6556385517120362, + "epoch": 1.619486504279131, + "grad_norm": 2.1901090145111084, + "learning_rate": 1.9181560903257234e-06, + "loss": 0.0242, + "mean_token_accuracy": 0.992319130897522, + "num_tokens": 10487881.0, + "step": 4920 + }, + { + "entropy": 1.6394746541976928, + "epoch": 1.6211323238973008, + "grad_norm": 1.6991859674453735, + "learning_rate": 1.9021588658579249e-06, + "loss": 0.0259, + "mean_token_accuracy": 0.9934229254722595, + "num_tokens": 10498864.0, + "step": 4925 + }, + { + "entropy": 1.7088184237480164, + "epoch": 1.6227781435154707, + "grad_norm": 1.4522404670715332, + "learning_rate": 1.8862216144342692e-06, + "loss": 0.0176, + "mean_token_accuracy": 0.9922803401947021, + "num_tokens": 10509379.0, + "step": 4930 + }, + { + "entropy": 1.6745522737503051, + "epoch": 1.6244239631336406, + "grad_norm": 0.5447269678115845, + "learning_rate": 1.870344454086901e-06, + "loss": 0.0315, + "mean_token_accuracy": 0.9911853075027466, + "num_tokens": 10519980.0, + "step": 4935 + }, + { + "entropy": 1.6869741559028626, + "epoch": 1.6260697827518102, + "grad_norm": 1.314532995223999, + "learning_rate": 1.8545275024029141e-06, + "loss": 0.0194, + "mean_token_accuracy": 0.9942934930324554, + "num_tokens": 10530726.0, + "step": 4940 + }, + { + "entropy": 1.6622373104095458, + "epoch": 1.6277156023699804, + "grad_norm": 1.3033603429794312, + "learning_rate": 1.838770876523498e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9951237618923188, + "num_tokens": 10541525.0, + "step": 4945 + }, + { + "entropy": 1.6855845212936402, + "epoch": 1.62936142198815, + "grad_norm": 2.0838241577148438, + "learning_rate": 1.8230746931430642e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9907946407794952, + "num_tokens": 10552191.0, + "step": 4950 + }, + { + "entropy": 1.6786072969436645, + "epoch": 1.63100724160632, + "grad_norm": 0.6673282384872437, + "learning_rate": 1.807439068508392e-06, + "loss": 0.0107, + "mean_token_accuracy": 0.9967107653617859, + "num_tokens": 10562922.0, + "step": 4955 + }, + { + "entropy": 1.6981304287910461, + "epoch": 1.6326530612244898, + "grad_norm": 0.6075028777122498, + "learning_rate": 1.7918641184177444e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.992760181427002, + "num_tokens": 10573352.0, + "step": 4960 + }, + { + "entropy": 1.6855650663375854, + "epoch": 1.6342988808426595, + "grad_norm": 1.076545000076294, + "learning_rate": 1.7763499582200405e-06, + "loss": 0.023, + "mean_token_accuracy": 0.9909692645072937, + "num_tokens": 10584337.0, + "step": 4965 + }, + { + "entropy": 1.6684273719787597, + "epoch": 1.6359447004608296, + "grad_norm": 0.9483628869056702, + "learning_rate": 1.7608967028139767e-06, + "loss": 0.025, + "mean_token_accuracy": 0.9940790653228759, + "num_tokens": 10595236.0, + "step": 4970 + }, + { + "entropy": 1.6795395970344544, + "epoch": 1.6375905200789993, + "grad_norm": 2.4686009883880615, + "learning_rate": 1.7455044666471875e-06, + "loss": 0.0407, + "mean_token_accuracy": 0.9889774978160858, + "num_tokens": 10605904.0, + "step": 4975 + }, + { + "entropy": 1.7077094316482544, + "epoch": 1.6392363396971692, + "grad_norm": 1.020444393157959, + "learning_rate": 1.7301733637153994e-06, + "loss": 0.0318, + "mean_token_accuracy": 0.9912243783473969, + "num_tokens": 10616311.0, + "step": 4980 + }, + { + "entropy": 1.6756536841392518, + "epoch": 1.640882159315339, + "grad_norm": 4.183596134185791, + "learning_rate": 1.7149035075615795e-06, + "loss": 0.0325, + "mean_token_accuracy": 0.9907270252704621, + "num_tokens": 10627079.0, + "step": 4985 + }, + { + "entropy": 1.6901473999023438, + "epoch": 1.6425279789335088, + "grad_norm": 1.742541790008545, + "learning_rate": 1.6996950112750964e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9967920780181885, + "num_tokens": 10637698.0, + "step": 4990 + }, + { + "entropy": 1.665801465511322, + "epoch": 1.6441737985516789, + "grad_norm": 2.1923062801361084, + "learning_rate": 1.6845479874908865e-06, + "loss": 0.0236, + "mean_token_accuracy": 0.9916431307792664, + "num_tokens": 10648267.0, + "step": 4995 + }, + { + "entropy": 1.6393486261367798, + "epoch": 1.6458196181698486, + "grad_norm": 0.6840113401412964, + "learning_rate": 1.6694625483886195e-06, + "loss": 0.0164, + "mean_token_accuracy": 0.9945251405239105, + "num_tokens": 10659109.0, + "step": 5000 + }, + { + "entropy": 1.6628060221672059, + "epoch": 1.6474654377880185, + "grad_norm": 2.219745397567749, + "learning_rate": 1.6544388056918614e-06, + "loss": 0.0254, + "mean_token_accuracy": 0.9924133539199829, + "num_tokens": 10669828.0, + "step": 5005 + }, + { + "entropy": 1.6695847511291504, + "epoch": 1.6491112574061884, + "grad_norm": 1.5060310363769531, + "learning_rate": 1.6394768706672547e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9916559994220734, + "num_tokens": 10680094.0, + "step": 5010 + }, + { + "entropy": 1.659652292728424, + "epoch": 1.650757077024358, + "grad_norm": 3.3251304626464844, + "learning_rate": 1.6245768541236894e-06, + "loss": 0.0202, + "mean_token_accuracy": 0.9898216426372528, + "num_tokens": 10690770.0, + "step": 5015 + }, + { + "entropy": 1.6149708151817321, + "epoch": 1.652402896642528, + "grad_norm": 1.9098771810531616, + "learning_rate": 1.6097388664114833e-06, + "loss": 0.0147, + "mean_token_accuracy": 0.9929858863353729, + "num_tokens": 10702024.0, + "step": 5020 + }, + { + "entropy": 1.6477410674095154, + "epoch": 1.6540487162606978, + "grad_norm": 1.96040940284729, + "learning_rate": 1.5949630174215647e-06, + "loss": 0.0144, + "mean_token_accuracy": 0.9963162183761597, + "num_tokens": 10712833.0, + "step": 5025 + }, + { + "entropy": 1.6466076374053955, + "epoch": 1.6556945358788677, + "grad_norm": 0.9714366793632507, + "learning_rate": 1.580249416584666e-06, + "loss": 0.0365, + "mean_token_accuracy": 0.9881292939186096, + "num_tokens": 10723440.0, + "step": 5030 + }, + { + "entropy": 1.669557237625122, + "epoch": 1.6573403554970376, + "grad_norm": 1.7277604341506958, + "learning_rate": 1.5655981728704973e-06, + "loss": 0.0239, + "mean_token_accuracy": 0.991591501235962, + "num_tokens": 10733955.0, + "step": 5035 + }, + { + "entropy": 1.6399479150772094, + "epoch": 1.6589861751152073, + "grad_norm": 5.28700590133667, + "learning_rate": 1.5510093947869508e-06, + "loss": 0.0201, + "mean_token_accuracy": 0.9937320530414582, + "num_tokens": 10744942.0, + "step": 5040 + }, + { + "entropy": 1.688657033443451, + "epoch": 1.6606319947333772, + "grad_norm": 6.3327813148498535, + "learning_rate": 1.536483190379302e-06, + "loss": 0.0227, + "mean_token_accuracy": 0.9922419965267182, + "num_tokens": 10755594.0, + "step": 5045 + }, + { + "entropy": 1.6918665885925293, + "epoch": 1.662277814351547, + "grad_norm": 0.1568760871887207, + "learning_rate": 1.522019667229393e-06, + "loss": 0.009, + "mean_token_accuracy": 0.9979844152927398, + "num_tokens": 10766398.0, + "step": 5050 + }, + { + "entropy": 1.6314533591270446, + "epoch": 1.6639236339697168, + "grad_norm": 1.880570411682129, + "learning_rate": 1.5076189324548506e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.9929390490055084, + "num_tokens": 10777255.0, + "step": 5055 + }, + { + "entropy": 1.6015793800354003, + "epoch": 1.6655694535878869, + "grad_norm": 1.6532872915267944, + "learning_rate": 1.493281092708283e-06, + "loss": 0.0153, + "mean_token_accuracy": 0.9948068380355835, + "num_tokens": 10788343.0, + "step": 5060 + }, + { + "entropy": 1.6586069226264955, + "epoch": 1.6672152732060566, + "grad_norm": 1.2856965065002441, + "learning_rate": 1.479006254176505e-06, + "loss": 0.0203, + "mean_token_accuracy": 0.992956918478012, + "num_tokens": 10799130.0, + "step": 5065 + }, + { + "entropy": 1.6501345872879027, + "epoch": 1.6688610928242265, + "grad_norm": 2.5434141159057617, + "learning_rate": 1.4647945225797244e-06, + "loss": 0.0261, + "mean_token_accuracy": 0.9912129878997803, + "num_tokens": 10809881.0, + "step": 5070 + }, + { + "entropy": 1.614508295059204, + "epoch": 1.6705069124423964, + "grad_norm": 0.4630468189716339, + "learning_rate": 1.4506460031707903e-06, + "loss": 0.0148, + "mean_token_accuracy": 0.9954249680042266, + "num_tokens": 10820794.0, + "step": 5075 + }, + { + "entropy": 1.645851767063141, + "epoch": 1.672152732060566, + "grad_norm": 2.122525930404663, + "learning_rate": 1.4365608007343922e-06, + "loss": 0.026, + "mean_token_accuracy": 0.9927963316440582, + "num_tokens": 10831234.0, + "step": 5080 + }, + { + "entropy": 1.6439926147460937, + "epoch": 1.6737985516787361, + "grad_norm": 0.7459424138069153, + "learning_rate": 1.4225390195862932e-06, + "loss": 0.0241, + "mean_token_accuracy": 0.9936447679996491, + "num_tokens": 10842096.0, + "step": 5085 + }, + { + "entropy": 1.6597126364707946, + "epoch": 1.6754443712969058, + "grad_norm": 1.7774500846862793, + "learning_rate": 1.4085807635725491e-06, + "loss": 0.0235, + "mean_token_accuracy": 0.9914094746112824, + "num_tokens": 10852586.0, + "step": 5090 + }, + { + "entropy": 1.7044395089149476, + "epoch": 1.6770901909150757, + "grad_norm": 1.1517224311828613, + "learning_rate": 1.3946861360687548e-06, + "loss": 0.0174, + "mean_token_accuracy": 0.9941137075424195, + "num_tokens": 10862934.0, + "step": 5095 + }, + { + "entropy": 1.6369341015815735, + "epoch": 1.6787360105332456, + "grad_norm": 0.2626267671585083, + "learning_rate": 1.380855239979264e-06, + "loss": 0.017, + "mean_token_accuracy": 0.99591423869133, + "num_tokens": 10873852.0, + "step": 5100 + }, + { + "entropy": 1.6440617442131042, + "epoch": 1.6803818301514153, + "grad_norm": 2.385711193084717, + "learning_rate": 1.3670881777364276e-06, + "loss": 0.0389, + "mean_token_accuracy": 0.9895196318626404, + "num_tokens": 10884490.0, + "step": 5105 + }, + { + "entropy": 1.6631293177604676, + "epoch": 1.6820276497695854, + "grad_norm": 2.767446994781494, + "learning_rate": 1.3533850512998515e-06, + "loss": 0.043, + "mean_token_accuracy": 0.9870490610599518, + "num_tokens": 10894958.0, + "step": 5110 + }, + { + "entropy": 1.671304202079773, + "epoch": 1.683673469387755, + "grad_norm": 1.706010103225708, + "learning_rate": 1.339745962155613e-06, + "loss": 0.0214, + "mean_token_accuracy": 0.9934303402900696, + "num_tokens": 10905373.0, + "step": 5115 + }, + { + "entropy": 1.652925717830658, + "epoch": 1.685319289005925, + "grad_norm": 1.8445219993591309, + "learning_rate": 1.3261710113155436e-06, + "loss": 0.0286, + "mean_token_accuracy": 0.9917166590690613, + "num_tokens": 10916144.0, + "step": 5120 + }, + { + "entropy": 1.6853961229324341, + "epoch": 1.6869651086240949, + "grad_norm": 0.4994262754917145, + "learning_rate": 1.3126602993164505e-06, + "loss": 0.0155, + "mean_token_accuracy": 0.9949258744716645, + "num_tokens": 10926960.0, + "step": 5125 + }, + { + "entropy": 1.6451457619667054, + "epoch": 1.6886109282422646, + "grad_norm": 0.9345157742500305, + "learning_rate": 1.2992139262193893e-06, + "loss": 0.0185, + "mean_token_accuracy": 0.9947284281253814, + "num_tokens": 10937405.0, + "step": 5130 + }, + { + "entropy": 1.624656081199646, + "epoch": 1.6902567478604344, + "grad_norm": 1.3270047903060913, + "learning_rate": 1.2858319916089156e-06, + "loss": 0.0168, + "mean_token_accuracy": 0.99341681599617, + "num_tokens": 10948172.0, + "step": 5135 + }, + { + "entropy": 1.659364914894104, + "epoch": 1.6919025674786043, + "grad_norm": 0.4350810647010803, + "learning_rate": 1.2725145945923588e-06, + "loss": 0.0183, + "mean_token_accuracy": 0.99389089345932, + "num_tokens": 10958763.0, + "step": 5140 + }, + { + "entropy": 1.6685426354408264, + "epoch": 1.6935483870967742, + "grad_norm": 1.704720377922058, + "learning_rate": 1.2592618337990647e-06, + "loss": 0.0285, + "mean_token_accuracy": 0.9913337886333465, + "num_tokens": 10969305.0, + "step": 5145 + }, + { + "entropy": 1.6044569730758667, + "epoch": 1.6951942067149441, + "grad_norm": 0.8705942034721375, + "learning_rate": 1.2460738073796929e-06, + "loss": 0.0238, + "mean_token_accuracy": 0.9927083373069763, + "num_tokens": 10980236.0, + "step": 5150 + }, + { + "entropy": 1.685863721370697, + "epoch": 1.6968400263331138, + "grad_norm": 0.5871350765228271, + "learning_rate": 1.2329506130054703e-06, + "loss": 0.0187, + "mean_token_accuracy": 0.9933398306369782, + "num_tokens": 10990582.0, + "step": 5155 + }, + { + "entropy": 1.6742502808570863, + "epoch": 1.6984858459512837, + "grad_norm": 2.4911105632781982, + "learning_rate": 1.219892347867474e-06, + "loss": 0.0298, + "mean_token_accuracy": 0.9904028534889221, + "num_tokens": 11000966.0, + "step": 5160 + }, + { + "entropy": 1.6669535398483277, + "epoch": 1.7001316655694536, + "grad_norm": 1.9511528015136719, + "learning_rate": 1.2068991086759175e-06, + "loss": 0.0377, + "mean_token_accuracy": 0.9925346195697784, + "num_tokens": 11011630.0, + "step": 5165 + }, + { + "entropy": 1.6653909087181091, + "epoch": 1.7017774851876233, + "grad_norm": 0.7113050222396851, + "learning_rate": 1.1939709916594222e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.9918692409992218, + "num_tokens": 11022348.0, + "step": 5170 + }, + { + "entropy": 1.6599775552749634, + "epoch": 1.7034233048057934, + "grad_norm": 2.2692055702209473, + "learning_rate": 1.1811080925643125e-06, + "loss": 0.0344, + "mean_token_accuracy": 0.9922800958156586, + "num_tokens": 11032982.0, + "step": 5175 + }, + { + "entropy": 1.6723967909812927, + "epoch": 1.705069124423963, + "grad_norm": 1.5152289867401123, + "learning_rate": 1.1683105066539068e-06, + "loss": 0.0172, + "mean_token_accuracy": 0.9953593909740448, + "num_tokens": 11043325.0, + "step": 5180 + }, + { + "entropy": 1.6335657358169555, + "epoch": 1.706714944042133, + "grad_norm": 1.356180191040039, + "learning_rate": 1.1555783287078116e-06, + "loss": 0.0215, + "mean_token_accuracy": 0.9901393771171569, + "num_tokens": 11054285.0, + "step": 5185 + }, + { + "entropy": 1.6311080694198608, + "epoch": 1.7083607636603029, + "grad_norm": 1.751895785331726, + "learning_rate": 1.142911653021217e-06, + "loss": 0.0163, + "mean_token_accuracy": 0.994689530134201, + "num_tokens": 11064881.0, + "step": 5190 + }, + { + "entropy": 1.6046368837356568, + "epoch": 1.7100065832784725, + "grad_norm": 0.38154032826423645, + "learning_rate": 1.1303105734041996e-06, + "loss": 0.0181, + "mean_token_accuracy": 0.9927674353122711, + "num_tokens": 11076136.0, + "step": 5195 + }, + { + "entropy": 1.6612093567848205, + "epoch": 1.7116524028966427, + "grad_norm": 0.979333221912384, + "learning_rate": 1.1177751831810279e-06, + "loss": 0.0335, + "mean_token_accuracy": 0.9923636972904205, + "num_tokens": 11086778.0, + "step": 5200 + }, + { + "entropy": 1.6807520270347596, + "epoch": 1.7132982225148123, + "grad_norm": 1.7828707695007324, + "learning_rate": 1.1053055751894726e-06, + "loss": 0.0277, + "mean_token_accuracy": 0.993100905418396, + "num_tokens": 11097370.0, + "step": 5205 + }, + { + "entropy": 1.6403346300125121, + "epoch": 1.7149440421329822, + "grad_norm": 2.5002477169036865, + "learning_rate": 1.0929018417801129e-06, + "loss": 0.017, + "mean_token_accuracy": 0.9920848309993744, + "num_tokens": 11108166.0, + "step": 5210 + }, + { + "entropy": 1.6517832636833192, + "epoch": 1.7165898617511521, + "grad_norm": 1.3457746505737305, + "learning_rate": 1.0805640748156675e-06, + "loss": 0.0213, + "mean_token_accuracy": 0.9931346833705902, + "num_tokens": 11118902.0, + "step": 5215 + }, + { + "entropy": 1.6854795217514038, + "epoch": 1.7182356813693218, + "grad_norm": 0.07055312395095825, + "learning_rate": 1.0682923656702948e-06, + "loss": 0.0211, + "mean_token_accuracy": 0.9933703184127808, + "num_tokens": 11129409.0, + "step": 5220 + }, + { + "entropy": 1.7007460117340087, + "epoch": 1.719881500987492, + "grad_norm": 1.2991859912872314, + "learning_rate": 1.0560868052289253e-06, + "loss": 0.0244, + "mean_token_accuracy": 0.9902635872364044, + "num_tokens": 11140079.0, + "step": 5225 + }, + { + "entropy": 1.6376017451286315, + "epoch": 1.7215273206056616, + "grad_norm": 0.83013516664505, + "learning_rate": 1.0439474838865981e-06, + "loss": 0.013, + "mean_token_accuracy": 0.9975570976734162, + "num_tokens": 11150683.0, + "step": 5230 + }, + { + "entropy": 1.6345697283744811, + "epoch": 1.7231731402238315, + "grad_norm": 3.94348406791687, + "learning_rate": 1.031874491547773e-06, + "loss": 0.0189, + "mean_token_accuracy": 0.9938337445259094, + "num_tokens": 11161334.0, + "step": 5235 + }, + { + "entropy": 1.648432207107544, + "epoch": 1.7248189598420014, + "grad_norm": 1.0848654508590698, + "learning_rate": 1.0198679176256742e-06, + "loss": 0.0287, + "mean_token_accuracy": 0.9918507993221283, + "num_tokens": 11171950.0, + "step": 5240 + }, + { + "entropy": 1.660215425491333, + "epoch": 1.726464779460171, + "grad_norm": 1.0720781087875366, + "learning_rate": 1.0079278510416313e-06, + "loss": 0.0297, + "mean_token_accuracy": 0.9905134677886963, + "num_tokens": 11182536.0, + "step": 5245 + }, + { + "entropy": 1.6251054883003235, + "epoch": 1.728110599078341, + "grad_norm": 2.8627305030822754, + "learning_rate": 9.960543802244195e-07, + "loss": 0.0214, + "mean_token_accuracy": 0.991985023021698, + "num_tokens": 11193429.0, + "step": 5250 + }, + { + "entropy": 1.668004846572876, + "epoch": 1.7297564186965109, + "grad_norm": 0.5310274958610535, + "learning_rate": 9.842475931095895e-07, + "loss": 0.0398, + "mean_token_accuracy": 0.9901407361030579, + "num_tokens": 11204189.0, + "step": 5255 + }, + { + "entropy": 1.6230967044830322, + "epoch": 1.7314022383146808, + "grad_norm": 1.8132059574127197, + "learning_rate": 9.725075771388449e-07, + "loss": 0.0181, + "mean_token_accuracy": 0.9925452589988708, + "num_tokens": 11214880.0, + "step": 5260 + }, + { + "entropy": 1.629646909236908, + "epoch": 1.7330480579328507, + "grad_norm": 0.9202048182487488, + "learning_rate": 9.60834419259369e-07, + "loss": 0.0242, + "mean_token_accuracy": 0.9919378876686096, + "num_tokens": 11225689.0, + "step": 5265 + }, + { + "entropy": 1.6248352885246278, + "epoch": 1.7346938775510203, + "grad_norm": 0.7040166258811951, + "learning_rate": 9.492282059231917e-07, + "loss": 0.0173, + "mean_token_accuracy": 0.9930191397666931, + "num_tokens": 11236662.0, + "step": 5270 + }, + { + "entropy": 1.6555840373039246, + "epoch": 1.7363396971691902, + "grad_norm": 0.31907936930656433, + "learning_rate": 9.376890230865487e-07, + "loss": 0.0105, + "mean_token_accuracy": 0.9968835175037384, + "num_tokens": 11247227.0, + "step": 5275 + }, + { + "entropy": 1.6610607862472535, + "epoch": 1.7379855167873601, + "grad_norm": 1.0085439682006836, + "learning_rate": 9.262169562092483e-07, + "loss": 0.0167, + "mean_token_accuracy": 0.9929597020149231, + "num_tokens": 11258079.0, + "step": 5280 + }, + { + "entropy": 1.6653624534606934, + "epoch": 1.7396313364055298, + "grad_norm": 1.6425352096557617, + "learning_rate": 9.148120902540281e-07, + "loss": 0.0244, + "mean_token_accuracy": 0.9898296535015106, + "num_tokens": 11268656.0, + "step": 5285 + }, + { + "entropy": 1.6512943148612975, + "epoch": 1.7412771560237, + "grad_norm": 1.9848991632461548, + "learning_rate": 9.034745096859332e-07, + "loss": 0.0165, + "mean_token_accuracy": 0.9939022302627564, + "num_tokens": 11279189.0, + "step": 5290 + }, + { + "entropy": 1.633478820323944, + "epoch": 1.7429229756418696, + "grad_norm": 1.188881278038025, + "learning_rate": 8.922042984716972e-07, + "loss": 0.0211, + "mean_token_accuracy": 0.9940697014331817, + "num_tokens": 11289941.0, + "step": 5295 + }, + { + "entropy": 1.6428375720977784, + "epoch": 1.7445687952600395, + "grad_norm": 1.3060122728347778, + "learning_rate": 8.810015400790994e-07, + "loss": 0.0146, + "mean_token_accuracy": 0.9937053561210633, + "num_tokens": 11300919.0, + "step": 5300 + }, + { + "entropy": 1.644476592540741, + "epoch": 1.7462146148782094, + "grad_norm": 4.126039981842041, + "learning_rate": 8.69866317476371e-07, + "loss": 0.0436, + "mean_token_accuracy": 0.9879814326763153, + "num_tokens": 11311626.0, + "step": 5305 + }, + { + "entropy": 1.6524176239967345, + "epoch": 1.747860434496379, + "grad_norm": 0.9086925387382507, + "learning_rate": 8.587987131315656e-07, + "loss": 0.0384, + "mean_token_accuracy": 0.9912387907505036, + "num_tokens": 11322432.0, + "step": 5310 + }, + { + "entropy": 1.6595850586891174, + "epoch": 1.7495062541145492, + "grad_norm": 1.3107917308807373, + "learning_rate": 8.477988090119515e-07, + "loss": 0.0209, + "mean_token_accuracy": 0.9909784734249115, + "num_tokens": 11333086.0, + "step": 5315 + }, + { + "entropy": 1.6489954590797424, + "epoch": 1.7511520737327189, + "grad_norm": 0.6780989766120911, + "learning_rate": 8.36866686583404e-07, + "loss": 0.013, + "mean_token_accuracy": 0.9960228025913238, + "num_tokens": 11343698.0, + "step": 5320 + }, + { + "entropy": 1.6577156782150269, + "epoch": 1.7527978933508888, + "grad_norm": 2.5030269622802734, + "learning_rate": 8.260024268098121e-07, + "loss": 0.0407, + "mean_token_accuracy": 0.9897329092025757, + "num_tokens": 11354384.0, + "step": 5325 + }, + { + "entropy": 1.6797374844551087, + "epoch": 1.7544437129690587, + "grad_norm": 1.1785377264022827, + "learning_rate": 8.152061101524578e-07, + "loss": 0.0204, + "mean_token_accuracy": 0.992442113161087, + "num_tokens": 11364888.0, + "step": 5330 + }, + { + "entropy": 1.6517166137695312, + "epoch": 1.7560895325872283, + "grad_norm": 2.3063104152679443, + "learning_rate": 8.044778165694434e-07, + "loss": 0.0249, + "mean_token_accuracy": 0.9921338021755218, + "num_tokens": 11375641.0, + "step": 5335 + }, + { + "entropy": 1.644195032119751, + "epoch": 1.7577353522053984, + "grad_norm": 2.3999085426330566, + "learning_rate": 7.93817625515082e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.9942924916744232, + "num_tokens": 11386312.0, + "step": 5340 + }, + { + "entropy": 1.6763506770133971, + "epoch": 1.7593811718235681, + "grad_norm": 2.3939199447631836, + "learning_rate": 7.832256159393181e-07, + "loss": 0.0242, + "mean_token_accuracy": 0.990136843919754, + "num_tokens": 11396648.0, + "step": 5345 + }, + { + "entropy": 1.7076772809028626, + "epoch": 1.761026991441738, + "grad_norm": 4.3696184158325195, + "learning_rate": 7.727018662871432e-07, + "loss": 0.0236, + "mean_token_accuracy": 0.9929444313049316, + "num_tokens": 11407191.0, + "step": 5350 + }, + { + "entropy": 1.6799395561218262, + "epoch": 1.762672811059908, + "grad_norm": 1.7837049961090088, + "learning_rate": 7.62246454498009e-07, + "loss": 0.0212, + "mean_token_accuracy": 0.993122935295105, + "num_tokens": 11417870.0, + "step": 5355 + }, + { + "entropy": 1.6892127633094787, + "epoch": 1.7643186306780776, + "grad_norm": 1.9029659032821655, + "learning_rate": 7.518594580052519e-07, + "loss": 0.0253, + "mean_token_accuracy": 0.9921809673309326, + "num_tokens": 11428181.0, + "step": 5360 + }, + { + "entropy": 1.6493794798851014, + "epoch": 1.7659644502962475, + "grad_norm": 0.7186495065689087, + "learning_rate": 7.415409537355222e-07, + "loss": 0.016, + "mean_token_accuracy": 0.9955087423324585, + "num_tokens": 11439027.0, + "step": 5365 + }, + { + "entropy": 1.668405246734619, + "epoch": 1.7676102699144174, + "grad_norm": 0.64583420753479, + "learning_rate": 7.312910181082178e-07, + "loss": 0.0149, + "mean_token_accuracy": 0.9964608550071716, + "num_tokens": 11449456.0, + "step": 5370 + }, + { + "entropy": 1.6470329999923705, + "epoch": 1.7692560895325873, + "grad_norm": 0.6577222347259521, + "learning_rate": 7.211097270349065e-07, + "loss": 0.0128, + "mean_token_accuracy": 0.9961907029151916, + "num_tokens": 11459922.0, + "step": 5375 + }, + { + "entropy": 1.6427839875221253, + "epoch": 1.7709019091507572, + "grad_norm": 1.2375580072402954, + "learning_rate": 7.109971559187767e-07, + "loss": 0.0251, + "mean_token_accuracy": 0.9925231218338013, + "num_tokens": 11470923.0, + "step": 5380 + }, + { + "entropy": 1.6566937565803528, + "epoch": 1.7725477287689269, + "grad_norm": 2.0263426303863525, + "learning_rate": 7.00953379654068e-07, + "loss": 0.012, + "mean_token_accuracy": 0.9966720283031464, + "num_tokens": 11481319.0, + "step": 5385 + }, + { + "entropy": 1.647239327430725, + "epoch": 1.7741935483870968, + "grad_norm": 0.439744770526886, + "learning_rate": 6.909784726255242e-07, + "loss": 0.0136, + "mean_token_accuracy": 0.9953515648841857, + "num_tokens": 11491892.0, + "step": 5390 + }, + { + "entropy": 1.653624951839447, + "epoch": 1.7758393680052666, + "grad_norm": 2.360499620437622, + "learning_rate": 6.810725087078395e-07, + "loss": 0.0177, + "mean_token_accuracy": 0.9931002259254456, + "num_tokens": 11502606.0, + "step": 5395 + }, + { + "entropy": 1.6672588109970092, + "epoch": 1.7774851876234363, + "grad_norm": 1.195021390914917, + "learning_rate": 6.712355612651145e-07, + "loss": 0.0175, + "mean_token_accuracy": 0.9928203105926514, + "num_tokens": 11513121.0, + "step": 5400 + }, + { + "entropy": 1.6415475726127624, + "epoch": 1.7791310072416064, + "grad_norm": 3.7338359355926514, + "learning_rate": 6.614677031503059e-07, + "loss": 0.0386, + "mean_token_accuracy": 0.9931011736392975, + "num_tokens": 11523757.0, + "step": 5405 + }, + { + "entropy": 1.630179488658905, + "epoch": 1.7807768268597761, + "grad_norm": 2.090383768081665, + "learning_rate": 6.517690067046922e-07, + "loss": 0.0193, + "mean_token_accuracy": 0.9923229575157165, + "num_tokens": 11534852.0, + "step": 5410 + }, + { + "entropy": 1.6325619697570801, + "epoch": 1.782422646477946, + "grad_norm": 0.8092725872993469, + "learning_rate": 6.421395437573386e-07, + "loss": 0.0219, + "mean_token_accuracy": 0.9927444994449616, + "num_tokens": 11545588.0, + "step": 5415 + }, + { + "entropy": 1.6684589862823487, + "epoch": 1.784068466096116, + "grad_norm": 1.0808240175247192, + "learning_rate": 6.325793856245632e-07, + "loss": 0.0084, + "mean_token_accuracy": 0.9980384469032287, + "num_tokens": 11555985.0, + "step": 5420 + }, + { + "entropy": 1.676847243309021, + "epoch": 1.7857142857142856, + "grad_norm": 1.1335194110870361, + "learning_rate": 6.230886031094063e-07, + "loss": 0.0168, + "mean_token_accuracy": 0.9945612967014312, + "num_tokens": 11566484.0, + "step": 5425 + }, + { + "entropy": 1.6188385486602783, + "epoch": 1.7873601053324557, + "grad_norm": 1.116150140762329, + "learning_rate": 6.136672665011089e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.9933637619018555, + "num_tokens": 11577115.0, + "step": 5430 + }, + { + "entropy": 1.6665136098861695, + "epoch": 1.7890059249506254, + "grad_norm": 1.42592191696167, + "learning_rate": 6.043154455745981e-07, + "loss": 0.0205, + "mean_token_accuracy": 0.9911065042018891, + "num_tokens": 11587678.0, + "step": 5435 + }, + { + "entropy": 1.642556118965149, + "epoch": 1.7906517445687953, + "grad_norm": 1.1183552742004395, + "learning_rate": 5.950332095899547e-07, + "loss": 0.0191, + "mean_token_accuracy": 0.9941525161266327, + "num_tokens": 11598261.0, + "step": 5440 + }, + { + "entropy": 1.6583900451660156, + "epoch": 1.7922975641869652, + "grad_norm": 1.2433513402938843, + "learning_rate": 5.858206272919165e-07, + "loss": 0.0263, + "mean_token_accuracy": 0.9929942131042481, + "num_tokens": 11609100.0, + "step": 5445 + }, + { + "entropy": 1.630812132358551, + "epoch": 1.7939433838051349, + "grad_norm": 3.484126567840576, + "learning_rate": 5.766777669093604e-07, + "loss": 0.032, + "mean_token_accuracy": 0.9925698578357697, + "num_tokens": 11619934.0, + "step": 5450 + }, + { + "entropy": 1.595421350002289, + "epoch": 1.795589203423305, + "grad_norm": 0.8385356664657593, + "learning_rate": 5.676046961547987e-07, + "loss": 0.0335, + "mean_token_accuracy": 0.9886408507823944, + "num_tokens": 11630717.0, + "step": 5455 + }, + { + "entropy": 1.6572367787361144, + "epoch": 1.7972350230414746, + "grad_norm": 0.9135512113571167, + "learning_rate": 5.586014822238772e-07, + "loss": 0.0208, + "mean_token_accuracy": 0.9933992445468902, + "num_tokens": 11641288.0, + "step": 5460 + }, + { + "entropy": 1.5970224499702455, + "epoch": 1.7988808426596445, + "grad_norm": 1.7872576713562012, + "learning_rate": 5.496681917948809e-07, + "loss": 0.0154, + "mean_token_accuracy": 0.9937366545200348, + "num_tokens": 11652159.0, + "step": 5465 + }, + { + "entropy": 1.632554793357849, + "epoch": 1.8005266622778144, + "grad_norm": 0.10970994830131531, + "learning_rate": 5.408048910282348e-07, + "loss": 0.032, + "mean_token_accuracy": 0.9962066173553467, + "num_tokens": 11662941.0, + "step": 5470 + }, + { + "epoch": 1.8011849901250823, + "eval_entropy": 1.6544960326694962, + "eval_loss": 0.04707782715559006, + "eval_mean_token_accuracy": 0.9874065164100884, + "eval_num_tokens": 11667189.0, + "eval_runtime": 196.3646, + "eval_samples_per_second": 42.406, + "eval_steps_per_second": 7.068, + "step": 5472 + }, + { + "entropy": 1.6609861731529236, + "epoch": 1.8021724818959841, + "grad_norm": 1.7392816543579102, + "learning_rate": 5.320116455660185e-07, + "loss": 0.0206, + "mean_token_accuracy": 0.9918794929981232, + "num_tokens": 11673648.0, + "step": 5475 + }, + { + "entropy": 1.6410995721817017, + "epoch": 1.803818301514154, + "grad_norm": 1.1604136228561401, + "learning_rate": 5.232885205314797e-07, + "loss": 0.032, + "mean_token_accuracy": 0.9902990460395813, + "num_tokens": 11684381.0, + "step": 5480 + }, + { + "entropy": 1.6263534784317017, + "epoch": 1.805464121132324, + "grad_norm": 1.2235734462738037, + "learning_rate": 5.146355805285452e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.994633013010025, + "num_tokens": 11694890.0, + "step": 5485 + }, + { + "entropy": 1.642207968235016, + "epoch": 1.8071099407504936, + "grad_norm": 3.1930456161499023, + "learning_rate": 5.06052889641353e-07, + "loss": 0.0133, + "mean_token_accuracy": 0.994309377670288, + "num_tokens": 11705533.0, + "step": 5490 + }, + { + "entropy": 1.6500659108161926, + "epoch": 1.8087557603686637, + "grad_norm": 3.067206621170044, + "learning_rate": 4.975405114337695e-07, + "loss": 0.0303, + "mean_token_accuracy": 0.9904295146465302, + "num_tokens": 11716171.0, + "step": 5495 + }, + { + "entropy": 1.636317217350006, + "epoch": 1.8104015799868334, + "grad_norm": 0.6358233094215393, + "learning_rate": 4.890985089489231e-07, + "loss": 0.0135, + "mean_token_accuracy": 0.9961594045162201, + "num_tokens": 11726724.0, + "step": 5500 + }, + { + "entropy": 1.6213058233261108, + "epoch": 1.8120473996050033, + "grad_norm": 1.792466402053833, + "learning_rate": 4.807269447087348e-07, + "loss": 0.019, + "mean_token_accuracy": 0.9931978344917297, + "num_tokens": 11737311.0, + "step": 5505 + }, + { + "entropy": 1.6493823170661925, + "epoch": 1.8136932192231732, + "grad_norm": 1.9440345764160156, + "learning_rate": 4.7242588071345965e-07, + "loss": 0.0278, + "mean_token_accuracy": 0.9932690799236298, + "num_tokens": 11747961.0, + "step": 5510 + }, + { + "entropy": 1.6495756149291991, + "epoch": 1.8153390388413428, + "grad_norm": 1.2750672101974487, + "learning_rate": 4.6419537844121565e-07, + "loss": 0.0189, + "mean_token_accuracy": 0.9953626215457916, + "num_tokens": 11758583.0, + "step": 5515 + }, + { + "entropy": 1.6222103834152222, + "epoch": 1.816984858459513, + "grad_norm": 2.3976049423217773, + "learning_rate": 4.5603549884754463e-07, + "loss": 0.0208, + "mean_token_accuracy": 0.9919763743877411, + "num_tokens": 11769636.0, + "step": 5520 + }, + { + "entropy": 1.6041205644607544, + "epoch": 1.8186306780776826, + "grad_norm": 1.3786771297454834, + "learning_rate": 4.479463023649555e-07, + "loss": 0.0127, + "mean_token_accuracy": 0.9947966277599335, + "num_tokens": 11780222.0, + "step": 5525 + }, + { + "entropy": 1.657669985294342, + "epoch": 1.8202764976958525, + "grad_norm": 3.0026445388793945, + "learning_rate": 4.3992784890246276e-07, + "loss": 0.0108, + "mean_token_accuracy": 0.9966068983078002, + "num_tokens": 11790566.0, + "step": 5530 + }, + { + "entropy": 1.6583294034004212, + "epoch": 1.8219223173140224, + "grad_norm": 2.0243186950683594, + "learning_rate": 4.319801978451654e-07, + "loss": 0.0269, + "mean_token_accuracy": 0.9926669538021088, + "num_tokens": 11801065.0, + "step": 5535 + }, + { + "entropy": 1.6717671513557435, + "epoch": 1.823568136932192, + "grad_norm": 0.7521846294403076, + "learning_rate": 4.241034080537909e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9947370052337646, + "num_tokens": 11811699.0, + "step": 5540 + }, + { + "entropy": 1.657283067703247, + "epoch": 1.8252139565503622, + "grad_norm": 2.917626142501831, + "learning_rate": 4.162975378642653e-07, + "loss": 0.0252, + "mean_token_accuracy": 0.9930805206298828, + "num_tokens": 11822213.0, + "step": 5545 + }, + { + "entropy": 1.6172433972358704, + "epoch": 1.826859776168532, + "grad_norm": 0.5599117875099182, + "learning_rate": 4.085626450872782e-07, + "loss": 0.0151, + "mean_token_accuracy": 0.9949965596199035, + "num_tokens": 11833106.0, + "step": 5550 + }, + { + "entropy": 1.6172014594078064, + "epoch": 1.8285055957867018, + "grad_norm": 1.6733261346817017, + "learning_rate": 4.008987870078629e-07, + "loss": 0.0168, + "mean_token_accuracy": 0.9940386712551117, + "num_tokens": 11843679.0, + "step": 5555 + }, + { + "entropy": 1.6307536125183106, + "epoch": 1.8301514154048717, + "grad_norm": 2.103327512741089, + "learning_rate": 3.9330602038495925e-07, + "loss": 0.0234, + "mean_token_accuracy": 0.9932344555854797, + "num_tokens": 11854343.0, + "step": 5560 + }, + { + "entropy": 1.6388854265213013, + "epoch": 1.8317972350230414, + "grad_norm": 0.9723978638648987, + "learning_rate": 3.8578440145100373e-07, + "loss": 0.0197, + "mean_token_accuracy": 0.9940811336040497, + "num_tokens": 11864727.0, + "step": 5565 + }, + { + "entropy": 1.6083611011505128, + "epoch": 1.8334430546412115, + "grad_norm": 1.644818902015686, + "learning_rate": 3.783339859115065e-07, + "loss": 0.0251, + "mean_token_accuracy": 0.990217787027359, + "num_tokens": 11875655.0, + "step": 5570 + }, + { + "entropy": 1.650509774684906, + "epoch": 1.8350888742593812, + "grad_norm": 4.321114540100098, + "learning_rate": 3.709548289446452e-07, + "loss": 0.0359, + "mean_token_accuracy": 0.9882233738899231, + "num_tokens": 11886276.0, + "step": 5575 + }, + { + "entropy": 1.6342867493629456, + "epoch": 1.836734693877551, + "grad_norm": 0.9216455221176147, + "learning_rate": 3.636469852008473e-07, + "loss": 0.0163, + "mean_token_accuracy": 0.9964061677455902, + "num_tokens": 11896970.0, + "step": 5580 + }, + { + "entropy": 1.6529955506324767, + "epoch": 1.838380513495721, + "grad_norm": 1.3248862028121948, + "learning_rate": 3.564105088023984e-07, + "loss": 0.0174, + "mean_token_accuracy": 0.9947122275829315, + "num_tokens": 11907415.0, + "step": 5585 + }, + { + "entropy": 1.6415375351905823, + "epoch": 1.8400263331138906, + "grad_norm": 1.5896852016448975, + "learning_rate": 3.4924545334302675e-07, + "loss": 0.0124, + "mean_token_accuracy": 0.9948625266551971, + "num_tokens": 11918125.0, + "step": 5590 + }, + { + "entropy": 1.6392775058746338, + "epoch": 1.8416721527320605, + "grad_norm": 0.6162107586860657, + "learning_rate": 3.421518718875161e-07, + "loss": 0.0204, + "mean_token_accuracy": 0.993281239271164, + "num_tokens": 11928566.0, + "step": 5595 + }, + { + "entropy": 1.6502568006515503, + "epoch": 1.8433179723502304, + "grad_norm": 0.8682472109794617, + "learning_rate": 3.351298169713102e-07, + "loss": 0.0241, + "mean_token_accuracy": 0.992335319519043, + "num_tokens": 11939271.0, + "step": 5600 + }, + { + "entropy": 1.6440783381462096, + "epoch": 1.8449637919684, + "grad_norm": 1.3868647813796997, + "learning_rate": 3.281793406001232e-07, + "loss": 0.0284, + "mean_token_accuracy": 0.9914430260658265, + "num_tokens": 11949781.0, + "step": 5605 + }, + { + "entropy": 1.6473740935325623, + "epoch": 1.8466096115865702, + "grad_norm": 1.2047520875930786, + "learning_rate": 3.213004942495546e-07, + "loss": 0.022, + "mean_token_accuracy": 0.9923123598098755, + "num_tokens": 11960650.0, + "step": 5610 + }, + { + "entropy": 1.6165038704872132, + "epoch": 1.84825543120474, + "grad_norm": 1.3083264827728271, + "learning_rate": 3.144933288647067e-07, + "loss": 0.01, + "mean_token_accuracy": 0.9968856811523438, + "num_tokens": 11971426.0, + "step": 5615 + }, + { + "entropy": 1.6375835180282592, + "epoch": 1.8499012508229098, + "grad_norm": 2.6903843879699707, + "learning_rate": 3.0775789485981254e-07, + "loss": 0.0134, + "mean_token_accuracy": 0.9949893295764923, + "num_tokens": 11982373.0, + "step": 5620 + }, + { + "entropy": 1.6399108648300171, + "epoch": 1.8515470704410797, + "grad_norm": 0.4139382839202881, + "learning_rate": 3.010942421178531e-07, + "loss": 0.0415, + "mean_token_accuracy": 0.9894947111606598, + "num_tokens": 11993082.0, + "step": 5625 + }, + { + "entropy": 1.6450653195381164, + "epoch": 1.8531928900592494, + "grad_norm": 1.1827541589736938, + "learning_rate": 2.9450241999020024e-07, + "loss": 0.0166, + "mean_token_accuracy": 0.9943609952926635, + "num_tokens": 12003839.0, + "step": 5630 + }, + { + "entropy": 1.6487621068954468, + "epoch": 1.8548387096774195, + "grad_norm": 0.9702763557434082, + "learning_rate": 2.879824772962381e-07, + "loss": 0.0148, + "mean_token_accuracy": 0.9935874819755555, + "num_tokens": 12014594.0, + "step": 5635 + }, + { + "entropy": 1.6262930750846862, + "epoch": 1.8564845292955892, + "grad_norm": 0.6445523500442505, + "learning_rate": 2.81534462323011e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9919591426849366, + "num_tokens": 12025486.0, + "step": 5640 + }, + { + "entropy": 1.6225888967514037, + "epoch": 1.858130348913759, + "grad_norm": 1.6470600366592407, + "learning_rate": 2.7515842282486274e-07, + "loss": 0.0263, + "mean_token_accuracy": 0.9922280848026276, + "num_tokens": 12035884.0, + "step": 5645 + }, + { + "entropy": 1.637891697883606, + "epoch": 1.859776168531929, + "grad_norm": 0.8527348637580872, + "learning_rate": 2.688544060230835e-07, + "loss": 0.0201, + "mean_token_accuracy": 0.9935072481632232, + "num_tokens": 12046847.0, + "step": 5650 + }, + { + "entropy": 1.6528636813163757, + "epoch": 1.8614219881500986, + "grad_norm": 0.8655990362167358, + "learning_rate": 2.626224586055581e-07, + "loss": 0.0191, + "mean_token_accuracy": 0.9932506561279297, + "num_tokens": 12057498.0, + "step": 5655 + }, + { + "entropy": 1.6207883477210998, + "epoch": 1.8630678077682687, + "grad_norm": 0.7990636229515076, + "learning_rate": 2.5646262672642033e-07, + "loss": 0.0169, + "mean_token_accuracy": 0.9947772800922394, + "num_tokens": 12068243.0, + "step": 5660 + }, + { + "entropy": 1.6112945556640625, + "epoch": 1.8647136273864384, + "grad_norm": 2.073958396911621, + "learning_rate": 2.503749560057178e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9919489622116089, + "num_tokens": 12079308.0, + "step": 5665 + }, + { + "entropy": 1.6480265259742737, + "epoch": 1.8663594470046083, + "grad_norm": 2.8769478797912598, + "learning_rate": 2.4435949152906144e-07, + "loss": 0.0235, + "mean_token_accuracy": 0.9915726184844971, + "num_tokens": 12090018.0, + "step": 5670 + }, + { + "entropy": 1.616892433166504, + "epoch": 1.8680052666227782, + "grad_norm": 0.648573100566864, + "learning_rate": 2.3841627784730536e-07, + "loss": 0.0174, + "mean_token_accuracy": 0.993311470746994, + "num_tokens": 12100850.0, + "step": 5675 + }, + { + "entropy": 1.6181252598762512, + "epoch": 1.869651086240948, + "grad_norm": 2.1162548065185547, + "learning_rate": 2.325453589762061e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9941606819629669, + "num_tokens": 12111601.0, + "step": 5680 + }, + { + "entropy": 1.6273619651794433, + "epoch": 1.8712969058591178, + "grad_norm": 3.485987901687622, + "learning_rate": 2.2674677839610305e-07, + "loss": 0.0351, + "mean_token_accuracy": 0.9911395490169526, + "num_tokens": 12122330.0, + "step": 5685 + }, + { + "entropy": 1.6523537158966064, + "epoch": 1.8729427254772877, + "grad_norm": 0.9486113786697388, + "learning_rate": 2.2102057905159292e-07, + "loss": 0.0188, + "mean_token_accuracy": 0.9935419499874115, + "num_tokens": 12132919.0, + "step": 5690 + }, + { + "entropy": 1.6415381669998168, + "epoch": 1.8745885450954576, + "grad_norm": 1.972302794456482, + "learning_rate": 2.1536680335121684e-07, + "loss": 0.0198, + "mean_token_accuracy": 0.9924857258796692, + "num_tokens": 12143394.0, + "step": 5695 + }, + { + "entropy": 1.6063251495361328, + "epoch": 1.8762343647136275, + "grad_norm": 0.8765217661857605, + "learning_rate": 2.0978549316713615e-07, + "loss": 0.0138, + "mean_token_accuracy": 0.9943941414356232, + "num_tokens": 12154201.0, + "step": 5700 + }, + { + "entropy": 1.6426220178604125, + "epoch": 1.8778801843317972, + "grad_norm": 1.477146029472351, + "learning_rate": 2.0427668983483361e-07, + "loss": 0.0221, + "mean_token_accuracy": 0.9906249940395355, + "num_tokens": 12164635.0, + "step": 5705 + }, + { + "entropy": 1.6509037852287292, + "epoch": 1.879526003949967, + "grad_norm": 1.7725666761398315, + "learning_rate": 1.9884043415280274e-07, + "loss": 0.0229, + "mean_token_accuracy": 0.9909674286842346, + "num_tokens": 12175320.0, + "step": 5710 + }, + { + "entropy": 1.649256443977356, + "epoch": 1.881171823568137, + "grad_norm": 1.5584267377853394, + "learning_rate": 1.9347676638224122e-07, + "loss": 0.0182, + "mean_token_accuracy": 0.9936787724494934, + "num_tokens": 12185966.0, + "step": 5715 + }, + { + "entropy": 1.6484339714050293, + "epoch": 1.8828176431863066, + "grad_norm": 0.69387286901474, + "learning_rate": 1.8818572624676124e-07, + "loss": 0.0179, + "mean_token_accuracy": 0.9950119018554687, + "num_tokens": 12196531.0, + "step": 5720 + }, + { + "entropy": 1.618002474308014, + "epoch": 1.8844634628044767, + "grad_norm": 3.0518853664398193, + "learning_rate": 1.8296735293208745e-07, + "loss": 0.0324, + "mean_token_accuracy": 0.991709417104721, + "num_tokens": 12207170.0, + "step": 5725 + }, + { + "entropy": 1.6460561990737914, + "epoch": 1.8861092824226464, + "grad_norm": 0.9351851940155029, + "learning_rate": 1.7782168508577168e-07, + "loss": 0.0178, + "mean_token_accuracy": 0.9925060391426086, + "num_tokens": 12217651.0, + "step": 5730 + }, + { + "entropy": 1.6722620725631714, + "epoch": 1.8877551020408163, + "grad_norm": 2.6632797718048096, + "learning_rate": 1.7274876081690429e-07, + "loss": 0.0423, + "mean_token_accuracy": 0.9879862844944001, + "num_tokens": 12227987.0, + "step": 5735 + }, + { + "entropy": 1.6359737396240235, + "epoch": 1.8894009216589862, + "grad_norm": 0.5745594501495361, + "learning_rate": 1.6774861769583538e-07, + "loss": 0.0291, + "mean_token_accuracy": 0.9925860285758972, + "num_tokens": 12238790.0, + "step": 5740 + }, + { + "entropy": 1.6011884927749633, + "epoch": 1.8910467412771559, + "grad_norm": 1.4916797876358032, + "learning_rate": 1.628212927538908e-07, + "loss": 0.0285, + "mean_token_accuracy": 0.9890454232692718, + "num_tokens": 12249683.0, + "step": 5745 + }, + { + "entropy": 1.6060463190078735, + "epoch": 1.892692560895326, + "grad_norm": 0.7883341312408447, + "learning_rate": 1.5796682248310214e-07, + "loss": 0.0156, + "mean_token_accuracy": 0.9944408297538757, + "num_tokens": 12260660.0, + "step": 5750 + }, + { + "entropy": 1.6521755695343017, + "epoch": 1.8943383805134957, + "grad_norm": 0.684786856174469, + "learning_rate": 1.5318524283593706e-07, + "loss": 0.0072, + "mean_token_accuracy": 0.9973987996578216, + "num_tokens": 12271559.0, + "step": 5755 + }, + { + "entropy": 1.6584128499031068, + "epoch": 1.8959842001316656, + "grad_norm": 3.097158432006836, + "learning_rate": 1.484765892250284e-07, + "loss": 0.0216, + "mean_token_accuracy": 0.9943334817886352, + "num_tokens": 12281996.0, + "step": 5760 + }, + { + "entropy": 1.6525746941566468, + "epoch": 1.8976300197498355, + "grad_norm": 4.315025329589844, + "learning_rate": 1.4384089652291544e-07, + "loss": 0.0293, + "mean_token_accuracy": 0.9913449168205262, + "num_tokens": 12292609.0, + "step": 5765 + }, + { + "entropy": 1.608083975315094, + "epoch": 1.8992758393680051, + "grad_norm": 0.8511555790901184, + "learning_rate": 1.3927819906178864e-07, + "loss": 0.0162, + "mean_token_accuracy": 0.9940433919429779, + "num_tokens": 12303345.0, + "step": 5770 + }, + { + "entropy": 1.654729962348938, + "epoch": 1.9009216589861753, + "grad_norm": 5.2224860191345215, + "learning_rate": 1.3478853063322862e-07, + "loss": 0.0203, + "mean_token_accuracy": 0.9930156350135804, + "num_tokens": 12313946.0, + "step": 5775 + }, + { + "entropy": 1.6443889737129211, + "epoch": 1.902567478604345, + "grad_norm": 0.4680110812187195, + "learning_rate": 1.3037192448795754e-07, + "loss": 0.0185, + "mean_token_accuracy": 0.9930983424186707, + "num_tokens": 12324639.0, + "step": 5780 + }, + { + "entropy": 1.634881365299225, + "epoch": 1.9042132982225148, + "grad_norm": 1.4533123970031738, + "learning_rate": 1.2602841333559934e-07, + "loss": 0.0181, + "mean_token_accuracy": 0.995710015296936, + "num_tokens": 12335427.0, + "step": 5785 + }, + { + "entropy": 1.6775338172912597, + "epoch": 1.9058591178406847, + "grad_norm": 1.9515726566314697, + "learning_rate": 1.217580293444276e-07, + "loss": 0.0208, + "mean_token_accuracy": 0.9887213408946991, + "num_tokens": 12345700.0, + "step": 5790 + }, + { + "entropy": 1.6608016729354858, + "epoch": 1.9075049374588544, + "grad_norm": 2.2838118076324463, + "learning_rate": 1.1756080414113691e-07, + "loss": 0.0221, + "mean_token_accuracy": 0.9925664782524108, + "num_tokens": 12356355.0, + "step": 5795 + }, + { + "entropy": 1.6454607725143433, + "epoch": 1.9091507570770243, + "grad_norm": 1.0771596431732178, + "learning_rate": 1.1343676881059751e-07, + "loss": 0.0106, + "mean_token_accuracy": 0.9954189419746399, + "num_tokens": 12366898.0, + "step": 5800 + }, + { + "entropy": 1.6636238932609557, + "epoch": 1.9107965766951942, + "grad_norm": 3.005011796951294, + "learning_rate": 1.0938595389563988e-07, + "loss": 0.0143, + "mean_token_accuracy": 0.994761872291565, + "num_tokens": 12377443.0, + "step": 5805 + }, + { + "entropy": 1.6605603098869324, + "epoch": 1.912442396313364, + "grad_norm": 2.230198621749878, + "learning_rate": 1.0540838939681164e-07, + "loss": 0.0242, + "mean_token_accuracy": 0.9926084816455841, + "num_tokens": 12388275.0, + "step": 5810 + }, + { + "entropy": 1.606773316860199, + "epoch": 1.914088215931534, + "grad_norm": 0.7968350052833557, + "learning_rate": 1.0150410477216987e-07, + "loss": 0.0105, + "mean_token_accuracy": 0.9964967429637909, + "num_tokens": 12399282.0, + "step": 5815 + }, + { + "entropy": 1.6078164100646972, + "epoch": 1.9157340355497037, + "grad_norm": 0.7457708716392517, + "learning_rate": 9.767312893705583e-08, + "loss": 0.0203, + "mean_token_accuracy": 0.9911931037902832, + "num_tokens": 12410043.0, + "step": 5820 + }, + { + "entropy": 1.6395422101020813, + "epoch": 1.9173798551678736, + "grad_norm": 0.5907423496246338, + "learning_rate": 9.391549026387948e-08, + "loss": 0.0238, + "mean_token_accuracy": 0.9929281890392303, + "num_tokens": 12420722.0, + "step": 5825 + }, + { + "entropy": 1.6461820960044862, + "epoch": 1.9190256747860435, + "grad_norm": 2.0906271934509277, + "learning_rate": 9.023121658191636e-08, + "loss": 0.0187, + "mean_token_accuracy": 0.993992280960083, + "num_tokens": 12431395.0, + "step": 5830 + }, + { + "entropy": 1.6568657994270324, + "epoch": 1.9206714944042131, + "grad_norm": 0.6122414469718933, + "learning_rate": 8.662033517709113e-08, + "loss": 0.0306, + "mean_token_accuracy": 0.9899275124073028, + "num_tokens": 12442077.0, + "step": 5835 + }, + { + "entropy": 1.6694289207458497, + "epoch": 1.9223173140223833, + "grad_norm": 0.9769027829170227, + "learning_rate": 8.308287279178651e-08, + "loss": 0.0214, + "mean_token_accuracy": 0.9897110819816589, + "num_tokens": 12452490.0, + "step": 5840 + }, + { + "entropy": 1.6428742289543152, + "epoch": 1.923963133640553, + "grad_norm": 1.7451066970825195, + "learning_rate": 7.961885562463689e-08, + "loss": 0.0205, + "mean_token_accuracy": 0.993678605556488, + "num_tokens": 12463113.0, + "step": 5845 + }, + { + "entropy": 1.6416840195655822, + "epoch": 1.9256089532587228, + "grad_norm": 0.20614351332187653, + "learning_rate": 7.622830933033954e-08, + "loss": 0.0159, + "mean_token_accuracy": 0.9947439730167389, + "num_tokens": 12473584.0, + "step": 5850 + }, + { + "entropy": 1.668267822265625, + "epoch": 1.9272547728768927, + "grad_norm": 1.4911457300186157, + "learning_rate": 7.291125901946027e-08, + "loss": 0.0293, + "mean_token_accuracy": 0.9900563299655915, + "num_tokens": 12483990.0, + "step": 5855 + }, + { + "entropy": 1.6057178854942322, + "epoch": 1.9289005924950624, + "grad_norm": 1.5931812524795532, + "learning_rate": 6.966772925825149e-08, + "loss": 0.0199, + "mean_token_accuracy": 0.9930034399032592, + "num_tokens": 12494711.0, + "step": 5860 + }, + { + "entropy": 1.6411123514175414, + "epoch": 1.9305464121132325, + "grad_norm": 2.541999578475952, + "learning_rate": 6.649774406846777e-08, + "loss": 0.0191, + "mean_token_accuracy": 0.9947657942771911, + "num_tokens": 12505189.0, + "step": 5865 + }, + { + "entropy": 1.6578918814659118, + "epoch": 1.9321922317314022, + "grad_norm": 1.0876408815383911, + "learning_rate": 6.340132692718936e-08, + "loss": 0.0143, + "mean_token_accuracy": 0.994833791255951, + "num_tokens": 12515680.0, + "step": 5870 + }, + { + "entropy": 1.620399296283722, + "epoch": 1.933838051349572, + "grad_norm": 1.0122355222702026, + "learning_rate": 6.037850076664686e-08, + "loss": 0.02, + "mean_token_accuracy": 0.9926087379455566, + "num_tokens": 12526219.0, + "step": 5875 + }, + { + "entropy": 1.619743776321411, + "epoch": 1.935483870967742, + "grad_norm": 2.4715607166290283, + "learning_rate": 5.742928797405234e-08, + "loss": 0.027, + "mean_token_accuracy": 0.9934347212314606, + "num_tokens": 12536968.0, + "step": 5880 + }, + { + "entropy": 1.6266910910606385, + "epoch": 1.9371296905859117, + "grad_norm": 1.2927361726760864, + "learning_rate": 5.455371039143176e-08, + "loss": 0.0296, + "mean_token_accuracy": 0.9922801196575165, + "num_tokens": 12547695.0, + "step": 5885 + }, + { + "entropy": 1.6311318516731261, + "epoch": 1.9387755102040818, + "grad_norm": 1.5303946733474731, + "learning_rate": 5.175178931546842e-08, + "loss": 0.0256, + "mean_token_accuracy": 0.9924924194812774, + "num_tokens": 12558345.0, + "step": 5890 + }, + { + "entropy": 1.6298449873924254, + "epoch": 1.9404213298222515, + "grad_norm": 0.6128053069114685, + "learning_rate": 4.902354549733979e-08, + "loss": 0.0268, + "mean_token_accuracy": 0.9905460000038147, + "num_tokens": 12568986.0, + "step": 5895 + }, + { + "entropy": 1.633393156528473, + "epoch": 1.9420671494404214, + "grad_norm": 1.1418019533157349, + "learning_rate": 4.636899914256421e-08, + "loss": 0.0184, + "mean_token_accuracy": 0.9936664879322052, + "num_tokens": 12579759.0, + "step": 5900 + }, + { + "entropy": 1.63808730840683, + "epoch": 1.9437129690585913, + "grad_norm": 2.710805654525757, + "learning_rate": 4.378816991085333e-08, + "loss": 0.0198, + "mean_token_accuracy": 0.9934788465499877, + "num_tokens": 12590306.0, + "step": 5905 + }, + { + "entropy": 1.6379319310188294, + "epoch": 1.945358788676761, + "grad_norm": 1.7254809141159058, + "learning_rate": 4.128107691596772e-08, + "loss": 0.0228, + "mean_token_accuracy": 0.9918417274951935, + "num_tokens": 12600767.0, + "step": 5910 + }, + { + "entropy": 1.6477123498916626, + "epoch": 1.9470046082949308, + "grad_norm": 0.8686738014221191, + "learning_rate": 3.884773872557035e-08, + "loss": 0.0285, + "mean_token_accuracy": 0.989762258529663, + "num_tokens": 12611296.0, + "step": 5915 + }, + { + "entropy": 1.6127866506576538, + "epoch": 1.9486504279131007, + "grad_norm": 1.5231475830078125, + "learning_rate": 3.648817336109556e-08, + "loss": 0.0185, + "mean_token_accuracy": 0.9937713503837585, + "num_tokens": 12622049.0, + "step": 5920 + }, + { + "entropy": 1.6393255352973939, + "epoch": 1.9502962475312706, + "grad_norm": 1.2301123142242432, + "learning_rate": 3.420239829761029e-08, + "loss": 0.0169, + "mean_token_accuracy": 0.9921809434890747, + "num_tokens": 12632714.0, + "step": 5925 + }, + { + "entropy": 1.661654818058014, + "epoch": 1.9519420671494405, + "grad_norm": 0.7805505990982056, + "learning_rate": 3.199043046368644e-08, + "loss": 0.0181, + "mean_token_accuracy": 0.9945761322975158, + "num_tokens": 12643222.0, + "step": 5930 + }, + { + "entropy": 1.6112356901168823, + "epoch": 1.9535878867676102, + "grad_norm": 1.137802004814148, + "learning_rate": 2.985228624127534e-08, + "loss": 0.0189, + "mean_token_accuracy": 0.9933175027370453, + "num_tokens": 12654137.0, + "step": 5935 + }, + { + "entropy": 1.6136484742164612, + "epoch": 1.95523370638578, + "grad_norm": 1.2796485424041748, + "learning_rate": 2.778798146558903e-08, + "loss": 0.0161, + "mean_token_accuracy": 0.9944588780403137, + "num_tokens": 12664918.0, + "step": 5940 + }, + { + "entropy": 1.615454363822937, + "epoch": 1.95687952600395, + "grad_norm": 1.3428839445114136, + "learning_rate": 2.5797531424976983e-08, + "loss": 0.0275, + "mean_token_accuracy": 0.993310171365738, + "num_tokens": 12675751.0, + "step": 5945 + }, + { + "entropy": 1.63991322517395, + "epoch": 1.9585253456221197, + "grad_norm": 1.0178812742233276, + "learning_rate": 2.388095086081954e-08, + "loss": 0.0189, + "mean_token_accuracy": 0.9943699061870575, + "num_tokens": 12686462.0, + "step": 5950 + }, + { + "entropy": 1.6252501964569093, + "epoch": 1.9601711652402898, + "grad_norm": 0.5898308157920837, + "learning_rate": 2.2038253967415768e-08, + "loss": 0.0262, + "mean_token_accuracy": 0.9889247179031372, + "num_tokens": 12697111.0, + "step": 5955 + }, + { + "entropy": 1.6657869219779968, + "epoch": 1.9618169848584595, + "grad_norm": 1.152711033821106, + "learning_rate": 2.0269454391874665e-08, + "loss": 0.0195, + "mean_token_accuracy": 0.9946586787700653, + "num_tokens": 12707827.0, + "step": 5960 + }, + { + "entropy": 1.6191688776016235, + "epoch": 1.9634628044766294, + "grad_norm": 1.1193846464157104, + "learning_rate": 1.8574565234023014e-08, + "loss": 0.0101, + "mean_token_accuracy": 0.9955754101276397, + "num_tokens": 12718563.0, + "step": 5965 + }, + { + "entropy": 1.603171467781067, + "epoch": 1.9651086240947993, + "grad_norm": 1.5283174514770508, + "learning_rate": 1.6953599046299895e-08, + "loss": 0.0175, + "mean_token_accuracy": 0.9934708416461945, + "num_tokens": 12729286.0, + "step": 5970 + }, + { + "entropy": 1.6272645711898803, + "epoch": 1.966754443712969, + "grad_norm": 0.8465014100074768, + "learning_rate": 1.5406567833666785e-08, + "loss": 0.0151, + "mean_token_accuracy": 0.9951033234596253, + "num_tokens": 12739732.0, + "step": 5975 + }, + { + "entropy": 1.644085705280304, + "epoch": 1.968400263331139, + "grad_norm": 0.6121142506599426, + "learning_rate": 1.3933483053519825e-08, + "loss": 0.0142, + "mean_token_accuracy": 0.9949011683464051, + "num_tokens": 12750492.0, + "step": 5980 + }, + { + "entropy": 1.6154101610183715, + "epoch": 1.9700460829493087, + "grad_norm": 2.3735482692718506, + "learning_rate": 1.2534355615603233e-08, + "loss": 0.033, + "mean_token_accuracy": 0.9925001919269562, + "num_tokens": 12761291.0, + "step": 5985 + }, + { + "entropy": 1.6425586223602295, + "epoch": 1.9716919025674786, + "grad_norm": 2.7295331954956055, + "learning_rate": 1.1209195881930479e-08, + "loss": 0.0247, + "mean_token_accuracy": 0.9938769578933716, + "num_tokens": 12771987.0, + "step": 5990 + }, + { + "entropy": 1.5966406106948852, + "epoch": 1.9733377221856485, + "grad_norm": 2.0931482315063477, + "learning_rate": 9.958013666704347e-09, + "loss": 0.0198, + "mean_token_accuracy": 0.9945861279964447, + "num_tokens": 12782717.0, + "step": 5995 + }, + { + "entropy": 1.6259140491485595, + "epoch": 1.9749835418038182, + "grad_norm": 0.6469723582267761, + "learning_rate": 8.780818236248101e-09, + "loss": 0.0242, + "mean_token_accuracy": 0.9942210376262665, + "num_tokens": 12793255.0, + "step": 6000 + }, + { + "entropy": 1.6549716949462892, + "epoch": 1.9766293614219883, + "grad_norm": 0.901390790939331, + "learning_rate": 7.67761830893443e-09, + "loss": 0.0137, + "mean_token_accuracy": 0.9956705689430236, + "num_tokens": 12804142.0, + "step": 6005 + }, + { + "entropy": 1.6143463373184204, + "epoch": 1.978275181040158, + "grad_norm": 1.2343865633010864, + "learning_rate": 6.648422055118842e-09, + "loss": 0.0127, + "mean_token_accuracy": 0.9951093852519989, + "num_tokens": 12815008.0, + "step": 6010 + }, + { + "entropy": 1.6510612845420838, + "epoch": 1.9799210006583279, + "grad_norm": 2.802551031112671, + "learning_rate": 5.693237097085247e-09, + "loss": 0.0162, + "mean_token_accuracy": 0.9935081660747528, + "num_tokens": 12825498.0, + "step": 6015 + }, + { + "entropy": 1.6498525738716125, + "epoch": 1.9815668202764978, + "grad_norm": 0.6338088512420654, + "learning_rate": 4.8120705089849116e-09, + "loss": 0.0147, + "mean_token_accuracy": 0.9955414116382599, + "num_tokens": 12835988.0, + "step": 6020 + }, + { + "entropy": 1.6122831344604491, + "epoch": 1.9832126398946675, + "grad_norm": 2.181898355484009, + "learning_rate": 4.00492881678427e-09, + "loss": 0.0294, + "mean_token_accuracy": 0.9928855717182159, + "num_tokens": 12846863.0, + "step": 6025 + }, + { + "entropy": 1.638089156150818, + "epoch": 1.9848584595128373, + "grad_norm": 2.5133092403411865, + "learning_rate": 3.271817998216076e-09, + "loss": 0.025, + "mean_token_accuracy": 0.9939551532268525, + "num_tokens": 12857465.0, + "step": 6030 + }, + { + "entropy": 1.5842849373817445, + "epoch": 1.9865042791310072, + "grad_norm": 3.628753662109375, + "learning_rate": 2.612743482741653e-09, + "loss": 0.0359, + "mean_token_accuracy": 0.9892144203186035, + "num_tokens": 12868438.0, + "step": 6035 + }, + { + "entropy": 1.676398503780365, + "epoch": 1.9881500987491771, + "grad_norm": 2.431900978088379, + "learning_rate": 2.0277101514987184e-09, + "loss": 0.0276, + "mean_token_accuracy": 0.991285365819931, + "num_tokens": 12878820.0, + "step": 6040 + }, + { + "entropy": 1.6270844221115113, + "epoch": 1.989795918367347, + "grad_norm": 3.490521192550659, + "learning_rate": 1.5167223372780648e-09, + "loss": 0.0291, + "mean_token_accuracy": 0.9905443787574768, + "num_tokens": 12889541.0, + "step": 6045 + }, + { + "entropy": 1.595809280872345, + "epoch": 1.9914417379855167, + "grad_norm": 2.535038709640503, + "learning_rate": 1.0797838244802627e-09, + "loss": 0.022, + "mean_token_accuracy": 0.9905419945716858, + "num_tokens": 12900791.0, + "step": 6050 + }, + { + "entropy": 1.594471561908722, + "epoch": 1.9930875576036866, + "grad_norm": 1.6428083181381226, + "learning_rate": 7.168978490978973e-10, + "loss": 0.0099, + "mean_token_accuracy": 0.9953214168548584, + "num_tokens": 12911687.0, + "step": 6055 + }, + { + "entropy": 1.6622145771980286, + "epoch": 1.9947333772218565, + "grad_norm": 1.0861161947250366, + "learning_rate": 4.2806709868115084e-10, + "loss": 0.0256, + "mean_token_accuracy": 0.9900858104228973, + "num_tokens": 12922176.0, + "step": 6060 + }, + { + "entropy": 1.616791796684265, + "epoch": 1.9963791968400262, + "grad_norm": 2.0424978733062744, + "learning_rate": 2.1329371232892138e-10, + "loss": 0.0261, + "mean_token_accuracy": 0.9941427707672119, + "num_tokens": 12932662.0, + "step": 6065 + }, + { + "entropy": 1.6235838294029237, + "epoch": 1.9980250164581963, + "grad_norm": 0.6039044260978699, + "learning_rate": 7.257928066217723e-11, + "loss": 0.0321, + "mean_token_accuracy": 0.9933849751949311, + "num_tokens": 12943359.0, + "step": 6070 + }, + { + "entropy": 1.6359187722206117, + "epoch": 1.999670836076366, + "grad_norm": 3.8299636840820312, + "learning_rate": 5.924845819516023e-12, + "loss": 0.0217, + "mean_token_accuracy": 0.9923797845840454, + "num_tokens": 12954081.0, + "step": 6075 + } + ], + "logging_steps": 5, + "max_steps": 6076, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 608, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.285029236308019e+16, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}