[ { "loss": 3.1532291412353515, "grad_norm": 0.7252150177955627, "learning_rate": 4.166666666666666e-08, "entropy": 0.909868873655796, "num_tokens": 59625.0, "mean_token_accuracy": 0.5191385246813297, "epoch": 0.004185413832792717, "step": 10 }, { "loss": 3.2842838287353517, "grad_norm": 0.8542681336402893, "learning_rate": 8.796296296296296e-08, "entropy": 0.9062351793050766, "num_tokens": 115055.0, "mean_token_accuracy": 0.5090377777814865, "epoch": 0.008370827665585435, "step": 20 }, { "loss": 3.120407485961914, "grad_norm": 0.8136049509048462, "learning_rate": 1.3425925925925926e-07, "entropy": 0.8844040989875793, "num_tokens": 173001.0, "mean_token_accuracy": 0.5300635486841202, "epoch": 0.012556241498378152, "step": 30 }, { "loss": 3.1775161743164064, "grad_norm": 0.7916132807731628, "learning_rate": 1.8055555555555554e-07, "entropy": 0.9001422733068466, "num_tokens": 230759.0, "mean_token_accuracy": 0.5147604435682297, "epoch": 0.01674165533117087, "step": 40 }, { "loss": 3.1207983016967775, "grad_norm": 0.8365122675895691, "learning_rate": 2.2685185185185184e-07, "entropy": 0.9065567880868912, "num_tokens": 290037.0, "mean_token_accuracy": 0.516672582924366, "epoch": 0.020927069163963585, "step": 50 }, { "loss": 3.110250473022461, "grad_norm": 0.8340612649917603, "learning_rate": 2.731481481481481e-07, "entropy": 0.894629393517971, "num_tokens": 348666.0, "mean_token_accuracy": 0.5243608556687832, "epoch": 0.025112482996756304, "step": 60 }, { "loss": 3.213370513916016, "grad_norm": 0.7208571434020996, "learning_rate": 3.194444444444444e-07, "entropy": 0.9139490529894829, "num_tokens": 404064.0, "mean_token_accuracy": 0.510741152614355, "epoch": 0.02929789682954902, "step": 70 }, { "loss": 3.1525503158569337, "grad_norm": 0.8069764971733093, "learning_rate": 3.657407407407407e-07, "entropy": 0.8901543036103249, "num_tokens": 459991.0, "mean_token_accuracy": 0.5182561405003071, "epoch": 0.03348331066234174, "step": 80 }, { "loss": 3.2056060791015626, "grad_norm": 0.7775021195411682, "learning_rate": 4.12037037037037e-07, "entropy": 0.9084297090768814, "num_tokens": 515607.0, "mean_token_accuracy": 0.5117158360779286, "epoch": 0.037668724495134455, "step": 90 }, { "loss": 3.1619991302490233, "grad_norm": 0.7517663240432739, "learning_rate": 4.5833333333333327e-07, "entropy": 0.9325117334723473, "num_tokens": 571331.0, "mean_token_accuracy": 0.5146001622080802, "epoch": 0.04185413832792717, "step": 100 }, { "loss": 3.083780288696289, "grad_norm": 0.7766073346138, "learning_rate": 5.046296296296297e-07, "entropy": 0.9166652098298073, "num_tokens": 628370.0, "mean_token_accuracy": 0.523348405212164, "epoch": 0.04603955216071989, "step": 110 }, { "loss": 2.9515827178955076, "grad_norm": 0.7737402319908142, "learning_rate": 5.509259259259259e-07, "entropy": 0.8990944147109985, "num_tokens": 691820.0, "mean_token_accuracy": 0.5330506779253483, "epoch": 0.05022496599351261, "step": 120 }, { "loss": 3.009184646606445, "grad_norm": 0.7089793086051941, "learning_rate": 5.972222222222222e-07, "entropy": 0.9393417268991471, "num_tokens": 750187.0, "mean_token_accuracy": 0.52148522362113, "epoch": 0.054410379826305325, "step": 130 }, { "loss": 2.931972885131836, "grad_norm": 0.6590924263000488, "learning_rate": 6.435185185185184e-07, "entropy": 0.9367253214120865, "num_tokens": 810727.0, "mean_token_accuracy": 0.52733798250556, "epoch": 0.05859579365909804, "step": 140 }, { "loss": 2.9645376205444336, "grad_norm": 0.8957489728927612, "learning_rate": 6.898148148148148e-07, "entropy": 0.9482719719409942, "num_tokens": 869700.0, "mean_token_accuracy": 0.5252767078578472, "epoch": 0.06278120749189076, "step": 150 }, { "loss": 2.9582921981811525, "grad_norm": 0.7582520842552185, "learning_rate": 7.361111111111111e-07, "entropy": 0.9640893578529358, "num_tokens": 929913.0, "mean_token_accuracy": 0.5209386304020882, "epoch": 0.06696662132468348, "step": 160 }, { "loss": 2.942486381530762, "grad_norm": 0.8944061994552612, "learning_rate": 7.824074074074074e-07, "entropy": 1.0264274433255196, "num_tokens": 987456.0, "mean_token_accuracy": 0.5134634062647819, "epoch": 0.0711520351574762, "step": 170 }, { "loss": 2.8791921615600584, "grad_norm": 0.8744268417358398, "learning_rate": 8.287037037037036e-07, "entropy": 1.0515225112438202, "num_tokens": 1045578.0, "mean_token_accuracy": 0.5203944936394691, "epoch": 0.07533744899026891, "step": 180 }, { "loss": 2.671405029296875, "grad_norm": 0.6432790160179138, "learning_rate": 8.75e-07, "entropy": 1.0437349081039429, "num_tokens": 1105831.0, "mean_token_accuracy": 0.5427842013537884, "epoch": 0.07952286282306163, "step": 190 }, { "loss": 2.658889389038086, "grad_norm": 0.606299877166748, "learning_rate": 9.212962962962962e-07, "entropy": 1.1231032446026803, "num_tokens": 1162666.0, "mean_token_accuracy": 0.5306987896561622, "epoch": 0.08370827665585434, "step": 200 }, { "eval_loss": 2.64770245552063, "eval_runtime": 46.2508, "eval_samples_per_second": 137.749, "eval_steps_per_second": 5.751, "eval_entropy": 1.1016171117474263, "eval_num_tokens": 1162666.0, "eval_mean_token_accuracy": 0.5367447220741358, "epoch": 0.08370827665585434, "step": 200 }, { "loss": 2.6236072540283204, "grad_norm": 0.497915118932724, "learning_rate": 9.675925925925926e-07, "entropy": 1.17362399995327, "num_tokens": 1219379.0, "mean_token_accuracy": 0.5290989577770233, "epoch": 0.08789369048864706, "step": 210 }, { "loss": 2.469617462158203, "grad_norm": 0.46626466512680054, "learning_rate": 9.999995407884271e-07, "entropy": 1.1599584549665451, "num_tokens": 1280477.0, "mean_token_accuracy": 0.54561567902565, "epoch": 0.09207910432143979, "step": 220 }, { "loss": 2.5247661590576174, "grad_norm": 0.4741360545158386, "learning_rate": 9.999913770505991e-07, "entropy": 1.2275266259908677, "num_tokens": 1337757.0, "mean_token_accuracy": 0.5331762477755546, "epoch": 0.0962645181542325, "step": 230 }, { "loss": 2.395884704589844, "grad_norm": 0.48130306601524353, "learning_rate": 9.999730088029378e-07, "entropy": 1.2018450900912285, "num_tokens": 1397229.0, "mean_token_accuracy": 0.5442760087549686, "epoch": 0.10044993198702522, "step": 240 }, { "loss": 2.3129932403564455, "grad_norm": 0.43950778245925903, "learning_rate": 9.99944436420327e-07, "entropy": 1.2097549244761467, "num_tokens": 1457262.0, "mean_token_accuracy": 0.5529780350625515, "epoch": 0.10463534581981794, "step": 250 }, { "loss": 2.4272939682006838, "grad_norm": 0.4790073335170746, "learning_rate": 9.999056604859114e-07, "entropy": 1.2772572070360184, "num_tokens": 1511629.0, "mean_token_accuracy": 0.5349524199962616, "epoch": 0.10882075965261065, "step": 260 }, { "loss": 2.3150350570678713, "grad_norm": 0.3617287278175354, "learning_rate": 9.998566817910835e-07, "entropy": 1.2845856487751006, "num_tokens": 1567691.0, "mean_token_accuracy": 0.5448164060711861, "epoch": 0.11300617348540337, "step": 270 }, { "loss": 2.2049863815307615, "grad_norm": 0.31779831647872925, "learning_rate": 9.997975013354675e-07, "entropy": 1.271442210674286, "num_tokens": 1628624.0, "mean_token_accuracy": 0.5550637729465961, "epoch": 0.11719158731819608, "step": 280 }, { "loss": 2.1529918670654298, "grad_norm": 0.32820868492126465, "learning_rate": 9.997281203268995e-07, "entropy": 1.279099041223526, "num_tokens": 1689782.0, "mean_token_accuracy": 0.5614619500935077, "epoch": 0.1213770011509888, "step": 290 }, { "loss": 2.1452268600463866, "grad_norm": 0.3186478018760681, "learning_rate": 9.99648540181402e-07, "entropy": 1.323472622036934, "num_tokens": 1749745.0, "mean_token_accuracy": 0.556930074095726, "epoch": 0.1255624149837815, "step": 300 }, { "loss": 2.134934997558594, "grad_norm": 0.30079811811447144, "learning_rate": 9.995587625231564e-07, "entropy": 1.3632762670516967, "num_tokens": 1807987.0, "mean_token_accuracy": 0.5517515152692795, "epoch": 0.12974782881657423, "step": 310 }, { "loss": 1.9839101791381837, "grad_norm": 0.361982136964798, "learning_rate": 9.994587891844675e-07, "entropy": 1.3298138067126275, "num_tokens": 1867566.0, "mean_token_accuracy": 0.57241967394948, "epoch": 0.13393324264936696, "step": 320 }, { "loss": 2.0360176086425783, "grad_norm": 0.3045739233493805, "learning_rate": 9.99348622205729e-07, "entropy": 1.3972268849611282, "num_tokens": 1926290.0, "mean_token_accuracy": 0.5540065504610538, "epoch": 0.13811865648215968, "step": 330 }, { "loss": 2.0286712646484375, "grad_norm": 0.31182143092155457, "learning_rate": 9.99228263835379e-07, "entropy": 1.4252518475055695, "num_tokens": 1985494.0, "mean_token_accuracy": 0.5559632822871208, "epoch": 0.1423040703149524, "step": 340 }, { "loss": 2.064646911621094, "grad_norm": 0.32973718643188477, "learning_rate": 9.990977165298569e-07, "entropy": 1.4729229807853699, "num_tokens": 2044451.0, "mean_token_accuracy": 0.542438729852438, "epoch": 0.1464894841477451, "step": 350 }, { "loss": 1.9804191589355469, "grad_norm": 0.289235383272171, "learning_rate": 9.989569829535508e-07, "entropy": 1.465097615122795, "num_tokens": 2104274.0, "mean_token_accuracy": 0.5540824517607689, "epoch": 0.15067489798053782, "step": 360 }, { "loss": 1.9926959991455078, "grad_norm": 0.29901251196861267, "learning_rate": 9.988060659787448e-07, "entropy": 1.5359219849109649, "num_tokens": 2159709.0, "mean_token_accuracy": 0.5466853015124797, "epoch": 0.15486031181333054, "step": 370 }, { "loss": 1.9560510635375976, "grad_norm": 0.2495652139186859, "learning_rate": 9.986449686855592e-07, "entropy": 1.5187518745660782, "num_tokens": 2215716.0, "mean_token_accuracy": 0.5528687633574009, "epoch": 0.15904572564612326, "step": 380 }, { "loss": 1.9273345947265625, "grad_norm": 0.24777130782604218, "learning_rate": 9.984736943618888e-07, "entropy": 1.521226641535759, "num_tokens": 2271587.0, "mean_token_accuracy": 0.5515810877084732, "epoch": 0.163231139478916, "step": 390 }, { "loss": 1.8669830322265626, "grad_norm": 0.22564777731895447, "learning_rate": 9.982922465033348e-07, "entropy": 1.5213739037513734, "num_tokens": 2333246.0, "mean_token_accuracy": 0.5613327234983444, "epoch": 0.16741655331170868, "step": 400 }, { "eval_loss": 1.8593320846557617, "eval_runtime": 43.8766, "eval_samples_per_second": 145.203, "eval_steps_per_second": 6.062, "eval_entropy": 1.4658072012707704, "eval_num_tokens": 2333246.0, "eval_mean_token_accuracy": 0.5688402788307434, "epoch": 0.16741655331170868, "step": 400 }, { "loss": 1.804941177368164, "grad_norm": 0.23039200901985168, "learning_rate": 9.981006288131342e-07, "entropy": 1.4880431205034257, "num_tokens": 2391775.0, "mean_token_accuracy": 0.5735716104507447, "epoch": 0.1716019671445014, "step": 410 }, { "loss": 1.8068187713623047, "grad_norm": 0.2117597907781601, "learning_rate": 9.978988452020832e-07, "entropy": 1.5207171112298965, "num_tokens": 2449834.0, "mean_token_accuracy": 0.5626526214182377, "epoch": 0.17578738097729413, "step": 420 }, { "loss": 1.8408426284790038, "grad_norm": 0.18858124315738678, "learning_rate": 9.97686899788459e-07, "entropy": 1.5431180804967881, "num_tokens": 2509956.0, "mean_token_accuracy": 0.5639997899532319, "epoch": 0.17997279481008685, "step": 430 }, { "loss": 1.8002569198608398, "grad_norm": 0.20246392488479614, "learning_rate": 9.974647968979339e-07, "entropy": 1.5480373591184615, "num_tokens": 2566510.0, "mean_token_accuracy": 0.5718876734375954, "epoch": 0.18415820864287957, "step": 440 }, { "loss": 1.7867300033569335, "grad_norm": 0.1937456578016281, "learning_rate": 9.972325410634885e-07, "entropy": 1.531213068962097, "num_tokens": 2624567.0, "mean_token_accuracy": 0.5752546131610871, "epoch": 0.1883436224756723, "step": 450 }, { "loss": 1.755255889892578, "grad_norm": 0.1926703304052353, "learning_rate": 9.969901370253187e-07, "entropy": 1.5252358853816985, "num_tokens": 2682287.0, "mean_token_accuracy": 0.5824674129486084, "epoch": 0.192529036308465, "step": 460 }, { "loss": 1.772911834716797, "grad_norm": 0.21854747831821442, "learning_rate": 9.96737589730738e-07, "entropy": 1.575288510322571, "num_tokens": 2735734.0, "mean_token_accuracy": 0.5827064469456673, "epoch": 0.1967144501412577, "step": 470 }, { "loss": 1.6943496704101562, "grad_norm": 0.16926386952400208, "learning_rate": 9.964749043340788e-07, "entropy": 1.4982535749673844, "num_tokens": 2794073.0, "mean_token_accuracy": 0.5961055085062981, "epoch": 0.20089986397405044, "step": 480 }, { "loss": 1.759925651550293, "grad_norm": 0.18107837438583374, "learning_rate": 9.962020861965843e-07, "entropy": 1.5479711294174194, "num_tokens": 2852819.0, "mean_token_accuracy": 0.5893502697348595, "epoch": 0.20508527780684316, "step": 490 }, { "loss": 1.6459325790405273, "grad_norm": 0.19736573100090027, "learning_rate": 9.959191408863014e-07, "entropy": 1.4799151957035064, "num_tokens": 2916292.0, "mean_token_accuracy": 0.6062492698431015, "epoch": 0.20927069163963588, "step": 500 }, { "loss": 1.6710922241210937, "grad_norm": 0.20262014865875244, "learning_rate": 9.956260741779665e-07, "entropy": 1.5181541979312896, "num_tokens": 2975466.0, "mean_token_accuracy": 0.6021158128976822, "epoch": 0.21345610547242858, "step": 510 }, { "loss": 1.6909679412841796, "grad_norm": 0.22447596490383148, "learning_rate": 9.953228920528865e-07, "entropy": 1.523398867249489, "num_tokens": 3033907.0, "mean_token_accuracy": 0.5984062060713768, "epoch": 0.2176415193052213, "step": 520 }, { "loss": 1.6476320266723632, "grad_norm": 0.20474065840244293, "learning_rate": 9.950096006988182e-07, "entropy": 1.5151092141866684, "num_tokens": 3090006.0, "mean_token_accuracy": 0.6024264812469482, "epoch": 0.22182693313801402, "step": 530 }, { "loss": 1.656897735595703, "grad_norm": 0.2609263062477112, "learning_rate": 9.946862065098413e-07, "entropy": 1.5152370780706406, "num_tokens": 3145289.0, "mean_token_accuracy": 0.6096841841936111, "epoch": 0.22601234697080674, "step": 540 }, { "loss": 1.540487289428711, "grad_norm": 0.21252916753292084, "learning_rate": 9.943527160862281e-07, "entropy": 1.4425812840461731, "num_tokens": 3202970.0, "mean_token_accuracy": 0.6310827702283859, "epoch": 0.23019776080359947, "step": 550 }, { "loss": 1.5528440475463867, "grad_norm": 0.16846199333667755, "learning_rate": 9.940091362343086e-07, "entropy": 1.4279247790575027, "num_tokens": 3258895.0, "mean_token_accuracy": 0.6345707163214683, "epoch": 0.23438317463639216, "step": 560 }, { "loss": 1.5468204498291016, "grad_norm": 0.1584591120481491, "learning_rate": 9.936554739663315e-07, "entropy": 1.4223629891872407, "num_tokens": 3315602.0, "mean_token_accuracy": 0.6405046731233597, "epoch": 0.23856858846918488, "step": 570 }, { "loss": 1.5502227783203124, "grad_norm": 0.1557629555463791, "learning_rate": 9.932917365003216e-07, "entropy": 1.3991417795419694, "num_tokens": 3376637.0, "mean_token_accuracy": 0.6378504887223244, "epoch": 0.2427540023019776, "step": 580 }, { "loss": 1.47230224609375, "grad_norm": 0.16370368003845215, "learning_rate": 9.929179312599317e-07, "entropy": 1.3776833653450011, "num_tokens": 3435084.0, "mean_token_accuracy": 0.655489268898964, "epoch": 0.24693941613477033, "step": 590 }, { "loss": 1.5052314758300782, "grad_norm": 0.15081895887851715, "learning_rate": 9.925340658742926e-07, "entropy": 1.4087885320186615, "num_tokens": 3491147.0, "mean_token_accuracy": 0.6534322142601013, "epoch": 0.251124829967563, "step": 600 }, { "eval_loss": 1.5126348733901978, "eval_runtime": 43.6936, "eval_samples_per_second": 145.811, "eval_steps_per_second": 6.088, "eval_entropy": 1.3902130023877424, "eval_num_tokens": 3491147.0, "eval_mean_token_accuracy": 0.654797031018967, "epoch": 0.251124829967563, "step": 600 }, { "loss": 1.5386703491210938, "grad_norm": 0.13764511048793793, "learning_rate": 9.921401481778548e-07, "entropy": 1.4563434034585954, "num_tokens": 3547282.0, "mean_token_accuracy": 0.6465040192008018, "epoch": 0.2553102438003558, "step": 610 }, { "loss": 1.5452125549316407, "grad_norm": 0.12197960168123245, "learning_rate": 9.917361862102316e-07, "entropy": 1.4381413817405702, "num_tokens": 3604669.0, "mean_token_accuracy": 0.6443240866065025, "epoch": 0.25949565763314847, "step": 620 }, { "loss": 1.5622711181640625, "grad_norm": 0.11413100361824036, "learning_rate": 9.913221882160325e-07, "entropy": 1.4670737832784653, "num_tokens": 3660716.0, "mean_token_accuracy": 0.6438136756420135, "epoch": 0.2636810714659412, "step": 630 }, { "loss": 1.5195579528808594, "grad_norm": 0.11969699710607529, "learning_rate": 9.908981626446967e-07, "entropy": 1.44781274497509, "num_tokens": 3716067.0, "mean_token_accuracy": 0.6527407199144364, "epoch": 0.2678664852987339, "step": 640 }, { "loss": 1.562466812133789, "grad_norm": 0.10654503107070923, "learning_rate": 9.904641181503193e-07, "entropy": 1.4735447496175766, "num_tokens": 3772158.0, "mean_token_accuracy": 0.6453444182872772, "epoch": 0.2720518991315266, "step": 650 }, { "loss": 1.4119970321655273, "grad_norm": 0.11862610280513763, "learning_rate": 9.900200635914762e-07, "entropy": 1.3951878100633621, "num_tokens": 3833284.0, "mean_token_accuracy": 0.6668122097849846, "epoch": 0.27623731296431936, "step": 660 }, { "loss": 1.4448695182800293, "grad_norm": 0.11796533316373825, "learning_rate": 9.895660080310418e-07, "entropy": 1.4141918390989303, "num_tokens": 3890126.0, "mean_token_accuracy": 0.6602638632059097, "epoch": 0.28042272679711205, "step": 670 }, { "loss": 1.4727934837341308, "grad_norm": 0.10836026817560196, "learning_rate": 9.891019607360042e-07, "entropy": 1.4221189886331558, "num_tokens": 3946816.0, "mean_token_accuracy": 0.6601494640111923, "epoch": 0.2846081406299048, "step": 680 }, { "loss": 1.4374773979187012, "grad_norm": 0.11260558664798737, "learning_rate": 9.88627931177278e-07, "entropy": 1.388827046751976, "num_tokens": 4006104.0, "mean_token_accuracy": 0.6628721192479133, "epoch": 0.2887935544626975, "step": 690 }, { "loss": 1.4522128105163574, "grad_norm": 0.09237143397331238, "learning_rate": 9.88143929029508e-07, "entropy": 1.4040265291929246, "num_tokens": 4064859.0, "mean_token_accuracy": 0.6620885074138642, "epoch": 0.2929789682954902, "step": 700 }, { "loss": 1.4630813598632812, "grad_norm": 0.11288689821958542, "learning_rate": 9.876499641708741e-07, "entropy": 1.4170700162649155, "num_tokens": 4122576.0, "mean_token_accuracy": 0.6592713505029678, "epoch": 0.29716438212828294, "step": 710 }, { "loss": 1.3941055297851563, "grad_norm": 0.09874723106622696, "learning_rate": 9.871460466828888e-07, "entropy": 1.3975009769201279, "num_tokens": 4180815.0, "mean_token_accuracy": 0.6696879684925079, "epoch": 0.30134979596107564, "step": 720 }, { "loss": 1.4689726829528809, "grad_norm": 0.10610879957675934, "learning_rate": 9.866321868501912e-07, "entropy": 1.4678748458623887, "num_tokens": 4239666.0, "mean_token_accuracy": 0.6571864351630211, "epoch": 0.3055352097938684, "step": 730 }, { "loss": 1.479258155822754, "grad_norm": 0.12200459837913513, "learning_rate": 9.861083951603377e-07, "entropy": 1.430861946940422, "num_tokens": 4297700.0, "mean_token_accuracy": 0.6564601019024849, "epoch": 0.3097206236266611, "step": 740 }, { "loss": 1.4389605522155762, "grad_norm": 0.12583598494529724, "learning_rate": 9.855746823035876e-07, "entropy": 1.432998749613762, "num_tokens": 4355152.0, "mean_token_accuracy": 0.6649609237909317, "epoch": 0.3139060374594538, "step": 750 }, { "loss": 1.451263999938965, "grad_norm": 0.11383051425218582, "learning_rate": 9.850310591726846e-07, "entropy": 1.4290786892175675, "num_tokens": 4414094.0, "mean_token_accuracy": 0.6572059765458107, "epoch": 0.31809145129224653, "step": 760 }, { "loss": 1.5145987510681151, "grad_norm": 0.10066704452037811, "learning_rate": 9.844775368626358e-07, "entropy": 1.4577032029628754, "num_tokens": 4472646.0, "mean_token_accuracy": 0.6500703617930412, "epoch": 0.3222768651250392, "step": 770 }, { "loss": 1.4831979751586915, "grad_norm": 0.08656121045351028, "learning_rate": 9.839141266704833e-07, "entropy": 1.4568557769060135, "num_tokens": 4529048.0, "mean_token_accuracy": 0.6541818514466285, "epoch": 0.326462278957832, "step": 780 }, { "loss": 1.422746181488037, "grad_norm": 0.10030363500118256, "learning_rate": 9.833408400950753e-07, "entropy": 1.4236784011125565, "num_tokens": 4587248.0, "mean_token_accuracy": 0.6693221822381019, "epoch": 0.33064769279062467, "step": 790 }, { "loss": 1.511890697479248, "grad_norm": 0.1208115741610527, "learning_rate": 9.827576888368306e-07, "entropy": 1.4541470259428024, "num_tokens": 4643727.0, "mean_token_accuracy": 0.6536489054560661, "epoch": 0.33483310662341736, "step": 800 }, { "eval_loss": 1.4582873582839966, "eval_runtime": 43.7122, "eval_samples_per_second": 145.749, "eval_steps_per_second": 6.085, "eval_entropy": 1.4234498008749539, "eval_num_tokens": 4643727.0, "eval_mean_token_accuracy": 0.6611158966569972, "epoch": 0.33483310662341736, "step": 800 }, { "loss": 1.519627285003662, "grad_norm": 0.10558852553367615, "learning_rate": 9.821646847974998e-07, "entropy": 1.4889809876680373, "num_tokens": 4699602.0, "mean_token_accuracy": 0.6542887255549431, "epoch": 0.3390185204562101, "step": 810 }, { "loss": 1.4185623168945312, "grad_norm": 0.11619652807712555, "learning_rate": 9.815618400799228e-07, "entropy": 1.4101483166217803, "num_tokens": 4759712.0, "mean_token_accuracy": 0.6648697286844254, "epoch": 0.3432039342890028, "step": 820 }, { "loss": 1.4773643493652344, "grad_norm": 0.10418440401554108, "learning_rate": 9.809491669877815e-07, "entropy": 1.4431717425584794, "num_tokens": 4817721.0, "mean_token_accuracy": 0.6592238992452621, "epoch": 0.34738934812179556, "step": 830 }, { "loss": 1.4270614624023437, "grad_norm": 0.09047893434762955, "learning_rate": 9.803266780253487e-07, "entropy": 1.4172182738780976, "num_tokens": 4876476.0, "mean_token_accuracy": 0.6634449914097786, "epoch": 0.35157476195458826, "step": 840 }, { "loss": 1.4365344047546387, "grad_norm": 0.11413703858852386, "learning_rate": 9.796943858972328e-07, "entropy": 1.424839785695076, "num_tokens": 4935356.0, "mean_token_accuracy": 0.664018252491951, "epoch": 0.35576017578738095, "step": 850 }, { "loss": 1.4368658065795898, "grad_norm": 0.10197298973798752, "learning_rate": 9.790523035081194e-07, "entropy": 1.4327729046344757, "num_tokens": 4996023.0, "mean_token_accuracy": 0.667100901901722, "epoch": 0.3599455896201737, "step": 860 }, { "loss": 1.4285932540893556, "grad_norm": 0.09999420493841171, "learning_rate": 9.78400443962506e-07, "entropy": 1.4451387345790863, "num_tokens": 5054143.0, "mean_token_accuracy": 0.6665249273180962, "epoch": 0.3641310034529664, "step": 870 }, { "loss": 1.4537543296813964, "grad_norm": 0.12824219465255737, "learning_rate": 9.777388205644365e-07, "entropy": 1.4365610003471374, "num_tokens": 5109151.0, "mean_token_accuracy": 0.6605026423931122, "epoch": 0.36831641728575915, "step": 880 }, { "loss": 1.4052467346191406, "grad_norm": 0.10136168450117111, "learning_rate": 9.770674468172288e-07, "entropy": 1.4461679026484489, "num_tokens": 5169545.0, "mean_token_accuracy": 0.6698134854435921, "epoch": 0.37250183111855184, "step": 890 }, { "loss": 1.5341646194458007, "grad_norm": 0.125015527009964, "learning_rate": 9.763863364231995e-07, "entropy": 1.4948209792375564, "num_tokens": 5226362.0, "mean_token_accuracy": 0.6506395027041435, "epoch": 0.3766872449513446, "step": 900 }, { "loss": 1.451594066619873, "grad_norm": 0.12184764444828033, "learning_rate": 9.75695503283383e-07, "entropy": 1.454634991288185, "num_tokens": 5287471.0, "mean_token_accuracy": 0.6617233619093895, "epoch": 0.3808726587841373, "step": 910 }, { "loss": 1.3663444519042969, "grad_norm": 0.09586543589830399, "learning_rate": 9.749949614972505e-07, "entropy": 1.4007300227880477, "num_tokens": 5346427.0, "mean_token_accuracy": 0.6761364534497261, "epoch": 0.38505807261693, "step": 920 }, { "loss": 1.4433299064636231, "grad_norm": 0.09879063069820404, "learning_rate": 9.74284725362419e-07, "entropy": 1.44069661796093, "num_tokens": 5406471.0, "mean_token_accuracy": 0.6573658585548401, "epoch": 0.38924348644972273, "step": 930 }, { "loss": 1.3213248252868652, "grad_norm": 0.09394767135381699, "learning_rate": 9.735648093743621e-07, "entropy": 1.3663470640778541, "num_tokens": 5468090.0, "mean_token_accuracy": 0.6877701610326767, "epoch": 0.3934289002825154, "step": 940 }, { "loss": 1.3884021759033203, "grad_norm": 0.10035385936498642, "learning_rate": 9.728352282261124e-07, "entropy": 1.4055696964263915, "num_tokens": 5527409.0, "mean_token_accuracy": 0.6716061800718307, "epoch": 0.3976143141153082, "step": 950 }, { "loss": 1.4895167350769043, "grad_norm": 0.1361590176820755, "learning_rate": 9.72095996807963e-07, "entropy": 1.4704587817192079, "num_tokens": 5586447.0, "mean_token_accuracy": 0.6556992784142495, "epoch": 0.40179972794810087, "step": 960 }, { "loss": 1.422182846069336, "grad_norm": 0.12393207103013992, "learning_rate": 9.713471302071624e-07, "entropy": 1.4276411414146424, "num_tokens": 5644917.0, "mean_token_accuracy": 0.663788178563118, "epoch": 0.40598514178089357, "step": 970 }, { "loss": 1.4414152145385741, "grad_norm": 0.12177922576665878, "learning_rate": 9.705886437076078e-07, "entropy": 1.4314857304096222, "num_tokens": 5706907.0, "mean_token_accuracy": 0.6635714635252953, "epoch": 0.4101705556136863, "step": 980 }, { "loss": 1.4422160148620606, "grad_norm": 0.09565871953964233, "learning_rate": 9.698205527895317e-07, "entropy": 1.4681658923625946, "num_tokens": 5767067.0, "mean_token_accuracy": 0.6618433445692062, "epoch": 0.414355969446479, "step": 990 }, { "loss": 1.3973498344421387, "grad_norm": 0.11843396723270416, "learning_rate": 9.69042873129187e-07, "entropy": 1.4245391979813575, "num_tokens": 5826368.0, "mean_token_accuracy": 0.6698687911033631, "epoch": 0.41854138327927176, "step": 1000 }, { "eval_loss": 1.430882215499878, "eval_runtime": 42.5472, "eval_samples_per_second": 149.739, "eval_steps_per_second": 6.252, "eval_entropy": 1.4260875381025158, "eval_num_tokens": 5826368.0, "eval_mean_token_accuracy": 0.6650281033121553, "epoch": 0.41854138327927176, "step": 1000 }, { "loss": 1.454050064086914, "grad_norm": 0.11938533186912537, "learning_rate": 9.682556205985273e-07, "entropy": 1.447835522890091, "num_tokens": 5882058.0, "mean_token_accuracy": 0.6635008811950683, "epoch": 0.42272679711206446, "step": 1010 }, { "loss": 1.3930376052856446, "grad_norm": 0.1063380166888237, "learning_rate": 9.674588112648819e-07, "entropy": 1.4178766876459121, "num_tokens": 5938913.0, "mean_token_accuracy": 0.6699633210897445, "epoch": 0.42691221094485715, "step": 1020 }, { "loss": 1.4532501220703125, "grad_norm": 0.1084047332406044, "learning_rate": 9.666524613906283e-07, "entropy": 1.4572493433952332, "num_tokens": 5993465.0, "mean_token_accuracy": 0.6680980160832405, "epoch": 0.4310976247776499, "step": 1030 }, { "loss": 1.4012516021728516, "grad_norm": 0.10825818032026291, "learning_rate": 9.658365874328613e-07, "entropy": 1.434103360772133, "num_tokens": 6049913.0, "mean_token_accuracy": 0.670105955004692, "epoch": 0.4352830386104426, "step": 1040 }, { "loss": 1.4673041343688964, "grad_norm": 0.09340775012969971, "learning_rate": 9.650112060430556e-07, "entropy": 1.4505166023969651, "num_tokens": 6106055.0, "mean_token_accuracy": 0.6630285322666168, "epoch": 0.43946845244323535, "step": 1050 }, { "loss": 1.425284481048584, "grad_norm": 0.12269195914268494, "learning_rate": 9.641763340667264e-07, "entropy": 1.438645276427269, "num_tokens": 6165982.0, "mean_token_accuracy": 0.6631047874689102, "epoch": 0.44365386627602804, "step": 1060 }, { "loss": 1.4093000411987304, "grad_norm": 0.12008947134017944, "learning_rate": 9.633319885430863e-07, "entropy": 1.4247242331504821, "num_tokens": 6221254.0, "mean_token_accuracy": 0.668901015818119, "epoch": 0.44783928010882074, "step": 1070 }, { "loss": 1.4575057983398438, "grad_norm": 0.12600930035114288, "learning_rate": 9.62478186704697e-07, "entropy": 1.4588077813386917, "num_tokens": 6281193.0, "mean_token_accuracy": 0.659762179851532, "epoch": 0.4520246939416135, "step": 1080 }, { "loss": 1.3971601486206056, "grad_norm": 0.09669267386198044, "learning_rate": 9.616149459771174e-07, "entropy": 1.4378665208816528, "num_tokens": 6338625.0, "mean_token_accuracy": 0.6723957479000091, "epoch": 0.4562101077744062, "step": 1090 }, { "loss": 1.4475428581237793, "grad_norm": 0.10479287803173065, "learning_rate": 9.607422839785487e-07, "entropy": 1.4612567931413651, "num_tokens": 6398522.0, "mean_token_accuracy": 0.6564841374754906, "epoch": 0.46039552160719893, "step": 1100 }, { "loss": 1.4175043106079102, "grad_norm": 0.10358787328004837, "learning_rate": 9.598602185194733e-07, "entropy": 1.4467926740646362, "num_tokens": 6458089.0, "mean_token_accuracy": 0.669213418662548, "epoch": 0.4645809354399916, "step": 1110 }, { "loss": 1.4430898666381835, "grad_norm": 0.12753859162330627, "learning_rate": 9.589687676022933e-07, "entropy": 1.4614018350839615, "num_tokens": 6517387.0, "mean_token_accuracy": 0.663593128323555, "epoch": 0.4687663492727843, "step": 1120 }, { "loss": 1.4554133415222168, "grad_norm": 0.11801481246948242, "learning_rate": 9.580679494209621e-07, "entropy": 1.463664811849594, "num_tokens": 6574281.0, "mean_token_accuracy": 0.6585227012634277, "epoch": 0.47295176310557707, "step": 1130 }, { "loss": 1.4516281127929687, "grad_norm": 0.1230725646018982, "learning_rate": 9.57157782360612e-07, "entropy": 1.4588176727294921, "num_tokens": 6632526.0, "mean_token_accuracy": 0.6620682567358017, "epoch": 0.47713717693836977, "step": 1140 }, { "loss": 1.3834566116333007, "grad_norm": 0.10615360736846924, "learning_rate": 9.562382849971814e-07, "entropy": 1.4231864005327224, "num_tokens": 6686576.0, "mean_token_accuracy": 0.6769091472029686, "epoch": 0.4813225907711625, "step": 1150 }, { "loss": 1.3678070068359376, "grad_norm": 0.10580965131521225, "learning_rate": 9.553094760970338e-07, "entropy": 1.4144569963216782, "num_tokens": 6743418.0, "mean_token_accuracy": 0.6736478328704834, "epoch": 0.4855080046039552, "step": 1160 }, { "loss": 1.4649283409118652, "grad_norm": 0.11393830180168152, "learning_rate": 9.543713746165746e-07, "entropy": 1.461512914299965, "num_tokens": 6801169.0, "mean_token_accuracy": 0.6581070765852928, "epoch": 0.4896934184367479, "step": 1170 }, { "loss": 1.3680376052856444, "grad_norm": 0.19611844420433044, "learning_rate": 9.534239997018663e-07, "entropy": 1.4197842329740524, "num_tokens": 6858807.0, "mean_token_accuracy": 0.6744951158761978, "epoch": 0.49387883226954066, "step": 1180 }, { "loss": 1.4589731216430664, "grad_norm": 0.12470986694097519, "learning_rate": 9.52467370688235e-07, "entropy": 1.4711190968751908, "num_tokens": 6915842.0, "mean_token_accuracy": 0.6595605373382568, "epoch": 0.49806424610233335, "step": 1190 }, { "loss": 1.3511184692382812, "grad_norm": 0.1231166198849678, "learning_rate": 9.515015070998781e-07, "entropy": 1.3929312020540237, "num_tokens": 6973364.0, "mean_token_accuracy": 0.6785273075103759, "epoch": 0.502249659935126, "step": 1200 }, { "eval_loss": 1.4083536863327026, "eval_runtime": 43.0534, "eval_samples_per_second": 147.979, "eval_steps_per_second": 6.178, "eval_entropy": 1.401145983907513, "eval_num_tokens": 6973364.0, "eval_mean_token_accuracy": 0.6672393374873283, "epoch": 0.502249659935126, "step": 1200 }, { "loss": 1.4147989273071289, "grad_norm": 0.10981585085391998, "learning_rate": 9.505264286494644e-07, "entropy": 1.4393782436847686, "num_tokens": 7029183.0, "mean_token_accuracy": 0.6653257578611373, "epoch": 0.5064350737679189, "step": 1210 }, { "loss": 1.4123595237731934, "grad_norm": 0.12332361936569214, "learning_rate": 9.495421552377325e-07, "entropy": 1.4351352035999299, "num_tokens": 7089107.0, "mean_token_accuracy": 0.6679085582494736, "epoch": 0.5106204876007115, "step": 1220 }, { "loss": 1.35689115524292, "grad_norm": 0.10939253121614456, "learning_rate": 9.485487069530841e-07, "entropy": 1.384123608469963, "num_tokens": 7145731.0, "mean_token_accuracy": 0.6764253750443459, "epoch": 0.5148059014335042, "step": 1230 }, { "loss": 1.4721358299255372, "grad_norm": 0.1354241967201233, "learning_rate": 9.475461040711745e-07, "entropy": 1.4555100411176682, "num_tokens": 7201497.0, "mean_token_accuracy": 0.6551220327615738, "epoch": 0.5189913152662969, "step": 1240 }, { "loss": 1.406270408630371, "grad_norm": 0.11071319878101349, "learning_rate": 9.465343670544987e-07, "entropy": 1.446416699886322, "num_tokens": 7255249.0, "mean_token_accuracy": 0.6669346168637276, "epoch": 0.5231767290990896, "step": 1250 }, { "loss": 1.409125804901123, "grad_norm": 0.1242227554321289, "learning_rate": 9.455135165519734e-07, "entropy": 1.4336748003959656, "num_tokens": 7312069.0, "mean_token_accuracy": 0.6685505136847496, "epoch": 0.5273621429318824, "step": 1260 }, { "loss": 1.353925609588623, "grad_norm": 0.12051878869533539, "learning_rate": 9.444835733985157e-07, "entropy": 1.3861510157585144, "num_tokens": 7374935.0, "mean_token_accuracy": 0.6735975816845894, "epoch": 0.5315475567646751, "step": 1270 }, { "loss": 1.3926225662231446, "grad_norm": 0.1231522411108017, "learning_rate": 9.434445586146182e-07, "entropy": 1.431991320848465, "num_tokens": 7429456.0, "mean_token_accuracy": 0.6716481134295463, "epoch": 0.5357329705974678, "step": 1280 }, { "loss": 1.3677814483642579, "grad_norm": 0.10811372101306915, "learning_rate": 9.423964934059202e-07, "entropy": 1.4019683420658111, "num_tokens": 7487005.0, "mean_token_accuracy": 0.6747205436229706, "epoch": 0.5399183844302605, "step": 1290 }, { "loss": 1.3889549255371094, "grad_norm": 0.12505528330802917, "learning_rate": 9.413393991627736e-07, "entropy": 1.3941765069961547, "num_tokens": 7547594.0, "mean_token_accuracy": 0.6716236621141434, "epoch": 0.5441037982630532, "step": 1300 }, { "loss": 1.388343048095703, "grad_norm": 0.11002212017774582, "learning_rate": 9.40273297459808e-07, "entropy": 1.4113761156797409, "num_tokens": 7605828.0, "mean_token_accuracy": 0.6661069095134735, "epoch": 0.548289212095846, "step": 1310 }, { "loss": 1.3891004562377929, "grad_norm": 0.14147064089775085, "learning_rate": 9.391982100554889e-07, "entropy": 1.4317275822162627, "num_tokens": 7661455.0, "mean_token_accuracy": 0.6669554397463798, "epoch": 0.5524746259286387, "step": 1320 }, { "loss": 1.3904253959655761, "grad_norm": 0.13139671087265015, "learning_rate": 9.38114158891675e-07, "entropy": 1.4096351087093353, "num_tokens": 7719091.0, "mean_token_accuracy": 0.671739687025547, "epoch": 0.5566600397614314, "step": 1330 }, { "loss": 1.463707733154297, "grad_norm": 0.09927231818437576, "learning_rate": 9.370211660931693e-07, "entropy": 1.4864629238843918, "num_tokens": 7774511.0, "mean_token_accuracy": 0.660004960000515, "epoch": 0.5608454535942241, "step": 1340 }, { "loss": 1.3764376640319824, "grad_norm": 0.11545363068580627, "learning_rate": 9.35919253967268e-07, "entropy": 1.3998028621077538, "num_tokens": 7836251.0, "mean_token_accuracy": 0.6720214635133743, "epoch": 0.5650308674270168, "step": 1350 }, { "loss": 1.3152969360351563, "grad_norm": 0.1053733229637146, "learning_rate": 9.348084450033051e-07, "entropy": 1.3938700079917907, "num_tokens": 7893911.0, "mean_token_accuracy": 0.6841806307435035, "epoch": 0.5692162812598096, "step": 1360 }, { "loss": 1.422788143157959, "grad_norm": 0.09823399037122726, "learning_rate": 9.336887618721938e-07, "entropy": 1.445565864443779, "num_tokens": 7949863.0, "mean_token_accuracy": 0.6624092936515809, "epoch": 0.5734016950926023, "step": 1370 }, { "loss": 1.3210840225219727, "grad_norm": 0.1335407942533493, "learning_rate": 9.325602274259629e-07, "entropy": 1.3757253885269165, "num_tokens": 8008384.0, "mean_token_accuracy": 0.6824934765696525, "epoch": 0.577587108925395, "step": 1380 }, { "loss": 1.397932243347168, "grad_norm": 0.09968513995409012, "learning_rate": 9.314228646972919e-07, "entropy": 1.4251334190368652, "num_tokens": 8067031.0, "mean_token_accuracy": 0.666124664247036, "epoch": 0.5817725227581877, "step": 1390 }, { "loss": 1.312647533416748, "grad_norm": 0.12575951218605042, "learning_rate": 9.302766968990387e-07, "entropy": 1.355531930923462, "num_tokens": 8126287.0, "mean_token_accuracy": 0.6826214835047721, "epoch": 0.5859579365909804, "step": 1400 }, { "eval_loss": 1.386446237564087, "eval_runtime": 42.6243, "eval_samples_per_second": 149.469, "eval_steps_per_second": 6.241, "eval_entropy": 1.4034466080199508, "eval_num_tokens": 8126287.0, "eval_mean_token_accuracy": 0.6737931832335049, "epoch": 0.5859579365909804, "step": 1400 }, { "loss": 1.38052396774292, "grad_norm": 0.13619256019592285, "learning_rate": 9.291217474237685e-07, "entropy": 1.404805138707161, "num_tokens": 8184847.0, "mean_token_accuracy": 0.6700320944190026, "epoch": 0.5901433504237732, "step": 1410 }, { "loss": 1.4232772827148437, "grad_norm": 0.12265791743993759, "learning_rate": 9.27958039843274e-07, "entropy": 1.4586470276117325, "num_tokens": 8243143.0, "mean_token_accuracy": 0.6625824689865112, "epoch": 0.5943287642565659, "step": 1420 }, { "loss": 1.3759157180786132, "grad_norm": 0.12311021983623505, "learning_rate": 9.267855979080959e-07, "entropy": 1.4208383083343505, "num_tokens": 8301096.0, "mean_token_accuracy": 0.6705714225769043, "epoch": 0.5985141780893586, "step": 1430 }, { "loss": 1.4408933639526367, "grad_norm": 0.10979989171028137, "learning_rate": 9.256044455470372e-07, "entropy": 1.4562449276447296, "num_tokens": 8357561.0, "mean_token_accuracy": 0.6647118896245956, "epoch": 0.6026995919221513, "step": 1440 }, { "loss": 1.4200193405151367, "grad_norm": 0.10581167787313461, "learning_rate": 9.244146068666756e-07, "entropy": 1.4489133656024933, "num_tokens": 8411021.0, "mean_token_accuracy": 0.6702521324157715, "epoch": 0.606885005754944, "step": 1450 }, { "loss": 1.3639183044433594, "grad_norm": 0.12785717844963074, "learning_rate": 9.232161061508707e-07, "entropy": 1.3970074653625488, "num_tokens": 8473715.0, "mean_token_accuracy": 0.6738650560379028, "epoch": 0.6110704195877368, "step": 1460 }, { "loss": 1.3117795944213868, "grad_norm": 0.11914683878421783, "learning_rate": 9.220089678602692e-07, "entropy": 1.3731692731380463, "num_tokens": 8536821.0, "mean_token_accuracy": 0.6784457266330719, "epoch": 0.6152558334205295, "step": 1470 }, { "loss": 1.3580459594726562, "grad_norm": 0.10762108862400055, "learning_rate": 9.20793216631805e-07, "entropy": 1.3978804230690003, "num_tokens": 8596217.0, "mean_token_accuracy": 0.6741666734218598, "epoch": 0.6194412472533222, "step": 1480 }, { "loss": 1.3858207702636718, "grad_norm": 0.13189709186553955, "learning_rate": 9.195688772781969e-07, "entropy": 1.4172445833683014, "num_tokens": 8649547.0, "mean_token_accuracy": 0.6702063709497452, "epoch": 0.6236266610861149, "step": 1490 }, { "loss": 1.3870158195495605, "grad_norm": 0.13120818138122559, "learning_rate": 9.183359747874416e-07, "entropy": 1.424094271659851, "num_tokens": 8704916.0, "mean_token_accuracy": 0.669642123579979, "epoch": 0.6278120749189076, "step": 1500 }, { "loss": 1.4398550033569335, "grad_norm": 0.12010879069566727, "learning_rate": 9.170945343223045e-07, "entropy": 1.4305728733539582, "num_tokens": 8760259.0, "mean_token_accuracy": 0.6612218707799912, "epoch": 0.6319974887517004, "step": 1510 }, { "loss": 1.3878154754638672, "grad_norm": 0.1339423507452011, "learning_rate": 9.15844581219805e-07, "entropy": 1.3878618061542511, "num_tokens": 8816700.0, "mean_token_accuracy": 0.6718688145279884, "epoch": 0.6361829025844931, "step": 1520 }, { "loss": 1.3522814750671386, "grad_norm": 0.13170458376407623, "learning_rate": 9.145861409907009e-07, "entropy": 1.3895842641592027, "num_tokens": 8876509.0, "mean_token_accuracy": 0.6753421723842621, "epoch": 0.6403683164172858, "step": 1530 }, { "loss": 1.3812095642089843, "grad_norm": 0.1139625683426857, "learning_rate": 9.133192393189664e-07, "entropy": 1.4209527760744094, "num_tokens": 8936438.0, "mean_token_accuracy": 0.6720142468810082, "epoch": 0.6445537302500784, "step": 1540 }, { "loss": 1.4154645919799804, "grad_norm": 0.13268420100212097, "learning_rate": 9.120439020612685e-07, "entropy": 1.424301978945732, "num_tokens": 8994731.0, "mean_token_accuracy": 0.6668044954538346, "epoch": 0.6487391440828711, "step": 1550 }, { "loss": 1.3785716056823731, "grad_norm": 0.11167196929454803, "learning_rate": 9.107601552464393e-07, "entropy": 1.3881200447678566, "num_tokens": 9052527.0, "mean_token_accuracy": 0.6731992438435555, "epoch": 0.652924557915664, "step": 1560 }, { "loss": 1.3963075637817384, "grad_norm": 0.1282496154308319, "learning_rate": 9.094680250749447e-07, "entropy": 1.408314546942711, "num_tokens": 9111578.0, "mean_token_accuracy": 0.6680608317255974, "epoch": 0.6571099717484566, "step": 1570 }, { "loss": 1.3251177787780761, "grad_norm": 0.12457749992609024, "learning_rate": 9.081675379183494e-07, "entropy": 1.3645547151565551, "num_tokens": 9171878.0, "mean_token_accuracy": 0.6805019825696945, "epoch": 0.6612953855812493, "step": 1580 }, { "loss": 1.3337480545043945, "grad_norm": 0.10987865179777145, "learning_rate": 9.068587203187794e-07, "entropy": 1.3783577740192414, "num_tokens": 9231431.0, "mean_token_accuracy": 0.6761843442916871, "epoch": 0.665480799414042, "step": 1590 }, { "loss": 1.3129050254821777, "grad_norm": 0.11137118935585022, "learning_rate": 9.055415989883792e-07, "entropy": 1.3690737694501878, "num_tokens": 9287759.0, "mean_token_accuracy": 0.6817014619708062, "epoch": 0.6696662132468347, "step": 1600 }, { "eval_loss": 1.3662420511245728, "eval_runtime": 43.7555, "eval_samples_per_second": 145.605, "eval_steps_per_second": 6.079, "eval_entropy": 1.3844989090037525, "eval_num_tokens": 9287759.0, "eval_mean_token_accuracy": 0.6762344077565616, "epoch": 0.6696662132468347, "step": 1600 }, { "loss": 1.3789652824401855, "grad_norm": 0.11303029209375381, "learning_rate": 9.042162008087678e-07, "entropy": 1.388508751988411, "num_tokens": 9347815.0, "mean_token_accuracy": 0.671443772315979, "epoch": 0.6738516270796275, "step": 1610 }, { "loss": 1.3409759521484375, "grad_norm": 0.12162081152200699, "learning_rate": 9.028825528304891e-07, "entropy": 1.3988509953022004, "num_tokens": 9404534.0, "mean_token_accuracy": 0.6778050258755683, "epoch": 0.6780370409124202, "step": 1620 }, { "loss": 1.286928367614746, "grad_norm": 0.1191353127360344, "learning_rate": 9.015406822724603e-07, "entropy": 1.3400784492492677, "num_tokens": 9465006.0, "mean_token_accuracy": 0.6883344247937202, "epoch": 0.6822224547452129, "step": 1630 }, { "loss": 1.3931745529174804, "grad_norm": 0.09988338500261307, "learning_rate": 9.001906165214163e-07, "entropy": 1.4158646211028099, "num_tokens": 9523244.0, "mean_token_accuracy": 0.6664687514305114, "epoch": 0.6864078685780056, "step": 1640 }, { "loss": 1.3149008750915527, "grad_norm": 0.1224365308880806, "learning_rate": 8.988323831313509e-07, "entropy": 1.3621025055646896, "num_tokens": 9583571.0, "mean_token_accuracy": 0.6805920660495758, "epoch": 0.6905932824107983, "step": 1650 }, { "loss": 1.3128664016723632, "grad_norm": 0.10845732688903809, "learning_rate": 8.974660098229538e-07, "entropy": 1.366037741303444, "num_tokens": 9640353.0, "mean_token_accuracy": 0.6822919920086861, "epoch": 0.6947786962435911, "step": 1660 }, { "loss": 1.3836250305175781, "grad_norm": 0.12312953174114227, "learning_rate": 8.960915244830462e-07, "entropy": 1.4012254863977431, "num_tokens": 9701108.0, "mean_token_accuracy": 0.6682980388402939, "epoch": 0.6989641100763838, "step": 1670 }, { "loss": 1.298573875427246, "grad_norm": 0.10932071506977081, "learning_rate": 8.947089551640099e-07, "entropy": 1.351333498954773, "num_tokens": 9758477.0, "mean_token_accuracy": 0.6857402086257934, "epoch": 0.7031495239091765, "step": 1680 }, { "loss": 1.3268583297729493, "grad_norm": 0.1166784018278122, "learning_rate": 8.933183300832159e-07, "entropy": 1.3652890086174012, "num_tokens": 9816530.0, "mean_token_accuracy": 0.6774859979748726, "epoch": 0.7073349377419692, "step": 1690 }, { "loss": 1.37611722946167, "grad_norm": 0.1278134286403656, "learning_rate": 8.919196776224483e-07, "entropy": 1.399143072962761, "num_tokens": 9872452.0, "mean_token_accuracy": 0.6704028770327568, "epoch": 0.7115203515747619, "step": 1700 }, { "loss": 1.3107229232788087, "grad_norm": 0.12152674794197083, "learning_rate": 8.905130263273252e-07, "entropy": 1.3753829419612884, "num_tokens": 9934101.0, "mean_token_accuracy": 0.68070268034935, "epoch": 0.7157057654075547, "step": 1710 }, { "loss": 1.3585830688476563, "grad_norm": 0.12099979817867279, "learning_rate": 8.890984049067154e-07, "entropy": 1.3618301630020142, "num_tokens": 9993614.0, "mean_token_accuracy": 0.6762332633137703, "epoch": 0.7198911792403474, "step": 1720 }, { "loss": 1.302845287322998, "grad_norm": 0.11998716741800308, "learning_rate": 8.876758422321534e-07, "entropy": 1.356363880634308, "num_tokens": 10047945.0, "mean_token_accuracy": 0.6853278845548629, "epoch": 0.7240765930731401, "step": 1730 }, { "loss": 1.3057265281677246, "grad_norm": 0.11447525024414062, "learning_rate": 8.862453673372495e-07, "entropy": 1.3511420711874962, "num_tokens": 10105849.0, "mean_token_accuracy": 0.6814648106694221, "epoch": 0.7282620069059328, "step": 1740 }, { "loss": 1.379593563079834, "grad_norm": 0.13615551590919495, "learning_rate": 8.848070094170972e-07, "entropy": 1.4266703605651856, "num_tokens": 10160689.0, "mean_token_accuracy": 0.6730331972241401, "epoch": 0.7324474207387256, "step": 1750 }, { "loss": 1.3482324600219726, "grad_norm": 0.1049669086933136, "learning_rate": 8.833607978276782e-07, "entropy": 1.365234938263893, "num_tokens": 10219317.0, "mean_token_accuracy": 0.6763183102011681, "epoch": 0.7366328345715183, "step": 1760 }, { "loss": 1.308854579925537, "grad_norm": 0.11895614117383957, "learning_rate": 8.819067620852621e-07, "entropy": 1.3593208014965057, "num_tokens": 10281133.0, "mean_token_accuracy": 0.6821026623249054, "epoch": 0.740818248404311, "step": 1770 }, { "loss": 1.3750693321228027, "grad_norm": 0.13367140293121338, "learning_rate": 8.804449318658047e-07, "entropy": 1.391082948446274, "num_tokens": 10338588.0, "mean_token_accuracy": 0.6708121821284294, "epoch": 0.7450036622371037, "step": 1780 }, { "loss": 1.3176989555358887, "grad_norm": 0.10955236107110977, "learning_rate": 8.789753370043425e-07, "entropy": 1.373031947016716, "num_tokens": 10398744.0, "mean_token_accuracy": 0.6810923710465431, "epoch": 0.7491890760698964, "step": 1790 }, { "loss": 1.3639984130859375, "grad_norm": 0.12343617528676987, "learning_rate": 8.77498007494383e-07, "entropy": 1.4000030606985092, "num_tokens": 10458537.0, "mean_token_accuracy": 0.6697928130626678, "epoch": 0.7533744899026892, "step": 1800 }, { "eval_loss": 1.350634217262268, "eval_runtime": 42.5241, "eval_samples_per_second": 149.821, "eval_steps_per_second": 6.255, "eval_entropy": 1.3988571140102875, "eval_num_tokens": 10458537.0, "eval_mean_token_accuracy": 0.6771848898633082, "epoch": 0.7533744899026892, "step": 1800 }, { "loss": 1.3396940231323242, "grad_norm": 0.14190584421157837, "learning_rate": 8.760129734872932e-07, "entropy": 1.3851164013147355, "num_tokens": 10516646.0, "mean_token_accuracy": 0.6750243782997132, "epoch": 0.7575599037354819, "step": 1810 }, { "loss": 1.3640681266784669, "grad_norm": 0.11394577473402023, "learning_rate": 8.745202652916841e-07, "entropy": 1.400177638232708, "num_tokens": 10576044.0, "mean_token_accuracy": 0.6688720732927322, "epoch": 0.7617453175682746, "step": 1820 }, { "loss": 1.4145827293395996, "grad_norm": 0.1021205335855484, "learning_rate": 8.73019913372792e-07, "entropy": 1.4293284267187119, "num_tokens": 10635490.0, "mean_token_accuracy": 0.664357790350914, "epoch": 0.7659307314010673, "step": 1830 }, { "loss": 1.3291969299316406, "grad_norm": 0.104949451982975, "learning_rate": 8.715119483518568e-07, "entropy": 1.392353293299675, "num_tokens": 10696235.0, "mean_token_accuracy": 0.6753359526395798, "epoch": 0.77011614523386, "step": 1840 }, { "loss": 1.3674373626708984, "grad_norm": 0.13051320612430573, "learning_rate": 8.699964010054972e-07, "entropy": 1.3989370226860047, "num_tokens": 10756113.0, "mean_token_accuracy": 0.6702560499310494, "epoch": 0.7743015590666528, "step": 1850 }, { "loss": 1.3329706192016602, "grad_norm": 0.11483673751354218, "learning_rate": 8.684733022650819e-07, "entropy": 1.368683397769928, "num_tokens": 10811097.0, "mean_token_accuracy": 0.6795622929930687, "epoch": 0.7784869728994455, "step": 1860 }, { "loss": 1.34647216796875, "grad_norm": 0.12257901579141617, "learning_rate": 8.669426832160995e-07, "entropy": 1.3777292981743812, "num_tokens": 10869645.0, "mean_token_accuracy": 0.6771846890449524, "epoch": 0.7826723867322382, "step": 1870 }, { "loss": 1.2678668022155761, "grad_norm": 0.10710500180721283, "learning_rate": 8.65404575097523e-07, "entropy": 1.319590486586094, "num_tokens": 10929506.0, "mean_token_accuracy": 0.6875983402132988, "epoch": 0.7868578005650309, "step": 1880 }, { "loss": 1.364974021911621, "grad_norm": 0.11756409704685211, "learning_rate": 8.638590093011722e-07, "entropy": 1.400401759147644, "num_tokens": 10984931.0, "mean_token_accuracy": 0.67054513245821, "epoch": 0.7910432143978235, "step": 1890 }, { "loss": 1.3333361625671387, "grad_norm": 0.13867364823818207, "learning_rate": 8.623060173710743e-07, "entropy": 1.369761797785759, "num_tokens": 11040065.0, "mean_token_accuracy": 0.6754815384745598, "epoch": 0.7952286282306164, "step": 1900 }, { "loss": 1.2708181381225585, "grad_norm": 0.10772886127233505, "learning_rate": 8.607456310028185e-07, "entropy": 1.3362341210246087, "num_tokens": 11101320.0, "mean_token_accuracy": 0.6911322221159935, "epoch": 0.799414042063409, "step": 1910 }, { "loss": 1.3482179641723633, "grad_norm": 0.13811437785625458, "learning_rate": 8.591778820429104e-07, "entropy": 1.3786241382360458, "num_tokens": 11159403.0, "mean_token_accuracy": 0.676637114584446, "epoch": 0.8035994558962017, "step": 1920 }, { "loss": 1.3001495361328126, "grad_norm": 0.11261286586523056, "learning_rate": 8.576028024881208e-07, "entropy": 1.342548942565918, "num_tokens": 11215300.0, "mean_token_accuracy": 0.6838564172387123, "epoch": 0.8077848697289944, "step": 1930 }, { "loss": 1.37919921875, "grad_norm": 0.11729196459054947, "learning_rate": 8.560204244848339e-07, "entropy": 1.399843516945839, "num_tokens": 11274016.0, "mean_token_accuracy": 0.6692644655704498, "epoch": 0.8119702835617871, "step": 1940 }, { "loss": 1.3153133392333984, "grad_norm": 0.11101414263248444, "learning_rate": 8.544307803283903e-07, "entropy": 1.3550761044025421, "num_tokens": 11331840.0, "mean_token_accuracy": 0.6830023691058159, "epoch": 0.8161556973945799, "step": 1950 }, { "loss": 1.3670063018798828, "grad_norm": 0.10791585594415665, "learning_rate": 8.528339024624287e-07, "entropy": 1.3926001816987992, "num_tokens": 11388250.0, "mean_token_accuracy": 0.6758360341191292, "epoch": 0.8203411112273726, "step": 1960 }, { "loss": 1.322612190246582, "grad_norm": 0.12179048359394073, "learning_rate": 8.512298234782227e-07, "entropy": 1.3523173958063126, "num_tokens": 11444623.0, "mean_token_accuracy": 0.6819486439228057, "epoch": 0.8245265250601653, "step": 1970 }, { "loss": 1.4050199508666992, "grad_norm": 0.1269518882036209, "learning_rate": 8.496185761140165e-07, "entropy": 1.4183282285928727, "num_tokens": 11501456.0, "mean_token_accuracy": 0.6670055955648422, "epoch": 0.828711938892958, "step": 1980 }, { "loss": 1.3573097229003905, "grad_norm": 0.09794802963733673, "learning_rate": 8.480001932543561e-07, "entropy": 1.3888707369565965, "num_tokens": 11562134.0, "mean_token_accuracy": 0.6723511442542076, "epoch": 0.8328973527257507, "step": 1990 }, { "loss": 1.3131244659423829, "grad_norm": 0.12277819216251373, "learning_rate": 8.463747079294192e-07, "entropy": 1.3465208828449249, "num_tokens": 11618831.0, "mean_token_accuracy": 0.6795975625514984, "epoch": 0.8370827665585435, "step": 2000 }, { "eval_loss": 1.3373528718948364, "eval_runtime": 42.4003, "eval_samples_per_second": 150.258, "eval_steps_per_second": 6.274, "eval_entropy": 1.3585354107663148, "eval_num_tokens": 11618831.0, "eval_mean_token_accuracy": 0.678231207947982, "epoch": 0.8370827665585435, "step": 2000 }, { "loss": 1.3982874870300293, "grad_norm": 0.13690534234046936, "learning_rate": 8.447421533143396e-07, "entropy": 1.4036804780364036, "num_tokens": 11676394.0, "mean_token_accuracy": 0.6648698434233665, "epoch": 0.8412681803913362, "step": 2010 }, { "loss": 1.321161937713623, "grad_norm": 0.16348762810230255, "learning_rate": 8.431025627285313e-07, "entropy": 1.349110186100006, "num_tokens": 11730143.0, "mean_token_accuracy": 0.6850418791174888, "epoch": 0.8454535942241289, "step": 2020 }, { "loss": 1.337346076965332, "grad_norm": 0.12358900159597397, "learning_rate": 8.414559696350078e-07, "entropy": 1.3770191550254822, "num_tokens": 11786616.0, "mean_token_accuracy": 0.6773856431245804, "epoch": 0.8496390080569216, "step": 2030 }, { "loss": 1.341224193572998, "grad_norm": 0.11463375389575958, "learning_rate": 8.398024076396996e-07, "entropy": 1.345393455028534, "num_tokens": 11845477.0, "mean_token_accuracy": 0.6754840731620788, "epoch": 0.8538244218897143, "step": 2040 }, { "loss": 1.3237956047058106, "grad_norm": 0.12505337595939636, "learning_rate": 8.381419104907681e-07, "entropy": 1.3643497437238694, "num_tokens": 11901746.0, "mean_token_accuracy": 0.6791232407093049, "epoch": 0.8580098357225071, "step": 2050 }, { "loss": 1.3346891403198242, "grad_norm": 0.15036678314208984, "learning_rate": 8.364745120779164e-07, "entropy": 1.3704555958509446, "num_tokens": 11959605.0, "mean_token_accuracy": 0.6759614482522011, "epoch": 0.8621952495552998, "step": 2060 }, { "loss": 1.4080591201782227, "grad_norm": 0.14488154649734497, "learning_rate": 8.348002464316987e-07, "entropy": 1.4137721806764603, "num_tokens": 12018839.0, "mean_token_accuracy": 0.6624691441655159, "epoch": 0.8663806633880925, "step": 2070 }, { "loss": 1.3576594352722169, "grad_norm": 0.1306961327791214, "learning_rate": 8.331191477228246e-07, "entropy": 1.4100464552640914, "num_tokens": 12077962.0, "mean_token_accuracy": 0.6744375959038734, "epoch": 0.8705660772208852, "step": 2080 }, { "loss": 1.3189333915710448, "grad_norm": 0.09990637004375458, "learning_rate": 8.314312502614625e-07, "entropy": 1.3474989101290702, "num_tokens": 12137755.0, "mean_token_accuracy": 0.6803866818547248, "epoch": 0.8747514910536779, "step": 2090 }, { "loss": 1.350827980041504, "grad_norm": 0.1305275708436966, "learning_rate": 8.29736588496539e-07, "entropy": 1.384324887394905, "num_tokens": 12194836.0, "mean_token_accuracy": 0.6731877833604812, "epoch": 0.8789369048864707, "step": 2100 }, { "loss": 1.3458109855651856, "grad_norm": 0.12695269286632538, "learning_rate": 8.280351970150358e-07, "entropy": 1.3462085962295531, "num_tokens": 12254568.0, "mean_token_accuracy": 0.6745196804404259, "epoch": 0.8831223187192634, "step": 2110 }, { "loss": 1.3157236099243164, "grad_norm": 0.12223149091005325, "learning_rate": 8.263271105412843e-07, "entropy": 1.345698779821396, "num_tokens": 12313266.0, "mean_token_accuracy": 0.6800820276141166, "epoch": 0.8873077325520561, "step": 2120 }, { "loss": 1.3625286102294922, "grad_norm": 0.12075755000114441, "learning_rate": 8.246123639362557e-07, "entropy": 1.3751042202115058, "num_tokens": 12368266.0, "mean_token_accuracy": 0.6779290676116944, "epoch": 0.8914931463848488, "step": 2130 }, { "loss": 1.3247100830078125, "grad_norm": 0.13140852749347687, "learning_rate": 8.22890992196851e-07, "entropy": 1.3399439036846161, "num_tokens": 12427195.0, "mean_token_accuracy": 0.6778766274452209, "epoch": 0.8956785602176415, "step": 2140 }, { "loss": 1.3223968505859376, "grad_norm": 0.11262480914592743, "learning_rate": 8.211630304551856e-07, "entropy": 1.3523710697889328, "num_tokens": 12481690.0, "mean_token_accuracy": 0.6801952719688416, "epoch": 0.8998639740504343, "step": 2150 }, { "loss": 1.2305709838867187, "grad_norm": 0.1140614002943039, "learning_rate": 8.194285139778727e-07, "entropy": 1.2833492413163186, "num_tokens": 12544082.0, "mean_token_accuracy": 0.696322962641716, "epoch": 0.904049387883227, "step": 2160 }, { "loss": 1.2584315299987794, "grad_norm": 0.1213318482041359, "learning_rate": 8.176874781653042e-07, "entropy": 1.2917151510715486, "num_tokens": 12605884.0, "mean_token_accuracy": 0.6918731480836868, "epoch": 0.9082348017160197, "step": 2170 }, { "loss": 1.4043787956237792, "grad_norm": 0.11265023797750473, "learning_rate": 8.159399585509271e-07, "entropy": 1.4147561937570572, "num_tokens": 12662340.0, "mean_token_accuracy": 0.6659792140126228, "epoch": 0.9124202155488124, "step": 2180 }, { "loss": 1.2520899772644043, "grad_norm": 0.12448090314865112, "learning_rate": 8.14185990800518e-07, "entropy": 1.278790497779846, "num_tokens": 12718634.0, "mean_token_accuracy": 0.6945044815540313, "epoch": 0.9166056293816051, "step": 2190 }, { "loss": 1.3444849967956543, "grad_norm": 0.12102659791707993, "learning_rate": 8.124256107114569e-07, "entropy": 1.3645626872777938, "num_tokens": 12774125.0, "mean_token_accuracy": 0.6725556075572967, "epoch": 0.9207910432143979, "step": 2200 }, { "eval_loss": 1.3292649984359741, "eval_runtime": 42.3319, "eval_samples_per_second": 150.501, "eval_steps_per_second": 6.284, "eval_entropy": 1.3460397827894168, "eval_num_tokens": 12774125.0, "eval_mean_token_accuracy": 0.6789818390419609, "epoch": 0.9207910432143979, "step": 2200 }, { "loss": 1.3911532402038573, "grad_norm": 0.1305384337902069, "learning_rate": 8.106588542119957e-07, "entropy": 1.3969669669866562, "num_tokens": 12832025.0, "mean_token_accuracy": 0.6678112506866455, "epoch": 0.9249764570471906, "step": 2210 }, { "loss": 1.3338760375976562, "grad_norm": 0.1187131404876709, "learning_rate": 8.088857573605237e-07, "entropy": 1.375227126479149, "num_tokens": 12888734.0, "mean_token_accuracy": 0.6802457317709922, "epoch": 0.9291618708799833, "step": 2220 }, { "loss": 1.2834582328796387, "grad_norm": 0.14116276800632477, "learning_rate": 8.071063563448339e-07, "entropy": 1.3024362832307816, "num_tokens": 12943886.0, "mean_token_accuracy": 0.6881816878914833, "epoch": 0.933347284712776, "step": 2230 }, { "loss": 1.312158203125, "grad_norm": 0.12964707612991333, "learning_rate": 8.053206874813829e-07, "entropy": 1.364695656299591, "num_tokens": 13000723.0, "mean_token_accuracy": 0.6795030117034913, "epoch": 0.9375326985455686, "step": 2240 }, { "loss": 1.3068957328796387, "grad_norm": 0.12915638089179993, "learning_rate": 8.035287872145502e-07, "entropy": 1.3586914032697677, "num_tokens": 13059283.0, "mean_token_accuracy": 0.6813771218061447, "epoch": 0.9417181123783614, "step": 2250 }, { "loss": 1.3330992698669433, "grad_norm": 0.10278042405843735, "learning_rate": 8.017306921158942e-07, "entropy": 1.3742854058742524, "num_tokens": 13118033.0, "mean_token_accuracy": 0.6798395842313767, "epoch": 0.9459035262111541, "step": 2260 }, { "loss": 1.4351073265075684, "grad_norm": 0.14834155142307281, "learning_rate": 7.99926438883406e-07, "entropy": 1.409215834736824, "num_tokens": 13173943.0, "mean_token_accuracy": 0.6597715452313423, "epoch": 0.9500889400439468, "step": 2270 }, { "loss": 1.2880861282348632, "grad_norm": 0.10686289519071579, "learning_rate": 7.981160643407603e-07, "entropy": 1.3126128152012826, "num_tokens": 13233131.0, "mean_token_accuracy": 0.6831002920866013, "epoch": 0.9542743538767395, "step": 2280 }, { "loss": 1.37518310546875, "grad_norm": 0.1137382760643959, "learning_rate": 7.962996054365642e-07, "entropy": 1.3762210130691528, "num_tokens": 13289669.0, "mean_token_accuracy": 0.6739885672926903, "epoch": 0.9584597677095322, "step": 2290 }, { "loss": 1.2979955673217773, "grad_norm": 0.12840472161769867, "learning_rate": 7.944770992436026e-07, "entropy": 1.3360363632440566, "num_tokens": 13347131.0, "mean_token_accuracy": 0.6823042362928391, "epoch": 0.962645181542325, "step": 2300 }, { "loss": 1.3241994857788086, "grad_norm": 0.11081521958112717, "learning_rate": 7.926485829580814e-07, "entropy": 1.3549024030566215, "num_tokens": 13405191.0, "mean_token_accuracy": 0.6771049797534943, "epoch": 0.9668305953751177, "step": 2310 }, { "loss": 1.3815485000610352, "grad_norm": 0.11393143981695175, "learning_rate": 7.908140938988692e-07, "entropy": 1.3816259652376175, "num_tokens": 13463913.0, "mean_token_accuracy": 0.6688653215765953, "epoch": 0.9710160092079104, "step": 2320 }, { "loss": 1.298836898803711, "grad_norm": 0.12833324074745178, "learning_rate": 7.889736695067348e-07, "entropy": 1.3122636392712592, "num_tokens": 13523958.0, "mean_token_accuracy": 0.6812730312347413, "epoch": 0.9752014230407031, "step": 2330 }, { "loss": 1.3618934631347657, "grad_norm": 0.13094215095043182, "learning_rate": 7.87127347343584e-07, "entropy": 1.3717454001307487, "num_tokens": 13581681.0, "mean_token_accuracy": 0.6716930896043778, "epoch": 0.9793868368734958, "step": 2340 }, { "loss": 1.3211997032165528, "grad_norm": 0.14429600536823273, "learning_rate": 7.852751650916917e-07, "entropy": 1.3575677514076232, "num_tokens": 13641102.0, "mean_token_accuracy": 0.6812080055475235, "epoch": 0.9835722507062886, "step": 2350 }, { "loss": 1.3058280944824219, "grad_norm": 0.13168948888778687, "learning_rate": 7.83417160552934e-07, "entropy": 1.3377871721982957, "num_tokens": 13697001.0, "mean_token_accuracy": 0.6837321490049362, "epoch": 0.9877576645390813, "step": 2360 }, { "loss": 1.320173168182373, "grad_norm": 0.13248135149478912, "learning_rate": 7.815533716480158e-07, "entropy": 1.3715132981538773, "num_tokens": 13754970.0, "mean_token_accuracy": 0.6818105265498161, "epoch": 0.991943078371874, "step": 2370 }, { "loss": 1.3184805870056153, "grad_norm": 0.1117711067199707, "learning_rate": 7.796838364156977e-07, "entropy": 1.3519122838973998, "num_tokens": 13814161.0, "mean_token_accuracy": 0.6789533212780953, "epoch": 0.9961284922046667, "step": 2380 }, { "loss": 1.381266212463379, "grad_norm": 0.3463696539402008, "learning_rate": 7.778085930120191e-07, "entropy": 1.3519603207304671, "num_tokens": 13865252.0, "mean_token_accuracy": 0.6710431801306235, "epoch": 1.0, "step": 2390 }, { "loss": 1.3650718688964845, "grad_norm": 0.10732991993427277, "learning_rate": 7.759276797095196e-07, "entropy": 1.3758342564105988, "num_tokens": 13925700.0, "mean_token_accuracy": 0.6686381295323371, "epoch": 1.0041854138327928, "step": 2400 }, { "eval_loss": 1.324312686920166, "eval_runtime": 42.9132, "eval_samples_per_second": 148.463, "eval_steps_per_second": 6.199, "eval_entropy": 1.3503220336777824, "eval_num_tokens": 13925700.0, "eval_mean_token_accuracy": 0.6799099082337287, "epoch": 1.0041854138327928, "step": 2400 }, { "loss": 1.275872802734375, "grad_norm": 0.10125313699245453, "learning_rate": 7.740411348964576e-07, "entropy": 1.3205101490020752, "num_tokens": 13983023.0, "mean_token_accuracy": 0.6841968685388565, "epoch": 1.0083708276655854, "step": 2410 }, { "loss": 1.3197596549987793, "grad_norm": 0.12487287819385529, "learning_rate": 7.721489970760275e-07, "entropy": 1.3373865127563476, "num_tokens": 14044602.0, "mean_token_accuracy": 0.677445650100708, "epoch": 1.0125562414983782, "step": 2420 }, { "loss": 1.2989977836608886, "grad_norm": 0.1548726111650467, "learning_rate": 7.702513048655733e-07, "entropy": 1.3116925165057183, "num_tokens": 14104408.0, "mean_token_accuracy": 0.6814423218369484, "epoch": 1.0167416553311708, "step": 2430 }, { "loss": 1.2706897735595704, "grad_norm": 0.11503283679485321, "learning_rate": 7.683480969958003e-07, "entropy": 1.310747703909874, "num_tokens": 14162736.0, "mean_token_accuracy": 0.6892054408788681, "epoch": 1.0209270691639636, "step": 2440 }, { "loss": 1.2928138732910157, "grad_norm": 0.12240534275770187, "learning_rate": 7.664394123099853e-07, "entropy": 1.3191738039255143, "num_tokens": 14221626.0, "mean_token_accuracy": 0.6869289621710777, "epoch": 1.0251124829967564, "step": 2450 }, { "loss": 1.293262767791748, "grad_norm": 0.11519357562065125, "learning_rate": 7.64525289763184e-07, "entropy": 1.320760977268219, "num_tokens": 14280841.0, "mean_token_accuracy": 0.6857645198702812, "epoch": 1.029297896829549, "step": 2460 }, { "loss": 1.3051738739013672, "grad_norm": 0.11736012250185013, "learning_rate": 7.626057684214341e-07, "entropy": 1.316636176407337, "num_tokens": 14338816.0, "mean_token_accuracy": 0.6803102239966392, "epoch": 1.0334833106623418, "step": 2470 }, { "loss": 1.3561962127685547, "grad_norm": 0.13388119637966156, "learning_rate": 7.606808874609605e-07, "entropy": 1.36598659157753, "num_tokens": 14395539.0, "mean_token_accuracy": 0.6725652754306793, "epoch": 1.0376687244951344, "step": 2480 }, { "loss": 1.3418392181396483, "grad_norm": 0.12838061153888702, "learning_rate": 7.587506861673737e-07, "entropy": 1.3244032382965087, "num_tokens": 14451789.0, "mean_token_accuracy": 0.6773718982934952, "epoch": 1.0418541383279272, "step": 2490 }, { "loss": 1.2931674003601075, "grad_norm": 0.12182667851448059, "learning_rate": 7.568152039348695e-07, "entropy": 1.3194489538669587, "num_tokens": 14510441.0, "mean_token_accuracy": 0.6842545494437218, "epoch": 1.04603955216072, "step": 2500 }, { "loss": 1.2959155082702636, "grad_norm": 0.11124531924724579, "learning_rate": 7.548744802654241e-07, "entropy": 1.3410497322678565, "num_tokens": 14571458.0, "mean_token_accuracy": 0.6812979131937027, "epoch": 1.0502249659935126, "step": 2510 }, { "loss": 1.3612911224365234, "grad_norm": 0.12837456166744232, "learning_rate": 7.529285547679882e-07, "entropy": 1.3736032456159593, "num_tokens": 14627118.0, "mean_token_accuracy": 0.672698700428009, "epoch": 1.0544103798263054, "step": 2520 }, { "loss": 1.2740073204040527, "grad_norm": 0.1267591416835785, "learning_rate": 7.509774671576785e-07, "entropy": 1.3048336684703827, "num_tokens": 14685752.0, "mean_token_accuracy": 0.6858905151486396, "epoch": 1.058595793659098, "step": 2530 }, { "loss": 1.3410483360290528, "grad_norm": 0.11439883708953857, "learning_rate": 7.490212572549666e-07, "entropy": 1.3314668446779252, "num_tokens": 14742644.0, "mean_token_accuracy": 0.6746952176094055, "epoch": 1.0627812074918908, "step": 2540 }, { "loss": 1.2937799453735352, "grad_norm": 0.12421438843011856, "learning_rate": 7.470599649848681e-07, "entropy": 1.3203342527151107, "num_tokens": 14801546.0, "mean_token_accuracy": 0.6863655790686607, "epoch": 1.0669666213246836, "step": 2550 }, { "loss": 1.3474176406860352, "grad_norm": 0.11059686541557312, "learning_rate": 7.450936303761256e-07, "entropy": 1.3507545605301856, "num_tokens": 14861872.0, "mean_token_accuracy": 0.6777540385723114, "epoch": 1.0711520351574761, "step": 2560 }, { "loss": 1.2592041015625, "grad_norm": 0.12262172996997833, "learning_rate": 7.431222935603929e-07, "entropy": 1.2903067260980605, "num_tokens": 14919917.0, "mean_token_accuracy": 0.6862245246767997, "epoch": 1.075337448990269, "step": 2570 }, { "loss": 1.3285273551940917, "grad_norm": 0.1249430701136589, "learning_rate": 7.411459947714156e-07, "entropy": 1.346482941508293, "num_tokens": 14977173.0, "mean_token_accuracy": 0.677224400639534, "epoch": 1.0795228628230615, "step": 2580 }, { "loss": 1.3090217590332032, "grad_norm": 0.15991806983947754, "learning_rate": 7.391647743442103e-07, "entropy": 1.3448469370603562, "num_tokens": 15036719.0, "mean_token_accuracy": 0.6807536914944649, "epoch": 1.0837082766558543, "step": 2590 }, { "loss": 1.385681438446045, "grad_norm": 0.12378425896167755, "learning_rate": 7.37178672714241e-07, "entropy": 1.4169642955064774, "num_tokens": 15093272.0, "mean_token_accuracy": 0.666177037358284, "epoch": 1.0878936904886471, "step": 2600 }, { "eval_loss": 1.3201794624328613, "eval_runtime": 43.6566, "eval_samples_per_second": 145.934, "eval_steps_per_second": 6.093, "eval_entropy": 1.318832386705212, "eval_num_tokens": 15093272.0, "eval_mean_token_accuracy": 0.6805890722382337, "epoch": 1.0878936904886471, "step": 2600 }, { "loss": 1.3124534606933593, "grad_norm": 0.14278866350650787, "learning_rate": 7.351877304165939e-07, "entropy": 1.3207478374242783, "num_tokens": 15151531.0, "mean_token_accuracy": 0.6814302504062653, "epoch": 1.0920791043214397, "step": 2610 }, { "loss": 1.310394859313965, "grad_norm": 0.11016988754272461, "learning_rate": 7.331919880851505e-07, "entropy": 1.3247565850615501, "num_tokens": 15208078.0, "mean_token_accuracy": 0.6797660425305366, "epoch": 1.0962645181542325, "step": 2620 }, { "loss": 1.3397459030151366, "grad_norm": 0.12294236570596695, "learning_rate": 7.311914864517574e-07, "entropy": 1.344627757370472, "num_tokens": 15262908.0, "mean_token_accuracy": 0.6800432533025742, "epoch": 1.1004499319870251, "step": 2630 }, { "loss": 1.3385157585144043, "grad_norm": 0.1285414695739746, "learning_rate": 7.291862663453963e-07, "entropy": 1.342196998000145, "num_tokens": 15323145.0, "mean_token_accuracy": 0.6765275478363038, "epoch": 1.104635345819818, "step": 2640 }, { "loss": 1.30029239654541, "grad_norm": 0.13284096121788025, "learning_rate": 7.271763686913493e-07, "entropy": 1.3492845341563224, "num_tokens": 15380781.0, "mean_token_accuracy": 0.6857595443725586, "epoch": 1.1088207596526107, "step": 2650 }, { "loss": 1.287161159515381, "grad_norm": 0.12089403718709946, "learning_rate": 7.251618345103646e-07, "entropy": 1.3121826618909835, "num_tokens": 15439602.0, "mean_token_accuracy": 0.6850664153695106, "epoch": 1.1130061734854033, "step": 2660 }, { "loss": 1.2762629508972168, "grad_norm": 0.12427452206611633, "learning_rate": 7.231427049178192e-07, "entropy": 1.2992495775222779, "num_tokens": 15495803.0, "mean_token_accuracy": 0.6846798285841942, "epoch": 1.1171915873181961, "step": 2670 }, { "loss": 1.274948501586914, "grad_norm": 0.13808666169643402, "learning_rate": 7.211190211228791e-07, "entropy": 1.305306363105774, "num_tokens": 15550588.0, "mean_token_accuracy": 0.6887386977672577, "epoch": 1.1213770011509887, "step": 2680 }, { "loss": 1.2809961318969727, "grad_norm": 0.1604543924331665, "learning_rate": 7.190908244276592e-07, "entropy": 1.291318878531456, "num_tokens": 15607839.0, "mean_token_accuracy": 0.6838915839791297, "epoch": 1.1255624149837815, "step": 2690 }, { "loss": 1.3102614402770996, "grad_norm": 0.1264321208000183, "learning_rate": 7.170581562263795e-07, "entropy": 1.3290839582681655, "num_tokens": 15666987.0, "mean_token_accuracy": 0.6819840222597122, "epoch": 1.1297478288165743, "step": 2700 }, { "loss": 1.3671725273132325, "grad_norm": 0.1209392324090004, "learning_rate": 7.150210580045207e-07, "entropy": 1.3735456377267838, "num_tokens": 15724955.0, "mean_token_accuracy": 0.6725474014878273, "epoch": 1.133933242649367, "step": 2710 }, { "loss": 1.3231231689453125, "grad_norm": 0.12559957802295685, "learning_rate": 7.129795713379776e-07, "entropy": 1.340329071879387, "num_tokens": 15782149.0, "mean_token_accuracy": 0.6805369645357132, "epoch": 1.1381186564821597, "step": 2720 }, { "loss": 1.2828726768493652, "grad_norm": 0.13034865260124207, "learning_rate": 7.109337378922102e-07, "entropy": 1.2797758102416992, "num_tokens": 15835973.0, "mean_token_accuracy": 0.6902579948306083, "epoch": 1.1423040703149523, "step": 2730 }, { "loss": 1.329068374633789, "grad_norm": 0.1187472939491272, "learning_rate": 7.088835994213937e-07, "entropy": 1.3206837117671966, "num_tokens": 15895605.0, "mean_token_accuracy": 0.6760165989398956, "epoch": 1.146489484147745, "step": 2740 }, { "loss": 1.2608001708984375, "grad_norm": 0.11278735101222992, "learning_rate": 7.068291977675661e-07, "entropy": 1.314364343881607, "num_tokens": 15956260.0, "mean_token_accuracy": 0.6899202361702919, "epoch": 1.150674897980538, "step": 2750 }, { "loss": 1.3079211235046386, "grad_norm": 0.10432706028223038, "learning_rate": 7.047705748597741e-07, "entropy": 1.3454543590545653, "num_tokens": 16013636.0, "mean_token_accuracy": 0.6848849534988404, "epoch": 1.1548603118133305, "step": 2760 }, { "loss": 1.3299365043640137, "grad_norm": 0.1423172652721405, "learning_rate": 7.027077727132178e-07, "entropy": 1.3436584562063216, "num_tokens": 16070788.0, "mean_token_accuracy": 0.6782758548855782, "epoch": 1.1590457256461233, "step": 2770 }, { "loss": 1.2507868766784669, "grad_norm": 0.12985938787460327, "learning_rate": 7.006408334283929e-07, "entropy": 1.300880002975464, "num_tokens": 16132003.0, "mean_token_accuracy": 0.6908931702375412, "epoch": 1.163231139478916, "step": 2780 }, { "loss": 1.3301843643188476, "grad_norm": 0.14071504771709442, "learning_rate": 6.985697991902313e-07, "entropy": 1.3270384550094605, "num_tokens": 16192149.0, "mean_token_accuracy": 0.6777920231223107, "epoch": 1.1674165533117087, "step": 2790 }, { "loss": 1.2743472099304198, "grad_norm": 0.1140187457203865, "learning_rate": 6.964947122672406e-07, "entropy": 1.3053037211298943, "num_tokens": 16251607.0, "mean_token_accuracy": 0.6888150230050087, "epoch": 1.1716019671445015, "step": 2800 }, { "eval_loss": 1.3166502714157104, "eval_runtime": 43.6438, "eval_samples_per_second": 145.977, "eval_steps_per_second": 6.095, "eval_entropy": 1.3201469900016498, "eval_num_tokens": 16251607.0, "eval_mean_token_accuracy": 0.6811264934844541, "epoch": 1.1716019671445015, "step": 2800 }, { "loss": 1.3409744262695313, "grad_norm": 0.10443054884672165, "learning_rate": 6.944156150106407e-07, "entropy": 1.342512857913971, "num_tokens": 16312813.0, "mean_token_accuracy": 0.6724711164832116, "epoch": 1.175787380977294, "step": 2810 }, { "loss": 1.3988855361938477, "grad_norm": 0.1189141720533371, "learning_rate": 6.923325498535005e-07, "entropy": 1.396900659799576, "num_tokens": 16370227.0, "mean_token_accuracy": 0.6674019232392311, "epoch": 1.1799727948100869, "step": 2820 }, { "loss": 1.3619994163513183, "grad_norm": 0.11577111482620239, "learning_rate": 6.902455593098711e-07, "entropy": 1.3739877551794053, "num_tokens": 16431284.0, "mean_token_accuracy": 0.6685123056173324, "epoch": 1.1841582086428795, "step": 2830 }, { "loss": 1.2823293685913086, "grad_norm": 0.1623101532459259, "learning_rate": 6.881546859739178e-07, "entropy": 1.287187758088112, "num_tokens": 16490232.0, "mean_token_accuracy": 0.6852916941046715, "epoch": 1.1883436224756723, "step": 2840 }, { "loss": 1.295179557800293, "grad_norm": 0.1286296844482422, "learning_rate": 6.860599725190516e-07, "entropy": 1.3181857854127883, "num_tokens": 16549313.0, "mean_token_accuracy": 0.6848940759897232, "epoch": 1.192529036308465, "step": 2850 }, { "loss": 1.351776695251465, "grad_norm": 0.12631654739379883, "learning_rate": 6.839614616970579e-07, "entropy": 1.3548940598964692, "num_tokens": 16607551.0, "mean_token_accuracy": 0.6714933633804321, "epoch": 1.1967144501412577, "step": 2860 }, { "loss": 1.3425410270690918, "grad_norm": 0.1304273158311844, "learning_rate": 6.818591963372242e-07, "entropy": 1.3392845541238785, "num_tokens": 16667124.0, "mean_token_accuracy": 0.6802757531404495, "epoch": 1.2008998639740505, "step": 2870 }, { "loss": 1.3046100616455079, "grad_norm": 0.12168211489915848, "learning_rate": 6.797532193454654e-07, "entropy": 1.3106303334236145, "num_tokens": 16725868.0, "mean_token_accuracy": 0.6820132330060005, "epoch": 1.2050852778068433, "step": 2880 }, { "loss": 1.266930389404297, "grad_norm": 0.16589786112308502, "learning_rate": 6.776435737034484e-07, "entropy": 1.2930086612701417, "num_tokens": 16780751.0, "mean_token_accuracy": 0.6916173666715622, "epoch": 1.2092706916396359, "step": 2890 }, { "loss": 1.290895366668701, "grad_norm": 0.11063241213560104, "learning_rate": 6.755303024677153e-07, "entropy": 1.3148932754993439, "num_tokens": 16838274.0, "mean_token_accuracy": 0.6836249440908432, "epoch": 1.2134561054724287, "step": 2900 }, { "loss": 1.3228137016296386, "grad_norm": 0.10892044007778168, "learning_rate": 6.734134487688043e-07, "entropy": 1.3384662061929702, "num_tokens": 16896457.0, "mean_token_accuracy": 0.6798223108053207, "epoch": 1.2176415193052212, "step": 2910 }, { "loss": 1.321933650970459, "grad_norm": 0.13741441071033478, "learning_rate": 6.712930558103691e-07, "entropy": 1.3460487127304077, "num_tokens": 16955127.0, "mean_token_accuracy": 0.6765735790133476, "epoch": 1.221826933138014, "step": 2920 }, { "loss": 1.3986333847045898, "grad_norm": 0.13116198778152466, "learning_rate": 6.691691668682977e-07, "entropy": 1.3796002447605134, "num_tokens": 17010269.0, "mean_token_accuracy": 0.6650555938482284, "epoch": 1.2260123469708066, "step": 2930 }, { "loss": 1.3275323867797852, "grad_norm": 0.1158343181014061, "learning_rate": 6.670418252898284e-07, "entropy": 1.3303757071495057, "num_tokens": 17067471.0, "mean_token_accuracy": 0.6782015576958657, "epoch": 1.2301977608035994, "step": 2940 }, { "loss": 1.299326515197754, "grad_norm": 0.13345105946063995, "learning_rate": 6.649110744926669e-07, "entropy": 1.319593369960785, "num_tokens": 17123848.0, "mean_token_accuracy": 0.6838883191347123, "epoch": 1.2343831746363922, "step": 2950 }, { "loss": 1.2679595947265625, "grad_norm": 0.13203385472297668, "learning_rate": 6.627769579640975e-07, "entropy": 1.2961439684033393, "num_tokens": 17180001.0, "mean_token_accuracy": 0.6859666183590889, "epoch": 1.2385685884691848, "step": 2960 }, { "loss": 1.3538383483886718, "grad_norm": 0.1471163034439087, "learning_rate": 6.606395192600978e-07, "entropy": 1.3352440029382706, "num_tokens": 17233262.0, "mean_token_accuracy": 0.6754373088479042, "epoch": 1.2427540023019776, "step": 2970 }, { "loss": 1.3183878898620605, "grad_norm": 0.12840019166469574, "learning_rate": 6.584988020044485e-07, "entropy": 1.349251627922058, "num_tokens": 17287228.0, "mean_token_accuracy": 0.6815307438373566, "epoch": 1.2469394161347704, "step": 2980 }, { "loss": 1.2082359313964843, "grad_norm": 0.15220077335834503, "learning_rate": 6.563548498878438e-07, "entropy": 1.2743981599807739, "num_tokens": 17348380.0, "mean_token_accuracy": 0.6978771463036537, "epoch": 1.251124829967563, "step": 2990 }, { "loss": 1.3413416862487793, "grad_norm": 0.12702776491641998, "learning_rate": 6.542077066669993e-07, "entropy": 1.338026624917984, "num_tokens": 17403328.0, "mean_token_accuracy": 0.675315049290657, "epoch": 1.2553102438003558, "step": 3000 }, { "eval_loss": 1.3136601448059082, "eval_runtime": 43.154, "eval_samples_per_second": 147.634, "eval_steps_per_second": 6.164, "eval_entropy": 1.3217154624766874, "eval_num_tokens": 17403328.0, "eval_mean_token_accuracy": 0.6815465722317086, "epoch": 1.2553102438003558, "step": 3000 }, { "loss": 1.3451406478881835, "grad_norm": 0.1156093031167984, "learning_rate": 6.52057416163759e-07, "entropy": 1.3520446419715881, "num_tokens": 17460648.0, "mean_token_accuracy": 0.6702774554491043, "epoch": 1.2594956576331484, "step": 3010 }, { "loss": 1.3822593688964844, "grad_norm": 0.12327724695205688, "learning_rate": 6.499040222642007e-07, "entropy": 1.365411925315857, "num_tokens": 17519410.0, "mean_token_accuracy": 0.6692025378346443, "epoch": 1.2636810714659412, "step": 3020 }, { "loss": 1.3682982444763183, "grad_norm": 0.12616313993930817, "learning_rate": 6.477475689177407e-07, "entropy": 1.3488513588905335, "num_tokens": 17575033.0, "mean_token_accuracy": 0.6721004649996758, "epoch": 1.2678664852987338, "step": 3030 }, { "loss": 1.3206647872924804, "grad_norm": 0.11206343024969101, "learning_rate": 6.455881001362372e-07, "entropy": 1.3416712805628777, "num_tokens": 17634281.0, "mean_token_accuracy": 0.6792711272835732, "epoch": 1.2720518991315266, "step": 3040 }, { "loss": 1.368018913269043, "grad_norm": 0.1311446875333786, "learning_rate": 6.434256599930909e-07, "entropy": 1.37212732732296, "num_tokens": 17689407.0, "mean_token_accuracy": 0.6717290371656418, "epoch": 1.2762373129643194, "step": 3050 }, { "loss": 1.3607032775878907, "grad_norm": 0.14133571088314056, "learning_rate": 6.412602926223464e-07, "entropy": 1.3578105926513673, "num_tokens": 17748080.0, "mean_token_accuracy": 0.6729270294308662, "epoch": 1.280422726797112, "step": 3060 }, { "loss": 1.259375, "grad_norm": 0.12888510525226593, "learning_rate": 6.390920422177909e-07, "entropy": 1.2880975693464278, "num_tokens": 17809457.0, "mean_token_accuracy": 0.6906314134597779, "epoch": 1.2846081406299048, "step": 3070 }, { "loss": 1.2357722282409669, "grad_norm": 0.09784252196550369, "learning_rate": 6.36920953032053e-07, "entropy": 1.3111811935901643, "num_tokens": 17871869.0, "mean_token_accuracy": 0.6910292714834213, "epoch": 1.2887935544626976, "step": 3080 }, { "loss": 1.2405315399169923, "grad_norm": 0.13264605402946472, "learning_rate": 6.347470693756987e-07, "entropy": 1.2896562367677689, "num_tokens": 17933114.0, "mean_token_accuracy": 0.6923574149608612, "epoch": 1.2929789682954902, "step": 3090 }, { "loss": 1.2628044128417968, "grad_norm": 0.11528719961643219, "learning_rate": 6.325704356163273e-07, "entropy": 1.2994973942637444, "num_tokens": 17989694.0, "mean_token_accuracy": 0.6884831428527832, "epoch": 1.297164382128283, "step": 3100 }, { "loss": 1.2996297836303712, "grad_norm": 0.1078164130449295, "learning_rate": 6.303910961776664e-07, "entropy": 1.315569232404232, "num_tokens": 18049760.0, "mean_token_accuracy": 0.6808459624648094, "epoch": 1.3013497959610756, "step": 3110 }, { "loss": 1.3285269737243652, "grad_norm": 0.13121522963047028, "learning_rate": 6.282090955386642e-07, "entropy": 1.3389025837183, "num_tokens": 18106326.0, "mean_token_accuracy": 0.6797921672463417, "epoch": 1.3055352097938684, "step": 3120 }, { "loss": 1.3413330078125, "grad_norm": 0.1084539070725441, "learning_rate": 6.260244782325829e-07, "entropy": 1.3604058563709258, "num_tokens": 18165478.0, "mean_token_accuracy": 0.6726910755038261, "epoch": 1.309720623626661, "step": 3130 }, { "loss": 1.3873212814331055, "grad_norm": 0.11267993599176407, "learning_rate": 6.238372888460892e-07, "entropy": 1.404004666209221, "num_tokens": 18221418.0, "mean_token_accuracy": 0.6710177347064018, "epoch": 1.3139060374594538, "step": 3140 }, { "loss": 1.312180519104004, "grad_norm": 0.12451887875795364, "learning_rate": 6.216475720183437e-07, "entropy": 1.322364729642868, "num_tokens": 18278027.0, "mean_token_accuracy": 0.6799433350563049, "epoch": 1.3180914512922466, "step": 3150 }, { "loss": 1.2946537017822266, "grad_norm": 0.15065018832683563, "learning_rate": 6.194553724400911e-07, "entropy": 1.3054640024900437, "num_tokens": 18334990.0, "mean_token_accuracy": 0.6847308576107025, "epoch": 1.3222768651250392, "step": 3160 }, { "loss": 1.2657323837280274, "grad_norm": 0.11712754517793655, "learning_rate": 6.172607348527474e-07, "entropy": 1.2842485100030898, "num_tokens": 18393253.0, "mean_token_accuracy": 0.6883261352777481, "epoch": 1.326462278957832, "step": 3170 }, { "loss": 1.3007762908935547, "grad_norm": 0.16621780395507812, "learning_rate": 6.150637040474868e-07, "entropy": 1.3247014865279199, "num_tokens": 18449407.0, "mean_token_accuracy": 0.6868977710604668, "epoch": 1.3306476927906248, "step": 3180 }, { "loss": 1.2898554801940918, "grad_norm": 0.13524088263511658, "learning_rate": 6.128643248643274e-07, "entropy": 1.305448915064335, "num_tokens": 18506773.0, "mean_token_accuracy": 0.6856573060154915, "epoch": 1.3348331066234174, "step": 3190 }, { "loss": 1.3588788986206055, "grad_norm": 0.1192813366651535, "learning_rate": 6.106626421912163e-07, "entropy": 1.3537309616804123, "num_tokens": 18568894.0, "mean_token_accuracy": 0.6747590154409409, "epoch": 1.3390185204562102, "step": 3200 }, { "eval_loss": 1.3109967708587646, "eval_runtime": 43.5438, "eval_samples_per_second": 146.312, "eval_steps_per_second": 6.109, "eval_entropy": 1.3213256283810264, "eval_num_tokens": 18568894.0, "eval_mean_token_accuracy": 0.6819047645518654, "epoch": 1.3390185204562102, "step": 3200 }, { "loss": 1.3187339782714844, "grad_norm": 0.12998685240745544, "learning_rate": 6.084587009631135e-07, "entropy": 1.3331160172820091, "num_tokens": 18627669.0, "mean_token_accuracy": 0.6809702217578888, "epoch": 1.3432039342890028, "step": 3210 }, { "loss": 1.287522792816162, "grad_norm": 0.11987276375293732, "learning_rate": 6.062525461610746e-07, "entropy": 1.3107565701007844, "num_tokens": 18690323.0, "mean_token_accuracy": 0.6874667569994927, "epoch": 1.3473893481217956, "step": 3220 }, { "loss": 1.2994555473327636, "grad_norm": 0.14244310557842255, "learning_rate": 6.040442228113328e-07, "entropy": 1.3177940219640731, "num_tokens": 18749330.0, "mean_token_accuracy": 0.6785065039992333, "epoch": 1.3515747619545881, "step": 3230 }, { "loss": 1.348573875427246, "grad_norm": 0.12585744261741638, "learning_rate": 6.018337759843803e-07, "entropy": 1.3356850504875184, "num_tokens": 18805120.0, "mean_token_accuracy": 0.676536102592945, "epoch": 1.355760175787381, "step": 3240 }, { "loss": 1.3361006736755372, "grad_norm": 0.1416776031255722, "learning_rate": 5.996212507940475e-07, "entropy": 1.355094811320305, "num_tokens": 18861563.0, "mean_token_accuracy": 0.6747770622372627, "epoch": 1.3599455896201738, "step": 3250 }, { "loss": 1.2256298065185547, "grad_norm": 0.10666567087173462, "learning_rate": 5.974066923965835e-07, "entropy": 1.2700331062078476, "num_tokens": 18922654.0, "mean_token_accuracy": 0.697255577147007, "epoch": 1.3641310034529663, "step": 3260 }, { "loss": 1.3261382102966308, "grad_norm": 0.1291145235300064, "learning_rate": 5.951901459897337e-07, "entropy": 1.351950439810753, "num_tokens": 18980966.0, "mean_token_accuracy": 0.6802997335791587, "epoch": 1.3683164172857591, "step": 3270 }, { "loss": 1.339816188812256, "grad_norm": 0.12637273967266083, "learning_rate": 5.929716568118176e-07, "entropy": 1.341824659705162, "num_tokens": 19041925.0, "mean_token_accuracy": 0.6740039184689521, "epoch": 1.372501831118552, "step": 3280 }, { "loss": 1.322571086883545, "grad_norm": 0.13360652327537537, "learning_rate": 5.907512701408049e-07, "entropy": 1.3231751516461372, "num_tokens": 19097885.0, "mean_token_accuracy": 0.6797602906823158, "epoch": 1.3766872449513445, "step": 3290 }, { "loss": 1.2750181198120116, "grad_norm": 0.1212676391005516, "learning_rate": 5.885290312933929e-07, "entropy": 1.2946186915040017, "num_tokens": 19156636.0, "mean_token_accuracy": 0.6879573374986648, "epoch": 1.3808726587841373, "step": 3300 }, { "loss": 1.2478185653686524, "grad_norm": 0.10930495709180832, "learning_rate": 5.863049856240797e-07, "entropy": 1.2818130880594254, "num_tokens": 19215539.0, "mean_token_accuracy": 0.6884996458888054, "epoch": 1.38505807261693, "step": 3310 }, { "loss": 1.2877882957458495, "grad_norm": 0.14618222415447235, "learning_rate": 5.840791785242399e-07, "entropy": 1.3158632695674897, "num_tokens": 19275155.0, "mean_token_accuracy": 0.6801917359232903, "epoch": 1.3892434864497227, "step": 3320 }, { "loss": 1.3204275131225587, "grad_norm": 0.13408797979354858, "learning_rate": 5.818516554211983e-07, "entropy": 1.3392174810171127, "num_tokens": 19331554.0, "mean_token_accuracy": 0.6769860580563545, "epoch": 1.3934289002825153, "step": 3330 }, { "loss": 1.2603473663330078, "grad_norm": 0.15371856093406677, "learning_rate": 5.796224617773012e-07, "entropy": 1.2774315923452377, "num_tokens": 19389359.0, "mean_token_accuracy": 0.6910146772861481, "epoch": 1.3976143141153081, "step": 3340 }, { "loss": 1.3022661209106445, "grad_norm": 0.1194000095129013, "learning_rate": 5.773916430889905e-07, "entropy": 1.3322788611054421, "num_tokens": 19449266.0, "mean_token_accuracy": 0.6764059454202652, "epoch": 1.401799727948101, "step": 3350 }, { "loss": 1.280670738220215, "grad_norm": 0.1414560228586197, "learning_rate": 5.751592448858737e-07, "entropy": 1.28292535841465, "num_tokens": 19505413.0, "mean_token_accuracy": 0.6831368803977966, "epoch": 1.4059851417808935, "step": 3360 }, { "loss": 1.2672719955444336, "grad_norm": 0.1158173456788063, "learning_rate": 5.729253127297955e-07, "entropy": 1.2811419636011123, "num_tokens": 19564391.0, "mean_token_accuracy": 0.6885835364460945, "epoch": 1.4101705556136863, "step": 3370 }, { "loss": 1.3353286743164063, "grad_norm": 0.12490648031234741, "learning_rate": 5.706898922139074e-07, "entropy": 1.3280266046524047, "num_tokens": 19623582.0, "mean_token_accuracy": 0.6795374467968941, "epoch": 1.4143559694464791, "step": 3380 }, { "loss": 1.2236414909362794, "grad_norm": 0.1268617957830429, "learning_rate": 5.684530289617376e-07, "entropy": 1.281736159324646, "num_tokens": 19682008.0, "mean_token_accuracy": 0.6963353782892228, "epoch": 1.4185413832792717, "step": 3390 }, { "loss": 1.365687370300293, "grad_norm": 0.12744104862213135, "learning_rate": 5.662147686262595e-07, "entropy": 1.3710105925798417, "num_tokens": 19735892.0, "mean_token_accuracy": 0.6697546020150185, "epoch": 1.4227267971120645, "step": 3400 }, { "eval_loss": 1.3087373971939087, "eval_runtime": 43.1953, "eval_samples_per_second": 147.493, "eval_steps_per_second": 6.158, "eval_entropy": 1.314033669636662, "eval_num_tokens": 19735892.0, "eval_mean_token_accuracy": 0.6822930157632756, "epoch": 1.4227267971120645, "step": 3400 }, { "loss": 1.2659673690795898, "grad_norm": 0.12900042533874512, "learning_rate": 5.639751568889601e-07, "entropy": 1.2991064012050628, "num_tokens": 19795701.0, "mean_token_accuracy": 0.6906736105680465, "epoch": 1.426912210944857, "step": 3410 }, { "loss": 1.3444564819335938, "grad_norm": 0.141217440366745, "learning_rate": 5.617342394589076e-07, "entropy": 1.328627872467041, "num_tokens": 19852770.0, "mean_token_accuracy": 0.6754001170396805, "epoch": 1.43109762477765, "step": 3420 }, { "loss": 1.306549644470215, "grad_norm": 0.12908576428890228, "learning_rate": 5.594920620718189e-07, "entropy": 1.3152456805109978, "num_tokens": 19912101.0, "mean_token_accuracy": 0.6831103786826134, "epoch": 1.4352830386104425, "step": 3430 }, { "loss": 1.4111559867858887, "grad_norm": 0.12521252036094666, "learning_rate": 5.572486704891254e-07, "entropy": 1.3963622391223907, "num_tokens": 19969427.0, "mean_token_accuracy": 0.6657738149166107, "epoch": 1.4394684524432353, "step": 3440 }, { "loss": 1.3437400817871095, "grad_norm": 0.14173802733421326, "learning_rate": 5.550041104970396e-07, "entropy": 1.3275486350059509, "num_tokens": 20030520.0, "mean_token_accuracy": 0.6768685072660446, "epoch": 1.443653866276028, "step": 3450 }, { "loss": 1.3257243156433105, "grad_norm": 0.12309889495372772, "learning_rate": 5.527584279056207e-07, "entropy": 1.3419605940580368, "num_tokens": 20088125.0, "mean_token_accuracy": 0.6762049332261085, "epoch": 1.4478392801088207, "step": 3460 }, { "loss": 1.226247215270996, "grad_norm": 0.13416838645935059, "learning_rate": 5.505116685478394e-07, "entropy": 1.284440317749977, "num_tokens": 20147039.0, "mean_token_accuracy": 0.6947048246860504, "epoch": 1.4520246939416135, "step": 3470 }, { "loss": 1.3330312728881837, "grad_norm": 0.1232227310538292, "learning_rate": 5.48263878278642e-07, "entropy": 1.3413183093070984, "num_tokens": 20205035.0, "mean_token_accuracy": 0.67842618227005, "epoch": 1.4562101077744063, "step": 3480 }, { "loss": 1.3752978324890137, "grad_norm": 0.1453479379415512, "learning_rate": 5.460151029740161e-07, "entropy": 1.3477472990751267, "num_tokens": 20260344.0, "mean_token_accuracy": 0.6687687709927559, "epoch": 1.4603955216071989, "step": 3490 }, { "loss": 1.221930980682373, "grad_norm": 0.1277054399251938, "learning_rate": 5.437653885300522e-07, "entropy": 1.261066934466362, "num_tokens": 20318023.0, "mean_token_accuracy": 0.6955515563488006, "epoch": 1.4645809354399917, "step": 3500 }, { "loss": 1.2911027908325194, "grad_norm": 0.12536244094371796, "learning_rate": 5.415147808620086e-07, "entropy": 1.3049872070550919, "num_tokens": 20376931.0, "mean_token_accuracy": 0.6845586389303208, "epoch": 1.4687663492727843, "step": 3510 }, { "loss": 1.289406967163086, "grad_norm": 0.10206779837608337, "learning_rate": 5.392633259033735e-07, "entropy": 1.3262745544314385, "num_tokens": 20435694.0, "mean_token_accuracy": 0.6822992920875549, "epoch": 1.472951763105577, "step": 3520 }, { "loss": 1.36239652633667, "grad_norm": 0.12939555943012238, "learning_rate": 5.370110696049282e-07, "entropy": 1.353842854499817, "num_tokens": 20494766.0, "mean_token_accuracy": 0.6707737103104592, "epoch": 1.4771371769383697, "step": 3530 }, { "loss": 1.2756587028503419, "grad_norm": 0.11803429573774338, "learning_rate": 5.34758057933808e-07, "entropy": 1.3108687788248061, "num_tokens": 20552412.0, "mean_token_accuracy": 0.6879084140062333, "epoch": 1.4813225907711625, "step": 3540 }, { "loss": 1.328935432434082, "grad_norm": 0.13244083523750305, "learning_rate": 5.325043368725662e-07, "entropy": 1.3331556499004364, "num_tokens": 20610804.0, "mean_token_accuracy": 0.6783339202404022, "epoch": 1.4855080046039553, "step": 3550 }, { "loss": 1.334804153442383, "grad_norm": 0.14279146492481232, "learning_rate": 5.302499524182327e-07, "entropy": 1.3319466978311538, "num_tokens": 20668475.0, "mean_token_accuracy": 0.6799613311886787, "epoch": 1.4896934184367479, "step": 3560 }, { "loss": 1.2445655822753907, "grad_norm": 0.137944757938385, "learning_rate": 5.279949505813783e-07, "entropy": 1.2786899566650392, "num_tokens": 20731142.0, "mean_token_accuracy": 0.6872789070010186, "epoch": 1.4938788322695407, "step": 3570 }, { "loss": 1.2799750328063966, "grad_norm": 0.14270278811454773, "learning_rate": 5.257393773851733e-07, "entropy": 1.3207889288663863, "num_tokens": 20791636.0, "mean_token_accuracy": 0.6855424389243125, "epoch": 1.4980642461023335, "step": 3580 }, { "loss": 1.2714473724365234, "grad_norm": 0.13160590827465057, "learning_rate": 5.234832788644492e-07, "entropy": 1.2881531581282615, "num_tokens": 20850942.0, "mean_token_accuracy": 0.6868263691663742, "epoch": 1.502249659935126, "step": 3590 }, { "loss": 1.377396297454834, "grad_norm": 0.13724471628665924, "learning_rate": 5.212267010647594e-07, "entropy": 1.350425472855568, "num_tokens": 20909794.0, "mean_token_accuracy": 0.6673172801733017, "epoch": 1.5064350737679189, "step": 3600 }, { "eval_loss": 1.3067371845245361, "eval_runtime": 43.2705, "eval_samples_per_second": 147.237, "eval_steps_per_second": 6.147, "eval_entropy": 1.3166241054248093, "eval_num_tokens": 20909794.0, "eval_mean_token_accuracy": 0.6824193930715546, "epoch": 1.5064350737679189, "step": 3600 }, { "loss": 1.3214588165283203, "grad_norm": 0.12896448373794556, "learning_rate": 5.189696900414387e-07, "entropy": 1.3233384594321251, "num_tokens": 20966668.0, "mean_token_accuracy": 0.6812221944332123, "epoch": 1.5106204876007117, "step": 3610 }, { "loss": 1.2983431816101074, "grad_norm": 0.11282876133918762, "learning_rate": 5.167122918586641e-07, "entropy": 1.3307133883237838, "num_tokens": 21019757.0, "mean_token_accuracy": 0.6807741552591324, "epoch": 1.5148059014335042, "step": 3620 }, { "loss": 1.2715835571289062, "grad_norm": 0.15096786618232727, "learning_rate": 5.144545525885137e-07, "entropy": 1.3070465952157975, "num_tokens": 21077694.0, "mean_token_accuracy": 0.6909742683172226, "epoch": 1.5189913152662968, "step": 3630 }, { "loss": 1.2847407341003418, "grad_norm": 0.13508452475070953, "learning_rate": 5.121965183100278e-07, "entropy": 1.2937607616186142, "num_tokens": 21135607.0, "mean_token_accuracy": 0.6900022774934769, "epoch": 1.5231767290990896, "step": 3640 }, { "loss": 1.3469314575195312, "grad_norm": 0.12108864635229111, "learning_rate": 5.099382351082666e-07, "entropy": 1.3381920427083969, "num_tokens": 21196736.0, "mean_token_accuracy": 0.6733641669154167, "epoch": 1.5273621429318824, "step": 3650 }, { "loss": 1.3375173568725587, "grad_norm": 0.10356143862009048, "learning_rate": 5.076797490733718e-07, "entropy": 1.339997085928917, "num_tokens": 21253173.0, "mean_token_accuracy": 0.6792127892374993, "epoch": 1.531547556764675, "step": 3660 }, { "loss": 1.2826983451843261, "grad_norm": 0.13479599356651306, "learning_rate": 5.054211062996241e-07, "entropy": 1.305300708115101, "num_tokens": 21309039.0, "mean_token_accuracy": 0.6865562707185745, "epoch": 1.5357329705974678, "step": 3670 }, { "loss": 1.2372420310974122, "grad_norm": 0.13392086327075958, "learning_rate": 5.031623528845032e-07, "entropy": 1.2712924674153327, "num_tokens": 21368207.0, "mean_token_accuracy": 0.6935058936476708, "epoch": 1.5399183844302606, "step": 3680 }, { "loss": 1.232171630859375, "grad_norm": 0.13666661083698273, "learning_rate": 5.009035349277469e-07, "entropy": 1.2765518009662629, "num_tokens": 21425778.0, "mean_token_accuracy": 0.6935865059494972, "epoch": 1.5441037982630532, "step": 3690 }, { "loss": 1.2454364776611329, "grad_norm": 0.11271411925554276, "learning_rate": 4.986446985304105e-07, "entropy": 1.2914676815271378, "num_tokens": 21484225.0, "mean_token_accuracy": 0.6902065351605415, "epoch": 1.548289212095846, "step": 3700 }, { "loss": 1.3030299186706542, "grad_norm": 0.1399720460176468, "learning_rate": 4.963858897939254e-07, "entropy": 1.3240129977464676, "num_tokens": 21541427.0, "mean_token_accuracy": 0.6800246313214302, "epoch": 1.5524746259286388, "step": 3710 }, { "loss": 1.2892935752868653, "grad_norm": 0.12150213122367859, "learning_rate": 4.941271548191588e-07, "entropy": 1.3206008851528168, "num_tokens": 21600143.0, "mean_token_accuracy": 0.6826386615633965, "epoch": 1.5566600397614314, "step": 3720 }, { "loss": 1.257300853729248, "grad_norm": 0.12151734530925751, "learning_rate": 4.918685397054718e-07, "entropy": 1.3101585179567337, "num_tokens": 21656388.0, "mean_token_accuracy": 0.6894284501671791, "epoch": 1.560845453594224, "step": 3730 }, { "loss": 1.233230972290039, "grad_norm": 0.11451518535614014, "learning_rate": 4.896100905497803e-07, "entropy": 1.2788519978523254, "num_tokens": 21715109.0, "mean_token_accuracy": 0.6912301525473594, "epoch": 1.5650308674270168, "step": 3740 }, { "loss": 1.3578661918640136, "grad_norm": 0.12948159873485565, "learning_rate": 4.873518534456119e-07, "entropy": 1.3681051909923554, "num_tokens": 21772746.0, "mean_token_accuracy": 0.6737246960401535, "epoch": 1.5692162812598096, "step": 3750 }, { "loss": 1.3305506706237793, "grad_norm": 0.132918581366539, "learning_rate": 4.850938744821674e-07, "entropy": 1.3546297058463097, "num_tokens": 21830592.0, "mean_token_accuracy": 0.6777592465281487, "epoch": 1.5734016950926022, "step": 3760 }, { "loss": 1.2513206481933594, "grad_norm": 0.11073267459869385, "learning_rate": 4.828361997433782e-07, "entropy": 1.3076282858848571, "num_tokens": 21892167.0, "mean_token_accuracy": 0.68986496925354, "epoch": 1.577587108925395, "step": 3770 }, { "loss": 1.271761131286621, "grad_norm": 0.1349179595708847, "learning_rate": 4.805788753069673e-07, "entropy": 1.3031177580356599, "num_tokens": 21952326.0, "mean_token_accuracy": 0.6844272211194038, "epoch": 1.5817725227581878, "step": 3780 }, { "loss": 1.3118972778320312, "grad_norm": 0.16216766834259033, "learning_rate": 4.783219472435081e-07, "entropy": 1.3089008510112763, "num_tokens": 22012045.0, "mean_token_accuracy": 0.6802516788244247, "epoch": 1.5859579365909804, "step": 3790 }, { "loss": 1.276634979248047, "grad_norm": 0.13579685986042023, "learning_rate": 4.760654616154842e-07, "entropy": 1.309640994668007, "num_tokens": 22068798.0, "mean_token_accuracy": 0.6864374697208404, "epoch": 1.5901433504237732, "step": 3800 }, { "eval_loss": 1.3049228191375732, "eval_runtime": 43.7061, "eval_samples_per_second": 145.769, "eval_steps_per_second": 6.086, "eval_entropy": 1.3191418598469038, "eval_num_tokens": 22068798.0, "eval_mean_token_accuracy": 0.6827442256131566, "epoch": 1.5901433504237732, "step": 3800 }, { "loss": 1.2272584915161133, "grad_norm": 0.14792950451374054, "learning_rate": 4.7380946447634935e-07, "entropy": 1.2662998199462892, "num_tokens": 22128395.0, "mean_token_accuracy": 0.6912488013505935, "epoch": 1.594328764256566, "step": 3810 }, { "loss": 1.3102972030639648, "grad_norm": 0.13187845051288605, "learning_rate": 4.7155400186958744e-07, "entropy": 1.3161917060613633, "num_tokens": 22185985.0, "mean_token_accuracy": 0.6805871248245239, "epoch": 1.5985141780893586, "step": 3820 }, { "loss": 1.2989760398864747, "grad_norm": 0.11845917999744415, "learning_rate": 4.6929911982777325e-07, "entropy": 1.3359744518995285, "num_tokens": 22241668.0, "mean_token_accuracy": 0.6845213517546653, "epoch": 1.6026995919221512, "step": 3830 }, { "loss": 1.3590301513671874, "grad_norm": 0.15108934044837952, "learning_rate": 4.670448643716322e-07, "entropy": 1.3409444272518158, "num_tokens": 22297005.0, "mean_token_accuracy": 0.6736213758587837, "epoch": 1.606885005754944, "step": 3840 }, { "loss": 1.3382845878601075, "grad_norm": 0.1333889216184616, "learning_rate": 4.6479128150910196e-07, "entropy": 1.3449043482542038, "num_tokens": 22357044.0, "mean_token_accuracy": 0.675865213572979, "epoch": 1.6110704195877368, "step": 3850 }, { "loss": 1.310408592224121, "grad_norm": 0.13304699957370758, "learning_rate": 4.625384172343926e-07, "entropy": 1.3386895060539246, "num_tokens": 22413961.0, "mean_token_accuracy": 0.6803735584020615, "epoch": 1.6152558334205294, "step": 3860 }, { "loss": 1.2976115226745606, "grad_norm": 0.11845416575670242, "learning_rate": 4.602863175270483e-07, "entropy": 1.3106703519821168, "num_tokens": 22473509.0, "mean_token_accuracy": 0.6821468025445938, "epoch": 1.6194412472533222, "step": 3870 }, { "loss": 1.2801057815551757, "grad_norm": 0.13403619825839996, "learning_rate": 4.580350283510088e-07, "entropy": 1.295821413397789, "num_tokens": 22527439.0, "mean_token_accuracy": 0.688027186691761, "epoch": 1.623626661086115, "step": 3880 }, { "loss": 1.2886218070983886, "grad_norm": 0.14298778772354126, "learning_rate": 4.55784595653672e-07, "entropy": 1.300849825143814, "num_tokens": 22586180.0, "mean_token_accuracy": 0.6847355782985687, "epoch": 1.6278120749189076, "step": 3890 }, { "loss": 1.3447054862976073, "grad_norm": 0.13079994916915894, "learning_rate": 4.535350653649549e-07, "entropy": 1.3326701998710633, "num_tokens": 22642360.0, "mean_token_accuracy": 0.6774184912443161, "epoch": 1.6319974887517004, "step": 3900 }, { "loss": 1.3529298782348633, "grad_norm": 0.10864491760730743, "learning_rate": 4.512864833963571e-07, "entropy": 1.338020858168602, "num_tokens": 22699591.0, "mean_token_accuracy": 0.6726731553673744, "epoch": 1.6361829025844932, "step": 3910 }, { "loss": 1.2351530075073243, "grad_norm": 0.12713301181793213, "learning_rate": 4.4903889564002394e-07, "entropy": 1.2726581797003746, "num_tokens": 22757192.0, "mean_token_accuracy": 0.6924531191587449, "epoch": 1.6403683164172858, "step": 3920 }, { "loss": 1.2388504028320313, "grad_norm": 0.12611106038093567, "learning_rate": 4.467923479678091e-07, "entropy": 1.2651499658823013, "num_tokens": 22813695.0, "mean_token_accuracy": 0.6929998561739922, "epoch": 1.6445537302500783, "step": 3930 }, { "loss": 1.2846209526062011, "grad_norm": 0.1429147869348526, "learning_rate": 4.4454688623033894e-07, "entropy": 1.3101652726531028, "num_tokens": 22873575.0, "mean_token_accuracy": 0.6836878523230553, "epoch": 1.6487391440828711, "step": 3940 }, { "loss": 1.3049224853515624, "grad_norm": 0.1356530785560608, "learning_rate": 4.4230255625607637e-07, "entropy": 1.3245902001857757, "num_tokens": 22930361.0, "mean_token_accuracy": 0.6820057585835457, "epoch": 1.652924557915664, "step": 3950 }, { "loss": 1.3374080657958984, "grad_norm": 0.13884375989437103, "learning_rate": 4.400594038503864e-07, "entropy": 1.3170197814702989, "num_tokens": 22987617.0, "mean_token_accuracy": 0.6769963175058364, "epoch": 1.6571099717484565, "step": 3960 }, { "loss": 1.3491817474365235, "grad_norm": 0.1354888528585434, "learning_rate": 4.3781747479459974e-07, "entropy": 1.3501463949680328, "num_tokens": 23042051.0, "mean_token_accuracy": 0.6761467263102532, "epoch": 1.6612953855812493, "step": 3970 }, { "loss": 1.3287680625915528, "grad_norm": 0.16015098989009857, "learning_rate": 4.355768148450799e-07, "entropy": 1.3458044916391372, "num_tokens": 23098670.0, "mean_token_accuracy": 0.6781758189201355, "epoch": 1.6654807994140421, "step": 3980 }, { "loss": 1.2587160110473632, "grad_norm": 0.13395771384239197, "learning_rate": 4.3333746973228854e-07, "entropy": 1.2841592252254486, "num_tokens": 23157362.0, "mean_token_accuracy": 0.6881028071045876, "epoch": 1.6696662132468347, "step": 3990 }, { "loss": 1.3319854736328125, "grad_norm": 0.13594871759414673, "learning_rate": 4.310994851598522e-07, "entropy": 1.3213010758161545, "num_tokens": 23212648.0, "mean_token_accuracy": 0.6781036898493766, "epoch": 1.6738516270796275, "step": 4000 }, { "eval_loss": 1.3033655881881714, "eval_runtime": 43.5667, "eval_samples_per_second": 146.235, "eval_steps_per_second": 6.106, "eval_entropy": 1.31763897399257, "eval_num_tokens": 23212648.0, "eval_mean_token_accuracy": 0.6829292187117096, "epoch": 1.6738516270796275, "step": 4000 }, { "loss": 1.2960718154907227, "grad_norm": 0.13045227527618408, "learning_rate": 4.288629068036296e-07, "entropy": 1.3475263714790344, "num_tokens": 23274106.0, "mean_token_accuracy": 0.6837118580937386, "epoch": 1.6780370409124203, "step": 4010 }, { "loss": 1.2239046096801758, "grad_norm": 0.1388995498418808, "learning_rate": 4.2662778031077993e-07, "entropy": 1.2441598355770112, "num_tokens": 23333462.0, "mean_token_accuracy": 0.6996882349252701, "epoch": 1.682222454745213, "step": 4020 }, { "loss": 1.356397533416748, "grad_norm": 0.13702833652496338, "learning_rate": 4.243941512988304e-07, "entropy": 1.363625492155552, "num_tokens": 23392153.0, "mean_token_accuracy": 0.6763727009296417, "epoch": 1.6864078685780055, "step": 4030 }, { "loss": 1.2513771057128906, "grad_norm": 0.14271850883960724, "learning_rate": 4.221620653547454e-07, "entropy": 1.2843372076749802, "num_tokens": 23454405.0, "mean_token_accuracy": 0.6873761117458344, "epoch": 1.6905932824107983, "step": 4040 }, { "loss": 1.2633016586303711, "grad_norm": 0.1479983925819397, "learning_rate": 4.199315680339968e-07, "entropy": 1.2902348592877388, "num_tokens": 23515963.0, "mean_token_accuracy": 0.6904997587203979, "epoch": 1.6947786962435911, "step": 4050 }, { "loss": 1.3018023490905761, "grad_norm": 0.12075834721326828, "learning_rate": 4.1770270485963294e-07, "entropy": 1.315699815750122, "num_tokens": 23573373.0, "mean_token_accuracy": 0.6817046746611595, "epoch": 1.6989641100763837, "step": 4060 }, { "loss": 1.3296629905700683, "grad_norm": 0.15214762091636658, "learning_rate": 4.154755213213513e-07, "entropy": 1.339156760275364, "num_tokens": 23630153.0, "mean_token_accuracy": 0.6760370403528213, "epoch": 1.7031495239091765, "step": 4070 }, { "loss": 1.288606834411621, "grad_norm": 0.1338847577571869, "learning_rate": 4.132500628745681e-07, "entropy": 1.308351318538189, "num_tokens": 23689525.0, "mean_token_accuracy": 0.6800839513540268, "epoch": 1.7073349377419693, "step": 4080 }, { "loss": 1.3122243881225586, "grad_norm": 0.13693219423294067, "learning_rate": 4.110263749394918e-07, "entropy": 1.310598623752594, "num_tokens": 23746173.0, "mean_token_accuracy": 0.6841694295406342, "epoch": 1.711520351574762, "step": 4090 }, { "loss": 1.3211769104003905, "grad_norm": 0.12190598249435425, "learning_rate": 4.0880450290019594e-07, "entropy": 1.3578921407461166, "num_tokens": 23804574.0, "mean_token_accuracy": 0.6757835909724236, "epoch": 1.7157057654075547, "step": 4100 }, { "loss": 1.2779497146606444, "grad_norm": 0.14035965502262115, "learning_rate": 4.0658449210369295e-07, "entropy": 1.311075533926487, "num_tokens": 23859817.0, "mean_token_accuracy": 0.6860598146915435, "epoch": 1.7198911792403475, "step": 4110 }, { "loss": 1.289837646484375, "grad_norm": 0.11380521208047867, "learning_rate": 4.0436638785900797e-07, "entropy": 1.3117400839924813, "num_tokens": 23918028.0, "mean_token_accuracy": 0.6838786184787751, "epoch": 1.72407659307314, "step": 4120 }, { "loss": 1.2879505157470703, "grad_norm": 0.15264193713665009, "learning_rate": 4.0215023543625494e-07, "entropy": 1.3319763213396072, "num_tokens": 23977871.0, "mean_token_accuracy": 0.6834924459457398, "epoch": 1.7282620069059327, "step": 4130 }, { "loss": 1.2729723930358887, "grad_norm": 0.13197912275791168, "learning_rate": 3.999360800657121e-07, "entropy": 1.3003861784934998, "num_tokens": 24032724.0, "mean_token_accuracy": 0.6865213885903358, "epoch": 1.7324474207387257, "step": 4140 }, { "loss": 1.2687364578247071, "grad_norm": 0.12328355014324188, "learning_rate": 3.977239669368997e-07, "entropy": 1.2848697736859322, "num_tokens": 24091459.0, "mean_token_accuracy": 0.686721895635128, "epoch": 1.7366328345715183, "step": 4150 }, { "loss": 1.310719871520996, "grad_norm": 0.13127021491527557, "learning_rate": 3.955139411976564e-07, "entropy": 1.3004455357789992, "num_tokens": 24145064.0, "mean_token_accuracy": 0.6823625862598419, "epoch": 1.7408182484043109, "step": 4160 }, { "loss": 1.277029323577881, "grad_norm": 0.13161002099514008, "learning_rate": 3.9330604795321877e-07, "entropy": 1.2868661388754845, "num_tokens": 24202651.0, "mean_token_accuracy": 0.6845416814088822, "epoch": 1.7450036622371037, "step": 4170 }, { "loss": 1.2249256134033204, "grad_norm": 0.1313517987728119, "learning_rate": 3.911003322653009e-07, "entropy": 1.2720478802919388, "num_tokens": 24259573.0, "mean_token_accuracy": 0.6942620486021042, "epoch": 1.7491890760698965, "step": 4180 }, { "loss": 1.3295866966247558, "grad_norm": 0.1546325832605362, "learning_rate": 3.888968391511738e-07, "entropy": 1.32426298558712, "num_tokens": 24322196.0, "mean_token_accuracy": 0.6730754569172859, "epoch": 1.753374489902689, "step": 4190 }, { "loss": 1.3283195495605469, "grad_norm": 0.13663379848003387, "learning_rate": 3.866956135827475e-07, "entropy": 1.3125308185815812, "num_tokens": 24376182.0, "mean_token_accuracy": 0.6829302325844765, "epoch": 1.7575599037354819, "step": 4200 }, { "eval_loss": 1.3021423816680908, "eval_runtime": 43.7163, "eval_samples_per_second": 145.735, "eval_steps_per_second": 6.085, "eval_entropy": 1.3239203189548694, "eval_num_tokens": 24376182.0, "eval_mean_token_accuracy": 0.6831337011846385, "epoch": 1.7575599037354819, "step": 4200 }, { "loss": 1.2734957695007325, "grad_norm": 0.13826783001422882, "learning_rate": 3.844967004856526e-07, "entropy": 1.3006668120622635, "num_tokens": 24433979.0, "mean_token_accuracy": 0.6882383152842522, "epoch": 1.7617453175682747, "step": 4210 }, { "loss": 1.298065757751465, "grad_norm": 0.11341753602027893, "learning_rate": 3.8230014473832386e-07, "entropy": 1.3199127793312073, "num_tokens": 24496717.0, "mean_token_accuracy": 0.6763370648026467, "epoch": 1.7659307314010673, "step": 4220 }, { "loss": 1.2942770004272461, "grad_norm": 0.11990880221128464, "learning_rate": 3.801059911710835e-07, "entropy": 1.3037174761295318, "num_tokens": 24556339.0, "mean_token_accuracy": 0.6810034438967705, "epoch": 1.7701161452338599, "step": 4230 }, { "loss": 1.3638200759887695, "grad_norm": 0.12909641861915588, "learning_rate": 3.779142845652275e-07, "entropy": 1.37214894592762, "num_tokens": 24610844.0, "mean_token_accuracy": 0.6698960587382317, "epoch": 1.7743015590666529, "step": 4240 }, { "loss": 1.4049521446228028, "grad_norm": 0.137081578373909, "learning_rate": 3.757250696521104e-07, "entropy": 1.3875975281000137, "num_tokens": 24663935.0, "mean_token_accuracy": 0.6685925871133804, "epoch": 1.7784869728994455, "step": 4250 }, { "loss": 1.2346957206726075, "grad_norm": 0.15599705278873444, "learning_rate": 3.7353839111223285e-07, "entropy": 1.2952020585536956, "num_tokens": 24724653.0, "mean_token_accuracy": 0.6917664587497712, "epoch": 1.782672386732238, "step": 4260 }, { "loss": 1.3122922897338867, "grad_norm": 0.14105546474456787, "learning_rate": 3.713542935743299e-07, "entropy": 1.3242159157991409, "num_tokens": 24783350.0, "mean_token_accuracy": 0.6838966220617294, "epoch": 1.7868578005650309, "step": 4270 }, { "loss": 1.2724437713623047, "grad_norm": 0.14495964348316193, "learning_rate": 3.6917282161445986e-07, "entropy": 1.2849380433559419, "num_tokens": 24840720.0, "mean_token_accuracy": 0.6882339790463448, "epoch": 1.7910432143978237, "step": 4280 }, { "loss": 1.2272959709167481, "grad_norm": 0.1268715113401413, "learning_rate": 3.66994019755095e-07, "entropy": 1.2522281989455224, "num_tokens": 24900268.0, "mean_token_accuracy": 0.6954008027911186, "epoch": 1.7952286282306162, "step": 4290 }, { "loss": 1.2925883293151856, "grad_norm": 0.12049921602010727, "learning_rate": 3.648179324642119e-07, "entropy": 1.3150138720870017, "num_tokens": 24955875.0, "mean_token_accuracy": 0.6815980896353722, "epoch": 1.799414042063409, "step": 4300 }, { "loss": 1.2687823295593261, "grad_norm": 0.1410578191280365, "learning_rate": 3.62644604154385e-07, "entropy": 1.292095237970352, "num_tokens": 25015757.0, "mean_token_accuracy": 0.6861520081758499, "epoch": 1.8035994558962019, "step": 4310 }, { "loss": 1.3122770309448242, "grad_norm": 0.1278466135263443, "learning_rate": 3.6047407918187923e-07, "entropy": 1.32326979637146, "num_tokens": 25073319.0, "mean_token_accuracy": 0.6822131305932999, "epoch": 1.8077848697289944, "step": 4320 }, { "loss": 1.239914321899414, "grad_norm": 0.1450994610786438, "learning_rate": 3.5830640184574567e-07, "entropy": 1.2679915323853492, "num_tokens": 25132470.0, "mean_token_accuracy": 0.6903218165040016, "epoch": 1.811970283561787, "step": 4330 }, { "loss": 1.3607330322265625, "grad_norm": 0.14295367896556854, "learning_rate": 3.5614161638691655e-07, "entropy": 1.361120554804802, "num_tokens": 25185195.0, "mean_token_accuracy": 0.6752493545413017, "epoch": 1.81615569739458, "step": 4340 }, { "loss": 1.2877880096435548, "grad_norm": 0.1336987167596817, "learning_rate": 3.539797669873029e-07, "entropy": 1.294544619321823, "num_tokens": 25241604.0, "mean_token_accuracy": 0.6817367270588874, "epoch": 1.8203411112273726, "step": 4350 }, { "loss": 1.2616640090942384, "grad_norm": 0.12443723529577255, "learning_rate": 3.518208977688924e-07, "entropy": 1.3023397505283356, "num_tokens": 25301515.0, "mean_token_accuracy": 0.6868541851639748, "epoch": 1.8245265250601652, "step": 4360 }, { "loss": 1.2237573623657227, "grad_norm": 0.14553169906139374, "learning_rate": 3.496650527928495e-07, "entropy": 1.2511302560567856, "num_tokens": 25357723.0, "mean_token_accuracy": 0.6974032506346702, "epoch": 1.828711938892958, "step": 4370 }, { "loss": 1.3280474662780761, "grad_norm": 0.12313038110733032, "learning_rate": 3.4751227605861544e-07, "entropy": 1.3370114535093307, "num_tokens": 25417249.0, "mean_token_accuracy": 0.6781404823064804, "epoch": 1.8328973527257508, "step": 4380 }, { "loss": 1.3116769790649414, "grad_norm": 0.12443029880523682, "learning_rate": 3.453626115030103e-07, "entropy": 1.323847246170044, "num_tokens": 25476722.0, "mean_token_accuracy": 0.6824665144085884, "epoch": 1.8370827665585434, "step": 4390 }, { "loss": 1.2541227340698242, "grad_norm": 0.14306563138961792, "learning_rate": 3.4321610299933754e-07, "entropy": 1.275883974134922, "num_tokens": 25536071.0, "mean_token_accuracy": 0.6896202132105828, "epoch": 1.8412681803913362, "step": 4400 }, { "eval_loss": 1.300899624824524, "eval_runtime": 43.6667, "eval_samples_per_second": 145.901, "eval_steps_per_second": 6.092, "eval_entropy": 1.31194052436298, "eval_num_tokens": 25536071.0, "eval_mean_token_accuracy": 0.6834116909736977, "epoch": 1.8412681803913362, "step": 4400 }, { "loss": 1.3052658081054687, "grad_norm": 0.12484145909547806, "learning_rate": 3.410727943564865e-07, "entropy": 1.304879105091095, "num_tokens": 25592326.0, "mean_token_accuracy": 0.6803866416215897, "epoch": 1.845453594224129, "step": 4410 }, { "loss": 1.2852392196655273, "grad_norm": 0.1245272308588028, "learning_rate": 3.3893272931804004e-07, "entropy": 1.2998150080442428, "num_tokens": 25650560.0, "mean_token_accuracy": 0.6859546720981597, "epoch": 1.8496390080569216, "step": 4420 }, { "loss": 1.3093093872070312, "grad_norm": 0.13694874942302704, "learning_rate": 3.367959515613809e-07, "entropy": 1.326773339509964, "num_tokens": 25710390.0, "mean_token_accuracy": 0.6779543459415436, "epoch": 1.8538244218897142, "step": 4430 }, { "loss": 1.3431434631347656, "grad_norm": 0.13425195217132568, "learning_rate": 3.346625046968003e-07, "entropy": 1.3320137143135071, "num_tokens": 25765683.0, "mean_token_accuracy": 0.6735352456569672, "epoch": 1.8580098357225072, "step": 4440 }, { "loss": 1.2724491119384767, "grad_norm": 0.13366232812404633, "learning_rate": 3.325324322666081e-07, "entropy": 1.27188421189785, "num_tokens": 25824731.0, "mean_token_accuracy": 0.689846420288086, "epoch": 1.8621952495552998, "step": 4450 }, { "loss": 1.2903520584106445, "grad_norm": 0.12611544132232666, "learning_rate": 3.3040577774424437e-07, "entropy": 1.3168232500553132, "num_tokens": 25885073.0, "mean_token_accuracy": 0.6854806423187256, "epoch": 1.8663806633880924, "step": 4460 }, { "loss": 1.3013708114624023, "grad_norm": 0.14317074418067932, "learning_rate": 3.2828258453339155e-07, "entropy": 1.3177704036235809, "num_tokens": 25942626.0, "mean_token_accuracy": 0.6822627365589142, "epoch": 1.8705660772208852, "step": 4470 }, { "loss": 1.332556915283203, "grad_norm": 0.14360307157039642, "learning_rate": 3.261628959670889e-07, "entropy": 1.3369245409965516, "num_tokens": 25997260.0, "mean_token_accuracy": 0.6774563640356064, "epoch": 1.874751491053678, "step": 4480 }, { "loss": 1.2616004943847656, "grad_norm": 0.13137495517730713, "learning_rate": 3.240467553068481e-07, "entropy": 1.2717559725046157, "num_tokens": 26055446.0, "mean_token_accuracy": 0.6905860707163811, "epoch": 1.8789369048864706, "step": 4490 }, { "loss": 1.408462142944336, "grad_norm": 0.13120928406715393, "learning_rate": 3.2193420574177034e-07, "entropy": 1.3706548005342483, "num_tokens": 26111925.0, "mean_token_accuracy": 0.6645766496658325, "epoch": 1.8831223187192634, "step": 4500 }, { "loss": 1.2302813529968262, "grad_norm": 0.15121367573738098, "learning_rate": 3.1982529038766505e-07, "entropy": 1.274702313542366, "num_tokens": 26171418.0, "mean_token_accuracy": 0.6942442029714584, "epoch": 1.8873077325520562, "step": 4510 }, { "loss": 1.2387846946716308, "grad_norm": 0.11012545973062515, "learning_rate": 3.1772005228616933e-07, "entropy": 1.2893740877509117, "num_tokens": 26232638.0, "mean_token_accuracy": 0.6922576785087585, "epoch": 1.8914931463848488, "step": 4520 }, { "loss": 1.292118453979492, "grad_norm": 0.14446091651916504, "learning_rate": 3.156185344038699e-07, "entropy": 1.3311437577009202, "num_tokens": 26293065.0, "mean_token_accuracy": 0.6810448184609413, "epoch": 1.8956785602176414, "step": 4530 }, { "loss": 1.33145170211792, "grad_norm": 0.14474819600582123, "learning_rate": 3.135207796314263e-07, "entropy": 1.3151442527770996, "num_tokens": 26349311.0, "mean_token_accuracy": 0.6806560069322586, "epoch": 1.8998639740504344, "step": 4540 }, { "loss": 1.2638792037963866, "grad_norm": 0.11845609545707703, "learning_rate": 3.114268307826953e-07, "entropy": 1.2752373963594437, "num_tokens": 26407067.0, "mean_token_accuracy": 0.6892140090465546, "epoch": 1.904049387883227, "step": 4550 }, { "loss": 1.341792106628418, "grad_norm": 0.1555166393518448, "learning_rate": 3.093367305938572e-07, "entropy": 1.3313662111759186, "num_tokens": 26463271.0, "mean_token_accuracy": 0.6772884294390679, "epoch": 1.9082348017160196, "step": 4560 }, { "loss": 1.2394842147827148, "grad_norm": 0.13164710998535156, "learning_rate": 3.072505217225435e-07, "entropy": 1.2927237793803215, "num_tokens": 26519442.0, "mean_token_accuracy": 0.688689187169075, "epoch": 1.9124202155488124, "step": 4570 }, { "loss": 1.2526350021362305, "grad_norm": 0.12433302402496338, "learning_rate": 3.051682467469663e-07, "entropy": 1.3005468085408212, "num_tokens": 26576793.0, "mean_token_accuracy": 0.6895026102662086, "epoch": 1.9166056293816052, "step": 4580 }, { "loss": 1.300935935974121, "grad_norm": 0.14517027139663696, "learning_rate": 3.030899481650496e-07, "entropy": 1.3120550215244293, "num_tokens": 26632676.0, "mean_token_accuracy": 0.6833431273698807, "epoch": 1.9207910432143978, "step": 4590 }, { "loss": 1.305576705932617, "grad_norm": 0.13038092851638794, "learning_rate": 3.010156683935614e-07, "entropy": 1.3124109566211701, "num_tokens": 26690585.0, "mean_token_accuracy": 0.6795723259449005, "epoch": 1.9249764570471906, "step": 4600 }, { "eval_loss": 1.299921989440918, "eval_runtime": 43.8297, "eval_samples_per_second": 145.358, "eval_steps_per_second": 6.069, "eval_entropy": 1.3205744122204028, "eval_num_tokens": 26690585.0, "eval_mean_token_accuracy": 0.6834659193243299, "epoch": 1.9249764570471906, "step": 4600 }, { "loss": 1.2759065628051758, "grad_norm": 0.13065700232982635, "learning_rate": 2.9894544976724845e-07, "entropy": 1.3232569113373756, "num_tokens": 26750126.0, "mean_token_accuracy": 0.6856075286865234, "epoch": 1.9291618708799834, "step": 4610 }, { "loss": 1.3960214614868165, "grad_norm": 0.13134630024433136, "learning_rate": 2.968793345379722e-07, "entropy": 1.3706552177667617, "num_tokens": 26809589.0, "mean_token_accuracy": 0.6644584119319916, "epoch": 1.933347284712776, "step": 4620 }, { "loss": 1.2981806755065919, "grad_norm": 0.13195905089378357, "learning_rate": 2.9481736487384615e-07, "entropy": 1.2926361411809921, "num_tokens": 26868122.0, "mean_token_accuracy": 0.6837931454181672, "epoch": 1.9375326985455685, "step": 4630 }, { "loss": 1.3205986022949219, "grad_norm": 0.1443631947040558, "learning_rate": 2.9275958285837567e-07, "entropy": 1.3107433021068573, "num_tokens": 26928383.0, "mean_token_accuracy": 0.6803469866514206, "epoch": 1.9417181123783616, "step": 4640 }, { "loss": 1.2733318328857421, "grad_norm": 0.12333279103040695, "learning_rate": 2.907060304895984e-07, "entropy": 1.306384412944317, "num_tokens": 26987347.0, "mean_token_accuracy": 0.6883521243929863, "epoch": 1.9459035262111541, "step": 4650 }, { "loss": 1.2352895736694336, "grad_norm": 0.15071088075637817, "learning_rate": 2.8865674967922815e-07, "entropy": 1.2537823468446732, "num_tokens": 27044802.0, "mean_token_accuracy": 0.6937527641654014, "epoch": 1.9500889400439467, "step": 4660 }, { "loss": 1.2543825149536132, "grad_norm": 0.11400660872459412, "learning_rate": 2.866117822517982e-07, "entropy": 1.2866296932101249, "num_tokens": 27102535.0, "mean_token_accuracy": 0.6873494073748588, "epoch": 1.9542743538767395, "step": 4670 }, { "loss": 1.3186541557312013, "grad_norm": 0.13829229772090912, "learning_rate": 2.8457116994380913e-07, "entropy": 1.331754493713379, "num_tokens": 27160092.0, "mean_token_accuracy": 0.6785844698548317, "epoch": 1.9584597677095323, "step": 4680 }, { "loss": 1.305195140838623, "grad_norm": 0.13482722640037537, "learning_rate": 2.8253495440287555e-07, "entropy": 1.3345891624689101, "num_tokens": 27216273.0, "mean_token_accuracy": 0.6796174451708794, "epoch": 1.962645181542325, "step": 4690 }, { "loss": 1.2968315124511718, "grad_norm": 0.13567803800106049, "learning_rate": 2.805031771868774e-07, "entropy": 1.3210385277867318, "num_tokens": 27274967.0, "mean_token_accuracy": 0.6838418498635292, "epoch": 1.9668305953751177, "step": 4700 }, { "loss": 1.2280590057373046, "grad_norm": 0.11562594771385193, "learning_rate": 2.784758797631113e-07, "entropy": 1.2723073571920396, "num_tokens": 27332706.0, "mean_token_accuracy": 0.693420697748661, "epoch": 1.9710160092079105, "step": 4710 }, { "loss": 1.2968725204467773, "grad_norm": 0.1304372102022171, "learning_rate": 2.7645310350744293e-07, "entropy": 1.3245373040437698, "num_tokens": 27391429.0, "mean_token_accuracy": 0.685273765027523, "epoch": 1.9752014230407031, "step": 4720 }, { "loss": 1.2400999069213867, "grad_norm": 0.12765555083751678, "learning_rate": 2.744348897034657e-07, "entropy": 1.2704340279102326, "num_tokens": 27449195.0, "mean_token_accuracy": 0.6926597207784653, "epoch": 1.9793868368734957, "step": 4730 }, { "loss": 1.313642406463623, "grad_norm": 0.12744539976119995, "learning_rate": 2.724212795416544e-07, "entropy": 1.323761799931526, "num_tokens": 27507409.0, "mean_token_accuracy": 0.6781192749738694, "epoch": 1.9835722507062887, "step": 4740 }, { "loss": 1.3404882431030274, "grad_norm": 0.1226453185081482, "learning_rate": 2.704123141185275e-07, "entropy": 1.3297797441482544, "num_tokens": 27562728.0, "mean_token_accuracy": 0.6767387732863426, "epoch": 1.9877576645390813, "step": 4750 }, { "loss": 1.3537111282348633, "grad_norm": 0.1401350498199463, "learning_rate": 2.6840803443580715e-07, "entropy": 1.3468406647443771, "num_tokens": 27622323.0, "mean_token_accuracy": 0.6730136394500732, "epoch": 1.991943078371874, "step": 4760 }, { "loss": 1.337536907196045, "grad_norm": 0.13678760826587677, "learning_rate": 2.664084813995818e-07, "entropy": 1.3439167469739914, "num_tokens": 27679189.0, "mean_token_accuracy": 0.676570326089859, "epoch": 1.9961284922046667, "step": 4770 }, { "loss": 1.2568793296813965, "grad_norm": 0.37100762128829956, "learning_rate": 2.644136958194727e-07, "entropy": 1.2735960676863387, "num_tokens": 27730417.0, "mean_token_accuracy": 0.6950372699144725, "epoch": 2.0, "step": 4780 }, { "loss": 1.2392065048217773, "grad_norm": 0.12272343039512634, "learning_rate": 2.624237184078004e-07, "entropy": 1.2709258124232292, "num_tokens": 27790663.0, "mean_token_accuracy": 0.6911343216896058, "epoch": 2.0041854138327926, "step": 4790 }, { "loss": 1.2543585777282715, "grad_norm": 0.11581531912088394, "learning_rate": 2.6043858977875287e-07, "entropy": 1.3081357836723329, "num_tokens": 27848150.0, "mean_token_accuracy": 0.6878180950880051, "epoch": 2.0083708276655856, "step": 4800 }, { "eval_loss": 1.2991561889648438, "eval_runtime": 42.5158, "eval_samples_per_second": 149.85, "eval_steps_per_second": 6.256, "eval_entropy": 1.2998529121391755, "eval_num_tokens": 27848150.0, "eval_mean_token_accuracy": 0.683588629378412, "epoch": 2.0083708276655856, "step": 4800 }, { "loss": 1.26077880859375, "grad_norm": 0.1441148817539215, "learning_rate": 2.584583504475587e-07, "entropy": 1.2779432222247125, "num_tokens": 27905176.0, "mean_token_accuracy": 0.6864334151148797, "epoch": 2.012556241498378, "step": 4810 }, { "loss": 1.2669543266296386, "grad_norm": 0.1253192126750946, "learning_rate": 2.5648304082965775e-07, "entropy": 1.2866142064332962, "num_tokens": 27963167.0, "mean_token_accuracy": 0.6874963492155075, "epoch": 2.016741655331171, "step": 4820 }, { "loss": 1.3348177909851073, "grad_norm": 0.14420834183692932, "learning_rate": 2.5451270123987843e-07, "entropy": 1.3393577009439468, "num_tokens": 28020052.0, "mean_token_accuracy": 0.6757590815424919, "epoch": 2.020927069163964, "step": 4830 }, { "loss": 1.2622191429138183, "grad_norm": 0.13085411489009857, "learning_rate": 2.5254737189161373e-07, "entropy": 1.3007038220763207, "num_tokens": 28078981.0, "mean_token_accuracy": 0.6892907366156578, "epoch": 2.0251124829967564, "step": 4840 }, { "loss": 1.3640257835388183, "grad_norm": 0.15776140987873077, "learning_rate": 2.5058709289600067e-07, "entropy": 1.3638720154762267, "num_tokens": 28134625.0, "mean_token_accuracy": 0.6710668623447418, "epoch": 2.029297896829549, "step": 4850 }, { "loss": 1.2844989776611329, "grad_norm": 0.11809894442558289, "learning_rate": 2.486319042611019e-07, "entropy": 1.2931891083717346, "num_tokens": 28194798.0, "mean_token_accuracy": 0.6872560605406761, "epoch": 2.0334833106623416, "step": 4860 }, { "loss": 1.3038467407226562, "grad_norm": 0.1337338089942932, "learning_rate": 2.4668184589108867e-07, "entropy": 1.33267682492733, "num_tokens": 28252833.0, "mean_token_accuracy": 0.6824387982487679, "epoch": 2.0376687244951346, "step": 4870 }, { "loss": 1.3354660034179688, "grad_norm": 0.15302719175815582, "learning_rate": 2.4473695758542707e-07, "entropy": 1.343110579252243, "num_tokens": 28312031.0, "mean_token_accuracy": 0.6740260154008866, "epoch": 2.041854138327927, "step": 4880 }, { "loss": 1.2677290916442872, "grad_norm": 0.13939176499843597, "learning_rate": 2.4279727903806556e-07, "entropy": 1.2891633421182633, "num_tokens": 28370177.0, "mean_token_accuracy": 0.6864463344216347, "epoch": 2.0460395521607198, "step": 4890 }, { "loss": 1.2482310295104981, "grad_norm": 0.1433423012495041, "learning_rate": 2.408628498366242e-07, "entropy": 1.2710548743605614, "num_tokens": 28429335.0, "mean_token_accuracy": 0.6896666899323464, "epoch": 2.050224965993513, "step": 4900 }, { "loss": 1.2301183700561524, "grad_norm": 0.12237236648797989, "learning_rate": 2.389337094615875e-07, "entropy": 1.278349894285202, "num_tokens": 28489915.0, "mean_token_accuracy": 0.6930847212672233, "epoch": 2.0544103798263054, "step": 4910 }, { "loss": 1.2912443161010743, "grad_norm": 0.17072777450084686, "learning_rate": 2.370098972854987e-07, "entropy": 1.277844424545765, "num_tokens": 28547300.0, "mean_token_accuracy": 0.68348438590765, "epoch": 2.058595793659098, "step": 4920 }, { "loss": 1.2300211906433105, "grad_norm": 0.12249208986759186, "learning_rate": 2.3509145257215495e-07, "entropy": 1.2522578805685043, "num_tokens": 28607326.0, "mean_token_accuracy": 0.6944442689418793, "epoch": 2.0627812074918905, "step": 4930 }, { "loss": 1.2790916442871094, "grad_norm": 0.22464126348495483, "learning_rate": 2.3317841447580767e-07, "entropy": 1.2809948831796647, "num_tokens": 28661604.0, "mean_token_accuracy": 0.6859934300184249, "epoch": 2.0669666213246836, "step": 4940 }, { "loss": 1.2724569320678711, "grad_norm": 0.13732433319091797, "learning_rate": 2.312708220403623e-07, "entropy": 1.31765376329422, "num_tokens": 28718818.0, "mean_token_accuracy": 0.6880935072898865, "epoch": 2.071152035157476, "step": 4950 }, { "loss": 1.3442106246948242, "grad_norm": 0.12912045419216156, "learning_rate": 2.2936871419858194e-07, "entropy": 1.3523348033428193, "num_tokens": 28775584.0, "mean_token_accuracy": 0.6736010074615478, "epoch": 2.0753374489902687, "step": 4960 }, { "loss": 1.1986734390258789, "grad_norm": 0.13469566404819489, "learning_rate": 2.2747212977129217e-07, "entropy": 1.2723553344607352, "num_tokens": 28835156.0, "mean_token_accuracy": 0.6995670750737191, "epoch": 2.0795228628230618, "step": 4970 }, { "loss": 1.3475229263305664, "grad_norm": 0.1290500909090042, "learning_rate": 2.2558110746658953e-07, "entropy": 1.3560008838772775, "num_tokens": 28895271.0, "mean_token_accuracy": 0.6725587636232376, "epoch": 2.0837082766558543, "step": 4980 }, { "loss": 1.342457389831543, "grad_norm": 0.1255948692560196, "learning_rate": 2.236956858790513e-07, "entropy": 1.3365329071879386, "num_tokens": 28953018.0, "mean_token_accuracy": 0.679172757267952, "epoch": 2.087893690488647, "step": 4990 }, { "loss": 1.2630849838256837, "grad_norm": 0.13795071840286255, "learning_rate": 2.218159034889469e-07, "entropy": 1.2892632216215134, "num_tokens": 29012433.0, "mean_token_accuracy": 0.6912027075886726, "epoch": 2.09207910432144, "step": 5000 }, { "eval_loss": 1.298377513885498, "eval_runtime": 43.7919, "eval_samples_per_second": 145.484, "eval_steps_per_second": 6.074, "eval_entropy": 1.3099012760291422, "eval_num_tokens": 29012433.0, "eval_mean_token_accuracy": 0.6836510939257485, "epoch": 2.09207910432144, "step": 5000 }, { "loss": 1.2960596084594727, "grad_norm": 0.11307420581579208, "learning_rate": 2.1994179866145396e-07, "entropy": 1.3118484735488891, "num_tokens": 29070217.0, "mean_token_accuracy": 0.6797604545950889, "epoch": 2.0962645181542325, "step": 5010 }, { "loss": 1.3223162651062013, "grad_norm": 0.15304112434387207, "learning_rate": 2.180734096458746e-07, "entropy": 1.3404868721961976, "num_tokens": 29126476.0, "mean_token_accuracy": 0.6759276837110519, "epoch": 2.100449931987025, "step": 5020 }, { "loss": 1.3450361251831056, "grad_norm": 0.11615368723869324, "learning_rate": 2.1621077457485427e-07, "entropy": 1.3462235242128373, "num_tokens": 29184125.0, "mean_token_accuracy": 0.6701866090297699, "epoch": 2.104635345819818, "step": 5030 }, { "loss": 1.3292051315307618, "grad_norm": 0.1241302341222763, "learning_rate": 2.1435393146360453e-07, "entropy": 1.3317017763853074, "num_tokens": 29243309.0, "mean_token_accuracy": 0.6787121832370758, "epoch": 2.1088207596526107, "step": 5040 }, { "loss": 1.309870719909668, "grad_norm": 0.12809441983699799, "learning_rate": 2.1250291820912648e-07, "entropy": 1.3308863699436189, "num_tokens": 29302274.0, "mean_token_accuracy": 0.6813490375876426, "epoch": 2.1130061734854033, "step": 5050 }, { "loss": 1.259312343597412, "grad_norm": 0.11709679663181305, "learning_rate": 2.1065777258943763e-07, "entropy": 1.2945900693535806, "num_tokens": 29359001.0, "mean_token_accuracy": 0.6841064542531967, "epoch": 2.117191587318196, "step": 5060 }, { "loss": 1.1917829513549805, "grad_norm": 0.13013018667697906, "learning_rate": 2.0881853226280082e-07, "entropy": 1.252656841278076, "num_tokens": 29417257.0, "mean_token_accuracy": 0.7048160001635552, "epoch": 2.121377001150989, "step": 5070 }, { "loss": 1.2949867248535156, "grad_norm": 0.15123531222343445, "learning_rate": 2.0698523476695506e-07, "entropy": 1.316368493437767, "num_tokens": 29474012.0, "mean_token_accuracy": 0.6840205147862435, "epoch": 2.1255624149837815, "step": 5080 }, { "loss": 1.231495475769043, "grad_norm": 0.13549339771270752, "learning_rate": 2.0515791751835066e-07, "entropy": 1.261933021247387, "num_tokens": 29535364.0, "mean_token_accuracy": 0.6923261538147927, "epoch": 2.129747828816574, "step": 5090 }, { "loss": 1.265492820739746, "grad_norm": 0.12323841452598572, "learning_rate": 2.0333661781138406e-07, "entropy": 1.2891878262162209, "num_tokens": 29594045.0, "mean_token_accuracy": 0.6874890491366387, "epoch": 2.133933242649367, "step": 5100 }, { "loss": 1.3455522537231446, "grad_norm": 0.12925904989242554, "learning_rate": 2.015213728176381e-07, "entropy": 1.355113722383976, "num_tokens": 29654672.0, "mean_token_accuracy": 0.6736163109540939, "epoch": 2.1381186564821597, "step": 5110 }, { "loss": 1.2876879692077636, "grad_norm": 0.10625462979078293, "learning_rate": 1.9971221958512259e-07, "entropy": 1.308001670241356, "num_tokens": 29713404.0, "mean_token_accuracy": 0.6850254252552986, "epoch": 2.1423040703149523, "step": 5120 }, { "loss": 1.269423484802246, "grad_norm": 0.14946334064006805, "learning_rate": 1.9790919503751786e-07, "entropy": 1.2912926644086837, "num_tokens": 29768834.0, "mean_token_accuracy": 0.6910573810338974, "epoch": 2.146489484147745, "step": 5130 }, { "loss": 1.3150415420532227, "grad_norm": 0.15966582298278809, "learning_rate": 1.961123359734222e-07, "entropy": 1.3350969046354293, "num_tokens": 29823986.0, "mean_token_accuracy": 0.6827343329787254, "epoch": 2.150674897980538, "step": 5140 }, { "loss": 1.2534076690673828, "grad_norm": 0.13799019157886505, "learning_rate": 1.9432167906560025e-07, "entropy": 1.2794459909200668, "num_tokens": 29882161.0, "mean_token_accuracy": 0.6894301295280456, "epoch": 2.1548603118133305, "step": 5150 }, { "loss": 1.226758861541748, "grad_norm": 0.16427931189537048, "learning_rate": 1.9253726086023376e-07, "entropy": 1.2521668612957, "num_tokens": 29938237.0, "mean_token_accuracy": 0.6923803791403771, "epoch": 2.159045725646123, "step": 5160 }, { "loss": 1.2537633895874023, "grad_norm": 0.13021980226039886, "learning_rate": 1.9075911777617776e-07, "entropy": 1.2832251608371734, "num_tokens": 29993951.0, "mean_token_accuracy": 0.6919069468975068, "epoch": 2.163231139478916, "step": 5170 }, { "loss": 1.2582441329956056, "grad_norm": 0.13316968083381653, "learning_rate": 1.8898728610421473e-07, "entropy": 1.2960840493440628, "num_tokens": 30053405.0, "mean_token_accuracy": 0.6867008566856384, "epoch": 2.1674165533117087, "step": 5180 }, { "loss": 1.2535063743591308, "grad_norm": 0.1502976417541504, "learning_rate": 1.8722180200631598e-07, "entropy": 1.291701939702034, "num_tokens": 30111434.0, "mean_token_accuracy": 0.6882436692714691, "epoch": 2.1716019671445013, "step": 5190 }, { "loss": 1.2767062187194824, "grad_norm": 0.1319260597229004, "learning_rate": 1.8546270151490278e-07, "entropy": 1.298856572806835, "num_tokens": 30168307.0, "mean_token_accuracy": 0.68586795181036, "epoch": 2.1757873809772943, "step": 5200 }, { "eval_loss": 1.297808051109314, "eval_runtime": 43.6552, "eval_samples_per_second": 145.939, "eval_steps_per_second": 6.093, "eval_entropy": 1.307837866750875, "eval_num_tokens": 30168307.0, "eval_mean_token_accuracy": 0.6838091374339914, "epoch": 2.1757873809772943, "step": 5200 }, { "loss": 1.2791316032409668, "grad_norm": 0.12315330654382706, "learning_rate": 1.8371002053211048e-07, "entropy": 1.3057184204459191, "num_tokens": 30225681.0, "mean_token_accuracy": 0.6861935615539551, "epoch": 2.179972794810087, "step": 5210 }, { "loss": 1.32224760055542, "grad_norm": 0.13483846187591553, "learning_rate": 1.819637948290569e-07, "entropy": 1.3247323662042618, "num_tokens": 30283602.0, "mean_token_accuracy": 0.677856071293354, "epoch": 2.1841582086428795, "step": 5220 }, { "loss": 1.245813751220703, "grad_norm": 0.12423646450042725, "learning_rate": 1.8022406004511114e-07, "entropy": 1.2820057839155197, "num_tokens": 30343652.0, "mean_token_accuracy": 0.6916850328445434, "epoch": 2.1883436224756725, "step": 5230 }, { "loss": 1.313099193572998, "grad_norm": 0.1301707625389099, "learning_rate": 1.7849085168716704e-07, "entropy": 1.3053890287876129, "num_tokens": 30400983.0, "mean_token_accuracy": 0.6804193690419197, "epoch": 2.192529036308465, "step": 5240 }, { "loss": 1.2556833267211913, "grad_norm": 0.1505342423915863, "learning_rate": 1.7676420512891842e-07, "entropy": 1.2873410269618035, "num_tokens": 30459009.0, "mean_token_accuracy": 0.6887684732675552, "epoch": 2.1967144501412577, "step": 5250 }, { "loss": 1.2559351921081543, "grad_norm": 0.13919785618782043, "learning_rate": 1.7504415561013614e-07, "entropy": 1.2811901897192002, "num_tokens": 30516861.0, "mean_token_accuracy": 0.6915321722626686, "epoch": 2.2008998639740502, "step": 5260 }, { "loss": 1.2761926651000977, "grad_norm": 0.12455730140209198, "learning_rate": 1.7333073823595025e-07, "entropy": 1.2844579115509986, "num_tokens": 30575979.0, "mean_token_accuracy": 0.6861526161432266, "epoch": 2.2050852778068433, "step": 5270 }, { "loss": 1.2840014457702638, "grad_norm": 0.13087549805641174, "learning_rate": 1.7162398797613282e-07, "entropy": 1.2940828785300256, "num_tokens": 30633600.0, "mean_token_accuracy": 0.685159420967102, "epoch": 2.209270691639636, "step": 5280 }, { "loss": 1.3325956344604493, "grad_norm": 0.15323391556739807, "learning_rate": 1.6992393966438405e-07, "entropy": 1.3237911939620972, "num_tokens": 30693015.0, "mean_token_accuracy": 0.6795177638530732, "epoch": 2.2134561054724284, "step": 5290 }, { "loss": 1.310387420654297, "grad_norm": 0.12490073591470718, "learning_rate": 1.6823062799762205e-07, "entropy": 1.3257877498865127, "num_tokens": 30749233.0, "mean_token_accuracy": 0.6818015187978744, "epoch": 2.2176415193052215, "step": 5300 }, { "loss": 1.2663789749145509, "grad_norm": 0.1327386498451233, "learning_rate": 1.6654408753527361e-07, "entropy": 1.3193859189748764, "num_tokens": 30809674.0, "mean_token_accuracy": 0.6879936501383781, "epoch": 2.221826933138014, "step": 5310 }, { "loss": 1.3301811218261719, "grad_norm": 0.14070047438144684, "learning_rate": 1.6486435269856985e-07, "entropy": 1.3461501210927964, "num_tokens": 30867279.0, "mean_token_accuracy": 0.6762196362018585, "epoch": 2.2260123469708066, "step": 5320 }, { "loss": 1.2323862075805665, "grad_norm": 0.14718832075595856, "learning_rate": 1.6319145776984361e-07, "entropy": 1.2663889586925507, "num_tokens": 30923604.0, "mean_token_accuracy": 0.6963629499077797, "epoch": 2.2301977608035997, "step": 5330 }, { "loss": 1.4061556816101075, "grad_norm": 0.11397302895784378, "learning_rate": 1.6152543689182885e-07, "entropy": 1.3801796600222587, "num_tokens": 30983941.0, "mean_token_accuracy": 0.6657746851444244, "epoch": 2.2343831746363922, "step": 5340 }, { "loss": 1.3607137680053711, "grad_norm": 0.13541868329048157, "learning_rate": 1.5986632406696515e-07, "entropy": 1.3243082225322724, "num_tokens": 31042120.0, "mean_token_accuracy": 0.6700464963912964, "epoch": 2.238568588469185, "step": 5350 }, { "loss": 1.2886553764343263, "grad_norm": 0.1389724761247635, "learning_rate": 1.5821415315670251e-07, "entropy": 1.3397713720798492, "num_tokens": 31102163.0, "mean_token_accuracy": 0.6880467623472214, "epoch": 2.2427540023019774, "step": 5360 }, { "loss": 1.220026397705078, "grad_norm": 0.1286465972661972, "learning_rate": 1.5656895788081104e-07, "entropy": 1.256170129776001, "num_tokens": 31159675.0, "mean_token_accuracy": 0.6972913891077042, "epoch": 2.2469394161347704, "step": 5370 }, { "loss": 1.2820704460144043, "grad_norm": 0.1213146299123764, "learning_rate": 1.5493077181669272e-07, "entropy": 1.2981676012277603, "num_tokens": 31219684.0, "mean_token_accuracy": 0.688413429260254, "epoch": 2.251124829967563, "step": 5380 }, { "loss": 1.2337745666503905, "grad_norm": 0.1330552101135254, "learning_rate": 1.532996283986957e-07, "entropy": 1.2481247037649155, "num_tokens": 31284113.0, "mean_token_accuracy": 0.694562304019928, "epoch": 2.2553102438003556, "step": 5390 }, { "loss": 1.3011648178100585, "grad_norm": 0.14552603662014008, "learning_rate": 1.5167556091743238e-07, "entropy": 1.3327119797468185, "num_tokens": 31344186.0, "mean_token_accuracy": 0.6838112965226173, "epoch": 2.2594956576331486, "step": 5400 }, { "eval_loss": 1.297374963760376, "eval_runtime": 43.7773, "eval_samples_per_second": 145.532, "eval_steps_per_second": 6.076, "eval_entropy": 1.315984815135038, "eval_num_tokens": 31344186.0, "eval_mean_token_accuracy": 0.6838296656321762, "epoch": 2.2594956576331486, "step": 5400 }, { "loss": 1.256122875213623, "grad_norm": 0.11425146460533142, "learning_rate": 1.5005860251909918e-07, "entropy": 1.2993682414293288, "num_tokens": 31399330.0, "mean_token_accuracy": 0.6879714965820313, "epoch": 2.263681071465941, "step": 5410 }, { "loss": 1.243597412109375, "grad_norm": 0.14105035364627838, "learning_rate": 1.4844878620480124e-07, "entropy": 1.2901643484830856, "num_tokens": 31458043.0, "mean_token_accuracy": 0.6898476853966713, "epoch": 2.267866485298734, "step": 5420 }, { "loss": 1.3062746047973632, "grad_norm": 0.1269349455833435, "learning_rate": 1.4684614482987805e-07, "entropy": 1.3157608151435851, "num_tokens": 31515675.0, "mean_token_accuracy": 0.6781650841236114, "epoch": 2.272051899131527, "step": 5430 }, { "loss": 1.3145343780517578, "grad_norm": 0.13170845806598663, "learning_rate": 1.452507111032329e-07, "entropy": 1.3244775086641312, "num_tokens": 31573727.0, "mean_token_accuracy": 0.6795195579528809, "epoch": 2.2762373129643194, "step": 5440 }, { "loss": 1.2743712425231934, "grad_norm": 0.13130150735378265, "learning_rate": 1.4366251758666558e-07, "entropy": 1.3025973543524743, "num_tokens": 31632527.0, "mean_token_accuracy": 0.6849103718996048, "epoch": 2.280422726797112, "step": 5450 }, { "loss": 1.2729061126708985, "grad_norm": 0.14398252964019775, "learning_rate": 1.4208159669420817e-07, "entropy": 1.2966506034135818, "num_tokens": 31688226.0, "mean_token_accuracy": 0.6885296568274498, "epoch": 2.2846081406299046, "step": 5460 }, { "loss": 1.308814811706543, "grad_norm": 0.1466449350118637, "learning_rate": 1.405079806914623e-07, "entropy": 1.306171926856041, "num_tokens": 31743518.0, "mean_token_accuracy": 0.6793627932667732, "epoch": 2.2887935544626976, "step": 5470 }, { "loss": 1.295256996154785, "grad_norm": 0.12834027409553528, "learning_rate": 1.389417016949419e-07, "entropy": 1.316891822218895, "num_tokens": 31800911.0, "mean_token_accuracy": 0.6853345051407814, "epoch": 2.29297896829549, "step": 5480 }, { "loss": 1.3327623367309571, "grad_norm": 0.1662720888853073, "learning_rate": 1.3738279167141725e-07, "entropy": 1.3393938541412354, "num_tokens": 31860118.0, "mean_token_accuracy": 0.6735303267836571, "epoch": 2.297164382128283, "step": 5490 }, { "loss": 1.279651165008545, "grad_norm": 0.12805919349193573, "learning_rate": 1.3583128243726227e-07, "entropy": 1.2862314611673356, "num_tokens": 31917654.0, "mean_token_accuracy": 0.6885863587260246, "epoch": 2.301349795961076, "step": 5500 }, { "loss": 1.2778766632080079, "grad_norm": 0.16033422946929932, "learning_rate": 1.3428720565780578e-07, "entropy": 1.300406639277935, "num_tokens": 31974868.0, "mean_token_accuracy": 0.6882018774747849, "epoch": 2.3055352097938684, "step": 5510 }, { "loss": 1.342056941986084, "grad_norm": 0.16284961998462677, "learning_rate": 1.327505928466842e-07, "entropy": 1.3293492585420608, "num_tokens": 32033943.0, "mean_token_accuracy": 0.6768848091363907, "epoch": 2.309720623626661, "step": 5520 }, { "loss": 1.206116008758545, "grad_norm": 0.1340523660182953, "learning_rate": 1.3122147536519985e-07, "entropy": 1.258744315803051, "num_tokens": 32095146.0, "mean_token_accuracy": 0.6991240099072457, "epoch": 2.3139060374594536, "step": 5530 }, { "loss": 1.255533218383789, "grad_norm": 0.12596993148326874, "learning_rate": 1.2969988442167934e-07, "entropy": 1.2745139241218566, "num_tokens": 32158070.0, "mean_token_accuracy": 0.6862679213285446, "epoch": 2.3180914512922466, "step": 5540 }, { "loss": 1.270913314819336, "grad_norm": 0.14521045982837677, "learning_rate": 1.2818585107083797e-07, "entropy": 1.2917841017246245, "num_tokens": 32213049.0, "mean_token_accuracy": 0.688049279153347, "epoch": 2.322276865125039, "step": 5550 }, { "loss": 1.2614711761474608, "grad_norm": 0.12460001558065414, "learning_rate": 1.2667940621314516e-07, "entropy": 1.288702441751957, "num_tokens": 32270375.0, "mean_token_accuracy": 0.691080367565155, "epoch": 2.326462278957832, "step": 5560 }, { "loss": 1.297041893005371, "grad_norm": 0.12860845029354095, "learning_rate": 1.2518058059419356e-07, "entropy": 1.2874844074249268, "num_tokens": 32327913.0, "mean_token_accuracy": 0.6812907472252846, "epoch": 2.330647692790625, "step": 5570 }, { "loss": 1.2307467460632324, "grad_norm": 0.13343603909015656, "learning_rate": 1.2368940480407242e-07, "entropy": 1.2836890518665314, "num_tokens": 32385583.0, "mean_token_accuracy": 0.6963008731603623, "epoch": 2.3348331066234174, "step": 5580 }, { "loss": 1.3668609619140626, "grad_norm": 0.13145415484905243, "learning_rate": 1.2220590927674286e-07, "entropy": 1.3669442266225815, "num_tokens": 32441025.0, "mean_token_accuracy": 0.671772038936615, "epoch": 2.33901852045621, "step": 5590 }, { "loss": 1.3068817138671875, "grad_norm": 0.13144521415233612, "learning_rate": 1.2073012428941588e-07, "entropy": 1.3122945204377174, "num_tokens": 32499899.0, "mean_token_accuracy": 0.674852766096592, "epoch": 2.343203934289003, "step": 5600 }, { "eval_loss": 1.2969086170196533, "eval_runtime": 43.7803, "eval_samples_per_second": 145.522, "eval_steps_per_second": 6.076, "eval_entropy": 1.312249709789018, "eval_num_tokens": 32499899.0, "eval_mean_token_accuracy": 0.6838675230965578, "epoch": 2.343203934289003, "step": 5600 }, { "loss": 1.33385009765625, "grad_norm": 0.125252828001976, "learning_rate": 1.1926207996193638e-07, "entropy": 1.3582130268216133, "num_tokens": 32556560.0, "mean_token_accuracy": 0.678239768743515, "epoch": 2.3473893481217956, "step": 5610 }, { "loss": 1.247665023803711, "grad_norm": 0.13559651374816895, "learning_rate": 1.178018062561662e-07, "entropy": 1.2727186426520347, "num_tokens": 32617607.0, "mean_token_accuracy": 0.6923329353332519, "epoch": 2.351574761954588, "step": 5620 }, { "loss": 1.2945147514343263, "grad_norm": 0.12413690984249115, "learning_rate": 1.1634933297537425e-07, "entropy": 1.3126976788043976, "num_tokens": 32676183.0, "mean_token_accuracy": 0.6811081647872925, "epoch": 2.355760175787381, "step": 5630 }, { "loss": 1.2760995864868163, "grad_norm": 0.15444409847259521, "learning_rate": 1.1490468976362766e-07, "entropy": 1.3008133977651597, "num_tokens": 32732392.0, "mean_token_accuracy": 0.6872219279408455, "epoch": 2.3599455896201738, "step": 5640 }, { "loss": 1.302404022216797, "grad_norm": 0.1389995664358139, "learning_rate": 1.1346790610518636e-07, "entropy": 1.3151475220918656, "num_tokens": 32788966.0, "mean_token_accuracy": 0.6797765508294106, "epoch": 2.3641310034529663, "step": 5650 }, { "loss": 1.288191795349121, "grad_norm": 0.14642177522182465, "learning_rate": 1.1203901132390225e-07, "entropy": 1.3152502685785294, "num_tokens": 32849483.0, "mean_token_accuracy": 0.6831487894058228, "epoch": 2.368316417285759, "step": 5660 }, { "loss": 1.2493846893310547, "grad_norm": 0.13448752462863922, "learning_rate": 1.1061803458261976e-07, "entropy": 1.2866099685430528, "num_tokens": 32907775.0, "mean_token_accuracy": 0.6911607295274734, "epoch": 2.372501831118552, "step": 5670 }, { "loss": 1.2729656219482421, "grad_norm": 0.1279105842113495, "learning_rate": 1.0920500488258134e-07, "entropy": 1.294448482990265, "num_tokens": 32966950.0, "mean_token_accuracy": 0.6881255716085434, "epoch": 2.3766872449513445, "step": 5680 }, { "loss": 1.2728429794311524, "grad_norm": 0.1403297632932663, "learning_rate": 1.0779995106283552e-07, "entropy": 1.2703639656305312, "num_tokens": 33022913.0, "mean_token_accuracy": 0.6847912818193436, "epoch": 2.380872658784137, "step": 5690 }, { "loss": 1.299112606048584, "grad_norm": 0.11831526458263397, "learning_rate": 1.0640290179964756e-07, "entropy": 1.324224580824375, "num_tokens": 33079983.0, "mean_token_accuracy": 0.6824282988905906, "epoch": 2.38505807261693, "step": 5700 }, { "loss": 1.327120018005371, "grad_norm": 0.13661810755729675, "learning_rate": 1.0501388560591523e-07, "entropy": 1.3056075662374496, "num_tokens": 33136523.0, "mean_token_accuracy": 0.677336810529232, "epoch": 2.3892434864497227, "step": 5710 }, { "loss": 1.3516573905944824, "grad_norm": 0.12402050942182541, "learning_rate": 1.0363293083058622e-07, "entropy": 1.3417491644620896, "num_tokens": 33194784.0, "mean_token_accuracy": 0.675346839427948, "epoch": 2.3934289002825153, "step": 5720 }, { "loss": 1.3104659080505372, "grad_norm": 0.13492602109909058, "learning_rate": 1.0226006565807982e-07, "entropy": 1.3131451904773712, "num_tokens": 33251897.0, "mean_token_accuracy": 0.6822021931409836, "epoch": 2.3976143141153083, "step": 5730 }, { "loss": 1.3267166137695312, "grad_norm": 0.13064873218536377, "learning_rate": 1.0089531810771163e-07, "entropy": 1.3214107781648636, "num_tokens": 33307060.0, "mean_token_accuracy": 0.6773762717843056, "epoch": 2.401799727948101, "step": 5740 }, { "loss": 1.316312599182129, "grad_norm": 0.15154863893985748, "learning_rate": 9.953871603312141e-08, "entropy": 1.3157601684331894, "num_tokens": 33362416.0, "mean_token_accuracy": 0.6785706043243408, "epoch": 2.4059851417808935, "step": 5750 }, { "loss": 1.2769282341003418, "grad_norm": 0.12917333841323853, "learning_rate": 9.819028712170512e-08, "entropy": 1.281336858868599, "num_tokens": 33422722.0, "mean_token_accuracy": 0.6885020643472671, "epoch": 2.4101705556136865, "step": 5760 }, { "loss": 1.3640668869018555, "grad_norm": 0.1428443044424057, "learning_rate": 9.68500588940498e-08, "entropy": 1.3538337886333465, "num_tokens": 33483556.0, "mean_token_accuracy": 0.6655726253986358, "epoch": 2.414355969446479, "step": 5770 }, { "loss": 1.251881980895996, "grad_norm": 0.14195656776428223, "learning_rate": 9.551805870337104e-08, "entropy": 1.2702584967017174, "num_tokens": 33543254.0, "mean_token_accuracy": 0.6900080740451813, "epoch": 2.4185413832792717, "step": 5780 }, { "loss": 1.3439226150512695, "grad_norm": 0.16237884759902954, "learning_rate": 9.419431373495612e-08, "entropy": 1.3545999929308892, "num_tokens": 33601741.0, "mean_token_accuracy": 0.6744641482830047, "epoch": 2.4227267971120643, "step": 5790 }, { "loss": 1.314307975769043, "grad_norm": 0.15124961733818054, "learning_rate": 9.287885100560771e-08, "entropy": 1.320368728041649, "num_tokens": 33657327.0, "mean_token_accuracy": 0.6819353699684143, "epoch": 2.4269122109448573, "step": 5800 }, { "eval_loss": 1.2966619729995728, "eval_runtime": 42.94, "eval_samples_per_second": 148.37, "eval_steps_per_second": 6.195, "eval_entropy": 1.3098934839542646, "eval_num_tokens": 33657327.0, "eval_mean_token_accuracy": 0.6839417216921211, "epoch": 2.4269122109448573, "step": 5800 }, { "loss": 1.2694414138793946, "grad_norm": 0.1228335052728653, "learning_rate": 9.157169736309384e-08, "entropy": 1.29910968542099, "num_tokens": 33713833.0, "mean_token_accuracy": 0.6918980091810226, "epoch": 2.43109762477765, "step": 5810 }, { "loss": 1.3777572631835937, "grad_norm": 0.12483090162277222, "learning_rate": 9.02728794855988e-08, "entropy": 1.3498617202043532, "num_tokens": 33771185.0, "mean_token_accuracy": 0.6699911892414093, "epoch": 2.4352830386104425, "step": 5820 }, { "loss": 1.3352084159851074, "grad_norm": 0.14710542559623718, "learning_rate": 8.898242388117949e-08, "entropy": 1.3336048945784569, "num_tokens": 33828941.0, "mean_token_accuracy": 0.6756347686052322, "epoch": 2.4394684524432355, "step": 5830 }, { "loss": 1.310356903076172, "grad_norm": 0.16344612836837769, "learning_rate": 8.770035688722399e-08, "entropy": 1.3448477059602737, "num_tokens": 33890202.0, "mean_token_accuracy": 0.6772162079811096, "epoch": 2.443653866276028, "step": 5840 }, { "loss": 1.2573143005371095, "grad_norm": 0.13473457098007202, "learning_rate": 8.642670466991381e-08, "entropy": 1.27697846442461, "num_tokens": 33945323.0, "mean_token_accuracy": 0.6902707099914551, "epoch": 2.4478392801088207, "step": 5850 }, { "loss": 1.2643320083618164, "grad_norm": 0.12609098851680756, "learning_rate": 8.516149322369054e-08, "entropy": 1.3083055540919304, "num_tokens": 34005115.0, "mean_token_accuracy": 0.6905182540416718, "epoch": 2.4520246939416133, "step": 5860 }, { "loss": 1.3385416984558105, "grad_norm": 0.1266658753156662, "learning_rate": 8.390474837072492e-08, "entropy": 1.330283808708191, "num_tokens": 34061458.0, "mean_token_accuracy": 0.6766823455691338, "epoch": 2.4562101077744063, "step": 5870 }, { "loss": 1.247739601135254, "grad_norm": 0.1117442175745964, "learning_rate": 8.265649576038946e-08, "entropy": 1.2861711964011193, "num_tokens": 34117371.0, "mean_token_accuracy": 0.6933671846985817, "epoch": 2.460395521607199, "step": 5880 }, { "loss": 1.3221072196960448, "grad_norm": 0.1515118032693863, "learning_rate": 8.141676086873573e-08, "entropy": 1.3331793665885925, "num_tokens": 34172239.0, "mean_token_accuracy": 0.6770834714174271, "epoch": 2.4645809354399915, "step": 5890 }, { "loss": 1.269434928894043, "grad_norm": 0.13494186103343964, "learning_rate": 8.018556899797396e-08, "entropy": 1.2998870089650154, "num_tokens": 34234355.0, "mean_token_accuracy": 0.687006613612175, "epoch": 2.4687663492727845, "step": 5900 }, { "loss": 1.326502799987793, "grad_norm": 0.14256730675697327, "learning_rate": 7.896294527595638e-08, "entropy": 1.350116790831089, "num_tokens": 34295462.0, "mean_token_accuracy": 0.6760937020182609, "epoch": 2.472951763105577, "step": 5910 }, { "loss": 1.3374545097351074, "grad_norm": 0.15305058658123016, "learning_rate": 7.774891465566518e-08, "entropy": 1.3536745309829712, "num_tokens": 34353670.0, "mean_token_accuracy": 0.6751428216695785, "epoch": 2.4771371769383697, "step": 5920 }, { "loss": 1.3062148094177246, "grad_norm": 0.1269030123949051, "learning_rate": 7.654350191470216e-08, "entropy": 1.3079909563064576, "num_tokens": 34409937.0, "mean_token_accuracy": 0.6825913473963737, "epoch": 2.4813225907711627, "step": 5930 }, { "loss": 1.3188300132751465, "grad_norm": 0.11656031757593155, "learning_rate": 7.534673165478417e-08, "entropy": 1.3348352879285812, "num_tokens": 34470681.0, "mean_token_accuracy": 0.673864497244358, "epoch": 2.4855080046039553, "step": 5940 }, { "loss": 1.263766098022461, "grad_norm": 0.12488370388746262, "learning_rate": 7.415862830124032e-08, "entropy": 1.3003046184778213, "num_tokens": 34530193.0, "mean_token_accuracy": 0.6913181528449058, "epoch": 2.489693418436748, "step": 5950 }, { "loss": 1.297060203552246, "grad_norm": 0.14231456816196442, "learning_rate": 7.297921610251323e-08, "entropy": 1.3110292360186577, "num_tokens": 34585018.0, "mean_token_accuracy": 0.6845840275287628, "epoch": 2.493878832269541, "step": 5960 }, { "loss": 1.2582796096801758, "grad_norm": 0.13470889627933502, "learning_rate": 7.180851912966501e-08, "entropy": 1.276314914226532, "num_tokens": 34640793.0, "mean_token_accuracy": 0.6882349893450737, "epoch": 2.4980642461023335, "step": 5970 }, { "loss": 1.2998489379882812, "grad_norm": 0.15721286833286285, "learning_rate": 7.064656127588508e-08, "entropy": 1.3124357014894485, "num_tokens": 34694819.0, "mean_token_accuracy": 0.6838565751910209, "epoch": 2.502249659935126, "step": 5980 }, { "loss": 1.306645965576172, "grad_norm": 0.11785798519849777, "learning_rate": 6.949336625600316e-08, "entropy": 1.3165518283843993, "num_tokens": 34751259.0, "mean_token_accuracy": 0.6841330319643021, "epoch": 2.5064350737679186, "step": 5990 }, { "loss": 1.3169086456298829, "grad_norm": 0.11234049499034882, "learning_rate": 6.834895760600517e-08, "entropy": 1.3190216064453124, "num_tokens": 34808644.0, "mean_token_accuracy": 0.6796000450849533, "epoch": 2.5106204876007117, "step": 6000 }, { "eval_loss": 1.2964025735855103, "eval_runtime": 43.5438, "eval_samples_per_second": 146.312, "eval_steps_per_second": 6.109, "eval_entropy": 1.3063210257910247, "eval_num_tokens": 34808644.0, "eval_mean_token_accuracy": 0.6839508241728732, "epoch": 2.5106204876007117, "step": 6000 }, { "loss": 1.2840510368347169, "grad_norm": 0.15145522356033325, "learning_rate": 6.721335868255229e-08, "entropy": 1.2826346635818482, "num_tokens": 34863928.0, "mean_token_accuracy": 0.6869580999016762, "epoch": 2.5148059014335042, "step": 6010 }, { "loss": 1.267976665496826, "grad_norm": 0.13854120671749115, "learning_rate": 6.60865926625051e-08, "entropy": 1.2938668191432954, "num_tokens": 34926879.0, "mean_token_accuracy": 0.6888927921652794, "epoch": 2.518991315266297, "step": 6020 }, { "loss": 1.332705307006836, "grad_norm": 0.13647040724754333, "learning_rate": 6.496868254245025e-08, "entropy": 1.3259623274207115, "num_tokens": 34984932.0, "mean_token_accuracy": 0.6760903507471084, "epoch": 2.52317672909909, "step": 6030 }, { "loss": 1.2476840019226074, "grad_norm": 0.13999158143997192, "learning_rate": 6.385965113823039e-08, "entropy": 1.2729045450687408, "num_tokens": 35042011.0, "mean_token_accuracy": 0.6900967061519623, "epoch": 2.5273621429318824, "step": 6040 }, { "loss": 1.2344088554382324, "grad_norm": 0.13583189249038696, "learning_rate": 6.275952108448018e-08, "entropy": 1.276967915892601, "num_tokens": 35100640.0, "mean_token_accuracy": 0.6938279047608376, "epoch": 2.531547556764675, "step": 6050 }, { "loss": 1.253906536102295, "grad_norm": 0.14375676214694977, "learning_rate": 6.166831483416229e-08, "entropy": 1.2771710246801375, "num_tokens": 35158864.0, "mean_token_accuracy": 0.6879908561706543, "epoch": 2.5357329705974676, "step": 6060 }, { "loss": 1.3232227325439454, "grad_norm": 0.12950512766838074, "learning_rate": 6.058605465811085e-08, "entropy": 1.3327802419662476, "num_tokens": 35216659.0, "mean_token_accuracy": 0.6788379862904549, "epoch": 2.5399183844302606, "step": 6070 }, { "loss": 1.2316680908203126, "grad_norm": 0.14246362447738647, "learning_rate": 5.9512762644576054e-08, "entropy": 1.2707349091768265, "num_tokens": 35276829.0, "mean_token_accuracy": 0.6953957095742226, "epoch": 2.544103798263053, "step": 6080 }, { "loss": 1.2903837203979491, "grad_norm": 0.14620284736156464, "learning_rate": 5.844846069877329e-08, "entropy": 1.3116936787962914, "num_tokens": 35335255.0, "mean_token_accuracy": 0.6844032138586045, "epoch": 2.5482892120958462, "step": 6090 }, { "loss": 1.2689558029174806, "grad_norm": 0.1156439483165741, "learning_rate": 5.7393170542436694e-08, "entropy": 1.2803223952651024, "num_tokens": 35398265.0, "mean_token_accuracy": 0.6874969124794006, "epoch": 2.552474625928639, "step": 6100 }, { "loss": 1.2471202850341796, "grad_norm": 0.14455300569534302, "learning_rate": 5.6346913713375076e-08, "entropy": 1.2767839536070824, "num_tokens": 35457467.0, "mean_token_accuracy": 0.6907158330082893, "epoch": 2.5566600397614314, "step": 6110 }, { "loss": 1.2724437713623047, "grad_norm": 0.1570362001657486, "learning_rate": 5.5309711565033055e-08, "entropy": 1.2919223070144654, "num_tokens": 35514814.0, "mean_token_accuracy": 0.688132356107235, "epoch": 2.560845453594224, "step": 6120 }, { "loss": 1.3232336044311523, "grad_norm": 0.15807446837425232, "learning_rate": 5.4281585266054755e-08, "entropy": 1.3095539420843125, "num_tokens": 35571529.0, "mean_token_accuracy": 0.6772763684391976, "epoch": 2.5650308674270166, "step": 6130 }, { "loss": 1.2672816276550294, "grad_norm": 0.12889625132083893, "learning_rate": 5.326255579985173e-08, "entropy": 1.2937352240085602, "num_tokens": 35630131.0, "mean_token_accuracy": 0.6905935257673264, "epoch": 2.5692162812598096, "step": 6140 }, { "loss": 1.2375890731811523, "grad_norm": 0.13400977849960327, "learning_rate": 5.225264396417522e-08, "entropy": 1.2529444962739944, "num_tokens": 35690381.0, "mean_token_accuracy": 0.6925143092870713, "epoch": 2.573401695092602, "step": 6150 }, { "loss": 1.261359405517578, "grad_norm": 0.10552431643009186, "learning_rate": 5.125187037069123e-08, "entropy": 1.2899439319968224, "num_tokens": 35755420.0, "mean_token_accuracy": 0.6874815404415131, "epoch": 2.577587108925395, "step": 6160 }, { "loss": 1.2803380966186524, "grad_norm": 0.12414208054542542, "learning_rate": 5.026025544455986e-08, "entropy": 1.3017450347542763, "num_tokens": 35812900.0, "mean_token_accuracy": 0.6863995373249054, "epoch": 2.581772522758188, "step": 6170 }, { "loss": 1.2126134872436523, "grad_norm": 0.11522486060857773, "learning_rate": 4.9277819424018815e-08, "entropy": 1.255126628279686, "num_tokens": 35872604.0, "mean_token_accuracy": 0.6964934691786766, "epoch": 2.5859579365909804, "step": 6180 }, { "loss": 1.253915023803711, "grad_norm": 0.11392216384410858, "learning_rate": 4.830458235996976e-08, "entropy": 1.2905526503920555, "num_tokens": 35930850.0, "mean_token_accuracy": 0.6916301295161247, "epoch": 2.590143350423773, "step": 6190 }, { "loss": 1.2767410278320312, "grad_norm": 0.12976758182048798, "learning_rate": 4.7340564115569804e-08, "entropy": 1.283238247036934, "num_tokens": 35988841.0, "mean_token_accuracy": 0.687064278125763, "epoch": 2.594328764256566, "step": 6200 }, { "eval_loss": 1.2962790727615356, "eval_runtime": 43.7745, "eval_samples_per_second": 145.542, "eval_steps_per_second": 6.077, "eval_entropy": 1.3085140944423532, "eval_num_tokens": 35988841.0, "eval_mean_token_accuracy": 0.6838811584433219, "epoch": 2.594328764256566, "step": 6200 }, { "loss": 1.3264198303222656, "grad_norm": 0.13761702179908752, "learning_rate": 4.638578436582552e-08, "entropy": 1.3237684190273284, "num_tokens": 36044753.0, "mean_token_accuracy": 0.67743628770113, "epoch": 2.5985141780893586, "step": 6210 }, { "loss": 1.257272720336914, "grad_norm": 0.1342718005180359, "learning_rate": 4.544026259719158e-08, "entropy": 1.288350522518158, "num_tokens": 36103553.0, "mean_token_accuracy": 0.6897284865379334, "epoch": 2.602699591922151, "step": 6220 }, { "loss": 1.2772136688232423, "grad_norm": 0.12511885166168213, "learning_rate": 4.4504018107173304e-08, "entropy": 1.3188367202877997, "num_tokens": 36162370.0, "mean_token_accuracy": 0.6876938834786415, "epoch": 2.606885005754944, "step": 6230 }, { "loss": 1.3926493644714355, "grad_norm": 0.15498943626880646, "learning_rate": 4.3577070003932234e-08, "entropy": 1.3926087036728858, "num_tokens": 36218877.0, "mean_token_accuracy": 0.6645623058080673, "epoch": 2.611070419587737, "step": 6240 }, { "loss": 1.2876951217651367, "grad_norm": 0.13086406886577606, "learning_rate": 4.265943720589688e-08, "entropy": 1.3051115587353705, "num_tokens": 36274783.0, "mean_token_accuracy": 0.6853921875357628, "epoch": 2.6152558334205294, "step": 6250 }, { "loss": 1.3133883476257324, "grad_norm": 0.15468242764472961, "learning_rate": 4.175113844137596e-08, "entropy": 1.3004416555166245, "num_tokens": 36330126.0, "mean_token_accuracy": 0.6820774778723717, "epoch": 2.619441247253322, "step": 6260 }, { "loss": 1.3524452209472657, "grad_norm": 0.14072105288505554, "learning_rate": 4.08521922481766e-08, "entropy": 1.3288619458675384, "num_tokens": 36386837.0, "mean_token_accuracy": 0.6738715380430221, "epoch": 2.623626661086115, "step": 6270 }, { "loss": 1.2715925216674804, "grad_norm": 0.13218800723552704, "learning_rate": 3.9962616973225784e-08, "entropy": 1.3046256229281425, "num_tokens": 36446096.0, "mean_token_accuracy": 0.6843361258506775, "epoch": 2.6278120749189076, "step": 6280 }, { "loss": 1.3278305053710937, "grad_norm": 0.12469816952943802, "learning_rate": 3.90824307721957e-08, "entropy": 1.3449779450893402, "num_tokens": 36502354.0, "mean_token_accuracy": 0.6795141100883484, "epoch": 2.6319974887517006, "step": 6290 }, { "loss": 1.2798616409301757, "grad_norm": 0.12675845623016357, "learning_rate": 3.821165160913381e-08, "entropy": 1.290797685086727, "num_tokens": 36560258.0, "mean_token_accuracy": 0.6858594298362732, "epoch": 2.636182902584493, "step": 6300 }, { "loss": 1.2590128898620605, "grad_norm": 0.14704617857933044, "learning_rate": 3.735029725609567e-08, "entropy": 1.2577021181583405, "num_tokens": 36616149.0, "mean_token_accuracy": 0.6896016135811806, "epoch": 2.6403683164172858, "step": 6310 }, { "loss": 1.2890483856201171, "grad_norm": 0.14774031937122345, "learning_rate": 3.649838529278232e-08, "entropy": 1.3169309496879578, "num_tokens": 36675723.0, "mean_token_accuracy": 0.683777266740799, "epoch": 2.6445537302500783, "step": 6320 }, { "loss": 1.3321017265319823, "grad_norm": 0.16364073753356934, "learning_rate": 3.565593310618165e-08, "entropy": 1.3246647357940673, "num_tokens": 36731369.0, "mean_token_accuracy": 0.6758654475212097, "epoch": 2.648739144082871, "step": 6330 }, { "loss": 1.3064552307128907, "grad_norm": 0.1471003293991089, "learning_rate": 3.48229578902135e-08, "entropy": 1.3057933449745178, "num_tokens": 36790065.0, "mean_token_accuracy": 0.6810683965682983, "epoch": 2.652924557915664, "step": 6340 }, { "loss": 1.279481792449951, "grad_norm": 0.14192688465118408, "learning_rate": 3.39994766453785e-08, "entropy": 1.303479927778244, "num_tokens": 36851965.0, "mean_token_accuracy": 0.6806197896599769, "epoch": 2.6571099717484565, "step": 6350 }, { "loss": 1.2731066703796388, "grad_norm": 0.1496737152338028, "learning_rate": 3.3185506178411593e-08, "entropy": 1.3053823009133338, "num_tokens": 36911260.0, "mean_token_accuracy": 0.685496874153614, "epoch": 2.6612953855812496, "step": 6360 }, { "loss": 1.3233325958251954, "grad_norm": 0.1397167593240738, "learning_rate": 3.238106310193822e-08, "entropy": 1.3544006377458573, "num_tokens": 36968655.0, "mean_token_accuracy": 0.6773801222443581, "epoch": 2.665480799414042, "step": 6370 }, { "loss": 1.273273754119873, "grad_norm": 0.1353992074728012, "learning_rate": 3.158616383413648e-08, "entropy": 1.2890938267111778, "num_tokens": 37023385.0, "mean_token_accuracy": 0.6872632309794426, "epoch": 2.6696662132468347, "step": 6380 }, { "loss": 1.269486427307129, "grad_norm": 0.14086788892745972, "learning_rate": 3.080082459840072e-08, "entropy": 1.29090928286314, "num_tokens": 37081937.0, "mean_token_accuracy": 0.6892599433660507, "epoch": 2.6738516270796273, "step": 6390 }, { "loss": 1.2413150787353515, "grad_norm": 0.147287517786026, "learning_rate": 3.0025061423011366e-08, "entropy": 1.2670000731945037, "num_tokens": 37139738.0, "mean_token_accuracy": 0.6992234885692596, "epoch": 2.6780370409124203, "step": 6400 }, { "eval_loss": 1.2961639165878296, "eval_runtime": 43.8119, "eval_samples_per_second": 145.417, "eval_steps_per_second": 6.071, "eval_entropy": 1.3104161902477867, "eval_num_tokens": 37139738.0, "eval_mean_token_accuracy": 0.6839385229842108, "epoch": 2.6780370409124203, "step": 6400 }, { "loss": 1.2215076446533204, "grad_norm": 0.12445386499166489, "learning_rate": 2.92588901408074e-08, "entropy": 1.2454605296254158, "num_tokens": 37196528.0, "mean_token_accuracy": 0.6982016503810883, "epoch": 2.682222454745213, "step": 6410 }, { "loss": 1.3178998947143554, "grad_norm": 0.14504940807819366, "learning_rate": 2.8502326388863073e-08, "entropy": 1.3217849105596542, "num_tokens": 37251881.0, "mean_token_accuracy": 0.6820187479257583, "epoch": 2.6864078685780055, "step": 6420 }, { "loss": 1.2787066459655763, "grad_norm": 0.1357499063014984, "learning_rate": 2.7755385608169368e-08, "entropy": 1.293387584388256, "num_tokens": 37308322.0, "mean_token_accuracy": 0.6850109323859215, "epoch": 2.6905932824107985, "step": 6430 }, { "loss": 1.3130731582641602, "grad_norm": 0.1391121745109558, "learning_rate": 2.701808304331826e-08, "entropy": 1.3160455033183098, "num_tokens": 37367065.0, "mean_token_accuracy": 0.6788717776536941, "epoch": 2.694778696243591, "step": 6440 }, { "loss": 1.324008083343506, "grad_norm": 0.17093950510025024, "learning_rate": 2.6290433742191697e-08, "entropy": 1.3303591817617417, "num_tokens": 37423674.0, "mean_token_accuracy": 0.6737320378422738, "epoch": 2.6989641100763837, "step": 6450 }, { "loss": 1.3142354965209961, "grad_norm": 0.11529888957738876, "learning_rate": 2.5572452555654766e-08, "entropy": 1.3235249876976014, "num_tokens": 37478758.0, "mean_token_accuracy": 0.6834891051054001, "epoch": 2.7031495239091763, "step": 6460 }, { "loss": 1.3274757385253906, "grad_norm": 0.14736518263816833, "learning_rate": 2.4864154137252348e-08, "entropy": 1.323398619890213, "num_tokens": 37535301.0, "mean_token_accuracy": 0.676533716917038, "epoch": 2.7073349377419693, "step": 6470 }, { "loss": 1.3244875907897948, "grad_norm": 0.1322164386510849, "learning_rate": 2.4165552942910005e-08, "entropy": 1.3245794102549553, "num_tokens": 37593859.0, "mean_token_accuracy": 0.6777049407362938, "epoch": 2.711520351574762, "step": 6480 }, { "loss": 1.3350665092468261, "grad_norm": 0.15845166146755219, "learning_rate": 2.3476663230639294e-08, "entropy": 1.311027655005455, "num_tokens": 37650900.0, "mean_token_accuracy": 0.6777532756328583, "epoch": 2.715705765407555, "step": 6490 }, { "loss": 1.3243823051452637, "grad_norm": 0.1440412998199463, "learning_rate": 2.279749906024625e-08, "entropy": 1.316267091035843, "num_tokens": 37709121.0, "mean_token_accuracy": 0.6794763222336769, "epoch": 2.7198911792403475, "step": 6500 }, { "loss": 1.2479528427124023, "grad_norm": 0.1294708549976349, "learning_rate": 2.2128074293044973e-08, "entropy": 1.2721221387386321, "num_tokens": 37769916.0, "mean_token_accuracy": 0.6923005178570747, "epoch": 2.72407659307314, "step": 6510 }, { "loss": 1.2882716178894043, "grad_norm": 0.1557369977235794, "learning_rate": 2.1468402591574176e-08, "entropy": 1.2828272953629494, "num_tokens": 37827782.0, "mean_token_accuracy": 0.6824876755475998, "epoch": 2.7282620069059327, "step": 6520 }, { "loss": 1.3350841522216796, "grad_norm": 0.14170175790786743, "learning_rate": 2.0818497419318847e-08, "entropy": 1.3358486652374268, "num_tokens": 37889014.0, "mean_token_accuracy": 0.6755576729774475, "epoch": 2.7324474207387257, "step": 6530 }, { "loss": 1.2199977874755858, "grad_norm": 0.1191650778055191, "learning_rate": 2.017837204043521e-08, "entropy": 1.2538551360368728, "num_tokens": 37949279.0, "mean_token_accuracy": 0.6934547841548919, "epoch": 2.7366328345715183, "step": 6540 }, { "loss": 1.3062921524047852, "grad_norm": 0.12893186509609222, "learning_rate": 1.954803951947992e-08, "entropy": 1.2967224359512328, "num_tokens": 38003642.0, "mean_token_accuracy": 0.679786990582943, "epoch": 2.740818248404311, "step": 6550 }, { "loss": 1.3133017539978027, "grad_norm": 0.15340933203697205, "learning_rate": 1.8927512721143733e-08, "entropy": 1.3341112226247787, "num_tokens": 38061010.0, "mean_token_accuracy": 0.6817509040236474, "epoch": 2.745003662237104, "step": 6560 }, { "loss": 1.2640517234802247, "grad_norm": 0.14441581070423126, "learning_rate": 1.831680430998872e-08, "entropy": 1.2839445233345033, "num_tokens": 38118091.0, "mean_token_accuracy": 0.6878686159849167, "epoch": 2.7491890760698965, "step": 6570 }, { "loss": 1.3473605155944823, "grad_norm": 0.1267869770526886, "learning_rate": 1.7715926750189736e-08, "entropy": 1.3488942801952362, "num_tokens": 38173499.0, "mean_token_accuracy": 0.673286820948124, "epoch": 2.753374489902689, "step": 6580 }, { "loss": 1.2386550903320312, "grad_norm": 0.1410847008228302, "learning_rate": 1.7124892305280248e-08, "entropy": 1.2806343123316766, "num_tokens": 38232916.0, "mean_token_accuracy": 0.6908275470137596, "epoch": 2.7575599037354817, "step": 6590 }, { "loss": 1.2414596557617188, "grad_norm": 0.15371793508529663, "learning_rate": 1.6543713037901863e-08, "entropy": 1.2626485541462897, "num_tokens": 38293881.0, "mean_token_accuracy": 0.6889653459191323, "epoch": 2.7617453175682747, "step": 6600 }, { "eval_loss": 1.2960588932037354, "eval_runtime": 43.7085, "eval_samples_per_second": 145.761, "eval_steps_per_second": 6.086, "eval_entropy": 1.3095905193708892, "eval_num_tokens": 38293881.0, "eval_mean_token_accuracy": 0.6839702539426044, "epoch": 2.7617453175682747, "step": 6600 }, { "loss": 1.2328216552734375, "grad_norm": 0.14509297907352448, "learning_rate": 1.5972400809558305e-08, "entropy": 1.2864874497056007, "num_tokens": 38354279.0, "mean_token_accuracy": 0.6910390242934227, "epoch": 2.7659307314010673, "step": 6610 }, { "loss": 1.3384916305541992, "grad_norm": 0.13343819975852966, "learning_rate": 1.541096728037322e-08, "entropy": 1.325030580163002, "num_tokens": 38412664.0, "mean_token_accuracy": 0.6726853728294373, "epoch": 2.77011614523386, "step": 6620 }, { "loss": 1.2254823684692382, "grad_norm": 0.1242731511592865, "learning_rate": 1.4859423908851976e-08, "entropy": 1.2863211989402772, "num_tokens": 38478157.0, "mean_token_accuracy": 0.6928248971700668, "epoch": 2.774301559066653, "step": 6630 }, { "loss": 1.3083304405212401, "grad_norm": 0.13707546889781952, "learning_rate": 1.43177819516484e-08, "entropy": 1.3157197803258895, "num_tokens": 38536491.0, "mean_token_accuracy": 0.6797169283032417, "epoch": 2.7784869728994455, "step": 6640 }, { "loss": 1.3308774948120117, "grad_norm": 0.18031752109527588, "learning_rate": 1.3786052463334363e-08, "entropy": 1.3267883569002152, "num_tokens": 38592693.0, "mean_token_accuracy": 0.6799778997898102, "epoch": 2.782672386732238, "step": 6650 }, { "loss": 1.254627799987793, "grad_norm": 0.13183258473873138, "learning_rate": 1.3264246296174675e-08, "entropy": 1.2955256581306458, "num_tokens": 38649528.0, "mean_token_accuracy": 0.6876810878515244, "epoch": 2.7868578005650306, "step": 6660 }, { "loss": 1.3457258224487305, "grad_norm": 0.13854053616523743, "learning_rate": 1.2752374099905371e-08, "entropy": 1.33312628865242, "num_tokens": 38706620.0, "mean_token_accuracy": 0.6743656143546104, "epoch": 2.7910432143978237, "step": 6670 }, { "loss": 1.3620613098144532, "grad_norm": 0.15011081099510193, "learning_rate": 1.2250446321516173e-08, "entropy": 1.3549015790224075, "num_tokens": 38764666.0, "mean_token_accuracy": 0.6692588478326797, "epoch": 2.7952286282306162, "step": 6680 }, { "loss": 1.259181022644043, "grad_norm": 0.15715329349040985, "learning_rate": 1.1758473205037812e-08, "entropy": 1.279186724126339, "num_tokens": 38823325.0, "mean_token_accuracy": 0.6928275167942047, "epoch": 2.7994140420634093, "step": 6690 }, { "loss": 1.2290419578552245, "grad_norm": 0.1388121098279953, "learning_rate": 1.127646479133243e-08, "entropy": 1.2684768080711364, "num_tokens": 38880457.0, "mean_token_accuracy": 0.694522102177143, "epoch": 2.803599455896202, "step": 6700 }, { "loss": 1.3010470390319824, "grad_norm": 0.15357019007205963, "learning_rate": 1.0804430917888795e-08, "entropy": 1.3155488684773444, "num_tokens": 38936834.0, "mean_token_accuracy": 0.6843758165836334, "epoch": 2.8077848697289944, "step": 6710 }, { "loss": 1.33712797164917, "grad_norm": 0.1296703815460205, "learning_rate": 1.0342381218621798e-08, "entropy": 1.3263304769992827, "num_tokens": 38992580.0, "mean_token_accuracy": 0.6779214948415756, "epoch": 2.811970283561787, "step": 6720 }, { "loss": 1.326553726196289, "grad_norm": 0.11933048069477081, "learning_rate": 9.890325123675324e-09, "entropy": 1.3206409364938736, "num_tokens": 39048629.0, "mean_token_accuracy": 0.6810004383325576, "epoch": 2.81615569739458, "step": 6730 }, { "loss": 1.2582717895507813, "grad_norm": 0.13642576336860657, "learning_rate": 9.44827185923036e-09, "entropy": 1.3072202578186989, "num_tokens": 39109993.0, "mean_token_accuracy": 0.6876247569918632, "epoch": 2.8203411112273726, "step": 6740 }, { "loss": 1.3292433738708496, "grad_norm": 0.14473669230937958, "learning_rate": 9.016230447316142e-09, "entropy": 1.312994186580181, "num_tokens": 39167300.0, "mean_token_accuracy": 0.6826678797602653, "epoch": 2.824526525060165, "step": 6750 }, { "loss": 1.2947887420654296, "grad_norm": 0.14324665069580078, "learning_rate": 8.59420970562652e-09, "entropy": 1.301099643111229, "num_tokens": 39225969.0, "mean_token_accuracy": 0.6844487801194191, "epoch": 2.8287119388929582, "step": 6760 }, { "loss": 1.3236183166503905, "grad_norm": 0.12903346121311188, "learning_rate": 8.182218247339557e-09, "entropy": 1.3240256026387214, "num_tokens": 39284458.0, "mean_token_accuracy": 0.6791065171360969, "epoch": 2.832897352725751, "step": 6770 }, { "loss": 1.2615336418151855, "grad_norm": 0.13563166558742523, "learning_rate": 7.7802644809421e-09, "entropy": 1.2816553667187691, "num_tokens": 39343117.0, "mean_token_accuracy": 0.6875470012426377, "epoch": 2.8370827665585434, "step": 6780 }, { "loss": 1.302596092224121, "grad_norm": 0.1292281448841095, "learning_rate": 7.388356610057878e-09, "entropy": 1.3054527580738067, "num_tokens": 39399894.0, "mean_token_accuracy": 0.6824864596128464, "epoch": 2.841268180391336, "step": 6790 }, { "loss": 1.266486930847168, "grad_norm": 0.13657227158546448, "learning_rate": 7.006502633280398e-09, "entropy": 1.2712685942649842, "num_tokens": 39460071.0, "mean_token_accuracy": 0.6891940608620644, "epoch": 2.845453594224129, "step": 6800 }, { "eval_loss": 1.2960532903671265, "eval_runtime": 43.6876, "eval_samples_per_second": 145.831, "eval_steps_per_second": 6.089, "eval_entropy": 1.3085124981134457, "eval_num_tokens": 39460071.0, "eval_mean_token_accuracy": 0.6840056230251054, "epoch": 2.845453594224129, "step": 6800 }, { "loss": 1.2665786743164062, "grad_norm": 0.13498687744140625, "learning_rate": 6.6347103440092534e-09, "entropy": 1.304034498333931, "num_tokens": 39521096.0, "mean_token_accuracy": 0.6852239608764649, "epoch": 2.8496390080569216, "step": 6810 }, { "loss": 1.2787626266479493, "grad_norm": 0.16180647909641266, "learning_rate": 6.272987330291635e-09, "entropy": 1.2805368885397912, "num_tokens": 39577586.0, "mean_token_accuracy": 0.6863118633627892, "epoch": 2.853824421889714, "step": 6820 }, { "loss": 1.298048973083496, "grad_norm": 0.14410941302776337, "learning_rate": 5.921340974666733e-09, "entropy": 1.315412837266922, "num_tokens": 39635685.0, "mean_token_accuracy": 0.6825838565826416, "epoch": 2.858009835722507, "step": 6830 }, { "loss": 1.2789037704467774, "grad_norm": 0.12575282156467438, "learning_rate": 5.57977845401586e-09, "entropy": 1.3064978927373887, "num_tokens": 39698219.0, "mean_token_accuracy": 0.6813527047634125, "epoch": 2.8621952495553, "step": 6840 }, { "loss": 1.2924142837524415, "grad_norm": 0.14836302399635315, "learning_rate": 5.248306739415453e-09, "entropy": 1.3290347814559937, "num_tokens": 39758992.0, "mean_token_accuracy": 0.6835691928863525, "epoch": 2.8663806633880924, "step": 6850 }, { "loss": 1.3144015312194823, "grad_norm": 0.1364041566848755, "learning_rate": 4.926932595994804e-09, "entropy": 1.3275774329900742, "num_tokens": 39818224.0, "mean_token_accuracy": 0.6781793549656868, "epoch": 2.870566077220885, "step": 6860 }, { "loss": 1.2337275505065919, "grad_norm": 0.15484337508678436, "learning_rate": 4.61566258279833e-09, "entropy": 1.2519328325986863, "num_tokens": 39875194.0, "mean_token_accuracy": 0.6923688799142838, "epoch": 2.874751491053678, "step": 6870 }, { "loss": 1.3397406578063964, "grad_norm": 0.13291706144809723, "learning_rate": 4.314503052651408e-09, "entropy": 1.3448736280202866, "num_tokens": 39930172.0, "mean_token_accuracy": 0.6757234945893288, "epoch": 2.8789369048864706, "step": 6880 }, { "loss": 1.337942886352539, "grad_norm": 0.1345345377922058, "learning_rate": 4.023460152030811e-09, "entropy": 1.3406923681497573, "num_tokens": 39989342.0, "mean_token_accuracy": 0.6747447595000267, "epoch": 2.8831223187192636, "step": 6890 }, { "loss": 1.2957025527954102, "grad_norm": 0.14848950505256653, "learning_rate": 3.74253982093925e-09, "entropy": 1.3245025753974915, "num_tokens": 40044850.0, "mean_token_accuracy": 0.6843625560402871, "epoch": 2.887307732552056, "step": 6900 }, { "loss": 1.2321189880371093, "grad_norm": 0.13322904706001282, "learning_rate": 3.471747792784141e-09, "entropy": 1.262104222178459, "num_tokens": 40102787.0, "mean_token_accuracy": 0.6951159760355949, "epoch": 2.891493146384849, "step": 6910 }, { "loss": 1.2964617729187011, "grad_norm": 0.11292250454425812, "learning_rate": 3.211089594260585e-09, "entropy": 1.3182623267173768, "num_tokens": 40163366.0, "mean_token_accuracy": 0.6819359913468361, "epoch": 2.8956785602176414, "step": 6920 }, { "loss": 1.3324262619018554, "grad_norm": 0.15124228596687317, "learning_rate": 2.9605705452387943e-09, "entropy": 1.3444043919444084, "num_tokens": 40221526.0, "mean_token_accuracy": 0.6710021272301674, "epoch": 2.8998639740504344, "step": 6930 }, { "loss": 1.2928382873535156, "grad_norm": 0.1519566923379898, "learning_rate": 2.7201957586550084e-09, "entropy": 1.299992610514164, "num_tokens": 40277117.0, "mean_token_accuracy": 0.6823776334524154, "epoch": 2.904049387883227, "step": 6940 }, { "loss": 1.2859591484069823, "grad_norm": 0.128694087266922, "learning_rate": 2.489970140407638e-09, "entropy": 1.3093111872673036, "num_tokens": 40334263.0, "mean_token_accuracy": 0.6845874279737473, "epoch": 2.9082348017160196, "step": 6950 }, { "loss": 1.2404520988464356, "grad_norm": 0.13099578022956848, "learning_rate": 2.2698983892568413e-09, "entropy": 1.2709315478801728, "num_tokens": 40388570.0, "mean_token_accuracy": 0.6908060133457183, "epoch": 2.9124202155488126, "step": 6960 }, { "loss": 1.30335693359375, "grad_norm": 0.12444788217544556, "learning_rate": 2.0599849967287696e-09, "entropy": 1.324267864227295, "num_tokens": 40447336.0, "mean_token_accuracy": 0.6855339229106903, "epoch": 2.916605629381605, "step": 6970 }, { "loss": 1.3318076133728027, "grad_norm": 0.14273810386657715, "learning_rate": 1.860234247023973e-09, "entropy": 1.345059370994568, "num_tokens": 40509083.0, "mean_token_accuracy": 0.6763337209820748, "epoch": 2.9207910432143978, "step": 6980 }, { "loss": 1.2318530082702637, "grad_norm": 0.13854491710662842, "learning_rate": 1.6706502169296366e-09, "entropy": 1.2656858801841735, "num_tokens": 40566757.0, "mean_token_accuracy": 0.6908913642168045, "epoch": 2.9249764570471903, "step": 6990 }, { "loss": 1.2753348350524902, "grad_norm": 0.1317194700241089, "learning_rate": 1.4912367757366485e-09, "entropy": 1.290731391310692, "num_tokens": 40626221.0, "mean_token_accuracy": 0.6856264978647232, "epoch": 2.9291618708799834, "step": 7000 }, { "eval_loss": 1.296015739440918, "eval_runtime": 42.3973, "eval_samples_per_second": 150.269, "eval_steps_per_second": 6.274, "eval_entropy": 1.3096338595662798, "eval_num_tokens": 40626221.0, "eval_mean_token_accuracy": 0.6839476989624196, "epoch": 2.9291618708799834, "step": 7000 }, { "loss": 1.3039022445678712, "grad_norm": 0.1216905489563942, "learning_rate": 1.3219975851607724e-09, "entropy": 1.3058283895254135, "num_tokens": 40684010.0, "mean_token_accuracy": 0.6836184665560723, "epoch": 2.933347284712776, "step": 7010 }, { "loss": 1.3275947570800781, "grad_norm": 0.15599027276039124, "learning_rate": 1.1629360992673754e-09, "entropy": 1.3284942299127578, "num_tokens": 40742081.0, "mean_token_accuracy": 0.6771559327840805, "epoch": 2.9375326985455685, "step": 7020 }, { "loss": 1.2864752769470216, "grad_norm": 0.12884531915187836, "learning_rate": 1.014055564401539e-09, "entropy": 1.2828212678432465, "num_tokens": 40801021.0, "mean_token_accuracy": 0.68373833745718, "epoch": 2.9417181123783616, "step": 7030 }, { "loss": 1.318696880340576, "grad_norm": 0.1270311027765274, "learning_rate": 8.753590191213356e-10, "entropy": 1.3187800377607346, "num_tokens": 40858738.0, "mean_token_accuracy": 0.6756755083799362, "epoch": 2.945903526211154, "step": 7040 }, { "loss": 1.274400520324707, "grad_norm": 0.15278004109859467, "learning_rate": 7.468492941362647e-10, "entropy": 1.2988332599401473, "num_tokens": 40917615.0, "mean_token_accuracy": 0.6798348844051361, "epoch": 2.9500889400439467, "step": 7050 }, { "loss": 1.2988153457641602, "grad_norm": 0.1352374255657196, "learning_rate": 6.285290122489128e-10, "entropy": 1.3167090728878974, "num_tokens": 40975462.0, "mean_token_accuracy": 0.6834619447588921, "epoch": 2.9542743538767393, "step": 7060 }, { "loss": 1.3265647888183594, "grad_norm": 0.13059331476688385, "learning_rate": 5.204005883019392e-10, "entropy": 1.325848352909088, "num_tokens": 41034824.0, "mean_token_accuracy": 0.6760321959853173, "epoch": 2.9584597677095323, "step": 7070 }, { "loss": 1.3572219848632812, "grad_norm": 0.14386902749538422, "learning_rate": 4.224662291285597e-10, "entropy": 1.356474344432354, "num_tokens": 41092574.0, "mean_token_accuracy": 0.6717446967959404, "epoch": 2.962645181542325, "step": 7080 }, { "loss": 1.362534523010254, "grad_norm": 0.12292881309986115, "learning_rate": 3.347279335074726e-10, "entropy": 1.3564091578125954, "num_tokens": 41151205.0, "mean_token_accuracy": 0.6701879113912582, "epoch": 2.966830595375118, "step": 7090 }, { "loss": 1.28519287109375, "grad_norm": 0.17997978627681732, "learning_rate": 2.571874921221129e-10, "entropy": 1.2821272403001784, "num_tokens": 41208932.0, "mean_token_accuracy": 0.6873822212219238, "epoch": 2.9710160092079105, "step": 7100 }, { "loss": 1.338641357421875, "grad_norm": 0.13907863199710846, "learning_rate": 1.8984648752429221e-10, "entropy": 1.3435455054044723, "num_tokens": 41265349.0, "mean_token_accuracy": 0.6759589716792107, "epoch": 2.975201423040703, "step": 7110 }, { "loss": 1.3811635971069336, "grad_norm": 0.13205569982528687, "learning_rate": 1.3270629410150335e-10, "entropy": 1.4000085026025773, "num_tokens": 41324164.0, "mean_token_accuracy": 0.6713890418410301, "epoch": 2.9793868368734957, "step": 7120 }, { "loss": 1.2836057662963867, "grad_norm": 0.13349401950836182, "learning_rate": 8.576807804921981e-11, "entropy": 1.3049908488988877, "num_tokens": 41383405.0, "mean_token_accuracy": 0.6830727905035019, "epoch": 2.9835722507062887, "step": 7130 }, { "loss": 1.2696043014526368, "grad_norm": 0.17530304193496704, "learning_rate": 4.903279734697063e-11, "entropy": 1.2787989050149917, "num_tokens": 41441198.0, "mean_token_accuracy": 0.6855189517140389, "epoch": 2.9877576645390813, "step": 7140 }, { "loss": 1.2740005493164062, "grad_norm": 0.15486350655555725, "learning_rate": 2.2501201738689414e-11, "entropy": 1.293800377845764, "num_tokens": 41496526.0, "mean_token_accuracy": 0.6852918058633805, "epoch": 2.991943078371874, "step": 7150 }, { "loss": 1.309797191619873, "grad_norm": 0.15995003283023834, "learning_rate": 6.173832717559779e-12, "entropy": 1.321927347779274, "num_tokens": 41546682.0, "mean_token_accuracy": 0.6836003750562668, "epoch": 2.996128492204667, "step": 7160 }, { "loss": 1.2691694259643556, "grad_norm": 0.2798561751842499, "learning_rate": 5.102351502417335e-14, "entropy": 1.282310128211975, "num_tokens": 41595613.0, "mean_token_accuracy": 0.6916061446473405, "epoch": 3.0, "step": 7170 }, { "train_runtime": 24564.0326, "train_samples_per_second": 7.003, "train_steps_per_second": 0.292, "total_flos": 1.1844488431738552e+18, "train_loss": 1.40203029108513, "epoch": 3.0, "step": 7170 } ]