{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9736963217332812, "eval_steps": 100, "global_step": 10500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002832159167345205, "grad_norm": 266.82720947265625, "learning_rate": 8.49056603773585e-08, "loss": 1.9703, "mean_token_accuracy": 0.26932485126890243, "num_tokens": 81920.0, "step": 10 }, { "epoch": 0.00566431833469041, "grad_norm": 113.50809478759766, "learning_rate": 1.7924528301886793e-07, "loss": 1.1437, "mean_token_accuracy": 0.5134050875902176, "num_tokens": 163840.0, "step": 20 }, { "epoch": 0.008496477502035615, "grad_norm": 26.898744583129883, "learning_rate": 2.735849056603774e-07, "loss": 0.5105, "mean_token_accuracy": 0.7163405127823352, "num_tokens": 245760.0, "step": 30 }, { "epoch": 0.01132863666938082, "grad_norm": 5.971513748168945, "learning_rate": 3.679245283018868e-07, "loss": 0.3397, "mean_token_accuracy": 0.7338307224214077, "num_tokens": 327680.0, "step": 40 }, { "epoch": 0.014160795836726024, "grad_norm": 2.588014602661133, "learning_rate": 4.622641509433962e-07, "loss": 0.2587, "mean_token_accuracy": 0.7489114515483379, "num_tokens": 409600.0, "step": 50 }, { "epoch": 0.01699295500407123, "grad_norm": 2.071500301361084, "learning_rate": 5.566037735849057e-07, "loss": 0.2269, "mean_token_accuracy": 0.7486179083585739, "num_tokens": 491520.0, "step": 60 }, { "epoch": 0.019825114171416434, "grad_norm": 2.0729641914367676, "learning_rate": 6.509433962264152e-07, "loss": 0.2169, "mean_token_accuracy": 0.7509662434458733, "num_tokens": 573440.0, "step": 70 }, { "epoch": 0.02265727333876164, "grad_norm": 2.071258544921875, "learning_rate": 7.452830188679246e-07, "loss": 0.2081, "mean_token_accuracy": 0.74461839646101, "num_tokens": 655360.0, "step": 80 }, { "epoch": 0.025489432506106843, "grad_norm": 1.6631486415863037, "learning_rate": 8.396226415094339e-07, "loss": 0.1976, "mean_token_accuracy": 0.7725538149476051, "num_tokens": 737280.0, "step": 90 }, { "epoch": 0.028321591673452048, "grad_norm": 1.8779876232147217, "learning_rate": 9.339622641509434e-07, "loss": 0.2339, "mean_token_accuracy": 0.7346746560186148, "num_tokens": 819200.0, "step": 100 }, { "epoch": 0.031153750840797252, "grad_norm": 1.9458152055740356, "learning_rate": 1.028301886792453e-06, "loss": 0.2159, "mean_token_accuracy": 0.7403008837252856, "num_tokens": 901120.0, "step": 110 }, { "epoch": 0.03398591000814246, "grad_norm": 2.350053071975708, "learning_rate": 1.1226415094339625e-06, "loss": 0.2016, "mean_token_accuracy": 0.7510640893131495, "num_tokens": 983040.0, "step": 120 }, { "epoch": 0.03681806917548766, "grad_norm": 1.6409040689468384, "learning_rate": 1.2169811320754718e-06, "loss": 0.1897, "mean_token_accuracy": 0.7467832688242197, "num_tokens": 1064960.0, "step": 130 }, { "epoch": 0.03965022834283287, "grad_norm": 2.1001358032226562, "learning_rate": 1.3113207547169812e-06, "loss": 0.1914, "mean_token_accuracy": 0.7644324846565723, "num_tokens": 1146880.0, "step": 140 }, { "epoch": 0.04248238751017807, "grad_norm": 1.8581039905548096, "learning_rate": 1.4056603773584907e-06, "loss": 0.188, "mean_token_accuracy": 0.7524094928056002, "num_tokens": 1228800.0, "step": 150 }, { "epoch": 0.04531454667752328, "grad_norm": 1.5342082977294922, "learning_rate": 1.5e-06, "loss": 0.1665, "mean_token_accuracy": 0.7694471593946218, "num_tokens": 1310720.0, "step": 160 }, { "epoch": 0.04814670584486848, "grad_norm": 1.8682209253311157, "learning_rate": 1.5943396226415096e-06, "loss": 0.2128, "mean_token_accuracy": 0.7429182983934879, "num_tokens": 1392640.0, "step": 170 }, { "epoch": 0.05097886501221369, "grad_norm": 1.4317920207977295, "learning_rate": 1.688679245283019e-06, "loss": 0.1809, "mean_token_accuracy": 0.7637964800000191, "num_tokens": 1474560.0, "step": 180 }, { "epoch": 0.053811024179558894, "grad_norm": 1.533575177192688, "learning_rate": 1.7830188679245283e-06, "loss": 0.2103, "mean_token_accuracy": 0.757950098067522, "num_tokens": 1556480.0, "step": 190 }, { "epoch": 0.056643183346904095, "grad_norm": 1.434243083000183, "learning_rate": 1.877358490566038e-06, "loss": 0.2003, "mean_token_accuracy": 0.740178570151329, "num_tokens": 1638400.0, "step": 200 }, { "epoch": 0.0594753425142493, "grad_norm": 1.764602780342102, "learning_rate": 1.9716981132075476e-06, "loss": 0.2021, "mean_token_accuracy": 0.7489114448428154, "num_tokens": 1720320.0, "step": 210 }, { "epoch": 0.062307501681594504, "grad_norm": 2.117680549621582, "learning_rate": 2.066037735849057e-06, "loss": 0.1785, "mean_token_accuracy": 0.7592955023050308, "num_tokens": 1802240.0, "step": 220 }, { "epoch": 0.06513966084893971, "grad_norm": 1.6958810091018677, "learning_rate": 2.1603773584905662e-06, "loss": 0.1683, "mean_token_accuracy": 0.7667441286146641, "num_tokens": 1884160.0, "step": 230 }, { "epoch": 0.06797182001628492, "grad_norm": 2.1871819496154785, "learning_rate": 2.2547169811320756e-06, "loss": 0.1999, "mean_token_accuracy": 0.7490459881722927, "num_tokens": 1966080.0, "step": 240 }, { "epoch": 0.07080397918363011, "grad_norm": 1.6220905780792236, "learning_rate": 2.349056603773585e-06, "loss": 0.1793, "mean_token_accuracy": 0.7611423686146737, "num_tokens": 2048000.0, "step": 250 }, { "epoch": 0.07363613835097532, "grad_norm": 1.859153151512146, "learning_rate": 2.4433962264150947e-06, "loss": 0.2042, "mean_token_accuracy": 0.7344055779278278, "num_tokens": 2129920.0, "step": 260 }, { "epoch": 0.07646829751832053, "grad_norm": 1.7373570203781128, "learning_rate": 2.537735849056604e-06, "loss": 0.1934, "mean_token_accuracy": 0.7504525419324637, "num_tokens": 2211840.0, "step": 270 }, { "epoch": 0.07930045668566574, "grad_norm": 1.7976300716400146, "learning_rate": 2.6320754716981134e-06, "loss": 0.1616, "mean_token_accuracy": 0.7769936423748731, "num_tokens": 2293760.0, "step": 280 }, { "epoch": 0.08213261585301095, "grad_norm": 1.6964890956878662, "learning_rate": 2.7264150943396227e-06, "loss": 0.2287, "mean_token_accuracy": 0.7370841491967439, "num_tokens": 2375680.0, "step": 290 }, { "epoch": 0.08496477502035614, "grad_norm": 1.399920105934143, "learning_rate": 2.820754716981132e-06, "loss": 0.1949, "mean_token_accuracy": 0.753412426635623, "num_tokens": 2457600.0, "step": 300 }, { "epoch": 0.08779693418770135, "grad_norm": 1.213510274887085, "learning_rate": 2.915094339622642e-06, "loss": 0.179, "mean_token_accuracy": 0.7716364972293377, "num_tokens": 2539520.0, "step": 310 }, { "epoch": 0.09062909335504656, "grad_norm": 2.2313196659088135, "learning_rate": 3.009433962264151e-06, "loss": 0.1994, "mean_token_accuracy": 0.7466487281024456, "num_tokens": 2621440.0, "step": 320 }, { "epoch": 0.09346125252239176, "grad_norm": 2.2745234966278076, "learning_rate": 3.1037735849056605e-06, "loss": 0.195, "mean_token_accuracy": 0.7552592940628529, "num_tokens": 2703360.0, "step": 330 }, { "epoch": 0.09629341168973696, "grad_norm": 1.491312026977539, "learning_rate": 3.19811320754717e-06, "loss": 0.1576, "mean_token_accuracy": 0.7606898214668035, "num_tokens": 2785280.0, "step": 340 }, { "epoch": 0.09912557085708217, "grad_norm": 1.663541316986084, "learning_rate": 3.292452830188679e-06, "loss": 0.1937, "mean_token_accuracy": 0.7564946185797453, "num_tokens": 2867200.0, "step": 350 }, { "epoch": 0.10195773002442737, "grad_norm": 2.4689948558807373, "learning_rate": 3.386792452830189e-06, "loss": 0.2032, "mean_token_accuracy": 0.744471624866128, "num_tokens": 2949120.0, "step": 360 }, { "epoch": 0.10478988919177258, "grad_norm": 1.5209397077560425, "learning_rate": 3.4811320754716982e-06, "loss": 0.1945, "mean_token_accuracy": 0.7584026407450437, "num_tokens": 3031040.0, "step": 370 }, { "epoch": 0.10762204835911779, "grad_norm": 2.1992151737213135, "learning_rate": 3.5754716981132076e-06, "loss": 0.1682, "mean_token_accuracy": 0.7578889414668083, "num_tokens": 3112960.0, "step": 380 }, { "epoch": 0.11045420752646298, "grad_norm": 1.4862502813339233, "learning_rate": 3.669811320754717e-06, "loss": 0.163, "mean_token_accuracy": 0.7855063613504172, "num_tokens": 3194880.0, "step": 390 }, { "epoch": 0.11328636669380819, "grad_norm": 1.8908488750457764, "learning_rate": 3.764150943396227e-06, "loss": 0.1833, "mean_token_accuracy": 0.7670621357858181, "num_tokens": 3276800.0, "step": 400 }, { "epoch": 0.1161185258611534, "grad_norm": 2.041515350341797, "learning_rate": 3.8584905660377364e-06, "loss": 0.1947, "mean_token_accuracy": 0.7530699595808983, "num_tokens": 3358720.0, "step": 410 }, { "epoch": 0.1189506850284986, "grad_norm": 2.2269811630249023, "learning_rate": 3.952830188679246e-06, "loss": 0.1928, "mean_token_accuracy": 0.737194225937128, "num_tokens": 3440640.0, "step": 420 }, { "epoch": 0.1217828441958438, "grad_norm": 1.5617631673812866, "learning_rate": 4.047169811320755e-06, "loss": 0.185, "mean_token_accuracy": 0.7586839504539966, "num_tokens": 3522560.0, "step": 430 }, { "epoch": 0.12461500336318901, "grad_norm": 1.7558482885360718, "learning_rate": 4.141509433962265e-06, "loss": 0.1947, "mean_token_accuracy": 0.7349804311990737, "num_tokens": 3604480.0, "step": 440 }, { "epoch": 0.1274471625305342, "grad_norm": 2.1484475135803223, "learning_rate": 4.235849056603774e-06, "loss": 0.1682, "mean_token_accuracy": 0.7538405071943999, "num_tokens": 3686400.0, "step": 450 }, { "epoch": 0.13027932169787942, "grad_norm": 1.708686351776123, "learning_rate": 4.3301886792452836e-06, "loss": 0.1855, "mean_token_accuracy": 0.7438111562281847, "num_tokens": 3768320.0, "step": 460 }, { "epoch": 0.13311148086522462, "grad_norm": 2.117609739303589, "learning_rate": 4.424528301886793e-06, "loss": 0.1602, "mean_token_accuracy": 0.7443003915250301, "num_tokens": 3850240.0, "step": 470 }, { "epoch": 0.13594364003256984, "grad_norm": 1.7930119037628174, "learning_rate": 4.518867924528302e-06, "loss": 0.1691, "mean_token_accuracy": 0.7597113497555256, "num_tokens": 3932160.0, "step": 480 }, { "epoch": 0.13877579919991503, "grad_norm": 1.7393847703933716, "learning_rate": 4.613207547169812e-06, "loss": 0.1905, "mean_token_accuracy": 0.7555283781141042, "num_tokens": 4014080.0, "step": 490 }, { "epoch": 0.14160795836726023, "grad_norm": 1.5610122680664062, "learning_rate": 4.707547169811321e-06, "loss": 0.1943, "mean_token_accuracy": 0.7373776897788048, "num_tokens": 4096000.0, "step": 500 }, { "epoch": 0.14444011753460545, "grad_norm": 1.9428974390029907, "learning_rate": 4.801886792452831e-06, "loss": 0.174, "mean_token_accuracy": 0.7455234818160534, "num_tokens": 4177920.0, "step": 510 }, { "epoch": 0.14727227670195064, "grad_norm": 1.3793879747390747, "learning_rate": 4.8962264150943404e-06, "loss": 0.1844, "mean_token_accuracy": 0.7617783773690462, "num_tokens": 4259840.0, "step": 520 }, { "epoch": 0.15010443586929587, "grad_norm": 1.913741111755371, "learning_rate": 4.990566037735849e-06, "loss": 0.1915, "mean_token_accuracy": 0.7580357156693935, "num_tokens": 4341760.0, "step": 530 }, { "epoch": 0.15293659503664106, "grad_norm": 1.7148768901824951, "learning_rate": 5.084905660377359e-06, "loss": 0.1826, "mean_token_accuracy": 0.7776908032596112, "num_tokens": 4423680.0, "step": 540 }, { "epoch": 0.15576875420398625, "grad_norm": 1.5805952548980713, "learning_rate": 5.179245283018869e-06, "loss": 0.1717, "mean_token_accuracy": 0.7660102739930152, "num_tokens": 4505600.0, "step": 550 }, { "epoch": 0.15860091337133148, "grad_norm": 2.2117791175842285, "learning_rate": 5.273584905660379e-06, "loss": 0.1799, "mean_token_accuracy": 0.7643346380442381, "num_tokens": 4587520.0, "step": 560 }, { "epoch": 0.16143307253867667, "grad_norm": 2.560072898864746, "learning_rate": 5.3679245283018875e-06, "loss": 0.1645, "mean_token_accuracy": 0.7761619407683611, "num_tokens": 4669440.0, "step": 570 }, { "epoch": 0.1642652317060219, "grad_norm": 1.3699485063552856, "learning_rate": 5.462264150943397e-06, "loss": 0.1922, "mean_token_accuracy": 0.7476883552968502, "num_tokens": 4751360.0, "step": 580 }, { "epoch": 0.16709739087336709, "grad_norm": 2.1325595378875732, "learning_rate": 5.556603773584906e-06, "loss": 0.2123, "mean_token_accuracy": 0.7427103746682405, "num_tokens": 4833280.0, "step": 590 }, { "epoch": 0.16992955004071228, "grad_norm": 1.8622149229049683, "learning_rate": 5.650943396226416e-06, "loss": 0.1742, "mean_token_accuracy": 0.7783268127590418, "num_tokens": 4915200.0, "step": 600 }, { "epoch": 0.1727617092080575, "grad_norm": 1.980405569076538, "learning_rate": 5.745283018867926e-06, "loss": 0.1746, "mean_token_accuracy": 0.762096381559968, "num_tokens": 4997120.0, "step": 610 }, { "epoch": 0.1755938683754027, "grad_norm": 1.8079500198364258, "learning_rate": 5.839622641509435e-06, "loss": 0.1843, "mean_token_accuracy": 0.754831212759018, "num_tokens": 5079040.0, "step": 620 }, { "epoch": 0.1784260275427479, "grad_norm": 1.4955353736877441, "learning_rate": 5.933962264150944e-06, "loss": 0.1798, "mean_token_accuracy": 0.7440679997205735, "num_tokens": 5160960.0, "step": 630 }, { "epoch": 0.1812581867100931, "grad_norm": 1.749066948890686, "learning_rate": 6.028301886792453e-06, "loss": 0.2077, "mean_token_accuracy": 0.7515533279627562, "num_tokens": 5242880.0, "step": 640 }, { "epoch": 0.1840903458774383, "grad_norm": 1.4171102046966553, "learning_rate": 6.122641509433963e-06, "loss": 0.1739, "mean_token_accuracy": 0.77824119143188, "num_tokens": 5324800.0, "step": 650 }, { "epoch": 0.18692250504478353, "grad_norm": 1.8210899829864502, "learning_rate": 6.216981132075473e-06, "loss": 0.2088, "mean_token_accuracy": 0.7350538142025471, "num_tokens": 5406720.0, "step": 660 }, { "epoch": 0.18975466421212872, "grad_norm": 1.6177716255187988, "learning_rate": 6.311320754716982e-06, "loss": 0.2077, "mean_token_accuracy": 0.7522382594645023, "num_tokens": 5488640.0, "step": 670 }, { "epoch": 0.19258682337947391, "grad_norm": 1.6177340745925903, "learning_rate": 6.4056603773584915e-06, "loss": 0.1824, "mean_token_accuracy": 0.7379158493131399, "num_tokens": 5570560.0, "step": 680 }, { "epoch": 0.19541898254681914, "grad_norm": 1.8311525583267212, "learning_rate": 6.5000000000000004e-06, "loss": 0.1753, "mean_token_accuracy": 0.772040119022131, "num_tokens": 5652480.0, "step": 690 }, { "epoch": 0.19825114171416433, "grad_norm": 1.4334756135940552, "learning_rate": 6.59433962264151e-06, "loss": 0.1995, "mean_token_accuracy": 0.735653131082654, "num_tokens": 5734400.0, "step": 700 }, { "epoch": 0.20108330088150955, "grad_norm": 1.3499442338943481, "learning_rate": 6.68867924528302e-06, "loss": 0.1767, "mean_token_accuracy": 0.7688723120838403, "num_tokens": 5816320.0, "step": 710 }, { "epoch": 0.20391546004885475, "grad_norm": 1.579254388809204, "learning_rate": 6.783018867924529e-06, "loss": 0.1674, "mean_token_accuracy": 0.7721868887543678, "num_tokens": 5898240.0, "step": 720 }, { "epoch": 0.20674761921619994, "grad_norm": 1.6362155675888062, "learning_rate": 6.877358490566039e-06, "loss": 0.1811, "mean_token_accuracy": 0.7621208380907774, "num_tokens": 5980160.0, "step": 730 }, { "epoch": 0.20957977838354516, "grad_norm": 1.319257378578186, "learning_rate": 6.9716981132075476e-06, "loss": 0.1791, "mean_token_accuracy": 0.7685665376484394, "num_tokens": 6062080.0, "step": 740 }, { "epoch": 0.21241193755089036, "grad_norm": 1.20759916305542, "learning_rate": 7.066037735849057e-06, "loss": 0.1881, "mean_token_accuracy": 0.7567514650523662, "num_tokens": 6144000.0, "step": 750 }, { "epoch": 0.21524409671823558, "grad_norm": 1.620995044708252, "learning_rate": 7.160377358490567e-06, "loss": 0.1977, "mean_token_accuracy": 0.7412793543189764, "num_tokens": 6225920.0, "step": 760 }, { "epoch": 0.21807625588558077, "grad_norm": 1.9897656440734863, "learning_rate": 7.254716981132076e-06, "loss": 0.2096, "mean_token_accuracy": 0.731763694807887, "num_tokens": 6307840.0, "step": 770 }, { "epoch": 0.22090841505292597, "grad_norm": 1.6105620861053467, "learning_rate": 7.349056603773586e-06, "loss": 0.1811, "mean_token_accuracy": 0.7511252447962761, "num_tokens": 6389760.0, "step": 780 }, { "epoch": 0.2237405742202712, "grad_norm": 1.685012698173523, "learning_rate": 7.443396226415095e-06, "loss": 0.2011, "mean_token_accuracy": 0.7308096878230572, "num_tokens": 6471680.0, "step": 790 }, { "epoch": 0.22657273338761638, "grad_norm": 1.711360216140747, "learning_rate": 7.5377358490566044e-06, "loss": 0.1684, "mean_token_accuracy": 0.7621575362980366, "num_tokens": 6553600.0, "step": 800 }, { "epoch": 0.22940489255496158, "grad_norm": 1.4329050779342651, "learning_rate": 7.632075471698114e-06, "loss": 0.1714, "mean_token_accuracy": 0.7760273981839418, "num_tokens": 6635520.0, "step": 810 }, { "epoch": 0.2322370517223068, "grad_norm": 1.5542831420898438, "learning_rate": 7.726415094339623e-06, "loss": 0.1808, "mean_token_accuracy": 0.7752568498253822, "num_tokens": 6717440.0, "step": 820 }, { "epoch": 0.235069210889652, "grad_norm": 1.6315128803253174, "learning_rate": 7.820754716981132e-06, "loss": 0.1725, "mean_token_accuracy": 0.7600415855646133, "num_tokens": 6799360.0, "step": 830 }, { "epoch": 0.2379013700569972, "grad_norm": 1.826146125793457, "learning_rate": 7.915094339622643e-06, "loss": 0.2227, "mean_token_accuracy": 0.7291095897555351, "num_tokens": 6881280.0, "step": 840 }, { "epoch": 0.2407335292243424, "grad_norm": 1.3974148035049438, "learning_rate": 8.009433962264152e-06, "loss": 0.1958, "mean_token_accuracy": 0.7599926635622978, "num_tokens": 6963200.0, "step": 850 }, { "epoch": 0.2435656883916876, "grad_norm": 2.1663241386413574, "learning_rate": 8.10377358490566e-06, "loss": 0.1748, "mean_token_accuracy": 0.7746697634458541, "num_tokens": 7045120.0, "step": 860 }, { "epoch": 0.24639784755903282, "grad_norm": 1.7251940965652466, "learning_rate": 8.198113207547171e-06, "loss": 0.1753, "mean_token_accuracy": 0.7653498005121946, "num_tokens": 7127040.0, "step": 870 }, { "epoch": 0.24923000672637802, "grad_norm": 1.5113860368728638, "learning_rate": 8.29245283018868e-06, "loss": 0.1682, "mean_token_accuracy": 0.7651785716414452, "num_tokens": 7208960.0, "step": 880 }, { "epoch": 0.2520621658937232, "grad_norm": 1.8497939109802246, "learning_rate": 8.386792452830189e-06, "loss": 0.1857, "mean_token_accuracy": 0.7495229963213206, "num_tokens": 7290880.0, "step": 890 }, { "epoch": 0.2548943250610684, "grad_norm": 1.4041365385055542, "learning_rate": 8.481132075471698e-06, "loss": 0.2014, "mean_token_accuracy": 0.7499388515949249, "num_tokens": 7372800.0, "step": 900 }, { "epoch": 0.25772648422841365, "grad_norm": 1.8879481554031372, "learning_rate": 8.575471698113208e-06, "loss": 0.1833, "mean_token_accuracy": 0.7608488276600838, "num_tokens": 7454720.0, "step": 910 }, { "epoch": 0.26055864339575885, "grad_norm": 1.7014100551605225, "learning_rate": 8.669811320754717e-06, "loss": 0.1728, "mean_token_accuracy": 0.7700831688940525, "num_tokens": 7536640.0, "step": 920 }, { "epoch": 0.26339080256310404, "grad_norm": 1.5690687894821167, "learning_rate": 8.764150943396226e-06, "loss": 0.1866, "mean_token_accuracy": 0.756568007916212, "num_tokens": 7618560.0, "step": 930 }, { "epoch": 0.26622296173044924, "grad_norm": 2.307182550430298, "learning_rate": 8.858490566037737e-06, "loss": 0.1565, "mean_token_accuracy": 0.7694104690104723, "num_tokens": 7700480.0, "step": 940 }, { "epoch": 0.26905512089779443, "grad_norm": 1.7262402772903442, "learning_rate": 8.952830188679246e-06, "loss": 0.1834, "mean_token_accuracy": 0.759197648242116, "num_tokens": 7782400.0, "step": 950 }, { "epoch": 0.2718872800651397, "grad_norm": 1.8886555433273315, "learning_rate": 9.047169811320755e-06, "loss": 0.1756, "mean_token_accuracy": 0.7790362030267716, "num_tokens": 7864320.0, "step": 960 }, { "epoch": 0.2747194392324849, "grad_norm": 1.3037172555923462, "learning_rate": 9.141509433962265e-06, "loss": 0.1779, "mean_token_accuracy": 0.7587818037718534, "num_tokens": 7946240.0, "step": 970 }, { "epoch": 0.27755159839983007, "grad_norm": 1.4751389026641846, "learning_rate": 9.235849056603774e-06, "loss": 0.2168, "mean_token_accuracy": 0.747052350640297, "num_tokens": 8028160.0, "step": 980 }, { "epoch": 0.28038375756717526, "grad_norm": 1.681829571723938, "learning_rate": 9.330188679245283e-06, "loss": 0.1992, "mean_token_accuracy": 0.7572773993015289, "num_tokens": 8110080.0, "step": 990 }, { "epoch": 0.28321591673452046, "grad_norm": 1.6176552772521973, "learning_rate": 9.424528301886792e-06, "loss": 0.1823, "mean_token_accuracy": 0.7453522514551878, "num_tokens": 8192000.0, "step": 1000 }, { "epoch": 0.2860480759018657, "grad_norm": 2.524972677230835, "learning_rate": 9.518867924528303e-06, "loss": 0.1856, "mean_token_accuracy": 0.7482142861932516, "num_tokens": 8273920.0, "step": 1010 }, { "epoch": 0.2888802350692109, "grad_norm": 1.5858063697814941, "learning_rate": 9.613207547169812e-06, "loss": 0.1634, "mean_token_accuracy": 0.7739970661699772, "num_tokens": 8355840.0, "step": 1020 }, { "epoch": 0.2917123942365561, "grad_norm": 1.5420701503753662, "learning_rate": 9.70754716981132e-06, "loss": 0.1842, "mean_token_accuracy": 0.7614236760884523, "num_tokens": 8437760.0, "step": 1030 }, { "epoch": 0.2945445534039013, "grad_norm": 1.387930989265442, "learning_rate": 9.801886792452831e-06, "loss": 0.1768, "mean_token_accuracy": 0.7464530315250159, "num_tokens": 8519680.0, "step": 1040 }, { "epoch": 0.2973767125712465, "grad_norm": 1.4573379755020142, "learning_rate": 9.89622641509434e-06, "loss": 0.1768, "mean_token_accuracy": 0.7664016596972942, "num_tokens": 8601600.0, "step": 1050 }, { "epoch": 0.30020887173859173, "grad_norm": 1.6145235300064087, "learning_rate": 9.990566037735849e-06, "loss": 0.1941, "mean_token_accuracy": 0.746453034132719, "num_tokens": 8683520.0, "step": 1060 }, { "epoch": 0.3030410309059369, "grad_norm": 1.4419227838516235, "learning_rate": 9.990559110458409e-06, "loss": 0.1666, "mean_token_accuracy": 0.7650807224214077, "num_tokens": 8765440.0, "step": 1070 }, { "epoch": 0.3058731900732821, "grad_norm": 1.5552211999893188, "learning_rate": 9.980069233189973e-06, "loss": 0.1791, "mean_token_accuracy": 0.7611545950174332, "num_tokens": 8847360.0, "step": 1080 }, { "epoch": 0.3087053492406273, "grad_norm": 2.0157933235168457, "learning_rate": 9.969579355921536e-06, "loss": 0.1761, "mean_token_accuracy": 0.7626590002328157, "num_tokens": 8929280.0, "step": 1090 }, { "epoch": 0.3115375084079725, "grad_norm": 1.5939455032348633, "learning_rate": 9.9590894786531e-06, "loss": 0.1876, "mean_token_accuracy": 0.7538405060768127, "num_tokens": 9011200.0, "step": 1100 }, { "epoch": 0.31436966757531776, "grad_norm": 1.5303661823272705, "learning_rate": 9.948599601384664e-06, "loss": 0.1857, "mean_token_accuracy": 0.7512720163911581, "num_tokens": 9093120.0, "step": 1110 }, { "epoch": 0.31720182674266295, "grad_norm": 1.552887201309204, "learning_rate": 9.938109724116229e-06, "loss": 0.1967, "mean_token_accuracy": 0.7368884541094303, "num_tokens": 9175040.0, "step": 1120 }, { "epoch": 0.32003398591000815, "grad_norm": 1.7698150873184204, "learning_rate": 9.927619846847793e-06, "loss": 0.1841, "mean_token_accuracy": 0.7445694729685783, "num_tokens": 9256960.0, "step": 1130 }, { "epoch": 0.32286614507735334, "grad_norm": 1.4545252323150635, "learning_rate": 9.917129969579356e-06, "loss": 0.1884, "mean_token_accuracy": 0.7585983354598284, "num_tokens": 9338880.0, "step": 1140 }, { "epoch": 0.32569830424469853, "grad_norm": 1.4329719543457031, "learning_rate": 9.906640092310922e-06, "loss": 0.1614, "mean_token_accuracy": 0.7699975535273552, "num_tokens": 9420800.0, "step": 1150 }, { "epoch": 0.3285304634120438, "grad_norm": 1.382524847984314, "learning_rate": 9.896150215042485e-06, "loss": 0.1851, "mean_token_accuracy": 0.7470645803958178, "num_tokens": 9502720.0, "step": 1160 }, { "epoch": 0.331362622579389, "grad_norm": 1.696998953819275, "learning_rate": 9.885660337774049e-06, "loss": 0.1773, "mean_token_accuracy": 0.7684319972991943, "num_tokens": 9584640.0, "step": 1170 }, { "epoch": 0.33419478174673417, "grad_norm": 1.4591037034988403, "learning_rate": 9.875170460505613e-06, "loss": 0.1859, "mean_token_accuracy": 0.7473825812339783, "num_tokens": 9666560.0, "step": 1180 }, { "epoch": 0.33702694091407936, "grad_norm": 1.4047170877456665, "learning_rate": 9.864680583237178e-06, "loss": 0.1928, "mean_token_accuracy": 0.7406678073108196, "num_tokens": 9748480.0, "step": 1190 }, { "epoch": 0.33985910008142456, "grad_norm": 1.5141708850860596, "learning_rate": 9.854190705968742e-06, "loss": 0.1789, "mean_token_accuracy": 0.7470645792782307, "num_tokens": 9830400.0, "step": 1200 }, { "epoch": 0.3426912592487698, "grad_norm": 1.480858564376831, "learning_rate": 9.843700828700305e-06, "loss": 0.1753, "mean_token_accuracy": 0.7382827743887901, "num_tokens": 9912320.0, "step": 1210 }, { "epoch": 0.345523418416115, "grad_norm": 1.7005478143692017, "learning_rate": 9.833210951431869e-06, "loss": 0.1779, "mean_token_accuracy": 0.7673067521303892, "num_tokens": 9994240.0, "step": 1220 }, { "epoch": 0.3483555775834602, "grad_norm": 1.1456356048583984, "learning_rate": 9.822721074163432e-06, "loss": 0.1622, "mean_token_accuracy": 0.7759295474737883, "num_tokens": 10076160.0, "step": 1230 }, { "epoch": 0.3511877367508054, "grad_norm": 1.5683114528656006, "learning_rate": 9.812231196894996e-06, "loss": 0.1566, "mean_token_accuracy": 0.7548679035156965, "num_tokens": 10158080.0, "step": 1240 }, { "epoch": 0.3540198959181506, "grad_norm": 1.7464343309402466, "learning_rate": 9.801741319626561e-06, "loss": 0.187, "mean_token_accuracy": 0.762108613550663, "num_tokens": 10240000.0, "step": 1250 }, { "epoch": 0.3568520550854958, "grad_norm": 1.6385228633880615, "learning_rate": 9.791251442358125e-06, "loss": 0.1741, "mean_token_accuracy": 0.7760885510593653, "num_tokens": 10321920.0, "step": 1260 }, { "epoch": 0.35968421425284103, "grad_norm": 1.4134849309921265, "learning_rate": 9.780761565089689e-06, "loss": 0.1746, "mean_token_accuracy": 0.7730430532246828, "num_tokens": 10403840.0, "step": 1270 }, { "epoch": 0.3625163734201862, "grad_norm": 1.717035174369812, "learning_rate": 9.770271687821254e-06, "loss": 0.206, "mean_token_accuracy": 0.7483977485448122, "num_tokens": 10485760.0, "step": 1280 }, { "epoch": 0.3653485325875314, "grad_norm": 1.2547250986099243, "learning_rate": 9.759781810552818e-06, "loss": 0.1897, "mean_token_accuracy": 0.7486179027706384, "num_tokens": 10567680.0, "step": 1290 }, { "epoch": 0.3681806917548766, "grad_norm": 1.6868046522140503, "learning_rate": 9.749291933284381e-06, "loss": 0.1805, "mean_token_accuracy": 0.765325341373682, "num_tokens": 10649600.0, "step": 1300 }, { "epoch": 0.3710128509222218, "grad_norm": 1.5401535034179688, "learning_rate": 9.738802056015945e-06, "loss": 0.1612, "mean_token_accuracy": 0.75227495059371, "num_tokens": 10731520.0, "step": 1310 }, { "epoch": 0.37384501008956705, "grad_norm": 1.3699342012405396, "learning_rate": 9.72831217874751e-06, "loss": 0.1884, "mean_token_accuracy": 0.7462940290570259, "num_tokens": 10813440.0, "step": 1320 }, { "epoch": 0.37667716925691225, "grad_norm": 1.4594292640686035, "learning_rate": 9.717822301479074e-06, "loss": 0.1651, "mean_token_accuracy": 0.7724070448428393, "num_tokens": 10895360.0, "step": 1330 }, { "epoch": 0.37950932842425744, "grad_norm": 1.6993435621261597, "learning_rate": 9.707332424210638e-06, "loss": 0.1753, "mean_token_accuracy": 0.7441780816763639, "num_tokens": 10977280.0, "step": 1340 }, { "epoch": 0.38234148759160264, "grad_norm": 1.5424118041992188, "learning_rate": 9.696842546942201e-06, "loss": 0.166, "mean_token_accuracy": 0.7565435383468866, "num_tokens": 11059200.0, "step": 1350 }, { "epoch": 0.38517364675894783, "grad_norm": 1.8051396608352661, "learning_rate": 9.686352669673765e-06, "loss": 0.1925, "mean_token_accuracy": 0.7536937363445759, "num_tokens": 11141120.0, "step": 1360 }, { "epoch": 0.3880058059262931, "grad_norm": 1.0824856758117676, "learning_rate": 9.675862792405329e-06, "loss": 0.1479, "mean_token_accuracy": 0.7759662423282861, "num_tokens": 11223040.0, "step": 1370 }, { "epoch": 0.3908379650936383, "grad_norm": 1.5008798837661743, "learning_rate": 9.665372915136894e-06, "loss": 0.1912, "mean_token_accuracy": 0.7346012704074383, "num_tokens": 11304960.0, "step": 1380 }, { "epoch": 0.39367012426098347, "grad_norm": 1.494032382965088, "learning_rate": 9.654883037868458e-06, "loss": 0.2023, "mean_token_accuracy": 0.7443003918975591, "num_tokens": 11386880.0, "step": 1390 }, { "epoch": 0.39650228342832866, "grad_norm": 1.5937719345092773, "learning_rate": 9.644393160600021e-06, "loss": 0.1821, "mean_token_accuracy": 0.7559686895459891, "num_tokens": 11468800.0, "step": 1400 }, { "epoch": 0.39933444259567386, "grad_norm": 1.3472212553024292, "learning_rate": 9.633903283331587e-06, "loss": 0.1827, "mean_token_accuracy": 0.7601394332945347, "num_tokens": 11550720.0, "step": 1410 }, { "epoch": 0.4021666017630191, "grad_norm": 1.5027636289596558, "learning_rate": 9.62341340606315e-06, "loss": 0.1789, "mean_token_accuracy": 0.7659491177648305, "num_tokens": 11632640.0, "step": 1420 }, { "epoch": 0.4049987609303643, "grad_norm": 1.5893715620040894, "learning_rate": 9.612923528794714e-06, "loss": 0.189, "mean_token_accuracy": 0.7464408036321402, "num_tokens": 11714560.0, "step": 1430 }, { "epoch": 0.4078309200977095, "grad_norm": 1.4710123538970947, "learning_rate": 9.602433651526278e-06, "loss": 0.1735, "mean_token_accuracy": 0.7448018580675125, "num_tokens": 11796480.0, "step": 1440 }, { "epoch": 0.4106630792650547, "grad_norm": 1.8729687929153442, "learning_rate": 9.591943774257843e-06, "loss": 0.1783, "mean_token_accuracy": 0.7488992147147655, "num_tokens": 11878400.0, "step": 1450 }, { "epoch": 0.4134952384323999, "grad_norm": 1.1461834907531738, "learning_rate": 9.581453896989405e-06, "loss": 0.1938, "mean_token_accuracy": 0.7519447162747384, "num_tokens": 11960320.0, "step": 1460 }, { "epoch": 0.41632739759974513, "grad_norm": 1.3577286005020142, "learning_rate": 9.57096401972097e-06, "loss": 0.198, "mean_token_accuracy": 0.7434809241443873, "num_tokens": 12042240.0, "step": 1470 }, { "epoch": 0.4191595567670903, "grad_norm": 1.5820889472961426, "learning_rate": 9.560474142452534e-06, "loss": 0.1828, "mean_token_accuracy": 0.7584637984633446, "num_tokens": 12124160.0, "step": 1480 }, { "epoch": 0.4219917159344355, "grad_norm": 1.5319749116897583, "learning_rate": 9.549984265184098e-06, "loss": 0.1718, "mean_token_accuracy": 0.7484466746449471, "num_tokens": 12206080.0, "step": 1490 }, { "epoch": 0.4248238751017807, "grad_norm": 1.5034176111221313, "learning_rate": 9.539494387915663e-06, "loss": 0.2045, "mean_token_accuracy": 0.7461350295692682, "num_tokens": 12288000.0, "step": 1500 }, { "epoch": 0.4276560342691259, "grad_norm": 1.3665423393249512, "learning_rate": 9.529004510647227e-06, "loss": 0.1868, "mean_token_accuracy": 0.7508561633527279, "num_tokens": 12369920.0, "step": 1510 }, { "epoch": 0.43048819343647116, "grad_norm": 1.5005970001220703, "learning_rate": 9.51851463337879e-06, "loss": 0.1679, "mean_token_accuracy": 0.7619006883352994, "num_tokens": 12451840.0, "step": 1520 }, { "epoch": 0.43332035260381635, "grad_norm": 1.434239387512207, "learning_rate": 9.508024756110354e-06, "loss": 0.1736, "mean_token_accuracy": 0.7496942266821861, "num_tokens": 12533760.0, "step": 1530 }, { "epoch": 0.43615251177116154, "grad_norm": 1.8729488849639893, "learning_rate": 9.49753487884192e-06, "loss": 0.1912, "mean_token_accuracy": 0.7433586075901986, "num_tokens": 12615680.0, "step": 1540 }, { "epoch": 0.43898467093850674, "grad_norm": 1.6623575687408447, "learning_rate": 9.487045001573483e-06, "loss": 0.1895, "mean_token_accuracy": 0.7445327788591385, "num_tokens": 12697600.0, "step": 1550 }, { "epoch": 0.44181683010585193, "grad_norm": 1.6611781120300293, "learning_rate": 9.476555124305047e-06, "loss": 0.1696, "mean_token_accuracy": 0.769838546589017, "num_tokens": 12779520.0, "step": 1560 }, { "epoch": 0.4446489892731971, "grad_norm": 1.4206360578536987, "learning_rate": 9.46606524703661e-06, "loss": 0.158, "mean_token_accuracy": 0.7799290582537651, "num_tokens": 12861440.0, "step": 1570 }, { "epoch": 0.4474811484405424, "grad_norm": 1.3145809173583984, "learning_rate": 9.455575369768174e-06, "loss": 0.1728, "mean_token_accuracy": 0.7565435416996479, "num_tokens": 12943360.0, "step": 1580 }, { "epoch": 0.45031330760788757, "grad_norm": 1.5584267377853394, "learning_rate": 9.445085492499737e-06, "loss": 0.1624, "mean_token_accuracy": 0.7784858159720898, "num_tokens": 13025280.0, "step": 1590 }, { "epoch": 0.45314546677523276, "grad_norm": 1.6326663494110107, "learning_rate": 9.434595615231303e-06, "loss": 0.1881, "mean_token_accuracy": 0.7289505872875452, "num_tokens": 13107200.0, "step": 1600 }, { "epoch": 0.45597762594257796, "grad_norm": 1.6709638833999634, "learning_rate": 9.424105737962866e-06, "loss": 0.2019, "mean_token_accuracy": 0.7344545021653175, "num_tokens": 13189120.0, "step": 1610 }, { "epoch": 0.45880978510992315, "grad_norm": 1.4046727418899536, "learning_rate": 9.41361586069443e-06, "loss": 0.1831, "mean_token_accuracy": 0.7332558706402779, "num_tokens": 13271040.0, "step": 1620 }, { "epoch": 0.4616419442772684, "grad_norm": 1.172830581665039, "learning_rate": 9.403125983425995e-06, "loss": 0.1723, "mean_token_accuracy": 0.7792196672409772, "num_tokens": 13352960.0, "step": 1630 }, { "epoch": 0.4644741034446136, "grad_norm": 1.5077433586120605, "learning_rate": 9.392636106157559e-06, "loss": 0.1579, "mean_token_accuracy": 0.7679672226309776, "num_tokens": 13434880.0, "step": 1640 }, { "epoch": 0.4673062626119588, "grad_norm": 1.376905083656311, "learning_rate": 9.382146228889123e-06, "loss": 0.154, "mean_token_accuracy": 0.7797822885215282, "num_tokens": 13516800.0, "step": 1650 }, { "epoch": 0.470138421779304, "grad_norm": 1.7126880884170532, "learning_rate": 9.371656351620686e-06, "loss": 0.1697, "mean_token_accuracy": 0.7503669280558825, "num_tokens": 13598720.0, "step": 1660 }, { "epoch": 0.4729705809466492, "grad_norm": 1.1549543142318726, "learning_rate": 9.361166474352252e-06, "loss": 0.1773, "mean_token_accuracy": 0.7700954005122185, "num_tokens": 13680640.0, "step": 1670 }, { "epoch": 0.4758027401139944, "grad_norm": 1.4870892763137817, "learning_rate": 9.350676597083815e-06, "loss": 0.1794, "mean_token_accuracy": 0.7565680038183927, "num_tokens": 13762560.0, "step": 1680 }, { "epoch": 0.4786348992813396, "grad_norm": 1.429553508758545, "learning_rate": 9.340186719815379e-06, "loss": 0.1899, "mean_token_accuracy": 0.7497431498020888, "num_tokens": 13844480.0, "step": 1690 }, { "epoch": 0.4814670584486848, "grad_norm": 1.374057650566101, "learning_rate": 9.329696842546943e-06, "loss": 0.1791, "mean_token_accuracy": 0.743480920419097, "num_tokens": 13926400.0, "step": 1700 }, { "epoch": 0.48429921761603, "grad_norm": 1.4187721014022827, "learning_rate": 9.319206965278506e-06, "loss": 0.1637, "mean_token_accuracy": 0.7638943243771792, "num_tokens": 14008320.0, "step": 1710 }, { "epoch": 0.4871313767833752, "grad_norm": 1.6536214351654053, "learning_rate": 9.30871708801007e-06, "loss": 0.181, "mean_token_accuracy": 0.7542074382305145, "num_tokens": 14090240.0, "step": 1720 }, { "epoch": 0.48996353595072045, "grad_norm": 1.1387007236480713, "learning_rate": 9.298227210741635e-06, "loss": 0.1571, "mean_token_accuracy": 0.7923801340162754, "num_tokens": 14172160.0, "step": 1730 }, { "epoch": 0.49279569511806565, "grad_norm": 1.2004634141921997, "learning_rate": 9.287737333473199e-06, "loss": 0.174, "mean_token_accuracy": 0.7598581194877625, "num_tokens": 14254080.0, "step": 1740 }, { "epoch": 0.49562785428541084, "grad_norm": 1.7080684900283813, "learning_rate": 9.277247456204763e-06, "loss": 0.1857, "mean_token_accuracy": 0.7612524475902319, "num_tokens": 14336000.0, "step": 1750 }, { "epoch": 0.49846001345275603, "grad_norm": 1.2792556285858154, "learning_rate": 9.266757578936328e-06, "loss": 0.1782, "mean_token_accuracy": 0.7547089047729969, "num_tokens": 14417920.0, "step": 1760 }, { "epoch": 0.5012921726201013, "grad_norm": 1.3763514757156372, "learning_rate": 9.256267701667892e-06, "loss": 0.1776, "mean_token_accuracy": 0.7574853233993053, "num_tokens": 14499840.0, "step": 1770 }, { "epoch": 0.5041243317874464, "grad_norm": 1.4386488199234009, "learning_rate": 9.245777824399455e-06, "loss": 0.1826, "mean_token_accuracy": 0.7463307186961174, "num_tokens": 14581760.0, "step": 1780 }, { "epoch": 0.5069564909547917, "grad_norm": 1.232284665107727, "learning_rate": 9.235287947131019e-06, "loss": 0.1689, "mean_token_accuracy": 0.7607509799301624, "num_tokens": 14663680.0, "step": 1790 }, { "epoch": 0.5097886501221368, "grad_norm": 1.2922643423080444, "learning_rate": 9.224798069862584e-06, "loss": 0.1596, "mean_token_accuracy": 0.7610078312456607, "num_tokens": 14745600.0, "step": 1800 }, { "epoch": 0.5126208092894821, "grad_norm": 1.464067816734314, "learning_rate": 9.214308192594146e-06, "loss": 0.1602, "mean_token_accuracy": 0.7783512733876705, "num_tokens": 14827520.0, "step": 1810 }, { "epoch": 0.5154529684568273, "grad_norm": 1.3232061862945557, "learning_rate": 9.203818315325712e-06, "loss": 0.2053, "mean_token_accuracy": 0.7439090043306351, "num_tokens": 14909440.0, "step": 1820 }, { "epoch": 0.5182851276241724, "grad_norm": 1.03334641456604, "learning_rate": 9.193328438057275e-06, "loss": 0.1668, "mean_token_accuracy": 0.7713429540395736, "num_tokens": 14991360.0, "step": 1830 }, { "epoch": 0.5211172867915177, "grad_norm": 1.3097628355026245, "learning_rate": 9.182838560788839e-06, "loss": 0.1718, "mean_token_accuracy": 0.7638209369033575, "num_tokens": 15073280.0, "step": 1840 }, { "epoch": 0.5239494459588628, "grad_norm": 1.0242208242416382, "learning_rate": 9.172348683520403e-06, "loss": 0.1659, "mean_token_accuracy": 0.769740704447031, "num_tokens": 15155200.0, "step": 1850 }, { "epoch": 0.5267816051262081, "grad_norm": 1.0100687742233276, "learning_rate": 9.161858806251968e-06, "loss": 0.162, "mean_token_accuracy": 0.7591364931315183, "num_tokens": 15237120.0, "step": 1860 }, { "epoch": 0.5296137642935533, "grad_norm": 1.5963248014450073, "learning_rate": 9.151368928983532e-06, "loss": 0.1601, "mean_token_accuracy": 0.7817270051687956, "num_tokens": 15319040.0, "step": 1870 }, { "epoch": 0.5324459234608985, "grad_norm": 1.2102185487747192, "learning_rate": 9.140879051715095e-06, "loss": 0.1942, "mean_token_accuracy": 0.7444716237485409, "num_tokens": 15400960.0, "step": 1880 }, { "epoch": 0.5352780826282437, "grad_norm": 1.4839965105056763, "learning_rate": 9.13038917444666e-06, "loss": 0.1713, "mean_token_accuracy": 0.7551247533410788, "num_tokens": 15482880.0, "step": 1890 }, { "epoch": 0.5381102417955889, "grad_norm": 1.0946964025497437, "learning_rate": 9.119899297178224e-06, "loss": 0.1774, "mean_token_accuracy": 0.7500733818858862, "num_tokens": 15564800.0, "step": 1900 }, { "epoch": 0.5409424009629341, "grad_norm": 1.4754222631454468, "learning_rate": 9.109409419909788e-06, "loss": 0.1912, "mean_token_accuracy": 0.7326932437717915, "num_tokens": 15646720.0, "step": 1910 }, { "epoch": 0.5437745601302794, "grad_norm": 1.741654396057129, "learning_rate": 9.098919542641352e-06, "loss": 0.1881, "mean_token_accuracy": 0.7564579259604216, "num_tokens": 15728640.0, "step": 1920 }, { "epoch": 0.5466067192976245, "grad_norm": 1.2974053621292114, "learning_rate": 9.088429665372915e-06, "loss": 0.1829, "mean_token_accuracy": 0.7555528402328491, "num_tokens": 15810560.0, "step": 1930 }, { "epoch": 0.5494388784649697, "grad_norm": 1.1609007120132446, "learning_rate": 9.077939788104479e-06, "loss": 0.2078, "mean_token_accuracy": 0.7468077309429646, "num_tokens": 15892480.0, "step": 1940 }, { "epoch": 0.5522710376323149, "grad_norm": 1.1889935731887817, "learning_rate": 9.067449910836044e-06, "loss": 0.1687, "mean_token_accuracy": 0.7571917813271284, "num_tokens": 15974400.0, "step": 1950 }, { "epoch": 0.5551031967996601, "grad_norm": 1.1042529344558716, "learning_rate": 9.056960033567608e-06, "loss": 0.1815, "mean_token_accuracy": 0.7431751452386379, "num_tokens": 16056320.0, "step": 1960 }, { "epoch": 0.5579353559670054, "grad_norm": 1.4160428047180176, "learning_rate": 9.046470156299171e-06, "loss": 0.1498, "mean_token_accuracy": 0.7774951070547104, "num_tokens": 16138240.0, "step": 1970 }, { "epoch": 0.5607675151343505, "grad_norm": 1.5061430931091309, "learning_rate": 9.035980279030737e-06, "loss": 0.1856, "mean_token_accuracy": 0.753571430966258, "num_tokens": 16220160.0, "step": 1980 }, { "epoch": 0.5635996743016958, "grad_norm": 1.5059019327163696, "learning_rate": 9.0254904017623e-06, "loss": 0.1745, "mean_token_accuracy": 0.742893836274743, "num_tokens": 16302080.0, "step": 1990 }, { "epoch": 0.5664318334690409, "grad_norm": 1.8156989812850952, "learning_rate": 9.015000524493864e-06, "loss": 0.1766, "mean_token_accuracy": 0.7376467682421207, "num_tokens": 16384000.0, "step": 2000 }, { "epoch": 0.5692639926363862, "grad_norm": 1.6337097883224487, "learning_rate": 9.004510647225428e-06, "loss": 0.171, "mean_token_accuracy": 0.7512108586728573, "num_tokens": 16465920.0, "step": 2010 }, { "epoch": 0.5720961518037314, "grad_norm": 1.2677297592163086, "learning_rate": 8.994020769956993e-06, "loss": 0.1924, "mean_token_accuracy": 0.7320450074970722, "num_tokens": 16547840.0, "step": 2020 }, { "epoch": 0.5749283109710766, "grad_norm": 1.142505407333374, "learning_rate": 8.983530892688557e-06, "loss": 0.1757, "mean_token_accuracy": 0.7441413898020983, "num_tokens": 16629760.0, "step": 2030 }, { "epoch": 0.5777604701384218, "grad_norm": 1.18378746509552, "learning_rate": 8.97304101542012e-06, "loss": 0.1758, "mean_token_accuracy": 0.7544520568102598, "num_tokens": 16711680.0, "step": 2040 }, { "epoch": 0.5805926293057669, "grad_norm": 1.37006676197052, "learning_rate": 8.962551138151684e-06, "loss": 0.1382, "mean_token_accuracy": 0.7852617416530847, "num_tokens": 16793600.0, "step": 2050 }, { "epoch": 0.5834247884731122, "grad_norm": 1.435603141784668, "learning_rate": 8.952061260883248e-06, "loss": 0.1422, "mean_token_accuracy": 0.783781798183918, "num_tokens": 16875520.0, "step": 2060 }, { "epoch": 0.5862569476404574, "grad_norm": 1.6944469213485718, "learning_rate": 8.941571383614811e-06, "loss": 0.1973, "mean_token_accuracy": 0.7334637954831124, "num_tokens": 16957440.0, "step": 2070 }, { "epoch": 0.5890891068078026, "grad_norm": 1.4558755159378052, "learning_rate": 8.931081506346377e-06, "loss": 0.1886, "mean_token_accuracy": 0.7556017655879259, "num_tokens": 17039360.0, "step": 2080 }, { "epoch": 0.5919212659751478, "grad_norm": 1.1582504510879517, "learning_rate": 8.92059162907794e-06, "loss": 0.1633, "mean_token_accuracy": 0.7658268067985773, "num_tokens": 17121280.0, "step": 2090 }, { "epoch": 0.594753425142493, "grad_norm": 1.3503059148788452, "learning_rate": 8.910101751809504e-06, "loss": 0.174, "mean_token_accuracy": 0.7463551849126816, "num_tokens": 17203200.0, "step": 2100 }, { "epoch": 0.5975855843098382, "grad_norm": 1.2336933612823486, "learning_rate": 8.89961187454107e-06, "loss": 0.1848, "mean_token_accuracy": 0.7446795504540205, "num_tokens": 17285120.0, "step": 2110 }, { "epoch": 0.6004177434771835, "grad_norm": 1.483437180519104, "learning_rate": 8.889121997272633e-06, "loss": 0.1695, "mean_token_accuracy": 0.7712818019092083, "num_tokens": 17367040.0, "step": 2120 }, { "epoch": 0.6032499026445286, "grad_norm": 1.1503459215164185, "learning_rate": 8.878632120004197e-06, "loss": 0.1841, "mean_token_accuracy": 0.742465752735734, "num_tokens": 17448960.0, "step": 2130 }, { "epoch": 0.6060820618118739, "grad_norm": 1.1363428831100464, "learning_rate": 8.86814224273576e-06, "loss": 0.1427, "mean_token_accuracy": 0.7742661446332931, "num_tokens": 17530880.0, "step": 2140 }, { "epoch": 0.608914220979219, "grad_norm": 1.3419382572174072, "learning_rate": 8.857652365467326e-06, "loss": 0.16, "mean_token_accuracy": 0.7700220141559839, "num_tokens": 17612800.0, "step": 2150 }, { "epoch": 0.6117463801465642, "grad_norm": 1.662088394165039, "learning_rate": 8.84716248819889e-06, "loss": 0.1554, "mean_token_accuracy": 0.7936276912689209, "num_tokens": 17694720.0, "step": 2160 }, { "epoch": 0.6145785393139095, "grad_norm": 1.3297075033187866, "learning_rate": 8.836672610930453e-06, "loss": 0.1737, "mean_token_accuracy": 0.7436643790453672, "num_tokens": 17776640.0, "step": 2170 }, { "epoch": 0.6174106984812546, "grad_norm": 1.0014644861221313, "learning_rate": 8.826182733662017e-06, "loss": 0.1924, "mean_token_accuracy": 0.751846868172288, "num_tokens": 17858560.0, "step": 2180 }, { "epoch": 0.6202428576485999, "grad_norm": 1.3324036598205566, "learning_rate": 8.81569285639358e-06, "loss": 0.1574, "mean_token_accuracy": 0.7664628185331821, "num_tokens": 17940480.0, "step": 2190 }, { "epoch": 0.623075016815945, "grad_norm": 1.3868496417999268, "learning_rate": 8.805202979125144e-06, "loss": 0.1431, "mean_token_accuracy": 0.7895792543888092, "num_tokens": 18022400.0, "step": 2200 }, { "epoch": 0.6259071759832903, "grad_norm": 1.2042073011398315, "learning_rate": 8.79471310185671e-06, "loss": 0.1986, "mean_token_accuracy": 0.7573874734342099, "num_tokens": 18104320.0, "step": 2210 }, { "epoch": 0.6287393351506355, "grad_norm": 1.271372675895691, "learning_rate": 8.784223224588273e-06, "loss": 0.1824, "mean_token_accuracy": 0.7366682969033718, "num_tokens": 18186240.0, "step": 2220 }, { "epoch": 0.6315714943179807, "grad_norm": 1.3485766649246216, "learning_rate": 8.773733347319837e-06, "loss": 0.1819, "mean_token_accuracy": 0.7487402144819498, "num_tokens": 18268160.0, "step": 2230 }, { "epoch": 0.6344036534853259, "grad_norm": 1.2651562690734863, "learning_rate": 8.763243470051402e-06, "loss": 0.2052, "mean_token_accuracy": 0.7405455011874438, "num_tokens": 18350080.0, "step": 2240 }, { "epoch": 0.637235812652671, "grad_norm": 1.3391584157943726, "learning_rate": 8.752753592782966e-06, "loss": 0.1847, "mean_token_accuracy": 0.7606164362281561, "num_tokens": 18432000.0, "step": 2250 }, { "epoch": 0.6400679718200163, "grad_norm": 1.5242635011672974, "learning_rate": 8.74226371551453e-06, "loss": 0.1701, "mean_token_accuracy": 0.7709637947380543, "num_tokens": 18513920.0, "step": 2260 }, { "epoch": 0.6429001309873615, "grad_norm": 1.1486823558807373, "learning_rate": 8.731773838246093e-06, "loss": 0.1428, "mean_token_accuracy": 0.7824608612805605, "num_tokens": 18595840.0, "step": 2270 }, { "epoch": 0.6457322901547067, "grad_norm": 1.2304952144622803, "learning_rate": 8.721283960977657e-06, "loss": 0.1747, "mean_token_accuracy": 0.7551247533410788, "num_tokens": 18677760.0, "step": 2280 }, { "epoch": 0.6485644493220519, "grad_norm": 1.6934930086135864, "learning_rate": 8.71079408370922e-06, "loss": 0.1919, "mean_token_accuracy": 0.7427592962980271, "num_tokens": 18759680.0, "step": 2290 }, { "epoch": 0.6513966084893971, "grad_norm": 1.3010927438735962, "learning_rate": 8.700304206440786e-06, "loss": 0.1728, "mean_token_accuracy": 0.7676614500582218, "num_tokens": 18841600.0, "step": 2300 }, { "epoch": 0.6542287676567423, "grad_norm": 1.4188960790634155, "learning_rate": 8.68981432917235e-06, "loss": 0.1783, "mean_token_accuracy": 0.760494127497077, "num_tokens": 18923520.0, "step": 2310 }, { "epoch": 0.6570609268240876, "grad_norm": 1.5634803771972656, "learning_rate": 8.679324451903913e-06, "loss": 0.1684, "mean_token_accuracy": 0.7487035218626261, "num_tokens": 19005440.0, "step": 2320 }, { "epoch": 0.6598930859914327, "grad_norm": 1.5351383686065674, "learning_rate": 8.668834574635478e-06, "loss": 0.1563, "mean_token_accuracy": 0.7576565533876419, "num_tokens": 19087360.0, "step": 2330 }, { "epoch": 0.662725245158778, "grad_norm": 1.0073812007904053, "learning_rate": 8.658344697367042e-06, "loss": 0.1714, "mean_token_accuracy": 0.7514187876135111, "num_tokens": 19169280.0, "step": 2340 }, { "epoch": 0.6655574043261231, "grad_norm": 1.3139631748199463, "learning_rate": 8.647854820098605e-06, "loss": 0.1981, "mean_token_accuracy": 0.7340264186263085, "num_tokens": 19251200.0, "step": 2350 }, { "epoch": 0.6683895634934683, "grad_norm": 1.451295018196106, "learning_rate": 8.637364942830169e-06, "loss": 0.1717, "mean_token_accuracy": 0.7439701590687037, "num_tokens": 19333120.0, "step": 2360 }, { "epoch": 0.6712217226608136, "grad_norm": 1.2492328882217407, "learning_rate": 8.626875065561734e-06, "loss": 0.1619, "mean_token_accuracy": 0.7639677129685879, "num_tokens": 19415040.0, "step": 2370 }, { "epoch": 0.6740538818281587, "grad_norm": 1.2084134817123413, "learning_rate": 8.616385188293298e-06, "loss": 0.1615, "mean_token_accuracy": 0.762181993573904, "num_tokens": 19496960.0, "step": 2380 }, { "epoch": 0.676886040995504, "grad_norm": 1.1642097234725952, "learning_rate": 8.605895311024862e-06, "loss": 0.1981, "mean_token_accuracy": 0.7449730925261975, "num_tokens": 19578880.0, "step": 2390 }, { "epoch": 0.6797182001628491, "grad_norm": 1.147381067276001, "learning_rate": 8.595405433756425e-06, "loss": 0.1697, "mean_token_accuracy": 0.7619740672409534, "num_tokens": 19660800.0, "step": 2400 }, { "epoch": 0.6825503593301944, "grad_norm": 1.1871576309204102, "learning_rate": 8.584915556487989e-06, "loss": 0.1619, "mean_token_accuracy": 0.7527886509895325, "num_tokens": 19742720.0, "step": 2410 }, { "epoch": 0.6853825184975396, "grad_norm": 1.5566953420639038, "learning_rate": 8.574425679219553e-06, "loss": 0.1666, "mean_token_accuracy": 0.7512108597904443, "num_tokens": 19824640.0, "step": 2420 }, { "epoch": 0.6882146776648848, "grad_norm": 1.1822768449783325, "learning_rate": 8.563935801951118e-06, "loss": 0.1805, "mean_token_accuracy": 0.7522015657275916, "num_tokens": 19906560.0, "step": 2430 }, { "epoch": 0.69104683683223, "grad_norm": 1.454461693763733, "learning_rate": 8.553445924682682e-06, "loss": 0.1352, "mean_token_accuracy": 0.7862891390919685, "num_tokens": 19988480.0, "step": 2440 }, { "epoch": 0.6938789959995751, "grad_norm": 1.5111033916473389, "learning_rate": 8.542956047414245e-06, "loss": 0.1452, "mean_token_accuracy": 0.7622798405587673, "num_tokens": 20070400.0, "step": 2450 }, { "epoch": 0.6967111551669204, "grad_norm": 1.065933108329773, "learning_rate": 8.53246617014581e-06, "loss": 0.1874, "mean_token_accuracy": 0.734662427380681, "num_tokens": 20152320.0, "step": 2460 }, { "epoch": 0.6995433143342655, "grad_norm": 1.5525884628295898, "learning_rate": 8.521976292877374e-06, "loss": 0.165, "mean_token_accuracy": 0.7557729911059141, "num_tokens": 20234240.0, "step": 2470 }, { "epoch": 0.7023754735016108, "grad_norm": 1.109391212463379, "learning_rate": 8.511486415608938e-06, "loss": 0.1533, "mean_token_accuracy": 0.7744496073573828, "num_tokens": 20316160.0, "step": 2480 }, { "epoch": 0.705207632668956, "grad_norm": 1.4385027885437012, "learning_rate": 8.500996538340502e-06, "loss": 0.2138, "mean_token_accuracy": 0.7235445238649845, "num_tokens": 20398080.0, "step": 2490 }, { "epoch": 0.7080397918363012, "grad_norm": 1.4169104099273682, "learning_rate": 8.490506661072067e-06, "loss": 0.1599, "mean_token_accuracy": 0.7705968704074622, "num_tokens": 20480000.0, "step": 2500 }, { "epoch": 0.7108719510036464, "grad_norm": 1.1053571701049805, "learning_rate": 8.48001678380363e-06, "loss": 0.2001, "mean_token_accuracy": 0.7382583156228065, "num_tokens": 20561920.0, "step": 2510 }, { "epoch": 0.7137041101709916, "grad_norm": 1.3876012563705444, "learning_rate": 8.469526906535194e-06, "loss": 0.1672, "mean_token_accuracy": 0.7606042090803384, "num_tokens": 20643840.0, "step": 2520 }, { "epoch": 0.7165362693383368, "grad_norm": 1.4283491373062134, "learning_rate": 8.459037029266758e-06, "loss": 0.1586, "mean_token_accuracy": 0.7576565530151129, "num_tokens": 20725760.0, "step": 2530 }, { "epoch": 0.7193684285056821, "grad_norm": 1.2347828149795532, "learning_rate": 8.448547151998322e-06, "loss": 0.1851, "mean_token_accuracy": 0.7495719160884619, "num_tokens": 20807680.0, "step": 2540 }, { "epoch": 0.7222005876730272, "grad_norm": 1.6317230463027954, "learning_rate": 8.438057274729885e-06, "loss": 0.2028, "mean_token_accuracy": 0.7395547956228257, "num_tokens": 20889600.0, "step": 2550 }, { "epoch": 0.7250327468403724, "grad_norm": 1.1536918878555298, "learning_rate": 8.42756739746145e-06, "loss": 0.1637, "mean_token_accuracy": 0.7681873787194491, "num_tokens": 20971520.0, "step": 2560 }, { "epoch": 0.7278649060077176, "grad_norm": 1.200775384902954, "learning_rate": 8.417077520193014e-06, "loss": 0.1524, "mean_token_accuracy": 0.7535469718277454, "num_tokens": 21053440.0, "step": 2570 }, { "epoch": 0.7306970651750628, "grad_norm": 1.301642656326294, "learning_rate": 8.406587642924578e-06, "loss": 0.169, "mean_token_accuracy": 0.7641144812107086, "num_tokens": 21135360.0, "step": 2580 }, { "epoch": 0.7335292243424081, "grad_norm": 1.3900648355484009, "learning_rate": 8.396097765656143e-06, "loss": 0.1586, "mean_token_accuracy": 0.8010885536670684, "num_tokens": 21217280.0, "step": 2590 }, { "epoch": 0.7363613835097532, "grad_norm": 1.4697816371917725, "learning_rate": 8.385607888387707e-06, "loss": 0.2202, "mean_token_accuracy": 0.7074486285448074, "num_tokens": 21299200.0, "step": 2600 }, { "epoch": 0.7391935426770985, "grad_norm": 1.1869508028030396, "learning_rate": 8.37511801111927e-06, "loss": 0.1856, "mean_token_accuracy": 0.7503424655646086, "num_tokens": 21381120.0, "step": 2610 }, { "epoch": 0.7420257018444436, "grad_norm": 1.3715872764587402, "learning_rate": 8.364628133850834e-06, "loss": 0.1539, "mean_token_accuracy": 0.7680406052619219, "num_tokens": 21463040.0, "step": 2620 }, { "epoch": 0.7448578610117889, "grad_norm": 1.3592404127120972, "learning_rate": 8.3541382565824e-06, "loss": 0.1716, "mean_token_accuracy": 0.751406553760171, "num_tokens": 21544960.0, "step": 2630 }, { "epoch": 0.7476900201791341, "grad_norm": 1.3478460311889648, "learning_rate": 8.343648379313962e-06, "loss": 0.1528, "mean_token_accuracy": 0.7696673188358545, "num_tokens": 21626880.0, "step": 2640 }, { "epoch": 0.7505221793464792, "grad_norm": 1.1898484230041504, "learning_rate": 8.333158502045527e-06, "loss": 0.1712, "mean_token_accuracy": 0.7787548899650574, "num_tokens": 21708800.0, "step": 2650 }, { "epoch": 0.7533543385138245, "grad_norm": 1.1102921962738037, "learning_rate": 8.32266862477709e-06, "loss": 0.1704, "mean_token_accuracy": 0.7460004903376103, "num_tokens": 21790720.0, "step": 2660 }, { "epoch": 0.7561864976811696, "grad_norm": 1.4233191013336182, "learning_rate": 8.312178747508654e-06, "loss": 0.1478, "mean_token_accuracy": 0.7663894306868315, "num_tokens": 21872640.0, "step": 2670 }, { "epoch": 0.7590186568485149, "grad_norm": 1.4859589338302612, "learning_rate": 8.30168887024022e-06, "loss": 0.2034, "mean_token_accuracy": 0.7479207452386618, "num_tokens": 21954560.0, "step": 2680 }, { "epoch": 0.7618508160158601, "grad_norm": 1.936881184577942, "learning_rate": 8.291198992971783e-06, "loss": 0.2002, "mean_token_accuracy": 0.7527274955064058, "num_tokens": 22036480.0, "step": 2690 }, { "epoch": 0.7646829751832053, "grad_norm": 1.481729507446289, "learning_rate": 8.280709115703347e-06, "loss": 0.166, "mean_token_accuracy": 0.7429916851222516, "num_tokens": 22118400.0, "step": 2700 }, { "epoch": 0.7675151343505505, "grad_norm": 1.3797627687454224, "learning_rate": 8.27021923843491e-06, "loss": 0.1909, "mean_token_accuracy": 0.7521526429802179, "num_tokens": 22200320.0, "step": 2710 }, { "epoch": 0.7703472935178957, "grad_norm": 1.4727095365524292, "learning_rate": 8.259729361166476e-06, "loss": 0.1746, "mean_token_accuracy": 0.7602128215134144, "num_tokens": 22282240.0, "step": 2720 }, { "epoch": 0.7731794526852409, "grad_norm": 1.140485167503357, "learning_rate": 8.24923948389804e-06, "loss": 0.1766, "mean_token_accuracy": 0.7472113493829966, "num_tokens": 22364160.0, "step": 2730 }, { "epoch": 0.7760116118525862, "grad_norm": 1.3125388622283936, "learning_rate": 8.238749606629603e-06, "loss": 0.1497, "mean_token_accuracy": 0.7740582197904586, "num_tokens": 22446080.0, "step": 2740 }, { "epoch": 0.7788437710199313, "grad_norm": 1.659411907196045, "learning_rate": 8.228259729361168e-06, "loss": 0.1589, "mean_token_accuracy": 0.7627446215599776, "num_tokens": 22528000.0, "step": 2750 }, { "epoch": 0.7816759301872765, "grad_norm": 1.2783703804016113, "learning_rate": 8.21776985209273e-06, "loss": 0.1644, "mean_token_accuracy": 0.7572529379278421, "num_tokens": 22609920.0, "step": 2760 }, { "epoch": 0.7845080893546217, "grad_norm": 1.441229224205017, "learning_rate": 8.207279974824294e-06, "loss": 0.1494, "mean_token_accuracy": 0.7544642824679613, "num_tokens": 22691840.0, "step": 2770 }, { "epoch": 0.7873402485219669, "grad_norm": 1.2756493091583252, "learning_rate": 8.19679009755586e-06, "loss": 0.2087, "mean_token_accuracy": 0.7334148738533258, "num_tokens": 22773760.0, "step": 2780 }, { "epoch": 0.7901724076893122, "grad_norm": 1.2635856866836548, "learning_rate": 8.186300220287423e-06, "loss": 0.1961, "mean_token_accuracy": 0.7443615455180407, "num_tokens": 22855680.0, "step": 2790 }, { "epoch": 0.7930045668566573, "grad_norm": 1.1410996913909912, "learning_rate": 8.175810343018987e-06, "loss": 0.1903, "mean_token_accuracy": 0.7431262202560902, "num_tokens": 22937600.0, "step": 2800 }, { "epoch": 0.7958367260240026, "grad_norm": 1.0032896995544434, "learning_rate": 8.165320465750552e-06, "loss": 0.1543, "mean_token_accuracy": 0.7704990223050118, "num_tokens": 23019520.0, "step": 2810 }, { "epoch": 0.7986688851913477, "grad_norm": 1.3099137544631958, "learning_rate": 8.154830588482116e-06, "loss": 0.1548, "mean_token_accuracy": 0.7717465735971928, "num_tokens": 23101440.0, "step": 2820 }, { "epoch": 0.801501044358693, "grad_norm": 1.2551571130752563, "learning_rate": 8.14434071121368e-06, "loss": 0.1605, "mean_token_accuracy": 0.7638820949941874, "num_tokens": 23183360.0, "step": 2830 }, { "epoch": 0.8043332035260382, "grad_norm": 1.0557252168655396, "learning_rate": 8.133850833945243e-06, "loss": 0.1658, "mean_token_accuracy": 0.7664138920605182, "num_tokens": 23265280.0, "step": 2840 }, { "epoch": 0.8071653626933833, "grad_norm": 1.4299802780151367, "learning_rate": 8.123360956676808e-06, "loss": 0.1789, "mean_token_accuracy": 0.7451810210943222, "num_tokens": 23347200.0, "step": 2850 }, { "epoch": 0.8099975218607286, "grad_norm": 1.3597182035446167, "learning_rate": 8.112871079408372e-06, "loss": 0.2101, "mean_token_accuracy": 0.7526296459138393, "num_tokens": 23429120.0, "step": 2860 }, { "epoch": 0.8128296810280737, "grad_norm": 1.0189119577407837, "learning_rate": 8.102381202139936e-06, "loss": 0.1717, "mean_token_accuracy": 0.767196673899889, "num_tokens": 23511040.0, "step": 2870 }, { "epoch": 0.815661840195419, "grad_norm": 1.206203579902649, "learning_rate": 8.0918913248715e-06, "loss": 0.1733, "mean_token_accuracy": 0.746844419836998, "num_tokens": 23592960.0, "step": 2880 }, { "epoch": 0.8184939993627642, "grad_norm": 1.3932020664215088, "learning_rate": 8.081401447603063e-06, "loss": 0.1906, "mean_token_accuracy": 0.7472235806286335, "num_tokens": 23674880.0, "step": 2890 }, { "epoch": 0.8213261585301094, "grad_norm": 1.356952428817749, "learning_rate": 8.070911570334627e-06, "loss": 0.1632, "mean_token_accuracy": 0.7697284713387489, "num_tokens": 23756800.0, "step": 2900 }, { "epoch": 0.8241583176974546, "grad_norm": 1.5694111585617065, "learning_rate": 8.060421693066192e-06, "loss": 0.1567, "mean_token_accuracy": 0.7708047945052385, "num_tokens": 23838720.0, "step": 2910 }, { "epoch": 0.8269904768647998, "grad_norm": 1.342678189277649, "learning_rate": 8.049931815797756e-06, "loss": 0.1503, "mean_token_accuracy": 0.7684931494295597, "num_tokens": 23920640.0, "step": 2920 }, { "epoch": 0.829822636032145, "grad_norm": 1.040459156036377, "learning_rate": 8.03944193852932e-06, "loss": 0.1595, "mean_token_accuracy": 0.7476394321769476, "num_tokens": 24002560.0, "step": 2930 }, { "epoch": 0.8326547951994903, "grad_norm": 1.049788236618042, "learning_rate": 8.028952061260885e-06, "loss": 0.1516, "mean_token_accuracy": 0.7659613501280547, "num_tokens": 24084480.0, "step": 2940 }, { "epoch": 0.8354869543668354, "grad_norm": 1.5024118423461914, "learning_rate": 8.018462183992448e-06, "loss": 0.1543, "mean_token_accuracy": 0.766157042980194, "num_tokens": 24166400.0, "step": 2950 }, { "epoch": 0.8383191135341806, "grad_norm": 1.4191840887069702, "learning_rate": 8.007972306724012e-06, "loss": 0.1553, "mean_token_accuracy": 0.7762720167636872, "num_tokens": 24248320.0, "step": 2960 }, { "epoch": 0.8411512727015258, "grad_norm": 1.8435730934143066, "learning_rate": 7.997482429455576e-06, "loss": 0.1957, "mean_token_accuracy": 0.7429672233760357, "num_tokens": 24330240.0, "step": 2970 }, { "epoch": 0.843983431868871, "grad_norm": 1.5547730922698975, "learning_rate": 7.986992552187141e-06, "loss": 0.1676, "mean_token_accuracy": 0.7445694722235203, "num_tokens": 24412160.0, "step": 2980 }, { "epoch": 0.8468155910362163, "grad_norm": 1.5078247785568237, "learning_rate": 7.976502674918703e-06, "loss": 0.1695, "mean_token_accuracy": 0.7468933440744877, "num_tokens": 24494080.0, "step": 2990 }, { "epoch": 0.8496477502035614, "grad_norm": 1.1769497394561768, "learning_rate": 7.966012797650268e-06, "loss": 0.1529, "mean_token_accuracy": 0.7612402152270079, "num_tokens": 24576000.0, "step": 3000 }, { "epoch": 0.8524799093709067, "grad_norm": 1.2363755702972412, "learning_rate": 7.955522920381832e-06, "loss": 0.1642, "mean_token_accuracy": 0.7628669273108244, "num_tokens": 24657920.0, "step": 3010 }, { "epoch": 0.8553120685382518, "grad_norm": 1.3236780166625977, "learning_rate": 7.945033043113396e-06, "loss": 0.17, "mean_token_accuracy": 0.7679794508963823, "num_tokens": 24739840.0, "step": 3020 }, { "epoch": 0.8581442277055971, "grad_norm": 1.2692599296569824, "learning_rate": 7.934543165844961e-06, "loss": 0.1562, "mean_token_accuracy": 0.7695083174854517, "num_tokens": 24821760.0, "step": 3030 }, { "epoch": 0.8609763868729423, "grad_norm": 1.6181145906448364, "learning_rate": 7.924053288576525e-06, "loss": 0.189, "mean_token_accuracy": 0.7479696679860354, "num_tokens": 24903680.0, "step": 3040 }, { "epoch": 0.8638085460402875, "grad_norm": 1.2541261911392212, "learning_rate": 7.913563411308088e-06, "loss": 0.1697, "mean_token_accuracy": 0.7549779832363128, "num_tokens": 24985600.0, "step": 3050 }, { "epoch": 0.8666407052076327, "grad_norm": 1.2831072807312012, "learning_rate": 7.903073534039652e-06, "loss": 0.1824, "mean_token_accuracy": 0.7485567510128022, "num_tokens": 25067520.0, "step": 3060 }, { "epoch": 0.8694728643749778, "grad_norm": 1.5733733177185059, "learning_rate": 7.892583656771217e-06, "loss": 0.1825, "mean_token_accuracy": 0.7598458901047707, "num_tokens": 25149440.0, "step": 3070 }, { "epoch": 0.8723050235423231, "grad_norm": 1.6035773754119873, "learning_rate": 7.88209377950278e-06, "loss": 0.1555, "mean_token_accuracy": 0.7691169291734695, "num_tokens": 25231360.0, "step": 3080 }, { "epoch": 0.8751371827096683, "grad_norm": 1.3902678489685059, "learning_rate": 7.871603902234344e-06, "loss": 0.2113, "mean_token_accuracy": 0.7383194722235202, "num_tokens": 25313280.0, "step": 3090 }, { "epoch": 0.8779693418770135, "grad_norm": 1.2184810638427734, "learning_rate": 7.86111402496591e-06, "loss": 0.1601, "mean_token_accuracy": 0.7690557759255171, "num_tokens": 25395200.0, "step": 3100 }, { "epoch": 0.8808015010443587, "grad_norm": 1.6569554805755615, "learning_rate": 7.850624147697472e-06, "loss": 0.1499, "mean_token_accuracy": 0.7663160473108291, "num_tokens": 25477120.0, "step": 3110 }, { "epoch": 0.8836336602117039, "grad_norm": 1.7487496137619019, "learning_rate": 7.840134270429035e-06, "loss": 0.18, "mean_token_accuracy": 0.7471012711524964, "num_tokens": 25559040.0, "step": 3120 }, { "epoch": 0.8864658193790491, "grad_norm": 1.5647265911102295, "learning_rate": 7.8296443931606e-06, "loss": 0.1659, "mean_token_accuracy": 0.7528498053550721, "num_tokens": 25640960.0, "step": 3130 }, { "epoch": 0.8892979785463943, "grad_norm": 1.6271146535873413, "learning_rate": 7.819154515892164e-06, "loss": 0.1741, "mean_token_accuracy": 0.7671844448894263, "num_tokens": 25722880.0, "step": 3140 }, { "epoch": 0.8921301377137395, "grad_norm": 1.1338396072387695, "learning_rate": 7.808664638623728e-06, "loss": 0.2053, "mean_token_accuracy": 0.7528008822351694, "num_tokens": 25804800.0, "step": 3150 }, { "epoch": 0.8949622968810848, "grad_norm": 1.3062673807144165, "learning_rate": 7.798174761355293e-06, "loss": 0.1793, "mean_token_accuracy": 0.7356409002095461, "num_tokens": 25886720.0, "step": 3160 }, { "epoch": 0.8977944560484299, "grad_norm": 1.1194148063659668, "learning_rate": 7.787684884086857e-06, "loss": 0.1788, "mean_token_accuracy": 0.7578400194644928, "num_tokens": 25968640.0, "step": 3170 }, { "epoch": 0.9006266152157751, "grad_norm": 1.395133376121521, "learning_rate": 7.77719500681842e-06, "loss": 0.1848, "mean_token_accuracy": 0.7355063576251268, "num_tokens": 26050560.0, "step": 3180 }, { "epoch": 0.9034587743831203, "grad_norm": 1.2578686475753784, "learning_rate": 7.766705129549984e-06, "loss": 0.1519, "mean_token_accuracy": 0.7749510753899813, "num_tokens": 26132480.0, "step": 3190 }, { "epoch": 0.9062909335504655, "grad_norm": 1.3314322233200073, "learning_rate": 7.75621525228155e-06, "loss": 0.1502, "mean_token_accuracy": 0.7856042020022869, "num_tokens": 26214400.0, "step": 3200 }, { "epoch": 0.9091230927178108, "grad_norm": 1.2085371017456055, "learning_rate": 7.745725375013113e-06, "loss": 0.1558, "mean_token_accuracy": 0.7697651658207179, "num_tokens": 26296320.0, "step": 3210 }, { "epoch": 0.9119552518851559, "grad_norm": 0.9980440735816956, "learning_rate": 7.735235497744677e-06, "loss": 0.1688, "mean_token_accuracy": 0.7536325797438621, "num_tokens": 26378240.0, "step": 3220 }, { "epoch": 0.9147874110525012, "grad_norm": 1.2215497493743896, "learning_rate": 7.72474562047624e-06, "loss": 0.1939, "mean_token_accuracy": 0.7408145820721984, "num_tokens": 26460160.0, "step": 3230 }, { "epoch": 0.9176195702198463, "grad_norm": 0.8396080732345581, "learning_rate": 7.714255743207804e-06, "loss": 0.1356, "mean_token_accuracy": 0.7769447147846222, "num_tokens": 26542080.0, "step": 3240 }, { "epoch": 0.9204517293871916, "grad_norm": 1.25995671749115, "learning_rate": 7.703765865939368e-06, "loss": 0.1421, "mean_token_accuracy": 0.7663282781839371, "num_tokens": 26624000.0, "step": 3250 }, { "epoch": 0.9232838885545368, "grad_norm": 1.246471643447876, "learning_rate": 7.693275988670933e-06, "loss": 0.1612, "mean_token_accuracy": 0.7616316061466932, "num_tokens": 26705920.0, "step": 3260 }, { "epoch": 0.9261160477218819, "grad_norm": 1.1481192111968994, "learning_rate": 7.682786111402497e-06, "loss": 0.1811, "mean_token_accuracy": 0.7447040125727653, "num_tokens": 26787840.0, "step": 3270 }, { "epoch": 0.9289482068892272, "grad_norm": 1.4435616731643677, "learning_rate": 7.67229623413406e-06, "loss": 0.1617, "mean_token_accuracy": 0.7634173180907965, "num_tokens": 26869760.0, "step": 3280 }, { "epoch": 0.9317803660565723, "grad_norm": 1.628287672996521, "learning_rate": 7.661806356865626e-06, "loss": 0.1809, "mean_token_accuracy": 0.7530699599534273, "num_tokens": 26951680.0, "step": 3290 }, { "epoch": 0.9346125252239176, "grad_norm": 1.2515571117401123, "learning_rate": 7.65131647959719e-06, "loss": 0.1719, "mean_token_accuracy": 0.757363012060523, "num_tokens": 27033600.0, "step": 3300 }, { "epoch": 0.9374446843912628, "grad_norm": 1.031917929649353, "learning_rate": 7.640826602328753e-06, "loss": 0.1963, "mean_token_accuracy": 0.7545132089406252, "num_tokens": 27115520.0, "step": 3310 }, { "epoch": 0.940276843558608, "grad_norm": 1.420943260192871, "learning_rate": 7.630336725060317e-06, "loss": 0.192, "mean_token_accuracy": 0.7166218187659978, "num_tokens": 27197440.0, "step": 3320 }, { "epoch": 0.9431090027259532, "grad_norm": 1.009371042251587, "learning_rate": 7.619846847791882e-06, "loss": 0.1591, "mean_token_accuracy": 0.7791462808847427, "num_tokens": 27279360.0, "step": 3330 }, { "epoch": 0.9459411618932984, "grad_norm": 1.2882252931594849, "learning_rate": 7.609356970523446e-06, "loss": 0.1623, "mean_token_accuracy": 0.7690680027008057, "num_tokens": 27361280.0, "step": 3340 }, { "epoch": 0.9487733210606436, "grad_norm": 0.997870922088623, "learning_rate": 7.598867093255009e-06, "loss": 0.1511, "mean_token_accuracy": 0.7653131123632193, "num_tokens": 27443200.0, "step": 3350 }, { "epoch": 0.9516054802279889, "grad_norm": 1.2685824632644653, "learning_rate": 7.588377215986573e-06, "loss": 0.1871, "mean_token_accuracy": 0.7422578245401382, "num_tokens": 27525120.0, "step": 3360 }, { "epoch": 0.954437639395334, "grad_norm": 0.9891111254692078, "learning_rate": 7.577887338718137e-06, "loss": 0.1722, "mean_token_accuracy": 0.7568248521536589, "num_tokens": 27607040.0, "step": 3370 }, { "epoch": 0.9572697985626792, "grad_norm": 1.0070608854293823, "learning_rate": 7.567397461449701e-06, "loss": 0.1675, "mean_token_accuracy": 0.7582803323864937, "num_tokens": 27688960.0, "step": 3380 }, { "epoch": 0.9601019577300244, "grad_norm": 1.4117594957351685, "learning_rate": 7.556907584181266e-06, "loss": 0.1423, "mean_token_accuracy": 0.7796966724097729, "num_tokens": 27770880.0, "step": 3390 }, { "epoch": 0.9629341168973696, "grad_norm": 1.1754200458526611, "learning_rate": 7.5464177069128295e-06, "loss": 0.171, "mean_token_accuracy": 0.7379158515483141, "num_tokens": 27852800.0, "step": 3400 }, { "epoch": 0.9657662760647149, "grad_norm": 1.0996763706207275, "learning_rate": 7.535927829644394e-06, "loss": 0.1447, "mean_token_accuracy": 0.7799168288707733, "num_tokens": 27934720.0, "step": 3410 }, { "epoch": 0.96859843523206, "grad_norm": 1.1180808544158936, "learning_rate": 7.525437952375958e-06, "loss": 0.1776, "mean_token_accuracy": 0.7429305303841829, "num_tokens": 28016640.0, "step": 3420 }, { "epoch": 0.9714305943994053, "grad_norm": 1.1991347074508667, "learning_rate": 7.514948075107522e-06, "loss": 0.1788, "mean_token_accuracy": 0.7498776897788048, "num_tokens": 28098560.0, "step": 3430 }, { "epoch": 0.9742627535667504, "grad_norm": 1.049326777458191, "learning_rate": 7.504458197839086e-06, "loss": 0.1962, "mean_token_accuracy": 0.7542685892432928, "num_tokens": 28180480.0, "step": 3440 }, { "epoch": 0.9770949127340957, "grad_norm": 1.31307053565979, "learning_rate": 7.49396832057065e-06, "loss": 0.163, "mean_token_accuracy": 0.7507827796041966, "num_tokens": 28262400.0, "step": 3450 }, { "epoch": 0.9799270719014409, "grad_norm": 1.402658462524414, "learning_rate": 7.483478443302215e-06, "loss": 0.181, "mean_token_accuracy": 0.7379892360419035, "num_tokens": 28344320.0, "step": 3460 }, { "epoch": 0.982759231068786, "grad_norm": 1.5197118520736694, "learning_rate": 7.472988566033778e-06, "loss": 0.1732, "mean_token_accuracy": 0.7527519583702087, "num_tokens": 28426240.0, "step": 3470 }, { "epoch": 0.9855913902361313, "grad_norm": 1.3193233013153076, "learning_rate": 7.462498688765341e-06, "loss": 0.1693, "mean_token_accuracy": 0.75834148414433, "num_tokens": 28508160.0, "step": 3480 }, { "epoch": 0.9884235494034764, "grad_norm": 1.3374930620193481, "learning_rate": 7.452008811496906e-06, "loss": 0.1661, "mean_token_accuracy": 0.7599926609545946, "num_tokens": 28590080.0, "step": 3490 }, { "epoch": 0.9912557085708217, "grad_norm": 1.3857275247573853, "learning_rate": 7.44151893422847e-06, "loss": 0.1759, "mean_token_accuracy": 0.7574486292898654, "num_tokens": 28672000.0, "step": 3500 }, { "epoch": 0.9940878677381669, "grad_norm": 1.3750524520874023, "learning_rate": 7.431029056960034e-06, "loss": 0.1738, "mean_token_accuracy": 0.7544887505471707, "num_tokens": 28753920.0, "step": 3510 }, { "epoch": 0.9969200269055121, "grad_norm": 1.3511148691177368, "learning_rate": 7.4205391796915985e-06, "loss": 0.1532, "mean_token_accuracy": 0.7683219172060489, "num_tokens": 28835840.0, "step": 3520 }, { "epoch": 0.9997521860728573, "grad_norm": 1.6001296043395996, "learning_rate": 7.410049302423162e-06, "loss": 0.1632, "mean_token_accuracy": 0.7503546994179487, "num_tokens": 28917760.0, "step": 3530 }, { "epoch": 1.0025489432506107, "grad_norm": 1.2244106531143188, "learning_rate": 7.399559425154727e-06, "loss": 0.1596, "mean_token_accuracy": 0.7478882304475277, "num_tokens": 28998656.0, "step": 3540 }, { "epoch": 1.005381102417956, "grad_norm": 1.0550057888031006, "learning_rate": 7.38906954788629e-06, "loss": 0.159, "mean_token_accuracy": 0.7704011708498001, "num_tokens": 29080576.0, "step": 3550 }, { "epoch": 1.008213261585301, "grad_norm": 0.9888843894004822, "learning_rate": 7.378579670617855e-06, "loss": 0.1623, "mean_token_accuracy": 0.7576443288475275, "num_tokens": 29162496.0, "step": 3560 }, { "epoch": 1.0110454207526463, "grad_norm": 1.0321097373962402, "learning_rate": 7.368089793349419e-06, "loss": 0.1424, "mean_token_accuracy": 0.757803326845169, "num_tokens": 29244416.0, "step": 3570 }, { "epoch": 1.0138775799199915, "grad_norm": 1.2819013595581055, "learning_rate": 7.357599916080983e-06, "loss": 0.1325, "mean_token_accuracy": 0.7876712325960398, "num_tokens": 29326336.0, "step": 3580 }, { "epoch": 1.0167097390873368, "grad_norm": 1.5579744577407837, "learning_rate": 7.347110038812546e-06, "loss": 0.1633, "mean_token_accuracy": 0.7363869857043028, "num_tokens": 29408256.0, "step": 3590 }, { "epoch": 1.019541898254682, "grad_norm": 1.3319820165634155, "learning_rate": 7.33662016154411e-06, "loss": 0.1511, "mean_token_accuracy": 0.7569349359720945, "num_tokens": 29490176.0, "step": 3600 }, { "epoch": 1.022374057422027, "grad_norm": 1.2774381637573242, "learning_rate": 7.326130284275675e-06, "loss": 0.1542, "mean_token_accuracy": 0.7640288658440113, "num_tokens": 29572096.0, "step": 3610 }, { "epoch": 1.0252062165893723, "grad_norm": 1.1909984350204468, "learning_rate": 7.315640407007238e-06, "loss": 0.1501, "mean_token_accuracy": 0.7641267124563456, "num_tokens": 29654016.0, "step": 3620 }, { "epoch": 1.0280383757567175, "grad_norm": 1.1910202503204346, "learning_rate": 7.305150529738803e-06, "loss": 0.1164, "mean_token_accuracy": 0.7849681958556175, "num_tokens": 29735936.0, "step": 3630 }, { "epoch": 1.0308705349240628, "grad_norm": 1.4356422424316406, "learning_rate": 7.2946606524703665e-06, "loss": 0.154, "mean_token_accuracy": 0.7766022492200136, "num_tokens": 29817856.0, "step": 3640 }, { "epoch": 1.033702694091408, "grad_norm": 1.3545632362365723, "learning_rate": 7.284170775201931e-06, "loss": 0.1394, "mean_token_accuracy": 0.7448263213038444, "num_tokens": 29899776.0, "step": 3650 }, { "epoch": 1.036534853258753, "grad_norm": 1.5208156108856201, "learning_rate": 7.273680897933495e-06, "loss": 0.1425, "mean_token_accuracy": 0.7730919763445854, "num_tokens": 29981696.0, "step": 3660 }, { "epoch": 1.0393670124260983, "grad_norm": 1.0131839513778687, "learning_rate": 7.263191020665059e-06, "loss": 0.1415, "mean_token_accuracy": 0.758243640512228, "num_tokens": 30063616.0, "step": 3670 }, { "epoch": 1.0421991715934436, "grad_norm": 1.3783141374588013, "learning_rate": 7.252701143396624e-06, "loss": 0.1479, "mean_token_accuracy": 0.7646771010011435, "num_tokens": 30145536.0, "step": 3680 }, { "epoch": 1.0450313307607888, "grad_norm": 1.1283754110336304, "learning_rate": 7.242211266128187e-06, "loss": 0.1458, "mean_token_accuracy": 0.7575587097555398, "num_tokens": 30227456.0, "step": 3690 }, { "epoch": 1.047863489928134, "grad_norm": 1.1113072633743286, "learning_rate": 7.23172138885975e-06, "loss": 0.1406, "mean_token_accuracy": 0.7670376732945442, "num_tokens": 30309376.0, "step": 3700 }, { "epoch": 1.050695649095479, "grad_norm": 1.6660853624343872, "learning_rate": 7.221231511591315e-06, "loss": 0.1478, "mean_token_accuracy": 0.7698385510593653, "num_tokens": 30391296.0, "step": 3710 }, { "epoch": 1.0535278082628243, "grad_norm": 1.4135735034942627, "learning_rate": 7.210741634322878e-06, "loss": 0.1529, "mean_token_accuracy": 0.7496086109429598, "num_tokens": 30473216.0, "step": 3720 }, { "epoch": 1.0563599674301696, "grad_norm": 1.173937201499939, "learning_rate": 7.200251757054443e-06, "loss": 0.1679, "mean_token_accuracy": 0.7544765170663595, "num_tokens": 30555136.0, "step": 3730 }, { "epoch": 1.0591921265975148, "grad_norm": 1.410231351852417, "learning_rate": 7.189761879786007e-06, "loss": 0.14, "mean_token_accuracy": 0.755675145611167, "num_tokens": 30637056.0, "step": 3740 }, { "epoch": 1.06202428576486, "grad_norm": 1.293808937072754, "learning_rate": 7.179272002517571e-06, "loss": 0.1739, "mean_token_accuracy": 0.7561521483585238, "num_tokens": 30718976.0, "step": 3750 }, { "epoch": 1.064856444932205, "grad_norm": 1.1845042705535889, "learning_rate": 7.168782125249135e-06, "loss": 0.1536, "mean_token_accuracy": 0.7682240687310695, "num_tokens": 30800896.0, "step": 3760 }, { "epoch": 1.0676886040995504, "grad_norm": 1.1527948379516602, "learning_rate": 7.158292247980699e-06, "loss": 0.1423, "mean_token_accuracy": 0.7801247552037239, "num_tokens": 30882816.0, "step": 3770 }, { "epoch": 1.0705207632668956, "grad_norm": 1.3724867105484009, "learning_rate": 7.1478023707122636e-06, "loss": 0.1544, "mean_token_accuracy": 0.7594300344586372, "num_tokens": 30964736.0, "step": 3780 }, { "epoch": 1.0733529224342409, "grad_norm": 0.9297869801521301, "learning_rate": 7.137312493443827e-06, "loss": 0.1331, "mean_token_accuracy": 0.7688600789755583, "num_tokens": 31046656.0, "step": 3790 }, { "epoch": 1.076185081601586, "grad_norm": 1.4095525741577148, "learning_rate": 7.126822616175392e-06, "loss": 0.1515, "mean_token_accuracy": 0.7549168284982443, "num_tokens": 31128576.0, "step": 3800 }, { "epoch": 1.0790172407689311, "grad_norm": 1.3171921968460083, "learning_rate": 7.116332738906956e-06, "loss": 0.1745, "mean_token_accuracy": 0.7319349318742752, "num_tokens": 31210496.0, "step": 3810 }, { "epoch": 1.0818493999362764, "grad_norm": 1.1299370527267456, "learning_rate": 7.105842861638519e-06, "loss": 0.157, "mean_token_accuracy": 0.7691046964377165, "num_tokens": 31292416.0, "step": 3820 }, { "epoch": 1.0846815591036216, "grad_norm": 1.124587059020996, "learning_rate": 7.095352984370083e-06, "loss": 0.1445, "mean_token_accuracy": 0.7558219194412231, "num_tokens": 31374336.0, "step": 3830 }, { "epoch": 1.0875137182709669, "grad_norm": 1.730022668838501, "learning_rate": 7.084863107101647e-06, "loss": 0.1585, "mean_token_accuracy": 0.7561766151338816, "num_tokens": 31456256.0, "step": 3840 }, { "epoch": 1.0903458774383121, "grad_norm": 1.2283018827438354, "learning_rate": 7.074373229833212e-06, "loss": 0.1205, "mean_token_accuracy": 0.7735445186495781, "num_tokens": 31538176.0, "step": 3850 }, { "epoch": 1.0931780366056572, "grad_norm": 1.1412532329559326, "learning_rate": 7.063883352564775e-06, "loss": 0.1528, "mean_token_accuracy": 0.77469422519207, "num_tokens": 31620096.0, "step": 3860 }, { "epoch": 1.0960101957730024, "grad_norm": 1.291068434715271, "learning_rate": 7.05339347529634e-06, "loss": 0.1655, "mean_token_accuracy": 0.7527152627706528, "num_tokens": 31702016.0, "step": 3870 }, { "epoch": 1.0988423549403477, "grad_norm": 1.4640803337097168, "learning_rate": 7.0429035980279035e-06, "loss": 0.1396, "mean_token_accuracy": 0.7694960858672857, "num_tokens": 31783936.0, "step": 3880 }, { "epoch": 1.101674514107693, "grad_norm": 1.4632534980773926, "learning_rate": 7.032413720759468e-06, "loss": 0.147, "mean_token_accuracy": 0.7770547911524772, "num_tokens": 31865856.0, "step": 3890 }, { "epoch": 1.1045066732750382, "grad_norm": 1.4930795431137085, "learning_rate": 7.021923843491032e-06, "loss": 0.142, "mean_token_accuracy": 0.7510029401630163, "num_tokens": 31947776.0, "step": 3900 }, { "epoch": 1.1073388324423832, "grad_norm": 1.2306910753250122, "learning_rate": 7.011433966222596e-06, "loss": 0.1275, "mean_token_accuracy": 0.7758928559720516, "num_tokens": 32029696.0, "step": 3910 }, { "epoch": 1.1101709916097284, "grad_norm": 1.280820608139038, "learning_rate": 7.000944088954161e-06, "loss": 0.1342, "mean_token_accuracy": 0.7799902185797691, "num_tokens": 32111616.0, "step": 3920 }, { "epoch": 1.1130031507770737, "grad_norm": 1.2635548114776611, "learning_rate": 6.990454211685724e-06, "loss": 0.1363, "mean_token_accuracy": 0.7661325864493846, "num_tokens": 32193536.0, "step": 3930 }, { "epoch": 1.115835309944419, "grad_norm": 1.3457741737365723, "learning_rate": 6.979964334417287e-06, "loss": 0.1484, "mean_token_accuracy": 0.7757950119674206, "num_tokens": 32275456.0, "step": 3940 }, { "epoch": 1.1186674691117642, "grad_norm": 1.2960811853408813, "learning_rate": 6.9694744571488516e-06, "loss": 0.146, "mean_token_accuracy": 0.7605308219790459, "num_tokens": 32357376.0, "step": 3950 }, { "epoch": 1.1214996282791092, "grad_norm": 1.4071904420852661, "learning_rate": 6.958984579880416e-06, "loss": 0.1458, "mean_token_accuracy": 0.7659124247729778, "num_tokens": 32439296.0, "step": 3960 }, { "epoch": 1.1243317874464545, "grad_norm": 1.0110926628112793, "learning_rate": 6.94849470261198e-06, "loss": 0.1189, "mean_token_accuracy": 0.7743273012340068, "num_tokens": 32521216.0, "step": 3970 }, { "epoch": 1.1271639466137997, "grad_norm": 1.2925784587860107, "learning_rate": 6.938004825343544e-06, "loss": 0.1483, "mean_token_accuracy": 0.7629525434225798, "num_tokens": 32603136.0, "step": 3980 }, { "epoch": 1.129996105781145, "grad_norm": 1.7860318422317505, "learning_rate": 6.927514948075108e-06, "loss": 0.1779, "mean_token_accuracy": 0.7448140896856785, "num_tokens": 32685056.0, "step": 3990 }, { "epoch": 1.1328282649484902, "grad_norm": 1.1383624076843262, "learning_rate": 6.917025070806672e-06, "loss": 0.1519, "mean_token_accuracy": 0.7636007823050022, "num_tokens": 32766976.0, "step": 4000 }, { "epoch": 1.1356604241158352, "grad_norm": 1.4539917707443237, "learning_rate": 6.906535193538236e-06, "loss": 0.1601, "mean_token_accuracy": 0.7487769085913897, "num_tokens": 32848896.0, "step": 4010 }, { "epoch": 1.1384925832831805, "grad_norm": 1.3108891248703003, "learning_rate": 6.8960453162698005e-06, "loss": 0.1404, "mean_token_accuracy": 0.7612157497555018, "num_tokens": 32930816.0, "step": 4020 }, { "epoch": 1.1413247424505257, "grad_norm": 1.7028967142105103, "learning_rate": 6.885555439001364e-06, "loss": 0.1485, "mean_token_accuracy": 0.7627690806984901, "num_tokens": 33012736.0, "step": 4030 }, { "epoch": 1.144156901617871, "grad_norm": 1.3381489515304565, "learning_rate": 6.875065561732929e-06, "loss": 0.1699, "mean_token_accuracy": 0.7397994138300419, "num_tokens": 33094656.0, "step": 4040 }, { "epoch": 1.146989060785216, "grad_norm": 1.515177845954895, "learning_rate": 6.864575684464493e-06, "loss": 0.1503, "mean_token_accuracy": 0.7573385510593653, "num_tokens": 33176576.0, "step": 4050 }, { "epoch": 1.1498212199525613, "grad_norm": 1.0826586484909058, "learning_rate": 6.854085807196056e-06, "loss": 0.1369, "mean_token_accuracy": 0.7894447185099125, "num_tokens": 33258496.0, "step": 4060 }, { "epoch": 1.1526533791199065, "grad_norm": 1.1878966093063354, "learning_rate": 6.84359592992762e-06, "loss": 0.1478, "mean_token_accuracy": 0.7772627171128988, "num_tokens": 33340416.0, "step": 4070 }, { "epoch": 1.1554855382872518, "grad_norm": 1.1523600816726685, "learning_rate": 6.833106052659184e-06, "loss": 0.1576, "mean_token_accuracy": 0.763062622025609, "num_tokens": 33422336.0, "step": 4080 }, { "epoch": 1.158317697454597, "grad_norm": 1.426184058189392, "learning_rate": 6.822616175390749e-06, "loss": 0.1467, "mean_token_accuracy": 0.754097356274724, "num_tokens": 33504256.0, "step": 4090 }, { "epoch": 1.1611498566219423, "grad_norm": 1.400943398475647, "learning_rate": 6.812126298122312e-06, "loss": 0.1474, "mean_token_accuracy": 0.7744006849825382, "num_tokens": 33586176.0, "step": 4100 }, { "epoch": 1.1639820157892873, "grad_norm": 1.2228891849517822, "learning_rate": 6.801636420853877e-06, "loss": 0.1277, "mean_token_accuracy": 0.7860200580209493, "num_tokens": 33668096.0, "step": 4110 }, { "epoch": 1.1668141749566325, "grad_norm": 1.4715205430984497, "learning_rate": 6.79114654358544e-06, "loss": 0.1651, "mean_token_accuracy": 0.7729818988591433, "num_tokens": 33750016.0, "step": 4120 }, { "epoch": 1.1696463341239778, "grad_norm": 1.3441720008850098, "learning_rate": 6.780656666317005e-06, "loss": 0.1568, "mean_token_accuracy": 0.7634907033294439, "num_tokens": 33831936.0, "step": 4130 }, { "epoch": 1.172478493291323, "grad_norm": 1.5211853981018066, "learning_rate": 6.7701667890485686e-06, "loss": 0.1615, "mean_token_accuracy": 0.7547089036554098, "num_tokens": 33913856.0, "step": 4140 }, { "epoch": 1.175310652458668, "grad_norm": 1.182962417602539, "learning_rate": 6.759676911780133e-06, "loss": 0.166, "mean_token_accuracy": 0.7567636977881194, "num_tokens": 33995776.0, "step": 4150 }, { "epoch": 1.1781428116260133, "grad_norm": 1.1260063648223877, "learning_rate": 6.7491870345116976e-06, "loss": 0.134, "mean_token_accuracy": 0.7878791600465774, "num_tokens": 34077696.0, "step": 4160 }, { "epoch": 1.1809749707933586, "grad_norm": 1.1578644514083862, "learning_rate": 6.738697157243261e-06, "loss": 0.1604, "mean_token_accuracy": 0.7316903125494718, "num_tokens": 34159616.0, "step": 4170 }, { "epoch": 1.1838071299607038, "grad_norm": 1.318691372871399, "learning_rate": 6.728207279974824e-06, "loss": 0.1523, "mean_token_accuracy": 0.7676736813038587, "num_tokens": 34241536.0, "step": 4180 }, { "epoch": 1.186639289128049, "grad_norm": 1.1302433013916016, "learning_rate": 6.7177174027063885e-06, "loss": 0.1545, "mean_token_accuracy": 0.769630628824234, "num_tokens": 34323456.0, "step": 4190 }, { "epoch": 1.1894714482953943, "grad_norm": 1.767404556274414, "learning_rate": 6.707227525437953e-06, "loss": 0.1728, "mean_token_accuracy": 0.7577054813504219, "num_tokens": 34405376.0, "step": 4200 }, { "epoch": 1.1923036074627393, "grad_norm": 1.2372157573699951, "learning_rate": 6.696737648169517e-06, "loss": 0.1359, "mean_token_accuracy": 0.7718688827008009, "num_tokens": 34487296.0, "step": 4210 }, { "epoch": 1.1951357666300846, "grad_norm": 1.8191885948181152, "learning_rate": 6.686247770901081e-06, "loss": 0.1364, "mean_token_accuracy": 0.7800146721303463, "num_tokens": 34569216.0, "step": 4220 }, { "epoch": 1.1979679257974298, "grad_norm": 1.0338172912597656, "learning_rate": 6.675757893632645e-06, "loss": 0.1295, "mean_token_accuracy": 0.7698630146682263, "num_tokens": 34651136.0, "step": 4230 }, { "epoch": 1.200800084964775, "grad_norm": 2.0343899726867676, "learning_rate": 6.665268016364209e-06, "loss": 0.1522, "mean_token_accuracy": 0.7629280831664801, "num_tokens": 34733056.0, "step": 4240 }, { "epoch": 1.20363224413212, "grad_norm": 1.6398661136627197, "learning_rate": 6.654778139095773e-06, "loss": 0.142, "mean_token_accuracy": 0.7607142835855484, "num_tokens": 34814976.0, "step": 4250 }, { "epoch": 1.2064644032994654, "grad_norm": 1.2537733316421509, "learning_rate": 6.6442882618273375e-06, "loss": 0.1491, "mean_token_accuracy": 0.7633072391152382, "num_tokens": 34896896.0, "step": 4260 }, { "epoch": 1.2092965624668106, "grad_norm": 1.158907175064087, "learning_rate": 6.633798384558902e-06, "loss": 0.1138, "mean_token_accuracy": 0.7818126231431961, "num_tokens": 34978816.0, "step": 4270 }, { "epoch": 1.2121287216341559, "grad_norm": 1.2836761474609375, "learning_rate": 6.623308507290466e-06, "loss": 0.1317, "mean_token_accuracy": 0.792551365494728, "num_tokens": 35060736.0, "step": 4280 }, { "epoch": 1.214960880801501, "grad_norm": 1.0998327732086182, "learning_rate": 6.612818630022028e-06, "loss": 0.141, "mean_token_accuracy": 0.7732142861932516, "num_tokens": 35142656.0, "step": 4290 }, { "epoch": 1.2177930399688464, "grad_norm": 1.312498688697815, "learning_rate": 6.602328752753593e-06, "loss": 0.1562, "mean_token_accuracy": 0.7645792573690414, "num_tokens": 35224576.0, "step": 4300 }, { "epoch": 1.2206251991361914, "grad_norm": 1.7708628177642822, "learning_rate": 6.591838875485157e-06, "loss": 0.1476, "mean_token_accuracy": 0.7577544063329696, "num_tokens": 35306496.0, "step": 4310 }, { "epoch": 1.2234573583035366, "grad_norm": 1.1896047592163086, "learning_rate": 6.581348998216721e-06, "loss": 0.1482, "mean_token_accuracy": 0.7766022499650717, "num_tokens": 35388416.0, "step": 4320 }, { "epoch": 1.2262895174708819, "grad_norm": 1.1819688081741333, "learning_rate": 6.5708591209482856e-06, "loss": 0.141, "mean_token_accuracy": 0.7666952088475227, "num_tokens": 35470336.0, "step": 4330 }, { "epoch": 1.2291216766382271, "grad_norm": 1.2172794342041016, "learning_rate": 6.560369243679849e-06, "loss": 0.1653, "mean_token_accuracy": 0.7531678069382906, "num_tokens": 35552256.0, "step": 4340 }, { "epoch": 1.2319538358055722, "grad_norm": 1.5508960485458374, "learning_rate": 6.549879366411414e-06, "loss": 0.1649, "mean_token_accuracy": 0.7559686873108149, "num_tokens": 35634176.0, "step": 4350 }, { "epoch": 1.2347859949729174, "grad_norm": 1.5100057125091553, "learning_rate": 6.539389489142977e-06, "loss": 0.1381, "mean_token_accuracy": 0.7603351261466742, "num_tokens": 35716096.0, "step": 4360 }, { "epoch": 1.2376181541402627, "grad_norm": 1.6024094820022583, "learning_rate": 6.528899611874542e-06, "loss": 0.1322, "mean_token_accuracy": 0.7459393355995416, "num_tokens": 35798016.0, "step": 4370 }, { "epoch": 1.240450313307608, "grad_norm": 2.0412209033966064, "learning_rate": 6.5184097346061055e-06, "loss": 0.1532, "mean_token_accuracy": 0.7603840488940478, "num_tokens": 35879936.0, "step": 4380 }, { "epoch": 1.2432824724749532, "grad_norm": 1.3321079015731812, "learning_rate": 6.50791985733767e-06, "loss": 0.1552, "mean_token_accuracy": 0.770278862118721, "num_tokens": 35961856.0, "step": 4390 }, { "epoch": 1.2461146316422984, "grad_norm": 1.4159857034683228, "learning_rate": 6.4974299800692345e-06, "loss": 0.1428, "mean_token_accuracy": 0.7683341484516859, "num_tokens": 36043776.0, "step": 4400 }, { "epoch": 1.2489467908096434, "grad_norm": 1.6967610120773315, "learning_rate": 6.486940102800797e-06, "loss": 0.1743, "mean_token_accuracy": 0.7465753432363271, "num_tokens": 36125696.0, "step": 4410 }, { "epoch": 1.2517789499769887, "grad_norm": 1.4607009887695312, "learning_rate": 6.476450225532361e-06, "loss": 0.1561, "mean_token_accuracy": 0.7734711334109307, "num_tokens": 36207616.0, "step": 4420 }, { "epoch": 1.254611109144334, "grad_norm": 1.37501859664917, "learning_rate": 6.4659603482639255e-06, "loss": 0.1295, "mean_token_accuracy": 0.7805283755064011, "num_tokens": 36289536.0, "step": 4430 }, { "epoch": 1.2574432683116792, "grad_norm": 1.6977213621139526, "learning_rate": 6.45547047099549e-06, "loss": 0.1615, "mean_token_accuracy": 0.7621942244470119, "num_tokens": 36371456.0, "step": 4440 }, { "epoch": 1.2602754274790242, "grad_norm": 1.40463125705719, "learning_rate": 6.444980593727054e-06, "loss": 0.1618, "mean_token_accuracy": 0.7591487251222133, "num_tokens": 36453376.0, "step": 4450 }, { "epoch": 1.2631075866463695, "grad_norm": 1.112905502319336, "learning_rate": 6.434490716458618e-06, "loss": 0.1392, "mean_token_accuracy": 0.7783635046333075, "num_tokens": 36535296.0, "step": 4460 }, { "epoch": 1.2659397458137147, "grad_norm": 1.1654239892959595, "learning_rate": 6.424000839190182e-06, "loss": 0.1447, "mean_token_accuracy": 0.7743028368800878, "num_tokens": 36617216.0, "step": 4470 }, { "epoch": 1.26877190498106, "grad_norm": 1.368154525756836, "learning_rate": 6.413510961921746e-06, "loss": 0.1726, "mean_token_accuracy": 0.740496576204896, "num_tokens": 36699136.0, "step": 4480 }, { "epoch": 1.2716040641484052, "grad_norm": 1.3509780168533325, "learning_rate": 6.40302108465331e-06, "loss": 0.1513, "mean_token_accuracy": 0.7747798439115285, "num_tokens": 36781056.0, "step": 4490 }, { "epoch": 1.2744362233157505, "grad_norm": 1.2839014530181885, "learning_rate": 6.392531207384874e-06, "loss": 0.1611, "mean_token_accuracy": 0.7618884552270174, "num_tokens": 36862976.0, "step": 4500 }, { "epoch": 1.2772683824830955, "grad_norm": 1.5664631128311157, "learning_rate": 6.382041330116439e-06, "loss": 0.148, "mean_token_accuracy": 0.7698385529220104, "num_tokens": 36944896.0, "step": 4510 }, { "epoch": 1.2801005416504407, "grad_norm": 1.3136051893234253, "learning_rate": 6.3715514528480026e-06, "loss": 0.1355, "mean_token_accuracy": 0.7603840511292219, "num_tokens": 37026816.0, "step": 4520 }, { "epoch": 1.282932700817786, "grad_norm": 1.2616698741912842, "learning_rate": 6.361061575579565e-06, "loss": 0.1371, "mean_token_accuracy": 0.7675758324563503, "num_tokens": 37108736.0, "step": 4530 }, { "epoch": 1.2857648599851312, "grad_norm": 1.853359341621399, "learning_rate": 6.35057169831113e-06, "loss": 0.1712, "mean_token_accuracy": 0.7541829720139503, "num_tokens": 37190656.0, "step": 4540 }, { "epoch": 1.2885970191524763, "grad_norm": 1.133120059967041, "learning_rate": 6.340081821042694e-06, "loss": 0.1307, "mean_token_accuracy": 0.7765533246099949, "num_tokens": 37272576.0, "step": 4550 }, { "epoch": 1.2914291783198215, "grad_norm": 1.3735811710357666, "learning_rate": 6.329591943774258e-06, "loss": 0.1641, "mean_token_accuracy": 0.7597358107566834, "num_tokens": 37354496.0, "step": 4560 }, { "epoch": 1.2942613374871668, "grad_norm": 1.4799385070800781, "learning_rate": 6.3191020665058225e-06, "loss": 0.1664, "mean_token_accuracy": 0.7421599786728621, "num_tokens": 37436416.0, "step": 4570 }, { "epoch": 1.297093496654512, "grad_norm": 1.1046998500823975, "learning_rate": 6.308612189237386e-06, "loss": 0.1572, "mean_token_accuracy": 0.7445083215832711, "num_tokens": 37518336.0, "step": 4580 }, { "epoch": 1.2999256558218573, "grad_norm": 0.9093125462532043, "learning_rate": 6.298122311968951e-06, "loss": 0.1299, "mean_token_accuracy": 0.7606409013271331, "num_tokens": 37600256.0, "step": 4590 }, { "epoch": 1.3027578149892025, "grad_norm": 1.3680236339569092, "learning_rate": 6.287632434700514e-06, "loss": 0.1778, "mean_token_accuracy": 0.7463551837950945, "num_tokens": 37682176.0, "step": 4600 }, { "epoch": 1.3055899741565475, "grad_norm": 1.3345285654067993, "learning_rate": 6.277142557432079e-06, "loss": 0.146, "mean_token_accuracy": 0.7786937344819307, "num_tokens": 37764096.0, "step": 4610 }, { "epoch": 1.3084221333238928, "grad_norm": 0.9545773267745972, "learning_rate": 6.266652680163643e-06, "loss": 0.153, "mean_token_accuracy": 0.7764432489871979, "num_tokens": 37846016.0, "step": 4620 }, { "epoch": 1.311254292491238, "grad_norm": 1.472604751586914, "learning_rate": 6.256162802895207e-06, "loss": 0.1552, "mean_token_accuracy": 0.7620963774621486, "num_tokens": 37927936.0, "step": 4630 }, { "epoch": 1.3140864516585833, "grad_norm": 0.9628563523292542, "learning_rate": 6.2456729256267715e-06, "loss": 0.1247, "mean_token_accuracy": 0.7696550875902176, "num_tokens": 38009856.0, "step": 4640 }, { "epoch": 1.3169186108259283, "grad_norm": 1.549745798110962, "learning_rate": 6.235183048358334e-06, "loss": 0.1557, "mean_token_accuracy": 0.7659246563911438, "num_tokens": 38091776.0, "step": 4650 }, { "epoch": 1.3197507699932736, "grad_norm": 1.3386218547821045, "learning_rate": 6.224693171089898e-06, "loss": 0.1327, "mean_token_accuracy": 0.7911815065890551, "num_tokens": 38173696.0, "step": 4660 }, { "epoch": 1.3225829291606188, "grad_norm": 1.1905710697174072, "learning_rate": 6.2142032938214624e-06, "loss": 0.1507, "mean_token_accuracy": 0.7516022492200136, "num_tokens": 38255616.0, "step": 4670 }, { "epoch": 1.325415088327964, "grad_norm": 1.2225019931793213, "learning_rate": 6.203713416553027e-06, "loss": 0.131, "mean_token_accuracy": 0.7831702534109354, "num_tokens": 38337536.0, "step": 4680 }, { "epoch": 1.3282472474953093, "grad_norm": 1.262401819229126, "learning_rate": 6.1932235392845906e-06, "loss": 0.1471, "mean_token_accuracy": 0.7721991177648306, "num_tokens": 38419456.0, "step": 4690 }, { "epoch": 1.3310794066626546, "grad_norm": 1.3216712474822998, "learning_rate": 6.182733662016155e-06, "loss": 0.1493, "mean_token_accuracy": 0.7624143823981285, "num_tokens": 38501376.0, "step": 4700 }, { "epoch": 1.3339115658299996, "grad_norm": 1.8408701419830322, "learning_rate": 6.172243784747719e-06, "loss": 0.1636, "mean_token_accuracy": 0.7578522481024266, "num_tokens": 38583296.0, "step": 4710 }, { "epoch": 1.3367437249973448, "grad_norm": 1.0912449359893799, "learning_rate": 6.161753907479283e-06, "loss": 0.1532, "mean_token_accuracy": 0.761484831944108, "num_tokens": 38665216.0, "step": 4720 }, { "epoch": 1.33957588416469, "grad_norm": 1.055835247039795, "learning_rate": 6.151264030210847e-06, "loss": 0.1374, "mean_token_accuracy": 0.748581214621663, "num_tokens": 38747136.0, "step": 4730 }, { "epoch": 1.3424080433320353, "grad_norm": 1.3156987428665161, "learning_rate": 6.140774152942411e-06, "loss": 0.1362, "mean_token_accuracy": 0.7838674157857894, "num_tokens": 38829056.0, "step": 4740 }, { "epoch": 1.3452402024993804, "grad_norm": 1.524060845375061, "learning_rate": 6.130284275673976e-06, "loss": 0.1512, "mean_token_accuracy": 0.7603840507566929, "num_tokens": 38910976.0, "step": 4750 }, { "epoch": 1.3480723616667256, "grad_norm": 1.510961651802063, "learning_rate": 6.1197943984055395e-06, "loss": 0.1749, "mean_token_accuracy": 0.7445572406053543, "num_tokens": 38992896.0, "step": 4760 }, { "epoch": 1.3509045208340709, "grad_norm": 1.1437835693359375, "learning_rate": 6.109304521137102e-06, "loss": 0.1338, "mean_token_accuracy": 0.7704867891967296, "num_tokens": 39074816.0, "step": 4770 }, { "epoch": 1.3537366800014161, "grad_norm": 1.2372264862060547, "learning_rate": 6.098814643868667e-06, "loss": 0.1458, "mean_token_accuracy": 0.7585249520838261, "num_tokens": 39156736.0, "step": 4780 }, { "epoch": 1.3565688391687614, "grad_norm": 1.0541081428527832, "learning_rate": 6.088324766600231e-06, "loss": 0.1435, "mean_token_accuracy": 0.7624633044004441, "num_tokens": 39238656.0, "step": 4790 }, { "epoch": 1.3594009983361066, "grad_norm": 0.9931426048278809, "learning_rate": 6.077834889331795e-06, "loss": 0.1343, "mean_token_accuracy": 0.7574853219091893, "num_tokens": 39320576.0, "step": 4800 }, { "epoch": 1.3622331575034516, "grad_norm": 1.1268242597579956, "learning_rate": 6.0673450120633595e-06, "loss": 0.1546, "mean_token_accuracy": 0.7514921735972167, "num_tokens": 39402496.0, "step": 4810 }, { "epoch": 1.365065316670797, "grad_norm": 1.043038249015808, "learning_rate": 6.056855134794923e-06, "loss": 0.1509, "mean_token_accuracy": 0.756115460768342, "num_tokens": 39484416.0, "step": 4820 }, { "epoch": 1.3678974758381421, "grad_norm": 1.308717966079712, "learning_rate": 6.046365257526488e-06, "loss": 0.1739, "mean_token_accuracy": 0.7565557733178139, "num_tokens": 39566336.0, "step": 4830 }, { "epoch": 1.3707296350054874, "grad_norm": 0.9775325059890747, "learning_rate": 6.035875380258051e-06, "loss": 0.1419, "mean_token_accuracy": 0.7432362988591195, "num_tokens": 39648256.0, "step": 4840 }, { "epoch": 1.3735617941728324, "grad_norm": 1.4292516708374023, "learning_rate": 6.025385502989616e-06, "loss": 0.1871, "mean_token_accuracy": 0.7510518580675125, "num_tokens": 39730176.0, "step": 4850 }, { "epoch": 1.3763939533401777, "grad_norm": 1.0225476026535034, "learning_rate": 6.01489562572118e-06, "loss": 0.1439, "mean_token_accuracy": 0.7606286656111478, "num_tokens": 39812096.0, "step": 4860 }, { "epoch": 1.379226112507523, "grad_norm": 1.322291374206543, "learning_rate": 6.004405748452744e-06, "loss": 0.1662, "mean_token_accuracy": 0.7463918786495924, "num_tokens": 39894016.0, "step": 4870 }, { "epoch": 1.3820582716748682, "grad_norm": 1.3240032196044922, "learning_rate": 5.9939158711843084e-06, "loss": 0.1905, "mean_token_accuracy": 0.7313111577183008, "num_tokens": 39975936.0, "step": 4880 }, { "epoch": 1.3848904308422134, "grad_norm": 1.168860912322998, "learning_rate": 5.983425993915871e-06, "loss": 0.1392, "mean_token_accuracy": 0.7595768075436353, "num_tokens": 40057856.0, "step": 4890 }, { "epoch": 1.3877225900095587, "grad_norm": 1.1481465101242065, "learning_rate": 5.972936116647436e-06, "loss": 0.1654, "mean_token_accuracy": 0.7479696717113257, "num_tokens": 40139776.0, "step": 4900 }, { "epoch": 1.3905547491769037, "grad_norm": 1.3022313117980957, "learning_rate": 5.962446239378999e-06, "loss": 0.1757, "mean_token_accuracy": 0.7338918764144182, "num_tokens": 40221696.0, "step": 4910 }, { "epoch": 1.393386908344249, "grad_norm": 1.5090274810791016, "learning_rate": 5.951956362110564e-06, "loss": 0.1603, "mean_token_accuracy": 0.766927595064044, "num_tokens": 40303616.0, "step": 4920 }, { "epoch": 1.3962190675115942, "grad_norm": 1.2486814260482788, "learning_rate": 5.9414664848421275e-06, "loss": 0.1412, "mean_token_accuracy": 0.771477498114109, "num_tokens": 40385536.0, "step": 4930 }, { "epoch": 1.3990512266789392, "grad_norm": 1.6799087524414062, "learning_rate": 5.930976607573692e-06, "loss": 0.1489, "mean_token_accuracy": 0.7679305300116539, "num_tokens": 40467456.0, "step": 4940 }, { "epoch": 1.4018833858462845, "grad_norm": 1.310074806213379, "learning_rate": 5.920486730305256e-06, "loss": 0.1419, "mean_token_accuracy": 0.7625611569732428, "num_tokens": 40549376.0, "step": 4950 }, { "epoch": 1.4047155450136297, "grad_norm": 0.9224068522453308, "learning_rate": 5.90999685303682e-06, "loss": 0.1492, "mean_token_accuracy": 0.7620841458439827, "num_tokens": 40631296.0, "step": 4960 }, { "epoch": 1.407547704180975, "grad_norm": 1.083561897277832, "learning_rate": 5.899506975768385e-06, "loss": 0.1428, "mean_token_accuracy": 0.7647504866123199, "num_tokens": 40713216.0, "step": 4970 }, { "epoch": 1.4103798633483202, "grad_norm": 1.225934624671936, "learning_rate": 5.889017098499948e-06, "loss": 0.1476, "mean_token_accuracy": 0.7516511730849743, "num_tokens": 40795136.0, "step": 4980 }, { "epoch": 1.4132120225156655, "grad_norm": 1.319635033607483, "learning_rate": 5.878527221231513e-06, "loss": 0.1508, "mean_token_accuracy": 0.7660714261233806, "num_tokens": 40877056.0, "step": 4990 }, { "epoch": 1.4160441816830107, "grad_norm": 1.386938214302063, "learning_rate": 5.868037343963076e-06, "loss": 0.164, "mean_token_accuracy": 0.7583659503608942, "num_tokens": 40958976.0, "step": 5000 }, { "epoch": 1.4188763408503557, "grad_norm": 0.8875451683998108, "learning_rate": 5.857547466694639e-06, "loss": 0.1281, "mean_token_accuracy": 0.7618150662630796, "num_tokens": 41040896.0, "step": 5010 }, { "epoch": 1.421708500017701, "grad_norm": 1.4074711799621582, "learning_rate": 5.847057589426204e-06, "loss": 0.1426, "mean_token_accuracy": 0.7689579267054796, "num_tokens": 41122816.0, "step": 5020 }, { "epoch": 1.4245406591850462, "grad_norm": 0.9906668066978455, "learning_rate": 5.836567712157768e-06, "loss": 0.1549, "mean_token_accuracy": 0.7471501965075731, "num_tokens": 41204736.0, "step": 5030 }, { "epoch": 1.4273728183523913, "grad_norm": 1.2162476778030396, "learning_rate": 5.826077834889332e-06, "loss": 0.1366, "mean_token_accuracy": 0.7777030356228352, "num_tokens": 41286656.0, "step": 5040 }, { "epoch": 1.4302049775197365, "grad_norm": 1.366721510887146, "learning_rate": 5.8155879576208964e-06, "loss": 0.1357, "mean_token_accuracy": 0.7593321919441223, "num_tokens": 41368576.0, "step": 5050 }, { "epoch": 1.4330371366870818, "grad_norm": 1.6212693452835083, "learning_rate": 5.80509808035246e-06, "loss": 0.1595, "mean_token_accuracy": 0.7368273001164198, "num_tokens": 41450496.0, "step": 5060 }, { "epoch": 1.435869295854427, "grad_norm": 1.4507249593734741, "learning_rate": 5.794608203084025e-06, "loss": 0.1724, "mean_token_accuracy": 0.7467955019325018, "num_tokens": 41532416.0, "step": 5070 }, { "epoch": 1.4387014550217723, "grad_norm": 1.33669912815094, "learning_rate": 5.784118325815588e-06, "loss": 0.1357, "mean_token_accuracy": 0.7817270044237375, "num_tokens": 41614336.0, "step": 5080 }, { "epoch": 1.4415336141891175, "grad_norm": 1.1292166709899902, "learning_rate": 5.773628448547153e-06, "loss": 0.1186, "mean_token_accuracy": 0.8025929532945156, "num_tokens": 41696256.0, "step": 5090 }, { "epoch": 1.4443657733564628, "grad_norm": 1.426797866821289, "learning_rate": 5.763138571278717e-06, "loss": 0.1302, "mean_token_accuracy": 0.7718444231897592, "num_tokens": 41778176.0, "step": 5100 }, { "epoch": 1.4471979325238078, "grad_norm": 1.319903016090393, "learning_rate": 5.752648694010281e-06, "loss": 0.164, "mean_token_accuracy": 0.7661203496158123, "num_tokens": 41860096.0, "step": 5110 }, { "epoch": 1.450030091691153, "grad_norm": 1.547183632850647, "learning_rate": 5.742158816741844e-06, "loss": 0.1427, "mean_token_accuracy": 0.7687255389988422, "num_tokens": 41942016.0, "step": 5120 }, { "epoch": 1.4528622508584983, "grad_norm": 1.4972529411315918, "learning_rate": 5.731668939473408e-06, "loss": 0.1495, "mean_token_accuracy": 0.7581580251455307, "num_tokens": 42023936.0, "step": 5130 }, { "epoch": 1.4556944100258433, "grad_norm": 1.038957953453064, "learning_rate": 5.721179062204973e-06, "loss": 0.1394, "mean_token_accuracy": 0.7617416813969612, "num_tokens": 42105856.0, "step": 5140 }, { "epoch": 1.4585265691931886, "grad_norm": 1.362169623374939, "learning_rate": 5.710689184936536e-06, "loss": 0.1429, "mean_token_accuracy": 0.7665239717811346, "num_tokens": 42187776.0, "step": 5150 }, { "epoch": 1.4613587283605338, "grad_norm": 1.1335233449935913, "learning_rate": 5.700199307668101e-06, "loss": 0.1367, "mean_token_accuracy": 0.7705968640744686, "num_tokens": 42269696.0, "step": 5160 }, { "epoch": 1.464190887527879, "grad_norm": 0.9468916058540344, "learning_rate": 5.6897094303996645e-06, "loss": 0.1434, "mean_token_accuracy": 0.7534858129918576, "num_tokens": 42351616.0, "step": 5170 }, { "epoch": 1.4670230466952243, "grad_norm": 1.2155487537384033, "learning_rate": 5.679219553131229e-06, "loss": 0.1287, "mean_token_accuracy": 0.7738013707101346, "num_tokens": 42433536.0, "step": 5180 }, { "epoch": 1.4698552058625696, "grad_norm": 1.442675232887268, "learning_rate": 5.668729675862793e-06, "loss": 0.1713, "mean_token_accuracy": 0.7675269097089767, "num_tokens": 42515456.0, "step": 5190 }, { "epoch": 1.4726873650299148, "grad_norm": 1.367919683456421, "learning_rate": 5.658239798594357e-06, "loss": 0.1722, "mean_token_accuracy": 0.7303693741559982, "num_tokens": 42597376.0, "step": 5200 }, { "epoch": 1.4755195241972598, "grad_norm": 1.0879158973693848, "learning_rate": 5.647749921325922e-06, "loss": 0.1595, "mean_token_accuracy": 0.7517612487077713, "num_tokens": 42679296.0, "step": 5210 }, { "epoch": 1.478351683364605, "grad_norm": 1.4965977668762207, "learning_rate": 5.637260044057485e-06, "loss": 0.1597, "mean_token_accuracy": 0.7517734851688147, "num_tokens": 42761216.0, "step": 5220 }, { "epoch": 1.4811838425319503, "grad_norm": 1.3967041969299316, "learning_rate": 5.62677016678905e-06, "loss": 0.1633, "mean_token_accuracy": 0.7461717240512371, "num_tokens": 42843136.0, "step": 5230 }, { "epoch": 1.4840160016992954, "grad_norm": 1.0910017490386963, "learning_rate": 5.616280289520613e-06, "loss": 0.1531, "mean_token_accuracy": 0.7749755389988422, "num_tokens": 42925056.0, "step": 5240 }, { "epoch": 1.4868481608666406, "grad_norm": 1.6112931966781616, "learning_rate": 5.605790412252177e-06, "loss": 0.1513, "mean_token_accuracy": 0.759821429848671, "num_tokens": 43006976.0, "step": 5250 }, { "epoch": 1.4896803200339859, "grad_norm": 1.4555633068084717, "learning_rate": 5.595300534983741e-06, "loss": 0.1218, "mean_token_accuracy": 0.7777030356228352, "num_tokens": 43088896.0, "step": 5260 }, { "epoch": 1.4925124792013311, "grad_norm": 1.2095690965652466, "learning_rate": 5.584810657715305e-06, "loss": 0.1361, "mean_token_accuracy": 0.7724070467054844, "num_tokens": 43170816.0, "step": 5270 }, { "epoch": 1.4953446383686764, "grad_norm": 1.534693956375122, "learning_rate": 5.574320780446869e-06, "loss": 0.1303, "mean_token_accuracy": 0.7665484361350536, "num_tokens": 43252736.0, "step": 5280 }, { "epoch": 1.4981767975360216, "grad_norm": 1.6403086185455322, "learning_rate": 5.563830903178433e-06, "loss": 0.1587, "mean_token_accuracy": 0.7772994108498097, "num_tokens": 43334656.0, "step": 5290 }, { "epoch": 1.5010089567033669, "grad_norm": 1.2470252513885498, "learning_rate": 5.553341025909997e-06, "loss": 0.1367, "mean_token_accuracy": 0.7570327784866094, "num_tokens": 43416576.0, "step": 5300 }, { "epoch": 1.503841115870712, "grad_norm": 1.2972415685653687, "learning_rate": 5.5428511486415615e-06, "loss": 0.1445, "mean_token_accuracy": 0.7555039104074239, "num_tokens": 43498496.0, "step": 5310 }, { "epoch": 1.5066732750380571, "grad_norm": 1.089989185333252, "learning_rate": 5.532361271373126e-06, "loss": 0.1497, "mean_token_accuracy": 0.7693737719208003, "num_tokens": 43580416.0, "step": 5320 }, { "epoch": 1.5095054342054024, "grad_norm": 1.447652816772461, "learning_rate": 5.52187139410469e-06, "loss": 0.1393, "mean_token_accuracy": 0.7628546942025423, "num_tokens": 43662336.0, "step": 5330 }, { "epoch": 1.5123375933727474, "grad_norm": 1.2589353322982788, "learning_rate": 5.511381516836254e-06, "loss": 0.1449, "mean_token_accuracy": 0.7809809196740389, "num_tokens": 43744256.0, "step": 5340 }, { "epoch": 1.5151697525400927, "grad_norm": 1.2996416091918945, "learning_rate": 5.500891639567818e-06, "loss": 0.1127, "mean_token_accuracy": 0.7794275935739279, "num_tokens": 43826176.0, "step": 5350 }, { "epoch": 1.518001911707438, "grad_norm": 1.3021950721740723, "learning_rate": 5.490401762299381e-06, "loss": 0.1266, "mean_token_accuracy": 0.7822773978114128, "num_tokens": 43908096.0, "step": 5360 }, { "epoch": 1.5208340708747832, "grad_norm": 1.0567891597747803, "learning_rate": 5.479911885030945e-06, "loss": 0.1426, "mean_token_accuracy": 0.7647627212107182, "num_tokens": 43990016.0, "step": 5370 }, { "epoch": 1.5236662300421284, "grad_norm": 1.370571494102478, "learning_rate": 5.46942200776251e-06, "loss": 0.1348, "mean_token_accuracy": 0.7819593951106072, "num_tokens": 44071936.0, "step": 5380 }, { "epoch": 1.5264983892094737, "grad_norm": 1.4271764755249023, "learning_rate": 5.458932130494073e-06, "loss": 0.1608, "mean_token_accuracy": 0.7575709402561188, "num_tokens": 44153856.0, "step": 5390 }, { "epoch": 1.529330548376819, "grad_norm": 1.4001497030258179, "learning_rate": 5.448442253225638e-06, "loss": 0.1587, "mean_token_accuracy": 0.7693982385098934, "num_tokens": 44235776.0, "step": 5400 }, { "epoch": 1.532162707544164, "grad_norm": 1.462554693222046, "learning_rate": 5.4379523759572014e-06, "loss": 0.1278, "mean_token_accuracy": 0.7782411959022284, "num_tokens": 44317696.0, "step": 5410 }, { "epoch": 1.5349948667115092, "grad_norm": 1.293890357017517, "learning_rate": 5.427462498688766e-06, "loss": 0.133, "mean_token_accuracy": 0.7597725078463554, "num_tokens": 44399616.0, "step": 5420 }, { "epoch": 1.5378270258788542, "grad_norm": 1.263534665107727, "learning_rate": 5.41697262142033e-06, "loss": 0.1479, "mean_token_accuracy": 0.7797089036554098, "num_tokens": 44481536.0, "step": 5430 }, { "epoch": 1.5406591850461995, "grad_norm": 1.1234465837478638, "learning_rate": 5.406482744151894e-06, "loss": 0.1493, "mean_token_accuracy": 0.7515777889639139, "num_tokens": 44563456.0, "step": 5440 }, { "epoch": 1.5434913442135447, "grad_norm": 1.2508556842803955, "learning_rate": 5.395992866883459e-06, "loss": 0.144, "mean_token_accuracy": 0.7621208388358355, "num_tokens": 44645376.0, "step": 5450 }, { "epoch": 1.54632350338089, "grad_norm": 1.0123307704925537, "learning_rate": 5.385502989615022e-06, "loss": 0.1378, "mean_token_accuracy": 0.7777886517345906, "num_tokens": 44727296.0, "step": 5460 }, { "epoch": 1.5491556625482352, "grad_norm": 1.485517144203186, "learning_rate": 5.375013112346587e-06, "loss": 0.1426, "mean_token_accuracy": 0.7553082205355167, "num_tokens": 44809216.0, "step": 5470 }, { "epoch": 1.5519878217155805, "grad_norm": 1.2062337398529053, "learning_rate": 5.3645232350781495e-06, "loss": 0.1269, "mean_token_accuracy": 0.7857509776949883, "num_tokens": 44891136.0, "step": 5480 }, { "epoch": 1.5548199808829257, "grad_norm": 1.4012064933776855, "learning_rate": 5.354033357809714e-06, "loss": 0.1487, "mean_token_accuracy": 0.7709026386961341, "num_tokens": 44973056.0, "step": 5490 }, { "epoch": 1.557652140050271, "grad_norm": 1.3313307762145996, "learning_rate": 5.343543480541278e-06, "loss": 0.1419, "mean_token_accuracy": 0.7683096852153539, "num_tokens": 45054976.0, "step": 5500 }, { "epoch": 1.560484299217616, "grad_norm": 1.7144908905029297, "learning_rate": 5.333053603272842e-06, "loss": 0.1654, "mean_token_accuracy": 0.7630503926426172, "num_tokens": 45136896.0, "step": 5510 }, { "epoch": 1.5633164583849612, "grad_norm": 1.4874516725540161, "learning_rate": 5.322563726004406e-06, "loss": 0.1448, "mean_token_accuracy": 0.7626100767403841, "num_tokens": 45218816.0, "step": 5520 }, { "epoch": 1.5661486175523063, "grad_norm": 1.499945044517517, "learning_rate": 5.31207384873597e-06, "loss": 0.1874, "mean_token_accuracy": 0.7353473532944917, "num_tokens": 45300736.0, "step": 5530 }, { "epoch": 1.5689807767196515, "grad_norm": 0.9582758545875549, "learning_rate": 5.301583971467534e-06, "loss": 0.128, "mean_token_accuracy": 0.7822407066822052, "num_tokens": 45382656.0, "step": 5540 }, { "epoch": 1.5718129358869968, "grad_norm": 1.0210357904434204, "learning_rate": 5.2910940941990985e-06, "loss": 0.1438, "mean_token_accuracy": 0.7709148716181516, "num_tokens": 45464576.0, "step": 5550 }, { "epoch": 1.574645095054342, "grad_norm": 1.0268406867980957, "learning_rate": 5.280604216930663e-06, "loss": 0.1212, "mean_token_accuracy": 0.7854941338300705, "num_tokens": 45546496.0, "step": 5560 }, { "epoch": 1.5774772542216873, "grad_norm": 1.132442593574524, "learning_rate": 5.270114339662227e-06, "loss": 0.1103, "mean_token_accuracy": 0.7826810192316771, "num_tokens": 45628416.0, "step": 5570 }, { "epoch": 1.5803094133890325, "grad_norm": 0.9779514074325562, "learning_rate": 5.259624462393791e-06, "loss": 0.1465, "mean_token_accuracy": 0.7640410970896483, "num_tokens": 45710336.0, "step": 5580 }, { "epoch": 1.5831415725563778, "grad_norm": 1.5635689496994019, "learning_rate": 5.249134585125355e-06, "loss": 0.1445, "mean_token_accuracy": 0.7751345373690128, "num_tokens": 45792256.0, "step": 5590 }, { "epoch": 1.585973731723723, "grad_norm": 1.183363437652588, "learning_rate": 5.2386447078569184e-06, "loss": 0.1432, "mean_token_accuracy": 0.7729329708963633, "num_tokens": 45874176.0, "step": 5600 }, { "epoch": 1.588805890891068, "grad_norm": 1.1412488222122192, "learning_rate": 5.228154830588482e-06, "loss": 0.134, "mean_token_accuracy": 0.7692147780209779, "num_tokens": 45956096.0, "step": 5610 }, { "epoch": 1.5916380500584133, "grad_norm": 0.8815058469772339, "learning_rate": 5.217664953320047e-06, "loss": 0.1501, "mean_token_accuracy": 0.7613258309662342, "num_tokens": 46038016.0, "step": 5620 }, { "epoch": 1.5944702092257583, "grad_norm": 1.241381049156189, "learning_rate": 5.20717507605161e-06, "loss": 0.1576, "mean_token_accuracy": 0.7301002964377403, "num_tokens": 46119936.0, "step": 5630 }, { "epoch": 1.5973023683931036, "grad_norm": 0.9993299841880798, "learning_rate": 5.196685198783175e-06, "loss": 0.1162, "mean_token_accuracy": 0.7772137984633446, "num_tokens": 46201856.0, "step": 5640 }, { "epoch": 1.6001345275604488, "grad_norm": 1.267540693283081, "learning_rate": 5.186195321514738e-06, "loss": 0.1439, "mean_token_accuracy": 0.7400440324097872, "num_tokens": 46283776.0, "step": 5650 }, { "epoch": 1.602966686727794, "grad_norm": 1.2300502061843872, "learning_rate": 5.175705444246303e-06, "loss": 0.1426, "mean_token_accuracy": 0.7766144797205925, "num_tokens": 46365696.0, "step": 5660 }, { "epoch": 1.6057988458951393, "grad_norm": 1.20527982711792, "learning_rate": 5.165215566977867e-06, "loss": 0.1295, "mean_token_accuracy": 0.7617539137601852, "num_tokens": 46447616.0, "step": 5670 }, { "epoch": 1.6086310050624846, "grad_norm": 1.313085675239563, "learning_rate": 5.154725689709431e-06, "loss": 0.1473, "mean_token_accuracy": 0.7565435413271189, "num_tokens": 46529536.0, "step": 5680 }, { "epoch": 1.6114631642298298, "grad_norm": 1.4303587675094604, "learning_rate": 5.1442358124409955e-06, "loss": 0.1538, "mean_token_accuracy": 0.7667196657508611, "num_tokens": 46611456.0, "step": 5690 }, { "epoch": 1.614295323397175, "grad_norm": 1.424620509147644, "learning_rate": 5.133745935172559e-06, "loss": 0.1444, "mean_token_accuracy": 0.762353228777647, "num_tokens": 46693376.0, "step": 5700 }, { "epoch": 1.61712748256452, "grad_norm": 1.2423510551452637, "learning_rate": 5.123256057904122e-06, "loss": 0.152, "mean_token_accuracy": 0.7459148727357388, "num_tokens": 46775296.0, "step": 5710 }, { "epoch": 1.6199596417318654, "grad_norm": 1.3926820755004883, "learning_rate": 5.1127661806356865e-06, "loss": 0.1508, "mean_token_accuracy": 0.7559931499883532, "num_tokens": 46857216.0, "step": 5720 }, { "epoch": 1.6227918008992104, "grad_norm": 1.3906859159469604, "learning_rate": 5.102276303367251e-06, "loss": 0.1369, "mean_token_accuracy": 0.7666585128754377, "num_tokens": 46939136.0, "step": 5730 }, { "epoch": 1.6256239600665556, "grad_norm": 1.1301428079605103, "learning_rate": 5.091786426098815e-06, "loss": 0.1258, "mean_token_accuracy": 0.7873776897788047, "num_tokens": 47021056.0, "step": 5740 }, { "epoch": 1.6284561192339009, "grad_norm": 1.4063740968704224, "learning_rate": 5.081296548830379e-06, "loss": 0.1379, "mean_token_accuracy": 0.7387475535273552, "num_tokens": 47102976.0, "step": 5750 }, { "epoch": 1.6312882784012461, "grad_norm": 1.215005874633789, "learning_rate": 5.070806671561943e-06, "loss": 0.1831, "mean_token_accuracy": 0.7394324894994497, "num_tokens": 47184896.0, "step": 5760 }, { "epoch": 1.6341204375685914, "grad_norm": 1.3542537689208984, "learning_rate": 5.060316794293507e-06, "loss": 0.1494, "mean_token_accuracy": 0.7688111521303653, "num_tokens": 47266816.0, "step": 5770 }, { "epoch": 1.6369525967359366, "grad_norm": 1.3587734699249268, "learning_rate": 5.049826917025071e-06, "loss": 0.1153, "mean_token_accuracy": 0.7815802372992039, "num_tokens": 47348736.0, "step": 5780 }, { "epoch": 1.6397847559032819, "grad_norm": 1.0946316719055176, "learning_rate": 5.0393370397566354e-06, "loss": 0.1269, "mean_token_accuracy": 0.786545991152525, "num_tokens": 47430656.0, "step": 5790 }, { "epoch": 1.6426169150706271, "grad_norm": 1.2802456617355347, "learning_rate": 5.0288471624882e-06, "loss": 0.1466, "mean_token_accuracy": 0.7504403118044138, "num_tokens": 47512576.0, "step": 5800 }, { "epoch": 1.6454490742379722, "grad_norm": 1.2711937427520752, "learning_rate": 5.018357285219764e-06, "loss": 0.1353, "mean_token_accuracy": 0.7765166345983744, "num_tokens": 47594496.0, "step": 5810 }, { "epoch": 1.6482812334053174, "grad_norm": 1.4547213315963745, "learning_rate": 5.007867407951328e-06, "loss": 0.1488, "mean_token_accuracy": 0.7489726033061743, "num_tokens": 47676416.0, "step": 5820 }, { "epoch": 1.6511133925726624, "grad_norm": 1.2024240493774414, "learning_rate": 4.997377530682892e-06, "loss": 0.1423, "mean_token_accuracy": 0.7631237763911486, "num_tokens": 47758336.0, "step": 5830 }, { "epoch": 1.6539455517400077, "grad_norm": 1.295454740524292, "learning_rate": 4.986887653414455e-06, "loss": 0.1336, "mean_token_accuracy": 0.7587573397904634, "num_tokens": 47840256.0, "step": 5840 }, { "epoch": 1.656777710907353, "grad_norm": 1.1817426681518555, "learning_rate": 4.97639777614602e-06, "loss": 0.1566, "mean_token_accuracy": 0.758659491315484, "num_tokens": 47922176.0, "step": 5850 }, { "epoch": 1.6596098700746982, "grad_norm": 1.86770761013031, "learning_rate": 4.9659078988775835e-06, "loss": 0.1462, "mean_token_accuracy": 0.7527764152735472, "num_tokens": 48004096.0, "step": 5860 }, { "epoch": 1.6624420292420434, "grad_norm": 1.2514455318450928, "learning_rate": 4.955418021609147e-06, "loss": 0.1657, "mean_token_accuracy": 0.7383561626076698, "num_tokens": 48086016.0, "step": 5870 }, { "epoch": 1.6652741884093887, "grad_norm": 1.2668324708938599, "learning_rate": 4.944928144340712e-06, "loss": 0.1415, "mean_token_accuracy": 0.7599681988358498, "num_tokens": 48167936.0, "step": 5880 }, { "epoch": 1.668106347576734, "grad_norm": 1.3593950271606445, "learning_rate": 4.934438267072275e-06, "loss": 0.1151, "mean_token_accuracy": 0.7792196657508612, "num_tokens": 48249856.0, "step": 5890 }, { "epoch": 1.6709385067440792, "grad_norm": 1.359831690788269, "learning_rate": 4.92394838980384e-06, "loss": 0.155, "mean_token_accuracy": 0.7778253436088562, "num_tokens": 48331776.0, "step": 5900 }, { "epoch": 1.6737706659114242, "grad_norm": 1.2708696126937866, "learning_rate": 4.913458512535404e-06, "loss": 0.1266, "mean_token_accuracy": 0.7753913901746273, "num_tokens": 48413696.0, "step": 5910 }, { "epoch": 1.6766028250787695, "grad_norm": 1.0099893808364868, "learning_rate": 4.902968635266967e-06, "loss": 0.1264, "mean_token_accuracy": 0.7803204517811537, "num_tokens": 48495616.0, "step": 5920 }, { "epoch": 1.6794349842461145, "grad_norm": 1.1538772583007812, "learning_rate": 4.892478757998532e-06, "loss": 0.1739, "mean_token_accuracy": 0.742918298766017, "num_tokens": 48577536.0, "step": 5930 }, { "epoch": 1.6822671434134597, "grad_norm": 1.347820520401001, "learning_rate": 4.881988880730096e-06, "loss": 0.16, "mean_token_accuracy": 0.7535591971129179, "num_tokens": 48659456.0, "step": 5940 }, { "epoch": 1.685099302580805, "grad_norm": 1.5099506378173828, "learning_rate": 4.87149900346166e-06, "loss": 0.1462, "mean_token_accuracy": 0.7506604705005884, "num_tokens": 48741376.0, "step": 5950 }, { "epoch": 1.6879314617481502, "grad_norm": 1.7748444080352783, "learning_rate": 4.861009126193224e-06, "loss": 0.1623, "mean_token_accuracy": 0.75568737834692, "num_tokens": 48823296.0, "step": 5960 }, { "epoch": 1.6907636209154955, "grad_norm": 1.3099833726882935, "learning_rate": 4.850519248924788e-06, "loss": 0.1371, "mean_token_accuracy": 0.7739481426775455, "num_tokens": 48905216.0, "step": 5970 }, { "epoch": 1.6935957800828407, "grad_norm": 1.0896867513656616, "learning_rate": 4.840029371656352e-06, "loss": 0.1472, "mean_token_accuracy": 0.7578889455646276, "num_tokens": 48987136.0, "step": 5980 }, { "epoch": 1.696427939250186, "grad_norm": 1.1280524730682373, "learning_rate": 4.829539494387916e-06, "loss": 0.1401, "mean_token_accuracy": 0.770523483864963, "num_tokens": 49069056.0, "step": 5990 }, { "epoch": 1.6992600984175312, "grad_norm": 1.3631882667541504, "learning_rate": 4.81904961711948e-06, "loss": 0.1449, "mean_token_accuracy": 0.7820083159953356, "num_tokens": 49150976.0, "step": 6000 }, { "epoch": 1.7020922575848763, "grad_norm": 1.32793128490448, "learning_rate": 4.808559739851044e-06, "loss": 0.1307, "mean_token_accuracy": 0.7666218191385269, "num_tokens": 49232896.0, "step": 6010 }, { "epoch": 1.7049244167522215, "grad_norm": 1.7551764249801636, "learning_rate": 4.798069862582608e-06, "loss": 0.1592, "mean_token_accuracy": 0.7585494127124548, "num_tokens": 49314816.0, "step": 6020 }, { "epoch": 1.7077565759195665, "grad_norm": 1.4971647262573242, "learning_rate": 4.787579985314172e-06, "loss": 0.137, "mean_token_accuracy": 0.7634050861001015, "num_tokens": 49396736.0, "step": 6030 }, { "epoch": 1.7105887350869118, "grad_norm": 1.153831124305725, "learning_rate": 4.777090108045736e-06, "loss": 0.1369, "mean_token_accuracy": 0.7616193741559982, "num_tokens": 49478656.0, "step": 6040 }, { "epoch": 1.713420894254257, "grad_norm": 1.137524962425232, "learning_rate": 4.7666002307773005e-06, "loss": 0.1487, "mean_token_accuracy": 0.7590998068451882, "num_tokens": 49560576.0, "step": 6050 }, { "epoch": 1.7162530534216023, "grad_norm": 1.3998011350631714, "learning_rate": 4.756110353508864e-06, "loss": 0.1277, "mean_token_accuracy": 0.7648116413503885, "num_tokens": 49642496.0, "step": 6060 }, { "epoch": 1.7190852125889475, "grad_norm": 1.186885952949524, "learning_rate": 4.745620476240429e-06, "loss": 0.1366, "mean_token_accuracy": 0.7636741679161787, "num_tokens": 49724416.0, "step": 6070 }, { "epoch": 1.7219173717562928, "grad_norm": 1.1694114208221436, "learning_rate": 4.735130598971992e-06, "loss": 0.1252, "mean_token_accuracy": 0.7741927593946457, "num_tokens": 49806336.0, "step": 6080 }, { "epoch": 1.724749530923638, "grad_norm": 1.310731053352356, "learning_rate": 4.724640721703557e-06, "loss": 0.1436, "mean_token_accuracy": 0.7724681984633207, "num_tokens": 49888256.0, "step": 6090 }, { "epoch": 1.7275816900909833, "grad_norm": 2.1491355895996094, "learning_rate": 4.7141508444351205e-06, "loss": 0.1477, "mean_token_accuracy": 0.7605430491268634, "num_tokens": 49970176.0, "step": 6100 }, { "epoch": 1.7304138492583283, "grad_norm": 1.3095204830169678, "learning_rate": 4.703660967166684e-06, "loss": 0.1354, "mean_token_accuracy": 0.7659368887543678, "num_tokens": 50052096.0, "step": 6110 }, { "epoch": 1.7332460084256736, "grad_norm": 1.7687945365905762, "learning_rate": 4.693171089898249e-06, "loss": 0.1361, "mean_token_accuracy": 0.7774706445634365, "num_tokens": 50134016.0, "step": 6120 }, { "epoch": 1.7360781675930186, "grad_norm": 1.420762062072754, "learning_rate": 4.682681212629812e-06, "loss": 0.1424, "mean_token_accuracy": 0.757228472083807, "num_tokens": 50215936.0, "step": 6130 }, { "epoch": 1.7389103267603638, "grad_norm": 1.2249056100845337, "learning_rate": 4.672191335361377e-06, "loss": 0.1365, "mean_token_accuracy": 0.767049903050065, "num_tokens": 50297856.0, "step": 6140 }, { "epoch": 1.741742485927709, "grad_norm": 1.0154017210006714, "learning_rate": 4.661701458092941e-06, "loss": 0.1388, "mean_token_accuracy": 0.7649706471711397, "num_tokens": 50379776.0, "step": 6150 }, { "epoch": 1.7445746450950543, "grad_norm": 1.2950698137283325, "learning_rate": 4.651211580824504e-06, "loss": 0.1711, "mean_token_accuracy": 0.7326320927590132, "num_tokens": 50461696.0, "step": 6160 }, { "epoch": 1.7474068042623996, "grad_norm": 1.328643798828125, "learning_rate": 4.640721703556069e-06, "loss": 0.1493, "mean_token_accuracy": 0.7542196661233902, "num_tokens": 50543616.0, "step": 6170 }, { "epoch": 1.7502389634297448, "grad_norm": 1.2442131042480469, "learning_rate": 4.630231826287633e-06, "loss": 0.139, "mean_token_accuracy": 0.7598825871944428, "num_tokens": 50625536.0, "step": 6180 }, { "epoch": 1.75307112259709, "grad_norm": 1.5204259157180786, "learning_rate": 4.619741949019197e-06, "loss": 0.1361, "mean_token_accuracy": 0.7618517607450486, "num_tokens": 50707456.0, "step": 6190 }, { "epoch": 1.755903281764435, "grad_norm": 0.8497937321662903, "learning_rate": 4.609252071750761e-06, "loss": 0.1284, "mean_token_accuracy": 0.7812622345983982, "num_tokens": 50789376.0, "step": 6200 }, { "epoch": 1.7587354409317804, "grad_norm": 1.1050366163253784, "learning_rate": 4.598762194482325e-06, "loss": 0.1308, "mean_token_accuracy": 0.7537304311990738, "num_tokens": 50871296.0, "step": 6210 }, { "epoch": 1.7615676000991256, "grad_norm": 1.61236572265625, "learning_rate": 4.5882723172138886e-06, "loss": 0.1252, "mean_token_accuracy": 0.766181505843997, "num_tokens": 50953216.0, "step": 6220 }, { "epoch": 1.7643997592664706, "grad_norm": 1.2308703660964966, "learning_rate": 4.577782439945453e-06, "loss": 0.1459, "mean_token_accuracy": 0.7547455996274948, "num_tokens": 51035136.0, "step": 6230 }, { "epoch": 1.7672319184338159, "grad_norm": 1.1633412837982178, "learning_rate": 4.567292562677017e-06, "loss": 0.1381, "mean_token_accuracy": 0.7818003922700882, "num_tokens": 51117056.0, "step": 6240 }, { "epoch": 1.7700640776011611, "grad_norm": 1.1778638362884521, "learning_rate": 4.556802685408581e-06, "loss": 0.1446, "mean_token_accuracy": 0.7586961854249239, "num_tokens": 51198976.0, "step": 6250 }, { "epoch": 1.7728962367685064, "grad_norm": 1.3454240560531616, "learning_rate": 4.546312808140146e-06, "loss": 0.1745, "mean_token_accuracy": 0.7514432486146688, "num_tokens": 51280896.0, "step": 6260 }, { "epoch": 1.7757283959358516, "grad_norm": 1.7052457332611084, "learning_rate": 4.535822930871709e-06, "loss": 0.1511, "mean_token_accuracy": 0.7420376721769572, "num_tokens": 51362816.0, "step": 6270 }, { "epoch": 1.7785605551031969, "grad_norm": 1.7256077527999878, "learning_rate": 4.525333053603273e-06, "loss": 0.163, "mean_token_accuracy": 0.7409980431199074, "num_tokens": 51444736.0, "step": 6280 }, { "epoch": 1.7813927142705421, "grad_norm": 1.2285022735595703, "learning_rate": 4.5148431763348375e-06, "loss": 0.1499, "mean_token_accuracy": 0.740044030174613, "num_tokens": 51526656.0, "step": 6290 }, { "epoch": 1.7842248734378872, "grad_norm": 1.3115322589874268, "learning_rate": 4.504353299066401e-06, "loss": 0.1343, "mean_token_accuracy": 0.7759050875902176, "num_tokens": 51608576.0, "step": 6300 }, { "epoch": 1.7870570326052324, "grad_norm": 1.179888367652893, "learning_rate": 4.493863421797966e-06, "loss": 0.1357, "mean_token_accuracy": 0.7581090994179249, "num_tokens": 51690496.0, "step": 6310 }, { "epoch": 1.7898891917725777, "grad_norm": 1.0342109203338623, "learning_rate": 4.483373544529529e-06, "loss": 0.1601, "mean_token_accuracy": 0.7542074400931597, "num_tokens": 51772416.0, "step": 6320 }, { "epoch": 1.7927213509399227, "grad_norm": 0.945540189743042, "learning_rate": 4.472883667261094e-06, "loss": 0.1168, "mean_token_accuracy": 0.768896771222353, "num_tokens": 51854336.0, "step": 6330 }, { "epoch": 1.795553510107268, "grad_norm": 1.3624308109283447, "learning_rate": 4.4623937899926575e-06, "loss": 0.134, "mean_token_accuracy": 0.7692147742956876, "num_tokens": 51936256.0, "step": 6340 }, { "epoch": 1.7983856692746132, "grad_norm": 1.515984296798706, "learning_rate": 4.451903912724221e-06, "loss": 0.1229, "mean_token_accuracy": 0.7707925647497177, "num_tokens": 52018176.0, "step": 6350 }, { "epoch": 1.8012178284419584, "grad_norm": 1.5331177711486816, "learning_rate": 4.441414035455786e-06, "loss": 0.1212, "mean_token_accuracy": 0.7510885518044234, "num_tokens": 52100096.0, "step": 6360 }, { "epoch": 1.8040499876093037, "grad_norm": 1.2075921297073364, "learning_rate": 4.430924158187349e-06, "loss": 0.1368, "mean_token_accuracy": 0.7677226033061743, "num_tokens": 52182016.0, "step": 6370 }, { "epoch": 1.806882146776649, "grad_norm": 1.0744962692260742, "learning_rate": 4.420434280918914e-06, "loss": 0.1346, "mean_token_accuracy": 0.7766756396740675, "num_tokens": 52263936.0, "step": 6380 }, { "epoch": 1.8097143059439942, "grad_norm": 1.2855663299560547, "learning_rate": 4.409944403650477e-06, "loss": 0.1473, "mean_token_accuracy": 0.7603473570197821, "num_tokens": 52345856.0, "step": 6390 }, { "epoch": 1.8125464651113392, "grad_norm": 1.4224014282226562, "learning_rate": 4.399454526382042e-06, "loss": 0.1531, "mean_token_accuracy": 0.7285836603492498, "num_tokens": 52427776.0, "step": 6400 }, { "epoch": 1.8153786242786845, "grad_norm": 0.9965189695358276, "learning_rate": 4.3889646491136056e-06, "loss": 0.1343, "mean_token_accuracy": 0.7847725041210651, "num_tokens": 52509696.0, "step": 6410 }, { "epoch": 1.8182107834460297, "grad_norm": 1.1762076616287231, "learning_rate": 4.37847477184517e-06, "loss": 0.1233, "mean_token_accuracy": 0.7741682980209589, "num_tokens": 52591616.0, "step": 6420 }, { "epoch": 1.8210429426133747, "grad_norm": 1.0796014070510864, "learning_rate": 4.367984894576734e-06, "loss": 0.1316, "mean_token_accuracy": 0.7858243614435196, "num_tokens": 52673536.0, "step": 6430 }, { "epoch": 1.82387510178072, "grad_norm": 1.0122427940368652, "learning_rate": 4.357495017308298e-06, "loss": 0.1419, "mean_token_accuracy": 0.7776174165308476, "num_tokens": 52755456.0, "step": 6440 }, { "epoch": 1.8267072609480652, "grad_norm": 1.4302805662155151, "learning_rate": 4.347005140039862e-06, "loss": 0.1383, "mean_token_accuracy": 0.7606042064726353, "num_tokens": 52837376.0, "step": 6450 }, { "epoch": 1.8295394201154105, "grad_norm": 1.604728102684021, "learning_rate": 4.3365152627714255e-06, "loss": 0.1399, "mean_token_accuracy": 0.768982382491231, "num_tokens": 52919296.0, "step": 6460 }, { "epoch": 1.8323715792827557, "grad_norm": 1.041070818901062, "learning_rate": 4.32602538550299e-06, "loss": 0.1486, "mean_token_accuracy": 0.7785225037485362, "num_tokens": 53001216.0, "step": 6470 }, { "epoch": 1.835203738450101, "grad_norm": 1.6519886255264282, "learning_rate": 4.315535508234554e-06, "loss": 0.15, "mean_token_accuracy": 0.7633683957159519, "num_tokens": 53083136.0, "step": 6480 }, { "epoch": 1.8380358976174462, "grad_norm": 1.0633420944213867, "learning_rate": 4.305045630966118e-06, "loss": 0.161, "mean_token_accuracy": 0.7400684945285321, "num_tokens": 53165056.0, "step": 6490 }, { "epoch": 1.8408680567847913, "grad_norm": 1.340437889099121, "learning_rate": 4.294555753697683e-06, "loss": 0.1445, "mean_token_accuracy": 0.7808708392083645, "num_tokens": 53246976.0, "step": 6500 }, { "epoch": 1.8437002159521365, "grad_norm": 1.051612377166748, "learning_rate": 4.2840658764292455e-06, "loss": 0.1379, "mean_token_accuracy": 0.7578767087310553, "num_tokens": 53328896.0, "step": 6510 }, { "epoch": 1.8465323751194818, "grad_norm": 1.1848019361495972, "learning_rate": 4.27357599916081e-06, "loss": 0.1477, "mean_token_accuracy": 0.7733732894062996, "num_tokens": 53410816.0, "step": 6520 }, { "epoch": 1.8493645342868268, "grad_norm": 1.636946678161621, "learning_rate": 4.2630861218923745e-06, "loss": 0.1106, "mean_token_accuracy": 0.789028862118721, "num_tokens": 53492736.0, "step": 6530 }, { "epoch": 1.852196693454172, "grad_norm": 1.2405682802200317, "learning_rate": 4.252596244623938e-06, "loss": 0.1462, "mean_token_accuracy": 0.7626467708498239, "num_tokens": 53574656.0, "step": 6540 }, { "epoch": 1.8550288526215173, "grad_norm": 1.2553350925445557, "learning_rate": 4.242106367355503e-06, "loss": 0.1347, "mean_token_accuracy": 0.7882705483585596, "num_tokens": 53656576.0, "step": 6550 }, { "epoch": 1.8578610117888625, "grad_norm": 1.2830207347869873, "learning_rate": 4.231616490087066e-06, "loss": 0.1479, "mean_token_accuracy": 0.7792685911059379, "num_tokens": 53738496.0, "step": 6560 }, { "epoch": 1.8606931709562078, "grad_norm": 1.353459358215332, "learning_rate": 4.22112661281863e-06, "loss": 0.153, "mean_token_accuracy": 0.7514677092432975, "num_tokens": 53820416.0, "step": 6570 }, { "epoch": 1.863525330123553, "grad_norm": 1.3550572395324707, "learning_rate": 4.210636735550194e-06, "loss": 0.1357, "mean_token_accuracy": 0.7496453028172254, "num_tokens": 53902336.0, "step": 6580 }, { "epoch": 1.8663574892908983, "grad_norm": 1.4738825559616089, "learning_rate": 4.200146858281758e-06, "loss": 0.1401, "mean_token_accuracy": 0.7623043045401573, "num_tokens": 53984256.0, "step": 6590 }, { "epoch": 1.8691896484582433, "grad_norm": 0.9880483150482178, "learning_rate": 4.1896569810133226e-06, "loss": 0.1352, "mean_token_accuracy": 0.7831947140395641, "num_tokens": 54066176.0, "step": 6600 }, { "epoch": 1.8720218076255886, "grad_norm": 1.3302457332611084, "learning_rate": 4.179167103744887e-06, "loss": 0.1479, "mean_token_accuracy": 0.7596501972526312, "num_tokens": 54148096.0, "step": 6610 }, { "epoch": 1.8748539667929338, "grad_norm": 1.125423550605774, "learning_rate": 4.168677226476451e-06, "loss": 0.1507, "mean_token_accuracy": 0.7573018599301576, "num_tokens": 54230016.0, "step": 6620 }, { "epoch": 1.8776861259602788, "grad_norm": 1.5439138412475586, "learning_rate": 4.158187349208014e-06, "loss": 0.122, "mean_token_accuracy": 0.7678816065192222, "num_tokens": 54311936.0, "step": 6630 }, { "epoch": 1.880518285127624, "grad_norm": 1.3888694047927856, "learning_rate": 4.147697471939579e-06, "loss": 0.1352, "mean_token_accuracy": 0.7666829742491246, "num_tokens": 54393856.0, "step": 6640 }, { "epoch": 1.8833504442949693, "grad_norm": 1.2198402881622314, "learning_rate": 4.1372075946711425e-06, "loss": 0.1507, "mean_token_accuracy": 0.7729941289871931, "num_tokens": 54475776.0, "step": 6650 }, { "epoch": 1.8861826034623146, "grad_norm": 1.471026062965393, "learning_rate": 4.126717717402707e-06, "loss": 0.162, "mean_token_accuracy": 0.7677592966705561, "num_tokens": 54557696.0, "step": 6660 }, { "epoch": 1.8890147626296598, "grad_norm": 1.1513080596923828, "learning_rate": 4.116227840134271e-06, "loss": 0.1212, "mean_token_accuracy": 0.7760151669383049, "num_tokens": 54639616.0, "step": 6670 }, { "epoch": 1.891846921797005, "grad_norm": 1.2265801429748535, "learning_rate": 4.105737962865835e-06, "loss": 0.1377, "mean_token_accuracy": 0.7539138946682215, "num_tokens": 54721536.0, "step": 6680 }, { "epoch": 1.8946790809643503, "grad_norm": 1.4949698448181152, "learning_rate": 4.095248085597399e-06, "loss": 0.1375, "mean_token_accuracy": 0.7611423674970865, "num_tokens": 54803456.0, "step": 6690 }, { "epoch": 1.8975112401316954, "grad_norm": 1.4778004884719849, "learning_rate": 4.0847582083289625e-06, "loss": 0.1396, "mean_token_accuracy": 0.7655821930617094, "num_tokens": 54885376.0, "step": 6700 }, { "epoch": 1.9003433992990406, "grad_norm": 1.1195168495178223, "learning_rate": 4.074268331060527e-06, "loss": 0.1429, "mean_token_accuracy": 0.7397137958556413, "num_tokens": 54967296.0, "step": 6710 }, { "epoch": 1.9031755584663859, "grad_norm": 1.469353437423706, "learning_rate": 4.063778453792091e-06, "loss": 0.1364, "mean_token_accuracy": 0.7581702508032322, "num_tokens": 55049216.0, "step": 6720 }, { "epoch": 1.906007717633731, "grad_norm": 1.1045783758163452, "learning_rate": 4.053288576523655e-06, "loss": 0.1345, "mean_token_accuracy": 0.779806750640273, "num_tokens": 55131136.0, "step": 6730 }, { "epoch": 1.9088398768010761, "grad_norm": 1.467956781387329, "learning_rate": 4.04279869925522e-06, "loss": 0.1324, "mean_token_accuracy": 0.7604329735040665, "num_tokens": 55213056.0, "step": 6740 }, { "epoch": 1.9116720359684214, "grad_norm": 1.5500046014785767, "learning_rate": 4.032308821986783e-06, "loss": 0.1297, "mean_token_accuracy": 0.7708170246332884, "num_tokens": 55294976.0, "step": 6750 }, { "epoch": 1.9145041951357666, "grad_norm": 1.1060036420822144, "learning_rate": 4.021818944718347e-06, "loss": 0.1627, "mean_token_accuracy": 0.7569471649825573, "num_tokens": 55376896.0, "step": 6760 }, { "epoch": 1.917336354303112, "grad_norm": 0.980268657207489, "learning_rate": 4.011329067449911e-06, "loss": 0.1391, "mean_token_accuracy": 0.7578767161816359, "num_tokens": 55458816.0, "step": 6770 }, { "epoch": 1.9201685134704571, "grad_norm": 1.228349208831787, "learning_rate": 4.000839190181475e-06, "loss": 0.1337, "mean_token_accuracy": 0.7779843404889106, "num_tokens": 55540736.0, "step": 6780 }, { "epoch": 1.9230006726378024, "grad_norm": 1.593131184577942, "learning_rate": 3.9903493129130396e-06, "loss": 0.1245, "mean_token_accuracy": 0.7812133099883795, "num_tokens": 55622656.0, "step": 6790 }, { "epoch": 1.9258328318051474, "grad_norm": 1.2142200469970703, "learning_rate": 3.979859435644603e-06, "loss": 0.165, "mean_token_accuracy": 0.7408512722700834, "num_tokens": 55704576.0, "step": 6800 }, { "epoch": 1.9286649909724927, "grad_norm": 1.1077088117599487, "learning_rate": 3.969369558376167e-06, "loss": 0.1532, "mean_token_accuracy": 0.7530332658439874, "num_tokens": 55786496.0, "step": 6810 }, { "epoch": 1.9314971501398377, "grad_norm": 1.250959038734436, "learning_rate": 3.958879681107731e-06, "loss": 0.1474, "mean_token_accuracy": 0.7682363007217645, "num_tokens": 55868416.0, "step": 6820 }, { "epoch": 1.934329309307183, "grad_norm": 1.5984784364700317, "learning_rate": 3.948389803839295e-06, "loss": 0.1492, "mean_token_accuracy": 0.748727984726429, "num_tokens": 55950336.0, "step": 6830 }, { "epoch": 1.9371614684745282, "grad_norm": 1.3626652956008911, "learning_rate": 3.9378999265708595e-06, "loss": 0.1194, "mean_token_accuracy": 0.7751956906169652, "num_tokens": 56032256.0, "step": 6840 }, { "epoch": 1.9399936276418734, "grad_norm": 0.8952313661575317, "learning_rate": 3.927410049302424e-06, "loss": 0.1412, "mean_token_accuracy": 0.7478962812572718, "num_tokens": 56114176.0, "step": 6850 }, { "epoch": 1.9428257868092187, "grad_norm": 1.1956207752227783, "learning_rate": 3.916920172033988e-06, "loss": 0.1387, "mean_token_accuracy": 0.7808708418160677, "num_tokens": 56196096.0, "step": 6860 }, { "epoch": 1.945657945976564, "grad_norm": 1.451227068901062, "learning_rate": 3.906430294765551e-06, "loss": 0.1513, "mean_token_accuracy": 0.7566046971827746, "num_tokens": 56278016.0, "step": 6870 }, { "epoch": 1.9484901051439092, "grad_norm": 1.3304190635681152, "learning_rate": 3.895940417497116e-06, "loss": 0.1396, "mean_token_accuracy": 0.7749633073806763, "num_tokens": 56359936.0, "step": 6880 }, { "epoch": 1.9513222643112544, "grad_norm": 1.8782711029052734, "learning_rate": 3.8854505402286795e-06, "loss": 0.121, "mean_token_accuracy": 0.7934931486845016, "num_tokens": 56441856.0, "step": 6890 }, { "epoch": 1.9541544234785995, "grad_norm": 1.382428765296936, "learning_rate": 3.874960662960244e-06, "loss": 0.1436, "mean_token_accuracy": 0.7599070426076651, "num_tokens": 56523776.0, "step": 6900 }, { "epoch": 1.9569865826459447, "grad_norm": 1.2815287113189697, "learning_rate": 3.864470785691808e-06, "loss": 0.1534, "mean_token_accuracy": 0.7486423697322607, "num_tokens": 56605696.0, "step": 6910 }, { "epoch": 1.9598187418132897, "grad_norm": 1.132082223892212, "learning_rate": 3.853980908423372e-06, "loss": 0.1278, "mean_token_accuracy": 0.7660836562514305, "num_tokens": 56687616.0, "step": 6920 }, { "epoch": 1.962650900980635, "grad_norm": 1.0676507949829102, "learning_rate": 3.843491031154936e-06, "loss": 0.1275, "mean_token_accuracy": 0.7827788632363081, "num_tokens": 56769536.0, "step": 6930 }, { "epoch": 1.9654830601479802, "grad_norm": 1.2757993936538696, "learning_rate": 3.833001153886499e-06, "loss": 0.1342, "mean_token_accuracy": 0.7745107628405095, "num_tokens": 56851456.0, "step": 6940 }, { "epoch": 1.9683152193153255, "grad_norm": 1.2901620864868164, "learning_rate": 3.822511276618064e-06, "loss": 0.1526, "mean_token_accuracy": 0.7572896286845208, "num_tokens": 56933376.0, "step": 6950 }, { "epoch": 1.9711473784826707, "grad_norm": 1.3894282579421997, "learning_rate": 3.812021399349628e-06, "loss": 0.1451, "mean_token_accuracy": 0.7742539130151271, "num_tokens": 57015296.0, "step": 6960 }, { "epoch": 1.973979537650016, "grad_norm": 1.3360012769699097, "learning_rate": 3.801531522081192e-06, "loss": 0.1298, "mean_token_accuracy": 0.7658023487776517, "num_tokens": 57097216.0, "step": 6970 }, { "epoch": 1.9768116968173612, "grad_norm": 1.1383564472198486, "learning_rate": 3.791041644812756e-06, "loss": 0.1368, "mean_token_accuracy": 0.7686521537601948, "num_tokens": 57179136.0, "step": 6980 }, { "epoch": 1.9796438559847065, "grad_norm": 1.2149908542633057, "learning_rate": 3.78055176754432e-06, "loss": 0.1301, "mean_token_accuracy": 0.7609466712921857, "num_tokens": 57261056.0, "step": 6990 }, { "epoch": 1.9824760151520515, "grad_norm": 1.2677174806594849, "learning_rate": 3.770061890275884e-06, "loss": 0.1497, "mean_token_accuracy": 0.7816780813038349, "num_tokens": 57342976.0, "step": 7000 }, { "epoch": 1.9853081743193968, "grad_norm": 1.2141501903533936, "learning_rate": 3.759572013007448e-06, "loss": 0.1356, "mean_token_accuracy": 0.7747798431664705, "num_tokens": 57424896.0, "step": 7010 }, { "epoch": 1.9881403334867418, "grad_norm": 1.7239429950714111, "learning_rate": 3.7490821357390124e-06, "loss": 0.1708, "mean_token_accuracy": 0.7312010746449232, "num_tokens": 57506816.0, "step": 7020 }, { "epoch": 1.990972492654087, "grad_norm": 1.4694973230361938, "learning_rate": 3.7385922584705765e-06, "loss": 0.1418, "mean_token_accuracy": 0.7763209376484156, "num_tokens": 57588736.0, "step": 7030 }, { "epoch": 1.9938046518214323, "grad_norm": 1.2129154205322266, "learning_rate": 3.72810238120214e-06, "loss": 0.1644, "mean_token_accuracy": 0.7523361068218947, "num_tokens": 57670656.0, "step": 7040 }, { "epoch": 1.9966368109887775, "grad_norm": 1.0555603504180908, "learning_rate": 3.7176125039337042e-06, "loss": 0.1415, "mean_token_accuracy": 0.7746942289173603, "num_tokens": 57752576.0, "step": 7050 }, { "epoch": 1.9994689701561228, "grad_norm": 1.5597031116485596, "learning_rate": 3.7071226266652683e-06, "loss": 0.1409, "mean_token_accuracy": 0.7636497039347887, "num_tokens": 57834496.0, "step": 7060 }, { "epoch": 2.002265727333876, "grad_norm": 1.1237660646438599, "learning_rate": 3.6966327493968324e-06, "loss": 0.1065, "mean_token_accuracy": 0.7782952212080171, "num_tokens": 57915392.0, "step": 7070 }, { "epoch": 2.0050978865012214, "grad_norm": 1.2385786771774292, "learning_rate": 3.6861428721283965e-06, "loss": 0.11, "mean_token_accuracy": 0.7639187891036272, "num_tokens": 57997312.0, "step": 7080 }, { "epoch": 2.0079300456685667, "grad_norm": 1.055507779121399, "learning_rate": 3.6756529948599605e-06, "loss": 0.113, "mean_token_accuracy": 0.7746575351804494, "num_tokens": 58079232.0, "step": 7090 }, { "epoch": 2.010762204835912, "grad_norm": 1.1237515211105347, "learning_rate": 3.665163117591524e-06, "loss": 0.0966, "mean_token_accuracy": 0.781531311571598, "num_tokens": 58161152.0, "step": 7100 }, { "epoch": 2.013594364003257, "grad_norm": 1.7721480131149292, "learning_rate": 3.6546732403230883e-06, "loss": 0.1252, "mean_token_accuracy": 0.761117908358574, "num_tokens": 58243072.0, "step": 7110 }, { "epoch": 2.016426523170602, "grad_norm": 1.310492992401123, "learning_rate": 3.6441833630546523e-06, "loss": 0.1261, "mean_token_accuracy": 0.7766267094761133, "num_tokens": 58324992.0, "step": 7120 }, { "epoch": 2.0192586823379473, "grad_norm": 1.3270450830459595, "learning_rate": 3.6336934857862164e-06, "loss": 0.1702, "mean_token_accuracy": 0.729219663143158, "num_tokens": 58406912.0, "step": 7130 }, { "epoch": 2.0220908415052925, "grad_norm": 1.4443167448043823, "learning_rate": 3.623203608517781e-06, "loss": 0.1288, "mean_token_accuracy": 0.7712328769266605, "num_tokens": 58488832.0, "step": 7140 }, { "epoch": 2.0249230006726378, "grad_norm": 1.5470432043075562, "learning_rate": 3.612713731249345e-06, "loss": 0.1346, "mean_token_accuracy": 0.7625000022351742, "num_tokens": 58570752.0, "step": 7150 }, { "epoch": 2.027755159839983, "grad_norm": 1.260907530784607, "learning_rate": 3.6022238539809086e-06, "loss": 0.104, "mean_token_accuracy": 0.7806873768568039, "num_tokens": 58652672.0, "step": 7160 }, { "epoch": 2.0305873190073283, "grad_norm": 0.9440592527389526, "learning_rate": 3.5917339767124727e-06, "loss": 0.1294, "mean_token_accuracy": 0.75951565541327, "num_tokens": 58734592.0, "step": 7170 }, { "epoch": 2.0334194781746735, "grad_norm": 1.4341137409210205, "learning_rate": 3.581244099444037e-06, "loss": 0.1188, "mean_token_accuracy": 0.7584393329918384, "num_tokens": 58816512.0, "step": 7180 }, { "epoch": 2.0362516373420187, "grad_norm": 1.1970900297164917, "learning_rate": 3.570754222175601e-06, "loss": 0.0963, "mean_token_accuracy": 0.7981164366006851, "num_tokens": 58898432.0, "step": 7190 }, { "epoch": 2.039083796509364, "grad_norm": 1.7112830877304077, "learning_rate": 3.560264344907165e-06, "loss": 0.1528, "mean_token_accuracy": 0.7553693726658821, "num_tokens": 58980352.0, "step": 7200 }, { "epoch": 2.0419159556767092, "grad_norm": 1.4228155612945557, "learning_rate": 3.549774467638729e-06, "loss": 0.1225, "mean_token_accuracy": 0.7504280801862478, "num_tokens": 59062272.0, "step": 7210 }, { "epoch": 2.044748114844054, "grad_norm": 1.5878945589065552, "learning_rate": 3.5392845903702927e-06, "loss": 0.1153, "mean_token_accuracy": 0.7910225056111813, "num_tokens": 59144192.0, "step": 7220 }, { "epoch": 2.0475802740113993, "grad_norm": 1.1206371784210205, "learning_rate": 3.5287947131018567e-06, "loss": 0.1279, "mean_token_accuracy": 0.7498165342956782, "num_tokens": 59226112.0, "step": 7230 }, { "epoch": 2.0504124331787446, "grad_norm": 1.6621618270874023, "learning_rate": 3.518304835833421e-06, "loss": 0.1217, "mean_token_accuracy": 0.7606286689639091, "num_tokens": 59308032.0, "step": 7240 }, { "epoch": 2.05324459234609, "grad_norm": 1.387544870376587, "learning_rate": 3.507814958564985e-06, "loss": 0.1446, "mean_token_accuracy": 0.7522015646100044, "num_tokens": 59389952.0, "step": 7250 }, { "epoch": 2.056076751513435, "grad_norm": 1.303863525390625, "learning_rate": 3.4973250812965494e-06, "loss": 0.1362, "mean_token_accuracy": 0.7659858129918575, "num_tokens": 59471872.0, "step": 7260 }, { "epoch": 2.0589089106807803, "grad_norm": 1.3478611707687378, "learning_rate": 3.4868352040281135e-06, "loss": 0.1246, "mean_token_accuracy": 0.7809809193015098, "num_tokens": 59553792.0, "step": 7270 }, { "epoch": 2.0617410698481256, "grad_norm": 1.1989835500717163, "learning_rate": 3.476345326759677e-06, "loss": 0.1085, "mean_token_accuracy": 0.7756604697555304, "num_tokens": 59635712.0, "step": 7280 }, { "epoch": 2.064573229015471, "grad_norm": 1.1155551671981812, "learning_rate": 3.465855449491241e-06, "loss": 0.1089, "mean_token_accuracy": 0.7770547963678837, "num_tokens": 59717632.0, "step": 7290 }, { "epoch": 2.067405388182816, "grad_norm": 1.2903120517730713, "learning_rate": 3.4553655722228053e-06, "loss": 0.1234, "mean_token_accuracy": 0.7721501953899861, "num_tokens": 59799552.0, "step": 7300 }, { "epoch": 2.0702375473501613, "grad_norm": 1.6666812896728516, "learning_rate": 3.4448756949543694e-06, "loss": 0.1253, "mean_token_accuracy": 0.7631360098719597, "num_tokens": 59881472.0, "step": 7310 }, { "epoch": 2.073069706517506, "grad_norm": 1.2040691375732422, "learning_rate": 3.4343858176859334e-06, "loss": 0.1359, "mean_token_accuracy": 0.7778008833527565, "num_tokens": 59963392.0, "step": 7320 }, { "epoch": 2.0759018656848514, "grad_norm": 1.2499768733978271, "learning_rate": 3.4238959404174975e-06, "loss": 0.1113, "mean_token_accuracy": 0.7751345403492451, "num_tokens": 60045312.0, "step": 7330 }, { "epoch": 2.0787340248521966, "grad_norm": 0.9466302990913391, "learning_rate": 3.413406063149061e-06, "loss": 0.1402, "mean_token_accuracy": 0.7586472611874342, "num_tokens": 60127232.0, "step": 7340 }, { "epoch": 2.081566184019542, "grad_norm": 1.0079811811447144, "learning_rate": 3.4029161858806252e-06, "loss": 0.1033, "mean_token_accuracy": 0.7875611506402492, "num_tokens": 60209152.0, "step": 7350 }, { "epoch": 2.084398343186887, "grad_norm": 1.493399977684021, "learning_rate": 3.3924263086121893e-06, "loss": 0.1128, "mean_token_accuracy": 0.7791707415133715, "num_tokens": 60291072.0, "step": 7360 }, { "epoch": 2.0872305023542324, "grad_norm": 1.5899913311004639, "learning_rate": 3.381936431343754e-06, "loss": 0.154, "mean_token_accuracy": 0.7400317970663309, "num_tokens": 60372992.0, "step": 7370 }, { "epoch": 2.0900626615215776, "grad_norm": 1.5314220190048218, "learning_rate": 3.371446554075318e-06, "loss": 0.1232, "mean_token_accuracy": 0.7905455000698567, "num_tokens": 60454912.0, "step": 7380 }, { "epoch": 2.092894820688923, "grad_norm": 1.2721341848373413, "learning_rate": 3.360956676806882e-06, "loss": 0.1085, "mean_token_accuracy": 0.7729696653783321, "num_tokens": 60536832.0, "step": 7390 }, { "epoch": 2.095726979856268, "grad_norm": 1.1642765998840332, "learning_rate": 3.3504667995384456e-06, "loss": 0.1081, "mean_token_accuracy": 0.7889799427241087, "num_tokens": 60618752.0, "step": 7400 }, { "epoch": 2.098559139023613, "grad_norm": 1.3702284097671509, "learning_rate": 3.3399769222700097e-06, "loss": 0.1094, "mean_token_accuracy": 0.7912671204656363, "num_tokens": 60700672.0, "step": 7410 }, { "epoch": 2.101391298190958, "grad_norm": 1.2944626808166504, "learning_rate": 3.3294870450015738e-06, "loss": 0.1313, "mean_token_accuracy": 0.7547211341559887, "num_tokens": 60782592.0, "step": 7420 }, { "epoch": 2.1042234573583034, "grad_norm": 1.1357483863830566, "learning_rate": 3.318997167733138e-06, "loss": 0.1069, "mean_token_accuracy": 0.7714530322700739, "num_tokens": 60864512.0, "step": 7430 }, { "epoch": 2.1070556165256487, "grad_norm": 1.0823742151260376, "learning_rate": 3.308507290464702e-06, "loss": 0.1106, "mean_token_accuracy": 0.7675146773457527, "num_tokens": 60946432.0, "step": 7440 }, { "epoch": 2.109887775692994, "grad_norm": 1.2482222318649292, "learning_rate": 3.298017413196266e-06, "loss": 0.1122, "mean_token_accuracy": 0.7684442289173603, "num_tokens": 61028352.0, "step": 7450 }, { "epoch": 2.112719934860339, "grad_norm": 1.9791706800460815, "learning_rate": 3.2875275359278296e-06, "loss": 0.1222, "mean_token_accuracy": 0.7773728009313345, "num_tokens": 61110272.0, "step": 7460 }, { "epoch": 2.1155520940276844, "grad_norm": 1.8661694526672363, "learning_rate": 3.2770376586593937e-06, "loss": 0.1091, "mean_token_accuracy": 0.7745596896857023, "num_tokens": 61192192.0, "step": 7470 }, { "epoch": 2.1183842531950297, "grad_norm": 0.9961443543434143, "learning_rate": 3.2665477813909578e-06, "loss": 0.1212, "mean_token_accuracy": 0.7665973592549562, "num_tokens": 61274112.0, "step": 7480 }, { "epoch": 2.121216412362375, "grad_norm": 1.1755738258361816, "learning_rate": 3.2560579041225223e-06, "loss": 0.1216, "mean_token_accuracy": 0.7842465721070766, "num_tokens": 61356032.0, "step": 7490 }, { "epoch": 2.12404857152972, "grad_norm": 1.227830171585083, "learning_rate": 3.2455680268540864e-06, "loss": 0.1372, "mean_token_accuracy": 0.7647749528288841, "num_tokens": 61437952.0, "step": 7500 }, { "epoch": 2.1268807306970654, "grad_norm": 1.0966523885726929, "learning_rate": 3.2350781495856504e-06, "loss": 0.1087, "mean_token_accuracy": 0.7892367880791425, "num_tokens": 61519872.0, "step": 7510 }, { "epoch": 2.12971288986441, "grad_norm": 1.2551891803741455, "learning_rate": 3.224588272317214e-06, "loss": 0.1314, "mean_token_accuracy": 0.7765900179743767, "num_tokens": 61601792.0, "step": 7520 }, { "epoch": 2.1325450490317555, "grad_norm": 1.0475414991378784, "learning_rate": 3.214098395048778e-06, "loss": 0.1402, "mean_token_accuracy": 0.769019079208374, "num_tokens": 61683712.0, "step": 7530 }, { "epoch": 2.1353772081991007, "grad_norm": 1.271935224533081, "learning_rate": 3.2036085177803422e-06, "loss": 0.1228, "mean_token_accuracy": 0.7731042068451643, "num_tokens": 61765632.0, "step": 7540 }, { "epoch": 2.138209367366446, "grad_norm": 0.95322585105896, "learning_rate": 3.1931186405119063e-06, "loss": 0.1477, "mean_token_accuracy": 0.7742539115250111, "num_tokens": 61847552.0, "step": 7550 }, { "epoch": 2.141041526533791, "grad_norm": 1.5371066331863403, "learning_rate": 3.1826287632434704e-06, "loss": 0.1342, "mean_token_accuracy": 0.7704500969499349, "num_tokens": 61929472.0, "step": 7560 }, { "epoch": 2.1438736857011365, "grad_norm": 1.1497443914413452, "learning_rate": 3.1721388859750345e-06, "loss": 0.1218, "mean_token_accuracy": 0.7789872772991657, "num_tokens": 62011392.0, "step": 7570 }, { "epoch": 2.1467058448684817, "grad_norm": 1.277105689048767, "learning_rate": 3.161649008706598e-06, "loss": 0.1234, "mean_token_accuracy": 0.7584148742258549, "num_tokens": 62093312.0, "step": 7580 }, { "epoch": 2.149538004035827, "grad_norm": 1.6442451477050781, "learning_rate": 3.151159131438162e-06, "loss": 0.133, "mean_token_accuracy": 0.7566413894295693, "num_tokens": 62175232.0, "step": 7590 }, { "epoch": 2.152370163203172, "grad_norm": 2.0741281509399414, "learning_rate": 3.1406692541697263e-06, "loss": 0.1461, "mean_token_accuracy": 0.7669642839580775, "num_tokens": 62257152.0, "step": 7600 }, { "epoch": 2.155202322370517, "grad_norm": 1.4436347484588623, "learning_rate": 3.1301793769012908e-06, "loss": 0.1265, "mean_token_accuracy": 0.7654109600931406, "num_tokens": 62339072.0, "step": 7610 }, { "epoch": 2.1580344815378623, "grad_norm": 1.2718037366867065, "learning_rate": 3.119689499632855e-06, "loss": 0.1241, "mean_token_accuracy": 0.7671110529452563, "num_tokens": 62420992.0, "step": 7620 }, { "epoch": 2.1608666407052075, "grad_norm": 0.9997752904891968, "learning_rate": 3.109199622364419e-06, "loss": 0.1085, "mean_token_accuracy": 0.7868273008614779, "num_tokens": 62502912.0, "step": 7630 }, { "epoch": 2.1636987998725528, "grad_norm": 1.042555332183838, "learning_rate": 3.0987097450959826e-06, "loss": 0.1211, "mean_token_accuracy": 0.7586227994412184, "num_tokens": 62584832.0, "step": 7640 }, { "epoch": 2.166530959039898, "grad_norm": 1.1687451601028442, "learning_rate": 3.0882198678275466e-06, "loss": 0.1112, "mean_token_accuracy": 0.7717954970896244, "num_tokens": 62666752.0, "step": 7650 }, { "epoch": 2.1693631182072433, "grad_norm": 1.1718406677246094, "learning_rate": 3.0777299905591107e-06, "loss": 0.1104, "mean_token_accuracy": 0.7741560660302639, "num_tokens": 62748672.0, "step": 7660 }, { "epoch": 2.1721952773745885, "grad_norm": 1.01470148563385, "learning_rate": 3.0672401132906748e-06, "loss": 0.1273, "mean_token_accuracy": 0.7732387490570545, "num_tokens": 62830592.0, "step": 7670 }, { "epoch": 2.1750274365419338, "grad_norm": 1.2278531789779663, "learning_rate": 3.056750236022239e-06, "loss": 0.1351, "mean_token_accuracy": 0.7633439332246781, "num_tokens": 62912512.0, "step": 7680 }, { "epoch": 2.177859595709279, "grad_norm": 1.0352813005447388, "learning_rate": 3.046260358753803e-06, "loss": 0.1331, "mean_token_accuracy": 0.7671477481722832, "num_tokens": 62994432.0, "step": 7690 }, { "epoch": 2.1806917548766243, "grad_norm": 1.7667654752731323, "learning_rate": 3.0357704814853666e-06, "loss": 0.1137, "mean_token_accuracy": 0.7720034249126911, "num_tokens": 63076352.0, "step": 7700 }, { "epoch": 2.1835239140439695, "grad_norm": 1.1187483072280884, "learning_rate": 3.0252806042169307e-06, "loss": 0.1132, "mean_token_accuracy": 0.7777274932712317, "num_tokens": 63158272.0, "step": 7710 }, { "epoch": 2.1863560732113143, "grad_norm": 1.072558045387268, "learning_rate": 3.014790726948495e-06, "loss": 0.1159, "mean_token_accuracy": 0.7706457927823067, "num_tokens": 63240192.0, "step": 7720 }, { "epoch": 2.1891882323786596, "grad_norm": 1.152151346206665, "learning_rate": 3.0043008496800592e-06, "loss": 0.1277, "mean_token_accuracy": 0.7525562614202499, "num_tokens": 63322112.0, "step": 7730 }, { "epoch": 2.192020391546005, "grad_norm": 1.0852047204971313, "learning_rate": 2.9938109724116233e-06, "loss": 0.1016, "mean_token_accuracy": 0.7921232886612415, "num_tokens": 63404032.0, "step": 7740 }, { "epoch": 2.19485255071335, "grad_norm": 1.2948185205459595, "learning_rate": 2.983321095143187e-06, "loss": 0.1386, "mean_token_accuracy": 0.7624510746449232, "num_tokens": 63485952.0, "step": 7750 }, { "epoch": 2.1976847098806953, "grad_norm": 0.9734096527099609, "learning_rate": 2.972831217874751e-06, "loss": 0.1207, "mean_token_accuracy": 0.7791585147380828, "num_tokens": 63567872.0, "step": 7760 }, { "epoch": 2.2005168690480406, "grad_norm": 1.1331419944763184, "learning_rate": 2.962341340606315e-06, "loss": 0.11, "mean_token_accuracy": 0.7842588059604167, "num_tokens": 63649792.0, "step": 7770 }, { "epoch": 2.203349028215386, "grad_norm": 1.4239418506622314, "learning_rate": 2.951851463337879e-06, "loss": 0.111, "mean_token_accuracy": 0.7711839515715837, "num_tokens": 63731712.0, "step": 7780 }, { "epoch": 2.206181187382731, "grad_norm": 0.8610633015632629, "learning_rate": 2.9413615860694433e-06, "loss": 0.1222, "mean_token_accuracy": 0.7689946174621582, "num_tokens": 63813632.0, "step": 7790 }, { "epoch": 2.2090133465500763, "grad_norm": 1.6536685228347778, "learning_rate": 2.9308717088010073e-06, "loss": 0.1338, "mean_token_accuracy": 0.7779721118509769, "num_tokens": 63895552.0, "step": 7800 }, { "epoch": 2.211845505717421, "grad_norm": 1.4962118864059448, "learning_rate": 2.920381831532571e-06, "loss": 0.1228, "mean_token_accuracy": 0.7519080217927694, "num_tokens": 63977472.0, "step": 7810 }, { "epoch": 2.2146776648847664, "grad_norm": 1.2594947814941406, "learning_rate": 2.909891954264135e-06, "loss": 0.1291, "mean_token_accuracy": 0.7706824846565723, "num_tokens": 64059392.0, "step": 7820 }, { "epoch": 2.2175098240521116, "grad_norm": 1.313718557357788, "learning_rate": 2.899402076995699e-06, "loss": 0.1329, "mean_token_accuracy": 0.7718933440744877, "num_tokens": 64141312.0, "step": 7830 }, { "epoch": 2.220341983219457, "grad_norm": 1.3639971017837524, "learning_rate": 2.8889121997272636e-06, "loss": 0.1388, "mean_token_accuracy": 0.7530088070780039, "num_tokens": 64223232.0, "step": 7840 }, { "epoch": 2.223174142386802, "grad_norm": 1.2895123958587646, "learning_rate": 2.8784223224588277e-06, "loss": 0.1366, "mean_token_accuracy": 0.7582681007683277, "num_tokens": 64305152.0, "step": 7850 }, { "epoch": 2.2260063015541474, "grad_norm": 1.492924451828003, "learning_rate": 2.8679324451903918e-06, "loss": 0.1269, "mean_token_accuracy": 0.7615704495459795, "num_tokens": 64387072.0, "step": 7860 }, { "epoch": 2.2288384607214926, "grad_norm": 1.5129145383834839, "learning_rate": 2.8574425679219554e-06, "loss": 0.0955, "mean_token_accuracy": 0.7921232879161835, "num_tokens": 64468992.0, "step": 7870 }, { "epoch": 2.231670619888838, "grad_norm": 1.1694437265396118, "learning_rate": 2.8469526906535195e-06, "loss": 0.1098, "mean_token_accuracy": 0.7837451074272395, "num_tokens": 64550912.0, "step": 7880 }, { "epoch": 2.234502779056183, "grad_norm": 1.4399017095565796, "learning_rate": 2.8364628133850836e-06, "loss": 0.1484, "mean_token_accuracy": 0.7374999996274709, "num_tokens": 64632832.0, "step": 7890 }, { "epoch": 2.2373349382235284, "grad_norm": 1.6011334657669067, "learning_rate": 2.8259729361166477e-06, "loss": 0.0958, "mean_token_accuracy": 0.7819227010011673, "num_tokens": 64714752.0, "step": 7900 }, { "epoch": 2.240167097390873, "grad_norm": 1.7093037366867065, "learning_rate": 2.8154830588482117e-06, "loss": 0.1033, "mean_token_accuracy": 0.7958537172526121, "num_tokens": 64796672.0, "step": 7910 }, { "epoch": 2.2429992565582184, "grad_norm": 1.4303596019744873, "learning_rate": 2.804993181579776e-06, "loss": 0.1204, "mean_token_accuracy": 0.7735445212572813, "num_tokens": 64878592.0, "step": 7920 }, { "epoch": 2.2458314157255637, "grad_norm": 1.651093602180481, "learning_rate": 2.7945033043113395e-06, "loss": 0.1316, "mean_token_accuracy": 0.7649951051920653, "num_tokens": 64960512.0, "step": 7930 }, { "epoch": 2.248663574892909, "grad_norm": 0.8308888077735901, "learning_rate": 2.7840134270429035e-06, "loss": 0.1251, "mean_token_accuracy": 0.7785714261233807, "num_tokens": 65042432.0, "step": 7940 }, { "epoch": 2.251495734060254, "grad_norm": 1.3998208045959473, "learning_rate": 2.7735235497744676e-06, "loss": 0.1319, "mean_token_accuracy": 0.7520425610244275, "num_tokens": 65124352.0, "step": 7950 }, { "epoch": 2.2543278932275994, "grad_norm": 1.1849442720413208, "learning_rate": 2.763033672506032e-06, "loss": 0.1173, "mean_token_accuracy": 0.7766878642141819, "num_tokens": 65206272.0, "step": 7960 }, { "epoch": 2.2571600523949447, "grad_norm": 1.3609066009521484, "learning_rate": 2.752543795237596e-06, "loss": 0.1445, "mean_token_accuracy": 0.7534613508731127, "num_tokens": 65288192.0, "step": 7970 }, { "epoch": 2.25999221156229, "grad_norm": 1.4002113342285156, "learning_rate": 2.7420539179691603e-06, "loss": 0.1151, "mean_token_accuracy": 0.7851394332945347, "num_tokens": 65370112.0, "step": 7980 }, { "epoch": 2.262824370729635, "grad_norm": 1.3668949604034424, "learning_rate": 2.731564040700724e-06, "loss": 0.1318, "mean_token_accuracy": 0.7521281778812409, "num_tokens": 65452032.0, "step": 7990 }, { "epoch": 2.2656565298969804, "grad_norm": 1.7958375215530396, "learning_rate": 2.721074163432288e-06, "loss": 0.135, "mean_token_accuracy": 0.7572773959487676, "num_tokens": 65533952.0, "step": 8000 }, { "epoch": 2.268488689064325, "grad_norm": 1.2995786666870117, "learning_rate": 2.710584286163852e-06, "loss": 0.1242, "mean_token_accuracy": 0.7715875700116157, "num_tokens": 65615872.0, "step": 8010 }, { "epoch": 2.2713208482316705, "grad_norm": 1.426206111907959, "learning_rate": 2.700094408895416e-06, "loss": 0.1122, "mean_token_accuracy": 0.7696306265890598, "num_tokens": 65697792.0, "step": 8020 }, { "epoch": 2.2741530073990157, "grad_norm": 1.3701132535934448, "learning_rate": 2.6896045316269802e-06, "loss": 0.1183, "mean_token_accuracy": 0.7692147731781006, "num_tokens": 65779712.0, "step": 8030 }, { "epoch": 2.276985166566361, "grad_norm": 1.0385992527008057, "learning_rate": 2.6791146543585443e-06, "loss": 0.1173, "mean_token_accuracy": 0.7765044033527374, "num_tokens": 65861632.0, "step": 8040 }, { "epoch": 2.279817325733706, "grad_norm": 1.2461035251617432, "learning_rate": 2.668624777090108e-06, "loss": 0.124, "mean_token_accuracy": 0.7599804311990738, "num_tokens": 65943552.0, "step": 8050 }, { "epoch": 2.2826494849010515, "grad_norm": 1.4159597158432007, "learning_rate": 2.658134899821672e-06, "loss": 0.1013, "mean_token_accuracy": 0.7964652631431818, "num_tokens": 66025472.0, "step": 8060 }, { "epoch": 2.2854816440683967, "grad_norm": 1.4617013931274414, "learning_rate": 2.647645022553236e-06, "loss": 0.1248, "mean_token_accuracy": 0.7644324880093336, "num_tokens": 66107392.0, "step": 8070 }, { "epoch": 2.288313803235742, "grad_norm": 1.1074751615524292, "learning_rate": 2.6371551452848006e-06, "loss": 0.1061, "mean_token_accuracy": 0.7882093951106072, "num_tokens": 66189312.0, "step": 8080 }, { "epoch": 2.291145962403087, "grad_norm": 1.4659677743911743, "learning_rate": 2.6266652680163647e-06, "loss": 0.1232, "mean_token_accuracy": 0.7460004922002554, "num_tokens": 66271232.0, "step": 8090 }, { "epoch": 2.293978121570432, "grad_norm": 1.3314685821533203, "learning_rate": 2.6161753907479287e-06, "loss": 0.1087, "mean_token_accuracy": 0.7831947173923254, "num_tokens": 66353152.0, "step": 8100 }, { "epoch": 2.2968102807377777, "grad_norm": 1.7255728244781494, "learning_rate": 2.6056855134794924e-06, "loss": 0.1242, "mean_token_accuracy": 0.7608121309429408, "num_tokens": 66435072.0, "step": 8110 }, { "epoch": 2.2996424399051225, "grad_norm": 0.8937911987304688, "learning_rate": 2.5951956362110565e-06, "loss": 0.114, "mean_token_accuracy": 0.778534734621644, "num_tokens": 66516992.0, "step": 8120 }, { "epoch": 2.3024745990724678, "grad_norm": 1.47197687625885, "learning_rate": 2.5847057589426205e-06, "loss": 0.142, "mean_token_accuracy": 0.7618028372526169, "num_tokens": 66598912.0, "step": 8130 }, { "epoch": 2.305306758239813, "grad_norm": 1.488588809967041, "learning_rate": 2.5742158816741846e-06, "loss": 0.1352, "mean_token_accuracy": 0.7768468666821718, "num_tokens": 66680832.0, "step": 8140 }, { "epoch": 2.3081389174071583, "grad_norm": 1.2545872926712036, "learning_rate": 2.5637260044057487e-06, "loss": 0.1156, "mean_token_accuracy": 0.7812866933643818, "num_tokens": 66762752.0, "step": 8150 }, { "epoch": 2.3109710765745035, "grad_norm": 0.91876620054245, "learning_rate": 2.5532361271373128e-06, "loss": 0.1133, "mean_token_accuracy": 0.7933219138532877, "num_tokens": 66844672.0, "step": 8160 }, { "epoch": 2.3138032357418488, "grad_norm": 1.1803959608078003, "learning_rate": 2.5427462498688764e-06, "loss": 0.1485, "mean_token_accuracy": 0.7601761255413294, "num_tokens": 66926592.0, "step": 8170 }, { "epoch": 2.316635394909194, "grad_norm": 1.4414998292922974, "learning_rate": 2.5322563726004405e-06, "loss": 0.1219, "mean_token_accuracy": 0.7790484368801117, "num_tokens": 67008512.0, "step": 8180 }, { "epoch": 2.3194675540765393, "grad_norm": 1.4225188493728638, "learning_rate": 2.521766495332005e-06, "loss": 0.1297, "mean_token_accuracy": 0.7588062636554241, "num_tokens": 67090432.0, "step": 8190 }, { "epoch": 2.3222997132438845, "grad_norm": 1.4619642496109009, "learning_rate": 2.511276618063569e-06, "loss": 0.1431, "mean_token_accuracy": 0.7619129199534654, "num_tokens": 67172352.0, "step": 8200 }, { "epoch": 2.3251318724112293, "grad_norm": 1.2214300632476807, "learning_rate": 2.500786740795133e-06, "loss": 0.1031, "mean_token_accuracy": 0.7702544037252664, "num_tokens": 67254272.0, "step": 8210 }, { "epoch": 2.3279640315785746, "grad_norm": 1.1540178060531616, "learning_rate": 2.490296863526697e-06, "loss": 0.1149, "mean_token_accuracy": 0.7846746612340212, "num_tokens": 67336192.0, "step": 8220 }, { "epoch": 2.33079619074592, "grad_norm": 0.9820622205734253, "learning_rate": 2.479806986258261e-06, "loss": 0.117, "mean_token_accuracy": 0.7940557729452848, "num_tokens": 67418112.0, "step": 8230 }, { "epoch": 2.333628349913265, "grad_norm": 1.272473692893982, "learning_rate": 2.469317108989825e-06, "loss": 0.1365, "mean_token_accuracy": 0.7350905116647481, "num_tokens": 67500032.0, "step": 8240 }, { "epoch": 2.3364605090806103, "grad_norm": 1.4759598970413208, "learning_rate": 2.458827231721389e-06, "loss": 0.123, "mean_token_accuracy": 0.7754525396972894, "num_tokens": 67581952.0, "step": 8250 }, { "epoch": 2.3392926682479556, "grad_norm": 1.1790000200271606, "learning_rate": 2.448337354452953e-06, "loss": 0.1213, "mean_token_accuracy": 0.7618272997438907, "num_tokens": 67663872.0, "step": 8260 }, { "epoch": 2.342124827415301, "grad_norm": 1.3157241344451904, "learning_rate": 2.437847477184517e-06, "loss": 0.1161, "mean_token_accuracy": 0.7648727998137475, "num_tokens": 67745792.0, "step": 8270 }, { "epoch": 2.344956986582646, "grad_norm": 1.382219672203064, "learning_rate": 2.4273575999160812e-06, "loss": 0.1284, "mean_token_accuracy": 0.7651418801397085, "num_tokens": 67827712.0, "step": 8280 }, { "epoch": 2.3477891457499913, "grad_norm": 1.2287477254867554, "learning_rate": 2.4168677226476453e-06, "loss": 0.1136, "mean_token_accuracy": 0.7735322885215282, "num_tokens": 67909632.0, "step": 8290 }, { "epoch": 2.350621304917336, "grad_norm": 1.2429749965667725, "learning_rate": 2.406377845379209e-06, "loss": 0.1211, "mean_token_accuracy": 0.7679305296391249, "num_tokens": 67991552.0, "step": 8300 }, { "epoch": 2.3534534640846814, "grad_norm": 1.40024733543396, "learning_rate": 2.3958879681107735e-06, "loss": 0.1385, "mean_token_accuracy": 0.7510029342025518, "num_tokens": 68073472.0, "step": 8310 }, { "epoch": 2.3562856232520266, "grad_norm": 1.351151466369629, "learning_rate": 2.3853980908423375e-06, "loss": 0.1343, "mean_token_accuracy": 0.7658145766705274, "num_tokens": 68155392.0, "step": 8320 }, { "epoch": 2.359117782419372, "grad_norm": 1.0294612646102905, "learning_rate": 2.374908213573901e-06, "loss": 0.1433, "mean_token_accuracy": 0.767025439441204, "num_tokens": 68237312.0, "step": 8330 }, { "epoch": 2.361949941586717, "grad_norm": 1.4289076328277588, "learning_rate": 2.3644183363054653e-06, "loss": 0.113, "mean_token_accuracy": 0.7760763227939605, "num_tokens": 68319232.0, "step": 8340 }, { "epoch": 2.3647821007540624, "grad_norm": 1.1143102645874023, "learning_rate": 2.3539284590370293e-06, "loss": 0.0943, "mean_token_accuracy": 0.7843566548079253, "num_tokens": 68401152.0, "step": 8350 }, { "epoch": 2.3676142599214076, "grad_norm": 2.267212390899658, "learning_rate": 2.3434385817685934e-06, "loss": 0.1186, "mean_token_accuracy": 0.7689334638416767, "num_tokens": 68483072.0, "step": 8360 }, { "epoch": 2.370446419088753, "grad_norm": 1.6281898021697998, "learning_rate": 2.3329487045001575e-06, "loss": 0.105, "mean_token_accuracy": 0.7923312105238438, "num_tokens": 68564992.0, "step": 8370 }, { "epoch": 2.373278578256098, "grad_norm": 1.3597502708435059, "learning_rate": 2.3224588272317216e-06, "loss": 0.1444, "mean_token_accuracy": 0.7395670253783464, "num_tokens": 68646912.0, "step": 8380 }, { "epoch": 2.3761107374234434, "grad_norm": 1.215596079826355, "learning_rate": 2.3119689499632856e-06, "loss": 0.1217, "mean_token_accuracy": 0.7679305259138346, "num_tokens": 68728832.0, "step": 8390 }, { "epoch": 2.3789428965907886, "grad_norm": 1.2109073400497437, "learning_rate": 2.3014790726948497e-06, "loss": 0.132, "mean_token_accuracy": 0.758402644097805, "num_tokens": 68810752.0, "step": 8400 }, { "epoch": 2.3817750557581334, "grad_norm": 1.3109997510910034, "learning_rate": 2.290989195426414e-06, "loss": 0.147, "mean_token_accuracy": 0.7524706453084946, "num_tokens": 68892672.0, "step": 8410 }, { "epoch": 2.3846072149254787, "grad_norm": 0.8913787007331848, "learning_rate": 2.2804993181579774e-06, "loss": 0.1445, "mean_token_accuracy": 0.7826687820255757, "num_tokens": 68974592.0, "step": 8420 }, { "epoch": 2.387439374092824, "grad_norm": 1.5869669914245605, "learning_rate": 2.270009440889542e-06, "loss": 0.1088, "mean_token_accuracy": 0.7752446208149195, "num_tokens": 69056512.0, "step": 8430 }, { "epoch": 2.390271533260169, "grad_norm": 0.9672917127609253, "learning_rate": 2.259519563621106e-06, "loss": 0.1299, "mean_token_accuracy": 0.763197161257267, "num_tokens": 69138432.0, "step": 8440 }, { "epoch": 2.3931036924275144, "grad_norm": 1.2218233346939087, "learning_rate": 2.2490296863526697e-06, "loss": 0.1157, "mean_token_accuracy": 0.7761130161583424, "num_tokens": 69220352.0, "step": 8450 }, { "epoch": 2.3959358515948597, "grad_norm": 1.2007724046707153, "learning_rate": 2.2385398090842337e-06, "loss": 0.1109, "mean_token_accuracy": 0.77244373857975, "num_tokens": 69302272.0, "step": 8460 }, { "epoch": 2.398768010762205, "grad_norm": 1.3817311525344849, "learning_rate": 2.2280499318157982e-06, "loss": 0.1496, "mean_token_accuracy": 0.7545254405587911, "num_tokens": 69384192.0, "step": 8470 }, { "epoch": 2.40160016992955, "grad_norm": 1.0182404518127441, "learning_rate": 2.217560054547362e-06, "loss": 0.1125, "mean_token_accuracy": 0.7736423641443253, "num_tokens": 69466112.0, "step": 8480 }, { "epoch": 2.4044323290968954, "grad_norm": 1.4888405799865723, "learning_rate": 2.207070177278926e-06, "loss": 0.1441, "mean_token_accuracy": 0.7494863025844097, "num_tokens": 69548032.0, "step": 8490 }, { "epoch": 2.40726448826424, "grad_norm": 1.682157278060913, "learning_rate": 2.19658030001049e-06, "loss": 0.1116, "mean_token_accuracy": 0.7845278859138489, "num_tokens": 69629952.0, "step": 8500 }, { "epoch": 2.4100966474315855, "grad_norm": 1.2417086362838745, "learning_rate": 2.186090422742054e-06, "loss": 0.1332, "mean_token_accuracy": 0.7469911925494671, "num_tokens": 69711872.0, "step": 8510 }, { "epoch": 2.4129288065989307, "grad_norm": 1.0602362155914307, "learning_rate": 2.175600545473618e-06, "loss": 0.118, "mean_token_accuracy": 0.7913282752037049, "num_tokens": 69793792.0, "step": 8520 }, { "epoch": 2.415760965766276, "grad_norm": 1.6940639019012451, "learning_rate": 2.1651106682051823e-06, "loss": 0.124, "mean_token_accuracy": 0.7758806262165308, "num_tokens": 69875712.0, "step": 8530 }, { "epoch": 2.418593124933621, "grad_norm": 1.2620666027069092, "learning_rate": 2.1546207909367463e-06, "loss": 0.1107, "mean_token_accuracy": 0.7848703525960445, "num_tokens": 69957632.0, "step": 8540 }, { "epoch": 2.4214252841009665, "grad_norm": 1.2057409286499023, "learning_rate": 2.1441309136683104e-06, "loss": 0.1236, "mean_token_accuracy": 0.7542930539697409, "num_tokens": 70039552.0, "step": 8550 }, { "epoch": 2.4242574432683117, "grad_norm": 1.1586029529571533, "learning_rate": 2.1336410363998745e-06, "loss": 0.1291, "mean_token_accuracy": 0.7688600767403841, "num_tokens": 70121472.0, "step": 8560 }, { "epoch": 2.427089602435657, "grad_norm": 1.0324428081512451, "learning_rate": 2.123151159131438e-06, "loss": 0.1232, "mean_token_accuracy": 0.7772504888474941, "num_tokens": 70203392.0, "step": 8570 }, { "epoch": 2.429921761603002, "grad_norm": 1.3371713161468506, "learning_rate": 2.1126612818630022e-06, "loss": 0.0994, "mean_token_accuracy": 0.7973581213504076, "num_tokens": 70285312.0, "step": 8580 }, { "epoch": 2.4327539207703475, "grad_norm": 1.5676774978637695, "learning_rate": 2.1021714045945667e-06, "loss": 0.1214, "mean_token_accuracy": 0.7777397260069847, "num_tokens": 70367232.0, "step": 8590 }, { "epoch": 2.4355860799376927, "grad_norm": 1.5341993570327759, "learning_rate": 2.0916815273261304e-06, "loss": 0.12, "mean_token_accuracy": 0.7651418760418892, "num_tokens": 70449152.0, "step": 8600 }, { "epoch": 2.4384182391050375, "grad_norm": 1.1318984031677246, "learning_rate": 2.0811916500576945e-06, "loss": 0.1135, "mean_token_accuracy": 0.783365948498249, "num_tokens": 70531072.0, "step": 8610 }, { "epoch": 2.4412503982723828, "grad_norm": 1.357151985168457, "learning_rate": 2.0707017727892585e-06, "loss": 0.1268, "mean_token_accuracy": 0.7591364953666926, "num_tokens": 70612992.0, "step": 8620 }, { "epoch": 2.444082557439728, "grad_norm": 1.4665955305099487, "learning_rate": 2.0602118955208226e-06, "loss": 0.1193, "mean_token_accuracy": 0.766915363073349, "num_tokens": 70694912.0, "step": 8630 }, { "epoch": 2.4469147166070733, "grad_norm": 1.5252732038497925, "learning_rate": 2.0497220182523867e-06, "loss": 0.1283, "mean_token_accuracy": 0.7542930487543344, "num_tokens": 70776832.0, "step": 8640 }, { "epoch": 2.4497468757744185, "grad_norm": 1.6196575164794922, "learning_rate": 2.0392321409839508e-06, "loss": 0.1409, "mean_token_accuracy": 0.761081213876605, "num_tokens": 70858752.0, "step": 8650 }, { "epoch": 2.4525790349417638, "grad_norm": 1.064545750617981, "learning_rate": 2.028742263715515e-06, "loss": 0.1198, "mean_token_accuracy": 0.7740092940628529, "num_tokens": 70940672.0, "step": 8660 }, { "epoch": 2.455411194109109, "grad_norm": 2.187279224395752, "learning_rate": 2.018252386447079e-06, "loss": 0.1214, "mean_token_accuracy": 0.7668419752269984, "num_tokens": 71022592.0, "step": 8670 }, { "epoch": 2.4582433532764543, "grad_norm": 1.5529414415359497, "learning_rate": 2.007762509178643e-06, "loss": 0.118, "mean_token_accuracy": 0.7827299419790507, "num_tokens": 71104512.0, "step": 8680 }, { "epoch": 2.4610755124437995, "grad_norm": 1.3601223230361938, "learning_rate": 1.9972726319102066e-06, "loss": 0.1174, "mean_token_accuracy": 0.7767979476600886, "num_tokens": 71186432.0, "step": 8690 }, { "epoch": 2.4639076716111443, "grad_norm": 1.203856110572815, "learning_rate": 1.9867827546417707e-06, "loss": 0.1183, "mean_token_accuracy": 0.7660225074738264, "num_tokens": 71268352.0, "step": 8700 }, { "epoch": 2.4667398307784896, "grad_norm": 1.3757646083831787, "learning_rate": 1.976292877373335e-06, "loss": 0.1198, "mean_token_accuracy": 0.7904231905937195, "num_tokens": 71350272.0, "step": 8710 }, { "epoch": 2.469571989945835, "grad_norm": 1.464977502822876, "learning_rate": 1.965803000104899e-06, "loss": 0.1167, "mean_token_accuracy": 0.7592465754598379, "num_tokens": 71432192.0, "step": 8720 }, { "epoch": 2.47240414911318, "grad_norm": 1.0755860805511475, "learning_rate": 1.955313122836463e-06, "loss": 0.1195, "mean_token_accuracy": 0.7648361060768366, "num_tokens": 71514112.0, "step": 8730 }, { "epoch": 2.4752363082805253, "grad_norm": 1.1315070390701294, "learning_rate": 1.944823245568027e-06, "loss": 0.1331, "mean_token_accuracy": 0.7784858118742705, "num_tokens": 71596032.0, "step": 8740 }, { "epoch": 2.4780684674478706, "grad_norm": 1.3340259790420532, "learning_rate": 1.934333368299591e-06, "loss": 0.113, "mean_token_accuracy": 0.7791218183934688, "num_tokens": 71677952.0, "step": 8750 }, { "epoch": 2.480900626615216, "grad_norm": 1.6691911220550537, "learning_rate": 1.923843491031155e-06, "loss": 0.1425, "mean_token_accuracy": 0.748287669941783, "num_tokens": 71759872.0, "step": 8760 }, { "epoch": 2.483732785782561, "grad_norm": 1.6373320817947388, "learning_rate": 1.913353613762719e-06, "loss": 0.1228, "mean_token_accuracy": 0.7790239740163087, "num_tokens": 71841792.0, "step": 8770 }, { "epoch": 2.4865649449499063, "grad_norm": 1.4492214918136597, "learning_rate": 1.902863736494283e-06, "loss": 0.1, "mean_token_accuracy": 0.7805650692433119, "num_tokens": 71923712.0, "step": 8780 }, { "epoch": 2.4893971041172516, "grad_norm": 1.5409858226776123, "learning_rate": 1.8923738592258472e-06, "loss": 0.1361, "mean_token_accuracy": 0.7608243644237518, "num_tokens": 72005632.0, "step": 8790 }, { "epoch": 2.492229263284597, "grad_norm": 1.140271782875061, "learning_rate": 1.881883981957411e-06, "loss": 0.1161, "mean_token_accuracy": 0.7726638954132795, "num_tokens": 72087552.0, "step": 8800 }, { "epoch": 2.4950614224519416, "grad_norm": 1.4284526109695435, "learning_rate": 1.8713941046889753e-06, "loss": 0.1604, "mean_token_accuracy": 0.7403375681489706, "num_tokens": 72169472.0, "step": 8810 }, { "epoch": 2.497893581619287, "grad_norm": 1.4512755870819092, "learning_rate": 1.8609042274205394e-06, "loss": 0.1107, "mean_token_accuracy": 0.774828764423728, "num_tokens": 72251392.0, "step": 8820 }, { "epoch": 2.500725740786632, "grad_norm": 1.543485403060913, "learning_rate": 1.8504143501521033e-06, "loss": 0.1289, "mean_token_accuracy": 0.7575831692665815, "num_tokens": 72333312.0, "step": 8830 }, { "epoch": 2.5035578999539774, "grad_norm": 1.533150553703308, "learning_rate": 1.8399244728836673e-06, "loss": 0.1455, "mean_token_accuracy": 0.7610567532479763, "num_tokens": 72415232.0, "step": 8840 }, { "epoch": 2.5063900591213226, "grad_norm": 1.922784447669983, "learning_rate": 1.8294345956152316e-06, "loss": 0.1093, "mean_token_accuracy": 0.7744985327124596, "num_tokens": 72497152.0, "step": 8850 }, { "epoch": 2.509222218288668, "grad_norm": 1.2514212131500244, "learning_rate": 1.8189447183467953e-06, "loss": 0.1429, "mean_token_accuracy": 0.7551247548311949, "num_tokens": 72579072.0, "step": 8860 }, { "epoch": 2.512054377456013, "grad_norm": 0.8660805821418762, "learning_rate": 1.8084548410783596e-06, "loss": 0.1419, "mean_token_accuracy": 0.7588184926658869, "num_tokens": 72660992.0, "step": 8870 }, { "epoch": 2.5148865366233584, "grad_norm": 1.2345062494277954, "learning_rate": 1.7979649638099236e-06, "loss": 0.1272, "mean_token_accuracy": 0.7659980446100235, "num_tokens": 72742912.0, "step": 8880 }, { "epoch": 2.5177186957907036, "grad_norm": 1.2266974449157715, "learning_rate": 1.7874750865414875e-06, "loss": 0.1154, "mean_token_accuracy": 0.7668175145983696, "num_tokens": 72824832.0, "step": 8890 }, { "epoch": 2.5205508549580484, "grad_norm": 1.330416202545166, "learning_rate": 1.7769852092730516e-06, "loss": 0.108, "mean_token_accuracy": 0.7775929540395736, "num_tokens": 72906752.0, "step": 8900 }, { "epoch": 2.523383014125394, "grad_norm": 1.626082181930542, "learning_rate": 1.7664953320046159e-06, "loss": 0.1301, "mean_token_accuracy": 0.7622920747846365, "num_tokens": 72988672.0, "step": 8910 }, { "epoch": 2.526215173292739, "grad_norm": 0.860943615436554, "learning_rate": 1.7560054547361797e-06, "loss": 0.1222, "mean_token_accuracy": 0.7588918808847666, "num_tokens": 73070592.0, "step": 8920 }, { "epoch": 2.529047332460084, "grad_norm": 1.5452173948287964, "learning_rate": 1.7455155774677438e-06, "loss": 0.1338, "mean_token_accuracy": 0.7492049925029278, "num_tokens": 73152512.0, "step": 8930 }, { "epoch": 2.5318794916274294, "grad_norm": 1.633522868156433, "learning_rate": 1.7350257001993079e-06, "loss": 0.1433, "mean_token_accuracy": 0.7465875737369061, "num_tokens": 73234432.0, "step": 8940 }, { "epoch": 2.5347116507947747, "grad_norm": 1.2171915769577026, "learning_rate": 1.7245358229308717e-06, "loss": 0.114, "mean_token_accuracy": 0.777262720093131, "num_tokens": 73316352.0, "step": 8950 }, { "epoch": 2.53754380996212, "grad_norm": 1.367807388305664, "learning_rate": 1.7140459456624358e-06, "loss": 0.1259, "mean_token_accuracy": 0.756103229150176, "num_tokens": 73398272.0, "step": 8960 }, { "epoch": 2.540375969129465, "grad_norm": 1.4970428943634033, "learning_rate": 1.703556068394e-06, "loss": 0.1065, "mean_token_accuracy": 0.7895425587892533, "num_tokens": 73480192.0, "step": 8970 }, { "epoch": 2.5432081282968104, "grad_norm": 1.1710160970687866, "learning_rate": 1.693066191125564e-06, "loss": 0.1121, "mean_token_accuracy": 0.7776785720139742, "num_tokens": 73562112.0, "step": 8980 }, { "epoch": 2.5460402874641552, "grad_norm": 0.9374082088470459, "learning_rate": 1.682576313857128e-06, "loss": 0.1112, "mean_token_accuracy": 0.7624633032828569, "num_tokens": 73644032.0, "step": 8990 }, { "epoch": 2.548872446631501, "grad_norm": 1.0552085638046265, "learning_rate": 1.6720864365886921e-06, "loss": 0.1042, "mean_token_accuracy": 0.7797822907567025, "num_tokens": 73725952.0, "step": 9000 }, { "epoch": 2.5517046057988457, "grad_norm": 1.1519678831100464, "learning_rate": 1.661596559320256e-06, "loss": 0.1289, "mean_token_accuracy": 0.764640410989523, "num_tokens": 73807872.0, "step": 9010 }, { "epoch": 2.554536764966191, "grad_norm": 1.1929394006729126, "learning_rate": 1.65110668205182e-06, "loss": 0.1074, "mean_token_accuracy": 0.7785836592316627, "num_tokens": 73889792.0, "step": 9020 }, { "epoch": 2.5573689241335362, "grad_norm": 1.3088452816009521, "learning_rate": 1.6406168047833843e-06, "loss": 0.0986, "mean_token_accuracy": 0.7910469707101584, "num_tokens": 73971712.0, "step": 9030 }, { "epoch": 2.5602010833008815, "grad_norm": 1.795518159866333, "learning_rate": 1.6301269275149482e-06, "loss": 0.1081, "mean_token_accuracy": 0.7642245594412088, "num_tokens": 74053632.0, "step": 9040 }, { "epoch": 2.5630332424682267, "grad_norm": 1.3358420133590698, "learning_rate": 1.6196370502465123e-06, "loss": 0.1147, "mean_token_accuracy": 0.7747553832828998, "num_tokens": 74135552.0, "step": 9050 }, { "epoch": 2.565865401635572, "grad_norm": 1.489589810371399, "learning_rate": 1.6091471729780763e-06, "loss": 0.1207, "mean_token_accuracy": 0.7772504851222038, "num_tokens": 74217472.0, "step": 9060 }, { "epoch": 2.5686975608029172, "grad_norm": 1.3394817113876343, "learning_rate": 1.5986572957096402e-06, "loss": 0.1376, "mean_token_accuracy": 0.7540484316647053, "num_tokens": 74299392.0, "step": 9070 }, { "epoch": 2.5715297199702625, "grad_norm": 1.119963526725769, "learning_rate": 1.5881674184412043e-06, "loss": 0.1185, "mean_token_accuracy": 0.7621819950640202, "num_tokens": 74381312.0, "step": 9080 }, { "epoch": 2.5743618791376077, "grad_norm": 1.4001566171646118, "learning_rate": 1.5776775411727686e-06, "loss": 0.104, "mean_token_accuracy": 0.7811521515250206, "num_tokens": 74463232.0, "step": 9090 }, { "epoch": 2.5771940383049525, "grad_norm": 1.5772784948349, "learning_rate": 1.5671876639043324e-06, "loss": 0.1152, "mean_token_accuracy": 0.7650195706635714, "num_tokens": 74545152.0, "step": 9100 }, { "epoch": 2.5800261974722978, "grad_norm": 1.7766703367233276, "learning_rate": 1.5566977866358965e-06, "loss": 0.113, "mean_token_accuracy": 0.7801981404423713, "num_tokens": 74627072.0, "step": 9110 }, { "epoch": 2.582858356639643, "grad_norm": 1.4249588251113892, "learning_rate": 1.5462079093674606e-06, "loss": 0.1236, "mean_token_accuracy": 0.7641022481024266, "num_tokens": 74708992.0, "step": 9120 }, { "epoch": 2.5856905158069883, "grad_norm": 1.6609476804733276, "learning_rate": 1.5357180320990244e-06, "loss": 0.1154, "mean_token_accuracy": 0.763050389662385, "num_tokens": 74790912.0, "step": 9130 }, { "epoch": 2.5885226749743335, "grad_norm": 0.9137653708457947, "learning_rate": 1.5252281548305885e-06, "loss": 0.1179, "mean_token_accuracy": 0.7632950112223625, "num_tokens": 74872832.0, "step": 9140 }, { "epoch": 2.5913548341416788, "grad_norm": 0.9380526542663574, "learning_rate": 1.5147382775621528e-06, "loss": 0.1242, "mean_token_accuracy": 0.7689334619790316, "num_tokens": 74954752.0, "step": 9150 }, { "epoch": 2.594186993309024, "grad_norm": 1.246500849723816, "learning_rate": 1.5042484002937167e-06, "loss": 0.1106, "mean_token_accuracy": 0.758524950966239, "num_tokens": 75036672.0, "step": 9160 }, { "epoch": 2.5970191524763693, "grad_norm": 1.2258425951004028, "learning_rate": 1.4937585230252807e-06, "loss": 0.121, "mean_token_accuracy": 0.7841976564377546, "num_tokens": 75118592.0, "step": 9170 }, { "epoch": 2.5998513116437145, "grad_norm": 1.4543510675430298, "learning_rate": 1.4832686457568448e-06, "loss": 0.0928, "mean_token_accuracy": 0.794337086752057, "num_tokens": 75200512.0, "step": 9180 }, { "epoch": 2.6026834708110593, "grad_norm": 1.4098447561264038, "learning_rate": 1.4727787684884087e-06, "loss": 0.1181, "mean_token_accuracy": 0.7629280813038349, "num_tokens": 75282432.0, "step": 9190 }, { "epoch": 2.605515629978405, "grad_norm": 1.3578165769577026, "learning_rate": 1.4622888912199728e-06, "loss": 0.1072, "mean_token_accuracy": 0.7831213314086198, "num_tokens": 75364352.0, "step": 9200 }, { "epoch": 2.60834778914575, "grad_norm": 1.7388701438903809, "learning_rate": 1.451799013951537e-06, "loss": 0.1029, "mean_token_accuracy": 0.7875366933643818, "num_tokens": 75446272.0, "step": 9210 }, { "epoch": 2.611179948313095, "grad_norm": 1.3704735040664673, "learning_rate": 1.441309136683101e-06, "loss": 0.1546, "mean_token_accuracy": 0.7491927597671747, "num_tokens": 75528192.0, "step": 9220 }, { "epoch": 2.6140121074804403, "grad_norm": 1.2139005661010742, "learning_rate": 1.430819259414665e-06, "loss": 0.1298, "mean_token_accuracy": 0.7586350254714489, "num_tokens": 75610112.0, "step": 9230 }, { "epoch": 2.6168442666477856, "grad_norm": 2.0187840461730957, "learning_rate": 1.420329382146229e-06, "loss": 0.1319, "mean_token_accuracy": 0.7546477496623993, "num_tokens": 75692032.0, "step": 9240 }, { "epoch": 2.619676425815131, "grad_norm": 1.0713800191879272, "learning_rate": 1.409839504877793e-06, "loss": 0.0989, "mean_token_accuracy": 0.7925269067287445, "num_tokens": 75773952.0, "step": 9250 }, { "epoch": 2.622508584982476, "grad_norm": 1.284598469734192, "learning_rate": 1.3993496276093572e-06, "loss": 0.1198, "mean_token_accuracy": 0.7871208406984807, "num_tokens": 75855872.0, "step": 9260 }, { "epoch": 2.6253407441498213, "grad_norm": 0.9530990123748779, "learning_rate": 1.3888597503409213e-06, "loss": 0.1194, "mean_token_accuracy": 0.7739114474505187, "num_tokens": 75937792.0, "step": 9270 }, { "epoch": 2.6281729033171666, "grad_norm": 1.252050757408142, "learning_rate": 1.3783698730724852e-06, "loss": 0.1143, "mean_token_accuracy": 0.7736790612339973, "num_tokens": 76019712.0, "step": 9280 }, { "epoch": 2.631005062484512, "grad_norm": 1.2160993814468384, "learning_rate": 1.3678799958040492e-06, "loss": 0.1035, "mean_token_accuracy": 0.7817025430500507, "num_tokens": 76101632.0, "step": 9290 }, { "epoch": 2.6338372216518566, "grad_norm": 1.4404122829437256, "learning_rate": 1.3573901185356133e-06, "loss": 0.1285, "mean_token_accuracy": 0.7545743621885777, "num_tokens": 76183552.0, "step": 9300 }, { "epoch": 2.636669380819202, "grad_norm": 1.158105492591858, "learning_rate": 1.3469002412671772e-06, "loss": 0.1127, "mean_token_accuracy": 0.7763209372758866, "num_tokens": 76265472.0, "step": 9310 }, { "epoch": 2.639501539986547, "grad_norm": 1.2974953651428223, "learning_rate": 1.3364103639987415e-06, "loss": 0.1254, "mean_token_accuracy": 0.7779109582304955, "num_tokens": 76347392.0, "step": 9320 }, { "epoch": 2.6423336991538924, "grad_norm": 1.4528638124465942, "learning_rate": 1.3259204867303055e-06, "loss": 0.1089, "mean_token_accuracy": 0.7840998075902462, "num_tokens": 76429312.0, "step": 9330 }, { "epoch": 2.6451658583212376, "grad_norm": 0.9896726012229919, "learning_rate": 1.3154306094618694e-06, "loss": 0.103, "mean_token_accuracy": 0.7768224064260721, "num_tokens": 76511232.0, "step": 9340 }, { "epoch": 2.647998017488583, "grad_norm": 1.1756311655044556, "learning_rate": 1.3049407321934335e-06, "loss": 0.1033, "mean_token_accuracy": 0.7787426613271237, "num_tokens": 76593152.0, "step": 9350 }, { "epoch": 2.650830176655928, "grad_norm": 1.280672550201416, "learning_rate": 1.2944508549249975e-06, "loss": 0.1049, "mean_token_accuracy": 0.7903131127357483, "num_tokens": 76675072.0, "step": 9360 }, { "epoch": 2.6536623358232734, "grad_norm": 1.228232979774475, "learning_rate": 1.2839609776565614e-06, "loss": 0.1449, "mean_token_accuracy": 0.743456457555294, "num_tokens": 76756992.0, "step": 9370 }, { "epoch": 2.6564944949906186, "grad_norm": 1.4639639854431152, "learning_rate": 1.2734711003881257e-06, "loss": 0.1358, "mean_token_accuracy": 0.7659980464726687, "num_tokens": 76838912.0, "step": 9380 }, { "epoch": 2.6593266541579634, "grad_norm": 1.4914389848709106, "learning_rate": 1.2629812231196898e-06, "loss": 0.1121, "mean_token_accuracy": 0.7777764193713665, "num_tokens": 76920832.0, "step": 9390 }, { "epoch": 2.662158813325309, "grad_norm": 1.1283109188079834, "learning_rate": 1.2524913458512536e-06, "loss": 0.113, "mean_token_accuracy": 0.7669520601630211, "num_tokens": 77002752.0, "step": 9400 }, { "epoch": 2.664990972492654, "grad_norm": 1.1668506860733032, "learning_rate": 1.2420014685828177e-06, "loss": 0.1224, "mean_token_accuracy": 0.7749633088707923, "num_tokens": 77084672.0, "step": 9410 }, { "epoch": 2.667823131659999, "grad_norm": 1.8604751825332642, "learning_rate": 1.2315115913143818e-06, "loss": 0.1349, "mean_token_accuracy": 0.7719178080558777, "num_tokens": 77166592.0, "step": 9420 }, { "epoch": 2.6706552908273444, "grad_norm": 2.2527692317962646, "learning_rate": 1.2210217140459456e-06, "loss": 0.1357, "mean_token_accuracy": 0.7704500976949931, "num_tokens": 77248512.0, "step": 9430 }, { "epoch": 2.6734874499946897, "grad_norm": 1.1649688482284546, "learning_rate": 1.21053183677751e-06, "loss": 0.1056, "mean_token_accuracy": 0.7830601751804351, "num_tokens": 77330432.0, "step": 9440 }, { "epoch": 2.676319609162035, "grad_norm": 1.1416834592819214, "learning_rate": 1.2000419595090738e-06, "loss": 0.1265, "mean_token_accuracy": 0.7677592922002077, "num_tokens": 77412352.0, "step": 9450 }, { "epoch": 2.67915176832938, "grad_norm": 1.1690260171890259, "learning_rate": 1.1895520822406379e-06, "loss": 0.1164, "mean_token_accuracy": 0.7738502897322178, "num_tokens": 77494272.0, "step": 9460 }, { "epoch": 2.6819839274967254, "grad_norm": 1.4305615425109863, "learning_rate": 1.179062204972202e-06, "loss": 0.1248, "mean_token_accuracy": 0.7664261247962714, "num_tokens": 77576192.0, "step": 9470 }, { "epoch": 2.6848160866640707, "grad_norm": 1.3226728439331055, "learning_rate": 1.168572327703766e-06, "loss": 0.1253, "mean_token_accuracy": 0.7715998016297817, "num_tokens": 77658112.0, "step": 9480 }, { "epoch": 2.687648245831416, "grad_norm": 1.2239925861358643, "learning_rate": 1.1580824504353299e-06, "loss": 0.1412, "mean_token_accuracy": 0.7618395283818244, "num_tokens": 77740032.0, "step": 9490 }, { "epoch": 2.6904804049987607, "grad_norm": 1.3090022802352905, "learning_rate": 1.1475925731668942e-06, "loss": 0.1944, "mean_token_accuracy": 0.7248899202793837, "num_tokens": 77821952.0, "step": 9500 }, { "epoch": 2.693312564166106, "grad_norm": 1.6963791847229004, "learning_rate": 1.137102695898458e-06, "loss": 0.1448, "mean_token_accuracy": 0.7517612528055906, "num_tokens": 77903872.0, "step": 9510 }, { "epoch": 2.6961447233334512, "grad_norm": 1.5691133737564087, "learning_rate": 1.1266128186300221e-06, "loss": 0.1062, "mean_token_accuracy": 0.7805895309895277, "num_tokens": 77985792.0, "step": 9520 }, { "epoch": 2.6989768825007965, "grad_norm": 1.3455393314361572, "learning_rate": 1.1161229413615862e-06, "loss": 0.1181, "mean_token_accuracy": 0.7727250501513481, "num_tokens": 78067712.0, "step": 9530 }, { "epoch": 2.7018090416681417, "grad_norm": 1.7499293088912964, "learning_rate": 1.1056330640931503e-06, "loss": 0.1621, "mean_token_accuracy": 0.7321550864726305, "num_tokens": 78149632.0, "step": 9540 }, { "epoch": 2.704641200835487, "grad_norm": 1.078167200088501, "learning_rate": 1.0951431868247141e-06, "loss": 0.1142, "mean_token_accuracy": 0.7804916825145483, "num_tokens": 78231552.0, "step": 9550 }, { "epoch": 2.7074733600028322, "grad_norm": 1.411314845085144, "learning_rate": 1.0846533095562784e-06, "loss": 0.1143, "mean_token_accuracy": 0.7715141884982586, "num_tokens": 78313472.0, "step": 9560 }, { "epoch": 2.7103055191701775, "grad_norm": 1.734834909439087, "learning_rate": 1.0741634322878423e-06, "loss": 0.1425, "mean_token_accuracy": 0.7414505925029516, "num_tokens": 78395392.0, "step": 9570 }, { "epoch": 2.7131376783375227, "grad_norm": 1.7494261264801025, "learning_rate": 1.0636735550194063e-06, "loss": 0.1213, "mean_token_accuracy": 0.759222112223506, "num_tokens": 78477312.0, "step": 9580 }, { "epoch": 2.7159698375048675, "grad_norm": 1.2814098596572876, "learning_rate": 1.0531836777509704e-06, "loss": 0.1335, "mean_token_accuracy": 0.7782045040279627, "num_tokens": 78559232.0, "step": 9590 }, { "epoch": 2.7188019966722132, "grad_norm": 1.2416023015975952, "learning_rate": 1.0426938004825345e-06, "loss": 0.131, "mean_token_accuracy": 0.7627446163445711, "num_tokens": 78641152.0, "step": 9600 }, { "epoch": 2.721634155839558, "grad_norm": 1.2916755676269531, "learning_rate": 1.0322039232140984e-06, "loss": 0.1292, "mean_token_accuracy": 0.7665728945285082, "num_tokens": 78723072.0, "step": 9610 }, { "epoch": 2.7244663150069033, "grad_norm": 0.9685536026954651, "learning_rate": 1.0217140459456624e-06, "loss": 0.0966, "mean_token_accuracy": 0.7933586113154888, "num_tokens": 78804992.0, "step": 9620 }, { "epoch": 2.7272984741742485, "grad_norm": 1.0701133012771606, "learning_rate": 1.0112241686772265e-06, "loss": 0.0929, "mean_token_accuracy": 0.768480920419097, "num_tokens": 78886912.0, "step": 9630 }, { "epoch": 2.730130633341594, "grad_norm": 1.155450701713562, "learning_rate": 1.0007342914087906e-06, "loss": 0.1217, "mean_token_accuracy": 0.7712084148079157, "num_tokens": 78968832.0, "step": 9640 }, { "epoch": 2.732962792508939, "grad_norm": 1.2108891010284424, "learning_rate": 9.902444141403547e-07, "loss": 0.1269, "mean_token_accuracy": 0.759784734621644, "num_tokens": 79050752.0, "step": 9650 }, { "epoch": 2.7357949516762843, "grad_norm": 1.3404109477996826, "learning_rate": 9.797545368719187e-07, "loss": 0.115, "mean_token_accuracy": 0.7742294497787953, "num_tokens": 79132672.0, "step": 9660 }, { "epoch": 2.7386271108436295, "grad_norm": 0.9352473616600037, "learning_rate": 9.692646596034828e-07, "loss": 0.1153, "mean_token_accuracy": 0.7558341480791568, "num_tokens": 79214592.0, "step": 9670 }, { "epoch": 2.741459270010975, "grad_norm": 1.2585588693618774, "learning_rate": 9.587747823350467e-07, "loss": 0.1447, "mean_token_accuracy": 0.7387475546449422, "num_tokens": 79296512.0, "step": 9680 }, { "epoch": 2.74429142917832, "grad_norm": 1.4785575866699219, "learning_rate": 9.482849050666109e-07, "loss": 0.1194, "mean_token_accuracy": 0.7637353233993054, "num_tokens": 79378432.0, "step": 9690 }, { "epoch": 2.747123588345665, "grad_norm": 0.9869931936264038, "learning_rate": 9.377950277981748e-07, "loss": 0.1237, "mean_token_accuracy": 0.7830968666821718, "num_tokens": 79460352.0, "step": 9700 }, { "epoch": 2.74995574751301, "grad_norm": 1.2523363828659058, "learning_rate": 9.273051505297388e-07, "loss": 0.1295, "mean_token_accuracy": 0.7593199610710144, "num_tokens": 79542272.0, "step": 9710 }, { "epoch": 2.7527879066803553, "grad_norm": 1.2600061893463135, "learning_rate": 9.16815273261303e-07, "loss": 0.1209, "mean_token_accuracy": 0.7813111554831267, "num_tokens": 79624192.0, "step": 9720 }, { "epoch": 2.7556200658477006, "grad_norm": 0.9577277898788452, "learning_rate": 9.063253959928669e-07, "loss": 0.1156, "mean_token_accuracy": 0.7740337550640106, "num_tokens": 79706112.0, "step": 9730 }, { "epoch": 2.758452225015046, "grad_norm": 1.1340205669403076, "learning_rate": 8.958355187244309e-07, "loss": 0.1038, "mean_token_accuracy": 0.7865215256810189, "num_tokens": 79788032.0, "step": 9740 }, { "epoch": 2.761284384182391, "grad_norm": 1.5387784242630005, "learning_rate": 8.853456414559951e-07, "loss": 0.1328, "mean_token_accuracy": 0.7656678043305873, "num_tokens": 79869952.0, "step": 9750 }, { "epoch": 2.7641165433497363, "grad_norm": 1.7430437803268433, "learning_rate": 8.748557641875591e-07, "loss": 0.095, "mean_token_accuracy": 0.7949119359254837, "num_tokens": 79951872.0, "step": 9760 }, { "epoch": 2.7669487025170816, "grad_norm": 1.7460997104644775, "learning_rate": 8.64365886919123e-07, "loss": 0.1196, "mean_token_accuracy": 0.775464779511094, "num_tokens": 80033792.0, "step": 9770 }, { "epoch": 2.769780861684427, "grad_norm": 1.1114528179168701, "learning_rate": 8.538760096506872e-07, "loss": 0.1293, "mean_token_accuracy": 0.7551736798137426, "num_tokens": 80115712.0, "step": 9780 }, { "epoch": 2.7726130208517716, "grad_norm": 1.3568215370178223, "learning_rate": 8.433861323822512e-07, "loss": 0.0965, "mean_token_accuracy": 0.7976272024214268, "num_tokens": 80197632.0, "step": 9790 }, { "epoch": 2.7754451800191173, "grad_norm": 1.039504885673523, "learning_rate": 8.328962551138151e-07, "loss": 0.1181, "mean_token_accuracy": 0.7570694729685783, "num_tokens": 80279552.0, "step": 9800 }, { "epoch": 2.778277339186462, "grad_norm": 0.9073276519775391, "learning_rate": 8.224063778453793e-07, "loss": 0.1327, "mean_token_accuracy": 0.7564946163445712, "num_tokens": 80361472.0, "step": 9810 }, { "epoch": 2.7811094983538074, "grad_norm": 2.061521291732788, "learning_rate": 8.119165005769433e-07, "loss": 0.1195, "mean_token_accuracy": 0.7596379648894072, "num_tokens": 80443392.0, "step": 9820 }, { "epoch": 2.7839416575211526, "grad_norm": 1.21349036693573, "learning_rate": 8.014266233085073e-07, "loss": 0.126, "mean_token_accuracy": 0.7478473570197821, "num_tokens": 80525312.0, "step": 9830 }, { "epoch": 2.786773816688498, "grad_norm": 1.4586316347122192, "learning_rate": 7.909367460400715e-07, "loss": 0.1223, "mean_token_accuracy": 0.770731408149004, "num_tokens": 80607232.0, "step": 9840 }, { "epoch": 2.789605975855843, "grad_norm": 1.3496206998825073, "learning_rate": 7.804468687716354e-07, "loss": 0.1015, "mean_token_accuracy": 0.7758072383701802, "num_tokens": 80689152.0, "step": 9850 }, { "epoch": 2.7924381350231884, "grad_norm": 1.2071694135665894, "learning_rate": 7.699569915031994e-07, "loss": 0.1146, "mean_token_accuracy": 0.774987768009305, "num_tokens": 80771072.0, "step": 9860 }, { "epoch": 2.7952702941905336, "grad_norm": 1.2012773752212524, "learning_rate": 7.594671142347636e-07, "loss": 0.1262, "mean_token_accuracy": 0.7779476504772902, "num_tokens": 80852992.0, "step": 9870 }, { "epoch": 2.7981024533578784, "grad_norm": 1.2166376113891602, "learning_rate": 7.489772369663275e-07, "loss": 0.1141, "mean_token_accuracy": 0.779562134295702, "num_tokens": 80934912.0, "step": 9880 }, { "epoch": 2.800934612525224, "grad_norm": 1.269511103630066, "learning_rate": 7.384873596978916e-07, "loss": 0.1247, "mean_token_accuracy": 0.7734099797904491, "num_tokens": 81016832.0, "step": 9890 }, { "epoch": 2.803766771692569, "grad_norm": 1.0128493309020996, "learning_rate": 7.279974824294557e-07, "loss": 0.1261, "mean_token_accuracy": 0.7762353252619505, "num_tokens": 81098752.0, "step": 9900 }, { "epoch": 2.806598930859914, "grad_norm": 1.538405179977417, "learning_rate": 7.175076051610197e-07, "loss": 0.1291, "mean_token_accuracy": 0.7826198644936084, "num_tokens": 81180672.0, "step": 9910 }, { "epoch": 2.8094310900272594, "grad_norm": 1.5747365951538086, "learning_rate": 7.070177278925837e-07, "loss": 0.1306, "mean_token_accuracy": 0.776382091268897, "num_tokens": 81262592.0, "step": 9920 }, { "epoch": 2.8122632491946047, "grad_norm": 1.071977972984314, "learning_rate": 6.965278506241478e-07, "loss": 0.1108, "mean_token_accuracy": 0.7729818969964981, "num_tokens": 81344512.0, "step": 9930 }, { "epoch": 2.81509540836195, "grad_norm": 1.172013282775879, "learning_rate": 6.860379733557118e-07, "loss": 0.1414, "mean_token_accuracy": 0.7524706482887268, "num_tokens": 81426432.0, "step": 9940 }, { "epoch": 2.817927567529295, "grad_norm": 1.3133201599121094, "learning_rate": 6.755480960872759e-07, "loss": 0.1192, "mean_token_accuracy": 0.7713796466588974, "num_tokens": 81508352.0, "step": 9950 }, { "epoch": 2.8207597266966404, "grad_norm": 1.6226385831832886, "learning_rate": 6.650582188188398e-07, "loss": 0.1244, "mean_token_accuracy": 0.7702299427241087, "num_tokens": 81590272.0, "step": 9960 }, { "epoch": 2.8235918858639857, "grad_norm": 1.5946696996688843, "learning_rate": 6.545683415504039e-07, "loss": 0.1186, "mean_token_accuracy": 0.7681873787194491, "num_tokens": 81672192.0, "step": 9970 }, { "epoch": 2.826424045031331, "grad_norm": 1.3367503881454468, "learning_rate": 6.44078464281968e-07, "loss": 0.129, "mean_token_accuracy": 0.7627201572060585, "num_tokens": 81754112.0, "step": 9980 }, { "epoch": 2.8292562041986757, "grad_norm": 1.6041656732559204, "learning_rate": 6.335885870135319e-07, "loss": 0.0955, "mean_token_accuracy": 0.7912304297089576, "num_tokens": 81836032.0, "step": 9990 }, { "epoch": 2.8320883633660214, "grad_norm": 1.589345097541809, "learning_rate": 6.23098709745096e-07, "loss": 0.1391, "mean_token_accuracy": 0.7658023487776517, "num_tokens": 81917952.0, "step": 10000 }, { "epoch": 2.8349205225333662, "grad_norm": 2.1276166439056396, "learning_rate": 6.126088324766601e-07, "loss": 0.1176, "mean_token_accuracy": 0.7733365941792727, "num_tokens": 81999872.0, "step": 10010 }, { "epoch": 2.8377526817007115, "grad_norm": 1.4682493209838867, "learning_rate": 6.021189552082242e-07, "loss": 0.1185, "mean_token_accuracy": 0.7733732853084803, "num_tokens": 82081792.0, "step": 10020 }, { "epoch": 2.8405848408680567, "grad_norm": 1.2164413928985596, "learning_rate": 5.916290779397881e-07, "loss": 0.1097, "mean_token_accuracy": 0.7680772997438907, "num_tokens": 82163712.0, "step": 10030 }, { "epoch": 2.843417000035402, "grad_norm": 1.2230916023254395, "learning_rate": 5.811392006713522e-07, "loss": 0.1209, "mean_token_accuracy": 0.770596868917346, "num_tokens": 82245632.0, "step": 10040 }, { "epoch": 2.8462491592027472, "grad_norm": 1.3805590867996216, "learning_rate": 5.706493234029163e-07, "loss": 0.1109, "mean_token_accuracy": 0.7664261229336262, "num_tokens": 82327552.0, "step": 10050 }, { "epoch": 2.8490813183700925, "grad_norm": 1.2335084676742554, "learning_rate": 5.601594461344803e-07, "loss": 0.1087, "mean_token_accuracy": 0.7922333665192127, "num_tokens": 82409472.0, "step": 10060 }, { "epoch": 2.8519134775374377, "grad_norm": 1.4766696691513062, "learning_rate": 5.496695688660443e-07, "loss": 0.0949, "mean_token_accuracy": 0.7757460854947567, "num_tokens": 82491392.0, "step": 10070 }, { "epoch": 2.8547456367047825, "grad_norm": 1.2470474243164062, "learning_rate": 5.391796915976084e-07, "loss": 0.112, "mean_token_accuracy": 0.7856042079627514, "num_tokens": 82573312.0, "step": 10080 }, { "epoch": 2.8575777958721282, "grad_norm": 1.7810742855072021, "learning_rate": 5.286898143291724e-07, "loss": 0.121, "mean_token_accuracy": 0.7693003930151463, "num_tokens": 82655232.0, "step": 10090 }, { "epoch": 2.860409955039473, "grad_norm": 1.3474197387695312, "learning_rate": 5.181999370607364e-07, "loss": 0.1182, "mean_token_accuracy": 0.7601394318044186, "num_tokens": 82737152.0, "step": 10100 }, { "epoch": 2.8632421142068183, "grad_norm": 1.096218466758728, "learning_rate": 5.077100597923005e-07, "loss": 0.13, "mean_token_accuracy": 0.7537181980907917, "num_tokens": 82819072.0, "step": 10110 }, { "epoch": 2.8660742733741635, "grad_norm": 1.064784049987793, "learning_rate": 4.972201825238645e-07, "loss": 0.1348, "mean_token_accuracy": 0.7513820916414261, "num_tokens": 82900992.0, "step": 10120 }, { "epoch": 2.868906432541509, "grad_norm": 1.5605591535568237, "learning_rate": 4.867303052554286e-07, "loss": 0.141, "mean_token_accuracy": 0.7740704540163279, "num_tokens": 82982912.0, "step": 10130 }, { "epoch": 2.871738591708854, "grad_norm": 1.420284390449524, "learning_rate": 4.7624042798699264e-07, "loss": 0.11, "mean_token_accuracy": 0.7743272982537747, "num_tokens": 83064832.0, "step": 10140 }, { "epoch": 2.8745707508761993, "grad_norm": 1.2748111486434937, "learning_rate": 4.657505507185566e-07, "loss": 0.1273, "mean_token_accuracy": 0.7646037183701992, "num_tokens": 83146752.0, "step": 10150 }, { "epoch": 2.8774029100435445, "grad_norm": 1.1738097667694092, "learning_rate": 4.552606734501207e-07, "loss": 0.1224, "mean_token_accuracy": 0.7754525426775217, "num_tokens": 83228672.0, "step": 10160 }, { "epoch": 2.88023506921089, "grad_norm": 1.5003738403320312, "learning_rate": 4.4477079618168476e-07, "loss": 0.1128, "mean_token_accuracy": 0.7775073368102312, "num_tokens": 83310592.0, "step": 10170 }, { "epoch": 2.883067228378235, "grad_norm": 1.2533864974975586, "learning_rate": 4.3428091891324873e-07, "loss": 0.1311, "mean_token_accuracy": 0.7413649678230285, "num_tokens": 83392512.0, "step": 10180 }, { "epoch": 2.88589938754558, "grad_norm": 1.5065313577651978, "learning_rate": 4.237910416448128e-07, "loss": 0.1546, "mean_token_accuracy": 0.77030332647264, "num_tokens": 83474432.0, "step": 10190 }, { "epoch": 2.8887315467129255, "grad_norm": 1.491937518119812, "learning_rate": 4.133011643763768e-07, "loss": 0.1268, "mean_token_accuracy": 0.7824853226542473, "num_tokens": 83556352.0, "step": 10200 }, { "epoch": 2.8915637058802703, "grad_norm": 1.166266918182373, "learning_rate": 4.0281128710794085e-07, "loss": 0.1116, "mean_token_accuracy": 0.782081701233983, "num_tokens": 83638272.0, "step": 10210 }, { "epoch": 2.8943958650476156, "grad_norm": 1.42288076877594, "learning_rate": 3.923214098395049e-07, "loss": 0.1282, "mean_token_accuracy": 0.7608121354132891, "num_tokens": 83720192.0, "step": 10220 }, { "epoch": 2.897228024214961, "grad_norm": 1.6304948329925537, "learning_rate": 3.818315325710689e-07, "loss": 0.1231, "mean_token_accuracy": 0.7633317038416862, "num_tokens": 83802112.0, "step": 10230 }, { "epoch": 2.900060183382306, "grad_norm": 1.4208807945251465, "learning_rate": 3.7134165530263297e-07, "loss": 0.0992, "mean_token_accuracy": 0.7627568505704403, "num_tokens": 83884032.0, "step": 10240 }, { "epoch": 2.9028923425496513, "grad_norm": 1.291266679763794, "learning_rate": 3.6085177803419705e-07, "loss": 0.11, "mean_token_accuracy": 0.7762964777648449, "num_tokens": 83965952.0, "step": 10250 }, { "epoch": 2.9057245017169966, "grad_norm": 1.5174055099487305, "learning_rate": 3.5036190076576107e-07, "loss": 0.1259, "mean_token_accuracy": 0.7930772956460714, "num_tokens": 84047872.0, "step": 10260 }, { "epoch": 2.908556660884342, "grad_norm": 1.2579764127731323, "learning_rate": 3.398720234973251e-07, "loss": 0.1173, "mean_token_accuracy": 0.7738992158323527, "num_tokens": 84129792.0, "step": 10270 }, { "epoch": 2.9113888200516866, "grad_norm": 1.7533577680587769, "learning_rate": 3.2938214622888917e-07, "loss": 0.1219, "mean_token_accuracy": 0.7630137003958225, "num_tokens": 84211712.0, "step": 10280 }, { "epoch": 2.9142209792190323, "grad_norm": 1.3265914916992188, "learning_rate": 3.188922689604532e-07, "loss": 0.139, "mean_token_accuracy": 0.7553082194179297, "num_tokens": 84293632.0, "step": 10290 }, { "epoch": 2.917053138386377, "grad_norm": 1.803127408027649, "learning_rate": 3.084023916920172e-07, "loss": 0.1207, "mean_token_accuracy": 0.7525073390454053, "num_tokens": 84375552.0, "step": 10300 }, { "epoch": 2.9198852975537224, "grad_norm": 1.6787763833999634, "learning_rate": 2.979125144235813e-07, "loss": 0.1139, "mean_token_accuracy": 0.7773361060768366, "num_tokens": 84457472.0, "step": 10310 }, { "epoch": 2.9227174567210676, "grad_norm": 1.486560344696045, "learning_rate": 2.874226371551453e-07, "loss": 0.1424, "mean_token_accuracy": 0.739921722188592, "num_tokens": 84539392.0, "step": 10320 }, { "epoch": 2.925549615888413, "grad_norm": 1.3302429914474487, "learning_rate": 2.7693275988670933e-07, "loss": 0.0954, "mean_token_accuracy": 0.7770058684051037, "num_tokens": 84621312.0, "step": 10330 }, { "epoch": 2.928381775055758, "grad_norm": 1.5905101299285889, "learning_rate": 2.664428826182734e-07, "loss": 0.1068, "mean_token_accuracy": 0.7657045032829046, "num_tokens": 84703232.0, "step": 10340 }, { "epoch": 2.9312139342231034, "grad_norm": 1.2340965270996094, "learning_rate": 2.559530053498374e-07, "loss": 0.121, "mean_token_accuracy": 0.7530577316880226, "num_tokens": 84785152.0, "step": 10350 }, { "epoch": 2.9340460933904486, "grad_norm": 1.4800512790679932, "learning_rate": 2.454631280814015e-07, "loss": 0.1025, "mean_token_accuracy": 0.783512718975544, "num_tokens": 84867072.0, "step": 10360 }, { "epoch": 2.936878252557794, "grad_norm": 1.4509563446044922, "learning_rate": 2.349732508129655e-07, "loss": 0.1136, "mean_token_accuracy": 0.7643346376717091, "num_tokens": 84948992.0, "step": 10370 }, { "epoch": 2.939710411725139, "grad_norm": 1.5300997495651245, "learning_rate": 2.2448337354452955e-07, "loss": 0.1394, "mean_token_accuracy": 0.7647504940629005, "num_tokens": 85030912.0, "step": 10380 }, { "epoch": 2.942542570892484, "grad_norm": 1.0120151042938232, "learning_rate": 2.139934962760936e-07, "loss": 0.1119, "mean_token_accuracy": 0.7749999992549419, "num_tokens": 85112832.0, "step": 10390 }, { "epoch": 2.9453747300598296, "grad_norm": 1.1445319652557373, "learning_rate": 2.0350361900765764e-07, "loss": 0.1343, "mean_token_accuracy": 0.7669031299650669, "num_tokens": 85194752.0, "step": 10400 }, { "epoch": 2.9482068892271744, "grad_norm": 1.1299060583114624, "learning_rate": 1.9301374173922166e-07, "loss": 0.1373, "mean_token_accuracy": 0.7592465754598379, "num_tokens": 85276672.0, "step": 10410 }, { "epoch": 2.9510390483945197, "grad_norm": 1.0287593603134155, "learning_rate": 1.8252386447078569e-07, "loss": 0.1243, "mean_token_accuracy": 0.7871330726891757, "num_tokens": 85358592.0, "step": 10420 }, { "epoch": 2.953871207561865, "grad_norm": 1.2568093538284302, "learning_rate": 1.7203398720234976e-07, "loss": 0.0979, "mean_token_accuracy": 0.77096379250288, "num_tokens": 85440512.0, "step": 10430 }, { "epoch": 2.95670336672921, "grad_norm": 2.05387020111084, "learning_rate": 1.6154410993391378e-07, "loss": 0.1146, "mean_token_accuracy": 0.7754647746682167, "num_tokens": 85522432.0, "step": 10440 }, { "epoch": 2.9595355258965554, "grad_norm": 1.3246551752090454, "learning_rate": 1.5105423266547783e-07, "loss": 0.1108, "mean_token_accuracy": 0.779549902677536, "num_tokens": 85604352.0, "step": 10450 }, { "epoch": 2.9623676850639007, "grad_norm": 1.5421769618988037, "learning_rate": 1.4056435539704185e-07, "loss": 0.1215, "mean_token_accuracy": 0.7485934443771839, "num_tokens": 85686272.0, "step": 10460 }, { "epoch": 2.965199844231246, "grad_norm": 1.457680583000183, "learning_rate": 1.300744781286059e-07, "loss": 0.1266, "mean_token_accuracy": 0.7566780854016543, "num_tokens": 85768192.0, "step": 10470 }, { "epoch": 2.9680320033985907, "grad_norm": 1.1517871618270874, "learning_rate": 1.1958460086016993e-07, "loss": 0.1209, "mean_token_accuracy": 0.7584148690104484, "num_tokens": 85850112.0, "step": 10480 }, { "epoch": 2.9708641625659364, "grad_norm": 1.3935081958770752, "learning_rate": 1.0909472359173399e-07, "loss": 0.1103, "mean_token_accuracy": 0.782497552037239, "num_tokens": 85932032.0, "step": 10490 }, { "epoch": 2.9736963217332812, "grad_norm": 1.209938883781433, "learning_rate": 9.860484632329804e-08, "loss": 0.1417, "mean_token_accuracy": 0.7534368880093097, "num_tokens": 86013952.0, "step": 10500 } ], "logging_steps": 10, "max_steps": 10593, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.2731807970767667e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }