{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 418, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023923444976076554, "grad_norm": 20.119582297751847, "learning_rate": 0.0, "loss": 2.1821, "num_tokens": 274125.0, "step": 1 }, { "epoch": 0.004784688995215311, "grad_norm": 23.170097201666195, "learning_rate": 7.692307692307694e-07, "loss": 2.2358, "num_tokens": 493377.0, "step": 2 }, { "epoch": 0.007177033492822967, "grad_norm": 23.45399110433363, "learning_rate": 1.5384615384615387e-06, "loss": 2.2012, "num_tokens": 686897.0, "step": 3 }, { "epoch": 0.009569377990430622, "grad_norm": 19.228005860305963, "learning_rate": 2.307692307692308e-06, "loss": 2.2219, "num_tokens": 914354.0, "step": 4 }, { "epoch": 0.011961722488038277, "grad_norm": 18.548172181534362, "learning_rate": 3.0769230769230774e-06, "loss": 2.1958, "num_tokens": 1087390.0, "step": 5 }, { "epoch": 0.014354066985645933, "grad_norm": 13.728999411657618, "learning_rate": 3.846153846153847e-06, "loss": 2.1247, "num_tokens": 1268762.0, "step": 6 }, { "epoch": 0.01674641148325359, "grad_norm": 8.505233076733274, "learning_rate": 4.615384615384616e-06, "loss": 2.0737, "num_tokens": 1433561.0, "step": 7 }, { "epoch": 0.019138755980861243, "grad_norm": 4.1867059667712025, "learning_rate": 5.384615384615385e-06, "loss": 1.938, "num_tokens": 1655898.0, "step": 8 }, { "epoch": 0.0215311004784689, "grad_norm": 3.3689397757728203, "learning_rate": 6.153846153846155e-06, "loss": 1.8762, "num_tokens": 1904754.0, "step": 9 }, { "epoch": 0.023923444976076555, "grad_norm": 2.6965544763096254, "learning_rate": 6.923076923076923e-06, "loss": 1.8481, "num_tokens": 2100951.0, "step": 10 }, { "epoch": 0.02631578947368421, "grad_norm": 2.0908111385220045, "learning_rate": 7.692307692307694e-06, "loss": 1.7457, "num_tokens": 2264681.0, "step": 11 }, { "epoch": 0.028708133971291867, "grad_norm": 2.170718723726301, "learning_rate": 8.461538461538462e-06, "loss": 1.7225, "num_tokens": 2459076.0, "step": 12 }, { "epoch": 0.03110047846889952, "grad_norm": 2.16857982636961, "learning_rate": 9.230769230769232e-06, "loss": 1.6537, "num_tokens": 2606612.0, "step": 13 }, { "epoch": 0.03349282296650718, "grad_norm": 1.5656854944876009, "learning_rate": 1e-05, "loss": 1.6801, "num_tokens": 2766328.0, "step": 14 }, { "epoch": 0.03588516746411483, "grad_norm": 1.6899464949924934, "learning_rate": 9.999864615158956e-06, "loss": 1.3963, "num_tokens": 2939734.0, "step": 15 }, { "epoch": 0.03827751196172249, "grad_norm": 1.2147889414450102, "learning_rate": 9.999458468782065e-06, "loss": 1.6588, "num_tokens": 3209741.0, "step": 16 }, { "epoch": 0.04066985645933014, "grad_norm": 1.3059422864639767, "learning_rate": 9.998781585307577e-06, "loss": 1.2028, "num_tokens": 3331253.0, "step": 17 }, { "epoch": 0.0430622009569378, "grad_norm": 0.8168354152517865, "learning_rate": 9.997834005464281e-06, "loss": 1.5119, "num_tokens": 3550942.0, "step": 18 }, { "epoch": 0.045454545454545456, "grad_norm": 0.7578450765410201, "learning_rate": 9.996615786269036e-06, "loss": 1.5165, "num_tokens": 3734184.0, "step": 19 }, { "epoch": 0.04784688995215311, "grad_norm": 0.772984535484589, "learning_rate": 9.995127001023362e-06, "loss": 1.4925, "num_tokens": 3923612.0, "step": 20 }, { "epoch": 0.050239234449760764, "grad_norm": 0.7657276095351829, "learning_rate": 9.993367739309013e-06, "loss": 1.3945, "num_tokens": 4090661.0, "step": 21 }, { "epoch": 0.05263157894736842, "grad_norm": 0.6839298661119211, "learning_rate": 9.991338106982598e-06, "loss": 1.46, "num_tokens": 4300333.0, "step": 22 }, { "epoch": 0.05502392344497608, "grad_norm": 0.7054066291049598, "learning_rate": 9.98903822616921e-06, "loss": 1.3554, "num_tokens": 4483986.0, "step": 23 }, { "epoch": 0.05741626794258373, "grad_norm": 0.7193972470009606, "learning_rate": 9.986468235255065e-06, "loss": 1.4998, "num_tokens": 4682593.0, "step": 24 }, { "epoch": 0.05980861244019139, "grad_norm": 0.6625723448730417, "learning_rate": 9.983628288879193e-06, "loss": 1.4898, "num_tokens": 4880940.0, "step": 25 }, { "epoch": 0.06220095693779904, "grad_norm": 0.7404539912651659, "learning_rate": 9.98051855792412e-06, "loss": 1.3321, "num_tokens": 5074700.0, "step": 26 }, { "epoch": 0.0645933014354067, "grad_norm": 0.7881264974132591, "learning_rate": 9.977139229505596e-06, "loss": 1.2212, "num_tokens": 5225193.0, "step": 27 }, { "epoch": 0.06698564593301436, "grad_norm": 0.6060089446257308, "learning_rate": 9.973490506961326e-06, "loss": 1.5731, "num_tokens": 5447459.0, "step": 28 }, { "epoch": 0.06937799043062201, "grad_norm": 0.618254776059864, "learning_rate": 9.969572609838745e-06, "loss": 1.4722, "num_tokens": 5676623.0, "step": 29 }, { "epoch": 0.07177033492822966, "grad_norm": 0.6304080009866732, "learning_rate": 9.965385773881795e-06, "loss": 1.3474, "num_tokens": 5898924.0, "step": 30 }, { "epoch": 0.07416267942583732, "grad_norm": 0.6104465608230878, "learning_rate": 9.960930251016752e-06, "loss": 1.4138, "num_tokens": 6089369.0, "step": 31 }, { "epoch": 0.07655502392344497, "grad_norm": 0.6581355504876419, "learning_rate": 9.956206309337067e-06, "loss": 1.4661, "num_tokens": 6294065.0, "step": 32 }, { "epoch": 0.07894736842105263, "grad_norm": 0.5866617107994286, "learning_rate": 9.951214233087223e-06, "loss": 1.4306, "num_tokens": 6515957.0, "step": 33 }, { "epoch": 0.08133971291866028, "grad_norm": 0.605393818271364, "learning_rate": 9.945954322645643e-06, "loss": 1.3046, "num_tokens": 6725025.0, "step": 34 }, { "epoch": 0.08373205741626795, "grad_norm": 0.5778342378194031, "learning_rate": 9.940426894506608e-06, "loss": 1.4363, "num_tokens": 6949955.0, "step": 35 }, { "epoch": 0.0861244019138756, "grad_norm": 0.6258805596031615, "learning_rate": 9.934632281261221e-06, "loss": 1.3519, "num_tokens": 7152815.0, "step": 36 }, { "epoch": 0.08851674641148326, "grad_norm": 0.5788764918533683, "learning_rate": 9.928570831577396e-06, "loss": 1.4289, "num_tokens": 7365760.0, "step": 37 }, { "epoch": 0.09090909090909091, "grad_norm": 0.6104478143341243, "learning_rate": 9.922242910178862e-06, "loss": 1.4927, "num_tokens": 7619917.0, "step": 38 }, { "epoch": 0.09330143540669857, "grad_norm": 0.666007518151506, "learning_rate": 9.915648897823232e-06, "loss": 1.1965, "num_tokens": 7772797.0, "step": 39 }, { "epoch": 0.09569377990430622, "grad_norm": 0.7244485739284531, "learning_rate": 9.908789191279093e-06, "loss": 1.3198, "num_tokens": 7978612.0, "step": 40 }, { "epoch": 0.09808612440191387, "grad_norm": 0.6500652663575426, "learning_rate": 9.901664203302126e-06, "loss": 1.3692, "num_tokens": 8181944.0, "step": 41 }, { "epoch": 0.10047846889952153, "grad_norm": 0.6523516464098081, "learning_rate": 9.89427436261027e-06, "loss": 1.2651, "num_tokens": 8349921.0, "step": 42 }, { "epoch": 0.10287081339712918, "grad_norm": 0.6141096849362858, "learning_rate": 9.886620113857926e-06, "loss": 1.1674, "num_tokens": 8513062.0, "step": 43 }, { "epoch": 0.10526315789473684, "grad_norm": 0.5176000363276883, "learning_rate": 9.878701917609208e-06, "loss": 1.3363, "num_tokens": 8739362.0, "step": 44 }, { "epoch": 0.1076555023923445, "grad_norm": 0.6496907081327192, "learning_rate": 9.870520250310223e-06, "loss": 1.2051, "num_tokens": 8882227.0, "step": 45 }, { "epoch": 0.11004784688995216, "grad_norm": 0.5781609822463768, "learning_rate": 9.862075604260402e-06, "loss": 1.4038, "num_tokens": 9101362.0, "step": 46 }, { "epoch": 0.11244019138755981, "grad_norm": 0.7191639780141069, "learning_rate": 9.853368487582888e-06, "loss": 1.1333, "num_tokens": 9286876.0, "step": 47 }, { "epoch": 0.11483253588516747, "grad_norm": 0.6406116951034948, "learning_rate": 9.84439942419395e-06, "loss": 1.4121, "num_tokens": 9459192.0, "step": 48 }, { "epoch": 0.11722488038277512, "grad_norm": 0.5661996222062946, "learning_rate": 9.835168953771463e-06, "loss": 1.322, "num_tokens": 9724803.0, "step": 49 }, { "epoch": 0.11961722488038277, "grad_norm": 0.5715728086031884, "learning_rate": 9.825677631722436e-06, "loss": 1.3516, "num_tokens": 9933571.0, "step": 50 }, { "epoch": 0.12200956937799043, "grad_norm": 0.6325774615690734, "learning_rate": 9.815926029149593e-06, "loss": 1.258, "num_tokens": 10136490.0, "step": 51 }, { "epoch": 0.12440191387559808, "grad_norm": 0.5904482238857803, "learning_rate": 9.805914732817007e-06, "loss": 1.293, "num_tokens": 10340564.0, "step": 52 }, { "epoch": 0.12679425837320574, "grad_norm": 0.5710320806437825, "learning_rate": 9.795644345114796e-06, "loss": 1.2765, "num_tokens": 10553400.0, "step": 53 }, { "epoch": 0.1291866028708134, "grad_norm": 0.622309054620362, "learning_rate": 9.78511548402287e-06, "loss": 1.123, "num_tokens": 10758112.0, "step": 54 }, { "epoch": 0.13157894736842105, "grad_norm": 0.7557997838257337, "learning_rate": 9.77432878307376e-06, "loss": 1.1149, "num_tokens": 10934718.0, "step": 55 }, { "epoch": 0.1339712918660287, "grad_norm": 0.4774648627893749, "learning_rate": 9.763284891314481e-06, "loss": 1.4329, "num_tokens": 11227923.0, "step": 56 }, { "epoch": 0.13636363636363635, "grad_norm": 0.6518939385243675, "learning_rate": 9.751984473267498e-06, "loss": 1.2629, "num_tokens": 11417535.0, "step": 57 }, { "epoch": 0.13875598086124402, "grad_norm": 0.5370370863120535, "learning_rate": 9.740428208890716e-06, "loss": 1.3426, "num_tokens": 11651380.0, "step": 58 }, { "epoch": 0.14114832535885166, "grad_norm": 0.5696851508370838, "learning_rate": 9.728616793536588e-06, "loss": 1.125, "num_tokens": 11830736.0, "step": 59 }, { "epoch": 0.14354066985645933, "grad_norm": 0.5644132429290988, "learning_rate": 9.716550937910268e-06, "loss": 1.2145, "num_tokens": 12023638.0, "step": 60 }, { "epoch": 0.145933014354067, "grad_norm": 0.7461647382617252, "learning_rate": 9.70423136802684e-06, "loss": 1.204, "num_tokens": 12234061.0, "step": 61 }, { "epoch": 0.14832535885167464, "grad_norm": 0.5086888568285274, "learning_rate": 9.691658825167641e-06, "loss": 1.3124, "num_tokens": 12472421.0, "step": 62 }, { "epoch": 0.1507177033492823, "grad_norm": 0.5053241954118645, "learning_rate": 9.67883406583566e-06, "loss": 1.3634, "num_tokens": 12734106.0, "step": 63 }, { "epoch": 0.15311004784688995, "grad_norm": 0.5179034670964426, "learning_rate": 9.665757861710008e-06, "loss": 1.3053, "num_tokens": 12960684.0, "step": 64 }, { "epoch": 0.15550239234449761, "grad_norm": 0.5461947358982723, "learning_rate": 9.652430999599491e-06, "loss": 1.2969, "num_tokens": 13170331.0, "step": 65 }, { "epoch": 0.15789473684210525, "grad_norm": 0.6423563162262463, "learning_rate": 9.638854281395271e-06, "loss": 1.3541, "num_tokens": 13397481.0, "step": 66 }, { "epoch": 0.16028708133971292, "grad_norm": 0.5755576573234283, "learning_rate": 9.625028524022606e-06, "loss": 1.2183, "num_tokens": 13638917.0, "step": 67 }, { "epoch": 0.16267942583732056, "grad_norm": 0.6393096708849371, "learning_rate": 9.610954559391704e-06, "loss": 1.2774, "num_tokens": 13845779.0, "step": 68 }, { "epoch": 0.16507177033492823, "grad_norm": 0.6238780043211961, "learning_rate": 9.596633234347661e-06, "loss": 1.0493, "num_tokens": 14015645.0, "step": 69 }, { "epoch": 0.1674641148325359, "grad_norm": 0.6004590974749275, "learning_rate": 9.582065410619503e-06, "loss": 1.1128, "num_tokens": 14174170.0, "step": 70 }, { "epoch": 0.16985645933014354, "grad_norm": 0.5353801191806298, "learning_rate": 9.567251964768343e-06, "loss": 1.2534, "num_tokens": 14391398.0, "step": 71 }, { "epoch": 0.1722488038277512, "grad_norm": 0.5703356560477955, "learning_rate": 9.55219378813463e-06, "loss": 1.2457, "num_tokens": 14610731.0, "step": 72 }, { "epoch": 0.17464114832535885, "grad_norm": 0.5213842592670314, "learning_rate": 9.53689178678452e-06, "loss": 1.3794, "num_tokens": 14858252.0, "step": 73 }, { "epoch": 0.17703349282296652, "grad_norm": 0.5665738251245545, "learning_rate": 9.521346881455356e-06, "loss": 1.3718, "num_tokens": 15084332.0, "step": 74 }, { "epoch": 0.17942583732057416, "grad_norm": 0.5432851738944047, "learning_rate": 9.505560007500263e-06, "loss": 1.2429, "num_tokens": 15352232.0, "step": 75 }, { "epoch": 0.18181818181818182, "grad_norm": 0.6029856670534988, "learning_rate": 9.489532114831876e-06, "loss": 1.1883, "num_tokens": 15574514.0, "step": 76 }, { "epoch": 0.18421052631578946, "grad_norm": 0.5636116286831033, "learning_rate": 9.473264167865172e-06, "loss": 1.1939, "num_tokens": 15788273.0, "step": 77 }, { "epoch": 0.18660287081339713, "grad_norm": 0.5273294226554239, "learning_rate": 9.456757145459445e-06, "loss": 1.3284, "num_tokens": 16058083.0, "step": 78 }, { "epoch": 0.18899521531100477, "grad_norm": 0.6091499871383838, "learning_rate": 9.44001204085941e-06, "loss": 1.1578, "num_tokens": 16222078.0, "step": 79 }, { "epoch": 0.19138755980861244, "grad_norm": 0.5729867351707406, "learning_rate": 9.423029861635431e-06, "loss": 1.1448, "num_tokens": 16452197.0, "step": 80 }, { "epoch": 0.1937799043062201, "grad_norm": 0.5753208503065251, "learning_rate": 9.405811629622904e-06, "loss": 1.3236, "num_tokens": 16678106.0, "step": 81 }, { "epoch": 0.19617224880382775, "grad_norm": 0.613469703266833, "learning_rate": 9.388358380860763e-06, "loss": 1.1021, "num_tokens": 16908054.0, "step": 82 }, { "epoch": 0.19856459330143542, "grad_norm": 0.6002222062441086, "learning_rate": 9.370671165529146e-06, "loss": 1.1476, "num_tokens": 17140981.0, "step": 83 }, { "epoch": 0.20095693779904306, "grad_norm": 0.5295041630429093, "learning_rate": 9.3527510478862e-06, "loss": 1.2725, "num_tokens": 17364693.0, "step": 84 }, { "epoch": 0.20334928229665072, "grad_norm": 0.5369203542352684, "learning_rate": 9.334599106204051e-06, "loss": 1.2895, "num_tokens": 17563578.0, "step": 85 }, { "epoch": 0.20574162679425836, "grad_norm": 0.5193929587177428, "learning_rate": 9.316216432703918e-06, "loss": 1.2499, "num_tokens": 17740374.0, "step": 86 }, { "epoch": 0.20813397129186603, "grad_norm": 0.49812886005887325, "learning_rate": 9.29760413349039e-06, "loss": 1.3455, "num_tokens": 18015806.0, "step": 87 }, { "epoch": 0.21052631578947367, "grad_norm": 0.5190241504997857, "learning_rate": 9.278763328484875e-06, "loss": 1.0828, "num_tokens": 18245485.0, "step": 88 }, { "epoch": 0.21291866028708134, "grad_norm": 0.534699634820348, "learning_rate": 9.259695151358215e-06, "loss": 1.2029, "num_tokens": 18441471.0, "step": 89 }, { "epoch": 0.215311004784689, "grad_norm": 0.5368146817909797, "learning_rate": 9.240400749462467e-06, "loss": 1.13, "num_tokens": 18659186.0, "step": 90 }, { "epoch": 0.21770334928229665, "grad_norm": 0.6643155654192867, "learning_rate": 9.220881283761868e-06, "loss": 1.1626, "num_tokens": 18811916.0, "step": 91 }, { "epoch": 0.22009569377990432, "grad_norm": 0.5953751009151461, "learning_rate": 9.20113792876298e-06, "loss": 1.1446, "num_tokens": 18974285.0, "step": 92 }, { "epoch": 0.22248803827751196, "grad_norm": 0.6067628035324104, "learning_rate": 9.181171872444015e-06, "loss": 1.2417, "num_tokens": 19182034.0, "step": 93 }, { "epoch": 0.22488038277511962, "grad_norm": 0.6396322460129866, "learning_rate": 9.160984316183354e-06, "loss": 1.0376, "num_tokens": 19324593.0, "step": 94 }, { "epoch": 0.22727272727272727, "grad_norm": 0.5167898612058803, "learning_rate": 9.140576474687263e-06, "loss": 1.0627, "num_tokens": 19559212.0, "step": 95 }, { "epoch": 0.22966507177033493, "grad_norm": 0.6898506829068124, "learning_rate": 9.1199495759168e-06, "loss": 1.0682, "num_tokens": 19734777.0, "step": 96 }, { "epoch": 0.23205741626794257, "grad_norm": 0.5632751758217261, "learning_rate": 9.099104861013922e-06, "loss": 1.2069, "num_tokens": 19924776.0, "step": 97 }, { "epoch": 0.23444976076555024, "grad_norm": 0.4975676948616479, "learning_rate": 9.078043584226816e-06, "loss": 1.2944, "num_tokens": 20166431.0, "step": 98 }, { "epoch": 0.23684210526315788, "grad_norm": 0.5811862630357938, "learning_rate": 9.056767012834417e-06, "loss": 1.2261, "num_tokens": 20342559.0, "step": 99 }, { "epoch": 0.23923444976076555, "grad_norm": 0.6205394909309613, "learning_rate": 9.035276427070166e-06, "loss": 1.1827, "num_tokens": 20528647.0, "step": 100 }, { "epoch": 0.24162679425837322, "grad_norm": 0.6101249338540917, "learning_rate": 9.013573120044968e-06, "loss": 1.0195, "num_tokens": 20735927.0, "step": 101 }, { "epoch": 0.24401913875598086, "grad_norm": 0.5589655982664236, "learning_rate": 8.991658397669384e-06, "loss": 1.2941, "num_tokens": 20973055.0, "step": 102 }, { "epoch": 0.24641148325358853, "grad_norm": 0.602415461668376, "learning_rate": 8.96953357857507e-06, "loss": 0.9238, "num_tokens": 21131698.0, "step": 103 }, { "epoch": 0.24880382775119617, "grad_norm": 0.4635975776481471, "learning_rate": 8.947199994035402e-06, "loss": 1.206, "num_tokens": 21426277.0, "step": 104 }, { "epoch": 0.2511961722488038, "grad_norm": 0.5416414335210736, "learning_rate": 8.924658987885403e-06, "loss": 1.1863, "num_tokens": 21629826.0, "step": 105 }, { "epoch": 0.2535885167464115, "grad_norm": 0.703889948074174, "learning_rate": 8.901911916440867e-06, "loss": 1.0592, "num_tokens": 21805342.0, "step": 106 }, { "epoch": 0.25598086124401914, "grad_norm": 0.5638998814508404, "learning_rate": 8.878960148416747e-06, "loss": 1.2387, "num_tokens": 21993750.0, "step": 107 }, { "epoch": 0.2583732057416268, "grad_norm": 0.5224818527209029, "learning_rate": 8.855805064844808e-06, "loss": 1.3391, "num_tokens": 22182974.0, "step": 108 }, { "epoch": 0.2607655502392344, "grad_norm": 0.5975570946282182, "learning_rate": 8.832448058990522e-06, "loss": 1.1119, "num_tokens": 22406584.0, "step": 109 }, { "epoch": 0.2631578947368421, "grad_norm": 0.5342575640517132, "learning_rate": 8.80889053626923e-06, "loss": 1.1556, "num_tokens": 22591986.0, "step": 110 }, { "epoch": 0.26555023923444976, "grad_norm": 0.6463928995023777, "learning_rate": 8.785133914161586e-06, "loss": 1.0927, "num_tokens": 22755674.0, "step": 111 }, { "epoch": 0.2679425837320574, "grad_norm": 0.5540394516081272, "learning_rate": 8.761179622128264e-06, "loss": 1.1932, "num_tokens": 22979344.0, "step": 112 }, { "epoch": 0.2703349282296651, "grad_norm": 0.5639562135925512, "learning_rate": 8.737029101523931e-06, "loss": 1.1062, "num_tokens": 23213393.0, "step": 113 }, { "epoch": 0.2727272727272727, "grad_norm": 0.47416665855465817, "learning_rate": 8.712683805510547e-06, "loss": 1.0925, "num_tokens": 23440736.0, "step": 114 }, { "epoch": 0.2751196172248804, "grad_norm": 0.6750642922896175, "learning_rate": 8.6881451989699e-06, "loss": 1.2461, "num_tokens": 23595366.0, "step": 115 }, { "epoch": 0.27751196172248804, "grad_norm": 0.5459520630146212, "learning_rate": 8.66341475841548e-06, "loss": 1.1222, "num_tokens": 23807492.0, "step": 116 }, { "epoch": 0.2799043062200957, "grad_norm": 0.5301705350454893, "learning_rate": 8.638493971903621e-06, "loss": 1.3022, "num_tokens": 24019959.0, "step": 117 }, { "epoch": 0.2822966507177033, "grad_norm": 0.6424194649582932, "learning_rate": 8.613384338943982e-06, "loss": 1.0574, "num_tokens": 24205265.0, "step": 118 }, { "epoch": 0.284688995215311, "grad_norm": 0.5546308776167657, "learning_rate": 8.588087370409303e-06, "loss": 1.2411, "num_tokens": 24429509.0, "step": 119 }, { "epoch": 0.28708133971291866, "grad_norm": 0.480470812260585, "learning_rate": 8.562604588444498e-06, "loss": 1.2674, "num_tokens": 24680453.0, "step": 120 }, { "epoch": 0.2894736842105263, "grad_norm": 0.5297827708710372, "learning_rate": 8.536937526375075e-06, "loss": 1.2252, "num_tokens": 24893378.0, "step": 121 }, { "epoch": 0.291866028708134, "grad_norm": 0.770470928681588, "learning_rate": 8.511087728614863e-06, "loss": 1.0353, "num_tokens": 25020898.0, "step": 122 }, { "epoch": 0.2942583732057416, "grad_norm": 0.5337837938457338, "learning_rate": 8.485056750573088e-06, "loss": 1.2966, "num_tokens": 25273187.0, "step": 123 }, { "epoch": 0.2966507177033493, "grad_norm": 0.592552325078839, "learning_rate": 8.458846158560787e-06, "loss": 1.1754, "num_tokens": 25469601.0, "step": 124 }, { "epoch": 0.29904306220095694, "grad_norm": 0.5958320399693818, "learning_rate": 8.43245752969655e-06, "loss": 1.069, "num_tokens": 25648408.0, "step": 125 }, { "epoch": 0.3014354066985646, "grad_norm": 0.624744711279868, "learning_rate": 8.40589245181163e-06, "loss": 1.1037, "num_tokens": 25866106.0, "step": 126 }, { "epoch": 0.3038277511961722, "grad_norm": 0.6392805038022229, "learning_rate": 8.379152523354407e-06, "loss": 1.1845, "num_tokens": 26058009.0, "step": 127 }, { "epoch": 0.3062200956937799, "grad_norm": 0.5505337156956458, "learning_rate": 8.352239353294196e-06, "loss": 1.245, "num_tokens": 26327152.0, "step": 128 }, { "epoch": 0.30861244019138756, "grad_norm": 0.5429338635678093, "learning_rate": 8.325154561024445e-06, "loss": 1.3208, "num_tokens": 26559334.0, "step": 129 }, { "epoch": 0.31100478468899523, "grad_norm": 0.5543720622642925, "learning_rate": 8.29789977626528e-06, "loss": 1.217, "num_tokens": 26754982.0, "step": 130 }, { "epoch": 0.3133971291866029, "grad_norm": 0.6525624593414054, "learning_rate": 8.270476638965463e-06, "loss": 1.0719, "num_tokens": 26887851.0, "step": 131 }, { "epoch": 0.3157894736842105, "grad_norm": 0.6284711216463389, "learning_rate": 8.242886799203696e-06, "loss": 1.1727, "num_tokens": 27042502.0, "step": 132 }, { "epoch": 0.3181818181818182, "grad_norm": 0.5632325030743454, "learning_rate": 8.215131917089342e-06, "loss": 1.1525, "num_tokens": 27248040.0, "step": 133 }, { "epoch": 0.32057416267942584, "grad_norm": 0.6252698594109136, "learning_rate": 8.187213662662539e-06, "loss": 1.0868, "num_tokens": 27463386.0, "step": 134 }, { "epoch": 0.3229665071770335, "grad_norm": 0.55667567195552, "learning_rate": 8.159133715793701e-06, "loss": 1.1098, "num_tokens": 27684485.0, "step": 135 }, { "epoch": 0.3253588516746411, "grad_norm": 0.5109763125317217, "learning_rate": 8.13089376608245e-06, "loss": 1.1185, "num_tokens": 27901192.0, "step": 136 }, { "epoch": 0.3277511961722488, "grad_norm": 0.5657322857245803, "learning_rate": 8.102495512755939e-06, "loss": 1.3105, "num_tokens": 28138162.0, "step": 137 }, { "epoch": 0.33014354066985646, "grad_norm": 0.5063120233634636, "learning_rate": 8.073940664566623e-06, "loss": 1.2374, "num_tokens": 28355174.0, "step": 138 }, { "epoch": 0.33253588516746413, "grad_norm": 0.5701958065694588, "learning_rate": 8.045230939689425e-06, "loss": 1.1063, "num_tokens": 28521259.0, "step": 139 }, { "epoch": 0.3349282296650718, "grad_norm": 0.540247926031648, "learning_rate": 8.016368065618361e-06, "loss": 1.0551, "num_tokens": 28746191.0, "step": 140 }, { "epoch": 0.3373205741626794, "grad_norm": 0.5340355745257312, "learning_rate": 7.987353779062598e-06, "loss": 1.235, "num_tokens": 29022355.0, "step": 141 }, { "epoch": 0.3397129186602871, "grad_norm": 0.5292859186809687, "learning_rate": 7.958189825841942e-06, "loss": 1.1531, "num_tokens": 29238427.0, "step": 142 }, { "epoch": 0.34210526315789475, "grad_norm": 0.7322544316739465, "learning_rate": 7.928877960781808e-06, "loss": 0.9135, "num_tokens": 29379111.0, "step": 143 }, { "epoch": 0.3444976076555024, "grad_norm": 0.5080774575481332, "learning_rate": 7.899419947607611e-06, "loss": 1.2097, "num_tokens": 29627097.0, "step": 144 }, { "epoch": 0.34688995215311, "grad_norm": 0.5832151085081759, "learning_rate": 7.869817558838654e-06, "loss": 1.0816, "num_tokens": 29832123.0, "step": 145 }, { "epoch": 0.3492822966507177, "grad_norm": 0.5206108052264397, "learning_rate": 7.840072575681468e-06, "loss": 1.108, "num_tokens": 30048644.0, "step": 146 }, { "epoch": 0.35167464114832536, "grad_norm": 0.5570271309488313, "learning_rate": 7.810186787922645e-06, "loss": 1.1653, "num_tokens": 30247851.0, "step": 147 }, { "epoch": 0.35406698564593303, "grad_norm": 0.4918371375990957, "learning_rate": 7.78016199382112e-06, "loss": 1.1408, "num_tokens": 30527686.0, "step": 148 }, { "epoch": 0.35645933014354064, "grad_norm": 0.5481932300046403, "learning_rate": 7.75e-06, "loss": 1.2044, "num_tokens": 30723713.0, "step": 149 }, { "epoch": 0.3588516746411483, "grad_norm": 0.6651847229876482, "learning_rate": 7.719702621337834e-06, "loss": 1.0119, "num_tokens": 30898218.0, "step": 150 }, { "epoch": 0.361244019138756, "grad_norm": 0.46633215220880386, "learning_rate": 7.68927168085942e-06, "loss": 1.1705, "num_tokens": 31126739.0, "step": 151 }, { "epoch": 0.36363636363636365, "grad_norm": 0.5876480626961219, "learning_rate": 7.658709009626109e-06, "loss": 0.9351, "num_tokens": 31301729.0, "step": 152 }, { "epoch": 0.3660287081339713, "grad_norm": 0.49945896590659167, "learning_rate": 7.628016446625626e-06, "loss": 1.2641, "num_tokens": 31531161.0, "step": 153 }, { "epoch": 0.3684210526315789, "grad_norm": 0.5384303848101453, "learning_rate": 7.597195838661426e-06, "loss": 1.1977, "num_tokens": 31785635.0, "step": 154 }, { "epoch": 0.3708133971291866, "grad_norm": 0.6031598977170286, "learning_rate": 7.566249040241553e-06, "loss": 1.0982, "num_tokens": 32017995.0, "step": 155 }, { "epoch": 0.37320574162679426, "grad_norm": 0.5114284004215709, "learning_rate": 7.53517791346707e-06, "loss": 1.2633, "num_tokens": 32246103.0, "step": 156 }, { "epoch": 0.37559808612440193, "grad_norm": 0.511553264808467, "learning_rate": 7.503984327920003e-06, "loss": 1.1566, "num_tokens": 32461173.0, "step": 157 }, { "epoch": 0.37799043062200954, "grad_norm": 0.4861428494553005, "learning_rate": 7.472670160550849e-06, "loss": 1.2219, "num_tokens": 32710394.0, "step": 158 }, { "epoch": 0.3803827751196172, "grad_norm": 0.591981436959529, "learning_rate": 7.441237295565642e-06, "loss": 1.275, "num_tokens": 32910997.0, "step": 159 }, { "epoch": 0.3827751196172249, "grad_norm": 0.5171815810924354, "learning_rate": 7.409687624312569e-06, "loss": 1.2906, "num_tokens": 33191166.0, "step": 160 }, { "epoch": 0.38516746411483255, "grad_norm": 0.6093674065623558, "learning_rate": 7.378023045168181e-06, "loss": 1.1703, "num_tokens": 33380845.0, "step": 161 }, { "epoch": 0.3875598086124402, "grad_norm": 0.5521223923681069, "learning_rate": 7.346245463423148e-06, "loss": 1.1532, "num_tokens": 33553617.0, "step": 162 }, { "epoch": 0.38995215311004783, "grad_norm": 0.5177157946810159, "learning_rate": 7.314356791167626e-06, "loss": 1.1612, "num_tokens": 33785498.0, "step": 163 }, { "epoch": 0.3923444976076555, "grad_norm": 0.5060522779515988, "learning_rate": 7.282358947176207e-06, "loss": 1.3366, "num_tokens": 34019728.0, "step": 164 }, { "epoch": 0.39473684210526316, "grad_norm": 0.5610143836266379, "learning_rate": 7.250253856792452e-06, "loss": 1.2572, "num_tokens": 34236289.0, "step": 165 }, { "epoch": 0.39712918660287083, "grad_norm": 0.5606343028811931, "learning_rate": 7.218043451813058e-06, "loss": 1.0956, "num_tokens": 34415700.0, "step": 166 }, { "epoch": 0.39952153110047844, "grad_norm": 0.5775794108416966, "learning_rate": 7.185729670371605e-06, "loss": 1.015, "num_tokens": 34605985.0, "step": 167 }, { "epoch": 0.4019138755980861, "grad_norm": 0.6312411170295402, "learning_rate": 7.153314456821942e-06, "loss": 0.922, "num_tokens": 34748670.0, "step": 168 }, { "epoch": 0.4043062200956938, "grad_norm": 0.5132788880980301, "learning_rate": 7.120799761621198e-06, "loss": 1.2394, "num_tokens": 34976413.0, "step": 169 }, { "epoch": 0.40669856459330145, "grad_norm": 0.5618840133734496, "learning_rate": 7.08818754121241e-06, "loss": 1.0443, "num_tokens": 35182351.0, "step": 170 }, { "epoch": 0.4090909090909091, "grad_norm": 0.5771799652861468, "learning_rate": 7.0554797579068155e-06, "loss": 1.0114, "num_tokens": 35384554.0, "step": 171 }, { "epoch": 0.41148325358851673, "grad_norm": 0.4649122455940863, "learning_rate": 7.022678379765766e-06, "loss": 1.2349, "num_tokens": 35658712.0, "step": 172 }, { "epoch": 0.4138755980861244, "grad_norm": 0.57386723032485, "learning_rate": 6.989785380482313e-06, "loss": 1.0024, "num_tokens": 35853348.0, "step": 173 }, { "epoch": 0.41626794258373206, "grad_norm": 0.5785841074184913, "learning_rate": 6.956802739262446e-06, "loss": 1.1307, "num_tokens": 36048889.0, "step": 174 }, { "epoch": 0.41866028708133973, "grad_norm": 0.5209762559962196, "learning_rate": 6.923732440706005e-06, "loss": 1.032, "num_tokens": 36250421.0, "step": 175 }, { "epoch": 0.42105263157894735, "grad_norm": 0.49999578979845366, "learning_rate": 6.890576474687264e-06, "loss": 1.3027, "num_tokens": 36467223.0, "step": 176 }, { "epoch": 0.423444976076555, "grad_norm": 0.44607951021905534, "learning_rate": 6.857336836235195e-06, "loss": 1.2908, "num_tokens": 36786228.0, "step": 177 }, { "epoch": 0.4258373205741627, "grad_norm": 0.5405149465909439, "learning_rate": 6.824015525413428e-06, "loss": 1.2206, "num_tokens": 36987436.0, "step": 178 }, { "epoch": 0.42822966507177035, "grad_norm": 0.5101094166751247, "learning_rate": 6.790614547199908e-06, "loss": 1.3338, "num_tokens": 37173969.0, "step": 179 }, { "epoch": 0.430622009569378, "grad_norm": 0.5018404262587114, "learning_rate": 6.7571359113662405e-06, "loss": 0.9635, "num_tokens": 37430838.0, "step": 180 }, { "epoch": 0.43301435406698563, "grad_norm": 0.5186179578093245, "learning_rate": 6.723581632356783e-06, "loss": 1.1317, "num_tokens": 37614321.0, "step": 181 }, { "epoch": 0.4354066985645933, "grad_norm": 0.5092089036024817, "learning_rate": 6.689953729167411e-06, "loss": 1.1989, "num_tokens": 37828436.0, "step": 182 }, { "epoch": 0.43779904306220097, "grad_norm": 0.5779182575588276, "learning_rate": 6.65625422522405e-06, "loss": 1.0699, "num_tokens": 37994173.0, "step": 183 }, { "epoch": 0.44019138755980863, "grad_norm": 0.5213748156719571, "learning_rate": 6.622485148260916e-06, "loss": 1.142, "num_tokens": 38226513.0, "step": 184 }, { "epoch": 0.44258373205741625, "grad_norm": 0.5124918281868935, "learning_rate": 6.588648530198505e-06, "loss": 1.0789, "num_tokens": 38424535.0, "step": 185 }, { "epoch": 0.4449760765550239, "grad_norm": 0.4965284532552029, "learning_rate": 6.554746407021332e-06, "loss": 1.2216, "num_tokens": 38662320.0, "step": 186 }, { "epoch": 0.4473684210526316, "grad_norm": 0.5776130784552208, "learning_rate": 6.520780818655421e-06, "loss": 1.2425, "num_tokens": 38852666.0, "step": 187 }, { "epoch": 0.44976076555023925, "grad_norm": 0.5433597025027418, "learning_rate": 6.486753808845565e-06, "loss": 1.1762, "num_tokens": 39020645.0, "step": 188 }, { "epoch": 0.45215311004784686, "grad_norm": 0.5851211313289845, "learning_rate": 6.45266742503235e-06, "loss": 1.1301, "num_tokens": 39229647.0, "step": 189 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5580553839960908, "learning_rate": 6.418523718228952e-06, "loss": 1.1287, "num_tokens": 39423404.0, "step": 190 }, { "epoch": 0.4569377990430622, "grad_norm": 0.5702668222438311, "learning_rate": 6.3843247428977365e-06, "loss": 1.1402, "num_tokens": 39603933.0, "step": 191 }, { "epoch": 0.45933014354066987, "grad_norm": 0.5524617168766218, "learning_rate": 6.350072556826632e-06, "loss": 1.0908, "num_tokens": 39799631.0, "step": 192 }, { "epoch": 0.46172248803827753, "grad_norm": 0.5054083920538464, "learning_rate": 6.315769221005313e-06, "loss": 1.1696, "num_tokens": 40042491.0, "step": 193 }, { "epoch": 0.46411483253588515, "grad_norm": 0.4984596483043875, "learning_rate": 6.281416799501188e-06, "loss": 0.9211, "num_tokens": 40228565.0, "step": 194 }, { "epoch": 0.4665071770334928, "grad_norm": 0.5341608488908804, "learning_rate": 6.247017359335199e-06, "loss": 1.2083, "num_tokens": 40410247.0, "step": 195 }, { "epoch": 0.4688995215311005, "grad_norm": 0.5046486493573384, "learning_rate": 6.2125729703574534e-06, "loss": 1.2149, "num_tokens": 40651771.0, "step": 196 }, { "epoch": 0.47129186602870815, "grad_norm": 0.6097314899954371, "learning_rate": 6.178085705122675e-06, "loss": 1.0858, "num_tokens": 40855435.0, "step": 197 }, { "epoch": 0.47368421052631576, "grad_norm": 0.5774665348623625, "learning_rate": 6.143557638765494e-06, "loss": 1.122, "num_tokens": 41030495.0, "step": 198 }, { "epoch": 0.47607655502392343, "grad_norm": 0.48860350341505726, "learning_rate": 6.108990848875591e-06, "loss": 1.3412, "num_tokens": 41277045.0, "step": 199 }, { "epoch": 0.4784688995215311, "grad_norm": 0.5361962907700251, "learning_rate": 6.074387415372677e-06, "loss": 1.0927, "num_tokens": 41500279.0, "step": 200 }, { "epoch": 0.48086124401913877, "grad_norm": 0.6039231448287091, "learning_rate": 6.039749420381349e-06, "loss": 1.1362, "num_tokens": 41677455.0, "step": 201 }, { "epoch": 0.48325358851674644, "grad_norm": 0.5131741268531921, "learning_rate": 6.005078948105808e-06, "loss": 1.2406, "num_tokens": 41894065.0, "step": 202 }, { "epoch": 0.48564593301435405, "grad_norm": 0.47724842296291775, "learning_rate": 5.970378084704441e-06, "loss": 1.0304, "num_tokens": 42128139.0, "step": 203 }, { "epoch": 0.4880382775119617, "grad_norm": 0.5240356233196276, "learning_rate": 5.935648918164308e-06, "loss": 1.0814, "num_tokens": 42333521.0, "step": 204 }, { "epoch": 0.4904306220095694, "grad_norm": 0.5251041508662586, "learning_rate": 5.90089353817549e-06, "loss": 1.1679, "num_tokens": 42533301.0, "step": 205 }, { "epoch": 0.49282296650717705, "grad_norm": 0.6532050533136743, "learning_rate": 5.866114036005363e-06, "loss": 0.9818, "num_tokens": 42694701.0, "step": 206 }, { "epoch": 0.49521531100478466, "grad_norm": 0.6836388656935797, "learning_rate": 5.831312504372762e-06, "loss": 1.0012, "num_tokens": 42809151.0, "step": 207 }, { "epoch": 0.49760765550239233, "grad_norm": 0.5030489700232146, "learning_rate": 5.796491037322054e-06, "loss": 1.1244, "num_tokens": 43035639.0, "step": 208 }, { "epoch": 0.5, "grad_norm": 0.5562880150972886, "learning_rate": 5.761651730097142e-06, "loss": 1.3298, "num_tokens": 43207069.0, "step": 209 }, { "epoch": 0.5023923444976076, "grad_norm": 0.5324885775750403, "learning_rate": 5.726796679015392e-06, "loss": 1.3305, "num_tokens": 43475398.0, "step": 210 }, { "epoch": 0.5047846889952153, "grad_norm": 0.6085427119073632, "learning_rate": 5.691927981341488e-06, "loss": 1.0097, "num_tokens": 43641183.0, "step": 211 }, { "epoch": 0.507177033492823, "grad_norm": 0.6541524113634078, "learning_rate": 5.657047735161256e-06, "loss": 0.7888, "num_tokens": 43820730.0, "step": 212 }, { "epoch": 0.5095693779904307, "grad_norm": 0.5724267971985464, "learning_rate": 5.622158039255394e-06, "loss": 1.1429, "num_tokens": 44013162.0, "step": 213 }, { "epoch": 0.5119617224880383, "grad_norm": 0.4888491874482519, "learning_rate": 5.58726099297321e-06, "loss": 1.0386, "num_tokens": 44259910.0, "step": 214 }, { "epoch": 0.5143540669856459, "grad_norm": 0.5678338260313958, "learning_rate": 5.552358696106288e-06, "loss": 1.175, "num_tokens": 44480685.0, "step": 215 }, { "epoch": 0.5167464114832536, "grad_norm": 0.5262339117176533, "learning_rate": 5.517453248762142e-06, "loss": 1.233, "num_tokens": 44690652.0, "step": 216 }, { "epoch": 0.5191387559808612, "grad_norm": 0.5686946242510297, "learning_rate": 5.482546751237859e-06, "loss": 0.9377, "num_tokens": 44905510.0, "step": 217 }, { "epoch": 0.5215311004784688, "grad_norm": 0.5096154075649568, "learning_rate": 5.447641303893715e-06, "loss": 0.9606, "num_tokens": 45121618.0, "step": 218 }, { "epoch": 0.5239234449760766, "grad_norm": 0.5027532121976238, "learning_rate": 5.412739007026791e-06, "loss": 1.3208, "num_tokens": 45328957.0, "step": 219 }, { "epoch": 0.5263157894736842, "grad_norm": 0.5955398795356434, "learning_rate": 5.377841960744607e-06, "loss": 1.0519, "num_tokens": 45470498.0, "step": 220 }, { "epoch": 0.5287081339712919, "grad_norm": 0.5632402633040062, "learning_rate": 5.342952264838748e-06, "loss": 1.0009, "num_tokens": 45690586.0, "step": 221 }, { "epoch": 0.5311004784688995, "grad_norm": 0.5530392228322656, "learning_rate": 5.308072018658512e-06, "loss": 1.0197, "num_tokens": 45915829.0, "step": 222 }, { "epoch": 0.5334928229665071, "grad_norm": 0.5560740776916706, "learning_rate": 5.273203320984611e-06, "loss": 1.0086, "num_tokens": 46125336.0, "step": 223 }, { "epoch": 0.5358851674641149, "grad_norm": 0.47936312873685966, "learning_rate": 5.23834826990286e-06, "loss": 1.2004, "num_tokens": 46386175.0, "step": 224 }, { "epoch": 0.5382775119617225, "grad_norm": 0.5451628089579803, "learning_rate": 5.203508962677947e-06, "loss": 1.1559, "num_tokens": 46618828.0, "step": 225 }, { "epoch": 0.5406698564593302, "grad_norm": 0.5352825379096331, "learning_rate": 5.168687495627239e-06, "loss": 1.1977, "num_tokens": 46873878.0, "step": 226 }, { "epoch": 0.5430622009569378, "grad_norm": 0.5328607455361074, "learning_rate": 5.1338859639946396e-06, "loss": 1.0719, "num_tokens": 47110612.0, "step": 227 }, { "epoch": 0.5454545454545454, "grad_norm": 0.5355655159606746, "learning_rate": 5.099106461824513e-06, "loss": 1.1536, "num_tokens": 47297604.0, "step": 228 }, { "epoch": 0.5478468899521531, "grad_norm": 0.652585538954601, "learning_rate": 5.064351081835695e-06, "loss": 1.1744, "num_tokens": 47508300.0, "step": 229 }, { "epoch": 0.5502392344497608, "grad_norm": 0.5726602885947132, "learning_rate": 5.02962191529556e-06, "loss": 0.9178, "num_tokens": 47674186.0, "step": 230 }, { "epoch": 0.5526315789473685, "grad_norm": 0.5227349746690181, "learning_rate": 4.9949210518941945e-06, "loss": 1.0537, "num_tokens": 47869064.0, "step": 231 }, { "epoch": 0.5550239234449761, "grad_norm": 0.5270482777761917, "learning_rate": 4.960250579618652e-06, "loss": 1.1318, "num_tokens": 48073543.0, "step": 232 }, { "epoch": 0.5574162679425837, "grad_norm": 0.5628820736414913, "learning_rate": 4.925612584627325e-06, "loss": 1.0542, "num_tokens": 48249518.0, "step": 233 }, { "epoch": 0.5598086124401914, "grad_norm": 0.5460319678028444, "learning_rate": 4.8910091511244115e-06, "loss": 1.0131, "num_tokens": 48471001.0, "step": 234 }, { "epoch": 0.562200956937799, "grad_norm": 0.5503254822986171, "learning_rate": 4.856442361234507e-06, "loss": 1.0773, "num_tokens": 48720980.0, "step": 235 }, { "epoch": 0.5645933014354066, "grad_norm": 0.5091545126296911, "learning_rate": 4.821914294877327e-06, "loss": 1.1478, "num_tokens": 48922782.0, "step": 236 }, { "epoch": 0.5669856459330144, "grad_norm": 0.5074108889085012, "learning_rate": 4.787427029642549e-06, "loss": 1.2534, "num_tokens": 49149522.0, "step": 237 }, { "epoch": 0.569377990430622, "grad_norm": 0.5849957930987398, "learning_rate": 4.752982640664804e-06, "loss": 1.0202, "num_tokens": 49321177.0, "step": 238 }, { "epoch": 0.5717703349282297, "grad_norm": 0.5347992211342384, "learning_rate": 4.718583200498814e-06, "loss": 1.2032, "num_tokens": 49544634.0, "step": 239 }, { "epoch": 0.5741626794258373, "grad_norm": 0.5280959102930131, "learning_rate": 4.684230778994688e-06, "loss": 1.1751, "num_tokens": 49724091.0, "step": 240 }, { "epoch": 0.5765550239234449, "grad_norm": 0.5164476203177735, "learning_rate": 4.64992744317337e-06, "loss": 1.1098, "num_tokens": 49929099.0, "step": 241 }, { "epoch": 0.5789473684210527, "grad_norm": 0.6667023806983443, "learning_rate": 4.615675257102265e-06, "loss": 0.9402, "num_tokens": 50081941.0, "step": 242 }, { "epoch": 0.5813397129186603, "grad_norm": 0.5023784414967131, "learning_rate": 4.58147628177105e-06, "loss": 1.01, "num_tokens": 50306579.0, "step": 243 }, { "epoch": 0.583732057416268, "grad_norm": 0.5370878293075974, "learning_rate": 4.547332574967653e-06, "loss": 1.079, "num_tokens": 50544895.0, "step": 244 }, { "epoch": 0.5861244019138756, "grad_norm": 0.5090426584844939, "learning_rate": 4.513246191154434e-06, "loss": 1.1825, "num_tokens": 50788203.0, "step": 245 }, { "epoch": 0.5885167464114832, "grad_norm": 0.4792828066902539, "learning_rate": 4.479219181344579e-06, "loss": 1.2301, "num_tokens": 51053982.0, "step": 246 }, { "epoch": 0.5909090909090909, "grad_norm": 0.49219719144165075, "learning_rate": 4.44525359297867e-06, "loss": 1.1711, "num_tokens": 51259911.0, "step": 247 }, { "epoch": 0.5933014354066986, "grad_norm": 0.5340406735561365, "learning_rate": 4.4113514698014955e-06, "loss": 1.1956, "num_tokens": 51473886.0, "step": 248 }, { "epoch": 0.5956937799043063, "grad_norm": 0.5702889032524951, "learning_rate": 4.377514851739085e-06, "loss": 1.1091, "num_tokens": 51735586.0, "step": 249 }, { "epoch": 0.5980861244019139, "grad_norm": 0.5115029340630267, "learning_rate": 4.3437457747759515e-06, "loss": 1.1343, "num_tokens": 51923001.0, "step": 250 }, { "epoch": 0.6004784688995215, "grad_norm": 0.4738251807559482, "learning_rate": 4.310046270832592e-06, "loss": 1.07, "num_tokens": 52167211.0, "step": 251 }, { "epoch": 0.6028708133971292, "grad_norm": 0.562569354089248, "learning_rate": 4.276418367643218e-06, "loss": 0.9359, "num_tokens": 52345300.0, "step": 252 }, { "epoch": 0.6052631578947368, "grad_norm": 0.6492878859321651, "learning_rate": 4.242864088633762e-06, "loss": 0.8908, "num_tokens": 52537210.0, "step": 253 }, { "epoch": 0.6076555023923444, "grad_norm": 0.6078233345214087, "learning_rate": 4.2093854528000955e-06, "loss": 0.8913, "num_tokens": 52695428.0, "step": 254 }, { "epoch": 0.6100478468899522, "grad_norm": 0.5115019352055596, "learning_rate": 4.175984474586572e-06, "loss": 1.0335, "num_tokens": 52945131.0, "step": 255 }, { "epoch": 0.6124401913875598, "grad_norm": 0.5875660189403787, "learning_rate": 4.142663163764806e-06, "loss": 0.941, "num_tokens": 53101160.0, "step": 256 }, { "epoch": 0.6148325358851675, "grad_norm": 0.5230885907461125, "learning_rate": 4.109423525312738e-06, "loss": 1.1472, "num_tokens": 53341330.0, "step": 257 }, { "epoch": 0.6172248803827751, "grad_norm": 0.676100542426314, "learning_rate": 4.076267559293996e-06, "loss": 0.9226, "num_tokens": 53477820.0, "step": 258 }, { "epoch": 0.6196172248803827, "grad_norm": 0.6027764896908601, "learning_rate": 4.043197260737556e-06, "loss": 1.1615, "num_tokens": 53655177.0, "step": 259 }, { "epoch": 0.6220095693779905, "grad_norm": 0.5114599101755669, "learning_rate": 4.0102146195176895e-06, "loss": 1.0848, "num_tokens": 53871093.0, "step": 260 }, { "epoch": 0.6244019138755981, "grad_norm": 0.5616877393452973, "learning_rate": 3.977321620234236e-06, "loss": 1.1293, "num_tokens": 54051884.0, "step": 261 }, { "epoch": 0.6267942583732058, "grad_norm": 0.5951828000342995, "learning_rate": 3.944520242093186e-06, "loss": 1.1116, "num_tokens": 54243302.0, "step": 262 }, { "epoch": 0.6291866028708134, "grad_norm": 0.5533241097093147, "learning_rate": 3.911812458787592e-06, "loss": 1.0339, "num_tokens": 54449587.0, "step": 263 }, { "epoch": 0.631578947368421, "grad_norm": 0.6391714671501187, "learning_rate": 3.8792002383788044e-06, "loss": 1.0188, "num_tokens": 54573282.0, "step": 264 }, { "epoch": 0.6339712918660287, "grad_norm": 0.48381850337769244, "learning_rate": 3.846685543178058e-06, "loss": 1.2549, "num_tokens": 54826368.0, "step": 265 }, { "epoch": 0.6363636363636364, "grad_norm": 0.49990948075130837, "learning_rate": 3.8142703296283954e-06, "loss": 1.1331, "num_tokens": 55080391.0, "step": 266 }, { "epoch": 0.638755980861244, "grad_norm": 0.5427808072503959, "learning_rate": 3.7819565481869426e-06, "loss": 1.1618, "num_tokens": 55285642.0, "step": 267 }, { "epoch": 0.6411483253588517, "grad_norm": 0.5747721632491769, "learning_rate": 3.7497461432075477e-06, "loss": 1.1053, "num_tokens": 55481520.0, "step": 268 }, { "epoch": 0.6435406698564593, "grad_norm": 0.5301204962544379, "learning_rate": 3.717641052823795e-06, "loss": 1.1108, "num_tokens": 55706780.0, "step": 269 }, { "epoch": 0.645933014354067, "grad_norm": 0.5775776454615925, "learning_rate": 3.6856432088323746e-06, "loss": 1.1119, "num_tokens": 55902431.0, "step": 270 }, { "epoch": 0.6483253588516746, "grad_norm": 0.5001600002488803, "learning_rate": 3.6537545365768543e-06, "loss": 0.9535, "num_tokens": 56104220.0, "step": 271 }, { "epoch": 0.6507177033492823, "grad_norm": 0.5699808255124916, "learning_rate": 3.6219769548318205e-06, "loss": 1.0524, "num_tokens": 56257950.0, "step": 272 }, { "epoch": 0.65311004784689, "grad_norm": 0.5003276838892392, "learning_rate": 3.5903123756874315e-06, "loss": 1.1485, "num_tokens": 56488654.0, "step": 273 }, { "epoch": 0.6555023923444976, "grad_norm": 0.6033119191336221, "learning_rate": 3.558762704434361e-06, "loss": 1.024, "num_tokens": 56686270.0, "step": 274 }, { "epoch": 0.6578947368421053, "grad_norm": 0.4693280395015428, "learning_rate": 3.527329839449152e-06, "loss": 1.136, "num_tokens": 56931317.0, "step": 275 }, { "epoch": 0.6602870813397129, "grad_norm": 0.5278398302464965, "learning_rate": 3.496015672079998e-06, "loss": 1.1571, "num_tokens": 57127263.0, "step": 276 }, { "epoch": 0.6626794258373205, "grad_norm": 0.49190545922349904, "learning_rate": 3.4648220865329312e-06, "loss": 1.0427, "num_tokens": 57354122.0, "step": 277 }, { "epoch": 0.6650717703349283, "grad_norm": 0.4934205228618601, "learning_rate": 3.4337509597584466e-06, "loss": 1.2705, "num_tokens": 57579975.0, "step": 278 }, { "epoch": 0.6674641148325359, "grad_norm": 0.6046200272271364, "learning_rate": 3.402804161338577e-06, "loss": 0.9143, "num_tokens": 57767139.0, "step": 279 }, { "epoch": 0.6698564593301436, "grad_norm": 0.5256841221145759, "learning_rate": 3.371983553374375e-06, "loss": 1.0864, "num_tokens": 57969542.0, "step": 280 }, { "epoch": 0.6722488038277512, "grad_norm": 0.5879727234811725, "learning_rate": 3.3412909903738937e-06, "loss": 0.9625, "num_tokens": 58145028.0, "step": 281 }, { "epoch": 0.6746411483253588, "grad_norm": 0.6263377798428889, "learning_rate": 3.310728319140581e-06, "loss": 0.9234, "num_tokens": 58312705.0, "step": 282 }, { "epoch": 0.6770334928229665, "grad_norm": 0.5407307381090947, "learning_rate": 3.2802973786621665e-06, "loss": 1.0687, "num_tokens": 58527623.0, "step": 283 }, { "epoch": 0.6794258373205742, "grad_norm": 0.5502001614125057, "learning_rate": 3.2500000000000015e-06, "loss": 1.1427, "num_tokens": 58772116.0, "step": 284 }, { "epoch": 0.6818181818181818, "grad_norm": 0.5686855066649326, "learning_rate": 3.2198380061788803e-06, "loss": 1.031, "num_tokens": 58948693.0, "step": 285 }, { "epoch": 0.6842105263157895, "grad_norm": 0.56427208726594, "learning_rate": 3.1898132120773566e-06, "loss": 1.0001, "num_tokens": 59160106.0, "step": 286 }, { "epoch": 0.6866028708133971, "grad_norm": 0.5178015680501699, "learning_rate": 3.1599274243185314e-06, "loss": 1.2459, "num_tokens": 59393828.0, "step": 287 }, { "epoch": 0.6889952153110048, "grad_norm": 0.6161696867803992, "learning_rate": 3.1301824411613473e-06, "loss": 1.077, "num_tokens": 59592707.0, "step": 288 }, { "epoch": 0.6913875598086124, "grad_norm": 0.49780237640470854, "learning_rate": 3.1005800523923906e-06, "loss": 1.1431, "num_tokens": 59812582.0, "step": 289 }, { "epoch": 0.69377990430622, "grad_norm": 0.5031207474545651, "learning_rate": 3.071122039218194e-06, "loss": 1.1467, "num_tokens": 60043641.0, "step": 290 }, { "epoch": 0.6961722488038278, "grad_norm": 0.574254924525526, "learning_rate": 3.0418101741580586e-06, "loss": 1.1918, "num_tokens": 60234442.0, "step": 291 }, { "epoch": 0.6985645933014354, "grad_norm": 0.5016769304104969, "learning_rate": 3.012646220937403e-06, "loss": 1.31, "num_tokens": 60456123.0, "step": 292 }, { "epoch": 0.7009569377990431, "grad_norm": 0.5058935049560537, "learning_rate": 2.98363193438164e-06, "loss": 0.9371, "num_tokens": 60672710.0, "step": 293 }, { "epoch": 0.7033492822966507, "grad_norm": 0.5351125304814696, "learning_rate": 2.9547690603105774e-06, "loss": 1.0698, "num_tokens": 60894772.0, "step": 294 }, { "epoch": 0.7057416267942583, "grad_norm": 0.5128628418090031, "learning_rate": 2.926059335433378e-06, "loss": 1.2298, "num_tokens": 61142587.0, "step": 295 }, { "epoch": 0.7081339712918661, "grad_norm": 0.5144613524379172, "learning_rate": 2.897504487244061e-06, "loss": 0.9337, "num_tokens": 61352129.0, "step": 296 }, { "epoch": 0.7105263157894737, "grad_norm": 0.5861410143772018, "learning_rate": 2.8691062339175512e-06, "loss": 0.9923, "num_tokens": 61498549.0, "step": 297 }, { "epoch": 0.7129186602870813, "grad_norm": 0.5481256980886055, "learning_rate": 2.8408662842063002e-06, "loss": 1.0957, "num_tokens": 61687826.0, "step": 298 }, { "epoch": 0.715311004784689, "grad_norm": 0.5582805882931381, "learning_rate": 2.8127863373374637e-06, "loss": 1.09, "num_tokens": 61877628.0, "step": 299 }, { "epoch": 0.7177033492822966, "grad_norm": 0.5983921444578938, "learning_rate": 2.7848680829106602e-06, "loss": 1.0968, "num_tokens": 62078858.0, "step": 300 }, { "epoch": 0.7200956937799043, "grad_norm": 0.5339997006585953, "learning_rate": 2.7571132007963074e-06, "loss": 1.1891, "num_tokens": 62265457.0, "step": 301 }, { "epoch": 0.722488038277512, "grad_norm": 0.5449456499746453, "learning_rate": 2.7295233610345384e-06, "loss": 1.0269, "num_tokens": 62488733.0, "step": 302 }, { "epoch": 0.7248803827751196, "grad_norm": 0.5699604526936535, "learning_rate": 2.7021002237347206e-06, "loss": 1.1336, "num_tokens": 62714416.0, "step": 303 }, { "epoch": 0.7272727272727273, "grad_norm": 0.6413790402904914, "learning_rate": 2.6748454389755576e-06, "loss": 0.9382, "num_tokens": 62890365.0, "step": 304 }, { "epoch": 0.7296650717703349, "grad_norm": 0.5390387726292147, "learning_rate": 2.647760646705804e-06, "loss": 1.0829, "num_tokens": 63120765.0, "step": 305 }, { "epoch": 0.7320574162679426, "grad_norm": 0.5984653976738545, "learning_rate": 2.620847476645594e-06, "loss": 0.9221, "num_tokens": 63320228.0, "step": 306 }, { "epoch": 0.7344497607655502, "grad_norm": 0.5801251118440074, "learning_rate": 2.5941075481883705e-06, "loss": 1.1212, "num_tokens": 63509873.0, "step": 307 }, { "epoch": 0.7368421052631579, "grad_norm": 0.5636489099209283, "learning_rate": 2.567542470303452e-06, "loss": 1.078, "num_tokens": 63745029.0, "step": 308 }, { "epoch": 0.7392344497607656, "grad_norm": 0.48725639119647585, "learning_rate": 2.5411538414392146e-06, "loss": 1.2125, "num_tokens": 63953310.0, "step": 309 }, { "epoch": 0.7416267942583732, "grad_norm": 0.549253240822144, "learning_rate": 2.5149432494269134e-06, "loss": 1.1192, "num_tokens": 64147381.0, "step": 310 }, { "epoch": 0.7440191387559809, "grad_norm": 0.5491580770023559, "learning_rate": 2.4889122713851397e-06, "loss": 0.9919, "num_tokens": 64340436.0, "step": 311 }, { "epoch": 0.7464114832535885, "grad_norm": 0.5164385106756677, "learning_rate": 2.463062473624927e-06, "loss": 1.0476, "num_tokens": 64568538.0, "step": 312 }, { "epoch": 0.7488038277511961, "grad_norm": 0.583840880433391, "learning_rate": 2.437395411555504e-06, "loss": 1.1016, "num_tokens": 64759586.0, "step": 313 }, { "epoch": 0.7511961722488039, "grad_norm": 0.5528719370540063, "learning_rate": 2.4119126295906997e-06, "loss": 1.1974, "num_tokens": 64942864.0, "step": 314 }, { "epoch": 0.7535885167464115, "grad_norm": 0.6028168080715274, "learning_rate": 2.3866156610560186e-06, "loss": 1.0019, "num_tokens": 65142788.0, "step": 315 }, { "epoch": 0.7559808612440191, "grad_norm": 0.5816986940686796, "learning_rate": 2.3615060280963797e-06, "loss": 1.2118, "num_tokens": 65362360.0, "step": 316 }, { "epoch": 0.7583732057416268, "grad_norm": 0.5809244671898545, "learning_rate": 2.3365852415845225e-06, "loss": 1.1267, "num_tokens": 65547922.0, "step": 317 }, { "epoch": 0.7607655502392344, "grad_norm": 0.5262370165475527, "learning_rate": 2.3118548010301015e-06, "loss": 1.1893, "num_tokens": 65731553.0, "step": 318 }, { "epoch": 0.7631578947368421, "grad_norm": 0.5357040610680347, "learning_rate": 2.2873161944894552e-06, "loss": 1.1869, "num_tokens": 65951250.0, "step": 319 }, { "epoch": 0.7655502392344498, "grad_norm": 0.5570433795031379, "learning_rate": 2.262970898476071e-06, "loss": 0.9916, "num_tokens": 66175000.0, "step": 320 }, { "epoch": 0.7679425837320574, "grad_norm": 0.604494546666767, "learning_rate": 2.2388203778717407e-06, "loss": 1.1347, "num_tokens": 66357517.0, "step": 321 }, { "epoch": 0.7703349282296651, "grad_norm": 0.5827904281357608, "learning_rate": 2.2148660858384147e-06, "loss": 1.0356, "num_tokens": 66566078.0, "step": 322 }, { "epoch": 0.7727272727272727, "grad_norm": 0.5218976553836495, "learning_rate": 2.1911094637307715e-06, "loss": 1.1124, "num_tokens": 66784937.0, "step": 323 }, { "epoch": 0.7751196172248804, "grad_norm": 0.49417380874831474, "learning_rate": 2.1675519410094803e-06, "loss": 1.1203, "num_tokens": 67057361.0, "step": 324 }, { "epoch": 0.777511961722488, "grad_norm": 0.6319926280044286, "learning_rate": 2.144194935155192e-06, "loss": 1.038, "num_tokens": 67276459.0, "step": 325 }, { "epoch": 0.7799043062200957, "grad_norm": 0.553450207558276, "learning_rate": 2.121039851583254e-06, "loss": 1.0843, "num_tokens": 67454638.0, "step": 326 }, { "epoch": 0.7822966507177034, "grad_norm": 0.5159208111364086, "learning_rate": 2.098088083559135e-06, "loss": 0.9358, "num_tokens": 67667938.0, "step": 327 }, { "epoch": 0.784688995215311, "grad_norm": 0.5059115925994171, "learning_rate": 2.0753410121145984e-06, "loss": 1.1579, "num_tokens": 67859669.0, "step": 328 }, { "epoch": 0.7870813397129187, "grad_norm": 0.5613491350937895, "learning_rate": 2.0528000059646e-06, "loss": 1.0022, "num_tokens": 68056005.0, "step": 329 }, { "epoch": 0.7894736842105263, "grad_norm": 0.5374042116513947, "learning_rate": 2.0304664214249326e-06, "loss": 1.0718, "num_tokens": 68255467.0, "step": 330 }, { "epoch": 0.7918660287081339, "grad_norm": 0.503580387927313, "learning_rate": 2.0083416023306163e-06, "loss": 1.1493, "num_tokens": 68469900.0, "step": 331 }, { "epoch": 0.7942583732057417, "grad_norm": 0.5884447457044938, "learning_rate": 1.986426879955034e-06, "loss": 0.9502, "num_tokens": 68685343.0, "step": 332 }, { "epoch": 0.7966507177033493, "grad_norm": 0.6834427409407543, "learning_rate": 1.9647235729298346e-06, "loss": 0.9018, "num_tokens": 68834514.0, "step": 333 }, { "epoch": 0.7990430622009569, "grad_norm": 0.5189288186456062, "learning_rate": 1.9432329871655837e-06, "loss": 1.2691, "num_tokens": 69046003.0, "step": 334 }, { "epoch": 0.8014354066985646, "grad_norm": 0.516776960640009, "learning_rate": 1.9219564157731848e-06, "loss": 1.0057, "num_tokens": 69272731.0, "step": 335 }, { "epoch": 0.8038277511961722, "grad_norm": 0.4831598833288486, "learning_rate": 1.9008951389860785e-06, "loss": 1.1143, "num_tokens": 69508303.0, "step": 336 }, { "epoch": 0.80622009569378, "grad_norm": 0.5753229158728437, "learning_rate": 1.8800504240832012e-06, "loss": 1.1146, "num_tokens": 69706781.0, "step": 337 }, { "epoch": 0.8086124401913876, "grad_norm": 0.5983941033127453, "learning_rate": 1.8594235253127373e-06, "loss": 1.1979, "num_tokens": 69926110.0, "step": 338 }, { "epoch": 0.8110047846889952, "grad_norm": 0.5114846230853078, "learning_rate": 1.8390156838166464e-06, "loss": 1.016, "num_tokens": 70133509.0, "step": 339 }, { "epoch": 0.8133971291866029, "grad_norm": 0.5260668256751079, "learning_rate": 1.8188281275559866e-06, "loss": 1.0266, "num_tokens": 70365768.0, "step": 340 }, { "epoch": 0.8157894736842105, "grad_norm": 0.5595038468322735, "learning_rate": 1.7988620712370197e-06, "loss": 1.1005, "num_tokens": 70548685.0, "step": 341 }, { "epoch": 0.8181818181818182, "grad_norm": 0.6890712705743423, "learning_rate": 1.7791187162381325e-06, "loss": 1.0739, "num_tokens": 70725591.0, "step": 342 }, { "epoch": 0.8205741626794258, "grad_norm": 0.5344037158436257, "learning_rate": 1.759599250537534e-06, "loss": 1.1548, "num_tokens": 70943507.0, "step": 343 }, { "epoch": 0.8229665071770335, "grad_norm": 0.5589105656078766, "learning_rate": 1.740304848641787e-06, "loss": 1.0402, "num_tokens": 71137045.0, "step": 344 }, { "epoch": 0.8253588516746412, "grad_norm": 0.5768929116638776, "learning_rate": 1.7212366715151263e-06, "loss": 0.9768, "num_tokens": 71350643.0, "step": 345 }, { "epoch": 0.8277511961722488, "grad_norm": 0.6276817700534357, "learning_rate": 1.702395866509612e-06, "loss": 0.9183, "num_tokens": 71539784.0, "step": 346 }, { "epoch": 0.8301435406698564, "grad_norm": 0.5484078243741392, "learning_rate": 1.6837835672960834e-06, "loss": 1.1514, "num_tokens": 71742614.0, "step": 347 }, { "epoch": 0.8325358851674641, "grad_norm": 0.5193578245554346, "learning_rate": 1.6654008937959498e-06, "loss": 0.9674, "num_tokens": 71994797.0, "step": 348 }, { "epoch": 0.8349282296650717, "grad_norm": 0.528358256622246, "learning_rate": 1.6472489521138016e-06, "loss": 1.108, "num_tokens": 72191401.0, "step": 349 }, { "epoch": 0.8373205741626795, "grad_norm": 0.5611551275004363, "learning_rate": 1.629328834470857e-06, "loss": 1.1481, "num_tokens": 72346485.0, "step": 350 }, { "epoch": 0.8397129186602871, "grad_norm": 0.4671315072196002, "learning_rate": 1.611641619139238e-06, "loss": 1.1736, "num_tokens": 72601665.0, "step": 351 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5555560185216512, "learning_rate": 1.5941883703770968e-06, "loss": 1.1533, "num_tokens": 72836095.0, "step": 352 }, { "epoch": 0.8444976076555024, "grad_norm": 0.5288816745801785, "learning_rate": 1.57697013836457e-06, "loss": 1.0494, "num_tokens": 73049430.0, "step": 353 }, { "epoch": 0.84688995215311, "grad_norm": 0.6233482042563366, "learning_rate": 1.5599879591405917e-06, "loss": 1.0147, "num_tokens": 73196007.0, "step": 354 }, { "epoch": 0.8492822966507177, "grad_norm": 0.5363849538121136, "learning_rate": 1.5432428545405554e-06, "loss": 1.1694, "num_tokens": 73396469.0, "step": 355 }, { "epoch": 0.8516746411483254, "grad_norm": 0.5932100916233094, "learning_rate": 1.526735832134829e-06, "loss": 1.0174, "num_tokens": 73584128.0, "step": 356 }, { "epoch": 0.854066985645933, "grad_norm": 0.6127092810753643, "learning_rate": 1.5104678851681253e-06, "loss": 0.8168, "num_tokens": 73717071.0, "step": 357 }, { "epoch": 0.8564593301435407, "grad_norm": 0.6293206669166083, "learning_rate": 1.4944399924997372e-06, "loss": 0.7752, "num_tokens": 73883367.0, "step": 358 }, { "epoch": 0.8588516746411483, "grad_norm": 0.531317141972036, "learning_rate": 1.4786531185446455e-06, "loss": 1.1077, "num_tokens": 74123207.0, "step": 359 }, { "epoch": 0.861244019138756, "grad_norm": 0.44768314533679704, "learning_rate": 1.4631082132154806e-06, "loss": 1.2024, "num_tokens": 74395731.0, "step": 360 }, { "epoch": 0.8636363636363636, "grad_norm": 0.4788316306745224, "learning_rate": 1.4478062118653703e-06, "loss": 1.1751, "num_tokens": 74663304.0, "step": 361 }, { "epoch": 0.8660287081339713, "grad_norm": 0.4783192674308249, "learning_rate": 1.4327480352316581e-06, "loss": 1.1805, "num_tokens": 74907925.0, "step": 362 }, { "epoch": 0.868421052631579, "grad_norm": 0.5707901460896949, "learning_rate": 1.417934589380498e-06, "loss": 1.0742, "num_tokens": 75130243.0, "step": 363 }, { "epoch": 0.8708133971291866, "grad_norm": 0.6017414939136261, "learning_rate": 1.4033667656523405e-06, "loss": 0.9557, "num_tokens": 75352077.0, "step": 364 }, { "epoch": 0.8732057416267942, "grad_norm": 0.4853066070350836, "learning_rate": 1.389045440608296e-06, "loss": 1.08, "num_tokens": 75592089.0, "step": 365 }, { "epoch": 0.8755980861244019, "grad_norm": 0.5253451321715548, "learning_rate": 1.374971475977394e-06, "loss": 1.2071, "num_tokens": 75818956.0, "step": 366 }, { "epoch": 0.8779904306220095, "grad_norm": 0.5659204983119508, "learning_rate": 1.361145718604731e-06, "loss": 1.1936, "num_tokens": 76017603.0, "step": 367 }, { "epoch": 0.8803827751196173, "grad_norm": 0.479841142759106, "learning_rate": 1.3475690004005098e-06, "loss": 1.191, "num_tokens": 76290864.0, "step": 368 }, { "epoch": 0.8827751196172249, "grad_norm": 0.5872255230326239, "learning_rate": 1.3342421382899936e-06, "loss": 1.0301, "num_tokens": 76529427.0, "step": 369 }, { "epoch": 0.8851674641148325, "grad_norm": 0.5029097871572791, "learning_rate": 1.3211659341643412e-06, "loss": 1.2066, "num_tokens": 76742589.0, "step": 370 }, { "epoch": 0.8875598086124402, "grad_norm": 0.584840618113796, "learning_rate": 1.308341174832359e-06, "loss": 0.9768, "num_tokens": 76939827.0, "step": 371 }, { "epoch": 0.8899521531100478, "grad_norm": 0.44994308377297715, "learning_rate": 1.2957686319731623e-06, "loss": 1.2925, "num_tokens": 77190390.0, "step": 372 }, { "epoch": 0.8923444976076556, "grad_norm": 0.614291349507059, "learning_rate": 1.2834490620897342e-06, "loss": 1.0009, "num_tokens": 77368607.0, "step": 373 }, { "epoch": 0.8947368421052632, "grad_norm": 0.5540701345571359, "learning_rate": 1.2713832064634127e-06, "loss": 1.281, "num_tokens": 77595326.0, "step": 374 }, { "epoch": 0.8971291866028708, "grad_norm": 0.5930336764639087, "learning_rate": 1.259571791109285e-06, "loss": 1.1882, "num_tokens": 77757257.0, "step": 375 }, { "epoch": 0.8995215311004785, "grad_norm": 0.5601557384818509, "learning_rate": 1.2480155267325039e-06, "loss": 0.9335, "num_tokens": 77966559.0, "step": 376 }, { "epoch": 0.9019138755980861, "grad_norm": 0.5146670174651209, "learning_rate": 1.2367151086855187e-06, "loss": 1.1928, "num_tokens": 78180912.0, "step": 377 }, { "epoch": 0.9043062200956937, "grad_norm": 0.585671381043156, "learning_rate": 1.2256712169262415e-06, "loss": 1.0569, "num_tokens": 78336709.0, "step": 378 }, { "epoch": 0.9066985645933014, "grad_norm": 0.5144842875674174, "learning_rate": 1.2148845159771311e-06, "loss": 1.0092, "num_tokens": 78603450.0, "step": 379 }, { "epoch": 0.9090909090909091, "grad_norm": 0.594728768695324, "learning_rate": 1.2043556548852065e-06, "loss": 1.0245, "num_tokens": 78852293.0, "step": 380 }, { "epoch": 0.9114832535885168, "grad_norm": 0.46010783326706295, "learning_rate": 1.1940852671829938e-06, "loss": 1.2352, "num_tokens": 79112672.0, "step": 381 }, { "epoch": 0.9138755980861244, "grad_norm": 0.601262109893317, "learning_rate": 1.184073970850408e-06, "loss": 1.1504, "num_tokens": 79319617.0, "step": 382 }, { "epoch": 0.916267942583732, "grad_norm": 0.5038692624203227, "learning_rate": 1.174322368277565e-06, "loss": 1.1967, "num_tokens": 79549771.0, "step": 383 }, { "epoch": 0.9186602870813397, "grad_norm": 0.5753103173201497, "learning_rate": 1.1648310462285386e-06, "loss": 1.1225, "num_tokens": 79738016.0, "step": 384 }, { "epoch": 0.9210526315789473, "grad_norm": 0.614917920007612, "learning_rate": 1.1556005758060517e-06, "loss": 0.9872, "num_tokens": 79913100.0, "step": 385 }, { "epoch": 0.9234449760765551, "grad_norm": 0.5342918968914316, "learning_rate": 1.146631512417113e-06, "loss": 1.0676, "num_tokens": 80103047.0, "step": 386 }, { "epoch": 0.9258373205741627, "grad_norm": 0.5439716109099237, "learning_rate": 1.1379243957395987e-06, "loss": 1.0585, "num_tokens": 80292737.0, "step": 387 }, { "epoch": 0.9282296650717703, "grad_norm": 0.5342393003750865, "learning_rate": 1.1294797496897786e-06, "loss": 1.1836, "num_tokens": 80512263.0, "step": 388 }, { "epoch": 0.930622009569378, "grad_norm": 0.4855841313887977, "learning_rate": 1.121298082390793e-06, "loss": 1.0198, "num_tokens": 80713362.0, "step": 389 }, { "epoch": 0.9330143540669856, "grad_norm": 0.5404438942427807, "learning_rate": 1.113379886142075e-06, "loss": 0.9669, "num_tokens": 80921168.0, "step": 390 }, { "epoch": 0.9354066985645934, "grad_norm": 0.5507820902601309, "learning_rate": 1.105725637389732e-06, "loss": 1.0652, "num_tokens": 81149885.0, "step": 391 }, { "epoch": 0.937799043062201, "grad_norm": 0.5015294273795851, "learning_rate": 1.0983357966978747e-06, "loss": 1.1452, "num_tokens": 81384820.0, "step": 392 }, { "epoch": 0.9401913875598086, "grad_norm": 0.5530079510762682, "learning_rate": 1.0912108087209075e-06, "loss": 1.0865, "num_tokens": 81577699.0, "step": 393 }, { "epoch": 0.9425837320574163, "grad_norm": 0.49796992979545124, "learning_rate": 1.084351102176769e-06, "loss": 0.9428, "num_tokens": 81803396.0, "step": 394 }, { "epoch": 0.9449760765550239, "grad_norm": 0.5777758192642776, "learning_rate": 1.0777570898211406e-06, "loss": 1.0373, "num_tokens": 81968827.0, "step": 395 }, { "epoch": 0.9473684210526315, "grad_norm": 0.5754456579892182, "learning_rate": 1.0714291684226054e-06, "loss": 1.0265, "num_tokens": 82166516.0, "step": 396 }, { "epoch": 0.9497607655502392, "grad_norm": 0.558633769969428, "learning_rate": 1.0653677187387787e-06, "loss": 1.0473, "num_tokens": 82338824.0, "step": 397 }, { "epoch": 0.9521531100478469, "grad_norm": 0.6176260102445734, "learning_rate": 1.0595731054933937e-06, "loss": 1.0043, "num_tokens": 82531186.0, "step": 398 }, { "epoch": 0.9545454545454546, "grad_norm": 0.5126700946523376, "learning_rate": 1.0540456773543596e-06, "loss": 1.2646, "num_tokens": 82735927.0, "step": 399 }, { "epoch": 0.9569377990430622, "grad_norm": 0.5671634428425157, "learning_rate": 1.0487857669127782e-06, "loss": 1.1623, "num_tokens": 82904745.0, "step": 400 }, { "epoch": 0.9593301435406698, "grad_norm": 0.559489922062985, "learning_rate": 1.0437936906629336e-06, "loss": 1.0435, "num_tokens": 83074515.0, "step": 401 }, { "epoch": 0.9617224880382775, "grad_norm": 0.5577904608135668, "learning_rate": 1.039069748983248e-06, "loss": 0.7559, "num_tokens": 83243340.0, "step": 402 }, { "epoch": 0.9641148325358851, "grad_norm": 0.5215879777836743, "learning_rate": 1.0346142261182064e-06, "loss": 1.1583, "num_tokens": 83474214.0, "step": 403 }, { "epoch": 0.9665071770334929, "grad_norm": 0.5509462473469403, "learning_rate": 1.0304273901612566e-06, "loss": 1.0304, "num_tokens": 83644954.0, "step": 404 }, { "epoch": 0.9688995215311005, "grad_norm": 0.536818549153514, "learning_rate": 1.0265094930386741e-06, "loss": 1.2204, "num_tokens": 83861919.0, "step": 405 }, { "epoch": 0.9712918660287081, "grad_norm": 0.5740452675590582, "learning_rate": 1.0228607704944048e-06, "loss": 0.9858, "num_tokens": 84024816.0, "step": 406 }, { "epoch": 0.9736842105263158, "grad_norm": 0.5261150137396471, "learning_rate": 1.0194814420758806e-06, "loss": 1.1349, "num_tokens": 84239403.0, "step": 407 }, { "epoch": 0.9760765550239234, "grad_norm": 0.6448679502450355, "learning_rate": 1.0163717111208086e-06, "loss": 0.9748, "num_tokens": 84432507.0, "step": 408 }, { "epoch": 0.9784688995215312, "grad_norm": 0.5218518317378777, "learning_rate": 1.0135317647449362e-06, "loss": 0.9739, "num_tokens": 84644408.0, "step": 409 }, { "epoch": 0.9808612440191388, "grad_norm": 0.5596368200732923, "learning_rate": 1.0109617738307914e-06, "loss": 1.0414, "num_tokens": 84854304.0, "step": 410 }, { "epoch": 0.9832535885167464, "grad_norm": 0.5348740586634487, "learning_rate": 1.0086618930174011e-06, "loss": 1.1507, "num_tokens": 85056365.0, "step": 411 }, { "epoch": 0.9856459330143541, "grad_norm": 0.554299617798691, "learning_rate": 1.006632260690988e-06, "loss": 1.0713, "num_tokens": 85211462.0, "step": 412 }, { "epoch": 0.9880382775119617, "grad_norm": 0.5659307655892759, "learning_rate": 1.0048729989766396e-06, "loss": 0.9576, "num_tokens": 85413979.0, "step": 413 }, { "epoch": 0.9904306220095693, "grad_norm": 0.6180230319552571, "learning_rate": 1.0033842137309649e-06, "loss": 0.9867, "num_tokens": 85564746.0, "step": 414 }, { "epoch": 0.992822966507177, "grad_norm": 0.49348429130589355, "learning_rate": 1.0021659945357202e-06, "loss": 1.2502, "num_tokens": 85821465.0, "step": 415 }, { "epoch": 0.9952153110047847, "grad_norm": 0.5397948420594149, "learning_rate": 1.0012184146924225e-06, "loss": 1.1626, "num_tokens": 86064119.0, "step": 416 }, { "epoch": 0.9976076555023924, "grad_norm": 0.7144358111953418, "learning_rate": 1.0005415312179367e-06, "loss": 0.8718, "num_tokens": 86205361.0, "step": 417 }, { "epoch": 1.0, "grad_norm": 0.546093577829937, "learning_rate": 1.0001353848410461e-06, "loss": 1.0204, "num_tokens": 86399088.0, "step": 418 }, { "epoch": 1.0, "eval_loss": 0.6695132851600647, "eval_num_tokens": 86399088.0, "eval_runtime": 101.4457, "eval_samples_per_second": 29.296, "eval_steps_per_second": 3.667, "step": 418 }, { "epoch": 1.0, "step": 418, "total_flos": 290901703622656.0, "train_loss": 1.1731020922295785, "train_runtime": 3083.067, "train_samples_per_second": 8.674, "train_steps_per_second": 0.136 } ], "logging_steps": 1, "max_steps": 418, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 290901703622656.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }