PreThink_MemAgent / trainer_state.json
ucmp137538's picture
Model save
3f5fdf7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 418,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023923444976076554,
"grad_norm": 20.119582297751847,
"learning_rate": 0.0,
"loss": 2.1821,
"num_tokens": 274125.0,
"step": 1
},
{
"epoch": 0.004784688995215311,
"grad_norm": 23.170097201666195,
"learning_rate": 7.692307692307694e-07,
"loss": 2.2358,
"num_tokens": 493377.0,
"step": 2
},
{
"epoch": 0.007177033492822967,
"grad_norm": 23.45399110433363,
"learning_rate": 1.5384615384615387e-06,
"loss": 2.2012,
"num_tokens": 686897.0,
"step": 3
},
{
"epoch": 0.009569377990430622,
"grad_norm": 19.228005860305963,
"learning_rate": 2.307692307692308e-06,
"loss": 2.2219,
"num_tokens": 914354.0,
"step": 4
},
{
"epoch": 0.011961722488038277,
"grad_norm": 18.548172181534362,
"learning_rate": 3.0769230769230774e-06,
"loss": 2.1958,
"num_tokens": 1087390.0,
"step": 5
},
{
"epoch": 0.014354066985645933,
"grad_norm": 13.728999411657618,
"learning_rate": 3.846153846153847e-06,
"loss": 2.1247,
"num_tokens": 1268762.0,
"step": 6
},
{
"epoch": 0.01674641148325359,
"grad_norm": 8.505233076733274,
"learning_rate": 4.615384615384616e-06,
"loss": 2.0737,
"num_tokens": 1433561.0,
"step": 7
},
{
"epoch": 0.019138755980861243,
"grad_norm": 4.1867059667712025,
"learning_rate": 5.384615384615385e-06,
"loss": 1.938,
"num_tokens": 1655898.0,
"step": 8
},
{
"epoch": 0.0215311004784689,
"grad_norm": 3.3689397757728203,
"learning_rate": 6.153846153846155e-06,
"loss": 1.8762,
"num_tokens": 1904754.0,
"step": 9
},
{
"epoch": 0.023923444976076555,
"grad_norm": 2.6965544763096254,
"learning_rate": 6.923076923076923e-06,
"loss": 1.8481,
"num_tokens": 2100951.0,
"step": 10
},
{
"epoch": 0.02631578947368421,
"grad_norm": 2.0908111385220045,
"learning_rate": 7.692307692307694e-06,
"loss": 1.7457,
"num_tokens": 2264681.0,
"step": 11
},
{
"epoch": 0.028708133971291867,
"grad_norm": 2.170718723726301,
"learning_rate": 8.461538461538462e-06,
"loss": 1.7225,
"num_tokens": 2459076.0,
"step": 12
},
{
"epoch": 0.03110047846889952,
"grad_norm": 2.16857982636961,
"learning_rate": 9.230769230769232e-06,
"loss": 1.6537,
"num_tokens": 2606612.0,
"step": 13
},
{
"epoch": 0.03349282296650718,
"grad_norm": 1.5656854944876009,
"learning_rate": 1e-05,
"loss": 1.6801,
"num_tokens": 2766328.0,
"step": 14
},
{
"epoch": 0.03588516746411483,
"grad_norm": 1.6899464949924934,
"learning_rate": 9.999864615158956e-06,
"loss": 1.3963,
"num_tokens": 2939734.0,
"step": 15
},
{
"epoch": 0.03827751196172249,
"grad_norm": 1.2147889414450102,
"learning_rate": 9.999458468782065e-06,
"loss": 1.6588,
"num_tokens": 3209741.0,
"step": 16
},
{
"epoch": 0.04066985645933014,
"grad_norm": 1.3059422864639767,
"learning_rate": 9.998781585307577e-06,
"loss": 1.2028,
"num_tokens": 3331253.0,
"step": 17
},
{
"epoch": 0.0430622009569378,
"grad_norm": 0.8168354152517865,
"learning_rate": 9.997834005464281e-06,
"loss": 1.5119,
"num_tokens": 3550942.0,
"step": 18
},
{
"epoch": 0.045454545454545456,
"grad_norm": 0.7578450765410201,
"learning_rate": 9.996615786269036e-06,
"loss": 1.5165,
"num_tokens": 3734184.0,
"step": 19
},
{
"epoch": 0.04784688995215311,
"grad_norm": 0.772984535484589,
"learning_rate": 9.995127001023362e-06,
"loss": 1.4925,
"num_tokens": 3923612.0,
"step": 20
},
{
"epoch": 0.050239234449760764,
"grad_norm": 0.7657276095351829,
"learning_rate": 9.993367739309013e-06,
"loss": 1.3945,
"num_tokens": 4090661.0,
"step": 21
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.6839298661119211,
"learning_rate": 9.991338106982598e-06,
"loss": 1.46,
"num_tokens": 4300333.0,
"step": 22
},
{
"epoch": 0.05502392344497608,
"grad_norm": 0.7054066291049598,
"learning_rate": 9.98903822616921e-06,
"loss": 1.3554,
"num_tokens": 4483986.0,
"step": 23
},
{
"epoch": 0.05741626794258373,
"grad_norm": 0.7193972470009606,
"learning_rate": 9.986468235255065e-06,
"loss": 1.4998,
"num_tokens": 4682593.0,
"step": 24
},
{
"epoch": 0.05980861244019139,
"grad_norm": 0.6625723448730417,
"learning_rate": 9.983628288879193e-06,
"loss": 1.4898,
"num_tokens": 4880940.0,
"step": 25
},
{
"epoch": 0.06220095693779904,
"grad_norm": 0.7404539912651659,
"learning_rate": 9.98051855792412e-06,
"loss": 1.3321,
"num_tokens": 5074700.0,
"step": 26
},
{
"epoch": 0.0645933014354067,
"grad_norm": 0.7881264974132591,
"learning_rate": 9.977139229505596e-06,
"loss": 1.2212,
"num_tokens": 5225193.0,
"step": 27
},
{
"epoch": 0.06698564593301436,
"grad_norm": 0.6060089446257308,
"learning_rate": 9.973490506961326e-06,
"loss": 1.5731,
"num_tokens": 5447459.0,
"step": 28
},
{
"epoch": 0.06937799043062201,
"grad_norm": 0.618254776059864,
"learning_rate": 9.969572609838745e-06,
"loss": 1.4722,
"num_tokens": 5676623.0,
"step": 29
},
{
"epoch": 0.07177033492822966,
"grad_norm": 0.6304080009866732,
"learning_rate": 9.965385773881795e-06,
"loss": 1.3474,
"num_tokens": 5898924.0,
"step": 30
},
{
"epoch": 0.07416267942583732,
"grad_norm": 0.6104465608230878,
"learning_rate": 9.960930251016752e-06,
"loss": 1.4138,
"num_tokens": 6089369.0,
"step": 31
},
{
"epoch": 0.07655502392344497,
"grad_norm": 0.6581355504876419,
"learning_rate": 9.956206309337067e-06,
"loss": 1.4661,
"num_tokens": 6294065.0,
"step": 32
},
{
"epoch": 0.07894736842105263,
"grad_norm": 0.5866617107994286,
"learning_rate": 9.951214233087223e-06,
"loss": 1.4306,
"num_tokens": 6515957.0,
"step": 33
},
{
"epoch": 0.08133971291866028,
"grad_norm": 0.605393818271364,
"learning_rate": 9.945954322645643e-06,
"loss": 1.3046,
"num_tokens": 6725025.0,
"step": 34
},
{
"epoch": 0.08373205741626795,
"grad_norm": 0.5778342378194031,
"learning_rate": 9.940426894506608e-06,
"loss": 1.4363,
"num_tokens": 6949955.0,
"step": 35
},
{
"epoch": 0.0861244019138756,
"grad_norm": 0.6258805596031615,
"learning_rate": 9.934632281261221e-06,
"loss": 1.3519,
"num_tokens": 7152815.0,
"step": 36
},
{
"epoch": 0.08851674641148326,
"grad_norm": 0.5788764918533683,
"learning_rate": 9.928570831577396e-06,
"loss": 1.4289,
"num_tokens": 7365760.0,
"step": 37
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.6104478143341243,
"learning_rate": 9.922242910178862e-06,
"loss": 1.4927,
"num_tokens": 7619917.0,
"step": 38
},
{
"epoch": 0.09330143540669857,
"grad_norm": 0.666007518151506,
"learning_rate": 9.915648897823232e-06,
"loss": 1.1965,
"num_tokens": 7772797.0,
"step": 39
},
{
"epoch": 0.09569377990430622,
"grad_norm": 0.7244485739284531,
"learning_rate": 9.908789191279093e-06,
"loss": 1.3198,
"num_tokens": 7978612.0,
"step": 40
},
{
"epoch": 0.09808612440191387,
"grad_norm": 0.6500652663575426,
"learning_rate": 9.901664203302126e-06,
"loss": 1.3692,
"num_tokens": 8181944.0,
"step": 41
},
{
"epoch": 0.10047846889952153,
"grad_norm": 0.6523516464098081,
"learning_rate": 9.89427436261027e-06,
"loss": 1.2651,
"num_tokens": 8349921.0,
"step": 42
},
{
"epoch": 0.10287081339712918,
"grad_norm": 0.6141096849362858,
"learning_rate": 9.886620113857926e-06,
"loss": 1.1674,
"num_tokens": 8513062.0,
"step": 43
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.5176000363276883,
"learning_rate": 9.878701917609208e-06,
"loss": 1.3363,
"num_tokens": 8739362.0,
"step": 44
},
{
"epoch": 0.1076555023923445,
"grad_norm": 0.6496907081327192,
"learning_rate": 9.870520250310223e-06,
"loss": 1.2051,
"num_tokens": 8882227.0,
"step": 45
},
{
"epoch": 0.11004784688995216,
"grad_norm": 0.5781609822463768,
"learning_rate": 9.862075604260402e-06,
"loss": 1.4038,
"num_tokens": 9101362.0,
"step": 46
},
{
"epoch": 0.11244019138755981,
"grad_norm": 0.7191639780141069,
"learning_rate": 9.853368487582888e-06,
"loss": 1.1333,
"num_tokens": 9286876.0,
"step": 47
},
{
"epoch": 0.11483253588516747,
"grad_norm": 0.6406116951034948,
"learning_rate": 9.84439942419395e-06,
"loss": 1.4121,
"num_tokens": 9459192.0,
"step": 48
},
{
"epoch": 0.11722488038277512,
"grad_norm": 0.5661996222062946,
"learning_rate": 9.835168953771463e-06,
"loss": 1.322,
"num_tokens": 9724803.0,
"step": 49
},
{
"epoch": 0.11961722488038277,
"grad_norm": 0.5715728086031884,
"learning_rate": 9.825677631722436e-06,
"loss": 1.3516,
"num_tokens": 9933571.0,
"step": 50
},
{
"epoch": 0.12200956937799043,
"grad_norm": 0.6325774615690734,
"learning_rate": 9.815926029149593e-06,
"loss": 1.258,
"num_tokens": 10136490.0,
"step": 51
},
{
"epoch": 0.12440191387559808,
"grad_norm": 0.5904482238857803,
"learning_rate": 9.805914732817007e-06,
"loss": 1.293,
"num_tokens": 10340564.0,
"step": 52
},
{
"epoch": 0.12679425837320574,
"grad_norm": 0.5710320806437825,
"learning_rate": 9.795644345114796e-06,
"loss": 1.2765,
"num_tokens": 10553400.0,
"step": 53
},
{
"epoch": 0.1291866028708134,
"grad_norm": 0.622309054620362,
"learning_rate": 9.78511548402287e-06,
"loss": 1.123,
"num_tokens": 10758112.0,
"step": 54
},
{
"epoch": 0.13157894736842105,
"grad_norm": 0.7557997838257337,
"learning_rate": 9.77432878307376e-06,
"loss": 1.1149,
"num_tokens": 10934718.0,
"step": 55
},
{
"epoch": 0.1339712918660287,
"grad_norm": 0.4774648627893749,
"learning_rate": 9.763284891314481e-06,
"loss": 1.4329,
"num_tokens": 11227923.0,
"step": 56
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.6518939385243675,
"learning_rate": 9.751984473267498e-06,
"loss": 1.2629,
"num_tokens": 11417535.0,
"step": 57
},
{
"epoch": 0.13875598086124402,
"grad_norm": 0.5370370863120535,
"learning_rate": 9.740428208890716e-06,
"loss": 1.3426,
"num_tokens": 11651380.0,
"step": 58
},
{
"epoch": 0.14114832535885166,
"grad_norm": 0.5696851508370838,
"learning_rate": 9.728616793536588e-06,
"loss": 1.125,
"num_tokens": 11830736.0,
"step": 59
},
{
"epoch": 0.14354066985645933,
"grad_norm": 0.5644132429290988,
"learning_rate": 9.716550937910268e-06,
"loss": 1.2145,
"num_tokens": 12023638.0,
"step": 60
},
{
"epoch": 0.145933014354067,
"grad_norm": 0.7461647382617252,
"learning_rate": 9.70423136802684e-06,
"loss": 1.204,
"num_tokens": 12234061.0,
"step": 61
},
{
"epoch": 0.14832535885167464,
"grad_norm": 0.5086888568285274,
"learning_rate": 9.691658825167641e-06,
"loss": 1.3124,
"num_tokens": 12472421.0,
"step": 62
},
{
"epoch": 0.1507177033492823,
"grad_norm": 0.5053241954118645,
"learning_rate": 9.67883406583566e-06,
"loss": 1.3634,
"num_tokens": 12734106.0,
"step": 63
},
{
"epoch": 0.15311004784688995,
"grad_norm": 0.5179034670964426,
"learning_rate": 9.665757861710008e-06,
"loss": 1.3053,
"num_tokens": 12960684.0,
"step": 64
},
{
"epoch": 0.15550239234449761,
"grad_norm": 0.5461947358982723,
"learning_rate": 9.652430999599491e-06,
"loss": 1.2969,
"num_tokens": 13170331.0,
"step": 65
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.6423563162262463,
"learning_rate": 9.638854281395271e-06,
"loss": 1.3541,
"num_tokens": 13397481.0,
"step": 66
},
{
"epoch": 0.16028708133971292,
"grad_norm": 0.5755576573234283,
"learning_rate": 9.625028524022606e-06,
"loss": 1.2183,
"num_tokens": 13638917.0,
"step": 67
},
{
"epoch": 0.16267942583732056,
"grad_norm": 0.6393096708849371,
"learning_rate": 9.610954559391704e-06,
"loss": 1.2774,
"num_tokens": 13845779.0,
"step": 68
},
{
"epoch": 0.16507177033492823,
"grad_norm": 0.6238780043211961,
"learning_rate": 9.596633234347661e-06,
"loss": 1.0493,
"num_tokens": 14015645.0,
"step": 69
},
{
"epoch": 0.1674641148325359,
"grad_norm": 0.6004590974749275,
"learning_rate": 9.582065410619503e-06,
"loss": 1.1128,
"num_tokens": 14174170.0,
"step": 70
},
{
"epoch": 0.16985645933014354,
"grad_norm": 0.5353801191806298,
"learning_rate": 9.567251964768343e-06,
"loss": 1.2534,
"num_tokens": 14391398.0,
"step": 71
},
{
"epoch": 0.1722488038277512,
"grad_norm": 0.5703356560477955,
"learning_rate": 9.55219378813463e-06,
"loss": 1.2457,
"num_tokens": 14610731.0,
"step": 72
},
{
"epoch": 0.17464114832535885,
"grad_norm": 0.5213842592670314,
"learning_rate": 9.53689178678452e-06,
"loss": 1.3794,
"num_tokens": 14858252.0,
"step": 73
},
{
"epoch": 0.17703349282296652,
"grad_norm": 0.5665738251245545,
"learning_rate": 9.521346881455356e-06,
"loss": 1.3718,
"num_tokens": 15084332.0,
"step": 74
},
{
"epoch": 0.17942583732057416,
"grad_norm": 0.5432851738944047,
"learning_rate": 9.505560007500263e-06,
"loss": 1.2429,
"num_tokens": 15352232.0,
"step": 75
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.6029856670534988,
"learning_rate": 9.489532114831876e-06,
"loss": 1.1883,
"num_tokens": 15574514.0,
"step": 76
},
{
"epoch": 0.18421052631578946,
"grad_norm": 0.5636116286831033,
"learning_rate": 9.473264167865172e-06,
"loss": 1.1939,
"num_tokens": 15788273.0,
"step": 77
},
{
"epoch": 0.18660287081339713,
"grad_norm": 0.5273294226554239,
"learning_rate": 9.456757145459445e-06,
"loss": 1.3284,
"num_tokens": 16058083.0,
"step": 78
},
{
"epoch": 0.18899521531100477,
"grad_norm": 0.6091499871383838,
"learning_rate": 9.44001204085941e-06,
"loss": 1.1578,
"num_tokens": 16222078.0,
"step": 79
},
{
"epoch": 0.19138755980861244,
"grad_norm": 0.5729867351707406,
"learning_rate": 9.423029861635431e-06,
"loss": 1.1448,
"num_tokens": 16452197.0,
"step": 80
},
{
"epoch": 0.1937799043062201,
"grad_norm": 0.5753208503065251,
"learning_rate": 9.405811629622904e-06,
"loss": 1.3236,
"num_tokens": 16678106.0,
"step": 81
},
{
"epoch": 0.19617224880382775,
"grad_norm": 0.613469703266833,
"learning_rate": 9.388358380860763e-06,
"loss": 1.1021,
"num_tokens": 16908054.0,
"step": 82
},
{
"epoch": 0.19856459330143542,
"grad_norm": 0.6002222062441086,
"learning_rate": 9.370671165529146e-06,
"loss": 1.1476,
"num_tokens": 17140981.0,
"step": 83
},
{
"epoch": 0.20095693779904306,
"grad_norm": 0.5295041630429093,
"learning_rate": 9.3527510478862e-06,
"loss": 1.2725,
"num_tokens": 17364693.0,
"step": 84
},
{
"epoch": 0.20334928229665072,
"grad_norm": 0.5369203542352684,
"learning_rate": 9.334599106204051e-06,
"loss": 1.2895,
"num_tokens": 17563578.0,
"step": 85
},
{
"epoch": 0.20574162679425836,
"grad_norm": 0.5193929587177428,
"learning_rate": 9.316216432703918e-06,
"loss": 1.2499,
"num_tokens": 17740374.0,
"step": 86
},
{
"epoch": 0.20813397129186603,
"grad_norm": 0.49812886005887325,
"learning_rate": 9.29760413349039e-06,
"loss": 1.3455,
"num_tokens": 18015806.0,
"step": 87
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.5190241504997857,
"learning_rate": 9.278763328484875e-06,
"loss": 1.0828,
"num_tokens": 18245485.0,
"step": 88
},
{
"epoch": 0.21291866028708134,
"grad_norm": 0.534699634820348,
"learning_rate": 9.259695151358215e-06,
"loss": 1.2029,
"num_tokens": 18441471.0,
"step": 89
},
{
"epoch": 0.215311004784689,
"grad_norm": 0.5368146817909797,
"learning_rate": 9.240400749462467e-06,
"loss": 1.13,
"num_tokens": 18659186.0,
"step": 90
},
{
"epoch": 0.21770334928229665,
"grad_norm": 0.6643155654192867,
"learning_rate": 9.220881283761868e-06,
"loss": 1.1626,
"num_tokens": 18811916.0,
"step": 91
},
{
"epoch": 0.22009569377990432,
"grad_norm": 0.5953751009151461,
"learning_rate": 9.20113792876298e-06,
"loss": 1.1446,
"num_tokens": 18974285.0,
"step": 92
},
{
"epoch": 0.22248803827751196,
"grad_norm": 0.6067628035324104,
"learning_rate": 9.181171872444015e-06,
"loss": 1.2417,
"num_tokens": 19182034.0,
"step": 93
},
{
"epoch": 0.22488038277511962,
"grad_norm": 0.6396322460129866,
"learning_rate": 9.160984316183354e-06,
"loss": 1.0376,
"num_tokens": 19324593.0,
"step": 94
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.5167898612058803,
"learning_rate": 9.140576474687263e-06,
"loss": 1.0627,
"num_tokens": 19559212.0,
"step": 95
},
{
"epoch": 0.22966507177033493,
"grad_norm": 0.6898506829068124,
"learning_rate": 9.1199495759168e-06,
"loss": 1.0682,
"num_tokens": 19734777.0,
"step": 96
},
{
"epoch": 0.23205741626794257,
"grad_norm": 0.5632751758217261,
"learning_rate": 9.099104861013922e-06,
"loss": 1.2069,
"num_tokens": 19924776.0,
"step": 97
},
{
"epoch": 0.23444976076555024,
"grad_norm": 0.4975676948616479,
"learning_rate": 9.078043584226816e-06,
"loss": 1.2944,
"num_tokens": 20166431.0,
"step": 98
},
{
"epoch": 0.23684210526315788,
"grad_norm": 0.5811862630357938,
"learning_rate": 9.056767012834417e-06,
"loss": 1.2261,
"num_tokens": 20342559.0,
"step": 99
},
{
"epoch": 0.23923444976076555,
"grad_norm": 0.6205394909309613,
"learning_rate": 9.035276427070166e-06,
"loss": 1.1827,
"num_tokens": 20528647.0,
"step": 100
},
{
"epoch": 0.24162679425837322,
"grad_norm": 0.6101249338540917,
"learning_rate": 9.013573120044968e-06,
"loss": 1.0195,
"num_tokens": 20735927.0,
"step": 101
},
{
"epoch": 0.24401913875598086,
"grad_norm": 0.5589655982664236,
"learning_rate": 8.991658397669384e-06,
"loss": 1.2941,
"num_tokens": 20973055.0,
"step": 102
},
{
"epoch": 0.24641148325358853,
"grad_norm": 0.602415461668376,
"learning_rate": 8.96953357857507e-06,
"loss": 0.9238,
"num_tokens": 21131698.0,
"step": 103
},
{
"epoch": 0.24880382775119617,
"grad_norm": 0.4635975776481471,
"learning_rate": 8.947199994035402e-06,
"loss": 1.206,
"num_tokens": 21426277.0,
"step": 104
},
{
"epoch": 0.2511961722488038,
"grad_norm": 0.5416414335210736,
"learning_rate": 8.924658987885403e-06,
"loss": 1.1863,
"num_tokens": 21629826.0,
"step": 105
},
{
"epoch": 0.2535885167464115,
"grad_norm": 0.703889948074174,
"learning_rate": 8.901911916440867e-06,
"loss": 1.0592,
"num_tokens": 21805342.0,
"step": 106
},
{
"epoch": 0.25598086124401914,
"grad_norm": 0.5638998814508404,
"learning_rate": 8.878960148416747e-06,
"loss": 1.2387,
"num_tokens": 21993750.0,
"step": 107
},
{
"epoch": 0.2583732057416268,
"grad_norm": 0.5224818527209029,
"learning_rate": 8.855805064844808e-06,
"loss": 1.3391,
"num_tokens": 22182974.0,
"step": 108
},
{
"epoch": 0.2607655502392344,
"grad_norm": 0.5975570946282182,
"learning_rate": 8.832448058990522e-06,
"loss": 1.1119,
"num_tokens": 22406584.0,
"step": 109
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.5342575640517132,
"learning_rate": 8.80889053626923e-06,
"loss": 1.1556,
"num_tokens": 22591986.0,
"step": 110
},
{
"epoch": 0.26555023923444976,
"grad_norm": 0.6463928995023777,
"learning_rate": 8.785133914161586e-06,
"loss": 1.0927,
"num_tokens": 22755674.0,
"step": 111
},
{
"epoch": 0.2679425837320574,
"grad_norm": 0.5540394516081272,
"learning_rate": 8.761179622128264e-06,
"loss": 1.1932,
"num_tokens": 22979344.0,
"step": 112
},
{
"epoch": 0.2703349282296651,
"grad_norm": 0.5639562135925512,
"learning_rate": 8.737029101523931e-06,
"loss": 1.1062,
"num_tokens": 23213393.0,
"step": 113
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.47416665855465817,
"learning_rate": 8.712683805510547e-06,
"loss": 1.0925,
"num_tokens": 23440736.0,
"step": 114
},
{
"epoch": 0.2751196172248804,
"grad_norm": 0.6750642922896175,
"learning_rate": 8.6881451989699e-06,
"loss": 1.2461,
"num_tokens": 23595366.0,
"step": 115
},
{
"epoch": 0.27751196172248804,
"grad_norm": 0.5459520630146212,
"learning_rate": 8.66341475841548e-06,
"loss": 1.1222,
"num_tokens": 23807492.0,
"step": 116
},
{
"epoch": 0.2799043062200957,
"grad_norm": 0.5301705350454893,
"learning_rate": 8.638493971903621e-06,
"loss": 1.3022,
"num_tokens": 24019959.0,
"step": 117
},
{
"epoch": 0.2822966507177033,
"grad_norm": 0.6424194649582932,
"learning_rate": 8.613384338943982e-06,
"loss": 1.0574,
"num_tokens": 24205265.0,
"step": 118
},
{
"epoch": 0.284688995215311,
"grad_norm": 0.5546308776167657,
"learning_rate": 8.588087370409303e-06,
"loss": 1.2411,
"num_tokens": 24429509.0,
"step": 119
},
{
"epoch": 0.28708133971291866,
"grad_norm": 0.480470812260585,
"learning_rate": 8.562604588444498e-06,
"loss": 1.2674,
"num_tokens": 24680453.0,
"step": 120
},
{
"epoch": 0.2894736842105263,
"grad_norm": 0.5297827708710372,
"learning_rate": 8.536937526375075e-06,
"loss": 1.2252,
"num_tokens": 24893378.0,
"step": 121
},
{
"epoch": 0.291866028708134,
"grad_norm": 0.770470928681588,
"learning_rate": 8.511087728614863e-06,
"loss": 1.0353,
"num_tokens": 25020898.0,
"step": 122
},
{
"epoch": 0.2942583732057416,
"grad_norm": 0.5337837938457338,
"learning_rate": 8.485056750573088e-06,
"loss": 1.2966,
"num_tokens": 25273187.0,
"step": 123
},
{
"epoch": 0.2966507177033493,
"grad_norm": 0.592552325078839,
"learning_rate": 8.458846158560787e-06,
"loss": 1.1754,
"num_tokens": 25469601.0,
"step": 124
},
{
"epoch": 0.29904306220095694,
"grad_norm": 0.5958320399693818,
"learning_rate": 8.43245752969655e-06,
"loss": 1.069,
"num_tokens": 25648408.0,
"step": 125
},
{
"epoch": 0.3014354066985646,
"grad_norm": 0.624744711279868,
"learning_rate": 8.40589245181163e-06,
"loss": 1.1037,
"num_tokens": 25866106.0,
"step": 126
},
{
"epoch": 0.3038277511961722,
"grad_norm": 0.6392805038022229,
"learning_rate": 8.379152523354407e-06,
"loss": 1.1845,
"num_tokens": 26058009.0,
"step": 127
},
{
"epoch": 0.3062200956937799,
"grad_norm": 0.5505337156956458,
"learning_rate": 8.352239353294196e-06,
"loss": 1.245,
"num_tokens": 26327152.0,
"step": 128
},
{
"epoch": 0.30861244019138756,
"grad_norm": 0.5429338635678093,
"learning_rate": 8.325154561024445e-06,
"loss": 1.3208,
"num_tokens": 26559334.0,
"step": 129
},
{
"epoch": 0.31100478468899523,
"grad_norm": 0.5543720622642925,
"learning_rate": 8.29789977626528e-06,
"loss": 1.217,
"num_tokens": 26754982.0,
"step": 130
},
{
"epoch": 0.3133971291866029,
"grad_norm": 0.6525624593414054,
"learning_rate": 8.270476638965463e-06,
"loss": 1.0719,
"num_tokens": 26887851.0,
"step": 131
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.6284711216463389,
"learning_rate": 8.242886799203696e-06,
"loss": 1.1727,
"num_tokens": 27042502.0,
"step": 132
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.5632325030743454,
"learning_rate": 8.215131917089342e-06,
"loss": 1.1525,
"num_tokens": 27248040.0,
"step": 133
},
{
"epoch": 0.32057416267942584,
"grad_norm": 0.6252698594109136,
"learning_rate": 8.187213662662539e-06,
"loss": 1.0868,
"num_tokens": 27463386.0,
"step": 134
},
{
"epoch": 0.3229665071770335,
"grad_norm": 0.55667567195552,
"learning_rate": 8.159133715793701e-06,
"loss": 1.1098,
"num_tokens": 27684485.0,
"step": 135
},
{
"epoch": 0.3253588516746411,
"grad_norm": 0.5109763125317217,
"learning_rate": 8.13089376608245e-06,
"loss": 1.1185,
"num_tokens": 27901192.0,
"step": 136
},
{
"epoch": 0.3277511961722488,
"grad_norm": 0.5657322857245803,
"learning_rate": 8.102495512755939e-06,
"loss": 1.3105,
"num_tokens": 28138162.0,
"step": 137
},
{
"epoch": 0.33014354066985646,
"grad_norm": 0.5063120233634636,
"learning_rate": 8.073940664566623e-06,
"loss": 1.2374,
"num_tokens": 28355174.0,
"step": 138
},
{
"epoch": 0.33253588516746413,
"grad_norm": 0.5701958065694588,
"learning_rate": 8.045230939689425e-06,
"loss": 1.1063,
"num_tokens": 28521259.0,
"step": 139
},
{
"epoch": 0.3349282296650718,
"grad_norm": 0.540247926031648,
"learning_rate": 8.016368065618361e-06,
"loss": 1.0551,
"num_tokens": 28746191.0,
"step": 140
},
{
"epoch": 0.3373205741626794,
"grad_norm": 0.5340355745257312,
"learning_rate": 7.987353779062598e-06,
"loss": 1.235,
"num_tokens": 29022355.0,
"step": 141
},
{
"epoch": 0.3397129186602871,
"grad_norm": 0.5292859186809687,
"learning_rate": 7.958189825841942e-06,
"loss": 1.1531,
"num_tokens": 29238427.0,
"step": 142
},
{
"epoch": 0.34210526315789475,
"grad_norm": 0.7322544316739465,
"learning_rate": 7.928877960781808e-06,
"loss": 0.9135,
"num_tokens": 29379111.0,
"step": 143
},
{
"epoch": 0.3444976076555024,
"grad_norm": 0.5080774575481332,
"learning_rate": 7.899419947607611e-06,
"loss": 1.2097,
"num_tokens": 29627097.0,
"step": 144
},
{
"epoch": 0.34688995215311,
"grad_norm": 0.5832151085081759,
"learning_rate": 7.869817558838654e-06,
"loss": 1.0816,
"num_tokens": 29832123.0,
"step": 145
},
{
"epoch": 0.3492822966507177,
"grad_norm": 0.5206108052264397,
"learning_rate": 7.840072575681468e-06,
"loss": 1.108,
"num_tokens": 30048644.0,
"step": 146
},
{
"epoch": 0.35167464114832536,
"grad_norm": 0.5570271309488313,
"learning_rate": 7.810186787922645e-06,
"loss": 1.1653,
"num_tokens": 30247851.0,
"step": 147
},
{
"epoch": 0.35406698564593303,
"grad_norm": 0.4918371375990957,
"learning_rate": 7.78016199382112e-06,
"loss": 1.1408,
"num_tokens": 30527686.0,
"step": 148
},
{
"epoch": 0.35645933014354064,
"grad_norm": 0.5481932300046403,
"learning_rate": 7.75e-06,
"loss": 1.2044,
"num_tokens": 30723713.0,
"step": 149
},
{
"epoch": 0.3588516746411483,
"grad_norm": 0.6651847229876482,
"learning_rate": 7.719702621337834e-06,
"loss": 1.0119,
"num_tokens": 30898218.0,
"step": 150
},
{
"epoch": 0.361244019138756,
"grad_norm": 0.46633215220880386,
"learning_rate": 7.68927168085942e-06,
"loss": 1.1705,
"num_tokens": 31126739.0,
"step": 151
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.5876480626961219,
"learning_rate": 7.658709009626109e-06,
"loss": 0.9351,
"num_tokens": 31301729.0,
"step": 152
},
{
"epoch": 0.3660287081339713,
"grad_norm": 0.49945896590659167,
"learning_rate": 7.628016446625626e-06,
"loss": 1.2641,
"num_tokens": 31531161.0,
"step": 153
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.5384303848101453,
"learning_rate": 7.597195838661426e-06,
"loss": 1.1977,
"num_tokens": 31785635.0,
"step": 154
},
{
"epoch": 0.3708133971291866,
"grad_norm": 0.6031598977170286,
"learning_rate": 7.566249040241553e-06,
"loss": 1.0982,
"num_tokens": 32017995.0,
"step": 155
},
{
"epoch": 0.37320574162679426,
"grad_norm": 0.5114284004215709,
"learning_rate": 7.53517791346707e-06,
"loss": 1.2633,
"num_tokens": 32246103.0,
"step": 156
},
{
"epoch": 0.37559808612440193,
"grad_norm": 0.511553264808467,
"learning_rate": 7.503984327920003e-06,
"loss": 1.1566,
"num_tokens": 32461173.0,
"step": 157
},
{
"epoch": 0.37799043062200954,
"grad_norm": 0.4861428494553005,
"learning_rate": 7.472670160550849e-06,
"loss": 1.2219,
"num_tokens": 32710394.0,
"step": 158
},
{
"epoch": 0.3803827751196172,
"grad_norm": 0.591981436959529,
"learning_rate": 7.441237295565642e-06,
"loss": 1.275,
"num_tokens": 32910997.0,
"step": 159
},
{
"epoch": 0.3827751196172249,
"grad_norm": 0.5171815810924354,
"learning_rate": 7.409687624312569e-06,
"loss": 1.2906,
"num_tokens": 33191166.0,
"step": 160
},
{
"epoch": 0.38516746411483255,
"grad_norm": 0.6093674065623558,
"learning_rate": 7.378023045168181e-06,
"loss": 1.1703,
"num_tokens": 33380845.0,
"step": 161
},
{
"epoch": 0.3875598086124402,
"grad_norm": 0.5521223923681069,
"learning_rate": 7.346245463423148e-06,
"loss": 1.1532,
"num_tokens": 33553617.0,
"step": 162
},
{
"epoch": 0.38995215311004783,
"grad_norm": 0.5177157946810159,
"learning_rate": 7.314356791167626e-06,
"loss": 1.1612,
"num_tokens": 33785498.0,
"step": 163
},
{
"epoch": 0.3923444976076555,
"grad_norm": 0.5060522779515988,
"learning_rate": 7.282358947176207e-06,
"loss": 1.3366,
"num_tokens": 34019728.0,
"step": 164
},
{
"epoch": 0.39473684210526316,
"grad_norm": 0.5610143836266379,
"learning_rate": 7.250253856792452e-06,
"loss": 1.2572,
"num_tokens": 34236289.0,
"step": 165
},
{
"epoch": 0.39712918660287083,
"grad_norm": 0.5606343028811931,
"learning_rate": 7.218043451813058e-06,
"loss": 1.0956,
"num_tokens": 34415700.0,
"step": 166
},
{
"epoch": 0.39952153110047844,
"grad_norm": 0.5775794108416966,
"learning_rate": 7.185729670371605e-06,
"loss": 1.015,
"num_tokens": 34605985.0,
"step": 167
},
{
"epoch": 0.4019138755980861,
"grad_norm": 0.6312411170295402,
"learning_rate": 7.153314456821942e-06,
"loss": 0.922,
"num_tokens": 34748670.0,
"step": 168
},
{
"epoch": 0.4043062200956938,
"grad_norm": 0.5132788880980301,
"learning_rate": 7.120799761621198e-06,
"loss": 1.2394,
"num_tokens": 34976413.0,
"step": 169
},
{
"epoch": 0.40669856459330145,
"grad_norm": 0.5618840133734496,
"learning_rate": 7.08818754121241e-06,
"loss": 1.0443,
"num_tokens": 35182351.0,
"step": 170
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.5771799652861468,
"learning_rate": 7.0554797579068155e-06,
"loss": 1.0114,
"num_tokens": 35384554.0,
"step": 171
},
{
"epoch": 0.41148325358851673,
"grad_norm": 0.4649122455940863,
"learning_rate": 7.022678379765766e-06,
"loss": 1.2349,
"num_tokens": 35658712.0,
"step": 172
},
{
"epoch": 0.4138755980861244,
"grad_norm": 0.57386723032485,
"learning_rate": 6.989785380482313e-06,
"loss": 1.0024,
"num_tokens": 35853348.0,
"step": 173
},
{
"epoch": 0.41626794258373206,
"grad_norm": 0.5785841074184913,
"learning_rate": 6.956802739262446e-06,
"loss": 1.1307,
"num_tokens": 36048889.0,
"step": 174
},
{
"epoch": 0.41866028708133973,
"grad_norm": 0.5209762559962196,
"learning_rate": 6.923732440706005e-06,
"loss": 1.032,
"num_tokens": 36250421.0,
"step": 175
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.49999578979845366,
"learning_rate": 6.890576474687264e-06,
"loss": 1.3027,
"num_tokens": 36467223.0,
"step": 176
},
{
"epoch": 0.423444976076555,
"grad_norm": 0.44607951021905534,
"learning_rate": 6.857336836235195e-06,
"loss": 1.2908,
"num_tokens": 36786228.0,
"step": 177
},
{
"epoch": 0.4258373205741627,
"grad_norm": 0.5405149465909439,
"learning_rate": 6.824015525413428e-06,
"loss": 1.2206,
"num_tokens": 36987436.0,
"step": 178
},
{
"epoch": 0.42822966507177035,
"grad_norm": 0.5101094166751247,
"learning_rate": 6.790614547199908e-06,
"loss": 1.3338,
"num_tokens": 37173969.0,
"step": 179
},
{
"epoch": 0.430622009569378,
"grad_norm": 0.5018404262587114,
"learning_rate": 6.7571359113662405e-06,
"loss": 0.9635,
"num_tokens": 37430838.0,
"step": 180
},
{
"epoch": 0.43301435406698563,
"grad_norm": 0.5186179578093245,
"learning_rate": 6.723581632356783e-06,
"loss": 1.1317,
"num_tokens": 37614321.0,
"step": 181
},
{
"epoch": 0.4354066985645933,
"grad_norm": 0.5092089036024817,
"learning_rate": 6.689953729167411e-06,
"loss": 1.1989,
"num_tokens": 37828436.0,
"step": 182
},
{
"epoch": 0.43779904306220097,
"grad_norm": 0.5779182575588276,
"learning_rate": 6.65625422522405e-06,
"loss": 1.0699,
"num_tokens": 37994173.0,
"step": 183
},
{
"epoch": 0.44019138755980863,
"grad_norm": 0.5213748156719571,
"learning_rate": 6.622485148260916e-06,
"loss": 1.142,
"num_tokens": 38226513.0,
"step": 184
},
{
"epoch": 0.44258373205741625,
"grad_norm": 0.5124918281868935,
"learning_rate": 6.588648530198505e-06,
"loss": 1.0789,
"num_tokens": 38424535.0,
"step": 185
},
{
"epoch": 0.4449760765550239,
"grad_norm": 0.4965284532552029,
"learning_rate": 6.554746407021332e-06,
"loss": 1.2216,
"num_tokens": 38662320.0,
"step": 186
},
{
"epoch": 0.4473684210526316,
"grad_norm": 0.5776130784552208,
"learning_rate": 6.520780818655421e-06,
"loss": 1.2425,
"num_tokens": 38852666.0,
"step": 187
},
{
"epoch": 0.44976076555023925,
"grad_norm": 0.5433597025027418,
"learning_rate": 6.486753808845565e-06,
"loss": 1.1762,
"num_tokens": 39020645.0,
"step": 188
},
{
"epoch": 0.45215311004784686,
"grad_norm": 0.5851211313289845,
"learning_rate": 6.45266742503235e-06,
"loss": 1.1301,
"num_tokens": 39229647.0,
"step": 189
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.5580553839960908,
"learning_rate": 6.418523718228952e-06,
"loss": 1.1287,
"num_tokens": 39423404.0,
"step": 190
},
{
"epoch": 0.4569377990430622,
"grad_norm": 0.5702668222438311,
"learning_rate": 6.3843247428977365e-06,
"loss": 1.1402,
"num_tokens": 39603933.0,
"step": 191
},
{
"epoch": 0.45933014354066987,
"grad_norm": 0.5524617168766218,
"learning_rate": 6.350072556826632e-06,
"loss": 1.0908,
"num_tokens": 39799631.0,
"step": 192
},
{
"epoch": 0.46172248803827753,
"grad_norm": 0.5054083920538464,
"learning_rate": 6.315769221005313e-06,
"loss": 1.1696,
"num_tokens": 40042491.0,
"step": 193
},
{
"epoch": 0.46411483253588515,
"grad_norm": 0.4984596483043875,
"learning_rate": 6.281416799501188e-06,
"loss": 0.9211,
"num_tokens": 40228565.0,
"step": 194
},
{
"epoch": 0.4665071770334928,
"grad_norm": 0.5341608488908804,
"learning_rate": 6.247017359335199e-06,
"loss": 1.2083,
"num_tokens": 40410247.0,
"step": 195
},
{
"epoch": 0.4688995215311005,
"grad_norm": 0.5046486493573384,
"learning_rate": 6.2125729703574534e-06,
"loss": 1.2149,
"num_tokens": 40651771.0,
"step": 196
},
{
"epoch": 0.47129186602870815,
"grad_norm": 0.6097314899954371,
"learning_rate": 6.178085705122675e-06,
"loss": 1.0858,
"num_tokens": 40855435.0,
"step": 197
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.5774665348623625,
"learning_rate": 6.143557638765494e-06,
"loss": 1.122,
"num_tokens": 41030495.0,
"step": 198
},
{
"epoch": 0.47607655502392343,
"grad_norm": 0.48860350341505726,
"learning_rate": 6.108990848875591e-06,
"loss": 1.3412,
"num_tokens": 41277045.0,
"step": 199
},
{
"epoch": 0.4784688995215311,
"grad_norm": 0.5361962907700251,
"learning_rate": 6.074387415372677e-06,
"loss": 1.0927,
"num_tokens": 41500279.0,
"step": 200
},
{
"epoch": 0.48086124401913877,
"grad_norm": 0.6039231448287091,
"learning_rate": 6.039749420381349e-06,
"loss": 1.1362,
"num_tokens": 41677455.0,
"step": 201
},
{
"epoch": 0.48325358851674644,
"grad_norm": 0.5131741268531921,
"learning_rate": 6.005078948105808e-06,
"loss": 1.2406,
"num_tokens": 41894065.0,
"step": 202
},
{
"epoch": 0.48564593301435405,
"grad_norm": 0.47724842296291775,
"learning_rate": 5.970378084704441e-06,
"loss": 1.0304,
"num_tokens": 42128139.0,
"step": 203
},
{
"epoch": 0.4880382775119617,
"grad_norm": 0.5240356233196276,
"learning_rate": 5.935648918164308e-06,
"loss": 1.0814,
"num_tokens": 42333521.0,
"step": 204
},
{
"epoch": 0.4904306220095694,
"grad_norm": 0.5251041508662586,
"learning_rate": 5.90089353817549e-06,
"loss": 1.1679,
"num_tokens": 42533301.0,
"step": 205
},
{
"epoch": 0.49282296650717705,
"grad_norm": 0.6532050533136743,
"learning_rate": 5.866114036005363e-06,
"loss": 0.9818,
"num_tokens": 42694701.0,
"step": 206
},
{
"epoch": 0.49521531100478466,
"grad_norm": 0.6836388656935797,
"learning_rate": 5.831312504372762e-06,
"loss": 1.0012,
"num_tokens": 42809151.0,
"step": 207
},
{
"epoch": 0.49760765550239233,
"grad_norm": 0.5030489700232146,
"learning_rate": 5.796491037322054e-06,
"loss": 1.1244,
"num_tokens": 43035639.0,
"step": 208
},
{
"epoch": 0.5,
"grad_norm": 0.5562880150972886,
"learning_rate": 5.761651730097142e-06,
"loss": 1.3298,
"num_tokens": 43207069.0,
"step": 209
},
{
"epoch": 0.5023923444976076,
"grad_norm": 0.5324885775750403,
"learning_rate": 5.726796679015392e-06,
"loss": 1.3305,
"num_tokens": 43475398.0,
"step": 210
},
{
"epoch": 0.5047846889952153,
"grad_norm": 0.6085427119073632,
"learning_rate": 5.691927981341488e-06,
"loss": 1.0097,
"num_tokens": 43641183.0,
"step": 211
},
{
"epoch": 0.507177033492823,
"grad_norm": 0.6541524113634078,
"learning_rate": 5.657047735161256e-06,
"loss": 0.7888,
"num_tokens": 43820730.0,
"step": 212
},
{
"epoch": 0.5095693779904307,
"grad_norm": 0.5724267971985464,
"learning_rate": 5.622158039255394e-06,
"loss": 1.1429,
"num_tokens": 44013162.0,
"step": 213
},
{
"epoch": 0.5119617224880383,
"grad_norm": 0.4888491874482519,
"learning_rate": 5.58726099297321e-06,
"loss": 1.0386,
"num_tokens": 44259910.0,
"step": 214
},
{
"epoch": 0.5143540669856459,
"grad_norm": 0.5678338260313958,
"learning_rate": 5.552358696106288e-06,
"loss": 1.175,
"num_tokens": 44480685.0,
"step": 215
},
{
"epoch": 0.5167464114832536,
"grad_norm": 0.5262339117176533,
"learning_rate": 5.517453248762142e-06,
"loss": 1.233,
"num_tokens": 44690652.0,
"step": 216
},
{
"epoch": 0.5191387559808612,
"grad_norm": 0.5686946242510297,
"learning_rate": 5.482546751237859e-06,
"loss": 0.9377,
"num_tokens": 44905510.0,
"step": 217
},
{
"epoch": 0.5215311004784688,
"grad_norm": 0.5096154075649568,
"learning_rate": 5.447641303893715e-06,
"loss": 0.9606,
"num_tokens": 45121618.0,
"step": 218
},
{
"epoch": 0.5239234449760766,
"grad_norm": 0.5027532121976238,
"learning_rate": 5.412739007026791e-06,
"loss": 1.3208,
"num_tokens": 45328957.0,
"step": 219
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.5955398795356434,
"learning_rate": 5.377841960744607e-06,
"loss": 1.0519,
"num_tokens": 45470498.0,
"step": 220
},
{
"epoch": 0.5287081339712919,
"grad_norm": 0.5632402633040062,
"learning_rate": 5.342952264838748e-06,
"loss": 1.0009,
"num_tokens": 45690586.0,
"step": 221
},
{
"epoch": 0.5311004784688995,
"grad_norm": 0.5530392228322656,
"learning_rate": 5.308072018658512e-06,
"loss": 1.0197,
"num_tokens": 45915829.0,
"step": 222
},
{
"epoch": 0.5334928229665071,
"grad_norm": 0.5560740776916706,
"learning_rate": 5.273203320984611e-06,
"loss": 1.0086,
"num_tokens": 46125336.0,
"step": 223
},
{
"epoch": 0.5358851674641149,
"grad_norm": 0.47936312873685966,
"learning_rate": 5.23834826990286e-06,
"loss": 1.2004,
"num_tokens": 46386175.0,
"step": 224
},
{
"epoch": 0.5382775119617225,
"grad_norm": 0.5451628089579803,
"learning_rate": 5.203508962677947e-06,
"loss": 1.1559,
"num_tokens": 46618828.0,
"step": 225
},
{
"epoch": 0.5406698564593302,
"grad_norm": 0.5352825379096331,
"learning_rate": 5.168687495627239e-06,
"loss": 1.1977,
"num_tokens": 46873878.0,
"step": 226
},
{
"epoch": 0.5430622009569378,
"grad_norm": 0.5328607455361074,
"learning_rate": 5.1338859639946396e-06,
"loss": 1.0719,
"num_tokens": 47110612.0,
"step": 227
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.5355655159606746,
"learning_rate": 5.099106461824513e-06,
"loss": 1.1536,
"num_tokens": 47297604.0,
"step": 228
},
{
"epoch": 0.5478468899521531,
"grad_norm": 0.652585538954601,
"learning_rate": 5.064351081835695e-06,
"loss": 1.1744,
"num_tokens": 47508300.0,
"step": 229
},
{
"epoch": 0.5502392344497608,
"grad_norm": 0.5726602885947132,
"learning_rate": 5.02962191529556e-06,
"loss": 0.9178,
"num_tokens": 47674186.0,
"step": 230
},
{
"epoch": 0.5526315789473685,
"grad_norm": 0.5227349746690181,
"learning_rate": 4.9949210518941945e-06,
"loss": 1.0537,
"num_tokens": 47869064.0,
"step": 231
},
{
"epoch": 0.5550239234449761,
"grad_norm": 0.5270482777761917,
"learning_rate": 4.960250579618652e-06,
"loss": 1.1318,
"num_tokens": 48073543.0,
"step": 232
},
{
"epoch": 0.5574162679425837,
"grad_norm": 0.5628820736414913,
"learning_rate": 4.925612584627325e-06,
"loss": 1.0542,
"num_tokens": 48249518.0,
"step": 233
},
{
"epoch": 0.5598086124401914,
"grad_norm": 0.5460319678028444,
"learning_rate": 4.8910091511244115e-06,
"loss": 1.0131,
"num_tokens": 48471001.0,
"step": 234
},
{
"epoch": 0.562200956937799,
"grad_norm": 0.5503254822986171,
"learning_rate": 4.856442361234507e-06,
"loss": 1.0773,
"num_tokens": 48720980.0,
"step": 235
},
{
"epoch": 0.5645933014354066,
"grad_norm": 0.5091545126296911,
"learning_rate": 4.821914294877327e-06,
"loss": 1.1478,
"num_tokens": 48922782.0,
"step": 236
},
{
"epoch": 0.5669856459330144,
"grad_norm": 0.5074108889085012,
"learning_rate": 4.787427029642549e-06,
"loss": 1.2534,
"num_tokens": 49149522.0,
"step": 237
},
{
"epoch": 0.569377990430622,
"grad_norm": 0.5849957930987398,
"learning_rate": 4.752982640664804e-06,
"loss": 1.0202,
"num_tokens": 49321177.0,
"step": 238
},
{
"epoch": 0.5717703349282297,
"grad_norm": 0.5347992211342384,
"learning_rate": 4.718583200498814e-06,
"loss": 1.2032,
"num_tokens": 49544634.0,
"step": 239
},
{
"epoch": 0.5741626794258373,
"grad_norm": 0.5280959102930131,
"learning_rate": 4.684230778994688e-06,
"loss": 1.1751,
"num_tokens": 49724091.0,
"step": 240
},
{
"epoch": 0.5765550239234449,
"grad_norm": 0.5164476203177735,
"learning_rate": 4.64992744317337e-06,
"loss": 1.1098,
"num_tokens": 49929099.0,
"step": 241
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.6667023806983443,
"learning_rate": 4.615675257102265e-06,
"loss": 0.9402,
"num_tokens": 50081941.0,
"step": 242
},
{
"epoch": 0.5813397129186603,
"grad_norm": 0.5023784414967131,
"learning_rate": 4.58147628177105e-06,
"loss": 1.01,
"num_tokens": 50306579.0,
"step": 243
},
{
"epoch": 0.583732057416268,
"grad_norm": 0.5370878293075974,
"learning_rate": 4.547332574967653e-06,
"loss": 1.079,
"num_tokens": 50544895.0,
"step": 244
},
{
"epoch": 0.5861244019138756,
"grad_norm": 0.5090426584844939,
"learning_rate": 4.513246191154434e-06,
"loss": 1.1825,
"num_tokens": 50788203.0,
"step": 245
},
{
"epoch": 0.5885167464114832,
"grad_norm": 0.4792828066902539,
"learning_rate": 4.479219181344579e-06,
"loss": 1.2301,
"num_tokens": 51053982.0,
"step": 246
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.49219719144165075,
"learning_rate": 4.44525359297867e-06,
"loss": 1.1711,
"num_tokens": 51259911.0,
"step": 247
},
{
"epoch": 0.5933014354066986,
"grad_norm": 0.5340406735561365,
"learning_rate": 4.4113514698014955e-06,
"loss": 1.1956,
"num_tokens": 51473886.0,
"step": 248
},
{
"epoch": 0.5956937799043063,
"grad_norm": 0.5702889032524951,
"learning_rate": 4.377514851739085e-06,
"loss": 1.1091,
"num_tokens": 51735586.0,
"step": 249
},
{
"epoch": 0.5980861244019139,
"grad_norm": 0.5115029340630267,
"learning_rate": 4.3437457747759515e-06,
"loss": 1.1343,
"num_tokens": 51923001.0,
"step": 250
},
{
"epoch": 0.6004784688995215,
"grad_norm": 0.4738251807559482,
"learning_rate": 4.310046270832592e-06,
"loss": 1.07,
"num_tokens": 52167211.0,
"step": 251
},
{
"epoch": 0.6028708133971292,
"grad_norm": 0.562569354089248,
"learning_rate": 4.276418367643218e-06,
"loss": 0.9359,
"num_tokens": 52345300.0,
"step": 252
},
{
"epoch": 0.6052631578947368,
"grad_norm": 0.6492878859321651,
"learning_rate": 4.242864088633762e-06,
"loss": 0.8908,
"num_tokens": 52537210.0,
"step": 253
},
{
"epoch": 0.6076555023923444,
"grad_norm": 0.6078233345214087,
"learning_rate": 4.2093854528000955e-06,
"loss": 0.8913,
"num_tokens": 52695428.0,
"step": 254
},
{
"epoch": 0.6100478468899522,
"grad_norm": 0.5115019352055596,
"learning_rate": 4.175984474586572e-06,
"loss": 1.0335,
"num_tokens": 52945131.0,
"step": 255
},
{
"epoch": 0.6124401913875598,
"grad_norm": 0.5875660189403787,
"learning_rate": 4.142663163764806e-06,
"loss": 0.941,
"num_tokens": 53101160.0,
"step": 256
},
{
"epoch": 0.6148325358851675,
"grad_norm": 0.5230885907461125,
"learning_rate": 4.109423525312738e-06,
"loss": 1.1472,
"num_tokens": 53341330.0,
"step": 257
},
{
"epoch": 0.6172248803827751,
"grad_norm": 0.676100542426314,
"learning_rate": 4.076267559293996e-06,
"loss": 0.9226,
"num_tokens": 53477820.0,
"step": 258
},
{
"epoch": 0.6196172248803827,
"grad_norm": 0.6027764896908601,
"learning_rate": 4.043197260737556e-06,
"loss": 1.1615,
"num_tokens": 53655177.0,
"step": 259
},
{
"epoch": 0.6220095693779905,
"grad_norm": 0.5114599101755669,
"learning_rate": 4.0102146195176895e-06,
"loss": 1.0848,
"num_tokens": 53871093.0,
"step": 260
},
{
"epoch": 0.6244019138755981,
"grad_norm": 0.5616877393452973,
"learning_rate": 3.977321620234236e-06,
"loss": 1.1293,
"num_tokens": 54051884.0,
"step": 261
},
{
"epoch": 0.6267942583732058,
"grad_norm": 0.5951828000342995,
"learning_rate": 3.944520242093186e-06,
"loss": 1.1116,
"num_tokens": 54243302.0,
"step": 262
},
{
"epoch": 0.6291866028708134,
"grad_norm": 0.5533241097093147,
"learning_rate": 3.911812458787592e-06,
"loss": 1.0339,
"num_tokens": 54449587.0,
"step": 263
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.6391714671501187,
"learning_rate": 3.8792002383788044e-06,
"loss": 1.0188,
"num_tokens": 54573282.0,
"step": 264
},
{
"epoch": 0.6339712918660287,
"grad_norm": 0.48381850337769244,
"learning_rate": 3.846685543178058e-06,
"loss": 1.2549,
"num_tokens": 54826368.0,
"step": 265
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.49990948075130837,
"learning_rate": 3.8142703296283954e-06,
"loss": 1.1331,
"num_tokens": 55080391.0,
"step": 266
},
{
"epoch": 0.638755980861244,
"grad_norm": 0.5427808072503959,
"learning_rate": 3.7819565481869426e-06,
"loss": 1.1618,
"num_tokens": 55285642.0,
"step": 267
},
{
"epoch": 0.6411483253588517,
"grad_norm": 0.5747721632491769,
"learning_rate": 3.7497461432075477e-06,
"loss": 1.1053,
"num_tokens": 55481520.0,
"step": 268
},
{
"epoch": 0.6435406698564593,
"grad_norm": 0.5301204962544379,
"learning_rate": 3.717641052823795e-06,
"loss": 1.1108,
"num_tokens": 55706780.0,
"step": 269
},
{
"epoch": 0.645933014354067,
"grad_norm": 0.5775776454615925,
"learning_rate": 3.6856432088323746e-06,
"loss": 1.1119,
"num_tokens": 55902431.0,
"step": 270
},
{
"epoch": 0.6483253588516746,
"grad_norm": 0.5001600002488803,
"learning_rate": 3.6537545365768543e-06,
"loss": 0.9535,
"num_tokens": 56104220.0,
"step": 271
},
{
"epoch": 0.6507177033492823,
"grad_norm": 0.5699808255124916,
"learning_rate": 3.6219769548318205e-06,
"loss": 1.0524,
"num_tokens": 56257950.0,
"step": 272
},
{
"epoch": 0.65311004784689,
"grad_norm": 0.5003276838892392,
"learning_rate": 3.5903123756874315e-06,
"loss": 1.1485,
"num_tokens": 56488654.0,
"step": 273
},
{
"epoch": 0.6555023923444976,
"grad_norm": 0.6033119191336221,
"learning_rate": 3.558762704434361e-06,
"loss": 1.024,
"num_tokens": 56686270.0,
"step": 274
},
{
"epoch": 0.6578947368421053,
"grad_norm": 0.4693280395015428,
"learning_rate": 3.527329839449152e-06,
"loss": 1.136,
"num_tokens": 56931317.0,
"step": 275
},
{
"epoch": 0.6602870813397129,
"grad_norm": 0.5278398302464965,
"learning_rate": 3.496015672079998e-06,
"loss": 1.1571,
"num_tokens": 57127263.0,
"step": 276
},
{
"epoch": 0.6626794258373205,
"grad_norm": 0.49190545922349904,
"learning_rate": 3.4648220865329312e-06,
"loss": 1.0427,
"num_tokens": 57354122.0,
"step": 277
},
{
"epoch": 0.6650717703349283,
"grad_norm": 0.4934205228618601,
"learning_rate": 3.4337509597584466e-06,
"loss": 1.2705,
"num_tokens": 57579975.0,
"step": 278
},
{
"epoch": 0.6674641148325359,
"grad_norm": 0.6046200272271364,
"learning_rate": 3.402804161338577e-06,
"loss": 0.9143,
"num_tokens": 57767139.0,
"step": 279
},
{
"epoch": 0.6698564593301436,
"grad_norm": 0.5256841221145759,
"learning_rate": 3.371983553374375e-06,
"loss": 1.0864,
"num_tokens": 57969542.0,
"step": 280
},
{
"epoch": 0.6722488038277512,
"grad_norm": 0.5879727234811725,
"learning_rate": 3.3412909903738937e-06,
"loss": 0.9625,
"num_tokens": 58145028.0,
"step": 281
},
{
"epoch": 0.6746411483253588,
"grad_norm": 0.6263377798428889,
"learning_rate": 3.310728319140581e-06,
"loss": 0.9234,
"num_tokens": 58312705.0,
"step": 282
},
{
"epoch": 0.6770334928229665,
"grad_norm": 0.5407307381090947,
"learning_rate": 3.2802973786621665e-06,
"loss": 1.0687,
"num_tokens": 58527623.0,
"step": 283
},
{
"epoch": 0.6794258373205742,
"grad_norm": 0.5502001614125057,
"learning_rate": 3.2500000000000015e-06,
"loss": 1.1427,
"num_tokens": 58772116.0,
"step": 284
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.5686855066649326,
"learning_rate": 3.2198380061788803e-06,
"loss": 1.031,
"num_tokens": 58948693.0,
"step": 285
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.56427208726594,
"learning_rate": 3.1898132120773566e-06,
"loss": 1.0001,
"num_tokens": 59160106.0,
"step": 286
},
{
"epoch": 0.6866028708133971,
"grad_norm": 0.5178015680501699,
"learning_rate": 3.1599274243185314e-06,
"loss": 1.2459,
"num_tokens": 59393828.0,
"step": 287
},
{
"epoch": 0.6889952153110048,
"grad_norm": 0.6161696867803992,
"learning_rate": 3.1301824411613473e-06,
"loss": 1.077,
"num_tokens": 59592707.0,
"step": 288
},
{
"epoch": 0.6913875598086124,
"grad_norm": 0.49780237640470854,
"learning_rate": 3.1005800523923906e-06,
"loss": 1.1431,
"num_tokens": 59812582.0,
"step": 289
},
{
"epoch": 0.69377990430622,
"grad_norm": 0.5031207474545651,
"learning_rate": 3.071122039218194e-06,
"loss": 1.1467,
"num_tokens": 60043641.0,
"step": 290
},
{
"epoch": 0.6961722488038278,
"grad_norm": 0.574254924525526,
"learning_rate": 3.0418101741580586e-06,
"loss": 1.1918,
"num_tokens": 60234442.0,
"step": 291
},
{
"epoch": 0.6985645933014354,
"grad_norm": 0.5016769304104969,
"learning_rate": 3.012646220937403e-06,
"loss": 1.31,
"num_tokens": 60456123.0,
"step": 292
},
{
"epoch": 0.7009569377990431,
"grad_norm": 0.5058935049560537,
"learning_rate": 2.98363193438164e-06,
"loss": 0.9371,
"num_tokens": 60672710.0,
"step": 293
},
{
"epoch": 0.7033492822966507,
"grad_norm": 0.5351125304814696,
"learning_rate": 2.9547690603105774e-06,
"loss": 1.0698,
"num_tokens": 60894772.0,
"step": 294
},
{
"epoch": 0.7057416267942583,
"grad_norm": 0.5128628418090031,
"learning_rate": 2.926059335433378e-06,
"loss": 1.2298,
"num_tokens": 61142587.0,
"step": 295
},
{
"epoch": 0.7081339712918661,
"grad_norm": 0.5144613524379172,
"learning_rate": 2.897504487244061e-06,
"loss": 0.9337,
"num_tokens": 61352129.0,
"step": 296
},
{
"epoch": 0.7105263157894737,
"grad_norm": 0.5861410143772018,
"learning_rate": 2.8691062339175512e-06,
"loss": 0.9923,
"num_tokens": 61498549.0,
"step": 297
},
{
"epoch": 0.7129186602870813,
"grad_norm": 0.5481256980886055,
"learning_rate": 2.8408662842063002e-06,
"loss": 1.0957,
"num_tokens": 61687826.0,
"step": 298
},
{
"epoch": 0.715311004784689,
"grad_norm": 0.5582805882931381,
"learning_rate": 2.8127863373374637e-06,
"loss": 1.09,
"num_tokens": 61877628.0,
"step": 299
},
{
"epoch": 0.7177033492822966,
"grad_norm": 0.5983921444578938,
"learning_rate": 2.7848680829106602e-06,
"loss": 1.0968,
"num_tokens": 62078858.0,
"step": 300
},
{
"epoch": 0.7200956937799043,
"grad_norm": 0.5339997006585953,
"learning_rate": 2.7571132007963074e-06,
"loss": 1.1891,
"num_tokens": 62265457.0,
"step": 301
},
{
"epoch": 0.722488038277512,
"grad_norm": 0.5449456499746453,
"learning_rate": 2.7295233610345384e-06,
"loss": 1.0269,
"num_tokens": 62488733.0,
"step": 302
},
{
"epoch": 0.7248803827751196,
"grad_norm": 0.5699604526936535,
"learning_rate": 2.7021002237347206e-06,
"loss": 1.1336,
"num_tokens": 62714416.0,
"step": 303
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.6413790402904914,
"learning_rate": 2.6748454389755576e-06,
"loss": 0.9382,
"num_tokens": 62890365.0,
"step": 304
},
{
"epoch": 0.7296650717703349,
"grad_norm": 0.5390387726292147,
"learning_rate": 2.647760646705804e-06,
"loss": 1.0829,
"num_tokens": 63120765.0,
"step": 305
},
{
"epoch": 0.7320574162679426,
"grad_norm": 0.5984653976738545,
"learning_rate": 2.620847476645594e-06,
"loss": 0.9221,
"num_tokens": 63320228.0,
"step": 306
},
{
"epoch": 0.7344497607655502,
"grad_norm": 0.5801251118440074,
"learning_rate": 2.5941075481883705e-06,
"loss": 1.1212,
"num_tokens": 63509873.0,
"step": 307
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.5636489099209283,
"learning_rate": 2.567542470303452e-06,
"loss": 1.078,
"num_tokens": 63745029.0,
"step": 308
},
{
"epoch": 0.7392344497607656,
"grad_norm": 0.48725639119647585,
"learning_rate": 2.5411538414392146e-06,
"loss": 1.2125,
"num_tokens": 63953310.0,
"step": 309
},
{
"epoch": 0.7416267942583732,
"grad_norm": 0.549253240822144,
"learning_rate": 2.5149432494269134e-06,
"loss": 1.1192,
"num_tokens": 64147381.0,
"step": 310
},
{
"epoch": 0.7440191387559809,
"grad_norm": 0.5491580770023559,
"learning_rate": 2.4889122713851397e-06,
"loss": 0.9919,
"num_tokens": 64340436.0,
"step": 311
},
{
"epoch": 0.7464114832535885,
"grad_norm": 0.5164385106756677,
"learning_rate": 2.463062473624927e-06,
"loss": 1.0476,
"num_tokens": 64568538.0,
"step": 312
},
{
"epoch": 0.7488038277511961,
"grad_norm": 0.583840880433391,
"learning_rate": 2.437395411555504e-06,
"loss": 1.1016,
"num_tokens": 64759586.0,
"step": 313
},
{
"epoch": 0.7511961722488039,
"grad_norm": 0.5528719370540063,
"learning_rate": 2.4119126295906997e-06,
"loss": 1.1974,
"num_tokens": 64942864.0,
"step": 314
},
{
"epoch": 0.7535885167464115,
"grad_norm": 0.6028168080715274,
"learning_rate": 2.3866156610560186e-06,
"loss": 1.0019,
"num_tokens": 65142788.0,
"step": 315
},
{
"epoch": 0.7559808612440191,
"grad_norm": 0.5816986940686796,
"learning_rate": 2.3615060280963797e-06,
"loss": 1.2118,
"num_tokens": 65362360.0,
"step": 316
},
{
"epoch": 0.7583732057416268,
"grad_norm": 0.5809244671898545,
"learning_rate": 2.3365852415845225e-06,
"loss": 1.1267,
"num_tokens": 65547922.0,
"step": 317
},
{
"epoch": 0.7607655502392344,
"grad_norm": 0.5262370165475527,
"learning_rate": 2.3118548010301015e-06,
"loss": 1.1893,
"num_tokens": 65731553.0,
"step": 318
},
{
"epoch": 0.7631578947368421,
"grad_norm": 0.5357040610680347,
"learning_rate": 2.2873161944894552e-06,
"loss": 1.1869,
"num_tokens": 65951250.0,
"step": 319
},
{
"epoch": 0.7655502392344498,
"grad_norm": 0.5570433795031379,
"learning_rate": 2.262970898476071e-06,
"loss": 0.9916,
"num_tokens": 66175000.0,
"step": 320
},
{
"epoch": 0.7679425837320574,
"grad_norm": 0.604494546666767,
"learning_rate": 2.2388203778717407e-06,
"loss": 1.1347,
"num_tokens": 66357517.0,
"step": 321
},
{
"epoch": 0.7703349282296651,
"grad_norm": 0.5827904281357608,
"learning_rate": 2.2148660858384147e-06,
"loss": 1.0356,
"num_tokens": 66566078.0,
"step": 322
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.5218976553836495,
"learning_rate": 2.1911094637307715e-06,
"loss": 1.1124,
"num_tokens": 66784937.0,
"step": 323
},
{
"epoch": 0.7751196172248804,
"grad_norm": 0.49417380874831474,
"learning_rate": 2.1675519410094803e-06,
"loss": 1.1203,
"num_tokens": 67057361.0,
"step": 324
},
{
"epoch": 0.777511961722488,
"grad_norm": 0.6319926280044286,
"learning_rate": 2.144194935155192e-06,
"loss": 1.038,
"num_tokens": 67276459.0,
"step": 325
},
{
"epoch": 0.7799043062200957,
"grad_norm": 0.553450207558276,
"learning_rate": 2.121039851583254e-06,
"loss": 1.0843,
"num_tokens": 67454638.0,
"step": 326
},
{
"epoch": 0.7822966507177034,
"grad_norm": 0.5159208111364086,
"learning_rate": 2.098088083559135e-06,
"loss": 0.9358,
"num_tokens": 67667938.0,
"step": 327
},
{
"epoch": 0.784688995215311,
"grad_norm": 0.5059115925994171,
"learning_rate": 2.0753410121145984e-06,
"loss": 1.1579,
"num_tokens": 67859669.0,
"step": 328
},
{
"epoch": 0.7870813397129187,
"grad_norm": 0.5613491350937895,
"learning_rate": 2.0528000059646e-06,
"loss": 1.0022,
"num_tokens": 68056005.0,
"step": 329
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.5374042116513947,
"learning_rate": 2.0304664214249326e-06,
"loss": 1.0718,
"num_tokens": 68255467.0,
"step": 330
},
{
"epoch": 0.7918660287081339,
"grad_norm": 0.503580387927313,
"learning_rate": 2.0083416023306163e-06,
"loss": 1.1493,
"num_tokens": 68469900.0,
"step": 331
},
{
"epoch": 0.7942583732057417,
"grad_norm": 0.5884447457044938,
"learning_rate": 1.986426879955034e-06,
"loss": 0.9502,
"num_tokens": 68685343.0,
"step": 332
},
{
"epoch": 0.7966507177033493,
"grad_norm": 0.6834427409407543,
"learning_rate": 1.9647235729298346e-06,
"loss": 0.9018,
"num_tokens": 68834514.0,
"step": 333
},
{
"epoch": 0.7990430622009569,
"grad_norm": 0.5189288186456062,
"learning_rate": 1.9432329871655837e-06,
"loss": 1.2691,
"num_tokens": 69046003.0,
"step": 334
},
{
"epoch": 0.8014354066985646,
"grad_norm": 0.516776960640009,
"learning_rate": 1.9219564157731848e-06,
"loss": 1.0057,
"num_tokens": 69272731.0,
"step": 335
},
{
"epoch": 0.8038277511961722,
"grad_norm": 0.4831598833288486,
"learning_rate": 1.9008951389860785e-06,
"loss": 1.1143,
"num_tokens": 69508303.0,
"step": 336
},
{
"epoch": 0.80622009569378,
"grad_norm": 0.5753229158728437,
"learning_rate": 1.8800504240832012e-06,
"loss": 1.1146,
"num_tokens": 69706781.0,
"step": 337
},
{
"epoch": 0.8086124401913876,
"grad_norm": 0.5983941033127453,
"learning_rate": 1.8594235253127373e-06,
"loss": 1.1979,
"num_tokens": 69926110.0,
"step": 338
},
{
"epoch": 0.8110047846889952,
"grad_norm": 0.5114846230853078,
"learning_rate": 1.8390156838166464e-06,
"loss": 1.016,
"num_tokens": 70133509.0,
"step": 339
},
{
"epoch": 0.8133971291866029,
"grad_norm": 0.5260668256751079,
"learning_rate": 1.8188281275559866e-06,
"loss": 1.0266,
"num_tokens": 70365768.0,
"step": 340
},
{
"epoch": 0.8157894736842105,
"grad_norm": 0.5595038468322735,
"learning_rate": 1.7988620712370197e-06,
"loss": 1.1005,
"num_tokens": 70548685.0,
"step": 341
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.6890712705743423,
"learning_rate": 1.7791187162381325e-06,
"loss": 1.0739,
"num_tokens": 70725591.0,
"step": 342
},
{
"epoch": 0.8205741626794258,
"grad_norm": 0.5344037158436257,
"learning_rate": 1.759599250537534e-06,
"loss": 1.1548,
"num_tokens": 70943507.0,
"step": 343
},
{
"epoch": 0.8229665071770335,
"grad_norm": 0.5589105656078766,
"learning_rate": 1.740304848641787e-06,
"loss": 1.0402,
"num_tokens": 71137045.0,
"step": 344
},
{
"epoch": 0.8253588516746412,
"grad_norm": 0.5768929116638776,
"learning_rate": 1.7212366715151263e-06,
"loss": 0.9768,
"num_tokens": 71350643.0,
"step": 345
},
{
"epoch": 0.8277511961722488,
"grad_norm": 0.6276817700534357,
"learning_rate": 1.702395866509612e-06,
"loss": 0.9183,
"num_tokens": 71539784.0,
"step": 346
},
{
"epoch": 0.8301435406698564,
"grad_norm": 0.5484078243741392,
"learning_rate": 1.6837835672960834e-06,
"loss": 1.1514,
"num_tokens": 71742614.0,
"step": 347
},
{
"epoch": 0.8325358851674641,
"grad_norm": 0.5193578245554346,
"learning_rate": 1.6654008937959498e-06,
"loss": 0.9674,
"num_tokens": 71994797.0,
"step": 348
},
{
"epoch": 0.8349282296650717,
"grad_norm": 0.528358256622246,
"learning_rate": 1.6472489521138016e-06,
"loss": 1.108,
"num_tokens": 72191401.0,
"step": 349
},
{
"epoch": 0.8373205741626795,
"grad_norm": 0.5611551275004363,
"learning_rate": 1.629328834470857e-06,
"loss": 1.1481,
"num_tokens": 72346485.0,
"step": 350
},
{
"epoch": 0.8397129186602871,
"grad_norm": 0.4671315072196002,
"learning_rate": 1.611641619139238e-06,
"loss": 1.1736,
"num_tokens": 72601665.0,
"step": 351
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.5555560185216512,
"learning_rate": 1.5941883703770968e-06,
"loss": 1.1533,
"num_tokens": 72836095.0,
"step": 352
},
{
"epoch": 0.8444976076555024,
"grad_norm": 0.5288816745801785,
"learning_rate": 1.57697013836457e-06,
"loss": 1.0494,
"num_tokens": 73049430.0,
"step": 353
},
{
"epoch": 0.84688995215311,
"grad_norm": 0.6233482042563366,
"learning_rate": 1.5599879591405917e-06,
"loss": 1.0147,
"num_tokens": 73196007.0,
"step": 354
},
{
"epoch": 0.8492822966507177,
"grad_norm": 0.5363849538121136,
"learning_rate": 1.5432428545405554e-06,
"loss": 1.1694,
"num_tokens": 73396469.0,
"step": 355
},
{
"epoch": 0.8516746411483254,
"grad_norm": 0.5932100916233094,
"learning_rate": 1.526735832134829e-06,
"loss": 1.0174,
"num_tokens": 73584128.0,
"step": 356
},
{
"epoch": 0.854066985645933,
"grad_norm": 0.6127092810753643,
"learning_rate": 1.5104678851681253e-06,
"loss": 0.8168,
"num_tokens": 73717071.0,
"step": 357
},
{
"epoch": 0.8564593301435407,
"grad_norm": 0.6293206669166083,
"learning_rate": 1.4944399924997372e-06,
"loss": 0.7752,
"num_tokens": 73883367.0,
"step": 358
},
{
"epoch": 0.8588516746411483,
"grad_norm": 0.531317141972036,
"learning_rate": 1.4786531185446455e-06,
"loss": 1.1077,
"num_tokens": 74123207.0,
"step": 359
},
{
"epoch": 0.861244019138756,
"grad_norm": 0.44768314533679704,
"learning_rate": 1.4631082132154806e-06,
"loss": 1.2024,
"num_tokens": 74395731.0,
"step": 360
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.4788316306745224,
"learning_rate": 1.4478062118653703e-06,
"loss": 1.1751,
"num_tokens": 74663304.0,
"step": 361
},
{
"epoch": 0.8660287081339713,
"grad_norm": 0.4783192674308249,
"learning_rate": 1.4327480352316581e-06,
"loss": 1.1805,
"num_tokens": 74907925.0,
"step": 362
},
{
"epoch": 0.868421052631579,
"grad_norm": 0.5707901460896949,
"learning_rate": 1.417934589380498e-06,
"loss": 1.0742,
"num_tokens": 75130243.0,
"step": 363
},
{
"epoch": 0.8708133971291866,
"grad_norm": 0.6017414939136261,
"learning_rate": 1.4033667656523405e-06,
"loss": 0.9557,
"num_tokens": 75352077.0,
"step": 364
},
{
"epoch": 0.8732057416267942,
"grad_norm": 0.4853066070350836,
"learning_rate": 1.389045440608296e-06,
"loss": 1.08,
"num_tokens": 75592089.0,
"step": 365
},
{
"epoch": 0.8755980861244019,
"grad_norm": 0.5253451321715548,
"learning_rate": 1.374971475977394e-06,
"loss": 1.2071,
"num_tokens": 75818956.0,
"step": 366
},
{
"epoch": 0.8779904306220095,
"grad_norm": 0.5659204983119508,
"learning_rate": 1.361145718604731e-06,
"loss": 1.1936,
"num_tokens": 76017603.0,
"step": 367
},
{
"epoch": 0.8803827751196173,
"grad_norm": 0.479841142759106,
"learning_rate": 1.3475690004005098e-06,
"loss": 1.191,
"num_tokens": 76290864.0,
"step": 368
},
{
"epoch": 0.8827751196172249,
"grad_norm": 0.5872255230326239,
"learning_rate": 1.3342421382899936e-06,
"loss": 1.0301,
"num_tokens": 76529427.0,
"step": 369
},
{
"epoch": 0.8851674641148325,
"grad_norm": 0.5029097871572791,
"learning_rate": 1.3211659341643412e-06,
"loss": 1.2066,
"num_tokens": 76742589.0,
"step": 370
},
{
"epoch": 0.8875598086124402,
"grad_norm": 0.584840618113796,
"learning_rate": 1.308341174832359e-06,
"loss": 0.9768,
"num_tokens": 76939827.0,
"step": 371
},
{
"epoch": 0.8899521531100478,
"grad_norm": 0.44994308377297715,
"learning_rate": 1.2957686319731623e-06,
"loss": 1.2925,
"num_tokens": 77190390.0,
"step": 372
},
{
"epoch": 0.8923444976076556,
"grad_norm": 0.614291349507059,
"learning_rate": 1.2834490620897342e-06,
"loss": 1.0009,
"num_tokens": 77368607.0,
"step": 373
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.5540701345571359,
"learning_rate": 1.2713832064634127e-06,
"loss": 1.281,
"num_tokens": 77595326.0,
"step": 374
},
{
"epoch": 0.8971291866028708,
"grad_norm": 0.5930336764639087,
"learning_rate": 1.259571791109285e-06,
"loss": 1.1882,
"num_tokens": 77757257.0,
"step": 375
},
{
"epoch": 0.8995215311004785,
"grad_norm": 0.5601557384818509,
"learning_rate": 1.2480155267325039e-06,
"loss": 0.9335,
"num_tokens": 77966559.0,
"step": 376
},
{
"epoch": 0.9019138755980861,
"grad_norm": 0.5146670174651209,
"learning_rate": 1.2367151086855187e-06,
"loss": 1.1928,
"num_tokens": 78180912.0,
"step": 377
},
{
"epoch": 0.9043062200956937,
"grad_norm": 0.585671381043156,
"learning_rate": 1.2256712169262415e-06,
"loss": 1.0569,
"num_tokens": 78336709.0,
"step": 378
},
{
"epoch": 0.9066985645933014,
"grad_norm": 0.5144842875674174,
"learning_rate": 1.2148845159771311e-06,
"loss": 1.0092,
"num_tokens": 78603450.0,
"step": 379
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.594728768695324,
"learning_rate": 1.2043556548852065e-06,
"loss": 1.0245,
"num_tokens": 78852293.0,
"step": 380
},
{
"epoch": 0.9114832535885168,
"grad_norm": 0.46010783326706295,
"learning_rate": 1.1940852671829938e-06,
"loss": 1.2352,
"num_tokens": 79112672.0,
"step": 381
},
{
"epoch": 0.9138755980861244,
"grad_norm": 0.601262109893317,
"learning_rate": 1.184073970850408e-06,
"loss": 1.1504,
"num_tokens": 79319617.0,
"step": 382
},
{
"epoch": 0.916267942583732,
"grad_norm": 0.5038692624203227,
"learning_rate": 1.174322368277565e-06,
"loss": 1.1967,
"num_tokens": 79549771.0,
"step": 383
},
{
"epoch": 0.9186602870813397,
"grad_norm": 0.5753103173201497,
"learning_rate": 1.1648310462285386e-06,
"loss": 1.1225,
"num_tokens": 79738016.0,
"step": 384
},
{
"epoch": 0.9210526315789473,
"grad_norm": 0.614917920007612,
"learning_rate": 1.1556005758060517e-06,
"loss": 0.9872,
"num_tokens": 79913100.0,
"step": 385
},
{
"epoch": 0.9234449760765551,
"grad_norm": 0.5342918968914316,
"learning_rate": 1.146631512417113e-06,
"loss": 1.0676,
"num_tokens": 80103047.0,
"step": 386
},
{
"epoch": 0.9258373205741627,
"grad_norm": 0.5439716109099237,
"learning_rate": 1.1379243957395987e-06,
"loss": 1.0585,
"num_tokens": 80292737.0,
"step": 387
},
{
"epoch": 0.9282296650717703,
"grad_norm": 0.5342393003750865,
"learning_rate": 1.1294797496897786e-06,
"loss": 1.1836,
"num_tokens": 80512263.0,
"step": 388
},
{
"epoch": 0.930622009569378,
"grad_norm": 0.4855841313887977,
"learning_rate": 1.121298082390793e-06,
"loss": 1.0198,
"num_tokens": 80713362.0,
"step": 389
},
{
"epoch": 0.9330143540669856,
"grad_norm": 0.5404438942427807,
"learning_rate": 1.113379886142075e-06,
"loss": 0.9669,
"num_tokens": 80921168.0,
"step": 390
},
{
"epoch": 0.9354066985645934,
"grad_norm": 0.5507820902601309,
"learning_rate": 1.105725637389732e-06,
"loss": 1.0652,
"num_tokens": 81149885.0,
"step": 391
},
{
"epoch": 0.937799043062201,
"grad_norm": 0.5015294273795851,
"learning_rate": 1.0983357966978747e-06,
"loss": 1.1452,
"num_tokens": 81384820.0,
"step": 392
},
{
"epoch": 0.9401913875598086,
"grad_norm": 0.5530079510762682,
"learning_rate": 1.0912108087209075e-06,
"loss": 1.0865,
"num_tokens": 81577699.0,
"step": 393
},
{
"epoch": 0.9425837320574163,
"grad_norm": 0.49796992979545124,
"learning_rate": 1.084351102176769e-06,
"loss": 0.9428,
"num_tokens": 81803396.0,
"step": 394
},
{
"epoch": 0.9449760765550239,
"grad_norm": 0.5777758192642776,
"learning_rate": 1.0777570898211406e-06,
"loss": 1.0373,
"num_tokens": 81968827.0,
"step": 395
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.5754456579892182,
"learning_rate": 1.0714291684226054e-06,
"loss": 1.0265,
"num_tokens": 82166516.0,
"step": 396
},
{
"epoch": 0.9497607655502392,
"grad_norm": 0.558633769969428,
"learning_rate": 1.0653677187387787e-06,
"loss": 1.0473,
"num_tokens": 82338824.0,
"step": 397
},
{
"epoch": 0.9521531100478469,
"grad_norm": 0.6176260102445734,
"learning_rate": 1.0595731054933937e-06,
"loss": 1.0043,
"num_tokens": 82531186.0,
"step": 398
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.5126700946523376,
"learning_rate": 1.0540456773543596e-06,
"loss": 1.2646,
"num_tokens": 82735927.0,
"step": 399
},
{
"epoch": 0.9569377990430622,
"grad_norm": 0.5671634428425157,
"learning_rate": 1.0487857669127782e-06,
"loss": 1.1623,
"num_tokens": 82904745.0,
"step": 400
},
{
"epoch": 0.9593301435406698,
"grad_norm": 0.559489922062985,
"learning_rate": 1.0437936906629336e-06,
"loss": 1.0435,
"num_tokens": 83074515.0,
"step": 401
},
{
"epoch": 0.9617224880382775,
"grad_norm": 0.5577904608135668,
"learning_rate": 1.039069748983248e-06,
"loss": 0.7559,
"num_tokens": 83243340.0,
"step": 402
},
{
"epoch": 0.9641148325358851,
"grad_norm": 0.5215879777836743,
"learning_rate": 1.0346142261182064e-06,
"loss": 1.1583,
"num_tokens": 83474214.0,
"step": 403
},
{
"epoch": 0.9665071770334929,
"grad_norm": 0.5509462473469403,
"learning_rate": 1.0304273901612566e-06,
"loss": 1.0304,
"num_tokens": 83644954.0,
"step": 404
},
{
"epoch": 0.9688995215311005,
"grad_norm": 0.536818549153514,
"learning_rate": 1.0265094930386741e-06,
"loss": 1.2204,
"num_tokens": 83861919.0,
"step": 405
},
{
"epoch": 0.9712918660287081,
"grad_norm": 0.5740452675590582,
"learning_rate": 1.0228607704944048e-06,
"loss": 0.9858,
"num_tokens": 84024816.0,
"step": 406
},
{
"epoch": 0.9736842105263158,
"grad_norm": 0.5261150137396471,
"learning_rate": 1.0194814420758806e-06,
"loss": 1.1349,
"num_tokens": 84239403.0,
"step": 407
},
{
"epoch": 0.9760765550239234,
"grad_norm": 0.6448679502450355,
"learning_rate": 1.0163717111208086e-06,
"loss": 0.9748,
"num_tokens": 84432507.0,
"step": 408
},
{
"epoch": 0.9784688995215312,
"grad_norm": 0.5218518317378777,
"learning_rate": 1.0135317647449362e-06,
"loss": 0.9739,
"num_tokens": 84644408.0,
"step": 409
},
{
"epoch": 0.9808612440191388,
"grad_norm": 0.5596368200732923,
"learning_rate": 1.0109617738307914e-06,
"loss": 1.0414,
"num_tokens": 84854304.0,
"step": 410
},
{
"epoch": 0.9832535885167464,
"grad_norm": 0.5348740586634487,
"learning_rate": 1.0086618930174011e-06,
"loss": 1.1507,
"num_tokens": 85056365.0,
"step": 411
},
{
"epoch": 0.9856459330143541,
"grad_norm": 0.554299617798691,
"learning_rate": 1.006632260690988e-06,
"loss": 1.0713,
"num_tokens": 85211462.0,
"step": 412
},
{
"epoch": 0.9880382775119617,
"grad_norm": 0.5659307655892759,
"learning_rate": 1.0048729989766396e-06,
"loss": 0.9576,
"num_tokens": 85413979.0,
"step": 413
},
{
"epoch": 0.9904306220095693,
"grad_norm": 0.6180230319552571,
"learning_rate": 1.0033842137309649e-06,
"loss": 0.9867,
"num_tokens": 85564746.0,
"step": 414
},
{
"epoch": 0.992822966507177,
"grad_norm": 0.49348429130589355,
"learning_rate": 1.0021659945357202e-06,
"loss": 1.2502,
"num_tokens": 85821465.0,
"step": 415
},
{
"epoch": 0.9952153110047847,
"grad_norm": 0.5397948420594149,
"learning_rate": 1.0012184146924225e-06,
"loss": 1.1626,
"num_tokens": 86064119.0,
"step": 416
},
{
"epoch": 0.9976076555023924,
"grad_norm": 0.7144358111953418,
"learning_rate": 1.0005415312179367e-06,
"loss": 0.8718,
"num_tokens": 86205361.0,
"step": 417
},
{
"epoch": 1.0,
"grad_norm": 0.546093577829937,
"learning_rate": 1.0001353848410461e-06,
"loss": 1.0204,
"num_tokens": 86399088.0,
"step": 418
},
{
"epoch": 1.0,
"eval_loss": 0.6695132851600647,
"eval_num_tokens": 86399088.0,
"eval_runtime": 101.4457,
"eval_samples_per_second": 29.296,
"eval_steps_per_second": 3.667,
"step": 418
},
{
"epoch": 1.0,
"step": 418,
"total_flos": 290901703622656.0,
"train_loss": 1.1731020922295785,
"train_runtime": 3083.067,
"train_samples_per_second": 8.674,
"train_steps_per_second": 0.136
}
],
"logging_steps": 1,
"max_steps": 418,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 290901703622656.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}