deepmath_32B / trainer_state.json
marianna13's picture
Upload folder using huggingface_hub
e821056 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1010,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0049504950495049506,
"grad_norm": 2.7106738805250417,
"learning_rate": 7.920792079207921e-07,
"loss": 0.6976,
"step": 1
},
{
"epoch": 0.009900990099009901,
"grad_norm": 2.696401601038833,
"learning_rate": 1.5841584158415842e-06,
"loss": 0.6904,
"step": 2
},
{
"epoch": 0.01485148514851485,
"grad_norm": 2.692059520203227,
"learning_rate": 2.3762376237623762e-06,
"loss": 0.6909,
"step": 3
},
{
"epoch": 0.019801980198019802,
"grad_norm": 2.49976515730273,
"learning_rate": 3.1683168316831685e-06,
"loss": 0.6768,
"step": 4
},
{
"epoch": 0.024752475247524754,
"grad_norm": 1.967697623740443,
"learning_rate": 3.960396039603961e-06,
"loss": 0.6623,
"step": 5
},
{
"epoch": 0.0297029702970297,
"grad_norm": 1.3506412968967432,
"learning_rate": 4.7524752475247525e-06,
"loss": 0.6329,
"step": 6
},
{
"epoch": 0.034653465346534656,
"grad_norm": 1.2792125314994267,
"learning_rate": 5.544554455445545e-06,
"loss": 0.6231,
"step": 7
},
{
"epoch": 0.039603960396039604,
"grad_norm": 1.0567537067272226,
"learning_rate": 6.336633663366337e-06,
"loss": 0.5908,
"step": 8
},
{
"epoch": 0.04455445544554455,
"grad_norm": 1.0503985206378559,
"learning_rate": 7.128712871287129e-06,
"loss": 0.5854,
"step": 9
},
{
"epoch": 0.04950495049504951,
"grad_norm": 0.9386269831580432,
"learning_rate": 7.920792079207921e-06,
"loss": 0.5732,
"step": 10
},
{
"epoch": 0.054455445544554455,
"grad_norm": 1.1064367369100885,
"learning_rate": 8.712871287128714e-06,
"loss": 0.537,
"step": 11
},
{
"epoch": 0.0594059405940594,
"grad_norm": 1.0457856811185964,
"learning_rate": 9.504950495049505e-06,
"loss": 0.5313,
"step": 12
},
{
"epoch": 0.06435643564356436,
"grad_norm": 0.681887596760736,
"learning_rate": 1.0297029702970298e-05,
"loss": 0.515,
"step": 13
},
{
"epoch": 0.06930693069306931,
"grad_norm": 0.5774254406822782,
"learning_rate": 1.108910891089109e-05,
"loss": 0.5054,
"step": 14
},
{
"epoch": 0.07425742574257425,
"grad_norm": 1.0925049544322916,
"learning_rate": 1.1881188118811881e-05,
"loss": 0.4899,
"step": 15
},
{
"epoch": 0.07920792079207921,
"grad_norm": 0.861721437180681,
"learning_rate": 1.2673267326732674e-05,
"loss": 0.4833,
"step": 16
},
{
"epoch": 0.08415841584158416,
"grad_norm": 0.5864729079716412,
"learning_rate": 1.3465346534653467e-05,
"loss": 0.4782,
"step": 17
},
{
"epoch": 0.0891089108910891,
"grad_norm": 0.5865390324576937,
"learning_rate": 1.4257425742574257e-05,
"loss": 0.4695,
"step": 18
},
{
"epoch": 0.09405940594059406,
"grad_norm": 0.6130092306626196,
"learning_rate": 1.504950495049505e-05,
"loss": 0.4674,
"step": 19
},
{
"epoch": 0.09900990099009901,
"grad_norm": 0.5021736430582571,
"learning_rate": 1.5841584158415843e-05,
"loss": 0.4613,
"step": 20
},
{
"epoch": 0.10396039603960396,
"grad_norm": 0.38770857771222433,
"learning_rate": 1.6633663366336635e-05,
"loss": 0.4518,
"step": 21
},
{
"epoch": 0.10891089108910891,
"grad_norm": 0.3801872343316169,
"learning_rate": 1.7425742574257428e-05,
"loss": 0.4461,
"step": 22
},
{
"epoch": 0.11386138613861387,
"grad_norm": 0.36919542588411713,
"learning_rate": 1.821782178217822e-05,
"loss": 0.4408,
"step": 23
},
{
"epoch": 0.1188118811881188,
"grad_norm": 0.37639723207731884,
"learning_rate": 1.900990099009901e-05,
"loss": 0.4449,
"step": 24
},
{
"epoch": 0.12376237623762376,
"grad_norm": 0.344848768853846,
"learning_rate": 1.9801980198019803e-05,
"loss": 0.4419,
"step": 25
},
{
"epoch": 0.12871287128712872,
"grad_norm": 0.28355527610389414,
"learning_rate": 2.0594059405940595e-05,
"loss": 0.4343,
"step": 26
},
{
"epoch": 0.13366336633663367,
"grad_norm": 0.2754529294047224,
"learning_rate": 2.1386138613861388e-05,
"loss": 0.4237,
"step": 27
},
{
"epoch": 0.13861386138613863,
"grad_norm": 0.2865711175520124,
"learning_rate": 2.217821782178218e-05,
"loss": 0.4263,
"step": 28
},
{
"epoch": 0.14356435643564355,
"grad_norm": 0.2508438147810079,
"learning_rate": 2.297029702970297e-05,
"loss": 0.4248,
"step": 29
},
{
"epoch": 0.1485148514851485,
"grad_norm": 0.24448250670982463,
"learning_rate": 2.3762376237623762e-05,
"loss": 0.4227,
"step": 30
},
{
"epoch": 0.15346534653465346,
"grad_norm": 0.2294048162138678,
"learning_rate": 2.4554455445544555e-05,
"loss": 0.4256,
"step": 31
},
{
"epoch": 0.15841584158415842,
"grad_norm": 0.209271866937176,
"learning_rate": 2.5346534653465348e-05,
"loss": 0.4151,
"step": 32
},
{
"epoch": 0.16336633663366337,
"grad_norm": 0.24176526780414578,
"learning_rate": 2.613861386138614e-05,
"loss": 0.4161,
"step": 33
},
{
"epoch": 0.16831683168316833,
"grad_norm": 0.21049518828920885,
"learning_rate": 2.6930693069306933e-05,
"loss": 0.406,
"step": 34
},
{
"epoch": 0.17326732673267325,
"grad_norm": 0.19342535956001514,
"learning_rate": 2.7722772277227722e-05,
"loss": 0.4126,
"step": 35
},
{
"epoch": 0.1782178217821782,
"grad_norm": 0.19033425678980173,
"learning_rate": 2.8514851485148515e-05,
"loss": 0.4126,
"step": 36
},
{
"epoch": 0.18316831683168316,
"grad_norm": 0.18234331276088547,
"learning_rate": 2.9306930693069308e-05,
"loss": 0.4105,
"step": 37
},
{
"epoch": 0.18811881188118812,
"grad_norm": 0.18097741327633646,
"learning_rate": 3.00990099009901e-05,
"loss": 0.4072,
"step": 38
},
{
"epoch": 0.19306930693069307,
"grad_norm": 0.17660769986639668,
"learning_rate": 3.0891089108910896e-05,
"loss": 0.4082,
"step": 39
},
{
"epoch": 0.19801980198019803,
"grad_norm": 0.16490613450315042,
"learning_rate": 3.1683168316831686e-05,
"loss": 0.4004,
"step": 40
},
{
"epoch": 0.20297029702970298,
"grad_norm": 0.17948074742599796,
"learning_rate": 3.247524752475248e-05,
"loss": 0.3991,
"step": 41
},
{
"epoch": 0.2079207920792079,
"grad_norm": 0.1694173422938168,
"learning_rate": 3.326732673267327e-05,
"loss": 0.39,
"step": 42
},
{
"epoch": 0.21287128712871287,
"grad_norm": 0.16195673711111672,
"learning_rate": 3.405940594059406e-05,
"loss": 0.3906,
"step": 43
},
{
"epoch": 0.21782178217821782,
"grad_norm": 0.18011327698717658,
"learning_rate": 3.4851485148514856e-05,
"loss": 0.3989,
"step": 44
},
{
"epoch": 0.22277227722772278,
"grad_norm": 0.16228918096461187,
"learning_rate": 3.5643564356435645e-05,
"loss": 0.3975,
"step": 45
},
{
"epoch": 0.22772277227722773,
"grad_norm": 0.1770949848045279,
"learning_rate": 3.643564356435644e-05,
"loss": 0.3926,
"step": 46
},
{
"epoch": 0.23267326732673269,
"grad_norm": 0.165950052864865,
"learning_rate": 3.722772277227723e-05,
"loss": 0.3836,
"step": 47
},
{
"epoch": 0.2376237623762376,
"grad_norm": 0.1687315416703815,
"learning_rate": 3.801980198019802e-05,
"loss": 0.3844,
"step": 48
},
{
"epoch": 0.24257425742574257,
"grad_norm": 0.16567211187560885,
"learning_rate": 3.8811881188118816e-05,
"loss": 0.3799,
"step": 49
},
{
"epoch": 0.24752475247524752,
"grad_norm": 0.16916743953716526,
"learning_rate": 3.9603960396039605e-05,
"loss": 0.387,
"step": 50
},
{
"epoch": 0.2524752475247525,
"grad_norm": 0.17623879763389697,
"learning_rate": 4.03960396039604e-05,
"loss": 0.3858,
"step": 51
},
{
"epoch": 0.25742574257425743,
"grad_norm": 0.17467961939214205,
"learning_rate": 4.118811881188119e-05,
"loss": 0.3807,
"step": 52
},
{
"epoch": 0.2623762376237624,
"grad_norm": 0.2944378690201917,
"learning_rate": 4.1980198019801987e-05,
"loss": 0.385,
"step": 53
},
{
"epoch": 0.26732673267326734,
"grad_norm": 0.5178473274848127,
"learning_rate": 4.2772277227722776e-05,
"loss": 0.381,
"step": 54
},
{
"epoch": 0.2722772277227723,
"grad_norm": 0.9941728281188755,
"learning_rate": 4.356435643564357e-05,
"loss": 0.3953,
"step": 55
},
{
"epoch": 0.27722772277227725,
"grad_norm": 0.9303019204930113,
"learning_rate": 4.435643564356436e-05,
"loss": 0.3966,
"step": 56
},
{
"epoch": 0.28217821782178215,
"grad_norm": 0.6881180716715236,
"learning_rate": 4.514851485148515e-05,
"loss": 0.3932,
"step": 57
},
{
"epoch": 0.2871287128712871,
"grad_norm": 0.764197790459852,
"learning_rate": 4.594059405940594e-05,
"loss": 0.3941,
"step": 58
},
{
"epoch": 0.29207920792079206,
"grad_norm": 1.647386456322686,
"learning_rate": 4.6732673267326736e-05,
"loss": 0.4021,
"step": 59
},
{
"epoch": 0.297029702970297,
"grad_norm": 0.7491760288830157,
"learning_rate": 4.7524752475247525e-05,
"loss": 0.3949,
"step": 60
},
{
"epoch": 0.30198019801980197,
"grad_norm": 0.8931216713440562,
"learning_rate": 4.831683168316832e-05,
"loss": 0.3942,
"step": 61
},
{
"epoch": 0.3069306930693069,
"grad_norm": 0.6557824032290299,
"learning_rate": 4.910891089108911e-05,
"loss": 0.3903,
"step": 62
},
{
"epoch": 0.3118811881188119,
"grad_norm": 0.613421803633619,
"learning_rate": 4.9900990099009906e-05,
"loss": 0.3816,
"step": 63
},
{
"epoch": 0.31683168316831684,
"grad_norm": 0.5547034347595892,
"learning_rate": 5.0693069306930696e-05,
"loss": 0.3867,
"step": 64
},
{
"epoch": 0.3217821782178218,
"grad_norm": 0.6397811968860269,
"learning_rate": 5.148514851485149e-05,
"loss": 0.3835,
"step": 65
},
{
"epoch": 0.32673267326732675,
"grad_norm": 0.48090924579294886,
"learning_rate": 5.227722772277228e-05,
"loss": 0.3814,
"step": 66
},
{
"epoch": 0.3316831683168317,
"grad_norm": 0.5627837815105704,
"learning_rate": 5.306930693069308e-05,
"loss": 0.382,
"step": 67
},
{
"epoch": 0.33663366336633666,
"grad_norm": 0.4296757500964271,
"learning_rate": 5.3861386138613866e-05,
"loss": 0.3771,
"step": 68
},
{
"epoch": 0.3415841584158416,
"grad_norm": 0.5124763237012445,
"learning_rate": 5.465346534653466e-05,
"loss": 0.3686,
"step": 69
},
{
"epoch": 0.3465346534653465,
"grad_norm": 0.44620454004036086,
"learning_rate": 5.5445544554455445e-05,
"loss": 0.3768,
"step": 70
},
{
"epoch": 0.35148514851485146,
"grad_norm": 0.40097174912710437,
"learning_rate": 5.623762376237624e-05,
"loss": 0.3732,
"step": 71
},
{
"epoch": 0.3564356435643564,
"grad_norm": 0.3605645786344511,
"learning_rate": 5.702970297029703e-05,
"loss": 0.3705,
"step": 72
},
{
"epoch": 0.3613861386138614,
"grad_norm": 0.41932072631744316,
"learning_rate": 5.7821782178217826e-05,
"loss": 0.3693,
"step": 73
},
{
"epoch": 0.36633663366336633,
"grad_norm": 0.4006777830339425,
"learning_rate": 5.8613861386138615e-05,
"loss": 0.379,
"step": 74
},
{
"epoch": 0.3712871287128713,
"grad_norm": 0.4465969738529599,
"learning_rate": 5.940594059405941e-05,
"loss": 0.3709,
"step": 75
},
{
"epoch": 0.37623762376237624,
"grad_norm": 0.4223804979204032,
"learning_rate": 6.01980198019802e-05,
"loss": 0.3675,
"step": 76
},
{
"epoch": 0.3811881188118812,
"grad_norm": 0.37123640032095456,
"learning_rate": 6.0990099009900997e-05,
"loss": 0.366,
"step": 77
},
{
"epoch": 0.38613861386138615,
"grad_norm": 0.3392928493351884,
"learning_rate": 6.178217821782179e-05,
"loss": 0.3716,
"step": 78
},
{
"epoch": 0.3910891089108911,
"grad_norm": 0.2889058251453323,
"learning_rate": 6.257425742574258e-05,
"loss": 0.3642,
"step": 79
},
{
"epoch": 0.39603960396039606,
"grad_norm": 0.3459814472315841,
"learning_rate": 6.336633663366337e-05,
"loss": 0.3691,
"step": 80
},
{
"epoch": 0.400990099009901,
"grad_norm": 0.38845940118983235,
"learning_rate": 6.415841584158417e-05,
"loss": 0.3711,
"step": 81
},
{
"epoch": 0.40594059405940597,
"grad_norm": 0.42532185159046343,
"learning_rate": 6.495049504950496e-05,
"loss": 0.3675,
"step": 82
},
{
"epoch": 0.41089108910891087,
"grad_norm": 0.5455105953636796,
"learning_rate": 6.574257425742575e-05,
"loss": 0.3633,
"step": 83
},
{
"epoch": 0.4158415841584158,
"grad_norm": 0.5990744796491794,
"learning_rate": 6.653465346534654e-05,
"loss": 0.3583,
"step": 84
},
{
"epoch": 0.4207920792079208,
"grad_norm": 0.49648400280044397,
"learning_rate": 6.732673267326732e-05,
"loss": 0.3664,
"step": 85
},
{
"epoch": 0.42574257425742573,
"grad_norm": 0.4307985110055904,
"learning_rate": 6.811881188118812e-05,
"loss": 0.3673,
"step": 86
},
{
"epoch": 0.4306930693069307,
"grad_norm": 0.5272138230588959,
"learning_rate": 6.891089108910892e-05,
"loss": 0.3653,
"step": 87
},
{
"epoch": 0.43564356435643564,
"grad_norm": 0.6931632267314781,
"learning_rate": 6.970297029702971e-05,
"loss": 0.3725,
"step": 88
},
{
"epoch": 0.4405940594059406,
"grad_norm": 0.9339192352616005,
"learning_rate": 7.04950495049505e-05,
"loss": 0.3718,
"step": 89
},
{
"epoch": 0.44554455445544555,
"grad_norm": 1.0174313097655168,
"learning_rate": 7.128712871287129e-05,
"loss": 0.3835,
"step": 90
},
{
"epoch": 0.4504950495049505,
"grad_norm": 0.6671231145210254,
"learning_rate": 7.207920792079209e-05,
"loss": 0.367,
"step": 91
},
{
"epoch": 0.45544554455445546,
"grad_norm": 0.6175710394910587,
"learning_rate": 7.287128712871288e-05,
"loss": 0.3705,
"step": 92
},
{
"epoch": 0.4603960396039604,
"grad_norm": 0.6273903843989881,
"learning_rate": 7.366336633663368e-05,
"loss": 0.3701,
"step": 93
},
{
"epoch": 0.46534653465346537,
"grad_norm": 0.5030734661096935,
"learning_rate": 7.445544554455446e-05,
"loss": 0.372,
"step": 94
},
{
"epoch": 0.47029702970297027,
"grad_norm": 0.610175464714336,
"learning_rate": 7.524752475247524e-05,
"loss": 0.3702,
"step": 95
},
{
"epoch": 0.4752475247524752,
"grad_norm": 0.45894009874038927,
"learning_rate": 7.603960396039604e-05,
"loss": 0.3695,
"step": 96
},
{
"epoch": 0.4801980198019802,
"grad_norm": 0.5986232687060531,
"learning_rate": 7.683168316831684e-05,
"loss": 0.3657,
"step": 97
},
{
"epoch": 0.48514851485148514,
"grad_norm": 0.46624796933237705,
"learning_rate": 7.762376237623763e-05,
"loss": 0.3614,
"step": 98
},
{
"epoch": 0.4900990099009901,
"grad_norm": 0.43351289175270075,
"learning_rate": 7.841584158415841e-05,
"loss": 0.3683,
"step": 99
},
{
"epoch": 0.49504950495049505,
"grad_norm": 0.4940464058502036,
"learning_rate": 7.920792079207921e-05,
"loss": 0.3621,
"step": 100
},
{
"epoch": 0.5,
"grad_norm": 0.4386737116693806,
"learning_rate": 8e-05,
"loss": 0.3611,
"step": 101
},
{
"epoch": 0.504950495049505,
"grad_norm": 0.33108309935071073,
"learning_rate": 7.999976110803523e-05,
"loss": 0.3571,
"step": 102
},
{
"epoch": 0.5099009900990099,
"grad_norm": 0.43229575461499764,
"learning_rate": 7.99990444349944e-05,
"loss": 0.3588,
"step": 103
},
{
"epoch": 0.5148514851485149,
"grad_norm": 0.39892418329866514,
"learning_rate": 7.999784998943787e-05,
"loss": 0.3621,
"step": 104
},
{
"epoch": 0.5198019801980198,
"grad_norm": 0.39765033553103313,
"learning_rate": 7.999617778563281e-05,
"loss": 0.36,
"step": 105
},
{
"epoch": 0.5247524752475248,
"grad_norm": 0.47174546802256195,
"learning_rate": 7.999402784355303e-05,
"loss": 0.3679,
"step": 106
},
{
"epoch": 0.5297029702970297,
"grad_norm": 0.5434295873242668,
"learning_rate": 7.999140018887873e-05,
"loss": 0.365,
"step": 107
},
{
"epoch": 0.5346534653465347,
"grad_norm": 0.46618471813920354,
"learning_rate": 7.998829485299617e-05,
"loss": 0.362,
"step": 108
},
{
"epoch": 0.5396039603960396,
"grad_norm": 0.32274696751184606,
"learning_rate": 7.998471187299734e-05,
"loss": 0.3573,
"step": 109
},
{
"epoch": 0.5445544554455446,
"grad_norm": 0.2980275103082691,
"learning_rate": 7.998065129167953e-05,
"loss": 0.3604,
"step": 110
},
{
"epoch": 0.5495049504950495,
"grad_norm": 0.3313425678437383,
"learning_rate": 7.997611315754472e-05,
"loss": 0.3559,
"step": 111
},
{
"epoch": 0.5544554455445545,
"grad_norm": 0.34876342801843374,
"learning_rate": 7.997109752479912e-05,
"loss": 0.3605,
"step": 112
},
{
"epoch": 0.5594059405940595,
"grad_norm": 0.27070943874947645,
"learning_rate": 7.996560445335241e-05,
"loss": 0.3578,
"step": 113
},
{
"epoch": 0.5643564356435643,
"grad_norm": 0.30784470099177474,
"learning_rate": 7.995963400881718e-05,
"loss": 0.3525,
"step": 114
},
{
"epoch": 0.5693069306930693,
"grad_norm": 0.3534221286392907,
"learning_rate": 7.995318626250795e-05,
"loss": 0.359,
"step": 115
},
{
"epoch": 0.5742574257425742,
"grad_norm": 0.3474350284931066,
"learning_rate": 7.994626129144047e-05,
"loss": 0.354,
"step": 116
},
{
"epoch": 0.5792079207920792,
"grad_norm": 0.3100297920829696,
"learning_rate": 7.993885917833073e-05,
"loss": 0.3505,
"step": 117
},
{
"epoch": 0.5841584158415841,
"grad_norm": 0.34227574104701863,
"learning_rate": 7.9930980011594e-05,
"loss": 0.357,
"step": 118
},
{
"epoch": 0.5891089108910891,
"grad_norm": 0.33712051818395816,
"learning_rate": 7.992262388534378e-05,
"loss": 0.3527,
"step": 119
},
{
"epoch": 0.594059405940594,
"grad_norm": 0.3626887525897252,
"learning_rate": 7.991379089939062e-05,
"loss": 0.3553,
"step": 120
},
{
"epoch": 0.599009900990099,
"grad_norm": 0.3293657653843729,
"learning_rate": 7.990448115924099e-05,
"loss": 0.3579,
"step": 121
},
{
"epoch": 0.6039603960396039,
"grad_norm": 0.4513819250936864,
"learning_rate": 7.989469477609601e-05,
"loss": 0.3536,
"step": 122
},
{
"epoch": 0.6089108910891089,
"grad_norm": 0.5867550752785323,
"learning_rate": 7.988443186685007e-05,
"loss": 0.3598,
"step": 123
},
{
"epoch": 0.6138613861386139,
"grad_norm": 0.6848907478025485,
"learning_rate": 7.987369255408953e-05,
"loss": 0.3557,
"step": 124
},
{
"epoch": 0.6188118811881188,
"grad_norm": 0.5804172340911018,
"learning_rate": 7.986247696609112e-05,
"loss": 0.3579,
"step": 125
},
{
"epoch": 0.6237623762376238,
"grad_norm": 0.35057273505497694,
"learning_rate": 7.985078523682058e-05,
"loss": 0.3476,
"step": 126
},
{
"epoch": 0.6287128712871287,
"grad_norm": 0.3497443854677124,
"learning_rate": 7.983861750593091e-05,
"loss": 0.3524,
"step": 127
},
{
"epoch": 0.6336633663366337,
"grad_norm": 0.4250139056185807,
"learning_rate": 7.982597391876076e-05,
"loss": 0.357,
"step": 128
},
{
"epoch": 0.6386138613861386,
"grad_norm": 0.31755749351038337,
"learning_rate": 7.981285462633268e-05,
"loss": 0.3513,
"step": 129
},
{
"epoch": 0.6435643564356436,
"grad_norm": 0.26676351402850523,
"learning_rate": 7.979925978535137e-05,
"loss": 0.3566,
"step": 130
},
{
"epoch": 0.6485148514851485,
"grad_norm": 0.32517374670467525,
"learning_rate": 7.978518955820173e-05,
"loss": 0.3548,
"step": 131
},
{
"epoch": 0.6534653465346535,
"grad_norm": 0.2890288825364607,
"learning_rate": 7.977064411294698e-05,
"loss": 0.3472,
"step": 132
},
{
"epoch": 0.6584158415841584,
"grad_norm": 0.22366770202582242,
"learning_rate": 7.975562362332663e-05,
"loss": 0.3516,
"step": 133
},
{
"epoch": 0.6633663366336634,
"grad_norm": 0.31844198115660743,
"learning_rate": 7.974012826875436e-05,
"loss": 0.3515,
"step": 134
},
{
"epoch": 0.6683168316831684,
"grad_norm": 0.3529525464109541,
"learning_rate": 7.972415823431599e-05,
"loss": 0.3525,
"step": 135
},
{
"epoch": 0.6732673267326733,
"grad_norm": 0.255066234755533,
"learning_rate": 7.970771371076715e-05,
"loss": 0.3498,
"step": 136
},
{
"epoch": 0.6782178217821783,
"grad_norm": 0.2593778740218104,
"learning_rate": 7.969079489453107e-05,
"loss": 0.3506,
"step": 137
},
{
"epoch": 0.6831683168316832,
"grad_norm": 0.35100330007534536,
"learning_rate": 7.96734019876962e-05,
"loss": 0.3507,
"step": 138
},
{
"epoch": 0.6881188118811881,
"grad_norm": 0.4006287005322337,
"learning_rate": 7.965553519801385e-05,
"loss": 0.3525,
"step": 139
},
{
"epoch": 0.693069306930693,
"grad_norm": 0.4271373026120573,
"learning_rate": 7.963719473889562e-05,
"loss": 0.3514,
"step": 140
},
{
"epoch": 0.698019801980198,
"grad_norm": 0.5081112805036884,
"learning_rate": 7.961838082941094e-05,
"loss": 0.3604,
"step": 141
},
{
"epoch": 0.7029702970297029,
"grad_norm": 0.5842521863062967,
"learning_rate": 7.959909369428441e-05,
"loss": 0.3515,
"step": 142
},
{
"epoch": 0.7079207920792079,
"grad_norm": 0.5503571352775998,
"learning_rate": 7.957933356389306e-05,
"loss": 0.3524,
"step": 143
},
{
"epoch": 0.7128712871287128,
"grad_norm": 0.42737622709783635,
"learning_rate": 7.955910067426377e-05,
"loss": 0.3497,
"step": 144
},
{
"epoch": 0.7178217821782178,
"grad_norm": 0.4811283193865698,
"learning_rate": 7.953839526707025e-05,
"loss": 0.3519,
"step": 145
},
{
"epoch": 0.7227722772277227,
"grad_norm": 0.5237842031287816,
"learning_rate": 7.951721758963028e-05,
"loss": 0.3543,
"step": 146
},
{
"epoch": 0.7277227722772277,
"grad_norm": 0.33835426109252503,
"learning_rate": 7.949556789490269e-05,
"loss": 0.3495,
"step": 147
},
{
"epoch": 0.7326732673267327,
"grad_norm": 0.4072933799188343,
"learning_rate": 7.94734464414844e-05,
"loss": 0.3525,
"step": 148
},
{
"epoch": 0.7376237623762376,
"grad_norm": 0.4420494788957065,
"learning_rate": 7.945085349360728e-05,
"loss": 0.3515,
"step": 149
},
{
"epoch": 0.7425742574257426,
"grad_norm": 0.3047954686653965,
"learning_rate": 7.942778932113501e-05,
"loss": 0.3526,
"step": 150
},
{
"epoch": 0.7475247524752475,
"grad_norm": 0.36720786547284384,
"learning_rate": 7.940425419955988e-05,
"loss": 0.3511,
"step": 151
},
{
"epoch": 0.7524752475247525,
"grad_norm": 0.2709662965586093,
"learning_rate": 7.938024840999944e-05,
"loss": 0.3464,
"step": 152
},
{
"epoch": 0.7574257425742574,
"grad_norm": 0.3067260723147892,
"learning_rate": 7.935577223919322e-05,
"loss": 0.3496,
"step": 153
},
{
"epoch": 0.7623762376237624,
"grad_norm": 0.33922244765994963,
"learning_rate": 7.933082597949925e-05,
"loss": 0.3444,
"step": 154
},
{
"epoch": 0.7673267326732673,
"grad_norm": 0.24825212387361578,
"learning_rate": 7.930540992889056e-05,
"loss": 0.3462,
"step": 155
},
{
"epoch": 0.7722772277227723,
"grad_norm": 0.28742334966555666,
"learning_rate": 7.927952439095167e-05,
"loss": 0.3415,
"step": 156
},
{
"epoch": 0.7772277227722773,
"grad_norm": 0.28105530967442244,
"learning_rate": 7.925316967487493e-05,
"loss": 0.3489,
"step": 157
},
{
"epoch": 0.7821782178217822,
"grad_norm": 0.2269815480722697,
"learning_rate": 7.922634609545685e-05,
"loss": 0.35,
"step": 158
},
{
"epoch": 0.7871287128712872,
"grad_norm": 0.3460207605487597,
"learning_rate": 7.919905397309429e-05,
"loss": 0.3454,
"step": 159
},
{
"epoch": 0.7920792079207921,
"grad_norm": 0.34695798696000907,
"learning_rate": 7.917129363378069e-05,
"loss": 0.3512,
"step": 160
},
{
"epoch": 0.7970297029702971,
"grad_norm": 0.3417847973040481,
"learning_rate": 7.914306540910216e-05,
"loss": 0.3491,
"step": 161
},
{
"epoch": 0.801980198019802,
"grad_norm": 0.42447506809816166,
"learning_rate": 7.91143696362335e-05,
"loss": 0.3458,
"step": 162
},
{
"epoch": 0.806930693069307,
"grad_norm": 0.4652667317777917,
"learning_rate": 7.908520665793419e-05,
"loss": 0.3471,
"step": 163
},
{
"epoch": 0.8118811881188119,
"grad_norm": 0.45670453442321735,
"learning_rate": 7.905557682254429e-05,
"loss": 0.35,
"step": 164
},
{
"epoch": 0.8168316831683168,
"grad_norm": 0.39419625102522626,
"learning_rate": 7.902548048398028e-05,
"loss": 0.3483,
"step": 165
},
{
"epoch": 0.8217821782178217,
"grad_norm": 0.24325629441917615,
"learning_rate": 7.89949180017308e-05,
"loss": 0.3405,
"step": 166
},
{
"epoch": 0.8267326732673267,
"grad_norm": 0.3145115138550842,
"learning_rate": 7.896388974085246e-05,
"loss": 0.3467,
"step": 167
},
{
"epoch": 0.8316831683168316,
"grad_norm": 0.41909642085754,
"learning_rate": 7.893239607196537e-05,
"loss": 0.3497,
"step": 168
},
{
"epoch": 0.8366336633663366,
"grad_norm": 0.33326007547308273,
"learning_rate": 7.890043737124872e-05,
"loss": 0.3468,
"step": 169
},
{
"epoch": 0.8415841584158416,
"grad_norm": 0.21373097133178287,
"learning_rate": 7.886801402043639e-05,
"loss": 0.347,
"step": 170
},
{
"epoch": 0.8465346534653465,
"grad_norm": 0.28669930148909606,
"learning_rate": 7.883512640681226e-05,
"loss": 0.3497,
"step": 171
},
{
"epoch": 0.8514851485148515,
"grad_norm": 0.34281409481153846,
"learning_rate": 7.880177492320565e-05,
"loss": 0.3476,
"step": 172
},
{
"epoch": 0.8564356435643564,
"grad_norm": 0.32878705062110175,
"learning_rate": 7.876795996798665e-05,
"loss": 0.3443,
"step": 173
},
{
"epoch": 0.8613861386138614,
"grad_norm": 0.2560890954076474,
"learning_rate": 7.873368194506131e-05,
"loss": 0.3449,
"step": 174
},
{
"epoch": 0.8663366336633663,
"grad_norm": 0.2659800616971379,
"learning_rate": 7.869894126386684e-05,
"loss": 0.3494,
"step": 175
},
{
"epoch": 0.8712871287128713,
"grad_norm": 0.2957706400184562,
"learning_rate": 7.866373833936673e-05,
"loss": 0.3427,
"step": 176
},
{
"epoch": 0.8762376237623762,
"grad_norm": 0.2877326856661544,
"learning_rate": 7.862807359204574e-05,
"loss": 0.3404,
"step": 177
},
{
"epoch": 0.8811881188118812,
"grad_norm": 0.2492809343857738,
"learning_rate": 7.859194744790498e-05,
"loss": 0.3423,
"step": 178
},
{
"epoch": 0.8861386138613861,
"grad_norm": 0.20823037683334517,
"learning_rate": 7.855536033845673e-05,
"loss": 0.3417,
"step": 179
},
{
"epoch": 0.8910891089108911,
"grad_norm": 0.2615486876434859,
"learning_rate": 7.851831270071929e-05,
"loss": 0.3447,
"step": 180
},
{
"epoch": 0.8960396039603961,
"grad_norm": 0.28681482554906357,
"learning_rate": 7.848080497721181e-05,
"loss": 0.3423,
"step": 181
},
{
"epoch": 0.900990099009901,
"grad_norm": 0.2979002962137349,
"learning_rate": 7.844283761594899e-05,
"loss": 0.3369,
"step": 182
},
{
"epoch": 0.905940594059406,
"grad_norm": 0.33231625569300743,
"learning_rate": 7.84044110704357e-05,
"loss": 0.3467,
"step": 183
},
{
"epoch": 0.9108910891089109,
"grad_norm": 0.3331695358581735,
"learning_rate": 7.83655257996616e-05,
"loss": 0.3417,
"step": 184
},
{
"epoch": 0.9158415841584159,
"grad_norm": 0.3411061202071129,
"learning_rate": 7.83261822680956e-05,
"loss": 0.3487,
"step": 185
},
{
"epoch": 0.9207920792079208,
"grad_norm": 0.30749202524041036,
"learning_rate": 7.828638094568041e-05,
"loss": 0.3406,
"step": 186
},
{
"epoch": 0.9257425742574258,
"grad_norm": 0.30511452956851215,
"learning_rate": 7.824612230782681e-05,
"loss": 0.3403,
"step": 187
},
{
"epoch": 0.9306930693069307,
"grad_norm": 0.33218486067673636,
"learning_rate": 7.820540683540808e-05,
"loss": 0.3388,
"step": 188
},
{
"epoch": 0.9356435643564357,
"grad_norm": 0.3213728709603462,
"learning_rate": 7.816423501475415e-05,
"loss": 0.3457,
"step": 189
},
{
"epoch": 0.9405940594059405,
"grad_norm": 0.3095238063829038,
"learning_rate": 7.812260733764591e-05,
"loss": 0.348,
"step": 190
},
{
"epoch": 0.9455445544554455,
"grad_norm": 0.35118241238757736,
"learning_rate": 7.80805243013092e-05,
"loss": 0.3467,
"step": 191
},
{
"epoch": 0.9504950495049505,
"grad_norm": 0.370467699158138,
"learning_rate": 7.803798640840901e-05,
"loss": 0.3441,
"step": 192
},
{
"epoch": 0.9554455445544554,
"grad_norm": 0.34511631244982893,
"learning_rate": 7.799499416704338e-05,
"loss": 0.3457,
"step": 193
},
{
"epoch": 0.9603960396039604,
"grad_norm": 0.3235511774722652,
"learning_rate": 7.795154809073735e-05,
"loss": 0.3408,
"step": 194
},
{
"epoch": 0.9653465346534653,
"grad_norm": 0.3539469729464918,
"learning_rate": 7.790764869843684e-05,
"loss": 0.3426,
"step": 195
},
{
"epoch": 0.9702970297029703,
"grad_norm": 0.35801210970598224,
"learning_rate": 7.786329651450248e-05,
"loss": 0.3462,
"step": 196
},
{
"epoch": 0.9752475247524752,
"grad_norm": 0.30433710890475135,
"learning_rate": 7.781849206870325e-05,
"loss": 0.3475,
"step": 197
},
{
"epoch": 0.9801980198019802,
"grad_norm": 0.34885762683733873,
"learning_rate": 7.77732358962103e-05,
"loss": 0.3408,
"step": 198
},
{
"epoch": 0.9851485148514851,
"grad_norm": 0.43270154125711485,
"learning_rate": 7.772752853759039e-05,
"loss": 0.3412,
"step": 199
},
{
"epoch": 0.9900990099009901,
"grad_norm": 0.5193353797335327,
"learning_rate": 7.768137053879957e-05,
"loss": 0.345,
"step": 200
},
{
"epoch": 0.995049504950495,
"grad_norm": 0.5848338133959268,
"learning_rate": 7.763476245117659e-05,
"loss": 0.3402,
"step": 201
},
{
"epoch": 1.0,
"grad_norm": 0.49189050142360197,
"learning_rate": 7.758770483143634e-05,
"loss": 0.3418,
"step": 202
},
{
"epoch": 1.004950495049505,
"grad_norm": 0.3965620050129241,
"learning_rate": 7.754019824166318e-05,
"loss": 0.3272,
"step": 203
},
{
"epoch": 1.00990099009901,
"grad_norm": 0.38729303869225656,
"learning_rate": 7.749224324930421e-05,
"loss": 0.3265,
"step": 204
},
{
"epoch": 1.0148514851485149,
"grad_norm": 0.46382028945219433,
"learning_rate": 7.744384042716258e-05,
"loss": 0.3259,
"step": 205
},
{
"epoch": 1.0198019801980198,
"grad_norm": 0.4961244468745795,
"learning_rate": 7.739499035339055e-05,
"loss": 0.3265,
"step": 206
},
{
"epoch": 1.0247524752475248,
"grad_norm": 0.39391833603810456,
"learning_rate": 7.734569361148262e-05,
"loss": 0.3243,
"step": 207
},
{
"epoch": 1.0297029702970297,
"grad_norm": 0.31996146294099276,
"learning_rate": 7.729595079026856e-05,
"loss": 0.3251,
"step": 208
},
{
"epoch": 1.0346534653465347,
"grad_norm": 0.26647643410303384,
"learning_rate": 7.724576248390639e-05,
"loss": 0.3223,
"step": 209
},
{
"epoch": 1.0396039603960396,
"grad_norm": 0.25657678139008755,
"learning_rate": 7.719512929187527e-05,
"loss": 0.3189,
"step": 210
},
{
"epoch": 1.0445544554455446,
"grad_norm": 0.3153458918560271,
"learning_rate": 7.714405181896831e-05,
"loss": 0.325,
"step": 211
},
{
"epoch": 1.0495049504950495,
"grad_norm": 0.3310266745371117,
"learning_rate": 7.709253067528545e-05,
"loss": 0.3258,
"step": 212
},
{
"epoch": 1.0544554455445545,
"grad_norm": 0.2595609505361843,
"learning_rate": 7.704056647622603e-05,
"loss": 0.3176,
"step": 213
},
{
"epoch": 1.0594059405940595,
"grad_norm": 0.22851822976210578,
"learning_rate": 7.698815984248152e-05,
"loss": 0.3171,
"step": 214
},
{
"epoch": 1.0643564356435644,
"grad_norm": 0.2628390466311596,
"learning_rate": 7.693531140002811e-05,
"loss": 0.3208,
"step": 215
},
{
"epoch": 1.0693069306930694,
"grad_norm": 0.22084804612132355,
"learning_rate": 7.688202178011921e-05,
"loss": 0.3246,
"step": 216
},
{
"epoch": 1.0742574257425743,
"grad_norm": 0.20572617251073919,
"learning_rate": 7.682829161927794e-05,
"loss": 0.3265,
"step": 217
},
{
"epoch": 1.0792079207920793,
"grad_norm": 0.26152370679155457,
"learning_rate": 7.677412155928946e-05,
"loss": 0.3244,
"step": 218
},
{
"epoch": 1.0841584158415842,
"grad_norm": 0.23198561773413004,
"learning_rate": 7.671951224719339e-05,
"loss": 0.3221,
"step": 219
},
{
"epoch": 1.0891089108910892,
"grad_norm": 0.23885371632512684,
"learning_rate": 7.666446433527601e-05,
"loss": 0.3228,
"step": 220
},
{
"epoch": 1.0940594059405941,
"grad_norm": 0.281941194601826,
"learning_rate": 7.660897848106251e-05,
"loss": 0.3183,
"step": 221
},
{
"epoch": 1.099009900990099,
"grad_norm": 0.35643488194797623,
"learning_rate": 7.655305534730916e-05,
"loss": 0.3223,
"step": 222
},
{
"epoch": 1.103960396039604,
"grad_norm": 0.4175750224727906,
"learning_rate": 7.649669560199528e-05,
"loss": 0.3226,
"step": 223
},
{
"epoch": 1.108910891089109,
"grad_norm": 0.4804436533424334,
"learning_rate": 7.643989991831541e-05,
"loss": 0.3261,
"step": 224
},
{
"epoch": 1.113861386138614,
"grad_norm": 0.5026289432175224,
"learning_rate": 7.638266897467117e-05,
"loss": 0.3239,
"step": 225
},
{
"epoch": 1.118811881188119,
"grad_norm": 0.45160715992891726,
"learning_rate": 7.632500345466318e-05,
"loss": 0.3255,
"step": 226
},
{
"epoch": 1.1237623762376239,
"grad_norm": 0.323894832081213,
"learning_rate": 7.62669040470829e-05,
"loss": 0.3235,
"step": 227
},
{
"epoch": 1.1287128712871288,
"grad_norm": 0.23111833292364858,
"learning_rate": 7.620837144590444e-05,
"loss": 0.3261,
"step": 228
},
{
"epoch": 1.1336633663366338,
"grad_norm": 0.35672796574835897,
"learning_rate": 7.61494063502762e-05,
"loss": 0.3244,
"step": 229
},
{
"epoch": 1.1386138613861387,
"grad_norm": 0.38953557445459835,
"learning_rate": 7.609000946451255e-05,
"loss": 0.3275,
"step": 230
},
{
"epoch": 1.1435643564356435,
"grad_norm": 0.2456196439272548,
"learning_rate": 7.603018149808542e-05,
"loss": 0.3242,
"step": 231
},
{
"epoch": 1.1485148514851484,
"grad_norm": 0.2627252229709477,
"learning_rate": 7.596992316561583e-05,
"loss": 0.3263,
"step": 232
},
{
"epoch": 1.1534653465346534,
"grad_norm": 0.3880693283953946,
"learning_rate": 7.590923518686537e-05,
"loss": 0.3227,
"step": 233
},
{
"epoch": 1.1584158415841583,
"grad_norm": 0.36122965194857654,
"learning_rate": 7.584811828672755e-05,
"loss": 0.324,
"step": 234
},
{
"epoch": 1.1633663366336633,
"grad_norm": 0.24771560907735035,
"learning_rate": 7.578657319521918e-05,
"loss": 0.3272,
"step": 235
},
{
"epoch": 1.1683168316831682,
"grad_norm": 0.3549181872881683,
"learning_rate": 7.572460064747167e-05,
"loss": 0.3252,
"step": 236
},
{
"epoch": 1.1732673267326732,
"grad_norm": 0.3701278541218383,
"learning_rate": 7.56622013837222e-05,
"loss": 0.322,
"step": 237
},
{
"epoch": 1.1782178217821782,
"grad_norm": 0.2291300819304423,
"learning_rate": 7.55993761493049e-05,
"loss": 0.3268,
"step": 238
},
{
"epoch": 1.183168316831683,
"grad_norm": 0.30325809144675336,
"learning_rate": 7.553612569464197e-05,
"loss": 0.3239,
"step": 239
},
{
"epoch": 1.188118811881188,
"grad_norm": 0.34766919596131945,
"learning_rate": 7.547245077523466e-05,
"loss": 0.3269,
"step": 240
},
{
"epoch": 1.193069306930693,
"grad_norm": 0.2802166057218898,
"learning_rate": 7.540835215165431e-05,
"loss": 0.3237,
"step": 241
},
{
"epoch": 1.198019801980198,
"grad_norm": 0.20761909947515408,
"learning_rate": 7.534383058953321e-05,
"loss": 0.3233,
"step": 242
},
{
"epoch": 1.202970297029703,
"grad_norm": 0.21713786543933997,
"learning_rate": 7.527888685955551e-05,
"loss": 0.3266,
"step": 243
},
{
"epoch": 1.2079207920792079,
"grad_norm": 0.28332778351110616,
"learning_rate": 7.5213521737448e-05,
"loss": 0.3234,
"step": 244
},
{
"epoch": 1.2128712871287128,
"grad_norm": 0.25218609771205475,
"learning_rate": 7.514773600397076e-05,
"loss": 0.3225,
"step": 245
},
{
"epoch": 1.2178217821782178,
"grad_norm": 0.23493196928481838,
"learning_rate": 7.508153044490796e-05,
"loss": 0.3244,
"step": 246
},
{
"epoch": 1.2227722772277227,
"grad_norm": 0.24313872207402484,
"learning_rate": 7.50149058510584e-05,
"loss": 0.322,
"step": 247
},
{
"epoch": 1.2277227722772277,
"grad_norm": 0.26071446304328083,
"learning_rate": 7.494786301822611e-05,
"loss": 0.325,
"step": 248
},
{
"epoch": 1.2326732673267327,
"grad_norm": 0.2713861363334041,
"learning_rate": 7.488040274721077e-05,
"loss": 0.3229,
"step": 249
},
{
"epoch": 1.2376237623762376,
"grad_norm": 0.2815273546029921,
"learning_rate": 7.481252584379822e-05,
"loss": 0.3229,
"step": 250
},
{
"epoch": 1.2425742574257426,
"grad_norm": 0.31187646080931386,
"learning_rate": 7.47442331187508e-05,
"loss": 0.3207,
"step": 251
},
{
"epoch": 1.2475247524752475,
"grad_norm": 0.2650133379890827,
"learning_rate": 7.467552538779768e-05,
"loss": 0.32,
"step": 252
},
{
"epoch": 1.2524752475247525,
"grad_norm": 0.17403472770593334,
"learning_rate": 7.460640347162508e-05,
"loss": 0.3238,
"step": 253
},
{
"epoch": 1.2574257425742574,
"grad_norm": 0.20063032190918698,
"learning_rate": 7.453686819586655e-05,
"loss": 0.329,
"step": 254
},
{
"epoch": 1.2623762376237624,
"grad_norm": 0.27733239216940847,
"learning_rate": 7.4466920391093e-05,
"loss": 0.3224,
"step": 255
},
{
"epoch": 1.2673267326732673,
"grad_norm": 0.30391912965378015,
"learning_rate": 7.439656089280286e-05,
"loss": 0.3187,
"step": 256
},
{
"epoch": 1.2722772277227723,
"grad_norm": 0.258142425865035,
"learning_rate": 7.432579054141208e-05,
"loss": 0.3213,
"step": 257
},
{
"epoch": 1.2772277227722773,
"grad_norm": 0.22151301347497324,
"learning_rate": 7.425461018224406e-05,
"loss": 0.3201,
"step": 258
},
{
"epoch": 1.2821782178217822,
"grad_norm": 0.23419027691681132,
"learning_rate": 7.418302066551959e-05,
"loss": 0.3267,
"step": 259
},
{
"epoch": 1.2871287128712872,
"grad_norm": 0.26013678205953394,
"learning_rate": 7.411102284634672e-05,
"loss": 0.3259,
"step": 260
},
{
"epoch": 1.2920792079207921,
"grad_norm": 0.2842311667544899,
"learning_rate": 7.403861758471043e-05,
"loss": 0.3187,
"step": 261
},
{
"epoch": 1.297029702970297,
"grad_norm": 0.31228351173236724,
"learning_rate": 7.396580574546251e-05,
"loss": 0.3222,
"step": 262
},
{
"epoch": 1.301980198019802,
"grad_norm": 0.3486572694201658,
"learning_rate": 7.38925881983111e-05,
"loss": 0.3275,
"step": 263
},
{
"epoch": 1.306930693069307,
"grad_norm": 0.3674794148030631,
"learning_rate": 7.381896581781042e-05,
"loss": 0.3215,
"step": 264
},
{
"epoch": 1.311881188118812,
"grad_norm": 0.3160950184823215,
"learning_rate": 7.37449394833502e-05,
"loss": 0.3235,
"step": 265
},
{
"epoch": 1.316831683168317,
"grad_norm": 0.22994487888930124,
"learning_rate": 7.367051007914527e-05,
"loss": 0.3222,
"step": 266
},
{
"epoch": 1.3217821782178218,
"grad_norm": 0.1938850721739046,
"learning_rate": 7.359567849422496e-05,
"loss": 0.324,
"step": 267
},
{
"epoch": 1.3267326732673268,
"grad_norm": 0.19676881488742676,
"learning_rate": 7.352044562242248e-05,
"loss": 0.3259,
"step": 268
},
{
"epoch": 1.3316831683168318,
"grad_norm": 0.2783673393968265,
"learning_rate": 7.344481236236428e-05,
"loss": 0.3201,
"step": 269
},
{
"epoch": 1.3366336633663367,
"grad_norm": 0.33404846697755264,
"learning_rate": 7.336877961745926e-05,
"loss": 0.3172,
"step": 270
},
{
"epoch": 1.3415841584158417,
"grad_norm": 0.32850987411997973,
"learning_rate": 7.329234829588798e-05,
"loss": 0.3201,
"step": 271
},
{
"epoch": 1.3465346534653464,
"grad_norm": 0.2580864877181822,
"learning_rate": 7.321551931059191e-05,
"loss": 0.3257,
"step": 272
},
{
"epoch": 1.3514851485148514,
"grad_norm": 0.15332399404847796,
"learning_rate": 7.313829357926238e-05,
"loss": 0.3261,
"step": 273
},
{
"epoch": 1.3564356435643563,
"grad_norm": 0.20248337274096065,
"learning_rate": 7.306067202432976e-05,
"loss": 0.3224,
"step": 274
},
{
"epoch": 1.3613861386138613,
"grad_norm": 0.30304088082346603,
"learning_rate": 7.29826555729523e-05,
"loss": 0.3255,
"step": 275
},
{
"epoch": 1.3663366336633662,
"grad_norm": 0.2931366381999886,
"learning_rate": 7.290424515700519e-05,
"loss": 0.323,
"step": 276
},
{
"epoch": 1.3712871287128712,
"grad_norm": 0.22992458525253304,
"learning_rate": 7.282544171306933e-05,
"loss": 0.3267,
"step": 277
},
{
"epoch": 1.3762376237623761,
"grad_norm": 0.18041230312676543,
"learning_rate": 7.274624618242022e-05,
"loss": 0.3227,
"step": 278
},
{
"epoch": 1.381188118811881,
"grad_norm": 0.204075595457074,
"learning_rate": 7.266665951101664e-05,
"loss": 0.3241,
"step": 279
},
{
"epoch": 1.386138613861386,
"grad_norm": 0.2695584195519876,
"learning_rate": 7.258668264948941e-05,
"loss": 0.3197,
"step": 280
},
{
"epoch": 1.391089108910891,
"grad_norm": 0.2587498125470864,
"learning_rate": 7.250631655313001e-05,
"loss": 0.3229,
"step": 281
},
{
"epoch": 1.396039603960396,
"grad_norm": 0.2498126566179802,
"learning_rate": 7.242556218187919e-05,
"loss": 0.3235,
"step": 282
},
{
"epoch": 1.400990099009901,
"grad_norm": 0.2769036801401161,
"learning_rate": 7.234442050031543e-05,
"loss": 0.3222,
"step": 283
},
{
"epoch": 1.4059405940594059,
"grad_norm": 0.2951697321170327,
"learning_rate": 7.226289247764354e-05,
"loss": 0.3193,
"step": 284
},
{
"epoch": 1.4108910891089108,
"grad_norm": 0.3140073279063781,
"learning_rate": 7.2180979087683e-05,
"loss": 0.3231,
"step": 285
},
{
"epoch": 1.4158415841584158,
"grad_norm": 0.3294640016367039,
"learning_rate": 7.209868130885634e-05,
"loss": 0.3214,
"step": 286
},
{
"epoch": 1.4207920792079207,
"grad_norm": 0.36565594630893544,
"learning_rate": 7.201600012417745e-05,
"loss": 0.3271,
"step": 287
},
{
"epoch": 1.4257425742574257,
"grad_norm": 0.35379832804519007,
"learning_rate": 7.193293652123989e-05,
"loss": 0.3205,
"step": 288
},
{
"epoch": 1.4306930693069306,
"grad_norm": 0.3223921579634103,
"learning_rate": 7.1849491492205e-05,
"loss": 0.3211,
"step": 289
},
{
"epoch": 1.4356435643564356,
"grad_norm": 0.35467670180093575,
"learning_rate": 7.176566603379015e-05,
"loss": 0.3221,
"step": 290
},
{
"epoch": 1.4405940594059405,
"grad_norm": 0.37798578537913297,
"learning_rate": 7.168146114725673e-05,
"loss": 0.3198,
"step": 291
},
{
"epoch": 1.4455445544554455,
"grad_norm": 0.34966699936779133,
"learning_rate": 7.159687783839832e-05,
"loss": 0.3227,
"step": 292
},
{
"epoch": 1.4504950495049505,
"grad_norm": 0.24726341182024242,
"learning_rate": 7.151191711752854e-05,
"loss": 0.3189,
"step": 293
},
{
"epoch": 1.4554455445544554,
"grad_norm": 0.25949588464207435,
"learning_rate": 7.142657999946906e-05,
"loss": 0.3222,
"step": 294
},
{
"epoch": 1.4603960396039604,
"grad_norm": 0.2855037596259817,
"learning_rate": 7.134086750353747e-05,
"loss": 0.3217,
"step": 295
},
{
"epoch": 1.4653465346534653,
"grad_norm": 0.22186831176071517,
"learning_rate": 7.125478065353512e-05,
"loss": 0.3193,
"step": 296
},
{
"epoch": 1.4702970297029703,
"grad_norm": 0.24569476568558268,
"learning_rate": 7.116832047773484e-05,
"loss": 0.3233,
"step": 297
},
{
"epoch": 1.4752475247524752,
"grad_norm": 0.24947664055115984,
"learning_rate": 7.108148800886869e-05,
"loss": 0.321,
"step": 298
},
{
"epoch": 1.4801980198019802,
"grad_norm": 0.21593927166838858,
"learning_rate": 7.09942842841156e-05,
"loss": 0.3177,
"step": 299
},
{
"epoch": 1.4851485148514851,
"grad_norm": 0.19917076539743275,
"learning_rate": 7.090671034508905e-05,
"loss": 0.3201,
"step": 300
},
{
"epoch": 1.49009900990099,
"grad_norm": 0.20214350623028918,
"learning_rate": 7.081876723782457e-05,
"loss": 0.3222,
"step": 301
},
{
"epoch": 1.495049504950495,
"grad_norm": 0.2385491320035371,
"learning_rate": 7.073045601276723e-05,
"loss": 0.3192,
"step": 302
},
{
"epoch": 1.5,
"grad_norm": 0.23154350378634414,
"learning_rate": 7.064177772475912e-05,
"loss": 0.3196,
"step": 303
},
{
"epoch": 1.504950495049505,
"grad_norm": 0.22628705886217929,
"learning_rate": 7.05527334330268e-05,
"loss": 0.3225,
"step": 304
},
{
"epoch": 1.50990099009901,
"grad_norm": 0.19427424546791436,
"learning_rate": 7.046332420116852e-05,
"loss": 0.3181,
"step": 305
},
{
"epoch": 1.5148514851485149,
"grad_norm": 0.18634215600114334,
"learning_rate": 7.037355109714165e-05,
"loss": 0.3184,
"step": 306
},
{
"epoch": 1.5198019801980198,
"grad_norm": 0.19920642103648958,
"learning_rate": 7.028341519324985e-05,
"loss": 0.317,
"step": 307
},
{
"epoch": 1.5247524752475248,
"grad_norm": 0.2684657108142712,
"learning_rate": 7.019291756613029e-05,
"loss": 0.3296,
"step": 308
},
{
"epoch": 1.5297029702970297,
"grad_norm": 0.2849696623219465,
"learning_rate": 7.010205929674075e-05,
"loss": 0.3202,
"step": 309
},
{
"epoch": 1.5346534653465347,
"grad_norm": 0.21705204588552374,
"learning_rate": 7.001084147034676e-05,
"loss": 0.319,
"step": 310
},
{
"epoch": 1.5396039603960396,
"grad_norm": 0.22560008026003084,
"learning_rate": 6.99192651765086e-05,
"loss": 0.3249,
"step": 311
},
{
"epoch": 1.5445544554455446,
"grad_norm": 0.26614078312578027,
"learning_rate": 6.982733150906833e-05,
"loss": 0.3212,
"step": 312
},
{
"epoch": 1.5495049504950495,
"grad_norm": 0.28899787142239786,
"learning_rate": 6.973504156613666e-05,
"loss": 0.3176,
"step": 313
},
{
"epoch": 1.5544554455445545,
"grad_norm": 0.2800623429447802,
"learning_rate": 6.964239645007989e-05,
"loss": 0.3197,
"step": 314
},
{
"epoch": 1.5594059405940595,
"grad_norm": 0.249656934863319,
"learning_rate": 6.954939726750667e-05,
"loss": 0.3214,
"step": 315
},
{
"epoch": 1.5643564356435644,
"grad_norm": 0.23045070686947017,
"learning_rate": 6.945604512925493e-05,
"loss": 0.3217,
"step": 316
},
{
"epoch": 1.5693069306930694,
"grad_norm": 0.22566199504874904,
"learning_rate": 6.936234115037842e-05,
"loss": 0.3239,
"step": 317
},
{
"epoch": 1.5742574257425743,
"grad_norm": 0.2262227158844052,
"learning_rate": 6.926828645013353e-05,
"loss": 0.3198,
"step": 318
},
{
"epoch": 1.5792079207920793,
"grad_norm": 0.1960066759664643,
"learning_rate": 6.917388215196585e-05,
"loss": 0.3222,
"step": 319
},
{
"epoch": 1.5841584158415842,
"grad_norm": 0.22269151083586428,
"learning_rate": 6.907912938349682e-05,
"loss": 0.3157,
"step": 320
},
{
"epoch": 1.5891089108910892,
"grad_norm": 0.21113188276021733,
"learning_rate": 6.898402927651019e-05,
"loss": 0.3175,
"step": 321
},
{
"epoch": 1.5940594059405941,
"grad_norm": 0.2349933871989314,
"learning_rate": 6.88885829669385e-05,
"loss": 0.3175,
"step": 322
},
{
"epoch": 1.599009900990099,
"grad_norm": 0.24334828788369725,
"learning_rate": 6.879279159484961e-05,
"loss": 0.3207,
"step": 323
},
{
"epoch": 1.603960396039604,
"grad_norm": 0.2375605067454906,
"learning_rate": 6.869665630443295e-05,
"loss": 0.3231,
"step": 324
},
{
"epoch": 1.608910891089109,
"grad_norm": 0.23722656434346312,
"learning_rate": 6.860017824398595e-05,
"loss": 0.3192,
"step": 325
},
{
"epoch": 1.613861386138614,
"grad_norm": 0.23914897802445803,
"learning_rate": 6.85033585659003e-05,
"loss": 0.3165,
"step": 326
},
{
"epoch": 1.618811881188119,
"grad_norm": 0.25680399156979883,
"learning_rate": 6.84061984266481e-05,
"loss": 0.3233,
"step": 327
},
{
"epoch": 1.6237623762376239,
"grad_norm": 0.29013994357664635,
"learning_rate": 6.830869898676822e-05,
"loss": 0.3184,
"step": 328
},
{
"epoch": 1.6287128712871288,
"grad_norm": 0.2596053886368674,
"learning_rate": 6.82108614108523e-05,
"loss": 0.315,
"step": 329
},
{
"epoch": 1.6336633663366338,
"grad_norm": 0.24593548087470338,
"learning_rate": 6.811268686753086e-05,
"loss": 0.3188,
"step": 330
},
{
"epoch": 1.6386138613861387,
"grad_norm": 0.2333605325483033,
"learning_rate": 6.801417652945939e-05,
"loss": 0.3233,
"step": 331
},
{
"epoch": 1.6435643564356437,
"grad_norm": 0.22975229979338618,
"learning_rate": 6.79153315733043e-05,
"loss": 0.3193,
"step": 332
},
{
"epoch": 1.6485148514851486,
"grad_norm": 0.22562916248834147,
"learning_rate": 6.781615317972886e-05,
"loss": 0.3195,
"step": 333
},
{
"epoch": 1.6534653465346536,
"grad_norm": 0.1881999956764054,
"learning_rate": 6.771664253337916e-05,
"loss": 0.3161,
"step": 334
},
{
"epoch": 1.6584158415841586,
"grad_norm": 0.17957554796661543,
"learning_rate": 6.761680082286988e-05,
"loss": 0.3146,
"step": 335
},
{
"epoch": 1.6633663366336635,
"grad_norm": 0.24244282127286865,
"learning_rate": 6.751662924077015e-05,
"loss": 0.3185,
"step": 336
},
{
"epoch": 1.6683168316831685,
"grad_norm": 0.27176749547357265,
"learning_rate": 6.741612898358924e-05,
"loss": 0.325,
"step": 337
},
{
"epoch": 1.6732673267326734,
"grad_norm": 0.23705062747162786,
"learning_rate": 6.731530125176237e-05,
"loss": 0.3172,
"step": 338
},
{
"epoch": 1.6782178217821784,
"grad_norm": 0.18068260445114523,
"learning_rate": 6.721414724963631e-05,
"loss": 0.317,
"step": 339
},
{
"epoch": 1.6831683168316833,
"grad_norm": 0.18686064135931604,
"learning_rate": 6.711266818545494e-05,
"loss": 0.323,
"step": 340
},
{
"epoch": 1.688118811881188,
"grad_norm": 0.2433556257913613,
"learning_rate": 6.701086527134491e-05,
"loss": 0.3197,
"step": 341
},
{
"epoch": 1.693069306930693,
"grad_norm": 0.237609603882351,
"learning_rate": 6.690873972330116e-05,
"loss": 0.3207,
"step": 342
},
{
"epoch": 1.698019801980198,
"grad_norm": 0.19748333286805253,
"learning_rate": 6.68062927611723e-05,
"loss": 0.316,
"step": 343
},
{
"epoch": 1.702970297029703,
"grad_norm": 0.19866434487926096,
"learning_rate": 6.670352560864615e-05,
"loss": 0.3186,
"step": 344
},
{
"epoch": 1.7079207920792079,
"grad_norm": 0.22486559077228344,
"learning_rate": 6.660043949323505e-05,
"loss": 0.3204,
"step": 345
},
{
"epoch": 1.7128712871287128,
"grad_norm": 0.2416905102052713,
"learning_rate": 6.649703564626125e-05,
"loss": 0.3164,
"step": 346
},
{
"epoch": 1.7178217821782178,
"grad_norm": 0.20746633246421747,
"learning_rate": 6.639331530284214e-05,
"loss": 0.324,
"step": 347
},
{
"epoch": 1.7227722772277227,
"grad_norm": 0.18552595723695436,
"learning_rate": 6.628927970187557e-05,
"loss": 0.3227,
"step": 348
},
{
"epoch": 1.7277227722772277,
"grad_norm": 0.19862972904635046,
"learning_rate": 6.618493008602496e-05,
"loss": 0.3176,
"step": 349
},
{
"epoch": 1.7326732673267327,
"grad_norm": 0.22644562653013095,
"learning_rate": 6.608026770170459e-05,
"loss": 0.3127,
"step": 350
},
{
"epoch": 1.7376237623762376,
"grad_norm": 0.23956197789130662,
"learning_rate": 6.597529379906455e-05,
"loss": 0.3195,
"step": 351
},
{
"epoch": 1.7425742574257426,
"grad_norm": 0.20822303615309365,
"learning_rate": 6.587000963197598e-05,
"loss": 0.3161,
"step": 352
},
{
"epoch": 1.7475247524752475,
"grad_norm": 0.1968015427618515,
"learning_rate": 6.576441645801592e-05,
"loss": 0.3198,
"step": 353
},
{
"epoch": 1.7524752475247525,
"grad_norm": 0.21268509183684337,
"learning_rate": 6.565851553845242e-05,
"loss": 0.3187,
"step": 354
},
{
"epoch": 1.7574257425742574,
"grad_norm": 0.23160018089210926,
"learning_rate": 6.555230813822942e-05,
"loss": 0.3174,
"step": 355
},
{
"epoch": 1.7623762376237624,
"grad_norm": 0.22635391545597686,
"learning_rate": 6.544579552595165e-05,
"loss": 0.3182,
"step": 356
},
{
"epoch": 1.7673267326732673,
"grad_norm": 0.19581408322193755,
"learning_rate": 6.533897897386946e-05,
"loss": 0.319,
"step": 357
},
{
"epoch": 1.7722772277227723,
"grad_norm": 0.1863068382565854,
"learning_rate": 6.523185975786366e-05,
"loss": 0.3206,
"step": 358
},
{
"epoch": 1.7772277227722773,
"grad_norm": 0.20289821307826553,
"learning_rate": 6.512443915743024e-05,
"loss": 0.322,
"step": 359
},
{
"epoch": 1.7821782178217822,
"grad_norm": 0.22406897792167674,
"learning_rate": 6.501671845566512e-05,
"loss": 0.3251,
"step": 360
},
{
"epoch": 1.7871287128712872,
"grad_norm": 0.22046628764087864,
"learning_rate": 6.49086989392488e-05,
"loss": 0.3204,
"step": 361
},
{
"epoch": 1.7920792079207921,
"grad_norm": 0.21486546344708518,
"learning_rate": 6.480038189843101e-05,
"loss": 0.3227,
"step": 362
},
{
"epoch": 1.797029702970297,
"grad_norm": 0.20057169420220247,
"learning_rate": 6.469176862701529e-05,
"loss": 0.3181,
"step": 363
},
{
"epoch": 1.801980198019802,
"grad_norm": 0.1983948377595345,
"learning_rate": 6.458286042234352e-05,
"loss": 0.3177,
"step": 364
},
{
"epoch": 1.806930693069307,
"grad_norm": 0.1825724140696818,
"learning_rate": 6.447365858528046e-05,
"loss": 0.3144,
"step": 365
},
{
"epoch": 1.811881188118812,
"grad_norm": 0.2045402810504852,
"learning_rate": 6.436416442019817e-05,
"loss": 0.3183,
"step": 366
},
{
"epoch": 1.8168316831683167,
"grad_norm": 0.23015666963702616,
"learning_rate": 6.425437923496045e-05,
"loss": 0.3195,
"step": 367
},
{
"epoch": 1.8217821782178216,
"grad_norm": 0.216666025851071,
"learning_rate": 6.414430434090725e-05,
"loss": 0.3115,
"step": 368
},
{
"epoch": 1.8267326732673266,
"grad_norm": 0.15099483842056746,
"learning_rate": 6.403394105283897e-05,
"loss": 0.3123,
"step": 369
},
{
"epoch": 1.8316831683168315,
"grad_norm": 0.16186595047832233,
"learning_rate": 6.392329068900072e-05,
"loss": 0.3182,
"step": 370
},
{
"epoch": 1.8366336633663365,
"grad_norm": 0.17482024609831948,
"learning_rate": 6.381235457106664e-05,
"loss": 0.3185,
"step": 371
},
{
"epoch": 1.8415841584158414,
"grad_norm": 0.14165035612092844,
"learning_rate": 6.370113402412412e-05,
"loss": 0.3145,
"step": 372
},
{
"epoch": 1.8465346534653464,
"grad_norm": 0.1539766406897144,
"learning_rate": 6.358963037665787e-05,
"loss": 0.3175,
"step": 373
},
{
"epoch": 1.8514851485148514,
"grad_norm": 0.1430702648589554,
"learning_rate": 6.347784496053416e-05,
"loss": 0.3159,
"step": 374
},
{
"epoch": 1.8564356435643563,
"grad_norm": 0.1728534526806878,
"learning_rate": 6.336577911098493e-05,
"loss": 0.3138,
"step": 375
},
{
"epoch": 1.8613861386138613,
"grad_norm": 0.1631825571245601,
"learning_rate": 6.325343416659166e-05,
"loss": 0.3185,
"step": 376
},
{
"epoch": 1.8663366336633662,
"grad_norm": 0.16660564436193692,
"learning_rate": 6.314081146926964e-05,
"loss": 0.3164,
"step": 377
},
{
"epoch": 1.8712871287128712,
"grad_norm": 0.15419071429371625,
"learning_rate": 6.302791236425169e-05,
"loss": 0.3139,
"step": 378
},
{
"epoch": 1.8762376237623761,
"grad_norm": 0.14922406198716454,
"learning_rate": 6.291473820007227e-05,
"loss": 0.3185,
"step": 379
},
{
"epoch": 1.881188118811881,
"grad_norm": 0.13257966307723432,
"learning_rate": 6.280129032855132e-05,
"loss": 0.3236,
"step": 380
},
{
"epoch": 1.886138613861386,
"grad_norm": 0.16559043781736188,
"learning_rate": 6.268757010477806e-05,
"loss": 0.3174,
"step": 381
},
{
"epoch": 1.891089108910891,
"grad_norm": 0.1901919649105337,
"learning_rate": 6.257357888709492e-05,
"loss": 0.3175,
"step": 382
},
{
"epoch": 1.896039603960396,
"grad_norm": 0.20040247164651906,
"learning_rate": 6.245931803708116e-05,
"loss": 0.3177,
"step": 383
},
{
"epoch": 1.900990099009901,
"grad_norm": 0.20936214961639668,
"learning_rate": 6.234478891953674e-05,
"loss": 0.324,
"step": 384
},
{
"epoch": 1.9059405940594059,
"grad_norm": 0.2524923054319829,
"learning_rate": 6.222999290246595e-05,
"loss": 0.3164,
"step": 385
},
{
"epoch": 1.9108910891089108,
"grad_norm": 0.29228623984013463,
"learning_rate": 6.211493135706109e-05,
"loss": 0.3158,
"step": 386
},
{
"epoch": 1.9158415841584158,
"grad_norm": 0.34235524644727683,
"learning_rate": 6.199960565768611e-05,
"loss": 0.3126,
"step": 387
},
{
"epoch": 1.9207920792079207,
"grad_norm": 0.3411973043712662,
"learning_rate": 6.188401718186013e-05,
"loss": 0.3207,
"step": 388
},
{
"epoch": 1.9257425742574257,
"grad_norm": 0.25638163404336106,
"learning_rate": 6.17681673102411e-05,
"loss": 0.3207,
"step": 389
},
{
"epoch": 1.9306930693069306,
"grad_norm": 0.21633928164084248,
"learning_rate": 6.165205742660915e-05,
"loss": 0.3151,
"step": 390
},
{
"epoch": 1.9356435643564356,
"grad_norm": 0.17868716190088593,
"learning_rate": 6.15356889178502e-05,
"loss": 0.3162,
"step": 391
},
{
"epoch": 1.9405940594059405,
"grad_norm": 0.23795517223714394,
"learning_rate": 6.141906317393934e-05,
"loss": 0.318,
"step": 392
},
{
"epoch": 1.9455445544554455,
"grad_norm": 0.2584866147734799,
"learning_rate": 6.130218158792421e-05,
"loss": 0.3176,
"step": 393
},
{
"epoch": 1.9504950495049505,
"grad_norm": 0.24993881460825326,
"learning_rate": 6.118504555590843e-05,
"loss": 0.3183,
"step": 394
},
{
"epoch": 1.9554455445544554,
"grad_norm": 0.233885680487996,
"learning_rate": 6.10676564770348e-05,
"loss": 0.3168,
"step": 395
},
{
"epoch": 1.9603960396039604,
"grad_norm": 0.22126298891282137,
"learning_rate": 6.0950015753468745e-05,
"loss": 0.316,
"step": 396
},
{
"epoch": 1.9653465346534653,
"grad_norm": 0.15727132928115792,
"learning_rate": 6.083212479038143e-05,
"loss": 0.3162,
"step": 397
},
{
"epoch": 1.9702970297029703,
"grad_norm": 0.1936288808268254,
"learning_rate": 6.0713984995933016e-05,
"loss": 0.3171,
"step": 398
},
{
"epoch": 1.9752475247524752,
"grad_norm": 0.21781666303367697,
"learning_rate": 6.059559778125593e-05,
"loss": 0.3147,
"step": 399
},
{
"epoch": 1.9801980198019802,
"grad_norm": 0.1893055744915759,
"learning_rate": 6.0476964560437864e-05,
"loss": 0.3154,
"step": 400
},
{
"epoch": 1.9851485148514851,
"grad_norm": 0.16631443387482395,
"learning_rate": 6.035808675050497e-05,
"loss": 0.3182,
"step": 401
},
{
"epoch": 1.99009900990099,
"grad_norm": 0.1861546633666588,
"learning_rate": 6.023896577140496e-05,
"loss": 0.3171,
"step": 402
},
{
"epoch": 1.995049504950495,
"grad_norm": 0.173542149801703,
"learning_rate": 6.011960304599003e-05,
"loss": 0.3128,
"step": 403
},
{
"epoch": 2.0,
"grad_norm": 0.20561215287249168,
"learning_rate": 6.000000000000001e-05,
"loss": 0.3137,
"step": 404
},
{
"epoch": 2.004950495049505,
"grad_norm": 0.21980457246155255,
"learning_rate": 5.988015806204521e-05,
"loss": 0.2936,
"step": 405
},
{
"epoch": 2.00990099009901,
"grad_norm": 0.28062396978410536,
"learning_rate": 5.9760078663589454e-05,
"loss": 0.2911,
"step": 406
},
{
"epoch": 2.014851485148515,
"grad_norm": 0.34344935249211755,
"learning_rate": 5.9639763238932893e-05,
"loss": 0.298,
"step": 407
},
{
"epoch": 2.01980198019802,
"grad_norm": 0.3775207626291412,
"learning_rate": 5.9519213225194944e-05,
"loss": 0.2892,
"step": 408
},
{
"epoch": 2.0247524752475248,
"grad_norm": 0.4129173027605364,
"learning_rate": 5.9398430062297104e-05,
"loss": 0.2978,
"step": 409
},
{
"epoch": 2.0297029702970297,
"grad_norm": 0.48484659194676527,
"learning_rate": 5.9277415192945707e-05,
"loss": 0.2936,
"step": 410
},
{
"epoch": 2.0346534653465347,
"grad_norm": 0.5647215424942426,
"learning_rate": 5.915617006261475e-05,
"loss": 0.2984,
"step": 411
},
{
"epoch": 2.0396039603960396,
"grad_norm": 0.46988958631525757,
"learning_rate": 5.903469611952861e-05,
"loss": 0.2926,
"step": 412
},
{
"epoch": 2.0445544554455446,
"grad_norm": 0.2844786650225628,
"learning_rate": 5.891299481464473e-05,
"loss": 0.2949,
"step": 413
},
{
"epoch": 2.0495049504950495,
"grad_norm": 0.39589356406529824,
"learning_rate": 5.8791067601636305e-05,
"loss": 0.2935,
"step": 414
},
{
"epoch": 2.0544554455445545,
"grad_norm": 0.41475513226060795,
"learning_rate": 5.866891593687492e-05,
"loss": 0.2935,
"step": 415
},
{
"epoch": 2.0594059405940595,
"grad_norm": 0.2878905866874,
"learning_rate": 5.8546541279413094e-05,
"loss": 0.2875,
"step": 416
},
{
"epoch": 2.0643564356435644,
"grad_norm": 0.32027862809257346,
"learning_rate": 5.842394509096699e-05,
"loss": 0.2914,
"step": 417
},
{
"epoch": 2.0693069306930694,
"grad_norm": 0.3536068047740315,
"learning_rate": 5.8301128835898814e-05,
"loss": 0.2968,
"step": 418
},
{
"epoch": 2.0742574257425743,
"grad_norm": 0.2864872599359047,
"learning_rate": 5.817809398119937e-05,
"loss": 0.2928,
"step": 419
},
{
"epoch": 2.0792079207920793,
"grad_norm": 0.3534226329728549,
"learning_rate": 5.805484199647059e-05,
"loss": 0.2954,
"step": 420
},
{
"epoch": 2.0841584158415842,
"grad_norm": 0.27157082417435113,
"learning_rate": 5.7931374353907904e-05,
"loss": 0.2915,
"step": 421
},
{
"epoch": 2.089108910891089,
"grad_norm": 0.2785354421375662,
"learning_rate": 5.780769252828268e-05,
"loss": 0.2938,
"step": 422
},
{
"epoch": 2.094059405940594,
"grad_norm": 0.35357055000386345,
"learning_rate": 5.768379799692469e-05,
"loss": 0.2949,
"step": 423
},
{
"epoch": 2.099009900990099,
"grad_norm": 0.2581527386389988,
"learning_rate": 5.7559692239704255e-05,
"loss": 0.291,
"step": 424
},
{
"epoch": 2.103960396039604,
"grad_norm": 0.25995964778388375,
"learning_rate": 5.743537673901485e-05,
"loss": 0.2856,
"step": 425
},
{
"epoch": 2.108910891089109,
"grad_norm": 0.2488711176684702,
"learning_rate": 5.731085297975516e-05,
"loss": 0.2912,
"step": 426
},
{
"epoch": 2.113861386138614,
"grad_norm": 0.2554760977266841,
"learning_rate": 5.718612244931146e-05,
"loss": 0.2907,
"step": 427
},
{
"epoch": 2.118811881188119,
"grad_norm": 0.22671175719855702,
"learning_rate": 5.706118663753982e-05,
"loss": 0.2941,
"step": 428
},
{
"epoch": 2.123762376237624,
"grad_norm": 0.23769071563907318,
"learning_rate": 5.6936047036748335e-05,
"loss": 0.2894,
"step": 429
},
{
"epoch": 2.128712871287129,
"grad_norm": 0.2836621764459792,
"learning_rate": 5.6810705141679246e-05,
"loss": 0.2907,
"step": 430
},
{
"epoch": 2.133663366336634,
"grad_norm": 0.1858854875190047,
"learning_rate": 5.6685162449491125e-05,
"loss": 0.2919,
"step": 431
},
{
"epoch": 2.1386138613861387,
"grad_norm": 0.199748393569554,
"learning_rate": 5.655942045974101e-05,
"loss": 0.2892,
"step": 432
},
{
"epoch": 2.1435643564356437,
"grad_norm": 0.24570429878298897,
"learning_rate": 5.643348067436644e-05,
"loss": 0.2928,
"step": 433
},
{
"epoch": 2.1485148514851486,
"grad_norm": 0.1710956167931347,
"learning_rate": 5.6307344597667555e-05,
"loss": 0.2888,
"step": 434
},
{
"epoch": 2.1534653465346536,
"grad_norm": 0.22400624967389368,
"learning_rate": 5.6181013736289114e-05,
"loss": 0.2933,
"step": 435
},
{
"epoch": 2.1584158415841586,
"grad_norm": 0.18635235837084865,
"learning_rate": 5.605448959920251e-05,
"loss": 0.2891,
"step": 436
},
{
"epoch": 2.1633663366336635,
"grad_norm": 0.17591809964429744,
"learning_rate": 5.5927773697687726e-05,
"loss": 0.2891,
"step": 437
},
{
"epoch": 2.1683168316831685,
"grad_norm": 0.21247736779834164,
"learning_rate": 5.580086754531527e-05,
"loss": 0.2928,
"step": 438
},
{
"epoch": 2.1732673267326734,
"grad_norm": 0.14355206393142206,
"learning_rate": 5.567377265792819e-05,
"loss": 0.2906,
"step": 439
},
{
"epoch": 2.1782178217821784,
"grad_norm": 0.21295542237568282,
"learning_rate": 5.554649055362381e-05,
"loss": 0.2911,
"step": 440
},
{
"epoch": 2.1831683168316833,
"grad_norm": 0.17126899898270218,
"learning_rate": 5.5419022752735764e-05,
"loss": 0.289,
"step": 441
},
{
"epoch": 2.1881188118811883,
"grad_norm": 0.15620418915639625,
"learning_rate": 5.5291370777815693e-05,
"loss": 0.2912,
"step": 442
},
{
"epoch": 2.1930693069306932,
"grad_norm": 0.18366948761566249,
"learning_rate": 5.5163536153615185e-05,
"loss": 0.289,
"step": 443
},
{
"epoch": 2.198019801980198,
"grad_norm": 0.148505159881694,
"learning_rate": 5.503552040706744e-05,
"loss": 0.2885,
"step": 444
},
{
"epoch": 2.202970297029703,
"grad_norm": 0.18651095779714405,
"learning_rate": 5.490732506726911e-05,
"loss": 0.2904,
"step": 445
},
{
"epoch": 2.207920792079208,
"grad_norm": 0.16447675158548666,
"learning_rate": 5.477895166546207e-05,
"loss": 0.291,
"step": 446
},
{
"epoch": 2.212871287128713,
"grad_norm": 0.1453281761545619,
"learning_rate": 5.4650401735014985e-05,
"loss": 0.2943,
"step": 447
},
{
"epoch": 2.217821782178218,
"grad_norm": 0.17954456405595917,
"learning_rate": 5.452167681140515e-05,
"loss": 0.292,
"step": 448
},
{
"epoch": 2.222772277227723,
"grad_norm": 0.1334253529596552,
"learning_rate": 5.4392778432200044e-05,
"loss": 0.2925,
"step": 449
},
{
"epoch": 2.227722772277228,
"grad_norm": 0.1480862531642023,
"learning_rate": 5.426370813703903e-05,
"loss": 0.2893,
"step": 450
},
{
"epoch": 2.232673267326733,
"grad_norm": 0.14228035944887227,
"learning_rate": 5.4134467467614945e-05,
"loss": 0.296,
"step": 451
},
{
"epoch": 2.237623762376238,
"grad_norm": 0.1398882481618865,
"learning_rate": 5.4005057967655634e-05,
"loss": 0.2899,
"step": 452
},
{
"epoch": 2.2425742574257423,
"grad_norm": 0.150249228933869,
"learning_rate": 5.3875481182905595e-05,
"loss": 0.2875,
"step": 453
},
{
"epoch": 2.2475247524752477,
"grad_norm": 0.12606016232940834,
"learning_rate": 5.374573866110746e-05,
"loss": 0.2984,
"step": 454
},
{
"epoch": 2.2524752475247523,
"grad_norm": 0.15069385649777214,
"learning_rate": 5.3615831951983535e-05,
"loss": 0.2916,
"step": 455
},
{
"epoch": 2.2574257425742577,
"grad_norm": 0.14028873576812315,
"learning_rate": 5.348576260721725e-05,
"loss": 0.2855,
"step": 456
},
{
"epoch": 2.262376237623762,
"grad_norm": 0.1434736708641502,
"learning_rate": 5.3355532180434696e-05,
"loss": 0.2866,
"step": 457
},
{
"epoch": 2.2673267326732676,
"grad_norm": 0.15021662510139536,
"learning_rate": 5.3225142227185974e-05,
"loss": 0.2861,
"step": 458
},
{
"epoch": 2.272277227722772,
"grad_norm": 0.14159743878292066,
"learning_rate": 5.309459430492672e-05,
"loss": 0.2893,
"step": 459
},
{
"epoch": 2.2772277227722775,
"grad_norm": 0.15035314890314877,
"learning_rate": 5.2963889972999384e-05,
"loss": 0.294,
"step": 460
},
{
"epoch": 2.282178217821782,
"grad_norm": 0.1429351778691825,
"learning_rate": 5.283303079261471e-05,
"loss": 0.2877,
"step": 461
},
{
"epoch": 2.287128712871287,
"grad_norm": 0.15695781588769755,
"learning_rate": 5.2702018326833044e-05,
"loss": 0.2909,
"step": 462
},
{
"epoch": 2.292079207920792,
"grad_norm": 0.16086443131888203,
"learning_rate": 5.257085414054565e-05,
"loss": 0.2881,
"step": 463
},
{
"epoch": 2.297029702970297,
"grad_norm": 0.11603808269970421,
"learning_rate": 5.243953980045603e-05,
"loss": 0.2939,
"step": 464
},
{
"epoch": 2.301980198019802,
"grad_norm": 0.11831438387847333,
"learning_rate": 5.230807687506122e-05,
"loss": 0.2946,
"step": 465
},
{
"epoch": 2.3069306930693068,
"grad_norm": 0.151830685765451,
"learning_rate": 5.2176466934633045e-05,
"loss": 0.2916,
"step": 466
},
{
"epoch": 2.3118811881188117,
"grad_norm": 0.12678954851295432,
"learning_rate": 5.204471155119938e-05,
"loss": 0.2965,
"step": 467
},
{
"epoch": 2.3168316831683167,
"grad_norm": 0.13602397297885316,
"learning_rate": 5.191281229852534e-05,
"loss": 0.2958,
"step": 468
},
{
"epoch": 2.3217821782178216,
"grad_norm": 0.12563578179567897,
"learning_rate": 5.17807707520945e-05,
"loss": 0.2905,
"step": 469
},
{
"epoch": 2.3267326732673266,
"grad_norm": 0.10565510601351141,
"learning_rate": 5.164858848909009e-05,
"loss": 0.2937,
"step": 470
},
{
"epoch": 2.3316831683168315,
"grad_norm": 0.13170476219629715,
"learning_rate": 5.151626708837612e-05,
"loss": 0.2971,
"step": 471
},
{
"epoch": 2.3366336633663365,
"grad_norm": 0.11937369149884527,
"learning_rate": 5.1383808130478605e-05,
"loss": 0.2885,
"step": 472
},
{
"epoch": 2.3415841584158414,
"grad_norm": 0.11453800690630266,
"learning_rate": 5.1251213197566515e-05,
"loss": 0.2854,
"step": 473
},
{
"epoch": 2.3465346534653464,
"grad_norm": 0.13062929038283053,
"learning_rate": 5.11184838734331e-05,
"loss": 0.2924,
"step": 474
},
{
"epoch": 2.3514851485148514,
"grad_norm": 0.12107898486408612,
"learning_rate": 5.098562174347679e-05,
"loss": 0.293,
"step": 475
},
{
"epoch": 2.3564356435643563,
"grad_norm": 0.09901605670345262,
"learning_rate": 5.085262839468236e-05,
"loss": 0.2913,
"step": 476
},
{
"epoch": 2.3613861386138613,
"grad_norm": 0.13783522952102104,
"learning_rate": 5.071950541560193e-05,
"loss": 0.2895,
"step": 477
},
{
"epoch": 2.366336633663366,
"grad_norm": 0.12148311907430435,
"learning_rate": 5.058625439633599e-05,
"loss": 0.2877,
"step": 478
},
{
"epoch": 2.371287128712871,
"grad_norm": 0.1028377899483092,
"learning_rate": 5.0452876928514434e-05,
"loss": 0.2881,
"step": 479
},
{
"epoch": 2.376237623762376,
"grad_norm": 0.12962503408861803,
"learning_rate": 5.031937460527753e-05,
"loss": 0.2974,
"step": 480
},
{
"epoch": 2.381188118811881,
"grad_norm": 0.1273079806450915,
"learning_rate": 5.018574902125689e-05,
"loss": 0.2882,
"step": 481
},
{
"epoch": 2.386138613861386,
"grad_norm": 0.10935427368540332,
"learning_rate": 5.005200177255645e-05,
"loss": 0.2905,
"step": 482
},
{
"epoch": 2.391089108910891,
"grad_norm": 0.13181865971025042,
"learning_rate": 4.991813445673334e-05,
"loss": 0.2941,
"step": 483
},
{
"epoch": 2.396039603960396,
"grad_norm": 0.14828254882763164,
"learning_rate": 4.9784148672778864e-05,
"loss": 0.2936,
"step": 484
},
{
"epoch": 2.400990099009901,
"grad_norm": 0.10590676383587967,
"learning_rate": 4.965004602109938e-05,
"loss": 0.2869,
"step": 485
},
{
"epoch": 2.405940594059406,
"grad_norm": 0.10897986558224348,
"learning_rate": 4.95158281034972e-05,
"loss": 0.2965,
"step": 486
},
{
"epoch": 2.410891089108911,
"grad_norm": 0.1526575863023953,
"learning_rate": 4.938149652315142e-05,
"loss": 0.2904,
"step": 487
},
{
"epoch": 2.4158415841584158,
"grad_norm": 0.13976101013770628,
"learning_rate": 4.92470528845988e-05,
"loss": 0.2907,
"step": 488
},
{
"epoch": 2.4207920792079207,
"grad_norm": 0.11275068997162369,
"learning_rate": 4.911249879371457e-05,
"loss": 0.2939,
"step": 489
},
{
"epoch": 2.4257425742574257,
"grad_norm": 0.13409375961453313,
"learning_rate": 4.897783585769331e-05,
"loss": 0.2896,
"step": 490
},
{
"epoch": 2.4306930693069306,
"grad_norm": 0.13184720935289135,
"learning_rate": 4.884306568502968e-05,
"loss": 0.2905,
"step": 491
},
{
"epoch": 2.4356435643564356,
"grad_norm": 0.09381659993624109,
"learning_rate": 4.870818988549923e-05,
"loss": 0.2881,
"step": 492
},
{
"epoch": 2.4405940594059405,
"grad_norm": 0.10819786847426302,
"learning_rate": 4.857321007013924e-05,
"loss": 0.2874,
"step": 493
},
{
"epoch": 2.4455445544554455,
"grad_norm": 0.11840707921735176,
"learning_rate": 4.843812785122933e-05,
"loss": 0.2914,
"step": 494
},
{
"epoch": 2.4504950495049505,
"grad_norm": 0.11186031988136662,
"learning_rate": 4.830294484227236e-05,
"loss": 0.2902,
"step": 495
},
{
"epoch": 2.4554455445544554,
"grad_norm": 0.10078610321782347,
"learning_rate": 4.816766265797505e-05,
"loss": 0.2875,
"step": 496
},
{
"epoch": 2.4603960396039604,
"grad_norm": 0.12008233777261267,
"learning_rate": 4.8032282914228743e-05,
"loss": 0.293,
"step": 497
},
{
"epoch": 2.4653465346534653,
"grad_norm": 0.12200388770178253,
"learning_rate": 4.78968072280901e-05,
"loss": 0.2868,
"step": 498
},
{
"epoch": 2.4702970297029703,
"grad_norm": 0.136406459834568,
"learning_rate": 4.7761237217761736e-05,
"loss": 0.2903,
"step": 499
},
{
"epoch": 2.4752475247524752,
"grad_norm": 0.11393281739573007,
"learning_rate": 4.7625574502572975e-05,
"loss": 0.2892,
"step": 500
},
{
"epoch": 2.48019801980198,
"grad_norm": 0.14322465120458702,
"learning_rate": 4.7489820702960444e-05,
"loss": 0.2913,
"step": 501
},
{
"epoch": 2.485148514851485,
"grad_norm": 0.1330468571388596,
"learning_rate": 4.735397744044874e-05,
"loss": 0.29,
"step": 502
},
{
"epoch": 2.49009900990099,
"grad_norm": 0.1914714574897793,
"learning_rate": 4.721804633763105e-05,
"loss": 0.2904,
"step": 503
},
{
"epoch": 2.495049504950495,
"grad_norm": 0.17353960077506989,
"learning_rate": 4.7082029018149816e-05,
"loss": 0.2914,
"step": 504
},
{
"epoch": 2.5,
"grad_norm": 0.10568552222443248,
"learning_rate": 4.694592710667723e-05,
"loss": 0.2879,
"step": 505
},
{
"epoch": 2.504950495049505,
"grad_norm": 0.14992802837908273,
"learning_rate": 4.680974222889595e-05,
"loss": 0.2884,
"step": 506
},
{
"epoch": 2.50990099009901,
"grad_norm": 0.16137694662439006,
"learning_rate": 4.667347601147965e-05,
"loss": 0.2897,
"step": 507
},
{
"epoch": 2.514851485148515,
"grad_norm": 0.12112544582018925,
"learning_rate": 4.653713008207353e-05,
"loss": 0.291,
"step": 508
},
{
"epoch": 2.51980198019802,
"grad_norm": 0.11917815529367859,
"learning_rate": 4.640070606927497e-05,
"loss": 0.2919,
"step": 509
},
{
"epoch": 2.5247524752475248,
"grad_norm": 0.15623317002467732,
"learning_rate": 4.6264205602613944e-05,
"loss": 0.2899,
"step": 510
},
{
"epoch": 2.5297029702970297,
"grad_norm": 0.1381688374311921,
"learning_rate": 4.612763031253372e-05,
"loss": 0.2933,
"step": 511
},
{
"epoch": 2.5346534653465347,
"grad_norm": 0.10724423849127208,
"learning_rate": 4.599098183037127e-05,
"loss": 0.2919,
"step": 512
},
{
"epoch": 2.5396039603960396,
"grad_norm": 0.1247464275436635,
"learning_rate": 4.5854261788337785e-05,
"loss": 0.2913,
"step": 513
},
{
"epoch": 2.5445544554455446,
"grad_norm": 0.13249787487710485,
"learning_rate": 4.571747181949928e-05,
"loss": 0.2895,
"step": 514
},
{
"epoch": 2.5495049504950495,
"grad_norm": 0.10321336217751037,
"learning_rate": 4.558061355775693e-05,
"loss": 0.2938,
"step": 515
},
{
"epoch": 2.5544554455445545,
"grad_norm": 0.10307416532977476,
"learning_rate": 4.5443688637827716e-05,
"loss": 0.2923,
"step": 516
},
{
"epoch": 2.5594059405940595,
"grad_norm": 0.12137127847342442,
"learning_rate": 4.530669869522478e-05,
"loss": 0.2938,
"step": 517
},
{
"epoch": 2.5643564356435644,
"grad_norm": 0.10696843702534209,
"learning_rate": 4.516964536623796e-05,
"loss": 0.2917,
"step": 518
},
{
"epoch": 2.5693069306930694,
"grad_norm": 0.10464426473950372,
"learning_rate": 4.503253028791422e-05,
"loss": 0.2871,
"step": 519
},
{
"epoch": 2.5742574257425743,
"grad_norm": 0.11642336829523302,
"learning_rate": 4.489535509803806e-05,
"loss": 0.2926,
"step": 520
},
{
"epoch": 2.5792079207920793,
"grad_norm": 0.10644012339280991,
"learning_rate": 4.475812143511202e-05,
"loss": 0.2903,
"step": 521
},
{
"epoch": 2.5841584158415842,
"grad_norm": 0.10999866291513487,
"learning_rate": 4.4620830938337055e-05,
"loss": 0.2883,
"step": 522
},
{
"epoch": 2.589108910891089,
"grad_norm": 0.13755068955133282,
"learning_rate": 4.448348524759302e-05,
"loss": 0.2907,
"step": 523
},
{
"epoch": 2.594059405940594,
"grad_norm": 0.11304002693406412,
"learning_rate": 4.4346086003418985e-05,
"loss": 0.2924,
"step": 524
},
{
"epoch": 2.599009900990099,
"grad_norm": 0.10875367629369516,
"learning_rate": 4.420863484699374e-05,
"loss": 0.2895,
"step": 525
},
{
"epoch": 2.603960396039604,
"grad_norm": 0.12194581656327487,
"learning_rate": 4.4071133420116106e-05,
"loss": 0.2922,
"step": 526
},
{
"epoch": 2.608910891089109,
"grad_norm": 0.10928580726928758,
"learning_rate": 4.3933583365185396e-05,
"loss": 0.2956,
"step": 527
},
{
"epoch": 2.613861386138614,
"grad_norm": 0.10472106905680585,
"learning_rate": 4.379598632518175e-05,
"loss": 0.2901,
"step": 528
},
{
"epoch": 2.618811881188119,
"grad_norm": 0.13630901537032983,
"learning_rate": 4.365834394364653e-05,
"loss": 0.2945,
"step": 529
},
{
"epoch": 2.623762376237624,
"grad_norm": 0.12559855808593584,
"learning_rate": 4.35206578646627e-05,
"loss": 0.2897,
"step": 530
},
{
"epoch": 2.628712871287129,
"grad_norm": 0.10534753317516414,
"learning_rate": 4.338292973283512e-05,
"loss": 0.2896,
"step": 531
},
{
"epoch": 2.633663366336634,
"grad_norm": 0.11993140772526223,
"learning_rate": 4.324516119327102e-05,
"loss": 0.2894,
"step": 532
},
{
"epoch": 2.6386138613861387,
"grad_norm": 0.11261616944808854,
"learning_rate": 4.310735389156026e-05,
"loss": 0.292,
"step": 533
},
{
"epoch": 2.6435643564356437,
"grad_norm": 0.09250685300963525,
"learning_rate": 4.296950947375566e-05,
"loss": 0.2912,
"step": 534
},
{
"epoch": 2.6485148514851486,
"grad_norm": 0.09584975853113382,
"learning_rate": 4.2831629586353446e-05,
"loss": 0.2882,
"step": 535
},
{
"epoch": 2.6534653465346536,
"grad_norm": 0.10430183276684997,
"learning_rate": 4.269371587627346e-05,
"loss": 0.2918,
"step": 536
},
{
"epoch": 2.6584158415841586,
"grad_norm": 0.09633743259405408,
"learning_rate": 4.255576999083956e-05,
"loss": 0.2912,
"step": 537
},
{
"epoch": 2.6633663366336635,
"grad_norm": 0.10315192122664113,
"learning_rate": 4.241779357775993e-05,
"loss": 0.2901,
"step": 538
},
{
"epoch": 2.6683168316831685,
"grad_norm": 0.11126301619791243,
"learning_rate": 4.227978828510739e-05,
"loss": 0.2907,
"step": 539
},
{
"epoch": 2.6732673267326734,
"grad_norm": 0.12626969810049277,
"learning_rate": 4.214175576129972e-05,
"loss": 0.2843,
"step": 540
},
{
"epoch": 2.6782178217821784,
"grad_norm": 0.10637402816416124,
"learning_rate": 4.200369765507995e-05,
"loss": 0.291,
"step": 541
},
{
"epoch": 2.6831683168316833,
"grad_norm": 0.11609951156690725,
"learning_rate": 4.18656156154967e-05,
"loss": 0.289,
"step": 542
},
{
"epoch": 2.6881188118811883,
"grad_norm": 0.10635855005794152,
"learning_rate": 4.172751129188447e-05,
"loss": 0.2878,
"step": 543
},
{
"epoch": 2.693069306930693,
"grad_norm": 0.09969116484857603,
"learning_rate": 4.158938633384389e-05,
"loss": 0.2911,
"step": 544
},
{
"epoch": 2.698019801980198,
"grad_norm": 0.10338679032150914,
"learning_rate": 4.1451242391222105e-05,
"loss": 0.29,
"step": 545
},
{
"epoch": 2.7029702970297027,
"grad_norm": 0.09747866123350818,
"learning_rate": 4.1313081114093025e-05,
"loss": 0.2878,
"step": 546
},
{
"epoch": 2.707920792079208,
"grad_norm": 0.09745084051835436,
"learning_rate": 4.117490415273757e-05,
"loss": 0.2893,
"step": 547
},
{
"epoch": 2.7128712871287126,
"grad_norm": 0.09225392778972681,
"learning_rate": 4.1036713157624045e-05,
"loss": 0.2903,
"step": 548
},
{
"epoch": 2.717821782178218,
"grad_norm": 0.09437236963665839,
"learning_rate": 4.089850977938836e-05,
"loss": 0.2881,
"step": 549
},
{
"epoch": 2.7227722772277225,
"grad_norm": 0.09436910160998535,
"learning_rate": 4.076029566881436e-05,
"loss": 0.289,
"step": 550
},
{
"epoch": 2.727722772277228,
"grad_norm": 0.0980607482524191,
"learning_rate": 4.0622072476814045e-05,
"loss": 0.2872,
"step": 551
},
{
"epoch": 2.7326732673267324,
"grad_norm": 0.10439483236771886,
"learning_rate": 4.0483841854407906e-05,
"loss": 0.2934,
"step": 552
},
{
"epoch": 2.737623762376238,
"grad_norm": 0.10513256333702312,
"learning_rate": 4.0345605452705225e-05,
"loss": 0.2933,
"step": 553
},
{
"epoch": 2.7425742574257423,
"grad_norm": 0.09879708977573251,
"learning_rate": 4.020736492288426e-05,
"loss": 0.2892,
"step": 554
},
{
"epoch": 2.7475247524752477,
"grad_norm": 0.10870427744339516,
"learning_rate": 4.006912191617259e-05,
"loss": 0.2885,
"step": 555
},
{
"epoch": 2.7524752475247523,
"grad_norm": 0.10462833839112169,
"learning_rate": 3.993087808382742e-05,
"loss": 0.2908,
"step": 556
},
{
"epoch": 2.7574257425742577,
"grad_norm": 0.11008718087986996,
"learning_rate": 3.9792635077115755e-05,
"loss": 0.2915,
"step": 557
},
{
"epoch": 2.762376237623762,
"grad_norm": 0.11165269895235802,
"learning_rate": 3.9654394547294775e-05,
"loss": 0.2949,
"step": 558
},
{
"epoch": 2.7673267326732676,
"grad_norm": 0.11157871130143804,
"learning_rate": 3.9516158145592093e-05,
"loss": 0.2902,
"step": 559
},
{
"epoch": 2.772277227722772,
"grad_norm": 0.10798653329594901,
"learning_rate": 3.937792752318597e-05,
"loss": 0.29,
"step": 560
},
{
"epoch": 2.7772277227722775,
"grad_norm": 0.08830297670358335,
"learning_rate": 3.923970433118566e-05,
"loss": 0.2911,
"step": 561
},
{
"epoch": 2.782178217821782,
"grad_norm": 0.10692200078028577,
"learning_rate": 3.9101490220611646e-05,
"loss": 0.2888,
"step": 562
},
{
"epoch": 2.7871287128712874,
"grad_norm": 0.09073477619941334,
"learning_rate": 3.8963286842375955e-05,
"loss": 0.2884,
"step": 563
},
{
"epoch": 2.792079207920792,
"grad_norm": 0.10758407723631432,
"learning_rate": 3.882509584726244e-05,
"loss": 0.2884,
"step": 564
},
{
"epoch": 2.7970297029702973,
"grad_norm": 0.08703114989053835,
"learning_rate": 3.868691888590699e-05,
"loss": 0.2905,
"step": 565
},
{
"epoch": 2.801980198019802,
"grad_norm": 0.09815325795728913,
"learning_rate": 3.854875760877791e-05,
"loss": 0.2891,
"step": 566
},
{
"epoch": 2.806930693069307,
"grad_norm": 0.0963193079710705,
"learning_rate": 3.8410613666156126e-05,
"loss": 0.2932,
"step": 567
},
{
"epoch": 2.8118811881188117,
"grad_norm": 0.0934892455160611,
"learning_rate": 3.8272488708115536e-05,
"loss": 0.2877,
"step": 568
},
{
"epoch": 2.8168316831683167,
"grad_norm": 0.11193820279984304,
"learning_rate": 3.81343843845033e-05,
"loss": 0.289,
"step": 569
},
{
"epoch": 2.8217821782178216,
"grad_norm": 0.11073267502904961,
"learning_rate": 3.7996302344920056e-05,
"loss": 0.2881,
"step": 570
},
{
"epoch": 2.8267326732673266,
"grad_norm": 0.10447219952745992,
"learning_rate": 3.785824423870029e-05,
"loss": 0.2932,
"step": 571
},
{
"epoch": 2.8316831683168315,
"grad_norm": 0.09161213745057079,
"learning_rate": 3.772021171489261e-05,
"loss": 0.2888,
"step": 572
},
{
"epoch": 2.8366336633663365,
"grad_norm": 0.10089529395624805,
"learning_rate": 3.7582206422240073e-05,
"loss": 0.2923,
"step": 573
},
{
"epoch": 2.8415841584158414,
"grad_norm": 0.08617518269899792,
"learning_rate": 3.744423000916045e-05,
"loss": 0.2872,
"step": 574
},
{
"epoch": 2.8465346534653464,
"grad_norm": 0.10144855439914764,
"learning_rate": 3.7306284123726545e-05,
"loss": 0.2901,
"step": 575
},
{
"epoch": 2.8514851485148514,
"grad_norm": 0.10158399468636482,
"learning_rate": 3.716837041364657e-05,
"loss": 0.2924,
"step": 576
},
{
"epoch": 2.8564356435643563,
"grad_norm": 0.10116294192144563,
"learning_rate": 3.703049052624434e-05,
"loss": 0.2844,
"step": 577
},
{
"epoch": 2.8613861386138613,
"grad_norm": 0.11957236661139066,
"learning_rate": 3.689264610843975e-05,
"loss": 0.2897,
"step": 578
},
{
"epoch": 2.866336633663366,
"grad_norm": 0.11284051298160382,
"learning_rate": 3.6754838806728985e-05,
"loss": 0.2867,
"step": 579
},
{
"epoch": 2.871287128712871,
"grad_norm": 0.0924325415706199,
"learning_rate": 3.6617070267164895e-05,
"loss": 0.289,
"step": 580
},
{
"epoch": 2.876237623762376,
"grad_norm": 0.1105426281095416,
"learning_rate": 3.647934213533733e-05,
"loss": 0.2875,
"step": 581
},
{
"epoch": 2.881188118811881,
"grad_norm": 0.09174690942839933,
"learning_rate": 3.634165605635347e-05,
"loss": 0.292,
"step": 582
},
{
"epoch": 2.886138613861386,
"grad_norm": 0.10220347134653099,
"learning_rate": 3.6204013674818264e-05,
"loss": 0.2857,
"step": 583
},
{
"epoch": 2.891089108910891,
"grad_norm": 0.0981680983492792,
"learning_rate": 3.606641663481462e-05,
"loss": 0.2919,
"step": 584
},
{
"epoch": 2.896039603960396,
"grad_norm": 0.09807625793972466,
"learning_rate": 3.5928866579883914e-05,
"loss": 0.2902,
"step": 585
},
{
"epoch": 2.900990099009901,
"grad_norm": 0.09723694201470973,
"learning_rate": 3.579136515300627e-05,
"loss": 0.2904,
"step": 586
},
{
"epoch": 2.905940594059406,
"grad_norm": 0.09282029713355545,
"learning_rate": 3.565391399658102e-05,
"loss": 0.2858,
"step": 587
},
{
"epoch": 2.910891089108911,
"grad_norm": 0.09078676323851734,
"learning_rate": 3.5516514752406996e-05,
"loss": 0.2877,
"step": 588
},
{
"epoch": 2.9158415841584158,
"grad_norm": 0.1035730885124026,
"learning_rate": 3.537916906166295e-05,
"loss": 0.2887,
"step": 589
},
{
"epoch": 2.9207920792079207,
"grad_norm": 0.10072443988749984,
"learning_rate": 3.5241878564888006e-05,
"loss": 0.2857,
"step": 590
},
{
"epoch": 2.9257425742574257,
"grad_norm": 0.10201801759402962,
"learning_rate": 3.510464490196195e-05,
"loss": 0.2878,
"step": 591
},
{
"epoch": 2.9306930693069306,
"grad_norm": 0.09498823456658204,
"learning_rate": 3.496746971208579e-05,
"loss": 0.2903,
"step": 592
},
{
"epoch": 2.9356435643564356,
"grad_norm": 0.09439742302971477,
"learning_rate": 3.4830354633762044e-05,
"loss": 0.2885,
"step": 593
},
{
"epoch": 2.9405940594059405,
"grad_norm": 0.10085915056545246,
"learning_rate": 3.4693301304775226e-05,
"loss": 0.2912,
"step": 594
},
{
"epoch": 2.9455445544554455,
"grad_norm": 0.08842640947738424,
"learning_rate": 3.455631136217231e-05,
"loss": 0.2867,
"step": 595
},
{
"epoch": 2.9504950495049505,
"grad_norm": 0.1004373056709111,
"learning_rate": 3.4419386442243084e-05,
"loss": 0.2921,
"step": 596
},
{
"epoch": 2.9554455445544554,
"grad_norm": 0.09684450083741175,
"learning_rate": 3.428252818050074e-05,
"loss": 0.2916,
"step": 597
},
{
"epoch": 2.9603960396039604,
"grad_norm": 0.09286612877802164,
"learning_rate": 3.414573821166222e-05,
"loss": 0.2905,
"step": 598
},
{
"epoch": 2.9653465346534653,
"grad_norm": 0.09408550294226069,
"learning_rate": 3.4009018169628744e-05,
"loss": 0.2871,
"step": 599
},
{
"epoch": 2.9702970297029703,
"grad_norm": 0.09366561753918336,
"learning_rate": 3.38723696874663e-05,
"loss": 0.2906,
"step": 600
},
{
"epoch": 2.9752475247524752,
"grad_norm": 0.10124650316242359,
"learning_rate": 3.373579439738606e-05,
"loss": 0.2885,
"step": 601
},
{
"epoch": 2.98019801980198,
"grad_norm": 0.08127495837017719,
"learning_rate": 3.359929393072505e-05,
"loss": 0.2922,
"step": 602
},
{
"epoch": 2.985148514851485,
"grad_norm": 0.10185253182926989,
"learning_rate": 3.346286991792648e-05,
"loss": 0.2894,
"step": 603
},
{
"epoch": 2.99009900990099,
"grad_norm": 0.09763480465367606,
"learning_rate": 3.3326523988520365e-05,
"loss": 0.288,
"step": 604
},
{
"epoch": 2.995049504950495,
"grad_norm": 0.08124295963133833,
"learning_rate": 3.3190257771104055e-05,
"loss": 0.2865,
"step": 605
},
{
"epoch": 3.0,
"grad_norm": 0.10088677360749207,
"learning_rate": 3.305407289332279e-05,
"loss": 0.2818,
"step": 606
},
{
"epoch": 3.004950495049505,
"grad_norm": 0.13873473329596994,
"learning_rate": 3.2917970981850205e-05,
"loss": 0.2602,
"step": 607
},
{
"epoch": 3.00990099009901,
"grad_norm": 0.12976038560967385,
"learning_rate": 3.2781953662368954e-05,
"loss": 0.2606,
"step": 608
},
{
"epoch": 3.014851485148515,
"grad_norm": 0.15951817148087163,
"learning_rate": 3.264602255955127e-05,
"loss": 0.2577,
"step": 609
},
{
"epoch": 3.01980198019802,
"grad_norm": 0.16279714502214718,
"learning_rate": 3.251017929703956e-05,
"loss": 0.2649,
"step": 610
},
{
"epoch": 3.0247524752475248,
"grad_norm": 0.1492586830551721,
"learning_rate": 3.237442549742704e-05,
"loss": 0.2612,
"step": 611
},
{
"epoch": 3.0297029702970297,
"grad_norm": 0.1491998033398401,
"learning_rate": 3.223876278223828e-05,
"loss": 0.2601,
"step": 612
},
{
"epoch": 3.0346534653465347,
"grad_norm": 0.13488498997370776,
"learning_rate": 3.2103192771909927e-05,
"loss": 0.2625,
"step": 613
},
{
"epoch": 3.0396039603960396,
"grad_norm": 0.1342441983854818,
"learning_rate": 3.196771708577127e-05,
"loss": 0.2597,
"step": 614
},
{
"epoch": 3.0445544554455446,
"grad_norm": 0.12716853452234733,
"learning_rate": 3.1832337342024956e-05,
"loss": 0.2618,
"step": 615
},
{
"epoch": 3.0495049504950495,
"grad_norm": 0.11793012695462536,
"learning_rate": 3.1697055157727654e-05,
"loss": 0.2612,
"step": 616
},
{
"epoch": 3.0544554455445545,
"grad_norm": 0.11621690642718228,
"learning_rate": 3.156187214877068e-05,
"loss": 0.2627,
"step": 617
},
{
"epoch": 3.0594059405940595,
"grad_norm": 0.10824520198545912,
"learning_rate": 3.142678992986078e-05,
"loss": 0.2588,
"step": 618
},
{
"epoch": 3.0643564356435644,
"grad_norm": 0.11763046761959932,
"learning_rate": 3.129181011450077e-05,
"loss": 0.2624,
"step": 619
},
{
"epoch": 3.0693069306930694,
"grad_norm": 0.13250782552006196,
"learning_rate": 3.115693431497033e-05,
"loss": 0.259,
"step": 620
},
{
"epoch": 3.0742574257425743,
"grad_norm": 0.10745625886139168,
"learning_rate": 3.102216414230671e-05,
"loss": 0.2634,
"step": 621
},
{
"epoch": 3.0792079207920793,
"grad_norm": 0.13028975884190627,
"learning_rate": 3.0887501206285436e-05,
"loss": 0.2645,
"step": 622
},
{
"epoch": 3.0841584158415842,
"grad_norm": 0.11204426938100358,
"learning_rate": 3.075294711540123e-05,
"loss": 0.2568,
"step": 623
},
{
"epoch": 3.089108910891089,
"grad_norm": 0.0938552199733989,
"learning_rate": 3.061850347684859e-05,
"loss": 0.2602,
"step": 624
},
{
"epoch": 3.094059405940594,
"grad_norm": 0.1127679213593348,
"learning_rate": 3.0484171896502805e-05,
"loss": 0.2607,
"step": 625
},
{
"epoch": 3.099009900990099,
"grad_norm": 0.10200434233865428,
"learning_rate": 3.034995397890063e-05,
"loss": 0.2616,
"step": 626
},
{
"epoch": 3.103960396039604,
"grad_norm": 0.09991375780190867,
"learning_rate": 3.0215851327221163e-05,
"loss": 0.2623,
"step": 627
},
{
"epoch": 3.108910891089109,
"grad_norm": 0.09232562826749552,
"learning_rate": 3.0081865543266687e-05,
"loss": 0.2614,
"step": 628
},
{
"epoch": 3.113861386138614,
"grad_norm": 0.1065573407714681,
"learning_rate": 2.994799822744356e-05,
"loss": 0.2586,
"step": 629
},
{
"epoch": 3.118811881188119,
"grad_norm": 0.09147796522034173,
"learning_rate": 2.9814250978743115e-05,
"loss": 0.2592,
"step": 630
},
{
"epoch": 3.123762376237624,
"grad_norm": 0.09934546358889536,
"learning_rate": 2.9680625394722483e-05,
"loss": 0.265,
"step": 631
},
{
"epoch": 3.128712871287129,
"grad_norm": 0.09335841251464885,
"learning_rate": 2.9547123071485586e-05,
"loss": 0.2591,
"step": 632
},
{
"epoch": 3.133663366336634,
"grad_norm": 0.08506717200868281,
"learning_rate": 2.9413745603664023e-05,
"loss": 0.2611,
"step": 633
},
{
"epoch": 3.1386138613861387,
"grad_norm": 0.09638638396370519,
"learning_rate": 2.928049458439808e-05,
"loss": 0.2627,
"step": 634
},
{
"epoch": 3.1435643564356437,
"grad_norm": 0.08636142462750247,
"learning_rate": 2.914737160531765e-05,
"loss": 0.2648,
"step": 635
},
{
"epoch": 3.1485148514851486,
"grad_norm": 0.09268694847893381,
"learning_rate": 2.9014378256523218e-05,
"loss": 0.2605,
"step": 636
},
{
"epoch": 3.1534653465346536,
"grad_norm": 0.09876107078789798,
"learning_rate": 2.888151612656692e-05,
"loss": 0.2583,
"step": 637
},
{
"epoch": 3.1584158415841586,
"grad_norm": 0.08411855624881796,
"learning_rate": 2.874878680243349e-05,
"loss": 0.263,
"step": 638
},
{
"epoch": 3.1633663366336635,
"grad_norm": 0.10415546979119643,
"learning_rate": 2.8616191869521412e-05,
"loss": 0.2604,
"step": 639
},
{
"epoch": 3.1683168316831685,
"grad_norm": 0.09152241248584239,
"learning_rate": 2.8483732911623882e-05,
"loss": 0.2617,
"step": 640
},
{
"epoch": 3.1732673267326734,
"grad_norm": 0.08574306643093083,
"learning_rate": 2.8351411510909926e-05,
"loss": 0.2551,
"step": 641
},
{
"epoch": 3.1782178217821784,
"grad_norm": 0.11525551347781693,
"learning_rate": 2.821922924790552e-05,
"loss": 0.2627,
"step": 642
},
{
"epoch": 3.1831683168316833,
"grad_norm": 0.07971057087238882,
"learning_rate": 2.8087187701474667e-05,
"loss": 0.2593,
"step": 643
},
{
"epoch": 3.1881188118811883,
"grad_norm": 0.09070420344687578,
"learning_rate": 2.7955288448800628e-05,
"loss": 0.2647,
"step": 644
},
{
"epoch": 3.1930693069306932,
"grad_norm": 0.08840670408396233,
"learning_rate": 2.7823533065366965e-05,
"loss": 0.2606,
"step": 645
},
{
"epoch": 3.198019801980198,
"grad_norm": 0.08190382673177843,
"learning_rate": 2.7691923124938794e-05,
"loss": 0.2592,
"step": 646
},
{
"epoch": 3.202970297029703,
"grad_norm": 0.08956962744513197,
"learning_rate": 2.756046019954398e-05,
"loss": 0.2617,
"step": 647
},
{
"epoch": 3.207920792079208,
"grad_norm": 0.08875947136387043,
"learning_rate": 2.742914585945436e-05,
"loss": 0.2601,
"step": 648
},
{
"epoch": 3.212871287128713,
"grad_norm": 0.07905536192742312,
"learning_rate": 2.7297981673166963e-05,
"loss": 0.2624,
"step": 649
},
{
"epoch": 3.217821782178218,
"grad_norm": 0.08409581506993473,
"learning_rate": 2.71669692073853e-05,
"loss": 0.2607,
"step": 650
},
{
"epoch": 3.222772277227723,
"grad_norm": 0.07852436746970415,
"learning_rate": 2.7036110027000636e-05,
"loss": 0.2614,
"step": 651
},
{
"epoch": 3.227722772277228,
"grad_norm": 0.10198706585003715,
"learning_rate": 2.690540569507329e-05,
"loss": 0.2603,
"step": 652
},
{
"epoch": 3.232673267326733,
"grad_norm": 0.08152560978569826,
"learning_rate": 2.677485777281403e-05,
"loss": 0.263,
"step": 653
},
{
"epoch": 3.237623762376238,
"grad_norm": 0.1010991457038921,
"learning_rate": 2.6644467819565317e-05,
"loss": 0.2604,
"step": 654
},
{
"epoch": 3.2425742574257423,
"grad_norm": 0.07891142552443962,
"learning_rate": 2.651423739278276e-05,
"loss": 0.2651,
"step": 655
},
{
"epoch": 3.2475247524752477,
"grad_norm": 0.11260260262443215,
"learning_rate": 2.638416804801648e-05,
"loss": 0.2635,
"step": 656
},
{
"epoch": 3.2524752475247523,
"grad_norm": 0.07603687187619462,
"learning_rate": 2.6254261338892536e-05,
"loss": 0.2579,
"step": 657
},
{
"epoch": 3.2574257425742577,
"grad_norm": 0.09383699286259775,
"learning_rate": 2.6124518817094418e-05,
"loss": 0.2624,
"step": 658
},
{
"epoch": 3.262376237623762,
"grad_norm": 0.08743649333602849,
"learning_rate": 2.5994942032344376e-05,
"loss": 0.2586,
"step": 659
},
{
"epoch": 3.2673267326732676,
"grad_norm": 0.08026177278250374,
"learning_rate": 2.5865532532385072e-05,
"loss": 0.2614,
"step": 660
},
{
"epoch": 3.272277227722772,
"grad_norm": 0.07564032476267157,
"learning_rate": 2.573629186296097e-05,
"loss": 0.2586,
"step": 661
},
{
"epoch": 3.2772277227722775,
"grad_norm": 0.07578420912298534,
"learning_rate": 2.560722156779996e-05,
"loss": 0.2579,
"step": 662
},
{
"epoch": 3.282178217821782,
"grad_norm": 0.08086724414241181,
"learning_rate": 2.547832318859487e-05,
"loss": 0.2579,
"step": 663
},
{
"epoch": 3.287128712871287,
"grad_norm": 0.0749779149015514,
"learning_rate": 2.5349598264985028e-05,
"loss": 0.2632,
"step": 664
},
{
"epoch": 3.292079207920792,
"grad_norm": 0.06928132458586024,
"learning_rate": 2.5221048334537952e-05,
"loss": 0.2621,
"step": 665
},
{
"epoch": 3.297029702970297,
"grad_norm": 0.07518881651499724,
"learning_rate": 2.5092674932730886e-05,
"loss": 0.2593,
"step": 666
},
{
"epoch": 3.301980198019802,
"grad_norm": 0.09171673550931758,
"learning_rate": 2.4964479592932574e-05,
"loss": 0.2601,
"step": 667
},
{
"epoch": 3.3069306930693068,
"grad_norm": 0.08658887819981398,
"learning_rate": 2.4836463846384832e-05,
"loss": 0.2652,
"step": 668
},
{
"epoch": 3.3118811881188117,
"grad_norm": 0.07581169369002726,
"learning_rate": 2.470862922218431e-05,
"loss": 0.2601,
"step": 669
},
{
"epoch": 3.3168316831683167,
"grad_norm": 0.08405531192143934,
"learning_rate": 2.4580977247264253e-05,
"loss": 0.2617,
"step": 670
},
{
"epoch": 3.3217821782178216,
"grad_norm": 0.08264190542819465,
"learning_rate": 2.4453509446376192e-05,
"loss": 0.2645,
"step": 671
},
{
"epoch": 3.3267326732673266,
"grad_norm": 0.0875835392802001,
"learning_rate": 2.432622734207182e-05,
"loss": 0.2606,
"step": 672
},
{
"epoch": 3.3316831683168315,
"grad_norm": 0.0777091868248858,
"learning_rate": 2.4199132454684736e-05,
"loss": 0.2635,
"step": 673
},
{
"epoch": 3.3366336633663365,
"grad_norm": 0.09147076046625585,
"learning_rate": 2.40722263023123e-05,
"loss": 0.2547,
"step": 674
},
{
"epoch": 3.3415841584158414,
"grad_norm": 0.0814714175342403,
"learning_rate": 2.3945510400797485e-05,
"loss": 0.2604,
"step": 675
},
{
"epoch": 3.3465346534653464,
"grad_norm": 0.08004074246147477,
"learning_rate": 2.3818986263710886e-05,
"loss": 0.263,
"step": 676
},
{
"epoch": 3.3514851485148514,
"grad_norm": 0.07584161785288035,
"learning_rate": 2.3692655402332455e-05,
"loss": 0.2594,
"step": 677
},
{
"epoch": 3.3564356435643563,
"grad_norm": 0.08695533626808738,
"learning_rate": 2.3566519325633567e-05,
"loss": 0.2601,
"step": 678
},
{
"epoch": 3.3613861386138613,
"grad_norm": 0.0791972098065313,
"learning_rate": 2.3440579540259006e-05,
"loss": 0.2615,
"step": 679
},
{
"epoch": 3.366336633663366,
"grad_norm": 0.08818611699036372,
"learning_rate": 2.3314837550508875e-05,
"loss": 0.2602,
"step": 680
},
{
"epoch": 3.371287128712871,
"grad_norm": 0.07757375545655479,
"learning_rate": 2.3189294858320768e-05,
"loss": 0.2609,
"step": 681
},
{
"epoch": 3.376237623762376,
"grad_norm": 0.08162773290629517,
"learning_rate": 2.3063952963251682e-05,
"loss": 0.259,
"step": 682
},
{
"epoch": 3.381188118811881,
"grad_norm": 0.0756857422551136,
"learning_rate": 2.2938813362460198e-05,
"loss": 0.2558,
"step": 683
},
{
"epoch": 3.386138613861386,
"grad_norm": 0.08379742666430028,
"learning_rate": 2.2813877550688553e-05,
"loss": 0.2643,
"step": 684
},
{
"epoch": 3.391089108910891,
"grad_norm": 0.08340597193070581,
"learning_rate": 2.2689147020244848e-05,
"loss": 0.2608,
"step": 685
},
{
"epoch": 3.396039603960396,
"grad_norm": 0.07863270852195665,
"learning_rate": 2.256462326098516e-05,
"loss": 0.2624,
"step": 686
},
{
"epoch": 3.400990099009901,
"grad_norm": 0.08196381945609525,
"learning_rate": 2.2440307760295755e-05,
"loss": 0.2616,
"step": 687
},
{
"epoch": 3.405940594059406,
"grad_norm": 0.08378698824706224,
"learning_rate": 2.2316202003075347e-05,
"loss": 0.262,
"step": 688
},
{
"epoch": 3.410891089108911,
"grad_norm": 0.07195893904281865,
"learning_rate": 2.2192307471717324e-05,
"loss": 0.2593,
"step": 689
},
{
"epoch": 3.4158415841584158,
"grad_norm": 0.08537050560423447,
"learning_rate": 2.2068625646092103e-05,
"loss": 0.2652,
"step": 690
},
{
"epoch": 3.4207920792079207,
"grad_norm": 0.07000150332485644,
"learning_rate": 2.194515800352942e-05,
"loss": 0.2598,
"step": 691
},
{
"epoch": 3.4257425742574257,
"grad_norm": 0.08681466720555077,
"learning_rate": 2.1821906018800643e-05,
"loss": 0.26,
"step": 692
},
{
"epoch": 3.4306930693069306,
"grad_norm": 0.07771534110147085,
"learning_rate": 2.169887116410121e-05,
"loss": 0.2632,
"step": 693
},
{
"epoch": 3.4356435643564356,
"grad_norm": 0.07591468547904767,
"learning_rate": 2.1576054909033014e-05,
"loss": 0.264,
"step": 694
},
{
"epoch": 3.4405940594059405,
"grad_norm": 0.08082768131651101,
"learning_rate": 2.1453458720586902e-05,
"loss": 0.2648,
"step": 695
},
{
"epoch": 3.4455445544554455,
"grad_norm": 0.08123463224039203,
"learning_rate": 2.13310840631251e-05,
"loss": 0.2616,
"step": 696
},
{
"epoch": 3.4504950495049505,
"grad_norm": 0.08379480947484824,
"learning_rate": 2.1208932398363712e-05,
"loss": 0.2604,
"step": 697
},
{
"epoch": 3.4554455445544554,
"grad_norm": 0.07509673804422137,
"learning_rate": 2.1087005185355292e-05,
"loss": 0.2623,
"step": 698
},
{
"epoch": 3.4603960396039604,
"grad_norm": 0.0858048782262789,
"learning_rate": 2.0965303880471405e-05,
"loss": 0.267,
"step": 699
},
{
"epoch": 3.4653465346534653,
"grad_norm": 0.07547640130167284,
"learning_rate": 2.0843829937385255e-05,
"loss": 0.2626,
"step": 700
},
{
"epoch": 3.4702970297029703,
"grad_norm": 0.07350883425204456,
"learning_rate": 2.072258480705431e-05,
"loss": 0.261,
"step": 701
},
{
"epoch": 3.4752475247524752,
"grad_norm": 0.07191893655858747,
"learning_rate": 2.0601569937702913e-05,
"loss": 0.2622,
"step": 702
},
{
"epoch": 3.48019801980198,
"grad_norm": 0.07616081022762551,
"learning_rate": 2.048078677480507e-05,
"loss": 0.2606,
"step": 703
},
{
"epoch": 3.485148514851485,
"grad_norm": 0.06984625689042373,
"learning_rate": 2.0360236761067117e-05,
"loss": 0.2587,
"step": 704
},
{
"epoch": 3.49009900990099,
"grad_norm": 0.08028605456370366,
"learning_rate": 2.023992133641055e-05,
"loss": 0.2651,
"step": 705
},
{
"epoch": 3.495049504950495,
"grad_norm": 0.07388138739910662,
"learning_rate": 2.0119841937954794e-05,
"loss": 0.2657,
"step": 706
},
{
"epoch": 3.5,
"grad_norm": 0.07223380158067759,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.2629,
"step": 707
},
{
"epoch": 3.504950495049505,
"grad_norm": 0.07459203599924807,
"learning_rate": 1.9880396954009976e-05,
"loss": 0.2663,
"step": 708
},
{
"epoch": 3.50990099009901,
"grad_norm": 0.07581551013145865,
"learning_rate": 1.976103422859506e-05,
"loss": 0.2629,
"step": 709
},
{
"epoch": 3.514851485148515,
"grad_norm": 0.06686751894402097,
"learning_rate": 1.9641913249495026e-05,
"loss": 0.2597,
"step": 710
},
{
"epoch": 3.51980198019802,
"grad_norm": 0.0736426132082484,
"learning_rate": 1.9523035439562146e-05,
"loss": 0.2588,
"step": 711
},
{
"epoch": 3.5247524752475248,
"grad_norm": 0.06871398394666604,
"learning_rate": 1.9404402218744086e-05,
"loss": 0.2618,
"step": 712
},
{
"epoch": 3.5297029702970297,
"grad_norm": 0.0763872712325709,
"learning_rate": 1.9286015004066984e-05,
"loss": 0.2635,
"step": 713
},
{
"epoch": 3.5346534653465347,
"grad_norm": 0.06539343146687637,
"learning_rate": 1.9167875209618592e-05,
"loss": 0.2603,
"step": 714
},
{
"epoch": 3.5396039603960396,
"grad_norm": 0.07878474772100631,
"learning_rate": 1.9049984246531255e-05,
"loss": 0.2637,
"step": 715
},
{
"epoch": 3.5445544554455446,
"grad_norm": 0.06871464146056458,
"learning_rate": 1.8932343522965205e-05,
"loss": 0.2611,
"step": 716
},
{
"epoch": 3.5495049504950495,
"grad_norm": 0.07766195448090221,
"learning_rate": 1.8814954444091595e-05,
"loss": 0.2629,
"step": 717
},
{
"epoch": 3.5544554455445545,
"grad_norm": 0.0684321915163134,
"learning_rate": 1.8697818412075794e-05,
"loss": 0.2602,
"step": 718
},
{
"epoch": 3.5594059405940595,
"grad_norm": 0.07986816367419101,
"learning_rate": 1.8580936826060685e-05,
"loss": 0.2622,
"step": 719
},
{
"epoch": 3.5643564356435644,
"grad_norm": 0.06371681601869503,
"learning_rate": 1.846431108214981e-05,
"loss": 0.2616,
"step": 720
},
{
"epoch": 3.5693069306930694,
"grad_norm": 0.0777995648043413,
"learning_rate": 1.8347942573390865e-05,
"loss": 0.2593,
"step": 721
},
{
"epoch": 3.5742574257425743,
"grad_norm": 0.06952441494079514,
"learning_rate": 1.8231832689758903e-05,
"loss": 0.2664,
"step": 722
},
{
"epoch": 3.5792079207920793,
"grad_norm": 0.0778247922778437,
"learning_rate": 1.8115982818139862e-05,
"loss": 0.263,
"step": 723
},
{
"epoch": 3.5841584158415842,
"grad_norm": 0.06754531987523885,
"learning_rate": 1.80003943423139e-05,
"loss": 0.2652,
"step": 724
},
{
"epoch": 3.589108910891089,
"grad_norm": 0.06763991069949353,
"learning_rate": 1.7885068642938924e-05,
"loss": 0.2647,
"step": 725
},
{
"epoch": 3.594059405940594,
"grad_norm": 0.06988041056890192,
"learning_rate": 1.7770007097534062e-05,
"loss": 0.2617,
"step": 726
},
{
"epoch": 3.599009900990099,
"grad_norm": 0.0675214500152565,
"learning_rate": 1.7655211080463265e-05,
"loss": 0.2601,
"step": 727
},
{
"epoch": 3.603960396039604,
"grad_norm": 0.0726640790012201,
"learning_rate": 1.754068196291885e-05,
"loss": 0.2624,
"step": 728
},
{
"epoch": 3.608910891089109,
"grad_norm": 0.06635841200881153,
"learning_rate": 1.7426421112905095e-05,
"loss": 0.2642,
"step": 729
},
{
"epoch": 3.613861386138614,
"grad_norm": 0.06321818867608649,
"learning_rate": 1.731242989522195e-05,
"loss": 0.2615,
"step": 730
},
{
"epoch": 3.618811881188119,
"grad_norm": 0.07039027608005301,
"learning_rate": 1.7198709671448696e-05,
"loss": 0.26,
"step": 731
},
{
"epoch": 3.623762376237624,
"grad_norm": 0.06583730825694595,
"learning_rate": 1.7085261799927738e-05,
"loss": 0.2626,
"step": 732
},
{
"epoch": 3.628712871287129,
"grad_norm": 0.06315117502780393,
"learning_rate": 1.697208763574833e-05,
"loss": 0.2604,
"step": 733
},
{
"epoch": 3.633663366336634,
"grad_norm": 0.0661718529079368,
"learning_rate": 1.6859188530730387e-05,
"loss": 0.2585,
"step": 734
},
{
"epoch": 3.6386138613861387,
"grad_norm": 0.06191307233046204,
"learning_rate": 1.6746565833408352e-05,
"loss": 0.2611,
"step": 735
},
{
"epoch": 3.6435643564356437,
"grad_norm": 0.06569352083114534,
"learning_rate": 1.6634220889015087e-05,
"loss": 0.2578,
"step": 736
},
{
"epoch": 3.6485148514851486,
"grad_norm": 0.06524584677846361,
"learning_rate": 1.652215503946583e-05,
"loss": 0.2591,
"step": 737
},
{
"epoch": 3.6534653465346536,
"grad_norm": 0.06882261641904054,
"learning_rate": 1.6410369623342144e-05,
"loss": 0.2621,
"step": 738
},
{
"epoch": 3.6584158415841586,
"grad_norm": 0.06599644920193304,
"learning_rate": 1.6298865975875903e-05,
"loss": 0.2621,
"step": 739
},
{
"epoch": 3.6633663366336635,
"grad_norm": 0.06494537997236631,
"learning_rate": 1.6187645428933372e-05,
"loss": 0.2576,
"step": 740
},
{
"epoch": 3.6683168316831685,
"grad_norm": 0.07191765354832381,
"learning_rate": 1.607670931099929e-05,
"loss": 0.2627,
"step": 741
},
{
"epoch": 3.6732673267326734,
"grad_norm": 0.06163702109997666,
"learning_rate": 1.5966058947161035e-05,
"loss": 0.2604,
"step": 742
},
{
"epoch": 3.6782178217821784,
"grad_norm": 0.06896827740527596,
"learning_rate": 1.5855695659092746e-05,
"loss": 0.2627,
"step": 743
},
{
"epoch": 3.6831683168316833,
"grad_norm": 0.07022993469113507,
"learning_rate": 1.5745620765039564e-05,
"loss": 0.2627,
"step": 744
},
{
"epoch": 3.6881188118811883,
"grad_norm": 0.06385564587911713,
"learning_rate": 1.563583557980186e-05,
"loss": 0.2571,
"step": 745
},
{
"epoch": 3.693069306930693,
"grad_norm": 0.06788789737493876,
"learning_rate": 1.5526341414719565e-05,
"loss": 0.2597,
"step": 746
},
{
"epoch": 3.698019801980198,
"grad_norm": 0.06355678554123764,
"learning_rate": 1.541713957765649e-05,
"loss": 0.2584,
"step": 747
},
{
"epoch": 3.7029702970297027,
"grad_norm": 0.06746069765280531,
"learning_rate": 1.5308231372984723e-05,
"loss": 0.2564,
"step": 748
},
{
"epoch": 3.707920792079208,
"grad_norm": 0.06602515183476947,
"learning_rate": 1.5199618101569003e-05,
"loss": 0.2618,
"step": 749
},
{
"epoch": 3.7128712871287126,
"grad_norm": 0.06261540013075592,
"learning_rate": 1.5091301060751207e-05,
"loss": 0.261,
"step": 750
},
{
"epoch": 3.717821782178218,
"grad_norm": 0.06738446233701077,
"learning_rate": 1.4983281544334896e-05,
"loss": 0.2615,
"step": 751
},
{
"epoch": 3.7227722772277225,
"grad_norm": 0.06387167102806292,
"learning_rate": 1.4875560842569767e-05,
"loss": 0.2628,
"step": 752
},
{
"epoch": 3.727722772277228,
"grad_norm": 0.06873548015112758,
"learning_rate": 1.4768140242136353e-05,
"loss": 0.2634,
"step": 753
},
{
"epoch": 3.7326732673267324,
"grad_norm": 0.07004440295456092,
"learning_rate": 1.4661021026130553e-05,
"loss": 0.2625,
"step": 754
},
{
"epoch": 3.737623762376238,
"grad_norm": 0.06913752523862327,
"learning_rate": 1.4554204474048357e-05,
"loss": 0.2603,
"step": 755
},
{
"epoch": 3.7425742574257423,
"grad_norm": 0.0648189635402994,
"learning_rate": 1.4447691861770591e-05,
"loss": 0.2598,
"step": 756
},
{
"epoch": 3.7475247524752477,
"grad_norm": 0.06865893360130693,
"learning_rate": 1.4341484461547585e-05,
"loss": 0.2621,
"step": 757
},
{
"epoch": 3.7524752475247523,
"grad_norm": 0.062626615368903,
"learning_rate": 1.4235583541984092e-05,
"loss": 0.2601,
"step": 758
},
{
"epoch": 3.7574257425742577,
"grad_norm": 0.06552809258055384,
"learning_rate": 1.412999036802404e-05,
"loss": 0.263,
"step": 759
},
{
"epoch": 3.762376237623762,
"grad_norm": 0.06568546114262737,
"learning_rate": 1.4024706200935452e-05,
"loss": 0.2612,
"step": 760
},
{
"epoch": 3.7673267326732676,
"grad_norm": 0.0669186021015185,
"learning_rate": 1.3919732298295431e-05,
"loss": 0.2596,
"step": 761
},
{
"epoch": 3.772277227722772,
"grad_norm": 0.0670319250412692,
"learning_rate": 1.3815069913975045e-05,
"loss": 0.2636,
"step": 762
},
{
"epoch": 3.7772277227722775,
"grad_norm": 0.06654340061509974,
"learning_rate": 1.3710720298124454e-05,
"loss": 0.256,
"step": 763
},
{
"epoch": 3.782178217821782,
"grad_norm": 0.06341792514467096,
"learning_rate": 1.3606684697157876e-05,
"loss": 0.2611,
"step": 764
},
{
"epoch": 3.7871287128712874,
"grad_norm": 0.06468155266586884,
"learning_rate": 1.350296435373876e-05,
"loss": 0.2614,
"step": 765
},
{
"epoch": 3.792079207920792,
"grad_norm": 0.06163612657643488,
"learning_rate": 1.3399560506764959e-05,
"loss": 0.2629,
"step": 766
},
{
"epoch": 3.7970297029702973,
"grad_norm": 0.06406891206439465,
"learning_rate": 1.3296474391353854e-05,
"loss": 0.2576,
"step": 767
},
{
"epoch": 3.801980198019802,
"grad_norm": 0.056663417800829424,
"learning_rate": 1.3193707238827714e-05,
"loss": 0.2562,
"step": 768
},
{
"epoch": 3.806930693069307,
"grad_norm": 0.06181580783140687,
"learning_rate": 1.3091260276698847e-05,
"loss": 0.2601,
"step": 769
},
{
"epoch": 3.8118811881188117,
"grad_norm": 0.05861519134108039,
"learning_rate": 1.2989134728655097e-05,
"loss": 0.261,
"step": 770
},
{
"epoch": 3.8168316831683167,
"grad_norm": 0.06567706503132943,
"learning_rate": 1.288733181454508e-05,
"loss": 0.2632,
"step": 771
},
{
"epoch": 3.8217821782178216,
"grad_norm": 0.06312183468335963,
"learning_rate": 1.2785852750363716e-05,
"loss": 0.2604,
"step": 772
},
{
"epoch": 3.8267326732673266,
"grad_norm": 0.06579516315731176,
"learning_rate": 1.2684698748237633e-05,
"loss": 0.2615,
"step": 773
},
{
"epoch": 3.8316831683168315,
"grad_norm": 0.05991919806623056,
"learning_rate": 1.2583871016410764e-05,
"loss": 0.2593,
"step": 774
},
{
"epoch": 3.8366336633663365,
"grad_norm": 0.06334787351057825,
"learning_rate": 1.2483370759229874e-05,
"loss": 0.2577,
"step": 775
},
{
"epoch": 3.8415841584158414,
"grad_norm": 0.07090959132648503,
"learning_rate": 1.2383199177130135e-05,
"loss": 0.2623,
"step": 776
},
{
"epoch": 3.8465346534653464,
"grad_norm": 0.0603300648619507,
"learning_rate": 1.228335746662086e-05,
"loss": 0.2642,
"step": 777
},
{
"epoch": 3.8514851485148514,
"grad_norm": 0.06486787854340496,
"learning_rate": 1.2183846820271147e-05,
"loss": 0.2649,
"step": 778
},
{
"epoch": 3.8564356435643563,
"grad_norm": 0.06511664568299079,
"learning_rate": 1.2084668426695712e-05,
"loss": 0.261,
"step": 779
},
{
"epoch": 3.8613861386138613,
"grad_norm": 0.06182611155071836,
"learning_rate": 1.198582347054062e-05,
"loss": 0.2649,
"step": 780
},
{
"epoch": 3.866336633663366,
"grad_norm": 0.06327929862771448,
"learning_rate": 1.1887313132469154e-05,
"loss": 0.265,
"step": 781
},
{
"epoch": 3.871287128712871,
"grad_norm": 0.06059354451174003,
"learning_rate": 1.178913858914772e-05,
"loss": 0.2585,
"step": 782
},
{
"epoch": 3.876237623762376,
"grad_norm": 0.062002888382052625,
"learning_rate": 1.1691301013231788e-05,
"loss": 0.2618,
"step": 783
},
{
"epoch": 3.881188118811881,
"grad_norm": 0.05662432666476744,
"learning_rate": 1.1593801573351908e-05,
"loss": 0.2624,
"step": 784
},
{
"epoch": 3.886138613861386,
"grad_norm": 0.05580277635060235,
"learning_rate": 1.1496641434099725e-05,
"loss": 0.2628,
"step": 785
},
{
"epoch": 3.891089108910891,
"grad_norm": 0.057711008207046964,
"learning_rate": 1.1399821756014058e-05,
"loss": 0.2605,
"step": 786
},
{
"epoch": 3.896039603960396,
"grad_norm": 0.05611620119762601,
"learning_rate": 1.1303343695567066e-05,
"loss": 0.2619,
"step": 787
},
{
"epoch": 3.900990099009901,
"grad_norm": 0.06680065656601918,
"learning_rate": 1.1207208405150397e-05,
"loss": 0.2639,
"step": 788
},
{
"epoch": 3.905940594059406,
"grad_norm": 0.060466239287352584,
"learning_rate": 1.1111417033061498e-05,
"loss": 0.2637,
"step": 789
},
{
"epoch": 3.910891089108911,
"grad_norm": 0.053378514763086894,
"learning_rate": 1.1015970723489828e-05,
"loss": 0.2565,
"step": 790
},
{
"epoch": 3.9158415841584158,
"grad_norm": 0.05813131762071988,
"learning_rate": 1.0920870616503194e-05,
"loss": 0.2595,
"step": 791
},
{
"epoch": 3.9207920792079207,
"grad_norm": 0.06355098231546671,
"learning_rate": 1.082611784803417e-05,
"loss": 0.2651,
"step": 792
},
{
"epoch": 3.9257425742574257,
"grad_norm": 0.05915544530618073,
"learning_rate": 1.0731713549866494e-05,
"loss": 0.2616,
"step": 793
},
{
"epoch": 3.9306930693069306,
"grad_norm": 0.05511489007583624,
"learning_rate": 1.0637658849621593e-05,
"loss": 0.2549,
"step": 794
},
{
"epoch": 3.9356435643564356,
"grad_norm": 0.056384857237651013,
"learning_rate": 1.0543954870745088e-05,
"loss": 0.2625,
"step": 795
},
{
"epoch": 3.9405940594059405,
"grad_norm": 0.05635325945291314,
"learning_rate": 1.0450602732493337e-05,
"loss": 0.2608,
"step": 796
},
{
"epoch": 3.9455445544554455,
"grad_norm": 0.057238822301882146,
"learning_rate": 1.0357603549920129e-05,
"loss": 0.2564,
"step": 797
},
{
"epoch": 3.9504950495049505,
"grad_norm": 0.06049015880310532,
"learning_rate": 1.0264958433863353e-05,
"loss": 0.2626,
"step": 798
},
{
"epoch": 3.9554455445544554,
"grad_norm": 0.05650257691466007,
"learning_rate": 1.0172668490931673e-05,
"loss": 0.2576,
"step": 799
},
{
"epoch": 3.9603960396039604,
"grad_norm": 0.057402085639922246,
"learning_rate": 1.0080734823491402e-05,
"loss": 0.2608,
"step": 800
},
{
"epoch": 3.9653465346534653,
"grad_norm": 0.057615806962218595,
"learning_rate": 9.989158529653257e-06,
"loss": 0.2621,
"step": 801
},
{
"epoch": 3.9702970297029703,
"grad_norm": 0.06058933676488647,
"learning_rate": 9.897940703259264e-06,
"loss": 0.2658,
"step": 802
},
{
"epoch": 3.9752475247524752,
"grad_norm": 0.05448946637229122,
"learning_rate": 9.807082433869727e-06,
"loss": 0.263,
"step": 803
},
{
"epoch": 3.98019801980198,
"grad_norm": 0.055961400533007126,
"learning_rate": 9.716584806750151e-06,
"loss": 0.26,
"step": 804
},
{
"epoch": 3.985148514851485,
"grad_norm": 0.06328918241686664,
"learning_rate": 9.626448902858359e-06,
"loss": 0.2596,
"step": 805
},
{
"epoch": 3.99009900990099,
"grad_norm": 0.05626164158686504,
"learning_rate": 9.536675798831499e-06,
"loss": 0.2605,
"step": 806
},
{
"epoch": 3.995049504950495,
"grad_norm": 0.06311974353501029,
"learning_rate": 9.447266566973211e-06,
"loss": 0.26,
"step": 807
},
{
"epoch": 4.0,
"grad_norm": 0.0657005829617962,
"learning_rate": 9.358222275240884e-06,
"loss": 0.2563,
"step": 808
},
{
"epoch": 4.0049504950495045,
"grad_norm": 0.14833670477704655,
"learning_rate": 9.26954398723278e-06,
"loss": 0.2415,
"step": 809
},
{
"epoch": 4.00990099009901,
"grad_norm": 0.09738865646581973,
"learning_rate": 9.181232762175435e-06,
"loss": 0.2363,
"step": 810
},
{
"epoch": 4.014851485148514,
"grad_norm": 0.09300244076891609,
"learning_rate": 9.093289654910946e-06,
"loss": 0.2367,
"step": 811
},
{
"epoch": 4.01980198019802,
"grad_norm": 0.13994293803359162,
"learning_rate": 9.005715715884409e-06,
"loss": 0.2366,
"step": 812
},
{
"epoch": 4.024752475247524,
"grad_norm": 0.11618540992533626,
"learning_rate": 8.918511991131335e-06,
"loss": 0.2371,
"step": 813
},
{
"epoch": 4.02970297029703,
"grad_norm": 0.11676112687203334,
"learning_rate": 8.831679522265167e-06,
"loss": 0.2373,
"step": 814
},
{
"epoch": 4.034653465346534,
"grad_norm": 0.11368974223103682,
"learning_rate": 8.745219346464884e-06,
"loss": 0.2398,
"step": 815
},
{
"epoch": 4.03960396039604,
"grad_norm": 0.10609372237214311,
"learning_rate": 8.659132496462521e-06,
"loss": 0.2389,
"step": 816
},
{
"epoch": 4.044554455445544,
"grad_norm": 0.10199378246411407,
"learning_rate": 8.57342000053095e-06,
"loss": 0.2369,
"step": 817
},
{
"epoch": 4.0495049504950495,
"grad_norm": 0.10641728987433756,
"learning_rate": 8.488082882471476e-06,
"loss": 0.2376,
"step": 818
},
{
"epoch": 4.054455445544554,
"grad_norm": 0.0948066652739402,
"learning_rate": 8.403122161601699e-06,
"loss": 0.2382,
"step": 819
},
{
"epoch": 4.0594059405940595,
"grad_norm": 0.09444767450020332,
"learning_rate": 8.318538852743275e-06,
"loss": 0.2413,
"step": 820
},
{
"epoch": 4.064356435643564,
"grad_norm": 0.09281522330484215,
"learning_rate": 8.23433396620986e-06,
"loss": 0.2357,
"step": 821
},
{
"epoch": 4.069306930693069,
"grad_norm": 0.08359912220801992,
"learning_rate": 8.150508507795005e-06,
"loss": 0.2397,
"step": 822
},
{
"epoch": 4.074257425742574,
"grad_norm": 0.0826517841789896,
"learning_rate": 8.067063478760127e-06,
"loss": 0.2394,
"step": 823
},
{
"epoch": 4.079207920792079,
"grad_norm": 0.07579285661758814,
"learning_rate": 7.983999875822563e-06,
"loss": 0.2351,
"step": 824
},
{
"epoch": 4.084158415841584,
"grad_norm": 0.07850432213332705,
"learning_rate": 7.901318691143678e-06,
"loss": 0.2403,
"step": 825
},
{
"epoch": 4.089108910891089,
"grad_norm": 0.07909754150897867,
"learning_rate": 7.819020912317011e-06,
"loss": 0.2387,
"step": 826
},
{
"epoch": 4.094059405940594,
"grad_norm": 0.06821060953413947,
"learning_rate": 7.73710752235647e-06,
"loss": 0.2372,
"step": 827
},
{
"epoch": 4.099009900990099,
"grad_norm": 0.07023999741301488,
"learning_rate": 7.65557949968459e-06,
"loss": 0.2402,
"step": 828
},
{
"epoch": 4.103960396039604,
"grad_norm": 0.07435090331777651,
"learning_rate": 7.574437818120839e-06,
"loss": 0.2338,
"step": 829
},
{
"epoch": 4.108910891089109,
"grad_norm": 0.07224904309089736,
"learning_rate": 7.4936834468699945e-06,
"loss": 0.2387,
"step": 830
},
{
"epoch": 4.1138613861386135,
"grad_norm": 0.06369678683068347,
"learning_rate": 7.413317350510589e-06,
"loss": 0.2367,
"step": 831
},
{
"epoch": 4.118811881188119,
"grad_norm": 0.06699315639457563,
"learning_rate": 7.333340488983363e-06,
"loss": 0.2375,
"step": 832
},
{
"epoch": 4.123762376237623,
"grad_norm": 0.06690201833514393,
"learning_rate": 7.253753817579792e-06,
"loss": 0.2369,
"step": 833
},
{
"epoch": 4.128712871287129,
"grad_norm": 0.06523178059708698,
"learning_rate": 7.174558286930682e-06,
"loss": 0.2353,
"step": 834
},
{
"epoch": 4.133663366336633,
"grad_norm": 0.06928338209332009,
"learning_rate": 7.095754842994824e-06,
"loss": 0.241,
"step": 835
},
{
"epoch": 4.138613861386139,
"grad_norm": 0.05826199727329262,
"learning_rate": 7.0173444270477075e-06,
"loss": 0.237,
"step": 836
},
{
"epoch": 4.143564356435643,
"grad_norm": 0.0650338593796366,
"learning_rate": 6.939327975670256e-06,
"loss": 0.2389,
"step": 837
},
{
"epoch": 4.148514851485149,
"grad_norm": 0.062373371937288424,
"learning_rate": 6.861706420737628e-06,
"loss": 0.235,
"step": 838
},
{
"epoch": 4.153465346534653,
"grad_norm": 0.059345606049629886,
"learning_rate": 6.784480689408099e-06,
"loss": 0.2374,
"step": 839
},
{
"epoch": 4.158415841584159,
"grad_norm": 0.05938411361311235,
"learning_rate": 6.707651704112028e-06,
"loss": 0.2394,
"step": 840
},
{
"epoch": 4.163366336633663,
"grad_norm": 0.05507804095810438,
"learning_rate": 6.631220382540755e-06,
"loss": 0.2379,
"step": 841
},
{
"epoch": 4.1683168316831685,
"grad_norm": 0.05676343569911684,
"learning_rate": 6.555187637635727e-06,
"loss": 0.2387,
"step": 842
},
{
"epoch": 4.173267326732673,
"grad_norm": 0.05738778495833525,
"learning_rate": 6.479554377577528e-06,
"loss": 0.2379,
"step": 843
},
{
"epoch": 4.178217821782178,
"grad_norm": 0.05800503879065267,
"learning_rate": 6.404321505775053e-06,
"loss": 0.2367,
"step": 844
},
{
"epoch": 4.183168316831683,
"grad_norm": 0.055483651786811077,
"learning_rate": 6.329489920854745e-06,
"loss": 0.2385,
"step": 845
},
{
"epoch": 4.188118811881188,
"grad_norm": 0.05477230373977264,
"learning_rate": 6.255060516649809e-06,
"loss": 0.239,
"step": 846
},
{
"epoch": 4.193069306930693,
"grad_norm": 0.051715088356370745,
"learning_rate": 6.181034182189592e-06,
"loss": 0.2429,
"step": 847
},
{
"epoch": 4.198019801980198,
"grad_norm": 0.05515414502968798,
"learning_rate": 6.107411801688905e-06,
"loss": 0.2379,
"step": 848
},
{
"epoch": 4.202970297029703,
"grad_norm": 0.05507096068990231,
"learning_rate": 6.034194254537502e-06,
"loss": 0.2355,
"step": 849
},
{
"epoch": 4.207920792079208,
"grad_norm": 0.053287860897235355,
"learning_rate": 5.9613824152895765e-06,
"loss": 0.2396,
"step": 850
},
{
"epoch": 4.212871287128713,
"grad_norm": 0.0560101802810022,
"learning_rate": 5.8889771536532855e-06,
"loss": 0.2368,
"step": 851
},
{
"epoch": 4.217821782178218,
"grad_norm": 0.051457826913242195,
"learning_rate": 5.8169793344804085e-06,
"loss": 0.2408,
"step": 852
},
{
"epoch": 4.2227722772277225,
"grad_norm": 0.05428425450968013,
"learning_rate": 5.7453898177559505e-06,
"loss": 0.2355,
"step": 853
},
{
"epoch": 4.227722772277228,
"grad_norm": 0.05221957619974725,
"learning_rate": 5.674209458587929e-06,
"loss": 0.2369,
"step": 854
},
{
"epoch": 4.232673267326732,
"grad_norm": 0.05124594198330728,
"learning_rate": 5.603439107197149e-06,
"loss": 0.2399,
"step": 855
},
{
"epoch": 4.237623762376238,
"grad_norm": 0.051046117213285905,
"learning_rate": 5.5330796089070064e-06,
"loss": 0.2391,
"step": 856
},
{
"epoch": 4.242574257425742,
"grad_norm": 0.052304073144293355,
"learning_rate": 5.463131804133461e-06,
"loss": 0.2374,
"step": 857
},
{
"epoch": 4.247524752475248,
"grad_norm": 0.05229718260879773,
"learning_rate": 5.393596528374923e-06,
"loss": 0.2377,
"step": 858
},
{
"epoch": 4.252475247524752,
"grad_norm": 0.051704981010214965,
"learning_rate": 5.324474612202335e-06,
"loss": 0.2386,
"step": 859
},
{
"epoch": 4.257425742574258,
"grad_norm": 0.04980569031859926,
"learning_rate": 5.255766881249212e-06,
"loss": 0.2382,
"step": 860
},
{
"epoch": 4.262376237623762,
"grad_norm": 0.056633565485021506,
"learning_rate": 5.187474156201786e-06,
"loss": 0.2358,
"step": 861
},
{
"epoch": 4.267326732673268,
"grad_norm": 0.05196973815132134,
"learning_rate": 5.119597252789237e-06,
"loss": 0.2353,
"step": 862
},
{
"epoch": 4.272277227722772,
"grad_norm": 0.054561533142327555,
"learning_rate": 5.052136981773892e-06,
"loss": 0.2379,
"step": 863
},
{
"epoch": 4.2772277227722775,
"grad_norm": 0.049673991948889856,
"learning_rate": 4.9850941489415985e-06,
"loss": 0.2404,
"step": 864
},
{
"epoch": 4.282178217821782,
"grad_norm": 0.052903140339615774,
"learning_rate": 4.918469555092049e-06,
"loss": 0.2383,
"step": 865
},
{
"epoch": 4.287128712871287,
"grad_norm": 0.05190724610448354,
"learning_rate": 4.852263996029259e-06,
"loss": 0.2357,
"step": 866
},
{
"epoch": 4.292079207920792,
"grad_norm": 0.04823404745690664,
"learning_rate": 4.786478262552012e-06,
"loss": 0.2347,
"step": 867
},
{
"epoch": 4.297029702970297,
"grad_norm": 0.05001330463963647,
"learning_rate": 4.7211131404444825e-06,
"loss": 0.2364,
"step": 868
},
{
"epoch": 4.301980198019802,
"grad_norm": 0.053915613343368925,
"learning_rate": 4.656169410466795e-06,
"loss": 0.2395,
"step": 869
},
{
"epoch": 4.306930693069307,
"grad_norm": 0.0521840158401366,
"learning_rate": 4.591647848345711e-06,
"loss": 0.2398,
"step": 870
},
{
"epoch": 4.311881188118812,
"grad_norm": 0.05092405950712264,
"learning_rate": 4.527549224765362e-06,
"loss": 0.2363,
"step": 871
},
{
"epoch": 4.316831683168317,
"grad_norm": 0.04856918141084432,
"learning_rate": 4.463874305358045e-06,
"loss": 0.2398,
"step": 872
},
{
"epoch": 4.321782178217822,
"grad_norm": 0.05172447677120079,
"learning_rate": 4.400623850695103e-06,
"loss": 0.2396,
"step": 873
},
{
"epoch": 4.326732673267327,
"grad_norm": 0.05355917732905732,
"learning_rate": 4.337798616277806e-06,
"loss": 0.2385,
"step": 874
},
{
"epoch": 4.3316831683168315,
"grad_norm": 0.051686520791098325,
"learning_rate": 4.275399352528342e-06,
"loss": 0.2394,
"step": 875
},
{
"epoch": 4.336633663366337,
"grad_norm": 0.05171279734424735,
"learning_rate": 4.213426804780838e-06,
"loss": 0.237,
"step": 876
},
{
"epoch": 4.341584158415841,
"grad_norm": 0.05150346337682817,
"learning_rate": 4.151881713272472e-06,
"loss": 0.239,
"step": 877
},
{
"epoch": 4.346534653465347,
"grad_norm": 0.050892722418566426,
"learning_rate": 4.090764813134644e-06,
"loss": 0.2416,
"step": 878
},
{
"epoch": 4.351485148514851,
"grad_norm": 0.049350733100512516,
"learning_rate": 4.0300768343841805e-06,
"loss": 0.2382,
"step": 879
},
{
"epoch": 4.356435643564357,
"grad_norm": 0.04931095750384826,
"learning_rate": 3.969818501914597e-06,
"loss": 0.2366,
"step": 880
},
{
"epoch": 4.361386138613861,
"grad_norm": 0.050924430998417654,
"learning_rate": 3.909990535487472e-06,
"loss": 0.237,
"step": 881
},
{
"epoch": 4.366336633663367,
"grad_norm": 0.049863362448873925,
"learning_rate": 3.850593649723804e-06,
"loss": 0.2398,
"step": 882
},
{
"epoch": 4.371287128712871,
"grad_norm": 0.04927463375224769,
"learning_rate": 3.7916285540955566e-06,
"loss": 0.2418,
"step": 883
},
{
"epoch": 4.376237623762377,
"grad_norm": 0.0482570661660394,
"learning_rate": 3.733095952917101e-06,
"loss": 0.2372,
"step": 884
},
{
"epoch": 4.381188118811881,
"grad_norm": 0.049192512272021933,
"learning_rate": 3.6749965453368375e-06,
"loss": 0.2364,
"step": 885
},
{
"epoch": 4.3861386138613865,
"grad_norm": 0.04845689086339911,
"learning_rate": 3.617331025328845e-06,
"loss": 0.2361,
"step": 886
},
{
"epoch": 4.391089108910891,
"grad_norm": 0.0502670225409252,
"learning_rate": 3.5601000816846053e-06,
"loss": 0.2372,
"step": 887
},
{
"epoch": 4.396039603960396,
"grad_norm": 0.04808200816842586,
"learning_rate": 3.50330439800473e-06,
"loss": 0.2384,
"step": 888
},
{
"epoch": 4.400990099009901,
"grad_norm": 0.05042628809150087,
"learning_rate": 3.4469446526908555e-06,
"loss": 0.2402,
"step": 889
},
{
"epoch": 4.405940594059406,
"grad_norm": 0.046037281684570205,
"learning_rate": 3.3910215189374916e-06,
"loss": 0.2404,
"step": 890
},
{
"epoch": 4.410891089108911,
"grad_norm": 0.0486971701984787,
"learning_rate": 3.3355356647239987e-06,
"loss": 0.2414,
"step": 891
},
{
"epoch": 4.415841584158416,
"grad_norm": 0.04880396827143136,
"learning_rate": 3.2804877528066225e-06,
"loss": 0.2383,
"step": 892
},
{
"epoch": 4.420792079207921,
"grad_norm": 0.04893364723038564,
"learning_rate": 3.225878440710544e-06,
"loss": 0.2408,
"step": 893
},
{
"epoch": 4.425742574257426,
"grad_norm": 0.047864235721432204,
"learning_rate": 3.171708380722072e-06,
"loss": 0.2375,
"step": 894
},
{
"epoch": 4.430693069306931,
"grad_norm": 0.050004565893982035,
"learning_rate": 3.1179782198807973e-06,
"loss": 0.2355,
"step": 895
},
{
"epoch": 4.435643564356436,
"grad_norm": 0.05153700056471179,
"learning_rate": 3.064688599971901e-06,
"loss": 0.2377,
"step": 896
},
{
"epoch": 4.4405940594059405,
"grad_norm": 0.0488630526140388,
"learning_rate": 3.011840157518493e-06,
"loss": 0.2376,
"step": 897
},
{
"epoch": 4.445544554455446,
"grad_norm": 0.04793270329958219,
"learning_rate": 2.9594335237739778e-06,
"loss": 0.24,
"step": 898
},
{
"epoch": 4.4504950495049505,
"grad_norm": 0.045205252044246934,
"learning_rate": 2.9074693247145513e-06,
"loss": 0.2369,
"step": 899
},
{
"epoch": 4.455445544554456,
"grad_norm": 0.046629357363959435,
"learning_rate": 2.85594818103168e-06,
"loss": 0.2345,
"step": 900
},
{
"epoch": 4.46039603960396,
"grad_norm": 0.04689830172030681,
"learning_rate": 2.804870708124745e-06,
"loss": 0.2366,
"step": 901
},
{
"epoch": 4.465346534653466,
"grad_norm": 0.05083578293715084,
"learning_rate": 2.754237516093623e-06,
"loss": 0.2375,
"step": 902
},
{
"epoch": 4.47029702970297,
"grad_norm": 0.04857771410366898,
"learning_rate": 2.7040492097314498e-06,
"loss": 0.2405,
"step": 903
},
{
"epoch": 4.475247524752476,
"grad_norm": 0.04634397869065822,
"learning_rate": 2.6543063885173936e-06,
"loss": 0.2374,
"step": 904
},
{
"epoch": 4.48019801980198,
"grad_norm": 0.047891106793236694,
"learning_rate": 2.605009646609453e-06,
"loss": 0.2387,
"step": 905
},
{
"epoch": 4.485148514851485,
"grad_norm": 0.048117090467841135,
"learning_rate": 2.556159572837422e-06,
"loss": 0.2415,
"step": 906
},
{
"epoch": 4.49009900990099,
"grad_norm": 0.04865265129692607,
"learning_rate": 2.5077567506957977e-06,
"loss": 0.2362,
"step": 907
},
{
"epoch": 4.4950495049504955,
"grad_norm": 0.04599413264021086,
"learning_rate": 2.459801758336835e-06,
"loss": 0.2372,
"step": 908
},
{
"epoch": 4.5,
"grad_norm": 0.048222084438524646,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.2405,
"step": 909
},
{
"epoch": 4.5049504950495045,
"grad_norm": 0.04704432338353155,
"learning_rate": 2.3652375488234114e-06,
"loss": 0.2391,
"step": 910
},
{
"epoch": 4.50990099009901,
"grad_norm": 0.04745475969682847,
"learning_rate": 2.3186294612004365e-06,
"loss": 0.2395,
"step": 911
},
{
"epoch": 4.514851485148515,
"grad_norm": 0.04862261011896364,
"learning_rate": 2.272471462409622e-06,
"loss": 0.2409,
"step": 912
},
{
"epoch": 4.51980198019802,
"grad_norm": 0.046907173224887085,
"learning_rate": 2.226764103789716e-06,
"loss": 0.2389,
"step": 913
},
{
"epoch": 4.524752475247524,
"grad_norm": 0.04625726359398786,
"learning_rate": 2.181507931296749e-06,
"loss": 0.2409,
"step": 914
},
{
"epoch": 4.52970297029703,
"grad_norm": 0.048230191171381366,
"learning_rate": 2.136703485497531e-06,
"loss": 0.2376,
"step": 915
},
{
"epoch": 4.534653465346535,
"grad_norm": 0.0473264881125143,
"learning_rate": 2.0923513015631646e-06,
"loss": 0.2351,
"step": 916
},
{
"epoch": 4.53960396039604,
"grad_norm": 0.04590922745948315,
"learning_rate": 2.0484519092626652e-06,
"loss": 0.2395,
"step": 917
},
{
"epoch": 4.544554455445544,
"grad_norm": 0.048130979836544845,
"learning_rate": 2.0050058329566367e-06,
"loss": 0.2419,
"step": 918
},
{
"epoch": 4.5495049504950495,
"grad_norm": 0.04562632269161316,
"learning_rate": 1.9620135915909968e-06,
"loss": 0.2364,
"step": 919
},
{
"epoch": 4.554455445544555,
"grad_norm": 0.0476088198793673,
"learning_rate": 1.9194756986908025e-06,
"loss": 0.2391,
"step": 920
},
{
"epoch": 4.5594059405940595,
"grad_norm": 0.04716082029677215,
"learning_rate": 1.8773926623541028e-06,
"loss": 0.2374,
"step": 921
},
{
"epoch": 4.564356435643564,
"grad_norm": 0.04742904413065106,
"learning_rate": 1.835764985245856e-06,
"loss": 0.2394,
"step": 922
},
{
"epoch": 4.569306930693069,
"grad_norm": 0.04569300920088693,
"learning_rate": 1.7945931645919358e-06,
"loss": 0.2358,
"step": 923
},
{
"epoch": 4.574257425742574,
"grad_norm": 0.04518374268437982,
"learning_rate": 1.7538776921731937e-06,
"loss": 0.2413,
"step": 924
},
{
"epoch": 4.579207920792079,
"grad_norm": 0.04609139030601693,
"learning_rate": 1.713619054319593e-06,
"loss": 0.2392,
"step": 925
},
{
"epoch": 4.584158415841584,
"grad_norm": 0.04817915866130937,
"learning_rate": 1.6738177319044036e-06,
"loss": 0.2375,
"step": 926
},
{
"epoch": 4.589108910891089,
"grad_norm": 0.04459736481607962,
"learning_rate": 1.6344742003384161e-06,
"loss": 0.2362,
"step": 927
},
{
"epoch": 4.594059405940594,
"grad_norm": 0.0449026355617313,
"learning_rate": 1.5955889295643111e-06,
"loss": 0.2377,
"step": 928
},
{
"epoch": 4.599009900990099,
"grad_norm": 0.04744519306239131,
"learning_rate": 1.5571623840510185e-06,
"loss": 0.2391,
"step": 929
},
{
"epoch": 4.603960396039604,
"grad_norm": 0.0451351470929595,
"learning_rate": 1.519195022788198e-06,
"loss": 0.2408,
"step": 930
},
{
"epoch": 4.608910891089109,
"grad_norm": 0.04561138995843172,
"learning_rate": 1.481687299280723e-06,
"loss": 0.242,
"step": 931
},
{
"epoch": 4.6138613861386135,
"grad_norm": 0.04424775857738886,
"learning_rate": 1.4446396615432855e-06,
"loss": 0.2384,
"step": 932
},
{
"epoch": 4.618811881188119,
"grad_norm": 0.04631851429293384,
"learning_rate": 1.4080525520950184e-06,
"loss": 0.2442,
"step": 933
},
{
"epoch": 4.623762376237623,
"grad_norm": 0.04514568203810665,
"learning_rate": 1.3719264079542628e-06,
"loss": 0.2369,
"step": 934
},
{
"epoch": 4.628712871287129,
"grad_norm": 0.04575984145611956,
"learning_rate": 1.33626166063328e-06,
"loss": 0.2381,
"step": 935
},
{
"epoch": 4.633663366336633,
"grad_norm": 0.04515268797675634,
"learning_rate": 1.3010587361331673e-06,
"loss": 0.242,
"step": 936
},
{
"epoch": 4.638613861386139,
"grad_norm": 0.045073799413081025,
"learning_rate": 1.2663180549387e-06,
"loss": 0.2375,
"step": 937
},
{
"epoch": 4.643564356435643,
"grad_norm": 0.04670702500193507,
"learning_rate": 1.2320400320133551e-06,
"loss": 0.239,
"step": 938
},
{
"epoch": 4.648514851485149,
"grad_norm": 0.04525298838401546,
"learning_rate": 1.1982250767943593e-06,
"loss": 0.2374,
"step": 939
},
{
"epoch": 4.653465346534653,
"grad_norm": 0.04414989478160124,
"learning_rate": 1.1648735931877543e-06,
"loss": 0.2399,
"step": 940
},
{
"epoch": 4.658415841584159,
"grad_norm": 0.04539179349644283,
"learning_rate": 1.131985979563619e-06,
"loss": 0.238,
"step": 941
},
{
"epoch": 4.663366336633663,
"grad_norm": 0.045565671775401856,
"learning_rate": 1.0995626287512828e-06,
"loss": 0.2382,
"step": 942
},
{
"epoch": 4.6683168316831685,
"grad_norm": 0.04662849759325202,
"learning_rate": 1.0676039280346439e-06,
"loss": 0.243,
"step": 943
},
{
"epoch": 4.673267326732673,
"grad_norm": 0.04464773083623712,
"learning_rate": 1.036110259147547e-06,
"loss": 0.2407,
"step": 944
},
{
"epoch": 4.678217821782178,
"grad_norm": 0.044465821345200274,
"learning_rate": 1.0050819982692083e-06,
"loss": 0.2388,
"step": 945
},
{
"epoch": 4.683168316831683,
"grad_norm": 0.0449944805033351,
"learning_rate": 9.745195160197452e-07,
"loss": 0.2373,
"step": 946
},
{
"epoch": 4.688118811881188,
"grad_norm": 0.04801372663982143,
"learning_rate": 9.444231774557199e-07,
"loss": 0.2396,
"step": 947
},
{
"epoch": 4.693069306930693,
"grad_norm": 0.04438811479086852,
"learning_rate": 9.147933420658117e-07,
"loss": 0.2389,
"step": 948
},
{
"epoch": 4.698019801980198,
"grad_norm": 0.044288975040482834,
"learning_rate": 8.856303637664987e-07,
"loss": 0.2369,
"step": 949
},
{
"epoch": 4.702970297029703,
"grad_norm": 0.046158842268618265,
"learning_rate": 8.569345908978355e-07,
"loss": 0.2387,
"step": 950
},
{
"epoch": 4.707920792079208,
"grad_norm": 0.045244039053761356,
"learning_rate": 8.287063662193095e-07,
"loss": 0.2426,
"step": 951
},
{
"epoch": 4.712871287128713,
"grad_norm": 0.04481365441294429,
"learning_rate": 8.009460269057156e-07,
"loss": 0.2386,
"step": 952
},
{
"epoch": 4.717821782178218,
"grad_norm": 0.04577841830144016,
"learning_rate": 7.736539045431634e-07,
"loss": 0.2415,
"step": 953
},
{
"epoch": 4.7227722772277225,
"grad_norm": 0.04408699692259636,
"learning_rate": 7.468303251250764e-07,
"loss": 0.2409,
"step": 954
},
{
"epoch": 4.727722772277228,
"grad_norm": 0.044862655648373294,
"learning_rate": 7.204756090483411e-07,
"loss": 0.2396,
"step": 955
},
{
"epoch": 4.732673267326732,
"grad_norm": 0.044900920206465036,
"learning_rate": 6.945900711094534e-07,
"loss": 0.2366,
"step": 956
},
{
"epoch": 4.737623762376238,
"grad_norm": 0.04644906035668882,
"learning_rate": 6.691740205007602e-07,
"loss": 0.2402,
"step": 957
},
{
"epoch": 4.742574257425742,
"grad_norm": 0.04499665814919747,
"learning_rate": 6.442277608067838e-07,
"loss": 0.2375,
"step": 958
},
{
"epoch": 4.747524752475248,
"grad_norm": 0.04333842509925558,
"learning_rate": 6.197515900005613e-07,
"loss": 0.238,
"step": 959
},
{
"epoch": 4.752475247524752,
"grad_norm": 0.04382161690247031,
"learning_rate": 5.957458004401328e-07,
"loss": 0.2401,
"step": 960
},
{
"epoch": 4.757425742574258,
"grad_norm": 0.044485838351754986,
"learning_rate": 5.722106788649928e-07,
"loss": 0.2372,
"step": 961
},
{
"epoch": 4.762376237623762,
"grad_norm": 0.0459814339270677,
"learning_rate": 5.491465063927282e-07,
"loss": 0.2384,
"step": 962
},
{
"epoch": 4.767326732673268,
"grad_norm": 0.04407284345840262,
"learning_rate": 5.265535585156079e-07,
"loss": 0.2397,
"step": 963
},
{
"epoch": 4.772277227722772,
"grad_norm": 0.04268369087826167,
"learning_rate": 5.044321050973189e-07,
"loss": 0.2428,
"step": 964
},
{
"epoch": 4.7772277227722775,
"grad_norm": 0.042380738168656576,
"learning_rate": 4.827824103697332e-07,
"loss": 0.2372,
"step": 965
},
{
"epoch": 4.782178217821782,
"grad_norm": 0.043861037556938605,
"learning_rate": 4.616047329297546e-07,
"loss": 0.241,
"step": 966
},
{
"epoch": 4.787128712871287,
"grad_norm": 0.04309315460955193,
"learning_rate": 4.408993257362282e-07,
"loss": 0.2367,
"step": 967
},
{
"epoch": 4.792079207920792,
"grad_norm": 0.043850264542277494,
"learning_rate": 4.206664361069379e-07,
"loss": 0.2406,
"step": 968
},
{
"epoch": 4.797029702970297,
"grad_norm": 0.042518962522091946,
"learning_rate": 4.0090630571560927e-07,
"loss": 0.2381,
"step": 969
},
{
"epoch": 4.801980198019802,
"grad_norm": 0.04280591327583044,
"learning_rate": 3.8161917058906706e-07,
"loss": 0.2362,
"step": 970
},
{
"epoch": 4.806930693069307,
"grad_norm": 0.04428828190691451,
"learning_rate": 3.628052611043842e-07,
"loss": 0.2388,
"step": 971
},
{
"epoch": 4.811881188118812,
"grad_norm": 0.04297748218450373,
"learning_rate": 3.444648019861552e-07,
"loss": 0.2356,
"step": 972
},
{
"epoch": 4.816831683168317,
"grad_norm": 0.04385791587886883,
"learning_rate": 3.265980123038004e-07,
"loss": 0.2386,
"step": 973
},
{
"epoch": 4.821782178217822,
"grad_norm": 0.041261863255062134,
"learning_rate": 3.0920510546894156e-07,
"loss": 0.2371,
"step": 974
},
{
"epoch": 4.826732673267327,
"grad_norm": 0.045027627430168714,
"learning_rate": 2.9228628923285705e-07,
"loss": 0.2413,
"step": 975
},
{
"epoch": 4.8316831683168315,
"grad_norm": 0.04461871544630878,
"learning_rate": 2.7584176568401734e-07,
"loss": 0.2362,
"step": 976
},
{
"epoch": 4.836633663366337,
"grad_norm": 0.04452805342206743,
"learning_rate": 2.5987173124564224e-07,
"loss": 0.2412,
"step": 977
},
{
"epoch": 4.841584158415841,
"grad_norm": 0.043976192764155944,
"learning_rate": 2.4437637667338754e-07,
"loss": 0.2374,
"step": 978
},
{
"epoch": 4.846534653465347,
"grad_norm": 0.04618394921045984,
"learning_rate": 2.2935588705302658e-07,
"loss": 0.2384,
"step": 979
},
{
"epoch": 4.851485148514851,
"grad_norm": 0.042765827362432736,
"learning_rate": 2.148104417982788e-07,
"loss": 0.2369,
"step": 980
},
{
"epoch": 4.856435643564357,
"grad_norm": 0.04141942912952928,
"learning_rate": 2.0074021464864702e-07,
"loss": 0.2368,
"step": 981
},
{
"epoch": 4.861386138613861,
"grad_norm": 0.04381001009904221,
"learning_rate": 1.871453736673301e-07,
"loss": 0.239,
"step": 982
},
{
"epoch": 4.866336633663367,
"grad_norm": 0.04481962497369779,
"learning_rate": 1.740260812392558e-07,
"loss": 0.241,
"step": 983
},
{
"epoch": 4.871287128712871,
"grad_norm": 0.04339245184057915,
"learning_rate": 1.6138249406909558e-07,
"loss": 0.2387,
"step": 984
},
{
"epoch": 4.876237623762377,
"grad_norm": 0.04468071895656673,
"learning_rate": 1.4921476317941719e-07,
"loss": 0.2393,
"step": 985
},
{
"epoch": 4.881188118811881,
"grad_norm": 0.043243224263089276,
"learning_rate": 1.3752303390887733e-07,
"loss": 0.2405,
"step": 986
},
{
"epoch": 4.8861386138613865,
"grad_norm": 0.04411565568410913,
"learning_rate": 1.2630744591048516e-07,
"loss": 0.2388,
"step": 987
},
{
"epoch": 4.891089108910891,
"grad_norm": 0.0439686894180931,
"learning_rate": 1.1556813314993698e-07,
"loss": 0.2387,
"step": 988
},
{
"epoch": 4.896039603960396,
"grad_norm": 0.04258496686208244,
"learning_rate": 1.0530522390400422e-07,
"loss": 0.2382,
"step": 989
},
{
"epoch": 4.900990099009901,
"grad_norm": 0.043027628871607715,
"learning_rate": 9.551884075901463e-08,
"loss": 0.2366,
"step": 990
},
{
"epoch": 4.905940594059406,
"grad_norm": 0.04240544691281539,
"learning_rate": 8.620910060938681e-08,
"loss": 0.2377,
"step": 991
},
{
"epoch": 4.910891089108911,
"grad_norm": 0.0429291141669323,
"learning_rate": 7.737611465622686e-08,
"loss": 0.2391,
"step": 992
},
{
"epoch": 4.915841584158416,
"grad_norm": 0.04405082368021681,
"learning_rate": 6.901998840600055e-08,
"loss": 0.2388,
"step": 993
},
{
"epoch": 4.920792079207921,
"grad_norm": 0.04233502027894486,
"learning_rate": 6.11408216692766e-08,
"loss": 0.2373,
"step": 994
},
{
"epoch": 4.925742574257426,
"grad_norm": 0.04248456835844998,
"learning_rate": 5.373870855954089e-08,
"loss": 0.2395,
"step": 995
},
{
"epoch": 4.930693069306931,
"grad_norm": 0.04337704667173435,
"learning_rate": 4.681373749205964e-08,
"loss": 0.2392,
"step": 996
},
{
"epoch": 4.935643564356436,
"grad_norm": 0.04293673533953289,
"learning_rate": 4.036599118282691e-08,
"loss": 0.2398,
"step": 997
},
{
"epoch": 4.9405940594059405,
"grad_norm": 0.04264178722028713,
"learning_rate": 3.439554664758316e-08,
"loss": 0.2372,
"step": 998
},
{
"epoch": 4.945544554455445,
"grad_norm": 0.04733981864127338,
"learning_rate": 2.890247520089151e-08,
"loss": 0.2389,
"step": 999
},
{
"epoch": 4.9504950495049505,
"grad_norm": 0.043007524302967316,
"learning_rate": 2.3886842455285166e-08,
"loss": 0.235,
"step": 1000
},
{
"epoch": 4.955445544554456,
"grad_norm": 0.042085073458264566,
"learning_rate": 1.934870832047686e-08,
"loss": 0.2364,
"step": 1001
},
{
"epoch": 4.96039603960396,
"grad_norm": 0.04333569651636603,
"learning_rate": 1.528812700266169e-08,
"loss": 0.2362,
"step": 1002
},
{
"epoch": 4.965346534653465,
"grad_norm": 0.04536711780330465,
"learning_rate": 1.1705147003842065e-08,
"loss": 0.2382,
"step": 1003
},
{
"epoch": 4.97029702970297,
"grad_norm": 0.04235293811505524,
"learning_rate": 8.59981112128594e-09,
"loss": 0.2367,
"step": 1004
},
{
"epoch": 4.975247524752476,
"grad_norm": 0.042102375850486706,
"learning_rate": 5.972156446980571e-09,
"loss": 0.2407,
"step": 1005
},
{
"epoch": 4.98019801980198,
"grad_norm": 0.04302505768029455,
"learning_rate": 3.822214367197319e-09,
"loss": 0.2388,
"step": 1006
},
{
"epoch": 4.985148514851485,
"grad_norm": 0.044323506621706,
"learning_rate": 2.150010562140814e-09,
"loss": 0.2391,
"step": 1007
},
{
"epoch": 4.99009900990099,
"grad_norm": 0.04355933275117794,
"learning_rate": 9.555650056070065e-10,
"loss": 0.2353,
"step": 1008
},
{
"epoch": 4.9950495049504955,
"grad_norm": 0.0446808654882407,
"learning_rate": 2.3889196477000497e-10,
"loss": 0.2401,
"step": 1009
},
{
"epoch": 5.0,
"grad_norm": 0.04593523633444536,
"learning_rate": 0.0,
"loss": 0.2343,
"step": 1010
},
{
"epoch": 5.0,
"step": 1010,
"total_flos": 5.689611896487936e+16,
"train_loss": 0.22220699680913794,
"train_runtime": 41938.1616,
"train_samples_per_second": 12.314,
"train_steps_per_second": 0.024
}
],
"logging_steps": 1,
"max_steps": 1010,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.689611896487936e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}