LiteCoder-Terminal-4b-sft / trainer_state.json
Lite-Coder's picture
Upload folder using huggingface_hub
46643f0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1056,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002840909090909091,
"grad_norm": 9.802746734876441,
"learning_rate": 0.0,
"loss": 0.6182,
"step": 1
},
{
"epoch": 0.005681818181818182,
"grad_norm": 12.388093403552265,
"learning_rate": 1.1627906976744187e-07,
"loss": 0.6649,
"step": 2
},
{
"epoch": 0.008522727272727272,
"grad_norm": 12.643095212909474,
"learning_rate": 2.3255813953488374e-07,
"loss": 0.6794,
"step": 3
},
{
"epoch": 0.011363636363636364,
"grad_norm": 9.628453582962425,
"learning_rate": 3.488372093023256e-07,
"loss": 0.5426,
"step": 4
},
{
"epoch": 0.014204545454545454,
"grad_norm": 12.114285873199693,
"learning_rate": 4.651162790697675e-07,
"loss": 0.6628,
"step": 5
},
{
"epoch": 0.017045454545454544,
"grad_norm": 12.590069642332757,
"learning_rate": 5.813953488372094e-07,
"loss": 0.6635,
"step": 6
},
{
"epoch": 0.019886363636363636,
"grad_norm": 11.894881604292143,
"learning_rate": 6.976744186046513e-07,
"loss": 0.6478,
"step": 7
},
{
"epoch": 0.022727272727272728,
"grad_norm": 10.523659604864859,
"learning_rate": 8.139534883720931e-07,
"loss": 0.6382,
"step": 8
},
{
"epoch": 0.02556818181818182,
"grad_norm": 9.260520595400251,
"learning_rate": 9.30232558139535e-07,
"loss": 0.5683,
"step": 9
},
{
"epoch": 0.028409090909090908,
"grad_norm": 8.701673712634479,
"learning_rate": 1.0465116279069768e-06,
"loss": 0.5677,
"step": 10
},
{
"epoch": 0.03125,
"grad_norm": 7.754246744436588,
"learning_rate": 1.1627906976744188e-06,
"loss": 0.5026,
"step": 11
},
{
"epoch": 0.03409090909090909,
"grad_norm": 8.663705476348797,
"learning_rate": 1.2790697674418605e-06,
"loss": 0.6104,
"step": 12
},
{
"epoch": 0.036931818181818184,
"grad_norm": 5.045315784322545,
"learning_rate": 1.3953488372093025e-06,
"loss": 0.4227,
"step": 13
},
{
"epoch": 0.03977272727272727,
"grad_norm": 4.926402953478099,
"learning_rate": 1.5116279069767443e-06,
"loss": 0.4896,
"step": 14
},
{
"epoch": 0.04261363636363636,
"grad_norm": 4.591926718398226,
"learning_rate": 1.6279069767441862e-06,
"loss": 0.4869,
"step": 15
},
{
"epoch": 0.045454545454545456,
"grad_norm": 4.197025239911461,
"learning_rate": 1.7441860465116282e-06,
"loss": 0.4637,
"step": 16
},
{
"epoch": 0.048295454545454544,
"grad_norm": 3.8588657903560684,
"learning_rate": 1.86046511627907e-06,
"loss": 0.4426,
"step": 17
},
{
"epoch": 0.05113636363636364,
"grad_norm": 1.8811670709600292,
"learning_rate": 1.976744186046512e-06,
"loss": 0.4305,
"step": 18
},
{
"epoch": 0.05397727272727273,
"grad_norm": 1.6752451580220031,
"learning_rate": 2.0930232558139536e-06,
"loss": 0.4529,
"step": 19
},
{
"epoch": 0.056818181818181816,
"grad_norm": 1.2090823975791671,
"learning_rate": 2.2093023255813954e-06,
"loss": 0.3613,
"step": 20
},
{
"epoch": 0.05965909090909091,
"grad_norm": 1.1814336772386804,
"learning_rate": 2.3255813953488376e-06,
"loss": 0.4037,
"step": 21
},
{
"epoch": 0.0625,
"grad_norm": 0.8954725283144086,
"learning_rate": 2.4418604651162793e-06,
"loss": 0.3702,
"step": 22
},
{
"epoch": 0.06534090909090909,
"grad_norm": 0.8798870296631145,
"learning_rate": 2.558139534883721e-06,
"loss": 0.3973,
"step": 23
},
{
"epoch": 0.06818181818181818,
"grad_norm": 0.5832983194953867,
"learning_rate": 2.674418604651163e-06,
"loss": 0.3262,
"step": 24
},
{
"epoch": 0.07102272727272728,
"grad_norm": 0.8732475291899245,
"learning_rate": 2.790697674418605e-06,
"loss": 0.3909,
"step": 25
},
{
"epoch": 0.07386363636363637,
"grad_norm": 1.100897285846476,
"learning_rate": 2.9069767441860468e-06,
"loss": 0.3817,
"step": 26
},
{
"epoch": 0.07670454545454546,
"grad_norm": 1.0608377951702355,
"learning_rate": 3.0232558139534885e-06,
"loss": 0.3583,
"step": 27
},
{
"epoch": 0.07954545454545454,
"grad_norm": 1.0224952192594947,
"learning_rate": 3.1395348837209307e-06,
"loss": 0.4162,
"step": 28
},
{
"epoch": 0.08238636363636363,
"grad_norm": 0.8097165887156961,
"learning_rate": 3.2558139534883724e-06,
"loss": 0.3477,
"step": 29
},
{
"epoch": 0.08522727272727272,
"grad_norm": 0.7315228867679278,
"learning_rate": 3.372093023255814e-06,
"loss": 0.3951,
"step": 30
},
{
"epoch": 0.08806818181818182,
"grad_norm": 0.6032121177421607,
"learning_rate": 3.4883720930232564e-06,
"loss": 0.3414,
"step": 31
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.5651833216962348,
"learning_rate": 3.6046511627906977e-06,
"loss": 0.3635,
"step": 32
},
{
"epoch": 0.09375,
"grad_norm": 0.5192255380315864,
"learning_rate": 3.72093023255814e-06,
"loss": 0.3888,
"step": 33
},
{
"epoch": 0.09659090909090909,
"grad_norm": 0.49173473741498314,
"learning_rate": 3.837209302325582e-06,
"loss": 0.3749,
"step": 34
},
{
"epoch": 0.09943181818181818,
"grad_norm": 0.48300590116190206,
"learning_rate": 3.953488372093024e-06,
"loss": 0.3719,
"step": 35
},
{
"epoch": 0.10227272727272728,
"grad_norm": 0.47568795818970555,
"learning_rate": 4.0697674418604655e-06,
"loss": 0.3502,
"step": 36
},
{
"epoch": 0.10511363636363637,
"grad_norm": 0.5738976486828545,
"learning_rate": 4.186046511627907e-06,
"loss": 0.3553,
"step": 37
},
{
"epoch": 0.10795454545454546,
"grad_norm": 0.48281438241706864,
"learning_rate": 4.302325581395349e-06,
"loss": 0.3194,
"step": 38
},
{
"epoch": 0.11079545454545454,
"grad_norm": 0.6040813728082152,
"learning_rate": 4.418604651162791e-06,
"loss": 0.3753,
"step": 39
},
{
"epoch": 0.11363636363636363,
"grad_norm": 0.5510018703021852,
"learning_rate": 4.5348837209302326e-06,
"loss": 0.3497,
"step": 40
},
{
"epoch": 0.11647727272727272,
"grad_norm": 0.4265614122633672,
"learning_rate": 4.651162790697675e-06,
"loss": 0.3067,
"step": 41
},
{
"epoch": 0.11931818181818182,
"grad_norm": 0.3982552723726358,
"learning_rate": 4.767441860465117e-06,
"loss": 0.3166,
"step": 42
},
{
"epoch": 0.12215909090909091,
"grad_norm": 0.42319934937905634,
"learning_rate": 4.883720930232559e-06,
"loss": 0.3406,
"step": 43
},
{
"epoch": 0.125,
"grad_norm": 0.48844669962812265,
"learning_rate": 5e-06,
"loss": 0.3809,
"step": 44
},
{
"epoch": 0.1278409090909091,
"grad_norm": 0.4283299903892573,
"learning_rate": 4.999987977618099e-06,
"loss": 0.3487,
"step": 45
},
{
"epoch": 0.13068181818181818,
"grad_norm": 0.45165901843941525,
"learning_rate": 4.999951910588025e-06,
"loss": 0.3261,
"step": 46
},
{
"epoch": 0.13352272727272727,
"grad_norm": 0.3309060296669714,
"learning_rate": 4.999891799256668e-06,
"loss": 0.3122,
"step": 47
},
{
"epoch": 0.13636363636363635,
"grad_norm": 0.3836084760514636,
"learning_rate": 4.9998076442021725e-06,
"loss": 0.3001,
"step": 48
},
{
"epoch": 0.13920454545454544,
"grad_norm": 0.425230874245839,
"learning_rate": 4.999699446233934e-06,
"loss": 0.3341,
"step": 49
},
{
"epoch": 0.14204545454545456,
"grad_norm": 0.4444798732501407,
"learning_rate": 4.999567206392591e-06,
"loss": 0.3373,
"step": 50
},
{
"epoch": 0.14488636363636365,
"grad_norm": 0.381536539310927,
"learning_rate": 4.999410925950012e-06,
"loss": 0.3267,
"step": 51
},
{
"epoch": 0.14772727272727273,
"grad_norm": 0.3767650025962174,
"learning_rate": 4.99923060640929e-06,
"loss": 0.328,
"step": 52
},
{
"epoch": 0.15056818181818182,
"grad_norm": 0.3903203005773619,
"learning_rate": 4.99902624950472e-06,
"loss": 0.3367,
"step": 53
},
{
"epoch": 0.1534090909090909,
"grad_norm": 0.47731540090520985,
"learning_rate": 4.9987978572017875e-06,
"loss": 0.3749,
"step": 54
},
{
"epoch": 0.15625,
"grad_norm": 0.36341294567474813,
"learning_rate": 4.998545431697149e-06,
"loss": 0.2952,
"step": 55
},
{
"epoch": 0.1590909090909091,
"grad_norm": 0.4160548663852485,
"learning_rate": 4.998268975418606e-06,
"loss": 0.3779,
"step": 56
},
{
"epoch": 0.16193181818181818,
"grad_norm": 0.3664734921308225,
"learning_rate": 4.997968491025093e-06,
"loss": 0.3105,
"step": 57
},
{
"epoch": 0.16477272727272727,
"grad_norm": 0.35755496009312704,
"learning_rate": 4.997643981406638e-06,
"loss": 0.3508,
"step": 58
},
{
"epoch": 0.16761363636363635,
"grad_norm": 0.3738253178296096,
"learning_rate": 4.997295449684345e-06,
"loss": 0.349,
"step": 59
},
{
"epoch": 0.17045454545454544,
"grad_norm": 0.3175005755892801,
"learning_rate": 4.996922899210358e-06,
"loss": 0.2984,
"step": 60
},
{
"epoch": 0.17329545454545456,
"grad_norm": 0.39931619691125575,
"learning_rate": 4.996526333567833e-06,
"loss": 0.3627,
"step": 61
},
{
"epoch": 0.17613636363636365,
"grad_norm": 0.3726199489633269,
"learning_rate": 4.9961057565709015e-06,
"loss": 0.3274,
"step": 62
},
{
"epoch": 0.17897727272727273,
"grad_norm": 0.3954308613768431,
"learning_rate": 4.995661172264632e-06,
"loss": 0.34,
"step": 63
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.3814105011898473,
"learning_rate": 4.995192584924995e-06,
"loss": 0.3122,
"step": 64
},
{
"epoch": 0.1846590909090909,
"grad_norm": 0.3655156038716592,
"learning_rate": 4.99469999905882e-06,
"loss": 0.35,
"step": 65
},
{
"epoch": 0.1875,
"grad_norm": 0.39224970009402493,
"learning_rate": 4.99418341940375e-06,
"loss": 0.3057,
"step": 66
},
{
"epoch": 0.1903409090909091,
"grad_norm": 0.33083770067354695,
"learning_rate": 4.9936428509282e-06,
"loss": 0.3144,
"step": 67
},
{
"epoch": 0.19318181818181818,
"grad_norm": 0.31786460924484966,
"learning_rate": 4.9930782988313065e-06,
"loss": 0.3156,
"step": 68
},
{
"epoch": 0.19602272727272727,
"grad_norm": 0.3515587615165226,
"learning_rate": 4.992489768542877e-06,
"loss": 0.318,
"step": 69
},
{
"epoch": 0.19886363636363635,
"grad_norm": 0.39680326873271354,
"learning_rate": 4.991877265723343e-06,
"loss": 0.3319,
"step": 70
},
{
"epoch": 0.20170454545454544,
"grad_norm": 0.3532333123348208,
"learning_rate": 4.9912407962636965e-06,
"loss": 0.3343,
"step": 71
},
{
"epoch": 0.20454545454545456,
"grad_norm": 0.3684851475562903,
"learning_rate": 4.990580366285441e-06,
"loss": 0.3214,
"step": 72
},
{
"epoch": 0.20738636363636365,
"grad_norm": 0.34703126004025847,
"learning_rate": 4.98989598214053e-06,
"loss": 0.3497,
"step": 73
},
{
"epoch": 0.21022727272727273,
"grad_norm": 0.331786659705209,
"learning_rate": 4.989187650411306e-06,
"loss": 0.3119,
"step": 74
},
{
"epoch": 0.21306818181818182,
"grad_norm": 0.3514432926351399,
"learning_rate": 4.988455377910436e-06,
"loss": 0.3276,
"step": 75
},
{
"epoch": 0.2159090909090909,
"grad_norm": 0.45669134699095365,
"learning_rate": 4.987699171680846e-06,
"loss": 0.3502,
"step": 76
},
{
"epoch": 0.21875,
"grad_norm": 0.3799997391446089,
"learning_rate": 4.98691903899566e-06,
"loss": 0.3389,
"step": 77
},
{
"epoch": 0.2215909090909091,
"grad_norm": 0.32987905423731806,
"learning_rate": 4.986114987358118e-06,
"loss": 0.3154,
"step": 78
},
{
"epoch": 0.22443181818181818,
"grad_norm": 0.37320907794023317,
"learning_rate": 4.985287024501512e-06,
"loss": 0.2865,
"step": 79
},
{
"epoch": 0.22727272727272727,
"grad_norm": 0.3606727238448836,
"learning_rate": 4.9844351583891125e-06,
"loss": 0.3352,
"step": 80
},
{
"epoch": 0.23011363636363635,
"grad_norm": 0.28704484493903537,
"learning_rate": 4.983559397214086e-06,
"loss": 0.2761,
"step": 81
},
{
"epoch": 0.23295454545454544,
"grad_norm": 0.3395805127723043,
"learning_rate": 4.982659749399421e-06,
"loss": 0.3013,
"step": 82
},
{
"epoch": 0.23579545454545456,
"grad_norm": 0.32754503212231606,
"learning_rate": 4.981736223597845e-06,
"loss": 0.3291,
"step": 83
},
{
"epoch": 0.23863636363636365,
"grad_norm": 0.3278411182469415,
"learning_rate": 4.9807888286917425e-06,
"loss": 0.281,
"step": 84
},
{
"epoch": 0.24147727272727273,
"grad_norm": 0.3312034883074764,
"learning_rate": 4.979817573793068e-06,
"loss": 0.3484,
"step": 85
},
{
"epoch": 0.24431818181818182,
"grad_norm": 0.3001329867151946,
"learning_rate": 4.978822468243259e-06,
"loss": 0.2842,
"step": 86
},
{
"epoch": 0.2471590909090909,
"grad_norm": 0.3516159032278349,
"learning_rate": 4.977803521613147e-06,
"loss": 0.3084,
"step": 87
},
{
"epoch": 0.25,
"grad_norm": 0.3782753735314241,
"learning_rate": 4.9767607437028645e-06,
"loss": 0.3381,
"step": 88
},
{
"epoch": 0.2528409090909091,
"grad_norm": 0.3170089268559784,
"learning_rate": 4.97569414454175e-06,
"loss": 0.3215,
"step": 89
},
{
"epoch": 0.2556818181818182,
"grad_norm": 0.29420316873312097,
"learning_rate": 4.9746037343882545e-06,
"loss": 0.2998,
"step": 90
},
{
"epoch": 0.2585227272727273,
"grad_norm": 0.45657642279690197,
"learning_rate": 4.97348952372984e-06,
"loss": 0.3354,
"step": 91
},
{
"epoch": 0.26136363636363635,
"grad_norm": 0.32675165284478025,
"learning_rate": 4.972351523282878e-06,
"loss": 0.2715,
"step": 92
},
{
"epoch": 0.26420454545454547,
"grad_norm": 0.37411987401338476,
"learning_rate": 4.97118974399255e-06,
"loss": 0.331,
"step": 93
},
{
"epoch": 0.26704545454545453,
"grad_norm": 0.2906231907319114,
"learning_rate": 4.970004197032741e-06,
"loss": 0.2635,
"step": 94
},
{
"epoch": 0.26988636363636365,
"grad_norm": 0.42609899782651967,
"learning_rate": 4.968794893805927e-06,
"loss": 0.3662,
"step": 95
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.35277264498485456,
"learning_rate": 4.967561845943074e-06,
"loss": 0.3656,
"step": 96
},
{
"epoch": 0.2755681818181818,
"grad_norm": 0.33825537104063047,
"learning_rate": 4.966305065303519e-06,
"loss": 0.2949,
"step": 97
},
{
"epoch": 0.2784090909090909,
"grad_norm": 0.36200881129772927,
"learning_rate": 4.96502456397486e-06,
"loss": 0.3457,
"step": 98
},
{
"epoch": 0.28125,
"grad_norm": 0.31133758943801504,
"learning_rate": 4.963720354272837e-06,
"loss": 0.2831,
"step": 99
},
{
"epoch": 0.2840909090909091,
"grad_norm": 0.3398462998770164,
"learning_rate": 4.962392448741216e-06,
"loss": 0.308,
"step": 100
},
{
"epoch": 0.2869318181818182,
"grad_norm": 0.2825796948908475,
"learning_rate": 4.961040860151669e-06,
"loss": 0.2634,
"step": 101
},
{
"epoch": 0.2897727272727273,
"grad_norm": 0.38927704510942096,
"learning_rate": 4.9596656015036434e-06,
"loss": 0.2942,
"step": 102
},
{
"epoch": 0.29261363636363635,
"grad_norm": 0.35680520232446933,
"learning_rate": 4.95826668602425e-06,
"loss": 0.3148,
"step": 103
},
{
"epoch": 0.29545454545454547,
"grad_norm": 0.40848691247631896,
"learning_rate": 4.956844127168124e-06,
"loss": 0.3475,
"step": 104
},
{
"epoch": 0.29829545454545453,
"grad_norm": 0.3675982469780909,
"learning_rate": 4.955397938617304e-06,
"loss": 0.3223,
"step": 105
},
{
"epoch": 0.30113636363636365,
"grad_norm": 0.32048567892217283,
"learning_rate": 4.953928134281093e-06,
"loss": 0.316,
"step": 106
},
{
"epoch": 0.3039772727272727,
"grad_norm": 0.3107707861319827,
"learning_rate": 4.952434728295931e-06,
"loss": 0.3031,
"step": 107
},
{
"epoch": 0.3068181818181818,
"grad_norm": 0.38878643961644715,
"learning_rate": 4.950917735025256e-06,
"loss": 0.3355,
"step": 108
},
{
"epoch": 0.3096590909090909,
"grad_norm": 0.3735768679081344,
"learning_rate": 4.949377169059365e-06,
"loss": 0.3008,
"step": 109
},
{
"epoch": 0.3125,
"grad_norm": 0.3808439931809935,
"learning_rate": 4.947813045215277e-06,
"loss": 0.3002,
"step": 110
},
{
"epoch": 0.3153409090909091,
"grad_norm": 0.3256292929675435,
"learning_rate": 4.946225378536587e-06,
"loss": 0.2988,
"step": 111
},
{
"epoch": 0.3181818181818182,
"grad_norm": 0.35150877205189135,
"learning_rate": 4.944614184293321e-06,
"loss": 0.2993,
"step": 112
},
{
"epoch": 0.3210227272727273,
"grad_norm": 0.37494589367664166,
"learning_rate": 4.942979477981797e-06,
"loss": 0.3129,
"step": 113
},
{
"epoch": 0.32386363636363635,
"grad_norm": 0.3506621432286222,
"learning_rate": 4.941321275324463e-06,
"loss": 0.3015,
"step": 114
},
{
"epoch": 0.32670454545454547,
"grad_norm": 0.30804865814837706,
"learning_rate": 4.939639592269757e-06,
"loss": 0.2709,
"step": 115
},
{
"epoch": 0.32954545454545453,
"grad_norm": 0.4334401140811609,
"learning_rate": 4.9379344449919465e-06,
"loss": 0.3211,
"step": 116
},
{
"epoch": 0.33238636363636365,
"grad_norm": 0.4113976286859321,
"learning_rate": 4.936205849890977e-06,
"loss": 0.3486,
"step": 117
},
{
"epoch": 0.3352272727272727,
"grad_norm": 0.38143204868428404,
"learning_rate": 4.934453823592313e-06,
"loss": 0.3248,
"step": 118
},
{
"epoch": 0.3380681818181818,
"grad_norm": 0.3935231496732602,
"learning_rate": 4.9326783829467795e-06,
"loss": 0.3369,
"step": 119
},
{
"epoch": 0.3409090909090909,
"grad_norm": 0.3715854335519974,
"learning_rate": 4.930879545030395e-06,
"loss": 0.3162,
"step": 120
},
{
"epoch": 0.34375,
"grad_norm": 0.2987173708346766,
"learning_rate": 4.929057327144213e-06,
"loss": 0.2704,
"step": 121
},
{
"epoch": 0.3465909090909091,
"grad_norm": 0.3505876441509565,
"learning_rate": 4.927211746814155e-06,
"loss": 0.2897,
"step": 122
},
{
"epoch": 0.3494318181818182,
"grad_norm": 0.3808807666150658,
"learning_rate": 4.925342821790834e-06,
"loss": 0.298,
"step": 123
},
{
"epoch": 0.3522727272727273,
"grad_norm": 0.40265933198110954,
"learning_rate": 4.923450570049398e-06,
"loss": 0.3063,
"step": 124
},
{
"epoch": 0.35511363636363635,
"grad_norm": 0.329984359578131,
"learning_rate": 4.921535009789344e-06,
"loss": 0.281,
"step": 125
},
{
"epoch": 0.35795454545454547,
"grad_norm": 0.3327810259029677,
"learning_rate": 4.91959615943435e-06,
"loss": 0.3035,
"step": 126
},
{
"epoch": 0.36079545454545453,
"grad_norm": 0.33832701513333335,
"learning_rate": 4.917634037632095e-06,
"loss": 0.2817,
"step": 127
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.3446767418817894,
"learning_rate": 4.915648663254081e-06,
"loss": 0.3275,
"step": 128
},
{
"epoch": 0.3664772727272727,
"grad_norm": 0.4067285176470478,
"learning_rate": 4.9136400553954526e-06,
"loss": 0.2644,
"step": 129
},
{
"epoch": 0.3693181818181818,
"grad_norm": 0.32647438056937467,
"learning_rate": 4.91160823337481e-06,
"loss": 0.3012,
"step": 130
},
{
"epoch": 0.3721590909090909,
"grad_norm": 0.2641653305047082,
"learning_rate": 4.909553216734024e-06,
"loss": 0.2551,
"step": 131
},
{
"epoch": 0.375,
"grad_norm": 0.3587439503975781,
"learning_rate": 4.907475025238051e-06,
"loss": 0.3429,
"step": 132
},
{
"epoch": 0.3778409090909091,
"grad_norm": 0.39094595293189244,
"learning_rate": 4.905373678874741e-06,
"loss": 0.3428,
"step": 133
},
{
"epoch": 0.3806818181818182,
"grad_norm": 0.33295666810345625,
"learning_rate": 4.903249197854645e-06,
"loss": 0.3024,
"step": 134
},
{
"epoch": 0.3835227272727273,
"grad_norm": 0.4067834961803898,
"learning_rate": 4.90110160261082e-06,
"loss": 0.388,
"step": 135
},
{
"epoch": 0.38636363636363635,
"grad_norm": 0.3041105753158812,
"learning_rate": 4.898930913798635e-06,
"loss": 0.2791,
"step": 136
},
{
"epoch": 0.38920454545454547,
"grad_norm": 0.3854716077313248,
"learning_rate": 4.89673715229557e-06,
"loss": 0.3516,
"step": 137
},
{
"epoch": 0.39204545454545453,
"grad_norm": 0.41029172649451373,
"learning_rate": 4.894520339201014e-06,
"loss": 0.3221,
"step": 138
},
{
"epoch": 0.39488636363636365,
"grad_norm": 0.31953693308642406,
"learning_rate": 4.892280495836068e-06,
"loss": 0.3268,
"step": 139
},
{
"epoch": 0.3977272727272727,
"grad_norm": 0.4798811586379984,
"learning_rate": 4.890017643743334e-06,
"loss": 0.3115,
"step": 140
},
{
"epoch": 0.4005681818181818,
"grad_norm": 0.3603031050892597,
"learning_rate": 4.887731804686707e-06,
"loss": 0.2844,
"step": 141
},
{
"epoch": 0.4034090909090909,
"grad_norm": 0.40465606169589835,
"learning_rate": 4.885423000651174e-06,
"loss": 0.3573,
"step": 142
},
{
"epoch": 0.40625,
"grad_norm": 0.3643063680731307,
"learning_rate": 4.883091253842592e-06,
"loss": 0.2861,
"step": 143
},
{
"epoch": 0.4090909090909091,
"grad_norm": 0.2855806950882976,
"learning_rate": 4.8807365866874825e-06,
"loss": 0.2856,
"step": 144
},
{
"epoch": 0.4119318181818182,
"grad_norm": 0.43700846878534866,
"learning_rate": 4.878359021832812e-06,
"loss": 0.3025,
"step": 145
},
{
"epoch": 0.4147727272727273,
"grad_norm": 0.3691328488500052,
"learning_rate": 4.875958582145775e-06,
"loss": 0.3516,
"step": 146
},
{
"epoch": 0.41761363636363635,
"grad_norm": 0.3602263970719629,
"learning_rate": 4.873535290713571e-06,
"loss": 0.3276,
"step": 147
},
{
"epoch": 0.42045454545454547,
"grad_norm": 0.2873285630204768,
"learning_rate": 4.871089170843192e-06,
"loss": 0.272,
"step": 148
},
{
"epoch": 0.42329545454545453,
"grad_norm": 0.3275589221978115,
"learning_rate": 4.868620246061185e-06,
"loss": 0.3127,
"step": 149
},
{
"epoch": 0.42613636363636365,
"grad_norm": 0.3595600686315243,
"learning_rate": 4.866128540113436e-06,
"loss": 0.293,
"step": 150
},
{
"epoch": 0.4289772727272727,
"grad_norm": 0.39412366891247624,
"learning_rate": 4.863614076964937e-06,
"loss": 0.3105,
"step": 151
},
{
"epoch": 0.4318181818181818,
"grad_norm": 0.2967856642106585,
"learning_rate": 4.8610768807995575e-06,
"loss": 0.2488,
"step": 152
},
{
"epoch": 0.4346590909090909,
"grad_norm": 0.3353960107255814,
"learning_rate": 4.85851697601981e-06,
"loss": 0.31,
"step": 153
},
{
"epoch": 0.4375,
"grad_norm": 0.3293934153604414,
"learning_rate": 4.855934387246619e-06,
"loss": 0.31,
"step": 154
},
{
"epoch": 0.4403409090909091,
"grad_norm": 0.4020477745824599,
"learning_rate": 4.853329139319076e-06,
"loss": 0.3607,
"step": 155
},
{
"epoch": 0.4431818181818182,
"grad_norm": 0.40194438779646285,
"learning_rate": 4.850701257294212e-06,
"loss": 0.3194,
"step": 156
},
{
"epoch": 0.4460227272727273,
"grad_norm": 0.35880107189234606,
"learning_rate": 4.848050766446746e-06,
"loss": 0.3257,
"step": 157
},
{
"epoch": 0.44886363636363635,
"grad_norm": 0.3225921590602741,
"learning_rate": 4.84537769226885e-06,
"loss": 0.2865,
"step": 158
},
{
"epoch": 0.45170454545454547,
"grad_norm": 0.43105913904133064,
"learning_rate": 4.842682060469899e-06,
"loss": 0.2917,
"step": 159
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.3984098156673031,
"learning_rate": 4.839963896976223e-06,
"loss": 0.3137,
"step": 160
},
{
"epoch": 0.45738636363636365,
"grad_norm": 0.34203541957482897,
"learning_rate": 4.837223227930864e-06,
"loss": 0.3021,
"step": 161
},
{
"epoch": 0.4602272727272727,
"grad_norm": 0.3410914811625815,
"learning_rate": 4.834460079693317e-06,
"loss": 0.3197,
"step": 162
},
{
"epoch": 0.4630681818181818,
"grad_norm": 0.3668120756523038,
"learning_rate": 4.831674478839281e-06,
"loss": 0.3242,
"step": 163
},
{
"epoch": 0.4659090909090909,
"grad_norm": 0.34128762447014865,
"learning_rate": 4.828866452160402e-06,
"loss": 0.2626,
"step": 164
},
{
"epoch": 0.46875,
"grad_norm": 0.34134817423813496,
"learning_rate": 4.826036026664014e-06,
"loss": 0.2771,
"step": 165
},
{
"epoch": 0.4715909090909091,
"grad_norm": 0.3270025125687817,
"learning_rate": 4.823183229572883e-06,
"loss": 0.2921,
"step": 166
},
{
"epoch": 0.4744318181818182,
"grad_norm": 0.3701876487404051,
"learning_rate": 4.820308088324942e-06,
"loss": 0.3315,
"step": 167
},
{
"epoch": 0.4772727272727273,
"grad_norm": 0.4223541290676315,
"learning_rate": 4.8174106305730284e-06,
"loss": 0.3458,
"step": 168
},
{
"epoch": 0.48011363636363635,
"grad_norm": 0.36826807946452467,
"learning_rate": 4.814490884184615e-06,
"loss": 0.3098,
"step": 169
},
{
"epoch": 0.48295454545454547,
"grad_norm": 0.34247450811498126,
"learning_rate": 4.811548877241549e-06,
"loss": 0.2794,
"step": 170
},
{
"epoch": 0.48579545454545453,
"grad_norm": 0.36931394013248037,
"learning_rate": 4.808584638039774e-06,
"loss": 0.3075,
"step": 171
},
{
"epoch": 0.48863636363636365,
"grad_norm": 0.38654212773141833,
"learning_rate": 4.805598195089063e-06,
"loss": 0.2957,
"step": 172
},
{
"epoch": 0.4914772727272727,
"grad_norm": 0.327791247654709,
"learning_rate": 4.802589577112742e-06,
"loss": 0.317,
"step": 173
},
{
"epoch": 0.4943181818181818,
"grad_norm": 0.4180368575468772,
"learning_rate": 4.7995588130474145e-06,
"loss": 0.2873,
"step": 174
},
{
"epoch": 0.4971590909090909,
"grad_norm": 0.41772200012858535,
"learning_rate": 4.7965059320426825e-06,
"loss": 0.3365,
"step": 175
},
{
"epoch": 0.5,
"grad_norm": 0.3622810863279747,
"learning_rate": 4.7934309634608676e-06,
"loss": 0.3406,
"step": 176
},
{
"epoch": 0.5028409090909091,
"grad_norm": 0.33039829085718986,
"learning_rate": 4.790333936876727e-06,
"loss": 0.2582,
"step": 177
},
{
"epoch": 0.5056818181818182,
"grad_norm": 0.2963847161562058,
"learning_rate": 4.78721488207717e-06,
"loss": 0.2621,
"step": 178
},
{
"epoch": 0.5085227272727273,
"grad_norm": 0.3688579036529526,
"learning_rate": 4.7840738290609714e-06,
"loss": 0.3106,
"step": 179
},
{
"epoch": 0.5113636363636364,
"grad_norm": 0.3882009236138182,
"learning_rate": 4.78091080803848e-06,
"loss": 0.2615,
"step": 180
},
{
"epoch": 0.5142045454545454,
"grad_norm": 0.35367280178437593,
"learning_rate": 4.777725849431336e-06,
"loss": 0.3045,
"step": 181
},
{
"epoch": 0.5170454545454546,
"grad_norm": 0.3874603305325755,
"learning_rate": 4.774518983872169e-06,
"loss": 0.3151,
"step": 182
},
{
"epoch": 0.5198863636363636,
"grad_norm": 0.3089601400335368,
"learning_rate": 4.77129024220431e-06,
"loss": 0.2565,
"step": 183
},
{
"epoch": 0.5227272727272727,
"grad_norm": 0.3741939570187776,
"learning_rate": 4.7680396554814886e-06,
"loss": 0.2824,
"step": 184
},
{
"epoch": 0.5255681818181818,
"grad_norm": 0.3684238808190501,
"learning_rate": 4.764767254967544e-06,
"loss": 0.2717,
"step": 185
},
{
"epoch": 0.5284090909090909,
"grad_norm": 0.34181925499552346,
"learning_rate": 4.761473072136114e-06,
"loss": 0.2984,
"step": 186
},
{
"epoch": 0.53125,
"grad_norm": 0.44267647661167453,
"learning_rate": 4.758157138670337e-06,
"loss": 0.3472,
"step": 187
},
{
"epoch": 0.5340909090909091,
"grad_norm": 0.3887831736377981,
"learning_rate": 4.75481948646255e-06,
"loss": 0.3111,
"step": 188
},
{
"epoch": 0.5369318181818182,
"grad_norm": 0.3683856304101638,
"learning_rate": 4.751460147613973e-06,
"loss": 0.3146,
"step": 189
},
{
"epoch": 0.5397727272727273,
"grad_norm": 0.38527593119976,
"learning_rate": 4.748079154434413e-06,
"loss": 0.3314,
"step": 190
},
{
"epoch": 0.5426136363636364,
"grad_norm": 0.4031772051747187,
"learning_rate": 4.744676539441941e-06,
"loss": 0.315,
"step": 191
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.3353722780310112,
"learning_rate": 4.741252335362588e-06,
"loss": 0.269,
"step": 192
},
{
"epoch": 0.5482954545454546,
"grad_norm": 0.3394618273632171,
"learning_rate": 4.737806575130024e-06,
"loss": 0.2745,
"step": 193
},
{
"epoch": 0.5511363636363636,
"grad_norm": 0.4450532210463518,
"learning_rate": 4.734339291885246e-06,
"loss": 0.3188,
"step": 194
},
{
"epoch": 0.5539772727272727,
"grad_norm": 0.397975066441739,
"learning_rate": 4.7308505189762565e-06,
"loss": 0.2985,
"step": 195
},
{
"epoch": 0.5568181818181818,
"grad_norm": 0.3440535351319966,
"learning_rate": 4.727340289957744e-06,
"loss": 0.2809,
"step": 196
},
{
"epoch": 0.5596590909090909,
"grad_norm": 0.6446149440778554,
"learning_rate": 4.723808638590759e-06,
"loss": 0.3218,
"step": 197
},
{
"epoch": 0.5625,
"grad_norm": 0.37606508969708213,
"learning_rate": 4.720255598842392e-06,
"loss": 0.3176,
"step": 198
},
{
"epoch": 0.5653409090909091,
"grad_norm": 0.43147254520622674,
"learning_rate": 4.716681204885442e-06,
"loss": 0.3268,
"step": 199
},
{
"epoch": 0.5681818181818182,
"grad_norm": 0.41993041372097106,
"learning_rate": 4.713085491098093e-06,
"loss": 0.2804,
"step": 200
},
{
"epoch": 0.5710227272727273,
"grad_norm": 0.48960282010679945,
"learning_rate": 4.70946849206358e-06,
"loss": 0.3996,
"step": 201
},
{
"epoch": 0.5738636363636364,
"grad_norm": 0.3375570582028718,
"learning_rate": 4.705830242569859e-06,
"loss": 0.2914,
"step": 202
},
{
"epoch": 0.5767045454545454,
"grad_norm": 0.33067898836626264,
"learning_rate": 4.70217077760927e-06,
"loss": 0.2717,
"step": 203
},
{
"epoch": 0.5795454545454546,
"grad_norm": 0.3919628586280393,
"learning_rate": 4.6984901323781996e-06,
"loss": 0.2758,
"step": 204
},
{
"epoch": 0.5823863636363636,
"grad_norm": 0.37621132131624546,
"learning_rate": 4.6947883422767475e-06,
"loss": 0.2927,
"step": 205
},
{
"epoch": 0.5852272727272727,
"grad_norm": 0.3588621280506994,
"learning_rate": 4.69106544290838e-06,
"loss": 0.3202,
"step": 206
},
{
"epoch": 0.5880681818181818,
"grad_norm": 0.36135048731331515,
"learning_rate": 4.687321470079593e-06,
"loss": 0.3075,
"step": 207
},
{
"epoch": 0.5909090909090909,
"grad_norm": 0.3804960320633388,
"learning_rate": 4.683556459799562e-06,
"loss": 0.304,
"step": 208
},
{
"epoch": 0.59375,
"grad_norm": 0.32482777456644224,
"learning_rate": 4.679770448279801e-06,
"loss": 0.2333,
"step": 209
},
{
"epoch": 0.5965909090909091,
"grad_norm": 0.38423666885394503,
"learning_rate": 4.6759634719338106e-06,
"loss": 0.3079,
"step": 210
},
{
"epoch": 0.5994318181818182,
"grad_norm": 0.3584077009643052,
"learning_rate": 4.672135567376729e-06,
"loss": 0.3078,
"step": 211
},
{
"epoch": 0.6022727272727273,
"grad_norm": 0.43190228684358967,
"learning_rate": 4.668286771424982e-06,
"loss": 0.3693,
"step": 212
},
{
"epoch": 0.6051136363636364,
"grad_norm": 0.3335333217535499,
"learning_rate": 4.664417121095925e-06,
"loss": 0.2978,
"step": 213
},
{
"epoch": 0.6079545454545454,
"grad_norm": 0.3343126694937098,
"learning_rate": 4.660526653607489e-06,
"loss": 0.2654,
"step": 214
},
{
"epoch": 0.6107954545454546,
"grad_norm": 0.400588578067547,
"learning_rate": 4.656615406377824e-06,
"loss": 0.3541,
"step": 215
},
{
"epoch": 0.6136363636363636,
"grad_norm": 0.28366454469863744,
"learning_rate": 4.652683417024933e-06,
"loss": 0.2595,
"step": 216
},
{
"epoch": 0.6164772727272727,
"grad_norm": 0.3333388085745537,
"learning_rate": 4.648730723366321e-06,
"loss": 0.3034,
"step": 217
},
{
"epoch": 0.6193181818181818,
"grad_norm": 0.3802324883963107,
"learning_rate": 4.644757363418622e-06,
"loss": 0.3149,
"step": 218
},
{
"epoch": 0.6221590909090909,
"grad_norm": 0.3323209944938239,
"learning_rate": 4.640763375397235e-06,
"loss": 0.2831,
"step": 219
},
{
"epoch": 0.625,
"grad_norm": 0.3816473948946037,
"learning_rate": 4.636748797715961e-06,
"loss": 0.2901,
"step": 220
},
{
"epoch": 0.6278409090909091,
"grad_norm": 0.45087508944423654,
"learning_rate": 4.632713668986628e-06,
"loss": 0.2668,
"step": 221
},
{
"epoch": 0.6306818181818182,
"grad_norm": 0.3277834281020941,
"learning_rate": 4.628658028018723e-06,
"loss": 0.3115,
"step": 222
},
{
"epoch": 0.6335227272727273,
"grad_norm": 0.4149700033604779,
"learning_rate": 4.624581913819019e-06,
"loss": 0.3049,
"step": 223
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.2986911926260575,
"learning_rate": 4.6204853655911945e-06,
"loss": 0.2828,
"step": 224
},
{
"epoch": 0.6392045454545454,
"grad_norm": 0.38662077935688544,
"learning_rate": 4.6163684227354656e-06,
"loss": 0.3019,
"step": 225
},
{
"epoch": 0.6420454545454546,
"grad_norm": 0.3670137115048512,
"learning_rate": 4.612231124848199e-06,
"loss": 0.2998,
"step": 226
},
{
"epoch": 0.6448863636363636,
"grad_norm": 0.3820920011764151,
"learning_rate": 4.608073511721534e-06,
"loss": 0.3627,
"step": 227
},
{
"epoch": 0.6477272727272727,
"grad_norm": 0.26469955866368194,
"learning_rate": 4.6038956233430034e-06,
"loss": 0.2419,
"step": 228
},
{
"epoch": 0.6505681818181818,
"grad_norm": 0.32240469660709375,
"learning_rate": 4.59969749989514e-06,
"loss": 0.2692,
"step": 229
},
{
"epoch": 0.6534090909090909,
"grad_norm": 0.3896277142098736,
"learning_rate": 4.5954791817551e-06,
"loss": 0.2789,
"step": 230
},
{
"epoch": 0.65625,
"grad_norm": 0.3510490299412409,
"learning_rate": 4.591240709494269e-06,
"loss": 0.281,
"step": 231
},
{
"epoch": 0.6590909090909091,
"grad_norm": 0.3636438474583087,
"learning_rate": 4.586982123877871e-06,
"loss": 0.2998,
"step": 232
},
{
"epoch": 0.6619318181818182,
"grad_norm": 0.3274578399993675,
"learning_rate": 4.582703465864582e-06,
"loss": 0.2758,
"step": 233
},
{
"epoch": 0.6647727272727273,
"grad_norm": 0.3205713499503409,
"learning_rate": 4.5784047766061305e-06,
"loss": 0.2716,
"step": 234
},
{
"epoch": 0.6676136363636364,
"grad_norm": 0.47159005981022434,
"learning_rate": 4.574086097446903e-06,
"loss": 0.3236,
"step": 235
},
{
"epoch": 0.6704545454545454,
"grad_norm": 0.3617567220761258,
"learning_rate": 4.569747469923547e-06,
"loss": 0.2863,
"step": 236
},
{
"epoch": 0.6732954545454546,
"grad_norm": 0.32166940611651096,
"learning_rate": 4.565388935764572e-06,
"loss": 0.31,
"step": 237
},
{
"epoch": 0.6761363636363636,
"grad_norm": 0.3982166865116622,
"learning_rate": 4.56101053688995e-06,
"loss": 0.2874,
"step": 238
},
{
"epoch": 0.6789772727272727,
"grad_norm": 0.4339388465917976,
"learning_rate": 4.5566123154107055e-06,
"loss": 0.3374,
"step": 239
},
{
"epoch": 0.6818181818181818,
"grad_norm": 0.36030799942916975,
"learning_rate": 4.552194313628518e-06,
"loss": 0.2668,
"step": 240
},
{
"epoch": 0.6846590909090909,
"grad_norm": 0.3940718141510353,
"learning_rate": 4.547756574035311e-06,
"loss": 0.3277,
"step": 241
},
{
"epoch": 0.6875,
"grad_norm": 0.4326472723953054,
"learning_rate": 4.5432991393128446e-06,
"loss": 0.3227,
"step": 242
},
{
"epoch": 0.6903409090909091,
"grad_norm": 0.41998189617141085,
"learning_rate": 4.538822052332306e-06,
"loss": 0.339,
"step": 243
},
{
"epoch": 0.6931818181818182,
"grad_norm": 0.36510653915186314,
"learning_rate": 4.534325356153892e-06,
"loss": 0.2637,
"step": 244
},
{
"epoch": 0.6960227272727273,
"grad_norm": 0.4748073641254545,
"learning_rate": 4.529809094026404e-06,
"loss": 0.3226,
"step": 245
},
{
"epoch": 0.6988636363636364,
"grad_norm": 0.3848777680236735,
"learning_rate": 4.525273309386825e-06,
"loss": 0.3401,
"step": 246
},
{
"epoch": 0.7017045454545454,
"grad_norm": 0.286675785535149,
"learning_rate": 4.5207180458599e-06,
"loss": 0.2495,
"step": 247
},
{
"epoch": 0.7045454545454546,
"grad_norm": 0.3770143744991594,
"learning_rate": 4.516143347257726e-06,
"loss": 0.2923,
"step": 248
},
{
"epoch": 0.7073863636363636,
"grad_norm": 0.37240976329747977,
"learning_rate": 4.511549257579322e-06,
"loss": 0.2968,
"step": 249
},
{
"epoch": 0.7102272727272727,
"grad_norm": 0.53790018713925,
"learning_rate": 4.506935821010206e-06,
"loss": 0.298,
"step": 250
},
{
"epoch": 0.7130681818181818,
"grad_norm": 0.3896643010491094,
"learning_rate": 4.502303081921978e-06,
"loss": 0.3125,
"step": 251
},
{
"epoch": 0.7159090909090909,
"grad_norm": 0.32770126981260167,
"learning_rate": 4.497651084871883e-06,
"loss": 0.2781,
"step": 252
},
{
"epoch": 0.71875,
"grad_norm": 0.3541924637393212,
"learning_rate": 4.492979874602389e-06,
"loss": 0.3023,
"step": 253
},
{
"epoch": 0.7215909090909091,
"grad_norm": 0.3735099253437524,
"learning_rate": 4.4882894960407566e-06,
"loss": 0.3225,
"step": 254
},
{
"epoch": 0.7244318181818182,
"grad_norm": 0.3853359485269271,
"learning_rate": 4.483579994298602e-06,
"loss": 0.3119,
"step": 255
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.4232262055395998,
"learning_rate": 4.478851414671469e-06,
"loss": 0.2996,
"step": 256
},
{
"epoch": 0.7301136363636364,
"grad_norm": 0.3403475343187684,
"learning_rate": 4.474103802638389e-06,
"loss": 0.2948,
"step": 257
},
{
"epoch": 0.7329545454545454,
"grad_norm": 0.4197482437210073,
"learning_rate": 4.469337203861447e-06,
"loss": 0.2999,
"step": 258
},
{
"epoch": 0.7357954545454546,
"grad_norm": 0.33941700168906186,
"learning_rate": 4.464551664185339e-06,
"loss": 0.2636,
"step": 259
},
{
"epoch": 0.7386363636363636,
"grad_norm": 0.35067662494508334,
"learning_rate": 4.459747229636933e-06,
"loss": 0.3153,
"step": 260
},
{
"epoch": 0.7414772727272727,
"grad_norm": 0.33432839847763335,
"learning_rate": 4.454923946424827e-06,
"loss": 0.2646,
"step": 261
},
{
"epoch": 0.7443181818181818,
"grad_norm": 0.3486384565640427,
"learning_rate": 4.450081860938904e-06,
"loss": 0.3026,
"step": 262
},
{
"epoch": 0.7471590909090909,
"grad_norm": 0.3647193452879592,
"learning_rate": 4.4452210197498845e-06,
"loss": 0.3208,
"step": 263
},
{
"epoch": 0.75,
"grad_norm": 0.3621939393169193,
"learning_rate": 4.440341469608879e-06,
"loss": 0.3042,
"step": 264
},
{
"epoch": 0.7528409090909091,
"grad_norm": 0.2856803312521231,
"learning_rate": 4.43544325744694e-06,
"loss": 0.2548,
"step": 265
},
{
"epoch": 0.7556818181818182,
"grad_norm": 0.41636147550676134,
"learning_rate": 4.4305264303746085e-06,
"loss": 0.2743,
"step": 266
},
{
"epoch": 0.7585227272727273,
"grad_norm": 0.3149004485762251,
"learning_rate": 4.425591035681465e-06,
"loss": 0.2768,
"step": 267
},
{
"epoch": 0.7613636363636364,
"grad_norm": 0.39793987625802313,
"learning_rate": 4.420637120835668e-06,
"loss": 0.3055,
"step": 268
},
{
"epoch": 0.7642045454545454,
"grad_norm": 0.4058178861459375,
"learning_rate": 4.415664733483502e-06,
"loss": 0.3168,
"step": 269
},
{
"epoch": 0.7670454545454546,
"grad_norm": 0.3732878019312248,
"learning_rate": 4.4106739214489195e-06,
"loss": 0.2935,
"step": 270
},
{
"epoch": 0.7698863636363636,
"grad_norm": 0.31801887671340195,
"learning_rate": 4.405664732733079e-06,
"loss": 0.2768,
"step": 271
},
{
"epoch": 0.7727272727272727,
"grad_norm": 0.43538965465048635,
"learning_rate": 4.400637215513883e-06,
"loss": 0.2644,
"step": 272
},
{
"epoch": 0.7755681818181818,
"grad_norm": 0.3619890541849985,
"learning_rate": 4.395591418145519e-06,
"loss": 0.2671,
"step": 273
},
{
"epoch": 0.7784090909090909,
"grad_norm": 0.43611885998338823,
"learning_rate": 4.390527389157989e-06,
"loss": 0.3481,
"step": 274
},
{
"epoch": 0.78125,
"grad_norm": 0.411038314679305,
"learning_rate": 4.385445177256646e-06,
"loss": 0.3283,
"step": 275
},
{
"epoch": 0.7840909090909091,
"grad_norm": 0.4004177118376606,
"learning_rate": 4.380344831321722e-06,
"loss": 0.3421,
"step": 276
},
{
"epoch": 0.7869318181818182,
"grad_norm": 0.31547958031028983,
"learning_rate": 4.375226400407863e-06,
"loss": 0.2541,
"step": 277
},
{
"epoch": 0.7897727272727273,
"grad_norm": 0.36900762280860266,
"learning_rate": 4.370089933743654e-06,
"loss": 0.3097,
"step": 278
},
{
"epoch": 0.7926136363636364,
"grad_norm": 0.4686945698836896,
"learning_rate": 4.364935480731147e-06,
"loss": 0.2918,
"step": 279
},
{
"epoch": 0.7954545454545454,
"grad_norm": 0.3509902009735286,
"learning_rate": 4.3597630909453835e-06,
"loss": 0.2646,
"step": 280
},
{
"epoch": 0.7982954545454546,
"grad_norm": 0.30875325359327965,
"learning_rate": 4.35457281413392e-06,
"loss": 0.2349,
"step": 281
},
{
"epoch": 0.8011363636363636,
"grad_norm": 0.3943745151294021,
"learning_rate": 4.349364700216346e-06,
"loss": 0.2764,
"step": 282
},
{
"epoch": 0.8039772727272727,
"grad_norm": 0.35558604531483284,
"learning_rate": 4.344138799283814e-06,
"loss": 0.2442,
"step": 283
},
{
"epoch": 0.8068181818181818,
"grad_norm": 0.38278211936173095,
"learning_rate": 4.338895161598541e-06,
"loss": 0.3294,
"step": 284
},
{
"epoch": 0.8096590909090909,
"grad_norm": 0.3932974746294698,
"learning_rate": 4.333633837593341e-06,
"loss": 0.2951,
"step": 285
},
{
"epoch": 0.8125,
"grad_norm": 0.31762648150994005,
"learning_rate": 4.328354877871131e-06,
"loss": 0.2612,
"step": 286
},
{
"epoch": 0.8153409090909091,
"grad_norm": 0.3405862130473983,
"learning_rate": 4.323058333204446e-06,
"loss": 0.2833,
"step": 287
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.31883855959276614,
"learning_rate": 4.317744254534954e-06,
"loss": 0.2609,
"step": 288
},
{
"epoch": 0.8210227272727273,
"grad_norm": 0.39913277335187336,
"learning_rate": 4.312412692972959e-06,
"loss": 0.2758,
"step": 289
},
{
"epoch": 0.8238636363636364,
"grad_norm": 0.39064418227258985,
"learning_rate": 4.307063699796918e-06,
"loss": 0.2664,
"step": 290
},
{
"epoch": 0.8267045454545454,
"grad_norm": 0.3126978473531618,
"learning_rate": 4.301697326452942e-06,
"loss": 0.2572,
"step": 291
},
{
"epoch": 0.8295454545454546,
"grad_norm": 0.3641340405050646,
"learning_rate": 4.296313624554303e-06,
"loss": 0.286,
"step": 292
},
{
"epoch": 0.8323863636363636,
"grad_norm": 0.4168496899263259,
"learning_rate": 4.290912645880936e-06,
"loss": 0.3035,
"step": 293
},
{
"epoch": 0.8352272727272727,
"grad_norm": 0.3466683321895305,
"learning_rate": 4.285494442378945e-06,
"loss": 0.2853,
"step": 294
},
{
"epoch": 0.8380681818181818,
"grad_norm": 0.3572355149237221,
"learning_rate": 4.280059066160098e-06,
"loss": 0.3021,
"step": 295
},
{
"epoch": 0.8409090909090909,
"grad_norm": 0.36054386776426756,
"learning_rate": 4.274606569501332e-06,
"loss": 0.3041,
"step": 296
},
{
"epoch": 0.84375,
"grad_norm": 0.3220431488871405,
"learning_rate": 4.269137004844242e-06,
"loss": 0.2542,
"step": 297
},
{
"epoch": 0.8465909090909091,
"grad_norm": 0.4103185848899213,
"learning_rate": 4.2636504247945865e-06,
"loss": 0.2859,
"step": 298
},
{
"epoch": 0.8494318181818182,
"grad_norm": 0.3444474167498623,
"learning_rate": 4.258146882121772e-06,
"loss": 0.3082,
"step": 299
},
{
"epoch": 0.8522727272727273,
"grad_norm": 0.35145064032825696,
"learning_rate": 4.252626429758354e-06,
"loss": 0.2679,
"step": 300
},
{
"epoch": 0.8551136363636364,
"grad_norm": 0.39931471518127176,
"learning_rate": 4.247089120799521e-06,
"loss": 0.3486,
"step": 301
},
{
"epoch": 0.8579545454545454,
"grad_norm": 0.2860970262972797,
"learning_rate": 4.241535008502587e-06,
"loss": 0.23,
"step": 302
},
{
"epoch": 0.8607954545454546,
"grad_norm": 0.4649020596412495,
"learning_rate": 4.235964146286479e-06,
"loss": 0.3252,
"step": 303
},
{
"epoch": 0.8636363636363636,
"grad_norm": 0.3482820070437071,
"learning_rate": 4.230376587731225e-06,
"loss": 0.2854,
"step": 304
},
{
"epoch": 0.8664772727272727,
"grad_norm": 0.3269410990279316,
"learning_rate": 4.2247723865774336e-06,
"loss": 0.2563,
"step": 305
},
{
"epoch": 0.8693181818181818,
"grad_norm": 0.31949294830520775,
"learning_rate": 4.219151596725782e-06,
"loss": 0.2688,
"step": 306
},
{
"epoch": 0.8721590909090909,
"grad_norm": 0.43502447469171057,
"learning_rate": 4.213514272236499e-06,
"loss": 0.3386,
"step": 307
},
{
"epoch": 0.875,
"grad_norm": 0.3797117211719601,
"learning_rate": 4.207860467328835e-06,
"loss": 0.2855,
"step": 308
},
{
"epoch": 0.8778409090909091,
"grad_norm": 0.3799062699361923,
"learning_rate": 4.202190236380552e-06,
"loss": 0.2545,
"step": 309
},
{
"epoch": 0.8806818181818182,
"grad_norm": 0.3360385792661154,
"learning_rate": 4.196503633927398e-06,
"loss": 0.2909,
"step": 310
},
{
"epoch": 0.8835227272727273,
"grad_norm": 0.4188943106281552,
"learning_rate": 4.190800714662576e-06,
"loss": 0.3291,
"step": 311
},
{
"epoch": 0.8863636363636364,
"grad_norm": 0.43183074366157487,
"learning_rate": 4.185081533436226e-06,
"loss": 0.3303,
"step": 312
},
{
"epoch": 0.8892045454545454,
"grad_norm": 0.35087669084397133,
"learning_rate": 4.179346145254892e-06,
"loss": 0.3152,
"step": 313
},
{
"epoch": 0.8920454545454546,
"grad_norm": 0.34360678080641915,
"learning_rate": 4.173594605280995e-06,
"loss": 0.2726,
"step": 314
},
{
"epoch": 0.8948863636363636,
"grad_norm": 0.39638626020449463,
"learning_rate": 4.1678269688323045e-06,
"loss": 0.3369,
"step": 315
},
{
"epoch": 0.8977272727272727,
"grad_norm": 0.3566510725505037,
"learning_rate": 4.1620432913814026e-06,
"loss": 0.2469,
"step": 316
},
{
"epoch": 0.9005681818181818,
"grad_norm": 0.32842562735745623,
"learning_rate": 4.156243628555151e-06,
"loss": 0.3018,
"step": 317
},
{
"epoch": 0.9034090909090909,
"grad_norm": 0.30679142774263857,
"learning_rate": 4.150428036134161e-06,
"loss": 0.2476,
"step": 318
},
{
"epoch": 0.90625,
"grad_norm": 0.38736943533330265,
"learning_rate": 4.144596570052249e-06,
"loss": 0.279,
"step": 319
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.3461038914128392,
"learning_rate": 4.1387492863959076e-06,
"loss": 0.262,
"step": 320
},
{
"epoch": 0.9119318181818182,
"grad_norm": 0.3328949084965424,
"learning_rate": 4.132886241403756e-06,
"loss": 0.2841,
"step": 321
},
{
"epoch": 0.9147727272727273,
"grad_norm": 0.3684252037786764,
"learning_rate": 4.127007491466008e-06,
"loss": 0.3032,
"step": 322
},
{
"epoch": 0.9176136363636364,
"grad_norm": 0.44163540100916987,
"learning_rate": 4.121113093123925e-06,
"loss": 0.3164,
"step": 323
},
{
"epoch": 0.9204545454545454,
"grad_norm": 0.49048074141989995,
"learning_rate": 4.115203103069273e-06,
"loss": 0.2623,
"step": 324
},
{
"epoch": 0.9232954545454546,
"grad_norm": 0.34827477492871306,
"learning_rate": 4.109277578143779e-06,
"loss": 0.2717,
"step": 325
},
{
"epoch": 0.9261363636363636,
"grad_norm": 0.3603610666299997,
"learning_rate": 4.10333657533858e-06,
"loss": 0.2783,
"step": 326
},
{
"epoch": 0.9289772727272727,
"grad_norm": 0.3901080384564019,
"learning_rate": 4.097380151793681e-06,
"loss": 0.286,
"step": 327
},
{
"epoch": 0.9318181818181818,
"grad_norm": 0.3598672604385207,
"learning_rate": 4.0914083647974025e-06,
"loss": 0.3375,
"step": 328
},
{
"epoch": 0.9346590909090909,
"grad_norm": 0.32775404856254314,
"learning_rate": 4.085421271785824e-06,
"loss": 0.2904,
"step": 329
},
{
"epoch": 0.9375,
"grad_norm": 0.29442351680114387,
"learning_rate": 4.079418930342243e-06,
"loss": 0.2629,
"step": 330
},
{
"epoch": 0.9403409090909091,
"grad_norm": 0.4405796100351076,
"learning_rate": 4.0734013981966125e-06,
"loss": 0.3665,
"step": 331
},
{
"epoch": 0.9431818181818182,
"grad_norm": 0.3334068109525356,
"learning_rate": 4.0673687332249866e-06,
"loss": 0.3079,
"step": 332
},
{
"epoch": 0.9460227272727273,
"grad_norm": 0.32669985590044703,
"learning_rate": 4.061320993448968e-06,
"loss": 0.2904,
"step": 333
},
{
"epoch": 0.9488636363636364,
"grad_norm": 0.3442146928076968,
"learning_rate": 4.055258237035146e-06,
"loss": 0.3146,
"step": 334
},
{
"epoch": 0.9517045454545454,
"grad_norm": 0.4309052746676042,
"learning_rate": 4.04918052229454e-06,
"loss": 0.3446,
"step": 335
},
{
"epoch": 0.9545454545454546,
"grad_norm": 0.35908542610160016,
"learning_rate": 4.043087907682035e-06,
"loss": 0.2534,
"step": 336
},
{
"epoch": 0.9573863636363636,
"grad_norm": 0.3894188962377372,
"learning_rate": 4.036980451795822e-06,
"loss": 0.3262,
"step": 337
},
{
"epoch": 0.9602272727272727,
"grad_norm": 0.37392061032103363,
"learning_rate": 4.030858213376838e-06,
"loss": 0.3158,
"step": 338
},
{
"epoch": 0.9630681818181818,
"grad_norm": 0.3880624083667109,
"learning_rate": 4.02472125130819e-06,
"loss": 0.2908,
"step": 339
},
{
"epoch": 0.9659090909090909,
"grad_norm": 0.4031632690009814,
"learning_rate": 4.018569624614602e-06,
"loss": 0.3279,
"step": 340
},
{
"epoch": 0.96875,
"grad_norm": 0.38583919245780574,
"learning_rate": 4.012403392461837e-06,
"loss": 0.2657,
"step": 341
},
{
"epoch": 0.9715909090909091,
"grad_norm": 0.4657940346556613,
"learning_rate": 4.006222614156132e-06,
"loss": 0.3176,
"step": 342
},
{
"epoch": 0.9744318181818182,
"grad_norm": 0.28406132307929355,
"learning_rate": 4.000027349143633e-06,
"loss": 0.2261,
"step": 343
},
{
"epoch": 0.9772727272727273,
"grad_norm": 0.3809447081607224,
"learning_rate": 3.993817657009808e-06,
"loss": 0.291,
"step": 344
},
{
"epoch": 0.9801136363636364,
"grad_norm": 0.37276416289236974,
"learning_rate": 3.987593597478894e-06,
"loss": 0.3229,
"step": 345
},
{
"epoch": 0.9829545454545454,
"grad_norm": 0.36213806018136363,
"learning_rate": 3.981355230413305e-06,
"loss": 0.2785,
"step": 346
},
{
"epoch": 0.9857954545454546,
"grad_norm": 0.3774008729788378,
"learning_rate": 3.975102615813068e-06,
"loss": 0.272,
"step": 347
},
{
"epoch": 0.9886363636363636,
"grad_norm": 0.3268419464248498,
"learning_rate": 3.968835813815236e-06,
"loss": 0.2468,
"step": 348
},
{
"epoch": 0.9914772727272727,
"grad_norm": 0.401670934547313,
"learning_rate": 3.962554884693323e-06,
"loss": 0.2953,
"step": 349
},
{
"epoch": 0.9943181818181818,
"grad_norm": 0.40169610324443583,
"learning_rate": 3.956259888856708e-06,
"loss": 0.2939,
"step": 350
},
{
"epoch": 0.9971590909090909,
"grad_norm": 0.2891600640815435,
"learning_rate": 3.949950886850069e-06,
"loss": 0.2805,
"step": 351
},
{
"epoch": 1.0,
"grad_norm": 0.3279215818041681,
"learning_rate": 3.943627939352789e-06,
"loss": 0.2598,
"step": 352
},
{
"epoch": 1.0028409090909092,
"grad_norm": 0.3533913319935541,
"learning_rate": 3.9372911071783805e-06,
"loss": 0.2673,
"step": 353
},
{
"epoch": 1.0056818181818181,
"grad_norm": 0.38416565428145066,
"learning_rate": 3.930940451273898e-06,
"loss": 0.2933,
"step": 354
},
{
"epoch": 1.0085227272727273,
"grad_norm": 0.41220420942768127,
"learning_rate": 3.924576032719349e-06,
"loss": 0.2952,
"step": 355
},
{
"epoch": 1.0113636363636365,
"grad_norm": 0.4096268298831798,
"learning_rate": 3.9181979127271076e-06,
"loss": 0.2575,
"step": 356
},
{
"epoch": 1.0142045454545454,
"grad_norm": 0.45379315898269595,
"learning_rate": 3.911806152641333e-06,
"loss": 0.2717,
"step": 357
},
{
"epoch": 1.0170454545454546,
"grad_norm": 0.32770827000624236,
"learning_rate": 3.9054008139373675e-06,
"loss": 0.266,
"step": 358
},
{
"epoch": 1.0198863636363635,
"grad_norm": 0.2965104343367262,
"learning_rate": 3.8989819582211555e-06,
"loss": 0.2548,
"step": 359
},
{
"epoch": 1.0227272727272727,
"grad_norm": 0.4054461782711258,
"learning_rate": 3.892549647228642e-06,
"loss": 0.3398,
"step": 360
},
{
"epoch": 1.0255681818181819,
"grad_norm": 0.39022556113460055,
"learning_rate": 3.886103942825189e-06,
"loss": 0.2826,
"step": 361
},
{
"epoch": 1.0284090909090908,
"grad_norm": 0.3374532413491821,
"learning_rate": 3.879644907004972e-06,
"loss": 0.2644,
"step": 362
},
{
"epoch": 1.03125,
"grad_norm": 0.337718457045594,
"learning_rate": 3.873172601890386e-06,
"loss": 0.2545,
"step": 363
},
{
"epoch": 1.0340909090909092,
"grad_norm": 0.3729922751436951,
"learning_rate": 3.86668708973145e-06,
"loss": 0.2951,
"step": 364
},
{
"epoch": 1.0369318181818181,
"grad_norm": 0.31238473142978845,
"learning_rate": 3.860188432905209e-06,
"loss": 0.2537,
"step": 365
},
{
"epoch": 1.0397727272727273,
"grad_norm": 0.37350151083829397,
"learning_rate": 3.853676693915129e-06,
"loss": 0.2614,
"step": 366
},
{
"epoch": 1.0426136363636365,
"grad_norm": 0.3575634359205247,
"learning_rate": 3.8471519353905025e-06,
"loss": 0.2437,
"step": 367
},
{
"epoch": 1.0454545454545454,
"grad_norm": 0.3537757819725644,
"learning_rate": 3.840614220085837e-06,
"loss": 0.2747,
"step": 368
},
{
"epoch": 1.0482954545454546,
"grad_norm": 0.34943668518465093,
"learning_rate": 3.834063610880263e-06,
"loss": 0.2844,
"step": 369
},
{
"epoch": 1.0511363636363635,
"grad_norm": 0.32611370130766987,
"learning_rate": 3.827500170776921e-06,
"loss": 0.2578,
"step": 370
},
{
"epoch": 1.0539772727272727,
"grad_norm": 0.29743321074762596,
"learning_rate": 3.8209239629023565e-06,
"loss": 0.2361,
"step": 371
},
{
"epoch": 1.0568181818181819,
"grad_norm": 0.3317934285561481,
"learning_rate": 3.814335050505916e-06,
"loss": 0.2645,
"step": 372
},
{
"epoch": 1.0596590909090908,
"grad_norm": 0.40729226447208133,
"learning_rate": 3.8077334969591377e-06,
"loss": 0.2929,
"step": 373
},
{
"epoch": 1.0625,
"grad_norm": 0.35583822537265253,
"learning_rate": 3.801119365755138e-06,
"loss": 0.3036,
"step": 374
},
{
"epoch": 1.0653409090909092,
"grad_norm": 0.47116931222172215,
"learning_rate": 3.7944927205080073e-06,
"loss": 0.2962,
"step": 375
},
{
"epoch": 1.0681818181818181,
"grad_norm": 0.4620500786524589,
"learning_rate": 3.7878536249521935e-06,
"loss": 0.3186,
"step": 376
},
{
"epoch": 1.0710227272727273,
"grad_norm": 0.4310223125222202,
"learning_rate": 3.7812021429418886e-06,
"loss": 0.305,
"step": 377
},
{
"epoch": 1.0738636363636365,
"grad_norm": 0.35860375920691345,
"learning_rate": 3.77453833845042e-06,
"loss": 0.3124,
"step": 378
},
{
"epoch": 1.0767045454545454,
"grad_norm": 0.40493909967111513,
"learning_rate": 3.7678622755696292e-06,
"loss": 0.2649,
"step": 379
},
{
"epoch": 1.0795454545454546,
"grad_norm": 0.3699164344949677,
"learning_rate": 3.7611740185092587e-06,
"loss": 0.3346,
"step": 380
},
{
"epoch": 1.0823863636363635,
"grad_norm": 0.5931781411606138,
"learning_rate": 3.754473631596332e-06,
"loss": 0.2729,
"step": 381
},
{
"epoch": 1.0852272727272727,
"grad_norm": 0.3122039055630976,
"learning_rate": 3.7477611792745384e-06,
"loss": 0.2816,
"step": 382
},
{
"epoch": 1.0880681818181819,
"grad_norm": 0.35273556528651445,
"learning_rate": 3.7410367261036094e-06,
"loss": 0.2765,
"step": 383
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.361323677115818,
"learning_rate": 3.7343003367587e-06,
"loss": 0.2831,
"step": 384
},
{
"epoch": 1.09375,
"grad_norm": 0.3776789429609578,
"learning_rate": 3.727552076029767e-06,
"loss": 0.3006,
"step": 385
},
{
"epoch": 1.0965909090909092,
"grad_norm": 0.4049848534001206,
"learning_rate": 3.7207920088209454e-06,
"loss": 0.3213,
"step": 386
},
{
"epoch": 1.0994318181818181,
"grad_norm": 0.3541711790485223,
"learning_rate": 3.7140202001499214e-06,
"loss": 0.2902,
"step": 387
},
{
"epoch": 1.1022727272727273,
"grad_norm": 0.3501668624619801,
"learning_rate": 3.707236715147312e-06,
"loss": 0.2809,
"step": 388
},
{
"epoch": 1.1051136363636365,
"grad_norm": 0.38321621491594765,
"learning_rate": 3.700441619056035e-06,
"loss": 0.3163,
"step": 389
},
{
"epoch": 1.1079545454545454,
"grad_norm": 0.4044457614031915,
"learning_rate": 3.693634977230681e-06,
"loss": 0.2862,
"step": 390
},
{
"epoch": 1.1107954545454546,
"grad_norm": 0.40951036198359486,
"learning_rate": 3.686816855136891e-06,
"loss": 0.28,
"step": 391
},
{
"epoch": 1.1136363636363635,
"grad_norm": 0.29410719311103134,
"learning_rate": 3.679987318350717e-06,
"loss": 0.2299,
"step": 392
},
{
"epoch": 1.1164772727272727,
"grad_norm": 0.3819079818809885,
"learning_rate": 3.673146432557998e-06,
"loss": 0.32,
"step": 393
},
{
"epoch": 1.1193181818181819,
"grad_norm": 0.3628245440460693,
"learning_rate": 3.666294263553729e-06,
"loss": 0.2724,
"step": 394
},
{
"epoch": 1.1221590909090908,
"grad_norm": 0.34928329721642853,
"learning_rate": 3.659430877241423e-06,
"loss": 0.248,
"step": 395
},
{
"epoch": 1.125,
"grad_norm": 0.442925717620733,
"learning_rate": 3.6525563396324826e-06,
"loss": 0.2942,
"step": 396
},
{
"epoch": 1.1278409090909092,
"grad_norm": 0.4525323331322651,
"learning_rate": 3.6456707168455584e-06,
"loss": 0.3258,
"step": 397
},
{
"epoch": 1.1306818181818181,
"grad_norm": 0.39153087965891287,
"learning_rate": 3.6387740751059218e-06,
"loss": 0.3072,
"step": 398
},
{
"epoch": 1.1335227272727273,
"grad_norm": 0.3886102447660378,
"learning_rate": 3.6318664807448218e-06,
"loss": 0.3415,
"step": 399
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.3642339507412296,
"learning_rate": 3.6249480001988463e-06,
"loss": 0.2691,
"step": 400
},
{
"epoch": 1.1392045454545454,
"grad_norm": 0.3380651370346197,
"learning_rate": 3.6180187000092894e-06,
"loss": 0.2791,
"step": 401
},
{
"epoch": 1.1420454545454546,
"grad_norm": 0.3193544491508243,
"learning_rate": 3.611078646821505e-06,
"loss": 0.2326,
"step": 402
},
{
"epoch": 1.1448863636363638,
"grad_norm": 0.30524333443799656,
"learning_rate": 3.6041279073842684e-06,
"loss": 0.2489,
"step": 403
},
{
"epoch": 1.1477272727272727,
"grad_norm": 0.39683144371135337,
"learning_rate": 3.597166548549136e-06,
"loss": 0.2656,
"step": 404
},
{
"epoch": 1.1505681818181819,
"grad_norm": 0.39975422805218463,
"learning_rate": 3.590194637269798e-06,
"loss": 0.2823,
"step": 405
},
{
"epoch": 1.1534090909090908,
"grad_norm": 0.3781718281788356,
"learning_rate": 3.5832122406014398e-06,
"loss": 0.2545,
"step": 406
},
{
"epoch": 1.15625,
"grad_norm": 0.39633632407524205,
"learning_rate": 3.576219425700092e-06,
"loss": 0.2656,
"step": 407
},
{
"epoch": 1.1590909090909092,
"grad_norm": 0.503126670284463,
"learning_rate": 3.5692162598219877e-06,
"loss": 0.3106,
"step": 408
},
{
"epoch": 1.1619318181818181,
"grad_norm": 0.3803993289484403,
"learning_rate": 3.5622028103229154e-06,
"loss": 0.2777,
"step": 409
},
{
"epoch": 1.1647727272727273,
"grad_norm": 0.32896270814306483,
"learning_rate": 3.555179144657568e-06,
"loss": 0.2681,
"step": 410
},
{
"epoch": 1.1676136363636362,
"grad_norm": 0.45079184347220275,
"learning_rate": 3.548145330378901e-06,
"loss": 0.298,
"step": 411
},
{
"epoch": 1.1704545454545454,
"grad_norm": 0.3409745563125651,
"learning_rate": 3.5411014351374735e-06,
"loss": 0.2829,
"step": 412
},
{
"epoch": 1.1732954545454546,
"grad_norm": 0.3524051821269997,
"learning_rate": 3.5340475266808046e-06,
"loss": 0.2897,
"step": 413
},
{
"epoch": 1.1761363636363638,
"grad_norm": 0.31354296956532873,
"learning_rate": 3.5269836728527194e-06,
"loss": 0.2512,
"step": 414
},
{
"epoch": 1.1789772727272727,
"grad_norm": 0.2819333444591201,
"learning_rate": 3.5199099415926985e-06,
"loss": 0.2336,
"step": 415
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.3667062945127836,
"learning_rate": 3.5128264009352177e-06,
"loss": 0.2797,
"step": 416
},
{
"epoch": 1.1846590909090908,
"grad_norm": 0.3717065816803459,
"learning_rate": 3.5057331190091036e-06,
"loss": 0.2625,
"step": 417
},
{
"epoch": 1.1875,
"grad_norm": 0.34247191523071263,
"learning_rate": 3.4986301640368726e-06,
"loss": 0.2915,
"step": 418
},
{
"epoch": 1.1903409090909092,
"grad_norm": 0.28055115946196074,
"learning_rate": 3.4915176043340726e-06,
"loss": 0.2323,
"step": 419
},
{
"epoch": 1.1931818181818181,
"grad_norm": 0.3512617852047132,
"learning_rate": 3.4843955083086315e-06,
"loss": 0.276,
"step": 420
},
{
"epoch": 1.1960227272727273,
"grad_norm": 0.3402592655838616,
"learning_rate": 3.477263944460196e-06,
"loss": 0.258,
"step": 421
},
{
"epoch": 1.1988636363636362,
"grad_norm": 0.3440775197912379,
"learning_rate": 3.4701229813794744e-06,
"loss": 0.2686,
"step": 422
},
{
"epoch": 1.2017045454545454,
"grad_norm": 0.32159613738142184,
"learning_rate": 3.4629726877475733e-06,
"loss": 0.2775,
"step": 423
},
{
"epoch": 1.2045454545454546,
"grad_norm": 0.3405153808986929,
"learning_rate": 3.4558131323353423e-06,
"loss": 0.2947,
"step": 424
},
{
"epoch": 1.2073863636363638,
"grad_norm": 0.4111884872726661,
"learning_rate": 3.4486443840027084e-06,
"loss": 0.2427,
"step": 425
},
{
"epoch": 1.2102272727272727,
"grad_norm": 0.38692560086654654,
"learning_rate": 3.4414665116980167e-06,
"loss": 0.3084,
"step": 426
},
{
"epoch": 1.2130681818181819,
"grad_norm": 0.4000466884476275,
"learning_rate": 3.4342795844573634e-06,
"loss": 0.2933,
"step": 427
},
{
"epoch": 1.2159090909090908,
"grad_norm": 0.3605831840618787,
"learning_rate": 3.427083671403937e-06,
"loss": 0.2892,
"step": 428
},
{
"epoch": 1.21875,
"grad_norm": 0.3225439729294941,
"learning_rate": 3.4198788417473485e-06,
"loss": 0.2579,
"step": 429
},
{
"epoch": 1.2215909090909092,
"grad_norm": 0.3869565428112392,
"learning_rate": 3.41266516478297e-06,
"loss": 0.3349,
"step": 430
},
{
"epoch": 1.2244318181818181,
"grad_norm": 0.3790938940448294,
"learning_rate": 3.4054427098912636e-06,
"loss": 0.2836,
"step": 431
},
{
"epoch": 1.2272727272727273,
"grad_norm": 0.33485764653621325,
"learning_rate": 3.3982115465371185e-06,
"loss": 0.2465,
"step": 432
},
{
"epoch": 1.2301136363636362,
"grad_norm": 0.3421027182025914,
"learning_rate": 3.390971744269181e-06,
"loss": 0.2436,
"step": 433
},
{
"epoch": 1.2329545454545454,
"grad_norm": 0.3343569283936874,
"learning_rate": 3.3837233727191856e-06,
"loss": 0.2533,
"step": 434
},
{
"epoch": 1.2357954545454546,
"grad_norm": 0.3490337805677148,
"learning_rate": 3.3764665016012842e-06,
"loss": 0.2401,
"step": 435
},
{
"epoch": 1.2386363636363638,
"grad_norm": 0.3116736362955648,
"learning_rate": 3.3692012007113776e-06,
"loss": 0.2482,
"step": 436
},
{
"epoch": 1.2414772727272727,
"grad_norm": 0.3963218536576595,
"learning_rate": 3.3619275399264444e-06,
"loss": 0.2944,
"step": 437
},
{
"epoch": 1.2443181818181819,
"grad_norm": 0.39432480274886955,
"learning_rate": 3.3546455892038666e-06,
"loss": 0.2918,
"step": 438
},
{
"epoch": 1.2471590909090908,
"grad_norm": 0.3775480283393243,
"learning_rate": 3.3473554185807573e-06,
"loss": 0.2771,
"step": 439
},
{
"epoch": 1.25,
"grad_norm": 0.34490450741107803,
"learning_rate": 3.340057098173288e-06,
"loss": 0.2756,
"step": 440
},
{
"epoch": 1.2528409090909092,
"grad_norm": 0.3324905873722346,
"learning_rate": 3.3327506981760183e-06,
"loss": 0.2608,
"step": 441
},
{
"epoch": 1.2556818181818181,
"grad_norm": 0.47138267546166734,
"learning_rate": 3.32543628886121e-06,
"loss": 0.3077,
"step": 442
},
{
"epoch": 1.2585227272727273,
"grad_norm": 0.2953842775844083,
"learning_rate": 3.3181139405781616e-06,
"loss": 0.2377,
"step": 443
},
{
"epoch": 1.2613636363636362,
"grad_norm": 0.3612627525520785,
"learning_rate": 3.3107837237525274e-06,
"loss": 0.2427,
"step": 444
},
{
"epoch": 1.2642045454545454,
"grad_norm": 0.3653963278501932,
"learning_rate": 3.3034457088856396e-06,
"loss": 0.2559,
"step": 445
},
{
"epoch": 1.2670454545454546,
"grad_norm": 0.3129568330696853,
"learning_rate": 3.2960999665538335e-06,
"loss": 0.2534,
"step": 446
},
{
"epoch": 1.2698863636363638,
"grad_norm": 0.3510947430261117,
"learning_rate": 3.288746567407763e-06,
"loss": 0.2502,
"step": 447
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.3437157582636368,
"learning_rate": 3.281385582171727e-06,
"loss": 0.2525,
"step": 448
},
{
"epoch": 1.2755681818181819,
"grad_norm": 0.3888446263801318,
"learning_rate": 3.274017081642986e-06,
"loss": 0.2885,
"step": 449
},
{
"epoch": 1.2784090909090908,
"grad_norm": 0.35942811400817226,
"learning_rate": 3.2666411366910827e-06,
"loss": 0.2571,
"step": 450
},
{
"epoch": 1.28125,
"grad_norm": 0.41674090701769867,
"learning_rate": 3.2592578182571583e-06,
"loss": 0.2973,
"step": 451
},
{
"epoch": 1.2840909090909092,
"grad_norm": 0.3702323179560626,
"learning_rate": 3.2518671973532704e-06,
"loss": 0.2415,
"step": 452
},
{
"epoch": 1.2869318181818181,
"grad_norm": 0.36007563550430505,
"learning_rate": 3.244469345061715e-06,
"loss": 0.2277,
"step": 453
},
{
"epoch": 1.2897727272727273,
"grad_norm": 0.3914691699646844,
"learning_rate": 3.237064332534336e-06,
"loss": 0.2828,
"step": 454
},
{
"epoch": 1.2926136363636362,
"grad_norm": 0.3522104855581335,
"learning_rate": 3.229652230991843e-06,
"loss": 0.2671,
"step": 455
},
{
"epoch": 1.2954545454545454,
"grad_norm": 0.3553148108185653,
"learning_rate": 3.2222331117231283e-06,
"loss": 0.2817,
"step": 456
},
{
"epoch": 1.2982954545454546,
"grad_norm": 0.3771227330111479,
"learning_rate": 3.2148070460845814e-06,
"loss": 0.274,
"step": 457
},
{
"epoch": 1.3011363636363638,
"grad_norm": 0.41388528735027136,
"learning_rate": 3.2073741054994e-06,
"loss": 0.3181,
"step": 458
},
{
"epoch": 1.3039772727272727,
"grad_norm": 0.33865063205260826,
"learning_rate": 3.199934361456903e-06,
"loss": 0.2634,
"step": 459
},
{
"epoch": 1.3068181818181819,
"grad_norm": 0.3520115660135833,
"learning_rate": 3.1924878855118475e-06,
"loss": 0.2618,
"step": 460
},
{
"epoch": 1.3096590909090908,
"grad_norm": 0.40034402955639337,
"learning_rate": 3.185034749283734e-06,
"loss": 0.2837,
"step": 461
},
{
"epoch": 1.3125,
"grad_norm": 0.34422942980117177,
"learning_rate": 3.1775750244561233e-06,
"loss": 0.2638,
"step": 462
},
{
"epoch": 1.3153409090909092,
"grad_norm": 0.38963794033279253,
"learning_rate": 3.1701087827759434e-06,
"loss": 0.294,
"step": 463
},
{
"epoch": 1.3181818181818181,
"grad_norm": 0.4262376192411251,
"learning_rate": 3.162636096052803e-06,
"loss": 0.3342,
"step": 464
},
{
"epoch": 1.3210227272727273,
"grad_norm": 0.38196782588004025,
"learning_rate": 3.155157036158295e-06,
"loss": 0.281,
"step": 465
},
{
"epoch": 1.3238636363636362,
"grad_norm": 0.39128577037723217,
"learning_rate": 3.147671675025313e-06,
"loss": 0.2864,
"step": 466
},
{
"epoch": 1.3267045454545454,
"grad_norm": 0.3622238856754979,
"learning_rate": 3.1401800846473506e-06,
"loss": 0.2742,
"step": 467
},
{
"epoch": 1.3295454545454546,
"grad_norm": 0.3187408313823274,
"learning_rate": 3.132682337077818e-06,
"loss": 0.2549,
"step": 468
},
{
"epoch": 1.3323863636363638,
"grad_norm": 0.33256196577073566,
"learning_rate": 3.1251785044293425e-06,
"loss": 0.2921,
"step": 469
},
{
"epoch": 1.3352272727272727,
"grad_norm": 0.377119549478706,
"learning_rate": 3.117668658873078e-06,
"loss": 0.2722,
"step": 470
},
{
"epoch": 1.3380681818181819,
"grad_norm": 0.31419013026351733,
"learning_rate": 3.1101528726380085e-06,
"loss": 0.2519,
"step": 471
},
{
"epoch": 1.3409090909090908,
"grad_norm": 0.3471415869479363,
"learning_rate": 3.102631218010257e-06,
"loss": 0.2817,
"step": 472
},
{
"epoch": 1.34375,
"grad_norm": 0.37953158089107286,
"learning_rate": 3.0951037673323863e-06,
"loss": 0.2642,
"step": 473
},
{
"epoch": 1.3465909090909092,
"grad_norm": 0.34488245509452714,
"learning_rate": 3.0875705930027065e-06,
"loss": 0.2499,
"step": 474
},
{
"epoch": 1.3494318181818181,
"grad_norm": 0.29818790329911665,
"learning_rate": 3.0800317674745755e-06,
"loss": 0.2572,
"step": 475
},
{
"epoch": 1.3522727272727273,
"grad_norm": 0.35582979006101406,
"learning_rate": 3.0724873632557068e-06,
"loss": 0.2806,
"step": 476
},
{
"epoch": 1.3551136363636362,
"grad_norm": 0.3886707765043663,
"learning_rate": 3.064937452907465e-06,
"loss": 0.2395,
"step": 477
},
{
"epoch": 1.3579545454545454,
"grad_norm": 0.39452409132776717,
"learning_rate": 3.057382109044177e-06,
"loss": 0.2748,
"step": 478
},
{
"epoch": 1.3607954545454546,
"grad_norm": 0.34362558608870675,
"learning_rate": 3.049821404332424e-06,
"loss": 0.2664,
"step": 479
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.3923547533127044,
"learning_rate": 3.0422554114903514e-06,
"loss": 0.3134,
"step": 480
},
{
"epoch": 1.3664772727272727,
"grad_norm": 0.42311598203108824,
"learning_rate": 3.0346842032869624e-06,
"loss": 0.3227,
"step": 481
},
{
"epoch": 1.3693181818181819,
"grad_norm": 0.49341501720924236,
"learning_rate": 3.0271078525414234e-06,
"loss": 0.2789,
"step": 482
},
{
"epoch": 1.3721590909090908,
"grad_norm": 0.3923870792288359,
"learning_rate": 3.0195264321223584e-06,
"loss": 0.3003,
"step": 483
},
{
"epoch": 1.375,
"grad_norm": 0.5047411107384405,
"learning_rate": 3.0119400149471535e-06,
"loss": 0.2835,
"step": 484
},
{
"epoch": 1.3778409090909092,
"grad_norm": 0.3431083613633404,
"learning_rate": 3.004348673981252e-06,
"loss": 0.2744,
"step": 485
},
{
"epoch": 1.3806818181818181,
"grad_norm": 0.3370392701002557,
"learning_rate": 2.996752482237456e-06,
"loss": 0.2503,
"step": 486
},
{
"epoch": 1.3835227272727273,
"grad_norm": 0.35789574905836263,
"learning_rate": 2.9891515127752172e-06,
"loss": 0.2558,
"step": 487
},
{
"epoch": 1.3863636363636362,
"grad_norm": 0.39542709664531145,
"learning_rate": 2.981545838699943e-06,
"loss": 0.2499,
"step": 488
},
{
"epoch": 1.3892045454545454,
"grad_norm": 0.4799271866705037,
"learning_rate": 2.9739355331622886e-06,
"loss": 0.2845,
"step": 489
},
{
"epoch": 1.3920454545454546,
"grad_norm": 0.30250300604212543,
"learning_rate": 2.966320669357453e-06,
"loss": 0.2428,
"step": 490
},
{
"epoch": 1.3948863636363638,
"grad_norm": 0.27928557627455064,
"learning_rate": 2.9587013205244767e-06,
"loss": 0.2354,
"step": 491
},
{
"epoch": 1.3977272727272727,
"grad_norm": 0.3254689902299252,
"learning_rate": 2.951077559945538e-06,
"loss": 0.2719,
"step": 492
},
{
"epoch": 1.4005681818181819,
"grad_norm": 0.38918459975286523,
"learning_rate": 2.943449460945244e-06,
"loss": 0.2726,
"step": 493
},
{
"epoch": 1.4034090909090908,
"grad_norm": 0.29871192903714955,
"learning_rate": 2.9358170968899323e-06,
"loss": 0.263,
"step": 494
},
{
"epoch": 1.40625,
"grad_norm": 0.3943630183447143,
"learning_rate": 2.9281805411869573e-06,
"loss": 0.2931,
"step": 495
},
{
"epoch": 1.4090909090909092,
"grad_norm": 0.34932644595142737,
"learning_rate": 2.920539867283992e-06,
"loss": 0.2577,
"step": 496
},
{
"epoch": 1.4119318181818181,
"grad_norm": 0.36296363929883135,
"learning_rate": 2.9128951486683144e-06,
"loss": 0.2884,
"step": 497
},
{
"epoch": 1.4147727272727273,
"grad_norm": 0.3536090241186941,
"learning_rate": 2.9052464588661076e-06,
"loss": 0.2518,
"step": 498
},
{
"epoch": 1.4176136363636362,
"grad_norm": 0.4071123114766137,
"learning_rate": 2.8975938714417466e-06,
"loss": 0.2955,
"step": 499
},
{
"epoch": 1.4204545454545454,
"grad_norm": 0.36319240545094117,
"learning_rate": 2.8899374599970943e-06,
"loss": 0.2933,
"step": 500
},
{
"epoch": 1.4232954545454546,
"grad_norm": 0.33541538203913807,
"learning_rate": 2.882277298170792e-06,
"loss": 0.2693,
"step": 501
},
{
"epoch": 1.4261363636363638,
"grad_norm": 0.42293889077814073,
"learning_rate": 2.8746134596375534e-06,
"loss": 0.2907,
"step": 502
},
{
"epoch": 1.4289772727272727,
"grad_norm": 0.3702782961580686,
"learning_rate": 2.866946018107453e-06,
"loss": 0.2701,
"step": 503
},
{
"epoch": 1.4318181818181819,
"grad_norm": 0.3454390175085058,
"learning_rate": 2.8592750473252197e-06,
"loss": 0.2612,
"step": 504
},
{
"epoch": 1.4346590909090908,
"grad_norm": 0.33107307095308464,
"learning_rate": 2.8516006210695244e-06,
"loss": 0.239,
"step": 505
},
{
"epoch": 1.4375,
"grad_norm": 0.3569062909249772,
"learning_rate": 2.843922813152275e-06,
"loss": 0.2755,
"step": 506
},
{
"epoch": 1.4403409090909092,
"grad_norm": 0.37131837135922086,
"learning_rate": 2.836241697417902e-06,
"loss": 0.2623,
"step": 507
},
{
"epoch": 1.4431818181818181,
"grad_norm": 0.3699557028893426,
"learning_rate": 2.8285573477426504e-06,
"loss": 0.2811,
"step": 508
},
{
"epoch": 1.4460227272727273,
"grad_norm": 0.33561480648358855,
"learning_rate": 2.820869838033871e-06,
"loss": 0.2686,
"step": 509
},
{
"epoch": 1.4488636363636362,
"grad_norm": 0.4711840304366533,
"learning_rate": 2.813179242229304e-06,
"loss": 0.2946,
"step": 510
},
{
"epoch": 1.4517045454545454,
"grad_norm": 0.382672820843295,
"learning_rate": 2.805485634296374e-06,
"loss": 0.2945,
"step": 511
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.3264806302650397,
"learning_rate": 2.7977890882314763e-06,
"loss": 0.2658,
"step": 512
},
{
"epoch": 1.4573863636363638,
"grad_norm": 0.3590459125833833,
"learning_rate": 2.7900896780592616e-06,
"loss": 0.2675,
"step": 513
},
{
"epoch": 1.4602272727272727,
"grad_norm": 0.41777977412669154,
"learning_rate": 2.7823874778319316e-06,
"loss": 0.3133,
"step": 514
},
{
"epoch": 1.4630681818181819,
"grad_norm": 0.3700743186678299,
"learning_rate": 2.774682561628519e-06,
"loss": 0.2781,
"step": 515
},
{
"epoch": 1.4659090909090908,
"grad_norm": 0.3586139592020702,
"learning_rate": 2.7669750035541798e-06,
"loss": 0.2709,
"step": 516
},
{
"epoch": 1.46875,
"grad_norm": 0.32668952210259083,
"learning_rate": 2.759264877739481e-06,
"loss": 0.2628,
"step": 517
},
{
"epoch": 1.4715909090909092,
"grad_norm": 0.3304970455370839,
"learning_rate": 2.7515522583396825e-06,
"loss": 0.2859,
"step": 518
},
{
"epoch": 1.4744318181818181,
"grad_norm": 0.3188860297893081,
"learning_rate": 2.74383721953403e-06,
"loss": 0.2435,
"step": 519
},
{
"epoch": 1.4772727272727273,
"grad_norm": 0.3701340525867732,
"learning_rate": 2.736119835525037e-06,
"loss": 0.2571,
"step": 520
},
{
"epoch": 1.4801136363636362,
"grad_norm": 0.4888553988204271,
"learning_rate": 2.728400180537775e-06,
"loss": 0.2956,
"step": 521
},
{
"epoch": 1.4829545454545454,
"grad_norm": 0.4110586830757001,
"learning_rate": 2.720678328819155e-06,
"loss": 0.2396,
"step": 522
},
{
"epoch": 1.4857954545454546,
"grad_norm": 0.3828799651532281,
"learning_rate": 2.712954354637218e-06,
"loss": 0.2701,
"step": 523
},
{
"epoch": 1.4886363636363638,
"grad_norm": 0.359763211121689,
"learning_rate": 2.705228332280418e-06,
"loss": 0.2387,
"step": 524
},
{
"epoch": 1.4914772727272727,
"grad_norm": 0.3785795319364518,
"learning_rate": 2.6975003360569087e-06,
"loss": 0.2761,
"step": 525
},
{
"epoch": 1.4943181818181819,
"grad_norm": 0.34255573500581615,
"learning_rate": 2.689770440293825e-06,
"loss": 0.267,
"step": 526
},
{
"epoch": 1.4971590909090908,
"grad_norm": 0.37025650452574843,
"learning_rate": 2.6820387193365764e-06,
"loss": 0.2781,
"step": 527
},
{
"epoch": 1.5,
"grad_norm": 0.35002281689988746,
"learning_rate": 2.674305247548125e-06,
"loss": 0.2947,
"step": 528
},
{
"epoch": 1.5028409090909092,
"grad_norm": 0.34143779580523753,
"learning_rate": 2.6665700993082705e-06,
"loss": 0.2658,
"step": 529
},
{
"epoch": 1.5056818181818183,
"grad_norm": 0.3560924867441854,
"learning_rate": 2.6588333490129376e-06,
"loss": 0.2742,
"step": 530
},
{
"epoch": 1.5085227272727273,
"grad_norm": 0.32295396334903814,
"learning_rate": 2.65109507107346e-06,
"loss": 0.2382,
"step": 531
},
{
"epoch": 1.5113636363636362,
"grad_norm": 0.33859114158227865,
"learning_rate": 2.6433553399158652e-06,
"loss": 0.2937,
"step": 532
},
{
"epoch": 1.5142045454545454,
"grad_norm": 0.35244369608972004,
"learning_rate": 2.6356142299801544e-06,
"loss": 0.3037,
"step": 533
},
{
"epoch": 1.5170454545454546,
"grad_norm": 0.3336662584141403,
"learning_rate": 2.6278718157195924e-06,
"loss": 0.2844,
"step": 534
},
{
"epoch": 1.5198863636363638,
"grad_norm": 0.35862845558521106,
"learning_rate": 2.620128171599989e-06,
"loss": 0.246,
"step": 535
},
{
"epoch": 1.5227272727272727,
"grad_norm": 0.31358277794725126,
"learning_rate": 2.6123833720989796e-06,
"loss": 0.2653,
"step": 536
},
{
"epoch": 1.5255681818181817,
"grad_norm": 0.36029376106362876,
"learning_rate": 2.6046374917053156e-06,
"loss": 0.2785,
"step": 537
},
{
"epoch": 1.5284090909090908,
"grad_norm": 0.3512123146788697,
"learning_rate": 2.5968906049181425e-06,
"loss": 0.2723,
"step": 538
},
{
"epoch": 1.53125,
"grad_norm": 0.35559911829983626,
"learning_rate": 2.5891427862462853e-06,
"loss": 0.2939,
"step": 539
},
{
"epoch": 1.5340909090909092,
"grad_norm": 0.3774459233336894,
"learning_rate": 2.581394110207532e-06,
"loss": 0.2593,
"step": 540
},
{
"epoch": 1.5369318181818183,
"grad_norm": 0.3213295704503383,
"learning_rate": 2.5736446513279166e-06,
"loss": 0.2615,
"step": 541
},
{
"epoch": 1.5397727272727273,
"grad_norm": 0.33894998490392014,
"learning_rate": 2.5658944841410032e-06,
"loss": 0.2856,
"step": 542
},
{
"epoch": 1.5426136363636362,
"grad_norm": 0.4085808452620872,
"learning_rate": 2.5581436831871666e-06,
"loss": 0.2611,
"step": 543
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.3377548562078041,
"learning_rate": 2.5503923230128787e-06,
"loss": 0.2445,
"step": 544
},
{
"epoch": 1.5482954545454546,
"grad_norm": 0.2986016210832829,
"learning_rate": 2.5426404781699886e-06,
"loss": 0.2345,
"step": 545
},
{
"epoch": 1.5511363636363638,
"grad_norm": 0.3130189679053128,
"learning_rate": 2.534888223215008e-06,
"loss": 0.2648,
"step": 546
},
{
"epoch": 1.5539772727272727,
"grad_norm": 0.29362772394820585,
"learning_rate": 2.5271356327083927e-06,
"loss": 0.2231,
"step": 547
},
{
"epoch": 1.5568181818181817,
"grad_norm": 0.3371287342113354,
"learning_rate": 2.5193827812138268e-06,
"loss": 0.2801,
"step": 548
},
{
"epoch": 1.5596590909090908,
"grad_norm": 0.438680590348071,
"learning_rate": 2.511629743297502e-06,
"loss": 0.3117,
"step": 549
},
{
"epoch": 1.5625,
"grad_norm": 0.3623332826643985,
"learning_rate": 2.5038765935274038e-06,
"loss": 0.2582,
"step": 550
},
{
"epoch": 1.5653409090909092,
"grad_norm": 0.3611764461964591,
"learning_rate": 2.4961234064725966e-06,
"loss": 0.2606,
"step": 551
},
{
"epoch": 1.5681818181818183,
"grad_norm": 0.6683755911265977,
"learning_rate": 2.488370256702499e-06,
"loss": 0.2686,
"step": 552
},
{
"epoch": 1.5710227272727273,
"grad_norm": 0.3699878510363697,
"learning_rate": 2.4806172187861736e-06,
"loss": 0.2823,
"step": 553
},
{
"epoch": 1.5738636363636362,
"grad_norm": 0.3603575134404355,
"learning_rate": 2.4728643672916073e-06,
"loss": 0.2696,
"step": 554
},
{
"epoch": 1.5767045454545454,
"grad_norm": 0.5708462895257692,
"learning_rate": 2.465111776784993e-06,
"loss": 0.3003,
"step": 555
},
{
"epoch": 1.5795454545454546,
"grad_norm": 0.414861092800249,
"learning_rate": 2.4573595218300127e-06,
"loss": 0.2878,
"step": 556
},
{
"epoch": 1.5823863636363638,
"grad_norm": 0.36176025431242964,
"learning_rate": 2.4496076769871226e-06,
"loss": 0.2614,
"step": 557
},
{
"epoch": 1.5852272727272727,
"grad_norm": 0.4170474058146532,
"learning_rate": 2.4418563168128346e-06,
"loss": 0.2868,
"step": 558
},
{
"epoch": 1.5880681818181817,
"grad_norm": 0.3270649689091589,
"learning_rate": 2.4341055158589976e-06,
"loss": 0.2699,
"step": 559
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.3807070125410976,
"learning_rate": 2.4263553486720838e-06,
"loss": 0.303,
"step": 560
},
{
"epoch": 1.59375,
"grad_norm": 0.3848553762149162,
"learning_rate": 2.4186058897924685e-06,
"loss": 0.2748,
"step": 561
},
{
"epoch": 1.5965909090909092,
"grad_norm": 0.3232840810454203,
"learning_rate": 2.410857213753715e-06,
"loss": 0.2445,
"step": 562
},
{
"epoch": 1.5994318181818183,
"grad_norm": 0.3092676360533537,
"learning_rate": 2.4031093950818583e-06,
"loss": 0.2356,
"step": 563
},
{
"epoch": 1.6022727272727273,
"grad_norm": 0.45118596036379494,
"learning_rate": 2.3953625082946856e-06,
"loss": 0.2837,
"step": 564
},
{
"epoch": 1.6051136363636362,
"grad_norm": 0.34970482571526373,
"learning_rate": 2.3876166279010212e-06,
"loss": 0.2973,
"step": 565
},
{
"epoch": 1.6079545454545454,
"grad_norm": 0.3364465296058301,
"learning_rate": 2.379871828400012e-06,
"loss": 0.2423,
"step": 566
},
{
"epoch": 1.6107954545454546,
"grad_norm": 0.363328151622841,
"learning_rate": 2.372128184280408e-06,
"loss": 0.269,
"step": 567
},
{
"epoch": 1.6136363636363638,
"grad_norm": 0.26766248199292697,
"learning_rate": 2.364385770019846e-06,
"loss": 0.2346,
"step": 568
},
{
"epoch": 1.6164772727272727,
"grad_norm": 0.3913465078730921,
"learning_rate": 2.356644660084135e-06,
"loss": 0.2866,
"step": 569
},
{
"epoch": 1.6193181818181817,
"grad_norm": 0.31905393138162685,
"learning_rate": 2.34890492892654e-06,
"loss": 0.2666,
"step": 570
},
{
"epoch": 1.6221590909090908,
"grad_norm": 0.3432468450311117,
"learning_rate": 2.341166650987064e-06,
"loss": 0.2443,
"step": 571
},
{
"epoch": 1.625,
"grad_norm": 0.34070598347786063,
"learning_rate": 2.333429900691731e-06,
"loss": 0.2968,
"step": 572
},
{
"epoch": 1.6278409090909092,
"grad_norm": 0.4257323783577944,
"learning_rate": 2.3256947524518756e-06,
"loss": 0.275,
"step": 573
},
{
"epoch": 1.6306818181818183,
"grad_norm": 0.35120372623976087,
"learning_rate": 2.317961280663424e-06,
"loss": 0.2779,
"step": 574
},
{
"epoch": 1.6335227272727273,
"grad_norm": 0.3288834361465399,
"learning_rate": 2.3102295597061757e-06,
"loss": 0.262,
"step": 575
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.3781083785525166,
"learning_rate": 2.3024996639430925e-06,
"loss": 0.2705,
"step": 576
},
{
"epoch": 1.6392045454545454,
"grad_norm": 0.3309675255585671,
"learning_rate": 2.2947716677195823e-06,
"loss": 0.2607,
"step": 577
},
{
"epoch": 1.6420454545454546,
"grad_norm": 0.4097606078658523,
"learning_rate": 2.2870456453627823e-06,
"loss": 0.3267,
"step": 578
},
{
"epoch": 1.6448863636363638,
"grad_norm": 0.25572751310886616,
"learning_rate": 2.2793216711808456e-06,
"loss": 0.2278,
"step": 579
},
{
"epoch": 1.6477272727272727,
"grad_norm": 0.3060607584281395,
"learning_rate": 2.2715998194622257e-06,
"loss": 0.2517,
"step": 580
},
{
"epoch": 1.6505681818181817,
"grad_norm": 0.37963871119456877,
"learning_rate": 2.2638801644749636e-06,
"loss": 0.2634,
"step": 581
},
{
"epoch": 1.6534090909090908,
"grad_norm": 0.3762574705485531,
"learning_rate": 2.2561627804659704e-06,
"loss": 0.2534,
"step": 582
},
{
"epoch": 1.65625,
"grad_norm": 0.34282885282451137,
"learning_rate": 2.2484477416603183e-06,
"loss": 0.2666,
"step": 583
},
{
"epoch": 1.6590909090909092,
"grad_norm": 0.3508691585265268,
"learning_rate": 2.24073512226052e-06,
"loss": 0.2589,
"step": 584
},
{
"epoch": 1.6619318181818183,
"grad_norm": 0.38903092342578377,
"learning_rate": 2.2330249964458202e-06,
"loss": 0.2853,
"step": 585
},
{
"epoch": 1.6647727272727273,
"grad_norm": 0.3466002683474289,
"learning_rate": 2.2253174383714816e-06,
"loss": 0.2812,
"step": 586
},
{
"epoch": 1.6676136363636362,
"grad_norm": 0.46395674632161,
"learning_rate": 2.21761252216807e-06,
"loss": 0.2692,
"step": 587
},
{
"epoch": 1.6704545454545454,
"grad_norm": 0.3699824822038089,
"learning_rate": 2.2099103219407392e-06,
"loss": 0.2699,
"step": 588
},
{
"epoch": 1.6732954545454546,
"grad_norm": 0.3805031596017454,
"learning_rate": 2.2022109117685246e-06,
"loss": 0.2953,
"step": 589
},
{
"epoch": 1.6761363636363638,
"grad_norm": 0.37764726137134685,
"learning_rate": 2.1945143657036267e-06,
"loss": 0.2753,
"step": 590
},
{
"epoch": 1.6789772727272727,
"grad_norm": 0.3304479070305256,
"learning_rate": 2.1868207577706964e-06,
"loss": 0.2524,
"step": 591
},
{
"epoch": 1.6818181818181817,
"grad_norm": 0.3587520279737923,
"learning_rate": 2.1791301619661297e-06,
"loss": 0.2602,
"step": 592
},
{
"epoch": 1.6846590909090908,
"grad_norm": 0.3323465218687911,
"learning_rate": 2.17144265225735e-06,
"loss": 0.2692,
"step": 593
},
{
"epoch": 1.6875,
"grad_norm": 0.3572276587914552,
"learning_rate": 2.1637583025820985e-06,
"loss": 0.2858,
"step": 594
},
{
"epoch": 1.6903409090909092,
"grad_norm": 0.37800630772529514,
"learning_rate": 2.156077186847726e-06,
"loss": 0.294,
"step": 595
},
{
"epoch": 1.6931818181818183,
"grad_norm": 0.3421660175170903,
"learning_rate": 2.148399378930476e-06,
"loss": 0.2573,
"step": 596
},
{
"epoch": 1.6960227272727273,
"grad_norm": 0.34254475964042214,
"learning_rate": 2.1407249526747816e-06,
"loss": 0.275,
"step": 597
},
{
"epoch": 1.6988636363636362,
"grad_norm": 0.3715201904697272,
"learning_rate": 2.133053981892547e-06,
"loss": 0.2833,
"step": 598
},
{
"epoch": 1.7017045454545454,
"grad_norm": 0.36015289752626467,
"learning_rate": 2.125386540362447e-06,
"loss": 0.2828,
"step": 599
},
{
"epoch": 1.7045454545454546,
"grad_norm": 0.40367397113055686,
"learning_rate": 2.1177227018292086e-06,
"loss": 0.2621,
"step": 600
},
{
"epoch": 1.7073863636363638,
"grad_norm": 0.32129619035430856,
"learning_rate": 2.110062540002906e-06,
"loss": 0.2757,
"step": 601
},
{
"epoch": 1.7102272727272727,
"grad_norm": 0.3137451287766472,
"learning_rate": 2.1024061285582546e-06,
"loss": 0.2535,
"step": 602
},
{
"epoch": 1.7130681818181817,
"grad_norm": 0.4280343421587481,
"learning_rate": 2.0947535411338936e-06,
"loss": 0.2559,
"step": 603
},
{
"epoch": 1.7159090909090908,
"grad_norm": 0.38561258389624026,
"learning_rate": 2.087104851331686e-06,
"loss": 0.339,
"step": 604
},
{
"epoch": 1.71875,
"grad_norm": 0.3187139343663328,
"learning_rate": 2.0794601327160083e-06,
"loss": 0.224,
"step": 605
},
{
"epoch": 1.7215909090909092,
"grad_norm": 0.4058807325173988,
"learning_rate": 2.0718194588130435e-06,
"loss": 0.2743,
"step": 606
},
{
"epoch": 1.7244318181818183,
"grad_norm": 0.3501025253129524,
"learning_rate": 2.0641829031100685e-06,
"loss": 0.2534,
"step": 607
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.34621897515864436,
"learning_rate": 2.0565505390547558e-06,
"loss": 0.2565,
"step": 608
},
{
"epoch": 1.7301136363636362,
"grad_norm": 0.2972165110796837,
"learning_rate": 2.0489224400544626e-06,
"loss": 0.2472,
"step": 609
},
{
"epoch": 1.7329545454545454,
"grad_norm": 0.28430139406095895,
"learning_rate": 2.041298679475524e-06,
"loss": 0.2278,
"step": 610
},
{
"epoch": 1.7357954545454546,
"grad_norm": 0.3424108937746101,
"learning_rate": 2.033679330642548e-06,
"loss": 0.2708,
"step": 611
},
{
"epoch": 1.7386363636363638,
"grad_norm": 0.34689691643105225,
"learning_rate": 2.026064466837712e-06,
"loss": 0.2489,
"step": 612
},
{
"epoch": 1.7414772727272727,
"grad_norm": 0.36538604704717154,
"learning_rate": 2.018454161300058e-06,
"loss": 0.2959,
"step": 613
},
{
"epoch": 1.7443181818181817,
"grad_norm": 0.3914980478603566,
"learning_rate": 2.0108484872247836e-06,
"loss": 0.2877,
"step": 614
},
{
"epoch": 1.7471590909090908,
"grad_norm": 0.3460591534025964,
"learning_rate": 2.003247517762545e-06,
"loss": 0.2392,
"step": 615
},
{
"epoch": 1.75,
"grad_norm": 0.35201168894909723,
"learning_rate": 1.995651326018748e-06,
"loss": 0.2775,
"step": 616
},
{
"epoch": 1.7528409090909092,
"grad_norm": 0.3907457148602396,
"learning_rate": 1.988059985052847e-06,
"loss": 0.2649,
"step": 617
},
{
"epoch": 1.7556818181818183,
"grad_norm": 0.31089272434312254,
"learning_rate": 1.980473567877643e-06,
"loss": 0.2717,
"step": 618
},
{
"epoch": 1.7585227272727273,
"grad_norm": 0.39029862965581613,
"learning_rate": 1.9728921474585783e-06,
"loss": 0.2996,
"step": 619
},
{
"epoch": 1.7613636363636362,
"grad_norm": 0.37522254054472837,
"learning_rate": 1.965315796713038e-06,
"loss": 0.3206,
"step": 620
},
{
"epoch": 1.7642045454545454,
"grad_norm": 0.37421333571503007,
"learning_rate": 1.957744588509649e-06,
"loss": 0.2953,
"step": 621
},
{
"epoch": 1.7670454545454546,
"grad_norm": 0.4113713231201874,
"learning_rate": 1.9501785956675767e-06,
"loss": 0.2587,
"step": 622
},
{
"epoch": 1.7698863636363638,
"grad_norm": 0.3775256295092349,
"learning_rate": 1.942617890955824e-06,
"loss": 0.2706,
"step": 623
},
{
"epoch": 1.7727272727272727,
"grad_norm": 0.361676860315546,
"learning_rate": 1.935062547092535e-06,
"loss": 0.2573,
"step": 624
},
{
"epoch": 1.7755681818181817,
"grad_norm": 0.3828484280989141,
"learning_rate": 1.927512636744294e-06,
"loss": 0.2635,
"step": 625
},
{
"epoch": 1.7784090909090908,
"grad_norm": 0.3194894627210845,
"learning_rate": 1.9199682325254258e-06,
"loss": 0.2412,
"step": 626
},
{
"epoch": 1.78125,
"grad_norm": 0.3467465431720772,
"learning_rate": 1.9124294069972947e-06,
"loss": 0.2558,
"step": 627
},
{
"epoch": 1.7840909090909092,
"grad_norm": 0.40591415428499084,
"learning_rate": 1.9048962326676145e-06,
"loss": 0.2591,
"step": 628
},
{
"epoch": 1.7869318181818183,
"grad_norm": 0.324247081690912,
"learning_rate": 1.897368781989744e-06,
"loss": 0.2525,
"step": 629
},
{
"epoch": 1.7897727272727273,
"grad_norm": 0.30168524950243947,
"learning_rate": 1.889847127361992e-06,
"loss": 0.2414,
"step": 630
},
{
"epoch": 1.7926136363636362,
"grad_norm": 0.3391445741041072,
"learning_rate": 1.8823313411269226e-06,
"loss": 0.2666,
"step": 631
},
{
"epoch": 1.7954545454545454,
"grad_norm": 0.3695919372425977,
"learning_rate": 1.874821495570658e-06,
"loss": 0.2738,
"step": 632
},
{
"epoch": 1.7982954545454546,
"grad_norm": 0.41985233793486193,
"learning_rate": 1.8673176629221824e-06,
"loss": 0.2843,
"step": 633
},
{
"epoch": 1.8011363636363638,
"grad_norm": 0.34508550168400526,
"learning_rate": 1.8598199153526502e-06,
"loss": 0.2762,
"step": 634
},
{
"epoch": 1.8039772727272727,
"grad_norm": 0.34432258391495646,
"learning_rate": 1.852328324974688e-06,
"loss": 0.2746,
"step": 635
},
{
"epoch": 1.8068181818181817,
"grad_norm": 0.432219335772206,
"learning_rate": 1.8448429638417053e-06,
"loss": 0.293,
"step": 636
},
{
"epoch": 1.8096590909090908,
"grad_norm": 0.30494323840811877,
"learning_rate": 1.8373639039471974e-06,
"loss": 0.2483,
"step": 637
},
{
"epoch": 1.8125,
"grad_norm": 0.38979888807881874,
"learning_rate": 1.8298912172240568e-06,
"loss": 0.2665,
"step": 638
},
{
"epoch": 1.8153409090909092,
"grad_norm": 0.4409357967627925,
"learning_rate": 1.8224249755438773e-06,
"loss": 0.2979,
"step": 639
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.40058073253392457,
"learning_rate": 1.8149652507162662e-06,
"loss": 0.2402,
"step": 640
},
{
"epoch": 1.8210227272727273,
"grad_norm": 0.3781947301668901,
"learning_rate": 1.807512114488153e-06,
"loss": 0.2815,
"step": 641
},
{
"epoch": 1.8238636363636362,
"grad_norm": 0.32982880475917153,
"learning_rate": 1.8000656385430978e-06,
"loss": 0.274,
"step": 642
},
{
"epoch": 1.8267045454545454,
"grad_norm": 0.34588388650165885,
"learning_rate": 1.7926258945006008e-06,
"loss": 0.2415,
"step": 643
},
{
"epoch": 1.8295454545454546,
"grad_norm": 0.46509270816531234,
"learning_rate": 1.7851929539154188e-06,
"loss": 0.2352,
"step": 644
},
{
"epoch": 1.8323863636363638,
"grad_norm": 0.3949892127680776,
"learning_rate": 1.7777668882768723e-06,
"loss": 0.2731,
"step": 645
},
{
"epoch": 1.8352272727272727,
"grad_norm": 0.33118981202064834,
"learning_rate": 1.7703477690081584e-06,
"loss": 0.2062,
"step": 646
},
{
"epoch": 1.8380681818181817,
"grad_norm": 0.41123429927968475,
"learning_rate": 1.762935667465665e-06,
"loss": 0.2603,
"step": 647
},
{
"epoch": 1.8409090909090908,
"grad_norm": 0.4086985175493265,
"learning_rate": 1.7555306549382853e-06,
"loss": 0.2633,
"step": 648
},
{
"epoch": 1.84375,
"grad_norm": 0.3829776136552432,
"learning_rate": 1.7481328026467292e-06,
"loss": 0.2645,
"step": 649
},
{
"epoch": 1.8465909090909092,
"grad_norm": 0.36580249698143114,
"learning_rate": 1.7407421817428432e-06,
"loss": 0.2907,
"step": 650
},
{
"epoch": 1.8494318181818183,
"grad_norm": 0.5114322764325684,
"learning_rate": 1.733358863308918e-06,
"loss": 0.2491,
"step": 651
},
{
"epoch": 1.8522727272727273,
"grad_norm": 0.3758211802363351,
"learning_rate": 1.7259829183570146e-06,
"loss": 0.275,
"step": 652
},
{
"epoch": 1.8551136363636362,
"grad_norm": 0.44005362349975546,
"learning_rate": 1.7186144178282735e-06,
"loss": 0.2759,
"step": 653
},
{
"epoch": 1.8579545454545454,
"grad_norm": 0.41121803130231066,
"learning_rate": 1.7112534325922381e-06,
"loss": 0.2835,
"step": 654
},
{
"epoch": 1.8607954545454546,
"grad_norm": 0.37656111256141905,
"learning_rate": 1.7039000334461673e-06,
"loss": 0.2808,
"step": 655
},
{
"epoch": 1.8636363636363638,
"grad_norm": 0.3651987202447528,
"learning_rate": 1.6965542911143601e-06,
"loss": 0.3218,
"step": 656
},
{
"epoch": 1.8664772727272727,
"grad_norm": 0.40004844795530625,
"learning_rate": 1.6892162762474732e-06,
"loss": 0.2945,
"step": 657
},
{
"epoch": 1.8693181818181817,
"grad_norm": 0.33043091198634184,
"learning_rate": 1.6818860594218396e-06,
"loss": 0.2277,
"step": 658
},
{
"epoch": 1.8721590909090908,
"grad_norm": 0.3346497899463932,
"learning_rate": 1.674563711138791e-06,
"loss": 0.2324,
"step": 659
},
{
"epoch": 1.875,
"grad_norm": 0.32658486289094646,
"learning_rate": 1.6672493018239828e-06,
"loss": 0.242,
"step": 660
},
{
"epoch": 1.8778409090909092,
"grad_norm": 0.3483520142042606,
"learning_rate": 1.659942901826712e-06,
"loss": 0.2724,
"step": 661
},
{
"epoch": 1.8806818181818183,
"grad_norm": 0.3447989906256544,
"learning_rate": 1.6526445814192437e-06,
"loss": 0.2522,
"step": 662
},
{
"epoch": 1.8835227272727273,
"grad_norm": 0.3745982582543309,
"learning_rate": 1.6453544107961338e-06,
"loss": 0.268,
"step": 663
},
{
"epoch": 1.8863636363636362,
"grad_norm": 0.47460009049304464,
"learning_rate": 1.638072460073556e-06,
"loss": 0.3004,
"step": 664
},
{
"epoch": 1.8892045454545454,
"grad_norm": 0.38922747831910864,
"learning_rate": 1.6307987992886221e-06,
"loss": 0.2923,
"step": 665
},
{
"epoch": 1.8920454545454546,
"grad_norm": 0.3619334724335469,
"learning_rate": 1.6235334983987166e-06,
"loss": 0.2929,
"step": 666
},
{
"epoch": 1.8948863636363638,
"grad_norm": 0.4134447223169521,
"learning_rate": 1.6162766272808153e-06,
"loss": 0.2443,
"step": 667
},
{
"epoch": 1.8977272727272727,
"grad_norm": 0.37827695457409233,
"learning_rate": 1.6090282557308199e-06,
"loss": 0.2634,
"step": 668
},
{
"epoch": 1.9005681818181817,
"grad_norm": 0.37553439336248,
"learning_rate": 1.6017884534628821e-06,
"loss": 0.2624,
"step": 669
},
{
"epoch": 1.9034090909090908,
"grad_norm": 0.30503546597237136,
"learning_rate": 1.594557290108737e-06,
"loss": 0.2448,
"step": 670
},
{
"epoch": 1.90625,
"grad_norm": 0.33139361815750534,
"learning_rate": 1.5873348352170309e-06,
"loss": 0.2344,
"step": 671
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.4071705047497215,
"learning_rate": 1.5801211582526515e-06,
"loss": 0.2972,
"step": 672
},
{
"epoch": 1.9119318181818183,
"grad_norm": 0.3520108684037794,
"learning_rate": 1.5729163285960636e-06,
"loss": 0.3064,
"step": 673
},
{
"epoch": 1.9147727272727273,
"grad_norm": 1.044294639450523,
"learning_rate": 1.5657204155426372e-06,
"loss": 0.2764,
"step": 674
},
{
"epoch": 1.9176136363636362,
"grad_norm": 0.2733575442921981,
"learning_rate": 1.5585334883019845e-06,
"loss": 0.2115,
"step": 675
},
{
"epoch": 1.9204545454545454,
"grad_norm": 0.3436043484209694,
"learning_rate": 1.551355615997292e-06,
"loss": 0.2613,
"step": 676
},
{
"epoch": 1.9232954545454546,
"grad_norm": 0.347973792440035,
"learning_rate": 1.5441868676646588e-06,
"loss": 0.2984,
"step": 677
},
{
"epoch": 1.9261363636363638,
"grad_norm": 0.3714627621893232,
"learning_rate": 1.537027312252427e-06,
"loss": 0.2939,
"step": 678
},
{
"epoch": 1.9289772727272727,
"grad_norm": 0.36946185178466473,
"learning_rate": 1.5298770186205262e-06,
"loss": 0.3133,
"step": 679
},
{
"epoch": 1.9318181818181817,
"grad_norm": 0.4445653274012168,
"learning_rate": 1.522736055539804e-06,
"loss": 0.2638,
"step": 680
},
{
"epoch": 1.9346590909090908,
"grad_norm": 0.36757539498984404,
"learning_rate": 1.5156044916913687e-06,
"loss": 0.2594,
"step": 681
},
{
"epoch": 1.9375,
"grad_norm": 0.3740375047815139,
"learning_rate": 1.5084823956659284e-06,
"loss": 0.2816,
"step": 682
},
{
"epoch": 1.9403409090909092,
"grad_norm": 0.3691390978665013,
"learning_rate": 1.5013698359631284e-06,
"loss": 0.3269,
"step": 683
},
{
"epoch": 1.9431818181818183,
"grad_norm": 0.3882033860276734,
"learning_rate": 1.4942668809908966e-06,
"loss": 0.2926,
"step": 684
},
{
"epoch": 1.9460227272727273,
"grad_norm": 0.3303389257139215,
"learning_rate": 1.487173599064783e-06,
"loss": 0.2813,
"step": 685
},
{
"epoch": 1.9488636363636362,
"grad_norm": 0.385716686112769,
"learning_rate": 1.4800900584073025e-06,
"loss": 0.3027,
"step": 686
},
{
"epoch": 1.9517045454545454,
"grad_norm": 0.30969234063219786,
"learning_rate": 1.4730163271472808e-06,
"loss": 0.2848,
"step": 687
},
{
"epoch": 1.9545454545454546,
"grad_norm": 0.39460846418007084,
"learning_rate": 1.465952473319196e-06,
"loss": 0.2638,
"step": 688
},
{
"epoch": 1.9573863636363638,
"grad_norm": 0.38043423948555954,
"learning_rate": 1.458898564862528e-06,
"loss": 0.3017,
"step": 689
},
{
"epoch": 1.9602272727272727,
"grad_norm": 0.344190102552331,
"learning_rate": 1.4518546696211003e-06,
"loss": 0.2475,
"step": 690
},
{
"epoch": 1.9630681818181817,
"grad_norm": 0.3584182768945062,
"learning_rate": 1.4448208553424318e-06,
"loss": 0.2599,
"step": 691
},
{
"epoch": 1.9659090909090908,
"grad_norm": 0.4193998956615056,
"learning_rate": 1.4377971896770854e-06,
"loss": 0.2932,
"step": 692
},
{
"epoch": 1.96875,
"grad_norm": 0.3183638489077071,
"learning_rate": 1.4307837401780129e-06,
"loss": 0.2353,
"step": 693
},
{
"epoch": 1.9715909090909092,
"grad_norm": 0.551291367904842,
"learning_rate": 1.4237805742999078e-06,
"loss": 0.2888,
"step": 694
},
{
"epoch": 1.9744318181818183,
"grad_norm": 0.3836625936106596,
"learning_rate": 1.4167877593985604e-06,
"loss": 0.2606,
"step": 695
},
{
"epoch": 1.9772727272727273,
"grad_norm": 0.33238753899253215,
"learning_rate": 1.4098053627302021e-06,
"loss": 0.2511,
"step": 696
},
{
"epoch": 1.9801136363636362,
"grad_norm": 0.3697431181915492,
"learning_rate": 1.402833451450865e-06,
"loss": 0.2592,
"step": 697
},
{
"epoch": 1.9829545454545454,
"grad_norm": 0.39546306881879256,
"learning_rate": 1.3958720926157326e-06,
"loss": 0.2867,
"step": 698
},
{
"epoch": 1.9857954545454546,
"grad_norm": 0.37081182355768993,
"learning_rate": 1.3889213531784967e-06,
"loss": 0.2774,
"step": 699
},
{
"epoch": 1.9886363636363638,
"grad_norm": 0.31730996135018236,
"learning_rate": 1.3819812999907112e-06,
"loss": 0.2558,
"step": 700
},
{
"epoch": 1.9914772727272727,
"grad_norm": 0.48697102294004946,
"learning_rate": 1.3750519998011545e-06,
"loss": 0.2807,
"step": 701
},
{
"epoch": 1.9943181818181817,
"grad_norm": 0.32660834038500147,
"learning_rate": 1.3681335192551795e-06,
"loss": 0.266,
"step": 702
},
{
"epoch": 1.9971590909090908,
"grad_norm": 0.37088856838391165,
"learning_rate": 1.3612259248940778e-06,
"loss": 0.3023,
"step": 703
},
{
"epoch": 2.0,
"grad_norm": 0.46910986149494815,
"learning_rate": 1.354329283154442e-06,
"loss": 0.354,
"step": 704
},
{
"epoch": 2.002840909090909,
"grad_norm": 0.3578196229806462,
"learning_rate": 1.3474436603675195e-06,
"loss": 0.2863,
"step": 705
},
{
"epoch": 2.0056818181818183,
"grad_norm": 0.3320147175830239,
"learning_rate": 1.3405691227585774e-06,
"loss": 0.2791,
"step": 706
},
{
"epoch": 2.008522727272727,
"grad_norm": 0.4104267883722151,
"learning_rate": 1.333705736446272e-06,
"loss": 0.2599,
"step": 707
},
{
"epoch": 2.0113636363636362,
"grad_norm": 0.44025732665188794,
"learning_rate": 1.326853567442003e-06,
"loss": 0.2648,
"step": 708
},
{
"epoch": 2.0142045454545454,
"grad_norm": 0.4463091829454087,
"learning_rate": 1.320012681649284e-06,
"loss": 0.3235,
"step": 709
},
{
"epoch": 2.0170454545454546,
"grad_norm": 0.3977418006694515,
"learning_rate": 1.3131831448631099e-06,
"loss": 0.2494,
"step": 710
},
{
"epoch": 2.0198863636363638,
"grad_norm": 0.30294420075479717,
"learning_rate": 1.3063650227693192e-06,
"loss": 0.2274,
"step": 711
},
{
"epoch": 2.022727272727273,
"grad_norm": 0.3580935126068431,
"learning_rate": 1.2995583809439655e-06,
"loss": 0.2641,
"step": 712
},
{
"epoch": 2.0255681818181817,
"grad_norm": 0.3633999760316955,
"learning_rate": 1.2927632848526892e-06,
"loss": 0.2664,
"step": 713
},
{
"epoch": 2.028409090909091,
"grad_norm": 0.39362626572566367,
"learning_rate": 1.285979799850079e-06,
"loss": 0.3028,
"step": 714
},
{
"epoch": 2.03125,
"grad_norm": 0.3732307387516034,
"learning_rate": 1.2792079911790554e-06,
"loss": 0.2903,
"step": 715
},
{
"epoch": 2.034090909090909,
"grad_norm": 0.348231549102206,
"learning_rate": 1.2724479239702334e-06,
"loss": 0.2776,
"step": 716
},
{
"epoch": 2.0369318181818183,
"grad_norm": 0.32154175294270404,
"learning_rate": 1.2656996632413e-06,
"loss": 0.2363,
"step": 717
},
{
"epoch": 2.039772727272727,
"grad_norm": 0.3738689076803405,
"learning_rate": 1.2589632738963915e-06,
"loss": 0.2747,
"step": 718
},
{
"epoch": 2.0426136363636362,
"grad_norm": 0.370533612023648,
"learning_rate": 1.2522388207254624e-06,
"loss": 0.2568,
"step": 719
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.3839434235801676,
"learning_rate": 1.2455263684036687e-06,
"loss": 0.2792,
"step": 720
},
{
"epoch": 2.0482954545454546,
"grad_norm": 0.5003341324574189,
"learning_rate": 1.2388259814907421e-06,
"loss": 0.2769,
"step": 721
},
{
"epoch": 2.0511363636363638,
"grad_norm": 0.3351671952514299,
"learning_rate": 1.2321377244303718e-06,
"loss": 0.2296,
"step": 722
},
{
"epoch": 2.053977272727273,
"grad_norm": 0.2999985412422647,
"learning_rate": 1.22546166154958e-06,
"loss": 0.2284,
"step": 723
},
{
"epoch": 2.0568181818181817,
"grad_norm": 0.3135859144132813,
"learning_rate": 1.2187978570581118e-06,
"loss": 0.251,
"step": 724
},
{
"epoch": 2.059659090909091,
"grad_norm": 0.4125239171099722,
"learning_rate": 1.212146375047808e-06,
"loss": 0.2569,
"step": 725
},
{
"epoch": 2.0625,
"grad_norm": 0.5126461046016878,
"learning_rate": 1.2055072794919927e-06,
"loss": 0.2867,
"step": 726
},
{
"epoch": 2.065340909090909,
"grad_norm": 0.32428865281600694,
"learning_rate": 1.198880634244862e-06,
"loss": 0.2526,
"step": 727
},
{
"epoch": 2.0681818181818183,
"grad_norm": 0.5892083787676873,
"learning_rate": 1.192266503040863e-06,
"loss": 0.2827,
"step": 728
},
{
"epoch": 2.071022727272727,
"grad_norm": 0.2947475596312562,
"learning_rate": 1.1856649494940842e-06,
"loss": 0.2288,
"step": 729
},
{
"epoch": 2.0738636363636362,
"grad_norm": 0.35972607487628616,
"learning_rate": 1.1790760370976445e-06,
"loss": 0.268,
"step": 730
},
{
"epoch": 2.0767045454545454,
"grad_norm": 0.36619988601771414,
"learning_rate": 1.1724998292230804e-06,
"loss": 0.2832,
"step": 731
},
{
"epoch": 2.0795454545454546,
"grad_norm": 0.3733558388597783,
"learning_rate": 1.1659363891197373e-06,
"loss": 0.2723,
"step": 732
},
{
"epoch": 2.0823863636363638,
"grad_norm": 0.39404340487463446,
"learning_rate": 1.1593857799141635e-06,
"loss": 0.2823,
"step": 733
},
{
"epoch": 2.085227272727273,
"grad_norm": 0.39535002691603904,
"learning_rate": 1.152848064609499e-06,
"loss": 0.2765,
"step": 734
},
{
"epoch": 2.0880681818181817,
"grad_norm": 0.4562125910263655,
"learning_rate": 1.1463233060848701e-06,
"loss": 0.2229,
"step": 735
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.34157106543064586,
"learning_rate": 1.139811567094791e-06,
"loss": 0.251,
"step": 736
},
{
"epoch": 2.09375,
"grad_norm": 0.3975912471137775,
"learning_rate": 1.1333129102685504e-06,
"loss": 0.2953,
"step": 737
},
{
"epoch": 2.096590909090909,
"grad_norm": 0.4344936348962993,
"learning_rate": 1.1268273981096154e-06,
"loss": 0.2481,
"step": 738
},
{
"epoch": 2.0994318181818183,
"grad_norm": 0.40663820339750667,
"learning_rate": 1.1203550929950296e-06,
"loss": 0.2704,
"step": 739
},
{
"epoch": 2.102272727272727,
"grad_norm": 0.4525407147079834,
"learning_rate": 1.1138960571748122e-06,
"loss": 0.2308,
"step": 740
},
{
"epoch": 2.1051136363636362,
"grad_norm": 0.36101599924638966,
"learning_rate": 1.107450352771358e-06,
"loss": 0.3198,
"step": 741
},
{
"epoch": 2.1079545454545454,
"grad_norm": 0.4132570992405224,
"learning_rate": 1.1010180417788458e-06,
"loss": 0.3157,
"step": 742
},
{
"epoch": 2.1107954545454546,
"grad_norm": 0.4296796806025471,
"learning_rate": 1.094599186062633e-06,
"loss": 0.2719,
"step": 743
},
{
"epoch": 2.1136363636363638,
"grad_norm": 0.4115860705303619,
"learning_rate": 1.0881938473586672e-06,
"loss": 0.2588,
"step": 744
},
{
"epoch": 2.116477272727273,
"grad_norm": 0.3341390354972397,
"learning_rate": 1.0818020872728935e-06,
"loss": 0.2803,
"step": 745
},
{
"epoch": 2.1193181818181817,
"grad_norm": 0.386666143661149,
"learning_rate": 1.0754239672806526e-06,
"loss": 0.2954,
"step": 746
},
{
"epoch": 2.122159090909091,
"grad_norm": 0.39729795109834065,
"learning_rate": 1.0690595487261032e-06,
"loss": 0.292,
"step": 747
},
{
"epoch": 2.125,
"grad_norm": 0.4632063849794996,
"learning_rate": 1.0627088928216203e-06,
"loss": 0.3011,
"step": 748
},
{
"epoch": 2.127840909090909,
"grad_norm": 0.364788422480122,
"learning_rate": 1.0563720606472116e-06,
"loss": 0.2887,
"step": 749
},
{
"epoch": 2.1306818181818183,
"grad_norm": 0.3613800764493521,
"learning_rate": 1.050049113149932e-06,
"loss": 0.2698,
"step": 750
},
{
"epoch": 2.133522727272727,
"grad_norm": 0.4840054604670755,
"learning_rate": 1.0437401111432928e-06,
"loss": 0.2671,
"step": 751
},
{
"epoch": 2.1363636363636362,
"grad_norm": 0.35647589283664843,
"learning_rate": 1.0374451153066773e-06,
"loss": 0.277,
"step": 752
},
{
"epoch": 2.1392045454545454,
"grad_norm": 0.3070617647042118,
"learning_rate": 1.0311641861847644e-06,
"loss": 0.2262,
"step": 753
},
{
"epoch": 2.1420454545454546,
"grad_norm": 0.36421008528422827,
"learning_rate": 1.0248973841869336e-06,
"loss": 0.2541,
"step": 754
},
{
"epoch": 2.1448863636363638,
"grad_norm": 0.36442145568995793,
"learning_rate": 1.018644769586695e-06,
"loss": 0.2968,
"step": 755
},
{
"epoch": 2.147727272727273,
"grad_norm": 0.5392899583290776,
"learning_rate": 1.0124064025211063e-06,
"loss": 0.2338,
"step": 756
},
{
"epoch": 2.1505681818181817,
"grad_norm": 0.40485627469450297,
"learning_rate": 1.006182342990192e-06,
"loss": 0.2734,
"step": 757
},
{
"epoch": 2.153409090909091,
"grad_norm": 0.36165309778969656,
"learning_rate": 9.99972650856368e-07,
"loss": 0.2717,
"step": 758
},
{
"epoch": 2.15625,
"grad_norm": 0.37054356564143653,
"learning_rate": 9.937773858438677e-07,
"loss": 0.2867,
"step": 759
},
{
"epoch": 2.159090909090909,
"grad_norm": 0.3209190334600411,
"learning_rate": 9.87596607538164e-07,
"loss": 0.2026,
"step": 760
},
{
"epoch": 2.1619318181818183,
"grad_norm": 0.33862908014599463,
"learning_rate": 9.81430375385399e-07,
"loss": 0.2589,
"step": 761
},
{
"epoch": 2.164772727272727,
"grad_norm": 0.33768216225160724,
"learning_rate": 9.752787486918108e-07,
"loss": 0.2832,
"step": 762
},
{
"epoch": 2.1676136363636362,
"grad_norm": 0.33566640920720886,
"learning_rate": 9.691417866231633e-07,
"loss": 0.2646,
"step": 763
},
{
"epoch": 2.1704545454545454,
"grad_norm": 0.296999788237227,
"learning_rate": 9.630195482041778e-07,
"loss": 0.2405,
"step": 764
},
{
"epoch": 2.1732954545454546,
"grad_norm": 0.36623960819597895,
"learning_rate": 9.569120923179661e-07,
"loss": 0.2997,
"step": 765
},
{
"epoch": 2.1761363636363638,
"grad_norm": 0.35989187708509074,
"learning_rate": 9.508194777054613e-07,
"loss": 0.2627,
"step": 766
},
{
"epoch": 2.178977272727273,
"grad_norm": 0.45558444510597795,
"learning_rate": 9.447417629648542e-07,
"loss": 0.2939,
"step": 767
},
{
"epoch": 2.1818181818181817,
"grad_norm": 1.352661504436191,
"learning_rate": 9.386790065510326e-07,
"loss": 0.2674,
"step": 768
},
{
"epoch": 2.184659090909091,
"grad_norm": 0.3484066310248953,
"learning_rate": 9.326312667750143e-07,
"loss": 0.2647,
"step": 769
},
{
"epoch": 2.1875,
"grad_norm": 0.3372643949746599,
"learning_rate": 9.265986018033887e-07,
"loss": 0.2712,
"step": 770
},
{
"epoch": 2.190340909090909,
"grad_norm": 0.45171014423025785,
"learning_rate": 9.205810696577577e-07,
"loss": 0.2531,
"step": 771
},
{
"epoch": 2.1931818181818183,
"grad_norm": 0.3426033696862187,
"learning_rate": 9.14578728214176e-07,
"loss": 0.2594,
"step": 772
},
{
"epoch": 2.196022727272727,
"grad_norm": 0.44139931309445984,
"learning_rate": 9.085916352025983e-07,
"loss": 0.2747,
"step": 773
},
{
"epoch": 2.1988636363636362,
"grad_norm": 0.3644501914038969,
"learning_rate": 9.02619848206319e-07,
"loss": 0.3172,
"step": 774
},
{
"epoch": 2.2017045454545454,
"grad_norm": 0.41216240398841364,
"learning_rate": 8.966634246614208e-07,
"loss": 0.2614,
"step": 775
},
{
"epoch": 2.2045454545454546,
"grad_norm": 0.34732770899892357,
"learning_rate": 8.907224218562219e-07,
"loss": 0.248,
"step": 776
},
{
"epoch": 2.2073863636363638,
"grad_norm": 0.34245013389418555,
"learning_rate": 8.847968969307283e-07,
"loss": 0.295,
"step": 777
},
{
"epoch": 2.210227272727273,
"grad_norm": 0.3893001282929315,
"learning_rate": 8.788869068760758e-07,
"loss": 0.2967,
"step": 778
},
{
"epoch": 2.2130681818181817,
"grad_norm": 0.48226856220499215,
"learning_rate": 8.729925085339929e-07,
"loss": 0.3055,
"step": 779
},
{
"epoch": 2.215909090909091,
"grad_norm": 0.36479498548889644,
"learning_rate": 8.67113758596245e-07,
"loss": 0.2944,
"step": 780
},
{
"epoch": 2.21875,
"grad_norm": 0.311563765449273,
"learning_rate": 8.612507136040926e-07,
"loss": 0.2208,
"step": 781
},
{
"epoch": 2.221590909090909,
"grad_norm": 0.39153959534391375,
"learning_rate": 8.554034299477506e-07,
"loss": 0.2955,
"step": 782
},
{
"epoch": 2.2244318181818183,
"grad_norm": 0.3752941766025436,
"learning_rate": 8.495719638658395e-07,
"loss": 0.2882,
"step": 783
},
{
"epoch": 2.227272727272727,
"grad_norm": 0.34306207357731855,
"learning_rate": 8.437563714448496e-07,
"loss": 0.2855,
"step": 784
},
{
"epoch": 2.2301136363636362,
"grad_norm": 0.2911256041409022,
"learning_rate": 8.379567086185989e-07,
"loss": 0.2245,
"step": 785
},
{
"epoch": 2.2329545454545454,
"grad_norm": 0.38423726910475914,
"learning_rate": 8.321730311676965e-07,
"loss": 0.2881,
"step": 786
},
{
"epoch": 2.2357954545454546,
"grad_norm": 0.28685959612362666,
"learning_rate": 8.264053947190051e-07,
"loss": 0.2168,
"step": 787
},
{
"epoch": 2.2386363636363638,
"grad_norm": 0.3177020831576707,
"learning_rate": 8.206538547451088e-07,
"loss": 0.2392,
"step": 788
},
{
"epoch": 2.241477272727273,
"grad_norm": 0.314674201211804,
"learning_rate": 8.149184665637746e-07,
"loss": 0.2244,
"step": 789
},
{
"epoch": 2.2443181818181817,
"grad_norm": 0.34609325605203806,
"learning_rate": 8.091992853374239e-07,
"loss": 0.2506,
"step": 790
},
{
"epoch": 2.247159090909091,
"grad_norm": 0.37417875469018747,
"learning_rate": 8.034963660726022e-07,
"loss": 0.297,
"step": 791
},
{
"epoch": 2.25,
"grad_norm": 0.4190001624824225,
"learning_rate": 7.978097636194482e-07,
"loss": 0.2822,
"step": 792
},
{
"epoch": 2.252840909090909,
"grad_norm": 0.31172594700443673,
"learning_rate": 7.921395326711664e-07,
"loss": 0.2277,
"step": 793
},
{
"epoch": 2.2556818181818183,
"grad_norm": 0.35515884644954326,
"learning_rate": 7.864857277635027e-07,
"loss": 0.252,
"step": 794
},
{
"epoch": 2.2585227272727275,
"grad_norm": 0.48510568393864467,
"learning_rate": 7.808484032742184e-07,
"loss": 0.2661,
"step": 795
},
{
"epoch": 2.2613636363636362,
"grad_norm": 0.40576550011180185,
"learning_rate": 7.75227613422567e-07,
"loss": 0.2624,
"step": 796
},
{
"epoch": 2.2642045454545454,
"grad_norm": 0.3153702935106711,
"learning_rate": 7.696234122687756e-07,
"loss": 0.2423,
"step": 797
},
{
"epoch": 2.2670454545454546,
"grad_norm": 0.45813794434618704,
"learning_rate": 7.640358537135214e-07,
"loss": 0.2773,
"step": 798
},
{
"epoch": 2.2698863636363638,
"grad_norm": 0.43799221687287815,
"learning_rate": 7.584649914974132e-07,
"loss": 0.2543,
"step": 799
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.36099400774254925,
"learning_rate": 7.5291087920048e-07,
"loss": 0.2554,
"step": 800
},
{
"epoch": 2.2755681818181817,
"grad_norm": 0.3681744190202427,
"learning_rate": 7.47373570241646e-07,
"loss": 0.2393,
"step": 801
},
{
"epoch": 2.278409090909091,
"grad_norm": 0.30088848462434675,
"learning_rate": 7.418531178782281e-07,
"loss": 0.2443,
"step": 802
},
{
"epoch": 2.28125,
"grad_norm": 0.36658882990515207,
"learning_rate": 7.363495752054145e-07,
"loss": 0.2716,
"step": 803
},
{
"epoch": 2.284090909090909,
"grad_norm": 0.3691396379554879,
"learning_rate": 7.30862995155758e-07,
"loss": 0.281,
"step": 804
},
{
"epoch": 2.2869318181818183,
"grad_norm": 0.3976865364065572,
"learning_rate": 7.25393430498669e-07,
"loss": 0.3126,
"step": 805
},
{
"epoch": 2.2897727272727275,
"grad_norm": 0.34972134382431147,
"learning_rate": 7.199409338399024e-07,
"loss": 0.2716,
"step": 806
},
{
"epoch": 2.2926136363636362,
"grad_norm": 0.359990470488163,
"learning_rate": 7.145055576210552e-07,
"loss": 0.282,
"step": 807
},
{
"epoch": 2.2954545454545454,
"grad_norm": 0.32127716098200765,
"learning_rate": 7.090873541190649e-07,
"loss": 0.2537,
"step": 808
},
{
"epoch": 2.2982954545454546,
"grad_norm": 0.3386422816466643,
"learning_rate": 7.036863754456985e-07,
"loss": 0.2663,
"step": 809
},
{
"epoch": 2.3011363636363638,
"grad_norm": 0.43294818109617667,
"learning_rate": 6.983026735470586e-07,
"loss": 0.3144,
"step": 810
},
{
"epoch": 2.3039772727272725,
"grad_norm": 0.3668974255373313,
"learning_rate": 6.929363002030829e-07,
"loss": 0.2665,
"step": 811
},
{
"epoch": 2.3068181818181817,
"grad_norm": 0.3372045903540735,
"learning_rate": 6.875873070270423e-07,
"loss": 0.2291,
"step": 812
},
{
"epoch": 2.309659090909091,
"grad_norm": 0.3686361653405783,
"learning_rate": 6.822557454650472e-07,
"loss": 0.3127,
"step": 813
},
{
"epoch": 2.3125,
"grad_norm": 0.3287416264369441,
"learning_rate": 6.769416667955545e-07,
"loss": 0.2497,
"step": 814
},
{
"epoch": 2.315340909090909,
"grad_norm": 0.378493696975223,
"learning_rate": 6.7164512212887e-07,
"loss": 0.2538,
"step": 815
},
{
"epoch": 2.3181818181818183,
"grad_norm": 0.3527906349071735,
"learning_rate": 6.6636616240666e-07,
"loss": 0.2759,
"step": 816
},
{
"epoch": 2.3210227272727275,
"grad_norm": 0.3283146351073707,
"learning_rate": 6.611048384014601e-07,
"loss": 0.2787,
"step": 817
},
{
"epoch": 2.3238636363636362,
"grad_norm": 0.4262766716182643,
"learning_rate": 6.558612007161876e-07,
"loss": 0.3367,
"step": 818
},
{
"epoch": 2.3267045454545454,
"grad_norm": 0.29243285573134076,
"learning_rate": 6.506352997836537e-07,
"loss": 0.2312,
"step": 819
},
{
"epoch": 2.3295454545454546,
"grad_norm": 0.3708515561207515,
"learning_rate": 6.454271858660816e-07,
"loss": 0.2947,
"step": 820
},
{
"epoch": 2.3323863636363638,
"grad_norm": 0.3031026424988807,
"learning_rate": 6.402369090546173e-07,
"loss": 0.2376,
"step": 821
},
{
"epoch": 2.3352272727272725,
"grad_norm": 0.40063837240803074,
"learning_rate": 6.350645192688531e-07,
"loss": 0.2706,
"step": 822
},
{
"epoch": 2.3380681818181817,
"grad_norm": 0.3931219211524187,
"learning_rate": 6.299100662563459e-07,
"loss": 0.2245,
"step": 823
},
{
"epoch": 2.340909090909091,
"grad_norm": 0.496053050631395,
"learning_rate": 6.247735995921375e-07,
"loss": 0.2665,
"step": 824
},
{
"epoch": 2.34375,
"grad_norm": 0.36983619426324377,
"learning_rate": 6.19655168678279e-07,
"loss": 0.2437,
"step": 825
},
{
"epoch": 2.346590909090909,
"grad_norm": 0.31853434490396093,
"learning_rate": 6.145548227433551e-07,
"loss": 0.237,
"step": 826
},
{
"epoch": 2.3494318181818183,
"grad_norm": 0.3833013165526796,
"learning_rate": 6.094726108420105e-07,
"loss": 0.2321,
"step": 827
},
{
"epoch": 2.3522727272727275,
"grad_norm": 0.34709948082141423,
"learning_rate": 6.044085818544807e-07,
"loss": 0.2435,
"step": 828
},
{
"epoch": 2.3551136363636362,
"grad_norm": 0.346027003213824,
"learning_rate": 5.993627844861172e-07,
"loss": 0.2536,
"step": 829
},
{
"epoch": 2.3579545454545454,
"grad_norm": 0.3350399776737133,
"learning_rate": 5.943352672669215e-07,
"loss": 0.2403,
"step": 830
},
{
"epoch": 2.3607954545454546,
"grad_norm": 0.32396672340715865,
"learning_rate": 5.89326078551081e-07,
"loss": 0.2213,
"step": 831
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.3844292483072848,
"learning_rate": 5.843352665164992e-07,
"loss": 0.249,
"step": 832
},
{
"epoch": 2.3664772727272725,
"grad_norm": 0.35019451009540753,
"learning_rate": 5.793628791643327e-07,
"loss": 0.2888,
"step": 833
},
{
"epoch": 2.3693181818181817,
"grad_norm": 0.3164025713303425,
"learning_rate": 5.744089643185355e-07,
"loss": 0.2515,
"step": 834
},
{
"epoch": 2.372159090909091,
"grad_norm": 0.3308520526667594,
"learning_rate": 5.69473569625392e-07,
"loss": 0.2587,
"step": 835
},
{
"epoch": 2.375,
"grad_norm": 0.3378919456195333,
"learning_rate": 5.645567425530607e-07,
"loss": 0.2433,
"step": 836
},
{
"epoch": 2.377840909090909,
"grad_norm": 0.3354025023866522,
"learning_rate": 5.596585303911217e-07,
"loss": 0.2542,
"step": 837
},
{
"epoch": 2.3806818181818183,
"grad_norm": 0.3756871055057431,
"learning_rate": 5.547789802501164e-07,
"loss": 0.2755,
"step": 838
},
{
"epoch": 2.3835227272727275,
"grad_norm": 0.3363888579054467,
"learning_rate": 5.499181390610958e-07,
"loss": 0.2545,
"step": 839
},
{
"epoch": 2.3863636363636362,
"grad_norm": 0.3726730517187886,
"learning_rate": 5.450760535751734e-07,
"loss": 0.2679,
"step": 840
},
{
"epoch": 2.3892045454545454,
"grad_norm": 0.34128788657594616,
"learning_rate": 5.402527703630681e-07,
"loss": 0.2744,
"step": 841
},
{
"epoch": 2.3920454545454546,
"grad_norm": 0.3112695417600679,
"learning_rate": 5.354483358146617e-07,
"loss": 0.2231,
"step": 842
},
{
"epoch": 2.3948863636363638,
"grad_norm": 0.34694374550516704,
"learning_rate": 5.306627961385538e-07,
"loss": 0.2571,
"step": 843
},
{
"epoch": 2.3977272727272725,
"grad_norm": 0.302981543192964,
"learning_rate": 5.258961973616117e-07,
"loss": 0.2427,
"step": 844
},
{
"epoch": 2.4005681818181817,
"grad_norm": 0.3008721863153869,
"learning_rate": 5.211485853285314e-07,
"loss": 0.2251,
"step": 845
},
{
"epoch": 2.403409090909091,
"grad_norm": 0.33302783458473956,
"learning_rate": 5.164200057013985e-07,
"loss": 0.2711,
"step": 846
},
{
"epoch": 2.40625,
"grad_norm": 0.3898327860869564,
"learning_rate": 5.117105039592444e-07,
"loss": 0.2869,
"step": 847
},
{
"epoch": 2.409090909090909,
"grad_norm": 0.3770328552305208,
"learning_rate": 5.070201253976115e-07,
"loss": 0.2777,
"step": 848
},
{
"epoch": 2.4119318181818183,
"grad_norm": 0.32513904942970184,
"learning_rate": 5.02348915128118e-07,
"loss": 0.2655,
"step": 849
},
{
"epoch": 2.4147727272727275,
"grad_norm": 0.3173329184832482,
"learning_rate": 4.976969180780225e-07,
"loss": 0.2398,
"step": 850
},
{
"epoch": 2.4176136363636362,
"grad_norm": 0.3853995789331807,
"learning_rate": 4.930641789897938e-07,
"loss": 0.2699,
"step": 851
},
{
"epoch": 2.4204545454545454,
"grad_norm": 0.3880784265747346,
"learning_rate": 4.884507424206788e-07,
"loss": 0.2649,
"step": 852
},
{
"epoch": 2.4232954545454546,
"grad_norm": 0.3710421332719178,
"learning_rate": 4.838566527422742e-07,
"loss": 0.2604,
"step": 853
},
{
"epoch": 2.4261363636363638,
"grad_norm": 0.42114780257384915,
"learning_rate": 4.792819541400998e-07,
"loss": 0.2982,
"step": 854
},
{
"epoch": 2.4289772727272725,
"grad_norm": 0.3704518376341159,
"learning_rate": 4.747266906131759e-07,
"loss": 0.2916,
"step": 855
},
{
"epoch": 2.4318181818181817,
"grad_norm": 0.3664937063178789,
"learning_rate": 4.7019090597359624e-07,
"loss": 0.2586,
"step": 856
},
{
"epoch": 2.434659090909091,
"grad_norm": 0.30129914419743803,
"learning_rate": 4.656746438461085e-07,
"loss": 0.233,
"step": 857
},
{
"epoch": 2.4375,
"grad_norm": 0.3610260573998573,
"learning_rate": 4.611779476676956e-07,
"loss": 0.2295,
"step": 858
},
{
"epoch": 2.440340909090909,
"grad_norm": 0.31555005162338934,
"learning_rate": 4.5670086068715564e-07,
"loss": 0.2324,
"step": 859
},
{
"epoch": 2.4431818181818183,
"grad_norm": 0.38647155996115823,
"learning_rate": 4.522434259646896e-07,
"loss": 0.2509,
"step": 860
},
{
"epoch": 2.4460227272727275,
"grad_norm": 0.3295294330692125,
"learning_rate": 4.4780568637148277e-07,
"loss": 0.2409,
"step": 861
},
{
"epoch": 2.4488636363636362,
"grad_norm": 0.40919134523297795,
"learning_rate": 4.4338768458929455e-07,
"loss": 0.2753,
"step": 862
},
{
"epoch": 2.4517045454545454,
"grad_norm": 0.3281509333195072,
"learning_rate": 4.3898946311005054e-07,
"loss": 0.2776,
"step": 863
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.3003640118064134,
"learning_rate": 4.346110642354284e-07,
"loss": 0.2288,
"step": 864
},
{
"epoch": 2.4573863636363638,
"grad_norm": 0.2856917980597871,
"learning_rate": 4.30252530076454e-07,
"loss": 0.2262,
"step": 865
},
{
"epoch": 2.4602272727272725,
"grad_norm": 0.3716917156792666,
"learning_rate": 4.259139025530981e-07,
"loss": 0.2704,
"step": 866
},
{
"epoch": 2.4630681818181817,
"grad_norm": 0.3646615055009088,
"learning_rate": 4.2159522339387027e-07,
"loss": 0.2422,
"step": 867
},
{
"epoch": 2.465909090909091,
"grad_norm": 0.352013885171188,
"learning_rate": 4.1729653413541795e-07,
"loss": 0.2586,
"step": 868
},
{
"epoch": 2.46875,
"grad_norm": 0.3341889105921635,
"learning_rate": 4.13017876122129e-07,
"loss": 0.2514,
"step": 869
},
{
"epoch": 2.471590909090909,
"grad_norm": 0.3436869862214985,
"learning_rate": 4.087592905057319e-07,
"loss": 0.2663,
"step": 870
},
{
"epoch": 2.4744318181818183,
"grad_norm": 0.3459285477446355,
"learning_rate": 4.0452081824490007e-07,
"loss": 0.2274,
"step": 871
},
{
"epoch": 2.4772727272727275,
"grad_norm": 0.39474776701060227,
"learning_rate": 4.0030250010486106e-07,
"loss": 0.2635,
"step": 872
},
{
"epoch": 2.4801136363636362,
"grad_norm": 0.3588162845171683,
"learning_rate": 3.9610437665699803e-07,
"loss": 0.2702,
"step": 873
},
{
"epoch": 2.4829545454545454,
"grad_norm": 0.3055170644052573,
"learning_rate": 3.919264882784662e-07,
"loss": 0.2642,
"step": 874
},
{
"epoch": 2.4857954545454546,
"grad_norm": 0.4004147388266674,
"learning_rate": 3.8776887515180215e-07,
"loss": 0.2673,
"step": 875
},
{
"epoch": 2.4886363636363638,
"grad_norm": 0.3435684772838886,
"learning_rate": 3.836315772645355e-07,
"loss": 0.2572,
"step": 876
},
{
"epoch": 2.4914772727272725,
"grad_norm": 0.3929983920357782,
"learning_rate": 3.79514634408806e-07,
"loss": 0.314,
"step": 877
},
{
"epoch": 2.4943181818181817,
"grad_norm": 0.3402456651574272,
"learning_rate": 3.7541808618098225e-07,
"loss": 0.2742,
"step": 878
},
{
"epoch": 2.497159090909091,
"grad_norm": 0.3391484648776555,
"learning_rate": 3.713419719812775e-07,
"loss": 0.2957,
"step": 879
},
{
"epoch": 2.5,
"grad_norm": 0.3300372482602716,
"learning_rate": 3.6728633101337283e-07,
"loss": 0.2402,
"step": 880
},
{
"epoch": 2.502840909090909,
"grad_norm": 0.3880324057454857,
"learning_rate": 3.632512022840401e-07,
"loss": 0.225,
"step": 881
},
{
"epoch": 2.5056818181818183,
"grad_norm": 0.40083562156829194,
"learning_rate": 3.592366246027654e-07,
"loss": 0.2885,
"step": 882
},
{
"epoch": 2.5085227272727275,
"grad_norm": 0.3898508645513151,
"learning_rate": 3.552426365813791e-07,
"loss": 0.279,
"step": 883
},
{
"epoch": 2.5113636363636362,
"grad_norm": 0.34747356344583896,
"learning_rate": 3.512692766336795e-07,
"loss": 0.2551,
"step": 884
},
{
"epoch": 2.5142045454545454,
"grad_norm": 0.3697878476145354,
"learning_rate": 3.4731658297506717e-07,
"loss": 0.2584,
"step": 885
},
{
"epoch": 2.5170454545454546,
"grad_norm": 0.3442593343497222,
"learning_rate": 3.433845936221772e-07,
"loss": 0.2323,
"step": 886
},
{
"epoch": 2.5198863636363638,
"grad_norm": 0.4052191198887473,
"learning_rate": 3.394733463925115e-07,
"loss": 0.2895,
"step": 887
},
{
"epoch": 2.5227272727272725,
"grad_norm": 0.3639886136390821,
"learning_rate": 3.355828789040752e-07,
"loss": 0.276,
"step": 888
},
{
"epoch": 2.5255681818181817,
"grad_norm": 0.39883666474289897,
"learning_rate": 3.3171322857501796e-07,
"loss": 0.2858,
"step": 889
},
{
"epoch": 2.528409090909091,
"grad_norm": 0.40889869044433336,
"learning_rate": 3.278644326232713e-07,
"loss": 0.257,
"step": 890
},
{
"epoch": 2.53125,
"grad_norm": 0.3284086126915543,
"learning_rate": 3.2403652806619e-07,
"loss": 0.2699,
"step": 891
},
{
"epoch": 2.534090909090909,
"grad_norm": 0.3806103148982155,
"learning_rate": 3.2022955172019947e-07,
"loss": 0.2607,
"step": 892
},
{
"epoch": 2.5369318181818183,
"grad_norm": 0.414262076377764,
"learning_rate": 3.1644354020043846e-07,
"loss": 0.2709,
"step": 893
},
{
"epoch": 2.5397727272727275,
"grad_norm": 0.3564646964673218,
"learning_rate": 3.1267852992040715e-07,
"loss": 0.2845,
"step": 894
},
{
"epoch": 2.5426136363636362,
"grad_norm": 0.35912306046922576,
"learning_rate": 3.0893455709162023e-07,
"loss": 0.2466,
"step": 895
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.3605968532309376,
"learning_rate": 3.052116577232533e-07,
"loss": 0.2868,
"step": 896
},
{
"epoch": 2.5482954545454546,
"grad_norm": 0.4129969651465434,
"learning_rate": 3.015098676218009e-07,
"loss": 0.2738,
"step": 897
},
{
"epoch": 2.5511363636363638,
"grad_norm": 0.38800081862705826,
"learning_rate": 2.9782922239073084e-07,
"loss": 0.274,
"step": 898
},
{
"epoch": 2.5539772727272725,
"grad_norm": 0.28725463626604075,
"learning_rate": 2.9416975743014134e-07,
"loss": 0.246,
"step": 899
},
{
"epoch": 2.5568181818181817,
"grad_norm": 0.33194490572792595,
"learning_rate": 2.9053150793642013e-07,
"loss": 0.2418,
"step": 900
},
{
"epoch": 2.559659090909091,
"grad_norm": 0.31927368240055043,
"learning_rate": 2.8691450890190794e-07,
"loss": 0.259,
"step": 901
},
{
"epoch": 2.5625,
"grad_norm": 0.4514488260064792,
"learning_rate": 2.833187951145588e-07,
"loss": 0.2674,
"step": 902
},
{
"epoch": 2.565340909090909,
"grad_norm": 0.3952590748072181,
"learning_rate": 2.797444011576089e-07,
"loss": 0.2764,
"step": 903
},
{
"epoch": 2.5681818181818183,
"grad_norm": 0.3035956390116324,
"learning_rate": 2.7619136140924153e-07,
"loss": 0.2361,
"step": 904
},
{
"epoch": 2.5710227272727275,
"grad_norm": 0.365463810965996,
"learning_rate": 2.726597100422565e-07,
"loss": 0.2955,
"step": 905
},
{
"epoch": 2.5738636363636362,
"grad_norm": 0.37417152902560946,
"learning_rate": 2.6914948102374384e-07,
"loss": 0.3007,
"step": 906
},
{
"epoch": 2.5767045454545454,
"grad_norm": 0.36872656495257466,
"learning_rate": 2.656607081147547e-07,
"loss": 0.2647,
"step": 907
},
{
"epoch": 2.5795454545454546,
"grad_norm": 0.381314240650295,
"learning_rate": 2.621934248699767e-07,
"loss": 0.3176,
"step": 908
},
{
"epoch": 2.5823863636363638,
"grad_norm": 0.41529466546435734,
"learning_rate": 2.5874766463741263e-07,
"loss": 0.2482,
"step": 909
},
{
"epoch": 2.5852272727272725,
"grad_norm": 0.4258649726301599,
"learning_rate": 2.553234605580593e-07,
"loss": 0.2618,
"step": 910
},
{
"epoch": 2.5880681818181817,
"grad_norm": 0.3762825021021476,
"learning_rate": 2.5192084556558776e-07,
"loss": 0.2914,
"step": 911
},
{
"epoch": 2.590909090909091,
"grad_norm": 0.3627506684619514,
"learning_rate": 2.4853985238602745e-07,
"loss": 0.2875,
"step": 912
},
{
"epoch": 2.59375,
"grad_norm": 0.3173651745814326,
"learning_rate": 2.451805135374516e-07,
"loss": 0.2421,
"step": 913
},
{
"epoch": 2.596590909090909,
"grad_norm": 0.44802208559240897,
"learning_rate": 2.4184286132966305e-07,
"loss": 0.2803,
"step": 914
},
{
"epoch": 2.5994318181818183,
"grad_norm": 0.36772649044669337,
"learning_rate": 2.3852692786388634e-07,
"loss": 0.3018,
"step": 915
},
{
"epoch": 2.6022727272727275,
"grad_norm": 0.3473737442586536,
"learning_rate": 2.3523274503245624e-07,
"loss": 0.2565,
"step": 916
},
{
"epoch": 2.6051136363636362,
"grad_norm": 0.31723371911082704,
"learning_rate": 2.319603445185109e-07,
"loss": 0.2769,
"step": 917
},
{
"epoch": 2.6079545454545454,
"grad_norm": 0.36837062880150556,
"learning_rate": 2.2870975779569066e-07,
"loss": 0.294,
"step": 918
},
{
"epoch": 2.6107954545454546,
"grad_norm": 0.34124708806422904,
"learning_rate": 2.2548101612783147e-07,
"loss": 0.2516,
"step": 919
},
{
"epoch": 2.6136363636363638,
"grad_norm": 0.3202170151424555,
"learning_rate": 2.2227415056866431e-07,
"loss": 0.254,
"step": 920
},
{
"epoch": 2.6164772727272725,
"grad_norm": 0.4260342271233267,
"learning_rate": 2.1908919196152013e-07,
"loss": 0.2719,
"step": 921
},
{
"epoch": 2.6193181818181817,
"grad_norm": 0.37728441327420986,
"learning_rate": 2.1592617093902978e-07,
"loss": 0.2753,
"step": 922
},
{
"epoch": 2.622159090909091,
"grad_norm": 0.39060937907330195,
"learning_rate": 2.1278511792283018e-07,
"loss": 0.2947,
"step": 923
},
{
"epoch": 2.625,
"grad_norm": 0.30888479325881507,
"learning_rate": 2.0966606312327303e-07,
"loss": 0.2284,
"step": 924
},
{
"epoch": 2.627840909090909,
"grad_norm": 0.40561974710485005,
"learning_rate": 2.065690365391329e-07,
"loss": 0.2943,
"step": 925
},
{
"epoch": 2.6306818181818183,
"grad_norm": 0.355886681039042,
"learning_rate": 2.0349406795731774e-07,
"loss": 0.2462,
"step": 926
},
{
"epoch": 2.6335227272727275,
"grad_norm": 0.37901081172880524,
"learning_rate": 2.0044118695258657e-07,
"loss": 0.2918,
"step": 927
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.48522777901179487,
"learning_rate": 1.9741042288725893e-07,
"loss": 0.3463,
"step": 928
},
{
"epoch": 2.6392045454545454,
"grad_norm": 0.35552067688931177,
"learning_rate": 1.944018049109375e-07,
"loss": 0.2589,
"step": 929
},
{
"epoch": 2.6420454545454546,
"grad_norm": 0.3245196964527464,
"learning_rate": 1.9141536196022658e-07,
"loss": 0.2667,
"step": 930
},
{
"epoch": 2.6448863636363638,
"grad_norm": 0.397373448701769,
"learning_rate": 1.884511227584518e-07,
"loss": 0.2635,
"step": 931
},
{
"epoch": 2.6477272727272725,
"grad_norm": 0.3230165219575403,
"learning_rate": 1.8550911581538517e-07,
"loss": 0.2524,
"step": 932
},
{
"epoch": 2.6505681818181817,
"grad_norm": 0.3201491067518106,
"learning_rate": 1.825893694269723e-07,
"loss": 0.2704,
"step": 933
},
{
"epoch": 2.653409090909091,
"grad_norm": 0.3806372642940993,
"learning_rate": 1.7969191167505811e-07,
"loss": 0.2891,
"step": 934
},
{
"epoch": 2.65625,
"grad_norm": 0.3315048294973883,
"learning_rate": 1.7681677042711732e-07,
"loss": 0.2469,
"step": 935
},
{
"epoch": 2.659090909090909,
"grad_norm": 0.3429832481491404,
"learning_rate": 1.7396397333598657e-07,
"loss": 0.2344,
"step": 936
},
{
"epoch": 2.6619318181818183,
"grad_norm": 0.31805225672924486,
"learning_rate": 1.711335478395984e-07,
"loss": 0.2301,
"step": 937
},
{
"epoch": 2.6647727272727275,
"grad_norm": 0.347431193735004,
"learning_rate": 1.6832552116071905e-07,
"loss": 0.274,
"step": 938
},
{
"epoch": 2.6676136363636362,
"grad_norm": 0.3276581659477082,
"learning_rate": 1.6553992030668293e-07,
"loss": 0.2569,
"step": 939
},
{
"epoch": 2.6704545454545454,
"grad_norm": 0.4181936566989231,
"learning_rate": 1.6277677206913588e-07,
"loss": 0.2737,
"step": 940
},
{
"epoch": 2.6732954545454546,
"grad_norm": 0.37610721012897674,
"learning_rate": 1.6003610302377708e-07,
"loss": 0.2999,
"step": 941
},
{
"epoch": 2.6761363636363638,
"grad_norm": 0.33046264353939814,
"learning_rate": 1.5731793953010193e-07,
"loss": 0.2427,
"step": 942
},
{
"epoch": 2.6789772727272725,
"grad_norm": 0.3494974820800891,
"learning_rate": 1.5462230773115066e-07,
"loss": 0.264,
"step": 943
},
{
"epoch": 2.6818181818181817,
"grad_norm": 0.3468159326122336,
"learning_rate": 1.5194923355325464e-07,
"loss": 0.3076,
"step": 944
},
{
"epoch": 2.684659090909091,
"grad_norm": 0.40045232274054987,
"learning_rate": 1.492987427057893e-07,
"loss": 0.3051,
"step": 945
},
{
"epoch": 2.6875,
"grad_norm": 0.4030575958079979,
"learning_rate": 1.4667086068092446e-07,
"loss": 0.2437,
"step": 946
},
{
"epoch": 2.690340909090909,
"grad_norm": 0.34082328928674294,
"learning_rate": 1.440656127533821e-07,
"loss": 0.2501,
"step": 947
},
{
"epoch": 2.6931818181818183,
"grad_norm": 0.34010796962843276,
"learning_rate": 1.414830239801898e-07,
"loss": 0.27,
"step": 948
},
{
"epoch": 2.6960227272727275,
"grad_norm": 0.4274695728838406,
"learning_rate": 1.3892311920044282e-07,
"loss": 0.2964,
"step": 949
},
{
"epoch": 2.6988636363636362,
"grad_norm": 0.35443571450269734,
"learning_rate": 1.3638592303506364e-07,
"loss": 0.252,
"step": 950
},
{
"epoch": 2.7017045454545454,
"grad_norm": 0.40737204314859,
"learning_rate": 1.3387145988656537e-07,
"loss": 0.2891,
"step": 951
},
{
"epoch": 2.7045454545454546,
"grad_norm": 0.352138799387513,
"learning_rate": 1.313797539388159e-07,
"loss": 0.2439,
"step": 952
},
{
"epoch": 2.7073863636363638,
"grad_norm": 0.33845536331763004,
"learning_rate": 1.2891082915680864e-07,
"loss": 0.2802,
"step": 953
},
{
"epoch": 2.7102272727272725,
"grad_norm": 0.35504925601892684,
"learning_rate": 1.264647092864288e-07,
"loss": 0.2514,
"step": 954
},
{
"epoch": 2.7130681818181817,
"grad_norm": 0.3609121713893806,
"learning_rate": 1.2404141785422568e-07,
"loss": 0.25,
"step": 955
},
{
"epoch": 2.715909090909091,
"grad_norm": 0.3936221787085924,
"learning_rate": 1.2164097816718818e-07,
"loss": 0.2312,
"step": 956
},
{
"epoch": 2.71875,
"grad_norm": 0.38365034429115125,
"learning_rate": 1.1926341331251756e-07,
"loss": 0.2682,
"step": 957
},
{
"epoch": 2.721590909090909,
"grad_norm": 0.31959559051327435,
"learning_rate": 1.169087461574081e-07,
"loss": 0.2457,
"step": 958
},
{
"epoch": 2.7244318181818183,
"grad_norm": 0.3799557870602865,
"learning_rate": 1.1457699934882715e-07,
"loss": 0.2968,
"step": 959
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.27723093935677195,
"learning_rate": 1.1226819531329342e-07,
"loss": 0.2219,
"step": 960
},
{
"epoch": 2.7301136363636362,
"grad_norm": 0.3534828660456155,
"learning_rate": 1.0998235625666708e-07,
"loss": 0.2433,
"step": 961
},
{
"epoch": 2.7329545454545454,
"grad_norm": 0.35791787748576426,
"learning_rate": 1.0771950416393228e-07,
"loss": 0.2597,
"step": 962
},
{
"epoch": 2.7357954545454546,
"grad_norm": 0.4475649717820448,
"learning_rate": 1.0547966079898637e-07,
"loss": 0.2636,
"step": 963
},
{
"epoch": 2.7386363636363638,
"grad_norm": 0.39027504830647813,
"learning_rate": 1.0326284770443063e-07,
"loss": 0.2728,
"step": 964
},
{
"epoch": 2.7414772727272725,
"grad_norm": 0.4315977251477179,
"learning_rate": 1.0106908620136525e-07,
"loss": 0.2588,
"step": 965
},
{
"epoch": 2.7443181818181817,
"grad_norm": 0.32246952155074843,
"learning_rate": 9.889839738918022e-08,
"loss": 0.2369,
"step": 966
},
{
"epoch": 2.747159090909091,
"grad_norm": 0.3333508436923039,
"learning_rate": 9.675080214535559e-08,
"loss": 0.2574,
"step": 967
},
{
"epoch": 2.75,
"grad_norm": 0.3654840156563527,
"learning_rate": 9.46263211252596e-08,
"loss": 0.3222,
"step": 968
},
{
"epoch": 2.752840909090909,
"grad_norm": 0.3366414190912868,
"learning_rate": 9.252497476194972e-08,
"loss": 0.2926,
"step": 969
},
{
"epoch": 2.7556818181818183,
"grad_norm": 0.3243823618475195,
"learning_rate": 9.044678326597722e-08,
"loss": 0.2484,
"step": 970
},
{
"epoch": 2.7585227272727275,
"grad_norm": 0.34777278160161157,
"learning_rate": 8.839176662519155e-08,
"loss": 0.2349,
"step": 971
},
{
"epoch": 2.7613636363636362,
"grad_norm": 0.34671371366502046,
"learning_rate": 8.635994460454766e-08,
"loss": 0.2574,
"step": 972
},
{
"epoch": 2.7642045454545454,
"grad_norm": 0.38617683116302787,
"learning_rate": 8.435133674591922e-08,
"loss": 0.3007,
"step": 973
},
{
"epoch": 2.7670454545454546,
"grad_norm": 0.4218961579649425,
"learning_rate": 8.2365962367906e-08,
"loss": 0.2916,
"step": 974
},
{
"epoch": 2.7698863636363638,
"grad_norm": 0.3971792338298757,
"learning_rate": 8.040384056565098e-08,
"loss": 0.2563,
"step": 975
},
{
"epoch": 2.7727272727272725,
"grad_norm": 0.3226524769417545,
"learning_rate": 7.846499021065684e-08,
"loss": 0.266,
"step": 976
},
{
"epoch": 2.7755681818181817,
"grad_norm": 0.3540519465775941,
"learning_rate": 7.654942995060283e-08,
"loss": 0.2616,
"step": 977
},
{
"epoch": 2.778409090909091,
"grad_norm": 0.3781537081979966,
"learning_rate": 7.465717820916624e-08,
"loss": 0.2698,
"step": 978
},
{
"epoch": 2.78125,
"grad_norm": 0.3564755050368105,
"learning_rate": 7.278825318584647e-08,
"loss": 0.27,
"step": 979
},
{
"epoch": 2.784090909090909,
"grad_norm": 0.3510249393237661,
"learning_rate": 7.094267285578688e-08,
"loss": 0.2666,
"step": 980
},
{
"epoch": 2.7869318181818183,
"grad_norm": 0.3998246424539849,
"learning_rate": 6.912045496960507e-08,
"loss": 0.2851,
"step": 981
},
{
"epoch": 2.7897727272727275,
"grad_norm": 0.37123966300816885,
"learning_rate": 6.732161705322093e-08,
"loss": 0.2528,
"step": 982
},
{
"epoch": 2.7926136363636362,
"grad_norm": 0.32607742324666744,
"learning_rate": 6.554617640768674e-08,
"loss": 0.2682,
"step": 983
},
{
"epoch": 2.7954545454545454,
"grad_norm": 0.3091883263291907,
"learning_rate": 6.379415010902362e-08,
"loss": 0.2431,
"step": 984
},
{
"epoch": 2.7982954545454546,
"grad_norm": 0.3896435979654701,
"learning_rate": 6.206555500805455e-08,
"loss": 0.2662,
"step": 985
},
{
"epoch": 2.8011363636363638,
"grad_norm": 0.36244662485716045,
"learning_rate": 6.036040773024387e-08,
"loss": 0.2708,
"step": 986
},
{
"epoch": 2.8039772727272725,
"grad_norm": 0.3558651773572941,
"learning_rate": 5.867872467553715e-08,
"loss": 0.3004,
"step": 987
},
{
"epoch": 2.8068181818181817,
"grad_norm": 0.37311773304851065,
"learning_rate": 5.702052201820352e-08,
"loss": 0.3088,
"step": 988
},
{
"epoch": 2.809659090909091,
"grad_norm": 0.411421481665237,
"learning_rate": 5.5385815706678894e-08,
"loss": 0.2923,
"step": 989
},
{
"epoch": 2.8125,
"grad_norm": 0.3759229007631887,
"learning_rate": 5.377462146341439e-08,
"loss": 0.2945,
"step": 990
},
{
"epoch": 2.815340909090909,
"grad_norm": 0.3014861546323833,
"learning_rate": 5.218695478472397e-08,
"loss": 0.2119,
"step": 991
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.4021583403485505,
"learning_rate": 5.062283094063536e-08,
"loss": 0.2878,
"step": 992
},
{
"epoch": 2.8210227272727275,
"grad_norm": 0.3293364475828707,
"learning_rate": 4.9082264974744665e-08,
"loss": 0.266,
"step": 993
},
{
"epoch": 2.8238636363636362,
"grad_norm": 0.30933470398564117,
"learning_rate": 4.756527170406922e-08,
"loss": 0.2314,
"step": 994
},
{
"epoch": 2.8267045454545454,
"grad_norm": 0.37909174739130147,
"learning_rate": 4.607186571890715e-08,
"loss": 0.2667,
"step": 995
},
{
"epoch": 2.8295454545454546,
"grad_norm": 0.37878603560502083,
"learning_rate": 4.46020613826964e-08,
"loss": 0.2937,
"step": 996
},
{
"epoch": 2.8323863636363638,
"grad_norm": 0.408496513297682,
"learning_rate": 4.3155872831875946e-08,
"loss": 0.2757,
"step": 997
},
{
"epoch": 2.8352272727272725,
"grad_norm": 0.3566593848752578,
"learning_rate": 4.1733313975750586e-08,
"loss": 0.2584,
"step": 998
},
{
"epoch": 2.8380681818181817,
"grad_norm": 0.352150696238673,
"learning_rate": 4.033439849635695e-08,
"loss": 0.2115,
"step": 999
},
{
"epoch": 2.840909090909091,
"grad_norm": 0.39392089147895293,
"learning_rate": 3.895913984833216e-08,
"loss": 0.2816,
"step": 1000
},
{
"epoch": 2.84375,
"grad_norm": 0.3412262767323334,
"learning_rate": 3.760755125878368e-08,
"loss": 0.2431,
"step": 1001
},
{
"epoch": 2.846590909090909,
"grad_norm": 0.3325324503502811,
"learning_rate": 3.627964572716331e-08,
"loss": 0.264,
"step": 1002
},
{
"epoch": 2.8494318181818183,
"grad_norm": 0.35296040111990046,
"learning_rate": 3.497543602514059e-08,
"loss": 0.2614,
"step": 1003
},
{
"epoch": 2.8522727272727275,
"grad_norm": 0.2837474483774213,
"learning_rate": 3.3694934696481275e-08,
"loss": 0.2123,
"step": 1004
},
{
"epoch": 2.8551136363636362,
"grad_norm": 0.34272040018575495,
"learning_rate": 3.24381540569263e-08,
"loss": 0.2808,
"step": 1005
},
{
"epoch": 2.8579545454545454,
"grad_norm": 0.3498353760521046,
"learning_rate": 3.120510619407324e-08,
"loss": 0.251,
"step": 1006
},
{
"epoch": 2.8607954545454546,
"grad_norm": 0.4069913912888687,
"learning_rate": 2.9995802967259516e-08,
"loss": 0.316,
"step": 1007
},
{
"epoch": 2.8636363636363638,
"grad_norm": 0.3361233831001831,
"learning_rate": 2.8810256007449632e-08,
"loss": 0.2293,
"step": 1008
},
{
"epoch": 2.8664772727272725,
"grad_norm": 0.4519558529396144,
"learning_rate": 2.7648476717122287e-08,
"loss": 0.2792,
"step": 1009
},
{
"epoch": 2.8693181818181817,
"grad_norm": 0.409732040720535,
"learning_rate": 2.651047627016068e-08,
"loss": 0.2904,
"step": 1010
},
{
"epoch": 2.872159090909091,
"grad_norm": 0.3250171306579268,
"learning_rate": 2.5396265611745687e-08,
"loss": 0.2463,
"step": 1011
},
{
"epoch": 2.875,
"grad_norm": 0.3856346320602474,
"learning_rate": 2.4305855458250373e-08,
"loss": 0.2356,
"step": 1012
},
{
"epoch": 2.877840909090909,
"grad_norm": 0.3526716263439721,
"learning_rate": 2.3239256297136193e-08,
"loss": 0.258,
"step": 1013
},
{
"epoch": 2.8806818181818183,
"grad_norm": 0.41510762650616695,
"learning_rate": 2.2196478386853624e-08,
"loss": 0.3018,
"step": 1014
},
{
"epoch": 2.8835227272727275,
"grad_norm": 0.2827340090469283,
"learning_rate": 2.117753175674142e-08,
"loss": 0.1949,
"step": 1015
},
{
"epoch": 2.8863636363636362,
"grad_norm": 0.42491879871002564,
"learning_rate": 2.0182426206932503e-08,
"loss": 0.2607,
"step": 1016
},
{
"epoch": 2.8892045454545454,
"grad_norm": 0.3281820518654654,
"learning_rate": 1.921117130825767e-08,
"loss": 0.266,
"step": 1017
},
{
"epoch": 2.8920454545454546,
"grad_norm": 0.5241869397210815,
"learning_rate": 1.82637764021551e-08,
"loss": 0.2566,
"step": 1018
},
{
"epoch": 2.8948863636363638,
"grad_norm": 0.36254656882284764,
"learning_rate": 1.7340250600579588e-08,
"loss": 0.2683,
"step": 1019
},
{
"epoch": 2.8977272727272725,
"grad_norm": 0.32113348760758087,
"learning_rate": 1.6440602785914584e-08,
"loss": 0.2495,
"step": 1020
},
{
"epoch": 2.9005681818181817,
"grad_norm": 0.39293475539987827,
"learning_rate": 1.556484161088806e-08,
"loss": 0.2673,
"step": 1021
},
{
"epoch": 2.903409090909091,
"grad_norm": 0.3692023050105476,
"learning_rate": 1.4712975498488158e-08,
"loss": 0.2676,
"step": 1022
},
{
"epoch": 2.90625,
"grad_norm": 0.3301143389304983,
"learning_rate": 1.3885012641882967e-08,
"loss": 0.2549,
"step": 1023
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.35000768054036296,
"learning_rate": 1.3080961004340308e-08,
"loss": 0.2769,
"step": 1024
},
{
"epoch": 2.9119318181818183,
"grad_norm": 0.3257952826732702,
"learning_rate": 1.2300828319153635e-08,
"loss": 0.2455,
"step": 1025
},
{
"epoch": 2.9147727272727275,
"grad_norm": 0.40990948777245817,
"learning_rate": 1.1544622089565139e-08,
"loss": 0.2999,
"step": 1026
},
{
"epoch": 2.9176136363636362,
"grad_norm": 0.5767979073587421,
"learning_rate": 1.0812349588694426e-08,
"loss": 0.2985,
"step": 1027
},
{
"epoch": 2.9204545454545454,
"grad_norm": 0.39277960762361686,
"learning_rate": 1.010401785947024e-08,
"loss": 0.3085,
"step": 1028
},
{
"epoch": 2.9232954545454546,
"grad_norm": 0.3487911223998262,
"learning_rate": 9.419633714559118e-09,
"loss": 0.2771,
"step": 1029
},
{
"epoch": 2.9261363636363638,
"grad_norm": 0.35424561590037207,
"learning_rate": 8.759203736304067e-09,
"loss": 0.2753,
"step": 1030
},
{
"epoch": 2.9289772727272725,
"grad_norm": 0.29485879447799396,
"learning_rate": 8.122734276657384e-09,
"loss": 0.2089,
"step": 1031
},
{
"epoch": 2.9318181818181817,
"grad_norm": 0.6294311065483419,
"learning_rate": 7.51023145712293e-09,
"loss": 0.3052,
"step": 1032
},
{
"epoch": 2.934659090909091,
"grad_norm": 0.354680706106559,
"learning_rate": 6.921701168694228e-09,
"loss": 0.2638,
"step": 1033
},
{
"epoch": 2.9375,
"grad_norm": 0.31404010890218703,
"learning_rate": 6.357149071800628e-09,
"loss": 0.2396,
"step": 1034
},
{
"epoch": 2.940340909090909,
"grad_norm": 0.3191510959590836,
"learning_rate": 5.816580596250676e-09,
"loss": 0.2652,
"step": 1035
},
{
"epoch": 2.9431818181818183,
"grad_norm": 0.33936071059872674,
"learning_rate": 5.300000941180494e-09,
"loss": 0.2761,
"step": 1036
},
{
"epoch": 2.9460227272727275,
"grad_norm": 0.6694940206582203,
"learning_rate": 4.807415075005206e-09,
"loss": 0.2716,
"step": 1037
},
{
"epoch": 2.9488636363636362,
"grad_norm": 0.3022654996639677,
"learning_rate": 4.338827735368423e-09,
"loss": 0.267,
"step": 1038
},
{
"epoch": 2.9517045454545454,
"grad_norm": 0.31223716729746726,
"learning_rate": 3.894243429098943e-09,
"loss": 0.2556,
"step": 1039
},
{
"epoch": 2.9545454545454546,
"grad_norm": 0.33999761359381697,
"learning_rate": 3.4736664321671777e-09,
"loss": 0.2234,
"step": 1040
},
{
"epoch": 2.9573863636363638,
"grad_norm": 0.38818142260184346,
"learning_rate": 3.0771007896424066e-09,
"loss": 0.2822,
"step": 1041
},
{
"epoch": 2.9602272727272725,
"grad_norm": 0.3915644733747401,
"learning_rate": 2.7045503156555853e-09,
"loss": 0.3089,
"step": 1042
},
{
"epoch": 2.9630681818181817,
"grad_norm": 0.35070734375473045,
"learning_rate": 2.3560185933621526e-09,
"loss": 0.2485,
"step": 1043
},
{
"epoch": 2.965909090909091,
"grad_norm": 0.32676962221864597,
"learning_rate": 2.031508974907337e-09,
"loss": 0.2564,
"step": 1044
},
{
"epoch": 2.96875,
"grad_norm": 0.37376434665996433,
"learning_rate": 1.7310245813939586e-09,
"loss": 0.2843,
"step": 1045
},
{
"epoch": 2.971590909090909,
"grad_norm": 0.3812123549505928,
"learning_rate": 1.4545683028521772e-09,
"loss": 0.2642,
"step": 1046
},
{
"epoch": 2.9744318181818183,
"grad_norm": 0.40366173461812144,
"learning_rate": 1.2021427982128463e-09,
"loss": 0.2714,
"step": 1047
},
{
"epoch": 2.9772727272727275,
"grad_norm": 0.38234650853272395,
"learning_rate": 9.737504952803124e-10,
"loss": 0.2483,
"step": 1048
},
{
"epoch": 2.9801136363636362,
"grad_norm": 0.3581632163317752,
"learning_rate": 7.693935907102102e-10,
"loss": 0.2448,
"step": 1049
},
{
"epoch": 2.9829545454545454,
"grad_norm": 0.44654505449503146,
"learning_rate": 5.890740499878145e-10,
"loss": 0.295,
"step": 1050
},
{
"epoch": 2.9857954545454546,
"grad_norm": 0.33560840489821,
"learning_rate": 4.3279360740972053e-10,
"loss": 0.2217,
"step": 1051
},
{
"epoch": 2.9886363636363638,
"grad_norm": 0.3283855292339783,
"learning_rate": 3.005537660663582e-10,
"loss": 0.219,
"step": 1052
},
{
"epoch": 2.9914772727272725,
"grad_norm": 0.35996516047736465,
"learning_rate": 1.923557978281143e-10,
"loss": 0.2571,
"step": 1053
},
{
"epoch": 2.9943181818181817,
"grad_norm": 0.30525860331677324,
"learning_rate": 1.0820074333256492e-10,
"loss": 0.2571,
"step": 1054
},
{
"epoch": 2.997159090909091,
"grad_norm": 0.37025001234738963,
"learning_rate": 4.808941197531614e-11,
"loss": 0.269,
"step": 1055
},
{
"epoch": 3.0,
"grad_norm": 0.3619551402376093,
"learning_rate": 1.2022381901399815e-11,
"loss": 0.24,
"step": 1056
},
{
"epoch": 3.0,
"step": 1056,
"total_flos": 1454552492015616.0,
"train_loss": 0.28625056774101476,
"train_runtime": 131042.0993,
"train_samples_per_second": 0.258,
"train_steps_per_second": 0.008
}
],
"logging_steps": 1.0,
"max_steps": 1056,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1454552492015616.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}