Light-R1 / trainer_state.json
Lingyue1's picture
Upload folder using huggingface_hub
c66737e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.95475113122172,
"eval_steps": 500,
"global_step": 550,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01809954751131222,
"grad_norm": 1.6741957199641677,
"learning_rate": 8.333333333333333e-07,
"loss": 0.392,
"step": 1
},
{
"epoch": 0.03619909502262444,
"grad_norm": 1.526970859287005,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.3479,
"step": 2
},
{
"epoch": 0.05429864253393665,
"grad_norm": 1.8103690939719148,
"learning_rate": 2.5e-06,
"loss": 0.363,
"step": 3
},
{
"epoch": 0.07239819004524888,
"grad_norm": 1.568077888738942,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.3513,
"step": 4
},
{
"epoch": 0.09049773755656108,
"grad_norm": 1.668945098216231,
"learning_rate": 4.166666666666667e-06,
"loss": 0.3759,
"step": 5
},
{
"epoch": 0.1085972850678733,
"grad_norm": 1.3864660758192329,
"learning_rate": 5e-06,
"loss": 0.3525,
"step": 6
},
{
"epoch": 0.12669683257918551,
"grad_norm": 1.538592504007101,
"learning_rate": 4.99995831202958e-06,
"loss": 0.3904,
"step": 7
},
{
"epoch": 0.14479638009049775,
"grad_norm": 1.2047351614977708,
"learning_rate": 4.999833249508629e-06,
"loss": 0.3924,
"step": 8
},
{
"epoch": 0.16289592760180996,
"grad_norm": 1.0640124047316322,
"learning_rate": 4.999624816608027e-06,
"loss": 0.375,
"step": 9
},
{
"epoch": 0.18099547511312217,
"grad_norm": 0.7966517341350207,
"learning_rate": 4.999333020279094e-06,
"loss": 0.356,
"step": 10
},
{
"epoch": 0.19909502262443438,
"grad_norm": 0.4554353875165799,
"learning_rate": 4.998957870253344e-06,
"loss": 0.3598,
"step": 11
},
{
"epoch": 0.2171945701357466,
"grad_norm": 0.6557533564712539,
"learning_rate": 4.998499379042172e-06,
"loss": 0.3392,
"step": 12
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.7936629840883419,
"learning_rate": 4.997957561936433e-06,
"loss": 0.3691,
"step": 13
},
{
"epoch": 0.25339366515837103,
"grad_norm": 0.7547277609627707,
"learning_rate": 4.997332437005932e-06,
"loss": 0.352,
"step": 14
},
{
"epoch": 0.27149321266968324,
"grad_norm": 0.8087501558896228,
"learning_rate": 4.996624025098819e-06,
"loss": 0.3449,
"step": 15
},
{
"epoch": 0.2895927601809955,
"grad_norm": 0.7820896976667914,
"learning_rate": 4.9958323498409e-06,
"loss": 0.3401,
"step": 16
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.7431270073814646,
"learning_rate": 4.99495743763484e-06,
"loss": 0.3567,
"step": 17
},
{
"epoch": 0.3257918552036199,
"grad_norm": 0.6777032410783791,
"learning_rate": 4.993999317659293e-06,
"loss": 0.3585,
"step": 18
},
{
"epoch": 0.3438914027149321,
"grad_norm": 0.6196369624534765,
"learning_rate": 4.9929580218679195e-06,
"loss": 0.3293,
"step": 19
},
{
"epoch": 0.36199095022624433,
"grad_norm": 0.5604472586874513,
"learning_rate": 4.991833584988326e-06,
"loss": 0.3437,
"step": 20
},
{
"epoch": 0.38009049773755654,
"grad_norm": 0.5137629265098744,
"learning_rate": 4.990626044520905e-06,
"loss": 0.3249,
"step": 21
},
{
"epoch": 0.39819004524886875,
"grad_norm": 0.547237003947588,
"learning_rate": 4.989335440737587e-06,
"loss": 0.3532,
"step": 22
},
{
"epoch": 0.416289592760181,
"grad_norm": 0.4164415963578454,
"learning_rate": 4.987961816680493e-06,
"loss": 0.3533,
"step": 23
},
{
"epoch": 0.4343891402714932,
"grad_norm": 0.35699522892651586,
"learning_rate": 4.986505218160502e-06,
"loss": 0.3268,
"step": 24
},
{
"epoch": 0.45248868778280543,
"grad_norm": 0.4026088211790661,
"learning_rate": 4.984965693755723e-06,
"loss": 0.3332,
"step": 25
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.35057192166080064,
"learning_rate": 4.983343294809875e-06,
"loss": 0.3245,
"step": 26
},
{
"epoch": 0.48868778280542985,
"grad_norm": 0.3639947181438965,
"learning_rate": 4.981638075430572e-06,
"loss": 0.3199,
"step": 27
},
{
"epoch": 0.5067873303167421,
"grad_norm": 0.3387354723957761,
"learning_rate": 4.979850092487525e-06,
"loss": 0.3282,
"step": 28
},
{
"epoch": 0.5248868778280543,
"grad_norm": 0.3528078697583281,
"learning_rate": 4.977979405610635e-06,
"loss": 0.337,
"step": 29
},
{
"epoch": 0.5429864253393665,
"grad_norm": 0.3126032062813636,
"learning_rate": 4.976026077188013e-06,
"loss": 0.3265,
"step": 30
},
{
"epoch": 0.5610859728506787,
"grad_norm": 0.3584209299955196,
"learning_rate": 4.973990172363899e-06,
"loss": 0.3568,
"step": 31
},
{
"epoch": 0.579185520361991,
"grad_norm": 0.4239503710543474,
"learning_rate": 4.9718717590364855e-06,
"loss": 0.3287,
"step": 32
},
{
"epoch": 0.5972850678733032,
"grad_norm": 0.41156579276283284,
"learning_rate": 4.969670907855651e-06,
"loss": 0.3267,
"step": 33
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.33536968371087267,
"learning_rate": 4.967387692220615e-06,
"loss": 0.3367,
"step": 34
},
{
"epoch": 0.6334841628959276,
"grad_norm": 0.30272106018319034,
"learning_rate": 4.965022188277474e-06,
"loss": 0.3236,
"step": 35
},
{
"epoch": 0.6515837104072398,
"grad_norm": 0.28697723150322,
"learning_rate": 4.962574474916678e-06,
"loss": 0.3236,
"step": 36
},
{
"epoch": 0.669683257918552,
"grad_norm": 0.21062422377276369,
"learning_rate": 4.960044633770387e-06,
"loss": 0.3295,
"step": 37
},
{
"epoch": 0.6877828054298643,
"grad_norm": 0.28283155334950705,
"learning_rate": 4.957432749209755e-06,
"loss": 0.3453,
"step": 38
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.2161778814999892,
"learning_rate": 4.954738908342116e-06,
"loss": 0.3645,
"step": 39
},
{
"epoch": 0.7239819004524887,
"grad_norm": 0.2354424408008659,
"learning_rate": 4.9519632010080765e-06,
"loss": 0.3372,
"step": 40
},
{
"epoch": 0.7420814479638009,
"grad_norm": 0.26054770828411217,
"learning_rate": 4.9491057197785205e-06,
"loss": 0.3349,
"step": 41
},
{
"epoch": 0.7601809954751131,
"grad_norm": 0.2596310001381547,
"learning_rate": 4.946166559951523e-06,
"loss": 0.3174,
"step": 42
},
{
"epoch": 0.7782805429864253,
"grad_norm": 0.2763815562688228,
"learning_rate": 4.943145819549169e-06,
"loss": 0.3464,
"step": 43
},
{
"epoch": 0.7963800904977375,
"grad_norm": 0.2508801820692124,
"learning_rate": 4.9400435993142895e-06,
"loss": 0.3277,
"step": 44
},
{
"epoch": 0.8144796380090498,
"grad_norm": 0.25823275674527674,
"learning_rate": 4.936860002707096e-06,
"loss": 0.343,
"step": 45
},
{
"epoch": 0.832579185520362,
"grad_norm": 0.23862916529933217,
"learning_rate": 4.933595135901733e-06,
"loss": 0.3425,
"step": 46
},
{
"epoch": 0.8506787330316742,
"grad_norm": 0.2377285409864031,
"learning_rate": 4.9302491077827366e-06,
"loss": 0.3345,
"step": 47
},
{
"epoch": 0.8687782805429864,
"grad_norm": 0.2054263655021643,
"learning_rate": 4.926822029941406e-06,
"loss": 0.3599,
"step": 48
},
{
"epoch": 0.8868778280542986,
"grad_norm": 0.21857378026560212,
"learning_rate": 4.923314016672075e-06,
"loss": 0.3293,
"step": 49
},
{
"epoch": 0.9049773755656109,
"grad_norm": 0.20834775020466292,
"learning_rate": 4.919725184968307e-06,
"loss": 0.3231,
"step": 50
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.2000139484905926,
"learning_rate": 4.9160556545189895e-06,
"loss": 0.3248,
"step": 51
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.24124485812118368,
"learning_rate": 4.9123055477043454e-06,
"loss": 0.3314,
"step": 52
},
{
"epoch": 0.9592760180995475,
"grad_norm": 0.26803109191751107,
"learning_rate": 4.908474989591846e-06,
"loss": 0.3341,
"step": 53
},
{
"epoch": 0.9773755656108597,
"grad_norm": 0.21490833872159623,
"learning_rate": 4.904564107932048e-06,
"loss": 0.3189,
"step": 54
},
{
"epoch": 0.995475113122172,
"grad_norm": 0.22738113980709365,
"learning_rate": 4.900573033154325e-06,
"loss": 0.3198,
"step": 55
},
{
"epoch": 1.0135746606334841,
"grad_norm": 0.1860953606536629,
"learning_rate": 4.8965018983625245e-06,
"loss": 0.3273,
"step": 56
},
{
"epoch": 1.0316742081447963,
"grad_norm": 0.2170252756734204,
"learning_rate": 4.8923508393305224e-06,
"loss": 0.3058,
"step": 57
},
{
"epoch": 1.0497737556561086,
"grad_norm": 0.19753070998453712,
"learning_rate": 4.888119994497701e-06,
"loss": 0.2949,
"step": 58
},
{
"epoch": 1.0678733031674208,
"grad_norm": 0.21040212719480175,
"learning_rate": 4.883809504964325e-06,
"loss": 0.298,
"step": 59
},
{
"epoch": 1.085972850678733,
"grad_norm": 0.20799415615187367,
"learning_rate": 4.879419514486846e-06,
"loss": 0.3201,
"step": 60
},
{
"epoch": 1.1040723981900453,
"grad_norm": 0.19784508945913667,
"learning_rate": 4.874950169473097e-06,
"loss": 0.3338,
"step": 61
},
{
"epoch": 1.1221719457013575,
"grad_norm": 0.20898074744097636,
"learning_rate": 4.870401618977415e-06,
"loss": 0.3053,
"step": 62
},
{
"epoch": 1.1402714932126696,
"grad_norm": 0.21530409824217756,
"learning_rate": 4.8657740146956724e-06,
"loss": 0.3346,
"step": 63
},
{
"epoch": 1.1583710407239818,
"grad_norm": 0.21656570481740497,
"learning_rate": 4.8610675109602135e-06,
"loss": 0.3175,
"step": 64
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.18916438407683134,
"learning_rate": 4.856282264734708e-06,
"loss": 0.2973,
"step": 65
},
{
"epoch": 1.1945701357466063,
"grad_norm": 0.19298959302885896,
"learning_rate": 4.851418435608919e-06,
"loss": 0.3328,
"step": 66
},
{
"epoch": 1.2126696832579185,
"grad_norm": 0.19382840884955524,
"learning_rate": 4.84647618579338e-06,
"loss": 0.3233,
"step": 67
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.22308099956654967,
"learning_rate": 4.841455680113979e-06,
"loss": 0.3401,
"step": 68
},
{
"epoch": 1.248868778280543,
"grad_norm": 0.1908581730308582,
"learning_rate": 4.836357086006471e-06,
"loss": 0.3199,
"step": 69
},
{
"epoch": 1.2669683257918551,
"grad_norm": 0.1900661127768816,
"learning_rate": 4.83118057351089e-06,
"loss": 0.3193,
"step": 70
},
{
"epoch": 1.2850678733031673,
"grad_norm": 0.1842083788274683,
"learning_rate": 4.825926315265874e-06,
"loss": 0.3093,
"step": 71
},
{
"epoch": 1.3031674208144797,
"grad_norm": 0.19304820753044424,
"learning_rate": 4.820594486502913e-06,
"loss": 0.3147,
"step": 72
},
{
"epoch": 1.3212669683257918,
"grad_norm": 0.1865184743330753,
"learning_rate": 4.815185265040504e-06,
"loss": 0.3371,
"step": 73
},
{
"epoch": 1.3393665158371042,
"grad_norm": 0.21257371675686554,
"learning_rate": 4.809698831278217e-06,
"loss": 0.3556,
"step": 74
},
{
"epoch": 1.3574660633484164,
"grad_norm": 0.19738810108074692,
"learning_rate": 4.804135368190684e-06,
"loss": 0.3098,
"step": 75
},
{
"epoch": 1.3755656108597285,
"grad_norm": 0.20419379710110824,
"learning_rate": 4.798495061321492e-06,
"loss": 0.3037,
"step": 76
},
{
"epoch": 1.3936651583710407,
"grad_norm": 0.21182701854581448,
"learning_rate": 4.792778098776997e-06,
"loss": 0.3046,
"step": 77
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.20966701782750055,
"learning_rate": 4.786984671220053e-06,
"loss": 0.3146,
"step": 78
},
{
"epoch": 1.4298642533936652,
"grad_norm": 0.2228994463496351,
"learning_rate": 4.7811149718636475e-06,
"loss": 0.3133,
"step": 79
},
{
"epoch": 1.4479638009049773,
"grad_norm": 0.2125517747018847,
"learning_rate": 4.7751691964644655e-06,
"loss": 0.3181,
"step": 80
},
{
"epoch": 1.4660633484162897,
"grad_norm": 0.18774294015726306,
"learning_rate": 4.7691475433163515e-06,
"loss": 0.3107,
"step": 81
},
{
"epoch": 1.4841628959276019,
"grad_norm": 0.2105655304494509,
"learning_rate": 4.763050213243705e-06,
"loss": 0.3193,
"step": 82
},
{
"epoch": 1.502262443438914,
"grad_norm": 0.2101302949838479,
"learning_rate": 4.7568774095947804e-06,
"loss": 0.3372,
"step": 83
},
{
"epoch": 1.5203619909502262,
"grad_norm": 0.1761520660073366,
"learning_rate": 4.7506293382349e-06,
"loss": 0.3058,
"step": 84
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.20214706457289192,
"learning_rate": 4.744306207539595e-06,
"loss": 0.34,
"step": 85
},
{
"epoch": 1.5565610859728507,
"grad_norm": 0.21608846929666756,
"learning_rate": 4.737908228387656e-06,
"loss": 0.3285,
"step": 86
},
{
"epoch": 1.5746606334841629,
"grad_norm": 0.19692503921435273,
"learning_rate": 4.731435614154094e-06,
"loss": 0.3134,
"step": 87
},
{
"epoch": 1.5927601809954752,
"grad_norm": 0.19107736826101185,
"learning_rate": 4.72488858070303e-06,
"loss": 0.305,
"step": 88
},
{
"epoch": 1.6108597285067874,
"grad_norm": 0.19148405595657123,
"learning_rate": 4.718267346380492e-06,
"loss": 0.3157,
"step": 89
},
{
"epoch": 1.6289592760180995,
"grad_norm": 0.19180277215162053,
"learning_rate": 4.711572132007139e-06,
"loss": 0.3124,
"step": 90
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.19539080957269014,
"learning_rate": 4.704803160870888e-06,
"loss": 0.3306,
"step": 91
},
{
"epoch": 1.6651583710407238,
"grad_norm": 0.21052797618402563,
"learning_rate": 4.697960658719475e-06,
"loss": 0.3061,
"step": 92
},
{
"epoch": 1.6832579185520362,
"grad_norm": 0.20191616959818315,
"learning_rate": 4.69104485375292e-06,
"loss": 0.3098,
"step": 93
},
{
"epoch": 1.7013574660633484,
"grad_norm": 0.2159013380308242,
"learning_rate": 4.684055976615924e-06,
"loss": 0.3088,
"step": 94
},
{
"epoch": 1.7194570135746607,
"grad_norm": 0.18904626555927467,
"learning_rate": 4.676994260390168e-06,
"loss": 0.2912,
"step": 95
},
{
"epoch": 1.737556561085973,
"grad_norm": 0.19467640291002175,
"learning_rate": 4.6698599405865465e-06,
"loss": 0.303,
"step": 96
},
{
"epoch": 1.755656108597285,
"grad_norm": 0.2880548104749461,
"learning_rate": 4.662653255137308e-06,
"loss": 0.3348,
"step": 97
},
{
"epoch": 1.7737556561085972,
"grad_norm": 0.2019155699824381,
"learning_rate": 4.655374444388127e-06,
"loss": 0.327,
"step": 98
},
{
"epoch": 1.7918552036199094,
"grad_norm": 0.2592156259533593,
"learning_rate": 4.648023751090079e-06,
"loss": 0.3363,
"step": 99
},
{
"epoch": 1.8099547511312217,
"grad_norm": 0.2180192099378802,
"learning_rate": 4.640601420391554e-06,
"loss": 0.3113,
"step": 100
},
{
"epoch": 1.8280542986425339,
"grad_norm": 0.20679493678747934,
"learning_rate": 4.633107699830073e-06,
"loss": 0.3148,
"step": 101
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.2053440368213778,
"learning_rate": 4.625542839324036e-06,
"loss": 0.2967,
"step": 102
},
{
"epoch": 1.8642533936651584,
"grad_norm": 0.19200611510261656,
"learning_rate": 4.617907091164389e-06,
"loss": 0.3188,
"step": 103
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.2302101510970096,
"learning_rate": 4.610200710006206e-06,
"loss": 0.3121,
"step": 104
},
{
"epoch": 1.9004524886877827,
"grad_norm": 0.2221804604843677,
"learning_rate": 4.602423952860199e-06,
"loss": 0.3146,
"step": 105
},
{
"epoch": 1.9185520361990949,
"grad_norm": 0.21983834708053807,
"learning_rate": 4.594577079084146e-06,
"loss": 0.3405,
"step": 106
},
{
"epoch": 1.9366515837104072,
"grad_norm": 0.21085636909889235,
"learning_rate": 4.58666035037424e-06,
"loss": 0.3089,
"step": 107
},
{
"epoch": 1.9547511312217196,
"grad_norm": 0.2016884181282795,
"learning_rate": 4.578674030756364e-06,
"loss": 0.3229,
"step": 108
},
{
"epoch": 1.9728506787330318,
"grad_norm": 0.19657023773974253,
"learning_rate": 4.57061838657728e-06,
"loss": 0.3237,
"step": 109
},
{
"epoch": 1.990950226244344,
"grad_norm": 0.20813455436587358,
"learning_rate": 4.562493686495756e-06,
"loss": 0.3255,
"step": 110
},
{
"epoch": 2.009049773755656,
"grad_norm": 0.18872409832307335,
"learning_rate": 4.5543002014735955e-06,
"loss": 0.2988,
"step": 111
},
{
"epoch": 2.0271493212669682,
"grad_norm": 0.19594421270270285,
"learning_rate": 4.546038204766609e-06,
"loss": 0.3109,
"step": 112
},
{
"epoch": 2.0452488687782804,
"grad_norm": 0.22355285614452686,
"learning_rate": 4.537707971915495e-06,
"loss": 0.3066,
"step": 113
},
{
"epoch": 2.0633484162895925,
"grad_norm": 0.2017792264758022,
"learning_rate": 4.529309780736654e-06,
"loss": 0.2939,
"step": 114
},
{
"epoch": 2.081447963800905,
"grad_norm": 0.20223483022494018,
"learning_rate": 4.520843911312922e-06,
"loss": 0.294,
"step": 115
},
{
"epoch": 2.0995475113122173,
"grad_norm": 0.20322098664858632,
"learning_rate": 4.512310645984231e-06,
"loss": 0.2984,
"step": 116
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.20705072743185104,
"learning_rate": 4.503710269338191e-06,
"loss": 0.2694,
"step": 117
},
{
"epoch": 2.1357466063348416,
"grad_norm": 0.18442012590242893,
"learning_rate": 4.4950430682005995e-06,
"loss": 0.2979,
"step": 118
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.2076635780792367,
"learning_rate": 4.486309331625877e-06,
"loss": 0.2874,
"step": 119
},
{
"epoch": 2.171945701357466,
"grad_norm": 0.19968964517363474,
"learning_rate": 4.477509350887424e-06,
"loss": 0.291,
"step": 120
},
{
"epoch": 2.1900452488687785,
"grad_norm": 0.18959726179400077,
"learning_rate": 4.468643419467909e-06,
"loss": 0.2921,
"step": 121
},
{
"epoch": 2.2081447963800906,
"grad_norm": 0.2388780187488927,
"learning_rate": 4.459711833049485e-06,
"loss": 0.3061,
"step": 122
},
{
"epoch": 2.226244343891403,
"grad_norm": 0.22092548916393367,
"learning_rate": 4.4507148895039165e-06,
"loss": 0.2765,
"step": 123
},
{
"epoch": 2.244343891402715,
"grad_norm": 0.21070917452514223,
"learning_rate": 4.4416528888826595e-06,
"loss": 0.2969,
"step": 124
},
{
"epoch": 2.262443438914027,
"grad_norm": 0.19807472108481627,
"learning_rate": 4.432526133406843e-06,
"loss": 0.3044,
"step": 125
},
{
"epoch": 2.2805429864253393,
"grad_norm": 0.1910225174641335,
"learning_rate": 4.423334927457198e-06,
"loss": 0.3132,
"step": 126
},
{
"epoch": 2.2986425339366514,
"grad_norm": 0.2203923882052516,
"learning_rate": 4.414079577563901e-06,
"loss": 0.3032,
"step": 127
},
{
"epoch": 2.3167420814479636,
"grad_norm": 0.21331518168793756,
"learning_rate": 4.404760392396355e-06,
"loss": 0.3033,
"step": 128
},
{
"epoch": 2.334841628959276,
"grad_norm": 0.21461268917839496,
"learning_rate": 4.3953776827528925e-06,
"loss": 0.3039,
"step": 129
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.1862241130798519,
"learning_rate": 4.385931761550411e-06,
"loss": 0.2793,
"step": 130
},
{
"epoch": 2.3710407239819005,
"grad_norm": 0.19779667332990994,
"learning_rate": 4.376422943813936e-06,
"loss": 0.2849,
"step": 131
},
{
"epoch": 2.3891402714932126,
"grad_norm": 0.20538470648954774,
"learning_rate": 4.366851546666118e-06,
"loss": 0.3129,
"step": 132
},
{
"epoch": 2.4072398190045248,
"grad_norm": 0.20067043214876432,
"learning_rate": 4.357217889316657e-06,
"loss": 0.3041,
"step": 133
},
{
"epoch": 2.425339366515837,
"grad_norm": 0.1997136625573991,
"learning_rate": 4.3475222930516484e-06,
"loss": 0.2839,
"step": 134
},
{
"epoch": 2.4434389140271495,
"grad_norm": 0.20004099038403145,
"learning_rate": 4.3377650812228765e-06,
"loss": 0.3014,
"step": 135
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.19311135694466858,
"learning_rate": 4.327946579237028e-06,
"loss": 0.2834,
"step": 136
},
{
"epoch": 2.479638009049774,
"grad_norm": 0.21078445039076968,
"learning_rate": 4.318067114544838e-06,
"loss": 0.2796,
"step": 137
},
{
"epoch": 2.497737556561086,
"grad_norm": 0.21975365365759061,
"learning_rate": 4.308127016630176e-06,
"loss": 0.2972,
"step": 138
},
{
"epoch": 2.515837104072398,
"grad_norm": 0.21203142423348517,
"learning_rate": 4.2981266169990436e-06,
"loss": 0.3196,
"step": 139
},
{
"epoch": 2.5339366515837103,
"grad_norm": 0.20131092451024465,
"learning_rate": 4.2880662491685345e-06,
"loss": 0.3003,
"step": 140
},
{
"epoch": 2.5520361990950224,
"grad_norm": 0.22294798360675439,
"learning_rate": 4.277946248655701e-06,
"loss": 0.2947,
"step": 141
},
{
"epoch": 2.5701357466063346,
"grad_norm": 0.22859386995564024,
"learning_rate": 4.267766952966369e-06,
"loss": 0.2958,
"step": 142
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.19567845392715985,
"learning_rate": 4.257528701583882e-06,
"loss": 0.2998,
"step": 143
},
{
"epoch": 2.6063348416289593,
"grad_norm": 0.19741413456031112,
"learning_rate": 4.247231835957773e-06,
"loss": 0.3408,
"step": 144
},
{
"epoch": 2.6244343891402715,
"grad_norm": 0.19905612890447116,
"learning_rate": 4.236876699492391e-06,
"loss": 0.3117,
"step": 145
},
{
"epoch": 2.6425339366515836,
"grad_norm": 0.1942385041095113,
"learning_rate": 4.226463637535429e-06,
"loss": 0.3152,
"step": 146
},
{
"epoch": 2.660633484162896,
"grad_norm": 0.22327732166814804,
"learning_rate": 4.215992997366425e-06,
"loss": 0.3142,
"step": 147
},
{
"epoch": 2.6787330316742084,
"grad_norm": 0.1935161282714164,
"learning_rate": 4.2054651281851685e-06,
"loss": 0.3081,
"step": 148
},
{
"epoch": 2.6968325791855206,
"grad_norm": 0.23957566926280122,
"learning_rate": 4.1948803811000585e-06,
"loss": 0.2894,
"step": 149
},
{
"epoch": 2.7149321266968327,
"grad_norm": 0.18805009890662516,
"learning_rate": 4.184239109116393e-06,
"loss": 0.2984,
"step": 150
},
{
"epoch": 2.733031674208145,
"grad_norm": 0.212580814141281,
"learning_rate": 4.173541667124599e-06,
"loss": 0.3097,
"step": 151
},
{
"epoch": 2.751131221719457,
"grad_norm": 0.19712271093008257,
"learning_rate": 4.1627884118883925e-06,
"loss": 0.3177,
"step": 152
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.2278946892968003,
"learning_rate": 4.1519797020328815e-06,
"loss": 0.3101,
"step": 153
},
{
"epoch": 2.7873303167420813,
"grad_norm": 0.21064766645861627,
"learning_rate": 4.141115898032607e-06,
"loss": 0.274,
"step": 154
},
{
"epoch": 2.8054298642533935,
"grad_norm": 0.20995612915210915,
"learning_rate": 4.130197362199521e-06,
"loss": 0.2926,
"step": 155
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.21633523471290103,
"learning_rate": 4.119224458670905e-06,
"loss": 0.2875,
"step": 156
},
{
"epoch": 2.841628959276018,
"grad_norm": 0.21266765467202223,
"learning_rate": 4.1081975533972185e-06,
"loss": 0.2947,
"step": 157
},
{
"epoch": 2.8597285067873304,
"grad_norm": 0.19506346084116072,
"learning_rate": 4.097117014129903e-06,
"loss": 0.296,
"step": 158
},
{
"epoch": 2.8778280542986425,
"grad_norm": 0.1986031276610744,
"learning_rate": 4.085983210409114e-06,
"loss": 0.2988,
"step": 159
},
{
"epoch": 2.8959276018099547,
"grad_norm": 0.22662474336309782,
"learning_rate": 4.074796513551395e-06,
"loss": 0.2952,
"step": 160
},
{
"epoch": 2.914027149321267,
"grad_norm": 0.21721813582738397,
"learning_rate": 4.063557296637295e-06,
"loss": 0.3099,
"step": 161
},
{
"epoch": 2.9321266968325794,
"grad_norm": 0.2133328804989817,
"learning_rate": 4.052265934498929e-06,
"loss": 0.2974,
"step": 162
},
{
"epoch": 2.9502262443438916,
"grad_norm": 0.1960218423105953,
"learning_rate": 4.040922803707474e-06,
"loss": 0.3065,
"step": 163
},
{
"epoch": 2.9683257918552037,
"grad_norm": 0.22167341080572722,
"learning_rate": 4.029528282560609e-06,
"loss": 0.2886,
"step": 164
},
{
"epoch": 2.986425339366516,
"grad_norm": 0.20386239234209946,
"learning_rate": 4.018082751069904e-06,
"loss": 0.3076,
"step": 165
},
{
"epoch": 3.004524886877828,
"grad_norm": 0.23748187918298697,
"learning_rate": 4.006586590948141e-06,
"loss": 0.2985,
"step": 166
},
{
"epoch": 3.02262443438914,
"grad_norm": 0.22617083435609797,
"learning_rate": 3.995040185596588e-06,
"loss": 0.2754,
"step": 167
},
{
"epoch": 3.0407239819004523,
"grad_norm": 0.23986769037952196,
"learning_rate": 3.983443920092206e-06,
"loss": 0.2854,
"step": 168
},
{
"epoch": 3.0588235294117645,
"grad_norm": 0.20150185345396,
"learning_rate": 3.971798181174816e-06,
"loss": 0.2832,
"step": 169
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.20913884879987113,
"learning_rate": 3.960103357234192e-06,
"loss": 0.2986,
"step": 170
},
{
"epoch": 3.0950226244343892,
"grad_norm": 0.20477890710932672,
"learning_rate": 3.948359838297115e-06,
"loss": 0.2876,
"step": 171
},
{
"epoch": 3.1131221719457014,
"grad_norm": 0.2031405516319826,
"learning_rate": 3.9365680160143595e-06,
"loss": 0.2971,
"step": 172
},
{
"epoch": 3.1312217194570136,
"grad_norm": 0.185634722744017,
"learning_rate": 3.924728283647638e-06,
"loss": 0.279,
"step": 173
},
{
"epoch": 3.1493212669683257,
"grad_norm": 0.20745314489894484,
"learning_rate": 3.91284103605648e-06,
"loss": 0.2903,
"step": 174
},
{
"epoch": 3.167420814479638,
"grad_norm": 0.20741649089082642,
"learning_rate": 3.9009066696850664e-06,
"loss": 0.2964,
"step": 175
},
{
"epoch": 3.1855203619909505,
"grad_norm": 0.20883578071304365,
"learning_rate": 3.888925582549006e-06,
"loss": 0.2946,
"step": 176
},
{
"epoch": 3.2036199095022626,
"grad_norm": 0.21451927304435986,
"learning_rate": 3.8768981742220646e-06,
"loss": 0.2811,
"step": 177
},
{
"epoch": 3.2217194570135748,
"grad_norm": 0.21080586953456093,
"learning_rate": 3.864824845822837e-06,
"loss": 0.2825,
"step": 178
},
{
"epoch": 3.239819004524887,
"grad_norm": 0.20609867837665838,
"learning_rate": 3.852706000001367e-06,
"loss": 0.2903,
"step": 179
},
{
"epoch": 3.257918552036199,
"grad_norm": 0.1972989252518201,
"learning_rate": 3.840542040925725e-06,
"loss": 0.2626,
"step": 180
},
{
"epoch": 3.276018099547511,
"grad_norm": 0.21568160972522105,
"learning_rate": 3.828333374268523e-06,
"loss": 0.2906,
"step": 181
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.1997067887507601,
"learning_rate": 3.81608040719339e-06,
"loss": 0.2862,
"step": 182
},
{
"epoch": 3.3122171945701355,
"grad_norm": 0.2017980016690952,
"learning_rate": 3.8037835483413877e-06,
"loss": 0.2855,
"step": 183
},
{
"epoch": 3.330316742081448,
"grad_norm": 0.20121264738949698,
"learning_rate": 3.7914432078173867e-06,
"loss": 0.2795,
"step": 184
},
{
"epoch": 3.3484162895927603,
"grad_norm": 0.22611700851822947,
"learning_rate": 3.7790597971763892e-06,
"loss": 0.2836,
"step": 185
},
{
"epoch": 3.3665158371040724,
"grad_norm": 0.2353941218093955,
"learning_rate": 3.7666337294097987e-06,
"loss": 0.288,
"step": 186
},
{
"epoch": 3.3846153846153846,
"grad_norm": 0.18605988505854537,
"learning_rate": 3.7541654189316525e-06,
"loss": 0.275,
"step": 187
},
{
"epoch": 3.4027149321266967,
"grad_norm": 0.22628052198695675,
"learning_rate": 3.741655281564796e-06,
"loss": 0.2966,
"step": 188
},
{
"epoch": 3.420814479638009,
"grad_norm": 0.21236583352079183,
"learning_rate": 3.72910373452702e-06,
"loss": 0.2702,
"step": 189
},
{
"epoch": 3.4389140271493215,
"grad_norm": 0.22886628365130654,
"learning_rate": 3.7165111964171407e-06,
"loss": 0.2718,
"step": 190
},
{
"epoch": 3.4570135746606336,
"grad_norm": 0.19177205299999378,
"learning_rate": 3.703878087201044e-06,
"loss": 0.2785,
"step": 191
},
{
"epoch": 3.475113122171946,
"grad_norm": 0.21164213919986813,
"learning_rate": 3.6912048281976764e-06,
"loss": 0.2991,
"step": 192
},
{
"epoch": 3.493212669683258,
"grad_norm": 0.20082392739954888,
"learning_rate": 3.6784918420649952e-06,
"loss": 0.2814,
"step": 193
},
{
"epoch": 3.51131221719457,
"grad_norm": 0.21531730826425216,
"learning_rate": 3.66573955278587e-06,
"loss": 0.2719,
"step": 194
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.20053264760640085,
"learning_rate": 3.6529483856539512e-06,
"loss": 0.2639,
"step": 195
},
{
"epoch": 3.5475113122171944,
"grad_norm": 0.18714903727677973,
"learning_rate": 3.640118767259474e-06,
"loss": 0.2712,
"step": 196
},
{
"epoch": 3.5656108597285066,
"grad_norm": 0.19923843024788357,
"learning_rate": 3.6272511254750403e-06,
"loss": 0.2825,
"step": 197
},
{
"epoch": 3.583710407239819,
"grad_norm": 0.2016875130706868,
"learning_rate": 3.6143458894413463e-06,
"loss": 0.2977,
"step": 198
},
{
"epoch": 3.6018099547511313,
"grad_norm": 0.21861290041385015,
"learning_rate": 3.6014034895528705e-06,
"loss": 0.284,
"step": 199
},
{
"epoch": 3.6199095022624435,
"grad_norm": 0.16879798551287897,
"learning_rate": 3.588424357443521e-06,
"loss": 0.2782,
"step": 200
},
{
"epoch": 3.6380090497737556,
"grad_norm": 0.22087168375536256,
"learning_rate": 3.5754089259722365e-06,
"loss": 0.2902,
"step": 201
},
{
"epoch": 3.6561085972850678,
"grad_norm": 0.2219253724635141,
"learning_rate": 3.5623576292085555e-06,
"loss": 0.294,
"step": 202
},
{
"epoch": 3.6742081447963804,
"grad_norm": 0.19173446074813308,
"learning_rate": 3.549270902418136e-06,
"loss": 0.2715,
"step": 203
},
{
"epoch": 3.6923076923076925,
"grad_norm": 0.20017688015075918,
"learning_rate": 3.536149182048243e-06,
"loss": 0.2823,
"step": 204
},
{
"epoch": 3.7104072398190047,
"grad_norm": 0.19825967232402708,
"learning_rate": 3.5229929057131877e-06,
"loss": 0.2881,
"step": 205
},
{
"epoch": 3.728506787330317,
"grad_norm": 0.218766824892692,
"learning_rate": 3.5098025121797375e-06,
"loss": 0.2999,
"step": 206
},
{
"epoch": 3.746606334841629,
"grad_norm": 0.18895340916933914,
"learning_rate": 3.496578441352481e-06,
"loss": 0.2687,
"step": 207
},
{
"epoch": 3.764705882352941,
"grad_norm": 0.19002028251920547,
"learning_rate": 3.4833211342591565e-06,
"loss": 0.2866,
"step": 208
},
{
"epoch": 3.7828054298642533,
"grad_norm": 0.20990408194136284,
"learning_rate": 3.4700310330359456e-06,
"loss": 0.2805,
"step": 209
},
{
"epoch": 3.8009049773755654,
"grad_norm": 0.1929810608863491,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.2864,
"step": 210
},
{
"epoch": 3.8190045248868776,
"grad_norm": 0.2026264509793902,
"learning_rate": 3.4433542221982863e-06,
"loss": 0.2847,
"step": 211
},
{
"epoch": 3.83710407239819,
"grad_norm": 0.20527047718646593,
"learning_rate": 3.4299684022655196e-06,
"loss": 0.285,
"step": 212
},
{
"epoch": 3.8552036199095023,
"grad_norm": 0.2004805974514508,
"learning_rate": 3.4165515675365558e-06,
"loss": 0.2862,
"step": 213
},
{
"epoch": 3.8733031674208145,
"grad_norm": 0.18650100732919933,
"learning_rate": 3.403104165467883e-06,
"loss": 0.2748,
"step": 214
},
{
"epoch": 3.8914027149321266,
"grad_norm": 0.21391027704520638,
"learning_rate": 3.3896266445354208e-06,
"loss": 0.2875,
"step": 215
},
{
"epoch": 3.909502262443439,
"grad_norm": 0.19076214648617013,
"learning_rate": 3.376119454219565e-06,
"loss": 0.2811,
"step": 216
},
{
"epoch": 3.9276018099547514,
"grad_norm": 0.22812272390771685,
"learning_rate": 3.362583044990195e-06,
"loss": 0.2923,
"step": 217
},
{
"epoch": 3.9457013574660635,
"grad_norm": 0.2114461488141671,
"learning_rate": 3.3490178682916534e-06,
"loss": 0.2784,
"step": 218
},
{
"epoch": 3.9638009049773757,
"grad_norm": 0.20971069504695025,
"learning_rate": 3.335424376527688e-06,
"loss": 0.2796,
"step": 219
},
{
"epoch": 3.981900452488688,
"grad_norm": 0.20721944316747504,
"learning_rate": 3.321803023046366e-06,
"loss": 0.2855,
"step": 220
},
{
"epoch": 4.0,
"grad_norm": 0.19529166110448204,
"learning_rate": 3.3081542621249503e-06,
"loss": 0.2722,
"step": 221
},
{
"epoch": 4.018099547511312,
"grad_norm": 0.2181909689708081,
"learning_rate": 3.2944785489547544e-06,
"loss": 0.2769,
"step": 222
},
{
"epoch": 4.036199095022624,
"grad_norm": 0.2041261831519089,
"learning_rate": 3.2807763396259597e-06,
"loss": 0.2755,
"step": 223
},
{
"epoch": 4.0542986425339365,
"grad_norm": 0.17317381953746722,
"learning_rate": 3.2670480911124045e-06,
"loss": 0.2457,
"step": 224
},
{
"epoch": 4.072398190045249,
"grad_norm": 0.20985257213280492,
"learning_rate": 3.2532942612563436e-06,
"loss": 0.3084,
"step": 225
},
{
"epoch": 4.090497737556561,
"grad_norm": 0.1805568892682367,
"learning_rate": 3.2395153087531767e-06,
"loss": 0.2688,
"step": 226
},
{
"epoch": 4.108597285067873,
"grad_norm": 0.20212288478471152,
"learning_rate": 3.225711693136156e-06,
"loss": 0.2678,
"step": 227
},
{
"epoch": 4.126696832579185,
"grad_norm": 0.20073111869372287,
"learning_rate": 3.211883874761058e-06,
"loss": 0.2636,
"step": 228
},
{
"epoch": 4.144796380090498,
"grad_norm": 0.21913185170065114,
"learning_rate": 3.19803231479083e-06,
"loss": 0.282,
"step": 229
},
{
"epoch": 4.16289592760181,
"grad_norm": 0.20273970778843858,
"learning_rate": 3.184157475180208e-06,
"loss": 0.2689,
"step": 230
},
{
"epoch": 4.180995475113122,
"grad_norm": 0.17451394799617262,
"learning_rate": 3.1702598186603152e-06,
"loss": 0.2583,
"step": 231
},
{
"epoch": 4.199095022624435,
"grad_norm": 0.1918998406915714,
"learning_rate": 3.1563398087232265e-06,
"loss": 0.2795,
"step": 232
},
{
"epoch": 4.217194570135747,
"grad_norm": 0.1970262585975004,
"learning_rate": 3.1423979096065134e-06,
"loss": 0.2605,
"step": 233
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.18769426018045784,
"learning_rate": 3.1284345862777572e-06,
"loss": 0.2592,
"step": 234
},
{
"epoch": 4.253393665158371,
"grad_norm": 0.18870339061291633,
"learning_rate": 3.1144503044190456e-06,
"loss": 0.2642,
"step": 235
},
{
"epoch": 4.271493212669683,
"grad_norm": 0.18389039727257753,
"learning_rate": 3.100445530411442e-06,
"loss": 0.2376,
"step": 236
},
{
"epoch": 4.289592760180995,
"grad_norm": 0.20812476367192412,
"learning_rate": 3.086420731319429e-06,
"loss": 0.2708,
"step": 237
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.197363611944904,
"learning_rate": 3.0723763748753354e-06,
"loss": 0.2844,
"step": 238
},
{
"epoch": 4.32579185520362,
"grad_norm": 0.20806148893145612,
"learning_rate": 3.0583129294637342e-06,
"loss": 0.2487,
"step": 239
},
{
"epoch": 4.343891402714932,
"grad_norm": 0.1903831131540733,
"learning_rate": 3.044230864105821e-06,
"loss": 0.256,
"step": 240
},
{
"epoch": 4.361990950226244,
"grad_norm": 0.19042569669796638,
"learning_rate": 3.030130648443777e-06,
"loss": 0.2788,
"step": 241
},
{
"epoch": 4.380090497737557,
"grad_norm": 0.18850059553704934,
"learning_rate": 3.0160127527250993e-06,
"loss": 0.2808,
"step": 242
},
{
"epoch": 4.398190045248869,
"grad_norm": 0.20047462331384677,
"learning_rate": 3.0018776477869244e-06,
"loss": 0.2654,
"step": 243
},
{
"epoch": 4.416289592760181,
"grad_norm": 0.20580882731270267,
"learning_rate": 2.9877258050403214e-06,
"loss": 0.2753,
"step": 244
},
{
"epoch": 4.4343891402714934,
"grad_norm": 0.18933219589587963,
"learning_rate": 2.973557696454571e-06,
"loss": 0.2627,
"step": 245
},
{
"epoch": 4.452488687782806,
"grad_norm": 0.2019766662866527,
"learning_rate": 2.9593737945414264e-06,
"loss": 0.2779,
"step": 246
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.21814847735110302,
"learning_rate": 2.9451745723393547e-06,
"loss": 0.2747,
"step": 247
},
{
"epoch": 4.48868778280543,
"grad_norm": 0.2044800165411035,
"learning_rate": 2.930960503397761e-06,
"loss": 0.2726,
"step": 248
},
{
"epoch": 4.506787330316742,
"grad_norm": 0.21142216034006506,
"learning_rate": 2.916732061761192e-06,
"loss": 0.2646,
"step": 249
},
{
"epoch": 4.524886877828054,
"grad_norm": 0.20150189042920258,
"learning_rate": 2.9024897219535326e-06,
"loss": 0.279,
"step": 250
},
{
"epoch": 4.542986425339366,
"grad_norm": 0.19100387279656014,
"learning_rate": 2.8882339589621742e-06,
"loss": 0.2795,
"step": 251
},
{
"epoch": 4.5610859728506785,
"grad_norm": 0.18930829794972215,
"learning_rate": 2.873965248222178e-06,
"loss": 0.2672,
"step": 252
},
{
"epoch": 4.579185520361991,
"grad_norm": 0.18814066866007795,
"learning_rate": 2.859684065600417e-06,
"loss": 0.2478,
"step": 253
},
{
"epoch": 4.597285067873303,
"grad_norm": 0.19644782065692218,
"learning_rate": 2.845390887379706e-06,
"loss": 0.2639,
"step": 254
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.20255200557130154,
"learning_rate": 2.8310861902429176e-06,
"loss": 0.2725,
"step": 255
},
{
"epoch": 4.633484162895927,
"grad_norm": 0.19849695081074425,
"learning_rate": 2.816770451257085e-06,
"loss": 0.2685,
"step": 256
},
{
"epoch": 4.65158371040724,
"grad_norm": 0.20106804584886076,
"learning_rate": 2.80244414785749e-06,
"loss": 0.2572,
"step": 257
},
{
"epoch": 4.669683257918552,
"grad_norm": 0.2021059922332257,
"learning_rate": 2.7881077578317445e-06,
"loss": 0.2924,
"step": 258
},
{
"epoch": 4.6877828054298645,
"grad_norm": 0.21055754243512417,
"learning_rate": 2.7737617593038493e-06,
"loss": 0.2714,
"step": 259
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.18496100638206028,
"learning_rate": 2.759406630718255e-06,
"loss": 0.2609,
"step": 260
},
{
"epoch": 4.723981900452489,
"grad_norm": 0.18437211430441194,
"learning_rate": 2.7450428508239024e-06,
"loss": 0.2662,
"step": 261
},
{
"epoch": 4.742081447963801,
"grad_norm": 0.18828985621936872,
"learning_rate": 2.730670898658255e-06,
"loss": 0.2549,
"step": 262
},
{
"epoch": 4.760180995475113,
"grad_norm": 0.19972804362365068,
"learning_rate": 2.716291253531329e-06,
"loss": 0.2873,
"step": 263
},
{
"epoch": 4.778280542986425,
"grad_norm": 0.2092834898971349,
"learning_rate": 2.7019043950096992e-06,
"loss": 0.2674,
"step": 264
},
{
"epoch": 4.796380090497737,
"grad_norm": 0.19131496744019671,
"learning_rate": 2.6875108029005113e-06,
"loss": 0.2724,
"step": 265
},
{
"epoch": 4.8144796380090495,
"grad_norm": 0.21255643670178404,
"learning_rate": 2.6731109572354795e-06,
"loss": 0.2684,
"step": 266
},
{
"epoch": 4.832579185520362,
"grad_norm": 0.18562764869110326,
"learning_rate": 2.658705338254876e-06,
"loss": 0.271,
"step": 267
},
{
"epoch": 4.850678733031674,
"grad_norm": 0.21207337609644833,
"learning_rate": 2.6442944263915153e-06,
"loss": 0.2719,
"step": 268
},
{
"epoch": 4.868778280542987,
"grad_norm": 0.2129223213748196,
"learning_rate": 2.6298787022547317e-06,
"loss": 0.2666,
"step": 269
},
{
"epoch": 4.886877828054299,
"grad_norm": 0.18692841429903953,
"learning_rate": 2.6154586466143495e-06,
"loss": 0.2755,
"step": 270
},
{
"epoch": 4.904977375565611,
"grad_norm": 0.19199436687113453,
"learning_rate": 2.6010347403846508e-06,
"loss": 0.2864,
"step": 271
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.19327034490069303,
"learning_rate": 2.5866074646083385e-06,
"loss": 0.2694,
"step": 272
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.26379305562686184,
"learning_rate": 2.572177300440487e-06,
"loss": 0.2597,
"step": 273
},
{
"epoch": 4.959276018099548,
"grad_norm": 0.1894366168665776,
"learning_rate": 2.557744729132503e-06,
"loss": 0.2825,
"step": 274
},
{
"epoch": 4.97737556561086,
"grad_norm": 0.19519701404452072,
"learning_rate": 2.5433102320160713e-06,
"loss": 0.2893,
"step": 275
},
{
"epoch": 4.995475113122172,
"grad_norm": 0.19163121413004777,
"learning_rate": 2.528874290487102e-06,
"loss": 0.2508,
"step": 276
},
{
"epoch": 5.013574660633484,
"grad_norm": 0.18512352279959782,
"learning_rate": 2.5144373859896792e-06,
"loss": 0.2589,
"step": 277
},
{
"epoch": 5.031674208144796,
"grad_norm": 0.18339390733870273,
"learning_rate": 2.5e-06,
"loss": 0.2621,
"step": 278
},
{
"epoch": 5.049773755656108,
"grad_norm": 0.1942547479998011,
"learning_rate": 2.4855626140103216e-06,
"loss": 0.245,
"step": 279
},
{
"epoch": 5.067873303167421,
"grad_norm": 0.201133955927992,
"learning_rate": 2.4711257095128987e-06,
"loss": 0.2428,
"step": 280
},
{
"epoch": 5.085972850678733,
"grad_norm": 0.19802266448824934,
"learning_rate": 2.4566897679839295e-06,
"loss": 0.2756,
"step": 281
},
{
"epoch": 5.104072398190045,
"grad_norm": 0.19714188235491836,
"learning_rate": 2.4422552708674977e-06,
"loss": 0.2626,
"step": 282
},
{
"epoch": 5.122171945701357,
"grad_norm": 0.18710363733656865,
"learning_rate": 2.427822699559514e-06,
"loss": 0.2616,
"step": 283
},
{
"epoch": 5.14027149321267,
"grad_norm": 0.18029896729988643,
"learning_rate": 2.413392535391663e-06,
"loss": 0.2671,
"step": 284
},
{
"epoch": 5.158371040723982,
"grad_norm": 0.19353123935666788,
"learning_rate": 2.3989652596153496e-06,
"loss": 0.2518,
"step": 285
},
{
"epoch": 5.176470588235294,
"grad_norm": 0.1999507247304982,
"learning_rate": 2.3845413533856517e-06,
"loss": 0.2691,
"step": 286
},
{
"epoch": 5.1945701357466065,
"grad_norm": 0.1802458898092889,
"learning_rate": 2.3701212977452683e-06,
"loss": 0.2662,
"step": 287
},
{
"epoch": 5.212669683257919,
"grad_norm": 0.20005237780106283,
"learning_rate": 2.3557055736084847e-06,
"loss": 0.2706,
"step": 288
},
{
"epoch": 5.230769230769231,
"grad_norm": 0.20349821320072675,
"learning_rate": 2.3412946617451242e-06,
"loss": 0.2651,
"step": 289
},
{
"epoch": 5.248868778280543,
"grad_norm": 0.19275858047883396,
"learning_rate": 2.3268890427645213e-06,
"loss": 0.2809,
"step": 290
},
{
"epoch": 5.266968325791855,
"grad_norm": 0.19491454590375834,
"learning_rate": 2.312489197099489e-06,
"loss": 0.242,
"step": 291
},
{
"epoch": 5.285067873303167,
"grad_norm": 0.17860701410760396,
"learning_rate": 2.298095604990302e-06,
"loss": 0.252,
"step": 292
},
{
"epoch": 5.3031674208144794,
"grad_norm": 0.18166338870243837,
"learning_rate": 2.283708746468672e-06,
"loss": 0.2687,
"step": 293
},
{
"epoch": 5.321266968325792,
"grad_norm": 0.20860085100238554,
"learning_rate": 2.269329101341745e-06,
"loss": 0.2749,
"step": 294
},
{
"epoch": 5.339366515837104,
"grad_norm": 0.18128543910141529,
"learning_rate": 2.2549571491760985e-06,
"loss": 0.2423,
"step": 295
},
{
"epoch": 5.357466063348416,
"grad_norm": 0.23828035104300602,
"learning_rate": 2.2405933692817458e-06,
"loss": 0.2582,
"step": 296
},
{
"epoch": 5.375565610859729,
"grad_norm": 0.19867583702537983,
"learning_rate": 2.226238240696151e-06,
"loss": 0.2505,
"step": 297
},
{
"epoch": 5.393665158371041,
"grad_norm": 0.2238993077156904,
"learning_rate": 2.2118922421682563e-06,
"loss": 0.2547,
"step": 298
},
{
"epoch": 5.411764705882353,
"grad_norm": 0.18659890730168405,
"learning_rate": 2.1975558521425106e-06,
"loss": 0.2541,
"step": 299
},
{
"epoch": 5.429864253393665,
"grad_norm": 0.2086208336638683,
"learning_rate": 2.183229548742916e-06,
"loss": 0.2449,
"step": 300
},
{
"epoch": 5.447963800904978,
"grad_norm": 0.19744096649329249,
"learning_rate": 2.1689138097570832e-06,
"loss": 0.2529,
"step": 301
},
{
"epoch": 5.46606334841629,
"grad_norm": 0.1905137878945102,
"learning_rate": 2.1546091126202955e-06,
"loss": 0.2549,
"step": 302
},
{
"epoch": 5.484162895927602,
"grad_norm": 0.18724152511382108,
"learning_rate": 2.1403159343995845e-06,
"loss": 0.2544,
"step": 303
},
{
"epoch": 5.502262443438914,
"grad_norm": 0.18137306072412968,
"learning_rate": 2.1260347517778223e-06,
"loss": 0.2472,
"step": 304
},
{
"epoch": 5.520361990950226,
"grad_norm": 0.21137486256539126,
"learning_rate": 2.111766041037826e-06,
"loss": 0.2663,
"step": 305
},
{
"epoch": 5.538461538461538,
"grad_norm": 0.18969561601900994,
"learning_rate": 2.0975102780464674e-06,
"loss": 0.2654,
"step": 306
},
{
"epoch": 5.5565610859728505,
"grad_norm": 0.18687293378459552,
"learning_rate": 2.083267938238808e-06,
"loss": 0.2521,
"step": 307
},
{
"epoch": 5.574660633484163,
"grad_norm": 0.18563465250651875,
"learning_rate": 2.0690394966022397e-06,
"loss": 0.2599,
"step": 308
},
{
"epoch": 5.592760180995475,
"grad_norm": 0.18961353982721652,
"learning_rate": 2.0548254276606457e-06,
"loss": 0.253,
"step": 309
},
{
"epoch": 5.610859728506787,
"grad_norm": 0.19358594701649867,
"learning_rate": 2.040626205458574e-06,
"loss": 0.268,
"step": 310
},
{
"epoch": 5.628959276018099,
"grad_norm": 0.18903082550740266,
"learning_rate": 2.02644230354543e-06,
"loss": 0.2794,
"step": 311
},
{
"epoch": 5.647058823529412,
"grad_norm": 0.18955280198715693,
"learning_rate": 2.01227419495968e-06,
"loss": 0.2466,
"step": 312
},
{
"epoch": 5.665158371040724,
"grad_norm": 0.21673963839382857,
"learning_rate": 1.9981223522130764e-06,
"loss": 0.2646,
"step": 313
},
{
"epoch": 5.683257918552036,
"grad_norm": 0.18658355423161882,
"learning_rate": 1.9839872472749016e-06,
"loss": 0.2524,
"step": 314
},
{
"epoch": 5.701357466063349,
"grad_norm": 0.18351414151686257,
"learning_rate": 1.9698693515562235e-06,
"loss": 0.2484,
"step": 315
},
{
"epoch": 5.719457013574661,
"grad_norm": 0.19521700431845607,
"learning_rate": 1.9557691358941796e-06,
"loss": 0.241,
"step": 316
},
{
"epoch": 5.737556561085973,
"grad_norm": 0.18325038007655156,
"learning_rate": 1.941687070536267e-06,
"loss": 0.2834,
"step": 317
},
{
"epoch": 5.755656108597285,
"grad_norm": 0.2024434466335083,
"learning_rate": 1.9276236251246655e-06,
"loss": 0.2617,
"step": 318
},
{
"epoch": 5.773755656108597,
"grad_norm": 0.19282545684546182,
"learning_rate": 1.913579268680572e-06,
"loss": 0.251,
"step": 319
},
{
"epoch": 5.791855203619909,
"grad_norm": 0.1985416405665436,
"learning_rate": 1.8995544695885593e-06,
"loss": 0.2528,
"step": 320
},
{
"epoch": 5.8099547511312215,
"grad_norm": 0.19180458814723977,
"learning_rate": 1.8855496955809546e-06,
"loss": 0.2623,
"step": 321
},
{
"epoch": 5.828054298642534,
"grad_norm": 0.19714720164607588,
"learning_rate": 1.8715654137222434e-06,
"loss": 0.2603,
"step": 322
},
{
"epoch": 5.846153846153846,
"grad_norm": 0.17915913395978303,
"learning_rate": 1.8576020903934872e-06,
"loss": 0.2461,
"step": 323
},
{
"epoch": 5.864253393665159,
"grad_norm": 0.1872517611416961,
"learning_rate": 1.8436601912767737e-06,
"loss": 0.2443,
"step": 324
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.2088265937495008,
"learning_rate": 1.8297401813396854e-06,
"loss": 0.2606,
"step": 325
},
{
"epoch": 5.900452488687783,
"grad_norm": 0.20072778739580704,
"learning_rate": 1.8158425248197931e-06,
"loss": 0.2683,
"step": 326
},
{
"epoch": 5.918552036199095,
"grad_norm": 0.20162018571475668,
"learning_rate": 1.801967685209171e-06,
"loss": 0.2674,
"step": 327
},
{
"epoch": 5.9366515837104075,
"grad_norm": 0.19962010438752759,
"learning_rate": 1.7881161252389423e-06,
"loss": 0.2518,
"step": 328
},
{
"epoch": 5.95475113122172,
"grad_norm": 0.1924016139723619,
"learning_rate": 1.7742883068638447e-06,
"loss": 0.2332,
"step": 329
},
{
"epoch": 5.972850678733032,
"grad_norm": 0.19688732396260147,
"learning_rate": 1.7604846912468243e-06,
"loss": 0.2758,
"step": 330
},
{
"epoch": 5.990950226244344,
"grad_norm": 0.21367643724553775,
"learning_rate": 1.7467057387436577e-06,
"loss": 0.2722,
"step": 331
},
{
"epoch": 6.009049773755656,
"grad_norm": 0.18143686535639186,
"learning_rate": 1.7329519088875959e-06,
"loss": 0.2505,
"step": 332
},
{
"epoch": 6.027149321266968,
"grad_norm": 0.19884601017939751,
"learning_rate": 1.719223660374041e-06,
"loss": 0.2406,
"step": 333
},
{
"epoch": 6.04524886877828,
"grad_norm": 0.19790104231314157,
"learning_rate": 1.7055214510452462e-06,
"loss": 0.2459,
"step": 334
},
{
"epoch": 6.0633484162895925,
"grad_norm": 0.21259902967676111,
"learning_rate": 1.6918457378750511e-06,
"loss": 0.256,
"step": 335
},
{
"epoch": 6.081447963800905,
"grad_norm": 0.213170628627418,
"learning_rate": 1.6781969769536356e-06,
"loss": 0.2606,
"step": 336
},
{
"epoch": 6.099547511312217,
"grad_norm": 0.18867147575952214,
"learning_rate": 1.6645756234723127e-06,
"loss": 0.2445,
"step": 337
},
{
"epoch": 6.117647058823529,
"grad_norm": 0.18694162673757048,
"learning_rate": 1.6509821317083466e-06,
"loss": 0.2346,
"step": 338
},
{
"epoch": 6.135746606334842,
"grad_norm": 0.19692152056487713,
"learning_rate": 1.6374169550098052e-06,
"loss": 0.2645,
"step": 339
},
{
"epoch": 6.153846153846154,
"grad_norm": 0.18900423846777845,
"learning_rate": 1.6238805457804353e-06,
"loss": 0.2409,
"step": 340
},
{
"epoch": 6.171945701357466,
"grad_norm": 0.19281737146761763,
"learning_rate": 1.6103733554645794e-06,
"loss": 0.2511,
"step": 341
},
{
"epoch": 6.1900452488687785,
"grad_norm": 0.18576535863582108,
"learning_rate": 1.5968958345321178e-06,
"loss": 0.2562,
"step": 342
},
{
"epoch": 6.208144796380091,
"grad_norm": 0.1937616575487202,
"learning_rate": 1.5834484324634453e-06,
"loss": 0.2558,
"step": 343
},
{
"epoch": 6.226244343891403,
"grad_norm": 0.20266025820130834,
"learning_rate": 1.5700315977344813e-06,
"loss": 0.2619,
"step": 344
},
{
"epoch": 6.244343891402715,
"grad_norm": 0.19244645126328583,
"learning_rate": 1.5566457778017141e-06,
"loss": 0.2357,
"step": 345
},
{
"epoch": 6.262443438914027,
"grad_norm": 0.19529354957198908,
"learning_rate": 1.5432914190872757e-06,
"loss": 0.2547,
"step": 346
},
{
"epoch": 6.280542986425339,
"grad_norm": 0.1977639183994923,
"learning_rate": 1.529968966964055e-06,
"loss": 0.253,
"step": 347
},
{
"epoch": 6.298642533936651,
"grad_norm": 0.19407931113719454,
"learning_rate": 1.5166788657408441e-06,
"loss": 0.2632,
"step": 348
},
{
"epoch": 6.316742081447964,
"grad_norm": 0.19011112857943221,
"learning_rate": 1.5034215586475194e-06,
"loss": 0.2647,
"step": 349
},
{
"epoch": 6.334841628959276,
"grad_norm": 0.2186558043805355,
"learning_rate": 1.490197487820263e-06,
"loss": 0.2395,
"step": 350
},
{
"epoch": 6.352941176470588,
"grad_norm": 0.18367578824384137,
"learning_rate": 1.477007094286813e-06,
"loss": 0.2516,
"step": 351
},
{
"epoch": 6.371040723981901,
"grad_norm": 0.18371310311269254,
"learning_rate": 1.4638508179517583e-06,
"loss": 0.2709,
"step": 352
},
{
"epoch": 6.389140271493213,
"grad_norm": 0.19750798322441557,
"learning_rate": 1.4507290975818648e-06,
"loss": 0.2497,
"step": 353
},
{
"epoch": 6.407239819004525,
"grad_norm": 0.17489326087119314,
"learning_rate": 1.4376423707914462e-06,
"loss": 0.2518,
"step": 354
},
{
"epoch": 6.425339366515837,
"grad_norm": 0.19109685375971255,
"learning_rate": 1.4245910740277642e-06,
"loss": 0.2464,
"step": 355
},
{
"epoch": 6.4434389140271495,
"grad_norm": 0.18732644035351217,
"learning_rate": 1.4115756425564798e-06,
"loss": 0.2554,
"step": 356
},
{
"epoch": 6.461538461538462,
"grad_norm": 0.2042904942174333,
"learning_rate": 1.39859651044713e-06,
"loss": 0.2677,
"step": 357
},
{
"epoch": 6.479638009049774,
"grad_norm": 0.20346012347129977,
"learning_rate": 1.3856541105586545e-06,
"loss": 0.2433,
"step": 358
},
{
"epoch": 6.497737556561086,
"grad_norm": 0.18096207448536866,
"learning_rate": 1.372748874524961e-06,
"loss": 0.248,
"step": 359
},
{
"epoch": 6.515837104072398,
"grad_norm": 0.18311281316650868,
"learning_rate": 1.3598812327405274e-06,
"loss": 0.2433,
"step": 360
},
{
"epoch": 6.53393665158371,
"grad_norm": 0.19877832010020277,
"learning_rate": 1.3470516143460494e-06,
"loss": 0.2419,
"step": 361
},
{
"epoch": 6.552036199095022,
"grad_norm": 0.19411009696243373,
"learning_rate": 1.3342604472141296e-06,
"loss": 0.2485,
"step": 362
},
{
"epoch": 6.570135746606335,
"grad_norm": 0.18775697820498174,
"learning_rate": 1.3215081579350058e-06,
"loss": 0.2514,
"step": 363
},
{
"epoch": 6.588235294117647,
"grad_norm": 0.1974485040630947,
"learning_rate": 1.308795171802324e-06,
"loss": 0.2623,
"step": 364
},
{
"epoch": 6.606334841628959,
"grad_norm": 0.20195192192796554,
"learning_rate": 1.2961219127989562e-06,
"loss": 0.2523,
"step": 365
},
{
"epoch": 6.624434389140271,
"grad_norm": 0.1867586520187508,
"learning_rate": 1.2834888035828597e-06,
"loss": 0.2434,
"step": 366
},
{
"epoch": 6.642533936651584,
"grad_norm": 0.19535767032905008,
"learning_rate": 1.2708962654729812e-06,
"loss": 0.2246,
"step": 367
},
{
"epoch": 6.660633484162896,
"grad_norm": 0.17951796660986621,
"learning_rate": 1.258344718435205e-06,
"loss": 0.2548,
"step": 368
},
{
"epoch": 6.678733031674208,
"grad_norm": 0.1838076745236157,
"learning_rate": 1.2458345810683492e-06,
"loss": 0.2517,
"step": 369
},
{
"epoch": 6.6968325791855206,
"grad_norm": 0.1987502629500275,
"learning_rate": 1.233366270590202e-06,
"loss": 0.2373,
"step": 370
},
{
"epoch": 6.714932126696833,
"grad_norm": 0.1921556070273265,
"learning_rate": 1.2209402028236114e-06,
"loss": 0.2444,
"step": 371
},
{
"epoch": 6.733031674208145,
"grad_norm": 0.18753751737041122,
"learning_rate": 1.2085567921826128e-06,
"loss": 0.2429,
"step": 372
},
{
"epoch": 6.751131221719457,
"grad_norm": 0.17267111610692507,
"learning_rate": 1.1962164516586123e-06,
"loss": 0.2408,
"step": 373
},
{
"epoch": 6.769230769230769,
"grad_norm": 0.1785397882614972,
"learning_rate": 1.1839195928066101e-06,
"loss": 0.2364,
"step": 374
},
{
"epoch": 6.787330316742081,
"grad_norm": 0.1974641160114867,
"learning_rate": 1.171666625731477e-06,
"loss": 0.2502,
"step": 375
},
{
"epoch": 6.8054298642533935,
"grad_norm": 0.1936200917713445,
"learning_rate": 1.1594579590742758e-06,
"loss": 0.2495,
"step": 376
},
{
"epoch": 6.823529411764706,
"grad_norm": 0.20474767855899034,
"learning_rate": 1.1472939999986338e-06,
"loss": 0.2444,
"step": 377
},
{
"epoch": 6.841628959276018,
"grad_norm": 0.21747609011178112,
"learning_rate": 1.1351751541771644e-06,
"loss": 0.2423,
"step": 378
},
{
"epoch": 6.859728506787331,
"grad_norm": 0.2024534108733349,
"learning_rate": 1.1231018257779363e-06,
"loss": 0.2641,
"step": 379
},
{
"epoch": 6.877828054298643,
"grad_norm": 0.19486585090979294,
"learning_rate": 1.1110744174509952e-06,
"loss": 0.2463,
"step": 380
},
{
"epoch": 6.895927601809955,
"grad_norm": 0.17849040364534344,
"learning_rate": 1.0990933303149342e-06,
"loss": 0.2631,
"step": 381
},
{
"epoch": 6.914027149321267,
"grad_norm": 0.19002926125887049,
"learning_rate": 1.0871589639435204e-06,
"loss": 0.2481,
"step": 382
},
{
"epoch": 6.932126696832579,
"grad_norm": 0.18083592050616315,
"learning_rate": 1.0752717163523623e-06,
"loss": 0.241,
"step": 383
},
{
"epoch": 6.950226244343892,
"grad_norm": 0.19496492930938145,
"learning_rate": 1.0634319839856407e-06,
"loss": 0.2527,
"step": 384
},
{
"epoch": 6.968325791855204,
"grad_norm": 0.19417699707230154,
"learning_rate": 1.0516401617028863e-06,
"loss": 0.2322,
"step": 385
},
{
"epoch": 6.986425339366516,
"grad_norm": 0.18003217148044237,
"learning_rate": 1.0398966427658091e-06,
"loss": 0.2357,
"step": 386
},
{
"epoch": 7.004524886877828,
"grad_norm": 0.18246799637458713,
"learning_rate": 1.0282018188251854e-06,
"loss": 0.2568,
"step": 387
},
{
"epoch": 7.02262443438914,
"grad_norm": 0.18781508356688068,
"learning_rate": 1.0165560799077952e-06,
"loss": 0.2387,
"step": 388
},
{
"epoch": 7.040723981900452,
"grad_norm": 0.17588577341825412,
"learning_rate": 1.004959814403413e-06,
"loss": 0.262,
"step": 389
},
{
"epoch": 7.0588235294117645,
"grad_norm": 0.19676767898186667,
"learning_rate": 9.934134090518593e-07,
"loss": 0.2374,
"step": 390
},
{
"epoch": 7.076923076923077,
"grad_norm": 0.19345676011938345,
"learning_rate": 9.81917248930096e-07,
"loss": 0.2162,
"step": 391
},
{
"epoch": 7.095022624434389,
"grad_norm": 0.2178742299523153,
"learning_rate": 9.704717174393912e-07,
"loss": 0.2495,
"step": 392
},
{
"epoch": 7.113122171945701,
"grad_norm": 0.18628703610003405,
"learning_rate": 9.590771962925272e-07,
"loss": 0.2596,
"step": 393
},
{
"epoch": 7.131221719457014,
"grad_norm": 0.18042019029734135,
"learning_rate": 9.477340655010717e-07,
"loss": 0.2465,
"step": 394
},
{
"epoch": 7.149321266968326,
"grad_norm": 0.1924619560299915,
"learning_rate": 9.36442703362706e-07,
"loss": 0.2395,
"step": 395
},
{
"epoch": 7.167420814479638,
"grad_norm": 0.18162050443390207,
"learning_rate": 9.252034864486062e-07,
"loss": 0.2425,
"step": 396
},
{
"epoch": 7.1855203619909505,
"grad_norm": 0.1725352404799184,
"learning_rate": 9.140167895908867e-07,
"loss": 0.2257,
"step": 397
},
{
"epoch": 7.203619909502263,
"grad_norm": 0.17850869622337964,
"learning_rate": 9.028829858700974e-07,
"loss": 0.2313,
"step": 398
},
{
"epoch": 7.221719457013575,
"grad_norm": 0.1896145123389741,
"learning_rate": 8.918024466027822e-07,
"loss": 0.2462,
"step": 399
},
{
"epoch": 7.239819004524887,
"grad_norm": 0.1878899849862918,
"learning_rate": 8.807755413290953e-07,
"loss": 0.2502,
"step": 400
},
{
"epoch": 7.257918552036199,
"grad_norm": 0.19070595484051797,
"learning_rate": 8.698026378004787e-07,
"loss": 0.2433,
"step": 401
},
{
"epoch": 7.276018099547511,
"grad_norm": 0.17359356109341043,
"learning_rate": 8.588841019673938e-07,
"loss": 0.2604,
"step": 402
},
{
"epoch": 7.294117647058823,
"grad_norm": 0.20358309076003017,
"learning_rate": 8.480202979671201e-07,
"loss": 0.2327,
"step": 403
},
{
"epoch": 7.3122171945701355,
"grad_norm": 0.1835516820557226,
"learning_rate": 8.372115881116089e-07,
"loss": 0.2409,
"step": 404
},
{
"epoch": 7.330316742081448,
"grad_norm": 0.18238130931189853,
"learning_rate": 8.264583328754017e-07,
"loss": 0.2393,
"step": 405
},
{
"epoch": 7.34841628959276,
"grad_norm": 0.17542601825119047,
"learning_rate": 8.157608908836071e-07,
"loss": 0.2312,
"step": 406
},
{
"epoch": 7.366515837104072,
"grad_norm": 0.18257023212771115,
"learning_rate": 8.051196188999425e-07,
"loss": 0.2503,
"step": 407
},
{
"epoch": 7.384615384615385,
"grad_norm": 0.1967778738312882,
"learning_rate": 7.945348718148324e-07,
"loss": 0.2419,
"step": 408
},
{
"epoch": 7.402714932126697,
"grad_norm": 0.18755379540882788,
"learning_rate": 7.840070026335758e-07,
"loss": 0.2332,
"step": 409
},
{
"epoch": 7.420814479638009,
"grad_norm": 0.1911070489817504,
"learning_rate": 7.735363624645712e-07,
"loss": 0.2484,
"step": 410
},
{
"epoch": 7.4389140271493215,
"grad_norm": 0.1882055636984676,
"learning_rate": 7.6312330050761e-07,
"loss": 0.2404,
"step": 411
},
{
"epoch": 7.457013574660634,
"grad_norm": 0.20190668623593286,
"learning_rate": 7.527681640422265e-07,
"loss": 0.2526,
"step": 412
},
{
"epoch": 7.475113122171946,
"grad_norm": 0.1974234563343766,
"learning_rate": 7.424712984161192e-07,
"loss": 0.2688,
"step": 413
},
{
"epoch": 7.493212669683258,
"grad_norm": 0.17631879313649837,
"learning_rate": 7.322330470336314e-07,
"loss": 0.2508,
"step": 414
},
{
"epoch": 7.51131221719457,
"grad_norm": 0.18714884817468105,
"learning_rate": 7.220537513442999e-07,
"loss": 0.2486,
"step": 415
},
{
"epoch": 7.529411764705882,
"grad_norm": 0.19399653562175878,
"learning_rate": 7.11933750831467e-07,
"loss": 0.2618,
"step": 416
},
{
"epoch": 7.547511312217194,
"grad_norm": 0.1881943799081702,
"learning_rate": 7.018733830009578e-07,
"loss": 0.2745,
"step": 417
},
{
"epoch": 7.5656108597285066,
"grad_norm": 0.19410422423302068,
"learning_rate": 6.91872983369826e-07,
"loss": 0.2575,
"step": 418
},
{
"epoch": 7.583710407239819,
"grad_norm": 0.19139908757724744,
"learning_rate": 6.819328854551619e-07,
"loss": 0.2431,
"step": 419
},
{
"epoch": 7.601809954751131,
"grad_norm": 0.19407692138480465,
"learning_rate": 6.720534207629731e-07,
"loss": 0.2612,
"step": 420
},
{
"epoch": 7.619909502262443,
"grad_norm": 0.19077609905815648,
"learning_rate": 6.622349187771246e-07,
"loss": 0.2363,
"step": 421
},
{
"epoch": 7.638009049773755,
"grad_norm": 0.19785590661298624,
"learning_rate": 6.524777069483526e-07,
"loss": 0.2165,
"step": 422
},
{
"epoch": 7.656108597285068,
"grad_norm": 0.18170589381863933,
"learning_rate": 6.427821106833429e-07,
"loss": 0.2518,
"step": 423
},
{
"epoch": 7.67420814479638,
"grad_norm": 0.19082550580582264,
"learning_rate": 6.33148453333881e-07,
"loss": 0.2497,
"step": 424
},
{
"epoch": 7.6923076923076925,
"grad_norm": 0.2010429672996338,
"learning_rate": 6.235770561860646e-07,
"loss": 0.2735,
"step": 425
},
{
"epoch": 7.710407239819005,
"grad_norm": 0.20631621699435826,
"learning_rate": 6.140682384495902e-07,
"loss": 0.2638,
"step": 426
},
{
"epoch": 7.728506787330317,
"grad_norm": 0.18857883979117615,
"learning_rate": 6.046223172471083e-07,
"loss": 0.2511,
"step": 427
},
{
"epoch": 7.746606334841629,
"grad_norm": 0.19438107603701976,
"learning_rate": 5.952396076036457e-07,
"loss": 0.2411,
"step": 428
},
{
"epoch": 7.764705882352941,
"grad_norm": 0.18435853585586434,
"learning_rate": 5.85920422436099e-07,
"loss": 0.2337,
"step": 429
},
{
"epoch": 7.782805429864253,
"grad_norm": 0.19759361458272545,
"learning_rate": 5.766650725428027e-07,
"loss": 0.2304,
"step": 430
},
{
"epoch": 7.800904977375565,
"grad_norm": 0.17820786715247264,
"learning_rate": 5.674738665931575e-07,
"loss": 0.2302,
"step": 431
},
{
"epoch": 7.819004524886878,
"grad_norm": 0.18336638108510472,
"learning_rate": 5.583471111173414e-07,
"loss": 0.2415,
"step": 432
},
{
"epoch": 7.83710407239819,
"grad_norm": 0.1861341218211825,
"learning_rate": 5.492851104960839e-07,
"loss": 0.2347,
"step": 433
},
{
"epoch": 7.855203619909502,
"grad_norm": 0.18671520221803245,
"learning_rate": 5.402881669505164e-07,
"loss": 0.2433,
"step": 434
},
{
"epoch": 7.873303167420815,
"grad_norm": 0.18470916369258913,
"learning_rate": 5.313565805320914e-07,
"loss": 0.2392,
"step": 435
},
{
"epoch": 7.891402714932127,
"grad_norm": 0.18145209957770228,
"learning_rate": 5.224906491125778e-07,
"loss": 0.2491,
"step": 436
},
{
"epoch": 7.909502262443439,
"grad_norm": 0.1841316864472566,
"learning_rate": 5.13690668374125e-07,
"loss": 0.2374,
"step": 437
},
{
"epoch": 7.927601809954751,
"grad_norm": 0.16991217903448427,
"learning_rate": 5.049569317994013e-07,
"loss": 0.2222,
"step": 438
},
{
"epoch": 7.9457013574660635,
"grad_norm": 0.18977292588230824,
"learning_rate": 4.962897306618101e-07,
"loss": 0.2413,
"step": 439
},
{
"epoch": 7.963800904977376,
"grad_norm": 0.2034200762540194,
"learning_rate": 4.876893540157692e-07,
"loss": 0.2526,
"step": 440
},
{
"epoch": 7.981900452488688,
"grad_norm": 0.18561076018112563,
"learning_rate": 4.791560886870786e-07,
"loss": 0.2505,
"step": 441
},
{
"epoch": 8.0,
"grad_norm": 0.1808509581648577,
"learning_rate": 4.70690219263347e-07,
"loss": 0.2397,
"step": 442
},
{
"epoch": 8.018099547511312,
"grad_norm": 0.1983786803651098,
"learning_rate": 4.6229202808450587e-07,
"loss": 0.2384,
"step": 443
},
{
"epoch": 8.036199095022624,
"grad_norm": 0.19613362321076386,
"learning_rate": 4.539617952333913e-07,
"loss": 0.2396,
"step": 444
},
{
"epoch": 8.054298642533936,
"grad_norm": 0.18104571677229486,
"learning_rate": 4.4569979852640444e-07,
"loss": 0.2481,
"step": 445
},
{
"epoch": 8.072398190045249,
"grad_norm": 0.18894956462902818,
"learning_rate": 4.3750631350424456e-07,
"loss": 0.2331,
"step": 446
},
{
"epoch": 8.09049773755656,
"grad_norm": 0.1856642703057781,
"learning_rate": 4.2938161342272024e-07,
"loss": 0.2398,
"step": 447
},
{
"epoch": 8.108597285067873,
"grad_norm": 0.19509279291436657,
"learning_rate": 4.2132596924363666e-07,
"loss": 0.2396,
"step": 448
},
{
"epoch": 8.126696832579185,
"grad_norm": 0.18583235612820456,
"learning_rate": 4.1333964962575995e-07,
"loss": 0.2457,
"step": 449
},
{
"epoch": 8.144796380090497,
"grad_norm": 0.19414831334323818,
"learning_rate": 4.0542292091585447e-07,
"loss": 0.2557,
"step": 450
},
{
"epoch": 8.16289592760181,
"grad_norm": 0.1948999434614907,
"learning_rate": 3.975760471398013e-07,
"loss": 0.2346,
"step": 451
},
{
"epoch": 8.180995475113122,
"grad_norm": 0.18223819061827173,
"learning_rate": 3.89799289993795e-07,
"loss": 0.2176,
"step": 452
},
{
"epoch": 8.199095022624434,
"grad_norm": 0.19449644313553408,
"learning_rate": 3.8209290883561205e-07,
"loss": 0.247,
"step": 453
},
{
"epoch": 8.217194570135746,
"grad_norm": 0.1930258214779179,
"learning_rate": 3.7445716067596506e-07,
"loss": 0.2298,
"step": 454
},
{
"epoch": 8.235294117647058,
"grad_norm": 0.18628969575946702,
"learning_rate": 3.668923001699284e-07,
"loss": 0.2385,
"step": 455
},
{
"epoch": 8.25339366515837,
"grad_norm": 0.18169941514755078,
"learning_rate": 3.593985796084468e-07,
"loss": 0.2519,
"step": 456
},
{
"epoch": 8.271493212669684,
"grad_norm": 0.1837119269988211,
"learning_rate": 3.519762489099207e-07,
"loss": 0.2602,
"step": 457
},
{
"epoch": 8.289592760180996,
"grad_norm": 0.1953248401558189,
"learning_rate": 3.446255556118736e-07,
"loss": 0.2567,
"step": 458
},
{
"epoch": 8.307692307692308,
"grad_norm": 0.17837155536528138,
"learning_rate": 3.373467448626916e-07,
"loss": 0.2332,
"step": 459
},
{
"epoch": 8.32579185520362,
"grad_norm": 0.1879124674324348,
"learning_rate": 3.3014005941345406e-07,
"loss": 0.2357,
"step": 460
},
{
"epoch": 8.343891402714933,
"grad_norm": 0.19669583622722217,
"learning_rate": 3.230057396098321e-07,
"loss": 0.2188,
"step": 461
},
{
"epoch": 8.361990950226245,
"grad_norm": 0.19436805306375338,
"learning_rate": 3.1594402338407633e-07,
"loss": 0.2595,
"step": 462
},
{
"epoch": 8.380090497737557,
"grad_norm": 0.1731035690780127,
"learning_rate": 3.0895514624707994e-07,
"loss": 0.2293,
"step": 463
},
{
"epoch": 8.39819004524887,
"grad_norm": 0.19086125694967881,
"learning_rate": 3.020393412805259e-07,
"loss": 0.2305,
"step": 464
},
{
"epoch": 8.416289592760181,
"grad_norm": 0.18779406733198983,
"learning_rate": 2.9519683912911267e-07,
"loss": 0.2596,
"step": 465
},
{
"epoch": 8.434389140271493,
"grad_norm": 0.18546808477280827,
"learning_rate": 2.8842786799286204e-07,
"loss": 0.2435,
"step": 466
},
{
"epoch": 8.452488687782806,
"grad_norm": 0.1896684936541315,
"learning_rate": 2.8173265361950837e-07,
"loss": 0.2386,
"step": 467
},
{
"epoch": 8.470588235294118,
"grad_norm": 0.17852233356583405,
"learning_rate": 2.751114192969709e-07,
"loss": 0.231,
"step": 468
},
{
"epoch": 8.48868778280543,
"grad_norm": 0.18399543647963754,
"learning_rate": 2.685643858459064e-07,
"loss": 0.2477,
"step": 469
},
{
"epoch": 8.506787330316742,
"grad_norm": 0.18054851239071437,
"learning_rate": 2.620917716123444e-07,
"loss": 0.2504,
"step": 470
},
{
"epoch": 8.524886877828054,
"grad_norm": 0.19308936407562874,
"learning_rate": 2.55693792460405e-07,
"loss": 0.2545,
"step": 471
},
{
"epoch": 8.542986425339366,
"grad_norm": 0.19847333989927235,
"learning_rate": 2.4937066176510123e-07,
"loss": 0.2462,
"step": 472
},
{
"epoch": 8.561085972850679,
"grad_norm": 0.20082127472743996,
"learning_rate": 2.4312259040522093e-07,
"loss": 0.2449,
"step": 473
},
{
"epoch": 8.57918552036199,
"grad_norm": 0.1843637491284879,
"learning_rate": 2.3694978675629476e-07,
"loss": 0.2422,
"step": 474
},
{
"epoch": 8.597285067873303,
"grad_norm": 0.18297796260401825,
"learning_rate": 2.3085245668364897e-07,
"loss": 0.2492,
"step": 475
},
{
"epoch": 8.615384615384615,
"grad_norm": 0.18214698681303781,
"learning_rate": 2.2483080353553537e-07,
"loss": 0.2435,
"step": 476
},
{
"epoch": 8.633484162895927,
"grad_norm": 0.1932187580551005,
"learning_rate": 2.1888502813635276e-07,
"loss": 0.2471,
"step": 477
},
{
"epoch": 8.65158371040724,
"grad_norm": 0.1862160611593082,
"learning_rate": 2.1301532877994747e-07,
"loss": 0.2367,
"step": 478
},
{
"epoch": 8.669683257918551,
"grad_norm": 0.1853161129752053,
"learning_rate": 2.0722190122300311e-07,
"loss": 0.2344,
"step": 479
},
{
"epoch": 8.687782805429864,
"grad_norm": 0.18442104500106515,
"learning_rate": 2.0150493867850867e-07,
"loss": 0.2394,
"step": 480
},
{
"epoch": 8.705882352941176,
"grad_norm": 0.1836768530394557,
"learning_rate": 1.9586463180931658e-07,
"loss": 0.242,
"step": 481
},
{
"epoch": 8.723981900452488,
"grad_norm": 0.18225478866484207,
"learning_rate": 1.9030116872178317e-07,
"loss": 0.2571,
"step": 482
},
{
"epoch": 8.742081447963802,
"grad_norm": 0.19072644512673081,
"learning_rate": 1.848147349594967e-07,
"loss": 0.2457,
"step": 483
},
{
"epoch": 8.760180995475114,
"grad_norm": 0.18223857901348137,
"learning_rate": 1.7940551349708734e-07,
"loss": 0.2351,
"step": 484
},
{
"epoch": 8.778280542986426,
"grad_norm": 0.23050285345657223,
"learning_rate": 1.7407368473412678e-07,
"loss": 0.2355,
"step": 485
},
{
"epoch": 8.796380090497738,
"grad_norm": 0.18880764635155572,
"learning_rate": 1.6881942648911077e-07,
"loss": 0.2287,
"step": 486
},
{
"epoch": 8.81447963800905,
"grad_norm": 0.1830117965150596,
"learning_rate": 1.6364291399352916e-07,
"loss": 0.2447,
"step": 487
},
{
"epoch": 8.832579185520363,
"grad_norm": 0.18803881671915923,
"learning_rate": 1.5854431988602175e-07,
"loss": 0.2431,
"step": 488
},
{
"epoch": 8.850678733031675,
"grad_norm": 0.18013778534000302,
"learning_rate": 1.5352381420662144e-07,
"loss": 0.2397,
"step": 489
},
{
"epoch": 8.868778280542987,
"grad_norm": 0.2003292008190993,
"learning_rate": 1.4858156439108097e-07,
"loss": 0.2291,
"step": 490
},
{
"epoch": 8.886877828054299,
"grad_norm": 0.1780640301175049,
"learning_rate": 1.4371773526529216e-07,
"loss": 0.2138,
"step": 491
},
{
"epoch": 8.904977375565611,
"grad_norm": 0.1858049004037094,
"learning_rate": 1.3893248903978695e-07,
"loss": 0.2248,
"step": 492
},
{
"epoch": 8.923076923076923,
"grad_norm": 0.1870658138910751,
"learning_rate": 1.342259853043279e-07,
"loss": 0.2628,
"step": 493
},
{
"epoch": 8.941176470588236,
"grad_norm": 0.1837618747915919,
"learning_rate": 1.2959838102258537e-07,
"loss": 0.2369,
"step": 494
},
{
"epoch": 8.959276018099548,
"grad_norm": 0.1825018533847707,
"learning_rate": 1.2504983052690406e-07,
"loss": 0.2371,
"step": 495
},
{
"epoch": 8.97737556561086,
"grad_norm": 0.18050085376698732,
"learning_rate": 1.2058048551315455e-07,
"loss": 0.2364,
"step": 496
},
{
"epoch": 8.995475113122172,
"grad_norm": 0.17972618184239006,
"learning_rate": 1.1619049503567486e-07,
"loss": 0.2473,
"step": 497
},
{
"epoch": 9.013574660633484,
"grad_norm": 0.1830792217516428,
"learning_rate": 1.1188000550230005e-07,
"loss": 0.2352,
"step": 498
},
{
"epoch": 9.031674208144796,
"grad_norm": 0.17879744556952354,
"learning_rate": 1.0764916066947795e-07,
"loss": 0.2641,
"step": 499
},
{
"epoch": 9.049773755656108,
"grad_norm": 0.18166675174635316,
"learning_rate": 1.0349810163747587e-07,
"loss": 0.2324,
"step": 500
},
{
"epoch": 9.06787330316742,
"grad_norm": 0.169470120760864,
"learning_rate": 9.942696684567488e-08,
"loss": 0.2433,
"step": 501
},
{
"epoch": 9.085972850678733,
"grad_norm": 0.18110948245786077,
"learning_rate": 9.54358920679524e-08,
"loss": 0.2374,
"step": 502
},
{
"epoch": 9.104072398190045,
"grad_norm": 0.18319694777040335,
"learning_rate": 9.152501040815442e-08,
"loss": 0.254,
"step": 503
},
{
"epoch": 9.122171945701357,
"grad_norm": 0.1915504535166829,
"learning_rate": 8.769445229565549e-08,
"loss": 0.2325,
"step": 504
},
{
"epoch": 9.14027149321267,
"grad_norm": 0.17665350982157665,
"learning_rate": 8.394434548101099e-08,
"loss": 0.2251,
"step": 505
},
{
"epoch": 9.158371040723981,
"grad_norm": 0.17427434868030764,
"learning_rate": 8.027481503169371e-08,
"loss": 0.2345,
"step": 506
},
{
"epoch": 9.176470588235293,
"grad_norm": 0.1787409835322033,
"learning_rate": 7.66859833279257e-08,
"loss": 0.2389,
"step": 507
},
{
"epoch": 9.194570135746606,
"grad_norm": 0.18100016492103735,
"learning_rate": 7.317797005859467e-08,
"loss": 0.2519,
"step": 508
},
{
"epoch": 9.212669683257918,
"grad_norm": 0.17821751417293089,
"learning_rate": 6.97508922172635e-08,
"loss": 0.2287,
"step": 509
},
{
"epoch": 9.23076923076923,
"grad_norm": 0.20843753394336795,
"learning_rate": 6.640486409826785e-08,
"loss": 0.2444,
"step": 510
},
{
"epoch": 9.248868778280542,
"grad_norm": 0.17620285125559612,
"learning_rate": 6.313999729290476e-08,
"loss": 0.2601,
"step": 511
},
{
"epoch": 9.266968325791856,
"grad_norm": 0.18672849956899618,
"learning_rate": 5.99564006857109e-08,
"loss": 0.2247,
"step": 512
},
{
"epoch": 9.285067873303168,
"grad_norm": 0.18049275292301087,
"learning_rate": 5.685418045083102e-08,
"loss": 0.2511,
"step": 513
},
{
"epoch": 9.30316742081448,
"grad_norm": 0.17415682650124498,
"learning_rate": 5.383344004847774e-08,
"loss": 0.2122,
"step": 514
},
{
"epoch": 9.321266968325792,
"grad_norm": 0.18556393996618256,
"learning_rate": 5.0894280221479855e-08,
"loss": 0.2294,
"step": 515
},
{
"epoch": 9.339366515837105,
"grad_norm": 0.1838789712871206,
"learning_rate": 4.8036798991923925e-08,
"loss": 0.2223,
"step": 516
},
{
"epoch": 9.357466063348417,
"grad_norm": 0.19715964425866056,
"learning_rate": 4.526109165788439e-08,
"loss": 0.2381,
"step": 517
},
{
"epoch": 9.375565610859729,
"grad_norm": 0.1855854696991745,
"learning_rate": 4.256725079024554e-08,
"loss": 0.2342,
"step": 518
},
{
"epoch": 9.393665158371041,
"grad_norm": 0.18048725239749752,
"learning_rate": 3.995536622961399e-08,
"loss": 0.2524,
"step": 519
},
{
"epoch": 9.411764705882353,
"grad_norm": 0.19277390554384807,
"learning_rate": 3.7425525083322755e-08,
"loss": 0.2488,
"step": 520
},
{
"epoch": 9.429864253393665,
"grad_norm": 0.18825292413778436,
"learning_rate": 3.4977811722526065e-08,
"loss": 0.2263,
"step": 521
},
{
"epoch": 9.447963800904978,
"grad_norm": 0.17855716822938666,
"learning_rate": 3.261230777938607e-08,
"loss": 0.2549,
"step": 522
},
{
"epoch": 9.46606334841629,
"grad_norm": 0.18271161254439716,
"learning_rate": 3.032909214434887e-08,
"loss": 0.2062,
"step": 523
},
{
"epoch": 9.484162895927602,
"grad_norm": 0.1985253721454189,
"learning_rate": 2.8128240963515574e-08,
"loss": 0.2395,
"step": 524
},
{
"epoch": 9.502262443438914,
"grad_norm": 0.17838320881574793,
"learning_rate": 2.600982763610094e-08,
"loss": 0.2526,
"step": 525
},
{
"epoch": 9.520361990950226,
"grad_norm": 0.18314903249677716,
"learning_rate": 2.3973922811987295e-08,
"loss": 0.2264,
"step": 526
},
{
"epoch": 9.538461538461538,
"grad_norm": 0.19459399624660845,
"learning_rate": 2.202059438936588e-08,
"loss": 0.2589,
"step": 527
},
{
"epoch": 9.55656108597285,
"grad_norm": 0.1904697767266005,
"learning_rate": 2.0149907512475585e-08,
"loss": 0.2515,
"step": 528
},
{
"epoch": 9.574660633484163,
"grad_norm": 0.19523143039480956,
"learning_rate": 1.8361924569427204e-08,
"loss": 0.2525,
"step": 529
},
{
"epoch": 9.592760180995475,
"grad_norm": 0.17856164334939217,
"learning_rate": 1.6656705190125078e-08,
"loss": 0.2276,
"step": 530
},
{
"epoch": 9.610859728506787,
"grad_norm": 0.18321195430667842,
"learning_rate": 1.5034306244277042e-08,
"loss": 0.2418,
"step": 531
},
{
"epoch": 9.628959276018099,
"grad_norm": 0.17787016407899692,
"learning_rate": 1.3494781839498428e-08,
"loss": 0.2342,
"step": 532
},
{
"epoch": 9.647058823529411,
"grad_norm": 0.19126723554650038,
"learning_rate": 1.2038183319507957e-08,
"loss": 0.2469,
"step": 533
},
{
"epoch": 9.665158371040723,
"grad_norm": 0.1892990291817674,
"learning_rate": 1.0664559262413831e-08,
"loss": 0.2549,
"step": 534
},
{
"epoch": 9.683257918552036,
"grad_norm": 0.1793510043645716,
"learning_rate": 9.373955479095587e-09,
"loss": 0.2299,
"step": 535
},
{
"epoch": 9.701357466063348,
"grad_norm": 0.18123530213186048,
"learning_rate": 8.166415011675032e-09,
"loss": 0.238,
"step": 536
},
{
"epoch": 9.71945701357466,
"grad_norm": 0.19155362352898522,
"learning_rate": 7.041978132081295e-09,
"loss": 0.2505,
"step": 537
},
{
"epoch": 9.737556561085974,
"grad_norm": 0.19166375374826475,
"learning_rate": 6.00068234070772e-09,
"loss": 0.2486,
"step": 538
},
{
"epoch": 9.755656108597286,
"grad_norm": 0.19649835123518228,
"learning_rate": 5.042562365160375e-09,
"loss": 0.2339,
"step": 539
},
{
"epoch": 9.773755656108598,
"grad_norm": 0.17975277337095447,
"learning_rate": 4.167650159100922e-09,
"loss": 0.2386,
"step": 540
},
{
"epoch": 9.79185520361991,
"grad_norm": 0.19853806613153782,
"learning_rate": 3.375974901181356e-09,
"loss": 0.2651,
"step": 541
},
{
"epoch": 9.809954751131222,
"grad_norm": 0.17741051186070012,
"learning_rate": 2.6675629940689508e-09,
"loss": 0.2345,
"step": 542
},
{
"epoch": 9.828054298642535,
"grad_norm": 0.21010956050591995,
"learning_rate": 2.0424380635675202e-09,
"loss": 0.2433,
"step": 543
},
{
"epoch": 9.846153846153847,
"grad_norm": 0.1925677170949037,
"learning_rate": 1.5006209578286024e-09,
"loss": 0.2442,
"step": 544
},
{
"epoch": 9.864253393665159,
"grad_norm": 0.18453673405456344,
"learning_rate": 1.0421297466570169e-09,
"loss": 0.2302,
"step": 545
},
{
"epoch": 9.882352941176471,
"grad_norm": 0.20600991870093216,
"learning_rate": 6.669797209069018e-10,
"loss": 0.2338,
"step": 546
},
{
"epoch": 9.900452488687783,
"grad_norm": 0.19783049619088353,
"learning_rate": 3.7518339197267774e-10,
"loss": 0.2584,
"step": 547
},
{
"epoch": 9.918552036199095,
"grad_norm": 0.2143160581746704,
"learning_rate": 1.6675049137188094e-10,
"loss": 0.2481,
"step": 548
},
{
"epoch": 9.936651583710407,
"grad_norm": 0.18644523978656508,
"learning_rate": 4.1687970420423165e-11,
"loss": 0.2456,
"step": 549
},
{
"epoch": 9.95475113122172,
"grad_norm": 0.19744141932163012,
"learning_rate": 0.0,
"loss": 0.2503,
"step": 550
},
{
"epoch": 9.95475113122172,
"step": 550,
"total_flos": 9.907464757911224e+17,
"train_loss": 0.2748682842471383,
"train_runtime": 89439.6385,
"train_samples_per_second": 0.395,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1.0,
"max_steps": 550,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.907464757911224e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}